diff options
author | Michal Marek <mmarek@suse.cz> | 2010-10-28 00:15:57 +0200 |
---|---|---|
committer | Michal Marek <mmarek@suse.cz> | 2010-10-28 00:15:57 +0200 |
commit | b74b953b998bcc2db91b694446f3a2619ec32de6 (patch) | |
tree | 6ce24caabd730f6ae9287ed0676ec32e6ff31e9d /mm | |
parent | abb438526201c6a79949ad45375c051b6681c253 (diff) | |
parent | f6f94e2ab1b33f0082ac22d71f66385a60d8157f (diff) | |
download | op-kernel-dev-b74b953b998bcc2db91b694446f3a2619ec32de6.zip op-kernel-dev-b74b953b998bcc2db91b694446f3a2619ec32de6.tar.gz |
Merge commit 'v2.6.36' into kbuild/misc
Update to be able to fix a recent change to scripts/basic/docproc.c
(commit eda603f).
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 28 | ||||
-rw-r--r-- | mm/Makefile | 9 | ||||
-rw-r--r-- | mm/backing-dev.c | 511 | ||||
-rw-r--r-- | mm/bootmem.c | 218 | ||||
-rw-r--r-- | mm/bounce.c | 3 | ||||
-rw-r--r-- | mm/compaction.c | 606 | ||||
-rw-r--r-- | mm/fadvise.c | 10 | ||||
-rw-r--r-- | mm/failslab.c | 19 | ||||
-rw-r--r-- | mm/filemap.c | 68 | ||||
-rw-r--r-- | mm/filemap_xip.c | 3 | ||||
-rw-r--r-- | mm/fremap.c | 9 | ||||
-rw-r--r-- | mm/highmem.c | 11 | ||||
-rw-r--r-- | mm/hugetlb.c | 142 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 15 | ||||
-rw-r--r-- | mm/init-mm.c | 6 | ||||
-rw-r--r-- | mm/kmemleak.c | 101 | ||||
-rw-r--r-- | mm/ksm.c | 106 | ||||
-rw-r--r-- | mm/memblock.c | 541 | ||||
-rw-r--r-- | mm/memcontrol.c | 2147 | ||||
-rw-r--r-- | mm/memory-failure.c | 171 | ||||
-rw-r--r-- | mm/memory.c | 309 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 62 | ||||
-rw-r--r-- | mm/mempolicy.c | 469 | ||||
-rw-r--r-- | mm/migrate.c | 116 | ||||
-rw-r--r-- | mm/mincore.c | 265 | ||||
-rw-r--r-- | mm/mlock.c | 62 | ||||
-rw-r--r-- | mm/mmap.c | 325 | ||||
-rw-r--r-- | mm/mmu_context.c | 4 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 1 | ||||
-rw-r--r-- | mm/mmzone.c | 21 | ||||
-rw-r--r-- | mm/mprotect.c | 1 | ||||
-rw-r--r-- | mm/mremap.c | 10 | ||||
-rw-r--r-- | mm/msync.c | 2 | ||||
-rw-r--r-- | mm/nommu.c | 79 | ||||
-rw-r--r-- | mm/oom_kill.c | 714 | ||||
-rw-r--r-- | mm/page-writeback.c | 324 | ||||
-rw-r--r-- | mm/page_alloc.c | 754 | ||||
-rw-r--r-- | mm/page_cgroup.c | 49 | ||||
-rw-r--r-- | mm/page_io.c | 3 | ||||
-rw-r--r-- | mm/pagewalk.c | 47 | ||||
-rw-r--r-- | mm/percpu-km.c | 104 | ||||
-rw-r--r-- | mm/percpu-vm.c | 451 | ||||
-rw-r--r-- | mm/percpu.c | 746 | ||||
-rw-r--r-- | mm/percpu_up.c | 30 | ||||
-rw-r--r-- | mm/quicklist.c | 1 | ||||
-rw-r--r-- | mm/readahead.c | 9 | ||||
-rw-r--r-- | mm/rmap.c | 432 | ||||
-rw-r--r-- | mm/shmem.c | 274 | ||||
-rw-r--r-- | mm/slab.c | 284 | ||||
-rw-r--r-- | mm/slob.c | 22 | ||||
-rw-r--r-- | mm/slub.c | 514 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 77 | ||||
-rw-r--r-- | mm/sparse.c | 204 | ||||
-rw-r--r-- | mm/swap.c | 4 | ||||
-rw-r--r-- | mm/swap_state.c | 1 | ||||
-rw-r--r-- | mm/swapfile.c | 102 | ||||
-rw-r--r-- | mm/truncate.c | 39 | ||||
-rw-r--r-- | mm/util.c | 32 | ||||
-rw-r--r-- | mm/vmalloc.c | 123 | ||||
-rw-r--r-- | mm/vmscan.c | 931 | ||||
-rw-r--r-- | mm/vmstat.c | 295 |
61 files changed, 9318 insertions, 3698 deletions
@@ -115,6 +115,10 @@ config SPARSEMEM_EXTREME config SPARSEMEM_VMEMMAP_ENABLE bool +config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER + def_bool y + depends on SPARSEMEM && X86_64 + config SPARSEMEM_VMEMMAP bool "Sparse Memory virtual memmap" depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE @@ -124,6 +128,9 @@ config SPARSEMEM_VMEMMAP pfn_to_page and page_to_pfn operations. This is the most efficient option when sufficient kernel resources are available. +config HAVE_MEMBLOCK + boolean + # eventually, we can have this option just 'select SPARSEMEM' config MEMORY_HOTPLUG bool "Allow for memory hot-add" @@ -168,17 +175,28 @@ config SPLIT_PTLOCK_CPUS default "4" # +# support for memory compaction +config COMPACTION + bool "Allow for memory compaction" + select MIGRATION + depends on EXPERIMENTAL && HUGETLB_PAGE && MMU + help + Allows the compaction of memory for the allocation of huge pages. + +# # support for page migration # config MIGRATION bool "Page migration" def_bool y - depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE + depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION help Allows the migration of the physical location of pages of processes - while the virtual addresses are not changed. This is useful for - example on NUMA systems to put pages nearer to the processors accessing - the page. + while the virtual addresses are not changed. This is useful in + two situations. The first is on NUMA systems to put pages nearer + to the processors accessing. The second is when allocating huge + pages as migration can relocate pages to satisfy a huge page + allocation instead of reclaiming. config PHYS_ADDR_T_64BIT def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT @@ -195,7 +213,7 @@ config BOUNCE config NR_QUICK int depends on QUICKLIST - default "2" if SUPERH || AVR32 + default "2" if AVR32 default "1" config VIRT_TO_BUS diff --git a/mm/Makefile b/mm/Makefile index 7a68d2a..34b2546 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -15,6 +15,8 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ $(mmu-y) obj-y += init-mm.o +obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o + obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o obj-$(CONFIG_HAS_DMA) += dmapool.o @@ -23,6 +25,7 @@ obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o obj-$(CONFIG_SLOB) += slob.o +obj-$(CONFIG_COMPACTION) += compaction.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o obj-$(CONFIG_KSM) += ksm.o obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o @@ -33,7 +36,11 @@ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o -obj-$(CONFIG_SMP) += percpu.o +ifdef CONFIG_SMP +obj-y += percpu.o +else +obj-y += percpu_up.o +endif obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 0e8ca03..65d4204 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -10,6 +10,9 @@ #include <linux/module.h> #include <linux/writeback.h> #include <linux/device.h> +#include <trace/events/writeback.h> + +static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) { @@ -25,6 +28,12 @@ struct backing_dev_info default_backing_dev_info = { }; EXPORT_SYMBOL_GPL(default_backing_dev_info); +struct backing_dev_info noop_backing_dev_info = { + .name = "noop", + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, +}; +EXPORT_SYMBOL_GPL(noop_backing_dev_info); + static struct class *bdi_class; /* @@ -41,9 +50,6 @@ static struct timer_list sync_supers_timer; static int bdi_sync_supers(void *); static void sync_supers_timer_fn(unsigned long); -static void arm_supers_timer(void); - -static void bdi_add_default_flusher_task(struct backing_dev_info *bdi); #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> @@ -59,31 +65,25 @@ static void bdi_debug_init(void) static int bdi_debug_stats_show(struct seq_file *m, void *v) { struct backing_dev_info *bdi = m->private; - struct bdi_writeback *wb; + struct bdi_writeback *wb = &bdi->wb; unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; unsigned long nr_dirty, nr_io, nr_more_io, nr_wb; struct inode *inode; - /* - * inode lock is enough here, the bdi->wb_list is protected by - * RCU on the reader side - */ nr_wb = nr_dirty = nr_io = nr_more_io = 0; spin_lock(&inode_lock); - list_for_each_entry(wb, &bdi->wb_list, list) { - nr_wb++; - list_for_each_entry(inode, &wb->b_dirty, i_list) - nr_dirty++; - list_for_each_entry(inode, &wb->b_io, i_list) - nr_io++; - list_for_each_entry(inode, &wb->b_more_io, i_list) - nr_more_io++; - } + list_for_each_entry(inode, &wb->b_dirty, i_list) + nr_dirty++; + list_for_each_entry(inode, &wb->b_io, i_list) + nr_io++; + list_for_each_entry(inode, &wb->b_more_io, i_list) + nr_more_io++; spin_unlock(&inode_lock); - get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); + global_dirty_limits(&background_thresh, &dirty_thresh); + bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); #define K(x) ((x) << (PAGE_SHIFT - 10)) seq_printf(m, @@ -92,21 +92,16 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "BdiDirtyThresh: %8lu kB\n" "DirtyThresh: %8lu kB\n" "BackgroundThresh: %8lu kB\n" - "WritebackThreads: %8lu\n" "b_dirty: %8lu\n" "b_io: %8lu\n" "b_more_io: %8lu\n" "bdi_list: %8u\n" - "state: %8lx\n" - "wb_mask: %8lx\n" - "wb_list: %8u\n" - "wb_cnt: %8u\n", + "state: %8lx\n", (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), K(bdi_thresh), K(dirty_thresh), - K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io, - !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask, - !list_empty(&bdi->wb_list), bdi->wb_cnt); + K(background_thresh), nr_dirty, nr_io, nr_more_io, + !list_empty(&bdi->bdi_list), bdi->state); #undef K return 0; @@ -227,6 +222,9 @@ static struct device_attribute bdi_dev_attrs[] = { static __init int bdi_class_init(void) { bdi_class = class_create(THIS_MODULE, "bdi"); + if (IS_ERR(bdi_class)) + return PTR_ERR(bdi_class); + bdi_class->dev_attrs = bdi_dev_attrs; bdi_debug_init(); return 0; @@ -240,89 +238,18 @@ static int __init default_bdi_init(void) sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers"); BUG_ON(IS_ERR(sync_supers_tsk)); - init_timer(&sync_supers_timer); setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0); - arm_supers_timer(); + bdi_arm_supers_timer(); err = bdi_init(&default_backing_dev_info); if (!err) bdi_register(&default_backing_dev_info, NULL, "default"); + err = bdi_init(&noop_backing_dev_info); return err; } subsys_initcall(default_bdi_init); -static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) -{ - memset(wb, 0, sizeof(*wb)); - - wb->bdi = bdi; - wb->last_old_flush = jiffies; - INIT_LIST_HEAD(&wb->b_dirty); - INIT_LIST_HEAD(&wb->b_io); - INIT_LIST_HEAD(&wb->b_more_io); -} - -static void bdi_task_init(struct backing_dev_info *bdi, - struct bdi_writeback *wb) -{ - struct task_struct *tsk = current; - - spin_lock(&bdi->wb_lock); - list_add_tail_rcu(&wb->list, &bdi->wb_list); - spin_unlock(&bdi->wb_lock); - - tsk->flags |= PF_FLUSHER | PF_SWAPWRITE; - set_freezable(); - - /* - * Our parent may run at a different priority, just set us to normal - */ - set_user_nice(tsk, 0); -} - -static int bdi_start_fn(void *ptr) -{ - struct bdi_writeback *wb = ptr; - struct backing_dev_info *bdi = wb->bdi; - int ret; - - /* - * Add us to the active bdi_list - */ - spin_lock_bh(&bdi_lock); - list_add_rcu(&bdi->bdi_list, &bdi_list); - spin_unlock_bh(&bdi_lock); - - bdi_task_init(bdi, wb); - - /* - * Clear pending bit and wakeup anybody waiting to tear us down - */ - clear_bit(BDI_pending, &bdi->state); - smp_mb__after_clear_bit(); - wake_up_bit(&bdi->state, BDI_pending); - - ret = bdi_writeback_task(wb); - - /* - * Remove us from the list - */ - spin_lock(&bdi->wb_lock); - list_del_rcu(&wb->list); - spin_unlock(&bdi->wb_lock); - - /* - * Flush any work that raced with us exiting. No new work - * will be added, since this bdi isn't discoverable anymore. - */ - if (!list_empty(&bdi->work_list)) - wb_do_writeback(wb, 1); - - wb->task = NULL; - return ret; -} - int bdi_has_dirty_io(struct backing_dev_info *bdi) { return wb_has_dirty_io(&bdi->wb); @@ -331,21 +258,20 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) static void bdi_flush_io(struct backing_dev_info *bdi) { struct writeback_control wbc = { - .bdi = bdi, .sync_mode = WB_SYNC_NONE, .older_than_this = NULL, .range_cyclic = 1, .nr_to_write = 1024, }; - writeback_inodes_wbc(&wbc); + writeback_inodes_wb(&bdi->wb, &wbc); } /* - * kupdated() used to do this. We cannot do it from the bdi_forker_task() + * kupdated() used to do this. We cannot do it from the bdi_forker_thread() * or we risk deadlocking on ->s_umount. The longer term solution would be * to implement sync_supers_bdi() or similar and simply do it from the - * bdi writeback tasks individually. + * bdi writeback thread individually. */ static int bdi_sync_supers(void *unused) { @@ -364,10 +290,13 @@ static int bdi_sync_supers(void *unused) return 0; } -static void arm_supers_timer(void) +void bdi_arm_supers_timer(void) { unsigned long next; + if (!dirty_writeback_interval) + return; + next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies; mod_timer(&sync_supers_timer, round_jiffies_up(next)); } @@ -375,142 +304,202 @@ static void arm_supers_timer(void) static void sync_supers_timer_fn(unsigned long unused) { wake_up_process(sync_supers_tsk); - arm_supers_timer(); + bdi_arm_supers_timer(); } -static int bdi_forker_task(void *ptr) +static void wakeup_timer_fn(unsigned long data) { - struct bdi_writeback *me = ptr; - - bdi_task_init(me->bdi, me); - - for (;;) { - struct backing_dev_info *bdi, *tmp; - struct bdi_writeback *wb; + struct backing_dev_info *bdi = (struct backing_dev_info *)data; + spin_lock_bh(&bdi->wb_lock); + if (bdi->wb.task) { + trace_writeback_wake_thread(bdi); + wake_up_process(bdi->wb.task); + } else { /* - * Temporary measure, we want to make sure we don't see - * dirty data on the default backing_dev_info + * When bdi tasks are inactive for long time, they are killed. + * In this case we have to wake-up the forker thread which + * should create and run the bdi thread. */ - if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) - wb_do_writeback(me, 0); + trace_writeback_wake_forker_thread(bdi); + wake_up_process(default_backing_dev_info.wb.task); + } + spin_unlock_bh(&bdi->wb_lock); +} - spin_lock_bh(&bdi_lock); +/* + * This function is used when the first inode for this bdi is marked dirty. It + * wakes-up the corresponding bdi thread which should then take care of the + * periodic background write-out of dirty inodes. Since the write-out would + * starts only 'dirty_writeback_interval' centisecs from now anyway, we just + * set up a timer which wakes the bdi thread up later. + * + * Note, we wouldn't bother setting up the timer, but this function is on the + * fast-path (used by '__mark_inode_dirty()'), so we save few context switches + * by delaying the wake-up. + */ +void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi) +{ + unsigned long timeout; - /* - * Check if any existing bdi's have dirty data without - * a thread registered. If so, set that up. - */ - list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) { - if (bdi->wb.task) - continue; - if (list_empty(&bdi->work_list) && - !bdi_has_dirty_io(bdi)) - continue; + timeout = msecs_to_jiffies(dirty_writeback_interval * 10); + mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout); +} - bdi_add_default_flusher_task(bdi); - } +/* + * Calculate the longest interval (jiffies) bdi threads are allowed to be + * inactive. + */ +static unsigned long bdi_longest_inactive(void) +{ + unsigned long interval; - set_current_state(TASK_INTERRUPTIBLE); + interval = msecs_to_jiffies(dirty_writeback_interval * 10); + return max(5UL * 60 * HZ, interval); +} - if (list_empty(&bdi_pending_list)) { - unsigned long wait; +static int bdi_forker_thread(void *ptr) +{ + struct bdi_writeback *me = ptr; - spin_unlock_bh(&bdi_lock); - wait = msecs_to_jiffies(dirty_writeback_interval * 10); - schedule_timeout(wait); - try_to_freeze(); - continue; - } + current->flags |= PF_FLUSHER | PF_SWAPWRITE; + set_freezable(); - __set_current_state(TASK_RUNNING); + /* + * Our parent may run at a different priority, just set us to normal + */ + set_user_nice(current, 0); - /* - * This is our real job - check for pending entries in - * bdi_pending_list, and create the tasks that got added - */ - bdi = list_entry(bdi_pending_list.next, struct backing_dev_info, - bdi_list); - list_del_init(&bdi->bdi_list); - spin_unlock_bh(&bdi_lock); + for (;;) { + struct task_struct *task = NULL; + struct backing_dev_info *bdi; + enum { + NO_ACTION, /* Nothing to do */ + FORK_THREAD, /* Fork bdi thread */ + KILL_THREAD, /* Kill inactive bdi thread */ + } action = NO_ACTION; - wb = &bdi->wb; - wb->task = kthread_run(bdi_start_fn, wb, "flush-%s", - dev_name(bdi->dev)); /* - * If task creation fails, then readd the bdi to - * the pending list and force writeout of the bdi - * from this forker thread. That will free some memory - * and we can try again. + * Temporary measure, we want to make sure we don't see + * dirty data on the default backing_dev_info */ - if (IS_ERR(wb->task)) { - wb->task = NULL; - - /* - * Add this 'bdi' to the back, so we get - * a chance to flush other bdi's to free - * memory. - */ - spin_lock_bh(&bdi_lock); - list_add_tail(&bdi->bdi_list, &bdi_pending_list); - spin_unlock_bh(&bdi_lock); - - bdi_flush_io(bdi); + if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) { + del_timer(&me->wakeup_timer); + wb_do_writeback(me, 0); } - } - return 0; -} + spin_lock_bh(&bdi_lock); + set_current_state(TASK_INTERRUPTIBLE); -static void bdi_add_to_pending(struct rcu_head *head) -{ - struct backing_dev_info *bdi; + list_for_each_entry(bdi, &bdi_list, bdi_list) { + bool have_dirty_io; - bdi = container_of(head, struct backing_dev_info, rcu_head); - INIT_LIST_HEAD(&bdi->bdi_list); + if (!bdi_cap_writeback_dirty(bdi) || + bdi_cap_flush_forker(bdi)) + continue; - spin_lock(&bdi_lock); - list_add_tail(&bdi->bdi_list, &bdi_pending_list); - spin_unlock(&bdi_lock); + WARN(!test_bit(BDI_registered, &bdi->state), + "bdi %p/%s is not registered!\n", bdi, bdi->name); - /* - * We are now on the pending list, wake up bdi_forker_task() - * to finish the job and add us back to the active bdi_list - */ - wake_up_process(default_backing_dev_info.wb.task); -} + have_dirty_io = !list_empty(&bdi->work_list) || + wb_has_dirty_io(&bdi->wb); -/* - * Add the default flusher task that gets created for any bdi - * that has dirty data pending writeout - */ -void static bdi_add_default_flusher_task(struct backing_dev_info *bdi) -{ - if (!bdi_cap_writeback_dirty(bdi)) - return; + /* + * If the bdi has work to do, but the thread does not + * exist - create it. + */ + if (!bdi->wb.task && have_dirty_io) { + /* + * Set the pending bit - if someone will try to + * unregister this bdi - it'll wait on this bit. + */ + set_bit(BDI_pending, &bdi->state); + action = FORK_THREAD; + break; + } + + spin_lock(&bdi->wb_lock); - if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) { - printk(KERN_ERR "bdi %p/%s is not registered!\n", - bdi, bdi->name); - return; - } + /* + * If there is no work to do and the bdi thread was + * inactive long enough - kill it. The wb_lock is taken + * to make sure no-one adds more work to this bdi and + * wakes the bdi thread up. + */ + if (bdi->wb.task && !have_dirty_io && + time_after(jiffies, bdi->wb.last_active + + bdi_longest_inactive())) { + task = bdi->wb.task; + bdi->wb.task = NULL; + spin_unlock(&bdi->wb_lock); + set_bit(BDI_pending, &bdi->state); + action = KILL_THREAD; + break; + } + spin_unlock(&bdi->wb_lock); + } + spin_unlock_bh(&bdi_lock); - /* - * Check with the helper whether to proceed adding a task. Will only - * abort if we two or more simultanous calls to - * bdi_add_default_flusher_task() occured, further additions will block - * waiting for previous additions to finish. - */ - if (!test_and_set_bit(BDI_pending, &bdi->state)) { - list_del_rcu(&bdi->bdi_list); + /* Keep working if default bdi still has things to do */ + if (!list_empty(&me->bdi->work_list)) + __set_current_state(TASK_RUNNING); + + switch (action) { + case FORK_THREAD: + __set_current_state(TASK_RUNNING); + task = kthread_create(bdi_writeback_thread, &bdi->wb, + "flush-%s", dev_name(bdi->dev)); + if (IS_ERR(task)) { + /* + * If thread creation fails, force writeout of + * the bdi from the thread. + */ + bdi_flush_io(bdi); + } else { + /* + * The spinlock makes sure we do not lose + * wake-ups when racing with 'bdi_queue_work()'. + * And as soon as the bdi thread is visible, we + * can start it. + */ + spin_lock_bh(&bdi->wb_lock); + bdi->wb.task = task; + spin_unlock_bh(&bdi->wb_lock); + wake_up_process(task); + } + break; + + case KILL_THREAD: + __set_current_state(TASK_RUNNING); + kthread_stop(task); + break; + + case NO_ACTION: + if (!wb_has_dirty_io(me) || !dirty_writeback_interval) + /* + * There are no dirty data. The only thing we + * should now care about is checking for + * inactive bdi threads and killing them. Thus, + * let's sleep for longer time, save energy and + * be friendly for battery-driven devices. + */ + schedule_timeout(bdi_longest_inactive()); + else + schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); + try_to_freeze(); + /* Back to the main loop */ + continue; + } /* - * We must wait for the current RCU period to end before - * moving to the pending list. So schedule that operation - * from an RCU callback. + * Clear pending bit and wakeup anybody waiting to tear us down. */ - call_rcu(&bdi->rcu_head, bdi_add_to_pending); + clear_bit(BDI_pending, &bdi->state); + smp_mb__after_clear_bit(); + wake_up_bit(&bdi->state, BDI_pending); } + + return 0; } /* @@ -529,23 +518,16 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...) { va_list args; - int ret = 0; struct device *dev; if (bdi->dev) /* The driver needs to use separate queues per device */ - goto exit; + return 0; va_start(args, fmt); dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args); va_end(args); - if (IS_ERR(dev)) { - ret = PTR_ERR(dev); - goto exit; - } - - spin_lock_bh(&bdi_lock); - list_add_tail_rcu(&bdi->bdi_list, &bdi_list); - spin_unlock_bh(&bdi_lock); + if (IS_ERR(dev)) + return PTR_ERR(dev); bdi->dev = dev; @@ -557,21 +539,21 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, if (bdi_cap_flush_forker(bdi)) { struct bdi_writeback *wb = &bdi->wb; - wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s", + wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s", dev_name(dev)); - if (IS_ERR(wb->task)) { - wb->task = NULL; - ret = -ENOMEM; - - bdi_remove_from_list(bdi); - goto exit; - } + if (IS_ERR(wb->task)) + return PTR_ERR(wb->task); } bdi_debug_register(bdi, dev_name(dev)); set_bit(BDI_registered, &bdi->state); -exit: - return ret; + + spin_lock_bh(&bdi_lock); + list_add_tail_rcu(&bdi->bdi_list, &bdi_list); + spin_unlock_bh(&bdi_lock); + + trace_writeback_bdi_register(bdi); + return 0; } EXPORT_SYMBOL(bdi_register); @@ -586,31 +568,29 @@ EXPORT_SYMBOL(bdi_register_dev); */ static void bdi_wb_shutdown(struct backing_dev_info *bdi) { - struct bdi_writeback *wb; - if (!bdi_cap_writeback_dirty(bdi)) return; /* - * If setup is pending, wait for that to complete first + * Make sure nobody finds us on the bdi_list anymore */ - wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, - TASK_UNINTERRUPTIBLE); + bdi_remove_from_list(bdi); /* - * Make sure nobody finds us on the bdi_list anymore + * If setup is pending, wait for that to complete first */ - bdi_remove_from_list(bdi); + wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, + TASK_UNINTERRUPTIBLE); /* - * Finally, kill the kernel threads. We don't need to be RCU + * Finally, kill the kernel thread. We don't need to be RCU * safe anymore, since the bdi is gone from visibility. Force * unfreeze of the thread before calling kthread_stop(), otherwise * it would never exet if it is currently stuck in the refrigerator. */ - list_for_each_entry(wb, &bdi->wb_list, list) { - thaw_process(wb->task); - kthread_stop(wb->task); + if (bdi->wb.task) { + thaw_process(bdi->wb.task); + kthread_stop(bdi->wb.task); } } @@ -632,7 +612,9 @@ static void bdi_prune_sb(struct backing_dev_info *bdi) void bdi_unregister(struct backing_dev_info *bdi) { if (bdi->dev) { + trace_writeback_bdi_unregister(bdi); bdi_prune_sb(bdi); + del_timer_sync(&bdi->wb.wakeup_timer); if (!bdi_cap_flush_forker(bdi)) bdi_wb_shutdown(bdi); @@ -643,6 +625,18 @@ void bdi_unregister(struct backing_dev_info *bdi) } EXPORT_SYMBOL(bdi_unregister); +static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) +{ + memset(wb, 0, sizeof(*wb)); + + wb->bdi = bdi; + wb->last_old_flush = jiffies; + INIT_LIST_HEAD(&wb->b_dirty); + INIT_LIST_HEAD(&wb->b_io); + INIT_LIST_HEAD(&wb->b_more_io); + setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); +} + int bdi_init(struct backing_dev_info *bdi) { int i, err; @@ -653,19 +647,11 @@ int bdi_init(struct backing_dev_info *bdi) bdi->max_ratio = 100; bdi->max_prop_frac = PROP_FRAC_BASE; spin_lock_init(&bdi->wb_lock); - INIT_RCU_HEAD(&bdi->rcu_head); INIT_LIST_HEAD(&bdi->bdi_list); - INIT_LIST_HEAD(&bdi->wb_list); INIT_LIST_HEAD(&bdi->work_list); bdi_wb_init(&bdi->wb, bdi); - /* - * Just one thread support for now, hard code mask and count - */ - bdi->wb_mask = 1; - bdi->wb_cnt = 1; - for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { err = percpu_counter_init(&bdi->bdi_stat[i], 0); if (err) @@ -712,6 +698,33 @@ void bdi_destroy(struct backing_dev_info *bdi) } EXPORT_SYMBOL(bdi_destroy); +/* + * For use from filesystems to quickly init and register a bdi associated + * with dirty writeback + */ +int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, + unsigned int cap) +{ + char tmp[32]; + int err; + + bdi->name = name; + bdi->capabilities = cap; + err = bdi_init(bdi); + if (err) + return err; + + sprintf(tmp, "%.28s%s", name, "-%d"); + err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq)); + if (err) { + bdi_destroy(bdi); + return err; + } + + return 0; +} +EXPORT_SYMBOL(bdi_setup_and_register); + static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) diff --git a/mm/bootmem.c b/mm/bootmem.c index 7d14868..142c84a 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -10,9 +10,11 @@ */ #include <linux/init.h> #include <linux/pfn.h> +#include <linux/slab.h> #include <linux/bootmem.h> #include <linux/module.h> #include <linux/kmemleak.h> +#include <linux/range.h> #include <asm/bug.h> #include <asm/io.h> @@ -32,6 +34,7 @@ unsigned long max_pfn; unsigned long saved_max_pfn; #endif +#ifndef CONFIG_NO_BOOTMEM bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); @@ -142,7 +145,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) min_low_pfn = start; return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); } - +#endif /* * free_bootmem_late - free bootmem pages directly to page allocator * @addr: starting address of the range @@ -167,6 +170,53 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) } } +#ifdef CONFIG_NO_BOOTMEM +static void __init __free_pages_memory(unsigned long start, unsigned long end) +{ + int i; + unsigned long start_aligned, end_aligned; + int order = ilog2(BITS_PER_LONG); + + start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); + end_aligned = end & ~(BITS_PER_LONG - 1); + + if (end_aligned <= start_aligned) { + for (i = start; i < end; i++) + __free_pages_bootmem(pfn_to_page(i), 0); + + return; + } + + for (i = start; i < start_aligned; i++) + __free_pages_bootmem(pfn_to_page(i), 0); + + for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) + __free_pages_bootmem(pfn_to_page(i), order); + + for (i = end_aligned; i < end; i++) + __free_pages_bootmem(pfn_to_page(i), 0); +} + +unsigned long __init free_all_memory_core_early(int nodeid) +{ + int i; + u64 start, end; + unsigned long count = 0; + struct range *range = NULL; + int nr_range; + + nr_range = get_free_all_memory_range(&range, nodeid); + + for (i = 0; i < nr_range; i++) { + start = range[i].start; + end = range[i].end; + count += end - start; + __free_pages_memory(start, end); + } + + return count; +} +#else static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { int aligned; @@ -227,6 +277,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) return count; } +#endif /** * free_all_bootmem_node - release a node's free pages to the buddy allocator @@ -237,7 +288,12 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) { register_page_bootmem_info_node(pgdat); +#ifdef CONFIG_NO_BOOTMEM + /* free_all_memory_core_early(MAX_NUMNODES) will be called later */ + return 0; +#else return free_all_bootmem_core(pgdat->bdata); +#endif } /** @@ -247,9 +303,27 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) */ unsigned long __init free_all_bootmem(void) { - return free_all_bootmem_core(NODE_DATA(0)->bdata); +#ifdef CONFIG_NO_BOOTMEM + /* + * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id + * because in some case like Node0 doesnt have RAM installed + * low ram will be on Node1 + * Use MAX_NUMNODES will make sure all ranges in early_node_map[] + * will be used instead of only Node0 related + */ + return free_all_memory_core_early(MAX_NUMNODES); +#else + unsigned long total_pages = 0; + bootmem_data_t *bdata; + + list_for_each_entry(bdata, &bdata_list, list) + total_pages += free_all_bootmem_core(bdata); + + return total_pages; +#endif } +#ifndef CONFIG_NO_BOOTMEM static void __init __free(bootmem_data_t *bdata, unsigned long sidx, unsigned long eidx) { @@ -344,6 +418,7 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, } BUG(); } +#endif /** * free_bootmem_node - mark a page range as usable @@ -358,6 +433,9 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { +#ifdef CONFIG_NO_BOOTMEM + free_early(physaddr, physaddr + size); +#else unsigned long start, end; kmemleak_free_part(__va(physaddr), size); @@ -366,6 +444,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, end = PFN_DOWN(physaddr + size); mark_bootmem_node(pgdat->bdata, start, end, 0, 0); +#endif } /** @@ -379,6 +458,9 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, */ void __init free_bootmem(unsigned long addr, unsigned long size) { +#ifdef CONFIG_NO_BOOTMEM + free_early(addr, addr + size); +#else unsigned long start, end; kmemleak_free_part(__va(addr), size); @@ -387,6 +469,7 @@ void __init free_bootmem(unsigned long addr, unsigned long size) end = PFN_DOWN(addr + size); mark_bootmem(start, end, 0, 0); +#endif } /** @@ -403,12 +486,17 @@ void __init free_bootmem(unsigned long addr, unsigned long size) int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size, int flags) { +#ifdef CONFIG_NO_BOOTMEM + panic("no bootmem"); + return 0; +#else unsigned long start, end; start = PFN_DOWN(physaddr); end = PFN_UP(physaddr + size); return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); +#endif } /** @@ -424,14 +512,20 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, int __init reserve_bootmem(unsigned long addr, unsigned long size, int flags) { +#ifdef CONFIG_NO_BOOTMEM + panic("no bootmem"); + return 0; +#else unsigned long start, end; start = PFN_DOWN(addr); end = PFN_UP(addr + size); return mark_bootmem(start, end, 1, flags); +#endif } +#ifndef CONFIG_NO_BOOTMEM static unsigned long __init align_idx(struct bootmem_data *bdata, unsigned long idx, unsigned long step) { @@ -582,12 +676,33 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, #endif return NULL; } +#endif static void * __init ___alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { +#ifdef CONFIG_NO_BOOTMEM + void *ptr; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc(size, GFP_NOWAIT); + +restart: + + ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); + + if (ptr) + return ptr; + + if (goal != 0) { + goal = 0; + goto restart; + } + + return NULL; +#else bootmem_data_t *bdata; void *region; @@ -613,6 +728,7 @@ restart: } return NULL; +#endif } /** @@ -631,7 +747,13 @@ restart: void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal) { - return ___alloc_bootmem_nopanic(size, align, goal, 0); + unsigned long limit = 0; + +#ifdef CONFIG_NO_BOOTMEM + limit = -1UL; +#endif + + return ___alloc_bootmem_nopanic(size, align, goal, limit); } static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, @@ -665,9 +787,16 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) { - return ___alloc_bootmem(size, align, goal, 0); + unsigned long limit = 0; + +#ifdef CONFIG_NO_BOOTMEM + limit = -1UL; +#endif + + return ___alloc_bootmem(size, align, goal, limit); } +#ifndef CONFIG_NO_BOOTMEM static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) @@ -684,6 +813,7 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, return ___alloc_bootmem(size, align, goal, limit); } +#endif /** * __alloc_bootmem_node - allocate boot memory from a specific node @@ -703,10 +833,58 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { + void *ptr; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + +#ifdef CONFIG_NO_BOOTMEM + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + goal, -1ULL); + if (ptr) + return ptr; + + ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, + goal, -1ULL); +#else + ptr = ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); +#endif + + return ptr; +} + +void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ +#ifdef MAX_DMA32_PFN + unsigned long end_pfn; + if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); + /* update goal according ...MAX_DMA32_PFN */ + end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages; + + if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && + (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { + void *ptr; + unsigned long new_goal; + + new_goal = MAX_DMA32_PFN << PAGE_SHIFT; +#ifdef CONFIG_NO_BOOTMEM + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + new_goal, -1ULL); +#else + ptr = alloc_bootmem_core(pgdat->bdata, size, align, + new_goal, 0); +#endif + if (ptr) + return ptr; + } +#endif + + return __alloc_bootmem_node(pgdat, size, align, goal); + } #ifdef CONFIG_SPARSEMEM @@ -720,6 +898,16 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, void * __init alloc_bootmem_section(unsigned long size, unsigned long section_nr) { +#ifdef CONFIG_NO_BOOTMEM + unsigned long pfn, goal, limit; + + pfn = section_nr_to_pfn(section_nr); + goal = pfn << PAGE_SHIFT; + limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; + + return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, + SMP_CACHE_BYTES, goal, limit); +#else bootmem_data_t *bdata; unsigned long pfn, goal, limit; @@ -729,6 +917,7 @@ void * __init alloc_bootmem_section(unsigned long size, bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); +#endif } #endif @@ -740,11 +929,16 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); +#ifdef CONFIG_NO_BOOTMEM + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + goal, -1ULL); +#else ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); if (ptr) return ptr; ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); +#endif if (ptr) return ptr; @@ -792,9 +986,21 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { + void *ptr; + if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - return ___alloc_bootmem_node(pgdat->bdata, size, align, +#ifdef CONFIG_NO_BOOTMEM + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + goal, ARCH_LOW_ADDRESS_LIMIT); + if (ptr) + return ptr; + ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, + goal, ARCH_LOW_ADDRESS_LIMIT); +#else + ptr = ___alloc_bootmem_node(pgdat->bdata, size, align, goal, ARCH_LOW_ADDRESS_LIMIT); +#endif + return ptr; } diff --git a/mm/bounce.c b/mm/bounce.c index a2b76a5..1481de6 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -6,6 +6,7 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/swap.h> +#include <linux/gfp.h> #include <linux/bio.h> #include <linux/pagemap.h> #include <linux/mempool.h> @@ -115,8 +116,8 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from) */ vfrom = page_address(fromvec->bv_page) + tovec->bv_offset; - flush_dcache_page(tovec->bv_page); bounce_copy_vec(tovec, vfrom); + flush_dcache_page(tovec->bv_page); } } diff --git a/mm/compaction.c b/mm/compaction.c new file mode 100644 index 0000000..4d709ee --- /dev/null +++ b/mm/compaction.c @@ -0,0 +1,606 @@ +/* + * linux/mm/compaction.c + * + * Memory compaction for the reduction of external fragmentation. Note that + * this heavily depends upon page migration to do all the real heavy + * lifting + * + * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie> + */ +#include <linux/swap.h> +#include <linux/migrate.h> +#include <linux/compaction.h> +#include <linux/mm_inline.h> +#include <linux/backing-dev.h> +#include <linux/sysctl.h> +#include <linux/sysfs.h> +#include "internal.h" + +/* + * compact_control is used to track pages being migrated and the free pages + * they are being migrated to during memory compaction. The free_pfn starts + * at the end of a zone and migrate_pfn begins at the start. Movable pages + * are moved to the end of a zone during a compaction run and the run + * completes when free_pfn <= migrate_pfn + */ +struct compact_control { + struct list_head freepages; /* List of free pages to migrate to */ + struct list_head migratepages; /* List of pages being migrated */ + unsigned long nr_freepages; /* Number of isolated free pages */ + unsigned long nr_migratepages; /* Number of pages to migrate */ + unsigned long free_pfn; /* isolate_freepages search base */ + unsigned long migrate_pfn; /* isolate_migratepages search base */ + + /* Account for isolated anon and file pages */ + unsigned long nr_anon; + unsigned long nr_file; + + unsigned int order; /* order a direct compactor needs */ + int migratetype; /* MOVABLE, RECLAIMABLE etc */ + struct zone *zone; +}; + +static unsigned long release_freepages(struct list_head *freelist) +{ + struct page *page, *next; + unsigned long count = 0; + + list_for_each_entry_safe(page, next, freelist, lru) { + list_del(&page->lru); + __free_page(page); + count++; + } + + return count; +} + +/* Isolate free pages onto a private freelist. Must hold zone->lock */ +static unsigned long isolate_freepages_block(struct zone *zone, + unsigned long blockpfn, + struct list_head *freelist) +{ + unsigned long zone_end_pfn, end_pfn; + int total_isolated = 0; + struct page *cursor; + + /* Get the last PFN we should scan for free pages at */ + zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; + end_pfn = min(blockpfn + pageblock_nr_pages, zone_end_pfn); + + /* Find the first usable PFN in the block to initialse page cursor */ + for (; blockpfn < end_pfn; blockpfn++) { + if (pfn_valid_within(blockpfn)) + break; + } + cursor = pfn_to_page(blockpfn); + + /* Isolate free pages. This assumes the block is valid */ + for (; blockpfn < end_pfn; blockpfn++, cursor++) { + int isolated, i; + struct page *page = cursor; + + if (!pfn_valid_within(blockpfn)) + continue; + + if (!PageBuddy(page)) + continue; + + /* Found a free page, break it into order-0 pages */ + isolated = split_free_page(page); + total_isolated += isolated; + for (i = 0; i < isolated; i++) { + list_add(&page->lru, freelist); + page++; + } + + /* If a page was split, advance to the end of it */ + if (isolated) { + blockpfn += isolated - 1; + cursor += isolated - 1; + } + } + + return total_isolated; +} + +/* Returns true if the page is within a block suitable for migration to */ +static bool suitable_migration_target(struct page *page) +{ + + int migratetype = get_pageblock_migratetype(page); + + /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ + if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) + return false; + + /* If the page is a large free page, then allow migration */ + if (PageBuddy(page) && page_order(page) >= pageblock_order) + return true; + + /* If the block is MIGRATE_MOVABLE, allow migration */ + if (migratetype == MIGRATE_MOVABLE) + return true; + + /* Otherwise skip the block */ + return false; +} + +/* + * Based on information in the current compact_control, find blocks + * suitable for isolating free pages from and then isolate them. + */ +static void isolate_freepages(struct zone *zone, + struct compact_control *cc) +{ + struct page *page; + unsigned long high_pfn, low_pfn, pfn; + unsigned long flags; + int nr_freepages = cc->nr_freepages; + struct list_head *freelist = &cc->freepages; + + pfn = cc->free_pfn; + low_pfn = cc->migrate_pfn + pageblock_nr_pages; + high_pfn = low_pfn; + + /* + * Isolate free pages until enough are available to migrate the + * pages on cc->migratepages. We stop searching if the migrate + * and free page scanners meet or enough free pages are isolated. + */ + spin_lock_irqsave(&zone->lock, flags); + for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; + pfn -= pageblock_nr_pages) { + unsigned long isolated; + + if (!pfn_valid(pfn)) + continue; + + /* + * Check for overlapping nodes/zones. It's possible on some + * configurations to have a setup like + * node0 node1 node0 + * i.e. it's possible that all pages within a zones range of + * pages do not belong to a single zone. + */ + page = pfn_to_page(pfn); + if (page_zone(page) != zone) + continue; + + /* Check the block is suitable for migration */ + if (!suitable_migration_target(page)) + continue; + + /* Found a block suitable for isolating free pages from */ + isolated = isolate_freepages_block(zone, pfn, freelist); + nr_freepages += isolated; + + /* + * Record the highest PFN we isolated pages from. When next + * looking for free pages, the search will restart here as + * page migration may have returned some pages to the allocator + */ + if (isolated) + high_pfn = max(high_pfn, pfn); + } + spin_unlock_irqrestore(&zone->lock, flags); + + /* split_free_page does not map the pages */ + list_for_each_entry(page, freelist, lru) { + arch_alloc_page(page, 0); + kernel_map_pages(page, 1, 1); + } + + cc->free_pfn = high_pfn; + cc->nr_freepages = nr_freepages; +} + +/* Update the number of anon and file isolated pages in the zone */ +static void acct_isolated(struct zone *zone, struct compact_control *cc) +{ + struct page *page; + unsigned int count[NR_LRU_LISTS] = { 0, }; + + list_for_each_entry(page, &cc->migratepages, lru) { + int lru = page_lru_base_type(page); + count[lru]++; + } + + cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; + cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; + __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file); +} + +/* Similar to reclaim, but different enough that they don't share logic */ +static bool too_many_isolated(struct zone *zone) +{ + unsigned long active, inactive, isolated; + + inactive = zone_page_state(zone, NR_INACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_ANON); + active = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_ACTIVE_ANON); + isolated = zone_page_state(zone, NR_ISOLATED_FILE) + + zone_page_state(zone, NR_ISOLATED_ANON); + + return isolated > (inactive + active) / 2; +} + +/* + * Isolate all pages that can be migrated from the block pointed to by + * the migrate scanner within compact_control. + */ +static unsigned long isolate_migratepages(struct zone *zone, + struct compact_control *cc) +{ + unsigned long low_pfn, end_pfn; + struct list_head *migratelist = &cc->migratepages; + + /* Do not scan outside zone boundaries */ + low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); + + /* Only scan within a pageblock boundary */ + end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages); + + /* Do not cross the free scanner or scan within a memory hole */ + if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { + cc->migrate_pfn = end_pfn; + return 0; + } + + /* + * Ensure that there are not too many pages isolated from the LRU + * list by either parallel reclaimers or compaction. If there are, + * delay for some time until fewer pages are isolated + */ + while (unlikely(too_many_isolated(zone))) { + congestion_wait(BLK_RW_ASYNC, HZ/10); + + if (fatal_signal_pending(current)) + return 0; + } + + /* Time to isolate some pages for migration */ + spin_lock_irq(&zone->lru_lock); + for (; low_pfn < end_pfn; low_pfn++) { + struct page *page; + if (!pfn_valid_within(low_pfn)) + continue; + + /* Get the page and skip if free */ + page = pfn_to_page(low_pfn); + if (PageBuddy(page)) + continue; + + /* Try isolate the page */ + if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) + continue; + + /* Successfully isolated */ + del_page_from_lru_list(zone, page, page_lru(page)); + list_add(&page->lru, migratelist); + mem_cgroup_del_lru(page); + cc->nr_migratepages++; + + /* Avoid isolating too much */ + if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) + break; + } + + acct_isolated(zone, cc); + + spin_unlock_irq(&zone->lru_lock); + cc->migrate_pfn = low_pfn; + + return cc->nr_migratepages; +} + +/* + * This is a migrate-callback that "allocates" freepages by taking pages + * from the isolated freelists in the block we are migrating to. + */ +static struct page *compaction_alloc(struct page *migratepage, + unsigned long data, + int **result) +{ + struct compact_control *cc = (struct compact_control *)data; + struct page *freepage; + + /* Isolate free pages if necessary */ + if (list_empty(&cc->freepages)) { + isolate_freepages(cc->zone, cc); + + if (list_empty(&cc->freepages)) + return NULL; + } + + freepage = list_entry(cc->freepages.next, struct page, lru); + list_del(&freepage->lru); + cc->nr_freepages--; + + return freepage; +} + +/* + * We cannot control nr_migratepages and nr_freepages fully when migration is + * running as migrate_pages() has no knowledge of compact_control. When + * migration is complete, we count the number of pages on the lists by hand. + */ +static void update_nr_listpages(struct compact_control *cc) +{ + int nr_migratepages = 0; + int nr_freepages = 0; + struct page *page; + + list_for_each_entry(page, &cc->migratepages, lru) + nr_migratepages++; + list_for_each_entry(page, &cc->freepages, lru) + nr_freepages++; + + cc->nr_migratepages = nr_migratepages; + cc->nr_freepages = nr_freepages; +} + +static int compact_finished(struct zone *zone, + struct compact_control *cc) +{ + unsigned int order; + unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order); + + if (fatal_signal_pending(current)) + return COMPACT_PARTIAL; + + /* Compaction run completes if the migrate and free scanner meet */ + if (cc->free_pfn <= cc->migrate_pfn) + return COMPACT_COMPLETE; + + /* Compaction run is not finished if the watermark is not met */ + if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) + return COMPACT_CONTINUE; + + if (cc->order == -1) + return COMPACT_CONTINUE; + + /* Direct compactor: Is a suitable page free? */ + for (order = cc->order; order < MAX_ORDER; order++) { + /* Job done if page is free of the right migratetype */ + if (!list_empty(&zone->free_area[order].free_list[cc->migratetype])) + return COMPACT_PARTIAL; + + /* Job done if allocation would set block type */ + if (order >= pageblock_order && zone->free_area[order].nr_free) + return COMPACT_PARTIAL; + } + + return COMPACT_CONTINUE; +} + +static int compact_zone(struct zone *zone, struct compact_control *cc) +{ + int ret; + + /* Setup to move all movable pages to the end of the zone */ + cc->migrate_pfn = zone->zone_start_pfn; + cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; + cc->free_pfn &= ~(pageblock_nr_pages-1); + + migrate_prep_local(); + + while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { + unsigned long nr_migrate, nr_remaining; + + if (!isolate_migratepages(zone, cc)) + continue; + + nr_migrate = cc->nr_migratepages; + migrate_pages(&cc->migratepages, compaction_alloc, + (unsigned long)cc, 0); + update_nr_listpages(cc); + nr_remaining = cc->nr_migratepages; + + count_vm_event(COMPACTBLOCKS); + count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); + if (nr_remaining) + count_vm_events(COMPACTPAGEFAILED, nr_remaining); + + /* Release LRU pages not migrated */ + if (!list_empty(&cc->migratepages)) { + putback_lru_pages(&cc->migratepages); + cc->nr_migratepages = 0; + } + + } + + /* Release free pages and check accounting */ + cc->nr_freepages -= release_freepages(&cc->freepages); + VM_BUG_ON(cc->nr_freepages != 0); + + return ret; +} + +static unsigned long compact_zone_order(struct zone *zone, + int order, gfp_t gfp_mask) +{ + struct compact_control cc = { + .nr_freepages = 0, + .nr_migratepages = 0, + .order = order, + .migratetype = allocflags_to_migratetype(gfp_mask), + .zone = zone, + }; + INIT_LIST_HEAD(&cc.freepages); + INIT_LIST_HEAD(&cc.migratepages); + + return compact_zone(zone, &cc); +} + +int sysctl_extfrag_threshold = 500; + +/** + * try_to_compact_pages - Direct compact to satisfy a high-order allocation + * @zonelist: The zonelist used for the current allocation + * @order: The order of the current allocation + * @gfp_mask: The GFP mask of the current allocation + * @nodemask: The allowed nodes to allocate from + * + * This is the main entry point for direct page compaction. + */ +unsigned long try_to_compact_pages(struct zonelist *zonelist, + int order, gfp_t gfp_mask, nodemask_t *nodemask) +{ + enum zone_type high_zoneidx = gfp_zone(gfp_mask); + int may_enter_fs = gfp_mask & __GFP_FS; + int may_perform_io = gfp_mask & __GFP_IO; + unsigned long watermark; + struct zoneref *z; + struct zone *zone; + int rc = COMPACT_SKIPPED; + + /* + * Check whether it is worth even starting compaction. The order check is + * made because an assumption is made that the page allocator can satisfy + * the "cheaper" orders without taking special steps + */ + if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io) + return rc; + + count_vm_event(COMPACTSTALL); + + /* Compact each zone in the list */ + for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, + nodemask) { + int fragindex; + int status; + + /* + * Watermarks for order-0 must be met for compaction. Note + * the 2UL. This is because during migration, copies of + * pages need to be allocated and for a short time, the + * footprint is higher + */ + watermark = low_wmark_pages(zone) + (2UL << order); + if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + continue; + + /* + * fragmentation index determines if allocation failures are + * due to low memory or external fragmentation + * + * index of -1 implies allocations might succeed depending + * on watermarks + * index towards 0 implies failure is due to lack of memory + * index towards 1000 implies failure is due to fragmentation + * + * Only compact if a failure would be due to fragmentation. + */ + fragindex = fragmentation_index(zone, order); + if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) + continue; + + if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) { + rc = COMPACT_PARTIAL; + break; + } + + status = compact_zone_order(zone, order, gfp_mask); + rc = max(status, rc); + + if (zone_watermark_ok(zone, order, watermark, 0, 0)) + break; + } + + return rc; +} + + +/* Compact all zones within a node */ +static int compact_node(int nid) +{ + int zoneid; + pg_data_t *pgdat; + struct zone *zone; + + if (nid < 0 || nid >= nr_node_ids || !node_online(nid)) + return -EINVAL; + pgdat = NODE_DATA(nid); + + /* Flush pending updates to the LRU lists */ + lru_add_drain_all(); + + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + struct compact_control cc = { + .nr_freepages = 0, + .nr_migratepages = 0, + .order = -1, + }; + + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + cc.zone = zone; + INIT_LIST_HEAD(&cc.freepages); + INIT_LIST_HEAD(&cc.migratepages); + + compact_zone(zone, &cc); + + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); + } + + return 0; +} + +/* Compact all nodes in the system */ +static int compact_nodes(void) +{ + int nid; + + for_each_online_node(nid) + compact_node(nid); + + return COMPACT_COMPLETE; +} + +/* The written value is actually unused, all memory is compacted */ +int sysctl_compact_memory; + +/* This is the entry point for compacting all nodes via /proc/sys/vm */ +int sysctl_compaction_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + if (write) + return compact_nodes(); + + return 0; +} + +int sysctl_extfrag_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + proc_dointvec_minmax(table, write, buffer, length, ppos); + + return 0; +} + +#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) +ssize_t sysfs_compact_node(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, size_t count) +{ + compact_node(dev->id); + + return count; +} +static SYSDEV_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node); + +int compaction_register_node(struct node *node) +{ + return sysdev_create_file(&node->sysdev, &attr_compact); +} + +void compaction_unregister_node(struct node *node) +{ + return sysdev_remove_file(&node->sysdev, &attr_compact); +} +#endif /* CONFIG_SYSFS && CONFIG_NUMA */ diff --git a/mm/fadvise.c b/mm/fadvise.c index e433592..8d723c9 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -77,12 +77,20 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) switch (advice) { case POSIX_FADV_NORMAL: file->f_ra.ra_pages = bdi->ra_pages; + spin_lock(&file->f_lock); + file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&file->f_lock); break; case POSIX_FADV_RANDOM: - file->f_ra.ra_pages = 0; + spin_lock(&file->f_lock); + file->f_mode |= FMODE_RANDOM; + spin_unlock(&file->f_lock); break; case POSIX_FADV_SEQUENTIAL: file->f_ra.ra_pages = bdi->ra_pages * 2; + spin_lock(&file->f_lock); + file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&file->f_lock); break; case POSIX_FADV_WILLNEED: if (!mapping->a_ops->readpage) { diff --git a/mm/failslab.c b/mm/failslab.c index 9339de5..c5f88f2 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -1,18 +1,21 @@ #include <linux/fault-inject.h> -#include <linux/gfp.h> +#include <linux/slab.h> static struct { struct fault_attr attr; u32 ignore_gfp_wait; + int cache_filter; #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS struct dentry *ignore_gfp_wait_file; + struct dentry *cache_filter_file; #endif } failslab = { .attr = FAULT_ATTR_INITIALIZER, .ignore_gfp_wait = 1, + .cache_filter = 0, }; -bool should_failslab(size_t size, gfp_t gfpflags) +bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags) { if (gfpflags & __GFP_NOFAIL) return false; @@ -20,6 +23,9 @@ bool should_failslab(size_t size, gfp_t gfpflags) if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT)) return false; + if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB)) + return false; + return should_fail(&failslab.attr, size); } @@ -30,7 +36,6 @@ static int __init setup_failslab(char *str) __setup("failslab=", setup_failslab); #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS - static int __init failslab_debugfs_init(void) { mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; @@ -46,8 +51,14 @@ static int __init failslab_debugfs_init(void) debugfs_create_bool("ignore-gfp-wait", mode, dir, &failslab.ignore_gfp_wait); - if (!failslab.ignore_gfp_wait_file) { + failslab.cache_filter_file = + debugfs_create_bool("cache-filter", mode, dir, + &failslab.cache_filter); + + if (!failslab.ignore_gfp_wait_file || + !failslab.cache_filter_file) { err = -ENOMEM; + debugfs_remove(failslab.cache_filter_file); debugfs_remove(failslab.ignore_gfp_wait_file); cleanup_fault_attr_dentries(&failslab.attr); } diff --git a/mm/filemap.c b/mm/filemap.c index e373692..3d4df44 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -10,13 +10,13 @@ * the NFS filesystem used to do this differently, for example) */ #include <linux/module.h> -#include <linux/slab.h> #include <linux/compiler.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/aio.h> #include <linux/capability.h> #include <linux/kernel_stat.h> +#include <linux/gfp.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/mman.h> @@ -151,6 +151,7 @@ void remove_from_page_cache(struct page *page) spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); } +EXPORT_SYMBOL(remove_from_page_cache); static int sync_page(void *word) { @@ -441,7 +442,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, /* * Splice_read and readahead add shmem/tmpfs pages into the page cache * before shmem_readpage has a chance to mark them as SwapBacked: they - * need to go on the active_anon lru below, and mem_cgroup_cache_charge + * need to go on the anon lru below, and mem_cgroup_cache_charge * (called in add_to_page_cache) needs to know where they're going too. */ if (mapping_cap_swap_backed(mapping)) @@ -452,7 +453,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, if (page_is_file_cache(page)) lru_cache_add_file(page); else - lru_cache_add_active_anon(page); + lru_cache_add_anon(page); } return ret; } @@ -461,9 +462,15 @@ EXPORT_SYMBOL_GPL(add_to_page_cache_lru); #ifdef CONFIG_NUMA struct page *__page_cache_alloc(gfp_t gfp) { + int n; + struct page *page; + if (cpuset_do_page_mem_spread()) { - int n = cpuset_mem_spread_node(); - return alloc_pages_exact_node(n, gfp, 0); + get_mems_allowed(); + n = cpuset_mem_spread_node(); + page = alloc_pages_exact_node(n, gfp, 0); + put_mems_allowed(); + return page; } return alloc_pages(gfp, 0); } @@ -1099,6 +1106,12 @@ page_not_up_to_date_locked: } readpage: + /* + * A previous I/O error may have been due to temporary + * failures, eg. multipath errors. + * PG_error will be set again if readpage fails. + */ + ClearPageError(page); /* Start the actual read. The read will unlock the page. */ error = mapping->a_ops->readpage(filp, page); @@ -1117,7 +1130,7 @@ readpage: if (!PageUptodate(page)) { if (page->mapping == NULL) { /* - * invalidate_inode_pages got it + * invalidate_mapping_pages got it */ unlock_page(page); page_cache_release(page); @@ -1263,7 +1276,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, { struct file *filp = iocb->ki_filp; ssize_t retval; - unsigned long seg; + unsigned long seg = 0; size_t count; loff_t *ppos = &iocb->ki_pos; @@ -1290,21 +1303,47 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, retval = mapping->a_ops->direct_IO(READ, iocb, iov, pos, nr_segs); } - if (retval > 0) + if (retval > 0) { *ppos = pos + retval; - if (retval) { + count -= retval; + } + + /* + * Btrfs can have a short DIO read if we encounter + * compressed extents, so if there was an error, or if + * we've already read everything we wanted to, or if + * there was a short read because we hit EOF, go ahead + * and return. Otherwise fallthrough to buffered io for + * the rest of the read. + */ + if (retval < 0 || !count || *ppos >= size) { file_accessed(filp); goto out; } } } + count = retval; for (seg = 0; seg < nr_segs; seg++) { read_descriptor_t desc; + loff_t offset = 0; + + /* + * If we did a short DIO read we need to skip the section of the + * iov that we've already read data into. + */ + if (count) { + if (count > iov[seg].iov_len) { + count -= iov[seg].iov_len; + continue; + } + offset = count; + count = 0; + } desc.written = 0; - desc.arg.buf = iov[seg].iov_base; - desc.count = iov[seg].iov_len; + desc.arg.buf = iov[seg].iov_base + offset; + desc.count = iov[seg].iov_len - offset; if (desc.count == 0) continue; desc.error = 0; @@ -1986,7 +2025,7 @@ EXPORT_SYMBOL(iov_iter_single_seg_count); inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) { struct inode *inode = file->f_mapping->host; - unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + unsigned long limit = rlimit(RLIMIT_FSIZE); if (unlikely(*pos < 0)) return -EINVAL; @@ -2199,14 +2238,12 @@ static ssize_t generic_perform_write(struct file *file, do { struct page *page; - pgoff_t index; /* Pagecache index for current page */ unsigned long offset; /* Offset into pagecache page */ unsigned long bytes; /* Bytes to write to page */ size_t copied; /* Bytes copied from user */ void *fsdata; offset = (pos & (PAGE_CACHE_SIZE - 1)); - index = pos >> PAGE_CACHE_SHIFT; bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, iov_iter_count(i)); @@ -2232,6 +2269,9 @@ again: if (unlikely(status)) break; + if (mapping_writably_mapped(mapping)) + flush_dcache_page(page); + pagefault_disable(); copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); pagefault_enable(); diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 1888b2d..83364df 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -17,6 +17,7 @@ #include <linux/sched.h> #include <linux/seqlock.h> #include <linux/mutex.h> +#include <linux/gfp.h> #include <asm/tlbflush.h> #include <asm/io.h> @@ -194,7 +195,7 @@ retry: flush_cache_page(vma, address, pte_pfn(*pte)); pteval = ptep_clear_flush_notify(vma, address, pte); page_remove_rmap(page); - dec_mm_counter(mm, file_rss); + dec_mm_counter(mm, MM_FILEPAGES); BUG_ON(pte_dirty(pteval)); pte_unmap_unlock(pte, ptl); page_cache_release(page); diff --git a/mm/fremap.c b/mm/fremap.c index b6ec85a..ec520c7 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, page_remove_rmap(page); page_cache_release(page); update_hiwater_rss(mm); - dec_mm_counter(mm, file_rss); + dec_mm_counter(mm, MM_FILEPAGES); } } else { if (!pte_file(pte)) @@ -125,7 +125,6 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, { struct mm_struct *mm = current->mm; struct address_space *mapping; - unsigned long end = start + size; struct vm_area_struct *vma; int err = -EINVAL; int has_write_lock = 0; @@ -142,6 +141,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (start + size <= start) return err; + /* Does pgoff wrap? */ + if (pgoff + (size >> PAGE_SHIFT) < pgoff) + return err; + /* Can we represent this offset inside this architecture's pte's? */ #if PTE_FILE_MAX_BITS < BITS_PER_LONG if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS)) @@ -168,7 +171,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (!(vma->vm_flags & VM_CAN_NONLINEAR)) goto out; - if (end <= start || start < vma->vm_start || end > vma->vm_end) + if (start < vma->vm_start || start + size > vma->vm_end) goto out; /* Must set VM_NONLINEAR before any pages are populated. */ diff --git a/mm/highmem.c b/mm/highmem.c index 9c1e627..7a0aa1b 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -26,6 +26,7 @@ #include <linux/init.h> #include <linux/hash.h> #include <linux/highmem.h> +#include <linux/kgdb.h> #include <asm/tlbflush.h> /* @@ -220,7 +221,7 @@ EXPORT_SYMBOL(kmap_high); * @page: &struct page to pin * * Returns the page's current virtual memory address, or NULL if no mapping - * exists. When and only when a non null address is returned then a + * exists. If and only if a non null address is returned then a * matching call to kunmap_high() is necessary. * * This can be called from any context. @@ -422,7 +423,7 @@ void __init page_address_init(void) #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ -#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT) +#ifdef CONFIG_DEBUG_HIGHMEM void debug_kmap_atomic(enum km_type type) { @@ -470,6 +471,12 @@ void debug_kmap_atomic(enum km_type type) warn_count--; } } +#ifdef CONFIG_KGDB_KDB + if (unlikely(type == KM_KDB && atomic_read(&kgdb_active) == -1)) { + WARN_ON(1); + warn_count--; + } +#endif /* CONFIG_KGDB_KDB */ } #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e91b81b..c032738 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2,7 +2,6 @@ * Generic hugetlb support. * (C) William Irwin, April 2004 */ -#include <linux/gfp.h> #include <linux/list.h> #include <linux/init.h> #include <linux/module.h> @@ -18,6 +17,10 @@ #include <linux/mutex.h> #include <linux/bootmem.h> #include <linux/sysfs.h> +#include <linux/slab.h> +#include <linux/rmap.h> +#include <linux/swap.h> +#include <linux/swapops.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -220,6 +223,12 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, (vma->vm_pgoff >> huge_page_order(h)); } +pgoff_t linear_hugepage_index(struct vm_area_struct *vma, + unsigned long address) +{ + return vma_hugecache_offset(hstate_vma(vma), vma, address); +} + /* * Return the size of the pages allocated when backing a VMA. In the majority * cases this will be same size as used by the page table entries. @@ -465,11 +474,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, struct page *page = NULL; struct mempolicy *mpol; nodemask_t *nodemask; - struct zonelist *zonelist = huge_zonelist(vma, address, - htlb_alloc_mask, &mpol, &nodemask); + struct zonelist *zonelist; struct zone *zone; struct zoneref *z; + get_mems_allowed(); + zonelist = huge_zonelist(vma, address, + htlb_alloc_mask, &mpol, &nodemask); /* * A child process with MAP_PRIVATE mappings created by their parent * have no page reserves. This check ensures that reservations are @@ -477,11 +488,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, */ if (!vma_has_reserves(vma) && h->free_huge_pages - h->resv_huge_pages == 0) - return NULL; + goto err; /* If reserves cannot be used, ensure enough pages are in the pool */ if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) - return NULL; + goto err;; for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { @@ -500,7 +511,9 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, break; } } +err: mpol_cond_put(mpol); + put_mems_allowed(); return page; } @@ -546,7 +559,9 @@ static void free_huge_page(struct page *page) mapping = (struct address_space *) page_private(page); set_page_private(page, 0); + page->mapping = NULL; BUG_ON(page_count(page)); + BUG_ON(page_mapcount(page)); INIT_LIST_HEAD(&page->lru); spin_lock(&hugetlb_lock); @@ -600,6 +615,8 @@ int PageHuge(struct page *page) return dtor == free_huge_page; } +EXPORT_SYMBOL_GPL(PageHuge); + static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; @@ -1038,7 +1055,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, page = alloc_buddy_huge_page(h, vma, addr); if (!page) { hugetlb_put_quota(inode->i_mapping, chg); - return ERR_PTR(-VM_FAULT_OOM); + return ERR_PTR(-VM_FAULT_SIGBUS); } } @@ -1515,10 +1532,9 @@ static struct attribute_group hstate_attr_group = { .attrs = hstate_attrs, }; -static int __init hugetlb_sysfs_add_hstate(struct hstate *h, - struct kobject *parent, - struct kobject **hstate_kobjs, - struct attribute_group *hstate_attr_group) +static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, + struct kobject **hstate_kobjs, + struct attribute_group *hstate_attr_group) { int retval; int hi = h - hstates; @@ -2088,7 +2104,7 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, ptep); } } @@ -2125,6 +2141,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, entry = huge_ptep_get(src_pte); ptepage = pte_page(entry); get_page(ptepage); + page_dup_rmap(ptepage); set_huge_pte_at(dst, addr, dst_pte, entry); } spin_unlock(&src->page_table_lock); @@ -2136,6 +2153,19 @@ nomem: return -ENOMEM; } +static int is_hugetlb_entry_hwpoisoned(pte_t pte) +{ + swp_entry_t swp; + + if (huge_pte_none(pte) || pte_present(pte)) + return 0; + swp = pte_to_swp_entry(pte); + if (non_swap_entry(swp) && is_hwpoison_entry(swp)) { + return 1; + } else + return 0; +} + void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) { @@ -2194,6 +2224,12 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, if (huge_pte_none(pte)) continue; + /* + * HWPoisoned hugepage is already unmapped and dropped reference + */ + if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) + continue; + page = pte_page(pte); if (pte_dirty(pte)) set_page_dirty(page); @@ -2203,6 +2239,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, flush_tlb_range(vma, start, end); mmu_notifier_invalidate_range_end(mm, start, end); list_for_each_entry_safe(page, tmp, &page_list, lru) { + page_remove_rmap(page); list_del(&page->lru); put_page(page); } @@ -2268,6 +2305,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, return 1; } +/* + * Hugetlb_cow() should be called with page lock of the original hugepage held. + */ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t pte, struct page *pagecache_page) @@ -2282,8 +2322,10 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, retry_avoidcopy: /* If no-one else is actually using this page, avoid the copy * and just make the page writable */ - avoidcopy = (page_count(old_page) == 1); + avoidcopy = (page_mapcount(old_page) == 1); if (avoidcopy) { + if (PageAnon(old_page)) + page_move_anon_rmap(old_page, vma, address); set_huge_ptep_writable(vma, address, ptep); return 0; } @@ -2334,6 +2376,13 @@ retry_avoidcopy: return -PTR_ERR(new_page); } + /* + * When the original hugepage is shared one, it does not have + * anon_vma prepared. + */ + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + copy_huge_page(new_page, old_page, address, vma); __SetPageUptodate(new_page); @@ -2345,11 +2394,19 @@ retry_avoidcopy: ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) { /* Break COW */ + mmu_notifier_invalidate_range_start(mm, + address & huge_page_mask(h), + (address & huge_page_mask(h)) + huge_page_size(h)); huge_ptep_clear_flush(vma, address, ptep); set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, new_page, 1)); + page_remove_rmap(old_page); + hugepage_add_new_anon_rmap(new_page, vma, address); /* Make the old page be freed below */ new_page = old_page; + mmu_notifier_invalidate_range_end(mm, + address & huge_page_mask(h), + (address & huge_page_mask(h)) + huge_page_size(h)); } page_cache_release(new_page); page_cache_release(old_page); @@ -2448,8 +2505,29 @@ retry: spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); spin_unlock(&inode->i_lock); - } else + page_dup_rmap(page); + } else { lock_page(page); + if (unlikely(anon_vma_prepare(vma))) { + ret = VM_FAULT_OOM; + goto backout_unlocked; + } + hugepage_add_new_anon_rmap(page, vma, address); + } + } else { + page_dup_rmap(page); + } + + /* + * Since memory error handler replaces pte into hwpoison swap entry + * at the time of error handling, a process which reserved but not have + * the mapping to the error hugepage does not have hwpoison swap entry. + * So we need to block accesses from such a process by checking + * PG_hwpoison bit here. + */ + if (unlikely(PageHWPoison(page))) { + ret = VM_FAULT_HWPOISON; + goto backout_unlocked; } /* @@ -2501,10 +2579,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *ptep; pte_t entry; int ret; + struct page *page = NULL; struct page *pagecache_page = NULL; static DEFINE_MUTEX(hugetlb_instantiation_mutex); struct hstate *h = hstate_vma(vma); + ptep = huge_pte_offset(mm, address); + if (ptep) { + entry = huge_ptep_get(ptep); + if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) + return VM_FAULT_HWPOISON; + } + ptep = huge_pte_alloc(mm, address, huge_page_size(h)); if (!ptep) return VM_FAULT_OOM; @@ -2542,6 +2628,17 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, vma, address); } + /* + * hugetlb_cow() requires page locks of pte_page(entry) and + * pagecache_page, so here we need take the former one + * when page != pagecache_page or !pagecache_page. + * Note that locking order is always pagecache_page -> page, + * so no worry about deadlock. + */ + page = pte_page(entry); + if (page != pagecache_page) + lock_page(page); + spin_lock(&mm->page_table_lock); /* Check for a racing update before calling hugetlb_cow */ if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) @@ -2559,7 +2656,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = pte_mkyoung(entry); if (huge_ptep_set_access_flags(vma, address, ptep, entry, flags & FAULT_FLAG_WRITE)) - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, ptep); out_page_table_lock: spin_unlock(&mm->page_table_lock); @@ -2568,6 +2665,7 @@ out_page_table_lock: unlock_page(pagecache_page); put_page(pagecache_page); } + unlock_page(page); out_mutex: mutex_unlock(&hugetlb_instantiation_mutex); @@ -2779,3 +2877,19 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) hugetlb_put_quota(inode->i_mapping, (chg - freed)); hugetlb_acct_memory(h, -(chg - freed)); } + +/* + * This function is called from memory failure code. + * Assume the caller holds page lock of the head page. + */ +void __isolate_hwpoisoned_huge_page(struct page *hpage) +{ + struct hstate *h = page_hstate(hpage); + int nid = page_to_nid(hpage); + + spin_lock(&hugetlb_lock); + list_del(&hpage->lru); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + spin_unlock(&hugetlb_lock); +} diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 10ea719..0948f10 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -5,6 +5,7 @@ #include <linux/mm.h> #include <linux/swap.h> #include <linux/pagemap.h> +#include <linux/hugetlb.h> #include "internal.h" static struct dentry *hwpoison_dir; @@ -13,6 +14,7 @@ static int hwpoison_inject(void *data, u64 val) { unsigned long pfn = val; struct page *p; + struct page *hpage; int err; if (!capable(CAP_SYS_ADMIN)) @@ -24,18 +26,19 @@ static int hwpoison_inject(void *data, u64 val) return -ENXIO; p = pfn_to_page(pfn); + hpage = compound_head(p); /* * This implies unable to support free buddy pages. */ - if (!get_page_unless_zero(p)) + if (!get_page_unless_zero(hpage)) return 0; - if (!PageLRU(p)) + if (!PageLRU(p) && !PageHuge(p)) shake_page(p, 0); /* * This implies unable to support non-LRU pages. */ - if (!PageLRU(p)) + if (!PageLRU(p) && !PageHuge(p)) return 0; /* @@ -44,9 +47,9 @@ static int hwpoison_inject(void *data, u64 val) * We temporarily take page lock for try_get_mem_cgroup_from_page(). * __memory_failure() will redo the check reliably inside page lock. */ - lock_page(p); - err = hwpoison_filter(p); - unlock_page(p); + lock_page(hpage); + err = hwpoison_filter(hpage); + unlock_page(hpage); if (err) return 0; diff --git a/mm/init-mm.c b/mm/init-mm.c index 57aba0d..1d29cdf 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -7,6 +7,11 @@ #include <asm/atomic.h> #include <asm/pgtable.h> +#include <asm/mmu.h> + +#ifndef INIT_MM_CONTEXT +#define INIT_MM_CONTEXT(name) +#endif struct mm_struct init_mm = { .mm_rb = RB_ROOT, @@ -17,4 +22,5 @@ struct mm_struct init_mm = { .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), .cpu_vm_mask = CPU_MASK_ALL, + INIT_MM_CONTEXT(init_mm) }; diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 5b069e4..bd9bc21 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -72,7 +72,6 @@ #include <linux/module.h> #include <linux/kthread.h> #include <linux/prio_tree.h> -#include <linux/gfp.h> #include <linux/fs.h> #include <linux/debugfs.h> #include <linux/seq_file.h> @@ -212,6 +211,9 @@ static signed long jiffies_scan_wait; static int kmemleak_stack_scan = 1; /* protects the memory scanning, parameters and debug/kmemleak file access */ static DEFINE_MUTEX(scan_mutex); +/* setting kmemleak=on, will set this var, skipping the disable */ +static int kmemleak_skip_disable; + /* * Early object allocation/freeing logging. Kmemleak is initialized after the @@ -399,7 +401,9 @@ static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) object = prio_tree_entry(node, struct kmemleak_object, tree_node); if (!alias && object->pointer != ptr) { - kmemleak_warn("Found object by alias"); + pr_warning("Found object by alias at 0x%08lx\n", ptr); + dump_stack(); + dump_object_info(object); object = NULL; } } else @@ -696,7 +700,7 @@ static void paint_ptr(unsigned long ptr, int color) } /* - * Make a object permanently as gray-colored so that it can no longer be + * Mark an object permanently as gray-colored so that it can no longer be * reported as a leak. This is used in general to mark a false positive. */ static void make_gray_object(unsigned long ptr) @@ -839,10 +843,19 @@ out: rcu_read_unlock(); } -/* - * Memory allocation function callback. This function is called from the - * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc, - * vmalloc etc.). +/** + * kmemleak_alloc - register a newly allocated object + * @ptr: pointer to beginning of the object + * @size: size of the object + * @min_count: minimum number of references to this object. If during memory + * scanning a number of references less than @min_count is found, + * the object is reported as a memory leak. If @min_count is 0, + * the object is never reported as a leak. If @min_count is -1, + * the object is ignored (not scanned and not reported as a leak) + * @gfp: kmalloc() flags used for kmemleak internal memory allocations + * + * This function is called from the kernel allocators when a new object + * (memory block) is allocated (kmem_cache_alloc, kmalloc, vmalloc etc.). */ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) @@ -856,9 +869,12 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, } EXPORT_SYMBOL_GPL(kmemleak_alloc); -/* - * Memory freeing function callback. This function is called from the kernel - * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.). +/** + * kmemleak_free - unregister a previously registered object + * @ptr: pointer to beginning of the object + * + * This function is called from the kernel allocators when an object (memory + * block) is freed (kmem_cache_free, kfree, vfree etc.). */ void __ref kmemleak_free(const void *ptr) { @@ -871,9 +887,14 @@ void __ref kmemleak_free(const void *ptr) } EXPORT_SYMBOL_GPL(kmemleak_free); -/* - * Partial memory freeing function callback. This function is usually called - * from bootmem allocator when (part of) a memory block is freed. +/** + * kmemleak_free_part - partially unregister a previously registered object + * @ptr: pointer to the beginning or inside the object. This also + * represents the start of the range to be freed + * @size: size to be unregistered + * + * This function is called when only a part of a memory block is freed + * (usually from the bootmem allocator). */ void __ref kmemleak_free_part(const void *ptr, size_t size) { @@ -886,9 +907,12 @@ void __ref kmemleak_free_part(const void *ptr, size_t size) } EXPORT_SYMBOL_GPL(kmemleak_free_part); -/* - * Mark an already allocated memory block as a false positive. This will cause - * the block to no longer be reported as leak and always be scanned. +/** + * kmemleak_not_leak - mark an allocated object as false positive + * @ptr: pointer to beginning of the object + * + * Calling this function on an object will cause the memory block to no longer + * be reported as leak and always be scanned. */ void __ref kmemleak_not_leak(const void *ptr) { @@ -901,10 +925,14 @@ void __ref kmemleak_not_leak(const void *ptr) } EXPORT_SYMBOL(kmemleak_not_leak); -/* - * Ignore a memory block. This is usually done when it is known that the - * corresponding block is not a leak and does not contain any references to - * other allocated memory blocks. +/** + * kmemleak_ignore - ignore an allocated object + * @ptr: pointer to beginning of the object + * + * Calling this function on an object will cause the memory block to be + * ignored (not scanned and not reported as a leak). This is usually done when + * it is known that the corresponding block is not a leak and does not contain + * any references to other allocated memory blocks. */ void __ref kmemleak_ignore(const void *ptr) { @@ -917,8 +945,16 @@ void __ref kmemleak_ignore(const void *ptr) } EXPORT_SYMBOL(kmemleak_ignore); -/* - * Limit the range to be scanned in an allocated memory block. +/** + * kmemleak_scan_area - limit the range to be scanned in an allocated object + * @ptr: pointer to beginning or inside the object. This also + * represents the start of the scan area + * @size: size of the scan area + * @gfp: kmalloc() flags used for kmemleak internal memory allocations + * + * This function is used when it is known that only certain parts of an object + * contain references to other objects. Kmemleak will only scan these areas + * reducing the number false negatives. */ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) { @@ -931,8 +967,14 @@ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) } EXPORT_SYMBOL(kmemleak_scan_area); -/* - * Inform kmemleak not to scan the given memory block. +/** + * kmemleak_no_scan - do not scan an allocated object + * @ptr: pointer to beginning of the object + * + * This function notifies kmemleak not to scan the given memory block. Useful + * in situations where it is known that the given object does not contain any + * references to other objects. Kmemleak will not scan such objects reducing + * the number of false negatives. */ void __ref kmemleak_no_scan(const void *ptr) { @@ -1603,7 +1645,9 @@ static int kmemleak_boot_config(char *str) return -EINVAL; if (strcmp(str, "off") == 0) kmemleak_disable(); - else if (strcmp(str, "on") != 0) + else if (strcmp(str, "on") == 0) + kmemleak_skip_disable = 1; + else return -EINVAL; return 0; } @@ -1617,6 +1661,13 @@ void __init kmemleak_init(void) int i; unsigned long flags; +#ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF + if (!kmemleak_skip_disable) { + kmemleak_disable(); + return; + } +#endif + jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); @@ -33,6 +33,7 @@ #include <linux/mmu_notifier.h> #include <linux/swap.h> #include <linux/ksm.h> +#include <linux/hash.h> #include <asm/tlbflush.h> #include "internal.h" @@ -153,8 +154,9 @@ struct rmap_item { static struct rb_root root_stable_tree = RB_ROOT; static struct rb_root root_unstable_tree = RB_ROOT; -#define MM_SLOTS_HASH_HEADS 1024 -static struct hlist_head *mm_slots_hash; +#define MM_SLOTS_HASH_SHIFT 10 +#define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT) +static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS]; static struct mm_slot ksm_mm_head = { .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), @@ -269,28 +271,13 @@ static inline void free_mm_slot(struct mm_slot *mm_slot) kmem_cache_free(mm_slot_cache, mm_slot); } -static int __init mm_slots_hash_init(void) -{ - mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), - GFP_KERNEL); - if (!mm_slots_hash) - return -ENOMEM; - return 0; -} - -static void __init mm_slots_hash_free(void) -{ - kfree(mm_slots_hash); -} - static struct mm_slot *get_mm_slot(struct mm_struct *mm) { struct mm_slot *mm_slot; struct hlist_head *bucket; struct hlist_node *node; - bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) - % MM_SLOTS_HASH_HEADS]; + bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)]; hlist_for_each_entry(mm_slot, node, bucket, link) { if (mm == mm_slot->mm) return mm_slot; @@ -303,8 +290,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm, { struct hlist_head *bucket; - bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) - % MM_SLOTS_HASH_HEADS]; + bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)]; mm_slot->mm = mm; hlist_add_head(&mm_slot->link, bucket); } @@ -318,19 +304,14 @@ static void hold_anon_vma(struct rmap_item *rmap_item, struct anon_vma *anon_vma) { rmap_item->anon_vma = anon_vma; - atomic_inc(&anon_vma->ksm_refcount); + get_anon_vma(anon_vma); } -static void drop_anon_vma(struct rmap_item *rmap_item) +static void ksm_drop_anon_vma(struct rmap_item *rmap_item) { struct anon_vma *anon_vma = rmap_item->anon_vma; - if (atomic_dec_and_lock(&anon_vma->ksm_refcount, &anon_vma->lock)) { - int empty = list_empty(&anon_vma->head); - spin_unlock(&anon_vma->lock); - if (empty) - anon_vma_free(anon_vma); - } + drop_anon_vma(anon_vma); } /* @@ -365,7 +346,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) do { cond_resched(); page = follow_page(vma, addr, FOLL_GET); - if (!page) + if (IS_ERR_OR_NULL(page)) break; if (PageKsm(page)) ret = handle_mm_fault(vma->vm_mm, vma, addr, @@ -415,7 +396,7 @@ static void break_cow(struct rmap_item *rmap_item) * It is not an accident that whenever we want to break COW * to undo, we also need to drop a reference to the anon_vma. */ - drop_anon_vma(rmap_item); + ksm_drop_anon_vma(rmap_item); down_read(&mm->mmap_sem); if (ksm_test_exit(mm)) @@ -447,7 +428,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) goto out; page = follow_page(vma, addr, FOLL_GET); - if (!page) + if (IS_ERR_OR_NULL(page)) goto out; if (PageAnon(page)) { flush_anon_page(vma, page, addr); @@ -470,7 +451,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) ksm_pages_sharing--; else ksm_pages_shared--; - drop_anon_vma(rmap_item); + ksm_drop_anon_vma(rmap_item); rmap_item->address &= PAGE_MASK; cond_resched(); } @@ -558,7 +539,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) else ksm_pages_shared--; - drop_anon_vma(rmap_item); + ksm_drop_anon_vma(rmap_item); rmap_item->address &= PAGE_MASK; } else if (rmap_item->address & UNSTABLE_FLAG) { @@ -731,7 +712,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, if (!ptep) goto out; - if (pte_write(*ptep)) { + if (pte_write(*ptep) || pte_dirty(*ptep)) { pte_t entry; swapped = PageSwapCache(page); @@ -751,10 +732,12 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, * page */ if (page_mapcount(page) + 1 + swapped != page_count(page)) { - set_pte_at_notify(mm, addr, ptep, entry); + set_pte_at(mm, addr, ptep, entry); goto out_unlock; } - entry = pte_wrprotect(entry); + if (pte_dirty(entry)) + set_page_dirty(page); + entry = pte_mkclean(pte_wrprotect(entry)); set_pte_at_notify(mm, addr, ptep, entry); } *orig_pte = *ptep; @@ -1086,7 +1069,7 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, cond_resched(); tree_rmap_item = rb_entry(*new, struct rmap_item, node); tree_page = get_mergeable_page(tree_rmap_item); - if (!tree_page) + if (IS_ERR_OR_NULL(tree_page)) return NULL; /* @@ -1294,7 +1277,7 @@ next_mm: if (ksm_test_exit(mm)) break; *page = follow_page(vma, ksm_scan.address, FOLL_GET); - if (*page && PageAnon(*page)) { + if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) { flush_anon_page(vma, *page, ksm_scan.address); flush_dcache_page(*page); rmap_item = get_next_rmap_item(slot, @@ -1308,7 +1291,7 @@ next_mm: up_read(&mm->mmap_sem); return rmap_item; } - if (*page) + if (!IS_ERR_OR_NULL(*page)) put_page(*page); ksm_scan.address += PAGE_SIZE; cond_resched(); @@ -1367,7 +1350,7 @@ next_mm: static void ksm_do_scan(unsigned int scan_npages) { struct rmap_item *rmap_item; - struct page *page; + struct page *uninitialized_var(page); while (scan_npages--) { cond_resched(); @@ -1523,8 +1506,6 @@ struct page *ksm_does_need_to_copy(struct page *page, { struct page *new_page; - unlock_page(page); /* any racers will COW it, not modify it */ - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); if (new_page) { copy_user_highpage(new_page, page, address, vma); @@ -1540,7 +1521,6 @@ struct page *ksm_does_need_to_copy(struct page *page, add_page_to_unevictable_list(new_page); } - page_cache_release(page); return new_page; } @@ -1563,10 +1543,12 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, again: hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { struct anon_vma *anon_vma = rmap_item->anon_vma; + struct anon_vma_chain *vmac; struct vm_area_struct *vma; - spin_lock(&anon_vma->lock); - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + anon_vma_lock(anon_vma); + list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + vma = vmac->vma; if (rmap_item->address < vma->vm_start || rmap_item->address >= vma->vm_end) continue; @@ -1587,7 +1569,7 @@ again: if (!search_new_forks || !mapcount) break; } - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); if (!mapcount) goto out; } @@ -1614,10 +1596,12 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) again: hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { struct anon_vma *anon_vma = rmap_item->anon_vma; + struct anon_vma_chain *vmac; struct vm_area_struct *vma; - spin_lock(&anon_vma->lock); - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + anon_vma_lock(anon_vma); + list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + vma = vmac->vma; if (rmap_item->address < vma->vm_start || rmap_item->address >= vma->vm_end) continue; @@ -1633,11 +1617,11 @@ again: ret = try_to_unmap_one(page, vma, rmap_item->address, flags); if (ret != SWAP_AGAIN || !page_mapped(page)) { - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); goto out; } } - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); } if (!search_new_forks++) goto again; @@ -1664,10 +1648,12 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, again: hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { struct anon_vma *anon_vma = rmap_item->anon_vma; + struct anon_vma_chain *vmac; struct vm_area_struct *vma; - spin_lock(&anon_vma->lock); - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + anon_vma_lock(anon_vma); + list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + vma = vmac->vma; if (rmap_item->address < vma->vm_start || rmap_item->address >= vma->vm_end) continue; @@ -1682,11 +1668,11 @@ again: ret = rmap_one(page, vma, rmap_item->address, arg); if (ret != SWAP_AGAIN) { - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); goto out; } } - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); } if (!search_new_forks++) goto again; @@ -1937,15 +1923,11 @@ static int __init ksm_init(void) if (err) goto out; - err = mm_slots_hash_init(); - if (err) - goto out_free1; - ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); if (IS_ERR(ksm_thread)) { printk(KERN_ERR "ksm: creating kthread failed\n"); err = PTR_ERR(ksm_thread); - goto out_free2; + goto out_free; } #ifdef CONFIG_SYSFS @@ -1953,7 +1935,7 @@ static int __init ksm_init(void) if (err) { printk(KERN_ERR "ksm: register sysfs failed\n"); kthread_stop(ksm_thread); - goto out_free2; + goto out_free; } #else ksm_run = KSM_RUN_MERGE; /* no way for user to start it */ @@ -1969,9 +1951,7 @@ static int __init ksm_init(void) #endif return 0; -out_free2: - mm_slots_hash_free(); -out_free1: +out_free: ksm_slab_free(); out: return err; diff --git a/mm/memblock.c b/mm/memblock.c new file mode 100644 index 0000000..43840b3 --- /dev/null +++ b/mm/memblock.c @@ -0,0 +1,541 @@ +/* + * Procedures for maintaining information about logical memory blocks. + * + * Peter Bergner, IBM Corp. June 2001. + * Copyright (C) 2001 Peter Bergner. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/bitops.h> +#include <linux/memblock.h> + +#define MEMBLOCK_ALLOC_ANYWHERE 0 + +struct memblock memblock; + +static int memblock_debug; + +static int __init early_memblock(char *p) +{ + if (p && strstr(p, "debug")) + memblock_debug = 1; + return 0; +} +early_param("memblock", early_memblock); + +static void memblock_dump(struct memblock_region *region, char *name) +{ + unsigned long long base, size; + int i; + + pr_info(" %s.cnt = 0x%lx\n", name, region->cnt); + + for (i = 0; i < region->cnt; i++) { + base = region->region[i].base; + size = region->region[i].size; + + pr_info(" %s[0x%x]\t0x%016llx - 0x%016llx, 0x%llx bytes\n", + name, i, base, base + size - 1, size); + } +} + +void memblock_dump_all(void) +{ + if (!memblock_debug) + return; + + pr_info("MEMBLOCK configuration:\n"); + pr_info(" rmo_size = 0x%llx\n", (unsigned long long)memblock.rmo_size); + pr_info(" memory.size = 0x%llx\n", (unsigned long long)memblock.memory.size); + + memblock_dump(&memblock.memory, "memory"); + memblock_dump(&memblock.reserved, "reserved"); +} + +static unsigned long memblock_addrs_overlap(u64 base1, u64 size1, u64 base2, + u64 size2) +{ + return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); +} + +static long memblock_addrs_adjacent(u64 base1, u64 size1, u64 base2, u64 size2) +{ + if (base2 == base1 + size1) + return 1; + else if (base1 == base2 + size2) + return -1; + + return 0; +} + +static long memblock_regions_adjacent(struct memblock_region *rgn, + unsigned long r1, unsigned long r2) +{ + u64 base1 = rgn->region[r1].base; + u64 size1 = rgn->region[r1].size; + u64 base2 = rgn->region[r2].base; + u64 size2 = rgn->region[r2].size; + + return memblock_addrs_adjacent(base1, size1, base2, size2); +} + +static void memblock_remove_region(struct memblock_region *rgn, unsigned long r) +{ + unsigned long i; + + for (i = r; i < rgn->cnt - 1; i++) { + rgn->region[i].base = rgn->region[i + 1].base; + rgn->region[i].size = rgn->region[i + 1].size; + } + rgn->cnt--; +} + +/* Assumption: base addr of region 1 < base addr of region 2 */ +static void memblock_coalesce_regions(struct memblock_region *rgn, + unsigned long r1, unsigned long r2) +{ + rgn->region[r1].size += rgn->region[r2].size; + memblock_remove_region(rgn, r2); +} + +void __init memblock_init(void) +{ + /* Create a dummy zero size MEMBLOCK which will get coalesced away later. + * This simplifies the memblock_add() code below... + */ + memblock.memory.region[0].base = 0; + memblock.memory.region[0].size = 0; + memblock.memory.cnt = 1; + + /* Ditto. */ + memblock.reserved.region[0].base = 0; + memblock.reserved.region[0].size = 0; + memblock.reserved.cnt = 1; +} + +void __init memblock_analyze(void) +{ + int i; + + memblock.memory.size = 0; + + for (i = 0; i < memblock.memory.cnt; i++) + memblock.memory.size += memblock.memory.region[i].size; +} + +static long memblock_add_region(struct memblock_region *rgn, u64 base, u64 size) +{ + unsigned long coalesced = 0; + long adjacent, i; + + if ((rgn->cnt == 1) && (rgn->region[0].size == 0)) { + rgn->region[0].base = base; + rgn->region[0].size = size; + return 0; + } + + /* First try and coalesce this MEMBLOCK with another. */ + for (i = 0; i < rgn->cnt; i++) { + u64 rgnbase = rgn->region[i].base; + u64 rgnsize = rgn->region[i].size; + + if ((rgnbase == base) && (rgnsize == size)) + /* Already have this region, so we're done */ + return 0; + + adjacent = memblock_addrs_adjacent(base, size, rgnbase, rgnsize); + if (adjacent > 0) { + rgn->region[i].base -= size; + rgn->region[i].size += size; + coalesced++; + break; + } else if (adjacent < 0) { + rgn->region[i].size += size; + coalesced++; + break; + } + } + + if ((i < rgn->cnt - 1) && memblock_regions_adjacent(rgn, i, i+1)) { + memblock_coalesce_regions(rgn, i, i+1); + coalesced++; + } + + if (coalesced) + return coalesced; + if (rgn->cnt >= MAX_MEMBLOCK_REGIONS) + return -1; + + /* Couldn't coalesce the MEMBLOCK, so add it to the sorted table. */ + for (i = rgn->cnt - 1; i >= 0; i--) { + if (base < rgn->region[i].base) { + rgn->region[i+1].base = rgn->region[i].base; + rgn->region[i+1].size = rgn->region[i].size; + } else { + rgn->region[i+1].base = base; + rgn->region[i+1].size = size; + break; + } + } + + if (base < rgn->region[0].base) { + rgn->region[0].base = base; + rgn->region[0].size = size; + } + rgn->cnt++; + + return 0; +} + +long memblock_add(u64 base, u64 size) +{ + struct memblock_region *_rgn = &memblock.memory; + + /* On pSeries LPAR systems, the first MEMBLOCK is our RMO region. */ + if (base == 0) + memblock.rmo_size = size; + + return memblock_add_region(_rgn, base, size); + +} + +static long __memblock_remove(struct memblock_region *rgn, u64 base, u64 size) +{ + u64 rgnbegin, rgnend; + u64 end = base + size; + int i; + + rgnbegin = rgnend = 0; /* supress gcc warnings */ + + /* Find the region where (base, size) belongs to */ + for (i=0; i < rgn->cnt; i++) { + rgnbegin = rgn->region[i].base; + rgnend = rgnbegin + rgn->region[i].size; + + if ((rgnbegin <= base) && (end <= rgnend)) + break; + } + + /* Didn't find the region */ + if (i == rgn->cnt) + return -1; + + /* Check to see if we are removing entire region */ + if ((rgnbegin == base) && (rgnend == end)) { + memblock_remove_region(rgn, i); + return 0; + } + + /* Check to see if region is matching at the front */ + if (rgnbegin == base) { + rgn->region[i].base = end; + rgn->region[i].size -= size; + return 0; + } + + /* Check to see if the region is matching at the end */ + if (rgnend == end) { + rgn->region[i].size -= size; + return 0; + } + + /* + * We need to split the entry - adjust the current one to the + * beginging of the hole and add the region after hole. + */ + rgn->region[i].size = base - rgn->region[i].base; + return memblock_add_region(rgn, end, rgnend - end); +} + +long memblock_remove(u64 base, u64 size) +{ + return __memblock_remove(&memblock.memory, base, size); +} + +long __init memblock_free(u64 base, u64 size) +{ + return __memblock_remove(&memblock.reserved, base, size); +} + +long __init memblock_reserve(u64 base, u64 size) +{ + struct memblock_region *_rgn = &memblock.reserved; + + BUG_ON(0 == size); + + return memblock_add_region(_rgn, base, size); +} + +long memblock_overlaps_region(struct memblock_region *rgn, u64 base, u64 size) +{ + unsigned long i; + + for (i = 0; i < rgn->cnt; i++) { + u64 rgnbase = rgn->region[i].base; + u64 rgnsize = rgn->region[i].size; + if (memblock_addrs_overlap(base, size, rgnbase, rgnsize)) + break; + } + + return (i < rgn->cnt) ? i : -1; +} + +static u64 memblock_align_down(u64 addr, u64 size) +{ + return addr & ~(size - 1); +} + +static u64 memblock_align_up(u64 addr, u64 size) +{ + return (addr + (size - 1)) & ~(size - 1); +} + +static u64 __init memblock_alloc_nid_unreserved(u64 start, u64 end, + u64 size, u64 align) +{ + u64 base, res_base; + long j; + + base = memblock_align_down((end - size), align); + while (start <= base) { + j = memblock_overlaps_region(&memblock.reserved, base, size); + if (j < 0) { + /* this area isn't reserved, take it */ + if (memblock_add_region(&memblock.reserved, base, size) < 0) + base = ~(u64)0; + return base; + } + res_base = memblock.reserved.region[j].base; + if (res_base < size) + break; + base = memblock_align_down(res_base - size, align); + } + + return ~(u64)0; +} + +static u64 __init memblock_alloc_nid_region(struct memblock_property *mp, + u64 (*nid_range)(u64, u64, int *), + u64 size, u64 align, int nid) +{ + u64 start, end; + + start = mp->base; + end = start + mp->size; + + start = memblock_align_up(start, align); + while (start < end) { + u64 this_end; + int this_nid; + + this_end = nid_range(start, end, &this_nid); + if (this_nid == nid) { + u64 ret = memblock_alloc_nid_unreserved(start, this_end, + size, align); + if (ret != ~(u64)0) + return ret; + } + start = this_end; + } + + return ~(u64)0; +} + +u64 __init memblock_alloc_nid(u64 size, u64 align, int nid, + u64 (*nid_range)(u64 start, u64 end, int *nid)) +{ + struct memblock_region *mem = &memblock.memory; + int i; + + BUG_ON(0 == size); + + size = memblock_align_up(size, align); + + for (i = 0; i < mem->cnt; i++) { + u64 ret = memblock_alloc_nid_region(&mem->region[i], + nid_range, + size, align, nid); + if (ret != ~(u64)0) + return ret; + } + + return memblock_alloc(size, align); +} + +u64 __init memblock_alloc(u64 size, u64 align) +{ + return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE); +} + +u64 __init memblock_alloc_base(u64 size, u64 align, u64 max_addr) +{ + u64 alloc; + + alloc = __memblock_alloc_base(size, align, max_addr); + + if (alloc == 0) + panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n", + (unsigned long long) size, (unsigned long long) max_addr); + + return alloc; +} + +u64 __init __memblock_alloc_base(u64 size, u64 align, u64 max_addr) +{ + long i, j; + u64 base = 0; + u64 res_base; + + BUG_ON(0 == size); + + size = memblock_align_up(size, align); + + /* On some platforms, make sure we allocate lowmem */ + /* Note that MEMBLOCK_REAL_LIMIT may be MEMBLOCK_ALLOC_ANYWHERE */ + if (max_addr == MEMBLOCK_ALLOC_ANYWHERE) + max_addr = MEMBLOCK_REAL_LIMIT; + + for (i = memblock.memory.cnt - 1; i >= 0; i--) { + u64 memblockbase = memblock.memory.region[i].base; + u64 memblocksize = memblock.memory.region[i].size; + + if (memblocksize < size) + continue; + if (max_addr == MEMBLOCK_ALLOC_ANYWHERE) + base = memblock_align_down(memblockbase + memblocksize - size, align); + else if (memblockbase < max_addr) { + base = min(memblockbase + memblocksize, max_addr); + base = memblock_align_down(base - size, align); + } else + continue; + + while (base && memblockbase <= base) { + j = memblock_overlaps_region(&memblock.reserved, base, size); + if (j < 0) { + /* this area isn't reserved, take it */ + if (memblock_add_region(&memblock.reserved, base, size) < 0) + return 0; + return base; + } + res_base = memblock.reserved.region[j].base; + if (res_base < size) + break; + base = memblock_align_down(res_base - size, align); + } + } + return 0; +} + +/* You must call memblock_analyze() before this. */ +u64 __init memblock_phys_mem_size(void) +{ + return memblock.memory.size; +} + +u64 memblock_end_of_DRAM(void) +{ + int idx = memblock.memory.cnt - 1; + + return (memblock.memory.region[idx].base + memblock.memory.region[idx].size); +} + +/* You must call memblock_analyze() after this. */ +void __init memblock_enforce_memory_limit(u64 memory_limit) +{ + unsigned long i; + u64 limit; + struct memblock_property *p; + + if (!memory_limit) + return; + + /* Truncate the memblock regions to satisfy the memory limit. */ + limit = memory_limit; + for (i = 0; i < memblock.memory.cnt; i++) { + if (limit > memblock.memory.region[i].size) { + limit -= memblock.memory.region[i].size; + continue; + } + + memblock.memory.region[i].size = limit; + memblock.memory.cnt = i + 1; + break; + } + + if (memblock.memory.region[0].size < memblock.rmo_size) + memblock.rmo_size = memblock.memory.region[0].size; + + memory_limit = memblock_end_of_DRAM(); + + /* And truncate any reserves above the limit also. */ + for (i = 0; i < memblock.reserved.cnt; i++) { + p = &memblock.reserved.region[i]; + + if (p->base > memory_limit) + p->size = 0; + else if ((p->base + p->size) > memory_limit) + p->size = memory_limit - p->base; + + if (p->size == 0) { + memblock_remove_region(&memblock.reserved, i); + i--; + } + } +} + +int __init memblock_is_reserved(u64 addr) +{ + int i; + + for (i = 0; i < memblock.reserved.cnt; i++) { + u64 upper = memblock.reserved.region[i].base + + memblock.reserved.region[i].size - 1; + if ((addr >= memblock.reserved.region[i].base) && (addr <= upper)) + return 1; + } + return 0; +} + +int memblock_is_region_reserved(u64 base, u64 size) +{ + return memblock_overlaps_region(&memblock.reserved, base, size) >= 0; +} + +/* + * Given a <base, len>, find which memory regions belong to this range. + * Adjust the request and return a contiguous chunk. + */ +int memblock_find(struct memblock_property *res) +{ + int i; + u64 rstart, rend; + + rstart = res->base; + rend = rstart + res->size - 1; + + for (i = 0; i < memblock.memory.cnt; i++) { + u64 start = memblock.memory.region[i].base; + u64 end = start + memblock.memory.region[i].size - 1; + + if (start > rend) + return -1; + + if ((end >= rstart) && (start < rend)) { + /* adjust the request */ + if (rstart < start) + rstart = start; + if (rend > end) + rend = end; + res->base = rstart; + res->size = rend - rstart + 1; + return 0; + } + } + return -1; +} diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 954032b..9be3cf8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6,6 +6,10 @@ * Copyright 2007 OpenVZ SWsoft Inc * Author: Pavel Emelianov <xemul@openvz.org> * + * Memory thresholds + * Copyright (C) 2009 Nokia Corporation + * Author: Kirill A. Shutemov + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -21,6 +25,7 @@ #include <linux/memcontrol.h> #include <linux/cgroup.h> #include <linux/mm.h> +#include <linux/hugetlb.h> #include <linux/pagemap.h> #include <linux/smp.h> #include <linux/page-flags.h> @@ -32,17 +37,23 @@ #include <linux/rbtree.h> #include <linux/slab.h> #include <linux/swap.h> +#include <linux/swapops.h> #include <linux/spinlock.h> +#include <linux/eventfd.h> +#include <linux/sort.h> #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/vmalloc.h> #include <linux/mm_inline.h> #include <linux/page_cgroup.h> #include <linux/cpu.h> +#include <linux/oom.h> #include "internal.h" #include <asm/uaccess.h> +#include <trace/events/vmscan.h> + struct cgroup_subsys mem_cgroup_subsys __read_mostly; #define MEM_CGROUP_RECLAIM_RETRIES 5 struct mem_cgroup *root_mem_cgroup __read_mostly; @@ -55,7 +66,15 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/ #define do_swap_account (0) #endif -#define SOFTLIMIT_EVENTS_THRESH (1000) +/* + * Per memcg event counter is incremented at every pagein/pageout. This counter + * is used for trigger some periodic events. This is straightforward and better + * than using jiffies etc. to handle periodic memcg event. + * + * These values will be used as !((event) & ((1 <<(thresh)) - 1)) + */ +#define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */ +#define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */ /* * Statistics for memory cgroup. @@ -69,62 +88,16 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ - MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ + MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ MEM_CGROUP_STAT_NSTATS, }; struct mem_cgroup_stat_cpu { s64 count[MEM_CGROUP_STAT_NSTATS]; -} ____cacheline_aligned_in_smp; - -struct mem_cgroup_stat { - struct mem_cgroup_stat_cpu cpustat[0]; }; -static inline void -__mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat, - enum mem_cgroup_stat_index idx) -{ - stat->count[idx] = 0; -} - -static inline s64 -__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat, - enum mem_cgroup_stat_index idx) -{ - return stat->count[idx]; -} - -/* - * For accounting under irq disable, no need for increment preempt count. - */ -static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat, - enum mem_cgroup_stat_index idx, int val) -{ - stat->count[idx] += val; -} - -static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, - enum mem_cgroup_stat_index idx) -{ - int cpu; - s64 ret = 0; - for_each_possible_cpu(cpu) - ret += stat->cpustat[cpu].count[idx]; - return ret; -} - -static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat) -{ - s64 ret; - - ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE); - ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS); - return ret; -} - /* * per-zone information in memory controller. */ @@ -174,6 +147,41 @@ struct mem_cgroup_tree { static struct mem_cgroup_tree soft_limit_tree __read_mostly; +struct mem_cgroup_threshold { + struct eventfd_ctx *eventfd; + u64 threshold; +}; + +/* For threshold */ +struct mem_cgroup_threshold_ary { + /* An array index points to threshold just below usage. */ + int current_threshold; + /* Size of entries[] */ + unsigned int size; + /* Array of thresholds */ + struct mem_cgroup_threshold entries[0]; +}; + +struct mem_cgroup_thresholds { + /* Primary thresholds array */ + struct mem_cgroup_threshold_ary *primary; + /* + * Spare threshold array. + * This is needed to make mem_cgroup_unregister_event() "never fail". + * It must be able to store at least primary->size - 1 entries. + */ + struct mem_cgroup_threshold_ary *spare; +}; + +/* for OOM */ +struct mem_cgroup_eventfd_list { + struct list_head list; + struct eventfd_ctx *eventfd; +}; + +static void mem_cgroup_threshold(struct mem_cgroup *mem); +static void mem_cgroup_oom_notify(struct mem_cgroup *mem); + /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide @@ -206,8 +214,6 @@ struct mem_cgroup { */ spinlock_t reclaim_param_lock; - int prev_priority; /* for recording reclaim priority */ - /* * While reclaiming in a hierarchy, we cache the last child we * reclaimed from. @@ -217,20 +223,77 @@ struct mem_cgroup { * Should the accounting and control be hierarchical, per subtree? */ bool use_hierarchy; - unsigned long last_oom_jiffies; + atomic_t oom_lock; atomic_t refcnt; unsigned int swappiness; + /* OOM-Killer disable */ + int oom_kill_disable; /* set when res.limit == memsw.limit */ bool memsw_is_minimum; + /* protect arrays of thresholds */ + struct mutex thresholds_lock; + + /* thresholds for memory usage. RCU-protected */ + struct mem_cgroup_thresholds thresholds; + + /* thresholds for mem+swap usage. RCU-protected */ + struct mem_cgroup_thresholds memsw_thresholds; + + /* For oom notifier event fd */ + struct list_head oom_notify; + /* - * statistics. This must be placed at the end of memcg. + * Should we move charges of a task when a task is moved into this + * mem_cgroup ? And what type of charges should we move ? */ - struct mem_cgroup_stat stat; + unsigned long move_charge_at_immigrate; + /* + * percpu counter. + */ + struct mem_cgroup_stat_cpu *stat; }; +/* Stuffs for move charges at task migration. */ +/* + * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a + * left-shifted bitmap of these types. + */ +enum move_type { + MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ + MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ + NR_MOVE_TYPE, +}; + +/* "mc" and its members are protected by cgroup_mutex */ +static struct move_charge_struct { + spinlock_t lock; /* for from, to, moving_task */ + struct mem_cgroup *from; + struct mem_cgroup *to; + unsigned long precharge; + unsigned long moved_charge; + unsigned long moved_swap; + struct task_struct *moving_task; /* a task moving charges */ + wait_queue_head_t waitq; /* a waitq for other context */ +} mc = { + .lock = __SPIN_LOCK_UNLOCKED(mc.lock), + .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), +}; + +static bool move_anon(void) +{ + return test_bit(MOVE_CHARGE_TYPE_ANON, + &mc.to->move_charge_at_immigrate); +} + +static bool move_file(void) +{ + return test_bit(MOVE_CHARGE_TYPE_FILE, + &mc.to->move_charge_at_immigrate); +} + /* * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft * limit reclaim to prevent infinite loops, if they ever occur. @@ -258,9 +321,12 @@ enum charge_type { /* for encoding cft->private value on file */ #define _MEM (0) #define _MEMSWAP (1) +#define _OOM_TYPE (2) #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) +/* Used for OOM nofiier */ +#define OOM_CONTROL (0) /* * Reclaim flags for mem_cgroup_hierarchical_reclaim @@ -371,23 +437,6 @@ mem_cgroup_remove_exceeded(struct mem_cgroup *mem, spin_unlock(&mctz->lock); } -static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem) -{ - bool ret = false; - int cpu; - s64 val; - struct mem_cgroup_stat_cpu *cpustat; - - cpu = get_cpu(); - cpustat = &mem->stat.cpustat[cpu]; - val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS); - if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) { - __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS); - ret = true; - } - put_cpu(); - return ret; -} static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) { @@ -481,17 +530,31 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) return mz; } +static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, + enum mem_cgroup_stat_index idx) +{ + int cpu; + s64 val = 0; + + for_each_possible_cpu(cpu) + val += per_cpu(mem->stat->count[idx], cpu); + return val; +} + +static s64 mem_cgroup_local_usage(struct mem_cgroup *mem) +{ + s64 ret; + + ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); + ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); + return ret; +} + static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, bool charge) { int val = (charge) ? 1 : -1; - struct mem_cgroup_stat *stat = &mem->stat; - struct mem_cgroup_stat_cpu *cpustat; - int cpu = get_cpu(); - - cpustat = &stat->cpustat[cpu]; - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val); - put_cpu(); + this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); } static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, @@ -499,24 +562,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, bool charge) { int val = (charge) ? 1 : -1; - struct mem_cgroup_stat *stat = &mem->stat; - struct mem_cgroup_stat_cpu *cpustat; - int cpu = get_cpu(); - cpustat = &stat->cpustat[cpu]; + preempt_disable(); + if (PageCgroupCache(pc)) - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); + __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val); else - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); + __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val); if (charge) - __mem_cgroup_stat_add_safe(cpustat, - MEM_CGROUP_STAT_PGPGIN_COUNT, 1); + __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); else - __mem_cgroup_stat_add_safe(cpustat, - MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1); - put_cpu(); + __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); + __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]); + + preempt_enable(); } static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, @@ -534,6 +594,29 @@ static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, return total; } +static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) +{ + s64 val; + + val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]); + + return !(val & ((1 << event_mask_shift) - 1)); +} + +/* + * Check events in order. + * + */ +static void memcg_check_events(struct mem_cgroup *mem, struct page *page) +{ + /* threshold event is triggered in finer grain than soft limit */ + if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) { + mem_cgroup_threshold(mem); + if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH))) + mem_cgroup_update_tree(mem, page); + } +} + static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) { return container_of(cgroup_subsys_state(cont, @@ -756,12 +839,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) { int ret; struct mem_cgroup *curr = NULL; + struct task_struct *p; - task_lock(task); - rcu_read_lock(); - curr = try_get_mem_cgroup_from_mm(task->mm); - rcu_read_unlock(); - task_unlock(task); + p = find_lock_task_mm(task); + if (!p) + return 0; + curr = try_get_mem_cgroup_from_mm(p->mm); + task_unlock(p); if (!curr) return 0; /* @@ -778,35 +862,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) return ret; } -/* - * prev_priority control...this will be used in memory reclaim path. - */ -int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) -{ - int prev_priority; - - spin_lock(&mem->reclaim_param_lock); - prev_priority = mem->prev_priority; - spin_unlock(&mem->reclaim_param_lock); - - return prev_priority; -} - -void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) -{ - spin_lock(&mem->reclaim_param_lock); - if (priority < mem->prev_priority) - mem->prev_priority = priority; - spin_unlock(&mem->reclaim_param_lock); -} - -void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) -{ - spin_lock(&mem->reclaim_param_lock); - mem->prev_priority = priority; - spin_unlock(&mem->reclaim_param_lock); -} - static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) { unsigned long active; @@ -864,7 +919,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, struct zone *zone, enum lru_list lru) { - int nid = zone->zone_pgdat->node_id; + int nid = zone_to_nid(zone); int zid = zone_idx(zone); struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); @@ -874,7 +929,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, struct zone *zone) { - int nid = zone->zone_pgdat->node_id; + int nid = zone_to_nid(zone); int zid = zone_idx(zone); struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); @@ -919,7 +974,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, LIST_HEAD(pc_list); struct list_head *src; struct page_cgroup *pc, *tmp; - int nid = z->zone_pgdat->node_id; + int nid = zone_to_nid(z); int zid = zone_idx(z); struct mem_cgroup_per_zone *mz; int lru = LRU_FILE * file + active; @@ -958,6 +1013,10 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, } *scanned = scan; + + trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken, + 0, 0, 0, mode); + return nr_taken; } @@ -992,6 +1051,47 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg) return swappiness; } +/* A routine for testing mem is not under move_account */ + +static bool mem_cgroup_under_move(struct mem_cgroup *mem) +{ + struct mem_cgroup *from; + struct mem_cgroup *to; + bool ret = false; + /* + * Unlike task_move routines, we access mc.to, mc.from not under + * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. + */ + spin_lock(&mc.lock); + from = mc.from; + to = mc.to; + if (!from) + goto unlock; + if (from == mem || to == mem + || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css)) + || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css))) + ret = true; +unlock: + spin_unlock(&mc.lock); + return ret; +} + +static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem) +{ + if (mc.moving_task && current != mc.moving_task) { + if (mem_cgroup_under_move(mem)) { + DEFINE_WAIT(wait); + prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); + /* moving charge context might have finished. */ + if (mc.moving_task) + schedule(); + finish_wait(&mc.waitq, &wait); + return true; + } + } + return false; +} + static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) { int *val = data; @@ -1000,7 +1100,7 @@ static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) } /** - * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode. + * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. * @memcg: The memory cgroup that went over limit * @p: Task that is going to be killed * @@ -1078,6 +1178,24 @@ static int mem_cgroup_count_children(struct mem_cgroup *mem) } /* + * Return the memory (and swap, if configured) limit for a memcg. + */ +u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) +{ + u64 limit; + u64 memsw; + + limit = res_counter_read_u64(&memcg->res, RES_LIMIT) + + total_swap_pages; + memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); + /* + * If memsw is finite and limits the amount of swap space available + * to this memcg, return that limit. + */ + return min(limit, memsw); +} + +/* * Visit the first child (need not be the first child as per the ordering * of the cgroup list, since we track last_scanned_child) of @mem and use * that to reclaim free pages from. @@ -1174,7 +1292,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, } } } - if (!mem_cgroup_local_usage(&victim->stat)) { + if (!mem_cgroup_local_usage(victim)) { /* this cgroup's local usage == 0 */ css_put(&victim->css); continue; @@ -1182,8 +1300,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, /* we use swappiness of local cgroup */ if (check_soft) ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, - noswap, get_swappiness(victim), zone, - zone->zone_pgdat->node_id); + noswap, get_swappiness(victim), zone); else ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, get_swappiness(victim)); @@ -1205,32 +1322,141 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, return total; } -bool mem_cgroup_oom_called(struct task_struct *task) +static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data) { - bool ret = false; - struct mem_cgroup *mem; - struct mm_struct *mm; + int *val = (int *)data; + int x; + /* + * Logically, we can stop scanning immediately when we find + * a memcg is already locked. But condidering unlock ops and + * creation/removal of memcg, scan-all is simple operation. + */ + x = atomic_inc_return(&mem->oom_lock); + *val = max(x, *val); + return 0; +} +/* + * Check OOM-Killer is already running under our hierarchy. + * If someone is running, return false. + */ +static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) +{ + int lock_count = 0; - rcu_read_lock(); - mm = task->mm; - if (!mm) - mm = &init_mm; - mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); - if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10)) - ret = true; - rcu_read_unlock(); - return ret; + mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb); + + if (lock_count == 1) + return true; + return false; } -static int record_last_oom_cb(struct mem_cgroup *mem, void *data) +static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data) { - mem->last_oom_jiffies = jiffies; + /* + * When a new child is created while the hierarchy is under oom, + * mem_cgroup_oom_lock() may not be called. We have to use + * atomic_add_unless() here. + */ + atomic_add_unless(&mem->oom_lock, -1, 0); return 0; } -static void record_last_oom(struct mem_cgroup *mem) +static void mem_cgroup_oom_unlock(struct mem_cgroup *mem) +{ + mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb); +} + +static DEFINE_MUTEX(memcg_oom_mutex); +static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); + +struct oom_wait_info { + struct mem_cgroup *mem; + wait_queue_t wait; +}; + +static int memcg_oom_wake_function(wait_queue_t *wait, + unsigned mode, int sync, void *arg) +{ + struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg; + struct oom_wait_info *oom_wait_info; + + oom_wait_info = container_of(wait, struct oom_wait_info, wait); + + if (oom_wait_info->mem == wake_mem) + goto wakeup; + /* if no hierarchy, no match */ + if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy) + return 0; + /* + * Both of oom_wait_info->mem and wake_mem are stable under us. + * Then we can use css_is_ancestor without taking care of RCU. + */ + if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) && + !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css)) + return 0; + +wakeup: + return autoremove_wake_function(wait, mode, sync, arg); +} + +static void memcg_wakeup_oom(struct mem_cgroup *mem) +{ + /* for filtering, pass "mem" as argument. */ + __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); +} + +static void memcg_oom_recover(struct mem_cgroup *mem) +{ + if (mem && atomic_read(&mem->oom_lock)) + memcg_wakeup_oom(mem); +} + +/* + * try to call OOM killer. returns false if we should exit memory-reclaim loop. + */ +bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) { - mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); + struct oom_wait_info owait; + bool locked, need_to_kill; + + owait.mem = mem; + owait.wait.flags = 0; + owait.wait.func = memcg_oom_wake_function; + owait.wait.private = current; + INIT_LIST_HEAD(&owait.wait.task_list); + need_to_kill = true; + /* At first, try to OOM lock hierarchy under mem.*/ + mutex_lock(&memcg_oom_mutex); + locked = mem_cgroup_oom_lock(mem); + /* + * Even if signal_pending(), we can't quit charge() loop without + * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL + * under OOM is always welcomed, use TASK_KILLABLE here. + */ + prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); + if (!locked || mem->oom_kill_disable) + need_to_kill = false; + if (locked) + mem_cgroup_oom_notify(mem); + mutex_unlock(&memcg_oom_mutex); + + if (need_to_kill) { + finish_wait(&memcg_oom_waitq, &owait.wait); + mem_cgroup_out_of_memory(mem, mask); + } else { + schedule(); + finish_wait(&memcg_oom_waitq, &owait.wait); + } + mutex_lock(&memcg_oom_mutex); + mem_cgroup_oom_unlock(mem); + memcg_wakeup_oom(mem); + mutex_unlock(&memcg_oom_mutex); + + if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) + return false; + /* Give chance to dying process */ + schedule_timeout(1); + return true; } /* @@ -1240,9 +1466,6 @@ static void record_last_oom(struct mem_cgroup *mem) void mem_cgroup_update_file_mapped(struct page *page, int val) { struct mem_cgroup *mem; - struct mem_cgroup_stat *stat; - struct mem_cgroup_stat_cpu *cpustat; - int cpu; struct page_cgroup *pc; pc = lookup_page_cgroup(page); @@ -1251,20 +1474,20 @@ void mem_cgroup_update_file_mapped(struct page *page, int val) lock_page_cgroup(pc); mem = pc->mem_cgroup; - if (!mem) - goto done; - - if (!PageCgroupUsed(pc)) + if (!mem || !PageCgroupUsed(pc)) goto done; /* - * Preemption is already disabled, we don't need get_cpu() + * Preemption is already disabled. We can use __this_cpu_xxx */ - cpu = smp_processor_id(); - stat = &mem->stat; - cpustat = &stat->cpustat[cpu]; + if (val > 0) { + __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); + SetPageCgroupFileMapped(pc); + } else { + __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); + ClearPageCgroupFileMapped(pc); + } - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val); done: unlock_page_cgroup(pc); } @@ -1330,7 +1553,7 @@ static void drain_local_stock(struct work_struct *dummy) /* * Cache charges(val) which is from res_counter, to local per_cpu area. - * This will be consumed by consumt_stock() function, later. + * This will be consumed by consume_stock() function, later. */ static void refill_stock(struct mem_cgroup *mem, int val) { @@ -1396,24 +1619,93 @@ static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, return NOTIFY_OK; } + +/* See __mem_cgroup_try_charge() for details */ +enum { + CHARGE_OK, /* success */ + CHARGE_RETRY, /* need to retry but retry is not bad */ + CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ + CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ + CHARGE_OOM_DIE, /* the current is killed because of OOM */ +}; + +static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, + int csize, bool oom_check) +{ + struct mem_cgroup *mem_over_limit; + struct res_counter *fail_res; + unsigned long flags = 0; + int ret; + + ret = res_counter_charge(&mem->res, csize, &fail_res); + + if (likely(!ret)) { + if (!do_swap_account) + return CHARGE_OK; + ret = res_counter_charge(&mem->memsw, csize, &fail_res); + if (likely(!ret)) + return CHARGE_OK; + + mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); + flags |= MEM_CGROUP_RECLAIM_NOSWAP; + } else + mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); + + if (csize > PAGE_SIZE) /* change csize and retry */ + return CHARGE_RETRY; + + if (!(gfp_mask & __GFP_WAIT)) + return CHARGE_WOULDBLOCK; + + ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, + gfp_mask, flags); + /* + * try_to_free_mem_cgroup_pages() might not give us a full + * picture of reclaim. Some pages are reclaimed and might be + * moved to swap cache or just unmapped from the cgroup. + * Check the limit again to see if the reclaim reduced the + * current usage of the cgroup before giving up + */ + if (ret || mem_cgroup_check_under_limit(mem_over_limit)) + return CHARGE_RETRY; + + /* + * At task move, charge accounts can be doubly counted. So, it's + * better to wait until the end of task_move if something is going on. + */ + if (mem_cgroup_wait_acct_move(mem_over_limit)) + return CHARGE_RETRY; + + /* If we don't need to call oom-killer at el, return immediately */ + if (!oom_check) + return CHARGE_NOMEM; + /* check OOM */ + if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) + return CHARGE_OOM_DIE; + + return CHARGE_RETRY; +} + /* * Unlike exported interface, "oom" parameter is added. if oom==true, * oom-killer can be invoked. */ static int __mem_cgroup_try_charge(struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcg, - bool oom, struct page *page) + gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) { - struct mem_cgroup *mem, *mem_over_limit; - int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; - struct res_counter *fail_res; + int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; + struct mem_cgroup *mem = NULL; + int ret; int csize = CHARGE_SIZE; - if (unlikely(test_thread_flag(TIF_MEMDIE))) { - /* Don't account this! */ - *memcg = NULL; - return 0; - } + /* + * Unlike gloval-vm's OOM-kill, we're not in memory shortage + * in system level. So, allow to go ahead dying process in addition to + * MEMDIE process. + */ + if (unlikely(test_thread_flag(TIF_MEMDIE) + || fatal_signal_pending(current))) + goto bypass; /* * We always charge the cgroup the mm_struct belongs to. @@ -1421,90 +1713,112 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, * thread group leader migrates. It's possible that mm is not * set, if so charge the init_mm (happens for pagecache usage). */ - mem = *memcg; - if (likely(!mem)) { - mem = try_get_mem_cgroup_from_mm(mm); - *memcg = mem; - } else { + if (!*memcg && !mm) + goto bypass; +again: + if (*memcg) { /* css should be a valid one */ + mem = *memcg; + VM_BUG_ON(css_is_removed(&mem->css)); + if (mem_cgroup_is_root(mem)) + goto done; + if (consume_stock(mem)) + goto done; css_get(&mem->css); + } else { + struct task_struct *p; + + rcu_read_lock(); + p = rcu_dereference(mm->owner); + VM_BUG_ON(!p); + /* + * because we don't have task_lock(), "p" can exit while + * we're here. In that case, "mem" can point to root + * cgroup but never be NULL. (and task_struct itself is freed + * by RCU, cgroup itself is RCU safe.) Then, we have small + * risk here to get wrong cgroup. But such kind of mis-account + * by race always happens because we don't have cgroup_mutex(). + * It's overkill and we allow that small race, here. + */ + mem = mem_cgroup_from_task(p); + VM_BUG_ON(!mem); + if (mem_cgroup_is_root(mem)) { + rcu_read_unlock(); + goto done; + } + if (consume_stock(mem)) { + /* + * It seems dagerous to access memcg without css_get(). + * But considering how consume_stok works, it's not + * necessary. If consume_stock success, some charges + * from this memcg are cached on this cpu. So, we + * don't need to call css_get()/css_tryget() before + * calling consume_stock(). + */ + rcu_read_unlock(); + goto done; + } + /* after here, we may be blocked. we need to get refcnt */ + if (!css_tryget(&mem->css)) { + rcu_read_unlock(); + goto again; + } + rcu_read_unlock(); } - if (unlikely(!mem)) - return 0; - VM_BUG_ON(css_is_removed(&mem->css)); - if (mem_cgroup_is_root(mem)) - goto done; + do { + bool oom_check; - while (1) { - int ret = 0; - unsigned long flags = 0; + /* If killed, bypass charge */ + if (fatal_signal_pending(current)) { + css_put(&mem->css); + goto bypass; + } - if (consume_stock(mem)) - goto charged; + oom_check = false; + if (oom && !nr_oom_retries) { + oom_check = true; + nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; + } - ret = res_counter_charge(&mem->res, csize, &fail_res); - if (likely(!ret)) { - if (!do_swap_account) - break; - ret = res_counter_charge(&mem->memsw, csize, &fail_res); - if (likely(!ret)) - break; - /* mem+swap counter fails */ - res_counter_uncharge(&mem->res, csize); - flags |= MEM_CGROUP_RECLAIM_NOSWAP; - mem_over_limit = mem_cgroup_from_res_counter(fail_res, - memsw); - } else - /* mem counter fails */ - mem_over_limit = mem_cgroup_from_res_counter(fail_res, - res); + ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check); - /* reduce request size and retry */ - if (csize > PAGE_SIZE) { + switch (ret) { + case CHARGE_OK: + break; + case CHARGE_RETRY: /* not in OOM situation but retry */ csize = PAGE_SIZE; - continue; - } - if (!(gfp_mask & __GFP_WAIT)) + css_put(&mem->css); + mem = NULL; + goto again; + case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ + css_put(&mem->css); goto nomem; - - ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, - gfp_mask, flags); - if (ret) - continue; - - /* - * try_to_free_mem_cgroup_pages() might not give us a full - * picture of reclaim. Some pages are reclaimed and might be - * moved to swap cache or just unmapped from the cgroup. - * Check the limit again to see if the reclaim reduced the - * current usage of the cgroup before giving up - * - */ - if (mem_cgroup_check_under_limit(mem_over_limit)) - continue; - - if (!nr_retries--) { - if (oom) { - mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); - record_last_oom(mem_over_limit); + case CHARGE_NOMEM: /* OOM routine works */ + if (!oom) { + css_put(&mem->css); + goto nomem; } - goto nomem; + /* If oom, we never return -ENOMEM */ + nr_oom_retries--; + break; + case CHARGE_OOM_DIE: /* Killed by OOM Killer */ + css_put(&mem->css); + goto bypass; } - } + } while (ret != CHARGE_OK); + if (csize > PAGE_SIZE) refill_stock(mem, csize - PAGE_SIZE); -charged: - /* - * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. - * if they exceeds softlimit. - */ - if (mem_cgroup_soft_limit_check(mem)) - mem_cgroup_update_tree(mem, page); + css_put(&mem->css); done: + *memcg = mem; return 0; nomem: - css_put(&mem->css); + *memcg = NULL; return -ENOMEM; +bypass: + *memcg = NULL; + return 0; } /* @@ -1512,14 +1826,19 @@ nomem: * This function is for that and do uncharge, put css's refcnt. * gotten by try_charge(). */ -static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) +static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, + unsigned long count) { if (!mem_cgroup_is_root(mem)) { - res_counter_uncharge(&mem->res, PAGE_SIZE); + res_counter_uncharge(&mem->res, PAGE_SIZE * count); if (do_swap_account) - res_counter_uncharge(&mem->memsw, PAGE_SIZE); + res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); } - css_put(&mem->css); +} + +static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) +{ + __mem_cgroup_cancel_charge(mem, 1); } /* @@ -1615,6 +1934,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, mem_cgroup_charge_statistics(mem, pc, true); unlock_page_cgroup(pc); + /* + * "charge_statistics" updated event counter. Then, check it. + * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. + * if they exceeds softlimit. + */ + memcg_check_events(mem, pc->page); } /** @@ -1622,61 +1947,48 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, * @pc: page_cgroup of the page. * @from: mem_cgroup which the page is moved from. * @to: mem_cgroup which the page is moved to. @from != @to. + * @uncharge: whether we should call uncharge and css_put against @from. * * The caller must confirm following. * - page is not on LRU (isolate_page() is useful.) * - the pc is locked, used, and ->mem_cgroup points to @from. * - * This function does "uncharge" from old cgroup but doesn't do "charge" to - * new cgroup. It should be done by a caller. + * This function doesn't do "charge" nor css_get to new cgroup. It should be + * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is + * true, this function does "uncharge" from old cgroup, but it doesn't if + * @uncharge is false, so a caller should do "uncharge". */ static void __mem_cgroup_move_account(struct page_cgroup *pc, - struct mem_cgroup *from, struct mem_cgroup *to) + struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) { - struct page *page; - int cpu; - struct mem_cgroup_stat *stat; - struct mem_cgroup_stat_cpu *cpustat; - VM_BUG_ON(from == to); VM_BUG_ON(PageLRU(pc->page)); VM_BUG_ON(!PageCgroupLocked(pc)); VM_BUG_ON(!PageCgroupUsed(pc)); VM_BUG_ON(pc->mem_cgroup != from); - if (!mem_cgroup_is_root(from)) - res_counter_uncharge(&from->res, PAGE_SIZE); - mem_cgroup_charge_statistics(from, pc, false); - - page = pc->page; - if (page_mapped(page) && !PageAnon(page)) { - cpu = smp_processor_id(); - /* Update mapped_file data for mem_cgroup "from" */ - stat = &from->stat; - cpustat = &stat->cpustat[cpu]; - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, - -1); - - /* Update mapped_file data for mem_cgroup "to" */ - stat = &to->stat; - cpustat = &stat->cpustat[cpu]; - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, - 1); + if (PageCgroupFileMapped(pc)) { + /* Update mapped_file data for mem_cgroup */ + preempt_disable(); + __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); + __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); + preempt_enable(); } + mem_cgroup_charge_statistics(from, pc, false); + if (uncharge) + /* This is not "cancel", but cancel_charge does all we need. */ + mem_cgroup_cancel_charge(from); - if (do_swap_account && !mem_cgroup_is_root(from)) - res_counter_uncharge(&from->memsw, PAGE_SIZE); - css_put(&from->css); - - css_get(&to->css); + /* caller should have done css_get */ pc->mem_cgroup = to; mem_cgroup_charge_statistics(to, pc, true); /* * We charges against "to" which may not have any tasks. Then, "to" * can be under rmdir(). But in current implementation, caller of - * this function is just force_empty() and it's garanteed that - * "to" is never removed. So, we don't check rmdir status here. + * this function is just force_empty() and move charge, so it's + * garanteed that "to" is never removed. So, we don't check rmdir + * status here. */ } @@ -1685,15 +1997,20 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, * __mem_cgroup_move_account() */ static int mem_cgroup_move_account(struct page_cgroup *pc, - struct mem_cgroup *from, struct mem_cgroup *to) + struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) { int ret = -EINVAL; lock_page_cgroup(pc); if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { - __mem_cgroup_move_account(pc, from, to); + __mem_cgroup_move_account(pc, from, to, uncharge); ret = 0; } unlock_page_cgroup(pc); + /* + * check events + */ + memcg_check_events(to, pc->page); + memcg_check_events(from, pc->page); return ret; } @@ -1722,15 +2039,13 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, goto put; parent = mem_cgroup_from_cont(pcg); - ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); + ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); if (ret || !parent) goto put_back; - ret = mem_cgroup_move_account(pc, child, parent); - if (!ret) - css_put(&parent->css); /* drop extra refcnt by try_charge() */ - else - mem_cgroup_cancel_charge(parent); /* does css_put */ + ret = mem_cgroup_move_account(pc, child, parent, true); + if (ret) + mem_cgroup_cancel_charge(parent); put_back: putback_lru_page(page); put: @@ -1746,10 +2061,9 @@ out: * < 0 if the cgroup is over its limit */ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, enum charge_type ctype, - struct mem_cgroup *memcg) + gfp_t gfp_mask, enum charge_type ctype) { - struct mem_cgroup *mem; + struct mem_cgroup *mem = NULL; struct page_cgroup *pc; int ret; @@ -1759,8 +2073,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, return 0; prefetchw(pc); - mem = memcg; - ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page); + ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); if (ret || !mem) return ret; @@ -1787,7 +2100,7 @@ int mem_cgroup_newpage_charge(struct page *page, if (unlikely(!mm)) mm = &init_mm; return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); + MEM_CGROUP_CHARGE_TYPE_MAPPED); } static void @@ -1797,7 +2110,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { - struct mem_cgroup *mem = NULL; int ret; if (mem_cgroup_disabled()) @@ -1818,7 +2130,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, if (!(gfp_mask & __GFP_WAIT)) { struct page_cgroup *pc; - pc = lookup_page_cgroup(page); if (!pc) return 0; @@ -1830,22 +2141,24 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, unlock_page_cgroup(pc); } - if (unlikely(!mm && !mem)) + if (unlikely(!mm)) mm = &init_mm; if (page_is_file_cache(page)) return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); + MEM_CGROUP_CHARGE_TYPE_CACHE); /* shmem */ if (PageSwapCache(page)) { + struct mem_cgroup *mem = NULL; + ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); if (!ret) __mem_cgroup_commit_charge_swapin(page, mem, MEM_CGROUP_CHARGE_TYPE_SHMEM); } else ret = mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); + MEM_CGROUP_CHARGE_TYPE_SHMEM); return ret; } @@ -1880,14 +2193,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, if (!mem) goto charge_cur_mm; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page); - /* drop extra refcnt from tryget */ + ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); css_put(&mem->css); return ret; charge_cur_mm: if (unlikely(!mm)) mm = &init_mm; - return __mem_cgroup_try_charge(mm, mask, ptr, true, page); + return __mem_cgroup_try_charge(mm, mask, ptr, true); } static void @@ -1963,15 +2275,6 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) /* If swapout, usage of swap doesn't decrease */ if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) uncharge_memsw = false; - /* - * do_batch > 0 when unmapping pages or inode invalidate/truncate. - * In those cases, all pages freed continously can be expected to be in - * the same cgroup and we have chance to coalesce uncharges. - * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) - * because we want to do uncharge as soon as possible. - */ - if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE)) - goto direct_uncharge; batch = ¤t->memcg_batch; /* @@ -1982,6 +2285,17 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) if (!batch->memcg) batch->memcg = mem; /* + * do_batch > 0 when unmapping pages or inode invalidate/truncate. + * In those cases, all pages freed continously can be expected to be in + * the same cgroup and we have chance to coalesce uncharges. + * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) + * because we want to do uncharge as soon as possible. + */ + + if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) + goto direct_uncharge; + + /* * In typical case, batch->memcg == mem. This means we can * merge a series of uncharges to an uncharge of res_counter. * If not, we uncharge res_counter ony by one. @@ -1997,6 +2311,8 @@ direct_uncharge: res_counter_uncharge(&mem->res, PAGE_SIZE); if (uncharge_memsw) res_counter_uncharge(&mem->memsw, PAGE_SIZE); + if (unlikely(batch->memcg != mem)) + memcg_oom_recover(mem); return; } @@ -2008,7 +2324,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) { struct page_cgroup *pc; struct mem_cgroup *mem = NULL; - struct mem_cgroup_per_zone *mz; if (mem_cgroup_disabled()) return NULL; @@ -2033,7 +2348,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) switch (ctype) { case MEM_CGROUP_CHARGE_TYPE_MAPPED: case MEM_CGROUP_CHARGE_TYPE_DROP: - if (page_mapped(page)) + /* See mem_cgroup_prepare_migration() */ + if (page_mapped(page) || PageCgroupMigration(pc)) goto unlock_out; break; case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: @@ -2047,10 +2363,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - if (!mem_cgroup_is_root(mem)) - __do_uncharge(mem, ctype); - if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) - mem_cgroup_swap_statistics(mem, true); mem_cgroup_charge_statistics(mem, pc, false); ClearPageCgroupUsed(pc); @@ -2061,14 +2373,18 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) * special functions. */ - mz = page_cgroup_zoneinfo(pc); unlock_page_cgroup(pc); - - if (mem_cgroup_soft_limit_check(mem)) - mem_cgroup_update_tree(mem, page); - /* at swapout, this memcg will be accessed to record to swap */ - if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) - css_put(&mem->css); + /* + * even after unlock, we have mem->res.usage here and this memcg + * will never be freed. + */ + memcg_check_events(mem, page); + if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { + mem_cgroup_swap_statistics(mem, true); + mem_cgroup_get(mem); + } + if (!mem_cgroup_is_root(mem)) + __do_uncharge(mem, ctype); return mem; @@ -2134,6 +2450,7 @@ void mem_cgroup_uncharge_end(void) res_counter_uncharge(&batch->memcg->res, batch->bytes); if (batch->memsw_bytes) res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); + memcg_oom_recover(batch->memcg); /* forget this pointer (for sanity check) */ batch->memcg = NULL; } @@ -2154,13 +2471,12 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) memcg = __mem_cgroup_uncharge_common(page, ctype); - /* record memcg information */ - if (do_swap_account && swapout && memcg) { + /* + * record memcg information, if swapout && memcg != NULL, + * mem_cgroup_get() was called in uncharge(). + */ + if (do_swap_account && swapout && memcg) swap_cgroup_record(ent, css_id(&memcg->css)); - mem_cgroup_get(memcg); - } - if (swapout && memcg) - css_put(&memcg->css); } #endif @@ -2192,16 +2508,75 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) } rcu_read_unlock(); } + +/** + * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. + * @entry: swap entry to be moved + * @from: mem_cgroup which the entry is moved from + * @to: mem_cgroup which the entry is moved to + * @need_fixup: whether we should fixup res_counters and refcounts. + * + * It succeeds only when the swap_cgroup's record for this entry is the same + * as the mem_cgroup's id of @from. + * + * Returns 0 on success, -EINVAL on failure. + * + * The caller must have charged to @to, IOW, called res_counter_charge() about + * both res and memsw, and called css_get(). + */ +static int mem_cgroup_move_swap_account(swp_entry_t entry, + struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) +{ + unsigned short old_id, new_id; + + old_id = css_id(&from->css); + new_id = css_id(&to->css); + + if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { + mem_cgroup_swap_statistics(from, false); + mem_cgroup_swap_statistics(to, true); + /* + * This function is only called from task migration context now. + * It postpones res_counter and refcount handling till the end + * of task migration(mem_cgroup_clear_mc()) for performance + * improvement. But we cannot postpone mem_cgroup_get(to) + * because if the process that has been moved to @to does + * swap-in, the refcount of @to might be decreased to 0. + */ + mem_cgroup_get(to); + if (need_fixup) { + if (!mem_cgroup_is_root(from)) + res_counter_uncharge(&from->memsw, PAGE_SIZE); + mem_cgroup_put(from); + /* + * we charged both to->res and to->memsw, so we should + * uncharge to->res. + */ + if (!mem_cgroup_is_root(to)) + res_counter_uncharge(&to->res, PAGE_SIZE); + } + return 0; + } + return -EINVAL; +} +#else +static inline int mem_cgroup_move_swap_account(swp_entry_t entry, + struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) +{ + return -EINVAL; +} #endif /* * Before starting migration, account PAGE_SIZE to mem_cgroup that the old * page belongs to. */ -int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) +int mem_cgroup_prepare_migration(struct page *page, + struct page *newpage, struct mem_cgroup **ptr) { struct page_cgroup *pc; struct mem_cgroup *mem = NULL; + enum charge_type ctype; int ret = 0; if (mem_cgroup_disabled()) @@ -2212,70 +2587,122 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) if (PageCgroupUsed(pc)) { mem = pc->mem_cgroup; css_get(&mem->css); + /* + * At migrating an anonymous page, its mapcount goes down + * to 0 and uncharge() will be called. But, even if it's fully + * unmapped, migration may fail and this page has to be + * charged again. We set MIGRATION flag here and delay uncharge + * until end_migration() is called + * + * Corner Case Thinking + * A) + * When the old page was mapped as Anon and it's unmap-and-freed + * while migration was ongoing. + * If unmap finds the old page, uncharge() of it will be delayed + * until end_migration(). If unmap finds a new page, it's + * uncharged when it make mapcount to be 1->0. If unmap code + * finds swap_migration_entry, the new page will not be mapped + * and end_migration() will find it(mapcount==0). + * + * B) + * When the old page was mapped but migraion fails, the kernel + * remaps it. A charge for it is kept by MIGRATION flag even + * if mapcount goes down to 0. We can do remap successfully + * without charging it again. + * + * C) + * The "old" page is under lock_page() until the end of + * migration, so, the old page itself will not be swapped-out. + * If the new page is swapped out before end_migraton, our + * hook to usual swap-out path will catch the event. + */ + if (PageAnon(page)) + SetPageCgroupMigration(pc); } unlock_page_cgroup(pc); + /* + * If the page is not charged at this point, + * we return here. + */ + if (!mem) + return 0; - if (mem) { - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, - page); - css_put(&mem->css); - } *ptr = mem; + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); + css_put(&mem->css);/* drop extra refcnt */ + if (ret || *ptr == NULL) { + if (PageAnon(page)) { + lock_page_cgroup(pc); + ClearPageCgroupMigration(pc); + unlock_page_cgroup(pc); + /* + * The old page may be fully unmapped while we kept it. + */ + mem_cgroup_uncharge_page(page); + } + return -ENOMEM; + } + /* + * We charge new page before it's used/mapped. So, even if unlock_page() + * is called before end_migration, we can catch all events on this new + * page. In the case new page is migrated but not remapped, new page's + * mapcount will be finally 0 and we call uncharge in end_migration(). + */ + pc = lookup_page_cgroup(newpage); + if (PageAnon(page)) + ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; + else if (page_is_file_cache(page)) + ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; + else + ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; + __mem_cgroup_commit_charge(mem, pc, ctype); return ret; } /* remove redundant charge if migration failed*/ void mem_cgroup_end_migration(struct mem_cgroup *mem, - struct page *oldpage, struct page *newpage) + struct page *oldpage, struct page *newpage) { - struct page *target, *unused; + struct page *used, *unused; struct page_cgroup *pc; - enum charge_type ctype; if (!mem) return; + /* blocks rmdir() */ cgroup_exclude_rmdir(&mem->css); /* at migration success, oldpage->mapping is NULL. */ if (oldpage->mapping) { - target = oldpage; - unused = NULL; + used = oldpage; + unused = newpage; } else { - target = newpage; + used = newpage; unused = oldpage; } - - if (PageAnon(target)) - ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; - else if (page_is_file_cache(target)) - ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; - else - ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; - - /* unused page is not on radix-tree now. */ - if (unused) - __mem_cgroup_uncharge_common(unused, ctype); - - pc = lookup_page_cgroup(target); /* - * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup. - * So, double-counting is effectively avoided. + * We disallowed uncharge of pages under migration because mapcount + * of the page goes down to zero, temporarly. + * Clear the flag and check the page should be charged. */ - __mem_cgroup_commit_charge(mem, pc, ctype); + pc = lookup_page_cgroup(oldpage); + lock_page_cgroup(pc); + ClearPageCgroupMigration(pc); + unlock_page_cgroup(pc); + + __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); /* - * Both of oldpage and newpage are still under lock_page(). - * Then, we don't have to care about race in radix-tree. - * But we have to be careful that this page is unmapped or not. - * - * There is a case for !page_mapped(). At the start of - * migration, oldpage was mapped. But now, it's zapped. - * But we know *target* page is not freed/reused under us. - * mem_cgroup_uncharge_page() does all necessary checks. + * If a page is a file cache, radix-tree replacement is very atomic + * and we can skip this check. When it was an Anon page, its mapcount + * goes down to 0. But because we added MIGRATION flage, it's not + * uncharged yet. There are several case but page->mapcount check + * and USED bit check in mem_cgroup_uncharge_page() will do enough + * check. (see prepare_charge() also) */ - if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) - mem_cgroup_uncharge_page(target); + if (PageAnon(used)) + mem_cgroup_uncharge_page(used); /* - * At migration, we may charge account against cgroup which has no tasks + * At migration, we may charge account against cgroup which has no + * tasks. * So, rmdir()->pre_destroy() can be called while we do this charge. * In that case, we need to call pre_destroy() again. check it here. */ @@ -2313,10 +2740,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) { int retry_count; - u64 memswlimit; + u64 memswlimit, memlimit; int ret = 0; int children = mem_cgroup_count_children(memcg); u64 curusage, oldusage; + int enlarge; /* * For keeping hierarchical_reclaim simple, how long we should retry @@ -2327,6 +2755,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); + enlarge = 0; while (retry_count) { if (signal_pending(current)) { ret = -EINTR; @@ -2344,6 +2773,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, mutex_unlock(&set_limit_mutex); break; } + + memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); + if (memlimit < val) + enlarge = 1; + ret = res_counter_set_limit(&memcg->res, val); if (!ret) { if (memswlimit == val) @@ -2365,6 +2799,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, else oldusage = curusage; } + if (!ret && enlarge) + memcg_oom_recover(memcg); return ret; } @@ -2373,9 +2809,10 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, unsigned long long val) { int retry_count; - u64 memlimit, oldusage, curusage; + u64 memlimit, memswlimit, oldusage, curusage; int children = mem_cgroup_count_children(memcg); int ret = -EBUSY; + int enlarge = 0; /* see mem_cgroup_resize_res_limit */ retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; @@ -2397,6 +2834,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, mutex_unlock(&set_limit_mutex); break; } + memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); + if (memswlimit < val) + enlarge = 1; ret = res_counter_set_limit(&memcg->memsw, val); if (!ret) { if (memlimit == val) @@ -2419,12 +2859,13 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, else oldusage = curusage; } + if (!ret && enlarge) + memcg_oom_recover(memcg); return ret; } unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, - gfp_t gfp_mask, int nid, - int zid) + gfp_t gfp_mask) { unsigned long nr_reclaimed = 0; struct mem_cgroup_per_zone *mz, *next_mz = NULL; @@ -2436,7 +2877,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, if (order > 0) return 0; - mctz = soft_limit_tree_node_zone(nid, zid); + mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); /* * This loop can run a while, specially if mem_cgroup's continuously * keep exceeding their soft limit and putting the system under @@ -2545,7 +2986,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, pc = list_entry(list->prev, struct page_cgroup, lru); if (busy == pc) { list_move(&pc->lru, list); - busy = 0; + busy = NULL; spin_unlock_irqrestore(&zone->lru_lock, flags); continue; } @@ -2610,6 +3051,7 @@ move_account: if (ret) break; } + memcg_oom_recover(mem); /* it seems parent cgroup doesn't have enough mem */ if (ret == -ENOMEM) goto try_to_free; @@ -2704,7 +3146,7 @@ static int mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) { struct mem_cgroup_idx_data *d = data; - d->val += mem_cgroup_read_stat(&mem->stat, d->idx); + d->val += mem_cgroup_read_stat(mem, d->idx); return 0; } @@ -2719,40 +3161,50 @@ mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, *val = d.val; } +static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) +{ + u64 idx_val, val; + + if (!mem_cgroup_is_root(mem)) { + if (!swap) + return res_counter_read_u64(&mem->res, RES_USAGE); + else + return res_counter_read_u64(&mem->memsw, RES_USAGE); + } + + mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val); + val = idx_val; + mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val); + val += idx_val; + + if (swap) { + mem_cgroup_get_recursive_idx_stat(mem, + MEM_CGROUP_STAT_SWAPOUT, &idx_val); + val += idx_val; + } + + return val << PAGE_SHIFT; +} + static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) { struct mem_cgroup *mem = mem_cgroup_from_cont(cont); - u64 idx_val, val; + u64 val; int type, name; type = MEMFILE_TYPE(cft->private); name = MEMFILE_ATTR(cft->private); switch (type) { case _MEM: - if (name == RES_USAGE && mem_cgroup_is_root(mem)) { - mem_cgroup_get_recursive_idx_stat(mem, - MEM_CGROUP_STAT_CACHE, &idx_val); - val = idx_val; - mem_cgroup_get_recursive_idx_stat(mem, - MEM_CGROUP_STAT_RSS, &idx_val); - val += idx_val; - val <<= PAGE_SHIFT; - } else + if (name == RES_USAGE) + val = mem_cgroup_usage(mem, false); + else val = res_counter_read_u64(&mem->res, name); break; case _MEMSWAP: - if (name == RES_USAGE && mem_cgroup_is_root(mem)) { - mem_cgroup_get_recursive_idx_stat(mem, - MEM_CGROUP_STAT_CACHE, &idx_val); - val = idx_val; - mem_cgroup_get_recursive_idx_stat(mem, - MEM_CGROUP_STAT_RSS, &idx_val); - val += idx_val; - mem_cgroup_get_recursive_idx_stat(mem, - MEM_CGROUP_STAT_SWAPOUT, &idx_val); - val += idx_val; - val <<= PAGE_SHIFT; - } else + if (name == RES_USAGE) + val = mem_cgroup_usage(mem, true); + else val = res_counter_read_u64(&mem->memsw, name); break; default: @@ -2865,6 +3317,39 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) return 0; } +static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, + struct cftype *cft) +{ + return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; +} + +#ifdef CONFIG_MMU +static int mem_cgroup_move_charge_write(struct cgroup *cgrp, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + + if (val >= (1 << NR_MOVE_TYPE)) + return -EINVAL; + /* + * We check this value several times in both in can_attach() and + * attach(), so we need cgroup lock to prevent this value from being + * inconsistent. + */ + cgroup_lock(); + mem->move_charge_at_immigrate = val; + cgroup_unlock(); + + return 0; +} +#else +static int mem_cgroup_move_charge_write(struct cgroup *cgrp, + struct cftype *cft, u64 val) +{ + return -ENOSYS; +} +#endif + /* For read statistics */ enum { @@ -2910,18 +3395,18 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) s64 val; /* per cpu stat */ - val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE); + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); s->stat[MCS_CACHE] += val * PAGE_SIZE; - val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); s->stat[MCS_RSS] += val * PAGE_SIZE; - val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED); + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; - val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT); s->stat[MCS_PGPGIN] += val; - val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT); s->stat[MCS_PGPGOUT] += val; if (do_swap_account) { - val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT); + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); s->stat[MCS_SWAP] += val * PAGE_SIZE; } @@ -3049,12 +3534,341 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, return 0; } +static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) +{ + struct mem_cgroup_threshold_ary *t; + u64 usage; + int i; + + rcu_read_lock(); + if (!swap) + t = rcu_dereference(memcg->thresholds.primary); + else + t = rcu_dereference(memcg->memsw_thresholds.primary); + + if (!t) + goto unlock; + + usage = mem_cgroup_usage(memcg, swap); + + /* + * current_threshold points to threshold just below usage. + * If it's not true, a threshold was crossed after last + * call of __mem_cgroup_threshold(). + */ + i = t->current_threshold; + + /* + * Iterate backward over array of thresholds starting from + * current_threshold and check if a threshold is crossed. + * If none of thresholds below usage is crossed, we read + * only one element of the array here. + */ + for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) + eventfd_signal(t->entries[i].eventfd, 1); + + /* i = current_threshold + 1 */ + i++; + + /* + * Iterate forward over array of thresholds starting from + * current_threshold+1 and check if a threshold is crossed. + * If none of thresholds above usage is crossed, we read + * only one element of the array here. + */ + for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) + eventfd_signal(t->entries[i].eventfd, 1); + + /* Update current_threshold */ + t->current_threshold = i - 1; +unlock: + rcu_read_unlock(); +} + +static void mem_cgroup_threshold(struct mem_cgroup *memcg) +{ + while (memcg) { + __mem_cgroup_threshold(memcg, false); + if (do_swap_account) + __mem_cgroup_threshold(memcg, true); + + memcg = parent_mem_cgroup(memcg); + } +} + +static int compare_thresholds(const void *a, const void *b) +{ + const struct mem_cgroup_threshold *_a = a; + const struct mem_cgroup_threshold *_b = b; + + return _a->threshold - _b->threshold; +} + +static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data) +{ + struct mem_cgroup_eventfd_list *ev; + + list_for_each_entry(ev, &mem->oom_notify, list) + eventfd_signal(ev->eventfd, 1); + return 0; +} + +static void mem_cgroup_oom_notify(struct mem_cgroup *mem) +{ + mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb); +} + +static int mem_cgroup_usage_register_event(struct cgroup *cgrp, + struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) +{ + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup_thresholds *thresholds; + struct mem_cgroup_threshold_ary *new; + int type = MEMFILE_TYPE(cft->private); + u64 threshold, usage; + int i, size, ret; + + ret = res_counter_memparse_write_strategy(args, &threshold); + if (ret) + return ret; + + mutex_lock(&memcg->thresholds_lock); + + if (type == _MEM) + thresholds = &memcg->thresholds; + else if (type == _MEMSWAP) + thresholds = &memcg->memsw_thresholds; + else + BUG(); + + usage = mem_cgroup_usage(memcg, type == _MEMSWAP); + + /* Check if a threshold crossed before adding a new one */ + if (thresholds->primary) + __mem_cgroup_threshold(memcg, type == _MEMSWAP); + + size = thresholds->primary ? thresholds->primary->size + 1 : 1; + + /* Allocate memory for new array of thresholds */ + new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), + GFP_KERNEL); + if (!new) { + ret = -ENOMEM; + goto unlock; + } + new->size = size; + + /* Copy thresholds (if any) to new array */ + if (thresholds->primary) { + memcpy(new->entries, thresholds->primary->entries, (size - 1) * + sizeof(struct mem_cgroup_threshold)); + } + + /* Add new threshold */ + new->entries[size - 1].eventfd = eventfd; + new->entries[size - 1].threshold = threshold; + + /* Sort thresholds. Registering of new threshold isn't time-critical */ + sort(new->entries, size, sizeof(struct mem_cgroup_threshold), + compare_thresholds, NULL); + + /* Find current threshold */ + new->current_threshold = -1; + for (i = 0; i < size; i++) { + if (new->entries[i].threshold < usage) { + /* + * new->current_threshold will not be used until + * rcu_assign_pointer(), so it's safe to increment + * it here. + */ + ++new->current_threshold; + } + } + + /* Free old spare buffer and save old primary buffer as spare */ + kfree(thresholds->spare); + thresholds->spare = thresholds->primary; + + rcu_assign_pointer(thresholds->primary, new); + + /* To be sure that nobody uses thresholds */ + synchronize_rcu(); + +unlock: + mutex_unlock(&memcg->thresholds_lock); + + return ret; +} + +static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, + struct cftype *cft, struct eventfd_ctx *eventfd) +{ + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup_thresholds *thresholds; + struct mem_cgroup_threshold_ary *new; + int type = MEMFILE_TYPE(cft->private); + u64 usage; + int i, j, size; + + mutex_lock(&memcg->thresholds_lock); + if (type == _MEM) + thresholds = &memcg->thresholds; + else if (type == _MEMSWAP) + thresholds = &memcg->memsw_thresholds; + else + BUG(); + + /* + * Something went wrong if we trying to unregister a threshold + * if we don't have thresholds + */ + BUG_ON(!thresholds); + + usage = mem_cgroup_usage(memcg, type == _MEMSWAP); + + /* Check if a threshold crossed before removing */ + __mem_cgroup_threshold(memcg, type == _MEMSWAP); + + /* Calculate new number of threshold */ + size = 0; + for (i = 0; i < thresholds->primary->size; i++) { + if (thresholds->primary->entries[i].eventfd != eventfd) + size++; + } + + new = thresholds->spare; + + /* Set thresholds array to NULL if we don't have thresholds */ + if (!size) { + kfree(new); + new = NULL; + goto swap_buffers; + } + + new->size = size; + + /* Copy thresholds and find current threshold */ + new->current_threshold = -1; + for (i = 0, j = 0; i < thresholds->primary->size; i++) { + if (thresholds->primary->entries[i].eventfd == eventfd) + continue; + + new->entries[j] = thresholds->primary->entries[i]; + if (new->entries[j].threshold < usage) { + /* + * new->current_threshold will not be used + * until rcu_assign_pointer(), so it's safe to increment + * it here. + */ + ++new->current_threshold; + } + j++; + } + +swap_buffers: + /* Swap primary and spare array */ + thresholds->spare = thresholds->primary; + rcu_assign_pointer(thresholds->primary, new); + + /* To be sure that nobody uses thresholds */ + synchronize_rcu(); + + mutex_unlock(&memcg->thresholds_lock); +} + +static int mem_cgroup_oom_register_event(struct cgroup *cgrp, + struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) +{ + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup_eventfd_list *event; + int type = MEMFILE_TYPE(cft->private); + + BUG_ON(type != _OOM_TYPE); + event = kmalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + + mutex_lock(&memcg_oom_mutex); + + event->eventfd = eventfd; + list_add(&event->list, &memcg->oom_notify); + + /* already in OOM ? */ + if (atomic_read(&memcg->oom_lock)) + eventfd_signal(eventfd, 1); + mutex_unlock(&memcg_oom_mutex); + + return 0; +} + +static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, + struct cftype *cft, struct eventfd_ctx *eventfd) +{ + struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + struct mem_cgroup_eventfd_list *ev, *tmp; + int type = MEMFILE_TYPE(cft->private); + + BUG_ON(type != _OOM_TYPE); + + mutex_lock(&memcg_oom_mutex); + + list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { + if (ev->eventfd == eventfd) { + list_del(&ev->list); + kfree(ev); + } + } + + mutex_unlock(&memcg_oom_mutex); +} + +static int mem_cgroup_oom_control_read(struct cgroup *cgrp, + struct cftype *cft, struct cgroup_map_cb *cb) +{ + struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + + cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); + + if (atomic_read(&mem->oom_lock)) + cb->fill(cb, "under_oom", 1); + else + cb->fill(cb, "under_oom", 0); + return 0; +} + +static int mem_cgroup_oom_control_write(struct cgroup *cgrp, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *parent; + + /* cannot set to root cgroup and only 0 and 1 are allowed */ + if (!cgrp->parent || !((val == 0) || (val == 1))) + return -EINVAL; + + parent = mem_cgroup_from_cont(cgrp->parent); + + cgroup_lock(); + /* oom-kill-disable is a flag for subhierarchy. */ + if ((parent->use_hierarchy) || + (mem->use_hierarchy && !list_empty(&cgrp->children))) { + cgroup_unlock(); + return -EINVAL; + } + mem->oom_kill_disable = val; + if (!val) + memcg_oom_recover(mem); + cgroup_unlock(); + return 0; +} static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), .read_u64 = mem_cgroup_read, + .register_event = mem_cgroup_usage_register_event, + .unregister_event = mem_cgroup_usage_unregister_event, }, { .name = "max_usage_in_bytes", @@ -3098,6 +3912,19 @@ static struct cftype mem_cgroup_files[] = { .read_u64 = mem_cgroup_swappiness_read, .write_u64 = mem_cgroup_swappiness_write, }, + { + .name = "move_charge_at_immigrate", + .read_u64 = mem_cgroup_move_charge_read, + .write_u64 = mem_cgroup_move_charge_write, + }, + { + .name = "oom_control", + .read_map = mem_cgroup_oom_control_read, + .write_u64 = mem_cgroup_oom_control_write, + .register_event = mem_cgroup_oom_register_event, + .unregister_event = mem_cgroup_oom_unregister_event, + .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), + }, }; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP @@ -3106,6 +3933,8 @@ static struct cftype memsw_cgroup_files[] = { .name = "memsw.usage_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), .read_u64 = mem_cgroup_read, + .register_event = mem_cgroup_usage_register_event, + .unregister_event = mem_cgroup_usage_unregister_event, }, { .name = "memsw.max_usage_in_bytes", @@ -3180,24 +4009,29 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) kfree(mem->info.nodeinfo[node]); } -static int mem_cgroup_size(void) -{ - int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu); - return sizeof(struct mem_cgroup) + cpustat_size; -} - static struct mem_cgroup *mem_cgroup_alloc(void) { struct mem_cgroup *mem; - int size = mem_cgroup_size(); + int size = sizeof(struct mem_cgroup); + /* Can be very big if MAX_NUMNODES is very big */ if (size < PAGE_SIZE) mem = kmalloc(size, GFP_KERNEL); else mem = vmalloc(size); - if (mem) - memset(mem, 0, size); + if (!mem) + return NULL; + + memset(mem, 0, size); + mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); + if (!mem->stat) { + if (size < PAGE_SIZE) + kfree(mem); + else + vfree(mem); + mem = NULL; + } return mem; } @@ -3222,7 +4056,8 @@ static void __mem_cgroup_free(struct mem_cgroup *mem) for_each_node_state(node, N_POSSIBLE) free_mem_cgroup_per_zone_info(mem, node); - if (mem_cgroup_size() < PAGE_SIZE) + free_percpu(mem->stat); + if (sizeof(struct mem_cgroup) < PAGE_SIZE) kfree(mem); else vfree(mem); @@ -3233,9 +4068,9 @@ static void mem_cgroup_get(struct mem_cgroup *mem) atomic_inc(&mem->refcnt); } -static void mem_cgroup_put(struct mem_cgroup *mem) +static void __mem_cgroup_put(struct mem_cgroup *mem, int count) { - if (atomic_dec_and_test(&mem->refcnt)) { + if (atomic_sub_and_test(count, &mem->refcnt)) { struct mem_cgroup *parent = parent_mem_cgroup(mem); __mem_cgroup_free(mem); if (parent) @@ -3243,6 +4078,11 @@ static void mem_cgroup_put(struct mem_cgroup *mem) } } +static void mem_cgroup_put(struct mem_cgroup *mem) +{ + __mem_cgroup_put(mem, 1); +} + /* * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. */ @@ -3319,10 +4159,10 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) INIT_WORK(&stock->work, drain_local_stock); } hotcpu_notifier(memcg_stock_cpu_callback, 0); - } else { parent = mem_cgroup_from_cont(cont->parent); mem->use_hierarchy = parent->use_hierarchy; + mem->oom_kill_disable = parent->oom_kill_disable; } if (parent && parent->use_hierarchy) { @@ -3341,10 +4181,13 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) } mem->last_scanned_child = 0; spin_lock_init(&mem->reclaim_param_lock); + INIT_LIST_HEAD(&mem->oom_notify); if (parent) mem->swappiness = get_swappiness(parent); atomic_set(&mem->refcnt, 1); + mem->move_charge_at_immigrate = 0; + mutex_init(&mem->thresholds_lock); return &mem->css; free_out: __mem_cgroup_free(mem); @@ -3381,17 +4224,489 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, return ret; } +#ifdef CONFIG_MMU +/* Handlers for move charge at task migration. */ +#define PRECHARGE_COUNT_AT_ONCE 256 +static int mem_cgroup_do_precharge(unsigned long count) +{ + int ret = 0; + int batch_count = PRECHARGE_COUNT_AT_ONCE; + struct mem_cgroup *mem = mc.to; + + if (mem_cgroup_is_root(mem)) { + mc.precharge += count; + /* we don't need css_get for root */ + return ret; + } + /* try to charge at once */ + if (count > 1) { + struct res_counter *dummy; + /* + * "mem" cannot be under rmdir() because we've already checked + * by cgroup_lock_live_cgroup() that it is not removed and we + * are still under the same cgroup_mutex. So we can postpone + * css_get(). + */ + if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) + goto one_by_one; + if (do_swap_account && res_counter_charge(&mem->memsw, + PAGE_SIZE * count, &dummy)) { + res_counter_uncharge(&mem->res, PAGE_SIZE * count); + goto one_by_one; + } + mc.precharge += count; + return ret; + } +one_by_one: + /* fall back to one by one charge */ + while (count--) { + if (signal_pending(current)) { + ret = -EINTR; + break; + } + if (!batch_count--) { + batch_count = PRECHARGE_COUNT_AT_ONCE; + cond_resched(); + } + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); + if (ret || !mem) + /* mem_cgroup_clear_mc() will do uncharge later */ + return -ENOMEM; + mc.precharge++; + } + return ret; +} + +/** + * is_target_pte_for_mc - check a pte whether it is valid for move charge + * @vma: the vma the pte to be checked belongs + * @addr: the address corresponding to the pte to be checked + * @ptent: the pte to be checked + * @target: the pointer the target page or swap ent will be stored(can be NULL) + * + * Returns + * 0(MC_TARGET_NONE): if the pte is not a target for move charge. + * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for + * move charge. if @target is not NULL, the page is stored in target->page + * with extra refcnt got(Callers should handle it). + * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a + * target for charge migration. if @target is not NULL, the entry is stored + * in target->ent. + * + * Called with pte lock held. + */ +union mc_target { + struct page *page; + swp_entry_t ent; +}; + +enum mc_target_type { + MC_TARGET_NONE, /* not used */ + MC_TARGET_PAGE, + MC_TARGET_SWAP, +}; + +static struct page *mc_handle_present_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t ptent) +{ + struct page *page = vm_normal_page(vma, addr, ptent); + + if (!page || !page_mapped(page)) + return NULL; + if (PageAnon(page)) { + /* we don't move shared anon */ + if (!move_anon() || page_mapcount(page) > 2) + return NULL; + } else if (!move_file()) + /* we ignore mapcount for file pages */ + return NULL; + if (!get_page_unless_zero(page)) + return NULL; + + return page; +} + +static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t ptent, swp_entry_t *entry) +{ + int usage_count; + struct page *page = NULL; + swp_entry_t ent = pte_to_swp_entry(ptent); + + if (!move_anon() || non_swap_entry(ent)) + return NULL; + usage_count = mem_cgroup_count_swap_user(ent, &page); + if (usage_count > 1) { /* we don't move shared anon */ + if (page) + put_page(page); + return NULL; + } + if (do_swap_account) + entry->val = ent.val; + + return page; +} + +static struct page *mc_handle_file_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t ptent, swp_entry_t *entry) +{ + struct page *page = NULL; + struct inode *inode; + struct address_space *mapping; + pgoff_t pgoff; + + if (!vma->vm_file) /* anonymous vma */ + return NULL; + if (!move_file()) + return NULL; + + inode = vma->vm_file->f_path.dentry->d_inode; + mapping = vma->vm_file->f_mapping; + if (pte_none(ptent)) + pgoff = linear_page_index(vma, addr); + else /* pte_file(ptent) is true */ + pgoff = pte_to_pgoff(ptent); + + /* page is moved even if it's not RSS of this task(page-faulted). */ + if (!mapping_cap_swap_backed(mapping)) { /* normal file */ + page = find_get_page(mapping, pgoff); + } else { /* shmem/tmpfs file. we should take account of swap too. */ + swp_entry_t ent; + mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); + if (do_swap_account) + entry->val = ent.val; + } + + return page; +} + +static int is_target_pte_for_mc(struct vm_area_struct *vma, + unsigned long addr, pte_t ptent, union mc_target *target) +{ + struct page *page = NULL; + struct page_cgroup *pc; + int ret = 0; + swp_entry_t ent = { .val = 0 }; + + if (pte_present(ptent)) + page = mc_handle_present_pte(vma, addr, ptent); + else if (is_swap_pte(ptent)) + page = mc_handle_swap_pte(vma, addr, ptent, &ent); + else if (pte_none(ptent) || pte_file(ptent)) + page = mc_handle_file_pte(vma, addr, ptent, &ent); + + if (!page && !ent.val) + return 0; + if (page) { + pc = lookup_page_cgroup(page); + /* + * Do only loose check w/o page_cgroup lock. + * mem_cgroup_move_account() checks the pc is valid or not under + * the lock. + */ + if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { + ret = MC_TARGET_PAGE; + if (target) + target->page = page; + } + if (!ret || !target) + put_page(page); + } + /* There is a swap entry and a page doesn't exist or isn't charged */ + if (ent.val && !ret && + css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { + ret = MC_TARGET_SWAP; + if (target) + target->ent = ent; + } + return ret; +} + +static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->private; + pte_t *pte; + spinlock_t *ptl; + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (; addr != end; pte++, addr += PAGE_SIZE) + if (is_target_pte_for_mc(vma, addr, *pte, NULL)) + mc.precharge++; /* increment precharge temporarily */ + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); + + return 0; +} + +static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) +{ + unsigned long precharge; + struct vm_area_struct *vma; + + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + struct mm_walk mem_cgroup_count_precharge_walk = { + .pmd_entry = mem_cgroup_count_precharge_pte_range, + .mm = mm, + .private = vma, + }; + if (is_vm_hugetlb_page(vma)) + continue; + walk_page_range(vma->vm_start, vma->vm_end, + &mem_cgroup_count_precharge_walk); + } + up_read(&mm->mmap_sem); + + precharge = mc.precharge; + mc.precharge = 0; + + return precharge; +} + +static int mem_cgroup_precharge_mc(struct mm_struct *mm) +{ + return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); +} + +static void mem_cgroup_clear_mc(void) +{ + struct mem_cgroup *from = mc.from; + struct mem_cgroup *to = mc.to; + + /* we must uncharge all the leftover precharges from mc.to */ + if (mc.precharge) { + __mem_cgroup_cancel_charge(mc.to, mc.precharge); + mc.precharge = 0; + } + /* + * we didn't uncharge from mc.from at mem_cgroup_move_account(), so + * we must uncharge here. + */ + if (mc.moved_charge) { + __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); + mc.moved_charge = 0; + } + /* we must fixup refcnts and charges */ + if (mc.moved_swap) { + /* uncharge swap account from the old cgroup */ + if (!mem_cgroup_is_root(mc.from)) + res_counter_uncharge(&mc.from->memsw, + PAGE_SIZE * mc.moved_swap); + __mem_cgroup_put(mc.from, mc.moved_swap); + + if (!mem_cgroup_is_root(mc.to)) { + /* + * we charged both to->res and to->memsw, so we should + * uncharge to->res. + */ + res_counter_uncharge(&mc.to->res, + PAGE_SIZE * mc.moved_swap); + } + /* we've already done mem_cgroup_get(mc.to) */ + + mc.moved_swap = 0; + } + spin_lock(&mc.lock); + mc.from = NULL; + mc.to = NULL; + mc.moving_task = NULL; + spin_unlock(&mc.lock); + memcg_oom_recover(from); + memcg_oom_recover(to); + wake_up_all(&mc.waitq); +} + +static int mem_cgroup_can_attach(struct cgroup_subsys *ss, + struct cgroup *cgroup, + struct task_struct *p, + bool threadgroup) +{ + int ret = 0; + struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); + + if (mem->move_charge_at_immigrate) { + struct mm_struct *mm; + struct mem_cgroup *from = mem_cgroup_from_task(p); + + VM_BUG_ON(from == mem); + + mm = get_task_mm(p); + if (!mm) + return 0; + /* We move charges only when we move a owner of the mm */ + if (mm->owner == p) { + VM_BUG_ON(mc.from); + VM_BUG_ON(mc.to); + VM_BUG_ON(mc.precharge); + VM_BUG_ON(mc.moved_charge); + VM_BUG_ON(mc.moved_swap); + VM_BUG_ON(mc.moving_task); + spin_lock(&mc.lock); + mc.from = from; + mc.to = mem; + mc.precharge = 0; + mc.moved_charge = 0; + mc.moved_swap = 0; + mc.moving_task = current; + spin_unlock(&mc.lock); + + ret = mem_cgroup_precharge_mc(mm); + if (ret) + mem_cgroup_clear_mc(); + } + mmput(mm); + } + return ret; +} + +static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, + struct cgroup *cgroup, + struct task_struct *p, + bool threadgroup) +{ + mem_cgroup_clear_mc(); +} + +static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + int ret = 0; + struct vm_area_struct *vma = walk->private; + pte_t *pte; + spinlock_t *ptl; + +retry: + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (; addr != end; addr += PAGE_SIZE) { + pte_t ptent = *(pte++); + union mc_target target; + int type; + struct page *page; + struct page_cgroup *pc; + swp_entry_t ent; + + if (!mc.precharge) + break; + + type = is_target_pte_for_mc(vma, addr, ptent, &target); + switch (type) { + case MC_TARGET_PAGE: + page = target.page; + if (isolate_lru_page(page)) + goto put; + pc = lookup_page_cgroup(page); + if (!mem_cgroup_move_account(pc, + mc.from, mc.to, false)) { + mc.precharge--; + /* we uncharge from mc.from later. */ + mc.moved_charge++; + } + putback_lru_page(page); +put: /* is_target_pte_for_mc() gets the page */ + put_page(page); + break; + case MC_TARGET_SWAP: + ent = target.ent; + if (!mem_cgroup_move_swap_account(ent, + mc.from, mc.to, false)) { + mc.precharge--; + /* we fixup refcnts and charges later. */ + mc.moved_swap++; + } + break; + default: + break; + } + } + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); + + if (addr != end) { + /* + * We have consumed all precharges we got in can_attach(). + * We try charge one by one, but don't do any additional + * charges to mc.to if we have failed in charge once in attach() + * phase. + */ + ret = mem_cgroup_do_precharge(1); + if (!ret) + goto retry; + } + + return ret; +} + +static void mem_cgroup_move_charge(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + lru_add_drain_all(); + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + int ret; + struct mm_walk mem_cgroup_move_charge_walk = { + .pmd_entry = mem_cgroup_move_charge_pte_range, + .mm = mm, + .private = vma, + }; + if (is_vm_hugetlb_page(vma)) + continue; + ret = walk_page_range(vma->vm_start, vma->vm_end, + &mem_cgroup_move_charge_walk); + if (ret) + /* + * means we have consumed all precharges and failed in + * doing additional charge. Just abandon here. + */ + break; + } + up_read(&mm->mmap_sem); +} + static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct cgroup *cont, struct cgroup *old_cont, struct task_struct *p, bool threadgroup) { - /* - * FIXME: It's better to move charges of this process from old - * memcg to new memcg. But it's just on TODO-List now. - */ + struct mm_struct *mm; + + if (!mc.to) + /* no need to move charge */ + return; + + mm = get_task_mm(p); + if (mm) { + mem_cgroup_move_charge(mm); + mmput(mm); + } + mem_cgroup_clear_mc(); +} +#else /* !CONFIG_MMU */ +static int mem_cgroup_can_attach(struct cgroup_subsys *ss, + struct cgroup *cgroup, + struct task_struct *p, + bool threadgroup) +{ + return 0; } +static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, + struct cgroup *cgroup, + struct task_struct *p, + bool threadgroup) +{ +} +static void mem_cgroup_move_task(struct cgroup_subsys *ss, + struct cgroup *cont, + struct cgroup *old_cont, + struct task_struct *p, + bool threadgroup) +{ +} +#endif struct cgroup_subsys mem_cgroup_subsys = { .name = "memory", @@ -3400,6 +4715,8 @@ struct cgroup_subsys mem_cgroup_subsys = { .pre_destroy = mem_cgroup_pre_destroy, .destroy = mem_cgroup_destroy, .populate = mem_cgroup_populate, + .can_attach = mem_cgroup_can_attach, + .cancel_attach = mem_cgroup_cancel_attach, .attach = mem_cgroup_move_task, .early_init = 0, .use_id = 1, diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 17299fd..757f6b0 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -44,6 +44,9 @@ #include <linux/migrate.h> #include <linux/page-isolation.h> #include <linux/suspend.h> +#include <linux/slab.h> +#include <linux/swapops.h> +#include <linux/hugetlb.h> #include "internal.h" int sysctl_memory_failure_early_kill __read_mostly = 0; @@ -180,7 +183,7 @@ EXPORT_SYMBOL_GPL(hwpoison_filter); * signal. */ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, - unsigned long pfn) + unsigned long pfn, struct page *page) { struct siginfo si; int ret; @@ -195,7 +198,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, #ifdef __ARCH_SI_TRAPNO si.si_trapno = trapno; #endif - si.si_addr_lsb = PAGE_SHIFT; + si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; /* * Don't use force here, it's convenient if the signal * can be temporarily blocked. @@ -232,7 +235,7 @@ void shake_page(struct page *p, int access) int nr; do { nr = shrink_slab(1000, GFP_KERNEL, 1000); - if (page_count(p) == 0) + if (page_count(p) == 1) break; } while (nr > 10); } @@ -324,7 +327,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * wrong earlier. */ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, - int fail, unsigned long pfn) + int fail, struct page *page, unsigned long pfn) { struct to_kill *tk, *next; @@ -349,7 +352,7 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, * process anyways. */ else if (kill_proc_ao(tk->tsk, tk->addr, trapno, - pfn) < 0) + pfn, page) < 0) printk(KERN_ERR "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", pfn, tk->tsk->comm, tk->tsk->pid); @@ -383,9 +386,12 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, if (av == NULL) /* Not actually mapped anymore */ goto out; for_each_process (tsk) { + struct anon_vma_chain *vmac; + if (!task_early_kill(tsk)) continue; - list_for_each_entry (vma, &av->head, anon_vma_node) { + list_for_each_entry(vmac, &av->head, same_anon_vma) { + vma = vmac->vma; if (!page_mapped_in_vma(page, vma)) continue; if (vma->vm_mm == tsk->mm) @@ -685,17 +691,29 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) /* * Huge pages. Needs work. * Issues: - * No rmap support so we cannot find the original mapper. In theory could walk - * all MMs and look for the mappings, but that would be non atomic and racy. - * Need rmap for hugepages for this. Alternatively we could employ a heuristic, - * like just walking the current process and hoping it has it mapped (that - * should be usually true for the common "shared database cache" case) - * Should handle free huge pages and dequeue them too, but this needs to - * handle huge page accounting correctly. + * - Error on hugepage is contained in hugepage unit (not in raw page unit.) + * To narrow down kill region to one page, we need to break up pmd. + * - To support soft-offlining for hugepage, we need to support hugepage + * migration. */ static int me_huge_page(struct page *p, unsigned long pfn) { - return FAILED; + struct page *hpage = compound_head(p); + /* + * We can safely recover from error on free or reserved (i.e. + * not in-use) hugepage by dequeuing it from freelist. + * To check whether a hugepage is in-use or not, we can't use + * page->lru because it can be used in other hugepage operations, + * such as __unmap_hugepage_range() and gather_surplus_pages(). + * So instead we use page_mapping() and PageAnon(). + * We assume that this function is called with page lock held, + * so there is no race between isolation and mapping/unmapping. + */ + if (!(page_mapping(hpage) || PageAnon(hpage))) { + __isolate_hwpoisoned_huge_page(hpage); + return RECOVERED; + } + return DELAYED; } /* @@ -833,6 +851,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, int ret; int i; int kill = 1; + struct page *hpage = compound_head(p); if (PageReserved(p) || PageSlab(p)) return SWAP_SUCCESS; @@ -841,10 +860,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * This check implies we don't kill processes if their pages * are in the swap cache early. Those are always late kills. */ - if (!page_mapped(p)) + if (!page_mapped(hpage)) return SWAP_SUCCESS; - if (PageCompound(p) || PageKsm(p)) + if (PageKsm(p)) return SWAP_FAIL; if (PageSwapCache(p)) { @@ -859,10 +878,11 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * XXX: the dirty test could be racy: set_page_dirty() may not always * be called inside page lock (it's recommended but not enforced). */ - mapping = page_mapping(p); - if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { - if (page_mkclean(p)) { - SetPageDirty(p); + mapping = page_mapping(hpage); + if (!PageDirty(hpage) && mapping && + mapping_cap_writeback_dirty(mapping)) { + if (page_mkclean(hpage)) { + SetPageDirty(hpage); } else { kill = 0; ttu |= TTU_IGNORE_HWPOISON; @@ -881,14 +901,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * there's nothing that can be done. */ if (kill) - collect_procs(p, &tokill); + collect_procs(hpage, &tokill); /* * try_to_unmap can fail temporarily due to races. * Try a few times (RED-PEN better strategy?) */ for (i = 0; i < N_UNMAP_TRIES; i++) { - ret = try_to_unmap(p, ttu); + ret = try_to_unmap(hpage, ttu); if (ret == SWAP_SUCCESS) break; pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); @@ -896,7 +916,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, if (ret != SWAP_SUCCESS) printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", - pfn, page_mapcount(p)); + pfn, page_mapcount(hpage)); /* * Now that the dirty bit has been propagated to the @@ -907,17 +927,35 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * use a more force-full uncatchable kill to prevent * any accesses to the poisoned memory. */ - kill_procs_ao(&tokill, !!PageDirty(p), trapno, - ret != SWAP_SUCCESS, pfn); + kill_procs_ao(&tokill, !!PageDirty(hpage), trapno, + ret != SWAP_SUCCESS, p, pfn); return ret; } +static void set_page_hwpoison_huge_page(struct page *hpage) +{ + int i; + int nr_pages = 1 << compound_order(hpage); + for (i = 0; i < nr_pages; i++) + SetPageHWPoison(hpage + i); +} + +static void clear_page_hwpoison_huge_page(struct page *hpage) +{ + int i; + int nr_pages = 1 << compound_order(hpage); + for (i = 0; i < nr_pages; i++) + ClearPageHWPoison(hpage + i); +} + int __memory_failure(unsigned long pfn, int trapno, int flags) { struct page_state *ps; struct page *p; + struct page *hpage; int res; + unsigned int nr_pages; if (!sysctl_memory_failure_recovery) panic("Memory failure from trap %d on page %lx", trapno, pfn); @@ -930,12 +968,14 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) } p = pfn_to_page(pfn); + hpage = compound_head(p); if (TestSetPageHWPoison(p)) { printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); return 0; } - atomic_long_add(1, &mce_bad_pages); + nr_pages = 1 << compound_order(hpage); + atomic_long_add(nr_pages, &mce_bad_pages); /* * We need/can do nothing about count=0 pages. @@ -949,7 +989,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. */ if (!(flags & MF_COUNT_INCREASED) && - !get_page_unless_zero(compound_head(p))) { + !get_page_unless_zero(hpage)) { if (is_free_buddy_page(p)) { action_result(pfn, "free buddy", DELAYED); return 0; @@ -967,9 +1007,9 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * The check (unnecessarily) ignores LRU pages being isolated and * walked by the page reclaim code, however that's not a big loss. */ - if (!PageLRU(p)) + if (!PageLRU(p) && !PageHuge(p)) shake_page(p, 0); - if (!PageLRU(p)) { + if (!PageLRU(p) && !PageHuge(p)) { /* * shake_page could have turned it free. */ @@ -987,7 +1027,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * It's very difficult to mess with pages currently under IO * and in many cases impossible, so we just avoid it here. */ - lock_page_nosync(p); + lock_page_nosync(hpage); /* * unpoison always clear PG_hwpoison inside page lock @@ -999,12 +1039,32 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) - atomic_long_dec(&mce_bad_pages); - unlock_page(p); - put_page(p); + atomic_long_sub(nr_pages, &mce_bad_pages); + unlock_page(hpage); + put_page(hpage); return 0; } + /* + * For error on the tail page, we should set PG_hwpoison + * on the head page to show that the hugepage is hwpoisoned + */ + if (PageTail(p) && TestSetPageHWPoison(hpage)) { + action_result(pfn, "hugepage already hardware poisoned", + IGNORED); + unlock_page(hpage); + put_page(hpage); + return 0; + } + /* + * Set PG_hwpoison on all pages in an error hugepage, + * because containment is done in hugepage unit for now. + * Since we have done TestSetPageHWPoison() for the head page with + * page lock held, we can safely set PG_hwpoison bits on tail pages. + */ + if (PageHuge(p)) + set_page_hwpoison_huge_page(hpage); + wait_on_page_writeback(p); /* @@ -1034,7 +1094,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) } } out: - unlock_page(p); + unlock_page(hpage); return res; } EXPORT_SYMBOL_GPL(__memory_failure); @@ -1078,6 +1138,7 @@ int unpoison_memory(unsigned long pfn) struct page *page; struct page *p; int freeit = 0; + unsigned int nr_pages; if (!pfn_valid(pfn)) return -ENXIO; @@ -1090,9 +1151,11 @@ int unpoison_memory(unsigned long pfn) return 0; } + nr_pages = 1 << compound_order(page); + if (!get_page_unless_zero(page)) { if (TestClearPageHWPoison(p)) - atomic_long_dec(&mce_bad_pages); + atomic_long_sub(nr_pages, &mce_bad_pages); pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); return 0; } @@ -1104,11 +1167,13 @@ int unpoison_memory(unsigned long pfn) * the PG_hwpoison page will be caught and isolated on the entrance to * the free buddy page pool. */ - if (TestClearPageHWPoison(p)) { + if (TestClearPageHWPoison(page)) { pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); - atomic_long_dec(&mce_bad_pages); + atomic_long_sub(nr_pages, &mce_bad_pages); freeit = 1; } + if (PageHuge(p)) + clear_page_hwpoison_huge_page(page); unlock_page(page); put_page(page); @@ -1292,3 +1357,35 @@ done: /* keep elevated page count for bad page */ return ret; } + +/* + * The caller must hold current->mm->mmap_sem in read mode. + */ +int is_hwpoison_address(unsigned long addr) +{ + pgd_t *pgdp; + pud_t pud, *pudp; + pmd_t pmd, *pmdp; + pte_t pte, *ptep; + swp_entry_t entry; + + pgdp = pgd_offset(current->mm, addr); + if (!pgd_present(*pgdp)) + return 0; + pudp = pud_offset(pgdp, addr); + pud = *pudp; + if (!pud_present(pud) || pud_large(pud)) + return 0; + pmdp = pmd_offset(pudp, addr); + pmd = *pmdp; + if (!pmd_present(pmd) || pmd_large(pmd)) + return 0; + ptep = pte_offset_map(pmdp, addr); + pte = *ptep; + pte_unmap(ptep); + if (!is_swap_pte(pte)) + return 0; + entry = pte_to_swp_entry(pte); + return is_hwpoison_entry(entry); +} +EXPORT_SYMBOL_GPL(is_hwpoison_address); diff --git a/mm/memory.c b/mm/memory.c index 09e4b1b..0e18b4d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -56,6 +56,7 @@ #include <linux/kallsyms.h> #include <linux/swapops.h> #include <linux/elf.h> +#include <linux/gfp.h> #include <asm/io.h> #include <asm/pgalloc.h> @@ -121,6 +122,77 @@ static int __init init_zero_pfn(void) } core_initcall(init_zero_pfn); + +#if defined(SPLIT_RSS_COUNTING) + +static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm) +{ + int i; + + for (i = 0; i < NR_MM_COUNTERS; i++) { + if (task->rss_stat.count[i]) { + add_mm_counter(mm, i, task->rss_stat.count[i]); + task->rss_stat.count[i] = 0; + } + } + task->rss_stat.events = 0; +} + +static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) +{ + struct task_struct *task = current; + + if (likely(task->mm == mm)) + task->rss_stat.count[member] += val; + else + add_mm_counter(mm, member, val); +} +#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) +#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) + +/* sync counter once per 64 page faults */ +#define TASK_RSS_EVENTS_THRESH (64) +static void check_sync_rss_stat(struct task_struct *task) +{ + if (unlikely(task != current)) + return; + if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) + __sync_task_rss_stat(task, task->mm); +} + +unsigned long get_mm_counter(struct mm_struct *mm, int member) +{ + long val = 0; + + /* + * Don't use task->mm here...for avoiding to use task_get_mm().. + * The caller must guarantee task->mm is not invalid. + */ + val = atomic_long_read(&mm->rss_stat.count[member]); + /* + * counter is updated in asynchronous manner and may go to minus. + * But it's never be expected number for users. + */ + if (val < 0) + return 0; + return (unsigned long)val; +} + +void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) +{ + __sync_task_rss_stat(task, mm); +} +#else + +#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) +#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) + +static void check_sync_rss_stat(struct task_struct *task) +{ +} + +#endif + /* * If a p?d_bad entry is found while walking page tables, report * the error, before resetting entry to p?d_none. Usually (but @@ -235,7 +307,6 @@ void free_pgd_range(struct mmu_gather *tlb, { pgd_t *pgd; unsigned long next; - unsigned long start; /* * The next few lines have given us lots of grief... @@ -279,7 +350,6 @@ void free_pgd_range(struct mmu_gather *tlb, if (addr > end - 1) return; - start = addr; pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); @@ -300,7 +370,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, * Hide vma from rmap and truncate_pagecache before freeing * pgtables */ - anon_vma_unlink(vma); + unlink_anon_vmas(vma); unlink_file_vma(vma); if (is_vm_hugetlb_page(vma)) { @@ -314,7 +384,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, && !is_vm_hugetlb_page(next)) { vma = next; next = vma->vm_next; - anon_vma_unlink(vma); + unlink_anon_vmas(vma); unlink_file_vma(vma); } free_pgd_range(tlb, addr, vma->vm_end, @@ -376,12 +446,20 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) return 0; } -static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) +static inline void init_rss_vec(int *rss) { - if (file_rss) - add_mm_counter(mm, file_rss, file_rss); - if (anon_rss) - add_mm_counter(mm, anon_rss, anon_rss); + memset(rss, 0, sizeof(int) * NR_MM_COUNTERS); +} + +static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) +{ + int i; + + if (current->mm == mm) + sync_mm_rss(current, mm); + for (i = 0; i < NR_MM_COUNTERS; i++) + if (rss[i]) + add_mm_counter(mm, i, rss[i]); } /* @@ -430,12 +508,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", current->comm, (long long)pte_val(pte), (long long)pmd_val(*pmd)); - if (page) { - printk(KERN_ALERT - "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", - page, (void *)page->flags, page_count(page), - page_mapcount(page), page->mapping, page->index); - } + if (page) + dump_page(page); printk(KERN_ALERT "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); @@ -597,7 +671,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, &src_mm->mmlist); spin_unlock(&mmlist_lock); } - if (is_write_migration_entry(entry) && + if (likely(!non_swap_entry(entry))) + rss[MM_SWAPENTS]++; + else if (is_write_migration_entry(entry) && is_cow_mapping(vm_flags)) { /* * COW mappings require pages in both parent @@ -632,7 +708,10 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (page) { get_page(page); page_dup_rmap(page); - rss[PageAnon(page)]++; + if (PageAnon(page)) + rss[MM_ANONPAGES]++; + else + rss[MM_FILEPAGES]++; } out_set_pte: @@ -648,11 +727,12 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *src_pte, *dst_pte; spinlock_t *src_ptl, *dst_ptl; int progress = 0; - int rss[2]; + int rss[NR_MM_COUNTERS]; swp_entry_t entry = (swp_entry_t){0}; again: - rss[1] = rss[0] = 0; + init_rss_vec(rss); + dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); if (!dst_pte) return -ENOMEM; @@ -688,7 +768,7 @@ again: arch_leave_lazy_mmu_mode(); spin_unlock(src_ptl); pte_unmap_nested(orig_src_pte); - add_mm_rss(dst_mm, rss[0], rss[1]); + add_mm_rss_vec(dst_mm, rss); pte_unmap_unlock(orig_dst_pte, dst_ptl); cond_resched(); @@ -816,8 +896,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, struct mm_struct *mm = tlb->mm; pte_t *pte; spinlock_t *ptl; - int file_rss = 0; - int anon_rss = 0; + int rss[NR_MM_COUNTERS]; + + init_rss_vec(rss); pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); @@ -863,14 +944,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, set_pte_at(mm, addr, pte, pgoff_to_pte(page->index)); if (PageAnon(page)) - anon_rss--; + rss[MM_ANONPAGES]--; else { if (pte_dirty(ptent)) set_page_dirty(page); if (pte_young(ptent) && likely(!VM_SequentialReadHint(vma))) mark_page_accessed(page); - file_rss--; + rss[MM_FILEPAGES]--; } page_remove_rmap(page); if (unlikely(page_mapcount(page) < 0)) @@ -887,13 +968,18 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (pte_file(ptent)) { if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) print_bad_pte(vma, addr, ptent, NULL); - } else if - (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent)))) - print_bad_pte(vma, addr, ptent, NULL); + } else { + swp_entry_t entry = pte_to_swp_entry(ptent); + + if (!non_swap_entry(entry)) + rss[MM_SWAPENTS]--; + if (unlikely(!free_swap_and_cache(entry))) + print_bad_pte(vma, addr, ptent, NULL); + } pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); - add_mm_rss(mm, file_rss, anon_rss); + add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); @@ -1139,8 +1225,17 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, } EXPORT_SYMBOL_GPL(zap_vma_ptes); -/* - * Do a quick page-table lookup for a single page. +/** + * follow_page - look up a page descriptor from a user-virtual address + * @vma: vm_area_struct mapping @address + * @address: virtual address to look up + * @flags: flags modifying lookup behaviour + * + * @flags can have FOLL_ flags set, defined in <linux/mm.h> + * + * Returns the mapped (struct page *), %NULL if no mapping exists, or + * an error pointer if there is a mapping to something not represented + * by a page descriptor (see also vm_normal_page()). */ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int flags) @@ -1297,10 +1392,20 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, return i ? : -EFAULT; } if (pages) { - struct page *page = vm_normal_page(gate_vma, start, *pte); + struct page *page; + + page = vm_normal_page(gate_vma, start, *pte); + if (!page) { + if (!(gup_flags & FOLL_DUMP) && + is_zero_pfn(pte_pfn(*pte))) + page = pte_page(*pte); + else { + pte_unmap(pte); + return i ? : -EFAULT; + } + } pages[i] = page; - if (page) - get_page(page); + get_page(page); } pte_unmap(pte); if (vmas) @@ -1527,7 +1632,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, /* Ok, finally just insert the thing.. */ get_page(page); - inc_mm_counter(mm, file_rss); + inc_mm_counter_fast(mm, MM_FILEPAGES); page_add_file_rmap(page); set_pte_at(mm, addr, pte, mk_pte(page, prot)); @@ -1593,7 +1698,7 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, /* Ok, finally just insert the thing.. */ entry = pte_mkspecial(pfn_pte(pfn, prot)); set_pte_at(mm, addr, pte, entry); - update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */ + update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ retval = 0; out_unlock: @@ -1901,11 +2006,10 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, { pgd_t *pgd; unsigned long next; - unsigned long start = addr, end = addr + size; + unsigned long end = addr + size; int err; BUG_ON(addr >= end); - mmu_notifier_invalidate_range_start(mm, start, end); pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); @@ -1913,7 +2017,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, if (err) break; } while (pgd++, addr = next, addr != end); - mmu_notifier_invalidate_range_end(mm, start, end); + return err; } EXPORT_SYMBOL_GPL(apply_to_page_range); @@ -2044,6 +2148,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, page_cache_release(old_page); } reuse = reuse_swap_page(old_page); + if (reuse) + /* + * The page is all ours. Move it to our anon_vma so + * the rmap code will not search our parent or siblings. + * Protected against the rmap code by the page lock. + */ + page_move_anon_rmap(old_page, vma, address); unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { @@ -2116,7 +2227,7 @@ reuse: entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, address, page_table, entry,1)) - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, page_table); ret |= VM_FAULT_WRITE; goto unlock; } @@ -2163,11 +2274,11 @@ gotten: if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { - dec_mm_counter(mm, file_rss); - inc_mm_counter(mm, anon_rss); + dec_mm_counter_fast(mm, MM_FILEPAGES); + inc_mm_counter_fast(mm, MM_ANONPAGES); } } else - inc_mm_counter(mm, anon_rss); + inc_mm_counter_fast(mm, MM_ANONPAGES); flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -2185,7 +2296,7 @@ gotten: * new page to be mapped directly into the secondary page table. */ set_pte_at_notify(mm, address, page_table, entry); - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, page_table); if (old_page) { /* * Only after switching the pte to the new page may @@ -2512,10 +2623,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned int flags, pte_t orig_pte) { spinlock_t *ptl; - struct page *page; + struct page *page, *swapcache = NULL; swp_entry_t entry; pte_t pte; struct mem_cgroup *ptr = NULL; + int exclusive = 0; int ret = 0; if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) @@ -2567,10 +2679,25 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, lock_page(page); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); - page = ksm_might_need_to_copy(page, vma, address); - if (!page) { - ret = VM_FAULT_OOM; - goto out; + /* + * Make sure try_to_free_swap or reuse_swap_page or swapoff did not + * release the swapcache from under us. The page pin, and pte_same + * test below, are not enough to exclude that. Even if it is still + * swapcache, we need to check that the page's swap has not changed. + */ + if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) + goto out_page; + + if (ksm_might_need_to_copy(page, vma, address)) { + swapcache = page; + page = ksm_does_need_to_copy(page, vma, address); + + if (unlikely(!page)) { + ret = VM_FAULT_OOM; + page = swapcache; + swapcache = NULL; + goto out_page; + } } if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { @@ -2604,15 +2731,18 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, * discarded at swap_free(). */ - inc_mm_counter(mm, anon_rss); + inc_mm_counter_fast(mm, MM_ANONPAGES); + dec_mm_counter_fast(mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); flags &= ~FAULT_FLAG_WRITE; + ret |= VM_FAULT_WRITE; + exclusive = 1; } flush_icache_page(vma, page); set_pte_at(mm, address, page_table, pte); - page_add_anon_rmap(page, vma, address); + do_page_add_anon_rmap(page, vma, address, exclusive); /* It's better to call commit-charge after rmap is established */ mem_cgroup_commit_charge_swapin(page, ptr); @@ -2620,6 +2750,18 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) try_to_free_swap(page); unlock_page(page); + if (swapcache) { + /* + * Hold the lock to avoid the swap entry to be reused + * until we take the PT lock for the pte_same() check + * (to avoid false positives from pte_same). For + * further safety release the lock after the swap_free + * so that the swap count won't change under a + * parallel locked swapcache. + */ + unlock_page(swapcache); + page_cache_release(swapcache); + } if (flags & FAULT_FLAG_WRITE) { ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); @@ -2629,7 +2771,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, } /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, address, pte); + update_mmu_cache(vma, address, page_table); unlock: pte_unmap_unlock(page_table, ptl); out: @@ -2641,10 +2783,48 @@ out_page: unlock_page(page); out_release: page_cache_release(page); + if (swapcache) { + unlock_page(swapcache); + page_cache_release(swapcache); + } return ret; } /* + * This is like a special single-page "expand_{down|up}wards()", + * except we must first make sure that 'address{-|+}PAGE_SIZE' + * doesn't hit another vma. + */ +static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address) +{ + address &= PAGE_MASK; + if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) { + struct vm_area_struct *prev = vma->vm_prev; + + /* + * Is there a mapping abutting this one below? + * + * That's only ok if it's the same stack mapping + * that has gotten split.. + */ + if (prev && prev->vm_end == address) + return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; + + expand_stack(vma, address - PAGE_SIZE); + } + if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { + struct vm_area_struct *next = vma->vm_next; + + /* As VM_GROWSDOWN but s/below/above/ */ + if (next && next->vm_start == address + PAGE_SIZE) + return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM; + + expand_upwards(vma, address + PAGE_SIZE); + } + return 0; +} + +/* * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. @@ -2657,19 +2837,23 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl; pte_t entry; + pte_unmap(page_table); + + /* Check if we need to add a guard page to the stack */ + if (check_stack_guard_page(vma, address) < 0) + return VM_FAULT_SIGBUS; + + /* Use the zero-page for reads */ if (!(flags & FAULT_FLAG_WRITE)) { entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), vma->vm_page_prot)); - ptl = pte_lockptr(mm, pmd); - spin_lock(ptl); + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_none(*page_table)) goto unlock; goto setpte; } /* Allocate our own private page. */ - pte_unmap(page_table); - if (unlikely(anon_vma_prepare(vma))) goto oom; page = alloc_zeroed_user_highpage_movable(vma, address); @@ -2688,13 +2872,13 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_none(*page_table)) goto release; - inc_mm_counter(mm, anon_rss); + inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); setpte: set_pte_at(mm, address, page_table, entry); /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, page_table); unlock: pte_unmap_unlock(page_table, ptl); return 0; @@ -2842,10 +3026,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & FAULT_FLAG_WRITE) entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (anon) { - inc_mm_counter(mm, anon_rss); + inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); } else { - inc_mm_counter(mm, file_rss); + inc_mm_counter_fast(mm, MM_FILEPAGES); page_add_file_rmap(page); if (flags & FAULT_FLAG_WRITE) { dirty_page = page; @@ -2855,7 +3039,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, set_pte_at(mm, address, page_table, entry); /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, page_table); } else { if (charged) mem_cgroup_uncharge_page(page); @@ -2992,7 +3176,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, } entry = pte_mkyoung(entry); if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, pte); } else { /* * This is needed only for protection faults but the arch code @@ -3023,6 +3207,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(PGFAULT); + /* do counter updates before entering really critical section. */ + check_sync_rss_stat(current); + if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, flags); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 030ce8a..dd186c1 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -28,6 +28,7 @@ #include <linux/pfn.h> #include <linux/suspend.h> #include <linux/mm_inline.h> +#include <linux/firmware-map.h> #include <asm/tlbflush.h> @@ -414,12 +415,14 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) * This means the page allocator ignores this zone. * So, zonelist must be updated after online. */ + mutex_lock(&zonelists_mutex); if (!populated_zone(zone)) need_zonelists_rebuild = 1; ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, online_pages_range); if (ret) { + mutex_unlock(&zonelists_mutex); printk(KERN_DEBUG "online_pages %lx at %lx failed\n", nr_pages, pfn); memory_notify(MEM_CANCEL_ONLINE, &arg); @@ -428,8 +431,12 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) zone->present_pages += onlined_pages; zone->zone_pgdat->node_present_pages += onlined_pages; + if (need_zonelists_rebuild) + build_all_zonelists(zone); + else + zone_pcp_update(zone); - zone_pcp_update(zone); + mutex_unlock(&zonelists_mutex); setup_per_zone_wmarks(); calculate_zone_inactive_ratio(zone); if (onlined_pages) { @@ -437,10 +444,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); } - if (need_zonelists_rebuild) - build_all_zonelists(); - else - vm_total_pages = nr_free_pagecache_pages(); + vm_total_pages = nr_free_pagecache_pages(); writeback_set_ratelimit(); @@ -481,6 +485,29 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat) } +/* + * called by cpu_up() to online a node without onlined memory. + */ +int mem_online_node(int nid) +{ + pg_data_t *pgdat; + int ret; + + lock_system_sleep(); + pgdat = hotadd_new_pgdat(nid, 0); + if (pgdat) { + ret = -ENOMEM; + goto out; + } + node_set_online(nid); + ret = register_one_node(nid); + BUG_ON(ret); + +out: + unlock_system_sleep(); + return ret; +} + /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ int __ref add_memory(int nid, u64 start, u64 size) { @@ -523,6 +550,9 @@ int __ref add_memory(int nid, u64 start, u64 size) BUG_ON(ret); } + /* create new memmap entry */ + firmware_map_add_hotplug(start, start + size, "System RAM"); + goto out; error: @@ -554,19 +584,19 @@ static inline int pageblock_free(struct page *page) /* Return the start of the next active pageblock after a given page */ static struct page *next_active_pageblock(struct page *page) { - int pageblocks_stride; - /* Ensure the starting page is pageblock-aligned */ BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); - /* Move forward by at least 1 * pageblock_nr_pages */ - pageblocks_stride = 1; - /* If the entire pageblock is free, move to the end of free page */ - if (pageblock_free(page)) - pageblocks_stride += page_order(page) - pageblock_order; + if (pageblock_free(page)) { + int order; + /* be careful. we don't have locks, page_order can be changed.*/ + order = page_order(page); + if ((order < MAX_ORDER) && (order >= pageblock_order)) + return page + (1 << order); + } - return page + (pageblocks_stride * pageblock_nr_pages); + return page + pageblock_nr_pages; } /* Checks if this range of memory is likely to be hot-removable. */ @@ -684,9 +714,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (page_count(page)) not_managed++; #ifdef CONFIG_DEBUG_VM - printk(KERN_INFO "removing from LRU failed" - " %lx/%d/%lx\n", - pfn, page_count(page), page->flags); + printk(KERN_ALERT "removing pfn %lx from LRU failed\n", + pfn); + dump_page(page); #endif } } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 290fb5b..f969da5 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -73,7 +73,6 @@ #include <linux/sched.h> #include <linux/nodemask.h> #include <linux/cpuset.h> -#include <linux/gfp.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/module.h> @@ -120,7 +119,22 @@ struct mempolicy default_policy = { static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); - void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); + /* + * If read-side task has no lock to protect task->mempolicy, write-side + * task will rebind the task->mempolicy by two step. The first step is + * setting all the newly nodes, and the second step is cleaning all the + * disallowed nodes. In this way, we can avoid finding no node to alloc + * page. + * If we have a lock to protect task->mempolicy in read-side, we do + * rebind directly. + * + * step: + * MPOL_REBIND_ONCE - do rebind work at once + * MPOL_REBIND_STEP1 - set all the newly nodes + * MPOL_REBIND_STEP2 - clean all the disallowed nodes + */ + void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes, + enum mpol_rebind_step step); } mpol_ops[MPOL_MAX]; /* Check that the nodemask contains at least one populated zone */ @@ -128,9 +142,6 @@ static int is_valid_nodemask(const nodemask_t *nodemask) { int nd, k; - /* Check that there is something useful in this mask */ - k = policy_zone; - for_each_node_mask(nd, *nodemask) { struct zone *z; @@ -146,7 +157,7 @@ static int is_valid_nodemask(const nodemask_t *nodemask) static inline int mpol_store_user_nodemask(const struct mempolicy *pol) { - return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES); + return pol->flags & MPOL_MODE_FLAGS; } static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, @@ -278,12 +289,19 @@ void __mpol_put(struct mempolicy *p) kmem_cache_free(policy_cache, p); } -static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) +static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes, + enum mpol_rebind_step step) { } -static void mpol_rebind_nodemask(struct mempolicy *pol, - const nodemask_t *nodes) +/* + * step: + * MPOL_REBIND_ONCE - do rebind work at once + * MPOL_REBIND_STEP1 - set all the newly nodes + * MPOL_REBIND_STEP2 - clean all the disallowed nodes + */ +static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, + enum mpol_rebind_step step) { nodemask_t tmp; @@ -292,12 +310,31 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, else if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); else { - nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed, - *nodes); - pol->w.cpuset_mems_allowed = *nodes; + /* + * if step == 1, we use ->w.cpuset_mems_allowed to cache the + * result + */ + if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) { + nodes_remap(tmp, pol->v.nodes, + pol->w.cpuset_mems_allowed, *nodes); + pol->w.cpuset_mems_allowed = step ? tmp : *nodes; + } else if (step == MPOL_REBIND_STEP2) { + tmp = pol->w.cpuset_mems_allowed; + pol->w.cpuset_mems_allowed = *nodes; + } else + BUG(); } - pol->v.nodes = tmp; + if (nodes_empty(tmp)) + tmp = *nodes; + + if (step == MPOL_REBIND_STEP1) + nodes_or(pol->v.nodes, pol->v.nodes, tmp); + else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2) + pol->v.nodes = tmp; + else + BUG(); + if (!node_isset(current->il_next, tmp)) { current->il_next = next_node(current->il_next, tmp); if (current->il_next >= MAX_NUMNODES) @@ -308,7 +345,8 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, } static void mpol_rebind_preferred(struct mempolicy *pol, - const nodemask_t *nodes) + const nodemask_t *nodes, + enum mpol_rebind_step step) { nodemask_t tmp; @@ -331,16 +369,45 @@ static void mpol_rebind_preferred(struct mempolicy *pol, } } -/* Migrate a policy to a different set of nodes */ -static void mpol_rebind_policy(struct mempolicy *pol, - const nodemask_t *newmask) +/* + * mpol_rebind_policy - Migrate a policy to a different set of nodes + * + * If read-side task has no lock to protect task->mempolicy, write-side + * task will rebind the task->mempolicy by two step. The first step is + * setting all the newly nodes, and the second step is cleaning all the + * disallowed nodes. In this way, we can avoid finding no node to alloc + * page. + * If we have a lock to protect task->mempolicy in read-side, we do + * rebind directly. + * + * step: + * MPOL_REBIND_ONCE - do rebind work at once + * MPOL_REBIND_STEP1 - set all the newly nodes + * MPOL_REBIND_STEP2 - clean all the disallowed nodes + */ +static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, + enum mpol_rebind_step step) { if (!pol) return; - if (!mpol_store_user_nodemask(pol) && + if (!mpol_store_user_nodemask(pol) && step == 0 && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; - mpol_ops[pol->mode].rebind(pol, newmask); + + if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING)) + return; + + if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING)) + BUG(); + + if (step == MPOL_REBIND_STEP1) + pol->flags |= MPOL_F_REBINDING; + else if (step == MPOL_REBIND_STEP2) + pol->flags &= ~MPOL_F_REBINDING; + else if (step >= MPOL_REBIND_NSTEP) + BUG(); + + mpol_ops[pol->mode].rebind(pol, newmask, step); } /* @@ -350,9 +417,10 @@ static void mpol_rebind_policy(struct mempolicy *pol, * Called with task's alloc_lock held. */ -void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) +void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, + enum mpol_rebind_step step) { - mpol_rebind_policy(tsk->mempolicy, new); + mpol_rebind_policy(tsk->mempolicy, new, step); } /* @@ -367,7 +435,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) down_write(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) - mpol_rebind_policy(vma->vm_policy, new); + mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE); up_write(&mm->mmap_sem); } @@ -563,24 +631,50 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) } /* Step 2: apply policy to a range and do splits. */ -static int mbind_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end, struct mempolicy *new) +static int mbind_range(struct mm_struct *mm, unsigned long start, + unsigned long end, struct mempolicy *new_pol) { struct vm_area_struct *next; - int err; + struct vm_area_struct *prev; + struct vm_area_struct *vma; + int err = 0; + pgoff_t pgoff; + unsigned long vmstart; + unsigned long vmend; - err = 0; - for (; vma && vma->vm_start < end; vma = next) { + vma = find_vma_prev(mm, start, &prev); + if (!vma || vma->vm_start > start) + return -EFAULT; + + for (; vma && vma->vm_start < end; prev = vma, vma = next) { next = vma->vm_next; - if (vma->vm_start < start) - err = split_vma(vma->vm_mm, vma, start, 1); - if (!err && vma->vm_end > end) - err = split_vma(vma->vm_mm, vma, end, 0); - if (!err) - err = policy_vma(vma, new); + vmstart = max(start, vma->vm_start); + vmend = min(end, vma->vm_end); + + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, + vma->anon_vma, vma->vm_file, pgoff, new_pol); + if (prev) { + vma = prev; + next = vma->vm_next; + continue; + } + if (vma->vm_start != vmstart) { + err = split_vma(vma->vm_mm, vma, vmstart, 1); + if (err) + goto out; + } + if (vma->vm_end != vmend) { + err = split_vma(vma->vm_mm, vma, vmend, 0); + if (err) + goto out; + } + err = policy_vma(vma, new_pol); if (err) - break; + goto out; } + + out: return err; } @@ -780,9 +874,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, err = 0; if (nmask) { - task_lock(current); - get_policy_nodemask(pol, nmask); - task_unlock(current); + if (mpol_store_user_nodemask(pol)) { + *nmask = pol->w.user_nodemask; + } else { + task_lock(current); + get_policy_nodemask(pol, nmask); + task_unlock(current); + } } out: @@ -830,7 +928,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, nodes_clear(nmask); node_set(source, nmask); - check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask, + check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) @@ -862,36 +960,36 @@ int do_migrate_pages(struct mm_struct *mm, if (err) goto out; -/* - * Find a 'source' bit set in 'tmp' whose corresponding 'dest' - * bit in 'to' is not also set in 'tmp'. Clear the found 'source' - * bit in 'tmp', and return that <source, dest> pair for migration. - * The pair of nodemasks 'to' and 'from' define the map. - * - * If no pair of bits is found that way, fallback to picking some - * pair of 'source' and 'dest' bits that are not the same. If the - * 'source' and 'dest' bits are the same, this represents a node - * that will be migrating to itself, so no pages need move. - * - * If no bits are left in 'tmp', or if all remaining bits left - * in 'tmp' correspond to the same bit in 'to', return false - * (nothing left to migrate). - * - * This lets us pick a pair of nodes to migrate between, such that - * if possible the dest node is not already occupied by some other - * source node, minimizing the risk of overloading the memory on a - * node that would happen if we migrated incoming memory to a node - * before migrating outgoing memory source that same node. - * - * A single scan of tmp is sufficient. As we go, we remember the - * most recent <s, d> pair that moved (s != d). If we find a pair - * that not only moved, but what's better, moved to an empty slot - * (d is not set in tmp), then we break out then, with that pair. - * Otherwise when we finish scannng from_tmp, we at least have the - * most recent <s, d> pair that moved. If we get all the way through - * the scan of tmp without finding any node that moved, much less - * moved to an empty node, then there is nothing left worth migrating. - */ + /* + * Find a 'source' bit set in 'tmp' whose corresponding 'dest' + * bit in 'to' is not also set in 'tmp'. Clear the found 'source' + * bit in 'tmp', and return that <source, dest> pair for migration. + * The pair of nodemasks 'to' and 'from' define the map. + * + * If no pair of bits is found that way, fallback to picking some + * pair of 'source' and 'dest' bits that are not the same. If the + * 'source' and 'dest' bits are the same, this represents a node + * that will be migrating to itself, so no pages need move. + * + * If no bits are left in 'tmp', or if all remaining bits left + * in 'tmp' correspond to the same bit in 'to', return false + * (nothing left to migrate). + * + * This lets us pick a pair of nodes to migrate between, such that + * if possible the dest node is not already occupied by some other + * source node, minimizing the risk of overloading the memory on a + * node that would happen if we migrated incoming memory to a node + * before migrating outgoing memory source that same node. + * + * A single scan of tmp is sufficient. As we go, we remember the + * most recent <s, d> pair that moved (s != d). If we find a pair + * that not only moved, but what's better, moved to an empty slot + * (d is not set in tmp), then we break out then, with that pair. + * Otherwise when we finish scannng from_tmp, we at least have the + * most recent <s, d> pair that moved. If we get all the way through + * the scan of tmp without finding any node that moved, much less + * moved to an empty node, then there is nothing left worth migrating. + */ tmp = *from_nodes; while (!nodes_empty(tmp)) { @@ -1047,7 +1145,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (!IS_ERR(vma)) { int nr_failed = 0; - err = mbind_range(vma, start, end, new); + err = mbind_range(mm, start, end, new); if (!list_empty(&pagelist)) nr_failed = migrate_pages(&pagelist, new_vma_page, @@ -1177,33 +1275,42 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, const unsigned long __user *, new_nodes) { const struct cred *cred = current_cred(), *tcred; - struct mm_struct *mm; + struct mm_struct *mm = NULL; struct task_struct *task; - nodemask_t old; - nodemask_t new; nodemask_t task_nodes; int err; + nodemask_t *old; + nodemask_t *new; + NODEMASK_SCRATCH(scratch); + + if (!scratch) + return -ENOMEM; + + old = &scratch->mask1; + new = &scratch->mask2; - err = get_nodes(&old, old_nodes, maxnode); + err = get_nodes(old, old_nodes, maxnode); if (err) - return err; + goto out; - err = get_nodes(&new, new_nodes, maxnode); + err = get_nodes(new, new_nodes, maxnode); if (err) - return err; + goto out; /* Find the mm_struct */ read_lock(&tasklist_lock); task = pid ? find_task_by_vpid(pid) : current; if (!task) { read_unlock(&tasklist_lock); - return -ESRCH; + err = -ESRCH; + goto out; } mm = get_task_mm(task); read_unlock(&tasklist_lock); + err = -EINVAL; if (!mm) - return -EINVAL; + goto out; /* * Check if this process has the right to modify the specified @@ -1224,12 +1331,12 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, task_nodes = cpuset_mems_allowed(task); /* Is the user allowed to access the target nodes? */ - if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) { + if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { err = -EPERM; goto out; } - if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) { + if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) { err = -EINVAL; goto out; } @@ -1238,10 +1345,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, if (err) goto out; - err = do_migrate_pages(mm, &old, &new, + err = do_migrate_pages(mm, old, new, capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); out: - mmput(mm); + if (mm) + mmput(mm); + NODEMASK_SCRATCH_FREE(scratch); + return err; } @@ -1415,15 +1525,13 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy) /* * Normally, MPOL_BIND allocations are node-local within the * allowed nodemask. However, if __GFP_THISNODE is set and the - * current node is part of the mask, we use the zonelist for + * current node isn't part of the mask, we use the zonelist for * the first node in the mask instead. */ if (unlikely(gfp & __GFP_THISNODE) && unlikely(!node_isset(nd, policy->v.nodes))) nd = first_node(policy->v.nodes); break; - case MPOL_INTERLEAVE: /* should not happen */ - break; default: BUG(); } @@ -1543,6 +1651,8 @@ static inline unsigned interleave_nid(struct mempolicy *pol, * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'BIND, returns a pointer to the mempolicy's * @nodemask for filtering the zonelist. + * + * Must be protected by get_mems_allowed() */ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, @@ -1588,6 +1698,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) if (!(mask && current->mempolicy)) return false; + task_lock(current); mempolicy = current->mempolicy; switch (mempolicy->mode) { case MPOL_PREFERRED: @@ -1607,11 +1718,56 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) default: BUG(); } + task_unlock(current); return true; } #endif +/* + * mempolicy_nodemask_intersects + * + * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default + * policy. Otherwise, check for intersection between mask and the policy + * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local' + * policy, always return true since it may allocate elsewhere on fallback. + * + * Takes task_lock(tsk) to prevent freeing of its mempolicy. + */ +bool mempolicy_nodemask_intersects(struct task_struct *tsk, + const nodemask_t *mask) +{ + struct mempolicy *mempolicy; + bool ret = true; + + if (!mask) + return ret; + task_lock(tsk); + mempolicy = tsk->mempolicy; + if (!mempolicy) + goto out; + + switch (mempolicy->mode) { + case MPOL_PREFERRED: + /* + * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to + * allocate from, they may fallback to other nodes when oom. + * Thus, it's possible for tsk to have allocated memory from + * nodes in mask. + */ + break; + case MPOL_BIND: + case MPOL_INTERLEAVE: + ret = nodes_intersects(mempolicy->v.nodes, *mask); + break; + default: + BUG(); + } +out: + task_unlock(tsk); + return ret; +} + /* Allocate a page in interleaved policy. Own path because it needs to do special accounting. */ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, @@ -1654,13 +1810,17 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = get_vma_policy(current, vma, addr); struct zonelist *zl; + struct page *page; + get_mems_allowed(); if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); mpol_cond_put(pol); - return alloc_page_interleave(gfp, 0, nid); + page = alloc_page_interleave(gfp, 0, nid); + put_mems_allowed(); + return page; } zl = policy_zonelist(gfp, pol); if (unlikely(mpol_needs_cond_ref(pol))) { @@ -1670,12 +1830,15 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) struct page *page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); __mpol_put(pol); + put_mems_allowed(); return page; } /* * fast path: default or task policy */ - return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); + page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); + put_mems_allowed(); + return page; } /** @@ -1700,18 +1863,23 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) struct page *alloc_pages_current(gfp_t gfp, unsigned order) { struct mempolicy *pol = current->mempolicy; + struct page *page; if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) pol = &default_policy; + get_mems_allowed(); /* * No reference counting needed for current->mempolicy * nor system default_policy */ if (pol->mode == MPOL_INTERLEAVE) - return alloc_page_interleave(gfp, order, interleave_nodes(pol)); - return __alloc_pages_nodemask(gfp, order, + page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); + else + page = __alloc_pages_nodemask(gfp, order, policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); + put_mems_allowed(); + return page; } EXPORT_SYMBOL(alloc_pages_current); @@ -1721,6 +1889,9 @@ EXPORT_SYMBOL(alloc_pages_current); * with the mems_allowed returned by cpuset_mems_allowed(). This * keeps mempolicies cpuset relative after its cpuset moves. See * further kernel/cpuset.c update_nodemask(). + * + * current's mempolicy may be rebinded by the other task(the task that changes + * cpuset's mems), so we needn't do rebind work for current task. */ /* Slow path of a mempolicy duplicate */ @@ -1730,11 +1901,24 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) if (!new) return ERR_PTR(-ENOMEM); + + /* task's mempolicy is protected by alloc_lock */ + if (old == current->mempolicy) { + task_lock(current); + *new = *old; + task_unlock(current); + } else + *new = *old; + + rcu_read_lock(); if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); - mpol_rebind_policy(old, &mems); + if (new->flags & MPOL_F_REBINDING) + mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2); + else + mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); } - *new = *old; + rcu_read_unlock(); atomic_set(&new->refcnt, 1); return new; } @@ -1761,16 +1945,6 @@ struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol, return tompol; } -static int mpol_match_intent(const struct mempolicy *a, - const struct mempolicy *b) -{ - if (a->flags != b->flags) - return 0; - if (!mpol_store_user_nodemask(a)) - return 1; - return nodes_equal(a->w.user_nodemask, b->w.user_nodemask); -} - /* Slow path of a mempolicy comparison */ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) { @@ -1778,8 +1952,12 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) return 0; if (a->mode != b->mode) return 0; - if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b)) + if (a->flags != b->flags) return 0; + if (mpol_store_user_nodemask(a)) + if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) + return 0; + switch (a->mode) { case MPOL_BIND: /* Fall through */ @@ -1972,31 +2150,29 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) NODEMASK_SCRATCH(scratch); if (!scratch) - return; + goto put_mpol; /* contextualize the tmpfs mount point mempolicy */ new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); - if (IS_ERR(new)) { - mpol_put(mpol); /* drop our ref on sb mpol */ - NODEMASK_SCRATCH_FREE(scratch); - return; /* no valid nodemask intersection */ - } + if (IS_ERR(new)) + goto free_scratch; /* no valid nodemask intersection */ task_lock(current); ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); task_unlock(current); - mpol_put(mpol); /* drop our ref on sb mpol */ - if (ret) { - NODEMASK_SCRATCH_FREE(scratch); - mpol_put(new); - return; - } + if (ret) + goto put_new; /* Create pseudo-vma that contains just the policy */ memset(&pvma, 0, sizeof(struct vm_area_struct)); pvma.vm_end = TASK_SIZE; /* policy covers entire file */ mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ + +put_new: mpol_put(new); /* drop initial ref */ +free_scratch: NODEMASK_SCRATCH_FREE(scratch); +put_mpol: + mpol_put(mpol); /* drop our incoming ref on sb mpol */ } } @@ -2101,9 +2277,15 @@ void numa_default_policy(void) * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag * Used only for mpol_parse_str() and mpol_to_str() */ -#define MPOL_LOCAL (MPOL_INTERLEAVE + 1) -static const char * const policy_types[] = - { "default", "prefer", "bind", "interleave", "local" }; +#define MPOL_LOCAL MPOL_MAX +static const char * const policy_modes[] = +{ + [MPOL_DEFAULT] = "default", + [MPOL_PREFERRED] = "prefer", + [MPOL_BIND] = "bind", + [MPOL_INTERLEAVE] = "interleave", + [MPOL_LOCAL] = "local" +}; #ifdef CONFIG_TMPFS @@ -2128,12 +2310,11 @@ static const char * const policy_types[] = int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) { struct mempolicy *new = NULL; - unsigned short uninitialized_var(mode); + unsigned short mode; unsigned short uninitialized_var(mode_flags); nodemask_t nodes; char *nodelist = strchr(str, ':'); char *flags = strchr(str, '='); - int i; int err = 1; if (nodelist) { @@ -2149,13 +2330,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) if (flags) *flags++ = '\0'; /* terminate mode string */ - for (i = 0; i <= MPOL_LOCAL; i++) { - if (!strcmp(str, policy_types[i])) { - mode = i; + for (mode = 0; mode <= MPOL_LOCAL; mode++) { + if (!strcmp(str, policy_modes[mode])) { break; } } - if (i > MPOL_LOCAL) + if (mode > MPOL_LOCAL) goto out; switch (mode) { @@ -2167,8 +2347,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) char *rest = nodelist; while (isdigit(*rest)) rest++; - if (!*rest) - err = 0; + if (*rest) + goto out; } break; case MPOL_INTERLEAVE: @@ -2177,7 +2357,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) */ if (!nodelist) nodes = node_states[N_HIGH_MEMORY]; - err = 0; break; case MPOL_LOCAL: /* @@ -2187,11 +2366,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) goto out; mode = MPOL_PREFERRED; break; - - /* - * case MPOL_BIND: mpol_new() enforces non-empty nodemask. - * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags. - */ + case MPOL_DEFAULT: + /* + * Insist on a empty nodelist + */ + if (!nodelist) + err = 0; + goto out; + case MPOL_BIND: + /* + * Insist on a nodelist + */ + if (!nodelist) + goto out; } mode_flags = 0; @@ -2205,13 +2392,17 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) else if (!strcmp(flags, "relative")) mode_flags |= MPOL_F_RELATIVE_NODES; else - err = 1; + goto out; } new = mpol_new(mode, mode_flags, &nodes); if (IS_ERR(new)) - err = 1; - else { + goto out; + + if (no_context) { + /* save for contextualization */ + new->w.user_nodemask = nodes; + } else { int ret; NODEMASK_SCRATCH(scratch); if (scratch) { @@ -2222,13 +2413,11 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) ret = -ENOMEM; NODEMASK_SCRATCH_FREE(scratch); if (ret) { - err = 1; mpol_put(new); - } else if (no_context) { - /* save for contextualization */ - new->w.user_nodemask = nodes; + goto out; } } + err = 0; out: /* Restore string for error message */ @@ -2297,11 +2486,11 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) BUG(); } - l = strlen(policy_types[mode]); + l = strlen(policy_modes[mode]); if (buffer + maxlen < p + l + 1) return -ENOSPC; - strcpy(p, policy_types[mode]); + strcpy(p, policy_modes[mode]); p += l; if (flags & MPOL_MODE_FLAGS) { diff --git a/mm/migrate.c b/mm/migrate.c index efddbf0..38e7cad 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -32,6 +32,7 @@ #include <linux/security.h> #include <linux/memcontrol.h> #include <linux/syscalls.h> +#include <linux/gfp.h> #include "internal.h" @@ -39,7 +40,8 @@ /* * migrate_prep() needs to be called before we start compiling a list of pages - * to be migrated using isolate_lru_page(). + * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is + * undesirable, use migrate_prep_local() */ int migrate_prep(void) { @@ -54,26 +56,29 @@ int migrate_prep(void) return 0; } +/* Do the necessary work of migrate_prep but not if it involves other CPUs */ +int migrate_prep_local(void) +{ + lru_add_drain(); + + return 0; +} + /* * Add isolated pages on the list back to the LRU under page lock * to avoid leaking evictable pages back onto unevictable list. - * - * returns the number of pages put back. */ -int putback_lru_pages(struct list_head *l) +void putback_lru_pages(struct list_head *l) { struct page *page; struct page *page2; - int count = 0; list_for_each_entry_safe(page, page2, l, lru) { list_del(&page->lru); dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); putback_lru_page(page); - count++; } - return count; } /* @@ -134,7 +139,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, page_add_file_rmap(new); /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, addr, pte); + update_mmu_cache(vma, addr, ptep); unlock: pte_unmap_unlock(ptep, ptl); out: @@ -275,8 +280,6 @@ static int migrate_page_move_mapping(struct address_space *mapping, */ static void migrate_page_copy(struct page *newpage, struct page *page) { - int anon; - copy_highpage(newpage, page); if (PageError(page)) @@ -313,8 +316,6 @@ static void migrate_page_copy(struct page *newpage, struct page *page) ClearPageSwapCache(page); ClearPagePrivate(page); set_page_private(page, 0); - /* page->mapping contains a flag for PageAnon() */ - anon = PageAnon(page); page->mapping = NULL; /* @@ -493,7 +494,8 @@ static int fallback_migrate_page(struct address_space *mapping, * < 0 - error code * == 0 - success */ -static int move_to_new_page(struct page *newpage, struct page *page) +static int move_to_new_page(struct page *newpage, struct page *page, + int remap_swapcache) { struct address_space *mapping; int rc; @@ -528,10 +530,12 @@ static int move_to_new_page(struct page *newpage, struct page *page) else rc = fallback_migrate_page(mapping, newpage, page); - if (!rc) - remove_migration_ptes(page, newpage); - else + if (rc) { newpage->mapping = NULL; + } else { + if (remap_swapcache) + remove_migration_ptes(page, newpage); + } unlock_page(newpage); @@ -548,9 +552,11 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, int rc = 0; int *result = NULL; struct page *newpage = get_new_page(page, private, &result); + int remap_swapcache = 1; int rcu_locked = 0; int charge = 0; struct mem_cgroup *mem = NULL; + struct anon_vma *anon_vma = NULL; if (!newpage) return -ENOMEM; @@ -584,7 +590,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, } /* charge against new page */ - charge = mem_cgroup_prepare_migration(page, &mem); + charge = mem_cgroup_prepare_migration(page, newpage, &mem); if (charge == -ENOMEM) { rc = -ENOMEM; goto unlock; @@ -607,6 +613,34 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (PageAnon(page)) { rcu_read_lock(); rcu_locked = 1; + + /* Determine how to safely use anon_vma */ + if (!page_mapped(page)) { + if (!PageSwapCache(page)) + goto rcu_unlock; + + /* + * We cannot be sure that the anon_vma of an unmapped + * swapcache page is safe to use because we don't + * know in advance if the VMA that this page belonged + * to still exists. If the VMA and others sharing the + * data have been freed, then the anon_vma could + * already be invalid. + * + * To avoid this possibility, swapcache pages get + * migrated but are not remapped when migration + * completes + */ + remap_swapcache = 0; + } else { + /* + * Take a reference count on the anon_vma if the + * page is mapped so that it is guaranteed to + * exist when the page is remapped later + */ + anon_vma = page_anon_vma(page); + get_anon_vma(anon_vma); + } } /* @@ -641,11 +675,16 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, skip_unmap: if (!page_mapped(page)) - rc = move_to_new_page(newpage, page); + rc = move_to_new_page(newpage, page, remap_swapcache); - if (rc) + if (rc && remap_swapcache) remove_migration_ptes(page, page); rcu_unlock: + + /* Drop an anon_vma reference if we took one */ + if (anon_vma) + drop_anon_vma(anon_vma); + if (rcu_locked) rcu_read_unlock(); uncharge: @@ -912,6 +951,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task, goto out_pm; err = -ENODEV; + if (node < 0 || node >= MAX_NUMNODES) + goto out_pm; + if (!node_state(node, N_HIGH_MEMORY)) goto out_pm; @@ -999,33 +1041,27 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, #define DO_PAGES_STAT_CHUNK_NR 16 const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; int chunk_status[DO_PAGES_STAT_CHUNK_NR]; - unsigned long i, chunk_nr = DO_PAGES_STAT_CHUNK_NR; - int err; - for (i = 0; i < nr_pages; i += chunk_nr) { - if (chunk_nr > nr_pages - i) - chunk_nr = nr_pages - i; + while (nr_pages) { + unsigned long chunk_nr; - err = copy_from_user(chunk_pages, &pages[i], - chunk_nr * sizeof(*chunk_pages)); - if (err) { - err = -EFAULT; - goto out; - } + chunk_nr = nr_pages; + if (chunk_nr > DO_PAGES_STAT_CHUNK_NR) + chunk_nr = DO_PAGES_STAT_CHUNK_NR; + + if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages))) + break; do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); - err = copy_to_user(&status[i], chunk_status, - chunk_nr * sizeof(*chunk_status)); - if (err) { - err = -EFAULT; - goto out; - } - } - err = 0; + if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status))) + break; -out: - return err; + pages += chunk_nr; + status += chunk_nr; + nr_pages -= chunk_nr; + } + return nr_pages ? -EFAULT : 0; } /* diff --git a/mm/mincore.c b/mm/mincore.c index 7a3436e..9ac42dc 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -7,8 +7,8 @@ /* * The mincore() system call. */ -#include <linux/slab.h> #include <linux/pagemap.h> +#include <linux/gfp.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/syscalls.h> @@ -19,6 +19,40 @@ #include <asm/uaccess.h> #include <asm/pgtable.h> +static void mincore_hugetlb_page_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + unsigned char *vec) +{ +#ifdef CONFIG_HUGETLB_PAGE + struct hstate *h; + + h = hstate_vma(vma); + while (1) { + unsigned char present; + pte_t *ptep; + /* + * Huge pages are always in RAM for now, but + * theoretically it needs to be checked. + */ + ptep = huge_pte_offset(current->mm, + addr & huge_page_mask(h)); + present = ptep && !huge_pte_none(huge_ptep_get(ptep)); + while (1) { + *vec = present; + vec++; + addr += PAGE_SIZE; + if (addr == end) + return; + /* check hugepage border */ + if (!(addr & ~huge_page_mask(h))) + break; + } + } +#else + BUG(); +#endif +} + /* * Later we can get more picky about what "in core" means precisely. * For now, simply check to see if the page is in the page cache, @@ -49,145 +83,150 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) return present; } -/* - * Do a chunk of "sys_mincore()". We've already checked - * all the arguments, we hold the mmap semaphore: we should - * just return the amount of info we're asked for. - */ -static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages) +static void mincore_unmapped_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + unsigned char *vec) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep; - spinlock_t *ptl; - unsigned long nr; + unsigned long nr = (end - addr) >> PAGE_SHIFT; int i; - pgoff_t pgoff; - struct vm_area_struct *vma = find_vma(current->mm, addr); - /* - * find_vma() didn't find anything above us, or we're - * in an unmapped hole in the address space: ENOMEM. - */ - if (!vma || addr < vma->vm_start) - return -ENOMEM; - -#ifdef CONFIG_HUGETLB_PAGE - if (is_vm_hugetlb_page(vma)) { - struct hstate *h; - unsigned long nr_huge; - unsigned char present; + if (vma->vm_file) { + pgoff_t pgoff; - i = 0; - nr = min(pages, (vma->vm_end - addr) >> PAGE_SHIFT); - h = hstate_vma(vma); - nr_huge = ((addr + pages * PAGE_SIZE - 1) >> huge_page_shift(h)) - - (addr >> huge_page_shift(h)) + 1; - nr_huge = min(nr_huge, - (vma->vm_end - addr) >> huge_page_shift(h)); - while (1) { - /* hugepage always in RAM for now, - * but generally it needs to be check */ - ptep = huge_pte_offset(current->mm, - addr & huge_page_mask(h)); - present = !!(ptep && - !huge_pte_none(huge_ptep_get(ptep))); - while (1) { - vec[i++] = present; - addr += PAGE_SIZE; - /* reach buffer limit */ - if (i == nr) - return nr; - /* check hugepage border */ - if (!((addr & ~huge_page_mask(h)) - >> PAGE_SHIFT)) - break; - } - } - return nr; + pgoff = linear_page_index(vma, addr); + for (i = 0; i < nr; i++, pgoff++) + vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff); + } else { + for (i = 0; i < nr; i++) + vec[i] = 0; } -#endif - - /* - * Calculate how many pages there are left in the last level of the - * PTE array for our address. - */ - nr = PTRS_PER_PTE - ((addr >> PAGE_SHIFT) & (PTRS_PER_PTE-1)); - - /* - * Don't overrun this vma - */ - nr = min(nr, (vma->vm_end - addr) >> PAGE_SHIFT); - - /* - * Don't return more than the caller asked for - */ - nr = min(nr, pages); +} - pgd = pgd_offset(vma->vm_mm, addr); - if (pgd_none_or_clear_bad(pgd)) - goto none_mapped; - pud = pud_offset(pgd, addr); - if (pud_none_or_clear_bad(pud)) - goto none_mapped; - pmd = pmd_offset(pud, addr); - if (pmd_none_or_clear_bad(pmd)) - goto none_mapped; +static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned char *vec) +{ + unsigned long next; + spinlock_t *ptl; + pte_t *ptep; ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - for (i = 0; i < nr; i++, ptep++, addr += PAGE_SIZE) { - unsigned char present; + do { pte_t pte = *ptep; + pgoff_t pgoff; - if (pte_present(pte)) { - present = 1; - - } else if (pte_none(pte)) { - if (vma->vm_file) { - pgoff = linear_page_index(vma, addr); - present = mincore_page(vma->vm_file->f_mapping, - pgoff); - } else - present = 0; - - } else if (pte_file(pte)) { + next = addr + PAGE_SIZE; + if (pte_none(pte)) + mincore_unmapped_range(vma, addr, next, vec); + else if (pte_present(pte)) + *vec = 1; + else if (pte_file(pte)) { pgoff = pte_to_pgoff(pte); - present = mincore_page(vma->vm_file->f_mapping, pgoff); - + *vec = mincore_page(vma->vm_file->f_mapping, pgoff); } else { /* pte is a swap entry */ swp_entry_t entry = pte_to_swp_entry(pte); + if (is_migration_entry(entry)) { /* migration entries are always uptodate */ - present = 1; + *vec = 1; } else { #ifdef CONFIG_SWAP pgoff = entry.val; - present = mincore_page(&swapper_space, pgoff); + *vec = mincore_page(&swapper_space, pgoff); #else WARN_ON(1); - present = 1; + *vec = 1; #endif } } + vec++; + } while (ptep++, addr = next, addr != end); + pte_unmap_unlock(ptep - 1, ptl); +} - vec[i] = present; - } - pte_unmap_unlock(ptep-1, ptl); +static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, + unsigned long addr, unsigned long end, + unsigned char *vec) +{ + unsigned long next; + pmd_t *pmd; - return nr; + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + mincore_unmapped_range(vma, addr, next, vec); + else + mincore_pte_range(vma, pmd, addr, next, vec); + vec += (next - addr) >> PAGE_SHIFT; + } while (pmd++, addr = next, addr != end); +} -none_mapped: - if (vma->vm_file) { - pgoff = linear_page_index(vma, addr); - for (i = 0; i < nr; i++, pgoff++) - vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff); - } else { - for (i = 0; i < nr; i++) - vec[i] = 0; +static void mincore_pud_range(struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end, + unsigned char *vec) +{ + unsigned long next; + pud_t *pud; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + mincore_unmapped_range(vma, addr, next, vec); + else + mincore_pmd_range(vma, pud, addr, next, vec); + vec += (next - addr) >> PAGE_SHIFT; + } while (pud++, addr = next, addr != end); +} + +static void mincore_page_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + unsigned char *vec) +{ + unsigned long next; + pgd_t *pgd; + + pgd = pgd_offset(vma->vm_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + mincore_unmapped_range(vma, addr, next, vec); + else + mincore_pud_range(vma, pgd, addr, next, vec); + vec += (next - addr) >> PAGE_SHIFT; + } while (pgd++, addr = next, addr != end); +} + +/* + * Do a chunk of "sys_mincore()". We've already checked + * all the arguments, we hold the mmap semaphore: we should + * just return the amount of info we're asked for. + */ +static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *vec) +{ + struct vm_area_struct *vma; + unsigned long end; + + vma = find_vma(current->mm, addr); + if (!vma || addr < vma->vm_start) + return -ENOMEM; + + end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); + + if (is_vm_hugetlb_page(vma)) { + mincore_hugetlb_page_range(vma, addr, end, vec); + return (end - addr) >> PAGE_SHIFT; } - return nr; + end = pmd_addr_end(addr, end); + + if (is_vm_hugetlb_page(vma)) + mincore_hugetlb_page_range(vma, addr, end, vec); + else + mincore_page_range(vma, addr, end, vec); + + return (end - addr) >> PAGE_SHIFT; } /* @@ -247,7 +286,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, * the temporary buffer size. */ down_read(¤t->mm->mmap_sem); - retval = do_mincore(start, tmp, min(pages, PAGE_SIZE)); + retval = do_mincore(start, min(pages, PAGE_SIZE), tmp); up_read(¤t->mm->mmap_sem); if (retval <= 0) @@ -25,7 +25,7 @@ int can_do_mlock(void) { if (capable(CAP_IPC_LOCK)) return 1; - if (current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur != 0) + if (rlimit(RLIMIT_MEMLOCK) != 0) return 1; return 0; } @@ -135,6 +135,13 @@ void munlock_vma_page(struct page *page) } } +static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) +{ + return (vma->vm_flags & VM_GROWSDOWN) && + (vma->vm_start == addr) && + !vma_stack_continue(vma->vm_prev, addr); +} + /** * __mlock_vma_pages_range() - mlock a range of pages in the vma. * @vma: target vma @@ -167,6 +174,12 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, if (vma->vm_flags & VM_WRITE) gup_flags |= FOLL_WRITE; + /* We don't try to access the guard page of a stack vma */ + if (stack_guard_page(vma, start)) { + addr += PAGE_SIZE; + nr_pages--; + } + while (nr_pages > 0) { int i; @@ -487,7 +500,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) locked = len >> PAGE_SHIFT; locked += current->mm->locked_vm; - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; /* check against resource limits */ @@ -550,7 +563,7 @@ SYSCALL_DEFINE1(mlockall, int, flags) down_write(¤t->mm->mmap_sem); - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; ret = -ENOMEM; @@ -584,7 +597,7 @@ int user_shm_lock(size_t size, struct user_struct *user) int allowed = 0; locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit = rlimit(RLIMIT_MEMLOCK); if (lock_limit == RLIM_INFINITY) allowed = 1; lock_limit >>= PAGE_SHIFT; @@ -607,44 +620,3 @@ void user_shm_unlock(size_t size, struct user_struct *user) spin_unlock(&shmlock_user_lock); free_uid(user); } - -int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, - size_t size) -{ - unsigned long lim, vm, pgsz; - int error = -ENOMEM; - - pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; - - down_write(&mm->mmap_sem); - - lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; - vm = mm->total_vm + pgsz; - if (lim < vm) - goto out; - - lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; - vm = mm->locked_vm + pgsz; - if (lim < vm) - goto out; - - mm->total_vm += pgsz; - mm->locked_vm += pgsz; - - error = 0; - out: - up_write(&mm->mmap_sem); - return error; -} - -void refund_locked_memory(struct mm_struct *mm, size_t size) -{ - unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; - - down_write(&mm->mmap_sem); - - mm->total_vm -= pgsz; - mm->locked_vm -= pgsz; - - up_write(&mm->mmap_sem); -} @@ -265,7 +265,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) * segment grow beyond its set limit the in case where the limit is * not page aligned -Ram Gupta */ - rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; + rlim = rlimit(RLIMIT_DATA); if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + (mm->end_data - mm->start_data) > rlim) goto out; @@ -388,17 +388,23 @@ static inline void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node *rb_parent) { + struct vm_area_struct *next; + + vma->vm_prev = prev; if (prev) { - vma->vm_next = prev->vm_next; + next = prev->vm_next; prev->vm_next = vma; } else { mm->mmap = vma; if (rb_parent) - vma->vm_next = rb_entry(rb_parent, + next = rb_entry(rb_parent, struct vm_area_struct, vm_rb); else - vma->vm_next = NULL; + next = NULL; } + vma->vm_next = next; + if (next) + next->vm_prev = vma; } void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, @@ -437,7 +443,6 @@ __vma_link(struct mm_struct *mm, struct vm_area_struct *vma, { __vma_link_list(mm, vma, prev, rb_parent); __vma_link_rb(mm, vma, rb_link, rb_parent); - __anon_vma_link(vma); } static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, @@ -453,12 +458,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, spin_lock(&mapping->i_mmap_lock); vma->vm_truncate_count = mapping->truncate_count; } - anon_vma_lock(vma); __vma_link(mm, vma, prev, rb_link, rb_parent); __vma_link_file(vma); - anon_vma_unlock(vma); if (mapping) spin_unlock(&mapping->i_mmap_lock); @@ -486,7 +489,11 @@ static inline void __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev) { - prev->vm_next = vma->vm_next; + struct vm_area_struct *next = vma->vm_next; + + prev->vm_next = next; + if (next) + next->vm_prev = prev; rb_erase(&vma->vm_rb, &mm->mm_rb); if (mm->mmap_cache == vma) mm->mmap_cache = prev; @@ -499,7 +506,7 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, * are necessary. The "insert" vma (if any) is to be inserted * before we drop the necessary locks. */ -void vma_adjust(struct vm_area_struct *vma, unsigned long start, +int vma_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) { struct mm_struct *mm = vma->vm_mm; @@ -507,12 +514,14 @@ void vma_adjust(struct vm_area_struct *vma, unsigned long start, struct vm_area_struct *importer = NULL; struct address_space *mapping = NULL; struct prio_tree_root *root = NULL; - struct file *file = vma->vm_file; struct anon_vma *anon_vma = NULL; + struct file *file = vma->vm_file; long adjust_next = 0; int remove_next = 0; if (next && !insert) { + struct vm_area_struct *exporter = NULL; + if (end >= next->vm_end) { /* * vma expands, overlapping all the next, and @@ -520,7 +529,7 @@ void vma_adjust(struct vm_area_struct *vma, unsigned long start, */ again: remove_next = 1 + (end > next->vm_end); end = next->vm_end; - anon_vma = next->anon_vma; + exporter = next; importer = vma; } else if (end > next->vm_start) { /* @@ -528,7 +537,7 @@ again: remove_next = 1 + (end > next->vm_end); * mprotect case 5 shifting the boundary up. */ adjust_next = (end - next->vm_start) >> PAGE_SHIFT; - anon_vma = next->anon_vma; + exporter = next; importer = vma; } else if (end < vma->vm_end) { /* @@ -537,9 +546,20 @@ again: remove_next = 1 + (end > next->vm_end); * mprotect case 4 shifting the boundary down. */ adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT); - anon_vma = next->anon_vma; + exporter = vma; importer = next; } + + /* + * Easily overlooked: when mprotect shifts the boundary, + * make sure the expanding vma has anon_vma set if the + * shrinking vma had, to cover any anon pages imported. + */ + if (exporter && exporter->anon_vma && !importer->anon_vma) { + if (anon_vma_clone(importer, exporter)) + return -ENOMEM; + importer->anon_vma = exporter->anon_vma; + } } if (file) { @@ -568,22 +588,14 @@ again: remove_next = 1 + (end > next->vm_end); } /* - * When changing only vma->vm_end, we don't really need - * anon_vma lock. + * When changing only vma->vm_end, we don't really need anon_vma + * lock. This is a fairly rare case by itself, but the anon_vma + * lock may be shared between many sibling processes. Skipping + * the lock for brk adjustments makes a difference sometimes. */ - if (vma->anon_vma && (insert || importer || start != vma->vm_start)) + if (vma->anon_vma && (insert || importer || start != vma->vm_start)) { anon_vma = vma->anon_vma; - if (anon_vma) { - spin_lock(&anon_vma->lock); - /* - * Easily overlooked: when mprotect shifts the boundary, - * make sure the expanding vma has anon_vma set if the - * shrinking vma had, to cover any anon pages imported. - */ - if (importer && !importer->anon_vma) { - importer->anon_vma = anon_vma; - __anon_vma_link(importer); - } + anon_vma_lock(anon_vma); } if (root) { @@ -616,8 +628,6 @@ again: remove_next = 1 + (end > next->vm_end); __vma_unlink(mm, next, vma); if (file) __remove_shared_vm_struct(next, file, mapping); - if (next->anon_vma) - __anon_vma_merge(vma, next); } else if (insert) { /* * split_vma has split insert from vma, and needs @@ -628,7 +638,7 @@ again: remove_next = 1 + (end > next->vm_end); } if (anon_vma) - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); if (mapping) spin_unlock(&mapping->i_mmap_lock); @@ -638,6 +648,8 @@ again: remove_next = 1 + (end > next->vm_end); if (next->vm_flags & VM_EXECUTABLE) removed_exe_file_vma(mm); } + if (next->anon_vma) + anon_vma_merge(vma, next); mm->map_count--; mpol_put(vma_policy(next)); kmem_cache_free(vm_area_cachep, next); @@ -653,6 +665,8 @@ again: remove_next = 1 + (end > next->vm_end); } validate_mm(mm); + + return 0; } /* @@ -759,6 +773,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; struct vm_area_struct *area, *next; + int err; /* * We later require that vma->vm_flags == vm_flags, @@ -792,11 +807,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, is_mergeable_anon_vma(prev->anon_vma, next->anon_vma)) { /* cases 1, 6 */ - vma_adjust(prev, prev->vm_start, + err = vma_adjust(prev, prev->vm_start, next->vm_end, prev->vm_pgoff, NULL); } else /* cases 2, 5, 7 */ - vma_adjust(prev, prev->vm_start, + err = vma_adjust(prev, prev->vm_start, end, prev->vm_pgoff, NULL); + if (err) + return NULL; return prev; } @@ -808,11 +825,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen)) { if (prev && addr < prev->vm_end) /* case 4 */ - vma_adjust(prev, prev->vm_start, + err = vma_adjust(prev, prev->vm_start, addr, prev->vm_pgoff, NULL); else /* cases 3, 8 */ - vma_adjust(area, addr, next->vm_end, + err = vma_adjust(area, addr, next->vm_end, next->vm_pgoff - pglen, NULL); + if (err) + return NULL; return area; } @@ -820,6 +839,61 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, } /* + * Rough compatbility check to quickly see if it's even worth looking + * at sharing an anon_vma. + * + * They need to have the same vm_file, and the flags can only differ + * in things that mprotect may change. + * + * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that + * we can merge the two vma's. For example, we refuse to merge a vma if + * there is a vm_ops->close() function, because that indicates that the + * driver is doing some kind of reference counting. But that doesn't + * really matter for the anon_vma sharing case. + */ +static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) +{ + return a->vm_end == b->vm_start && + mpol_equal(vma_policy(a), vma_policy(b)) && + a->vm_file == b->vm_file && + !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) && + b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); +} + +/* + * Do some basic sanity checking to see if we can re-use the anon_vma + * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be + * the same as 'old', the other will be the new one that is trying + * to share the anon_vma. + * + * NOTE! This runs with mm_sem held for reading, so it is possible that + * the anon_vma of 'old' is concurrently in the process of being set up + * by another page fault trying to merge _that_. But that's ok: if it + * is being set up, that automatically means that it will be a singleton + * acceptable for merging, so we can do all of this optimistically. But + * we do that ACCESS_ONCE() to make sure that we never re-load the pointer. + * + * IOW: that the "list_is_singular()" test on the anon_vma_chain only + * matters for the 'stable anon_vma' case (ie the thing we want to avoid + * is to return an anon_vma that is "complex" due to having gone through + * a fork). + * + * We also make sure that the two vma's are compatible (adjacent, + * and with the same memory policies). That's all stable, even with just + * a read lock on the mm_sem. + */ +static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) +{ + if (anon_vma_compatible(a, b)) { + struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma); + + if (anon_vma && list_is_singular(&old->anon_vma_chain)) + return anon_vma; + } + return NULL; +} + +/* * find_mergeable_anon_vma is used by anon_vma_prepare, to check * neighbouring vmas for a suitable anon_vma, before it goes off * to allocate a new anon_vma. It checks because a repetitive @@ -829,28 +903,16 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, */ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) { + struct anon_vma *anon_vma; struct vm_area_struct *near; - unsigned long vm_flags; near = vma->vm_next; if (!near) goto try_prev; - /* - * Since only mprotect tries to remerge vmas, match flags - * which might be mprotected into each other later on. - * Neither mlock nor madvise tries to remerge at present, - * so leave their flags as obstructing a merge. - */ - vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); - vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC); - - if (near->anon_vma && vma->vm_end == near->vm_start && - mpol_equal(vma_policy(vma), vma_policy(near)) && - can_vma_merge_before(near, vm_flags, - NULL, vma->vm_file, vma->vm_pgoff + - ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT))) - return near->anon_vma; + anon_vma = reusable_anon_vma(near, vma, near); + if (anon_vma) + return anon_vma; try_prev: /* * It is potentially slow to have to call find_vma_prev here. @@ -863,14 +925,9 @@ try_prev: if (!near) goto none; - vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); - vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC); - - if (near->anon_vma && near->vm_end == vma->vm_start && - mpol_equal(vma_policy(near), vma_policy(vma)) && - can_vma_merge_after(near, vm_flags, - NULL, vma->vm_file, vma->vm_pgoff)) - return near->anon_vma; + anon_vma = reusable_anon_vma(near, near, vma); + if (anon_vma) + return anon_vma; none: /* * There's no absolute need to look only at touching neighbours: @@ -967,7 +1024,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long locked, lock_limit; locked = len >> PAGE_SHIFT; locked += mm->locked_vm; - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) return -EAGAIN; @@ -1083,6 +1140,30 @@ out: return retval; } +#ifdef __ARCH_WANT_SYS_OLD_MMAP +struct mmap_arg_struct { + unsigned long addr; + unsigned long len; + unsigned long prot; + unsigned long flags; + unsigned long fd; + unsigned long offset; +}; + +SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) +{ + struct mmap_arg_struct a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + if (a.offset & ~PAGE_MASK) + return -EINVAL; + + return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); +} +#endif /* __ARCH_WANT_SYS_OLD_MMAP */ + /* * Some shared mappigns will want the pages marked read-only * to track write events. If so, we'll downgrade vm_page_prot @@ -1205,6 +1286,7 @@ munmap_back: vma->vm_flags = vm_flags; vma->vm_page_prot = vm_get_page_prot(vm_flags); vma->vm_pgoff = pgoff; + INIT_LIST_HEAD(&vma->anon_vma_chain); if (file) { error = -EINVAL; @@ -1265,13 +1347,8 @@ out: mm->total_vm += len >> PAGE_SHIFT; vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { - /* - * makes pages present; downgrades, drops, reacquires mmap_sem - */ - long nr_pages = mlock_vma_pages_range(vma, addr, addr + len); - if (nr_pages < 0) - return nr_pages; /* vma gone! */ - mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages; + if (!mlock_vma_pages_range(vma, addr, addr + len)) + mm->locked_vm += (len >> PAGE_SHIFT); } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) make_pages_present(addr, addr + len); return addr; @@ -1599,7 +1676,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns return -ENOMEM; /* Stack limit test */ - if (size > rlim[RLIMIT_STACK].rlim_cur) + if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur)) return -ENOMEM; /* mlock limit tests */ @@ -1607,7 +1684,8 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns unsigned long locked; unsigned long limit; locked = mm->locked_vm + grow; - limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; + limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); + limit >>= PAGE_SHIFT; if (locked > limit && !capable(CAP_IPC_LOCK)) return -ENOMEM; } @@ -1638,9 +1716,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns * PA-RISC uses this for its stack; IA64 for its Register Backing Store. * vma is the last one with address > vma->vm_end. Have to extend vma. */ -#ifndef CONFIG_IA64 -static -#endif int expand_upwards(struct vm_area_struct *vma, unsigned long address) { int error; @@ -1654,7 +1729,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) */ if (unlikely(anon_vma_prepare(vma))) return -ENOMEM; - anon_vma_lock(vma); + vma_lock_anon_vma(vma); /* * vma->vm_start/vm_end cannot change under us because the caller @@ -1665,7 +1740,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (address < PAGE_ALIGN(address+4)) address = PAGE_ALIGN(address+4); else { - anon_vma_unlock(vma); + vma_unlock_anon_vma(vma); return -ENOMEM; } error = 0; @@ -1678,10 +1753,12 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) grow = (address - vma->vm_end) >> PAGE_SHIFT; error = acct_stack_growth(vma, size, grow); - if (!error) + if (!error) { vma->vm_end = address; + perf_event_mmap(vma); + } } - anon_vma_unlock(vma); + vma_unlock_anon_vma(vma); return error; } #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ @@ -1706,7 +1783,7 @@ static int expand_downwards(struct vm_area_struct *vma, if (error) return error; - anon_vma_lock(vma); + vma_lock_anon_vma(vma); /* * vma->vm_start/vm_end cannot change under us because the caller @@ -1725,9 +1802,10 @@ static int expand_downwards(struct vm_area_struct *vma, if (!error) { vma->vm_start = address; vma->vm_pgoff -= grow; + perf_event_mmap(vma); } } - anon_vma_unlock(vma); + vma_unlock_anon_vma(vma); return error; } @@ -1754,8 +1832,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) if (!prev || expand_stack(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) { - if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0) - return NULL; /* vma gone! */ + mlock_vma_pages_range(prev, addr, prev->vm_end); } return prev; } @@ -1783,8 +1860,7 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) if (expand_stack(vma, addr)) return NULL; if (vma->vm_flags & VM_LOCKED) { - if (mlock_vma_pages_range(vma, addr, start) < 0) - return NULL; /* vma gone! */ + mlock_vma_pages_range(vma, addr, start); } return vma; } @@ -1846,6 +1922,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr; insertion_point = (prev ? &prev->vm_next : &mm->mmap); + vma->vm_prev = NULL; do { rb_erase(&vma->vm_rb, &mm->mm_rb); mm->map_count--; @@ -1853,6 +1930,8 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, vma = vma->vm_next; } while (vma && vma->vm_start < end); *insertion_point = vma; + if (vma) + vma->vm_prev = prev; tail_vma->vm_next = NULL; if (mm->unmap_area == arch_unmap_area) addr = prev ? prev->vm_end : mm->mmap_base; @@ -1871,6 +1950,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, { struct mempolicy *pol; struct vm_area_struct *new; + int err = -ENOMEM; if (is_vm_hugetlb_page(vma) && (addr & ~(huge_page_mask(hstate_vma(vma))))) @@ -1878,11 +1958,13 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!new) - return -ENOMEM; + goto out_err; /* most fields are the same, copy all, and then fixup */ *new = *vma; + INIT_LIST_HEAD(&new->anon_vma_chain); + if (new_below) new->vm_end = addr; else { @@ -1892,11 +1974,14 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, pol = mpol_dup(vma_policy(vma)); if (IS_ERR(pol)) { - kmem_cache_free(vm_area_cachep, new); - return PTR_ERR(pol); + err = PTR_ERR(pol); + goto out_free_vma; } vma_set_policy(new, pol); + if (anon_vma_clone(new, vma)) + goto out_free_mpol; + if (new->vm_file) { get_file(new->vm_file); if (vma->vm_flags & VM_EXECUTABLE) @@ -1907,12 +1992,30 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, new->vm_ops->open(new); if (new_below) - vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + + err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + ((addr - new->vm_start) >> PAGE_SHIFT), new); else - vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); + err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); - return 0; + /* Success. */ + if (!err) + return 0; + + /* Clean everything up if vma_adjust failed. */ + if (new->vm_ops && new->vm_ops->close) + new->vm_ops->close(new); + if (new->vm_file) { + if (vma->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(mm); + fput(new->vm_file); + } + unlink_anon_vmas(new); + out_free_mpol: + mpol_put(pol); + out_free_vma: + kmem_cache_free(vm_area_cachep, new); + out_err: + return err; } /* @@ -2074,7 +2177,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) unsigned long locked, lock_limit; locked = len >> PAGE_SHIFT; locked += mm->locked_vm; - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) return -EAGAIN; @@ -2122,6 +2225,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) return -ENOMEM; } + INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; @@ -2130,6 +2234,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) vma->vm_page_prot = vm_get_page_prot(flags); vma_link(mm, vma, prev, rb_link, rb_parent); out: + perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) { if (!mlock_vma_pages_range(vma, addr, addr + len)) @@ -2258,10 +2363,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (new_vma) { *new_vma = *vma; pol = mpol_dup(vma_policy(vma)); - if (IS_ERR(pol)) { - kmem_cache_free(vm_area_cachep, new_vma); - return NULL; - } + if (IS_ERR(pol)) + goto out_free_vma; + INIT_LIST_HEAD(&new_vma->anon_vma_chain); + if (anon_vma_clone(new_vma, vma)) + goto out_free_mempol; vma_set_policy(new_vma, pol); new_vma->vm_start = addr; new_vma->vm_end = addr + len; @@ -2277,6 +2383,12 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, } } return new_vma; + + out_free_mempol: + mpol_put(pol); + out_free_vma: + kmem_cache_free(vm_area_cachep, new_vma); + return NULL; } /* @@ -2288,7 +2400,7 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages) unsigned long cur = mm->total_vm; /* pages */ unsigned long lim; - lim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; + lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT; if (cur + npages > lim) return 0; @@ -2354,6 +2466,7 @@ int install_special_mapping(struct mm_struct *mm, if (unlikely(vma == NULL)) return -ENOMEM; + INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; @@ -2380,23 +2493,23 @@ static DEFINE_MUTEX(mm_all_locks_mutex); static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) { - if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) { + if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { /* * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. */ - spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem); + spin_lock_nest_lock(&anon_vma->root->lock, &mm->mmap_sem); /* * We can safely modify head.next after taking the - * anon_vma->lock. If some other vma in this mm shares + * anon_vma->root->lock. If some other vma in this mm shares * the same anon_vma we won't take it again. * * No need of atomic instructions here, head.next * can't change from under us thanks to the - * anon_vma->lock. + * anon_vma->root->lock. */ if (__test_and_set_bit(0, (unsigned long *) - &anon_vma->head.next)) + &anon_vma->root->head.next)) BUG(); } } @@ -2454,6 +2567,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) int mm_take_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; + struct anon_vma_chain *avc; int ret = -EINTR; BUG_ON(down_read_trylock(&mm->mmap_sem)); @@ -2471,7 +2585,8 @@ int mm_take_all_locks(struct mm_struct *mm) if (signal_pending(current)) goto out_unlock; if (vma->anon_vma) - vm_lock_anon_vma(mm, vma->anon_vma); + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + vm_lock_anon_vma(mm, avc->anon_vma); } ret = 0; @@ -2485,7 +2600,7 @@ out_unlock: static void vm_unlock_anon_vma(struct anon_vma *anon_vma) { - if (test_bit(0, (unsigned long *) &anon_vma->head.next)) { + if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { /* * The LSB of head.next can't change to 0 from under * us because we hold the mm_all_locks_mutex. @@ -2496,12 +2611,12 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) * * No need of atomic instructions here, head.next * can't change from under us until we release the - * anon_vma->lock. + * anon_vma->root->lock. */ if (!__test_and_clear_bit(0, (unsigned long *) - &anon_vma->head.next)) + &anon_vma->root->head.next)) BUG(); - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); } } @@ -2526,13 +2641,15 @@ static void vm_unlock_mapping(struct address_space *mapping) void mm_drop_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; + struct anon_vma_chain *avc; BUG_ON(down_read_trylock(&mm->mmap_sem)); BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); for (vma = mm->mmap; vma; vma = vma->vm_next) { if (vma->anon_vma) - vm_unlock_anon_vma(vma->anon_vma); + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + vm_unlock_anon_vma(avc->anon_vma); if (vma->vm_file && vma->vm_file->f_mapping) vm_unlock_mapping(vma->vm_file->f_mapping); } diff --git a/mm/mmu_context.c b/mm/mmu_context.c index ded9081..9e82e93 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c @@ -5,6 +5,7 @@ #include <linux/mm.h> #include <linux/mmu_context.h> +#include <linux/module.h> #include <linux/sched.h> #include <asm/mmu_context.h> @@ -37,6 +38,7 @@ void use_mm(struct mm_struct *mm) if (active_mm != mm) mmdrop(active_mm); } +EXPORT_SYMBOL_GPL(use_mm); /* * unuse_mm @@ -51,8 +53,10 @@ void unuse_mm(struct mm_struct *mm) struct task_struct *tsk = current; task_lock(tsk); + sync_mm_rss(tsk, mm); tsk->mm = NULL; /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); task_unlock(tsk); } +EXPORT_SYMBOL_GPL(unuse_mm); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 7e33f2c..438951d 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -16,6 +16,7 @@ #include <linux/err.h> #include <linux/rcupdate.h> #include <linux/sched.h> +#include <linux/slab.h> /* * This function can't run concurrently against mmu_notifier_register diff --git a/mm/mmzone.c b/mm/mmzone.c index f5b7d17..e35bfb8 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -87,3 +87,24 @@ int memmap_valid_within(unsigned long pfn, return 1; } #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ + +#ifdef CONFIG_SMP +/* Called when a more accurate view of NR_FREE_PAGES is needed */ +unsigned long zone_nr_free_pages(struct zone *zone) +{ + unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES); + + /* + * While kswapd is awake, it is considered the zone is under some + * memory pressure. Under pressure, there is a risk that + * per-cpu-counter-drift will allow the min watermark to be breached + * potentially causing a live-lock. While kswapd is awake and + * free pages are low, get a better estimate for free pages + */ + if (nr_free_pages < zone->percpu_drift_mark && + !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) + return zone_page_state_snapshot(zone, NR_FREE_PAGES); + + return nr_free_pages; +} +#endif /* CONFIG_SMP */ diff --git a/mm/mprotect.c b/mm/mprotect.c index 8bc969d..2d1bf7c 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -10,7 +10,6 @@ #include <linux/mm.h> #include <linux/hugetlb.h> -#include <linux/slab.h> #include <linux/shm.h> #include <linux/mman.h> #include <linux/fs.h> diff --git a/mm/mremap.c b/mm/mremap.c index 8451908..cde56ee 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -9,7 +9,6 @@ #include <linux/mm.h> #include <linux/hugetlb.h> -#include <linux/slab.h> #include <linux/shm.h> #include <linux/ksm.h> #include <linux/mman.h> @@ -285,7 +284,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, if (vma->vm_flags & VM_LOCKED) { unsigned long locked, lock_limit; locked = mm->locked_vm << PAGE_SHIFT; - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit = rlimit(RLIMIT_MEMLOCK); locked += new_len - old_len; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) goto Eagain; @@ -460,8 +459,11 @@ unsigned long do_mremap(unsigned long addr, if (vma_expandable(vma, new_len - old_len)) { int pages = (new_len - old_len) >> PAGE_SHIFT; - vma_adjust(vma, vma->vm_start, - addr + new_len, vma->vm_pgoff, NULL); + if (vma_adjust(vma, vma->vm_start, addr + new_len, + vma->vm_pgoff, NULL)) { + ret = -ENOMEM; + goto out; + } mm->total_vm += pages; vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); @@ -82,7 +82,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) (vma->vm_flags & VM_SHARED)) { get_file(file); up_read(&mm->mmap_sem); - error = vfs_fsync(file, file->f_path.dentry, 0); + error = vfs_fsync(file, 0); fput(file); if (error || start >= end) goto out; @@ -36,11 +36,6 @@ #include <asm/mmu_context.h> #include "internal.h" -static inline __attribute__((format(printf, 1, 2))) -void no_printk(const char *fmt, ...) -{ -} - #if 0 #define kenter(FMT, ...) \ printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) @@ -162,7 +157,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } if (vmas) vmas[i] = vma; - start += PAGE_SIZE; + start = (start + PAGE_SIZE) & PAGE_MASK; } return i; @@ -609,7 +604,7 @@ static void protect_vma(struct vm_area_struct *vma, unsigned long flags) */ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct *pvma, **pp; + struct vm_area_struct *pvma, **pp, *next; struct address_space *mapping; struct rb_node **p, *parent; @@ -669,8 +664,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) break; } - vma->vm_next = *pp; + next = *pp; *pp = vma; + vma->vm_next = next; + if (next) + next->vm_prev = vma; } /* @@ -918,14 +916,6 @@ static int validate_mmap_request(struct file *file, if (!(capabilities & BDI_CAP_MAP_DIRECT)) return -ENODEV; - if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) || - ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) || - ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP)) - ) { - printk("MAP_SHARED not completely supported on !MMU\n"); - return -EINVAL; - } - /* we mustn't privatise shared mappings */ capabilities &= ~BDI_CAP_MAP_COPY; } @@ -941,6 +931,20 @@ static int validate_mmap_request(struct file *file, capabilities &= ~BDI_CAP_MAP_DIRECT; } + if (capabilities & BDI_CAP_MAP_DIRECT) { + if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) || + ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) || + ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP)) + ) { + capabilities &= ~BDI_CAP_MAP_DIRECT; + if (flags & MAP_SHARED) { + printk(KERN_WARNING + "MAP_SHARED not completely supported on !MMU\n"); + return -EINVAL; + } + } + } + /* handle executable mappings and implied executable * mappings */ if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { @@ -996,22 +1000,20 @@ static unsigned long determine_vm_flags(struct file *file, unsigned long vm_flags; vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags); - vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; /* vm_flags |= mm->def_flags; */ if (!(capabilities & BDI_CAP_MAP_DIRECT)) { /* attempt to share read-only copies of mapped file chunks */ + vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; if (file && !(prot & PROT_WRITE)) vm_flags |= VM_MAYSHARE; - } - else { + } else { /* overlay a shareable mapping on the backing device or inode * if possible - used for chardevs, ramfs/tmpfs/shmfs and * romfs/cramfs */ + vm_flags |= VM_MAYSHARE | (capabilities & BDI_CAP_VMFLAGS); if (flags & MAP_SHARED) - vm_flags |= VM_MAYSHARE | VM_SHARED; - else if ((((vm_flags & capabilities) ^ vm_flags) & BDI_CAP_VMFLAGS) == 0) - vm_flags |= VM_MAYSHARE; + vm_flags |= VM_SHARED; } /* refuse to let anyone share private mappings with this process if @@ -1040,10 +1042,9 @@ static int do_mmap_shared_file(struct vm_area_struct *vma) if (ret != -ENOSYS) return ret; - /* getting an ENOSYS error indicates that direct mmap isn't - * possible (as opposed to tried but failed) so we'll fall - * through to making a private copy of the data and mapping - * that if we can */ + /* getting -ENOSYS indicates that direct mmap isn't possible (as + * opposed to tried but failed) so we can only give a suitable error as + * it's not possible to make a private copy if MAP_SHARED was given */ return -ENODEV; } @@ -1209,7 +1210,7 @@ unsigned long do_mmap_pgoff(struct file *file, region->vm_flags = vm_flags; region->vm_pgoff = pgoff; - INIT_LIST_HEAD(&vma->anon_vma_node); + INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_flags = vm_flags; vma->vm_pgoff = pgoff; @@ -1428,6 +1429,30 @@ out: return retval; } +#ifdef __ARCH_WANT_SYS_OLD_MMAP +struct mmap_arg_struct { + unsigned long addr; + unsigned long len; + unsigned long prot; + unsigned long flags; + unsigned long fd; + unsigned long offset; +}; + +SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) +{ + struct mmap_arg_struct a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + if (a.offset & ~PAGE_MASK) + return -EINVAL; + + return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); +} +#endif /* __ARCH_WANT_SYS_OLD_MMAP */ + /* * split a vma into two pieces at address 'addr', a new vma is allocated either * for the first part or the tail. diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f52481b..4029583 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -4,6 +4,8 @@ * Copyright (C) 1998,2000 Rik van Riel * Thanks go out to Claus Fischer for some serious inspiration and * for goading me into coding this file... + * Copyright (C) 2010 Google, Inc. + * Rewritten by David Rientjes * * The routines in this file are used to kill a process when * we're seriously out of memory. This gets called from __alloc_pages() @@ -18,6 +20,7 @@ #include <linux/oom.h> #include <linux/mm.h> #include <linux/err.h> +#include <linux/gfp.h> #include <linux/sched.h> #include <linux/swap.h> #include <linux/timex.h> @@ -26,171 +29,193 @@ #include <linux/module.h> #include <linux/notifier.h> #include <linux/memcontrol.h> +#include <linux/mempolicy.h> #include <linux/security.h> int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; -int sysctl_oom_dump_tasks; +int sysctl_oom_dump_tasks = 1; static DEFINE_SPINLOCK(zone_scan_lock); -/* #define DEBUG */ + +#ifdef CONFIG_NUMA +/** + * has_intersects_mems_allowed() - check task eligiblity for kill + * @tsk: task struct of which task to consider + * @mask: nodemask passed to page allocator for mempolicy ooms + * + * Task eligibility is determined by whether or not a candidate task, @tsk, + * shares the same mempolicy nodes as current if it is bound by such a policy + * and whether or not it has the same set of allowed cpuset nodes. + */ +static bool has_intersects_mems_allowed(struct task_struct *tsk, + const nodemask_t *mask) +{ + struct task_struct *start = tsk; + + do { + if (mask) { + /* + * If this is a mempolicy constrained oom, tsk's + * cpuset is irrelevant. Only return true if its + * mempolicy intersects current, otherwise it may be + * needlessly killed. + */ + if (mempolicy_nodemask_intersects(tsk, mask)) + return true; + } else { + /* + * This is not a mempolicy constrained oom, so only + * check the mems of tsk's cpuset. + */ + if (cpuset_mems_allowed_intersects(current, tsk)) + return true; + } + } while_each_thread(start, tsk); + + return false; +} +#else +static bool has_intersects_mems_allowed(struct task_struct *tsk, + const nodemask_t *mask) +{ + return true; +} +#endif /* CONFIG_NUMA */ /* - * Is all threads of the target process nodes overlap ours? + * If this is a system OOM (not a memcg OOM) and the task selected to be + * killed is not already running at high (RT) priorities, speed up the + * recovery by boosting the dying task to the lowest FIFO priority. + * That helps with the recovery and avoids interfering with RT tasks. */ -static int has_intersects_mems_allowed(struct task_struct *tsk) +static void boost_dying_task_prio(struct task_struct *p, + struct mem_cgroup *mem) { - struct task_struct *t; + struct sched_param param = { .sched_priority = 1 }; + + if (mem) + return; + + if (!rt_task(p)) + sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); +} + +/* + * The process p may have detached its own ->mm while exiting or through + * use_mm(), but one or more of its subthreads may still have a valid + * pointer. Return p, or any of its subthreads with a valid ->mm, with + * task_lock() held. + */ +struct task_struct *find_lock_task_mm(struct task_struct *p) +{ + struct task_struct *t = p; - t = tsk; do { - if (cpuset_mems_allowed_intersects(current, t)) - return 1; - t = next_thread(t); - } while (t != tsk); + task_lock(t); + if (likely(t->mm)) + return t; + task_unlock(t); + } while_each_thread(p, t); - return 0; + return NULL; +} + +/* return true if the task is not adequate as candidate victim task. */ +static bool oom_unkillable_task(struct task_struct *p, + const struct mem_cgroup *mem, const nodemask_t *nodemask) +{ + if (is_global_init(p)) + return true; + if (p->flags & PF_KTHREAD) + return true; + + /* When mem_cgroup_out_of_memory() and p is not member of the group */ + if (mem && !task_in_mem_cgroup(p, mem)) + return true; + + /* p may not have freeable memory in nodemask */ + if (!has_intersects_mems_allowed(p, nodemask)) + return true; + + return false; } /** - * badness - calculate a numeric value for how bad this task has been + * oom_badness - heuristic function to determine which candidate task to kill * @p: task struct of which task we should calculate - * @uptime: current uptime in seconds - * - * The formula used is relatively simple and documented inline in the - * function. The main rationale is that we want to select a good task - * to kill when we run out of memory. + * @totalpages: total present RAM allowed for page allocation * - * Good in this context means that: - * 1) we lose the minimum amount of work done - * 2) we recover a large amount of memory - * 3) we don't kill anything innocent of eating tons of memory - * 4) we want to kill the minimum amount of processes (one) - * 5) we try to kill the process the user expects us to kill, this - * algorithm has been meticulously tuned to meet the principle - * of least surprise ... (be careful when you change it) + * The heuristic for determining which task to kill is made to be as simple and + * predictable as possible. The goal is to return the highest value for the + * task consuming the most memory to avoid subsequent oom failures. */ - -unsigned long badness(struct task_struct *p, unsigned long uptime) +unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, + const nodemask_t *nodemask, unsigned long totalpages) { - unsigned long points, cpu_time, run_time; - struct mm_struct *mm; - struct task_struct *child; - int oom_adj = p->signal->oom_adj; - struct task_cputime task_time; - unsigned long utime; - unsigned long stime; + int points; - if (oom_adj == OOM_DISABLE) + if (oom_unkillable_task(p, mem, nodemask)) return 0; - task_lock(p); - mm = p->mm; - if (!mm) { - task_unlock(p); + p = find_lock_task_mm(p); + if (!p) return 0; - } /* - * The memory size of the process is the basis for the badness. + * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't + * need to be executed for something that cannot be killed. */ - points = mm->total_vm; - - /* - * After this unlock we can no longer dereference local variable `mm' - */ - task_unlock(p); - - /* - * swapoff can easily use up all memory, so kill those first. - */ - if (p->flags & PF_OOM_ORIGIN) - return ULONG_MAX; - - /* - * Processes which fork a lot of child processes are likely - * a good choice. We add half the vmsize of the children if they - * have an own mm. This prevents forking servers to flood the - * machine with an endless amount of children. In case a single - * child is eating the vast majority of memory, adding only half - * to the parents will make the child our kill candidate of choice. - */ - list_for_each_entry(child, &p->children, sibling) { - task_lock(child); - if (child->mm != mm && child->mm) - points += child->mm->total_vm/2 + 1; - task_unlock(child); + if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + task_unlock(p); + return 0; } /* - * CPU time is in tens of seconds and run time is in thousands - * of seconds. There is no particular reason for this other than - * that it turned out to work very well in practice. + * When the PF_OOM_ORIGIN bit is set, it indicates the task should have + * priority for oom killing. */ - thread_group_cputime(p, &task_time); - utime = cputime_to_jiffies(task_time.utime); - stime = cputime_to_jiffies(task_time.stime); - cpu_time = (utime + stime) >> (SHIFT_HZ + 3); - - - if (uptime >= p->start_time.tv_sec) - run_time = (uptime - p->start_time.tv_sec) >> 10; - else - run_time = 0; - - if (cpu_time) - points /= int_sqrt(cpu_time); - if (run_time) - points /= int_sqrt(int_sqrt(run_time)); + if (p->flags & PF_OOM_ORIGIN) { + task_unlock(p); + return 1000; + } /* - * Niced processes are most likely less important, so double - * their badness points. + * The memory controller may have a limit of 0 bytes, so avoid a divide + * by zero, if necessary. */ - if (task_nice(p) > 0) - points *= 2; + if (!totalpages) + totalpages = 1; /* - * Superuser processes are usually more important, so we make it - * less likely that we kill those. + * The baseline for the badness score is the proportion of RAM that each + * task's rss and swap space use. */ - if (has_capability_noaudit(p, CAP_SYS_ADMIN) || - has_capability_noaudit(p, CAP_SYS_RESOURCE)) - points /= 4; + points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 / + totalpages; + task_unlock(p); /* - * We don't want to kill a process with direct hardware access. - * Not only could that mess up the hardware, but usually users - * tend to only have this flag set on applications they think - * of as important. + * Root processes get 3% bonus, just like the __vm_enough_memory() + * implementation used by LSMs. */ - if (has_capability_noaudit(p, CAP_SYS_RAWIO)) - points /= 4; + if (has_capability_noaudit(p, CAP_SYS_ADMIN)) + points -= 30; /* - * If p's nodes don't overlap ours, it may still help to kill p - * because p may have allocated or otherwise mapped memory on - * this node before. However it will be less likely. + * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may + * either completely disable oom killing or always prefer a certain + * task. */ - if (!has_intersects_mems_allowed(p)) - points /= 8; + points += p->signal->oom_score_adj; /* - * Adjust the score by oom_adj. + * Never return 0 for an eligible task that may be killed since it's + * possible that no single user task uses more than 0.1% of memory and + * no single admin tasks uses more than 3.0%. */ - if (oom_adj) { - if (oom_adj > 0) { - if (!points) - points = 1; - points <<= oom_adj; - } else - points >>= -(oom_adj); - } - -#ifdef DEBUG - printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points\n", - p->pid, p->comm, points); -#endif - return points; + if (points <= 0) + return 1; + return (points < 1000) ? points : 1000; } /* @@ -198,12 +223,20 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) */ #ifdef CONFIG_NUMA static enum oom_constraint constrained_alloc(struct zonelist *zonelist, - gfp_t gfp_mask, nodemask_t *nodemask) + gfp_t gfp_mask, nodemask_t *nodemask, + unsigned long *totalpages) { struct zone *zone; struct zoneref *z; enum zone_type high_zoneidx = gfp_zone(gfp_mask); + bool cpuset_limited = false; + int nid; + /* Default to all available memory */ + *totalpages = totalram_pages + total_swap_pages; + + if (!zonelist) + return CONSTRAINT_NONE; /* * Reach here only when __GFP_NOFAIL is used. So, we should avoid * to kill current.We have to random task kill in this case. @@ -213,26 +246,37 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, return CONSTRAINT_NONE; /* - * The nodemask here is a nodemask passed to alloc_pages(). Now, - * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy - * feature. mempolicy is an only user of nodemask here. - * check mempolicy's nodemask contains all N_HIGH_MEMORY + * This is not a __GFP_THISNODE allocation, so a truncated nodemask in + * the page allocator means a mempolicy is in effect. Cpuset policy + * is enforced in get_page_from_freelist(). */ - if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) + if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) { + *totalpages = total_swap_pages; + for_each_node_mask(nid, *nodemask) + *totalpages += node_spanned_pages(nid); return CONSTRAINT_MEMORY_POLICY; + } /* Check this allocation failure is caused by cpuset's wall function */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) if (!cpuset_zone_allowed_softwall(zone, gfp_mask)) - return CONSTRAINT_CPUSET; + cpuset_limited = true; + if (cpuset_limited) { + *totalpages = total_swap_pages; + for_each_node_mask(nid, cpuset_current_mems_allowed) + *totalpages += node_spanned_pages(nid); + return CONSTRAINT_CPUSET; + } return CONSTRAINT_NONE; } #else static enum oom_constraint constrained_alloc(struct zonelist *zonelist, - gfp_t gfp_mask, nodemask_t *nodemask) + gfp_t gfp_mask, nodemask_t *nodemask, + unsigned long *totalpages) { + *totalpages = totalram_pages + total_swap_pages; return CONSTRAINT_NONE; } #endif @@ -243,28 +287,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, * * (not docbooked, we don't want this one cluttering up the manual) */ -static struct task_struct *select_bad_process(unsigned long *ppoints, - struct mem_cgroup *mem) +static struct task_struct *select_bad_process(unsigned int *ppoints, + unsigned long totalpages, struct mem_cgroup *mem, + const nodemask_t *nodemask) { struct task_struct *p; struct task_struct *chosen = NULL; - struct timespec uptime; *ppoints = 0; - do_posix_clock_monotonic_gettime(&uptime); for_each_process(p) { - unsigned long points; + unsigned int points; - /* - * skip kernel threads and tasks which have already released - * their mm. - */ - if (!p->mm) - continue; - /* skip the init task */ - if (is_global_init(p)) - continue; - if (mem && !task_in_mem_cgroup(p, mem)) + if (oom_unkillable_task(p, mem, nodemask)) continue; /* @@ -289,19 +323,16 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, * the process of exiting and releasing its resources. * Otherwise we could get an easy OOM deadlock. */ - if (p->flags & PF_EXITING) { + if (thread_group_empty(p) && (p->flags & PF_EXITING) && p->mm) { if (p != current) return ERR_PTR(-1UL); chosen = p; - *ppoints = ULONG_MAX; + *ppoints = 1000; } - if (p->signal->oom_adj == OOM_DISABLE) - continue; - - points = badness(p, uptime.tv_sec); - if (points > *ppoints || !chosen) { + points = oom_badness(p, mem, nodemask, totalpages); + if (points > *ppoints) { chosen = p; *ppoints = points; } @@ -312,175 +343,184 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, /** * dump_tasks - dump current memory state of all system tasks - * @mem: target memory controller + * @mem: current's memory controller, if constrained + * @nodemask: nodemask passed to page allocator for mempolicy ooms * - * Dumps the current memory state of all system tasks, excluding kernel threads. + * Dumps the current memory state of all eligible tasks. Tasks not in the same + * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes + * are not shown. * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj - * score, and name. - * - * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are - * shown. + * value, oom_score_adj value, and name. * * Call with tasklist_lock read-locked. */ -static void dump_tasks(const struct mem_cgroup *mem) +static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask) { - struct task_struct *g, *p; - - printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj " - "name\n"); - do_each_thread(g, p) { - struct mm_struct *mm; + struct task_struct *p; + struct task_struct *task; - if (mem && !task_in_mem_cgroup(p, mem)) - continue; - if (!thread_group_leader(p)) + pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n"); + for_each_process(p) { + if (oom_unkillable_task(p, mem, nodemask)) continue; - task_lock(p); - mm = p->mm; - if (!mm) { + task = find_lock_task_mm(p); + if (!task) { /* - * total_vm and rss sizes do not exist for tasks with no - * mm so there's no need to report them; they can't be - * oom killed anyway. + * This is a kthread or all of p's threads have already + * detached their mm's. There's no need to report + * them; they can't be oom killed anyway. */ - task_unlock(p); continue; } - printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", - p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, - get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj, - p->comm); - task_unlock(p); - } while_each_thread(g, p); + + pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n", + task->pid, task_uid(task), task->tgid, + task->mm->total_vm, get_mm_rss(task->mm), + task_cpu(task), task->signal->oom_adj, + task->signal->oom_score_adj, task->comm); + task_unlock(task); + } } static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, - struct mem_cgroup *mem) + struct mem_cgroup *mem, const nodemask_t *nodemask) { - pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " - "oom_adj=%d\n", - current->comm, gfp_mask, order, current->signal->oom_adj); task_lock(current); + pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " + "oom_adj=%d, oom_score_adj=%d\n", + current->comm, gfp_mask, order, current->signal->oom_adj, + current->signal->oom_score_adj); cpuset_print_task_mems_allowed(current); task_unlock(current); dump_stack(); mem_cgroup_print_oom_info(mem, p); show_mem(); if (sysctl_oom_dump_tasks) - dump_tasks(mem); + dump_tasks(mem, nodemask); } #define K(x) ((x) << (PAGE_SHIFT-10)) - -/* - * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO - * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO - * set. - */ -static void __oom_kill_task(struct task_struct *p, int verbose) +static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) { - if (is_global_init(p)) { - WARN_ON(1); - printk(KERN_WARNING "tried to kill init!\n"); - return; - } - - task_lock(p); - if (!p->mm) { - WARN_ON(1); - printk(KERN_WARNING "tried to kill an mm-less task %d (%s)!\n", - task_pid_nr(p), p->comm); - task_unlock(p); - return; - } + p = find_lock_task_mm(p); + if (!p) + return 1; - if (verbose) - printk(KERN_ERR "Killed process %d (%s) " - "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n", - task_pid_nr(p), p->comm, - K(p->mm->total_vm), - K(get_mm_counter(p->mm, anon_rss)), - K(get_mm_counter(p->mm, file_rss))); + pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", + task_pid_nr(p), p->comm, K(p->mm->total_vm), + K(get_mm_counter(p->mm, MM_ANONPAGES)), + K(get_mm_counter(p->mm, MM_FILEPAGES))); task_unlock(p); + + set_tsk_thread_flag(p, TIF_MEMDIE); + force_sig(SIGKILL, p); + /* * We give our sacrificial lamb high priority and access to * all the memory it needs. That way it should be able to * exit() and clear out its resources quickly... */ - p->rt.time_slice = HZ; - set_tsk_thread_flag(p, TIF_MEMDIE); - - force_sig(SIGKILL, p); -} - -static int oom_kill_task(struct task_struct *p) -{ - /* WARNING: mm may not be dereferenced since we did not obtain its - * value from get_task_mm(p). This is OK since all we need to do is - * compare mm to q->mm below. - * - * Furthermore, even if mm contains a non-NULL value, p->mm may - * change to NULL at any time since we do not hold task_lock(p). - * However, this is of no concern to us. - */ - if (!p->mm || p->signal->oom_adj == OOM_DISABLE) - return 1; - - __oom_kill_task(p, 1); + boost_dying_task_prio(p, mem); return 0; } +#undef K static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, - unsigned long points, struct mem_cgroup *mem, + unsigned int points, unsigned long totalpages, + struct mem_cgroup *mem, nodemask_t *nodemask, const char *message) { - struct task_struct *c; + struct task_struct *victim = p; + struct task_struct *child; + struct task_struct *t = p; + unsigned int victim_points = 0; if (printk_ratelimit()) - dump_header(p, gfp_mask, order, mem); + dump_header(p, gfp_mask, order, mem, nodemask); /* * If the task is already exiting, don't alarm the sysadmin or kill * its children or threads, just set TIF_MEMDIE so it can die quickly */ if (p->flags & PF_EXITING) { - __oom_kill_task(p, 0); + set_tsk_thread_flag(p, TIF_MEMDIE); + boost_dying_task_prio(p, mem); return 0; } - printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n", - message, task_pid_nr(p), p->comm, points); + task_lock(p); + pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n", + message, task_pid_nr(p), p->comm, points); + task_unlock(p); - /* Try to kill a child first */ - list_for_each_entry(c, &p->children, sibling) { - if (c->mm == p->mm) - continue; - if (!oom_kill_task(c)) - return 0; + /* + * If any of p's children has a different mm and is eligible for kill, + * the one with the highest badness() score is sacrificed for its + * parent. This attempts to lose the minimal amount of work done while + * still freeing memory. + */ + do { + list_for_each_entry(child, &t->children, sibling) { + unsigned int child_points; + + /* + * oom_badness() returns 0 if the thread is unkillable + */ + child_points = oom_badness(child, mem, nodemask, + totalpages); + if (child_points > victim_points) { + victim = child; + victim_points = child_points; + } + } + } while_each_thread(p, t); + + return oom_kill_task(victim, mem); +} + +/* + * Determines whether the kernel must panic because of the panic_on_oom sysctl. + */ +static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, + int order, const nodemask_t *nodemask) +{ + if (likely(!sysctl_panic_on_oom)) + return; + if (sysctl_panic_on_oom != 2) { + /* + * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel + * does not panic for cpuset, mempolicy, or memcg allocation + * failures. + */ + if (constraint != CONSTRAINT_NONE) + return; } - return oom_kill_task(p); + read_lock(&tasklist_lock); + dump_header(NULL, gfp_mask, order, NULL, nodemask); + read_unlock(&tasklist_lock); + panic("Out of memory: %s panic_on_oom is enabled\n", + sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); } #ifdef CONFIG_CGROUP_MEM_RES_CTLR void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) { - unsigned long points = 0; + unsigned long limit; + unsigned int points = 0; struct task_struct *p; + check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); + limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; read_lock(&tasklist_lock); retry: - p = select_bad_process(&points, mem); - if (PTR_ERR(p) == -1UL) + p = select_bad_process(&points, limit, mem, NULL); + if (!p || PTR_ERR(p) == -1UL) goto out; - if (!p) - p = current; - - if (oom_kill_process(p, gfp_mask, 0, points, mem, + if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL, "Memory cgroup out of memory")) goto retry; out: @@ -507,7 +547,7 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier); * if a parallel OOM killing is already taking place that includes a zone in * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. */ -int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) +int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) { struct zoneref *z; struct zone *zone; @@ -524,7 +564,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { /* * Lock each zone in the zonelist under zone_scan_lock so a - * parallel invocation of try_set_zone_oom() doesn't succeed + * parallel invocation of try_set_zonelist_oom() doesn't succeed * when it shouldn't. */ zone_set_flag(zone, ZONE_OOM_LOCKED); @@ -553,73 +593,40 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) } /* - * Must be called with tasklist_lock held for read. + * Try to acquire the oom killer lock for all system zones. Returns zero if a + * parallel oom killing is taking place, otherwise locks all zones and returns + * non-zero. */ -static void __out_of_memory(gfp_t gfp_mask, int order) +static int try_set_system_oom(void) { - struct task_struct *p; - unsigned long points; - - if (sysctl_oom_kill_allocating_task) - if (!oom_kill_process(current, gfp_mask, order, 0, NULL, - "Out of memory (oom_kill_allocating_task)")) - return; -retry: - /* - * Rambo mode: Shoot down a process and hope it solves whatever - * issues we may have. - */ - p = select_bad_process(&points, NULL); - - if (PTR_ERR(p) == -1UL) - return; - - /* Found nothing?!?! Either we hang forever, or we panic. */ - if (!p) { - read_unlock(&tasklist_lock); - dump_header(NULL, gfp_mask, order, NULL); - panic("Out of memory and no killable processes...\n"); - } + struct zone *zone; + int ret = 1; - if (oom_kill_process(p, gfp_mask, order, points, NULL, - "Out of memory")) - goto retry; + spin_lock(&zone_scan_lock); + for_each_populated_zone(zone) + if (zone_is_oom_locked(zone)) { + ret = 0; + goto out; + } + for_each_populated_zone(zone) + zone_set_flag(zone, ZONE_OOM_LOCKED); +out: + spin_unlock(&zone_scan_lock); + return ret; } /* - * pagefault handler calls into here because it is out of memory but - * doesn't know exactly how or why. + * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation + * attempts or page faults may now recall the oom killer, if necessary. */ -void pagefault_out_of_memory(void) +static void clear_system_oom(void) { - unsigned long freed = 0; - - blocking_notifier_call_chain(&oom_notify_list, 0, &freed); - if (freed > 0) - /* Got some memory back in the last second. */ - return; - - /* - * If this is from memcg, oom-killer is already invoked. - * and not worth to go system-wide-oom. - */ - if (mem_cgroup_oom_called(current)) - goto rest_and_return; - - if (sysctl_panic_on_oom) - panic("out of memory from page fault. panic_on_oom is selected.\n"); - - read_lock(&tasklist_lock); - __out_of_memory(0, 0); /* unknown gfp_mask and order */ - read_unlock(&tasklist_lock); + struct zone *zone; - /* - * Give "p" a good chance of killing itself before we - * retry to allocate memory. - */ -rest_and_return: - if (!test_thread_flag(TIF_MEMDIE)) - schedule_timeout_uninterruptible(1); + spin_lock(&zone_scan_lock); + for_each_populated_zone(zone) + zone_clear_flag(zone, ZONE_OOM_LOCKED); + spin_unlock(&zone_scan_lock); } /** @@ -627,6 +634,7 @@ rest_and_return: * @zonelist: zonelist pointer * @gfp_mask: memory allocation flags * @order: amount of memory being requested as a power of 2 + * @nodemask: nodemask passed to page allocator * * If we run out of memory, we have the choice between either * killing a random task (bad), letting the system crash (worse) @@ -636,49 +644,93 @@ rest_and_return: void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order, nodemask_t *nodemask) { + const nodemask_t *mpol_mask; + struct task_struct *p; + unsigned long totalpages; unsigned long freed = 0; - enum oom_constraint constraint; + unsigned int points; + enum oom_constraint constraint = CONSTRAINT_NONE; + int killed = 0; blocking_notifier_call_chain(&oom_notify_list, 0, &freed); if (freed > 0) /* Got some memory back in the last second. */ return; - if (sysctl_panic_on_oom == 2) { - dump_header(NULL, gfp_mask, order, NULL); - panic("out of memory. Compulsory panic_on_oom is selected.\n"); + /* + * If current has a pending SIGKILL, then automatically select it. The + * goal is to allow it to allocate so that it may quickly exit and free + * its memory. + */ + if (fatal_signal_pending(current)) { + set_thread_flag(TIF_MEMDIE); + boost_dying_task_prio(current, NULL); + return; } /* * Check if there were limitations on the allocation (only relevant for * NUMA) that may require different handling. */ - constraint = constrained_alloc(zonelist, gfp_mask, nodemask); + constraint = constrained_alloc(zonelist, gfp_mask, nodemask, + &totalpages); + mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; + check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); + read_lock(&tasklist_lock); + if (sysctl_oom_kill_allocating_task && + !oom_unkillable_task(current, NULL, nodemask) && + (current->signal->oom_adj != OOM_DISABLE)) { + /* + * oom_kill_process() needs tasklist_lock held. If it returns + * non-zero, current could not be killed so we must fallback to + * the tasklist scan. + */ + if (!oom_kill_process(current, gfp_mask, order, 0, totalpages, + NULL, nodemask, + "Out of memory (oom_kill_allocating_task)")) + goto out; + } - switch (constraint) { - case CONSTRAINT_MEMORY_POLICY: - oom_kill_process(current, gfp_mask, order, 0, NULL, - "No available memory (MPOL_BIND)"); - break; +retry: + p = select_bad_process(&points, totalpages, NULL, mpol_mask); + if (PTR_ERR(p) == -1UL) + goto out; - case CONSTRAINT_NONE: - if (sysctl_panic_on_oom) { - dump_header(NULL, gfp_mask, order, NULL); - panic("out of memory. panic_on_oom is selected\n"); - } - /* Fall-through */ - case CONSTRAINT_CPUSET: - __out_of_memory(gfp_mask, order); - break; + /* Found nothing?!?! Either we hang forever, or we panic. */ + if (!p) { + dump_header(NULL, gfp_mask, order, NULL, mpol_mask); + read_unlock(&tasklist_lock); + panic("Out of memory and no killable processes...\n"); } + if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL, + nodemask, "Out of memory")) + goto retry; + killed = 1; +out: read_unlock(&tasklist_lock); /* * Give "p" a good chance of killing itself before we * retry to allocate memory unless "p" is current */ + if (killed && !test_thread_flag(TIF_MEMDIE)) + schedule_timeout_uninterruptible(1); +} + +/* + * The pagefault handler calls here because it is out of memory, so kill a + * memory-hogging task. If a populated zone has ZONE_OOM_LOCKED set, a parallel + * oom killing is already in progress so do nothing. If a task is found with + * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit. + */ +void pagefault_out_of_memory(void) +{ + if (try_set_system_oom()) { + out_of_memory(NULL, 0, 0, NULL); + clear_system_oom(); + } if (!test_thread_flag(TIF_MEMDIE)) schedule_timeout_uninterruptible(1); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0b19943..e3bccac 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -34,6 +34,7 @@ #include <linux/syscalls.h> #include <linux/buffer_head.h> #include <linux/pagevec.h> +#include <trace/events/writeback.h> /* * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited @@ -252,32 +253,6 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi, } } -/* - * Clip the earned share of dirty pages to that which is actually available. - * This avoids exceeding the total dirty_limit when the floating averages - * fluctuate too quickly. - */ -static void clip_bdi_dirty_limit(struct backing_dev_info *bdi, - unsigned long dirty, unsigned long *pbdi_dirty) -{ - unsigned long avail_dirty; - - avail_dirty = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_WRITEBACK) + - global_page_state(NR_UNSTABLE_NFS) + - global_page_state(NR_WRITEBACK_TEMP); - - if (avail_dirty < dirty) - avail_dirty = dirty - avail_dirty; - else - avail_dirty = 0; - - avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + - bdi_stat(bdi, BDI_WRITEBACK); - - *pbdi_dirty = min(*pbdi_dirty, avail_dirty); -} - static inline void task_dirties_fraction(struct task_struct *tsk, long *numerator, long *denominator) { @@ -286,16 +261,24 @@ static inline void task_dirties_fraction(struct task_struct *tsk, } /* - * scale the dirty limit + * task_dirty_limit - scale down dirty throttling threshold for one task * * task specific dirty limit: * * dirty -= (dirty/8) * p_{t} + * + * To protect light/slow dirtying tasks from heavier/fast ones, we start + * throttling individual tasks before reaching the bdi dirty limit. + * Relatively low thresholds will be allocated to heavy dirtiers. So when + * dirty pages grow large, heavy dirtiers will be throttled first, which will + * effectively curb the growth of dirty pages. Light dirtiers with high enough + * dirty threshold may never get throttled. */ -static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty) +static unsigned long task_dirty_limit(struct task_struct *tsk, + unsigned long bdi_dirty) { long numerator, denominator; - unsigned long dirty = *pdirty; + unsigned long dirty = bdi_dirty; u64 inv = dirty >> 3; task_dirties_fraction(tsk, &numerator, &denominator); @@ -303,10 +286,8 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty) do_div(inv, denominator); dirty -= inv; - if (dirty < *pdirty/2) - dirty = *pdirty/2; - *pdirty = dirty; + return max(dirty, bdi_dirty/2); } /* @@ -416,9 +397,16 @@ unsigned long determine_dirtyable_memory(void) return x + 1; /* Ensure that we never return 0 */ } -void -get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, - unsigned long *pbdi_dirty, struct backing_dev_info *bdi) +/* + * global_dirty_limits - background-writeback and dirty-throttling thresholds + * + * Calculate the dirty thresholds based on sysctl parameters + * - vm.dirty_background_ratio or vm.dirty_background_bytes + * - vm.dirty_ratio or vm.dirty_bytes + * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and + * runtime tasks. + */ +void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) { unsigned long background; unsigned long dirty; @@ -450,27 +438,37 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, } *pbackground = background; *pdirty = dirty; +} + +/* + * bdi_dirty_limit - @bdi's share of dirty throttling threshold + * + * Allocate high/low dirty limits to fast/slow devices, in order to prevent + * - starving fast devices + * - piling up dirty pages (that will take long time to sync) on slow devices + * + * The bdi's share of dirty limit will be adapting to its throughput and + * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. + */ +unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) +{ + u64 bdi_dirty; + long numerator, denominator; - if (bdi) { - u64 bdi_dirty; - long numerator, denominator; + /* + * Calculate this BDI's share of the dirty ratio. + */ + bdi_writeout_fraction(bdi, &numerator, &denominator); - /* - * Calculate this BDI's share of the dirty ratio. - */ - bdi_writeout_fraction(bdi, &numerator, &denominator); - - bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100; - bdi_dirty *= numerator; - do_div(bdi_dirty, denominator); - bdi_dirty += (dirty * bdi->min_ratio) / 100; - if (bdi_dirty > (dirty * bdi->max_ratio) / 100) - bdi_dirty = dirty * bdi->max_ratio / 100; - - *pbdi_dirty = bdi_dirty; - clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty); - task_dirty_limit(current, pbdi_dirty); - } + bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100; + bdi_dirty *= numerator; + do_div(bdi_dirty, denominator); + + bdi_dirty += (dirty * bdi->min_ratio) / 100; + if (bdi_dirty > (dirty * bdi->max_ratio) / 100) + bdi_dirty = dirty * bdi->max_ratio / 100; + + return bdi_dirty; } /* @@ -490,30 +488,22 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long bdi_thresh; unsigned long pages_written = 0; unsigned long pause = 1; - + bool dirty_exceeded = false; struct backing_dev_info *bdi = mapping->backing_dev_info; for (;;) { struct writeback_control wbc = { - .bdi = bdi, .sync_mode = WB_SYNC_NONE, .older_than_this = NULL, .nr_to_write = write_chunk, .range_cyclic = 1, }; - get_dirty_limits(&background_thresh, &dirty_thresh, - &bdi_thresh, bdi); - nr_reclaimable = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS); nr_writeback = global_page_state(NR_WRITEBACK); - bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); - bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); - - if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) - break; + global_dirty_limits(&background_thresh, &dirty_thresh); /* * Throttle it only when the background writeback cannot @@ -524,24 +514,8 @@ static void balance_dirty_pages(struct address_space *mapping, (background_thresh + dirty_thresh) / 2) break; - if (!bdi->dirty_exceeded) - bdi->dirty_exceeded = 1; - - /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. - * Unstable writes are a feature of certain networked - * filesystems (i.e. NFS) in which data may have been - * written to the server's write cache, but has not yet - * been flushed to permanent storage. - * Only move pages to writeback if this bdi is over its - * threshold otherwise wait until the disk writes catch - * up. - */ - if (bdi_nr_reclaimable > bdi_thresh) { - writeback_inodes_wbc(&wbc); - pages_written += write_chunk - wbc.nr_to_write; - get_dirty_limits(&background_thresh, &dirty_thresh, - &bdi_thresh, bdi); - } + bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); + bdi_thresh = task_dirty_limit(current, bdi_thresh); /* * In order to avoid the stacked BDI deadlock we need @@ -556,16 +530,45 @@ static void balance_dirty_pages(struct address_space *mapping, if (bdi_thresh < 2*bdi_stat_error(bdi)) { bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); - } else if (bdi_nr_reclaimable) { + } else { bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); } - if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) + /* + * The bdi thresh is somehow "soft" limit derived from the + * global "hard" limit. The former helps to prevent heavy IO + * bdi or process from holding back light ones; The latter is + * the last resort safeguard. + */ + dirty_exceeded = + (bdi_nr_reclaimable + bdi_nr_writeback >= bdi_thresh) + || (nr_reclaimable + nr_writeback >= dirty_thresh); + + if (!dirty_exceeded) break; - if (pages_written >= write_chunk) - break; /* We've done our duty */ + if (!bdi->dirty_exceeded) + bdi->dirty_exceeded = 1; + + /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. + * Unstable writes are a feature of certain networked + * filesystems (i.e. NFS) in which data may have been + * written to the server's write cache, but has not yet + * been flushed to permanent storage. + * Only move pages to writeback if this bdi is over its + * threshold otherwise wait until the disk writes catch + * up. + */ + trace_wbc_balance_dirty_start(&wbc, bdi); + if (bdi_nr_reclaimable > bdi_thresh) { + writeback_inodes_wb(&bdi->wb, &wbc); + pages_written += write_chunk - wbc.nr_to_write; + trace_wbc_balance_dirty_written(&wbc, bdi); + if (pages_written >= write_chunk) + break; /* We've done our duty */ + } + trace_wbc_balance_dirty_wait(&wbc, bdi); __set_current_state(TASK_INTERRUPTIBLE); io_schedule_timeout(pause); @@ -578,8 +581,7 @@ static void balance_dirty_pages(struct address_space *mapping, pause = HZ / 10; } - if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && - bdi->dirty_exceeded) + if (!dirty_exceeded && bdi->dirty_exceeded) bdi->dirty_exceeded = 0; if (writeback_in_progress(bdi)) @@ -594,10 +596,8 @@ static void balance_dirty_pages(struct address_space *mapping, * background_thresh, to keep the amount of dirty memory low. */ if ((laptop_mode && pages_written) || - (!laptop_mode && ((global_page_state(NR_FILE_DIRTY) - + global_page_state(NR_UNSTABLE_NFS)) - > background_thresh))) - bdi_start_writeback(bdi, NULL, 0); + (!laptop_mode && (nr_reclaimable > background_thresh))) + bdi_start_background_writeback(bdi); } void set_page_dirty_balance(struct page *page, int page_mkwrite) @@ -660,7 +660,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) unsigned long dirty_thresh; for ( ; ; ) { - get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); + global_dirty_limits(&background_thresh, &dirty_thresh); /* * Boost the allowable dirty threshold a bit for page @@ -683,10 +683,6 @@ void throttle_vm_writeout(gfp_t gfp_mask) } } -static void laptop_timer_fn(unsigned long unused); - -static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); - /* * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ @@ -694,24 +690,23 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, buffer, length, ppos); + bdi_arm_supers_timer(); return 0; } -static void do_laptop_sync(struct work_struct *work) -{ - wakeup_flusher_threads(0); - kfree(work); -} - -static void laptop_timer_fn(unsigned long unused) +#ifdef CONFIG_BLOCK +void laptop_mode_timer_fn(unsigned long data) { - struct work_struct *work; + struct request_queue *q = (struct request_queue *)data; + int nr_pages = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); - work = kmalloc(sizeof(*work), GFP_ATOMIC); - if (work) { - INIT_WORK(work, do_laptop_sync); - schedule_work(work); - } + /* + * We want to write everything out, not just down to the dirty + * threshold + */ + if (bdi_has_dirty_io(&q->backing_dev_info)) + bdi_start_writeback(&q->backing_dev_info, nr_pages); } /* @@ -719,9 +714,9 @@ static void laptop_timer_fn(unsigned long unused) * of all dirty data a few seconds from now. If the flush is already scheduled * then push it back - the user is still using the disk. */ -void laptop_io_completion(void) +void laptop_io_completion(struct backing_dev_info *info) { - mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode); + mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode); } /* @@ -731,8 +726,16 @@ void laptop_io_completion(void) */ void laptop_sync_completion(void) { - del_timer(&laptop_mode_wb_timer); + struct backing_dev_info *bdi; + + rcu_read_lock(); + + list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) + del_timer(&bdi->laptop_mode_wb_timer); + + rcu_read_unlock(); } +#endif /* * If ratelimit_pages is too high then we can get into dirty-data overload @@ -803,6 +806,42 @@ void __init page_writeback_init(void) } /** + * tag_pages_for_writeback - tag pages to be written by write_cache_pages + * @mapping: address space structure to write + * @start: starting page index + * @end: ending page index (inclusive) + * + * This function scans the page range from @start to @end (inclusive) and tags + * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is + * that write_cache_pages (or whoever calls this function) will then use + * TOWRITE tag to identify pages eligible for writeback. This mechanism is + * used to avoid livelocking of writeback by a process steadily creating new + * dirty pages in the file (thus it is important for this function to be quick + * so that it can tag pages faster than a dirtying process can create them). + */ +/* + * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency. + */ +void tag_pages_for_writeback(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ +#define WRITEBACK_TAG_BATCH 4096 + unsigned long tagged; + + do { + spin_lock_irq(&mapping->tree_lock); + tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree, + &start, end, WRITEBACK_TAG_BATCH, + PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE); + spin_unlock_irq(&mapping->tree_lock); + WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH); + cond_resched(); + /* We check 'start' to handle wrapping when end == ~0UL */ + } while (tagged >= WRITEBACK_TAG_BATCH && start); +} +EXPORT_SYMBOL(tag_pages_for_writeback); + +/** * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write @@ -816,6 +855,13 @@ void __init page_writeback_init(void) * the call was made get new I/O started against them. If wbc->sync_mode is * WB_SYNC_ALL then we were called for data integrity and we must wait for * existing IO to complete. + * + * To avoid livelocks (when other process dirties new pages), we first tag + * pages which should be written back with TOWRITE tag and only then start + * writing them. For data-integrity sync we have to be careful so that we do + * not miss some pages (e.g., because some other process has cleared TOWRITE + * tag we set). The rule we follow is that TOWRITE tag can be cleared only + * by the process clearing the DIRTY tag (and submitting the page for IO). */ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, @@ -831,7 +877,7 @@ int write_cache_pages(struct address_space *mapping, pgoff_t done_index; int cycled; int range_whole = 0; - long nr_to_write = wbc->nr_to_write; + int tag; pagevec_init(&pvec, 0); if (wbc->range_cyclic) { @@ -849,13 +895,18 @@ int write_cache_pages(struct address_space *mapping, range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ } + if (wbc->sync_mode == WB_SYNC_ALL) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; retry: + if (wbc->sync_mode == WB_SYNC_ALL) + tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && (index <= end)) { int i; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) break; @@ -913,6 +964,7 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; + trace_wbc_writepage(wbc, mapping->backing_dev_info); ret = (*writepage)(page, wbc, data); if (unlikely(ret)) { if (ret == AOP_WRITEPAGE_ACTIVATE) { @@ -931,25 +983,18 @@ continue_unlock: done = 1; break; } - } + } - if (nr_to_write > 0) { - nr_to_write--; - if (nr_to_write == 0 && - wbc->sync_mode == WB_SYNC_NONE) { - /* - * We stop writing back only if we are - * not doing integrity sync. In case of - * integrity sync we have to keep going - * because someone may be concurrently - * dirtying pages, and we might have - * synced a lot of newly appeared dirty - * pages, but have not synced all of the - * old dirty pages. - */ - done = 1; - break; - } + /* + * We stop writing back only if we are not doing + * integrity sync. In case of integrity sync we have to + * keep going until we have written all the pages + * we tagged for writeback prior to entering this loop. + */ + if (--wbc->nr_to_write <= 0 && + wbc->sync_mode == WB_SYNC_NONE) { + done = 1; + break; } } pagevec_release(&pvec); @@ -966,11 +1011,8 @@ continue_unlock: end = writeback_index - 1; goto retry; } - if (!wbc->no_nrwrite_index_update) { - if (wbc->range_cyclic || (range_whole && nr_to_write > 0)) - mapping->writeback_index = done_index; - wbc->nr_to_write = nr_to_write; - } + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = done_index; return ret; } @@ -1084,6 +1126,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) task_io_account_write(PAGE_CACHE_SIZE); } } +EXPORT_SYMBOL(account_page_dirtied); /* * For address_spaces which do not use buffers. Just tag the page as dirty in @@ -1315,6 +1358,9 @@ int test_set_page_writeback(struct page *page) radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); + radix_tree_tag_clear(&mapping->page_tree, + page_index(page), + PAGECACHE_TAG_TOWRITE); spin_unlock_irqrestore(&mapping->tree_lock, flags); } else { ret = TestSetPageWriteback(page); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8deb9d0..f12ad18 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -49,12 +49,30 @@ #include <linux/debugobjects.h> #include <linux/kmemleak.h> #include <linux/memory.h> +#include <linux/compaction.h> #include <trace/events/kmem.h> +#include <linux/ftrace_event.h> #include <asm/tlbflush.h> #include <asm/div64.h> #include "internal.h" +#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID +DEFINE_PER_CPU(int, numa_node); +EXPORT_PER_CPU_SYMBOL(numa_node); +#endif + +#ifdef CONFIG_HAVE_MEMORYLESS_NODES +/* + * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. + * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. + * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() + * defined in <linux/topology.h>. + */ +DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ +EXPORT_PER_CPU_SYMBOL(_numa_mem_); +#endif + /* * Array of node states. */ @@ -76,6 +94,31 @@ unsigned long totalreserve_pages __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; +#ifdef CONFIG_PM_SLEEP +/* + * The following functions are used by the suspend/hibernate code to temporarily + * change gfp_allowed_mask in order to avoid using I/O during memory allocations + * while devices are suspended. To avoid races with the suspend/hibernate code, + * they should always be called with pm_mutex held (gfp_allowed_mask also should + * only be modified with pm_mutex held, unless the suspend/hibernate code is + * guaranteed not to run in parallel with that modification). + */ +void set_gfp_allowed_mask(gfp_t mask) +{ + WARN_ON(!mutex_is_locked(&pm_mutex)); + gfp_allowed_mask = mask; +} + +gfp_t clear_gfp_allowed_mask(gfp_t mask) +{ + gfp_t ret = gfp_allowed_mask; + + WARN_ON(!mutex_is_locked(&pm_mutex)); + gfp_allowed_mask &= ~mask; + return ret; +} +#endif /* CONFIG_PM_SLEEP */ + #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE int pageblock_order __read_mostly; #endif @@ -263,10 +306,7 @@ static void bad_page(struct page *page) printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", current->comm, page_to_pfn(page)); - printk(KERN_ALERT - "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", - page, (void *)page->flags, page_count(page), - page_mapcount(page), page->mapping, page->index); + dump_page(page); dump_stack(); out: @@ -452,6 +492,8 @@ static inline void __free_one_page(struct page *page, int migratetype) { unsigned long page_idx; + unsigned long combined_idx; + struct page *buddy; if (unlikely(PageCompound(page))) if (unlikely(destroy_compound_page(page, order))) @@ -465,9 +507,6 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON(bad_range(zone, page)); while (order < MAX_ORDER-1) { - unsigned long combined_idx; - struct page *buddy; - buddy = __page_find_buddy(page, page_idx, order); if (!page_is_buddy(page, buddy, order)) break; @@ -482,8 +521,29 @@ static inline void __free_one_page(struct page *page, order++; } set_page_order(page, order); - list_add(&page->lru, - &zone->free_area[order].free_list[migratetype]); + + /* + * If this is not the largest possible page, check if the buddy + * of the next-highest order is free. If it is, it's possible + * that pages are being freed that will coalesce soon. In case, + * that is happening, add the free page to the tail of the list + * so it's less likely to be used soon and more likely to be merged + * as a higher order page + */ + if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) { + struct page *higher_page, *higher_buddy; + combined_idx = __find_combined_index(page_idx, order); + higher_page = page + combined_idx - page_idx; + higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1); + if (page_is_buddy(higher_page, higher_buddy, order + 1)) { + list_add_tail(&page->lru, + &zone->free_area[order].free_list[migratetype]); + goto out; + } + } + + list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); +out: zone->free_area[order].nr_free++; } @@ -528,13 +588,13 @@ static void free_pcppages_bulk(struct zone *zone, int count, { int migratetype = 0; int batch_free = 0; + int to_free = count; spin_lock(&zone->lock); - zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); + zone->all_unreclaimable = 0; zone->pages_scanned = 0; - __mod_zone_page_state(zone, NR_FREE_PAGES, count); - while (count) { + while (to_free) { struct page *page; struct list_head *list; @@ -559,8 +619,9 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ __free_one_page(page, zone, 0, page_private(page)); trace_mm_page_pcpu_drain(page, 0, page_private(page)); - } while (--count && --batch_free && !list_empty(list)); + } while (--to_free && --batch_free && !list_empty(list)); } + __mod_zone_page_state(zone, NR_FREE_PAGES, count); spin_unlock(&zone->lock); } @@ -568,27 +629,31 @@ static void free_one_page(struct zone *zone, struct page *page, int order, int migratetype) { spin_lock(&zone->lock); - zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); + zone->all_unreclaimable = 0; zone->pages_scanned = 0; - __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); __free_one_page(page, zone, order, migratetype); + __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); spin_unlock(&zone->lock); } -static void __free_pages_ok(struct page *page, unsigned int order) +static bool free_pages_prepare(struct page *page, unsigned int order) { - unsigned long flags; int i; int bad = 0; - int wasMlocked = __TestClearPageMlocked(page); + trace_mm_page_free_direct(page, order); kmemcheck_free_shadow(page, order); - for (i = 0 ; i < (1 << order) ; ++i) - bad += free_pages_check(page + i); + for (i = 0; i < (1 << order); i++) { + struct page *pg = page + i; + + if (PageAnon(pg)) + pg->mapping = NULL; + bad += free_pages_check(pg); + } if (bad) - return; + return false; if (!PageHighMem(page)) { debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); @@ -598,6 +663,17 @@ static void __free_pages_ok(struct page *page, unsigned int order) arch_free_page(page, order); kernel_map_pages(page, 1 << order, 0); + return true; +} + +static void __free_pages_ok(struct page *page, unsigned int order) +{ + unsigned long flags; + int wasMlocked = __TestClearPageMlocked(page); + + if (!free_pages_prepare(page, order)) + return; + local_irq_save(flags); if (unlikely(wasMlocked)) free_page_mlock(page); @@ -1009,10 +1085,10 @@ static void drain_pages(unsigned int cpu) struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; - pset = zone_pcp(zone, cpu); + local_irq_save(flags); + pset = per_cpu_ptr(zone->pageset, cpu); pcp = &pset->pcp; - local_irq_save(flags); free_pcppages_bulk(zone, pcp->count, pcp); pcp->count = 0; local_irq_restore(flags); @@ -1073,8 +1149,9 @@ void mark_free_pages(struct zone *zone) /* * Free a 0-order page + * cold == 1 ? free a cold page : free a hot page */ -static void free_hot_cold_page(struct page *page, int cold) +void free_hot_cold_page(struct page *page, int cold) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; @@ -1082,21 +1159,9 @@ static void free_hot_cold_page(struct page *page, int cold) int migratetype; int wasMlocked = __TestClearPageMlocked(page); - kmemcheck_free_shadow(page, 0); - - if (PageAnon(page)) - page->mapping = NULL; - if (free_pages_check(page)) + if (!free_pages_prepare(page, 0)) return; - if (!PageHighMem(page)) { - debug_check_no_locks_freed(page_address(page), PAGE_SIZE); - debug_check_no_obj_freed(page_address(page), PAGE_SIZE); - } - arch_free_page(page, 0); - kernel_map_pages(page, 1, 0); - - pcp = &zone_pcp(zone, get_cpu())->pcp; migratetype = get_pageblock_migratetype(page); set_page_private(page, migratetype); local_irq_save(flags); @@ -1119,6 +1184,7 @@ static void free_hot_cold_page(struct page *page, int cold) migratetype = MIGRATE_MOVABLE; } + pcp = &this_cpu_ptr(zone->pageset)->pcp; if (cold) list_add_tail(&page->lru, &pcp->lists[migratetype]); else @@ -1131,15 +1197,8 @@ static void free_hot_cold_page(struct page *page, int cold) out: local_irq_restore(flags); - put_cpu(); } -void free_hot_page(struct page *page) -{ - trace_mm_page_free_direct(page, 0); - free_hot_cold_page(page, 0); -} - /* * split_page takes a non-compound higher-order page, and splits it into * n (1<<order) sub-pages: page[0..n] @@ -1169,6 +1228,51 @@ void split_page(struct page *page, unsigned int order) } /* + * Similar to split_page except the page is already free. As this is only + * being used for migration, the migratetype of the block also changes. + * As this is called with interrupts disabled, the caller is responsible + * for calling arch_alloc_page() and kernel_map_page() after interrupts + * are enabled. + * + * Note: this is probably too low level an operation for use in drivers. + * Please consult with lkml before using this in your driver. + */ +int split_free_page(struct page *page) +{ + unsigned int order; + unsigned long watermark; + struct zone *zone; + + BUG_ON(!PageBuddy(page)); + + zone = page_zone(page); + order = page_order(page); + + /* Obey watermarks as if the page was being allocated */ + watermark = low_wmark_pages(zone) + (1 << order); + if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + return 0; + + /* Remove page from free list */ + list_del(&page->lru); + zone->free_area[order].nr_free--; + rmv_page_order(page); + __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); + + /* Split into individual pages */ + set_page_refcounted(page); + split_page(page, order); + + if (order >= pageblock_order - 1) { + struct page *endpage = page + (1 << order) - 1; + for (; page < endpage; page += pageblock_nr_pages) + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + } + + return 1 << order; +} + +/* * Really, prep_compound_page() should be called from __rmqueue_bulk(). But * we cheat by calling it from here, in the order > 0 path. Saves a branch * or two. @@ -1181,17 +1285,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, unsigned long flags; struct page *page; int cold = !!(gfp_flags & __GFP_COLD); - int cpu; again: - cpu = get_cpu(); if (likely(order == 0)) { struct per_cpu_pages *pcp; struct list_head *list; - pcp = &zone_pcp(zone, cpu)->pcp; - list = &pcp->lists[migratetype]; local_irq_save(flags); + pcp = &this_cpu_ptr(zone->pageset)->pcp; + list = &pcp->lists[migratetype]; if (list_empty(list)) { pcp->count += rmqueue_bulk(zone, 0, pcp->batch, list, @@ -1232,7 +1334,6 @@ again: __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone); local_irq_restore(flags); - put_cpu(); VM_BUG_ON(bad_range(zone, page)); if (prep_new_page(page, order, gfp_flags)) @@ -1241,7 +1342,6 @@ again: failed: local_irq_restore(flags); - put_cpu(); return NULL; } @@ -1362,7 +1462,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, { /* free_pages my go negative - that's OK */ long min = mark; - long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1; + long free_pages = zone_nr_free_pages(z) - (1 << order) + 1; int o; if (alloc_flags & ALLOC_HIGH) @@ -1639,7 +1739,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, struct page *page; /* Acquire the OOM killer lock for the zones in zonelist */ - if (!try_set_zone_oom(zonelist, gfp_mask)) { + if (!try_set_zonelist_oom(zonelist, gfp_mask)) { schedule_timeout_uninterruptible(1); return NULL; } @@ -1660,6 +1760,9 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, /* The OOM killer will not help higher order allocs */ if (order > PAGE_ALLOC_COSTLY_ORDER) goto out; + /* The OOM killer does not needlessly kill tasks for lowmem */ + if (high_zoneidx < ZONE_NORMAL) + goto out; /* * GFP_THISNODE contains __GFP_NORETRY and we never hit this. * Sanity check for bare calls of __GFP_THISNODE, not real OOM. @@ -1678,6 +1781,62 @@ out: return page; } +#ifdef CONFIG_COMPACTION +/* Try memory compaction for high-order allocations before reclaim */ +static struct page * +__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, + int migratetype, unsigned long *did_some_progress) +{ + struct page *page; + + if (!order || compaction_deferred(preferred_zone)) + return NULL; + + *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, + nodemask); + if (*did_some_progress != COMPACT_SKIPPED) { + + /* Page migration frees to the PCP lists but we want merging */ + drain_pages(get_cpu()); + put_cpu(); + + page = get_page_from_freelist(gfp_mask, nodemask, + order, zonelist, high_zoneidx, + alloc_flags, preferred_zone, + migratetype); + if (page) { + preferred_zone->compact_considered = 0; + preferred_zone->compact_defer_shift = 0; + count_vm_event(COMPACTSUCCESS); + return page; + } + + /* + * It's bad if compaction run occurs and fails. + * The most likely reason is that pages exist, + * but not enough to satisfy watermarks. + */ + count_vm_event(COMPACTFAIL); + defer_compaction(preferred_zone); + + cond_resched(); + } + + return NULL; +} +#else +static inline struct page * +__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, + int migratetype, unsigned long *did_some_progress) +{ + return NULL; +} +#endif /* CONFIG_COMPACTION */ + /* The really slow allocator path where we enter direct reclaim */ static inline struct page * __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, @@ -1688,6 +1847,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, struct page *page = NULL; struct reclaim_state reclaim_state; struct task_struct *p = current; + bool drained = false; cond_resched(); @@ -1706,14 +1866,25 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, cond_resched(); - if (order != 0) - drain_all_pages(); + if (unlikely(!(*did_some_progress))) + return NULL; - if (likely(*did_some_progress)) - page = get_page_from_freelist(gfp_mask, nodemask, order, +retry: + page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags, preferred_zone, migratetype); + + /* + * If an allocation failed after direct reclaim, it could be because + * pages are pinned on the per-cpu lists. Drain them and try again + */ + if (!page && !drained) { + drain_all_pages(); + drained = true; + goto retry; + } + return page; } @@ -1864,6 +2035,15 @@ rebalance: if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) goto nopage; + /* Try direct compaction */ + page = __alloc_pages_direct_compact(gfp_mask, order, + zonelist, high_zoneidx, + nodemask, + alloc_flags, preferred_zone, + migratetype, &did_some_progress); + if (page) + goto got_pg; + /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, zonelist, high_zoneidx, @@ -1888,15 +2068,23 @@ rebalance: if (page) goto got_pg; - /* - * The OOM killer does not trigger for high-order - * ~__GFP_NOFAIL allocations so if no progress is being - * made, there are no other options and retrying is - * unlikely to help. - */ - if (order > PAGE_ALLOC_COSTLY_ORDER && - !(gfp_mask & __GFP_NOFAIL)) - goto nopage; + if (!(gfp_mask & __GFP_NOFAIL)) { + /* + * The oom killer is not called for high-order + * allocations that may fail, so if no progress + * is being made, there are no other options and + * retrying is unlikely to help. + */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + goto nopage; + /* + * The oom killer is not called for lowmem + * allocations to prevent needlessly killing + * innocent tasks. + */ + if (high_zoneidx < ZONE_NORMAL) + goto nopage; + } goto restart; } @@ -1955,10 +2143,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; + get_mems_allowed(); /* The preferred zone is used for statistics later */ first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); - if (!preferred_zone) + if (!preferred_zone) { + put_mems_allowed(); return NULL; + } /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, @@ -1968,6 +2159,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, page = __alloc_pages_slowpath(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); + put_mems_allowed(); trace_mm_page_alloc(page, order, gfp_mask, migratetype); return page; @@ -2013,9 +2205,8 @@ void __pagevec_free(struct pagevec *pvec) void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) { - trace_mm_page_free_direct(page, order); if (order == 0) - free_hot_page(page); + free_hot_cold_page(page, 0); else __free_pages_ok(page, order); } @@ -2180,7 +2371,7 @@ void show_free_areas(void) for_each_online_cpu(cpu) { struct per_cpu_pageset *pageset; - pageset = zone_pcp(zone, cpu); + pageset = per_cpu_ptr(zone->pageset, cpu); printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", cpu, pageset->pcp.high, @@ -2245,7 +2436,7 @@ void show_free_areas(void) " all_unreclaimable? %s" "\n", zone->name, - K(zone_page_state(zone, NR_FREE_PAGES)), + K(zone_nr_free_pages(zone)), K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), @@ -2271,7 +2462,7 @@ void show_free_areas(void) K(zone_page_state(zone, NR_BOUNCE)), K(zone_page_state(zone, NR_WRITEBACK_TEMP)), zone->pages_scanned, - (zone_is_all_unreclaimable(zone) ? "yes" : "no") + (zone->all_unreclaimable ? "yes" : "no") ); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) @@ -2420,8 +2611,11 @@ int numa_zonelist_order_handler(ctl_table *table, int write, strncpy((char*)table->data, saved_string, NUMA_ZONELIST_ORDER_LEN); user_zonelist_order = oldval; - } else if (oldval != user_zonelist_order) - build_all_zonelists(); + } else if (oldval != user_zonelist_order) { + mutex_lock(&zonelists_mutex); + build_all_zonelists(NULL); + mutex_unlock(&zonelists_mutex); + } } out: mutex_unlock(&zl_order_mutex); @@ -2565,10 +2759,10 @@ static int default_zonelist_order(void) struct zone *z; int average_size; /* - * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem. + * ZONE_DMA and ZONE_DMA32 can be very small area in the system. * If they are really small and used heavily, the system can fall * into OOM very easily. - * This function detect ZONE_DMA/DMA32 size and confgigures zone order. + * This function detect ZONE_DMA/DMA32 size and configures zone order. */ /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ low_kmem_size = 0; @@ -2580,6 +2774,15 @@ static int default_zonelist_order(void) if (zone_type < ZONE_NORMAL) low_kmem_size += z->present_pages; total_size += z->present_pages; + } else if (zone_type == ZONE_NORMAL) { + /* + * If any node has only lowmem, then node order + * is preferred to allow kernel allocations + * locally; otherwise, they can easily infringe + * on other nodes when there is an abundance of + * lowmem available to allocate from. + */ + return ZONELIST_ORDER_NODE; } } } @@ -2693,6 +2896,24 @@ static void build_zonelist_cache(pg_data_t *pgdat) zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); } +#ifdef CONFIG_HAVE_MEMORYLESS_NODES +/* + * Return node id of node used for "local" allocations. + * I.e., first node id of first zone in arg node's generic zonelist. + * Used for initializing percpu 'numa_mem', which is used primarily + * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. + */ +int local_memory_node(int node) +{ + struct zone *zone; + + (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), + gfp_zone(GFP_KERNEL), + NULL, + &zone); + return zone->node; +} +#endif #else /* CONFIG_NUMA */ @@ -2745,10 +2966,36 @@ static void build_zonelist_cache(pg_data_t *pgdat) #endif /* CONFIG_NUMA */ +/* + * Boot pageset table. One per cpu which is going to be used for all + * zones and all nodes. The parameters will be set in such a way + * that an item put on a list will immediately be handed over to + * the buddy list. This is safe since pageset manipulation is done + * with interrupts disabled. + * + * The boot_pagesets must be kept even after bootup is complete for + * unused processors and/or zones. They do play a role for bootstrapping + * hotplugged processors. + * + * zoneinfo_show() and maybe other functions do + * not check if the processor is online before following the pageset pointer. + * Other parts of the kernel may not check if the zone is available. + */ +static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); +static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); +static void setup_zone_pageset(struct zone *zone); + +/* + * Global mutex to protect against size modification of zonelists + * as well as to serialize pageset setup for the new populated zone. + */ +DEFINE_MUTEX(zonelists_mutex); + /* return values int ....just for stop_machine() */ -static int __build_all_zonelists(void *dummy) +static __init_refok int __build_all_zonelists(void *data) { int nid; + int cpu; #ifdef CONFIG_NUMA memset(node_load, 0, sizeof(node_load)); @@ -2759,10 +3006,53 @@ static int __build_all_zonelists(void *dummy) build_zonelists(pgdat); build_zonelist_cache(pgdat); } + +#ifdef CONFIG_MEMORY_HOTPLUG + /* Setup real pagesets for the new zone */ + if (data) { + struct zone *zone = data; + setup_zone_pageset(zone); + } +#endif + + /* + * Initialize the boot_pagesets that are going to be used + * for bootstrapping processors. The real pagesets for + * each zone will be allocated later when the per cpu + * allocator is available. + * + * boot_pagesets are used also for bootstrapping offline + * cpus if the system is already booted because the pagesets + * are needed to initialize allocators on a specific cpu too. + * F.e. the percpu allocator needs the page allocator which + * needs the percpu allocator in order to allocate its pagesets + * (a chicken-egg dilemma). + */ + for_each_possible_cpu(cpu) { + setup_pageset(&per_cpu(boot_pageset, cpu), 0); + +#ifdef CONFIG_HAVE_MEMORYLESS_NODES + /* + * We now know the "local memory node" for each node-- + * i.e., the node of the first zone in the generic zonelist. + * Set up numa_mem percpu variable for on-line cpus. During + * boot, only the boot cpu should be on-line; we'll init the + * secondary cpus' numa_mem as they come on-line. During + * node/memory hotplug, we'll fixup all on-line cpus. + */ + if (cpu_online(cpu)) + set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); +#endif + } + return 0; } -void build_all_zonelists(void) +/* + * Called with zonelists_mutex held always + * unless system_state == SYSTEM_BOOTING. + */ +void build_all_zonelists(void *data) { set_zonelist_order(); @@ -2773,7 +3063,7 @@ void build_all_zonelists(void) } else { /* we have to stop all cpus to guarantee there is no user of zonelist */ - stop_machine(__build_all_zonelists, NULL, NULL); + stop_machine(__build_all_zonelists, data, NULL); /* cpuset refresh routine should be here */ } vm_total_pages = nr_free_pagecache_pages(); @@ -3096,121 +3386,36 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, pcp->batch = PAGE_SHIFT * 8; } - -#ifdef CONFIG_NUMA -/* - * Boot pageset table. One per cpu which is going to be used for all - * zones and all nodes. The parameters will be set in such a way - * that an item put on a list will immediately be handed over to - * the buddy list. This is safe since pageset manipulation is done - * with interrupts disabled. - * - * Some NUMA counter updates may also be caught by the boot pagesets. - * - * The boot_pagesets must be kept even after bootup is complete for - * unused processors and/or zones. They do play a role for bootstrapping - * hotplugged processors. - * - * zoneinfo_show() and maybe other functions do - * not check if the processor is online before following the pageset pointer. - * Other parts of the kernel may not check if the zone is available. - */ -static struct per_cpu_pageset boot_pageset[NR_CPUS]; - -/* - * Dynamically allocate memory for the - * per cpu pageset array in struct zone. - */ -static int __cpuinit process_zones(int cpu) +static __meminit void setup_zone_pageset(struct zone *zone) { - struct zone *zone, *dzone; - int node = cpu_to_node(cpu); + int cpu; - node_set_state(node, N_CPU); /* this node has a cpu */ + zone->pageset = alloc_percpu(struct per_cpu_pageset); - for_each_populated_zone(zone) { - zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), - GFP_KERNEL, node); - if (!zone_pcp(zone, cpu)) - goto bad; + for_each_possible_cpu(cpu) { + struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); - setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); + setup_pageset(pcp, zone_batchsize(zone)); if (percpu_pagelist_fraction) - setup_pagelist_highmark(zone_pcp(zone, cpu), - (zone->present_pages / percpu_pagelist_fraction)); - } - - return 0; -bad: - for_each_zone(dzone) { - if (!populated_zone(dzone)) - continue; - if (dzone == zone) - break; - kfree(zone_pcp(dzone, cpu)); - zone_pcp(dzone, cpu) = &boot_pageset[cpu]; - } - return -ENOMEM; -} - -static inline void free_zone_pagesets(int cpu) -{ - struct zone *zone; - - for_each_zone(zone) { - struct per_cpu_pageset *pset = zone_pcp(zone, cpu); - - /* Free per_cpu_pageset if it is slab allocated */ - if (pset != &boot_pageset[cpu]) - kfree(pset); - zone_pcp(zone, cpu) = &boot_pageset[cpu]; + setup_pagelist_highmark(pcp, + (zone->present_pages / + percpu_pagelist_fraction)); } } -static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - int cpu = (long)hcpu; - int ret = NOTIFY_OK; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - if (process_zones(cpu)) - ret = NOTIFY_BAD; - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - free_zone_pagesets(cpu); - break; - default: - break; - } - return ret; -} - -static struct notifier_block __cpuinitdata pageset_notifier = - { &pageset_cpuup_callback, NULL, 0 }; - +/* + * Allocate per cpu pagesets and initialize them. + * Before this call only boot pagesets were available. + */ void __init setup_per_cpu_pageset(void) { - int err; + struct zone *zone; - /* Initialize per_cpu_pageset for cpu 0. - * A cpuup callback will do this for every cpu - * as it comes online - */ - err = process_zones(smp_processor_id()); - BUG_ON(err); - register_cpu_notifier(&pageset_notifier); + for_each_populated_zone(zone) + setup_zone_pageset(zone); } -#endif - static noinline __init_refok int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) { @@ -3260,11 +3465,11 @@ static int __zone_pcp_update(void *data) int cpu; unsigned long batch = zone_batchsize(zone), flags; - for (cpu = 0; cpu < NR_CPUS; cpu++) { + for_each_possible_cpu(cpu) { struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; - pset = zone_pcp(zone, cpu); + pset = per_cpu_ptr(zone->pageset, cpu); pcp = &pset->pcp; local_irq_save(flags); @@ -3282,21 +3487,17 @@ void zone_pcp_update(struct zone *zone) static __meminit void zone_pcp_init(struct zone *zone) { - int cpu; - unsigned long batch = zone_batchsize(zone); + /* + * per cpu subsystem is not up at this point. The following code + * relies on the ability of the linker to provide the + * offset of a (static) per cpu variable into the per cpu area. + */ + zone->pageset = &boot_pageset; - for (cpu = 0; cpu < NR_CPUS; cpu++) { -#ifdef CONFIG_NUMA - /* Early boot. Slab allocator not functional yet */ - zone_pcp(zone, cpu) = &boot_pageset[cpu]; - setup_pageset(&boot_pageset[cpu],0); -#else - setup_pageset(zone_pcp(zone,cpu), batch); -#endif - } if (zone->present_pages) - printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", - zone->name, zone->present_pages, batch); + printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", + zone->name, zone->present_pages, + zone_batchsize(zone)); } __meminit int init_currently_empty_zone(struct zone *zone, @@ -3435,6 +3636,69 @@ void __init free_bootmem_with_active_regions(int nid, } } +int __init add_from_early_node_map(struct range *range, int az, + int nr_range, int nid) +{ + int i; + u64 start, end; + + /* need to go over early_node_map to find out good range for node */ + for_each_active_range_index_in_nid(i, nid) { + start = early_node_map[i].start_pfn; + end = early_node_map[i].end_pfn; + nr_range = add_range(range, az, nr_range, start, end); + } + return nr_range; +} + +#ifdef CONFIG_NO_BOOTMEM +void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, + u64 goal, u64 limit) +{ + int i; + void *ptr; + + if (limit > get_max_mapped()) + limit = get_max_mapped(); + + /* need to go over early_node_map to find out good range for node */ + for_each_active_range_index_in_nid(i, nid) { + u64 addr; + u64 ei_start, ei_last; + + ei_last = early_node_map[i].end_pfn; + ei_last <<= PAGE_SHIFT; + ei_start = early_node_map[i].start_pfn; + ei_start <<= PAGE_SHIFT; + addr = find_early_area(ei_start, ei_last, + goal, limit, size, align); + + if (addr == -1ULL) + continue; + +#if 0 + printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n", + nid, + ei_start, ei_last, goal, limit, size, + align, addr); +#endif + + ptr = phys_to_virt(addr); + memset(ptr, 0, size); + reserve_early_without_check(addr, addr + size, "BOOTMEM"); + /* + * The min_count is set to 0 so that bootmem allocated blocks + * are never reported as leaks. + */ + kmemleak_alloc(ptr, size, 0, 0); + return ptr; + } + + return NULL; +} +#endif + + void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) { int i; @@ -3849,8 +4113,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone_seqlock_init(zone); zone->zone_pgdat = pgdat; - zone->prev_priority = DEF_PRIORITY; - zone_pcp_init(zone); for_each_lru(l) { INIT_LIST_HEAD(&zone->lru[l].list); @@ -4377,8 +4639,12 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; - printk(" %-8s %0#10lx -> %0#10lx\n", - zone_names[i], + printk(" %-8s ", zone_names[i]); + if (arch_zone_lowest_possible_pfn[i] == + arch_zone_highest_possible_pfn[i]) + printk("empty\n"); + else + printk("%0#10lx -> %0#10lx\n", arch_zone_lowest_possible_pfn[i], arch_zone_highest_possible_pfn[i]); } @@ -4467,7 +4733,11 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) } #ifndef CONFIG_NEED_MULTIPLE_NODES -struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] }; +struct pglist_data __refdata contig_page_data = { +#ifndef CONFIG_NO_BOOTMEM + .bdata = &bootmem_node_data[0] +#endif + }; EXPORT_SYMBOL(contig_page_data); #endif @@ -4810,10 +5080,11 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, if (!write || (ret == -EINVAL)) return ret; for_each_populated_zone(zone) { - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { unsigned long high; high = zone->present_pages / percpu_pagelist_fraction; - setup_pagelist_highmark(zone_pcp(zone, cpu), high); + setup_pagelist_highmark( + per_cpu_ptr(zone->pageset, cpu), high); } } return 0; @@ -4911,9 +5182,9 @@ void *__init alloc_large_system_hash(const char *tablename, if (!table) panic("Failed to allocate %s hash table\n", tablename); - printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n", + printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", tablename, - (1U << log2qty), + (1UL << log2qty), ilog2(size) - PAGE_SHIFT, size); @@ -5159,3 +5430,80 @@ bool is_free_buddy_page(struct page *page) return order < MAX_ORDER; } #endif + +static struct trace_print_flags pageflag_names[] = { + {1UL << PG_locked, "locked" }, + {1UL << PG_error, "error" }, + {1UL << PG_referenced, "referenced" }, + {1UL << PG_uptodate, "uptodate" }, + {1UL << PG_dirty, "dirty" }, + {1UL << PG_lru, "lru" }, + {1UL << PG_active, "active" }, + {1UL << PG_slab, "slab" }, + {1UL << PG_owner_priv_1, "owner_priv_1" }, + {1UL << PG_arch_1, "arch_1" }, + {1UL << PG_reserved, "reserved" }, + {1UL << PG_private, "private" }, + {1UL << PG_private_2, "private_2" }, + {1UL << PG_writeback, "writeback" }, +#ifdef CONFIG_PAGEFLAGS_EXTENDED + {1UL << PG_head, "head" }, + {1UL << PG_tail, "tail" }, +#else + {1UL << PG_compound, "compound" }, +#endif + {1UL << PG_swapcache, "swapcache" }, + {1UL << PG_mappedtodisk, "mappedtodisk" }, + {1UL << PG_reclaim, "reclaim" }, + {1UL << PG_buddy, "buddy" }, + {1UL << PG_swapbacked, "swapbacked" }, + {1UL << PG_unevictable, "unevictable" }, +#ifdef CONFIG_MMU + {1UL << PG_mlocked, "mlocked" }, +#endif +#ifdef CONFIG_ARCH_USES_PG_UNCACHED + {1UL << PG_uncached, "uncached" }, +#endif +#ifdef CONFIG_MEMORY_FAILURE + {1UL << PG_hwpoison, "hwpoison" }, +#endif + {-1UL, NULL }, +}; + +static void dump_page_flags(unsigned long flags) +{ + const char *delim = ""; + unsigned long mask; + int i; + + printk(KERN_ALERT "page flags: %#lx(", flags); + + /* remove zone id */ + flags &= (1UL << NR_PAGEFLAGS) - 1; + + for (i = 0; pageflag_names[i].name && flags; i++) { + + mask = pageflag_names[i].mask; + if ((flags & mask) != mask) + continue; + + flags &= ~mask; + printk("%s%s", delim, pageflag_names[i].name); + delim = "|"; + } + + /* check for left over flags */ + if (flags) + printk("%s%#lx", delim, flags); + + printk(")\n"); +} + +void dump_page(struct page *page) +{ + printk(KERN_ALERT + "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", + page, page_count(page), page_mapcount(page), + page->mapping, page->index); + dump_page_flags(page->flags); +} diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 3d535d5..5bffada 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -9,6 +9,7 @@ #include <linux/vmalloc.h> #include <linux/cgroup.h> #include <linux/swapops.h> +#include <linux/kmemleak.h> static void __meminit __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) @@ -126,6 +127,12 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn) if (!base) base = vmalloc(table_size); } + /* + * The value stored in section->page_cgroup is (base - pfn) + * and it does not point to the memory block allocated above, + * causing kmemleak false positives. + */ + kmemleak_not_leak(base); } else { /* * We don't have to allocate page_cgroup again, but @@ -284,6 +291,7 @@ static DEFINE_MUTEX(swap_cgroup_mutex); struct swap_cgroup_ctrl { struct page **map; unsigned long length; + spinlock_t lock; }; struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; @@ -335,6 +343,43 @@ not_enough_page: } /** + * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. + * @end: swap entry to be cmpxchged + * @old: old id + * @new: new id + * + * Returns old id at success, 0 at failure. + * (There is no mem_cgroup useing 0 as its id) + */ +unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, + unsigned short old, unsigned short new) +{ + int type = swp_type(ent); + unsigned long offset = swp_offset(ent); + unsigned long idx = offset / SC_PER_PAGE; + unsigned long pos = offset & SC_POS_MASK; + struct swap_cgroup_ctrl *ctrl; + struct page *mappage; + struct swap_cgroup *sc; + unsigned long flags; + unsigned short retval; + + ctrl = &swap_cgroup_ctrl[type]; + + mappage = ctrl->map[idx]; + sc = page_address(mappage); + sc += pos; + spin_lock_irqsave(&ctrl->lock, flags); + retval = sc->id; + if (retval == old) + sc->id = new; + else + retval = 0; + spin_unlock_irqrestore(&ctrl->lock, flags); + return retval; +} + +/** * swap_cgroup_record - record mem_cgroup for this swp_entry. * @ent: swap entry to be recorded into * @mem: mem_cgroup to be recorded @@ -352,14 +397,17 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) struct page *mappage; struct swap_cgroup *sc; unsigned short old; + unsigned long flags; ctrl = &swap_cgroup_ctrl[type]; mappage = ctrl->map[idx]; sc = page_address(mappage); sc += pos; + spin_lock_irqsave(&ctrl->lock, flags); old = sc->id; sc->id = id; + spin_unlock_irqrestore(&ctrl->lock, flags); return old; } @@ -411,6 +459,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) mutex_lock(&swap_cgroup_mutex); ctrl->length = length; ctrl->map = array; + spin_lock_init(&ctrl->lock); if (swap_cgroup_prepare(type)) { /* memory shortage */ ctrl->map = NULL; diff --git a/mm/page_io.c b/mm/page_io.c index a19af95..2dee975 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -12,6 +12,7 @@ #include <linux/mm.h> #include <linux/kernel_stat.h> +#include <linux/gfp.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/bio.h> @@ -105,7 +106,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) goto out; } if (wbc->sync_mode == WB_SYNC_ALL) - rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); + rw |= REQ_SYNC | REQ_UNPLUG; count_vm_event(PSWPOUT); set_page_writeback(page); unlock_page(page); diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 7b47a57..8b1a2ce 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -80,6 +80,37 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, return err; } +#ifdef CONFIG_HUGETLB_PAGE +static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, + unsigned long end) +{ + unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h); + return boundary < end ? boundary : end; +} + +static int walk_hugetlb_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct hstate *h = hstate_vma(vma); + unsigned long next; + unsigned long hmask = huge_page_mask(h); + pte_t *pte; + int err = 0; + + do { + next = hugetlb_entry_end(h, addr, end); + pte = huge_pte_offset(walk->mm, addr & hmask); + if (pte && walk->hugetlb_entry) + err = walk->hugetlb_entry(pte, hmask, addr, next, walk); + if (err) + return err; + } while (addr = next, addr != end); + + return 0; +} +#endif + /** * walk_page_range - walk a memory map's page tables with a callback * @mm: memory map to walk @@ -128,20 +159,16 @@ int walk_page_range(unsigned long addr, unsigned long end, vma = find_vma(walk->mm, addr); #ifdef CONFIG_HUGETLB_PAGE if (vma && is_vm_hugetlb_page(vma)) { - pte_t *pte; - struct hstate *hs; - if (vma->vm_end < next) next = vma->vm_end; - hs = hstate_vma(vma); - pte = huge_pte_offset(walk->mm, - addr & huge_page_mask(hs)); - if (pte && !huge_pte_none(huge_ptep_get(pte)) - && walk->hugetlb_entry) - err = walk->hugetlb_entry(pte, addr, - next, walk); + /* + * Hugepage is very tightly coupled with vma, so + * walk through hugetlb entries within a given vma. + */ + err = walk_hugetlb_range(vma, addr, next, walk); if (err) break; + pgd = pgd_offset(walk->mm, next); continue; } #endif diff --git a/mm/percpu-km.c b/mm/percpu-km.c new file mode 100644 index 0000000..df68085 --- /dev/null +++ b/mm/percpu-km.c @@ -0,0 +1,104 @@ +/* + * mm/percpu-km.c - kernel memory based chunk allocation + * + * Copyright (C) 2010 SUSE Linux Products GmbH + * Copyright (C) 2010 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2. + * + * Chunks are allocated as a contiguous kernel memory using gfp + * allocation. This is to be used on nommu architectures. + * + * To use percpu-km, + * + * - define CONFIG_NEED_PER_CPU_KM from the arch Kconfig. + * + * - CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK must not be defined. It's + * not compatible with PER_CPU_KM. EMBED_FIRST_CHUNK should work + * fine. + * + * - NUMA is not supported. When setting up the first chunk, + * @cpu_distance_fn should be NULL or report all CPUs to be nearer + * than or at LOCAL_DISTANCE. + * + * - It's best if the chunk size is power of two multiple of + * PAGE_SIZE. Because each chunk is allocated as a contiguous + * kernel memory block using alloc_pages(), memory will be wasted if + * chunk size is not aligned. percpu-km code will whine about it. + */ + +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK +#error "contiguous percpu allocation is incompatible with paged first chunk" +#endif + +#include <linux/log2.h> + +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) +{ + /* noop */ + return 0; +} + +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) +{ + /* nada */ +} + +static struct pcpu_chunk *pcpu_create_chunk(void) +{ + const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT; + struct pcpu_chunk *chunk; + struct page *pages; + int i; + + chunk = pcpu_alloc_chunk(); + if (!chunk) + return NULL; + + pages = alloc_pages(GFP_KERNEL, order_base_2(nr_pages)); + if (!pages) { + pcpu_free_chunk(chunk); + return NULL; + } + + for (i = 0; i < nr_pages; i++) + pcpu_set_page_chunk(nth_page(pages, i), chunk); + + chunk->data = pages; + chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; + return chunk; +} + +static void pcpu_destroy_chunk(struct pcpu_chunk *chunk) +{ + const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT; + + if (chunk && chunk->data) + __free_pages(chunk->data, order_base_2(nr_pages)); + pcpu_free_chunk(chunk); +} + +static struct page *pcpu_addr_to_page(void *addr) +{ + return virt_to_page(addr); +} + +static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) +{ + size_t nr_pages, alloc_pages; + + /* all units must be in a single group */ + if (ai->nr_groups != 1) { + printk(KERN_CRIT "percpu: can't handle more than one groups\n"); + return -EINVAL; + } + + nr_pages = (ai->groups[0].nr_units * ai->unit_size) >> PAGE_SHIFT; + alloc_pages = roundup_pow_of_two(nr_pages); + + if (alloc_pages > nr_pages) + printk(KERN_WARNING "percpu: wasting %zu pages per chunk\n", + alloc_pages - nr_pages); + + return 0; +} diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c new file mode 100644 index 0000000..7d9c1d0 --- /dev/null +++ b/mm/percpu-vm.c @@ -0,0 +1,451 @@ +/* + * mm/percpu-vm.c - vmalloc area based chunk allocation + * + * Copyright (C) 2010 SUSE Linux Products GmbH + * Copyright (C) 2010 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2. + * + * Chunks are mapped into vmalloc areas and populated page by page. + * This is the default chunk allocator. + */ + +static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, + unsigned int cpu, int page_idx) +{ + /* must not be used on pre-mapped chunk */ + WARN_ON(chunk->immutable); + + return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx)); +} + +/** + * pcpu_get_pages_and_bitmap - get temp pages array and bitmap + * @chunk: chunk of interest + * @bitmapp: output parameter for bitmap + * @may_alloc: may allocate the array + * + * Returns pointer to array of pointers to struct page and bitmap, + * both of which can be indexed with pcpu_page_idx(). The returned + * array is cleared to zero and *@bitmapp is copied from + * @chunk->populated. Note that there is only one array and bitmap + * and access exclusion is the caller's responsibility. + * + * CONTEXT: + * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc. + * Otherwise, don't care. + * + * RETURNS: + * Pointer to temp pages array on success, NULL on failure. + */ +static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, + unsigned long **bitmapp, + bool may_alloc) +{ + static struct page **pages; + static unsigned long *bitmap; + size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); + size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * + sizeof(unsigned long); + + if (!pages || !bitmap) { + if (may_alloc && !pages) + pages = pcpu_mem_alloc(pages_size); + if (may_alloc && !bitmap) + bitmap = pcpu_mem_alloc(bitmap_size); + if (!pages || !bitmap) + return NULL; + } + + memset(pages, 0, pages_size); + bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); + + *bitmapp = bitmap; + return pages; +} + +/** + * pcpu_free_pages - free pages which were allocated for @chunk + * @chunk: chunk pages were allocated for + * @pages: array of pages to be freed, indexed by pcpu_page_idx() + * @populated: populated bitmap + * @page_start: page index of the first page to be freed + * @page_end: page index of the last page to be freed + 1 + * + * Free pages [@page_start and @page_end) in @pages for all units. + * The pages were allocated for @chunk. + */ +static void pcpu_free_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) +{ + unsigned int cpu; + int i; + + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page *page = pages[pcpu_page_idx(cpu, i)]; + + if (page) + __free_page(page); + } + } +} + +/** + * pcpu_alloc_pages - allocates pages for @chunk + * @chunk: target chunk + * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() + * @populated: populated bitmap + * @page_start: page index of the first page to be allocated + * @page_end: page index of the last page to be allocated + 1 + * + * Allocate pages [@page_start,@page_end) into @pages for all units. + * The allocation is for @chunk. Percpu core doesn't care about the + * content of @pages and will pass it verbatim to pcpu_map_pages(). + */ +static int pcpu_alloc_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) +{ + const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; + unsigned int cpu; + int i; + + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; + + *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); + if (!*pagep) { + pcpu_free_pages(chunk, pages, populated, + page_start, page_end); + return -ENOMEM; + } + } + } + return 0; +} + +/** + * pcpu_pre_unmap_flush - flush cache prior to unmapping + * @chunk: chunk the regions to be flushed belongs to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages in [@page_start,@page_end) of @chunk are about to be + * unmapped. Flush cache. As each flushing trial can be very + * expensive, issue flush on the whole region at once rather than + * doing it for each cpu. This could be an overkill but is more + * scalable. + */ +static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + flush_cache_vunmap( + pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); +} + +static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) +{ + unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT); +} + +/** + * pcpu_unmap_pages - unmap pages out of a pcpu_chunk + * @chunk: chunk of interest + * @pages: pages array which can be used to pass information to free + * @populated: populated bitmap + * @page_start: page index of the first page to unmap + * @page_end: page index of the last page to unmap + 1 + * + * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. + * Corresponding elements in @pages were cleared by the caller and can + * be used to carry information to pcpu_free_pages() which will be + * called after all unmaps are finished. The caller should call + * proper pre/post flush functions. + */ +static void pcpu_unmap_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) +{ + unsigned int cpu; + int i; + + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page *page; + + page = pcpu_chunk_page(chunk, cpu, i); + WARN_ON(!page); + pages[pcpu_page_idx(cpu, i)] = page; + } + __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), + page_end - page_start); + } + + for (i = page_start; i < page_end; i++) + __clear_bit(i, populated); +} + +/** + * pcpu_post_unmap_tlb_flush - flush TLB after unmapping + * @chunk: pcpu_chunk the regions to be flushed belong to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush + * TLB for the regions. This can be skipped if the area is to be + * returned to vmalloc as vmalloc will handle TLB flushing lazily. + * + * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once + * for the whole region. + */ +static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + flush_tlb_kernel_range( + pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); +} + +static int __pcpu_map_pages(unsigned long addr, struct page **pages, + int nr_pages) +{ + return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT, + PAGE_KERNEL, pages); +} + +/** + * pcpu_map_pages - map pages into a pcpu_chunk + * @chunk: chunk of interest + * @pages: pages array containing pages to be mapped + * @populated: populated bitmap + * @page_start: page index of the first page to map + * @page_end: page index of the last page to map + 1 + * + * For each cpu, map pages [@page_start,@page_end) into @chunk. The + * caller is responsible for calling pcpu_post_map_flush() after all + * mappings are complete. + * + * This function is responsible for setting corresponding bits in + * @chunk->populated bitmap and whatever is necessary for reverse + * lookup (addr -> chunk). + */ +static int pcpu_map_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) +{ + unsigned int cpu, tcpu; + int i, err; + + for_each_possible_cpu(cpu) { + err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), + &pages[pcpu_page_idx(cpu, page_start)], + page_end - page_start); + if (err < 0) + goto err; + } + + /* mapping successful, link chunk and mark populated */ + for (i = page_start; i < page_end; i++) { + for_each_possible_cpu(cpu) + pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], + chunk); + __set_bit(i, populated); + } + + return 0; + +err: + for_each_possible_cpu(tcpu) { + if (tcpu == cpu) + break; + __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), + page_end - page_start); + } + return err; +} + +/** + * pcpu_post_map_flush - flush cache after mapping + * @chunk: pcpu_chunk the regions to be flushed belong to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages [@page_start,@page_end) of @chunk have been mapped. Flush + * cache. + * + * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once + * for the whole region. + */ +static void pcpu_post_map_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + flush_cache_vmap( + pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); +} + +/** + * pcpu_populate_chunk - populate and map an area of a pcpu_chunk + * @chunk: chunk of interest + * @off: offset to the area to populate + * @size: size of the area to populate in bytes + * + * For each cpu, populate and map pages [@page_start,@page_end) into + * @chunk. The area is cleared on return. + * + * CONTEXT: + * pcpu_alloc_mutex, does GFP_KERNEL allocation. + */ +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) +{ + int page_start = PFN_DOWN(off); + int page_end = PFN_UP(off + size); + int free_end = page_start, unmap_end = page_start; + struct page **pages; + unsigned long *populated; + unsigned int cpu; + int rs, re, rc; + + /* quick path, check whether all pages are already there */ + rs = page_start; + pcpu_next_pop(chunk, &rs, &re, page_end); + if (rs == page_start && re == page_end) + goto clear; + + /* need to allocate and map pages, this chunk can't be immutable */ + WARN_ON(chunk->immutable); + + pages = pcpu_get_pages_and_bitmap(chunk, &populated, true); + if (!pages) + return -ENOMEM; + + /* alloc and map */ + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + rc = pcpu_alloc_pages(chunk, pages, populated, rs, re); + if (rc) + goto err_free; + free_end = re; + } + + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + rc = pcpu_map_pages(chunk, pages, populated, rs, re); + if (rc) + goto err_unmap; + unmap_end = re; + } + pcpu_post_map_flush(chunk, page_start, page_end); + + /* commit new bitmap */ + bitmap_copy(chunk->populated, populated, pcpu_unit_pages); +clear: + for_each_possible_cpu(cpu) + memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); + return 0; + +err_unmap: + pcpu_pre_unmap_flush(chunk, page_start, unmap_end); + pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end) + pcpu_unmap_pages(chunk, pages, populated, rs, re); + pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end); +err_free: + pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end) + pcpu_free_pages(chunk, pages, populated, rs, re); + return rc; +} + +/** + * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk + * @chunk: chunk to depopulate + * @off: offset to the area to depopulate + * @size: size of the area to depopulate in bytes + * @flush: whether to flush cache and tlb or not + * + * For each cpu, depopulate and unmap pages [@page_start,@page_end) + * from @chunk. If @flush is true, vcache is flushed before unmapping + * and tlb after. + * + * CONTEXT: + * pcpu_alloc_mutex. + */ +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) +{ + int page_start = PFN_DOWN(off); + int page_end = PFN_UP(off + size); + struct page **pages; + unsigned long *populated; + int rs, re; + + /* quick path, check whether it's empty already */ + rs = page_start; + pcpu_next_unpop(chunk, &rs, &re, page_end); + if (rs == page_start && re == page_end) + return; + + /* immutable chunks can't be depopulated */ + WARN_ON(chunk->immutable); + + /* + * If control reaches here, there must have been at least one + * successful population attempt so the temp pages array must + * be available now. + */ + pages = pcpu_get_pages_and_bitmap(chunk, &populated, false); + BUG_ON(!pages); + + /* unmap and free */ + pcpu_pre_unmap_flush(chunk, page_start, page_end); + + pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) + pcpu_unmap_pages(chunk, pages, populated, rs, re); + + /* no need to flush tlb, vmalloc will handle it lazily */ + + pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) + pcpu_free_pages(chunk, pages, populated, rs, re); + + /* commit new bitmap */ + bitmap_copy(chunk->populated, populated, pcpu_unit_pages); +} + +static struct pcpu_chunk *pcpu_create_chunk(void) +{ + struct pcpu_chunk *chunk; + struct vm_struct **vms; + + chunk = pcpu_alloc_chunk(); + if (!chunk) + return NULL; + + vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, + pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL); + if (!vms) { + pcpu_free_chunk(chunk); + return NULL; + } + + chunk->data = vms; + chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0]; + return chunk; +} + +static void pcpu_destroy_chunk(struct pcpu_chunk *chunk) +{ + if (chunk && chunk->data) + pcpu_free_vm_areas(chunk->data, pcpu_nr_groups); + pcpu_free_chunk(chunk); +} + +static struct page *pcpu_addr_to_page(void *addr) +{ + return vmalloc_to_page(addr); +} + +static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) +{ + /* no extra restriction */ + return 0; +} diff --git a/mm/percpu.c b/mm/percpu.c index 083e7c9..c76ef38 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1,5 +1,5 @@ /* - * linux/mm/percpu.c - percpu memory allocator + * mm/percpu.c - percpu memory allocator * * Copyright (C) 2009 SUSE Linux Products GmbH * Copyright (C) 2009 Tejun Heo <tj@kernel.org> @@ -7,14 +7,13 @@ * This file is released under the GPLv2. * * This is percpu allocator which can handle both static and dynamic - * areas. Percpu areas are allocated in chunks in vmalloc area. Each - * chunk is consisted of boot-time determined number of units and the - * first chunk is used for static percpu variables in the kernel image + * areas. Percpu areas are allocated in chunks. Each chunk is + * consisted of boot-time determined number of units and the first + * chunk is used for static percpu variables in the kernel image * (special boot time alloc/init handling necessary as these areas * need to be brought up before allocation services are running). * Unit grows as necessary and all units grow or shrink in unison. - * When a chunk is filled up, another chunk is allocated. ie. in - * vmalloc area + * When a chunk is filled up, another chunk is allocated. * * c0 c1 c2 * ------------------- ------------------- ------------ @@ -80,13 +79,15 @@ /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ #ifndef __addr_to_pcpu_ptr #define __addr_to_pcpu_ptr(addr) \ - (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ - + (unsigned long)__per_cpu_start) + (void __percpu *)((unsigned long)(addr) - \ + (unsigned long)pcpu_base_addr + \ + (unsigned long)__per_cpu_start) #endif #ifndef __pcpu_ptr_to_addr #define __pcpu_ptr_to_addr(ptr) \ - (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ - - (unsigned long)__per_cpu_start) + (void __force *)((unsigned long)(ptr) + \ + (unsigned long)pcpu_base_addr - \ + (unsigned long)__per_cpu_start) #endif struct pcpu_chunk { @@ -97,7 +98,7 @@ struct pcpu_chunk { int map_used; /* # of map entries used */ int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ - struct vm_struct **vms; /* mapped vmalloc regions */ + void *data; /* chunk data */ bool immutable; /* no [de]population allowed */ unsigned long populated[]; /* populated bitmap */ }; @@ -175,6 +176,21 @@ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ static void pcpu_reclaim(struct work_struct *work); static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); +static bool pcpu_addr_in_first_chunk(void *addr) +{ + void *first_start = pcpu_first_chunk->base_addr; + + return addr >= first_start && addr < first_start + pcpu_unit_size; +} + +static bool pcpu_addr_in_reserved_chunk(void *addr) +{ + void *first_start = pcpu_first_chunk->base_addr; + + return addr >= first_start && + addr < first_start + pcpu_reserved_chunk_limit; +} + static int __pcpu_size_to_slot(int size) { int highbit = fls(size); /* size is in bytes */ @@ -196,27 +212,6 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) return pcpu_size_to_slot(chunk->free_size); } -static int pcpu_page_idx(unsigned int cpu, int page_idx) -{ - return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx; -} - -static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, - unsigned int cpu, int page_idx) -{ - return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] + - (page_idx << PAGE_SHIFT); -} - -static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, - unsigned int cpu, int page_idx) -{ - /* must not be used on pre-mapped chunk */ - WARN_ON(chunk->immutable); - - return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx)); -} - /* set the pointer to a chunk in a page struct */ static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu) { @@ -229,13 +224,27 @@ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) return (struct pcpu_chunk *)page->index; } -static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end) +static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx) +{ + return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx; +} + +static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, + unsigned int cpu, int page_idx) +{ + return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] + + (page_idx << PAGE_SHIFT); +} + +static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk, + int *rs, int *re, int end) { *rs = find_next_zero_bit(chunk->populated, end, *rs); *re = find_next_bit(chunk->populated, end, *rs + 1); } -static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end) +static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk, + int *rs, int *re, int end) { *rs = find_next_bit(chunk->populated, end, *rs); *re = find_next_zero_bit(chunk->populated, end, *rs + 1); @@ -273,6 +282,9 @@ static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end) */ static void *pcpu_mem_alloc(size_t size) { + if (WARN_ON_ONCE(!slab_is_available())) + return NULL; + if (size <= PAGE_SIZE) return kzalloc(size, GFP_KERNEL); else { @@ -324,36 +336,6 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) } /** - * pcpu_chunk_addr_search - determine chunk containing specified address - * @addr: address for which the chunk needs to be determined. - * - * RETURNS: - * The address of the found chunk. - */ -static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) -{ - void *first_start = pcpu_first_chunk->base_addr; - - /* is it in the first chunk? */ - if (addr >= first_start && addr < first_start + pcpu_unit_size) { - /* is it in the reserved area? */ - if (addr < first_start + pcpu_reserved_chunk_limit) - return pcpu_reserved_chunk; - return pcpu_first_chunk; - } - - /* - * The address is relative to unit0 which might be unused and - * thus unmapped. Offset the address to the unit space of the - * current processor before looking it up in the vmalloc - * space. Note that any possible cpu id can be used here, so - * there's no need to worry about preemption or cpu hotplug. - */ - addr += pcpu_unit_offsets[raw_smp_processor_id()]; - return pcpu_get_page_chunk(vmalloc_to_page(addr)); -} - -/** * pcpu_need_to_extend - determine whether chunk area map needs to be extended * @chunk: chunk of interest * @@ -411,14 +393,9 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc) goto out_unlock; old_size = chunk->map_alloc * sizeof(chunk->map[0]); - memcpy(new, chunk->map, old_size); + old = chunk->map; - /* - * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is - * one of the first chunks and still using static map. - */ - if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC) - old = chunk->map; + memcpy(new, old, old_size); chunk->map_alloc = new_alloc; chunk->map = new; @@ -621,436 +598,92 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) pcpu_chunk_relocate(chunk, oslot); } -/** - * pcpu_get_pages_and_bitmap - get temp pages array and bitmap - * @chunk: chunk of interest - * @bitmapp: output parameter for bitmap - * @may_alloc: may allocate the array - * - * Returns pointer to array of pointers to struct page and bitmap, - * both of which can be indexed with pcpu_page_idx(). The returned - * array is cleared to zero and *@bitmapp is copied from - * @chunk->populated. Note that there is only one array and bitmap - * and access exclusion is the caller's responsibility. - * - * CONTEXT: - * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc. - * Otherwise, don't care. - * - * RETURNS: - * Pointer to temp pages array on success, NULL on failure. - */ -static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, - unsigned long **bitmapp, - bool may_alloc) -{ - static struct page **pages; - static unsigned long *bitmap; - size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); - size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * - sizeof(unsigned long); - - if (!pages || !bitmap) { - if (may_alloc && !pages) - pages = pcpu_mem_alloc(pages_size); - if (may_alloc && !bitmap) - bitmap = pcpu_mem_alloc(bitmap_size); - if (!pages || !bitmap) - return NULL; - } - - memset(pages, 0, pages_size); - bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); - - *bitmapp = bitmap; - return pages; -} - -/** - * pcpu_free_pages - free pages which were allocated for @chunk - * @chunk: chunk pages were allocated for - * @pages: array of pages to be freed, indexed by pcpu_page_idx() - * @populated: populated bitmap - * @page_start: page index of the first page to be freed - * @page_end: page index of the last page to be freed + 1 - * - * Free pages [@page_start and @page_end) in @pages for all units. - * The pages were allocated for @chunk. - */ -static void pcpu_free_pages(struct pcpu_chunk *chunk, - struct page **pages, unsigned long *populated, - int page_start, int page_end) -{ - unsigned int cpu; - int i; - - for_each_possible_cpu(cpu) { - for (i = page_start; i < page_end; i++) { - struct page *page = pages[pcpu_page_idx(cpu, i)]; - - if (page) - __free_page(page); - } - } -} - -/** - * pcpu_alloc_pages - allocates pages for @chunk - * @chunk: target chunk - * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() - * @populated: populated bitmap - * @page_start: page index of the first page to be allocated - * @page_end: page index of the last page to be allocated + 1 - * - * Allocate pages [@page_start,@page_end) into @pages for all units. - * The allocation is for @chunk. Percpu core doesn't care about the - * content of @pages and will pass it verbatim to pcpu_map_pages(). - */ -static int pcpu_alloc_pages(struct pcpu_chunk *chunk, - struct page **pages, unsigned long *populated, - int page_start, int page_end) +static struct pcpu_chunk *pcpu_alloc_chunk(void) { - const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; - unsigned int cpu; - int i; - - for_each_possible_cpu(cpu) { - for (i = page_start; i < page_end; i++) { - struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; - - *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); - if (!*pagep) { - pcpu_free_pages(chunk, pages, populated, - page_start, page_end); - return -ENOMEM; - } - } - } - return 0; -} - -/** - * pcpu_pre_unmap_flush - flush cache prior to unmapping - * @chunk: chunk the regions to be flushed belongs to - * @page_start: page index of the first page to be flushed - * @page_end: page index of the last page to be flushed + 1 - * - * Pages in [@page_start,@page_end) of @chunk are about to be - * unmapped. Flush cache. As each flushing trial can be very - * expensive, issue flush on the whole region at once rather than - * doing it for each cpu. This could be an overkill but is more - * scalable. - */ -static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, - int page_start, int page_end) -{ - flush_cache_vunmap( - pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), - pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); -} - -static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) -{ - unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT); -} - -/** - * pcpu_unmap_pages - unmap pages out of a pcpu_chunk - * @chunk: chunk of interest - * @pages: pages array which can be used to pass information to free - * @populated: populated bitmap - * @page_start: page index of the first page to unmap - * @page_end: page index of the last page to unmap + 1 - * - * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. - * Corresponding elements in @pages were cleared by the caller and can - * be used to carry information to pcpu_free_pages() which will be - * called after all unmaps are finished. The caller should call - * proper pre/post flush functions. - */ -static void pcpu_unmap_pages(struct pcpu_chunk *chunk, - struct page **pages, unsigned long *populated, - int page_start, int page_end) -{ - unsigned int cpu; - int i; + struct pcpu_chunk *chunk; - for_each_possible_cpu(cpu) { - for (i = page_start; i < page_end; i++) { - struct page *page; + chunk = pcpu_mem_alloc(pcpu_chunk_struct_size); + if (!chunk) + return NULL; - page = pcpu_chunk_page(chunk, cpu, i); - WARN_ON(!page); - pages[pcpu_page_idx(cpu, i)] = page; - } - __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), - page_end - page_start); + chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); + if (!chunk->map) { + kfree(chunk); + return NULL; } - for (i = page_start; i < page_end; i++) - __clear_bit(i, populated); -} + chunk->map_alloc = PCPU_DFL_MAP_ALLOC; + chunk->map[chunk->map_used++] = pcpu_unit_size; -/** - * pcpu_post_unmap_tlb_flush - flush TLB after unmapping - * @chunk: pcpu_chunk the regions to be flushed belong to - * @page_start: page index of the first page to be flushed - * @page_end: page index of the last page to be flushed + 1 - * - * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush - * TLB for the regions. This can be skipped if the area is to be - * returned to vmalloc as vmalloc will handle TLB flushing lazily. - * - * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once - * for the whole region. - */ -static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, - int page_start, int page_end) -{ - flush_tlb_kernel_range( - pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), - pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); -} + INIT_LIST_HEAD(&chunk->list); + chunk->free_size = pcpu_unit_size; + chunk->contig_hint = pcpu_unit_size; -static int __pcpu_map_pages(unsigned long addr, struct page **pages, - int nr_pages) -{ - return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT, - PAGE_KERNEL, pages); + return chunk; } -/** - * pcpu_map_pages - map pages into a pcpu_chunk - * @chunk: chunk of interest - * @pages: pages array containing pages to be mapped - * @populated: populated bitmap - * @page_start: page index of the first page to map - * @page_end: page index of the last page to map + 1 - * - * For each cpu, map pages [@page_start,@page_end) into @chunk. The - * caller is responsible for calling pcpu_post_map_flush() after all - * mappings are complete. - * - * This function is responsible for setting corresponding bits in - * @chunk->populated bitmap and whatever is necessary for reverse - * lookup (addr -> chunk). - */ -static int pcpu_map_pages(struct pcpu_chunk *chunk, - struct page **pages, unsigned long *populated, - int page_start, int page_end) +static void pcpu_free_chunk(struct pcpu_chunk *chunk) { - unsigned int cpu, tcpu; - int i, err; - - for_each_possible_cpu(cpu) { - err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), - &pages[pcpu_page_idx(cpu, page_start)], - page_end - page_start); - if (err < 0) - goto err; - } - - /* mapping successful, link chunk and mark populated */ - for (i = page_start; i < page_end; i++) { - for_each_possible_cpu(cpu) - pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], - chunk); - __set_bit(i, populated); - } - - return 0; - -err: - for_each_possible_cpu(tcpu) { - if (tcpu == cpu) - break; - __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), - page_end - page_start); - } - return err; + if (!chunk) + return; + pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); + kfree(chunk); } -/** - * pcpu_post_map_flush - flush cache after mapping - * @chunk: pcpu_chunk the regions to be flushed belong to - * @page_start: page index of the first page to be flushed - * @page_end: page index of the last page to be flushed + 1 - * - * Pages [@page_start,@page_end) of @chunk have been mapped. Flush - * cache. - * - * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once - * for the whole region. +/* + * Chunk management implementation. + * + * To allow different implementations, chunk alloc/free and + * [de]population are implemented in a separate file which is pulled + * into this file and compiled together. The following functions + * should be implemented. + * + * pcpu_populate_chunk - populate the specified range of a chunk + * pcpu_depopulate_chunk - depopulate the specified range of a chunk + * pcpu_create_chunk - create a new chunk + * pcpu_destroy_chunk - destroy a chunk, always preceded by full depop + * pcpu_addr_to_page - translate address to physical address + * pcpu_verify_alloc_info - check alloc_info is acceptable during init */ -static void pcpu_post_map_flush(struct pcpu_chunk *chunk, - int page_start, int page_end) -{ - flush_cache_vmap( - pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), - pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); -} +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size); +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size); +static struct pcpu_chunk *pcpu_create_chunk(void); +static void pcpu_destroy_chunk(struct pcpu_chunk *chunk); +static struct page *pcpu_addr_to_page(void *addr); +static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai); + +#ifdef CONFIG_NEED_PER_CPU_KM +#include "percpu-km.c" +#else +#include "percpu-vm.c" +#endif /** - * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk - * @chunk: chunk to depopulate - * @off: offset to the area to depopulate - * @size: size of the area to depopulate in bytes - * @flush: whether to flush cache and tlb or not - * - * For each cpu, depopulate and unmap pages [@page_start,@page_end) - * from @chunk. If @flush is true, vcache is flushed before unmapping - * and tlb after. + * pcpu_chunk_addr_search - determine chunk containing specified address + * @addr: address for which the chunk needs to be determined. * - * CONTEXT: - * pcpu_alloc_mutex. + * RETURNS: + * The address of the found chunk. */ -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) +static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) { - int page_start = PFN_DOWN(off); - int page_end = PFN_UP(off + size); - struct page **pages; - unsigned long *populated; - int rs, re; - - /* quick path, check whether it's empty already */ - pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { - if (rs == page_start && re == page_end) - return; - break; + /* is it in the first chunk? */ + if (pcpu_addr_in_first_chunk(addr)) { + /* is it in the reserved area? */ + if (pcpu_addr_in_reserved_chunk(addr)) + return pcpu_reserved_chunk; + return pcpu_first_chunk; } - /* immutable chunks can't be depopulated */ - WARN_ON(chunk->immutable); - /* - * If control reaches here, there must have been at least one - * successful population attempt so the temp pages array must - * be available now. + * The address is relative to unit0 which might be unused and + * thus unmapped. Offset the address to the unit space of the + * current processor before looking it up in the vmalloc + * space. Note that any possible cpu id can be used here, so + * there's no need to worry about preemption or cpu hotplug. */ - pages = pcpu_get_pages_and_bitmap(chunk, &populated, false); - BUG_ON(!pages); - - /* unmap and free */ - pcpu_pre_unmap_flush(chunk, page_start, page_end); - - pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) - pcpu_unmap_pages(chunk, pages, populated, rs, re); - - /* no need to flush tlb, vmalloc will handle it lazily */ - - pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) - pcpu_free_pages(chunk, pages, populated, rs, re); - - /* commit new bitmap */ - bitmap_copy(chunk->populated, populated, pcpu_unit_pages); -} - -/** - * pcpu_populate_chunk - populate and map an area of a pcpu_chunk - * @chunk: chunk of interest - * @off: offset to the area to populate - * @size: size of the area to populate in bytes - * - * For each cpu, populate and map pages [@page_start,@page_end) into - * @chunk. The area is cleared on return. - * - * CONTEXT: - * pcpu_alloc_mutex, does GFP_KERNEL allocation. - */ -static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) -{ - int page_start = PFN_DOWN(off); - int page_end = PFN_UP(off + size); - int free_end = page_start, unmap_end = page_start; - struct page **pages; - unsigned long *populated; - unsigned int cpu; - int rs, re, rc; - - /* quick path, check whether all pages are already there */ - pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) { - if (rs == page_start && re == page_end) - goto clear; - break; - } - - /* need to allocate and map pages, this chunk can't be immutable */ - WARN_ON(chunk->immutable); - - pages = pcpu_get_pages_and_bitmap(chunk, &populated, true); - if (!pages) - return -ENOMEM; - - /* alloc and map */ - pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { - rc = pcpu_alloc_pages(chunk, pages, populated, rs, re); - if (rc) - goto err_free; - free_end = re; - } - - pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { - rc = pcpu_map_pages(chunk, pages, populated, rs, re); - if (rc) - goto err_unmap; - unmap_end = re; - } - pcpu_post_map_flush(chunk, page_start, page_end); - - /* commit new bitmap */ - bitmap_copy(chunk->populated, populated, pcpu_unit_pages); -clear: - for_each_possible_cpu(cpu) - memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); - return 0; - -err_unmap: - pcpu_pre_unmap_flush(chunk, page_start, unmap_end); - pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end) - pcpu_unmap_pages(chunk, pages, populated, rs, re); - pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end); -err_free: - pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end) - pcpu_free_pages(chunk, pages, populated, rs, re); - return rc; -} - -static void free_pcpu_chunk(struct pcpu_chunk *chunk) -{ - if (!chunk) - return; - if (chunk->vms) - pcpu_free_vm_areas(chunk->vms, pcpu_nr_groups); - pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); - kfree(chunk); -} - -static struct pcpu_chunk *alloc_pcpu_chunk(void) -{ - struct pcpu_chunk *chunk; - - chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL); - if (!chunk) - return NULL; - - chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); - chunk->map_alloc = PCPU_DFL_MAP_ALLOC; - chunk->map[chunk->map_used++] = pcpu_unit_size; - - chunk->vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, - pcpu_nr_groups, pcpu_atom_size, - GFP_KERNEL); - if (!chunk->vms) { - free_pcpu_chunk(chunk); - return NULL; - } - - INIT_LIST_HEAD(&chunk->list); - chunk->free_size = pcpu_unit_size; - chunk->contig_hint = pcpu_unit_size; - chunk->base_addr = chunk->vms[0]->addr - pcpu_group_offsets[0]; - - return chunk; + addr += pcpu_unit_offsets[raw_smp_processor_id()]; + return pcpu_get_page_chunk(pcpu_addr_to_page(addr)); } /** @@ -1067,7 +700,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ -static void *pcpu_alloc(size_t size, size_t align, bool reserved) +static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) { static int warn_limit = 10; struct pcpu_chunk *chunk; @@ -1142,7 +775,7 @@ restart: /* hmmm... no space left, create a new chunk */ spin_unlock_irqrestore(&pcpu_lock, flags); - chunk = alloc_pcpu_chunk(); + chunk = pcpu_create_chunk(); if (!chunk) { err = "failed to allocate new chunk"; goto fail_unlock_mutex; @@ -1196,7 +829,7 @@ fail_unlock_mutex: * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ -void *__alloc_percpu(size_t size, size_t align) +void __percpu *__alloc_percpu(size_t size, size_t align) { return pcpu_alloc(size, align, false); } @@ -1217,7 +850,7 @@ EXPORT_SYMBOL_GPL(__alloc_percpu); * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ -void *__alloc_reserved_percpu(size_t size, size_t align) +void __percpu *__alloc_reserved_percpu(size_t size, size_t align) { return pcpu_alloc(size, align, true); } @@ -1254,7 +887,7 @@ static void pcpu_reclaim(struct work_struct *work) list_for_each_entry_safe(chunk, next, &todo, list) { pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); - free_pcpu_chunk(chunk); + pcpu_destroy_chunk(chunk); } mutex_unlock(&pcpu_alloc_mutex); @@ -1269,7 +902,7 @@ static void pcpu_reclaim(struct work_struct *work) * CONTEXT: * Can be called from atomic context. */ -void free_percpu(void *ptr) +void free_percpu(void __percpu *ptr) { void *addr; struct pcpu_chunk *chunk; @@ -1304,6 +937,32 @@ void free_percpu(void *ptr) EXPORT_SYMBOL_GPL(free_percpu); /** + * is_kernel_percpu_address - test whether address is from static percpu area + * @addr: address to test + * + * Test whether @addr belongs to in-kernel static percpu area. Module + * static percpu areas are not considered. For those, use + * is_module_percpu_address(). + * + * RETURNS: + * %true if @addr is from in-kernel static percpu area, %false otherwise. + */ +bool is_kernel_percpu_address(unsigned long addr) +{ + const size_t static_size = __per_cpu_end - __per_cpu_start; + void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); + unsigned int cpu; + + for_each_possible_cpu(cpu) { + void *start = per_cpu_ptr(base, cpu); + + if ((void *)addr >= start && (void *)addr < start + static_size) + return true; + } + return false; +} + +/** * per_cpu_ptr_to_phys - convert translated percpu address to physical address * @addr: the address to be converted to physical address * @@ -1317,25 +976,39 @@ EXPORT_SYMBOL_GPL(free_percpu); */ phys_addr_t per_cpu_ptr_to_phys(void *addr) { - if ((unsigned long)addr < VMALLOC_START || - (unsigned long)addr >= VMALLOC_END) - return __pa(addr); - else - return page_to_phys(vmalloc_to_page(addr)); -} - -static inline size_t pcpu_calc_fc_sizes(size_t static_size, - size_t reserved_size, - ssize_t *dyn_sizep) -{ - size_t size_sum; + void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); + bool in_first_chunk = false; + unsigned long first_start, first_end; + unsigned int cpu; - size_sum = PFN_ALIGN(static_size + reserved_size + - (*dyn_sizep >= 0 ? *dyn_sizep : 0)); - if (*dyn_sizep != 0) - *dyn_sizep = size_sum - static_size - reserved_size; + /* + * The following test on first_start/end isn't strictly + * necessary but will speed up lookups of addresses which + * aren't in the first chunk. + */ + first_start = pcpu_chunk_addr(pcpu_first_chunk, pcpu_first_unit_cpu, 0); + first_end = pcpu_chunk_addr(pcpu_first_chunk, pcpu_last_unit_cpu, + pcpu_unit_pages); + if ((unsigned long)addr >= first_start && + (unsigned long)addr < first_end) { + for_each_possible_cpu(cpu) { + void *start = per_cpu_ptr(base, cpu); + + if (addr >= start && addr < start + pcpu_unit_size) { + in_first_chunk = true; + break; + } + } + } - return size_sum; + if (in_first_chunk) { + if ((unsigned long)addr < VMALLOC_START || + (unsigned long)addr >= VMALLOC_END) + return __pa(addr); + else + return page_to_phys(vmalloc_to_page(addr)); + } else + return page_to_phys(pcpu_addr_to_page(addr)); } /** @@ -1396,7 +1069,7 @@ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) /** * pcpu_build_alloc_info - build alloc_info considering distances between CPUs * @reserved_size: the size of reserved percpu area in bytes - * @dyn_size: free size for dynamic allocation in bytes, -1 for auto + * @dyn_size: minimum free size for dynamic allocation in bytes * @atom_size: allocation atom size * @cpu_distance_fn: callback to determine distance between cpus, optional * @@ -1414,15 +1087,15 @@ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) * On success, pointer to the new allocation_info is returned. On * failure, ERR_PTR value is returned. */ -struct pcpu_alloc_info * __init pcpu_build_alloc_info( - size_t reserved_size, ssize_t dyn_size, +static struct pcpu_alloc_info * __init pcpu_build_alloc_info( + size_t reserved_size, size_t dyn_size, size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn) { static int group_map[NR_CPUS] __initdata; static int group_cnt[NR_CPUS] __initdata; const size_t static_size = __per_cpu_end - __per_cpu_start; - int group_cnt_max = 0, nr_groups = 1, nr_units = 0; + int nr_groups = 1, nr_units = 0; size_t size_sum, min_unit_size, alloc_size; int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ int last_allocs, group, unit; @@ -1432,7 +1105,12 @@ struct pcpu_alloc_info * __init pcpu_build_alloc_info( /* this function may be called multiple times */ memset(group_map, 0, sizeof(group_map)); - memset(group_cnt, 0, sizeof(group_map)); + memset(group_cnt, 0, sizeof(group_cnt)); + + /* calculate size_sum and ensure dyn_size is enough for early alloc */ + size_sum = PFN_ALIGN(static_size + reserved_size + + max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE)); + dyn_size = size_sum - static_size - reserved_size; /* * Determine min_unit_size, alloc_size and max_upa such that @@ -1440,7 +1118,6 @@ struct pcpu_alloc_info * __init pcpu_build_alloc_info( * which can accomodate 4k aligned segments which are equal to * or larger than min_unit_size. */ - size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); alloc_size = roundup(min_unit_size, atom_size); @@ -1466,7 +1143,6 @@ struct pcpu_alloc_info * __init pcpu_build_alloc_info( } group_map[cpu] = group; group_cnt[group]++; - group_cnt_max = max(group_cnt_max, group_cnt[group]); } /* @@ -1488,7 +1164,7 @@ struct pcpu_alloc_info * __init pcpu_build_alloc_info( } /* - * Don't accept if wastage is over 25%. The + * Don't accept if wastage is over 1/3. The * greater-than comparison ensures upa==1 always * passes the following check. */ @@ -1662,7 +1338,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, void *base_addr) { static char cpus_buf[4096] __initdata; - static int smap[2], dmap[2]; + static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; + static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; size_t dyn_size = ai->dyn_size; size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; struct pcpu_chunk *schunk, *dchunk = NULL; @@ -1685,14 +1362,14 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, } while (0) /* sanity checks */ - BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || - ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); PCPU_SETUP_BUG_ON(!ai->static_size); PCPU_SETUP_BUG_ON(!base_addr); PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); + PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE); + PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); /* process group information and build config tables accordingly */ group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0])); @@ -1724,9 +1401,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, if (pcpu_first_unit_cpu == NR_CPUS) pcpu_first_unit_cpu = cpu; + pcpu_last_unit_cpu = cpu; } } - pcpu_last_unit_cpu = cpu; pcpu_nr_units = unit; for_each_possible_cpu(cpu) @@ -1843,7 +1520,7 @@ early_param("percpu_alloc", percpu_alloc_setup); /** * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem * @reserved_size: the size of reserved percpu area in bytes - * @dyn_size: free size for dynamic allocation in bytes, -1 for auto + * @dyn_size: minimum free size for dynamic allocation in bytes * @atom_size: allocation atom size * @cpu_distance_fn: callback to determine distance between cpus, optional * @alloc_fn: function to allocate percpu page @@ -1864,10 +1541,7 @@ early_param("percpu_alloc", percpu_alloc_setup); * vmalloc space is not orders of magnitude larger than distances * between node memory addresses (ie. 32bit NUMA machines). * - * When @dyn_size is positive, dynamic area might be larger than - * specified to fill page alignment. When @dyn_size is auto, - * @dyn_size is just big enough to fill page alignment after static - * and reserved areas. + * @dyn_size specifies the minimum dynamic area size. * * If the needed size is smaller than the minimum or specified unit * size, the leftover is returned using @free_fn. @@ -1875,7 +1549,7 @@ early_param("percpu_alloc", percpu_alloc_setup); * RETURNS: * 0 on success, -errno on failure. */ -int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size, +int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn, pcpu_fc_alloc_fn_t alloc_fn, @@ -2006,7 +1680,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); - ai = pcpu_build_alloc_info(reserved_size, -1, PAGE_SIZE, NULL); + ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL); if (IS_ERR(ai)) return PTR_ERR(ai); BUG_ON(ai->nr_groups != 1); @@ -2132,3 +1806,33 @@ void __init setup_per_cpu_areas(void) __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; } #endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ + +/* + * First and reserved chunks are initialized with temporary allocation + * map in initdata so that they can be used before slab is online. + * This function is called after slab is brought up and replaces those + * with properly allocated maps. + */ +void __init percpu_init_late(void) +{ + struct pcpu_chunk *target_chunks[] = + { pcpu_first_chunk, pcpu_reserved_chunk, NULL }; + struct pcpu_chunk *chunk; + unsigned long flags; + int i; + + for (i = 0; (chunk = target_chunks[i]); i++) { + int *map; + const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]); + + BUILD_BUG_ON(size > PAGE_SIZE); + + map = pcpu_mem_alloc(size); + BUG_ON(!map); + + spin_lock_irqsave(&pcpu_lock, flags); + memcpy(map, chunk->map, size); + chunk->map = map; + spin_unlock_irqrestore(&pcpu_lock, flags); + } +} diff --git a/mm/percpu_up.c b/mm/percpu_up.c new file mode 100644 index 0000000..db884fa --- /dev/null +++ b/mm/percpu_up.c @@ -0,0 +1,30 @@ +/* + * mm/percpu_up.c - dummy percpu memory allocator implementation for UP + */ + +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/slab.h> + +void __percpu *__alloc_percpu(size_t size, size_t align) +{ + /* + * Can't easily make larger alignment work with kmalloc. WARN + * on it. Larger alignment should only be used for module + * percpu sections on SMP for which this path isn't used. + */ + WARN_ON_ONCE(align > SMP_CACHE_BYTES); + return (void __percpu __force *)kzalloc(size, GFP_KERNEL); +} +EXPORT_SYMBOL_GPL(__alloc_percpu); + +void free_percpu(void __percpu *p) +{ + kfree(this_cpu_ptr(p)); +} +EXPORT_SYMBOL_GPL(free_percpu); + +phys_addr_t per_cpu_ptr_to_phys(void *addr) +{ + return __pa(addr); +} diff --git a/mm/quicklist.c b/mm/quicklist.c index 6633965..2876349 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c @@ -14,6 +14,7 @@ */ #include <linux/kernel.h> +#include <linux/gfp.h> #include <linux/mm.h> #include <linux/mmzone.h> #include <linux/module.h> diff --git a/mm/readahead.c b/mm/readahead.c index 033bc13..77506a2 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/fs.h> +#include <linux/gfp.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/blkdev.h> @@ -501,6 +502,12 @@ void page_cache_sync_readahead(struct address_space *mapping, if (!ra->ra_pages) return; + /* be dumb */ + if (filp && (filp->f_mode & FMODE_RANDOM)) { + force_page_cache_readahead(mapping, filp, offset, req_size); + return; + } + /* do read-ahead */ ondemand_readahead(mapping, ra, filp, false, offset, req_size); } @@ -516,7 +523,7 @@ EXPORT_SYMBOL_GPL(page_cache_sync_readahead); * @req_size: hint: total size of the read which the caller is performing in * pagecache pages * - * page_cache_async_ondemand() should be called when a page is used which + * page_cache_async_readahead() should be called when a page is used which * has the PG_readahead flag; this is a marker to suggest that the application * has used up enough of the readahead window that we should start pulling in * more pages. @@ -56,12 +56,14 @@ #include <linux/memcontrol.h> #include <linux/mmu_notifier.h> #include <linux/migrate.h> +#include <linux/hugetlb.h> #include <asm/tlbflush.h> #include "internal.h" static struct kmem_cache *anon_vma_cachep; +static struct kmem_cache *anon_vma_chain_cachep; static inline struct anon_vma *anon_vma_alloc(void) { @@ -73,6 +75,16 @@ void anon_vma_free(struct anon_vma *anon_vma) kmem_cache_free(anon_vma_cachep, anon_vma); } +static inline struct anon_vma_chain *anon_vma_chain_alloc(void) +{ + return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL); +} + +void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) +{ + kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); +} + /** * anon_vma_prepare - attach an anon_vma to a memory region * @vma: the memory region in question @@ -103,80 +115,183 @@ void anon_vma_free(struct anon_vma *anon_vma) int anon_vma_prepare(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; + struct anon_vma_chain *avc; might_sleep(); if (unlikely(!anon_vma)) { struct mm_struct *mm = vma->vm_mm; struct anon_vma *allocated; + avc = anon_vma_chain_alloc(); + if (!avc) + goto out_enomem; + anon_vma = find_mergeable_anon_vma(vma); allocated = NULL; if (!anon_vma) { anon_vma = anon_vma_alloc(); if (unlikely(!anon_vma)) - return -ENOMEM; + goto out_enomem_free_avc; allocated = anon_vma; + /* + * This VMA had no anon_vma yet. This anon_vma is + * the root of any anon_vma tree that might form. + */ + anon_vma->root = anon_vma; } - spin_lock(&anon_vma->lock); + anon_vma_lock(anon_vma); /* page_table_lock to protect against threads */ spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { vma->anon_vma = anon_vma; - list_add_tail(&vma->anon_vma_node, &anon_vma->head); + avc->anon_vma = anon_vma; + avc->vma = vma; + list_add(&avc->same_vma, &vma->anon_vma_chain); + list_add_tail(&avc->same_anon_vma, &anon_vma->head); allocated = NULL; + avc = NULL; } spin_unlock(&mm->page_table_lock); + anon_vma_unlock(anon_vma); - spin_unlock(&anon_vma->lock); if (unlikely(allocated)) anon_vma_free(allocated); + if (unlikely(avc)) + anon_vma_chain_free(avc); } return 0; + + out_enomem_free_avc: + anon_vma_chain_free(avc); + out_enomem: + return -ENOMEM; } -void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) +static void anon_vma_chain_link(struct vm_area_struct *vma, + struct anon_vma_chain *avc, + struct anon_vma *anon_vma) { - BUG_ON(vma->anon_vma != next->anon_vma); - list_del(&next->anon_vma_node); + avc->vma = vma; + avc->anon_vma = anon_vma; + list_add(&avc->same_vma, &vma->anon_vma_chain); + + anon_vma_lock(anon_vma); + list_add_tail(&avc->same_anon_vma, &anon_vma->head); + anon_vma_unlock(anon_vma); } -void __anon_vma_link(struct vm_area_struct *vma) +/* + * Attach the anon_vmas from src to dst. + * Returns 0 on success, -ENOMEM on failure. + */ +int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) { - struct anon_vma *anon_vma = vma->anon_vma; + struct anon_vma_chain *avc, *pavc; - if (anon_vma) - list_add_tail(&vma->anon_vma_node, &anon_vma->head); + list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { + avc = anon_vma_chain_alloc(); + if (!avc) + goto enomem_failure; + anon_vma_chain_link(dst, avc, pavc->anon_vma); + } + return 0; + + enomem_failure: + unlink_anon_vmas(dst); + return -ENOMEM; } -void anon_vma_link(struct vm_area_struct *vma) +/* + * Attach vma to its own anon_vma, as well as to the anon_vmas that + * the corresponding VMA in the parent process is attached to. + * Returns 0 on success, non-zero on failure. + */ +int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) { - struct anon_vma *anon_vma = vma->anon_vma; + struct anon_vma_chain *avc; + struct anon_vma *anon_vma; - if (anon_vma) { - spin_lock(&anon_vma->lock); - list_add_tail(&vma->anon_vma_node, &anon_vma->head); - spin_unlock(&anon_vma->lock); - } + /* Don't bother if the parent process has no anon_vma here. */ + if (!pvma->anon_vma) + return 0; + + /* + * First, attach the new VMA to the parent VMA's anon_vmas, + * so rmap can find non-COWed pages in child processes. + */ + if (anon_vma_clone(vma, pvma)) + return -ENOMEM; + + /* Then add our own anon_vma. */ + anon_vma = anon_vma_alloc(); + if (!anon_vma) + goto out_error; + avc = anon_vma_chain_alloc(); + if (!avc) + goto out_error_free_anon_vma; + + /* + * The root anon_vma's spinlock is the lock actually used when we + * lock any of the anon_vmas in this anon_vma tree. + */ + anon_vma->root = pvma->anon_vma->root; + /* + * With KSM refcounts, an anon_vma can stay around longer than the + * process it belongs to. The root anon_vma needs to be pinned + * until this anon_vma is freed, because the lock lives in the root. + */ + get_anon_vma(anon_vma->root); + /* Mark this anon_vma as the one where our new (COWed) pages go. */ + vma->anon_vma = anon_vma; + anon_vma_chain_link(vma, avc, anon_vma); + + return 0; + + out_error_free_anon_vma: + anon_vma_free(anon_vma); + out_error: + unlink_anon_vmas(vma); + return -ENOMEM; } -void anon_vma_unlink(struct vm_area_struct *vma) +static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) { - struct anon_vma *anon_vma = vma->anon_vma; + struct anon_vma *anon_vma = anon_vma_chain->anon_vma; int empty; + /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */ if (!anon_vma) return; - spin_lock(&anon_vma->lock); - list_del(&vma->anon_vma_node); + anon_vma_lock(anon_vma); + list_del(&anon_vma_chain->same_anon_vma); /* We must garbage collect the anon_vma if it's empty */ - empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma); - spin_unlock(&anon_vma->lock); + empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma); + anon_vma_unlock(anon_vma); - if (empty) + if (empty) { + /* We no longer need the root anon_vma */ + if (anon_vma->root != anon_vma) + drop_anon_vma(anon_vma->root); anon_vma_free(anon_vma); + } +} + +void unlink_anon_vmas(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc, *next; + + /* + * Unlink each anon_vma chained to the VMA. This list is ordered + * from newest to oldest, ensuring the root anon_vma gets freed last. + */ + list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { + anon_vma_unlink(avc); + list_del(&avc->same_vma); + anon_vma_chain_free(avc); + } } static void anon_vma_ctor(void *data) @@ -184,7 +299,7 @@ static void anon_vma_ctor(void *data) struct anon_vma *anon_vma = data; spin_lock_init(&anon_vma->lock); - ksm_refcount_init(anon_vma); + anonvma_external_refcount_init(anon_vma); INIT_LIST_HEAD(&anon_vma->head); } @@ -192,6 +307,7 @@ void __init anon_vma_init(void) { anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); + anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC); } /* @@ -200,7 +316,7 @@ void __init anon_vma_init(void) */ struct anon_vma *page_lock_anon_vma(struct page *page) { - struct anon_vma *anon_vma; + struct anon_vma *anon_vma, *root_anon_vma; unsigned long anon_mapping; rcu_read_lock(); @@ -211,8 +327,21 @@ struct anon_vma *page_lock_anon_vma(struct page *page) goto out; anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); - spin_lock(&anon_vma->lock); - return anon_vma; + root_anon_vma = ACCESS_ONCE(anon_vma->root); + spin_lock(&root_anon_vma->lock); + + /* + * If this page is still mapped, then its anon_vma cannot have been + * freed. But if it has been unmapped, we have no security against + * the anon_vma structure being freed and reused (for another anon_vma: + * SLAB_DESTROY_BY_RCU guarantees that - so the spin_lock above cannot + * corrupt): with anon_vma_prepare() or anon_vma_fork() redirecting + * anon_vma->root before page_unlock_anon_vma() is called to unlock. + */ + if (page_mapped(page)) + return anon_vma; + + spin_unlock(&root_anon_vma->lock); out: rcu_read_unlock(); return NULL; @@ -220,7 +349,7 @@ out: void page_unlock_anon_vma(struct anon_vma *anon_vma) { - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); rcu_read_unlock(); } @@ -235,6 +364,8 @@ vma_address(struct page *page, struct vm_area_struct *vma) pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); unsigned long address; + if (unlikely(is_vm_hugetlb_page(vma))) + pgoff = page->index << huge_page_order(page_hstate(page)); address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { /* page should be within @vma mapping range */ @@ -245,12 +376,18 @@ vma_address(struct page *page, struct vm_area_struct *vma) /* * At what user virtual address is page expected in vma? - * checking that the page matches the vma. + * Caller should check the page is actually part of the vma. */ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { if (PageAnon(page)) { - if (vma->anon_vma != page_anon_vma(page)) + struct anon_vma *page__anon_vma = page_anon_vma(page); + /* + * Note: swapoff's unuse_vma() is more efficient with this + * check, and needs it to match anon_vma when KSM is active. + */ + if (!vma->anon_vma || !page__anon_vma || + vma->anon_vma->root != page__anon_vma->root) return -EFAULT; } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { if (!vma->vm_file || @@ -279,6 +416,12 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, pte_t *pte; spinlock_t *ptl; + if (unlikely(PageHuge(page))) { + pte = huge_pte_offset(mm, address); + ptl = &mm->page_table_lock; + goto check; + } + pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) return NULL; @@ -299,6 +442,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, } ptl = pte_lockptr(mm, pmd); +check: spin_lock(ptl); if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { *ptlp = ptl; @@ -396,7 +540,7 @@ static int page_referenced_anon(struct page *page, { unsigned int mapcount; struct anon_vma *anon_vma; - struct vm_area_struct *vma; + struct anon_vma_chain *avc; int referenced = 0; anon_vma = page_lock_anon_vma(page); @@ -404,7 +548,8 @@ static int page_referenced_anon(struct page *page, return referenced; mapcount = page_mapcount(page); - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); if (address == -EFAULT) continue; @@ -511,9 +656,6 @@ int page_referenced(struct page *page, int referenced = 0; int we_locked = 0; - if (TestClearPageReferenced(page)) - referenced++; - *vm_flags = 0; if (page_mapped(page) && page_rmapping(page)) { if (!is_locked && (!PageAnon(page) || PageKsm(page))) { @@ -614,17 +756,63 @@ int page_mkclean(struct page *page) EXPORT_SYMBOL_GPL(page_mkclean); /** + * page_move_anon_rmap - move a page to our anon_vma + * @page: the page to move to our anon_vma + * @vma: the vma the page belongs to + * @address: the user virtual address mapped + * + * When a page belongs exclusively to one process after a COW event, + * that page can be moved into the anon_vma that belongs to just that + * process, so the rmap code will not search the parent or sibling + * processes. + */ +void page_move_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + struct anon_vma *anon_vma = vma->anon_vma; + + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(!anon_vma); + VM_BUG_ON(page->index != linear_page_index(vma, address)); + + anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; + page->mapping = (struct address_space *) anon_vma; +} + +/** * __page_set_anon_rmap - setup new anonymous rmap * @page: the page to add the mapping to * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped + * @exclusive: the page is exclusively owned by the current process */ static void __page_set_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address) + struct vm_area_struct *vma, unsigned long address, int exclusive) { struct anon_vma *anon_vma = vma->anon_vma; BUG_ON(!anon_vma); + + /* + * If the page isn't exclusively mapped into this vma, + * we must use the _oldest_ possible anon_vma for the + * page mapping! + */ + if (!exclusive) { + if (PageAnon(page)) + return; + anon_vma = anon_vma->root; + } else { + /* + * In this case, swapped-out-but-not-discarded swap-cache + * is remapped. So, no need to update page->mapping here. + * We convice anon_vma poitned by page->mapping is not obsolete + * because vma->anon_vma is necessary to be a family of it. + */ + if (PageAnon(page)) + return; + } + anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; page->mapping = (struct address_space *) anon_vma; page->index = linear_page_index(vma, address); @@ -652,9 +840,7 @@ static void __page_check_anon_rmap(struct page *page, * are initially only visible via the pagetables, and the pte is locked * over the call to page_add_new_anon_rmap. */ - struct anon_vma *anon_vma = vma->anon_vma; - anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; - BUG_ON(page->mapping != (struct address_space *)anon_vma); + BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root); BUG_ON(page->index != linear_page_index(vma, address)); #endif } @@ -673,6 +859,17 @@ static void __page_check_anon_rmap(struct page *page, void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { + do_page_add_anon_rmap(page, vma, address, 0); +} + +/* + * Special version of the above for do_swap_page, which often runs + * into pages that are exclusively owned by the current process. + * Everybody else should continue to use page_add_anon_rmap above. + */ +void do_page_add_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address, int exclusive) +{ int first = atomic_inc_and_test(&page->_mapcount); if (first) __inc_zone_page_state(page, NR_ANON_PAGES); @@ -682,7 +879,7 @@ void page_add_anon_rmap(struct page *page, VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); if (first) - __page_set_anon_rmap(page, vma, address); + __page_set_anon_rmap(page, vma, address, exclusive); else __page_check_anon_rmap(page, vma, address); } @@ -704,7 +901,7 @@ void page_add_new_anon_rmap(struct page *page, SetPageSwapBacked(page); atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ __inc_zone_page_state(page, NR_ANON_PAGES); - __page_set_anon_rmap(page, vma, address); + __page_set_anon_rmap(page, vma, address, 1); if (page_evictable(page, vma)) lru_cache_add_lru(page, LRU_ACTIVE_ANON); else @@ -748,6 +945,12 @@ void page_remove_rmap(struct page *page) page_clear_dirty(page); set_page_dirty(page); } + /* + * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED + * and not charged by memcg for now. + */ + if (unlikely(PageHuge(page))) + return; if (PageAnon(page)) { mem_cgroup_uncharge_page(page); __dec_zone_page_state(page, NR_ANON_PAGES); @@ -815,9 +1018,9 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { if (PageAnon(page)) - dec_mm_counter(mm, anon_rss); + dec_mm_counter(mm, MM_ANONPAGES); else - dec_mm_counter(mm, file_rss); + dec_mm_counter(mm, MM_FILEPAGES); set_pte_at(mm, address, pte, swp_entry_to_pte(make_hwpoison_entry(page))); } else if (PageAnon(page)) { @@ -839,7 +1042,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, list_add(&mm->mmlist, &init_mm.mmlist); spin_unlock(&mmlist_lock); } - dec_mm_counter(mm, anon_rss); + dec_mm_counter(mm, MM_ANONPAGES); + inc_mm_counter(mm, MM_SWAPENTS); } else if (PAGE_MIGRATION) { /* * Store the pfn of the page in a special migration @@ -857,7 +1061,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, entry = make_migration_entry(page, pte_write(pteval)); set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); } else - dec_mm_counter(mm, file_rss); + dec_mm_counter(mm, MM_FILEPAGES); page_remove_rmap(page); page_cache_release(page); @@ -996,7 +1200,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, page_remove_rmap(page); page_cache_release(page); - dec_mm_counter(mm, file_rss); + dec_mm_counter(mm, MM_FILEPAGES); (*mapcount)--; } pte_unmap_unlock(pte - 1, ptl); @@ -1005,6 +1209,20 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, return ret; } +static bool is_vma_temporary_stack(struct vm_area_struct *vma) +{ + int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); + + if (!maybe_stack) + return false; + + if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == + VM_STACK_INCOMPLETE_SETUP) + return true; + + return false; +} + /** * try_to_unmap_anon - unmap or unlock anonymous page using the object-based * rmap method @@ -1024,15 +1242,30 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) { struct anon_vma *anon_vma; - struct vm_area_struct *vma; + struct anon_vma_chain *avc; int ret = SWAP_AGAIN; anon_vma = page_lock_anon_vma(page); if (!anon_vma) return ret; - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { - unsigned long address = vma_address(page, vma); + list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + struct vm_area_struct *vma = avc->vma; + unsigned long address; + + /* + * During exec, a temporary VMA is setup and later moved. + * The VMA is moved under the anon_vma lock but not the + * page tables leading to a race where migration cannot + * find the migration ptes. Rather than increasing the + * locking requirements of exec(), migration skips + * temporary VMAs until after exec() completes. + */ + if (PAGE_MIGRATION && (flags & TTU_MIGRATION) && + is_vma_temporary_stack(vma)) + continue; + + address = vma_address(page, vma); if (address == -EFAULT) continue; ret = try_to_unmap_one(page, vma, address, flags); @@ -1213,6 +1446,42 @@ int try_to_munlock(struct page *page) return try_to_unmap_file(page, TTU_MUNLOCK); } +#if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION) +/* + * Drop an anon_vma refcount, freeing the anon_vma and anon_vma->root + * if necessary. Be careful to do all the tests under the lock. Once + * we know we are the last user, nobody else can get a reference and we + * can do the freeing without the lock. + */ +void drop_anon_vma(struct anon_vma *anon_vma) +{ + BUG_ON(atomic_read(&anon_vma->external_refcount) <= 0); + if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) { + struct anon_vma *root = anon_vma->root; + int empty = list_empty(&anon_vma->head); + int last_root_user = 0; + int root_empty = 0; + + /* + * The refcount on a non-root anon_vma got dropped. Drop + * the refcount on the root and check if we need to free it. + */ + if (empty && anon_vma != root) { + BUG_ON(atomic_read(&root->external_refcount) <= 0); + last_root_user = atomic_dec_and_test(&root->external_refcount); + root_empty = list_empty(&root->head); + } + anon_vma_unlock(anon_vma); + + if (empty) { + anon_vma_free(anon_vma); + if (root_empty && last_root_user) + anon_vma_free(root); + } + } +} +#endif + #ifdef CONFIG_MIGRATION /* * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): @@ -1222,22 +1491,21 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, struct vm_area_struct *, unsigned long, void *), void *arg) { struct anon_vma *anon_vma; - struct vm_area_struct *vma; + struct anon_vma_chain *avc; int ret = SWAP_AGAIN; /* * Note: remove_migration_ptes() cannot use page_lock_anon_vma() * because that depends on page_mapped(); but not all its usages - * are holding mmap_sem, which also gave the necessary guarantee - * (that this anon_vma's slab has not already been destroyed). - * This needs to be reviewed later: avoiding page_lock_anon_vma() - * is risky, and currently limits the usefulness of rmap_walk(). + * are holding mmap_sem. Users without mmap_sem are required to + * take a reference count to prevent the anon_vma disappearing */ anon_vma = page_anon_vma(page); if (!anon_vma) return ret; - spin_lock(&anon_vma->lock); - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + anon_vma_lock(anon_vma); + list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); if (address == -EFAULT) continue; @@ -1245,7 +1513,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, if (ret != SWAP_AGAIN) break; } - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); return ret; } @@ -1291,3 +1559,49 @@ int rmap_walk(struct page *page, int (*rmap_one)(struct page *, return rmap_walk_file(page, rmap_one, arg); } #endif /* CONFIG_MIGRATION */ + +#ifdef CONFIG_HUGETLB_PAGE +/* + * The following three functions are for anonymous (private mapped) hugepages. + * Unlike common anonymous pages, anonymous hugepages have no accounting code + * and no lru code, because we handle hugepages differently from common pages. + */ +static void __hugepage_set_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address, int exclusive) +{ + struct anon_vma *anon_vma = vma->anon_vma; + + BUG_ON(!anon_vma); + + if (PageAnon(page)) + return; + if (!exclusive) + anon_vma = anon_vma->root; + + anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; + page->mapping = (struct address_space *) anon_vma; + page->index = linear_page_index(vma, address); +} + +void hugepage_add_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + struct anon_vma *anon_vma = vma->anon_vma; + int first; + + BUG_ON(!PageLocked(page)); + BUG_ON(!anon_vma); + BUG_ON(address < vma->vm_start || address >= vma->vm_end); + first = atomic_inc_and_test(&page->_mapcount); + if (first) + __hugepage_set_anon_rmap(page, vma, address, 0); +} + +void hugepage_add_new_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + BUG_ON(address < vma->vm_start || address >= vma->vm_end); + atomic_set(&page->_mapcount, 0); + __hugepage_set_anon_rmap(page, vma, address, 1); +} +#endif /* CONFIG_HUGETLB_PAGE */ @@ -28,6 +28,7 @@ #include <linux/file.h> #include <linux/mm.h> #include <linux/module.h> +#include <linux/percpu_counter.h> #include <linux/swap.h> static struct vfsmount *shm_mnt; @@ -233,10 +234,10 @@ static void shmem_free_blocks(struct inode *inode, long pages) { struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); if (sbinfo->max_blocks) { - spin_lock(&sbinfo->stat_lock); - sbinfo->free_blocks += pages; + percpu_counter_add(&sbinfo->used_blocks, -pages); + spin_lock(&inode->i_lock); inode->i_blocks -= pages*BLOCKS_PER_PAGE; - spin_unlock(&sbinfo->stat_lock); + spin_unlock(&inode->i_lock); } } @@ -416,25 +417,21 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long if (sgp == SGP_READ) return shmem_swp_map(ZERO_PAGE(0)); /* - * Test free_blocks against 1 not 0, since we have 1 data + * Test used_blocks against 1 less max_blocks, since we have 1 data * page (and perhaps indirect index pages) yet to allocate: * a waste to allocate index if we cannot allocate data. */ if (sbinfo->max_blocks) { - spin_lock(&sbinfo->stat_lock); - if (sbinfo->free_blocks <= 1) { - spin_unlock(&sbinfo->stat_lock); + if (percpu_counter_compare(&sbinfo->used_blocks, (sbinfo->max_blocks - 1)) > 0) return ERR_PTR(-ENOSPC); - } - sbinfo->free_blocks--; + percpu_counter_inc(&sbinfo->used_blocks); + spin_lock(&inode->i_lock); inode->i_blocks += BLOCKS_PER_PAGE; - spin_unlock(&sbinfo->stat_lock); + spin_unlock(&inode->i_lock); } spin_unlock(&info->lock); page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); - if (page) - set_page_private(page, 0); spin_lock(&info->lock); if (!page) { @@ -729,10 +726,11 @@ done2: if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) { /* * Call truncate_inode_pages again: racing shmem_unuse_inode - * may have swizzled a page in from swap since vmtruncate or - * generic_delete_inode did it, before we lowered next_index. - * Also, though shmem_getpage checks i_size before adding to - * cache, no recheck after: so fix the narrow window there too. + * may have swizzled a page in from swap since + * truncate_pagecache or generic_delete_inode did it, before we + * lowered next_index. Also, though shmem_getpage checks + * i_size before adding to cache, no recheck after: so fix the + * narrow window there too. * * Recalling truncate_inode_pages_range and unmap_mapping_range * every time for punch_hole (which never got a chance to clear @@ -762,19 +760,21 @@ done2: } } -static void shmem_truncate(struct inode *inode) -{ - shmem_truncate_range(inode, inode->i_size, (loff_t)-1); -} - static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; - struct page *page = NULL; + loff_t newsize = attr->ia_size; int error; - if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { - if (attr->ia_size < inode->i_size) { + error = inode_change_ok(inode, attr); + if (error) + return error; + + if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE) + && newsize != inode->i_size) { + struct page *page = NULL; + + if (newsize < inode->i_size) { /* * If truncating down to a partial page, then * if that page is already allocated, hold it @@ -782,9 +782,9 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) * truncate_partial_page cannnot miss it were * it assigned to swap. */ - if (attr->ia_size & (PAGE_CACHE_SIZE-1)) { + if (newsize & (PAGE_CACHE_SIZE-1)) { (void) shmem_getpage(inode, - attr->ia_size>>PAGE_CACHE_SHIFT, + newsize >> PAGE_CACHE_SHIFT, &page, SGP_READ, NULL); if (page) unlock_page(page); @@ -796,36 +796,38 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) * if it's being fully truncated to zero-length: the * nrpages check is efficient enough in that case. */ - if (attr->ia_size) { + if (newsize) { struct shmem_inode_info *info = SHMEM_I(inode); spin_lock(&info->lock); info->flags &= ~SHMEM_PAGEIN; spin_unlock(&info->lock); } } + + /* XXX(truncate): truncate_setsize should be called last */ + truncate_setsize(inode, newsize); + if (page) + page_cache_release(page); + shmem_truncate_range(inode, newsize, (loff_t)-1); } - error = inode_change_ok(inode, attr); - if (!error) - error = inode_setattr(inode, attr); + setattr_copy(inode, attr); #ifdef CONFIG_TMPFS_POSIX_ACL - if (!error && (attr->ia_valid & ATTR_MODE)) + if (attr->ia_valid & ATTR_MODE) error = generic_acl_chmod(inode); #endif - if (page) - page_cache_release(page); return error; } -static void shmem_delete_inode(struct inode *inode) +static void shmem_evict_inode(struct inode *inode) { struct shmem_inode_info *info = SHMEM_I(inode); - if (inode->i_op->truncate == shmem_truncate) { + if (inode->i_mapping->a_ops == &shmem_aops) { truncate_inode_pages(inode->i_mapping, 0); shmem_unacct_size(info->flags, inode->i_size); inode->i_size = 0; - shmem_truncate(inode); + shmem_truncate_range(inode, 0, (loff_t)-1); if (!list_empty(&info->swaplist)) { mutex_lock(&shmem_swaplist_mutex); list_del_init(&info->swaplist); @@ -834,7 +836,7 @@ static void shmem_delete_inode(struct inode *inode) } BUG_ON(inode->i_blocks); shmem_free_inode(inode->i_sb); - clear_inode(inode); + end_writeback(inode); } static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir) @@ -931,7 +933,7 @@ found: /* * Move _head_ to start search for next from here. - * But be careful: shmem_delete_inode checks list_empty without taking + * But be careful: shmem_evict_inode checks list_empty without taking * mutex, and there's an instant in list_move_tail when info->swaplist * would appear empty, if it were the only one on shmem_swaplist. We * could avoid doing it if inode NULL; or use this minor optimization. @@ -1221,6 +1223,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx, struct shmem_sb_info *sbinfo; struct page *filepage = *pagep; struct page *swappage; + struct page *prealloc_page = NULL; swp_entry_t *entry; swp_entry_t swap; gfp_t gfp; @@ -1245,7 +1248,6 @@ repeat: filepage = find_lock_page(mapping, idx); if (filepage && PageUptodate(filepage)) goto done; - error = 0; gfp = mapping_gfp_mask(mapping); if (!filepage) { /* @@ -1256,7 +1258,19 @@ repeat: if (error) goto failed; radix_tree_preload_end(); + if (sgp != SGP_READ && !prealloc_page) { + /* We don't care if this fails */ + prealloc_page = shmem_alloc_page(gfp, info, idx); + if (prealloc_page) { + if (mem_cgroup_cache_charge(prealloc_page, + current->mm, GFP_KERNEL)) { + page_cache_release(prealloc_page); + prealloc_page = NULL; + } + } + } } + error = 0; spin_lock(&info->lock); shmem_recalc_inode(inode); @@ -1385,17 +1399,16 @@ repeat: shmem_swp_unmap(entry); sbinfo = SHMEM_SB(inode->i_sb); if (sbinfo->max_blocks) { - spin_lock(&sbinfo->stat_lock); - if (sbinfo->free_blocks == 0 || + if ((percpu_counter_compare(&sbinfo->used_blocks, sbinfo->max_blocks) > 0) || shmem_acct_block(info->flags)) { - spin_unlock(&sbinfo->stat_lock); spin_unlock(&info->lock); error = -ENOSPC; goto failed; } - sbinfo->free_blocks--; + percpu_counter_inc(&sbinfo->used_blocks); + spin_lock(&inode->i_lock); inode->i_blocks += BLOCKS_PER_PAGE; - spin_unlock(&sbinfo->stat_lock); + spin_unlock(&inode->i_lock); } else if (shmem_acct_block(info->flags)) { spin_unlock(&info->lock); error = -ENOSPC; @@ -1405,28 +1418,38 @@ repeat: if (!filepage) { int ret; - spin_unlock(&info->lock); - filepage = shmem_alloc_page(gfp, info, idx); - if (!filepage) { - shmem_unacct_blocks(info->flags, 1); - shmem_free_blocks(inode, 1); - error = -ENOMEM; - goto failed; - } - SetPageSwapBacked(filepage); + if (!prealloc_page) { + spin_unlock(&info->lock); + filepage = shmem_alloc_page(gfp, info, idx); + if (!filepage) { + shmem_unacct_blocks(info->flags, 1); + shmem_free_blocks(inode, 1); + error = -ENOMEM; + goto failed; + } + SetPageSwapBacked(filepage); - /* Precharge page while we can wait, compensate after */ - error = mem_cgroup_cache_charge(filepage, current->mm, - GFP_KERNEL); - if (error) { - page_cache_release(filepage); - shmem_unacct_blocks(info->flags, 1); - shmem_free_blocks(inode, 1); - filepage = NULL; - goto failed; + /* + * Precharge page while we can wait, compensate + * after + */ + error = mem_cgroup_cache_charge(filepage, + current->mm, GFP_KERNEL); + if (error) { + page_cache_release(filepage); + shmem_unacct_blocks(info->flags, 1); + shmem_free_blocks(inode, 1); + filepage = NULL; + goto failed; + } + + spin_lock(&info->lock); + } else { + filepage = prealloc_page; + prealloc_page = NULL; + SetPageSwapBacked(filepage); } - spin_lock(&info->lock); entry = shmem_swp_alloc(info, idx, sgp); if (IS_ERR(entry)) error = PTR_ERR(entry); @@ -1467,13 +1490,19 @@ repeat: } done: *pagep = filepage; - return 0; + error = 0; + goto out; failed: if (*pagep != filepage) { unlock_page(filepage); page_cache_release(filepage); } +out: + if (prealloc_page) { + mem_cgroup_uncharge_cache_page(prealloc_page); + page_cache_release(prealloc_page); + } return error; } @@ -1545,8 +1574,8 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static struct inode *shmem_get_inode(struct super_block *sb, int mode, - dev_t dev, unsigned long flags) +static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir, + int mode, dev_t dev, unsigned long flags) { struct inode *inode; struct shmem_inode_info *info; @@ -1557,9 +1586,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, int mode, inode = new_inode(sb); if (inode) { - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); + inode_init_owner(inode, dir, mode); inode->i_blocks = 0; inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; @@ -1791,17 +1818,16 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_type = TMPFS_MAGIC; buf->f_bsize = PAGE_CACHE_SIZE; buf->f_namelen = NAME_MAX; - spin_lock(&sbinfo->stat_lock); if (sbinfo->max_blocks) { buf->f_blocks = sbinfo->max_blocks; - buf->f_bavail = buf->f_bfree = sbinfo->free_blocks; + buf->f_bavail = buf->f_bfree = + sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks); } if (sbinfo->max_inodes) { buf->f_files = sbinfo->max_inodes; buf->f_ffree = sbinfo->free_inodes; } /* else leave those fields 0 like simple_statfs */ - spin_unlock(&sbinfo->stat_lock); return 0; } @@ -1814,7 +1840,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) struct inode *inode; int error = -ENOSPC; - inode = shmem_get_inode(dir->i_sb, mode, dev, VM_NORESERVE); + inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); if (inode) { error = security_inode_init_security(inode, dir, NULL, NULL, NULL); @@ -1833,11 +1859,6 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) #else error = 0; #endif - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - inode->i_mode |= S_ISGID; - } dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = CURRENT_TIME; d_instantiate(dentry, inode); @@ -1957,7 +1978,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s if (len > PAGE_CACHE_SIZE) return -ENAMETOOLONG; - inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE); + inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE); if (!inode) return -ENOSPC; @@ -1992,8 +2013,6 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s unlock_page(page); page_cache_release(page); } - if (dir->i_mode & S_ISGID) - inode->i_gid = dir->i_gid; dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = CURRENT_TIME; d_instantiate(dentry, inode); @@ -2033,7 +2052,6 @@ static const struct inode_operations shmem_symlink_inline_operations = { }; static const struct inode_operations shmem_symlink_inode_operations = { - .truncate = shmem_truncate, .readlink = generic_readlink, .follow_link = shmem_follow_link, .put_link = shmem_put_link, @@ -2071,14 +2089,14 @@ static int shmem_xattr_security_set(struct dentry *dentry, const char *name, size, flags); } -static struct xattr_handler shmem_xattr_security_handler = { +static const struct xattr_handler shmem_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = shmem_xattr_security_list, .get = shmem_xattr_security_get, .set = shmem_xattr_security_set, }; -static struct xattr_handler *shmem_xattr_handlers[] = { +static const struct xattr_handler *shmem_xattr_handlers[] = { &generic_acl_access_handler, &generic_acl_default_handler, &shmem_xattr_security_handler, @@ -2250,7 +2268,6 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); struct shmem_sb_info config = *sbinfo; - unsigned long blocks; unsigned long inodes; int error = -EINVAL; @@ -2258,9 +2275,8 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) return error; spin_lock(&sbinfo->stat_lock); - blocks = sbinfo->max_blocks - sbinfo->free_blocks; inodes = sbinfo->max_inodes - sbinfo->free_inodes; - if (config.max_blocks < blocks) + if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0) goto out; if (config.max_inodes < inodes) goto out; @@ -2277,7 +2293,6 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) error = 0; sbinfo->max_blocks = config.max_blocks; - sbinfo->free_blocks = config.max_blocks - blocks; sbinfo->max_inodes = config.max_inodes; sbinfo->free_inodes = config.max_inodes - inodes; @@ -2310,7 +2325,10 @@ static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs) static void shmem_put_super(struct super_block *sb) { - kfree(sb->s_fs_info); + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + + percpu_counter_destroy(&sbinfo->used_blocks); + kfree(sbinfo); sb->s_fs_info = NULL; } @@ -2352,7 +2370,8 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) #endif spin_lock_init(&sbinfo->stat_lock); - sbinfo->free_blocks = sbinfo->max_blocks; + if (percpu_counter_init(&sbinfo->used_blocks, 0)) + goto failed; sbinfo->free_inodes = sbinfo->max_inodes; sb->s_maxbytes = SHMEM_MAX_BYTES; @@ -2366,7 +2385,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags |= MS_POSIXACL; #endif - inode = shmem_get_inode(sb, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); + inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); if (!inode) goto failed; inode->i_uid = sbinfo->uid; @@ -2444,14 +2463,13 @@ static const struct file_operations shmem_file_operations = { .write = do_sync_write, .aio_read = shmem_file_aio_read, .aio_write = generic_file_aio_write, - .fsync = simple_sync_file, + .fsync = noop_fsync, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, #endif }; static const struct inode_operations shmem_inode_operations = { - .truncate = shmem_truncate, .setattr = shmem_notify_change, .truncate_range = shmem_truncate_range, #ifdef CONFIG_TMPFS_POSIX_ACL @@ -2505,7 +2523,7 @@ static const struct super_operations shmem_ops = { .remount_fs = shmem_remount_fs, .show_options = shmem_show_options, #endif - .delete_inode = shmem_delete_inode, + .evict_inode = shmem_evict_inode, .drop_inode = generic_delete_inode, .put_super = shmem_put_super, }; @@ -2570,6 +2588,45 @@ out4: return error; } +#ifdef CONFIG_CGROUP_MEM_RES_CTLR +/** + * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file + * @inode: the inode to be searched + * @pgoff: the offset to be searched + * @pagep: the pointer for the found page to be stored + * @ent: the pointer for the found swap entry to be stored + * + * If a page is found, refcount of it is incremented. Callers should handle + * these refcount. + */ +void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, + struct page **pagep, swp_entry_t *ent) +{ + swp_entry_t entry = { .val = 0 }, *ptr; + struct page *page = NULL; + struct shmem_inode_info *info = SHMEM_I(inode); + + if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) + goto out; + + spin_lock(&info->lock); + ptr = shmem_swp_entry(info, pgoff, NULL); +#ifdef CONFIG_SWAP + if (ptr && ptr->val) { + entry.val = ptr->val; + page = find_get_page(&swapper_space, entry.val); + } else +#endif + page = find_get_page(inode->i_mapping, pgoff); + if (ptr) + shmem_swp_unmap(ptr); + spin_unlock(&info->lock); +out: + *pagep = page; + *ent = entry; +} +#endif + #else /* !CONFIG_SHMEM */ /* @@ -2609,9 +2666,34 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) return 0; } +#ifdef CONFIG_CGROUP_MEM_RES_CTLR +/** + * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file + * @inode: the inode to be searched + * @pgoff: the offset to be searched + * @pagep: the pointer for the found page to be stored + * @ent: the pointer for the found swap entry to be stored + * + * If a page is found, refcount of it is incremented. Callers should handle + * these refcount. + */ +void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, + struct page **pagep, swp_entry_t *ent) +{ + struct page *page = NULL; + + if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) + goto out; + page = find_get_page(inode->i_mapping, pgoff); +out: + *pagep = page; + *ent = (swp_entry_t){ .val = 0 }; +} +#endif + #define shmem_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations -#define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev) +#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) #define shmem_acct_size(flags, size) 0 #define shmem_unacct_size(flags, size) do {} while (0) #define SHMEM_MAX_BYTES MAX_LFS_FILESIZE @@ -2655,7 +2737,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags path.mnt = mntget(shm_mnt); error = -ENOSPC; - inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags); + inode = shmem_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); if (!inode) goto put_dentry; @@ -102,7 +102,6 @@ #include <linux/cpu.h> #include <linux/sysctl.h> #include <linux/module.h> -#include <linux/kmemtrace.h> #include <linux/rcupdate.h> #include <linux/string.h> #include <linux/uaccess.h> @@ -115,6 +114,7 @@ #include <linux/reciprocal_div.h> #include <linux/debugobjects.h> #include <linux/kmemcheck.h> +#include <linux/memory.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> @@ -144,30 +144,6 @@ #define BYTES_PER_WORD sizeof(void *) #define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) -#ifndef ARCH_KMALLOC_MINALIGN -/* - * Enforce a minimum alignment for the kmalloc caches. - * Usually, the kmalloc caches are cache_line_size() aligned, except when - * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned. - * Some archs want to perform DMA into kmalloc caches and need a guaranteed - * alignment larger than the alignment of a 64-bit integer. - * ARCH_KMALLOC_MINALIGN allows that. - * Note that increasing this value may disable some debug features. - */ -#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) -#endif - -#ifndef ARCH_SLAB_MINALIGN -/* - * Enforce a minimum alignment for all caches. - * Intended for archs that get misalignment faults even for BYTES_PER_WORD - * aligned buffers. Includes ARCH_KMALLOC_MINALIGN. - * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables - * some debug features. - */ -#define ARCH_SLAB_MINALIGN 0 -#endif - #ifndef ARCH_KMALLOC_FLAGS #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN #endif @@ -418,7 +394,7 @@ static void kmem_list3_init(struct kmem_list3 *parent) #define STATS_DEC_ACTIVE(x) do { } while (0) #define STATS_INC_ALLOCED(x) do { } while (0) #define STATS_INC_GROWN(x) do { } while (0) -#define STATS_ADD_REAPED(x,y) do { } while (0) +#define STATS_ADD_REAPED(x,y) do { (void)(y); } while (0) #define STATS_SET_HIGH(x) do { } while (0) #define STATS_INC_ERR(x) do { } while (0) #define STATS_INC_NODEALLOCS(x) do { } while (0) @@ -844,7 +820,7 @@ static void init_reap_node(int cpu) { int node; - node = next_node(cpu_to_node(cpu), node_online_map); + node = next_node(cpu_to_mem(cpu), node_online_map); if (node == MAX_NUMNODES) node = first_node(node_online_map); @@ -884,7 +860,7 @@ static void __cpuinit start_cpu_timer(int cpu) */ if (keventd_up() && reap_work->work.func == NULL) { init_reap_node(cpu); - INIT_DELAYED_WORK(reap_work, cache_reap); + INIT_DELAYED_WORK_DEFERRABLE(reap_work, cache_reap); schedule_delayed_work_on(cpu, reap_work, __round_jiffies_relative(HZ, cpu)); } @@ -935,7 +911,6 @@ static int transfer_objects(struct array_cache *to, from->avail -= nr; to->avail += nr; - to->touched = 1; return nr; } @@ -983,13 +958,11 @@ static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) if (limit > 1) limit = 12; - ac_ptr = kmalloc_node(memsize, gfp, node); + ac_ptr = kzalloc_node(memsize, gfp, node); if (ac_ptr) { for_each_node(i) { - if (i == node || !node_online(i)) { - ac_ptr[i] = NULL; + if (i == node || !node_online(i)) continue; - } ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); if (!ac_ptr[i]) { for (i--; i >= 0; i--) @@ -1076,7 +1049,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) struct array_cache *alien = NULL; int node; - node = numa_node_id(); + node = numa_mem_id(); /* * Make sure we are not freeing a object from another node to the array @@ -1105,11 +1078,57 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) } #endif +/* + * Allocates and initializes nodelists for a node on each slab cache, used for + * either memory or cpu hotplug. If memory is being hot-added, the kmem_list3 + * will be allocated off-node since memory is not yet online for the new node. + * When hotplugging memory or a cpu, existing nodelists are not replaced if + * already in use. + * + * Must hold cache_chain_mutex. + */ +static int init_cache_nodelists_node(int node) +{ + struct kmem_cache *cachep; + struct kmem_list3 *l3; + const int memsize = sizeof(struct kmem_list3); + + list_for_each_entry(cachep, &cache_chain, next) { + /* + * Set up the size64 kmemlist for cpu before we can + * begin anything. Make sure some other cpu on this + * node has not already allocated this + */ + if (!cachep->nodelists[node]) { + l3 = kmalloc_node(memsize, GFP_KERNEL, node); + if (!l3) + return -ENOMEM; + kmem_list3_init(l3); + l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + + /* + * The l3s don't come and go as CPUs come and + * go. cache_chain_mutex is sufficient + * protection here. + */ + cachep->nodelists[node] = l3; + } + + spin_lock_irq(&cachep->nodelists[node]->list_lock); + cachep->nodelists[node]->free_limit = + (1 + nr_cpus_node(node)) * + cachep->batchcount + cachep->num; + spin_unlock_irq(&cachep->nodelists[node]->list_lock); + } + return 0; +} + static void __cpuinit cpuup_canceled(long cpu) { struct kmem_cache *cachep; struct kmem_list3 *l3 = NULL; - int node = cpu_to_node(cpu); + int node = cpu_to_mem(cpu); const struct cpumask *mask = cpumask_of_node(node); list_for_each_entry(cachep, &cache_chain, next) { @@ -1174,8 +1193,8 @@ static int __cpuinit cpuup_prepare(long cpu) { struct kmem_cache *cachep; struct kmem_list3 *l3 = NULL; - int node = cpu_to_node(cpu); - const int memsize = sizeof(struct kmem_list3); + int node = cpu_to_mem(cpu); + int err; /* * We need to do this right in the beginning since @@ -1183,35 +1202,9 @@ static int __cpuinit cpuup_prepare(long cpu) * kmalloc_node allows us to add the slab to the right * kmem_list3 and not this cpu's kmem_list3 */ - - list_for_each_entry(cachep, &cache_chain, next) { - /* - * Set up the size64 kmemlist for cpu before we can - * begin anything. Make sure some other cpu on this - * node has not already allocated this - */ - if (!cachep->nodelists[node]) { - l3 = kmalloc_node(memsize, GFP_KERNEL, node); - if (!l3) - goto bad; - kmem_list3_init(l3); - l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; - - /* - * The l3s don't come and go as CPUs come and - * go. cache_chain_mutex is sufficient - * protection here. - */ - cachep->nodelists[node] = l3; - } - - spin_lock_irq(&cachep->nodelists[node]->list_lock); - cachep->nodelists[node]->free_limit = - (1 + nr_cpus_node(node)) * - cachep->batchcount + cachep->num; - spin_unlock_irq(&cachep->nodelists[node]->list_lock); - } + err = init_cache_nodelists_node(node); + if (err < 0) + goto bad; /* * Now we can go ahead with allocating the shared arrays and @@ -1327,18 +1320,82 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, mutex_unlock(&cache_chain_mutex); break; } - return err ? NOTIFY_BAD : NOTIFY_OK; + return notifier_from_errno(err); } static struct notifier_block __cpuinitdata cpucache_notifier = { &cpuup_callback, NULL, 0 }; +#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) +/* + * Drains freelist for a node on each slab cache, used for memory hot-remove. + * Returns -EBUSY if all objects cannot be drained so that the node is not + * removed. + * + * Must hold cache_chain_mutex. + */ +static int __meminit drain_cache_nodelists_node(int node) +{ + struct kmem_cache *cachep; + int ret = 0; + + list_for_each_entry(cachep, &cache_chain, next) { + struct kmem_list3 *l3; + + l3 = cachep->nodelists[node]; + if (!l3) + continue; + + drain_freelist(cachep, l3, l3->free_objects); + + if (!list_empty(&l3->slabs_full) || + !list_empty(&l3->slabs_partial)) { + ret = -EBUSY; + break; + } + } + return ret; +} + +static int __meminit slab_memory_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct memory_notify *mnb = arg; + int ret = 0; + int nid; + + nid = mnb->status_change_nid; + if (nid < 0) + goto out; + + switch (action) { + case MEM_GOING_ONLINE: + mutex_lock(&cache_chain_mutex); + ret = init_cache_nodelists_node(nid); + mutex_unlock(&cache_chain_mutex); + break; + case MEM_GOING_OFFLINE: + mutex_lock(&cache_chain_mutex); + ret = drain_cache_nodelists_node(nid); + mutex_unlock(&cache_chain_mutex); + break; + case MEM_ONLINE: + case MEM_OFFLINE: + case MEM_CANCEL_ONLINE: + case MEM_CANCEL_OFFLINE: + break; + } +out: + return ret ? notifier_from_errno(ret) : NOTIFY_OK; +} +#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */ + /* * swap the static kmem_list3 with kmalloced memory */ -static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, - int nodeid) +static void __init init_list(struct kmem_cache *cachep, struct kmem_list3 *list, + int nodeid) { struct kmem_list3 *ptr; @@ -1421,7 +1478,7 @@ void __init kmem_cache_init(void) * 6) Resize the head arrays of the kmalloc caches to their final sizes. */ - node = numa_node_id(); + node = numa_mem_id(); /* 1) create the cache_cache */ INIT_LIST_HEAD(&cache_chain); @@ -1583,6 +1640,14 @@ void __init kmem_cache_init_late(void) */ register_cpu_notifier(&cpucache_notifier); +#ifdef CONFIG_NUMA + /* + * Register a memory hotplug callback that initializes and frees + * nodelists. + */ + hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); +#endif + /* * The reap timers are started later, with a module init call: That part * of the kernel is not yet operational. @@ -2055,7 +2120,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) } } } - cachep->nodelists[numa_node_id()]->next_reap = + cachep->nodelists[numa_mem_id()]->next_reap = jiffies + REAPTIMEOUT_LIST3 + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; @@ -2223,8 +2288,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, if (ralign < align) { ralign = align; } - /* disable debug if necessary */ - if (ralign > __alignof__(unsigned long long)) + /* disable debug if not aligning with REDZONE_ALIGN */ + if (ralign & (__alignof__(unsigned long long) - 1)) flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); /* * 4) Store it. @@ -2250,8 +2315,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, */ if (flags & SLAB_RED_ZONE) { /* add space for red zone words */ - cachep->obj_offset += sizeof(unsigned long long); - size += 2 * sizeof(unsigned long long); + cachep->obj_offset += align; + size += align + sizeof(unsigned long long); } if (flags & SLAB_STORE_USER) { /* user store requires one word storage behind the end of @@ -2265,8 +2330,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, } #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) if (size >= malloc_sizes[INDEX_L3 + 1].cs_size - && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) { - cachep->obj_offset += PAGE_SIZE - size; + && cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) { + cachep->obj_offset += PAGE_SIZE - ALIGN(size, align); size = PAGE_SIZE; } #endif @@ -2386,7 +2451,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep) { #ifdef CONFIG_SMP check_irq_off(); - assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock); + assert_spin_locked(&cachep->nodelists[numa_mem_id()]->list_lock); #endif } @@ -2413,7 +2478,7 @@ static void do_drain(void *arg) { struct kmem_cache *cachep = arg; struct array_cache *ac; - int node = numa_node_id(); + int node = numa_mem_id(); check_irq_off(); ac = cpu_cache_get(cachep); @@ -2946,7 +3011,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) retry: check_irq_off(); - node = numa_node_id(); + node = numa_mem_id(); ac = cpu_cache_get(cachep); batchcount = ac->batchcount; if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { @@ -2963,8 +3028,10 @@ retry: spin_lock(&l3->list_lock); /* See if we can refill from the shared array */ - if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) + if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) { + l3->shared->touched = 1; goto alloc_done; + } while (batchcount > 0) { struct list_head *entry; @@ -3101,7 +3168,7 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) if (cachep == &cache_cache) return false; - return should_failslab(obj_size(cachep), flags); + return should_failslab(obj_size(cachep), flags, cachep->flags); } static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) @@ -3148,11 +3215,13 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) if (in_interrupt() || (flags & __GFP_THISNODE)) return NULL; - nid_alloc = nid_here = numa_node_id(); + nid_alloc = nid_here = numa_mem_id(); + get_mems_allowed(); if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) - nid_alloc = cpuset_mem_spread_node(); + nid_alloc = cpuset_slab_spread_node(); else if (current->mempolicy) nid_alloc = slab_node(current->mempolicy); + put_mems_allowed(); if (nid_alloc != nid_here) return ____cache_alloc_node(cachep, flags, nid_alloc); return NULL; @@ -3179,6 +3248,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) if (flags & __GFP_THISNODE) return NULL; + get_mems_allowed(); zonelist = node_zonelist(slab_node(current->mempolicy), flags); local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); @@ -3210,7 +3280,7 @@ retry: if (local_flags & __GFP_WAIT) local_irq_enable(); kmem_flagcheck(cache, flags); - obj = kmem_getpages(cache, local_flags, numa_node_id()); + obj = kmem_getpages(cache, local_flags, numa_mem_id()); if (local_flags & __GFP_WAIT) local_irq_disable(); if (obj) { @@ -3234,6 +3304,7 @@ retry: } } } + put_mems_allowed(); return obj; } @@ -3317,6 +3388,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, { unsigned long save_flags; void *ptr; + int slab_node = numa_mem_id(); flags &= gfp_allowed_mask; @@ -3329,7 +3401,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, local_irq_save(save_flags); if (nodeid == -1) - nodeid = numa_node_id(); + nodeid = slab_node; if (unlikely(!cachep->nodelists[nodeid])) { /* Node not bootstrapped yet */ @@ -3337,7 +3409,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, goto out; } - if (nodeid == numa_node_id()) { + if (nodeid == slab_node) { /* * Use the locally cached objects if possible. * However ____cache_alloc does not allow fallback @@ -3381,8 +3453,8 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags) * We may just have run out of memory on the local node. * ____cache_alloc_node() knows how to locate memory on other nodes */ - if (!objp) - objp = ____cache_alloc_node(cache, flags, numa_node_id()); + if (!objp) + objp = ____cache_alloc_node(cache, flags, numa_mem_id()); out: return objp; @@ -3479,7 +3551,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) { int batchcount; struct kmem_list3 *l3; - int node = numa_node_id(); + int node = numa_mem_id(); batchcount = ac->batchcount; #if DEBUG @@ -3603,21 +3675,10 @@ EXPORT_SYMBOL(kmem_cache_alloc_notrace); */ int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr) { - unsigned long addr = (unsigned long)ptr; - unsigned long min_addr = PAGE_OFFSET; - unsigned long align_mask = BYTES_PER_WORD - 1; unsigned long size = cachep->buffer_size; struct page *page; - if (unlikely(addr < min_addr)) - goto out; - if (unlikely(addr > (unsigned long)high_memory - size)) - goto out; - if (unlikely(addr & align_mask)) - goto out; - if (unlikely(!kern_addr_valid(addr))) - goto out; - if (unlikely(!kern_addr_valid(addr + size - 1))) + if (unlikely(!kern_ptr_validate(ptr, size))) goto out; page = virt_to_page(ptr); if (unlikely(!PageSlab(page))) @@ -3924,7 +3985,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, return -ENOMEM; for_each_online_cpu(i) { - new->new[i] = alloc_arraycache(cpu_to_node(i), limit, + new->new[i] = alloc_arraycache(cpu_to_mem(i), limit, batchcount, gfp); if (!new->new[i]) { for (i--; i >= 0; i--) @@ -3946,9 +4007,9 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, struct array_cache *ccold = new->new[i]; if (!ccold) continue; - spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); - free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i)); - spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); + spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock); + free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i)); + spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock); kfree(ccold); } kfree(new); @@ -4054,7 +4115,7 @@ static void cache_reap(struct work_struct *w) { struct kmem_cache *searchp; struct kmem_list3 *l3; - int node = numa_node_id(); + int node = numa_mem_id(); struct delayed_work *work = to_delayed_work(w); if (!mutex_trylock(&cache_chain_mutex)) @@ -4228,10 +4289,11 @@ static int s_show(struct seq_file *m, void *p) unsigned long node_frees = cachep->node_frees; unsigned long overflows = cachep->node_overflow; - seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ - %4lu %4lu %4lu %4lu %4lu", allocs, high, grown, - reaped, errors, max_freeable, node_allocs, - node_frees, overflows); + seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu " + "%4lu %4lu %4lu %4lu %4lu", + allocs, high, grown, + reaped, errors, max_freeable, node_allocs, + node_frees, overflows); } /* cpu stats */ { @@ -66,8 +66,10 @@ #include <linux/module.h> #include <linux/rcupdate.h> #include <linux/list.h> -#include <linux/kmemtrace.h> #include <linux/kmemleak.h> + +#include <trace/events/kmem.h> + #include <asm/atomic.h> /* @@ -394,6 +396,7 @@ static void slob_free(void *block, int size) slob_t *prev, *next, *b = (slob_t *)block; slobidx_t units; unsigned long flags; + struct list_head *slob_list; if (unlikely(ZERO_OR_NULL_PTR(block))) return; @@ -422,7 +425,13 @@ static void slob_free(void *block, int size) set_slob(b, units, (void *)((unsigned long)(b + SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK)); - set_slob_page_free(sp, &free_slob_small); + if (size < SLOB_BREAK1) + slob_list = &free_slob_small; + else if (size < SLOB_BREAK2) + slob_list = &free_slob_medium; + else + slob_list = &free_slob_large; + set_slob_page_free(sp, slob_list); goto out; } @@ -467,14 +476,6 @@ out: * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend. */ -#ifndef ARCH_KMALLOC_MINALIGN -#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long) -#endif - -#ifndef ARCH_SLAB_MINALIGN -#define ARCH_SLAB_MINALIGN __alignof__(unsigned long) -#endif - void *__kmalloc_node(size_t size, gfp_t gfp, int node) { unsigned int *m; @@ -647,7 +648,6 @@ void kmem_cache_free(struct kmem_cache *c, void *b) if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) { struct slob_rcu *slob_rcu; slob_rcu = b + (c->size - sizeof(struct slob_rcu)); - INIT_RCU_HEAD(&slob_rcu->head); slob_rcu->size = c->size; call_rcu(&slob_rcu->head, kmem_rcu_free); } else { @@ -17,7 +17,6 @@ #include <linux/slab.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> -#include <linux/kmemtrace.h> #include <linux/kmemcheck.h> #include <linux/cpu.h> #include <linux/cpuset.h> @@ -107,11 +106,17 @@ * the fast path and disables lockless freelists. */ +#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ + SLAB_TRACE | SLAB_DEBUG_FREE) + +static inline int kmem_cache_debug(struct kmem_cache *s) +{ #ifdef CONFIG_SLUB_DEBUG -#define SLABDEBUG 1 + return unlikely(s->flags & SLAB_DEBUG_FLAGS); #else -#define SLABDEBUG 0 + return 0; #endif +} /* * Issues still to be resolved: @@ -151,26 +156,19 @@ * Set of flags that will prevent slab merging */ #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE) + SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ + SLAB_FAILSLAB) #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ SLAB_CACHE_DMA | SLAB_NOTRACK) -#ifndef ARCH_KMALLOC_MINALIGN -#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) -#endif - -#ifndef ARCH_SLAB_MINALIGN -#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) -#endif - #define OO_SHIFT 16 #define OO_MASK ((1 << OO_SHIFT) - 1) #define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */ /* Internal SLUB flags */ -#define __OBJECT_POISON 0x80000000 /* Poison object */ -#define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ +#define __OBJECT_POISON 0x80000000UL /* Poison object */ +#define __SYSFS_ADD_DEFERRED 0x40000000UL /* Not yet visible via sysfs */ static int kmem_size = sizeof(struct kmem_cache); @@ -217,10 +215,10 @@ static inline void sysfs_slab_remove(struct kmem_cache *s) #endif -static inline void stat(struct kmem_cache_cpu *c, enum stat_item si) +static inline void stat(struct kmem_cache *s, enum stat_item si) { #ifdef CONFIG_SLUB_STATS - c->stat[si]++; + __this_cpu_inc(s->cpu_slab->stat[si]); #endif } @@ -242,15 +240,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) #endif } -static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu) -{ -#ifdef CONFIG_SMP - return s->cpu_slab[cpu]; -#else - return &s->cpu_slab; -#endif -} - /* Verify that a pointer has an address that is valid within a slab page */ static inline int check_valid_pointer(struct kmem_cache *s, struct page *page, const void *object) @@ -269,13 +258,6 @@ static inline int check_valid_pointer(struct kmem_cache *s, return 1; } -/* - * Slow version of get and set free pointer. - * - * This version requires touching the cache lines of kmem_cache which - * we avoid to do in the fast alloc free paths. There we obtain the offset - * from the page struct. - */ static inline void *get_freepointer(struct kmem_cache *s, void *object) { return *(void **)(object + s->offset); @@ -1020,6 +1002,9 @@ static int __init setup_slub_debug(char *str) case 't': slub_debug |= SLAB_TRACE; break; + case 'a': + slub_debug |= SLAB_FAILSLAB; + break; default: printk(KERN_ERR "slub_debug option '%c' " "unknown. skipped\n", *str); @@ -1093,10 +1078,10 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node, flags |= __GFP_NOTRACK; - if (node == -1) + if (node == NUMA_NO_NODE) return alloc_pages(flags, order); else - return alloc_pages_node(node, flags, order); + return alloc_pages_exact_node(node, flags, order); } static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) @@ -1124,7 +1109,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) if (!page) return NULL; - stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); + stat(s, ORDER_FALLBACK); } if (kmemcheck_enabled @@ -1177,9 +1162,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) inc_slabs_node(s, page_to_nid(page), page->objects); page->slab = s; page->flags |= 1 << PG_slab; - if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | - SLAB_STORE_USER | SLAB_TRACE)) - __SetPageSlubDebug(page); start = page_address(page); @@ -1206,14 +1188,13 @@ static void __free_slab(struct kmem_cache *s, struct page *page) int order = compound_order(page); int pages = 1 << order; - if (unlikely(SLABDEBUG && PageSlubDebug(page))) { + if (kmem_cache_debug(s)) { void *p; slab_pad_check(s, page); for_each_object(p, s, page_address(page), page->objects) check_object(s, page, p, 0); - __ClearPageSlubDebug(page); } kmemcheck_free_shadow(page, compound_order(page)); @@ -1380,6 +1361,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) get_cycles() % 1024 > s->remote_node_defrag_ratio) return NULL; + get_mems_allowed(); zonelist = node_zonelist(slab_node(current->mempolicy), flags); for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { struct kmem_cache_node *n; @@ -1389,10 +1371,13 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) if (n && cpuset_zone_allowed_hardwall(zone, flags) && n->nr_partial > s->min_partial) { page = get_partial_node(n); - if (page) + if (page) { + put_mems_allowed(); return page; + } } } + put_mems_allowed(); #endif return NULL; } @@ -1403,10 +1388,10 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; - int searchnode = (node == -1) ? numa_node_id() : node; + int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; page = get_partial_node(get_node(s, searchnode)); - if (page || (flags & __GFP_THISNODE)) + if (page || node != -1) return page; return get_any_partial(s, flags); @@ -1422,23 +1407,21 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) { struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id()); __ClearPageSlubFrozen(page); if (page->inuse) { if (page->freelist) { add_partial(n, page, tail); - stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); + stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); } else { - stat(c, DEACTIVATE_FULL); - if (SLABDEBUG && PageSlubDebug(page) && - (s->flags & SLAB_STORE_USER)) + stat(s, DEACTIVATE_FULL); + if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER)) add_full(n, page); } slab_unlock(page); } else { - stat(c, DEACTIVATE_EMPTY); + stat(s, DEACTIVATE_EMPTY); if (n->nr_partial < s->min_partial) { /* * Adding an empty slab to the partial slabs in order @@ -1454,7 +1437,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) slab_unlock(page); } else { slab_unlock(page); - stat(get_cpu_slab(s, raw_smp_processor_id()), FREE_SLAB); + stat(s, FREE_SLAB); discard_slab(s, page); } } @@ -1469,7 +1452,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) int tail = 1; if (page->freelist) - stat(c, DEACTIVATE_REMOTE_FREES); + stat(s, DEACTIVATE_REMOTE_FREES); /* * Merge cpu freelist into slab freelist. Typically we get here * because both freelists are empty. So this is unlikely @@ -1482,10 +1465,10 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) /* Retrieve object from cpu_freelist */ object = c->freelist; - c->freelist = c->freelist[c->offset]; + c->freelist = get_freepointer(s, c->freelist); /* And put onto the regular freelist */ - object[c->offset] = page->freelist; + set_freepointer(s, object, page->freelist); page->freelist = object; page->inuse--; } @@ -1495,7 +1478,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { - stat(c, CPUSLAB_FLUSH); + stat(s, CPUSLAB_FLUSH); slab_lock(c->page); deactivate_slab(s, c); } @@ -1507,7 +1490,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) */ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); if (likely(c && c->page)) flush_slab(s, c); @@ -1532,7 +1515,7 @@ static void flush_all(struct kmem_cache *s) static inline int node_match(struct kmem_cache_cpu *c, int node) { #ifdef CONFIG_NUMA - if (node != -1 && c->node != node) + if (node != NUMA_NO_NODE && c->node != node) return 0; #endif return 1; @@ -1635,22 +1618,22 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, if (unlikely(!node_match(c, node))) goto another_slab; - stat(c, ALLOC_REFILL); + stat(s, ALLOC_REFILL); load_freelist: object = c->page->freelist; if (unlikely(!object)) goto another_slab; - if (unlikely(SLABDEBUG && PageSlubDebug(c->page))) + if (kmem_cache_debug(s)) goto debug; - c->freelist = object[c->offset]; + c->freelist = get_freepointer(s, object); c->page->inuse = c->page->objects; c->page->freelist = NULL; c->node = page_to_nid(c->page); unlock_out: slab_unlock(c->page); - stat(c, ALLOC_SLOWPATH); + stat(s, ALLOC_SLOWPATH); return object; another_slab: @@ -1660,7 +1643,7 @@ new_slab: new = get_partial(s, gfpflags, node); if (new) { c->page = new; - stat(c, ALLOC_FROM_PARTIAL); + stat(s, ALLOC_FROM_PARTIAL); goto load_freelist; } @@ -1673,8 +1656,8 @@ new_slab: local_irq_disable(); if (new) { - c = get_cpu_slab(s, smp_processor_id()); - stat(c, ALLOC_SLAB); + c = __this_cpu_ptr(s->cpu_slab); + stat(s, ALLOC_SLAB); if (c->page) flush_slab(s, c); slab_lock(new); @@ -1690,7 +1673,7 @@ debug: goto another_slab; c->page->inuse++; - c->page->freelist = object[c->offset]; + c->page->freelist = get_freepointer(s, object); c->node = -1; goto unlock_out; } @@ -1711,42 +1694,40 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, void **object; struct kmem_cache_cpu *c; unsigned long flags; - unsigned int objsize; gfpflags &= gfp_allowed_mask; lockdep_trace_alloc(gfpflags); might_sleep_if(gfpflags & __GFP_WAIT); - if (should_failslab(s->objsize, gfpflags)) + if (should_failslab(s->objsize, gfpflags, s->flags)) return NULL; local_irq_save(flags); - c = get_cpu_slab(s, smp_processor_id()); - objsize = c->objsize; - if (unlikely(!c->freelist || !node_match(c, node))) + c = __this_cpu_ptr(s->cpu_slab); + object = c->freelist; + if (unlikely(!object || !node_match(c, node))) object = __slab_alloc(s, gfpflags, node, addr, c); else { - object = c->freelist; - c->freelist = object[c->offset]; - stat(c, ALLOC_FASTPATH); + c->freelist = get_freepointer(s, object); + stat(s, ALLOC_FASTPATH); } local_irq_restore(flags); if (unlikely(gfpflags & __GFP_ZERO) && object) - memset(object, 0, objsize); + memset(object, 0, s->objsize); - kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); - kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags); + kmemcheck_slab_alloc(s, gfpflags, object, s->objsize); + kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, gfpflags); return object; } void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) { - void *ret = slab_alloc(s, gfpflags, -1, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags); @@ -1757,7 +1738,7 @@ EXPORT_SYMBOL(kmem_cache_alloc); #ifdef CONFIG_TRACING void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) { - return slab_alloc(s, gfpflags, -1, _RET_IP_); + return slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); } EXPORT_SYMBOL(kmem_cache_alloc_notrace); #endif @@ -1794,26 +1775,25 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); * handling required then we can return immediately. */ static void __slab_free(struct kmem_cache *s, struct page *page, - void *x, unsigned long addr, unsigned int offset) + void *x, unsigned long addr) { void *prior; void **object = (void *)x; - struct kmem_cache_cpu *c; - c = get_cpu_slab(s, raw_smp_processor_id()); - stat(c, FREE_SLOWPATH); + stat(s, FREE_SLOWPATH); slab_lock(page); - if (unlikely(SLABDEBUG && PageSlubDebug(page))) + if (kmem_cache_debug(s)) goto debug; checks_ok: - prior = object[offset] = page->freelist; + prior = page->freelist; + set_freepointer(s, object, prior); page->freelist = object; page->inuse--; if (unlikely(PageSlubFrozen(page))) { - stat(c, FREE_FROZEN); + stat(s, FREE_FROZEN); goto out_unlock; } @@ -1826,7 +1806,7 @@ checks_ok: */ if (unlikely(!prior)) { add_partial(get_node(s, page_to_nid(page)), page, 1); - stat(c, FREE_ADD_PARTIAL); + stat(s, FREE_ADD_PARTIAL); } out_unlock: @@ -1839,10 +1819,10 @@ slab_empty: * Slab still on the partial list. */ remove_partial(s, page); - stat(c, FREE_REMOVE_PARTIAL); + stat(s, FREE_REMOVE_PARTIAL); } slab_unlock(page); - stat(c, FREE_SLAB); + stat(s, FREE_SLAB); discard_slab(s, page); return; @@ -1872,17 +1852,17 @@ static __always_inline void slab_free(struct kmem_cache *s, kmemleak_free_recursive(x, s->flags); local_irq_save(flags); - c = get_cpu_slab(s, smp_processor_id()); - kmemcheck_slab_free(s, object, c->objsize); - debug_check_no_locks_freed(object, c->objsize); + c = __this_cpu_ptr(s->cpu_slab); + kmemcheck_slab_free(s, object, s->objsize); + debug_check_no_locks_freed(object, s->objsize); if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(object, c->objsize); + debug_check_no_obj_freed(object, s->objsize); if (likely(page == c->page && c->node >= 0)) { - object[c->offset] = c->freelist; + set_freepointer(s, object, c->freelist); c->freelist = object; - stat(c, FREE_FASTPATH); + stat(s, FREE_FASTPATH); } else - __slab_free(s, page, x, addr, c->offset); + __slab_free(s, page, x, addr); local_irq_restore(flags); } @@ -2069,19 +2049,6 @@ static unsigned long calculate_alignment(unsigned long flags, return ALIGN(align, sizeof(void *)); } -static void init_kmem_cache_cpu(struct kmem_cache *s, - struct kmem_cache_cpu *c) -{ - c->page = NULL; - c->freelist = NULL; - c->node = 0; - c->offset = s->offset / sizeof(void *); - c->objsize = s->objsize; -#ifdef CONFIG_SLUB_STATS - memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned)); -#endif -} - static void init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) { @@ -2095,131 +2062,25 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) #endif } -#ifdef CONFIG_SMP -/* - * Per cpu array for per cpu structures. - * - * The per cpu array places all kmem_cache_cpu structures from one processor - * close together meaning that it becomes possible that multiple per cpu - * structures are contained in one cacheline. This may be particularly - * beneficial for the kmalloc caches. - * - * A desktop system typically has around 60-80 slabs. With 100 here we are - * likely able to get per cpu structures for all caches from the array defined - * here. We must be able to cover all kmalloc caches during bootstrap. - * - * If the per cpu array is exhausted then fall back to kmalloc - * of individual cachelines. No sharing is possible then. - */ -#define NR_KMEM_CACHE_CPU 100 - -static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU], - kmem_cache_cpu); - -static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); -static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS); - -static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, - int cpu, gfp_t flags) -{ - struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu); - - if (c) - per_cpu(kmem_cache_cpu_free, cpu) = - (void *)c->freelist; - else { - /* Table overflow: So allocate ourselves */ - c = kmalloc_node( - ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()), - flags, cpu_to_node(cpu)); - if (!c) - return NULL; - } - - init_kmem_cache_cpu(s, c); - return c; -} - -static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu) -{ - if (c < per_cpu(kmem_cache_cpu, cpu) || - c >= per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { - kfree(c); - return; - } - c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu); - per_cpu(kmem_cache_cpu_free, cpu) = c; -} - -static void free_kmem_cache_cpus(struct kmem_cache *s) -{ - int cpu; - - for_each_online_cpu(cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - - if (c) { - s->cpu_slab[cpu] = NULL; - free_kmem_cache_cpu(c, cpu); - } - } -} +static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[KMALLOC_CACHES]); -static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) +static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) { - int cpu; - - for_each_online_cpu(cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches) + /* + * Boot time creation of the kmalloc array. Use static per cpu data + * since the per cpu allocator is not available yet. + */ + s->cpu_slab = kmalloc_percpu + (s - kmalloc_caches); + else + s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); - if (c) - continue; + if (!s->cpu_slab) + return 0; - c = alloc_kmem_cache_cpu(s, cpu, flags); - if (!c) { - free_kmem_cache_cpus(s); - return 0; - } - s->cpu_slab[cpu] = c; - } return 1; } -/* - * Initialize the per cpu array. - */ -static void init_alloc_cpu_cpu(int cpu) -{ - int i; - - if (cpumask_test_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once))) - return; - - for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--) - free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu); - - cpumask_set_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once)); -} - -static void __init init_alloc_cpu(void) -{ - int cpu; - - for_each_online_cpu(cpu) - init_alloc_cpu_cpu(cpu); - } - -#else -static inline void free_kmem_cache_cpus(struct kmem_cache *s) {} -static inline void init_alloc_cpu(void) {} - -static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) -{ - init_kmem_cache_cpu(s, &s->cpu_slab); - return 1; -} -#endif - #ifdef CONFIG_NUMA /* * No kmalloc_node yet so do it by hand. We know that this is the first @@ -2276,7 +2137,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = s->node[node]; - if (n && n != &s->local_node) + if (n) kmem_cache_free(kmalloc_caches, n); s->node[node] = NULL; } @@ -2285,32 +2146,22 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) { int node; - int local_node; - - if (slab_state >= UP) - local_node = page_to_nid(virt_to_page(s)); - else - local_node = 0; for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n; - if (local_node == node) - n = &s->local_node; - else { - if (slab_state == DOWN) { - early_kmem_cache_node_alloc(gfpflags, node); - continue; - } - n = kmem_cache_alloc_node(kmalloc_caches, - gfpflags, node); - - if (!n) { - free_kmem_cache_nodes(s); - return 0; - } + if (slab_state == DOWN) { + early_kmem_cache_node_alloc(gfpflags, node); + continue; + } + n = kmem_cache_alloc_node(kmalloc_caches, + gfpflags, node); + if (!n) { + free_kmem_cache_nodes(s); + return 0; } + s->node[node] = n; init_kmem_cache_node(n, s); } @@ -2502,6 +2353,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) return 1; + free_kmem_cache_nodes(s); error: if (flags & SLAB_PANIC) @@ -2519,6 +2371,9 @@ int kmem_ptr_validate(struct kmem_cache *s, const void *object) { struct page *page; + if (!kern_ptr_validate(object, s->size)) + return 0; + page = get_object_page(object); if (!page || s != page->slab) @@ -2559,9 +2414,11 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, #ifdef CONFIG_SLUB_DEBUG void *addr = page_address(page); void *p; - DECLARE_BITMAP(map, page->objects); + long *map = kzalloc(BITS_TO_LONGS(page->objects) * sizeof(long), + GFP_ATOMIC); - bitmap_zero(map, page->objects); + if (!map) + return; slab_err(s, page, "%s", text); slab_lock(page); for_each_free_object(p, s, page->freelist) @@ -2576,6 +2433,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, } } slab_unlock(page); + kfree(map); #endif } @@ -2609,9 +2467,8 @@ static inline int kmem_cache_close(struct kmem_cache *s) int node; flush_all(s); - + free_percpu(s->cpu_slab); /* Attempt to free all objects */ - free_kmem_cache_cpus(s); for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); @@ -2633,7 +2490,6 @@ void kmem_cache_destroy(struct kmem_cache *s) s->refcount--; if (!s->refcount) { list_del(&s->list); - up_write(&slub_lock); if (kmem_cache_close(s)) { printk(KERN_ERR "SLUB %s: %s called for cache that " "still has objects.\n", s->name, __func__); @@ -2642,8 +2498,8 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->flags & SLAB_DESTROY_BY_RCU) rcu_barrier(); sysfs_slab_remove(s); - } else - up_write(&slub_lock); + } + up_write(&slub_lock); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -2651,7 +2507,7 @@ EXPORT_SYMBOL(kmem_cache_destroy); * Kmalloc subsystem *******************************************************************/ -struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned; +struct kmem_cache kmalloc_caches[KMALLOC_CACHES] __cacheline_aligned; EXPORT_SYMBOL(kmalloc_caches); static int __init setup_slub_min_order(char *str) @@ -2741,6 +2597,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) char *text; size_t realsize; unsigned long slabflags; + int i; s = kmalloc_caches_dma[index]; if (s) @@ -2760,7 +2617,14 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) realsize = kmalloc_caches[index].objsize; text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", (unsigned int)realsize); - s = kmalloc(kmem_size, flags & ~SLUB_DMA); + + s = NULL; + for (i = 0; i < KMALLOC_CACHES; i++) + if (!kmalloc_caches[i].size) + break; + + BUG_ON(i >= KMALLOC_CACHES); + s = kmalloc_caches + i; /* * Must defer sysfs creation to a workqueue because we don't know @@ -2772,9 +2636,9 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) if (slab_state >= SYSFS) slabflags |= __SYSFS_ADD_DEFERRED; - if (!s || !text || !kmem_cache_open(s, flags, text, + if (!text || !kmem_cache_open(s, flags, text, realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) { - kfree(s); + s->size = 0; kfree(text); goto unlock_out; } @@ -2863,7 +2727,7 @@ void *__kmalloc(size_t size, gfp_t flags) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, flags, -1, _RET_IP_); + ret = slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_); trace_kmalloc(_RET_IP_, ret, size, s->size, flags); @@ -3086,7 +2950,7 @@ static void slab_mem_offline_callback(void *arg) /* * if n->nr_slabs > 0, slabs still exist on the node * that is going down. We were unable to free them, - * and offline_pages() function shoudn't call this + * and offline_pages() function shouldn't call this * callback. So, we must fail. */ BUG_ON(slabs_node(s, offline_node)); @@ -3176,8 +3040,6 @@ void __init kmem_cache_init(void) int i; int caches = 0; - init_alloc_cpu(); - #ifdef CONFIG_NUMA /* * Must first have the slab cache available for the allocations of the @@ -3255,14 +3117,19 @@ void __init kmem_cache_init(void) slab_state = UP; /* Provide the correct kmalloc names now that the caches are up */ - for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) - kmalloc_caches[i]. name = - kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i); + for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { + char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i); + + BUG_ON(!s); + kmalloc_caches[i].name = s; + } #ifdef CONFIG_SMP register_cpu_notifier(&slab_notifier); - kmem_size = offsetof(struct kmem_cache, cpu_slab) + - nr_cpu_ids * sizeof(struct kmem_cache_cpu *); +#endif +#ifdef CONFIG_NUMA + kmem_size = offsetof(struct kmem_cache, node) + + nr_node_ids * sizeof(struct kmem_cache_node *); #else kmem_size = sizeof(struct kmem_cache); #endif @@ -3351,31 +3218,19 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, down_write(&slub_lock); s = find_mergeable(size, align, flags, name, ctor); if (s) { - int cpu; - s->refcount++; /* * Adjust the object sizes so that we clear * the complete object on kzalloc. */ s->objsize = max(s->objsize, (int)size); - - /* - * And then we need to update the object size in the - * per cpu structures - */ - for_each_online_cpu(cpu) - get_cpu_slab(s, cpu)->objsize = s->objsize; - s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); - up_write(&slub_lock); if (sysfs_slab_alias(s, name)) { - down_write(&slub_lock); s->refcount--; - up_write(&slub_lock); goto err; } + up_write(&slub_lock); return s; } @@ -3384,14 +3239,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, if (kmem_cache_open(s, GFP_KERNEL, name, size, align, flags, ctor)) { list_add(&s->list, &slab_caches); - up_write(&slub_lock); if (sysfs_slab_add(s)) { - down_write(&slub_lock); list_del(&s->list); - up_write(&slub_lock); kfree(s); goto err; } + up_write(&slub_lock); return s; } kfree(s); @@ -3420,29 +3273,15 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, unsigned long flags; switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - init_alloc_cpu_cpu(cpu); - down_read(&slub_lock); - list_for_each_entry(s, &slab_caches, list) - s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu, - GFP_KERNEL); - up_read(&slub_lock); - break; - case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: down_read(&slub_lock); list_for_each_entry(s, &slab_caches, list) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - local_irq_save(flags); __flush_cpu_slab(s, cpu); local_irq_restore(flags); - free_kmem_cache_cpu(c, cpu); - s->cpu_slab[cpu] = NULL; } up_read(&slub_lock); break; @@ -3471,7 +3310,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, gfpflags, -1, caller); + ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller); /* Honor the call site pointer we recieved. */ trace_kmalloc(caller, ret, size, s->size, gfpflags); @@ -3485,8 +3324,15 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, struct kmem_cache *s; void *ret; - if (unlikely(size > SLUB_MAX_SIZE)) - return kmalloc_large_node(size, gfpflags, node); + if (unlikely(size > SLUB_MAX_SIZE)) { + ret = kmalloc_large_node(size, gfpflags, node); + + trace_kmalloc_node(caller, ret, + size, PAGE_SIZE << get_order(size), + gfpflags, node); + + return ret; + } s = get_slab(size, gfpflags); @@ -3547,16 +3393,6 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page, } else printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", s->name, page); - - if (s->flags & DEBUG_DEFAULT_FLAGS) { - if (!PageSlubDebug(page)) - printk(KERN_ERR "SLUB %s: SlubDebug not set " - "on slab 0x%p\n", s->name, page); - } else { - if (PageSlubDebug(page)) - printk(KERN_ERR "SLUB %s: SlubDebug set on " - "slab 0x%p\n", s->name, page); - } } static int validate_slab_node(struct kmem_cache *s, @@ -3798,10 +3634,10 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, } static void process_slab(struct loc_track *t, struct kmem_cache *s, - struct page *page, enum track_item alloc) + struct page *page, enum track_item alloc, + long *map) { void *addr = page_address(page); - DECLARE_BITMAP(map, page->objects); void *p; bitmap_zero(map, page->objects); @@ -3820,11 +3656,14 @@ static int list_locations(struct kmem_cache *s, char *buf, unsigned long i; struct loc_track t = { 0, 0, NULL }; int node; + unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * + sizeof(unsigned long), GFP_KERNEL); - if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), - GFP_TEMPORARY)) + if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), + GFP_TEMPORARY)) { + kfree(map); return sprintf(buf, "Out of memory\n"); - + } /* Push back cpu slabs */ flush_all(s); @@ -3838,9 +3677,9 @@ static int list_locations(struct kmem_cache *s, char *buf, spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry(page, &n->partial, lru) - process_slab(&t, s, page, alloc); + process_slab(&t, s, page, alloc, map); list_for_each_entry(page, &n->full, lru) - process_slab(&t, s, page, alloc); + process_slab(&t, s, page, alloc, map); spin_unlock_irqrestore(&n->list_lock, flags); } @@ -3891,6 +3730,7 @@ static int list_locations(struct kmem_cache *s, char *buf, } free_loc_track(&t); + kfree(map); if (!t.count) len += sprintf(buf, "No data\n"); return len; @@ -3928,7 +3768,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, int cpu; for_each_possible_cpu(cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); if (!c || c->node < 0) continue; @@ -4171,6 +4011,23 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf, } SLAB_ATTR(trace); +#ifdef CONFIG_FAILSLAB +static ssize_t failslab_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); +} + +static ssize_t failslab_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + s->flags &= ~SLAB_FAILSLAB; + if (buf[0] == '1') + s->flags |= SLAB_FAILSLAB; + return length; +} +SLAB_ATTR(failslab); +#endif + static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); @@ -4353,7 +4210,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) return -ENOMEM; for_each_online_cpu(cpu) { - unsigned x = get_cpu_slab(s, cpu)->stat[si]; + unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si]; data[cpu] = x; sum += x; @@ -4376,7 +4233,7 @@ static void clear_stat(struct kmem_cache *s, enum stat_item si) int cpu; for_each_online_cpu(cpu) - get_cpu_slab(s, cpu)->stat[si] = 0; + per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0; } #define STAT_ATTR(si, text) \ @@ -4467,6 +4324,10 @@ static struct attribute *slab_attrs[] = { &deactivate_remote_frees_attr.attr, &order_fallback_attr.attr, #endif +#ifdef CONFIG_FAILSLAB + &failslab_attr.attr, +#endif + NULL }; @@ -4519,7 +4380,7 @@ static void kmem_cache_release(struct kobject *kobj) kfree(s); } -static struct sysfs_ops slab_sysfs_ops = { +static const struct sysfs_ops slab_sysfs_ops = { .show = slab_attr_show, .store = slab_attr_store, }; @@ -4538,7 +4399,7 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj) return 0; } -static struct kset_uevent_ops slab_uevent_ops = { +static const struct kset_uevent_ops slab_uevent_ops = { .filter = uevent_filter, }; @@ -4631,6 +4492,13 @@ static int sysfs_slab_add(struct kmem_cache *s) static void sysfs_slab_remove(struct kmem_cache *s) { + if (slab_state < SYSFS) + /* + * Sysfs has not been setup yet so no need to remove the + * cache from sysfs. + */ + return; + kobject_uevent(&s->kobj, KOBJ_REMOVE); kobject_del(&s->kobj); kobject_put(&s->kobj); @@ -4676,8 +4544,11 @@ static int __init slab_sysfs_init(void) struct kmem_cache *s; int err; + down_write(&slub_lock); + slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); if (!slab_kset) { + up_write(&slub_lock); printk(KERN_ERR "Cannot register slab subsystem.\n"); return -ENOSYS; } @@ -4702,6 +4573,7 @@ static int __init slab_sysfs_init(void) kfree(al); } + up_write(&slub_lock); resiliency_test(); return 0; } diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index d9714bd..aa33fd6 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -22,6 +22,7 @@ #include <linux/bootmem.h> #include <linux/highmem.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/spinlock.h> #include <linux/vmalloc.h> #include <linux/sched.h> @@ -40,9 +41,11 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node, unsigned long align, unsigned long goal) { - return __alloc_bootmem_node(NODE_DATA(node), size, align, goal); + return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal); } +static void *vmemmap_buf; +static void *vmemmap_buf_end; void * __meminit vmemmap_alloc_block(unsigned long size, int node) { @@ -64,6 +67,24 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) __pa(MAX_DMA_ADDRESS)); } +/* need to make sure size is all the same during early stage */ +void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node) +{ + void *ptr; + + if (!vmemmap_buf) + return vmemmap_alloc_block(size, node); + + /* take the from buf */ + ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size); + if (ptr + size > vmemmap_buf_end) + return vmemmap_alloc_block(size, node); + + vmemmap_buf = ptr + size; + + return ptr; +} + void __meminit vmemmap_verify(pte_t *pte, int node, unsigned long start, unsigned long end) { @@ -80,7 +101,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) pte_t *pte = pte_offset_kernel(pmd, addr); if (pte_none(*pte)) { pte_t entry; - void *p = vmemmap_alloc_block(PAGE_SIZE, node); + void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node); if (!p) return NULL; entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); @@ -163,3 +184,55 @@ struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid) return map; } + +void __init sparse_mem_maps_populate_node(struct page **map_map, + unsigned long pnum_begin, + unsigned long pnum_end, + unsigned long map_count, int nodeid) +{ + unsigned long pnum; + unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; + void *vmemmap_buf_start; + + size = ALIGN(size, PMD_SIZE); + vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count, + PMD_SIZE, __pa(MAX_DMA_ADDRESS)); + + if (vmemmap_buf_start) { + vmemmap_buf = vmemmap_buf_start; + vmemmap_buf_end = vmemmap_buf_start + size * map_count; + } + + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + struct mem_section *ms; + + if (!present_section_nr(pnum)) + continue; + + map_map[pnum] = sparse_mem_map_populate(pnum, nodeid); + if (map_map[pnum]) + continue; + ms = __nr_to_section(pnum); + printk(KERN_ERR "%s: sparsemem memory map backing failed " + "some memory will not be available.\n", __func__); + ms->section_mem_map = 0; + } + + if (vmemmap_buf_start) { + /* need to free left buf */ +#ifdef CONFIG_NO_BOOTMEM + free_early(__pa(vmemmap_buf_start), __pa(vmemmap_buf_end)); + if (vmemmap_buf_start < vmemmap_buf) { + char name[15]; + + snprintf(name, sizeof(name), "MEMMAP %d", nodeid); + reserve_early_without_check(__pa(vmemmap_buf_start), + __pa(vmemmap_buf), name); + } +#else + free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf); +#endif + vmemmap_buf = NULL; + vmemmap_buf_end = NULL; + } +} diff --git a/mm/sparse.c b/mm/sparse.c index 6ce4aab..95ac219 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -2,6 +2,7 @@ * sparse memory mappings. */ #include <linux/mm.h> +#include <linux/slab.h> #include <linux/mmzone.h> #include <linux/bootmem.h> #include <linux/highmem.h> @@ -271,7 +272,8 @@ static unsigned long *__kmalloc_section_usemap(void) #ifdef CONFIG_MEMORY_HOTREMOVE static unsigned long * __init -sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) +sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, + unsigned long count) { unsigned long section_nr; @@ -286,7 +288,7 @@ sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) * this problem. */ section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); - return alloc_bootmem_section(usemap_size(), section_nr); + return alloc_bootmem_section(usemap_size() * count, section_nr); } static void __init check_usemap_section_nr(int nid, unsigned long *usemap) @@ -329,7 +331,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) } #else static unsigned long * __init -sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) +sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, + unsigned long count) { return NULL; } @@ -339,44 +342,117 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) } #endif /* CONFIG_MEMORY_HOTREMOVE */ -static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) +static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map, + unsigned long pnum_begin, + unsigned long pnum_end, + unsigned long usemap_count, int nodeid) { - unsigned long *usemap; - struct mem_section *ms = __nr_to_section(pnum); - int nid = sparse_early_nid(ms); - - usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid)); - if (usemap) - return usemap; + void *usemap; + unsigned long pnum; + int size = usemap_size(); - usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); + usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), + usemap_count); if (usemap) { - check_usemap_section_nr(nid, usemap); - return usemap; + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + if (!present_section_nr(pnum)) + continue; + usemap_map[pnum] = usemap; + usemap += size; + } + return; } - /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ - nid = 0; + usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); + if (usemap) { + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + if (!present_section_nr(pnum)) + continue; + usemap_map[pnum] = usemap; + usemap += size; + check_usemap_section_nr(nodeid, usemap_map[pnum]); + } + return; + } printk(KERN_WARNING "%s: allocation failed\n", __func__); - return NULL; } #ifndef CONFIG_SPARSEMEM_VMEMMAP struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) { struct page *map; + unsigned long size; map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); if (map) return map; - map = alloc_bootmem_pages_node(NODE_DATA(nid), - PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); + size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); + map = __alloc_bootmem_node_high(NODE_DATA(nid), size, + PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); return map; } +void __init sparse_mem_maps_populate_node(struct page **map_map, + unsigned long pnum_begin, + unsigned long pnum_end, + unsigned long map_count, int nodeid) +{ + void *map; + unsigned long pnum; + unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; + + map = alloc_remap(nodeid, size * map_count); + if (map) { + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + if (!present_section_nr(pnum)) + continue; + map_map[pnum] = map; + map += size; + } + return; + } + + size = PAGE_ALIGN(size); + map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count, + PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + if (map) { + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + if (!present_section_nr(pnum)) + continue; + map_map[pnum] = map; + map += size; + } + return; + } + + /* fallback */ + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + struct mem_section *ms; + + if (!present_section_nr(pnum)) + continue; + map_map[pnum] = sparse_mem_map_populate(pnum, nodeid); + if (map_map[pnum]) + continue; + ms = __nr_to_section(pnum); + printk(KERN_ERR "%s: sparsemem memory map backing failed " + "some memory will not be available.\n", __func__); + ms->section_mem_map = 0; + } +} #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ +#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER +static void __init sparse_early_mem_maps_alloc_node(struct page **map_map, + unsigned long pnum_begin, + unsigned long pnum_end, + unsigned long map_count, int nodeid) +{ + sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end, + map_count, nodeid); +} +#else static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) { struct page *map; @@ -392,10 +468,12 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) ms->section_mem_map = 0; return NULL; } +#endif void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) { } + /* * Allocate the accumulated non-linear sections, allocate a mem_map * for each and record the physical to section mapping. @@ -407,6 +485,14 @@ void __init sparse_init(void) unsigned long *usemap; unsigned long **usemap_map; int size; + int nodeid_begin = 0; + unsigned long pnum_begin = 0; + unsigned long usemap_count; +#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER + unsigned long map_count; + int size2; + struct page **map_map; +#endif /* * map is using big page (aka 2M in x86 64 bit) @@ -425,10 +511,81 @@ void __init sparse_init(void) panic("can not allocate usemap_map\n"); for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { + struct mem_section *ms; + if (!present_section_nr(pnum)) continue; - usemap_map[pnum] = sparse_early_usemap_alloc(pnum); + ms = __nr_to_section(pnum); + nodeid_begin = sparse_early_nid(ms); + pnum_begin = pnum; + break; } + usemap_count = 1; + for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { + struct mem_section *ms; + int nodeid; + + if (!present_section_nr(pnum)) + continue; + ms = __nr_to_section(pnum); + nodeid = sparse_early_nid(ms); + if (nodeid == nodeid_begin) { + usemap_count++; + continue; + } + /* ok, we need to take cake of from pnum_begin to pnum - 1*/ + sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, pnum, + usemap_count, nodeid_begin); + /* new start, update count etc*/ + nodeid_begin = nodeid; + pnum_begin = pnum; + usemap_count = 1; + } + /* ok, last chunk */ + sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS, + usemap_count, nodeid_begin); + +#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER + size2 = sizeof(struct page *) * NR_MEM_SECTIONS; + map_map = alloc_bootmem(size2); + if (!map_map) + panic("can not allocate map_map\n"); + + for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { + struct mem_section *ms; + + if (!present_section_nr(pnum)) + continue; + ms = __nr_to_section(pnum); + nodeid_begin = sparse_early_nid(ms); + pnum_begin = pnum; + break; + } + map_count = 1; + for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { + struct mem_section *ms; + int nodeid; + + if (!present_section_nr(pnum)) + continue; + ms = __nr_to_section(pnum); + nodeid = sparse_early_nid(ms); + if (nodeid == nodeid_begin) { + map_count++; + continue; + } + /* ok, we need to take cake of from pnum_begin to pnum - 1*/ + sparse_early_mem_maps_alloc_node(map_map, pnum_begin, pnum, + map_count, nodeid_begin); + /* new start, update count etc*/ + nodeid_begin = nodeid; + pnum_begin = pnum; + map_count = 1; + } + /* ok, last chunk */ + sparse_early_mem_maps_alloc_node(map_map, pnum_begin, NR_MEM_SECTIONS, + map_count, nodeid_begin); +#endif for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { if (!present_section_nr(pnum)) @@ -438,7 +595,11 @@ void __init sparse_init(void) if (!usemap) continue; +#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER + map = map_map[pnum]; +#else map = sparse_early_mem_map_alloc(pnum); +#endif if (!map) continue; @@ -448,6 +609,9 @@ void __init sparse_init(void) vmemmap_populate_print_last(); +#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER + free_bootmem(__pa(map_map), size2); +#endif free_bootmem(__pa(usemap_map), size); } @@ -30,6 +30,7 @@ #include <linux/notifier.h> #include <linux/backing-dev.h> #include <linux/memcontrol.h> +#include <linux/gfp.h> #include "internal.h" @@ -55,7 +56,7 @@ static void __page_cache_release(struct page *page) del_page_from_lru(zone, page); spin_unlock_irqrestore(&zone->lru_lock, flags); } - free_hot_page(page); + free_hot_cold_page(page, 0); } static void put_compound_page(struct page *page) @@ -223,6 +224,7 @@ void __lru_cache_add(struct page *page, enum lru_list lru) ____pagevec_lru_add(pvec, lru); put_cpu_var(lru_add_pvecs); } +EXPORT_SYMBOL(__lru_cache_add); /** * lru_cache_add_lru - add a page to a page list diff --git a/mm/swap_state.c b/mm/swap_state.c index 6d1daeb..e10f583 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -8,6 +8,7 @@ */ #include <linux/module.h> #include <linux/mm.h> +#include <linux/gfp.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/swapops.h> diff --git a/mm/swapfile.c b/mm/swapfile.c index 6c0585b..7c703ff 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -139,7 +139,7 @@ static int discard_swap(struct swap_info_struct *si) nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); if (nr_blocks) { err = blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); + nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT); if (err) return err; cond_resched(); @@ -150,7 +150,7 @@ static int discard_swap(struct swap_info_struct *si) nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); err = blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); + nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT); if (err) break; @@ -189,7 +189,7 @@ static void discard_swap_cluster(struct swap_info_struct *si, start_block <<= PAGE_SHIFT - 9; nr_blocks <<= PAGE_SHIFT - 9; if (blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER)) + nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT)) break; } @@ -574,6 +574,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, /* free if no reference */ if (!usage) { + struct gendisk *disk = p->bdev->bd_disk; if (offset < p->lowest_bit) p->lowest_bit = offset; if (offset > p->highest_bit) @@ -583,6 +584,9 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, swap_list.next = p->type; nr_swap_pages++; p->inuse_pages--; + if ((p->flags & SWP_BLKDEV) && + disk->fops->swap_slot_free_notify) + disk->fops->swap_slot_free_notify(p->bdev, offset); } return usage; @@ -679,6 +683,24 @@ int try_to_free_swap(struct page *page) if (page_swapcount(page)) return 0; + /* + * Once hibernation has begun to create its image of memory, + * there's a danger that one of the calls to try_to_free_swap() + * - most probably a call from __try_to_reclaim_swap() while + * hibernation is allocating its own swap pages for the image, + * but conceivably even a call from memory reclaim - will free + * the swap from a page which has already been recorded in the + * image as a clean swapcache page, and then reuse its swap for + * another page of the image. On waking from hibernation, the + * original page might be freed under memory pressure, then + * later read back in from swap, now with the wrong data. + * + * Hibernation clears bits from gfp_allowed_mask to prevent + * memory reclaim from writing to disk, so check that here. + */ + if (!(gfp_allowed_mask & __GFP_IO)) + return 0; + delete_from_swap_cache(page); SetPageDirty(page); return 1; @@ -723,6 +745,37 @@ int free_swap_and_cache(swp_entry_t entry) return p != NULL; } +#ifdef CONFIG_CGROUP_MEM_RES_CTLR +/** + * mem_cgroup_count_swap_user - count the user of a swap entry + * @ent: the swap entry to be checked + * @pagep: the pointer for the swap cache page of the entry to be stored + * + * Returns the number of the user of the swap entry. The number is valid only + * for swaps of anonymous pages. + * If the entry is found on swap cache, the page is stored to pagep with + * refcount of it being incremented. + */ +int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep) +{ + struct page *page; + struct swap_info_struct *p; + int count = 0; + + page = find_get_page(&swapper_space, ent.val); + if (page) + count += page_mapcount(page); + p = swap_info_get(ent); + if (p) { + count += swap_count(p->swap_map[swp_offset(ent)]); + spin_unlock(&swap_lock); + } + + *pagep = page; + return count; +} +#endif + #ifdef CONFIG_HIBERNATION /* * Find the swap type that corresponds to given device (if any). @@ -840,7 +893,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, goto out; } - inc_mm_counter(vma->vm_mm, anon_rss); + dec_mm_counter(vma->vm_mm, MM_SWAPENTS); + inc_mm_counter(vma->vm_mm, MM_ANONPAGES); get_page(page); set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); @@ -1759,11 +1813,11 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) unsigned int type; int i, prev; int error; - union swap_header *swap_header = NULL; - unsigned int nr_good_pages = 0; + union swap_header *swap_header; + unsigned int nr_good_pages; int nr_extents = 0; sector_t span; - unsigned long maxpages = 1; + unsigned long maxpages; unsigned long swapfilepages; unsigned char *swap_map = NULL; struct page *page = NULL; @@ -1852,6 +1906,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (error < 0) goto bad_swap; p->bdev = bdev; + p->flags |= SWP_BLKDEV; } else if (S_ISREG(inode->i_mode)) { p->bdev = inode->i_sb->s_bdev; mutex_lock(&inode->i_mutex); @@ -1922,9 +1977,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) * swap pte. */ maxpages = swp_offset(pte_to_swp_entry( - swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1; - if (maxpages > swap_header->info.last_page) - maxpages = swap_header->info.last_page; + swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; + if (maxpages > swap_header->info.last_page) { + maxpages = swap_header->info.last_page + 1; + /* p->max is an unsigned int: don't overflow it */ + if ((unsigned int)maxpages == 0) + maxpages = UINT_MAX; + } p->highest_bit = maxpages - 1; error = -EINVAL; @@ -1948,23 +2007,24 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } memset(swap_map, 0, maxpages); + nr_good_pages = maxpages - 1; /* omit header page */ + for (i = 0; i < swap_header->info.nr_badpages; i++) { - int page_nr = swap_header->info.badpages[i]; - if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { + unsigned int page_nr = swap_header->info.badpages[i]; + if (page_nr == 0 || page_nr > swap_header->info.last_page) { error = -EINVAL; goto bad_swap; } - swap_map[page_nr] = SWAP_MAP_BAD; + if (page_nr < maxpages) { + swap_map[page_nr] = SWAP_MAP_BAD; + nr_good_pages--; + } } error = swap_cgroup_swapon(type, maxpages); if (error) goto bad_swap; - nr_good_pages = swap_header->info.last_page - - swap_header->info.nr_badpages - - 1 /* header page */; - if (nr_good_pages) { swap_map[0] = SWAP_MAP_BAD; p->max = maxpages; @@ -1987,7 +2047,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->flags |= SWP_SOLIDSTATE; p->cluster_next = 1 + (random32() % p->highest_bit); } - if (discard_swap(p) == 0) + if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD)) p->flags |= SWP_DISCARDABLE; } @@ -2155,7 +2215,11 @@ void swap_shmem_alloc(swp_entry_t entry) } /* - * increase reference count of swap entry by 1. + * Increase reference count of swap entry by 1. + * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required + * but could not be atomically allocated. Returns 0, just as if it succeeded, + * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which + * might occur if a page table entry has got corrupted. */ int swap_duplicate(swp_entry_t entry) { diff --git a/mm/truncate.c b/mm/truncate.c index e87e372..ba887bf 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/backing-dev.h> +#include <linux/gfp.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/module.h> @@ -540,28 +541,48 @@ void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) EXPORT_SYMBOL(truncate_pagecache); /** + * truncate_setsize - update inode and pagecache for a new file size + * @inode: inode + * @newsize: new file size + * + * truncate_setsize updastes i_size update and performs pagecache + * truncation (if necessary) for a file size updates. It will be + * typically be called from the filesystem's setattr function when + * ATTR_SIZE is passed in. + * + * Must be called with inode_mutex held and after all filesystem + * specific block truncation has been performed. + */ +void truncate_setsize(struct inode *inode, loff_t newsize) +{ + loff_t oldsize; + + oldsize = inode->i_size; + i_size_write(inode, newsize); + + truncate_pagecache(inode, oldsize, newsize); +} +EXPORT_SYMBOL(truncate_setsize); + +/** * vmtruncate - unmap mappings "freed" by truncate() syscall * @inode: inode of the file used * @offset: file offset to start truncating * - * NOTE! We have to be ready to update the memory sharing - * between the file and the memory map for a potential last - * incomplete page. Ugly, but necessary. + * This function is deprecated and truncate_setsize or truncate_pagecache + * should be used instead, together with filesystem specific block truncation. */ int vmtruncate(struct inode *inode, loff_t offset) { - loff_t oldsize; int error; error = inode_newsize_ok(inode, offset); if (error) return error; - oldsize = inode->i_size; - i_size_write(inode, offset); - truncate_pagecache(inode, oldsize, offset); + + truncate_setsize(inode, offset); if (inode->i_op->truncate) inode->i_op->truncate(inode); - - return error; + return 0; } EXPORT_SYMBOL(vmtruncate); @@ -186,6 +186,27 @@ void kzfree(const void *p) } EXPORT_SYMBOL(kzfree); +int kern_ptr_validate(const void *ptr, unsigned long size) +{ + unsigned long addr = (unsigned long)ptr; + unsigned long min_addr = PAGE_OFFSET; + unsigned long align_mask = sizeof(void *) - 1; + + if (unlikely(addr < min_addr)) + goto out; + if (unlikely(addr > (unsigned long)high_memory - size)) + goto out; + if (unlikely(addr & align_mask)) + goto out; + if (unlikely(!kern_addr_valid(addr))) + goto out; + if (unlikely(!kern_addr_valid(addr + size - 1))) + goto out; + return 1; +out: + return 0; +} + /* * strndup_user - duplicate an existing string from user space * @s: The string to duplicate @@ -204,15 +225,10 @@ char *strndup_user(const char __user *s, long n) if (length > n) return ERR_PTR(-EINVAL); - p = kmalloc(length, GFP_KERNEL); + p = memdup_user(s, length); - if (!p) - return ERR_PTR(-ENOMEM); - - if (copy_from_user(p, s, length)) { - kfree(p); - return ERR_PTR(-EFAULT); - } + if (IS_ERR(p)) + return p; p[length - 1] = '\0'; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d55d905..6b8889d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -31,6 +31,7 @@ #include <asm/tlbflush.h> #include <asm/shmparam.h> +bool vmap_lazy_unmap __read_mostly = true; /*** Page table manipulation functions ***/ @@ -502,6 +503,9 @@ static unsigned long lazy_max_pages(void) { unsigned int log; + if (!vmap_lazy_unmap) + return 0; + log = fls(num_online_cpus()); return log * (32UL * 1024 * 1024 / PAGE_SIZE); @@ -509,6 +513,9 @@ static unsigned long lazy_max_pages(void) static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); +/* for per-CPU blocks */ +static void purge_fragmented_blocks_allcpus(void); + /* * Purges all lazily-freed vmap areas. * @@ -539,6 +546,9 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, } else spin_lock(&purge_lock); + if (sync) + purge_fragmented_blocks_allcpus(); + rcu_read_lock(); list_for_each_entry_rcu(va, &vmap_area_list, list) { if (va->flags & VM_LAZY_FREE) { @@ -667,8 +677,6 @@ static bool vmap_initialized __read_mostly = false; struct vmap_block_queue { spinlock_t lock; struct list_head free; - struct list_head dirty; - unsigned int nr_dirty; }; struct vmap_block { @@ -678,10 +686,9 @@ struct vmap_block { unsigned long free, dirty; DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); - union { - struct list_head free_list; - struct rcu_head rcu_head; - }; + struct list_head free_list; + struct rcu_head rcu_head; + struct list_head purge; }; /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ @@ -729,7 +736,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) node, gfp_mask); if (unlikely(IS_ERR(va))) { kfree(vb); - return ERR_PTR(PTR_ERR(va)); + return ERR_CAST(va); } err = radix_tree_preload(gfp_mask); @@ -757,7 +764,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) vbq = &get_cpu_var(vmap_block_queue); vb->vbq = vbq; spin_lock(&vbq->lock); - list_add(&vb->free_list, &vbq->free); + list_add_rcu(&vb->free_list, &vbq->free); spin_unlock(&vbq->lock); put_cpu_var(vmap_block_queue); @@ -776,8 +783,6 @@ static void free_vmap_block(struct vmap_block *vb) struct vmap_block *tmp; unsigned long vb_idx; - BUG_ON(!list_empty(&vb->free_list)); - vb_idx = addr_to_vb_idx(vb->va->va_start); spin_lock(&vmap_block_tree_lock); tmp = radix_tree_delete(&vmap_block_tree, vb_idx); @@ -788,12 +793,61 @@ static void free_vmap_block(struct vmap_block *vb) call_rcu(&vb->rcu_head, rcu_free_vb); } +static void purge_fragmented_blocks(int cpu) +{ + LIST_HEAD(purge); + struct vmap_block *vb; + struct vmap_block *n_vb; + struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); + + rcu_read_lock(); + list_for_each_entry_rcu(vb, &vbq->free, free_list) { + + if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS)) + continue; + + spin_lock(&vb->lock); + if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { + vb->free = 0; /* prevent further allocs after releasing lock */ + vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ + bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS); + bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS); + spin_lock(&vbq->lock); + list_del_rcu(&vb->free_list); + spin_unlock(&vbq->lock); + spin_unlock(&vb->lock); + list_add_tail(&vb->purge, &purge); + } else + spin_unlock(&vb->lock); + } + rcu_read_unlock(); + + list_for_each_entry_safe(vb, n_vb, &purge, purge) { + list_del(&vb->purge); + free_vmap_block(vb); + } +} + +static void purge_fragmented_blocks_thiscpu(void) +{ + purge_fragmented_blocks(smp_processor_id()); +} + +static void purge_fragmented_blocks_allcpus(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + purge_fragmented_blocks(cpu); +} + static void *vb_alloc(unsigned long size, gfp_t gfp_mask) { struct vmap_block_queue *vbq; struct vmap_block *vb; unsigned long addr = 0; unsigned int order; + int purge = 0; BUG_ON(size & ~PAGE_MASK); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); @@ -806,24 +860,38 @@ again: int i; spin_lock(&vb->lock); + if (vb->free < 1UL << order) + goto next; + i = bitmap_find_free_region(vb->alloc_map, VMAP_BBMAP_BITS, order); - if (i >= 0) { - addr = vb->va->va_start + (i << PAGE_SHIFT); - BUG_ON(addr_to_vb_idx(addr) != - addr_to_vb_idx(vb->va->va_start)); - vb->free -= 1UL << order; - if (vb->free == 0) { - spin_lock(&vbq->lock); - list_del_init(&vb->free_list); - spin_unlock(&vbq->lock); + if (i < 0) { + if (vb->free + vb->dirty == VMAP_BBMAP_BITS) { + /* fragmented and no outstanding allocations */ + BUG_ON(vb->dirty != VMAP_BBMAP_BITS); + purge = 1; } - spin_unlock(&vb->lock); - break; + goto next; } + addr = vb->va->va_start + (i << PAGE_SHIFT); + BUG_ON(addr_to_vb_idx(addr) != + addr_to_vb_idx(vb->va->va_start)); + vb->free -= 1UL << order; + if (vb->free == 0) { + spin_lock(&vbq->lock); + list_del_rcu(&vb->free_list); + spin_unlock(&vbq->lock); + } + spin_unlock(&vb->lock); + break; +next: spin_unlock(&vb->lock); } + + if (purge) + purge_fragmented_blocks_thiscpu(); + put_cpu_var(vmap_block_queue); rcu_read_unlock(); @@ -860,11 +928,11 @@ static void vb_free(const void *addr, unsigned long size) BUG_ON(!vb); spin_lock(&vb->lock); - bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); + BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); vb->dirty += 1UL << order; if (vb->dirty == VMAP_BBMAP_BITS) { - BUG_ON(vb->free || !list_empty(&vb->free_list)); + BUG_ON(vb->free); spin_unlock(&vb->lock); free_vmap_block(vb); } else @@ -1033,8 +1101,6 @@ void __init vmalloc_init(void) vbq = &per_cpu(vmap_block_queue, i); spin_lock_init(&vbq->lock); INIT_LIST_HEAD(&vbq->free); - INIT_LIST_HEAD(&vbq->dirty); - vbq->nr_dirty = 0; } /* Import existing vmlist entries. */ @@ -2341,7 +2407,7 @@ static int s_show(struct seq_file *m, void *p) seq_printf(m, " pages=%d", v->nr_pages); if (v->phys_addr) - seq_printf(m, " phys=%lx", v->phys_addr); + seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr); if (v->flags & VM_IOREMAP) seq_printf(m, " ioremap"); @@ -2375,8 +2441,11 @@ static int vmalloc_open(struct inode *inode, struct file *file) unsigned int *ptr = NULL; int ret; - if (NUMA_BUILD) + if (NUMA_BUILD) { ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); + if (ptr == NULL) + return -ENOMEM; + } ret = seq_open(file, &vmalloc_op); if (!ret) { struct seq_file *m = file->private_data; diff --git a/mm/vmscan.c b/mm/vmscan.c index c26986c..c5dfabf 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -13,7 +13,7 @@ #include <linux/mm.h> #include <linux/module.h> -#include <linux/slab.h> +#include <linux/gfp.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/pagemap.h> @@ -48,6 +48,9 @@ #include "internal.h" +#define CREATE_TRACE_POINTS +#include <trace/events/vmscan.h> + struct scan_control { /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; @@ -73,10 +76,14 @@ struct scan_control { int swappiness; - int all_unreclaimable; - int order; + /* + * Intend to reclaim enough contenious memory rather than to reclaim + * enough amount memory. I.e, it's the mode for high order allocation. + */ + bool lumpy_reclaim_mode; + /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; @@ -85,12 +92,6 @@ struct scan_control { * are scanned. */ nodemask_t *nodemask; - - /* Pluggable isolate pages callback */ - unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, - unsigned long *scanned, int order, int mode, - struct zone *z, struct mem_cgroup *mem_cont, - int active, int file); }; #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -215,8 +216,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, list_for_each_entry(shrinker, &shrinker_list, list) { unsigned long long delta; unsigned long total_scan; - unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask); + unsigned long max_pass; + max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask); delta = (4 * scanned) / shrinker->seeks; delta *= max_pass; do_div(delta, lru_pages + 1); @@ -244,8 +246,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, int shrink_ret; int nr_before; - nr_before = (*shrinker->shrink)(0, gfp_mask); - shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask); + nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask); + shrink_ret = (*shrinker->shrink)(shrinker, this_scan, + gfp_mask); if (shrink_ret == -1) break; if (shrink_ret < nr_before) @@ -262,27 +265,6 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, return ret; } -/* Called without lock on whether page is mapped, so answer is unstable */ -static inline int page_mapping_inuse(struct page *page) -{ - struct address_space *mapping; - - /* Page is in somebody's page tables. */ - if (page_mapped(page)) - return 1; - - /* Be more reluctant to reclaim swapcache than pagecache */ - if (PageSwapCache(page)) - return 1; - - mapping = page_mapping(page); - if (!mapping) - return 0; - - /* File is mmap'd by somebody? */ - return mapping_mapped(mapping); -} - static inline int is_page_cache_freeable(struct page *page) { /* @@ -319,7 +301,7 @@ static int may_write_to_queue(struct backing_dev_info *bdi) static void handle_write_error(struct address_space *mapping, struct page *page, int error) { - lock_page(page); + lock_page_nosync(page); if (page_mapping(page) == mapping) mapping_set_error(mapping, error); unlock_page(page); @@ -419,6 +401,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, /* synchronous write or broken a_ops? */ ClearPageReclaim(page); } + trace_mm_vmscan_writepage(page, + trace_reclaim_flags(page, sync_writeback)); inc_zone_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } @@ -579,6 +563,83 @@ redo: put_page(page); /* drop ref from isolate */ } +enum page_references { + PAGEREF_RECLAIM, + PAGEREF_RECLAIM_CLEAN, + PAGEREF_KEEP, + PAGEREF_ACTIVATE, +}; + +static enum page_references page_check_references(struct page *page, + struct scan_control *sc) +{ + int referenced_ptes, referenced_page; + unsigned long vm_flags; + + referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags); + referenced_page = TestClearPageReferenced(page); + + /* Lumpy reclaim - ignore references */ + if (sc->lumpy_reclaim_mode) + return PAGEREF_RECLAIM; + + /* + * Mlock lost the isolation race with us. Let try_to_unmap() + * move the page to the unevictable list. + */ + if (vm_flags & VM_LOCKED) + return PAGEREF_RECLAIM; + + if (referenced_ptes) { + if (PageAnon(page)) + return PAGEREF_ACTIVATE; + /* + * All mapped pages start out with page table + * references from the instantiating fault, so we need + * to look twice if a mapped file page is used more + * than once. + * + * Mark it and spare it for another trip around the + * inactive list. Another page table reference will + * lead to its activation. + * + * Note: the mark is set for activated pages as well + * so that recently deactivated but used pages are + * quickly recovered. + */ + SetPageReferenced(page); + + if (referenced_page) + return PAGEREF_ACTIVATE; + + return PAGEREF_KEEP; + } + + /* Reclaim if clean, defer dirty pages to writeback */ + if (referenced_page) + return PAGEREF_RECLAIM_CLEAN; + + return PAGEREF_RECLAIM; +} + +static noinline_for_stack void free_page_list(struct list_head *free_pages) +{ + struct pagevec freed_pvec; + struct page *page, *tmp; + + pagevec_init(&freed_pvec, 1); + + list_for_each_entry_safe(page, tmp, free_pages, lru) { + list_del(&page->lru); + if (!pagevec_add(&freed_pvec, page)) { + __pagevec_free(&freed_pvec); + pagevec_reinit(&freed_pvec); + } + } + + pagevec_free(&freed_pvec); +} + /* * shrink_page_list() returns the number of reclaimed pages */ @@ -587,19 +648,17 @@ static unsigned long shrink_page_list(struct list_head *page_list, enum pageout_io sync_writeback) { LIST_HEAD(ret_pages); - struct pagevec freed_pvec; + LIST_HEAD(free_pages); int pgactivate = 0; unsigned long nr_reclaimed = 0; - unsigned long vm_flags; cond_resched(); - pagevec_init(&freed_pvec, 1); while (!list_empty(page_list)) { + enum page_references references; struct address_space *mapping; struct page *page; int may_enter_fs; - int referenced; cond_resched(); @@ -641,17 +700,16 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; } - referenced = page_referenced(page, 1, - sc->mem_cgroup, &vm_flags); - /* - * In active use or really unfreeable? Activate it. - * If page which have PG_mlocked lost isoltation race, - * try_to_unmap moves it to unevictable list - */ - if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && - referenced && page_mapping_inuse(page) - && !(vm_flags & VM_LOCKED)) + references = page_check_references(page, sc); + switch (references) { + case PAGEREF_ACTIVATE: goto activate_locked; + case PAGEREF_KEEP: + goto keep_locked; + case PAGEREF_RECLAIM: + case PAGEREF_RECLAIM_CLEAN: + ; /* try to reclaim the page below */ + } /* * Anonymous process memory has backing store? @@ -685,7 +743,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } if (PageDirty(page)) { - if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced) + if (references == PAGEREF_RECLAIM_CLEAN) goto keep_locked; if (!may_enter_fs) goto keep_locked; @@ -770,10 +828,12 @@ static unsigned long shrink_page_list(struct list_head *page_list, __clear_page_locked(page); free_it: nr_reclaimed++; - if (!pagevec_add(&freed_pvec, page)) { - __pagevec_free(&freed_pvec); - pagevec_reinit(&freed_pvec); - } + + /* + * Is there need to periodically free_page_list? It would + * appear not as the counts should be low + */ + list_add(&page->lru, &free_pages); continue; cull_mlocked: @@ -796,18 +856,14 @@ keep: list_add(&page->lru, &ret_pages); VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); } + + free_page_list(&free_pages); + list_splice(&ret_pages, page_list); - if (pagevec_count(&freed_pvec)) - __pagevec_free(&freed_pvec); count_vm_events(PGACTIVATE, pgactivate); return nr_reclaimed; } -/* LRU Isolation modes. */ -#define ISOLATE_INACTIVE 0 /* Isolate inactive pages. */ -#define ISOLATE_ACTIVE 1 /* Isolate active pages. */ -#define ISOLATE_BOTH 2 /* Isolate both active and inactive pages. */ - /* * Attempt to remove the specified page from its LRU. Only take this page * if it is of the appropriate PageActive status. Pages which are being @@ -885,6 +941,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, unsigned long *scanned, int order, int mode, int file) { unsigned long nr_taken = 0; + unsigned long nr_lumpy_taken = 0; + unsigned long nr_lumpy_dirty = 0; + unsigned long nr_lumpy_failed = 0; unsigned long scan; for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { @@ -962,12 +1021,25 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, list_move(&cursor_page->lru, dst); mem_cgroup_del_lru(cursor_page); nr_taken++; + nr_lumpy_taken++; + if (PageDirty(cursor_page)) + nr_lumpy_dirty++; scan++; + } else { + if (mode == ISOLATE_BOTH && + page_count(cursor_page)) + nr_lumpy_failed++; } } } *scanned = scan; + + trace_mm_vmscan_lru_isolate(order, + nr_to_scan, scan, + nr_taken, + nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, + mode); return nr_taken; } @@ -975,7 +1047,6 @@ static unsigned long isolate_pages_global(unsigned long nr, struct list_head *dst, unsigned long *scanned, int order, int mode, struct zone *z, - struct mem_cgroup *mem_cont, int active, int file) { int lru = LRU_BASE; @@ -1005,7 +1076,8 @@ static unsigned long clear_active_flags(struct list_head *page_list, ClearPageActive(page); nr_active++; } - count[lru]++; + if (count) + count[lru]++; } return nr_active; @@ -1082,176 +1154,212 @@ static int too_many_isolated(struct zone *zone, int file, } /* - * shrink_inactive_list() is a helper for shrink_zone(). It returns the number - * of reclaimed pages + * TODO: Try merging with migrations version of putback_lru_pages */ -static unsigned long shrink_inactive_list(unsigned long max_scan, - struct zone *zone, struct scan_control *sc, - int priority, int file) +static noinline_for_stack void +putback_lru_pages(struct zone *zone, struct scan_control *sc, + unsigned long nr_anon, unsigned long nr_file, + struct list_head *page_list) { - LIST_HEAD(page_list); + struct page *page; struct pagevec pvec; - unsigned long nr_scanned = 0; - unsigned long nr_reclaimed = 0; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - int lumpy_reclaim = 0; - - while (unlikely(too_many_isolated(zone, file, sc))) { - congestion_wait(BLK_RW_ASYNC, HZ/10); - /* We are about to die and free our memory. Return now. */ - if (fatal_signal_pending(current)) - return SWAP_CLUSTER_MAX; - } + pagevec_init(&pvec, 1); /* - * If we need a large contiguous chunk of memory, or have - * trouble getting a small set of contiguous pages, we - * will reclaim both active and inactive pages. - * - * We use the same threshold as pageout congestion_wait below. + * Put back any unfreeable pages. */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - lumpy_reclaim = 1; - else if (sc->order && priority < DEF_PRIORITY - 2) - lumpy_reclaim = 1; + spin_lock(&zone->lru_lock); + while (!list_empty(page_list)) { + int lru; + page = lru_to_page(page_list); + VM_BUG_ON(PageLRU(page)); + list_del(&page->lru); + if (unlikely(!page_evictable(page, NULL))) { + spin_unlock_irq(&zone->lru_lock); + putback_lru_page(page); + spin_lock_irq(&zone->lru_lock); + continue; + } + SetPageLRU(page); + lru = page_lru(page); + add_page_to_lru_list(zone, page, lru); + if (is_active_lru(lru)) { + int file = is_file_lru(lru); + reclaim_stat->recent_rotated[file]++; + } + if (!pagevec_add(&pvec, page)) { + spin_unlock_irq(&zone->lru_lock); + __pagevec_release(&pvec); + spin_lock_irq(&zone->lru_lock); + } + } + __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); - pagevec_init(&pvec, 1); + spin_unlock_irq(&zone->lru_lock); + pagevec_release(&pvec); +} - lru_add_drain(); - spin_lock_irq(&zone->lru_lock); - do { - struct page *page; - unsigned long nr_taken; - unsigned long nr_scan; - unsigned long nr_freed; - unsigned long nr_active; - unsigned int count[NR_LRU_LISTS] = { 0, }; - int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE; - unsigned long nr_anon; - unsigned long nr_file; - - nr_taken = sc->isolate_pages(SWAP_CLUSTER_MAX, - &page_list, &nr_scan, sc->order, mode, - zone, sc->mem_cgroup, 0, file); +static noinline_for_stack void update_isolated_counts(struct zone *zone, + struct scan_control *sc, + unsigned long *nr_anon, + unsigned long *nr_file, + struct list_head *isolated_list) +{ + unsigned long nr_active; + unsigned int count[NR_LRU_LISTS] = { 0, }; + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - if (scanning_global_lru(sc)) { - zone->pages_scanned += nr_scan; - if (current_is_kswapd()) - __count_zone_vm_events(PGSCAN_KSWAPD, zone, - nr_scan); - else - __count_zone_vm_events(PGSCAN_DIRECT, zone, - nr_scan); - } + nr_active = clear_active_flags(isolated_list, count); + __count_vm_events(PGDEACTIVATE, nr_active); + + __mod_zone_page_state(zone, NR_ACTIVE_FILE, + -count[LRU_ACTIVE_FILE]); + __mod_zone_page_state(zone, NR_INACTIVE_FILE, + -count[LRU_INACTIVE_FILE]); + __mod_zone_page_state(zone, NR_ACTIVE_ANON, + -count[LRU_ACTIVE_ANON]); + __mod_zone_page_state(zone, NR_INACTIVE_ANON, + -count[LRU_INACTIVE_ANON]); + + *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; + *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; + __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file); + + reclaim_stat->recent_scanned[0] += *nr_anon; + reclaim_stat->recent_scanned[1] += *nr_file; +} - if (nr_taken == 0) - goto done; +/* + * Returns true if the caller should wait to clean dirty/writeback pages. + * + * If we are direct reclaiming for contiguous pages and we do not reclaim + * everything in the list, try again and wait for writeback IO to complete. + * This will stall high-order allocations noticeably. Only do that when really + * need to free the pages under high memory pressure. + */ +static inline bool should_reclaim_stall(unsigned long nr_taken, + unsigned long nr_freed, + int priority, + struct scan_control *sc) +{ + int lumpy_stall_priority; + + /* kswapd should not stall on sync IO */ + if (current_is_kswapd()) + return false; - nr_active = clear_active_flags(&page_list, count); - __count_vm_events(PGDEACTIVATE, nr_active); + /* Only stall on lumpy reclaim */ + if (!sc->lumpy_reclaim_mode) + return false; - __mod_zone_page_state(zone, NR_ACTIVE_FILE, - -count[LRU_ACTIVE_FILE]); - __mod_zone_page_state(zone, NR_INACTIVE_FILE, - -count[LRU_INACTIVE_FILE]); - __mod_zone_page_state(zone, NR_ACTIVE_ANON, - -count[LRU_ACTIVE_ANON]); - __mod_zone_page_state(zone, NR_INACTIVE_ANON, - -count[LRU_INACTIVE_ANON]); + /* If we have relaimed everything on the isolated list, no stall */ + if (nr_freed == nr_taken) + return false; - nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; - nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; - __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); + /* + * For high-order allocations, there are two stall thresholds. + * High-cost allocations stall immediately where as lower + * order allocations such as stacks require the scanning + * priority to be much higher before stalling. + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + lumpy_stall_priority = DEF_PRIORITY; + else + lumpy_stall_priority = DEF_PRIORITY / 3; - reclaim_stat->recent_scanned[0] += nr_anon; - reclaim_stat->recent_scanned[1] += nr_file; + return priority <= lumpy_stall_priority; +} - spin_unlock_irq(&zone->lru_lock); +/* + * shrink_inactive_list() is a helper for shrink_zone(). It returns the number + * of reclaimed pages + */ +static noinline_for_stack unsigned long +shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, + struct scan_control *sc, int priority, int file) +{ + LIST_HEAD(page_list); + unsigned long nr_scanned; + unsigned long nr_reclaimed = 0; + unsigned long nr_taken; + unsigned long nr_active; + unsigned long nr_anon; + unsigned long nr_file; + + while (unlikely(too_many_isolated(zone, file, sc))) { + congestion_wait(BLK_RW_ASYNC, HZ/10); + + /* We are about to die and free our memory. Return now. */ + if (fatal_signal_pending(current)) + return SWAP_CLUSTER_MAX; + } - nr_scanned += nr_scan; - nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC); + lru_add_drain(); + spin_lock_irq(&zone->lru_lock); + + if (scanning_global_lru(sc)) { + nr_taken = isolate_pages_global(nr_to_scan, + &page_list, &nr_scanned, sc->order, + sc->lumpy_reclaim_mode ? + ISOLATE_BOTH : ISOLATE_INACTIVE, + zone, 0, file); + zone->pages_scanned += nr_scanned; + if (current_is_kswapd()) + __count_zone_vm_events(PGSCAN_KSWAPD, zone, + nr_scanned); + else + __count_zone_vm_events(PGSCAN_DIRECT, zone, + nr_scanned); + } else { + nr_taken = mem_cgroup_isolate_pages(nr_to_scan, + &page_list, &nr_scanned, sc->order, + sc->lumpy_reclaim_mode ? + ISOLATE_BOTH : ISOLATE_INACTIVE, + zone, sc->mem_cgroup, + 0, file); /* - * If we are direct reclaiming for contiguous pages and we do - * not reclaim everything in the list, try again and wait - * for IO to complete. This will stall high-order allocations - * but that should be acceptable to the caller + * mem_cgroup_isolate_pages() keeps track of + * scanned pages on its own. */ - if (nr_freed < nr_taken && !current_is_kswapd() && - lumpy_reclaim) { - congestion_wait(BLK_RW_ASYNC, HZ/10); + } - /* - * The attempt at page out may have made some - * of the pages active, mark them inactive again. - */ - nr_active = clear_active_flags(&page_list, count); - count_vm_events(PGDEACTIVATE, nr_active); + if (nr_taken == 0) { + spin_unlock_irq(&zone->lru_lock); + return 0; + } - nr_freed += shrink_page_list(&page_list, sc, - PAGEOUT_IO_SYNC); - } + update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list); + + spin_unlock_irq(&zone->lru_lock); - nr_reclaimed += nr_freed; + nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC); - local_irq_disable(); - if (current_is_kswapd()) - __count_vm_events(KSWAPD_STEAL, nr_freed); - __count_zone_vm_events(PGSTEAL, zone, nr_freed); + /* Check if we should syncronously wait for writeback */ + if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { + congestion_wait(BLK_RW_ASYNC, HZ/10); - spin_lock(&zone->lru_lock); /* - * Put back any unfreeable pages. + * The attempt at page out may have made some + * of the pages active, mark them inactive again. */ - while (!list_empty(&page_list)) { - int lru; - page = lru_to_page(&page_list); - VM_BUG_ON(PageLRU(page)); - list_del(&page->lru); - if (unlikely(!page_evictable(page, NULL))) { - spin_unlock_irq(&zone->lru_lock); - putback_lru_page(page); - spin_lock_irq(&zone->lru_lock); - continue; - } - SetPageLRU(page); - lru = page_lru(page); - add_page_to_lru_list(zone, page, lru); - if (is_active_lru(lru)) { - int file = is_file_lru(lru); - reclaim_stat->recent_rotated[file]++; - } - if (!pagevec_add(&pvec, page)) { - spin_unlock_irq(&zone->lru_lock); - __pagevec_release(&pvec); - spin_lock_irq(&zone->lru_lock); - } - } - __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); + nr_active = clear_active_flags(&page_list, NULL); + count_vm_events(PGDEACTIVATE, nr_active); - } while (nr_scanned < max_scan); + nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC); + } -done: - spin_unlock_irq(&zone->lru_lock); - pagevec_release(&pvec); - return nr_reclaimed; -} + local_irq_disable(); + if (current_is_kswapd()) + __count_vm_events(KSWAPD_STEAL, nr_reclaimed); + __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); -/* - * We are about to scan this zone at a certain priority level. If that priority - * level is smaller (ie: more urgent) than the previous priority, then note - * that priority level within the zone. This is done so that when the next - * process comes in to scan this zone, it will immediately start out at this - * priority level rather than having to build up its own scanning priority. - * Here, this priority affects only the reclaim-mapped threshold. - */ -static inline void note_zone_scanning_priority(struct zone *zone, int priority) -{ - if (priority < zone->prev_priority) - zone->prev_priority = priority; + putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); + return nr_reclaimed; } /* @@ -1320,16 +1428,23 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, lru_add_drain(); spin_lock_irq(&zone->lru_lock); - nr_taken = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, - ISOLATE_ACTIVE, zone, - sc->mem_cgroup, 1, file); - /* - * zone->pages_scanned is used for detect zone's oom - * mem_cgroup remembers nr_scan by itself. - */ if (scanning_global_lru(sc)) { + nr_taken = isolate_pages_global(nr_pages, &l_hold, + &pgscanned, sc->order, + ISOLATE_ACTIVE, zone, + 1, file); zone->pages_scanned += pgscanned; + } else { + nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, + &pgscanned, sc->order, + ISOLATE_ACTIVE, zone, + sc->mem_cgroup, 1, file); + /* + * mem_cgroup_isolate_pages() keeps track of + * scanned pages on its own. + */ } + reclaim_stat->recent_scanned[file] += nr_taken; __count_zone_vm_events(PGREFILL, zone, pgscanned); @@ -1350,9 +1465,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, continue; } - /* page_referenced clears PageReferenced */ - if (page_mapping_inuse(page) && - page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { + if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { nr_rotated++; /* * Identify referenced, file-backed active pages and @@ -1485,21 +1598,52 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, } /* + * Smallish @nr_to_scan's are deposited in @nr_saved_scan, + * until we collected @swap_cluster_max pages to scan. + */ +static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, + unsigned long *nr_saved_scan) +{ + unsigned long nr; + + *nr_saved_scan += nr_to_scan; + nr = *nr_saved_scan; + + if (nr >= SWAP_CLUSTER_MAX) + *nr_saved_scan = 0; + else + nr = 0; + + return nr; +} + +/* * Determine how aggressively the anon and file LRU lists should be * scanned. The relative value of each set of LRU lists is determined * by looking at the fraction of the pages scanned we did rotate back * onto the active list instead of evict. * - * percent[0] specifies how much pressure to put on ram/swap backed - * memory, while percent[1] determines pressure on the file LRUs. + * nr[0] = anon pages to scan; nr[1] = file pages to scan */ -static void get_scan_ratio(struct zone *zone, struct scan_control *sc, - unsigned long *percent) +static void get_scan_count(struct zone *zone, struct scan_control *sc, + unsigned long *nr, int priority) { unsigned long anon, file, free; unsigned long anon_prio, file_prio; unsigned long ap, fp; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + u64 fraction[2], denominator; + enum lru_list l; + int noswap = 0; + + /* If we have no swap space, do not bother scanning anon pages. */ + if (!sc->may_swap || (nr_swap_pages <= 0)) { + noswap = 1; + fraction[0] = 0; + fraction[1] = 1; + denominator = 1; + goto out; + } anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); @@ -1511,13 +1655,21 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, /* If we have very few page cache pages, force-scan anon pages. */ if (unlikely(file + free <= high_wmark_pages(zone))) { - percent[0] = 100; - percent[1] = 0; - return; + fraction[0] = 1; + fraction[1] = 0; + denominator = 1; + goto out; } } /* + * With swappiness at 100, anonymous and file have the same priority. + * This scanning priority is essentially the inverse of IO cost. + */ + anon_prio = sc->swappiness; + file_prio = 200 - sc->swappiness; + + /* * OK, so we have swap space and a fair amount of page cache * pages. We use the recently rotated / recently scanned * ratios to determine how valuable each cache is. @@ -1528,28 +1680,18 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, * * anon in [0], file in [1] */ + spin_lock_irq(&zone->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { - spin_lock_irq(&zone->lru_lock); reclaim_stat->recent_scanned[0] /= 2; reclaim_stat->recent_rotated[0] /= 2; - spin_unlock_irq(&zone->lru_lock); } if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { - spin_lock_irq(&zone->lru_lock); reclaim_stat->recent_scanned[1] /= 2; reclaim_stat->recent_rotated[1] /= 2; - spin_unlock_irq(&zone->lru_lock); } /* - * With swappiness at 100, anonymous and file have the same priority. - * This scanning priority is essentially the inverse of IO cost. - */ - anon_prio = sc->swappiness; - file_prio = 200 - sc->swappiness; - - /* * The amount of pressure on anon vs file pages is inversely * proportional to the fraction of recently scanned pages on * each list that were recently referenced and in active use. @@ -1559,30 +1701,39 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); fp /= reclaim_stat->recent_rotated[1] + 1; + spin_unlock_irq(&zone->lru_lock); + + fraction[0] = ap; + fraction[1] = fp; + denominator = ap + fp + 1; +out: + for_each_evictable_lru(l) { + int file = is_file_lru(l); + unsigned long scan; - /* Normalize to percentages */ - percent[0] = 100 * ap / (ap + fp + 1); - percent[1] = 100 - percent[0]; + scan = zone_nr_lru_pages(zone, sc, l); + if (priority || noswap) { + scan >>= priority; + scan = div64_u64(scan * fraction[file], denominator); + } + nr[l] = nr_scan_try_batch(scan, + &reclaim_stat->nr_saved_scan[l]); + } } -/* - * Smallish @nr_to_scan's are deposited in @nr_saved_scan, - * until we collected @swap_cluster_max pages to scan. - */ -static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, - unsigned long *nr_saved_scan) +static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc) { - unsigned long nr; - - *nr_saved_scan += nr_to_scan; - nr = *nr_saved_scan; - - if (nr >= SWAP_CLUSTER_MAX) - *nr_saved_scan = 0; + /* + * If we need a large contiguous chunk of memory, or have + * trouble getting a small set of contiguous pages, we + * will reclaim both active and inactive pages. + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + sc->lumpy_reclaim_mode = 1; + else if (sc->order && priority < DEF_PRIORITY - 2) + sc->lumpy_reclaim_mode = 1; else - nr = 0; - - return nr; + sc->lumpy_reclaim_mode = 0; } /* @@ -1593,33 +1744,13 @@ static void shrink_zone(int priority, struct zone *zone, { unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; - unsigned long percent[2]; /* anon @ 0; file @ 1 */ enum lru_list l; unsigned long nr_reclaimed = sc->nr_reclaimed; unsigned long nr_to_reclaim = sc->nr_to_reclaim; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - int noswap = 0; - - /* If we have no swap space, do not bother scanning anon pages. */ - if (!sc->may_swap || (nr_swap_pages <= 0)) { - noswap = 1; - percent[0] = 0; - percent[1] = 100; - } else - get_scan_ratio(zone, sc, percent); - for_each_evictable_lru(l) { - int file = is_file_lru(l); - unsigned long scan; + get_scan_count(zone, sc, nr, priority); - scan = zone_nr_lru_pages(zone, sc, l); - if (priority || noswap) { - scan >>= priority; - scan = (scan * percent[file]) / 100; - } - nr[l] = nr_scan_try_batch(scan, - &reclaim_stat->nr_saved_scan[l]); - } + set_lumpy_reclaim_mode(priority, sc); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { @@ -1676,13 +1807,11 @@ static void shrink_zone(int priority, struct zone *zone, static void shrink_zones(int priority, struct zonelist *zonelist, struct scan_control *sc) { - enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); struct zoneref *z; struct zone *zone; - sc->all_unreclaimable = 1; - for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, - sc->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_zone(sc->gfp_mask), sc->nodemask) { if (!populated_zone(zone)) continue; /* @@ -1692,26 +1821,46 @@ static void shrink_zones(int priority, struct zonelist *zonelist, if (scanning_global_lru(sc)) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - note_zone_scanning_priority(zone, priority); - - if (zone_is_all_unreclaimable(zone) && - priority != DEF_PRIORITY) + if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ - sc->all_unreclaimable = 0; - } else { - /* - * Ignore cpuset limitation here. We just want to reduce - * # of used pages by us regardless of memory shortage. - */ - sc->all_unreclaimable = 0; - mem_cgroup_note_reclaim_priority(sc->mem_cgroup, - priority); } shrink_zone(priority, zone, sc); } } +static bool zone_reclaimable(struct zone *zone) +{ + return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; +} + +/* + * As hibernation is going on, kswapd is freezed so that it can't mark + * the zone into all_unreclaimable. It can't handle OOM during hibernation. + * So let's check zone's unreclaimable in direct reclaim as well as kswapd. + */ +static bool all_unreclaimable(struct zonelist *zonelist, + struct scan_control *sc) +{ + struct zoneref *z; + struct zone *zone; + bool all_unreclaimable = true; + + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_zone(sc->gfp_mask), sc->nodemask) { + if (!populated_zone(zone)) + continue; + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + continue; + if (zone_reclaimable(zone)) { + all_unreclaimable = false; + break; + } + } + + return all_unreclaimable; +} + /* * This is the main entry point to direct page reclaim. * @@ -1732,31 +1881,17 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct scan_control *sc) { int priority; - unsigned long ret = 0; unsigned long total_scanned = 0; struct reclaim_state *reclaim_state = current->reclaim_state; - unsigned long lru_pages = 0; struct zoneref *z; struct zone *zone; - enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); unsigned long writeback_threshold; + get_mems_allowed(); delayacct_freepages_start(); if (scanning_global_lru(sc)) count_vm_event(ALLOCSTALL); - /* - * mem_cgroup will not do shrink_slab. - */ - if (scanning_global_lru(sc)) { - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; - - lru_pages += zone_reclaimable_pages(zone); - } - } for (priority = DEF_PRIORITY; priority >= 0; priority--) { sc->nr_scanned = 0; @@ -1768,6 +1903,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, * over limit cgroups */ if (scanning_global_lru(sc)) { + unsigned long lru_pages = 0; + for_each_zone_zonelist(zone, z, zonelist, + gfp_zone(sc->gfp_mask)) { + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + continue; + + lru_pages += zone_reclaimable_pages(zone); + } + shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; @@ -1775,10 +1919,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, } } total_scanned += sc->nr_scanned; - if (sc->nr_reclaimed >= sc->nr_to_reclaim) { - ret = sc->nr_reclaimed; + if (sc->nr_reclaimed >= sc->nr_to_reclaim) goto out; - } /* * Try to write back as many pages as we just scanned. This @@ -1798,9 +1940,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, priority < DEF_PRIORITY - 2) congestion_wait(BLK_RW_ASYNC, HZ/10); } - /* top priority shrink_zones still had more to do? don't OOM, then */ - if (!sc->all_unreclaimable && scanning_global_lru(sc)) - ret = sc->nr_reclaimed; + out: /* * Now that we've scanned all the zones at this priority level, note @@ -1812,25 +1952,23 @@ out: if (priority < 0) priority = 0; - if (scanning_global_lru(sc)) { - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; + delayacct_freepages_end(); + put_mems_allowed(); - zone->prev_priority = priority; - } - } else - mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); + if (sc->nr_reclaimed) + return sc->nr_reclaimed; - delayacct_freepages_end(); + /* top priority shrink_zones still had more to do? don't OOM, then */ + if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) + return 1; - return ret; + return 0; } unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask) { + unsigned long nr_reclaimed; struct scan_control sc = { .gfp_mask = gfp_mask, .may_writepage = !laptop_mode, @@ -1840,11 +1978,18 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, - .isolate_pages = isolate_pages_global, .nodemask = nodemask, }; - return do_try_to_free_pages(zonelist, &sc); + trace_mm_vmscan_direct_reclaim_begin(order, + sc.may_writepage, + gfp_mask); + + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + + trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); + + return nr_reclaimed; } #ifdef CONFIG_CGROUP_MEM_RES_CTLR @@ -1852,24 +1997,24 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, unsigned int swappiness, - struct zone *zone, int nid) + struct zone *zone) { struct scan_control sc = { + .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, .swappiness = swappiness, .order = 0, .mem_cgroup = mem, - .isolate_pages = mem_cgroup_isolate_pages, }; - nodemask_t nm = nodemask_of_node(nid); - sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); - sc.nodemask = &nm; - sc.nr_reclaimed = 0; - sc.nr_scanned = 0; + + trace_mm_vmscan_memcg_softlimit_reclaim_begin(0, + sc.may_writepage, + sc.gfp_mask); + /* * NOTE: Although we can get the priority field, using it * here is not a good idea, since it limits the pages we can scan. @@ -1878,6 +2023,9 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, * the priority and make it zero. */ shrink_zone(0, zone, &sc); + + trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); + return sc.nr_reclaimed; } @@ -1887,6 +2035,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, unsigned int swappiness) { struct zonelist *zonelist; + unsigned long nr_reclaimed; struct scan_control sc = { .may_writepage = !laptop_mode, .may_unmap = 1, @@ -1895,14 +2044,22 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .swappiness = swappiness, .order = 0, .mem_cgroup = mem_cont, - .isolate_pages = mem_cgroup_isolate_pages, .nodemask = NULL, /* we don't care the placement */ }; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); zonelist = NODE_DATA(numa_node_id())->node_zonelists; - return do_try_to_free_pages(zonelist, &sc); + + trace_mm_vmscan_memcg_reclaim_begin(0, + sc.may_writepage, + sc.gfp_mask); + + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + + trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); + + return nr_reclaimed; } #endif @@ -1922,7 +2079,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) if (!populated_zone(zone)) continue; - if (zone_is_all_unreclaimable(zone)) + if (zone->all_unreclaimable) continue; if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), @@ -1973,24 +2130,13 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, - .isolate_pages = isolate_pages_global, }; - /* - * temp_priority is used to remember the scanning priority at which - * this zone was successfully refilled to - * free_pages == high_wmark_pages(zone). - */ - int temp_priority[MAX_NR_ZONES]; - loop_again: total_scanned = 0; sc.nr_reclaimed = 0; sc.may_writepage = !laptop_mode; count_vm_event(PAGEOUTRUN); - for (i = 0; i < pgdat->nr_zones; i++) - temp_priority[i] = DEF_PRIORITY; - for (priority = DEF_PRIORITY; priority >= 0; priority--) { int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long lru_pages = 0; @@ -2012,8 +2158,7 @@ loop_again: if (!populated_zone(zone)) continue; - if (zone_is_all_unreclaimable(zone) && - priority != DEF_PRIORITY) + if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* @@ -2051,30 +2196,21 @@ loop_again: for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; int nr_slab; - int nid, zid; if (!populated_zone(zone)) continue; - if (zone_is_all_unreclaimable(zone) && - priority != DEF_PRIORITY) + if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; - if (!zone_watermark_ok(zone, order, - high_wmark_pages(zone), end_zone, 0)) - all_zones_ok = 0; - temp_priority[i] = priority; sc.nr_scanned = 0; - note_zone_scanning_priority(zone, priority); - nid = pgdat->node_id; - zid = zone_idx(zone); /* * Call soft limit reclaim before calling shrink_zone. * For now we ignore the return value */ - mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask, - nid, zid); + mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask); + /* * We put equal pressure on every zone, unless one * zone has way too many pages free already. @@ -2087,12 +2223,10 @@ loop_again: lru_pages); sc.nr_reclaimed += reclaim_state->reclaimed_slab; total_scanned += sc.nr_scanned; - if (zone_is_all_unreclaimable(zone)) + if (zone->all_unreclaimable) continue; - if (nr_slab == 0 && zone->pages_scanned >= - (zone_reclaimable_pages(zone) * 6)) - zone_set_flag(zone, - ZONE_ALL_UNRECLAIMABLE); + if (nr_slab == 0 && !zone_reclaimable(zone)) + zone->all_unreclaimable = 1; /* * If we've done a decent amount of scanning and * the reclaim ratio is low, start doing writepage @@ -2102,13 +2236,18 @@ loop_again: total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; - /* - * We are still under min water mark. it mean we have - * GFP_ATOMIC allocation failure risk. Hurry up! - */ - if (!zone_watermark_ok(zone, order, min_wmark_pages(zone), - end_zone, 0)) - has_under_min_watermark_zone = 1; + if (!zone_watermark_ok(zone, order, + high_wmark_pages(zone), end_zone, 0)) { + all_zones_ok = 0; + /* + * We are still under min water mark. This + * means that we have a GFP_ATOMIC allocation + * failure risk. Hurry up! + */ + if (!zone_watermark_ok(zone, order, + min_wmark_pages(zone), end_zone, 0)) + has_under_min_watermark_zone = 1; + } } if (all_zones_ok) @@ -2134,16 +2273,6 @@ loop_again: break; } out: - /* - * Note within each zone the priority level at which this zone was - * brought into a happy state. So that the next thread which scans this - * zone will start out at that priority level. - */ - for (i = 0; i < pgdat->nr_zones; i++) { - struct zone *zone = pgdat->node_zones + i; - - zone->prev_priority = temp_priority[i]; - } if (!all_zones_ok) { cond_resched(); @@ -2247,9 +2376,10 @@ static int kswapd(void *p) * premature sleep. If not, then go fully * to sleep until explicitly woken up */ - if (!sleeping_prematurely(pgdat, order, remaining)) + if (!sleeping_prematurely(pgdat, order, remaining)) { + trace_mm_vmscan_kswapd_sleep(pgdat->node_id); schedule(); - else { + } else { if (remaining) count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); else @@ -2269,8 +2399,10 @@ static int kswapd(void *p) * We can speed up thawing tasks if we don't call balance_pgdat * after returning from the refrigerator */ - if (!ret) + if (!ret) { + trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); balance_pgdat(pgdat, order); + } } return 0; } @@ -2290,6 +2422,7 @@ void wakeup_kswapd(struct zone *zone, int order) return; if (pgdat->kswapd_max_order < order) pgdat->kswapd_max_order = order; + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) return; if (!waitqueue_active(&pgdat->kswapd_wait)) @@ -2353,7 +2486,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) .hibernation_mode = 1, .swappiness = vm_swappiness, .order = 0, - .isolate_pages = isolate_pages_global, }; struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); struct task_struct *p = current; @@ -2538,11 +2670,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .gfp_mask = gfp_mask, .swappiness = vm_swappiness, .order = order, - .isolate_pages = isolate_pages_global, }; - unsigned long slab_reclaimable; + unsigned long nr_slab_pages0, nr_slab_pages1; - disable_swap_token(); cond_resched(); /* * We need to be able to allocate from the reserves for RECLAIM_SWAP @@ -2550,6 +2680,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * and RECLAIM_SWAP. */ p->flags |= PF_MEMALLOC | PF_SWAPWRITE; + lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; @@ -2560,14 +2691,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) */ priority = ZONE_RECLAIM_PRIORITY; do { - note_zone_scanning_priority(zone, priority); shrink_zone(priority, zone, &sc); priority--; } while (priority >= 0 && sc.nr_reclaimed < nr_pages); } - slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); - if (slab_reclaimable > zone->min_slab_pages) { + nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); + if (nr_slab_pages0 > zone->min_slab_pages) { /* * shrink_slab() does not currently allow us to determine how * many pages were freed in this zone. So we take the current @@ -2578,21 +2708,32 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * Note that shrink_slab will free memory on all zones and may * take a long time. */ - while (shrink_slab(sc.nr_scanned, gfp_mask, order) && - zone_page_state(zone, NR_SLAB_RECLAIMABLE) > - slab_reclaimable - nr_pages) - ; + for (;;) { + unsigned long lru_pages = zone_reclaimable_pages(zone); + + /* No reclaimable slab or very low memory pressure */ + if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages)) + break; + + /* Freed enough memory */ + nr_slab_pages1 = zone_page_state(zone, + NR_SLAB_RECLAIMABLE); + if (nr_slab_pages1 + nr_pages <= nr_slab_pages0) + break; + } /* * Update nr_reclaimed by the number of slab pages we * reclaimed from this zone. */ - sc.nr_reclaimed += slab_reclaimable - - zone_page_state(zone, NR_SLAB_RECLAIMABLE); + nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); + if (nr_slab_pages1 < nr_slab_pages0) + sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1; } p->reclaim_state = NULL; current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); + lockdep_clear_current_reclaim_state(); return sc.nr_reclaimed >= nr_pages; } @@ -2615,7 +2756,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) return ZONE_RECLAIM_FULL; - if (zone_is_all_unreclaimable(zone)) + if (zone->all_unreclaimable) return ZONE_RECLAIM_FULL; /* diff --git a/mm/vmstat.c b/mm/vmstat.c index 6051fba..355a9e6 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -12,22 +12,24 @@ #include <linux/mm.h> #include <linux/err.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/cpu.h> #include <linux/vmstat.h> #include <linux/sched.h> +#include <linux/math64.h> #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; EXPORT_PER_CPU_SYMBOL(vm_event_states); -static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask) +static void sum_vm_events(unsigned long *ret) { int cpu; int i; memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); - for_each_cpu(cpu, cpumask) { + for_each_online_cpu(cpu) { struct vm_event_state *this = &per_cpu(vm_event_states, cpu); for (i = 0; i < NR_VM_EVENT_ITEMS; i++) @@ -43,7 +45,7 @@ static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask) void all_vm_events(unsigned long *ret) { get_online_cpus(); - sum_vm_events(ret, cpu_online_mask); + sum_vm_events(ret); put_online_cpus(); } EXPORT_SYMBOL_GPL(all_vm_events); @@ -136,10 +138,24 @@ static void refresh_zone_stat_thresholds(void) int threshold; for_each_populated_zone(zone) { + unsigned long max_drift, tolerate_drift; + threshold = calculate_threshold(zone); for_each_online_cpu(cpu) - zone_pcp(zone, cpu)->stat_threshold = threshold; + per_cpu_ptr(zone->pageset, cpu)->stat_threshold + = threshold; + + /* + * Only set percpu_drift_mark if there is a danger that + * NR_FREE_PAGES reports the low watermark is ok when in fact + * the min watermark could be breached by an allocation + */ + tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone); + max_drift = num_online_cpus() * threshold; + if (max_drift > tolerate_drift) + zone->percpu_drift_mark = high_wmark_pages(zone) + + max_drift; } } @@ -149,7 +165,8 @@ static void refresh_zone_stat_thresholds(void) void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, int delta) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); + s8 *p = pcp->vm_stat_diff + item; long x; @@ -202,7 +219,7 @@ EXPORT_SYMBOL(mod_zone_page_state); */ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); s8 *p = pcp->vm_stat_diff + item; (*p)++; @@ -223,7 +240,7 @@ EXPORT_SYMBOL(__inc_zone_page_state); void __dec_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); s8 *p = pcp->vm_stat_diff + item; (*p)--; @@ -300,7 +317,7 @@ void refresh_cpu_vm_stats(int cpu) for_each_populated_zone(zone) { struct per_cpu_pageset *p; - p = zone_pcp(zone, cpu); + p = per_cpu_ptr(zone->pageset, cpu); for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) if (p->vm_stat_diff[i]) { @@ -376,7 +393,86 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z) } #endif -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_COMPACTION +struct contig_page_info { + unsigned long free_pages; + unsigned long free_blocks_total; + unsigned long free_blocks_suitable; +}; + +/* + * Calculate the number of free pages in a zone, how many contiguous + * pages are free and how many are large enough to satisfy an allocation of + * the target size. Note that this function makes no attempt to estimate + * how many suitable free blocks there *might* be if MOVABLE pages were + * migrated. Calculating that is possible, but expensive and can be + * figured out from userspace + */ +static void fill_contig_page_info(struct zone *zone, + unsigned int suitable_order, + struct contig_page_info *info) +{ + unsigned int order; + + info->free_pages = 0; + info->free_blocks_total = 0; + info->free_blocks_suitable = 0; + + for (order = 0; order < MAX_ORDER; order++) { + unsigned long blocks; + + /* Count number of free blocks */ + blocks = zone->free_area[order].nr_free; + info->free_blocks_total += blocks; + + /* Count free base pages */ + info->free_pages += blocks << order; + + /* Count the suitable free blocks */ + if (order >= suitable_order) + info->free_blocks_suitable += blocks << + (order - suitable_order); + } +} + +/* + * A fragmentation index only makes sense if an allocation of a requested + * size would fail. If that is true, the fragmentation index indicates + * whether external fragmentation or a lack of memory was the problem. + * The value can be used to determine if page reclaim or compaction + * should be used + */ +static int __fragmentation_index(unsigned int order, struct contig_page_info *info) +{ + unsigned long requested = 1UL << order; + + if (!info->free_blocks_total) + return 0; + + /* Fragmentation index only makes sense when a request would fail */ + if (info->free_blocks_suitable) + return -1000; + + /* + * Index is between 0 and 1 so return within 3 decimal places + * + * 0 => allocation would fail due to lack of memory + * 1 => allocation would fail due to fragmentation + */ + return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total); +} + +/* Same as __fragmentation index but allocs contig_page_info on stack */ +int fragmentation_index(struct zone *zone, unsigned int order) +{ + struct contig_page_info info; + + fill_contig_page_info(zone, order, &info); + return __fragmentation_index(order, &info); +} +#endif + +#if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION) #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -429,7 +525,9 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, spin_unlock_irqrestore(&zone->lock, flags); } } +#endif +#ifdef CONFIG_PROC_FS static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone) { @@ -690,6 +788,16 @@ static const char * const vmstat_text[] = { "allocstall", "pgrotated", + +#ifdef CONFIG_COMPACTION + "compact_blocks_moved", + "compact_pages_moved", + "compact_pagemigrate_failed", + "compact_stall", + "compact_fail", + "compact_success", +#endif + #ifdef CONFIG_HUGETLB_PAGE "htlb_buddy_alloc_success", "htlb_buddy_alloc_fail", @@ -718,7 +826,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n scanned %lu" "\n spanned %lu" "\n present %lu", - zone_page_state(zone, NR_FREE_PAGES), + zone_nr_free_pages(zone), min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), @@ -741,7 +849,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, for_each_online_cpu(i) { struct per_cpu_pageset *pageset; - pageset = zone_pcp(zone, i); + pageset = per_cpu_ptr(zone->pageset, i); seq_printf(m, "\n cpu: %i" "\n count: %i" @@ -758,11 +866,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, } seq_printf(m, "\n all_unreclaimable: %u" - "\n prev_priority: %i" "\n start_pfn: %lu" "\n inactive_ratio: %u", - zone_is_all_unreclaimable(zone), - zone->prev_priority, + zone->all_unreclaimable, zone->zone_start_pfn, zone->inactive_ratio); seq_putc(m, '\n'); @@ -905,7 +1011,9 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: + refresh_zone_stat_thresholds(); start_cpu_timer(cpu); + node_set_state(cpu_to_node(cpu), N_CPU); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: @@ -950,3 +1058,162 @@ static int __init setup_vmstat(void) return 0; } module_init(setup_vmstat) + +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) +#include <linux/debugfs.h> + +static struct dentry *extfrag_debug_root; + +/* + * Return an index indicating how much of the available free memory is + * unusable for an allocation of the requested size. + */ +static int unusable_free_index(unsigned int order, + struct contig_page_info *info) +{ + /* No free memory is interpreted as all free memory is unusable */ + if (info->free_pages == 0) + return 1000; + + /* + * Index should be a value between 0 and 1. Return a value to 3 + * decimal places. + * + * 0 => no fragmentation + * 1 => high fragmentation + */ + return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages); + +} + +static void unusable_show_print(struct seq_file *m, + pg_data_t *pgdat, struct zone *zone) +{ + unsigned int order; + int index; + struct contig_page_info info; + + seq_printf(m, "Node %d, zone %8s ", + pgdat->node_id, + zone->name); + for (order = 0; order < MAX_ORDER; ++order) { + fill_contig_page_info(zone, order, &info); + index = unusable_free_index(order, &info); + seq_printf(m, "%d.%03d ", index / 1000, index % 1000); + } + + seq_putc(m, '\n'); +} + +/* + * Display unusable free space index + * + * The unusable free space index measures how much of the available free + * memory cannot be used to satisfy an allocation of a given size and is a + * value between 0 and 1. The higher the value, the more of free memory is + * unusable and by implication, the worse the external fragmentation is. This + * can be expressed as a percentage by multiplying by 100. + */ +static int unusable_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + /* check memoryless node */ + if (!node_state(pgdat->node_id, N_HIGH_MEMORY)) + return 0; + + walk_zones_in_node(m, pgdat, unusable_show_print); + + return 0; +} + +static const struct seq_operations unusable_op = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = unusable_show, +}; + +static int unusable_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &unusable_op); +} + +static const struct file_operations unusable_file_ops = { + .open = unusable_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void extfrag_show_print(struct seq_file *m, + pg_data_t *pgdat, struct zone *zone) +{ + unsigned int order; + int index; + + /* Alloc on stack as interrupts are disabled for zone walk */ + struct contig_page_info info; + + seq_printf(m, "Node %d, zone %8s ", + pgdat->node_id, + zone->name); + for (order = 0; order < MAX_ORDER; ++order) { + fill_contig_page_info(zone, order, &info); + index = __fragmentation_index(order, &info); + seq_printf(m, "%d.%03d ", index / 1000, index % 1000); + } + + seq_putc(m, '\n'); +} + +/* + * Display fragmentation index for orders that allocations would fail for + */ +static int extfrag_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + walk_zones_in_node(m, pgdat, extfrag_show_print); + + return 0; +} + +static const struct seq_operations extfrag_op = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = extfrag_show, +}; + +static int extfrag_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &extfrag_op); +} + +static const struct file_operations extfrag_file_ops = { + .open = extfrag_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init extfrag_debug_init(void) +{ + extfrag_debug_root = debugfs_create_dir("extfrag", NULL); + if (!extfrag_debug_root) + return -ENOMEM; + + if (!debugfs_create_file("unusable_index", 0444, + extfrag_debug_root, NULL, &unusable_file_ops)) + return -ENOMEM; + + if (!debugfs_create_file("extfrag_index", 0444, + extfrag_debug_root, NULL, &extfrag_file_ops)) + return -ENOMEM; + + return 0; +} + +module_init(extfrag_debug_init); +#endif |