diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/bootmem.c | 6 | ||||
-rw-r--r-- | mm/bounce.c | 8 | ||||
-rw-r--r-- | mm/compaction.c | 5 | ||||
-rw-r--r-- | mm/frontswap.c | 150 | ||||
-rw-r--r-- | mm/madvise.c | 18 | ||||
-rw-r--r-- | mm/memblock.c | 115 | ||||
-rw-r--r-- | mm/memcontrol.c | 6 | ||||
-rw-r--r-- | mm/memory-failure.c | 14 | ||||
-rw-r--r-- | mm/memory.c | 12 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 2 | ||||
-rw-r--r-- | mm/mempolicy.c | 2 | ||||
-rw-r--r-- | mm/nobootmem.c | 40 | ||||
-rw-r--r-- | mm/oom_kill.c | 17 | ||||
-rw-r--r-- | mm/page_alloc.c | 7 | ||||
-rw-r--r-- | mm/page_cgroup.c | 4 | ||||
-rw-r--r-- | mm/pagewalk.c | 1 | ||||
-rw-r--r-- | mm/percpu-vm.c | 1 | ||||
-rw-r--r-- | mm/shmem.c | 198 | ||||
-rw-r--r-- | mm/sparse.c | 20 | ||||
-rw-r--r-- | mm/swapfile.c | 12 | ||||
-rw-r--r-- | mm/vmscan.c | 17 |
21 files changed, 351 insertions, 304 deletions
diff --git a/mm/bootmem.c b/mm/bootmem.c index ec4fcb7..bcb63ac 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -698,7 +698,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, return ___alloc_bootmem(size, align, goal, limit); } -static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, +void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { @@ -710,6 +710,10 @@ again: if (ptr) return ptr; + /* do not panic in alloc_bootmem_bdata() */ + if (limit && goal + size > limit) + limit = 0; + ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit); if (ptr) return ptr; diff --git a/mm/bounce.c b/mm/bounce.c index d1be02c..0420867 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -24,23 +24,25 @@ static mempool_t *page_pool, *isa_page_pool; -#ifdef CONFIG_HIGHMEM +#if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL) static __init int init_emergency_pool(void) { -#ifndef CONFIG_MEMORY_HOTPLUG +#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG) if (max_pfn <= max_low_pfn) return 0; #endif page_pool = mempool_create_page_pool(POOL_SIZE, 0); BUG_ON(!page_pool); - printk("highmem bounce pool size: %d pages\n", POOL_SIZE); + printk("bounce pool size: %d pages\n", POOL_SIZE); return 0; } __initcall(init_emergency_pool); +#endif +#ifdef CONFIG_HIGHMEM /* * highmem version, map in to vec */ diff --git a/mm/compaction.c b/mm/compaction.c index 7ea259d..2f42d95 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -701,8 +701,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) if (err) { putback_lru_pages(&cc->migratepages); cc->nr_migratepages = 0; + if (err == -ENOMEM) { + ret = COMPACT_PARTIAL; + goto out; + } } - } out: diff --git a/mm/frontswap.c b/mm/frontswap.c index e250255..6b3e71a 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -11,15 +11,11 @@ * This work is licensed under the terms of the GNU GPL, version 2. */ -#include <linux/mm.h> #include <linux/mman.h> #include <linux/swap.h> #include <linux/swapops.h> -#include <linux/proc_fs.h> #include <linux/security.h> -#include <linux/capability.h> #include <linux/module.h> -#include <linux/uaccess.h> #include <linux/debugfs.h> #include <linux/frontswap.h> #include <linux/swapfile.h> @@ -110,16 +106,21 @@ void __frontswap_init(unsigned type) BUG_ON(sis == NULL); if (sis->frontswap_map == NULL) return; - if (frontswap_enabled) - (*frontswap_ops.init)(type); + frontswap_ops.init(type); } EXPORT_SYMBOL(__frontswap_init); +static inline void __frontswap_clear(struct swap_info_struct *sis, pgoff_t offset) +{ + frontswap_clear(sis, offset); + atomic_dec(&sis->frontswap_pages); +} + /* * "Store" data from a page to frontswap and associate it with the page's * swaptype and offset. Page must be locked and in the swap cache. * If frontswap already contains a page with matching swaptype and - * offset, the frontswap implmentation may either overwrite the data and + * offset, the frontswap implementation may either overwrite the data and * return success or invalidate the page from frontswap and return failure. */ int __frontswap_store(struct page *page) @@ -134,22 +135,21 @@ int __frontswap_store(struct page *page) BUG_ON(sis == NULL); if (frontswap_test(sis, offset)) dup = 1; - ret = (*frontswap_ops.store)(type, offset, page); + ret = frontswap_ops.store(type, offset, page); if (ret == 0) { frontswap_set(sis, offset); inc_frontswap_succ_stores(); if (!dup) atomic_inc(&sis->frontswap_pages); - } else if (dup) { + } else { /* failed dup always results in automatic invalidate of the (older) page from frontswap */ - frontswap_clear(sis, offset); - atomic_dec(&sis->frontswap_pages); - inc_frontswap_failed_stores(); - } else inc_frontswap_failed_stores(); + if (dup) + __frontswap_clear(sis, offset); + } if (frontswap_writethrough_enabled) /* report failure so swap also writes to swap device */ ret = -1; @@ -173,7 +173,7 @@ int __frontswap_load(struct page *page) BUG_ON(!PageLocked(page)); BUG_ON(sis == NULL); if (frontswap_test(sis, offset)) - ret = (*frontswap_ops.load)(type, offset, page); + ret = frontswap_ops.load(type, offset, page); if (ret == 0) inc_frontswap_loads(); return ret; @@ -190,9 +190,8 @@ void __frontswap_invalidate_page(unsigned type, pgoff_t offset) BUG_ON(sis == NULL); if (frontswap_test(sis, offset)) { - (*frontswap_ops.invalidate_page)(type, offset); - atomic_dec(&sis->frontswap_pages); - frontswap_clear(sis, offset); + frontswap_ops.invalidate_page(type, offset); + __frontswap_clear(sis, offset); inc_frontswap_invalidates(); } } @@ -209,67 +208,102 @@ void __frontswap_invalidate_area(unsigned type) BUG_ON(sis == NULL); if (sis->frontswap_map == NULL) return; - (*frontswap_ops.invalidate_area)(type); + frontswap_ops.invalidate_area(type); atomic_set(&sis->frontswap_pages, 0); memset(sis->frontswap_map, 0, sis->max / sizeof(long)); } EXPORT_SYMBOL(__frontswap_invalidate_area); -/* - * Frontswap, like a true swap device, may unnecessarily retain pages - * under certain circumstances; "shrink" frontswap is essentially a - * "partial swapoff" and works by calling try_to_unuse to attempt to - * unuse enough frontswap pages to attempt to -- subject to memory - * constraints -- reduce the number of pages in frontswap to the - * number given in the parameter target_pages. - */ -void frontswap_shrink(unsigned long target_pages) +static unsigned long __frontswap_curr_pages(void) { - struct swap_info_struct *si = NULL; - int si_frontswap_pages; - unsigned long total_pages = 0, total_pages_to_unuse; - unsigned long pages = 0, pages_to_unuse = 0; int type; - bool locked = false; + unsigned long totalpages = 0; + struct swap_info_struct *si = NULL; - /* - * we don't want to hold swap_lock while doing a very - * lengthy try_to_unuse, but swap_list may change - * so restart scan from swap_list.head each time - */ - spin_lock(&swap_lock); - locked = true; - total_pages = 0; + assert_spin_locked(&swap_lock); for (type = swap_list.head; type >= 0; type = si->next) { si = swap_info[type]; - total_pages += atomic_read(&si->frontswap_pages); + totalpages += atomic_read(&si->frontswap_pages); } - if (total_pages <= target_pages) - goto out; - total_pages_to_unuse = total_pages - target_pages; + return totalpages; +} + +static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, + int *swapid) +{ + int ret = -EINVAL; + struct swap_info_struct *si = NULL; + int si_frontswap_pages; + unsigned long total_pages_to_unuse = total; + unsigned long pages = 0, pages_to_unuse = 0; + int type; + + assert_spin_locked(&swap_lock); for (type = swap_list.head; type >= 0; type = si->next) { si = swap_info[type]; si_frontswap_pages = atomic_read(&si->frontswap_pages); - if (total_pages_to_unuse < si_frontswap_pages) + if (total_pages_to_unuse < si_frontswap_pages) { pages = pages_to_unuse = total_pages_to_unuse; - else { + } else { pages = si_frontswap_pages; pages_to_unuse = 0; /* unuse all */ } /* ensure there is enough RAM to fetch pages from frontswap */ - if (security_vm_enough_memory_mm(current->mm, pages)) + if (security_vm_enough_memory_mm(current->mm, pages)) { + ret = -ENOMEM; continue; + } vm_unacct_memory(pages); + *unused = pages_to_unuse; + *swapid = type; + ret = 0; break; } - if (type < 0) - goto out; - locked = false; + + return ret; +} + +static int __frontswap_shrink(unsigned long target_pages, + unsigned long *pages_to_unuse, + int *type) +{ + unsigned long total_pages = 0, total_pages_to_unuse; + + assert_spin_locked(&swap_lock); + + total_pages = __frontswap_curr_pages(); + if (total_pages <= target_pages) { + /* Nothing to do */ + *pages_to_unuse = 0; + return 0; + } + total_pages_to_unuse = total_pages - target_pages; + return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type); +} + +/* + * Frontswap, like a true swap device, may unnecessarily retain pages + * under certain circumstances; "shrink" frontswap is essentially a + * "partial swapoff" and works by calling try_to_unuse to attempt to + * unuse enough frontswap pages to attempt to -- subject to memory + * constraints -- reduce the number of pages in frontswap to the + * number given in the parameter target_pages. + */ +void frontswap_shrink(unsigned long target_pages) +{ + unsigned long pages_to_unuse = 0; + int type, ret; + + /* + * we don't want to hold swap_lock while doing a very + * lengthy try_to_unuse, but swap_list may change + * so restart scan from swap_list.head each time + */ + spin_lock(&swap_lock); + ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); spin_unlock(&swap_lock); - try_to_unuse(type, true, pages_to_unuse); -out: - if (locked) - spin_unlock(&swap_lock); + if (ret == 0 && pages_to_unuse) + try_to_unuse(type, true, pages_to_unuse); return; } EXPORT_SYMBOL(frontswap_shrink); @@ -281,16 +315,12 @@ EXPORT_SYMBOL(frontswap_shrink); */ unsigned long frontswap_curr_pages(void) { - int type; unsigned long totalpages = 0; - struct swap_info_struct *si = NULL; spin_lock(&swap_lock); - for (type = swap_list.head; type >= 0; type = si->next) { - si = swap_info[type]; - totalpages += atomic_read(&si->frontswap_pages); - } + totalpages = __frontswap_curr_pages(); spin_unlock(&swap_lock); + return totalpages; } EXPORT_SYMBOL(frontswap_curr_pages); diff --git a/mm/madvise.c b/mm/madvise.c index deff1b6..14d260f 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -15,6 +15,7 @@ #include <linux/sched.h> #include <linux/ksm.h> #include <linux/fs.h> +#include <linux/file.h> /* * Any behaviour which results in changes to the vma->vm_flags needs to @@ -204,14 +205,16 @@ static long madvise_remove(struct vm_area_struct *vma, { loff_t offset; int error; + struct file *f; *prev = NULL; /* tell sys_madvise we drop mmap_sem */ if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) return -EINVAL; - if (!vma->vm_file || !vma->vm_file->f_mapping - || !vma->vm_file->f_mapping->host) { + f = vma->vm_file; + + if (!f || !f->f_mapping || !f->f_mapping->host) { return -EINVAL; } @@ -221,11 +224,18 @@ static long madvise_remove(struct vm_area_struct *vma, offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - /* filesystem's fallocate may need to take i_mutex */ + /* + * Filesystem's fallocate may need to take i_mutex. We need to + * explicitly grab a reference because the vma (and hence the + * vma's reference to the file) can go away as soon as we drop + * mmap_sem. + */ + get_file(f); up_read(¤t->mm->mmap_sem); - error = do_fallocate(vma->vm_file, + error = do_fallocate(f, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, end - start); + fput(f); down_read(¤t->mm->mmap_sem); return error; } diff --git a/mm/memblock.c b/mm/memblock.c index 952123e..5cc6731 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -143,30 +143,6 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, MAX_NUMNODES); } -/* - * Free memblock.reserved.regions - */ -int __init_memblock memblock_free_reserved_regions(void) -{ - if (memblock.reserved.regions == memblock_reserved_init_regions) - return 0; - - return memblock_free(__pa(memblock.reserved.regions), - sizeof(struct memblock_region) * memblock.reserved.max); -} - -/* - * Reserve memblock.reserved.regions - */ -int __init_memblock memblock_reserve_reserved_regions(void) -{ - if (memblock.reserved.regions == memblock_reserved_init_regions) - return 0; - - return memblock_reserve(__pa(memblock.reserved.regions), - sizeof(struct memblock_region) * memblock.reserved.max); -} - static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) { type->total_size -= type->regions[r].size; @@ -184,9 +160,39 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u } } -static int __init_memblock memblock_double_array(struct memblock_type *type) +phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( + phys_addr_t *addr) +{ + if (memblock.reserved.regions == memblock_reserved_init_regions) + return 0; + + *addr = __pa(memblock.reserved.regions); + + return PAGE_ALIGN(sizeof(struct memblock_region) * + memblock.reserved.max); +} + +/** + * memblock_double_array - double the size of the memblock regions array + * @type: memblock type of the regions array being doubled + * @new_area_start: starting address of memory range to avoid overlap with + * @new_area_size: size of memory range to avoid overlap with + * + * Double the size of the @type regions array. If memblock is being used to + * allocate memory for a new reserved regions array and there is a previously + * allocated memory range [@new_area_start,@new_area_start+@new_area_size] + * waiting to be reserved, ensure the memory used by the new array does + * not overlap. + * + * RETURNS: + * 0 on success, -1 on failure. + */ +static int __init_memblock memblock_double_array(struct memblock_type *type, + phys_addr_t new_area_start, + phys_addr_t new_area_size) { struct memblock_region *new_array, *old_array; + phys_addr_t old_alloc_size, new_alloc_size; phys_addr_t old_size, new_size, addr; int use_slab = slab_is_available(); int *in_slab; @@ -200,6 +206,12 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) /* Calculate new doubled size */ old_size = type->max * sizeof(struct memblock_region); new_size = old_size << 1; + /* + * We need to allocated new one align to PAGE_SIZE, + * so we can free them completely later. + */ + old_alloc_size = PAGE_ALIGN(old_size); + new_alloc_size = PAGE_ALIGN(new_size); /* Retrieve the slab flag */ if (type == &memblock.memory) @@ -222,7 +234,18 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) new_array = kmalloc(new_size, GFP_KERNEL); addr = new_array ? __pa(new_array) : 0; } else { - addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t)); + /* only exclude range when trying to double reserved.regions */ + if (type != &memblock.reserved) + new_area_start = new_area_size = 0; + + addr = memblock_find_in_range(new_area_start + new_area_size, + memblock.current_limit, + new_alloc_size, PAGE_SIZE); + if (!addr && new_area_size) + addr = memblock_find_in_range(0, + min(new_area_start, memblock.current_limit), + new_alloc_size, PAGE_SIZE); + new_array = addr ? __va(addr) : 0; } if (!addr) { @@ -251,13 +274,13 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) kfree(old_array); else if (old_array != memblock_memory_init_regions && old_array != memblock_reserved_init_regions) - memblock_free(__pa(old_array), old_size); + memblock_free(__pa(old_array), old_alloc_size); /* Reserve the new array if that comes from the memblock. * Otherwise, we needn't do it */ if (!use_slab) - BUG_ON(memblock_reserve(addr, new_size)); + BUG_ON(memblock_reserve(addr, new_alloc_size)); /* Update slab flag */ *in_slab = use_slab; @@ -399,7 +422,7 @@ repeat: */ if (!insert) { while (type->cnt + nr_new > type->max) - if (memblock_double_array(type) < 0) + if (memblock_double_array(type, obase, size) < 0) return -ENOMEM; insert = true; goto repeat; @@ -450,7 +473,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, /* we'll create at most two more regions */ while (type->cnt + 2 > type->max) - if (memblock_double_array(type) < 0) + if (memblock_double_array(type, base, size) < 0) return -ENOMEM; for (i = 0; i < type->cnt; i++) { @@ -540,9 +563,9 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) * __next_free_mem_range - next function for for_each_free_mem_range() * @idx: pointer to u64 loop variable * @nid: nid: node selector, %MAX_NUMNODES for all nodes - * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL - * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL - * @p_nid: ptr to int for nid of the range, can be %NULL + * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL + * @out_nid: ptr to int for nid of the range, can be %NULL * * Find the first free area from *@idx which matches @nid, fill the out * parameters, and update *@idx for the next iteration. The lower 32bit of @@ -616,9 +639,9 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() * @idx: pointer to u64 loop variable * @nid: nid: node selector, %MAX_NUMNODES for all nodes - * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL - * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL - * @p_nid: ptr to int for nid of the range, can be %NULL + * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL + * @out_nid: ptr to int for nid of the range, can be %NULL * * Reverse of __next_free_mem_range(). */ @@ -867,6 +890,16 @@ int __init_memblock memblock_is_memory(phys_addr_t addr) return memblock_search(&memblock.memory, addr) != -1; } +/** + * memblock_is_region_memory - check if a region is a subset of memory + * @base: base of region to check + * @size: size of region to check + * + * Check if the region [@base, @base+@size) is a subset of a memory block. + * + * RETURNS: + * 0 if false, non-zero if true + */ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) { int idx = memblock_search(&memblock.memory, base); @@ -879,6 +912,16 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size memblock.memory.regions[idx].size) >= end; } +/** + * memblock_is_region_reserved - check if a region intersects reserved memory + * @base: base of region to check + * @size: size of region to check + * + * Check if the region [@base, @base+@size) intersects a reserved memory block. + * + * RETURNS: + * 0 if false, non-zero if true + */ int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) { memblock_cap_size(base, &size); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ac35bcc..f72b5e5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1148,7 +1148,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, { if (root_memcg == memcg) return true; - if (!root_memcg->use_hierarchy) + if (!root_memcg->use_hierarchy || !memcg) return false; return css_is_ancestor(&memcg->css, &root_memcg->css); } @@ -1234,7 +1234,7 @@ int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec) /** * mem_cgroup_margin - calculate chargeable space of a memory cgroup - * @mem: the memory cgroup + * @memcg: the memory cgroup * * Returns the maximum amount of memory @mem can be charged with, in * pages. @@ -1508,7 +1508,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, /** * test_mem_cgroup_node_reclaimable - * @mem: the target memcg + * @memcg: the target memcg * @nid: the node ID to be checked. * @noswap : specify true here if the user wants flle only information. * diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ab1e714..de4ce70 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -345,14 +345,14 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * Also when FAIL is set do a force kill because something went * wrong earlier. */ -static void kill_procs(struct list_head *to_kill, int doit, int trapno, +static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, int fail, struct page *page, unsigned long pfn, int flags) { struct to_kill *tk, *next; list_for_each_entry_safe (tk, next, to_kill, nd) { - if (doit) { + if (forcekill) { /* * In case something went wrong with munmapping * make sure the process doesn't catch the @@ -858,7 +858,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, struct address_space *mapping; LIST_HEAD(tokill); int ret; - int kill = 1; + int kill = 1, forcekill; struct page *hpage = compound_head(p); struct page *ppage; @@ -888,7 +888,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * be called inside page lock (it's recommended but not enforced). */ mapping = page_mapping(hpage); - if (!PageDirty(hpage) && mapping && + if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping && mapping_cap_writeback_dirty(mapping)) { if (page_mkclean(hpage)) { SetPageDirty(hpage); @@ -965,12 +965,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * Now that the dirty bit has been propagated to the * struct page and all unmaps done we can decide if * killing is needed or not. Only kill when the page - * was dirty, otherwise the tokill list is merely + * was dirty or the process is not restartable, + * otherwise the tokill list is merely * freed. When there was a problem unmapping earlier * use a more force-full uncatchable kill to prevent * any accesses to the poisoned memory. */ - kill_procs(&tokill, !!PageDirty(ppage), trapno, + forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL); + kill_procs(&tokill, forcekill, trapno, ret != SWAP_SUCCESS, p, pfn, flags); return ret; diff --git a/mm/memory.c b/mm/memory.c index 32c9943..91f6945 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1234,7 +1234,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { - VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); +#ifdef CONFIG_DEBUG_VM + if (!rwsem_is_locked(&tlb->mm->mmap_sem)) { + pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n", + __func__, addr, end, + vma->vm_start, + vma->vm_end); + BUG(); + } +#endif split_huge_page_pmd(vma->vm_mm, pmd); } else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; @@ -1375,7 +1383,7 @@ void unmap_vmas(struct mmu_gather *tlb, /** * zap_page_range - remove user pages in a given range * @vma: vm_area_struct holding the applicable pages - * @address: starting address of pages to zap + * @start: starting address of pages to zap * @size: number of bytes to zap * @details: details of nonlinear truncation or shared cache invalidation * diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0d7e3ec..427bb29 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -618,7 +618,7 @@ int __ref add_memory(int nid, u64 start, u64 size) pgdat = hotadd_new_pgdat(nid, start); ret = -ENOMEM; if (!pgdat) - goto out; + goto error; new_pgdat = 1; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f15c1b2..1d771e4 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1177,7 +1177,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { nr_failed = migrate_pages(&pagelist, new_vma_page, (unsigned long)vma, - false, true); + false, MIGRATE_SYNC); if (nr_failed) putback_lru_pages(&pagelist); } diff --git a/mm/nobootmem.c b/mm/nobootmem.c index d23415c..4055730 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -105,27 +105,35 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end) __free_pages_bootmem(pfn_to_page(i), 0); } +static unsigned long __init __free_memory_core(phys_addr_t start, + phys_addr_t end) +{ + unsigned long start_pfn = PFN_UP(start); + unsigned long end_pfn = min_t(unsigned long, + PFN_DOWN(end), max_low_pfn); + + if (start_pfn > end_pfn) + return 0; + + __free_pages_memory(start_pfn, end_pfn); + + return end_pfn - start_pfn; +} + unsigned long __init free_low_memory_core_early(int nodeid) { unsigned long count = 0; - phys_addr_t start, end; + phys_addr_t start, end, size; u64 i; - /* free reserved array temporarily so that it's treated as free area */ - memblock_free_reserved_regions(); - - for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { - unsigned long start_pfn = PFN_UP(start); - unsigned long end_pfn = min_t(unsigned long, - PFN_DOWN(end), max_low_pfn); - if (start_pfn < end_pfn) { - __free_pages_memory(start_pfn, end_pfn); - count += end_pfn - start_pfn; - } - } + for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) + count += __free_memory_core(start, end); + + /* free range that is used for reserved array if we allocate it */ + size = get_allocated_memblock_reserved_regions_info(&start); + if (size) + count += __free_memory_core(start, start + size); - /* put region array back? */ - memblock_reserve_reserved_regions(); return count; } @@ -274,7 +282,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, return ___alloc_bootmem(size, align, goal, limit); } -static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, +void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal, diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 416637f..ac300c9 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -184,6 +184,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages) { long points; + long adj; if (oom_unkillable_task(p, memcg, nodemask)) return 0; @@ -192,7 +193,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, if (!p) return 0; - if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + adj = p->signal->oom_score_adj; + if (adj == OOM_SCORE_ADJ_MIN) { task_unlock(p); return 0; } @@ -210,14 +212,11 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * implementation used by LSMs. */ if (has_capability_noaudit(p, CAP_SYS_ADMIN)) - points -= 30 * totalpages / 1000; + adj -= 30; - /* - * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may - * either completely disable oom killing or always prefer a certain - * task. - */ - points += p->signal->oom_score_adj * totalpages / 1000; + /* Normalize to oom_score_adj units */ + adj *= totalpages / 1000; + points += adj; /* * Never return 0 for an eligible task regardless of the root bonus and @@ -366,7 +365,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, /** * dump_tasks - dump current memory state of all system tasks - * @mem: current's memory controller, if constrained + * @memcg: current's memory controller, if constrained * @nodemask: nodemask passed to page allocator for mempolicy ooms * * Dumps the current memory state of all eligible tasks. Tasks not in the same diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4403009..4a4f921 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5635,7 +5635,12 @@ static struct page * __alloc_contig_migrate_alloc(struct page *page, unsigned long private, int **resultp) { - return alloc_page(GFP_HIGHUSER_MOVABLE); + gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; + + if (PageHighMem(page)) + gfp_mask |= __GFP_HIGHMEM; + + return alloc_page(gfp_mask); } /* [start, end) must belong to a single zone. */ diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 1ccbd71..eb750f8 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -392,7 +392,7 @@ static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, /** * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. - * @end: swap entry to be cmpxchged + * @ent: swap entry to be cmpxchged * @old: old id * @new: new id * @@ -422,7 +422,7 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, /** * swap_cgroup_record - record mem_cgroup for this swp_entry. * @ent: swap entry to be recorded into - * @mem: mem_cgroup to be recorded + * @id: mem_cgroup to be recorded * * Returns old value at success, 0 at failure. * (Of course, old value can be 0.) diff --git a/mm/pagewalk.c b/mm/pagewalk.c index aa9701e..6c118d0 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -162,7 +162,6 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, /** * walk_page_range - walk a memory map's page tables with a callback - * @mm: memory map to walk * @addr: starting address * @end: ending address * @walk: set of callbacks to invoke for each level of the tree diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 405d331..3707c71 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -360,7 +360,6 @@ err_free: * @chunk: chunk to depopulate * @off: offset to the area to depopulate * @size: size of the area to depopulate in bytes - * @flush: whether to flush cache and tlb or not * * For each cpu, depopulate and unmap pages [@page_start,@page_end) * from @chunk. If @flush is true, vcache is flushed before unmapping @@ -264,46 +264,55 @@ static int shmem_radix_tree_replace(struct address_space *mapping, } /* + * Sometimes, before we decide whether to proceed or to fail, we must check + * that an entry was not already brought back from swap by a racing thread. + * + * Checking page is not enough: by the time a SwapCache page is locked, it + * might be reused, and again be SwapCache, using the same swap as before. + */ +static bool shmem_confirm_swap(struct address_space *mapping, + pgoff_t index, swp_entry_t swap) +{ + void *item; + + rcu_read_lock(); + item = radix_tree_lookup(&mapping->page_tree, index); + rcu_read_unlock(); + return item == swp_to_radix_entry(swap); +} + +/* * Like add_to_page_cache_locked, but error if expected item has gone. */ static int shmem_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp, void *expected) { - int error = 0; + int error; VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(!PageSwapBacked(page)); + page_cache_get(page); + page->mapping = mapping; + page->index = index; + + spin_lock_irq(&mapping->tree_lock); if (!expected) - error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); + error = radix_tree_insert(&mapping->page_tree, index, page); + else + error = shmem_radix_tree_replace(mapping, index, expected, + page); if (!error) { - page_cache_get(page); - page->mapping = mapping; - page->index = index; - - spin_lock_irq(&mapping->tree_lock); - if (!expected) - error = radix_tree_insert(&mapping->page_tree, - index, page); - else - error = shmem_radix_tree_replace(mapping, index, - expected, page); - if (!error) { - mapping->nrpages++; - __inc_zone_page_state(page, NR_FILE_PAGES); - __inc_zone_page_state(page, NR_SHMEM); - spin_unlock_irq(&mapping->tree_lock); - } else { - page->mapping = NULL; - spin_unlock_irq(&mapping->tree_lock); - page_cache_release(page); - } - if (!expected) - radix_tree_preload_end(); + mapping->nrpages++; + __inc_zone_page_state(page, NR_FILE_PAGES); + __inc_zone_page_state(page, NR_SHMEM); + spin_unlock_irq(&mapping->tree_lock); + } else { + page->mapping = NULL; + spin_unlock_irq(&mapping->tree_lock); + page_cache_release(page); } - if (error) - mem_cgroup_uncharge_cache_page(page); return error; } @@ -1124,9 +1133,9 @@ repeat: /* We have to do this with page locked to prevent races */ lock_page(page); if (!PageSwapCache(page) || page_private(page) != swap.val || - page->mapping) { + !shmem_confirm_swap(mapping, index, swap)) { error = -EEXIST; /* try again */ - goto failed; + goto unlock; } if (!PageUptodate(page)) { error = -EIO; @@ -1142,9 +1151,12 @@ repeat: error = mem_cgroup_cache_charge(page, current->mm, gfp & GFP_RECLAIM_MASK); - if (!error) + if (!error) { error = shmem_add_to_page_cache(page, mapping, index, gfp, swp_to_radix_entry(swap)); + /* We already confirmed swap, and make no allocation */ + VM_BUG_ON(error); + } if (error) goto failed; @@ -1181,11 +1193,18 @@ repeat: __set_page_locked(page); error = mem_cgroup_cache_charge(page, current->mm, gfp & GFP_RECLAIM_MASK); - if (!error) - error = shmem_add_to_page_cache(page, mapping, index, - gfp, NULL); if (error) goto decused; + error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); + if (!error) { + error = shmem_add_to_page_cache(page, mapping, index, + gfp, NULL); + radix_tree_preload_end(); + } + if (error) { + mem_cgroup_uncharge_cache_page(page); + goto decused; + } lru_cache_add_anon(page); spin_lock(&info->lock); @@ -1245,14 +1264,10 @@ decused: unacct: shmem_unacct_blocks(info->flags, 1); failed: - if (swap.val && error != -EINVAL) { - struct page *test = find_get_page(mapping, index); - if (test && !radix_tree_exceptional_entry(test)) - page_cache_release(test); - /* Have another try if the entry has changed */ - if (test != swp_to_radix_entry(swap)) - error = -EEXIST; - } + if (swap.val && error != -EINVAL && + !shmem_confirm_swap(mapping, index, swap)) + error = -EEXIST; +unlock: if (page) { unlock_page(page); page_cache_release(page); @@ -1264,7 +1279,7 @@ failed: spin_unlock(&info->lock); goto repeat; } - if (error == -EEXIST) + if (error == -EEXIST) /* from above or from radix_tree_insert */ goto repeat; return error; } @@ -1594,6 +1609,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, struct splice_pipe_desc spd = { .pages = pages, .partial = partial, + .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &page_cache_pipe_buf_ops, .spd_release = spd_release_page, @@ -1682,7 +1698,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, if (spd.nr_pages) error = splice_to_pipe(pipe, &spd); - splice_shrink_spd(pipe, &spd); + splice_shrink_spd(&spd); if (error > 0) { *ppos += error; @@ -1691,98 +1707,6 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, return error; } -/* - * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. - */ -static pgoff_t shmem_seek_hole_data(struct address_space *mapping, - pgoff_t index, pgoff_t end, int origin) -{ - struct page *page; - struct pagevec pvec; - pgoff_t indices[PAGEVEC_SIZE]; - bool done = false; - int i; - - pagevec_init(&pvec, 0); - pvec.nr = 1; /* start small: we may be there already */ - while (!done) { - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, - pvec.nr, pvec.pages, indices); - if (!pvec.nr) { - if (origin == SEEK_DATA) - index = end; - break; - } - for (i = 0; i < pvec.nr; i++, index++) { - if (index < indices[i]) { - if (origin == SEEK_HOLE) { - done = true; - break; - } - index = indices[i]; - } - page = pvec.pages[i]; - if (page && !radix_tree_exceptional_entry(page)) { - if (!PageUptodate(page)) - page = NULL; - } - if (index >= end || - (page && origin == SEEK_DATA) || - (!page && origin == SEEK_HOLE)) { - done = true; - break; - } - } - shmem_deswap_pagevec(&pvec); - pagevec_release(&pvec); - pvec.nr = PAGEVEC_SIZE; - cond_resched(); - } - return index; -} - -static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin) -{ - struct address_space *mapping; - struct inode *inode; - pgoff_t start, end; - loff_t new_offset; - - if (origin != SEEK_DATA && origin != SEEK_HOLE) - return generic_file_llseek_size(file, offset, origin, - MAX_LFS_FILESIZE); - mapping = file->f_mapping; - inode = mapping->host; - mutex_lock(&inode->i_mutex); - /* We're holding i_mutex so we can access i_size directly */ - - if (offset < 0) - offset = -EINVAL; - else if (offset >= inode->i_size) - offset = -ENXIO; - else { - start = offset >> PAGE_CACHE_SHIFT; - end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - new_offset = shmem_seek_hole_data(mapping, start, end, origin); - new_offset <<= PAGE_CACHE_SHIFT; - if (new_offset > offset) { - if (new_offset < inode->i_size) - offset = new_offset; - else if (origin == SEEK_DATA) - offset = -ENXIO; - else - offset = inode->i_size; - } - } - - if (offset >= 0 && offset != file->f_pos) { - file->f_pos = offset; - file->f_version = 0; - } - mutex_unlock(&inode->i_mutex); - return offset; -} - static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { @@ -1953,7 +1877,7 @@ static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) } static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { return shmem_mknod(dir, dentry, mode | S_IFREG, 0); } @@ -2786,7 +2710,7 @@ static const struct address_space_operations shmem_aops = { static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, #ifdef CONFIG_TMPFS - .llseek = shmem_file_llseek, + .llseek = generic_file_llseek, .read = do_sync_read, .write = do_sync_write, .aio_read = shmem_file_aio_read, diff --git a/mm/sparse.c b/mm/sparse.c index 6a4bf91..c7bb952 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -275,8 +275,9 @@ static unsigned long * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, unsigned long size) { - pg_data_t *host_pgdat; - unsigned long goal; + unsigned long goal, limit; + unsigned long *p; + int nid; /* * A page may contain usemaps for other sections preventing the * page being freed and making a section unremovable while @@ -287,10 +288,17 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, * from the same section as the pgdat where possible to avoid * this problem. */ - goal = __pa(pgdat) & PAGE_SECTION_MASK; - host_pgdat = NODE_DATA(early_pfn_to_nid(goal >> PAGE_SHIFT)); - return __alloc_bootmem_node_nopanic(host_pgdat, size, - SMP_CACHE_BYTES, goal); + goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT); + limit = goal + (1UL << PA_SECTION_SHIFT); + nid = early_pfn_to_nid(goal >> PAGE_SHIFT); +again: + p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, + SMP_CACHE_BYTES, goal, limit); + if (!p && limit) { + limit = 0; + goto again; + } + return p; } static void __init check_usemap_section_nr(int nid, unsigned long *usemap) diff --git a/mm/swapfile.c b/mm/swapfile.c index de5bc51..71373d0 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1916,24 +1916,20 @@ static unsigned long read_swap_header(struct swap_info_struct *p, /* * Find out how many pages are allowed for a single swap - * device. There are three limiting factors: 1) the number + * device. There are two limiting factors: 1) the number * of bits for the swap offset in the swp_entry_t type, and * 2) the number of bits in the swap pte as defined by the - * the different architectures, and 3) the number of free bits - * in an exceptional radix_tree entry. In order to find the + * different architectures. In order to find the * largest possible bit mask, a swap entry with swap type 0 * and swap offset ~0UL is created, encoded to a swap pte, * decoded to a swp_entry_t again, and finally the swap * offset is extracted. This will mask all the bits from * the initial ~0UL mask that can't be encoded in either * the swp_entry_t or the architecture definition of a - * swap pte. Then the same is done for a radix_tree entry. + * swap pte. */ maxpages = swp_offset(pte_to_swp_entry( - swp_entry_to_pte(swp_entry(0, ~0UL)))); - maxpages = swp_offset(radix_to_swp_entry( - swp_to_radix_entry(swp_entry(0, maxpages)))) + 1; - + swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; if (maxpages > swap_header->info.last_page) { maxpages = swap_header->info.last_page + 1; /* p->max is an unsigned int: don't overflow it */ diff --git a/mm/vmscan.c b/mm/vmscan.c index eeb3bc9..347b3ff 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1567,7 +1567,8 @@ static int vmscan_swappiness(struct scan_control *sc) * by looking at the fraction of the pages scanned we did rotate back * onto the active list instead of evict. * - * nr[0] = anon pages to scan; nr[1] = file pages to scan + * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan + * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan */ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long *nr) @@ -2537,7 +2538,7 @@ loop_again: * consider it to be no longer congested. It's * possible there are dirty pages backed by * congested BDIs but as pressure is relieved, - * spectulatively avoid congestion waits + * speculatively avoid congestion waits */ zone_clear_flag(zone, ZONE_CONGESTED); if (i <= *classzone_idx) @@ -2688,7 +2689,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) * them before going back to sleep. */ set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); - schedule(); + + if (!kthread_should_stop()) + schedule(); + set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); } else { if (remaining) @@ -2955,14 +2959,17 @@ int kswapd_run(int nid) } /* - * Called by memory hotplug when all memory in a node is offlined. + * Called by memory hotplug when all memory in a node is offlined. Caller must + * hold lock_memory_hotplug(). */ void kswapd_stop(int nid) { struct task_struct *kswapd = NODE_DATA(nid)->kswapd; - if (kswapd) + if (kswapd) { kthread_stop(kswapd); + NODE_DATA(nid)->kswapd = NULL; + } } static int __init kswapd_init(void) |