diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 8 | ||||
-rw-r--r-- | mm/bootmem.c | 195 | ||||
-rw-r--r-- | mm/failslab.c | 18 | ||||
-rw-r--r-- | mm/filemap.c | 103 | ||||
-rw-r--r-- | mm/hugetlb.c | 13 | ||||
-rw-r--r-- | mm/maccess.c | 11 | ||||
-rw-r--r-- | mm/memcontrol.c | 11 | ||||
-rw-r--r-- | mm/memory-failure.c | 9 | ||||
-rw-r--r-- | mm/memory.c | 14 | ||||
-rw-r--r-- | mm/migrate.c | 41 | ||||
-rw-r--r-- | mm/mmap.c | 40 | ||||
-rw-r--r-- | mm/mmu_context.c | 3 | ||||
-rw-r--r-- | mm/nommu.c | 144 | ||||
-rw-r--r-- | mm/oom_kill.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 344 | ||||
-rw-r--r-- | mm/percpu.c | 40 | ||||
-rw-r--r-- | mm/slab.c | 17 | ||||
-rw-r--r-- | mm/slub.c | 29 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 76 | ||||
-rw-r--r-- | mm/sparse.c | 196 | ||||
-rw-r--r-- | mm/truncate.c | 30 | ||||
-rw-r--r-- | mm/util.c | 46 | ||||
-rw-r--r-- | mm/vmalloc.c | 114 | ||||
-rw-r--r-- | mm/vmscan.c | 3 | ||||
-rw-r--r-- | mm/vmstat.c | 15 |
25 files changed, 1108 insertions, 414 deletions
@@ -115,6 +115,10 @@ config SPARSEMEM_EXTREME config SPARSEMEM_VMEMMAP_ENABLE bool +config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER + def_bool y + depends on SPARSEMEM && X86_64 + config SPARSEMEM_VMEMMAP bool "Sparse Memory virtual memmap" depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE @@ -195,7 +199,7 @@ config BOUNCE config NR_QUICK int depends on QUICKLIST - default "2" if SUPERH || AVR32 + default "2" if AVR32 default "1" config VIRT_TO_BUS @@ -253,7 +257,7 @@ config MEMORY_FAILURE config HWPOISON_INJECT tristate "HWPoison pages injector" - depends on MEMORY_FAILURE && DEBUG_KERNEL + depends on MEMORY_FAILURE && DEBUG_KERNEL && PROC_FS select PROC_PAGE_MONITOR config NOMMU_INITIAL_TRIM_EXCESS diff --git a/mm/bootmem.c b/mm/bootmem.c index 7d14868..d7c791e 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -13,6 +13,7 @@ #include <linux/bootmem.h> #include <linux/module.h> #include <linux/kmemleak.h> +#include <linux/range.h> #include <asm/bug.h> #include <asm/io.h> @@ -32,6 +33,7 @@ unsigned long max_pfn; unsigned long saved_max_pfn; #endif +#ifndef CONFIG_NO_BOOTMEM bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); @@ -142,7 +144,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) min_low_pfn = start; return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); } - +#endif /* * free_bootmem_late - free bootmem pages directly to page allocator * @addr: starting address of the range @@ -167,6 +169,60 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) } } +#ifdef CONFIG_NO_BOOTMEM +static void __init __free_pages_memory(unsigned long start, unsigned long end) +{ + int i; + unsigned long start_aligned, end_aligned; + int order = ilog2(BITS_PER_LONG); + + start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); + end_aligned = end & ~(BITS_PER_LONG - 1); + + if (end_aligned <= start_aligned) { +#if 1 + printk(KERN_DEBUG " %lx - %lx\n", start, end); +#endif + for (i = start; i < end; i++) + __free_pages_bootmem(pfn_to_page(i), 0); + + return; + } + +#if 1 + printk(KERN_DEBUG " %lx %lx - %lx %lx\n", + start, start_aligned, end_aligned, end); +#endif + for (i = start; i < start_aligned; i++) + __free_pages_bootmem(pfn_to_page(i), 0); + + for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) + __free_pages_bootmem(pfn_to_page(i), order); + + for (i = end_aligned; i < end; i++) + __free_pages_bootmem(pfn_to_page(i), 0); +} + +unsigned long __init free_all_memory_core_early(int nodeid) +{ + int i; + u64 start, end; + unsigned long count = 0; + struct range *range = NULL; + int nr_range; + + nr_range = get_free_all_memory_range(&range, nodeid); + + for (i = 0; i < nr_range; i++) { + start = range[i].start; + end = range[i].end; + count += end - start; + __free_pages_memory(start, end); + } + + return count; +} +#else static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { int aligned; @@ -227,6 +283,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) return count; } +#endif /** * free_all_bootmem_node - release a node's free pages to the buddy allocator @@ -237,7 +294,12 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) { register_page_bootmem_info_node(pgdat); +#ifdef CONFIG_NO_BOOTMEM + /* free_all_memory_core_early(MAX_NUMNODES) will be called later */ + return 0; +#else return free_all_bootmem_core(pgdat->bdata); +#endif } /** @@ -247,9 +309,14 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) */ unsigned long __init free_all_bootmem(void) { +#ifdef CONFIG_NO_BOOTMEM + return free_all_memory_core_early(NODE_DATA(0)->node_id); +#else return free_all_bootmem_core(NODE_DATA(0)->bdata); +#endif } +#ifndef CONFIG_NO_BOOTMEM static void __init __free(bootmem_data_t *bdata, unsigned long sidx, unsigned long eidx) { @@ -344,6 +411,7 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, } BUG(); } +#endif /** * free_bootmem_node - mark a page range as usable @@ -358,6 +426,12 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { +#ifdef CONFIG_NO_BOOTMEM + free_early(physaddr, physaddr + size); +#if 0 + printk(KERN_DEBUG "free %lx %lx\n", physaddr, size); +#endif +#else unsigned long start, end; kmemleak_free_part(__va(physaddr), size); @@ -366,6 +440,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, end = PFN_DOWN(physaddr + size); mark_bootmem_node(pgdat->bdata, start, end, 0, 0); +#endif } /** @@ -379,6 +454,12 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, */ void __init free_bootmem(unsigned long addr, unsigned long size) { +#ifdef CONFIG_NO_BOOTMEM + free_early(addr, addr + size); +#if 0 + printk(KERN_DEBUG "free %lx %lx\n", addr, size); +#endif +#else unsigned long start, end; kmemleak_free_part(__va(addr), size); @@ -387,6 +468,7 @@ void __init free_bootmem(unsigned long addr, unsigned long size) end = PFN_DOWN(addr + size); mark_bootmem(start, end, 0, 0); +#endif } /** @@ -403,12 +485,17 @@ void __init free_bootmem(unsigned long addr, unsigned long size) int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size, int flags) { +#ifdef CONFIG_NO_BOOTMEM + panic("no bootmem"); + return 0; +#else unsigned long start, end; start = PFN_DOWN(physaddr); end = PFN_UP(physaddr + size); return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); +#endif } /** @@ -424,14 +511,20 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, int __init reserve_bootmem(unsigned long addr, unsigned long size, int flags) { +#ifdef CONFIG_NO_BOOTMEM + panic("no bootmem"); + return 0; +#else unsigned long start, end; start = PFN_DOWN(addr); end = PFN_UP(addr + size); return mark_bootmem(start, end, 1, flags); +#endif } +#ifndef CONFIG_NO_BOOTMEM static unsigned long __init align_idx(struct bootmem_data *bdata, unsigned long idx, unsigned long step) { @@ -582,12 +675,33 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, #endif return NULL; } +#endif static void * __init ___alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { +#ifdef CONFIG_NO_BOOTMEM + void *ptr; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc(size, GFP_NOWAIT); + +restart: + + ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); + + if (ptr) + return ptr; + + if (goal != 0) { + goal = 0; + goto restart; + } + + return NULL; +#else bootmem_data_t *bdata; void *region; @@ -613,6 +727,7 @@ restart: } return NULL; +#endif } /** @@ -631,7 +746,13 @@ restart: void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal) { - return ___alloc_bootmem_nopanic(size, align, goal, 0); + unsigned long limit = 0; + +#ifdef CONFIG_NO_BOOTMEM + limit = -1UL; +#endif + + return ___alloc_bootmem_nopanic(size, align, goal, limit); } static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, @@ -665,9 +786,16 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) { - return ___alloc_bootmem(size, align, goal, 0); + unsigned long limit = 0; + +#ifdef CONFIG_NO_BOOTMEM + limit = -1UL; +#endif + + return ___alloc_bootmem(size, align, goal, limit); } +#ifndef CONFIG_NO_BOOTMEM static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) @@ -684,6 +812,7 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, return ___alloc_bootmem(size, align, goal, limit); } +#endif /** * __alloc_bootmem_node - allocate boot memory from a specific node @@ -706,7 +835,46 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); +#ifdef CONFIG_NO_BOOTMEM + return __alloc_memory_core_early(pgdat->node_id, size, align, + goal, -1ULL); +#else return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); +#endif +} + +void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ +#ifdef MAX_DMA32_PFN + unsigned long end_pfn; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + /* update goal according ...MAX_DMA32_PFN */ + end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages; + + if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && + (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { + void *ptr; + unsigned long new_goal; + + new_goal = MAX_DMA32_PFN << PAGE_SHIFT; +#ifdef CONFIG_NO_BOOTMEM + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + new_goal, -1ULL); +#else + ptr = alloc_bootmem_core(pgdat->bdata, size, align, + new_goal, 0); +#endif + if (ptr) + return ptr; + } +#endif + + return __alloc_bootmem_node(pgdat, size, align, goal); + } #ifdef CONFIG_SPARSEMEM @@ -720,6 +888,16 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, void * __init alloc_bootmem_section(unsigned long size, unsigned long section_nr) { +#ifdef CONFIG_NO_BOOTMEM + unsigned long pfn, goal, limit; + + pfn = section_nr_to_pfn(section_nr); + goal = pfn << PAGE_SHIFT; + limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; + + return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, + SMP_CACHE_BYTES, goal, limit); +#else bootmem_data_t *bdata; unsigned long pfn, goal, limit; @@ -729,6 +907,7 @@ void * __init alloc_bootmem_section(unsigned long size, bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); +#endif } #endif @@ -740,11 +919,16 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); +#ifdef CONFIG_NO_BOOTMEM + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + goal, -1ULL); +#else ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); if (ptr) return ptr; ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); +#endif if (ptr) return ptr; @@ -795,6 +979,11 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); +#ifdef CONFIG_NO_BOOTMEM + return __alloc_memory_core_early(pgdat->node_id, size, align, + goal, ARCH_LOW_ADDRESS_LIMIT); +#else return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, ARCH_LOW_ADDRESS_LIMIT); +#endif } diff --git a/mm/failslab.c b/mm/failslab.c index 9339de5..bb41f98 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -1,18 +1,22 @@ #include <linux/fault-inject.h> #include <linux/gfp.h> +#include <linux/slab.h> static struct { struct fault_attr attr; u32 ignore_gfp_wait; + int cache_filter; #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS struct dentry *ignore_gfp_wait_file; + struct dentry *cache_filter_file; #endif } failslab = { .attr = FAULT_ATTR_INITIALIZER, .ignore_gfp_wait = 1, + .cache_filter = 0, }; -bool should_failslab(size_t size, gfp_t gfpflags) +bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags) { if (gfpflags & __GFP_NOFAIL) return false; @@ -20,6 +24,9 @@ bool should_failslab(size_t size, gfp_t gfpflags) if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT)) return false; + if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB)) + return false; + return should_fail(&failslab.attr, size); } @@ -30,7 +37,6 @@ static int __init setup_failslab(char *str) __setup("failslab=", setup_failslab); #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS - static int __init failslab_debugfs_init(void) { mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; @@ -46,8 +52,14 @@ static int __init failslab_debugfs_init(void) debugfs_create_bool("ignore-gfp-wait", mode, dir, &failslab.ignore_gfp_wait); - if (!failslab.ignore_gfp_wait_file) { + failslab.cache_filter_file = + debugfs_create_bool("cache-filter", mode, dir, + &failslab.cache_filter); + + if (!failslab.ignore_gfp_wait_file || + !failslab.cache_filter_file) { err = -ENOMEM; + debugfs_remove(failslab.cache_filter_file); debugfs_remove(failslab.ignore_gfp_wait_file); cleanup_fault_attr_dentries(&failslab.attr); } diff --git a/mm/filemap.c b/mm/filemap.c index 96ac6b0..698ea80 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1634,14 +1634,15 @@ EXPORT_SYMBOL(generic_file_readonly_mmap); static struct page *__read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *,struct page*), - void *data) + void *data, + gfp_t gfp) { struct page *page; int err; repeat: page = find_get_page(mapping, index); if (!page) { - page = page_cache_alloc_cold(mapping); + page = __page_cache_alloc(gfp | __GFP_COLD); if (!page) return ERR_PTR(-ENOMEM); err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); @@ -1661,31 +1662,18 @@ repeat: return page; } -/** - * read_cache_page_async - read into page cache, fill it if needed - * @mapping: the page's address_space - * @index: the page index - * @filler: function to perform the read - * @data: destination for read data - * - * Same as read_cache_page, but don't wait for page to become unlocked - * after submitting it to the filler. - * - * Read into the page cache. If a page already exists, and PageUptodate() is - * not set, try to fill the page but don't wait for it to become unlocked. - * - * If the page does not get brought uptodate, return -EIO. - */ -struct page *read_cache_page_async(struct address_space *mapping, +static struct page *do_read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *,struct page*), - void *data) + void *data, + gfp_t gfp) + { struct page *page; int err; retry: - page = __read_cache_page(mapping, index, filler, data); + page = __read_cache_page(mapping, index, filler, data, gfp); if (IS_ERR(page)) return page; if (PageUptodate(page)) @@ -1710,8 +1698,67 @@ out: mark_page_accessed(page); return page; } + +/** + * read_cache_page_async - read into page cache, fill it if needed + * @mapping: the page's address_space + * @index: the page index + * @filler: function to perform the read + * @data: destination for read data + * + * Same as read_cache_page, but don't wait for page to become unlocked + * after submitting it to the filler. + * + * Read into the page cache. If a page already exists, and PageUptodate() is + * not set, try to fill the page but don't wait for it to become unlocked. + * + * If the page does not get brought uptodate, return -EIO. + */ +struct page *read_cache_page_async(struct address_space *mapping, + pgoff_t index, + int (*filler)(void *,struct page*), + void *data) +{ + return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); +} EXPORT_SYMBOL(read_cache_page_async); +static struct page *wait_on_page_read(struct page *page) +{ + if (!IS_ERR(page)) { + wait_on_page_locked(page); + if (!PageUptodate(page)) { + page_cache_release(page); + page = ERR_PTR(-EIO); + } + } + return page; +} + +/** + * read_cache_page_gfp - read into page cache, using specified page allocation flags. + * @mapping: the page's address_space + * @index: the page index + * @gfp: the page allocator flags to use if allocating + * + * This is the same as "read_mapping_page(mapping, index, NULL)", but with + * any new page allocations done using the specified allocation flags. Note + * that the Radix tree operations will still use GFP_KERNEL, so you can't + * expect to do this atomically or anything like that - but you can pass in + * other page requirements. + * + * If the page does not get brought uptodate, return -EIO. + */ +struct page *read_cache_page_gfp(struct address_space *mapping, + pgoff_t index, + gfp_t gfp) +{ + filler_t *filler = (filler_t *)mapping->a_ops->readpage; + + return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp)); +} +EXPORT_SYMBOL(read_cache_page_gfp); + /** * read_cache_page - read into page cache, fill it if needed * @mapping: the page's address_space @@ -1729,18 +1776,7 @@ struct page *read_cache_page(struct address_space *mapping, int (*filler)(void *,struct page*), void *data) { - struct page *page; - - page = read_cache_page_async(mapping, index, filler, data); - if (IS_ERR(page)) - goto out; - wait_on_page_locked(page); - if (!PageUptodate(page)) { - page_cache_release(page); - page = ERR_PTR(-EIO); - } - out: - return page; + return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); } EXPORT_SYMBOL(read_cache_page); @@ -2196,6 +2232,9 @@ again: if (unlikely(status)) break; + if (mapping_writably_mapped(mapping)) + flush_dcache_page(page); + pagefault_disable(); copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); pagefault_enable(); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 65f38c2..3a5aeb3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -402,7 +402,7 @@ static void clear_huge_page(struct page *page, { int i; - if (unlikely(sz > MAX_ORDER_NR_PAGES)) { + if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) { clear_gigantic_page(page, addr, sz); return; } @@ -1515,10 +1515,9 @@ static struct attribute_group hstate_attr_group = { .attrs = hstate_attrs, }; -static int __init hugetlb_sysfs_add_hstate(struct hstate *h, - struct kobject *parent, - struct kobject **hstate_kobjs, - struct attribute_group *hstate_attr_group) +static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, + struct kobject **hstate_kobjs, + struct attribute_group *hstate_attr_group) { int retval; int hi = h - hstates; @@ -2088,7 +2087,7 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, ptep); } } @@ -2559,7 +2558,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = pte_mkyoung(entry); if (huge_ptep_set_access_flags(vma, address, ptep, entry, flags & FAULT_FLAG_WRITE)) - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, ptep); out_page_table_lock: spin_unlock(&mm->page_table_lock); diff --git a/mm/maccess.c b/mm/maccess.c index 9073695..4e348db 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -14,7 +14,11 @@ * Safely read from address @src to the buffer at @dst. If a kernel fault * happens, handle that and return -EFAULT. */ -long probe_kernel_read(void *dst, void *src, size_t size) + +long __weak probe_kernel_read(void *dst, void *src, size_t size) + __attribute__((alias("__probe_kernel_read"))); + +long __probe_kernel_read(void *dst, void *src, size_t size) { long ret; mm_segment_t old_fs = get_fs(); @@ -39,7 +43,10 @@ EXPORT_SYMBOL_GPL(probe_kernel_read); * Safely write to address @dst from the buffer at @src. If a kernel fault * happens, handle that and return -EFAULT. */ -long notrace __weak probe_kernel_write(void *dst, void *src, size_t size) +long __weak probe_kernel_write(void *dst, void *src, size_t size) + __attribute__((alias("__probe_kernel_write"))); + +long __probe_kernel_write(void *dst, void *src, size_t size) { long ret; mm_segment_t old_fs = get_fs(); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 488b644..954032b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2586,7 +2586,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) if (free_all) goto try_to_free; move_account: - while (mem->res.usage > 0) { + do { ret = -EBUSY; if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) goto out; @@ -2614,8 +2614,8 @@ move_account: if (ret == -ENOMEM) goto try_to_free; cond_resched(); - } - ret = 0; + /* "ret" should also be checked to ensure all lists are empty. */ + } while (mem->res.usage > 0 || ret); out: css_put(&mem->css); return ret; @@ -2648,10 +2648,7 @@ try_to_free: } lru_add_drain(); /* try move_account...there may be some *locked* pages. */ - if (mem->res.usage) - goto move_account; - ret = 0; - goto out; + goto move_account; } int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6a0466e..17299fd 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -52,6 +52,8 @@ int sysctl_memory_failure_recovery __read_mostly = 1; atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); +#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) + u32 hwpoison_filter_enable = 0; u32 hwpoison_filter_dev_major = ~0U; u32 hwpoison_filter_dev_minor = ~0U; @@ -164,6 +166,13 @@ int hwpoison_filter(struct page *p) return 0; } +#else +int hwpoison_filter(struct page *p) +{ + return 0; +} +#endif + EXPORT_SYMBOL_GPL(hwpoison_filter); /* diff --git a/mm/memory.c b/mm/memory.c index 09e4b1b..72fb5f3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1593,7 +1593,7 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, /* Ok, finally just insert the thing.. */ entry = pte_mkspecial(pfn_pte(pfn, prot)); set_pte_at(mm, addr, pte, entry); - update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */ + update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ retval = 0; out_unlock: @@ -2116,7 +2116,7 @@ reuse: entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, address, page_table, entry,1)) - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, page_table); ret |= VM_FAULT_WRITE; goto unlock; } @@ -2185,7 +2185,7 @@ gotten: * new page to be mapped directly into the secondary page table. */ set_pte_at_notify(mm, address, page_table, entry); - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, page_table); if (old_page) { /* * Only after switching the pte to the new page may @@ -2629,7 +2629,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, } /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, address, pte); + update_mmu_cache(vma, address, page_table); unlock: pte_unmap_unlock(page_table, ptl); out: @@ -2694,7 +2694,7 @@ setpte: set_pte_at(mm, address, page_table, entry); /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, page_table); unlock: pte_unmap_unlock(page_table, ptl); return 0; @@ -2855,7 +2855,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, set_pte_at(mm, address, page_table, entry); /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, page_table); } else { if (charged) mem_cgroup_uncharge_page(page); @@ -2992,7 +2992,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, } entry = pte_mkyoung(entry); if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, pte); } else { /* * This is needed only for protection faults but the arch code diff --git a/mm/migrate.c b/mm/migrate.c index efddbf0..edb6101 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -134,7 +134,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, page_add_file_rmap(new); /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, addr, pte); + update_mmu_cache(vma, addr, ptep); unlock: pte_unmap_unlock(ptep, ptl); out: @@ -912,6 +912,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task, goto out_pm; err = -ENODEV; + if (node < 0 || node >= MAX_NUMNODES) + goto out_pm; + if (!node_state(node, N_HIGH_MEMORY)) goto out_pm; @@ -999,33 +1002,27 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, #define DO_PAGES_STAT_CHUNK_NR 16 const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; int chunk_status[DO_PAGES_STAT_CHUNK_NR]; - unsigned long i, chunk_nr = DO_PAGES_STAT_CHUNK_NR; - int err; - for (i = 0; i < nr_pages; i += chunk_nr) { - if (chunk_nr > nr_pages - i) - chunk_nr = nr_pages - i; + while (nr_pages) { + unsigned long chunk_nr; - err = copy_from_user(chunk_pages, &pages[i], - chunk_nr * sizeof(*chunk_pages)); - if (err) { - err = -EFAULT; - goto out; - } + chunk_nr = nr_pages; + if (chunk_nr > DO_PAGES_STAT_CHUNK_NR) + chunk_nr = DO_PAGES_STAT_CHUNK_NR; + + if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages))) + break; do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); - err = copy_to_user(&status[i], chunk_status, - chunk_nr * sizeof(*chunk_status)); - if (err) { - err = -EFAULT; - goto out; - } - } - err = 0; + if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status))) + break; -out: - return err; + pages += chunk_nr; + status += chunk_nr; + nr_pages -= chunk_nr; + } + return nr_pages ? -EFAULT : 0; } /* @@ -1043,6 +1043,46 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, } EXPORT_SYMBOL(do_mmap_pgoff); +SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, pgoff) +{ + struct file *file = NULL; + unsigned long retval = -EBADF; + + if (!(flags & MAP_ANONYMOUS)) { + if (unlikely(flags & MAP_HUGETLB)) + return -EINVAL; + file = fget(fd); + if (!file) + goto out; + } else if (flags & MAP_HUGETLB) { + struct user_struct *user = NULL; + /* + * VM_NORESERVE is used because the reservations will be + * taken when vm_ops->mmap() is called + * A dummy user value is used because we are not locking + * memory so no accounting is necessary + */ + len = ALIGN(len, huge_page_size(&default_hstate)); + file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, + &user, HUGETLB_ANONHUGE_INODE); + if (IS_ERR(file)) + return PTR_ERR(file); + } + + flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); + + down_write(¤t->mm->mmap_sem); + retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); + up_write(¤t->mm->mmap_sem); + + if (file) + fput(file); +out: + return retval; +} + /* * Some shared mappigns will want the pages marked read-only * to track write events. If so, we'll downgrade vm_page_prot diff --git a/mm/mmu_context.c b/mm/mmu_context.c index ded9081..0777654 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c @@ -5,6 +5,7 @@ #include <linux/mm.h> #include <linux/mmu_context.h> +#include <linux/module.h> #include <linux/sched.h> #include <asm/mmu_context.h> @@ -37,6 +38,7 @@ void use_mm(struct mm_struct *mm) if (active_mm != mm) mmdrop(active_mm); } +EXPORT_SYMBOL_GPL(use_mm); /* * unuse_mm @@ -56,3 +58,4 @@ void unuse_mm(struct mm_struct *mm) enter_lazy_tlb(mm, tsk); task_unlock(tsk); } +EXPORT_SYMBOL_GPL(unuse_mm); @@ -432,6 +432,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* * Ok, looks good - let it rip. */ + flush_icache_range(mm->brk, brk); return mm->brk = brk; } @@ -551,11 +552,11 @@ static void free_page_series(unsigned long from, unsigned long to) static void __put_nommu_region(struct vm_region *region) __releases(nommu_region_sem) { - kenter("%p{%d}", region, atomic_read(®ion->vm_usage)); + kenter("%p{%d}", region, region->vm_usage); BUG_ON(!nommu_region_tree.rb_node); - if (atomic_dec_and_test(®ion->vm_usage)) { + if (--region->vm_usage == 0) { if (region->vm_top > region->vm_start) delete_nommu_region(region); up_write(&nommu_region_sem); @@ -1204,7 +1205,7 @@ unsigned long do_mmap_pgoff(struct file *file, if (!vma) goto error_getting_vma; - atomic_set(®ion->vm_usage, 1); + region->vm_usage = 1; region->vm_flags = vm_flags; region->vm_pgoff = pgoff; @@ -1271,7 +1272,7 @@ unsigned long do_mmap_pgoff(struct file *file, } /* we've found a region we can share */ - atomic_inc(&pregion->vm_usage); + pregion->vm_usage++; vma->vm_region = pregion; start = pregion->vm_start; start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT; @@ -1288,7 +1289,7 @@ unsigned long do_mmap_pgoff(struct file *file, vma->vm_region = NULL; vma->vm_start = 0; vma->vm_end = 0; - atomic_dec(&pregion->vm_usage); + pregion->vm_usage--; pregion = NULL; goto error_just_free; } @@ -1353,10 +1354,14 @@ unsigned long do_mmap_pgoff(struct file *file, share: add_vma_to_mm(current->mm, vma); - up_write(&nommu_region_sem); + /* we flush the region from the icache only when the first executable + * mapping of it is made */ + if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) { + flush_icache_range(region->vm_start, region->vm_end); + region->vm_icache_flushed = true; + } - if (prot & PROT_EXEC) - flush_icache_range(result, result + len); + up_write(&nommu_region_sem); kleave(" = %lx", result); return result; @@ -1398,6 +1403,31 @@ error_getting_region: } EXPORT_SYMBOL(do_mmap_pgoff); +SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, pgoff) +{ + struct file *file = NULL; + unsigned long retval = -EBADF; + + if (!(flags & MAP_ANONYMOUS)) { + file = fget(fd); + if (!file) + goto out; + } + + flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); + + down_write(¤t->mm->mmap_sem); + retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); + up_write(¤t->mm->mmap_sem); + + if (file) + fput(file); +out: + return retval; +} + /* * split a vma into two pieces at address 'addr', a new vma is allocated either * for the first part or the tail. @@ -1411,10 +1441,9 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, kenter(""); - /* we're only permitted to split anonymous regions that have a single - * owner */ - if (vma->vm_file || - atomic_read(&vma->vm_region->vm_usage) != 1) + /* we're only permitted to split anonymous regions (these should have + * only a single usage on the region) */ + if (vma->vm_file) return -ENOMEM; if (mm->map_count >= sysctl_max_map_count) @@ -1488,7 +1517,7 @@ static int shrink_vma(struct mm_struct *mm, /* cut the backing region down to size */ region = vma->vm_region; - BUG_ON(atomic_read(®ion->vm_usage) != 1); + BUG_ON(region->vm_usage != 1); down_write(&nommu_region_sem); delete_nommu_region(region); @@ -1732,27 +1761,6 @@ void unmap_mapping_range(struct address_space *mapping, EXPORT_SYMBOL(unmap_mapping_range); /* - * ask for an unmapped area at which to create a mapping on a file - */ -unsigned long get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - unsigned long (*get_area)(struct file *, unsigned long, unsigned long, - unsigned long, unsigned long); - - get_area = current->mm->get_unmapped_area; - if (file && file->f_op && file->f_op->get_unmapped_area) - get_area = file->f_op->get_unmapped_area; - - if (!get_area) - return -ENOSYS; - - return get_area(file, addr, len, pgoff, flags); -} -EXPORT_SYMBOL(get_unmapped_area); - -/* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to * succeed and -ENOMEM implies there is not. @@ -1891,9 +1899,11 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in /* only read or write mappings where it is permitted */ if (write && vma->vm_flags & VM_MAYWRITE) - len -= copy_to_user((void *) addr, buf, len); + copy_to_user_page(vma, NULL, addr, + (void *) addr, buf, len); else if (!write && vma->vm_flags & VM_MAYREAD) - len -= copy_from_user(buf, (void *) addr, len); + copy_from_user_page(vma, NULL, addr, + buf, (void *) addr, len); else len = 0; } else { @@ -1904,3 +1914,65 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in mmput(mm); return len; } + +/** + * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode + * @inode: The inode to check + * @size: The current filesize of the inode + * @newsize: The proposed filesize of the inode + * + * Check the shared mappings on an inode on behalf of a shrinking truncate to + * make sure that that any outstanding VMAs aren't broken and then shrink the + * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't + * automatically grant mappings that are too large. + */ +int nommu_shrink_inode_mappings(struct inode *inode, size_t size, + size_t newsize) +{ + struct vm_area_struct *vma; + struct prio_tree_iter iter; + struct vm_region *region; + pgoff_t low, high; + size_t r_size, r_top; + + low = newsize >> PAGE_SHIFT; + high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + + down_write(&nommu_region_sem); + + /* search for VMAs that fall within the dead zone */ + vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, + low, high) { + /* found one - only interested if it's shared out of the page + * cache */ + if (vma->vm_flags & VM_SHARED) { + up_write(&nommu_region_sem); + return -ETXTBSY; /* not quite true, but near enough */ + } + } + + /* reduce any regions that overlap the dead zone - if in existence, + * these will be pointed to by VMAs that don't overlap the dead zone + * + * we don't check for any regions that start beyond the EOF as there + * shouldn't be any + */ + vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, + 0, ULONG_MAX) { + if (!(vma->vm_flags & VM_SHARED)) + continue; + + region = vma->vm_region; + r_size = region->vm_top - region->vm_start; + r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size; + + if (r_top > newsize) { + region->vm_top -= r_top - newsize; + if (region->vm_end > region->vm_top) + region->vm_end = region->vm_top; + } + } + + up_write(&nommu_region_sem); + return 0; +} diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f52481b..2370504 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -459,6 +459,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, list_for_each_entry(c, &p->children, sibling) { if (c->mm == p->mm) continue; + if (mem && !task_in_mem_cgroup(c, mem)) + continue; if (!oom_kill_task(c)) return 0; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 74af449..a6b17aa 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -48,6 +48,7 @@ #include <linux/page_cgroup.h> #include <linux/debugobjects.h> #include <linux/kmemleak.h> +#include <linux/memory.h> #include <trace/events/kmem.h> #include <asm/tlbflush.h> @@ -555,8 +556,9 @@ static void free_pcppages_bulk(struct zone *zone, int count, page = list_entry(list->prev, struct page, lru); /* must delete as __free_one_page list manipulates */ list_del(&page->lru); - __free_one_page(page, zone, 0, migratetype); - trace_mm_page_pcpu_drain(page, 0, migratetype); + /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ + __free_one_page(page, zone, 0, page_private(page)); + trace_mm_page_pcpu_drain(page, 0, page_private(page)); } while (--count && --batch_free && !list_empty(list)); } spin_unlock(&zone->lock); @@ -1007,10 +1009,10 @@ static void drain_pages(unsigned int cpu) struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; - pset = zone_pcp(zone, cpu); + local_irq_save(flags); + pset = per_cpu_ptr(zone->pageset, cpu); pcp = &pset->pcp; - local_irq_save(flags); free_pcppages_bulk(zone, pcp->count, pcp); pcp->count = 0; local_irq_restore(flags); @@ -1094,7 +1096,6 @@ static void free_hot_cold_page(struct page *page, int cold) arch_free_page(page, 0); kernel_map_pages(page, 1, 0); - pcp = &zone_pcp(zone, get_cpu())->pcp; migratetype = get_pageblock_migratetype(page); set_page_private(page, migratetype); local_irq_save(flags); @@ -1117,6 +1118,7 @@ static void free_hot_cold_page(struct page *page, int cold) migratetype = MIGRATE_MOVABLE; } + pcp = &this_cpu_ptr(zone->pageset)->pcp; if (cold) list_add_tail(&page->lru, &pcp->lists[migratetype]); else @@ -1129,7 +1131,6 @@ static void free_hot_cold_page(struct page *page, int cold) out: local_irq_restore(flags); - put_cpu(); } void free_hot_page(struct page *page) @@ -1179,17 +1180,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, unsigned long flags; struct page *page; int cold = !!(gfp_flags & __GFP_COLD); - int cpu; again: - cpu = get_cpu(); if (likely(order == 0)) { struct per_cpu_pages *pcp; struct list_head *list; - pcp = &zone_pcp(zone, cpu)->pcp; - list = &pcp->lists[migratetype]; local_irq_save(flags); + pcp = &this_cpu_ptr(zone->pageset)->pcp; + list = &pcp->lists[migratetype]; if (list_empty(list)) { pcp->count += rmqueue_bulk(zone, 0, pcp->batch, list, @@ -1221,16 +1220,15 @@ again: } spin_lock_irqsave(&zone->lock, flags); page = __rmqueue(zone, order, migratetype); - __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); spin_unlock(&zone->lock); if (!page) goto failed; + __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); } __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone); local_irq_restore(flags); - put_cpu(); VM_BUG_ON(bad_range(zone, page)); if (prep_new_page(page, order, gfp_flags)) @@ -1239,7 +1237,6 @@ again: failed: local_irq_restore(flags); - put_cpu(); return NULL; } @@ -2178,7 +2175,7 @@ void show_free_areas(void) for_each_online_cpu(cpu) { struct per_cpu_pageset *pageset; - pageset = zone_pcp(zone, cpu); + pageset = per_cpu_ptr(zone->pageset, cpu); printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", cpu, pageset->pcp.high, @@ -2401,13 +2398,14 @@ int numa_zonelist_order_handler(ctl_table *table, int write, { char saved_string[NUMA_ZONELIST_ORDER_LEN]; int ret; + static DEFINE_MUTEX(zl_order_mutex); + mutex_lock(&zl_order_mutex); if (write) - strncpy(saved_string, (char*)table->data, - NUMA_ZONELIST_ORDER_LEN); + strcpy(saved_string, (char*)table->data); ret = proc_dostring(table, write, buffer, length, ppos); if (ret) - return ret; + goto out; if (write) { int oldval = user_zonelist_order; if (__parse_numa_zonelist_order((char*)table->data)) { @@ -2420,7 +2418,9 @@ int numa_zonelist_order_handler(ctl_table *table, int write, } else if (oldval != user_zonelist_order) build_all_zonelists(); } - return 0; +out: + mutex_unlock(&zl_order_mutex); + return ret; } @@ -2740,10 +2740,29 @@ static void build_zonelist_cache(pg_data_t *pgdat) #endif /* CONFIG_NUMA */ +/* + * Boot pageset table. One per cpu which is going to be used for all + * zones and all nodes. The parameters will be set in such a way + * that an item put on a list will immediately be handed over to + * the buddy list. This is safe since pageset manipulation is done + * with interrupts disabled. + * + * The boot_pagesets must be kept even after bootup is complete for + * unused processors and/or zones. They do play a role for bootstrapping + * hotplugged processors. + * + * zoneinfo_show() and maybe other functions do + * not check if the processor is online before following the pageset pointer. + * Other parts of the kernel may not check if the zone is available. + */ +static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); +static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); + /* return values int ....just for stop_machine() */ static int __build_all_zonelists(void *dummy) { int nid; + int cpu; #ifdef CONFIG_NUMA memset(node_load, 0, sizeof(node_load)); @@ -2754,6 +2773,23 @@ static int __build_all_zonelists(void *dummy) build_zonelists(pgdat); build_zonelist_cache(pgdat); } + + /* + * Initialize the boot_pagesets that are going to be used + * for bootstrapping processors. The real pagesets for + * each zone will be allocated later when the per cpu + * allocator is available. + * + * boot_pagesets are used also for bootstrapping offline + * cpus if the system is already booted because the pagesets + * are needed to initialize allocators on a specific cpu too. + * F.e. the percpu allocator needs the page allocator which + * needs the percpu allocator in order to allocate its pagesets + * (a chicken-egg dilemma). + */ + for_each_possible_cpu(cpu) + setup_pageset(&per_cpu(boot_pageset, cpu), 0); + return 0; } @@ -3091,121 +3127,33 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, pcp->batch = PAGE_SHIFT * 8; } - -#ifdef CONFIG_NUMA -/* - * Boot pageset table. One per cpu which is going to be used for all - * zones and all nodes. The parameters will be set in such a way - * that an item put on a list will immediately be handed over to - * the buddy list. This is safe since pageset manipulation is done - * with interrupts disabled. - * - * Some NUMA counter updates may also be caught by the boot pagesets. - * - * The boot_pagesets must be kept even after bootup is complete for - * unused processors and/or zones. They do play a role for bootstrapping - * hotplugged processors. - * - * zoneinfo_show() and maybe other functions do - * not check if the processor is online before following the pageset pointer. - * Other parts of the kernel may not check if the zone is available. - */ -static struct per_cpu_pageset boot_pageset[NR_CPUS]; - /* - * Dynamically allocate memory for the - * per cpu pageset array in struct zone. + * Allocate per cpu pagesets and initialize them. + * Before this call only boot pagesets were available. + * Boot pagesets will no longer be used by this processorr + * after setup_per_cpu_pageset(). */ -static int __cpuinit process_zones(int cpu) +void __init setup_per_cpu_pageset(void) { - struct zone *zone, *dzone; - int node = cpu_to_node(cpu); - - node_set_state(node, N_CPU); /* this node has a cpu */ + struct zone *zone; + int cpu; for_each_populated_zone(zone) { - zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), - GFP_KERNEL, node); - if (!zone_pcp(zone, cpu)) - goto bad; + zone->pageset = alloc_percpu(struct per_cpu_pageset); - setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); + for_each_possible_cpu(cpu) { + struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); - if (percpu_pagelist_fraction) - setup_pagelist_highmark(zone_pcp(zone, cpu), - (zone->present_pages / percpu_pagelist_fraction)); - } + setup_pageset(pcp, zone_batchsize(zone)); - return 0; -bad: - for_each_zone(dzone) { - if (!populated_zone(dzone)) - continue; - if (dzone == zone) - break; - kfree(zone_pcp(dzone, cpu)); - zone_pcp(dzone, cpu) = &boot_pageset[cpu]; - } - return -ENOMEM; -} - -static inline void free_zone_pagesets(int cpu) -{ - struct zone *zone; - - for_each_zone(zone) { - struct per_cpu_pageset *pset = zone_pcp(zone, cpu); - - /* Free per_cpu_pageset if it is slab allocated */ - if (pset != &boot_pageset[cpu]) - kfree(pset); - zone_pcp(zone, cpu) = &boot_pageset[cpu]; + if (percpu_pagelist_fraction) + setup_pagelist_highmark(pcp, + (zone->present_pages / + percpu_pagelist_fraction)); + } } } -static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - int cpu = (long)hcpu; - int ret = NOTIFY_OK; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - if (process_zones(cpu)) - ret = NOTIFY_BAD; - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - free_zone_pagesets(cpu); - break; - default: - break; - } - return ret; -} - -static struct notifier_block __cpuinitdata pageset_notifier = - { &pageset_cpuup_callback, NULL, 0 }; - -void __init setup_per_cpu_pageset(void) -{ - int err; - - /* Initialize per_cpu_pageset for cpu 0. - * A cpuup callback will do this for every cpu - * as it comes online - */ - err = process_zones(smp_processor_id()); - BUG_ON(err); - register_cpu_notifier(&pageset_notifier); -} - -#endif - static noinline __init_refok int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) { @@ -3259,7 +3207,7 @@ static int __zone_pcp_update(void *data) struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; - pset = zone_pcp(zone, cpu); + pset = per_cpu_ptr(zone->pageset, cpu); pcp = &pset->pcp; local_irq_save(flags); @@ -3277,21 +3225,17 @@ void zone_pcp_update(struct zone *zone) static __meminit void zone_pcp_init(struct zone *zone) { - int cpu; - unsigned long batch = zone_batchsize(zone); + /* + * per cpu subsystem is not up at this point. The following code + * relies on the ability of the linker to provide the + * offset of a (static) per cpu variable into the per cpu area. + */ + zone->pageset = &boot_pageset; - for (cpu = 0; cpu < NR_CPUS; cpu++) { -#ifdef CONFIG_NUMA - /* Early boot. Slab allocator not functional yet */ - zone_pcp(zone, cpu) = &boot_pageset[cpu]; - setup_pageset(&boot_pageset[cpu],0); -#else - setup_pageset(zone_pcp(zone,cpu), batch); -#endif - } if (zone->present_pages) - printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", - zone->name, zone->present_pages, batch); + printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", + zone->name, zone->present_pages, + zone_batchsize(zone)); } __meminit int init_currently_empty_zone(struct zone *zone, @@ -3430,6 +3374,61 @@ void __init free_bootmem_with_active_regions(int nid, } } +int __init add_from_early_node_map(struct range *range, int az, + int nr_range, int nid) +{ + int i; + u64 start, end; + + /* need to go over early_node_map to find out good range for node */ + for_each_active_range_index_in_nid(i, nid) { + start = early_node_map[i].start_pfn; + end = early_node_map[i].end_pfn; + nr_range = add_range(range, az, nr_range, start, end); + } + return nr_range; +} + +#ifdef CONFIG_NO_BOOTMEM +void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, + u64 goal, u64 limit) +{ + int i; + void *ptr; + + /* need to go over early_node_map to find out good range for node */ + for_each_active_range_index_in_nid(i, nid) { + u64 addr; + u64 ei_start, ei_last; + + ei_last = early_node_map[i].end_pfn; + ei_last <<= PAGE_SHIFT; + ei_start = early_node_map[i].start_pfn; + ei_start <<= PAGE_SHIFT; + addr = find_early_area(ei_start, ei_last, + goal, limit, size, align); + + if (addr == -1ULL) + continue; + +#if 0 + printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n", + nid, + ei_start, ei_last, goal, limit, size, + align, addr); +#endif + + ptr = phys_to_virt(addr); + memset(ptr, 0, size); + reserve_early_without_check(addr, addr + size, "BOOTMEM"); + return ptr; + } + + return NULL; +} +#endif + + void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) { int i; @@ -3579,7 +3578,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, * then all holes in the requested range will be accounted for. */ -static unsigned long __meminit __absent_pages_in_range(int nid, +unsigned long __meminit __absent_pages_in_range(int nid, unsigned long range_start_pfn, unsigned long range_end_pfn) { @@ -3994,7 +3993,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, } /* Merge backward if suitable */ - if (start_pfn < early_node_map[i].end_pfn && + if (start_pfn < early_node_map[i].start_pfn && end_pfn >= early_node_map[i].start_pfn) { early_node_map[i].start_pfn = start_pfn; return; @@ -4108,7 +4107,7 @@ static int __init cmp_node_active_region(const void *a, const void *b) } /* sort the node_map by start_pfn */ -static void __init sort_node_map(void) +void __init sort_node_map(void) { sort(early_node_map, (size_t)nr_nodemap_entries, sizeof(struct node_active_region), @@ -4462,7 +4461,11 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) } #ifndef CONFIG_NEED_MULTIPLE_NODES -struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] }; +struct pglist_data __refdata contig_page_data = { +#ifndef CONFIG_NO_BOOTMEM + .bdata = &bootmem_node_data[0] +#endif + }; EXPORT_SYMBOL(contig_page_data); #endif @@ -4805,10 +4808,11 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, if (!write || (ret == -EINVAL)) return ret; for_each_populated_zone(zone) { - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { unsigned long high; high = zone->present_pages / percpu_pagelist_fraction; - setup_pagelist_highmark(zone_pcp(zone, cpu), high); + setup_pagelist_highmark( + per_cpu_ptr(zone->pageset, cpu), high); } } return 0; @@ -5008,23 +5012,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, int set_migratetype_isolate(struct page *page) { struct zone *zone; - unsigned long flags; + struct page *curr_page; + unsigned long flags, pfn, iter; + unsigned long immobile = 0; + struct memory_isolate_notify arg; + int notifier_ret; int ret = -EBUSY; int zone_idx; zone = page_zone(page); zone_idx = zone_idx(zone); + spin_lock_irqsave(&zone->lock, flags); + if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE || + zone_idx == ZONE_MOVABLE) { + ret = 0; + goto out; + } + + pfn = page_to_pfn(page); + arg.start_pfn = pfn; + arg.nr_pages = pageblock_nr_pages; + arg.pages_found = 0; + /* - * In future, more migrate types will be able to be isolation target. + * It may be possible to isolate a pageblock even if the + * migratetype is not MIGRATE_MOVABLE. The memory isolation + * notifier chain is used by balloon drivers to return the + * number of pages in a range that are held by the balloon + * driver to shrink memory. If all the pages are accounted for + * by balloons, are free, or on the LRU, isolation can continue. + * Later, for example, when memory hotplug notifier runs, these + * pages reported as "can be isolated" should be isolated(freed) + * by the balloon driver through the memory notifier chain. */ - if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE && - zone_idx != ZONE_MOVABLE) + notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); + notifier_ret = notifier_to_errno(notifier_ret); + if (notifier_ret || !arg.pages_found) goto out; - set_pageblock_migratetype(page, MIGRATE_ISOLATE); - move_freepages_block(zone, page, MIGRATE_ISOLATE); - ret = 0; + + for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) { + if (!pfn_valid_within(pfn)) + continue; + + curr_page = pfn_to_page(iter); + if (!page_count(curr_page) || PageLRU(curr_page)) + continue; + + immobile++; + } + + if (arg.pages_found == immobile) + ret = 0; + out: + if (!ret) { + set_pageblock_migratetype(page, MIGRATE_ISOLATE); + move_freepages_block(zone, page, MIGRATE_ISOLATE); + } + spin_unlock_irqrestore(&zone->lock, flags); if (!ret) drain_all_pages(); diff --git a/mm/percpu.c b/mm/percpu.c index 442010c..768419d 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -80,13 +80,15 @@ /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ #ifndef __addr_to_pcpu_ptr #define __addr_to_pcpu_ptr(addr) \ - (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ - + (unsigned long)__per_cpu_start) + (void __percpu *)((unsigned long)(addr) - \ + (unsigned long)pcpu_base_addr + \ + (unsigned long)__per_cpu_start) #endif #ifndef __pcpu_ptr_to_addr #define __pcpu_ptr_to_addr(ptr) \ - (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ - - (unsigned long)__per_cpu_start) + (void __force *)((unsigned long)(ptr) + \ + (unsigned long)pcpu_base_addr - \ + (unsigned long)__per_cpu_start) #endif struct pcpu_chunk { @@ -913,11 +915,10 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) int rs, re; /* quick path, check whether it's empty already */ - pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { - if (rs == page_start && re == page_end) - return; - break; - } + rs = page_start; + pcpu_next_unpop(chunk, &rs, &re, page_end); + if (rs == page_start && re == page_end) + return; /* immutable chunks can't be depopulated */ WARN_ON(chunk->immutable); @@ -968,11 +969,10 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) int rs, re, rc; /* quick path, check whether all pages are already there */ - pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) { - if (rs == page_start && re == page_end) - goto clear; - break; - } + rs = page_start; + pcpu_next_pop(chunk, &rs, &re, page_end); + if (rs == page_start && re == page_end) + goto clear; /* need to allocate and map pages, this chunk can't be immutable */ WARN_ON(chunk->immutable); @@ -1067,7 +1067,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ -static void *pcpu_alloc(size_t size, size_t align, bool reserved) +static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) { static int warn_limit = 10; struct pcpu_chunk *chunk; @@ -1196,7 +1196,7 @@ fail_unlock_mutex: * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ -void *__alloc_percpu(size_t size, size_t align) +void __percpu *__alloc_percpu(size_t size, size_t align) { return pcpu_alloc(size, align, false); } @@ -1217,7 +1217,7 @@ EXPORT_SYMBOL_GPL(__alloc_percpu); * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ -void *__alloc_reserved_percpu(size_t size, size_t align) +void __percpu *__alloc_reserved_percpu(size_t size, size_t align) { return pcpu_alloc(size, align, true); } @@ -1269,9 +1269,9 @@ static void pcpu_reclaim(struct work_struct *work) * CONTEXT: * Can be called from atomic context. */ -void free_percpu(void *ptr) +void free_percpu(void __percpu *ptr) { - void *addr = __pcpu_ptr_to_addr(ptr); + void *addr; struct pcpu_chunk *chunk; unsigned long flags; int off; @@ -1279,6 +1279,8 @@ void free_percpu(void *ptr) if (!ptr) return; + addr = __pcpu_ptr_to_addr(ptr); + spin_lock_irqsave(&pcpu_lock, flags); chunk = pcpu_chunk_addr_search(addr); @@ -654,7 +654,7 @@ static void init_node_lock_keys(int q) l3 = s->cs_cachep->nodelists[q]; if (!l3 || OFF_SLAB(s->cs_cachep)) - return; + continue; lockdep_set_class(&l3->list_lock, &on_slab_l3_key); alc = l3->alien; /* @@ -665,7 +665,7 @@ static void init_node_lock_keys(int q) * for alloc_alien_cache, */ if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) - return; + continue; for_each_node(r) { if (alc[r]) lockdep_set_class(&alc[r]->lock, @@ -935,7 +935,6 @@ static int transfer_objects(struct array_cache *to, from->avail -= nr; to->avail += nr; - to->touched = 1; return nr; } @@ -983,13 +982,11 @@ static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) if (limit > 1) limit = 12; - ac_ptr = kmalloc_node(memsize, gfp, node); + ac_ptr = kzalloc_node(memsize, gfp, node); if (ac_ptr) { for_each_node(i) { - if (i == node || !node_online(i)) { - ac_ptr[i] = NULL; + if (i == node || !node_online(i)) continue; - } ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); if (!ac_ptr[i]) { for (i--; i >= 0; i--) @@ -2963,8 +2960,10 @@ retry: spin_lock(&l3->list_lock); /* See if we can refill from the shared array */ - if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) + if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) { + l3->shared->touched = 1; goto alloc_done; + } while (batchcount > 0) { struct list_head *entry; @@ -3101,7 +3100,7 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) if (cachep == &cache_cache) return false; - return should_failslab(obj_size(cachep), flags); + return should_failslab(obj_size(cachep), flags, cachep->flags); } static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) @@ -151,7 +151,8 @@ * Set of flags that will prevent slab merging */ #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE) + SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ + SLAB_FAILSLAB) #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ SLAB_CACHE_DMA | SLAB_NOTRACK) @@ -1004,6 +1005,9 @@ static int __init setup_slub_debug(char *str) case 't': slub_debug |= SLAB_TRACE; break; + case 'a': + slub_debug |= SLAB_FAILSLAB; + break; default: printk(KERN_ERR "slub_debug option '%c' " "unknown. skipped\n", *str); @@ -1700,7 +1704,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, lockdep_trace_alloc(gfpflags); might_sleep_if(gfpflags & __GFP_WAIT); - if (should_failslab(s->objsize, gfpflags)) + if (should_failslab(s->objsize, gfpflags, s->flags)) return NULL; local_irq_save(flags); @@ -4017,6 +4021,23 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf, } SLAB_ATTR(trace); +#ifdef CONFIG_FAILSLAB +static ssize_t failslab_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); +} + +static ssize_t failslab_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + s->flags &= ~SLAB_FAILSLAB; + if (buf[0] == '1') + s->flags |= SLAB_FAILSLAB; + return length; +} +SLAB_ATTR(failslab); +#endif + static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); @@ -4313,6 +4334,10 @@ static struct attribute *slab_attrs[] = { &deactivate_remote_frees_attr.attr, &order_fallback_attr.attr, #endif +#ifdef CONFIG_FAILSLAB + &failslab_attr.attr, +#endif + NULL }; diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index d9714bd..392b9bb 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -40,9 +40,11 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node, unsigned long align, unsigned long goal) { - return __alloc_bootmem_node(NODE_DATA(node), size, align, goal); + return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal); } +static void *vmemmap_buf; +static void *vmemmap_buf_end; void * __meminit vmemmap_alloc_block(unsigned long size, int node) { @@ -64,6 +66,24 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) __pa(MAX_DMA_ADDRESS)); } +/* need to make sure size is all the same during early stage */ +void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node) +{ + void *ptr; + + if (!vmemmap_buf) + return vmemmap_alloc_block(size, node); + + /* take the from buf */ + ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size); + if (ptr + size > vmemmap_buf_end) + return vmemmap_alloc_block(size, node); + + vmemmap_buf = ptr + size; + + return ptr; +} + void __meminit vmemmap_verify(pte_t *pte, int node, unsigned long start, unsigned long end) { @@ -80,7 +100,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) pte_t *pte = pte_offset_kernel(pmd, addr); if (pte_none(*pte)) { pte_t entry; - void *p = vmemmap_alloc_block(PAGE_SIZE, node); + void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node); if (!p) return NULL; entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); @@ -163,3 +183,55 @@ struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid) return map; } + +void __init sparse_mem_maps_populate_node(struct page **map_map, + unsigned long pnum_begin, + unsigned long pnum_end, + unsigned long map_count, int nodeid) +{ + unsigned long pnum; + unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; + void *vmemmap_buf_start; + + size = ALIGN(size, PMD_SIZE); + vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count, + PMD_SIZE, __pa(MAX_DMA_ADDRESS)); + + if (vmemmap_buf_start) { + vmemmap_buf = vmemmap_buf_start; + vmemmap_buf_end = vmemmap_buf_start + size * map_count; + } + + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + struct mem_section *ms; + + if (!present_section_nr(pnum)) + continue; + + map_map[pnum] = sparse_mem_map_populate(pnum, nodeid); + if (map_map[pnum]) + continue; + ms = __nr_to_section(pnum); + printk(KERN_ERR "%s: sparsemem memory map backing failed " + "some memory will not be available.\n", __func__); + ms->section_mem_map = 0; + } + + if (vmemmap_buf_start) { + /* need to free left buf */ +#ifdef CONFIG_NO_BOOTMEM + free_early(__pa(vmemmap_buf_start), __pa(vmemmap_buf_end)); + if (vmemmap_buf_start < vmemmap_buf) { + char name[15]; + + snprintf(name, sizeof(name), "MEMMAP %d", nodeid); + reserve_early_without_check(__pa(vmemmap_buf_start), + __pa(vmemmap_buf), name); + } +#else + free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf); +#endif + vmemmap_buf = NULL; + vmemmap_buf_end = NULL; + } +} diff --git a/mm/sparse.c b/mm/sparse.c index 6ce4aab..22896d5 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -271,7 +271,8 @@ static unsigned long *__kmalloc_section_usemap(void) #ifdef CONFIG_MEMORY_HOTREMOVE static unsigned long * __init -sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) +sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, + unsigned long count) { unsigned long section_nr; @@ -286,7 +287,7 @@ sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) * this problem. */ section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); - return alloc_bootmem_section(usemap_size(), section_nr); + return alloc_bootmem_section(usemap_size() * count, section_nr); } static void __init check_usemap_section_nr(int nid, unsigned long *usemap) @@ -329,7 +330,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) } #else static unsigned long * __init -sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) +sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, + unsigned long count) { return NULL; } @@ -339,27 +341,40 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) } #endif /* CONFIG_MEMORY_HOTREMOVE */ -static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) +static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map, + unsigned long pnum_begin, + unsigned long pnum_end, + unsigned long usemap_count, int nodeid) { - unsigned long *usemap; - struct mem_section *ms = __nr_to_section(pnum); - int nid = sparse_early_nid(ms); - - usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid)); - if (usemap) - return usemap; + void *usemap; + unsigned long pnum; + int size = usemap_size(); - usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); + usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), + usemap_count); if (usemap) { - check_usemap_section_nr(nid, usemap); - return usemap; + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + if (!present_section_nr(pnum)) + continue; + usemap_map[pnum] = usemap; + usemap += size; + } + return; } - /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ - nid = 0; + usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); + if (usemap) { + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + if (!present_section_nr(pnum)) + continue; + usemap_map[pnum] = usemap; + usemap += size; + check_usemap_section_nr(nodeid, usemap_map[pnum]); + } + return; + } printk(KERN_WARNING "%s: allocation failed\n", __func__); - return NULL; } #ifndef CONFIG_SPARSEMEM_VMEMMAP @@ -375,8 +390,65 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); return map; } +void __init sparse_mem_maps_populate_node(struct page **map_map, + unsigned long pnum_begin, + unsigned long pnum_end, + unsigned long map_count, int nodeid) +{ + void *map; + unsigned long pnum; + unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; + + map = alloc_remap(nodeid, size * map_count); + if (map) { + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + if (!present_section_nr(pnum)) + continue; + map_map[pnum] = map; + map += size; + } + return; + } + + size = PAGE_ALIGN(size); + map = alloc_bootmem_pages_node(NODE_DATA(nodeid), size * map_count); + if (map) { + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + if (!present_section_nr(pnum)) + continue; + map_map[pnum] = map; + map += size; + } + return; + } + + /* fallback */ + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + struct mem_section *ms; + + if (!present_section_nr(pnum)) + continue; + map_map[pnum] = sparse_mem_map_populate(pnum, nodeid); + if (map_map[pnum]) + continue; + ms = __nr_to_section(pnum); + printk(KERN_ERR "%s: sparsemem memory map backing failed " + "some memory will not be available.\n", __func__); + ms->section_mem_map = 0; + } +} #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ +#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER +static void __init sparse_early_mem_maps_alloc_node(struct page **map_map, + unsigned long pnum_begin, + unsigned long pnum_end, + unsigned long map_count, int nodeid) +{ + sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end, + map_count, nodeid); +} +#else static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) { struct page *map; @@ -392,10 +464,12 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) ms->section_mem_map = 0; return NULL; } +#endif void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) { } + /* * Allocate the accumulated non-linear sections, allocate a mem_map * for each and record the physical to section mapping. @@ -407,6 +481,14 @@ void __init sparse_init(void) unsigned long *usemap; unsigned long **usemap_map; int size; + int nodeid_begin = 0; + unsigned long pnum_begin = 0; + unsigned long usemap_count; +#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER + unsigned long map_count; + int size2; + struct page **map_map; +#endif /* * map is using big page (aka 2M in x86 64 bit) @@ -425,10 +507,81 @@ void __init sparse_init(void) panic("can not allocate usemap_map\n"); for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { + struct mem_section *ms; + if (!present_section_nr(pnum)) continue; - usemap_map[pnum] = sparse_early_usemap_alloc(pnum); + ms = __nr_to_section(pnum); + nodeid_begin = sparse_early_nid(ms); + pnum_begin = pnum; + break; } + usemap_count = 1; + for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { + struct mem_section *ms; + int nodeid; + + if (!present_section_nr(pnum)) + continue; + ms = __nr_to_section(pnum); + nodeid = sparse_early_nid(ms); + if (nodeid == nodeid_begin) { + usemap_count++; + continue; + } + /* ok, we need to take cake of from pnum_begin to pnum - 1*/ + sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, pnum, + usemap_count, nodeid_begin); + /* new start, update count etc*/ + nodeid_begin = nodeid; + pnum_begin = pnum; + usemap_count = 1; + } + /* ok, last chunk */ + sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS, + usemap_count, nodeid_begin); + +#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER + size2 = sizeof(struct page *) * NR_MEM_SECTIONS; + map_map = alloc_bootmem(size2); + if (!map_map) + panic("can not allocate map_map\n"); + + for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { + struct mem_section *ms; + + if (!present_section_nr(pnum)) + continue; + ms = __nr_to_section(pnum); + nodeid_begin = sparse_early_nid(ms); + pnum_begin = pnum; + break; + } + map_count = 1; + for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { + struct mem_section *ms; + int nodeid; + + if (!present_section_nr(pnum)) + continue; + ms = __nr_to_section(pnum); + nodeid = sparse_early_nid(ms); + if (nodeid == nodeid_begin) { + map_count++; + continue; + } + /* ok, we need to take cake of from pnum_begin to pnum - 1*/ + sparse_early_mem_maps_alloc_node(map_map, pnum_begin, pnum, + map_count, nodeid_begin); + /* new start, update count etc*/ + nodeid_begin = nodeid; + pnum_begin = pnum; + map_count = 1; + } + /* ok, last chunk */ + sparse_early_mem_maps_alloc_node(map_map, pnum_begin, NR_MEM_SECTIONS, + map_count, nodeid_begin); +#endif for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { if (!present_section_nr(pnum)) @@ -438,7 +591,11 @@ void __init sparse_init(void) if (!usemap) continue; +#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER + map = map_map[pnum]; +#else map = sparse_early_mem_map_alloc(pnum); +#endif if (!map) continue; @@ -448,6 +605,9 @@ void __init sparse_init(void) vmemmap_populate_print_last(); +#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER + free_bootmem(__pa(map_map), size2); +#endif free_bootmem(__pa(usemap_map), size); } diff --git a/mm/truncate.c b/mm/truncate.c index 342deee..e87e372 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -522,22 +522,20 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); */ void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) { - if (new < old) { - struct address_space *mapping = inode->i_mapping; - - /* - * unmap_mapping_range is called twice, first simply for - * efficiency so that truncate_inode_pages does fewer - * single-page unmaps. However after this first call, and - * before truncate_inode_pages finishes, it is possible for - * private pages to be COWed, which remain after - * truncate_inode_pages finishes, hence the second - * unmap_mapping_range call must be made for correctness. - */ - unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(mapping, new); - unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); - } + struct address_space *mapping = inode->i_mapping; + + /* + * unmap_mapping_range is called twice, first simply for + * efficiency so that truncate_inode_pages does fewer + * single-page unmaps. However after this first call, and + * before truncate_inode_pages finishes, it is possible for + * private pages to be COWed, which remain after + * truncate_inode_pages finishes, hence the second + * unmap_mapping_range call must be made for correctness. + */ + unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(mapping, new); + unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); } EXPORT_SYMBOL(truncate_pagecache); @@ -4,10 +4,6 @@ #include <linux/module.h> #include <linux/err.h> #include <linux/sched.h> -#include <linux/hugetlb.h> -#include <linux/syscalls.h> -#include <linux/mman.h> -#include <linux/file.h> #include <asm/uaccess.h> #define CREATE_TRACE_POINTS @@ -224,7 +220,7 @@ char *strndup_user(const char __user *s, long n) } EXPORT_SYMBOL(strndup_user); -#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT +#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm) { mm->mmap_base = TASK_UNMAPPED_BASE; @@ -272,46 +268,6 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start, } EXPORT_SYMBOL_GPL(get_user_pages_fast); -SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, - unsigned long, prot, unsigned long, flags, - unsigned long, fd, unsigned long, pgoff) -{ - struct file * file = NULL; - unsigned long retval = -EBADF; - - if (!(flags & MAP_ANONYMOUS)) { - if (unlikely(flags & MAP_HUGETLB)) - return -EINVAL; - file = fget(fd); - if (!file) - goto out; - } else if (flags & MAP_HUGETLB) { - struct user_struct *user = NULL; - /* - * VM_NORESERVE is used because the reservations will be - * taken when vm_ops->mmap() is called - * A dummy user value is used because we are not locking - * memory so no accounting is necessary - */ - len = ALIGN(len, huge_page_size(&default_hstate)); - file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, - &user, HUGETLB_ANONHUGE_INODE); - if (IS_ERR(file)) - return PTR_ERR(file); - } - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - down_write(¤t->mm->mmap_sem); - retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return retval; -} - /* Tracepoints definitions. */ EXPORT_TRACEPOINT_SYMBOL(kmalloc); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 37e6929..ae00746 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -509,6 +509,9 @@ static unsigned long lazy_max_pages(void) static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); +/* for per-CPU blocks */ +static void purge_fragmented_blocks_allcpus(void); + /* * Purges all lazily-freed vmap areas. * @@ -539,6 +542,9 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, } else spin_lock(&purge_lock); + if (sync) + purge_fragmented_blocks_allcpus(); + rcu_read_lock(); list_for_each_entry_rcu(va, &vmap_area_list, list) { if (va->flags & VM_LAZY_FREE) { @@ -555,10 +561,8 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, } rcu_read_unlock(); - if (nr) { - BUG_ON(nr > atomic_read(&vmap_lazy_nr)); + if (nr) atomic_sub(nr, &vmap_lazy_nr); - } if (nr || force_flush) flush_tlb_kernel_range(*start, *end); @@ -669,8 +673,6 @@ static bool vmap_initialized __read_mostly = false; struct vmap_block_queue { spinlock_t lock; struct list_head free; - struct list_head dirty; - unsigned int nr_dirty; }; struct vmap_block { @@ -680,10 +682,9 @@ struct vmap_block { unsigned long free, dirty; DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); - union { - struct list_head free_list; - struct rcu_head rcu_head; - }; + struct list_head free_list; + struct rcu_head rcu_head; + struct list_head purge; }; /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ @@ -759,7 +760,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) vbq = &get_cpu_var(vmap_block_queue); vb->vbq = vbq; spin_lock(&vbq->lock); - list_add(&vb->free_list, &vbq->free); + list_add_rcu(&vb->free_list, &vbq->free); spin_unlock(&vbq->lock); put_cpu_var(vmap_block_queue); @@ -778,8 +779,6 @@ static void free_vmap_block(struct vmap_block *vb) struct vmap_block *tmp; unsigned long vb_idx; - BUG_ON(!list_empty(&vb->free_list)); - vb_idx = addr_to_vb_idx(vb->va->va_start); spin_lock(&vmap_block_tree_lock); tmp = radix_tree_delete(&vmap_block_tree, vb_idx); @@ -790,12 +789,61 @@ static void free_vmap_block(struct vmap_block *vb) call_rcu(&vb->rcu_head, rcu_free_vb); } +static void purge_fragmented_blocks(int cpu) +{ + LIST_HEAD(purge); + struct vmap_block *vb; + struct vmap_block *n_vb; + struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); + + rcu_read_lock(); + list_for_each_entry_rcu(vb, &vbq->free, free_list) { + + if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS)) + continue; + + spin_lock(&vb->lock); + if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { + vb->free = 0; /* prevent further allocs after releasing lock */ + vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ + bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS); + bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS); + spin_lock(&vbq->lock); + list_del_rcu(&vb->free_list); + spin_unlock(&vbq->lock); + spin_unlock(&vb->lock); + list_add_tail(&vb->purge, &purge); + } else + spin_unlock(&vb->lock); + } + rcu_read_unlock(); + + list_for_each_entry_safe(vb, n_vb, &purge, purge) { + list_del(&vb->purge); + free_vmap_block(vb); + } +} + +static void purge_fragmented_blocks_thiscpu(void) +{ + purge_fragmented_blocks(smp_processor_id()); +} + +static void purge_fragmented_blocks_allcpus(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + purge_fragmented_blocks(cpu); +} + static void *vb_alloc(unsigned long size, gfp_t gfp_mask) { struct vmap_block_queue *vbq; struct vmap_block *vb; unsigned long addr = 0; unsigned int order; + int purge = 0; BUG_ON(size & ~PAGE_MASK); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); @@ -808,24 +856,38 @@ again: int i; spin_lock(&vb->lock); + if (vb->free < 1UL << order) + goto next; + i = bitmap_find_free_region(vb->alloc_map, VMAP_BBMAP_BITS, order); - if (i >= 0) { - addr = vb->va->va_start + (i << PAGE_SHIFT); - BUG_ON(addr_to_vb_idx(addr) != - addr_to_vb_idx(vb->va->va_start)); - vb->free -= 1UL << order; - if (vb->free == 0) { - spin_lock(&vbq->lock); - list_del_init(&vb->free_list); - spin_unlock(&vbq->lock); + if (i < 0) { + if (vb->free + vb->dirty == VMAP_BBMAP_BITS) { + /* fragmented and no outstanding allocations */ + BUG_ON(vb->dirty != VMAP_BBMAP_BITS); + purge = 1; } - spin_unlock(&vb->lock); - break; + goto next; + } + addr = vb->va->va_start + (i << PAGE_SHIFT); + BUG_ON(addr_to_vb_idx(addr) != + addr_to_vb_idx(vb->va->va_start)); + vb->free -= 1UL << order; + if (vb->free == 0) { + spin_lock(&vbq->lock); + list_del_rcu(&vb->free_list); + spin_unlock(&vbq->lock); } spin_unlock(&vb->lock); + break; +next: + spin_unlock(&vb->lock); } + + if (purge) + purge_fragmented_blocks_thiscpu(); + put_cpu_var(vmap_block_queue); rcu_read_unlock(); @@ -862,11 +924,11 @@ static void vb_free(const void *addr, unsigned long size) BUG_ON(!vb); spin_lock(&vb->lock); - bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); + BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); vb->dirty += 1UL << order; if (vb->dirty == VMAP_BBMAP_BITS) { - BUG_ON(vb->free || !list_empty(&vb->free_list)); + BUG_ON(vb->free); spin_unlock(&vb->lock); free_vmap_block(vb); } else @@ -1035,8 +1097,6 @@ void __init vmalloc_init(void) vbq = &per_cpu(vmap_block_queue, i); spin_lock_init(&vbq->lock); INIT_LIST_HEAD(&vbq->free); - INIT_LIST_HEAD(&vbq->dirty); - vbq->nr_dirty = 0; } /* Import existing vmlist entries. */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 885207a..c26986c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1922,6 +1922,9 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) if (!populated_zone(zone)) continue; + if (zone_is_all_unreclaimable(zone)) + continue; + if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), 0, 0)) return 1; diff --git a/mm/vmstat.c b/mm/vmstat.c index 6051fba..fc5aa18 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -139,7 +139,8 @@ static void refresh_zone_stat_thresholds(void) threshold = calculate_threshold(zone); for_each_online_cpu(cpu) - zone_pcp(zone, cpu)->stat_threshold = threshold; + per_cpu_ptr(zone->pageset, cpu)->stat_threshold + = threshold; } } @@ -149,7 +150,8 @@ static void refresh_zone_stat_thresholds(void) void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, int delta) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); + s8 *p = pcp->vm_stat_diff + item; long x; @@ -202,7 +204,7 @@ EXPORT_SYMBOL(mod_zone_page_state); */ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); s8 *p = pcp->vm_stat_diff + item; (*p)++; @@ -223,7 +225,7 @@ EXPORT_SYMBOL(__inc_zone_page_state); void __dec_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); s8 *p = pcp->vm_stat_diff + item; (*p)--; @@ -300,7 +302,7 @@ void refresh_cpu_vm_stats(int cpu) for_each_populated_zone(zone) { struct per_cpu_pageset *p; - p = zone_pcp(zone, cpu); + p = per_cpu_ptr(zone->pageset, cpu); for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) if (p->vm_stat_diff[i]) { @@ -741,7 +743,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, for_each_online_cpu(i) { struct per_cpu_pageset *pageset; - pageset = zone_pcp(zone, i); + pageset = per_cpu_ptr(zone->pageset, i); seq_printf(m, "\n cpu: %i" "\n count: %i" @@ -906,6 +908,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, case CPU_ONLINE: case CPU_ONLINE_FROZEN: start_cpu_timer(cpu); + node_set_state(cpu_to_node(cpu), N_CPU); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: |