diff options
author | Ingo Molnar <mingo@elte.hu> | 2009-06-17 12:52:15 +0200 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-06-17 12:56:49 +0200 |
commit | eadb8a091b27a840de7450f84ecff5ef13476424 (patch) | |
tree | 58c3782d40def63baa8167f3d31e3048cb4c7660 /mm | |
parent | 73874005cd8800440be4299bd095387fff4b90ac (diff) | |
parent | 65795efbd380a832ae508b04dba8f8e53f0b84d9 (diff) | |
download | op-kernel-dev-eadb8a091b27a840de7450f84ecff5ef13476424.zip op-kernel-dev-eadb8a091b27a840de7450f84ecff5ef13476424.tar.gz |
Merge branch 'linus' into tracing/hw-breakpoints
Conflicts:
arch/x86/Kconfig
arch/x86/kernel/traps.c
arch/x86/power/cpu.c
arch/x86/power/cpu_32.c
kernel/Makefile
Semantic conflict:
arch/x86/kernel/hw_breakpoint.c
Merge reason: Resolve the conflicts, move from put_cpu_no_sched() to
put_cpu() in arch/x86/kernel/hw_breakpoint.c.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 37 | ||||
-rw-r--r-- | mm/Kconfig.debug | 1 | ||||
-rw-r--r-- | mm/Makefile | 4 | ||||
-rw-r--r-- | mm/bootmem.c | 12 | ||||
-rw-r--r-- | mm/bounce.c | 10 | ||||
-rw-r--r-- | mm/fadvise.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 175 | ||||
-rw-r--r-- | mm/highmem.c | 1 | ||||
-rw-r--r-- | mm/hugetlb.c | 132 | ||||
-rw-r--r-- | mm/init-mm.c | 20 | ||||
-rw-r--r-- | mm/internal.h | 33 | ||||
-rw-r--r-- | mm/kmemcheck.c | 122 | ||||
-rw-r--r-- | mm/kmemleak-test.c | 111 | ||||
-rw-r--r-- | mm/kmemleak.c | 1498 | ||||
-rw-r--r-- | mm/maccess.c | 2 | ||||
-rw-r--r-- | mm/madvise.c | 26 | ||||
-rw-r--r-- | mm/memcontrol.c | 25 | ||||
-rw-r--r-- | mm/memory.c | 128 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 6 | ||||
-rw-r--r-- | mm/mempolicy.c | 145 | ||||
-rw-r--r-- | mm/migrate.c | 6 | ||||
-rw-r--r-- | mm/mlock.c | 22 | ||||
-rw-r--r-- | mm/mmap.c | 8 | ||||
-rw-r--r-- | mm/mmzone.c | 15 | ||||
-rw-r--r-- | mm/mprotect.c | 2 | ||||
-rw-r--r-- | mm/nommu.c | 3 | ||||
-rw-r--r-- | mm/oom_kill.c | 84 | ||||
-rw-r--r-- | mm/page-writeback.c | 25 | ||||
-rw-r--r-- | mm/page_alloc.c | 852 | ||||
-rw-r--r-- | mm/page_cgroup.c | 17 | ||||
-rw-r--r-- | mm/page_io.c | 2 | ||||
-rw-r--r-- | mm/percpu.c | 141 | ||||
-rw-r--r-- | mm/readahead.c | 145 | ||||
-rw-r--r-- | mm/rmap.c | 42 | ||||
-rw-r--r-- | mm/shmem.c | 6 | ||||
-rw-r--r-- | mm/slab.c | 269 | ||||
-rw-r--r-- | mm/slob.c | 16 | ||||
-rw-r--r-- | mm/slub.c | 84 | ||||
-rw-r--r-- | mm/swap_state.c | 19 | ||||
-rw-r--r-- | mm/swapfile.c | 276 | ||||
-rw-r--r-- | mm/truncate.c | 40 | ||||
-rw-r--r-- | mm/util.c | 16 | ||||
-rw-r--r-- | mm/vmalloc.c | 33 | ||||
-rw-r--r-- | mm/vmscan.c | 376 | ||||
-rw-r--r-- | mm/vmstat.c | 38 |
45 files changed, 3757 insertions, 1270 deletions
@@ -128,11 +128,11 @@ config SPARSEMEM_VMEMMAP config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA - depends on HOTPLUG && !HIBERNATION && ARCH_ENABLE_MEMORY_HOTPLUG + depends on HOTPLUG && !(HIBERNATION && !S390) && ARCH_ENABLE_MEMORY_HOTPLUG depends on (IA64 || X86 || PPC64 || SUPERH || S390) comment "Memory hotplug is currently incompatible with Software Suspend" - depends on SPARSEMEM && HOTPLUG && HIBERNATION + depends on SPARSEMEM && HOTPLUG && HIBERNATION && !S390 config MEMORY_HOTPLUG_SPARSE def_bool y @@ -203,29 +203,36 @@ config VIRT_TO_BUS def_bool y depends on !ARCH_NO_VIRT_TO_BUS -config UNEVICTABLE_LRU - bool "Add LRU list to track non-evictable pages" - default y - help - Keeps unevictable pages off of the active and inactive pageout - lists, so kswapd will not waste CPU time or have its balancing - algorithms thrown off by scanning these pages. Selecting this - will use one page flag and increase the code size a little, - say Y unless you know what you are doing. - - See Documentation/vm/unevictable-lru.txt for more information. - config HAVE_MLOCK bool default y if MMU=y config HAVE_MLOCKED_PAGE_BIT bool - default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y + default y if HAVE_MLOCK=y config MMU_NOTIFIER bool +config DEFAULT_MMAP_MIN_ADDR + int "Low address space to protect from user allocation" + default 4096 + help + This is the portion of low virtual memory which should be protected + from userspace allocation. Keeping a user from writing to low pages + can help reduce the impact of kernel NULL pointer bugs. + + For most ia64, ppc64 and x86 users with lots of address space + a value of 65536 is reasonable and should cause no problems. + On arm and other archs it should not be higher than 32768. + Programs which use vm86 functionality would either need additional + permissions from either the LSM or the capabilities module or have + this protection disabled. + + This value can be changed after boot using the + /proc/sys/vm/mmap_min_addr tunable. + + config NOMMU_INITIAL_TRIM_EXCESS int "Turn on mmap() excess space trimming before booting" depends on !MMU diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index bb01e29..aa99fd1 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -2,6 +2,7 @@ config DEBUG_PAGEALLOC bool "Debug page memory allocations" depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC depends on !HIBERNATION || !PPC && !SPARC + depends on !KMEMCHECK ---help--- Unmap pages from the kernel linear mapping after free_pages(). This results in a large slowdown, but helps to find certain types diff --git a/mm/Makefile b/mm/Makefile index ec73c68..5e0bd64 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -12,6 +12,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ page_isolation.o mm_init.o $(mmu-y) +obj-y += init-mm.o obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o obj-$(CONFIG_BOUNCE) += bounce.o @@ -27,6 +28,7 @@ obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o +obj-$(CONFIG_KMEMCHECK) += kmemcheck.o obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o @@ -38,3 +40,5 @@ obj-$(CONFIG_SMP) += allocpercpu.o endif obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o +obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o +obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o diff --git a/mm/bootmem.c b/mm/bootmem.c index daf9271..282df0a 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -532,6 +532,9 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc(size, GFP_NOWAIT); + #ifdef CONFIG_HAVE_ARCH_BOOTMEM bootmem_data_t *p_bdata; @@ -662,6 +665,9 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); } @@ -693,6 +699,9 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, { void *ptr; + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); if (ptr) return ptr; @@ -745,6 +754,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, ARCH_LOW_ADDRESS_LIMIT); } diff --git a/mm/bounce.c b/mm/bounce.c index e590272..a2b76a5 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -13,17 +13,15 @@ #include <linux/init.h> #include <linux/hash.h> #include <linux/highmem.h> -#include <linux/blktrace_api.h> -#include <trace/block.h> #include <asm/tlbflush.h> +#include <trace/events/block.h> + #define POOL_SIZE 64 #define ISA_POOL_SIZE 16 static mempool_t *page_pool, *isa_page_pool; -DEFINE_TRACE(block_bio_bounce); - #ifdef CONFIG_HIGHMEM static __init int init_emergency_pool(void) { @@ -192,7 +190,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, /* * is destination page below bounce pfn? */ - if (page_to_pfn(page) <= q->bounce_pfn) + if (page_to_pfn(page) <= queue_bounce_pfn(q)) continue; /* @@ -284,7 +282,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) * don't waste time iterating over bio segments */ if (!(q->bounce_gfp & GFP_DMA)) { - if (q->bounce_pfn >= blk_max_pfn) + if (queue_bounce_pfn(q) >= blk_max_pfn) return; pool = page_pool; } else { diff --git a/mm/fadvise.c b/mm/fadvise.c index 54a0f80..e433592 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -101,7 +101,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) ret = force_page_cache_readahead(mapping, file, start_index, - max_sane_readahead(nrpages)); + nrpages); if (ret > 0) ret = 0; break; diff --git a/mm/filemap.c b/mm/filemap.c index 379ff0b..2239671 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -121,7 +121,6 @@ void __remove_from_page_cache(struct page *page) mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); BUG_ON(page_mapped(page)); - mem_cgroup_uncharge_cache_page(page); /* * Some filesystems seem to re-dirty the page even after @@ -145,6 +144,7 @@ void remove_from_page_cache(struct page *page) spin_lock_irq(&mapping->tree_lock); __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); + mem_cgroup_uncharge_cache_page(page); } static int sync_page(void *word) @@ -476,13 +476,13 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, if (likely(!error)) { mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); + spin_unlock_irq(&mapping->tree_lock); } else { page->mapping = NULL; + spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); page_cache_release(page); } - - spin_unlock_irq(&mapping->tree_lock); radix_tree_preload_end(); } else mem_cgroup_uncharge_cache_page(page); @@ -521,7 +521,7 @@ struct page *__page_cache_alloc(gfp_t gfp) { if (cpuset_do_page_mem_spread()) { int n = cpuset_mem_spread_node(); - return alloc_pages_node(n, gfp, 0); + return alloc_pages_exact_node(n, gfp, 0); } return alloc_pages(gfp, 0); } @@ -1004,9 +1004,6 @@ EXPORT_SYMBOL(grab_cache_page_nowait); static void shrink_readahead_size_eio(struct file *filp, struct file_ra_state *ra) { - if (!ra->ra_pages) - return; - ra->ra_pages /= 4; } @@ -1390,8 +1387,7 @@ do_readahead(struct address_space *mapping, struct file *filp, if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) return -EINVAL; - force_page_cache_readahead(mapping, filp, index, - max_sane_readahead(nr)); + force_page_cache_readahead(mapping, filp, index, nr); return 0; } @@ -1457,6 +1453,73 @@ static int page_cache_read(struct file *file, pgoff_t offset) #define MMAP_LOTSAMISS (100) +/* + * Synchronous readahead happens when we don't even find + * a page in the page cache at all. + */ +static void do_sync_mmap_readahead(struct vm_area_struct *vma, + struct file_ra_state *ra, + struct file *file, + pgoff_t offset) +{ + unsigned long ra_pages; + struct address_space *mapping = file->f_mapping; + + /* If we don't want any read-ahead, don't bother */ + if (VM_RandomReadHint(vma)) + return; + + if (VM_SequentialReadHint(vma) || + offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) { + page_cache_sync_readahead(mapping, ra, file, offset, + ra->ra_pages); + return; + } + + if (ra->mmap_miss < INT_MAX) + ra->mmap_miss++; + + /* + * Do we miss much more than hit in this file? If so, + * stop bothering with read-ahead. It will only hurt. + */ + if (ra->mmap_miss > MMAP_LOTSAMISS) + return; + + /* + * mmap read-around + */ + ra_pages = max_sane_readahead(ra->ra_pages); + if (ra_pages) { + ra->start = max_t(long, 0, offset - ra_pages/2); + ra->size = ra_pages; + ra->async_size = 0; + ra_submit(ra, mapping, file); + } +} + +/* + * Asynchronous readahead happens when we find the page and PG_readahead, + * so we want to possibly extend the readahead further.. + */ +static void do_async_mmap_readahead(struct vm_area_struct *vma, + struct file_ra_state *ra, + struct file *file, + struct page *page, + pgoff_t offset) +{ + struct address_space *mapping = file->f_mapping; + + /* If we don't want any read-ahead, don't bother */ + if (VM_RandomReadHint(vma)) + return; + if (ra->mmap_miss > 0) + ra->mmap_miss--; + if (PageReadahead(page)) + page_cache_async_readahead(mapping, ra, file, + page, offset, ra->ra_pages); +} + /** * filemap_fault - read in file data for page fault handling * @vma: vma in which the fault was taken @@ -1476,78 +1539,44 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct address_space *mapping = file->f_mapping; struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; + pgoff_t offset = vmf->pgoff; struct page *page; pgoff_t size; - int did_readaround = 0; int ret = 0; size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (vmf->pgoff >= size) + if (offset >= size) return VM_FAULT_SIGBUS; - /* If we don't want any read-ahead, don't bother */ - if (VM_RandomReadHint(vma)) - goto no_cached_page; - /* * Do we have something in the page cache already? */ -retry_find: - page = find_lock_page(mapping, vmf->pgoff); - /* - * For sequential accesses, we use the generic readahead logic. - */ - if (VM_SequentialReadHint(vma)) { - if (!page) { - page_cache_sync_readahead(mapping, ra, file, - vmf->pgoff, 1); - page = find_lock_page(mapping, vmf->pgoff); - if (!page) - goto no_cached_page; - } - if (PageReadahead(page)) { - page_cache_async_readahead(mapping, ra, file, page, - vmf->pgoff, 1); - } - } - - if (!page) { - unsigned long ra_pages; - - ra->mmap_miss++; - + page = find_get_page(mapping, offset); + if (likely(page)) { /* - * Do we miss much more than hit in this file? If so, - * stop bothering with read-ahead. It will only hurt. + * We found the page, so try async readahead before + * waiting for the lock. */ - if (ra->mmap_miss > MMAP_LOTSAMISS) - goto no_cached_page; + do_async_mmap_readahead(vma, ra, file, page, offset); + lock_page(page); - /* - * To keep the pgmajfault counter straight, we need to - * check did_readaround, as this is an inner loop. - */ - if (!did_readaround) { - ret = VM_FAULT_MAJOR; - count_vm_event(PGMAJFAULT); - } - did_readaround = 1; - ra_pages = max_sane_readahead(file->f_ra.ra_pages); - if (ra_pages) { - pgoff_t start = 0; - - if (vmf->pgoff > ra_pages / 2) - start = vmf->pgoff - ra_pages / 2; - do_page_cache_readahead(mapping, file, start, ra_pages); + /* Did it get truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + put_page(page); + goto no_cached_page; } - page = find_lock_page(mapping, vmf->pgoff); + } else { + /* No page in the page cache at all */ + do_sync_mmap_readahead(vma, ra, file, offset); + count_vm_event(PGMAJFAULT); + ret = VM_FAULT_MAJOR; +retry_find: + page = find_lock_page(mapping, offset); if (!page) goto no_cached_page; } - if (!did_readaround) - ra->mmap_miss--; - /* * We have a locked page in the page cache, now we need to check * that it's up-to-date. If not, it is going to be due to an error. @@ -1555,18 +1584,18 @@ retry_find: if (unlikely(!PageUptodate(page))) goto page_not_uptodate; - /* Must recheck i_size under page lock */ + /* + * Found the page and have a reference on it. + * We must recheck i_size under page lock. + */ size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (unlikely(vmf->pgoff >= size)) { + if (unlikely(offset >= size)) { unlock_page(page); page_cache_release(page); return VM_FAULT_SIGBUS; } - /* - * Found the page and have a reference on it. - */ - ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; + ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT; vmf->page = page; return ret | VM_FAULT_LOCKED; @@ -1575,7 +1604,7 @@ no_cached_page: * We're only likely to ever get here if MADV_RANDOM is in * effect. */ - error = page_cache_read(file, vmf->pgoff); + error = page_cache_read(file, offset); /* * The page we want has now been added to the page cache. @@ -1595,12 +1624,6 @@ no_cached_page: return VM_FAULT_SIGBUS; page_not_uptodate: - /* IO error path */ - if (!did_readaround) { - ret = VM_FAULT_MAJOR; - count_vm_event(PGMAJFAULT); - } - /* * Umm, take care of errors if the page isn't up-to-date. * Try to re-read it _once_. We do this synchronously, diff --git a/mm/highmem.c b/mm/highmem.c index 68eb1d9..25878cc 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -26,7 +26,6 @@ #include <linux/init.h> #include <linux/hash.h> #include <linux/highmem.h> -#include <linux/blktrace_api.h> #include <asm/tlbflush.h> /* diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 28c655b..a56e6f3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -316,7 +316,7 @@ static void resv_map_release(struct kref *ref) static struct resv_map *vma_resv_map(struct vm_area_struct *vma) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - if (!(vma->vm_flags & VM_SHARED)) + if (!(vma->vm_flags & VM_MAYSHARE)) return (struct resv_map *)(get_vma_private_data(vma) & ~HPAGE_RESV_MASK); return NULL; @@ -325,7 +325,7 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma) static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - VM_BUG_ON(vma->vm_flags & VM_SHARED); + VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); set_vma_private_data(vma, (get_vma_private_data(vma) & HPAGE_RESV_MASK) | (unsigned long)map); @@ -334,7 +334,7 @@ static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - VM_BUG_ON(vma->vm_flags & VM_SHARED); + VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); set_vma_private_data(vma, get_vma_private_data(vma) | flags); } @@ -353,7 +353,7 @@ static void decrement_hugepage_resv_vma(struct hstate *h, if (vma->vm_flags & VM_NORESERVE) return; - if (vma->vm_flags & VM_SHARED) { + if (vma->vm_flags & VM_MAYSHARE) { /* Shared mappings always use reserves */ h->resv_huge_pages--; } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { @@ -369,14 +369,14 @@ static void decrement_hugepage_resv_vma(struct hstate *h, void reset_vma_resv_huge_pages(struct vm_area_struct *vma) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - if (!(vma->vm_flags & VM_SHARED)) + if (!(vma->vm_flags & VM_MAYSHARE)) vma->vm_private_data = (void *)0; } /* Returns true if the VMA has associated reserve pages */ static int vma_has_reserves(struct vm_area_struct *vma) { - if (vma->vm_flags & VM_SHARED) + if (vma->vm_flags & VM_MAYSHARE) return 1; if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return 1; @@ -578,41 +578,6 @@ static void free_huge_page(struct page *page) hugetlb_put_quota(mapping, 1); } -/* - * Increment or decrement surplus_huge_pages. Keep node-specific counters - * balanced by operating on them in a round-robin fashion. - * Returns 1 if an adjustment was made. - */ -static int adjust_pool_surplus(struct hstate *h, int delta) -{ - static int prev_nid; - int nid = prev_nid; - int ret = 0; - - VM_BUG_ON(delta != -1 && delta != 1); - do { - nid = next_node(nid, node_online_map); - if (nid == MAX_NUMNODES) - nid = first_node(node_online_map); - - /* To shrink on this node, there must be a surplus page */ - if (delta < 0 && !h->surplus_huge_pages_node[nid]) - continue; - /* Surplus cannot exceed the total number of pages */ - if (delta > 0 && h->surplus_huge_pages_node[nid] >= - h->nr_huge_pages_node[nid]) - continue; - - h->surplus_huge_pages += delta; - h->surplus_huge_pages_node[nid] += delta; - ret = 1; - break; - } while (nid != prev_nid); - - prev_nid = nid; - return ret; -} - static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { set_compound_page_dtor(page, free_huge_page); @@ -623,6 +588,34 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) put_page(page); /* free it into the hugepage allocator */ } +static void prep_compound_gigantic_page(struct page *page, unsigned long order) +{ + int i; + int nr_pages = 1 << order; + struct page *p = page + 1; + + /* we rely on prep_new_huge_page to set the destructor */ + set_compound_order(page, order); + __SetPageHead(page); + for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { + __SetPageTail(p); + p->first_page = page; + } +} + +int PageHuge(struct page *page) +{ + compound_page_dtor *dtor; + + if (!PageCompound(page)) + return 0; + + page = compound_head(page); + dtor = get_compound_page_dtor(page); + + return dtor == free_huge_page; +} + static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; @@ -630,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) if (h->order >= MAX_ORDER) return NULL; - page = alloc_pages_node(nid, + page = alloc_pages_exact_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); @@ -649,7 +642,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) * Use a helper variable to find the next node and then * copy it back to hugetlb_next_nid afterwards: * otherwise there's a window in which a racer might - * pass invalid nid MAX_NUMNODES to alloc_pages_node. + * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. * But we don't need to use a spin_lock here: it really * doesn't matter if occasionally a racer chooses the * same nid as we do. Move nid forward in the mask even @@ -875,7 +868,7 @@ static void return_unused_surplus_pages(struct hstate *h, * can no longer free unreserved surplus pages. This occurs when * the nodes with surplus pages have no free pages. */ - unsigned long remaining_iterations = num_online_nodes(); + unsigned long remaining_iterations = nr_online_nodes; /* Uncommit the reservation */ h->resv_huge_pages -= unused_resv_pages; @@ -904,7 +897,7 @@ static void return_unused_surplus_pages(struct hstate *h, h->surplus_huge_pages--; h->surplus_huge_pages_node[nid]--; nr_pages--; - remaining_iterations = num_online_nodes(); + remaining_iterations = nr_online_nodes; } } } @@ -924,7 +917,7 @@ static long vma_needs_reservation(struct hstate *h, struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; - if (vma->vm_flags & VM_SHARED) { + if (vma->vm_flags & VM_MAYSHARE) { pgoff_t idx = vma_hugecache_offset(h, vma, addr); return region_chg(&inode->i_mapping->private_list, idx, idx + 1); @@ -949,7 +942,7 @@ static void vma_commit_reservation(struct hstate *h, struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; - if (vma->vm_flags & VM_SHARED) { + if (vma->vm_flags & VM_MAYSHARE) { pgoff_t idx = vma_hugecache_offset(h, vma, addr); region_add(&inode->i_mapping->private_list, idx, idx + 1); @@ -1140,6 +1133,41 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) } #endif +/* + * Increment or decrement surplus_huge_pages. Keep node-specific counters + * balanced by operating on them in a round-robin fashion. + * Returns 1 if an adjustment was made. + */ +static int adjust_pool_surplus(struct hstate *h, int delta) +{ + static int prev_nid; + int nid = prev_nid; + int ret = 0; + + VM_BUG_ON(delta != -1 && delta != 1); + do { + nid = next_node(nid, node_online_map); + if (nid == MAX_NUMNODES) + nid = first_node(node_online_map); + + /* To shrink on this node, there must be a surplus page */ + if (delta < 0 && !h->surplus_huge_pages_node[nid]) + continue; + /* Surplus cannot exceed the total number of pages */ + if (delta > 0 && h->surplus_huge_pages_node[nid] >= + h->nr_huge_pages_node[nid]) + continue; + + h->surplus_huge_pages += delta; + h->surplus_huge_pages_node[nid] += delta; + ret = 1; + break; + } while (nid != prev_nid); + + prev_nid = nid; + return ret; +} + #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) { @@ -1893,7 +1921,7 @@ retry_avoidcopy: * at the time of fork() could consume its reserves on COW instead * of the full address range. */ - if (!(vma->vm_flags & VM_SHARED) && + if (!(vma->vm_flags & VM_MAYSHARE) && is_vma_resv_set(vma, HPAGE_RESV_OWNER) && old_page != pagecache_page) outside_reserve = 1; @@ -2000,7 +2028,7 @@ retry: clear_huge_page(page, address, huge_page_size(h)); __SetPageUptodate(page); - if (vma->vm_flags & VM_SHARED) { + if (vma->vm_flags & VM_MAYSHARE) { int err; struct inode *inode = mapping->host; @@ -2104,7 +2132,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto out_mutex; } - if (!(vma->vm_flags & VM_SHARED)) + if (!(vma->vm_flags & VM_MAYSHARE)) pagecache_page = hugetlbfs_pagecache_page(h, vma, address); } @@ -2289,7 +2317,7 @@ int hugetlb_reserve_pages(struct inode *inode, * to reserve the full area even if read-only as mprotect() may be * called to make the mapping read-write. Assume !vma is a shm mapping */ - if (!vma || vma->vm_flags & VM_SHARED) + if (!vma || vma->vm_flags & VM_MAYSHARE) chg = region_chg(&inode->i_mapping->private_list, from, to); else { struct resv_map *resv_map = resv_map_alloc(); @@ -2330,7 +2358,7 @@ int hugetlb_reserve_pages(struct inode *inode, * consumed reservations are stored in the map. Hence, nothing * else has to be done for private mappings here */ - if (!vma || vma->vm_flags & VM_SHARED) + if (!vma || vma->vm_flags & VM_MAYSHARE) region_add(&inode->i_mapping->private_list, from, to); return 0; } diff --git a/mm/init-mm.c b/mm/init-mm.c new file mode 100644 index 0000000..57aba0d --- /dev/null +++ b/mm/init-mm.c @@ -0,0 +1,20 @@ +#include <linux/mm_types.h> +#include <linux/rbtree.h> +#include <linux/rwsem.h> +#include <linux/spinlock.h> +#include <linux/list.h> +#include <linux/cpumask.h> + +#include <asm/atomic.h> +#include <asm/pgtable.h> + +struct mm_struct init_mm = { + .mm_rb = RB_ROOT, + .pgd = swapper_pg_dir, + .mm_users = ATOMIC_INIT(2), + .mm_count = ATOMIC_INIT(1), + .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), + .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), + .mmlist = LIST_HEAD_INIT(init_mm.mmlist), + .cpu_vm_mask = CPU_MASK_ALL, +}; diff --git a/mm/internal.h b/mm/internal.h index 987bb03..f290c4d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -16,9 +16,6 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); -extern void prep_compound_page(struct page *page, unsigned long order); -extern void prep_compound_gigantic_page(struct page *page, unsigned long order); - static inline void set_page_count(struct page *page, int v) { atomic_set(&page->_count, v); @@ -51,6 +48,8 @@ extern void putback_lru_page(struct page *page); */ extern unsigned long highest_memmap_pfn; extern void __free_pages_bootmem(struct page *page, unsigned int order); +extern void prep_compound_page(struct page *page, unsigned long order); + /* * function for dealing with page's order in buddy system. @@ -74,7 +73,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) } #endif -#ifdef CONFIG_UNEVICTABLE_LRU /* * unevictable_migrate_page() called only from migrate_page_copy() to * migrate unevictable flag to new page. @@ -86,11 +84,6 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old) if (TestClearPageUnevictable(old)) SetPageUnevictable(new); } -#else -static inline void unevictable_migrate_page(struct page *new, struct page *old) -{ -} -#endif #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT /* @@ -150,23 +143,6 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) } } -/* - * free_page_mlock() -- clean up attempts to free and mlocked() page. - * Page should not be on lru, so no need to fix that up. - * free_pages_check() will verify... - */ -static inline void free_page_mlock(struct page *page) -{ - if (unlikely(TestClearPageMlocked(page))) { - unsigned long flags; - - local_irq_save(flags); - __dec_zone_page_state(page, NR_MLOCK); - __count_vm_event(UNEVICTABLE_MLOCKFREED); - local_irq_restore(flags); - } -} - #else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) { @@ -175,7 +151,6 @@ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page) { } static inline void mlock_migrate_page(struct page *new, struct page *old) { } -static inline void free_page_mlock(struct page *page) { } #endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ @@ -284,4 +259,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int flags, struct page **pages, struct vm_area_struct **vmas); +#define ZONE_RECLAIM_NOSCAN -2 +#define ZONE_RECLAIM_FULL -1 +#define ZONE_RECLAIM_SOME 0 +#define ZONE_RECLAIM_SUCCESS 1 #endif diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c new file mode 100644 index 0000000..fd814fd --- /dev/null +++ b/mm/kmemcheck.c @@ -0,0 +1,122 @@ +#include <linux/gfp.h> +#include <linux/mm_types.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/kmemcheck.h> + +void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) +{ + struct page *shadow; + int pages; + int i; + + pages = 1 << order; + + /* + * With kmemcheck enabled, we need to allocate a memory area for the + * shadow bits as well. + */ + shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order); + if (!shadow) { + if (printk_ratelimit()) + printk(KERN_ERR "kmemcheck: failed to allocate " + "shadow bitmap\n"); + return; + } + + for(i = 0; i < pages; ++i) + page[i].shadow = page_address(&shadow[i]); + + /* + * Mark it as non-present for the MMU so that our accesses to + * this memory will trigger a page fault and let us analyze + * the memory accesses. + */ + kmemcheck_hide_pages(page, pages); +} + +void kmemcheck_free_shadow(struct page *page, int order) +{ + struct page *shadow; + int pages; + int i; + + if (!kmemcheck_page_is_tracked(page)) + return; + + pages = 1 << order; + + kmemcheck_show_pages(page, pages); + + shadow = virt_to_page(page[0].shadow); + + for(i = 0; i < pages; ++i) + page[i].shadow = NULL; + + __free_pages(shadow, order); +} + +void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, + size_t size) +{ + /* + * Has already been memset(), which initializes the shadow for us + * as well. + */ + if (gfpflags & __GFP_ZERO) + return; + + /* No need to initialize the shadow of a non-tracked slab. */ + if (s->flags & SLAB_NOTRACK) + return; + + if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) { + /* + * Allow notracked objects to be allocated from + * tracked caches. Note however that these objects + * will still get page faults on access, they just + * won't ever be flagged as uninitialized. If page + * faults are not acceptable, the slab cache itself + * should be marked NOTRACK. + */ + kmemcheck_mark_initialized(object, size); + } else if (!s->ctor) { + /* + * New objects should be marked uninitialized before + * they're returned to the called. + */ + kmemcheck_mark_uninitialized(object, size); + } +} + +void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size) +{ + /* TODO: RCU freeing is unsupported for now; hide false positives. */ + if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU)) + kmemcheck_mark_freed(object, size); +} + +void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order, + gfp_t gfpflags) +{ + int pages; + + if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK)) + return; + + pages = 1 << order; + + /* + * NOTE: We choose to track GFP_ZERO pages too; in fact, they + * can become uninitialized by copying uninitialized memory + * into them. + */ + + /* XXX: Can use zone->node for node? */ + kmemcheck_alloc_shadow(page, order, gfpflags, -1); + + if (gfpflags & __GFP_ZERO) + kmemcheck_mark_initialized_pages(page, pages); + else + kmemcheck_mark_uninitialized_pages(page, pages); +} diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c new file mode 100644 index 0000000..d5292fc --- /dev/null +++ b/mm/kmemleak-test.c @@ -0,0 +1,111 @@ +/* + * mm/kmemleak-test.c + * + * Copyright (C) 2008 ARM Limited + * Written by Catalin Marinas <catalin.marinas@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/list.h> +#include <linux/percpu.h> +#include <linux/fdtable.h> + +#include <linux/kmemleak.h> + +struct test_node { + long header[25]; + struct list_head list; + long footer[25]; +}; + +static LIST_HEAD(test_list); +static DEFINE_PER_CPU(void *, test_pointer); + +/* + * Some very simple testing. This function needs to be extended for + * proper testing. + */ +static int __init kmemleak_test_init(void) +{ + struct test_node *elem; + int i; + + printk(KERN_INFO "Kmemleak testing\n"); + + /* make some orphan objects */ + pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); + pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); + pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); + pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); + pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); + pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); + pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); + pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); +#ifndef CONFIG_MODULES + pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n", + kmem_cache_alloc(files_cachep, GFP_KERNEL)); + pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n", + kmem_cache_alloc(files_cachep, GFP_KERNEL)); +#endif + pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); + pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); + pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); + pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); + pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); + + /* + * Add elements to a list. They should only appear as orphan + * after the module is removed. + */ + for (i = 0; i < 10; i++) { + elem = kmalloc(sizeof(*elem), GFP_KERNEL); + pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem); + if (!elem) + return -ENOMEM; + memset(elem, 0, sizeof(*elem)); + INIT_LIST_HEAD(&elem->list); + + list_add_tail(&elem->list, &test_list); + } + + for_each_possible_cpu(i) { + per_cpu(test_pointer, i) = kmalloc(129, GFP_KERNEL); + pr_info("kmemleak: kmalloc(129) = %p\n", + per_cpu(test_pointer, i)); + } + + return 0; +} +module_init(kmemleak_test_init); + +static void __exit kmemleak_test_exit(void) +{ + struct test_node *elem, *tmp; + + /* + * Remove the list elements without actually freeing the + * memory. + */ + list_for_each_entry_safe(elem, tmp, &test_list, list) + list_del(&elem->list); +} +module_exit(kmemleak_test_exit); + +MODULE_LICENSE("GPL"); diff --git a/mm/kmemleak.c b/mm/kmemleak.c new file mode 100644 index 0000000..58ec86c --- /dev/null +++ b/mm/kmemleak.c @@ -0,0 +1,1498 @@ +/* + * mm/kmemleak.c + * + * Copyright (C) 2008 ARM Limited + * Written by Catalin Marinas <catalin.marinas@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + * For more information on the algorithm and kmemleak usage, please see + * Documentation/kmemleak.txt. + * + * Notes on locking + * ---------------- + * + * The following locks and mutexes are used by kmemleak: + * + * - kmemleak_lock (rwlock): protects the object_list modifications and + * accesses to the object_tree_root. The object_list is the main list + * holding the metadata (struct kmemleak_object) for the allocated memory + * blocks. The object_tree_root is a priority search tree used to look-up + * metadata based on a pointer to the corresponding memory block. The + * kmemleak_object structures are added to the object_list and + * object_tree_root in the create_object() function called from the + * kmemleak_alloc() callback and removed in delete_object() called from the + * kmemleak_free() callback + * - kmemleak_object.lock (spinlock): protects a kmemleak_object. Accesses to + * the metadata (e.g. count) are protected by this lock. Note that some + * members of this structure may be protected by other means (atomic or + * kmemleak_lock). This lock is also held when scanning the corresponding + * memory block to avoid the kernel freeing it via the kmemleak_free() + * callback. This is less heavyweight than holding a global lock like + * kmemleak_lock during scanning + * - scan_mutex (mutex): ensures that only one thread may scan the memory for + * unreferenced objects at a time. The gray_list contains the objects which + * are already referenced or marked as false positives and need to be + * scanned. This list is only modified during a scanning episode when the + * scan_mutex is held. At the end of a scan, the gray_list is always empty. + * Note that the kmemleak_object.use_count is incremented when an object is + * added to the gray_list and therefore cannot be freed + * - kmemleak_mutex (mutex): prevents multiple users of the "kmemleak" debugfs + * file together with modifications to the memory scanning parameters + * including the scan_thread pointer + * + * The kmemleak_object structures have a use_count incremented or decremented + * using the get_object()/put_object() functions. When the use_count becomes + * 0, this count can no longer be incremented and put_object() schedules the + * kmemleak_object freeing via an RCU callback. All calls to the get_object() + * function must be protected by rcu_read_lock() to avoid accessing a freed + * structure. + */ + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/sched.h> +#include <linux/jiffies.h> +#include <linux/delay.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/prio_tree.h> +#include <linux/gfp.h> +#include <linux/fs.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> +#include <linux/cpumask.h> +#include <linux/spinlock.h> +#include <linux/mutex.h> +#include <linux/rcupdate.h> +#include <linux/stacktrace.h> +#include <linux/cache.h> +#include <linux/percpu.h> +#include <linux/hardirq.h> +#include <linux/mmzone.h> +#include <linux/slab.h> +#include <linux/thread_info.h> +#include <linux/err.h> +#include <linux/uaccess.h> +#include <linux/string.h> +#include <linux/nodemask.h> +#include <linux/mm.h> + +#include <asm/sections.h> +#include <asm/processor.h> +#include <asm/atomic.h> + +#include <linux/kmemleak.h> + +/* + * Kmemleak configuration and common defines. + */ +#define MAX_TRACE 16 /* stack trace length */ +#define REPORTS_NR 50 /* maximum number of reported leaks */ +#define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ +#define MSECS_SCAN_YIELD 10 /* CPU yielding period */ +#define SECS_FIRST_SCAN 60 /* delay before the first scan */ +#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ + +#define BYTES_PER_POINTER sizeof(void *) + +/* scanning area inside a memory block */ +struct kmemleak_scan_area { + struct hlist_node node; + unsigned long offset; + size_t length; +}; + +/* + * Structure holding the metadata for each allocated memory block. + * Modifications to such objects should be made while holding the + * object->lock. Insertions or deletions from object_list, gray_list or + * tree_node are already protected by the corresponding locks or mutex (see + * the notes on locking above). These objects are reference-counted + * (use_count) and freed using the RCU mechanism. + */ +struct kmemleak_object { + spinlock_t lock; + unsigned long flags; /* object status flags */ + struct list_head object_list; + struct list_head gray_list; + struct prio_tree_node tree_node; + struct rcu_head rcu; /* object_list lockless traversal */ + /* object usage count; object freed when use_count == 0 */ + atomic_t use_count; + unsigned long pointer; + size_t size; + /* minimum number of a pointers found before it is considered leak */ + int min_count; + /* the total number of pointers found pointing to this object */ + int count; + /* memory ranges to be scanned inside an object (empty for all) */ + struct hlist_head area_list; + unsigned long trace[MAX_TRACE]; + unsigned int trace_len; + unsigned long jiffies; /* creation timestamp */ + pid_t pid; /* pid of the current task */ + char comm[TASK_COMM_LEN]; /* executable name */ +}; + +/* flag representing the memory block allocation status */ +#define OBJECT_ALLOCATED (1 << 0) +/* flag set after the first reporting of an unreference object */ +#define OBJECT_REPORTED (1 << 1) +/* flag set to not scan the object */ +#define OBJECT_NO_SCAN (1 << 2) + +/* the list of all allocated objects */ +static LIST_HEAD(object_list); +/* the list of gray-colored objects (see color_gray comment below) */ +static LIST_HEAD(gray_list); +/* prio search tree for object boundaries */ +static struct prio_tree_root object_tree_root; +/* rw_lock protecting the access to object_list and prio_tree_root */ +static DEFINE_RWLOCK(kmemleak_lock); + +/* allocation caches for kmemleak internal data */ +static struct kmem_cache *object_cache; +static struct kmem_cache *scan_area_cache; + +/* set if tracing memory operations is enabled */ +static atomic_t kmemleak_enabled = ATOMIC_INIT(0); +/* set in the late_initcall if there were no errors */ +static atomic_t kmemleak_initialized = ATOMIC_INIT(0); +/* enables or disables early logging of the memory operations */ +static atomic_t kmemleak_early_log = ATOMIC_INIT(1); +/* set if a fata kmemleak error has occurred */ +static atomic_t kmemleak_error = ATOMIC_INIT(0); + +/* minimum and maximum address that may be valid pointers */ +static unsigned long min_addr = ULONG_MAX; +static unsigned long max_addr; + +/* used for yielding the CPU to other tasks during scanning */ +static unsigned long next_scan_yield; +static struct task_struct *scan_thread; +static unsigned long jiffies_scan_yield; +static unsigned long jiffies_min_age; +/* delay between automatic memory scannings */ +static signed long jiffies_scan_wait; +/* enables or disables the task stacks scanning */ +static int kmemleak_stack_scan; +/* mutex protecting the memory scanning */ +static DEFINE_MUTEX(scan_mutex); +/* mutex protecting the access to the /sys/kernel/debug/kmemleak file */ +static DEFINE_MUTEX(kmemleak_mutex); + +/* number of leaks reported (for limitation purposes) */ +static int reported_leaks; + +/* + * Early object allocation/freeing logging. Kkmemleak is initialized after the + * kernel allocator. However, both the kernel allocator and kmemleak may + * allocate memory blocks which need to be tracked. Kkmemleak defines an + * arbitrary buffer to hold the allocation/freeing information before it is + * fully initialized. + */ + +/* kmemleak operation type for early logging */ +enum { + KMEMLEAK_ALLOC, + KMEMLEAK_FREE, + KMEMLEAK_NOT_LEAK, + KMEMLEAK_IGNORE, + KMEMLEAK_SCAN_AREA, + KMEMLEAK_NO_SCAN +}; + +/* + * Structure holding the information passed to kmemleak callbacks during the + * early logging. + */ +struct early_log { + int op_type; /* kmemleak operation type */ + const void *ptr; /* allocated/freed memory block */ + size_t size; /* memory block size */ + int min_count; /* minimum reference count */ + unsigned long offset; /* scan area offset */ + size_t length; /* scan area length */ +}; + +/* early logging buffer and current position */ +static struct early_log early_log[200]; +static int crt_early_log; + +static void kmemleak_disable(void); + +/* + * Print a warning and dump the stack trace. + */ +#define kmemleak_warn(x...) do { \ + pr_warning(x); \ + dump_stack(); \ +} while (0) + +/* + * Macro invoked when a serious kmemleak condition occured and cannot be + * recovered from. Kkmemleak will be disabled and further allocation/freeing + * tracing no longer available. + */ +#define kmemleak_panic(x...) do { \ + kmemleak_warn(x); \ + kmemleak_disable(); \ +} while (0) + +/* + * Object colors, encoded with count and min_count: + * - white - orphan object, not enough references to it (count < min_count) + * - gray - not orphan, not marked as false positive (min_count == 0) or + * sufficient references to it (count >= min_count) + * - black - ignore, it doesn't contain references (e.g. text section) + * (min_count == -1). No function defined for this color. + * Newly created objects don't have any color assigned (object->count == -1) + * before the next memory scan when they become white. + */ +static int color_white(const struct kmemleak_object *object) +{ + return object->count != -1 && object->count < object->min_count; +} + +static int color_gray(const struct kmemleak_object *object) +{ + return object->min_count != -1 && object->count >= object->min_count; +} + +/* + * Objects are considered referenced if their color is gray and they have not + * been deleted. + */ +static int referenced_object(struct kmemleak_object *object) +{ + return (object->flags & OBJECT_ALLOCATED) && color_gray(object); +} + +/* + * Objects are considered unreferenced only if their color is white, they have + * not be deleted and have a minimum age to avoid false positives caused by + * pointers temporarily stored in CPU registers. + */ +static int unreferenced_object(struct kmemleak_object *object) +{ + return (object->flags & OBJECT_ALLOCATED) && color_white(object) && + time_is_before_eq_jiffies(object->jiffies + jiffies_min_age); +} + +/* + * Printing of the (un)referenced objects information, either to the seq file + * or to the kernel log. The print_referenced/print_unreferenced functions + * must be called with the object->lock held. + */ +#define print_helper(seq, x...) do { \ + struct seq_file *s = (seq); \ + if (s) \ + seq_printf(s, x); \ + else \ + pr_info(x); \ +} while (0) + +static void print_referenced(struct kmemleak_object *object) +{ + pr_info("kmemleak: referenced object 0x%08lx (size %zu)\n", + object->pointer, object->size); +} + +static void print_unreferenced(struct seq_file *seq, + struct kmemleak_object *object) +{ + int i; + + print_helper(seq, "kmemleak: unreferenced object 0x%08lx (size %zu):\n", + object->pointer, object->size); + print_helper(seq, " comm \"%s\", pid %d, jiffies %lu\n", + object->comm, object->pid, object->jiffies); + print_helper(seq, " backtrace:\n"); + + for (i = 0; i < object->trace_len; i++) { + void *ptr = (void *)object->trace[i]; + print_helper(seq, " [<%p>] %pS\n", ptr, ptr); + } +} + +/* + * Print the kmemleak_object information. This function is used mainly for + * debugging special cases when kmemleak operations. It must be called with + * the object->lock held. + */ +static void dump_object_info(struct kmemleak_object *object) +{ + struct stack_trace trace; + + trace.nr_entries = object->trace_len; + trace.entries = object->trace; + + pr_notice("kmemleak: Object 0x%08lx (size %zu):\n", + object->tree_node.start, object->size); + pr_notice(" comm \"%s\", pid %d, jiffies %lu\n", + object->comm, object->pid, object->jiffies); + pr_notice(" min_count = %d\n", object->min_count); + pr_notice(" count = %d\n", object->count); + pr_notice(" backtrace:\n"); + print_stack_trace(&trace, 4); +} + +/* + * Look-up a memory block metadata (kmemleak_object) in the priority search + * tree based on a pointer value. If alias is 0, only values pointing to the + * beginning of the memory block are allowed. The kmemleak_lock must be held + * when calling this function. + */ +static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) +{ + struct prio_tree_node *node; + struct prio_tree_iter iter; + struct kmemleak_object *object; + + prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr); + node = prio_tree_next(&iter); + if (node) { + object = prio_tree_entry(node, struct kmemleak_object, + tree_node); + if (!alias && object->pointer != ptr) { + kmemleak_warn("kmemleak: Found object by alias"); + object = NULL; + } + } else + object = NULL; + + return object; +} + +/* + * Increment the object use_count. Return 1 if successful or 0 otherwise. Note + * that once an object's use_count reached 0, the RCU freeing was already + * registered and the object should no longer be used. This function must be + * called under the protection of rcu_read_lock(). + */ +static int get_object(struct kmemleak_object *object) +{ + return atomic_inc_not_zero(&object->use_count); +} + +/* + * RCU callback to free a kmemleak_object. + */ +static void free_object_rcu(struct rcu_head *rcu) +{ + struct hlist_node *elem, *tmp; + struct kmemleak_scan_area *area; + struct kmemleak_object *object = + container_of(rcu, struct kmemleak_object, rcu); + + /* + * Once use_count is 0 (guaranteed by put_object), there is no other + * code accessing this object, hence no need for locking. + */ + hlist_for_each_entry_safe(area, elem, tmp, &object->area_list, node) { + hlist_del(elem); + kmem_cache_free(scan_area_cache, area); + } + kmem_cache_free(object_cache, object); +} + +/* + * Decrement the object use_count. Once the count is 0, free the object using + * an RCU callback. Since put_object() may be called via the kmemleak_free() -> + * delete_object() path, the delayed RCU freeing ensures that there is no + * recursive call to the kernel allocator. Lock-less RCU object_list traversal + * is also possible. + */ +static void put_object(struct kmemleak_object *object) +{ + if (!atomic_dec_and_test(&object->use_count)) + return; + + /* should only get here after delete_object was called */ + WARN_ON(object->flags & OBJECT_ALLOCATED); + + call_rcu(&object->rcu, free_object_rcu); +} + +/* + * Look up an object in the prio search tree and increase its use_count. + */ +static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) +{ + unsigned long flags; + struct kmemleak_object *object = NULL; + + rcu_read_lock(); + read_lock_irqsave(&kmemleak_lock, flags); + if (ptr >= min_addr && ptr < max_addr) + object = lookup_object(ptr, alias); + read_unlock_irqrestore(&kmemleak_lock, flags); + + /* check whether the object is still available */ + if (object && !get_object(object)) + object = NULL; + rcu_read_unlock(); + + return object; +} + +/* + * Create the metadata (struct kmemleak_object) corresponding to an allocated + * memory block and add it to the object_list and object_tree_root. + */ +static void create_object(unsigned long ptr, size_t size, int min_count, + gfp_t gfp) +{ + unsigned long flags; + struct kmemleak_object *object; + struct prio_tree_node *node; + struct stack_trace trace; + + object = kmem_cache_alloc(object_cache, gfp & ~GFP_SLAB_BUG_MASK); + if (!object) { + kmemleak_panic("kmemleak: Cannot allocate a kmemleak_object " + "structure\n"); + return; + } + + INIT_LIST_HEAD(&object->object_list); + INIT_LIST_HEAD(&object->gray_list); + INIT_HLIST_HEAD(&object->area_list); + spin_lock_init(&object->lock); + atomic_set(&object->use_count, 1); + object->flags = OBJECT_ALLOCATED; + object->pointer = ptr; + object->size = size; + object->min_count = min_count; + object->count = -1; /* no color initially */ + object->jiffies = jiffies; + + /* task information */ + if (in_irq()) { + object->pid = 0; + strncpy(object->comm, "hardirq", sizeof(object->comm)); + } else if (in_softirq()) { + object->pid = 0; + strncpy(object->comm, "softirq", sizeof(object->comm)); + } else { + object->pid = current->pid; + /* + * There is a small chance of a race with set_task_comm(), + * however using get_task_comm() here may cause locking + * dependency issues with current->alloc_lock. In the worst + * case, the command line is not correct. + */ + strncpy(object->comm, current->comm, sizeof(object->comm)); + } + + /* kernel backtrace */ + trace.max_entries = MAX_TRACE; + trace.nr_entries = 0; + trace.entries = object->trace; + trace.skip = 1; + save_stack_trace(&trace); + object->trace_len = trace.nr_entries; + + INIT_PRIO_TREE_NODE(&object->tree_node); + object->tree_node.start = ptr; + object->tree_node.last = ptr + size - 1; + + write_lock_irqsave(&kmemleak_lock, flags); + min_addr = min(min_addr, ptr); + max_addr = max(max_addr, ptr + size); + node = prio_tree_insert(&object_tree_root, &object->tree_node); + /* + * The code calling the kernel does not yet have the pointer to the + * memory block to be able to free it. However, we still hold the + * kmemleak_lock here in case parts of the kernel started freeing + * random memory blocks. + */ + if (node != &object->tree_node) { + unsigned long flags; + + kmemleak_panic("kmemleak: Cannot insert 0x%lx into the object " + "search tree (already existing)\n", ptr); + object = lookup_object(ptr, 1); + spin_lock_irqsave(&object->lock, flags); + dump_object_info(object); + spin_unlock_irqrestore(&object->lock, flags); + + goto out; + } + list_add_tail_rcu(&object->object_list, &object_list); +out: + write_unlock_irqrestore(&kmemleak_lock, flags); +} + +/* + * Remove the metadata (struct kmemleak_object) for a memory block from the + * object_list and object_tree_root and decrement its use_count. + */ +static void delete_object(unsigned long ptr) +{ + unsigned long flags; + struct kmemleak_object *object; + + write_lock_irqsave(&kmemleak_lock, flags); + object = lookup_object(ptr, 0); + if (!object) { + kmemleak_warn("kmemleak: Freeing unknown object at 0x%08lx\n", + ptr); + write_unlock_irqrestore(&kmemleak_lock, flags); + return; + } + prio_tree_remove(&object_tree_root, &object->tree_node); + list_del_rcu(&object->object_list); + write_unlock_irqrestore(&kmemleak_lock, flags); + + WARN_ON(!(object->flags & OBJECT_ALLOCATED)); + WARN_ON(atomic_read(&object->use_count) < 1); + + /* + * Locking here also ensures that the corresponding memory block + * cannot be freed when it is being scanned. + */ + spin_lock_irqsave(&object->lock, flags); + if (object->flags & OBJECT_REPORTED) + print_referenced(object); + object->flags &= ~OBJECT_ALLOCATED; + spin_unlock_irqrestore(&object->lock, flags); + put_object(object); +} + +/* + * Make a object permanently as gray-colored so that it can no longer be + * reported as a leak. This is used in general to mark a false positive. + */ +static void make_gray_object(unsigned long ptr) +{ + unsigned long flags; + struct kmemleak_object *object; + + object = find_and_get_object(ptr, 0); + if (!object) { + kmemleak_warn("kmemleak: Graying unknown object at 0x%08lx\n", + ptr); + return; + } + + spin_lock_irqsave(&object->lock, flags); + object->min_count = 0; + spin_unlock_irqrestore(&object->lock, flags); + put_object(object); +} + +/* + * Mark the object as black-colored so that it is ignored from scans and + * reporting. + */ +static void make_black_object(unsigned long ptr) +{ + unsigned long flags; + struct kmemleak_object *object; + + object = find_and_get_object(ptr, 0); + if (!object) { + kmemleak_warn("kmemleak: Blacking unknown object at 0x%08lx\n", + ptr); + return; + } + + spin_lock_irqsave(&object->lock, flags); + object->min_count = -1; + spin_unlock_irqrestore(&object->lock, flags); + put_object(object); +} + +/* + * Add a scanning area to the object. If at least one such area is added, + * kmemleak will only scan these ranges rather than the whole memory block. + */ +static void add_scan_area(unsigned long ptr, unsigned long offset, + size_t length, gfp_t gfp) +{ + unsigned long flags; + struct kmemleak_object *object; + struct kmemleak_scan_area *area; + + object = find_and_get_object(ptr, 0); + if (!object) { + kmemleak_warn("kmemleak: Adding scan area to unknown " + "object at 0x%08lx\n", ptr); + return; + } + + area = kmem_cache_alloc(scan_area_cache, gfp & ~GFP_SLAB_BUG_MASK); + if (!area) { + kmemleak_warn("kmemleak: Cannot allocate a scan area\n"); + goto out; + } + + spin_lock_irqsave(&object->lock, flags); + if (offset + length > object->size) { + kmemleak_warn("kmemleak: Scan area larger than object " + "0x%08lx\n", ptr); + dump_object_info(object); + kmem_cache_free(scan_area_cache, area); + goto out_unlock; + } + + INIT_HLIST_NODE(&area->node); + area->offset = offset; + area->length = length; + + hlist_add_head(&area->node, &object->area_list); +out_unlock: + spin_unlock_irqrestore(&object->lock, flags); +out: + put_object(object); +} + +/* + * Set the OBJECT_NO_SCAN flag for the object corresponding to the give + * pointer. Such object will not be scanned by kmemleak but references to it + * are searched. + */ +static void object_no_scan(unsigned long ptr) +{ + unsigned long flags; + struct kmemleak_object *object; + + object = find_and_get_object(ptr, 0); + if (!object) { + kmemleak_warn("kmemleak: Not scanning unknown object at " + "0x%08lx\n", ptr); + return; + } + + spin_lock_irqsave(&object->lock, flags); + object->flags |= OBJECT_NO_SCAN; + spin_unlock_irqrestore(&object->lock, flags); + put_object(object); +} + +/* + * Log an early kmemleak_* call to the early_log buffer. These calls will be + * processed later once kmemleak is fully initialized. + */ +static void log_early(int op_type, const void *ptr, size_t size, + int min_count, unsigned long offset, size_t length) +{ + unsigned long flags; + struct early_log *log; + + if (crt_early_log >= ARRAY_SIZE(early_log)) { + kmemleak_panic("kmemleak: Early log buffer exceeded\n"); + return; + } + + /* + * There is no need for locking since the kernel is still in UP mode + * at this stage. Disabling the IRQs is enough. + */ + local_irq_save(flags); + log = &early_log[crt_early_log]; + log->op_type = op_type; + log->ptr = ptr; + log->size = size; + log->min_count = min_count; + log->offset = offset; + log->length = length; + crt_early_log++; + local_irq_restore(flags); +} + +/* + * Memory allocation function callback. This function is called from the + * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc, + * vmalloc etc.). + */ +void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) +{ + pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count); + + if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + create_object((unsigned long)ptr, size, min_count, gfp); + else if (atomic_read(&kmemleak_early_log)) + log_early(KMEMLEAK_ALLOC, ptr, size, min_count, 0, 0); +} +EXPORT_SYMBOL_GPL(kmemleak_alloc); + +/* + * Memory freeing function callback. This function is called from the kernel + * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.). + */ +void kmemleak_free(const void *ptr) +{ + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + delete_object((unsigned long)ptr); + else if (atomic_read(&kmemleak_early_log)) + log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0); +} +EXPORT_SYMBOL_GPL(kmemleak_free); + +/* + * Mark an already allocated memory block as a false positive. This will cause + * the block to no longer be reported as leak and always be scanned. + */ +void kmemleak_not_leak(const void *ptr) +{ + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + make_gray_object((unsigned long)ptr); + else if (atomic_read(&kmemleak_early_log)) + log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0, 0, 0); +} +EXPORT_SYMBOL(kmemleak_not_leak); + +/* + * Ignore a memory block. This is usually done when it is known that the + * corresponding block is not a leak and does not contain any references to + * other allocated memory blocks. + */ +void kmemleak_ignore(const void *ptr) +{ + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + make_black_object((unsigned long)ptr); + else if (atomic_read(&kmemleak_early_log)) + log_early(KMEMLEAK_IGNORE, ptr, 0, 0, 0, 0); +} +EXPORT_SYMBOL(kmemleak_ignore); + +/* + * Limit the range to be scanned in an allocated memory block. + */ +void kmemleak_scan_area(const void *ptr, unsigned long offset, size_t length, + gfp_t gfp) +{ + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + add_scan_area((unsigned long)ptr, offset, length, gfp); + else if (atomic_read(&kmemleak_early_log)) + log_early(KMEMLEAK_SCAN_AREA, ptr, 0, 0, offset, length); +} +EXPORT_SYMBOL(kmemleak_scan_area); + +/* + * Inform kmemleak not to scan the given memory block. + */ +void kmemleak_no_scan(const void *ptr) +{ + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + object_no_scan((unsigned long)ptr); + else if (atomic_read(&kmemleak_early_log)) + log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0, 0, 0); +} +EXPORT_SYMBOL(kmemleak_no_scan); + +/* + * Yield the CPU so that other tasks get a chance to run. The yielding is + * rate-limited to avoid excessive number of calls to the schedule() function + * during memory scanning. + */ +static void scan_yield(void) +{ + might_sleep(); + + if (time_is_before_eq_jiffies(next_scan_yield)) { + schedule(); + next_scan_yield = jiffies + jiffies_scan_yield; + } +} + +/* + * Memory scanning is a long process and it needs to be interruptable. This + * function checks whether such interrupt condition occured. + */ +static int scan_should_stop(void) +{ + if (!atomic_read(&kmemleak_enabled)) + return 1; + + /* + * This function may be called from either process or kthread context, + * hence the need to check for both stop conditions. + */ + if (current->mm) + return signal_pending(current); + else + return kthread_should_stop(); + + return 0; +} + +/* + * Scan a memory block (exclusive range) for valid pointers and add those + * found to the gray list. + */ +static void scan_block(void *_start, void *_end, + struct kmemleak_object *scanned) +{ + unsigned long *ptr; + unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER); + unsigned long *end = _end - (BYTES_PER_POINTER - 1); + + for (ptr = start; ptr < end; ptr++) { + unsigned long flags; + unsigned long pointer = *ptr; + struct kmemleak_object *object; + + if (scan_should_stop()) + break; + + /* + * When scanning a memory block with a corresponding + * kmemleak_object, the CPU yielding is handled in the calling + * code since it holds the object->lock to avoid the block + * freeing. + */ + if (!scanned) + scan_yield(); + + object = find_and_get_object(pointer, 1); + if (!object) + continue; + if (object == scanned) { + /* self referenced, ignore */ + put_object(object); + continue; + } + + /* + * Avoid the lockdep recursive warning on object->lock being + * previously acquired in scan_object(). These locks are + * enclosed by scan_mutex. + */ + spin_lock_irqsave_nested(&object->lock, flags, + SINGLE_DEPTH_NESTING); + if (!color_white(object)) { + /* non-orphan, ignored or new */ + spin_unlock_irqrestore(&object->lock, flags); + put_object(object); + continue; + } + + /* + * Increase the object's reference count (number of pointers + * to the memory block). If this count reaches the required + * minimum, the object's color will become gray and it will be + * added to the gray_list. + */ + object->count++; + if (color_gray(object)) + list_add_tail(&object->gray_list, &gray_list); + else + put_object(object); + spin_unlock_irqrestore(&object->lock, flags); + } +} + +/* + * Scan a memory block corresponding to a kmemleak_object. A condition is + * that object->use_count >= 1. + */ +static void scan_object(struct kmemleak_object *object) +{ + struct kmemleak_scan_area *area; + struct hlist_node *elem; + unsigned long flags; + + /* + * Once the object->lock is aquired, the corresponding memory block + * cannot be freed (the same lock is aquired in delete_object). + */ + spin_lock_irqsave(&object->lock, flags); + if (object->flags & OBJECT_NO_SCAN) + goto out; + if (!(object->flags & OBJECT_ALLOCATED)) + /* already freed object */ + goto out; + if (hlist_empty(&object->area_list)) + scan_block((void *)object->pointer, + (void *)(object->pointer + object->size), object); + else + hlist_for_each_entry(area, elem, &object->area_list, node) + scan_block((void *)(object->pointer + area->offset), + (void *)(object->pointer + area->offset + + area->length), object); +out: + spin_unlock_irqrestore(&object->lock, flags); +} + +/* + * Scan data sections and all the referenced memory blocks allocated via the + * kernel's standard allocators. This function must be called with the + * scan_mutex held. + */ +static void kmemleak_scan(void) +{ + unsigned long flags; + struct kmemleak_object *object, *tmp; + struct task_struct *task; + int i; + + /* prepare the kmemleak_object's */ + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { + spin_lock_irqsave(&object->lock, flags); +#ifdef DEBUG + /* + * With a few exceptions there should be a maximum of + * 1 reference to any object at this point. + */ + if (atomic_read(&object->use_count) > 1) { + pr_debug("kmemleak: object->use_count = %d\n", + atomic_read(&object->use_count)); + dump_object_info(object); + } +#endif + /* reset the reference count (whiten the object) */ + object->count = 0; + if (color_gray(object) && get_object(object)) + list_add_tail(&object->gray_list, &gray_list); + + spin_unlock_irqrestore(&object->lock, flags); + } + rcu_read_unlock(); + + /* data/bss scanning */ + scan_block(_sdata, _edata, NULL); + scan_block(__bss_start, __bss_stop, NULL); + +#ifdef CONFIG_SMP + /* per-cpu sections scanning */ + for_each_possible_cpu(i) + scan_block(__per_cpu_start + per_cpu_offset(i), + __per_cpu_end + per_cpu_offset(i), NULL); +#endif + + /* + * Struct page scanning for each node. The code below is not yet safe + * with MEMORY_HOTPLUG. + */ + for_each_online_node(i) { + pg_data_t *pgdat = NODE_DATA(i); + unsigned long start_pfn = pgdat->node_start_pfn; + unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + struct page *page; + + if (!pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + /* only scan if page is in use */ + if (page_count(page) == 0) + continue; + scan_block(page, page + 1, NULL); + } + } + + /* + * Scanning the task stacks may introduce false negatives and it is + * not enabled by default. + */ + if (kmemleak_stack_scan) { + read_lock(&tasklist_lock); + for_each_process(task) + scan_block(task_stack_page(task), + task_stack_page(task) + THREAD_SIZE, NULL); + read_unlock(&tasklist_lock); + } + + /* + * Scan the objects already referenced from the sections scanned + * above. More objects will be referenced and, if there are no memory + * leaks, all the objects will be scanned. The list traversal is safe + * for both tail additions and removals from inside the loop. The + * kmemleak objects cannot be freed from outside the loop because their + * use_count was increased. + */ + object = list_entry(gray_list.next, typeof(*object), gray_list); + while (&object->gray_list != &gray_list) { + scan_yield(); + + /* may add new objects to the list */ + if (!scan_should_stop()) + scan_object(object); + + tmp = list_entry(object->gray_list.next, typeof(*object), + gray_list); + + /* remove the object from the list and release it */ + list_del(&object->gray_list); + put_object(object); + + object = tmp; + } + WARN_ON(!list_empty(&gray_list)); +} + +/* + * Thread function performing automatic memory scanning. Unreferenced objects + * at the end of a memory scan are reported but only the first time. + */ +static int kmemleak_scan_thread(void *arg) +{ + static int first_run = 1; + + pr_info("kmemleak: Automatic memory scanning thread started\n"); + + /* + * Wait before the first scan to allow the system to fully initialize. + */ + if (first_run) { + first_run = 0; + ssleep(SECS_FIRST_SCAN); + } + + while (!kthread_should_stop()) { + struct kmemleak_object *object; + signed long timeout = jiffies_scan_wait; + + mutex_lock(&scan_mutex); + + kmemleak_scan(); + reported_leaks = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { + unsigned long flags; + + if (reported_leaks >= REPORTS_NR) + break; + spin_lock_irqsave(&object->lock, flags); + if (!(object->flags & OBJECT_REPORTED) && + unreferenced_object(object)) { + print_unreferenced(NULL, object); + object->flags |= OBJECT_REPORTED; + reported_leaks++; + } else if ((object->flags & OBJECT_REPORTED) && + referenced_object(object)) { + print_referenced(object); + object->flags &= ~OBJECT_REPORTED; + } + spin_unlock_irqrestore(&object->lock, flags); + } + rcu_read_unlock(); + + mutex_unlock(&scan_mutex); + /* wait before the next scan */ + while (timeout && !kthread_should_stop()) + timeout = schedule_timeout_interruptible(timeout); + } + + pr_info("kmemleak: Automatic memory scanning thread ended\n"); + + return 0; +} + +/* + * Start the automatic memory scanning thread. This function must be called + * with the kmemleak_mutex held. + */ +void start_scan_thread(void) +{ + if (scan_thread) + return; + scan_thread = kthread_run(kmemleak_scan_thread, NULL, "kmemleak"); + if (IS_ERR(scan_thread)) { + pr_warning("kmemleak: Failed to create the scan thread\n"); + scan_thread = NULL; + } +} + +/* + * Stop the automatic memory scanning thread. This function must be called + * with the kmemleak_mutex held. + */ +void stop_scan_thread(void) +{ + if (scan_thread) { + kthread_stop(scan_thread); + scan_thread = NULL; + } +} + +/* + * Iterate over the object_list and return the first valid object at or after + * the required position with its use_count incremented. The function triggers + * a memory scanning when the pos argument points to the first position. + */ +static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct kmemleak_object *object; + loff_t n = *pos; + + if (!n) { + kmemleak_scan(); + reported_leaks = 0; + } + if (reported_leaks >= REPORTS_NR) + return NULL; + + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { + if (n-- > 0) + continue; + if (get_object(object)) + goto out; + } + object = NULL; +out: + rcu_read_unlock(); + return object; +} + +/* + * Return the next object in the object_list. The function decrements the + * use_count of the previous object and increases that of the next one. + */ +static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct kmemleak_object *prev_obj = v; + struct kmemleak_object *next_obj = NULL; + struct list_head *n = &prev_obj->object_list; + + ++(*pos); + if (reported_leaks >= REPORTS_NR) + goto out; + + rcu_read_lock(); + list_for_each_continue_rcu(n, &object_list) { + next_obj = list_entry(n, struct kmemleak_object, object_list); + if (get_object(next_obj)) + break; + } + rcu_read_unlock(); +out: + put_object(prev_obj); + return next_obj; +} + +/* + * Decrement the use_count of the last object required, if any. + */ +static void kmemleak_seq_stop(struct seq_file *seq, void *v) +{ + if (v) + put_object(v); +} + +/* + * Print the information for an unreferenced object to the seq file. + */ +static int kmemleak_seq_show(struct seq_file *seq, void *v) +{ + struct kmemleak_object *object = v; + unsigned long flags; + + spin_lock_irqsave(&object->lock, flags); + if (!unreferenced_object(object)) + goto out; + print_unreferenced(seq, object); + reported_leaks++; +out: + spin_unlock_irqrestore(&object->lock, flags); + return 0; +} + +static const struct seq_operations kmemleak_seq_ops = { + .start = kmemleak_seq_start, + .next = kmemleak_seq_next, + .stop = kmemleak_seq_stop, + .show = kmemleak_seq_show, +}; + +static int kmemleak_open(struct inode *inode, struct file *file) +{ + int ret = 0; + + if (!atomic_read(&kmemleak_enabled)) + return -EBUSY; + + ret = mutex_lock_interruptible(&kmemleak_mutex); + if (ret < 0) + goto out; + if (file->f_mode & FMODE_READ) { + ret = mutex_lock_interruptible(&scan_mutex); + if (ret < 0) + goto kmemleak_unlock; + ret = seq_open(file, &kmemleak_seq_ops); + if (ret < 0) + goto scan_unlock; + } + return ret; + +scan_unlock: + mutex_unlock(&scan_mutex); +kmemleak_unlock: + mutex_unlock(&kmemleak_mutex); +out: + return ret; +} + +static int kmemleak_release(struct inode *inode, struct file *file) +{ + int ret = 0; + + if (file->f_mode & FMODE_READ) { + seq_release(inode, file); + mutex_unlock(&scan_mutex); + } + mutex_unlock(&kmemleak_mutex); + + return ret; +} + +/* + * File write operation to configure kmemleak at run-time. The following + * commands can be written to the /sys/kernel/debug/kmemleak file: + * off - disable kmemleak (irreversible) + * stack=on - enable the task stacks scanning + * stack=off - disable the tasks stacks scanning + * scan=on - start the automatic memory scanning thread + * scan=off - stop the automatic memory scanning thread + * scan=... - set the automatic memory scanning period in seconds (0 to + * disable it) + */ +static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, + size_t size, loff_t *ppos) +{ + char buf[64]; + int buf_size; + + if (!atomic_read(&kmemleak_enabled)) + return -EBUSY; + + buf_size = min(size, (sizeof(buf) - 1)); + if (strncpy_from_user(buf, user_buf, buf_size) < 0) + return -EFAULT; + buf[buf_size] = 0; + + if (strncmp(buf, "off", 3) == 0) + kmemleak_disable(); + else if (strncmp(buf, "stack=on", 8) == 0) + kmemleak_stack_scan = 1; + else if (strncmp(buf, "stack=off", 9) == 0) + kmemleak_stack_scan = 0; + else if (strncmp(buf, "scan=on", 7) == 0) + start_scan_thread(); + else if (strncmp(buf, "scan=off", 8) == 0) + stop_scan_thread(); + else if (strncmp(buf, "scan=", 5) == 0) { + unsigned long secs; + int err; + + err = strict_strtoul(buf + 5, 0, &secs); + if (err < 0) + return err; + stop_scan_thread(); + if (secs) { + jiffies_scan_wait = msecs_to_jiffies(secs * 1000); + start_scan_thread(); + } + } else + return -EINVAL; + + /* ignore the rest of the buffer, only one command at a time */ + *ppos += size; + return size; +} + +static const struct file_operations kmemleak_fops = { + .owner = THIS_MODULE, + .open = kmemleak_open, + .read = seq_read, + .write = kmemleak_write, + .llseek = seq_lseek, + .release = kmemleak_release, +}; + +/* + * Perform the freeing of the kmemleak internal objects after waiting for any + * current memory scan to complete. + */ +static int kmemleak_cleanup_thread(void *arg) +{ + struct kmemleak_object *object; + + mutex_lock(&kmemleak_mutex); + stop_scan_thread(); + mutex_unlock(&kmemleak_mutex); + + mutex_lock(&scan_mutex); + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) + delete_object(object->pointer); + rcu_read_unlock(); + mutex_unlock(&scan_mutex); + + return 0; +} + +/* + * Start the clean-up thread. + */ +static void kmemleak_cleanup(void) +{ + struct task_struct *cleanup_thread; + + cleanup_thread = kthread_run(kmemleak_cleanup_thread, NULL, + "kmemleak-clean"); + if (IS_ERR(cleanup_thread)) + pr_warning("kmemleak: Failed to create the clean-up thread\n"); +} + +/* + * Disable kmemleak. No memory allocation/freeing will be traced once this + * function is called. Disabling kmemleak is an irreversible operation. + */ +static void kmemleak_disable(void) +{ + /* atomically check whether it was already invoked */ + if (atomic_cmpxchg(&kmemleak_error, 0, 1)) + return; + + /* stop any memory operation tracing */ + atomic_set(&kmemleak_early_log, 0); + atomic_set(&kmemleak_enabled, 0); + + /* check whether it is too early for a kernel thread */ + if (atomic_read(&kmemleak_initialized)) + kmemleak_cleanup(); + + pr_info("Kernel memory leak detector disabled\n"); +} + +/* + * Allow boot-time kmemleak disabling (enabled by default). + */ +static int kmemleak_boot_config(char *str) +{ + if (!str) + return -EINVAL; + if (strcmp(str, "off") == 0) + kmemleak_disable(); + else if (strcmp(str, "on") != 0) + return -EINVAL; + return 0; +} +early_param("kmemleak", kmemleak_boot_config); + +/* + * Kkmemleak initialization. + */ +void __init kmemleak_init(void) +{ + int i; + unsigned long flags; + + jiffies_scan_yield = msecs_to_jiffies(MSECS_SCAN_YIELD); + jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); + jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); + + object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); + scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); + INIT_PRIO_TREE_ROOT(&object_tree_root); + + /* the kernel is still in UP mode, so disabling the IRQs is enough */ + local_irq_save(flags); + if (!atomic_read(&kmemleak_error)) { + atomic_set(&kmemleak_enabled, 1); + atomic_set(&kmemleak_early_log, 0); + } + local_irq_restore(flags); + + /* + * This is the point where tracking allocations is safe. Automatic + * scanning is started during the late initcall. Add the early logged + * callbacks to the kmemleak infrastructure. + */ + for (i = 0; i < crt_early_log; i++) { + struct early_log *log = &early_log[i]; + + switch (log->op_type) { + case KMEMLEAK_ALLOC: + kmemleak_alloc(log->ptr, log->size, log->min_count, + GFP_KERNEL); + break; + case KMEMLEAK_FREE: + kmemleak_free(log->ptr); + break; + case KMEMLEAK_NOT_LEAK: + kmemleak_not_leak(log->ptr); + break; + case KMEMLEAK_IGNORE: + kmemleak_ignore(log->ptr); + break; + case KMEMLEAK_SCAN_AREA: + kmemleak_scan_area(log->ptr, log->offset, log->length, + GFP_KERNEL); + break; + case KMEMLEAK_NO_SCAN: + kmemleak_no_scan(log->ptr); + break; + default: + WARN_ON(1); + } + } +} + +/* + * Late initialization function. + */ +static int __init kmemleak_late_init(void) +{ + struct dentry *dentry; + + atomic_set(&kmemleak_initialized, 1); + + if (atomic_read(&kmemleak_error)) { + /* + * Some error occured and kmemleak was disabled. There is a + * small chance that kmemleak_disable() was called immediately + * after setting kmemleak_initialized and we may end up with + * two clean-up threads but serialized by scan_mutex. + */ + kmemleak_cleanup(); + return -ENOMEM; + } + + dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL, + &kmemleak_fops); + if (!dentry) + pr_warning("kmemleak: Failed to create the debugfs kmemleak " + "file\n"); + mutex_lock(&kmemleak_mutex); + start_scan_thread(); + mutex_unlock(&kmemleak_mutex); + + pr_info("Kernel memory leak detector initialized\n"); + + return 0; +} +late_initcall(kmemleak_late_init); diff --git a/mm/maccess.c b/mm/maccess.c index ac40796..9073695 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -39,7 +39,7 @@ EXPORT_SYMBOL_GPL(probe_kernel_read); * Safely write to address @dst from the buffer at @src. If a kernel fault * happens, handle that and return -EFAULT. */ -long probe_kernel_write(void *dst, void *src, size_t size) +long notrace __weak probe_kernel_write(void *dst, void *src, size_t size) { long ret; mm_segment_t old_fs = get_fs(); diff --git a/mm/madvise.c b/mm/madvise.c index b9ce574..76eb419 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -123,8 +123,7 @@ static long madvise_willneed(struct vm_area_struct * vma, end = vma->vm_end; end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - force_page_cache_readahead(file->f_mapping, - file, start, max_sane_readahead(end - start)); + force_page_cache_readahead(file->f_mapping, file, start, end - start); return 0; } @@ -239,12 +238,30 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, break; default: - error = -EINVAL; + BUG(); break; } return error; } +static int +madvise_behavior_valid(int behavior) +{ + switch (behavior) { + case MADV_DOFORK: + case MADV_DONTFORK: + case MADV_NORMAL: + case MADV_SEQUENTIAL: + case MADV_RANDOM: + case MADV_REMOVE: + case MADV_WILLNEED: + case MADV_DONTNEED: + return 1; + + default: + return 0; + } +} /* * The madvise(2) system call. * @@ -290,6 +307,9 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) int write; size_t len; + if (!madvise_behavior_valid(behavior)) + return error; + write = madvise_need_mmap_write(behavior); if (write) down_write(¤t->mm->mmap_sem); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 01c2d8f1..70db6e0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -314,14 +314,6 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) return mem; } -static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem) -{ - if (!mem) - return true; - return css_is_removed(&mem->css); -} - - /* * Call callback function against all cgroup under hierarchy tree. */ @@ -578,6 +570,17 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) return 0; } +int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) +{ + unsigned long active; + unsigned long inactive; + + inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); + active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); + + return (active > inactive); +} + unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, struct zone *zone, enum lru_list lru) @@ -932,7 +935,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, if (unlikely(!mem)) return 0; - VM_BUG_ON(!mem || mem_cgroup_is_obsolete(mem)); + VM_BUG_ON(css_is_removed(&mem->css)); while (1) { int ret; @@ -1488,8 +1491,9 @@ void mem_cgroup_uncharge_cache_page(struct page *page) __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); } +#ifdef CONFIG_SWAP /* - * called from __delete_from_swap_cache() and drop "page" account. + * called after __delete_from_swap_cache() and drop "page" account. * memcg information is recorded to swap_cgroup of "ent" */ void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) @@ -1506,6 +1510,7 @@ void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) if (memcg) css_put(&memcg->css); } +#endif #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP /* diff --git a/mm/memory.c b/mm/memory.c index 4126dd1..d5d1653 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1360,6 +1360,56 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, return i; } +/** + * get_user_pages() - pin user pages in memory + * @tsk: task_struct of target task + * @mm: mm_struct of target mm + * @start: starting user address + * @len: number of pages from start to pin + * @write: whether pages will be written to by the caller + * @force: whether to force write access even if user mapping is + * readonly. This will result in the page being COWed even + * in MAP_SHARED mappings. You do not want this. + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If len is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. Each page returned must be released + * with a put_page() call when it is finished with. vmas will only + * remain valid while mmap_sem is held. + * + * Must be called with mmap_sem held for read or write. + * + * get_user_pages walks a process's page tables and takes a reference to + * each struct page that each user address corresponds to at a given + * instant. That is, it takes the page that would be accessed if a user + * thread accesses the given user virtual address at that instant. + * + * This does not guarantee that the page exists in the user mappings when + * get_user_pages returns, and there may even be a completely different + * page there in some cases (eg. if mmapped pagecache has been invalidated + * and subsequently re faulted). However it does guarantee that the page + * won't be freed completely. And mostly callers simply care that the page + * contains data that was valid *at some point in time*. Typically, an IO + * or similar operation cannot guarantee anything stronger anyway because + * locks can't be held over the syscall boundary. + * + * If write=0, the page must not be written to. If the page is written to, + * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called + * after the page is finished with, and before put_page is called. + * + * get_user_pages is typically used for fewer-copy IO operations, to get a + * handle on the memory by some means other than accesses via the user virtual + * addresses. The pages may be submitted for DMA to devices or accessed via + * their kernel linear mapping (via the kmap APIs). Care should be taken to + * use the correct cache flushing APIs. + * + * See also get_user_pages_fast, for performance critical applications. + */ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas) @@ -3053,22 +3103,13 @@ int in_gate_area_no_task(unsigned long addr) #endif /* __HAVE_ARCH_GATE_AREA */ -#ifdef CONFIG_HAVE_IOREMAP_PROT -int follow_phys(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned long *prot, resource_size_t *phys) +static int follow_pte(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, spinlock_t **ptlp) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; - pte_t *ptep, pte; - spinlock_t *ptl; - resource_size_t phys_addr = 0; - struct mm_struct *mm = vma->vm_mm; - int ret = -EINVAL; - - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - goto out; + pte_t *ptep; pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) @@ -3086,22 +3127,71 @@ int follow_phys(struct vm_area_struct *vma, if (pmd_huge(*pmd)) goto out; - ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + ptep = pte_offset_map_lock(mm, pmd, address, ptlp); if (!ptep) goto out; + if (!pte_present(*ptep)) + goto unlock; + *ptepp = ptep; + return 0; +unlock: + pte_unmap_unlock(ptep, *ptlp); +out: + return -EINVAL; +} +/** + * follow_pfn - look up PFN at a user virtual address + * @vma: memory mapping + * @address: user virtual address + * @pfn: location to store found PFN + * + * Only IO mappings and raw PFN mappings are allowed. + * + * Returns zero and the pfn at @pfn on success, -ve otherwise. + */ +int follow_pfn(struct vm_area_struct *vma, unsigned long address, + unsigned long *pfn) +{ + int ret = -EINVAL; + spinlock_t *ptl; + pte_t *ptep; + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + return ret; + + ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); + if (ret) + return ret; + *pfn = pte_pfn(*ptep); + pte_unmap_unlock(ptep, ptl); + return 0; +} +EXPORT_SYMBOL(follow_pfn); + +#ifdef CONFIG_HAVE_IOREMAP_PROT +int follow_phys(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned long *prot, resource_size_t *phys) +{ + int ret = -EINVAL; + pte_t *ptep, pte; + spinlock_t *ptl; + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + goto out; + + if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) + goto out; pte = *ptep; - if (!pte_present(pte)) - goto unlock; + if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; - phys_addr = pte_pfn(pte); - phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */ *prot = pgprot_val(pte_pgprot(pte)); - *phys = phys_addr; - ret = 0; + *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; + ret = 0; unlock: pte_unmap_unlock(ptep, ptl); out: diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c083cf5..e4412a6 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -422,7 +422,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) zone->present_pages += onlined_pages; zone->zone_pgdat->node_present_pages += onlined_pages; - setup_per_zone_pages_min(); + setup_per_zone_wmarks(); + calculate_zone_inactive_ratio(zone); if (onlined_pages) { kswapd_run(zone_to_nid(zone)); node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); @@ -832,6 +833,9 @@ repeat: totalram_pages -= offlined_pages; num_physpages -= offlined_pages; + setup_per_zone_wmarks(); + calculate_zone_inactive_ratio(zone); + vm_total_pages = nr_free_pagecache_pages(); writeback_set_ratelimit(); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3eb4a6f..e08e2c4 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -182,13 +182,54 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) return 0; } -/* Create a new policy */ +/* + * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if + * any, for the new policy. mpol_new() has already validated the nodes + * parameter with respect to the policy mode and flags. But, we need to + * handle an empty nodemask with MPOL_PREFERRED here. + * + * Must be called holding task's alloc_lock to protect task's mems_allowed + * and mempolicy. May also be called holding the mmap_semaphore for write. + */ +static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes) +{ + nodemask_t cpuset_context_nmask; + int ret; + + /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ + if (pol == NULL) + return 0; + + VM_BUG_ON(!nodes); + if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) + nodes = NULL; /* explicit local allocation */ + else { + if (pol->flags & MPOL_F_RELATIVE_NODES) + mpol_relative_nodemask(&cpuset_context_nmask, nodes, + &cpuset_current_mems_allowed); + else + nodes_and(cpuset_context_nmask, *nodes, + cpuset_current_mems_allowed); + if (mpol_store_user_nodemask(pol)) + pol->w.user_nodemask = *nodes; + else + pol->w.cpuset_mems_allowed = + cpuset_current_mems_allowed; + } + + ret = mpol_ops[pol->mode].create(pol, + nodes ? &cpuset_context_nmask : NULL); + return ret; +} + +/* + * This function just creates a new policy, does some check and simple + * initialization. You must invoke mpol_set_nodemask() to set nodes. + */ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, nodemask_t *nodes) { struct mempolicy *policy; - nodemask_t cpuset_context_nmask; - int ret; pr_debug("setting mode %d flags %d nodes[0] %lx\n", mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); @@ -210,7 +251,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, if (((flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES))) return ERR_PTR(-EINVAL); - nodes = NULL; /* flag local alloc */ } } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); @@ -221,30 +261,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, policy->mode = mode; policy->flags = flags; - if (nodes) { - /* - * cpuset related setup doesn't apply to local allocation - */ - cpuset_update_task_memory_state(); - if (flags & MPOL_F_RELATIVE_NODES) - mpol_relative_nodemask(&cpuset_context_nmask, nodes, - &cpuset_current_mems_allowed); - else - nodes_and(cpuset_context_nmask, *nodes, - cpuset_current_mems_allowed); - if (mpol_store_user_nodemask(policy)) - policy->w.user_nodemask = *nodes; - else - policy->w.cpuset_mems_allowed = - cpuset_mems_allowed(current); - } - - ret = mpol_ops[mode].create(policy, - nodes ? &cpuset_context_nmask : NULL); - if (ret < 0) { - kmem_cache_free(policy_cache, policy); - return ERR_PTR(ret); - } return policy; } @@ -324,6 +340,8 @@ static void mpol_rebind_policy(struct mempolicy *pol, /* * Wrapper for mpol_rebind_policy() that just requires task * pointer, and updates task mempolicy. + * + * Called with task's alloc_lock held. */ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) @@ -600,8 +618,9 @@ static void mpol_set_task_struct_flag(void) static long do_set_mempolicy(unsigned short mode, unsigned short flags, nodemask_t *nodes) { - struct mempolicy *new; + struct mempolicy *new, *old; struct mm_struct *mm = current->mm; + int ret; new = mpol_new(mode, flags, nodes); if (IS_ERR(new)) @@ -615,20 +634,33 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, */ if (mm) down_write(&mm->mmap_sem); - mpol_put(current->mempolicy); + task_lock(current); + ret = mpol_set_nodemask(new, nodes); + if (ret) { + task_unlock(current); + if (mm) + up_write(&mm->mmap_sem); + mpol_put(new); + return ret; + } + old = current->mempolicy; current->mempolicy = new; mpol_set_task_struct_flag(); if (new && new->mode == MPOL_INTERLEAVE && nodes_weight(new->v.nodes)) current->il_next = first_node(new->v.nodes); + task_unlock(current); if (mm) up_write(&mm->mmap_sem); + mpol_put(old); return 0; } /* * Return nodemask for policy for get_mempolicy() query + * + * Called with task's alloc_lock held */ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) { @@ -674,7 +706,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, struct vm_area_struct *vma = NULL; struct mempolicy *pol = current->mempolicy; - cpuset_update_task_memory_state(); if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) return -EINVAL; @@ -683,7 +714,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) return -EINVAL; *policy = 0; /* just so it's initialized */ + task_lock(current); *nmask = cpuset_current_mems_allowed; + task_unlock(current); return 0; } @@ -738,8 +771,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, } err = 0; - if (nmask) + if (nmask) { + task_lock(current); get_policy_nodemask(pol, nmask); + task_unlock(current); + } out: mpol_cond_put(pol); @@ -767,7 +803,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, static struct page *new_node_page(struct page *page, unsigned long node, int **x) { - return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0); + return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); } /* @@ -979,6 +1015,14 @@ static long do_mbind(unsigned long start, unsigned long len, return err; } down_write(&mm->mmap_sem); + task_lock(current); + err = mpol_set_nodemask(new, nmask); + task_unlock(current); + if (err) { + up_write(&mm->mmap_sem); + mpol_put(new); + return err; + } vma = check_range(mm, start, end, nmask, flags | MPOL_MF_INVERT, &pagelist); @@ -1545,8 +1589,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) struct mempolicy *pol = get_vma_policy(current, vma, addr); struct zonelist *zl; - cpuset_update_task_memory_state(); - if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; @@ -1593,8 +1635,6 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) { struct mempolicy *pol = current->mempolicy; - if ((gfp & __GFP_WAIT) && !in_interrupt()) - cpuset_update_task_memory_state(); if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) pol = &default_policy; @@ -1854,6 +1894,8 @@ restart: */ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { + int ret; + sp->root = RB_ROOT; /* empty tree == default mempolicy */ spin_lock_init(&sp->lock); @@ -1863,9 +1905,19 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) /* contextualize the tmpfs mount point mempolicy */ new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); - mpol_put(mpol); /* drop our ref on sb mpol */ - if (IS_ERR(new)) + if (IS_ERR(new)) { + mpol_put(mpol); /* drop our ref on sb mpol */ return; /* no valid nodemask intersection */ + } + + task_lock(current); + ret = mpol_set_nodemask(new, &mpol->w.user_nodemask); + task_unlock(current); + mpol_put(mpol); /* drop our ref on sb mpol */ + if (ret) { + mpol_put(new); + return; + } /* Create pseudo-vma that contains just the policy */ memset(&pvma, 0, sizeof(struct vm_area_struct)); @@ -2086,8 +2138,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) new = mpol_new(mode, mode_flags, &nodes); if (IS_ERR(new)) err = 1; - else if (no_context) - new->w.user_nodemask = nodes; /* save for contextualization */ + else { + int ret; + + task_lock(current); + ret = mpol_set_nodemask(new, &nodes); + task_unlock(current); + if (ret) + err = 1; + else if (no_context) { + /* save for contextualization */ + new->w.user_nodemask = nodes; + } + } out: /* Restore string for error message */ diff --git a/mm/migrate.c b/mm/migrate.c index 068655d8..939888f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -802,7 +802,7 @@ static struct page *new_page_node(struct page *p, unsigned long private, *result = &pm->status; - return alloc_pages_node(pm->node, + return alloc_pages_exact_node(pm->node, GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); } @@ -820,7 +820,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm, struct page_to_node *pp; LIST_HEAD(pagelist); - migrate_prep(); down_read(&mm->mmap_sem); /* @@ -907,6 +906,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task, pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); if (!pm) goto out; + + migrate_prep(); + /* * Store a chunk of page_to_node array in a page, * but keep the last one as a marker @@ -31,7 +31,6 @@ int can_do_mlock(void) } EXPORT_SYMBOL(can_do_mlock); -#ifdef CONFIG_UNEVICTABLE_LRU /* * Mlocked pages are marked with PageMlocked() flag for efficient testing * in vmscan and, possibly, the fault path; and to support semi-accurate @@ -261,27 +260,6 @@ static int __mlock_posix_error_return(long retval) return retval; } -#else /* CONFIG_UNEVICTABLE_LRU */ - -/* - * Just make pages present if VM_LOCKED. No-op if unlocking. - */ -static long __mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - int mlock) -{ - if (mlock && (vma->vm_flags & VM_LOCKED)) - return make_pages_present(start, end); - return 0; -} - -static inline int __mlock_posix_error_return(long retval) -{ - return 0; -} - -#endif /* CONFIG_UNEVICTABLE_LRU */ - /** * mlock_vma_pages_range() - mlock pages in specified vma range. * @vma - the vma containing the specfied address range @@ -28,6 +28,7 @@ #include <linux/mempolicy.h> #include <linux/rmap.h> #include <linux/mmu_notifier.h> +#include <linux/perf_counter.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -87,6 +88,9 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; struct percpu_counter vm_committed_as; +/* amount of vm to protect from userspace access */ +unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; + /* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to @@ -1219,6 +1223,8 @@ munmap_back: if (correct_wcount) atomic_inc(&inode->i_writecount); out: + perf_counter_mmap(vma); + mm->total_vm += len >> PAGE_SHIFT; vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { @@ -2305,6 +2311,8 @@ int install_special_mapping(struct mm_struct *mm, mm->total_vm += len >> PAGE_SHIFT; + perf_counter_mmap(vma); + return 0; } diff --git a/mm/mmzone.c b/mm/mmzone.c index 16ce8b9..f5b7d17 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -6,6 +6,7 @@ #include <linux/stddef.h> +#include <linux/mm.h> #include <linux/mmzone.h> #include <linux/module.h> @@ -72,3 +73,17 @@ struct zoneref *next_zones_zonelist(struct zoneref *z, *zone = zonelist_zone(z); return z; } + +#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL +int memmap_valid_within(unsigned long pfn, + struct page *page, struct zone *zone) +{ + if (page_to_pfn(page) != pfn) + return 0; + + if (page_zone(page) != zone) + return 0; + + return 1; +} +#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ diff --git a/mm/mprotect.c b/mm/mprotect.c index 258197b..d80311b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -23,6 +23,7 @@ #include <linux/swapops.h> #include <linux/mmu_notifier.h> #include <linux/migrate.h> +#include <linux/perf_counter.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/cacheflush.h> @@ -299,6 +300,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); if (error) goto out; + perf_counter_mmap(vma); nstart = tmp; if (nstart < prev->vm_end) @@ -69,6 +69,9 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; int heap_stack_gap = 0; +/* amount of vm to protect from userspace access */ +unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; + atomic_long_t mmap_pages_allocated; EXPORT_SYMBOL(mem_map); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 92bcf1d..175a67a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -58,6 +58,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) unsigned long points, cpu_time, run_time; struct mm_struct *mm; struct task_struct *child; + int oom_adj; task_lock(p); mm = p->mm; @@ -65,6 +66,11 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) task_unlock(p); return 0; } + oom_adj = mm->oom_adj; + if (oom_adj == OOM_DISABLE) { + task_unlock(p); + return 0; + } /* * The memory size of the process is the basis for the badness. @@ -148,15 +154,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) points /= 8; /* - * Adjust the score by oomkilladj. + * Adjust the score by oom_adj. */ - if (p->oomkilladj) { - if (p->oomkilladj > 0) { + if (oom_adj) { + if (oom_adj > 0) { if (!points) points = 1; - points <<= p->oomkilladj; + points <<= oom_adj; } else - points >>= -(p->oomkilladj); + points >>= -(oom_adj); } #ifdef DEBUG @@ -251,11 +257,8 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, *ppoints = ULONG_MAX; } - if (p->oomkilladj == OOM_DISABLE) - continue; - points = badness(p, uptime.tv_sec); - if (points > *ppoints || !chosen) { + if (points > *ppoints) { chosen = p; *ppoints = points; } @@ -284,22 +287,27 @@ static void dump_tasks(const struct mem_cgroup *mem) printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj " "name\n"); do_each_thread(g, p) { - /* - * total_vm and rss sizes do not exist for tasks with a - * detached mm so there's no need to report them. - */ - if (!p->mm) - continue; + struct mm_struct *mm; + if (mem && !task_in_mem_cgroup(p, mem)) continue; if (!thread_group_leader(p)) continue; task_lock(p); + mm = p->mm; + if (!mm) { + /* + * total_vm and rss sizes do not exist for tasks with no + * mm so there's no need to report them; they can't be + * oom killed anyway. + */ + task_unlock(p); + continue; + } printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", - p->pid, __task_cred(p)->uid, p->tgid, - p->mm->total_vm, get_mm_rss(p->mm), (int)task_cpu(p), - p->oomkilladj, p->comm); + p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, + get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm); task_unlock(p); } while_each_thread(g, p); } @@ -317,11 +325,8 @@ static void __oom_kill_task(struct task_struct *p, int verbose) return; } - if (!p->mm) { - WARN_ON(1); - printk(KERN_WARNING "tried to kill an mm-less task!\n"); + if (!p->mm) return; - } if (verbose) printk(KERN_ERR "Killed process %d (%s)\n", @@ -343,28 +348,13 @@ static int oom_kill_task(struct task_struct *p) struct mm_struct *mm; struct task_struct *g, *q; + task_lock(p); mm = p->mm; - - /* WARNING: mm may not be dereferenced since we did not obtain its - * value from get_task_mm(p). This is OK since all we need to do is - * compare mm to q->mm below. - * - * Furthermore, even if mm contains a non-NULL value, p->mm may - * change to NULL at any time since we do not hold task_lock(p). - * However, this is of no concern to us. - */ - - if (mm == NULL) + if (!mm || mm->oom_adj == OOM_DISABLE) { + task_unlock(p); return 1; - - /* - * Don't kill the process if any threads are set to OOM_DISABLE - */ - do_each_thread(g, q) { - if (q->mm == mm && q->oomkilladj == OOM_DISABLE) - return 1; - } while_each_thread(g, q); - + } + task_unlock(p); __oom_kill_task(p, 1); /* @@ -387,10 +377,11 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, struct task_struct *c; if (printk_ratelimit()) { - printk(KERN_WARNING "%s invoked oom-killer: " - "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", - current->comm, gfp_mask, order, current->oomkilladj); task_lock(current); + printk(KERN_WARNING "%s invoked oom-killer: " + "gfp_mask=0x%x, order=%d, oom_adj=%d\n", + current->comm, gfp_mask, order, + current->mm ? current->mm->oom_adj : OOM_DISABLE); cpuset_print_task_mems_allowed(current); task_unlock(current); dump_stack(); @@ -403,8 +394,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, /* * If the task is already exiting, don't alarm the sysadmin or kill * its children or threads, just set TIF_MEMDIE so it can die quickly + * if its mm is still attached. */ - if (p->flags & PF_EXITING) { + if (p->mm && (p->flags & PF_EXITING)) { __oom_kill_task(p, 0); return 0; } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 30351f0..7b0dcea 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -94,12 +94,12 @@ unsigned long vm_dirty_bytes; /* * The interval between `kupdate'-style writebacks */ -unsigned int dirty_writeback_interval = 5 * 100; /* sentiseconds */ +unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ /* * The longest time for which data is allowed to remain dirty */ -unsigned int dirty_expire_interval = 30 * 100; /* sentiseconds */ +unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ /* * Flag that makes the machine dump writes/reads and block dirtyings. @@ -265,18 +265,19 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi, * This avoids exceeding the total dirty_limit when the floating averages * fluctuate too quickly. */ -static void -clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty) +static void clip_bdi_dirty_limit(struct backing_dev_info *bdi, + unsigned long dirty, unsigned long *pbdi_dirty) { - long avail_dirty; + unsigned long avail_dirty; - avail_dirty = dirty - - (global_page_state(NR_FILE_DIRTY) + + avail_dirty = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_WRITEBACK) + global_page_state(NR_UNSTABLE_NFS) + - global_page_state(NR_WRITEBACK_TEMP)); + global_page_state(NR_WRITEBACK_TEMP); - if (avail_dirty < 0) + if (avail_dirty < dirty) + avail_dirty = dirty - avail_dirty; + else avail_dirty = 0; avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + @@ -299,10 +300,10 @@ static inline void task_dirties_fraction(struct task_struct *tsk, * * dirty -= (dirty/8) * p_{t} */ -static void task_dirty_limit(struct task_struct *tsk, long *pdirty) +static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty) { long numerator, denominator; - long dirty = *pdirty; + unsigned long dirty = *pdirty; u64 inv = dirty >> 3; task_dirties_fraction(tsk, &numerator, &denominator); @@ -770,7 +771,7 @@ static void wb_kupdate(unsigned long arg) sync_supers(); - oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval); + oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); start_jif = jiffies; next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10); nr_to_write = global_page_state(NR_FILE_DIRTY) + diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fe753ec..a5f3c27 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -23,6 +23,7 @@ #include <linux/bootmem.h> #include <linux/compiler.h> #include <linux/kernel.h> +#include <linux/kmemcheck.h> #include <linux/module.h> #include <linux/suspend.h> #include <linux/pagevec.h> @@ -46,6 +47,7 @@ #include <linux/page-isolation.h> #include <linux/page_cgroup.h> #include <linux/debugobjects.h> +#include <linux/kmemleak.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -149,10 +151,6 @@ static unsigned long __meminitdata dma_reserve; static int __meminitdata nr_nodemap_entries; static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE - static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; - static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; -#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ static unsigned long __initdata required_kernelcore; static unsigned long __initdata required_movablecore; static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; @@ -164,17 +162,25 @@ static unsigned long __meminitdata dma_reserve; #if MAX_NUMNODES > 1 int nr_node_ids __read_mostly = MAX_NUMNODES; +int nr_online_nodes __read_mostly = 1; EXPORT_SYMBOL(nr_node_ids); +EXPORT_SYMBOL(nr_online_nodes); #endif int page_group_by_mobility_disabled __read_mostly; static void set_pageblock_migratetype(struct page *page, int migratetype) { + + if (unlikely(page_group_by_mobility_disabled)) + migratetype = MIGRATE_UNMOVABLE; + set_pageblock_flags_group(page, (unsigned long)migratetype, PB_migrate, PB_migrate_end); } +bool oom_killer_disabled __read_mostly; + #ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) { @@ -297,23 +303,6 @@ void prep_compound_page(struct page *page, unsigned long order) } } -#ifdef CONFIG_HUGETLBFS -void prep_compound_gigantic_page(struct page *page, unsigned long order) -{ - int i; - int nr_pages = 1 << order; - struct page *p = page + 1; - - set_compound_page_dtor(page, free_compound_page); - set_compound_order(page, order); - __SetPageHead(page); - for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { - __SetPageTail(p); - p->first_page = page; - } -} -#endif - static int destroy_compound_page(struct page *page, unsigned long order) { int i; @@ -420,7 +409,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, return 0; if (PageBuddy(buddy) && page_order(buddy) == order) { - BUG_ON(page_count(buddy) != 0); + VM_BUG_ON(page_count(buddy) != 0); return 1; } return 0; @@ -451,22 +440,22 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, */ static inline void __free_one_page(struct page *page, - struct zone *zone, unsigned int order) + struct zone *zone, unsigned int order, + int migratetype) { unsigned long page_idx; - int order_size = 1 << order; - int migratetype = get_pageblock_migratetype(page); if (unlikely(PageCompound(page))) if (unlikely(destroy_compound_page(page, order))) return; + VM_BUG_ON(migratetype == -1); + page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); - VM_BUG_ON(page_idx & (order_size - 1)); + VM_BUG_ON(page_idx & ((1 << order) - 1)); VM_BUG_ON(bad_range(zone, page)); - __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); while (order < MAX_ORDER-1) { unsigned long combined_idx; struct page *buddy; @@ -490,12 +479,27 @@ static inline void __free_one_page(struct page *page, zone->free_area[order].nr_free++; } +#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT +/* + * free_page_mlock() -- clean up attempts to free and mlocked() page. + * Page should not be on lru, so no need to fix that up. + * free_pages_check() will verify... + */ +static inline void free_page_mlock(struct page *page) +{ + __ClearPageMlocked(page); + __dec_zone_page_state(page, NR_MLOCK); + __count_vm_event(UNEVICTABLE_MLOCKFREED); +} +#else +static void free_page_mlock(struct page *page) { } +#endif + static inline int free_pages_check(struct page *page) { - free_page_mlock(page); if (unlikely(page_mapcount(page) | (page->mapping != NULL) | - (page_count(page) != 0) | + (atomic_read(&page->_count) != 0) | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { bad_page(page); return 1; @@ -522,6 +526,8 @@ static void free_pages_bulk(struct zone *zone, int count, spin_lock(&zone->lock); zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); zone->pages_scanned = 0; + + __mod_zone_page_state(zone, NR_FREE_PAGES, count << order); while (count--) { struct page *page; @@ -529,17 +535,20 @@ static void free_pages_bulk(struct zone *zone, int count, page = list_entry(list->prev, struct page, lru); /* have to delete it as __free_one_page list manipulates */ list_del(&page->lru); - __free_one_page(page, zone, order); + __free_one_page(page, zone, order, page_private(page)); } spin_unlock(&zone->lock); } -static void free_one_page(struct zone *zone, struct page *page, int order) +static void free_one_page(struct zone *zone, struct page *page, int order, + int migratetype) { spin_lock(&zone->lock); zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); zone->pages_scanned = 0; - __free_one_page(page, zone, order); + + __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); + __free_one_page(page, zone, order, migratetype); spin_unlock(&zone->lock); } @@ -548,6 +557,9 @@ static void __free_pages_ok(struct page *page, unsigned int order) unsigned long flags; int i; int bad = 0; + int clearMlocked = PageMlocked(page); + + kmemcheck_free_shadow(page, order); for (i = 0 ; i < (1 << order) ; ++i) bad += free_pages_check(page + i); @@ -563,8 +575,11 @@ static void __free_pages_ok(struct page *page, unsigned int order) kernel_map_pages(page, 1 << order, 0); local_irq_save(flags); + if (unlikely(clearMlocked)) + free_page_mlock(page); __count_vm_events(PGFREE, 1 << order); - free_one_page(page_zone(page), page, order); + free_one_page(page_zone(page), page, order, + get_pageblock_migratetype(page)); local_irq_restore(flags); } @@ -635,7 +650,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) { if (unlikely(page_mapcount(page) | (page->mapping != NULL) | - (page_count(page) != 0) | + (atomic_read(&page->_count) != 0) | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { bad_page(page); return 1; @@ -660,7 +675,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) * Go through the free lists for the given migratetype and remove * the smallest available page from the freelists */ -static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, +static inline +struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, int migratetype) { unsigned int current_order; @@ -678,7 +694,6 @@ static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, list_del(&page->lru); rmv_page_order(page); area->nr_free--; - __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); expand(zone, page, order, current_order, area, migratetype); return page; } @@ -769,8 +784,8 @@ static int move_freepages_block(struct zone *zone, struct page *page, } /* Remove an element from the buddy allocator from the fallback list */ -static struct page *__rmqueue_fallback(struct zone *zone, int order, - int start_migratetype) +static inline struct page * +__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) { struct free_area * area; int current_order; @@ -818,8 +833,6 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order, /* Remove the page from the freelists */ list_del(&page->lru); rmv_page_order(page); - __mod_zone_page_state(zone, NR_FREE_PAGES, - -(1UL << order)); if (current_order == pageblock_order) set_pageblock_migratetype(page, @@ -830,8 +843,7 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order, } } - /* Use MIGRATE_RESERVE rather than fail an allocation */ - return __rmqueue_smallest(zone, order, MIGRATE_RESERVE); + return NULL; } /* @@ -843,11 +855,23 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order, { struct page *page; +retry_reserve: page = __rmqueue_smallest(zone, order, migratetype); - if (unlikely(!page)) + if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { page = __rmqueue_fallback(zone, order, migratetype); + /* + * Use MIGRATE_RESERVE rather than fail an allocation. goto + * is used because __rmqueue_smallest is an inline function + * and we want just one call site + */ + if (!page) { + migratetype = MIGRATE_RESERVE; + goto retry_reserve; + } + } + return page; } @@ -881,6 +905,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, set_page_private(page, migratetype); list = &page->lru; } + __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); spin_unlock(&zone->lock); return i; } @@ -996,6 +1021,9 @@ static void free_hot_cold_page(struct page *page, int cold) struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; unsigned long flags; + int clearMlocked = PageMlocked(page); + + kmemcheck_free_shadow(page, 0); if (PageAnon(page)) page->mapping = NULL; @@ -1010,13 +1038,16 @@ static void free_hot_cold_page(struct page *page, int cold) kernel_map_pages(page, 1, 0); pcp = &zone_pcp(zone, get_cpu())->pcp; + set_page_private(page, get_pageblock_migratetype(page)); local_irq_save(flags); + if (unlikely(clearMlocked)) + free_page_mlock(page); __count_vm_event(PGFREE); + if (cold) list_add_tail(&page->lru, &pcp->list); else list_add(&page->lru, &pcp->list); - set_page_private(page, get_pageblock_migratetype(page)); pcp->count++; if (pcp->count >= pcp->high) { free_pages_bulk(zone, pcp->batch, &pcp->list, 0); @@ -1050,6 +1081,16 @@ void split_page(struct page *page, unsigned int order) VM_BUG_ON(PageCompound(page)); VM_BUG_ON(!page_count(page)); + +#ifdef CONFIG_KMEMCHECK + /* + * Split shadow pages too, because free(page[0]) would + * otherwise free the whole shadow. + */ + if (kmemcheck_page_is_tracked(page)) + split_page(virt_to_page(page[0].shadow), order); +#endif + for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); } @@ -1059,14 +1100,15 @@ void split_page(struct page *page, unsigned int order) * we cheat by calling it from here, in the order > 0 path. Saves a branch * or two. */ -static struct page *buffered_rmqueue(struct zone *preferred_zone, - struct zone *zone, int order, gfp_t gfp_flags) +static inline +struct page *buffered_rmqueue(struct zone *preferred_zone, + struct zone *zone, int order, gfp_t gfp_flags, + int migratetype) { unsigned long flags; struct page *page; int cold = !!(gfp_flags & __GFP_COLD); int cpu; - int migratetype = allocflags_to_migratetype(gfp_flags); again: cpu = get_cpu(); @@ -1103,8 +1145,22 @@ again: list_del(&page->lru); pcp->count--; } else { + if (unlikely(gfp_flags & __GFP_NOFAIL)) { + /* + * __GFP_NOFAIL is not to be used in new code. + * + * All __GFP_NOFAIL callers should be fixed so that they + * properly detect and handle allocation failures. + * + * We most definitely don't want callers attempting to + * allocate greater than single-page units with + * __GFP_NOFAIL. + */ + WARN_ON_ONCE(order > 0); + } spin_lock_irqsave(&zone->lock, flags); page = __rmqueue(zone, order, migratetype); + __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); spin_unlock(&zone->lock); if (!page) goto failed; @@ -1126,10 +1182,15 @@ failed: return NULL; } -#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ -#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ -#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ -#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ +/* The ALLOC_WMARK bits are used as an index to zone->watermark */ +#define ALLOC_WMARK_MIN WMARK_MIN +#define ALLOC_WMARK_LOW WMARK_LOW +#define ALLOC_WMARK_HIGH WMARK_HIGH +#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ + +/* Mask to get the watermark bits */ +#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) + #define ALLOC_HARDER 0x10 /* try to alloc harder */ #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ @@ -1387,23 +1448,18 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) */ static struct page * get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, - struct zonelist *zonelist, int high_zoneidx, int alloc_flags) + struct zonelist *zonelist, int high_zoneidx, int alloc_flags, + struct zone *preferred_zone, int migratetype) { struct zoneref *z; struct page *page = NULL; int classzone_idx; - struct zone *zone, *preferred_zone; + struct zone *zone; nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ int zlc_active = 0; /* set if using zonelist_cache */ int did_zlc_setup = 0; /* just call zlc_setup() one time */ - (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask, - &preferred_zone); - if (!preferred_zone) - return NULL; - classzone_idx = zone_idx(preferred_zone); - zonelist_scan: /* * Scan zonelist, looking for a zone with enough free. @@ -1418,31 +1474,49 @@ zonelist_scan: !cpuset_zone_allowed_softwall(zone, gfp_mask)) goto try_next_zone; + BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { unsigned long mark; - if (alloc_flags & ALLOC_WMARK_MIN) - mark = zone->pages_min; - else if (alloc_flags & ALLOC_WMARK_LOW) - mark = zone->pages_low; - else - mark = zone->pages_high; - if (!zone_watermark_ok(zone, order, mark, - classzone_idx, alloc_flags)) { - if (!zone_reclaim_mode || - !zone_reclaim(zone, gfp_mask, order)) + int ret; + + mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; + if (zone_watermark_ok(zone, order, mark, + classzone_idx, alloc_flags)) + goto try_this_zone; + + if (zone_reclaim_mode == 0) + goto this_zone_full; + + ret = zone_reclaim(zone, gfp_mask, order); + switch (ret) { + case ZONE_RECLAIM_NOSCAN: + /* did not scan */ + goto try_next_zone; + case ZONE_RECLAIM_FULL: + /* scanned but unreclaimable */ + goto this_zone_full; + default: + /* did we reclaim enough */ + if (!zone_watermark_ok(zone, order, mark, + classzone_idx, alloc_flags)) goto this_zone_full; } } - page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask); +try_this_zone: + page = buffered_rmqueue(preferred_zone, zone, order, + gfp_mask, migratetype); if (page) break; this_zone_full: if (NUMA_BUILD) zlc_mark_zone_full(zonelist, z); try_next_zone: - if (NUMA_BUILD && !did_zlc_setup) { - /* we do zlc_setup after the first zone is tried */ + if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { + /* + * we do zlc_setup after the first zone is tried but only + * if there are multiple nodes make it worthwhile + */ allowednodes = zlc_setup(zonelist, alloc_flags); zlc_active = 1; did_zlc_setup = 1; @@ -1457,47 +1531,217 @@ try_next_zone: return page; } +static inline int +should_alloc_retry(gfp_t gfp_mask, unsigned int order, + unsigned long pages_reclaimed) +{ + /* Do not loop if specifically requested */ + if (gfp_mask & __GFP_NORETRY) + return 0; + + /* + * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER + * means __GFP_NOFAIL, but that may not be true in other + * implementations. + */ + if (order <= PAGE_ALLOC_COSTLY_ORDER) + return 1; + + /* + * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is + * specified, then we retry until we no longer reclaim any pages + * (above), or we've reclaimed an order of pages at least as + * large as the allocation's order. In both cases, if the + * allocation still fails, we stop retrying. + */ + if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) + return 1; + + /* + * Don't let big-order allocations loop unless the caller + * explicitly requests that. + */ + if (gfp_mask & __GFP_NOFAIL) + return 1; + + return 0; +} + +static inline struct page * +__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, struct zone *preferred_zone, + int migratetype) +{ + struct page *page; + + /* Acquire the OOM killer lock for the zones in zonelist */ + if (!try_set_zone_oom(zonelist, gfp_mask)) { + schedule_timeout_uninterruptible(1); + return NULL; + } + + /* + * Go through the zonelist yet one more time, keep very high watermark + * here, this is only to catch a parallel oom killing, we must fail if + * we're still under heavy pressure. + */ + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, + order, zonelist, high_zoneidx, + ALLOC_WMARK_HIGH|ALLOC_CPUSET, + preferred_zone, migratetype); + if (page) + goto out; + + /* The OOM killer will not help higher order allocs */ + if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL)) + goto out; + + /* Exhausted what can be done so it's blamo time */ + out_of_memory(zonelist, gfp_mask, order); + +out: + clear_zonelist_oom(zonelist, gfp_mask); + return page; +} + +/* The really slow allocator path where we enter direct reclaim */ +static inline struct page * +__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, + int migratetype, unsigned long *did_some_progress) +{ + struct page *page = NULL; + struct reclaim_state reclaim_state; + struct task_struct *p = current; + + cond_resched(); + + /* We now go into synchronous reclaim */ + cpuset_memory_pressure_bump(); + + /* + * The task's cpuset might have expanded its set of allowable nodes + */ + p->flags |= PF_MEMALLOC; + lockdep_set_current_reclaim_state(gfp_mask); + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + + *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); + + p->reclaim_state = NULL; + lockdep_clear_current_reclaim_state(); + p->flags &= ~PF_MEMALLOC; + + cond_resched(); + + if (order != 0) + drain_all_pages(); + + if (likely(*did_some_progress)) + page = get_page_from_freelist(gfp_mask, nodemask, order, + zonelist, high_zoneidx, + alloc_flags, preferred_zone, + migratetype); + return page; +} + /* - * This is the 'heart' of the zoned buddy allocator. + * This is called in the allocator slow-path if the allocation request is of + * sufficient urgency to ignore watermarks and take other desperate measures */ -struct page * -__alloc_pages_internal(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, nodemask_t *nodemask) +static inline struct page * +__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, struct zone *preferred_zone, + int migratetype) +{ + struct page *page; + + do { + page = get_page_from_freelist(gfp_mask, nodemask, order, + zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, + preferred_zone, migratetype); + + if (!page && gfp_mask & __GFP_NOFAIL) + congestion_wait(WRITE, HZ/50); + } while (!page && (gfp_mask & __GFP_NOFAIL)); + + return page; +} + +static inline +void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, + enum zone_type high_zoneidx) { - const gfp_t wait = gfp_mask & __GFP_WAIT; - enum zone_type high_zoneidx = gfp_zone(gfp_mask); struct zoneref *z; struct zone *zone; - struct page *page; - struct reclaim_state reclaim_state; - struct task_struct *p = current; - int do_retry; - int alloc_flags; - unsigned long did_some_progress; - unsigned long pages_reclaimed = 0; - lockdep_trace_alloc(gfp_mask); + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) + wakeup_kswapd(zone, order); +} - might_sleep_if(wait); +static inline int +gfp_to_alloc_flags(gfp_t gfp_mask) +{ + struct task_struct *p = current; + int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; + const gfp_t wait = gfp_mask & __GFP_WAIT; - if (should_fail_alloc_page(gfp_mask, order)) - return NULL; + /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ + BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH); -restart: - z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */ + /* + * The caller may dip into page reserves a bit more if the caller + * cannot run direct reclaim, or if the caller has realtime scheduling + * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will + * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). + */ + alloc_flags |= (gfp_mask & __GFP_HIGH); - if (unlikely(!z->zone)) { + if (!wait) { + alloc_flags |= ALLOC_HARDER; /* - * Happens if we have an empty zonelist as a result of - * GFP_THISNODE being used on a memoryless node + * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. + * See also cpuset_zone_allowed() comment in kernel/cpuset.c. */ - return NULL; + alloc_flags &= ~ALLOC_CPUSET; + } else if (unlikely(rt_task(p))) + alloc_flags |= ALLOC_HARDER; + + if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { + if (!in_interrupt() && + ((p->flags & PF_MEMALLOC) || + unlikely(test_thread_flag(TIF_MEMDIE)))) + alloc_flags |= ALLOC_NO_WATERMARKS; } - page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, - zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); - if (page) - goto got_pg; + return alloc_flags; +} + +static inline struct page * +__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, struct zone *preferred_zone, + int migratetype) +{ + const gfp_t wait = gfp_mask & __GFP_WAIT; + struct page *page = NULL; + int alloc_flags; + unsigned long pages_reclaimed = 0; + unsigned long did_some_progress; + struct task_struct *p = current; + + /* + * In the slowpath, we sanity check order to avoid ever trying to + * reclaim >= MAX_ORDER areas which will never succeed. Callers may + * be using allocators in order of preference for an area that is + * too large. + */ + if (WARN_ON_ONCE(order >= MAX_ORDER)) + return NULL; /* * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and @@ -1510,154 +1754,83 @@ restart: if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) goto nopage; - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) - wakeup_kswapd(zone, order); + wake_all_kswapd(order, zonelist, high_zoneidx); /* * OK, we're below the kswapd watermark and have kicked background * reclaim. Now things get more complex, so set up alloc_flags according * to how we want to proceed. - * - * The caller may dip into page reserves a bit more if the caller - * cannot run direct reclaim, or if the caller has realtime scheduling - * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will - * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). */ - alloc_flags = ALLOC_WMARK_MIN; - if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) - alloc_flags |= ALLOC_HARDER; - if (gfp_mask & __GFP_HIGH) - alloc_flags |= ALLOC_HIGH; - if (wait) - alloc_flags |= ALLOC_CPUSET; + alloc_flags = gfp_to_alloc_flags(gfp_mask); - /* - * Go through the zonelist again. Let __GFP_HIGH and allocations - * coming from realtime tasks go deeper into reserves. - * - * This is the last chance, in general, before the goto nopage. - * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. - * See also cpuset_zone_allowed() comment in kernel/cpuset.c. - */ +restart: + /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, - high_zoneidx, alloc_flags); + high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, + preferred_zone, migratetype); if (page) goto got_pg; - /* This allocation should allow future memory freeing. */ - rebalance: - if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) - && !in_interrupt()) { - if (!(gfp_mask & __GFP_NOMEMALLOC)) { -nofail_alloc: - /* go through the zonelist yet again, ignoring mins */ - page = get_page_from_freelist(gfp_mask, nodemask, order, - zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); - if (page) - goto got_pg; - if (gfp_mask & __GFP_NOFAIL) { - congestion_wait(WRITE, HZ/50); - goto nofail_alloc; - } - } - goto nopage; + /* Allocate without watermarks if the context allows */ + if (alloc_flags & ALLOC_NO_WATERMARKS) { + page = __alloc_pages_high_priority(gfp_mask, order, + zonelist, high_zoneidx, nodemask, + preferred_zone, migratetype); + if (page) + goto got_pg; } /* Atomic allocations - we can't balance anything */ if (!wait) goto nopage; - cond_resched(); + /* Avoid recursion of direct reclaim */ + if (p->flags & PF_MEMALLOC) + goto nopage; + + /* Try direct reclaim and then allocating */ + page = __alloc_pages_direct_reclaim(gfp_mask, order, + zonelist, high_zoneidx, + nodemask, + alloc_flags, preferred_zone, + migratetype, &did_some_progress); + if (page) + goto got_pg; - /* We now go into synchronous reclaim */ - cpuset_memory_pressure_bump(); /* - * The task's cpuset might have expanded its set of allowable nodes + * If we failed to make any progress reclaiming, then we are + * running out of options and have to consider going OOM */ - cpuset_update_task_memory_state(); - p->flags |= PF_MEMALLOC; - - lockdep_set_current_reclaim_state(gfp_mask); - reclaim_state.reclaimed_slab = 0; - p->reclaim_state = &reclaim_state; - - did_some_progress = try_to_free_pages(zonelist, order, - gfp_mask, nodemask); - - p->reclaim_state = NULL; - lockdep_clear_current_reclaim_state(); - p->flags &= ~PF_MEMALLOC; - - cond_resched(); + if (!did_some_progress) { + if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { + if (oom_killer_disabled) + goto nopage; + page = __alloc_pages_may_oom(gfp_mask, order, + zonelist, high_zoneidx, + nodemask, preferred_zone, + migratetype); + if (page) + goto got_pg; - if (order != 0) - drain_all_pages(); + /* + * The OOM killer does not trigger for high-order + * ~__GFP_NOFAIL allocations so if no progress is being + * made, there are no other options and retrying is + * unlikely to help. + */ + if (order > PAGE_ALLOC_COSTLY_ORDER && + !(gfp_mask & __GFP_NOFAIL)) + goto nopage; - if (likely(did_some_progress)) { - page = get_page_from_freelist(gfp_mask, nodemask, order, - zonelist, high_zoneidx, alloc_flags); - if (page) - goto got_pg; - } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { - if (!try_set_zone_oom(zonelist, gfp_mask)) { - schedule_timeout_uninterruptible(1); goto restart; } - - /* - * Go through the zonelist yet one more time, keep - * very high watermark here, this is only to catch - * a parallel oom killing, we must fail if we're still - * under heavy pressure. - */ - page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, - order, zonelist, high_zoneidx, - ALLOC_WMARK_HIGH|ALLOC_CPUSET); - if (page) { - clear_zonelist_oom(zonelist, gfp_mask); - goto got_pg; - } - - /* The OOM killer will not help higher order allocs so fail */ - if (order > PAGE_ALLOC_COSTLY_ORDER) { - clear_zonelist_oom(zonelist, gfp_mask); - goto nopage; - } - - out_of_memory(zonelist, gfp_mask, order); - clear_zonelist_oom(zonelist, gfp_mask); - goto restart; } - /* - * Don't let big-order allocations loop unless the caller explicitly - * requests that. Wait for some write requests to complete then retry. - * - * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER - * means __GFP_NOFAIL, but that may not be true in other - * implementations. - * - * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is - * specified, then we retry until we no longer reclaim any pages - * (above), or we've reclaimed an order of pages at least as - * large as the allocation's order. In both cases, if the - * allocation still fails, we stop retrying. - */ + /* Check if we should retry the allocation */ pages_reclaimed += did_some_progress; - do_retry = 0; - if (!(gfp_mask & __GFP_NORETRY)) { - if (order <= PAGE_ALLOC_COSTLY_ORDER) { - do_retry = 1; - } else { - if (gfp_mask & __GFP_REPEAT && - pages_reclaimed < (1 << order)) - do_retry = 1; - } - if (gfp_mask & __GFP_NOFAIL) - do_retry = 1; - } - if (do_retry) { + if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { + /* Wait for some write requests to complete then retry */ congestion_wait(WRITE, HZ/50); goto rebalance; } @@ -1670,10 +1843,58 @@ nopage: dump_stack(); show_mem(); } + return page; got_pg: + if (kmemcheck_enabled) + kmemcheck_pagealloc_alloc(page, order, gfp_mask); return page; + } -EXPORT_SYMBOL(__alloc_pages_internal); + +/* + * This is the 'heart' of the zoned buddy allocator. + */ +struct page * +__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, nodemask_t *nodemask) +{ + enum zone_type high_zoneidx = gfp_zone(gfp_mask); + struct zone *preferred_zone; + struct page *page; + int migratetype = allocflags_to_migratetype(gfp_mask); + + lockdep_trace_alloc(gfp_mask); + + might_sleep_if(gfp_mask & __GFP_WAIT); + + if (should_fail_alloc_page(gfp_mask, order)) + return NULL; + + /* + * Check the zones suitable for the gfp_mask contain at least one + * valid zone. It's possible to have an empty zonelist as a result + * of GFP_THISNODE and a memoryless node + */ + if (unlikely(!zonelist->_zonerefs->zone)) + return NULL; + + /* The preferred zone is used for statistics later */ + first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); + if (!preferred_zone) + return NULL; + + /* First allocation attempt */ + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, + zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, + preferred_zone, migratetype); + if (unlikely(!page)) + page = __alloc_pages_slowpath(gfp_mask, order, + zonelist, high_zoneidx, nodemask, + preferred_zone, migratetype); + + return page; +} +EXPORT_SYMBOL(__alloc_pages_nodemask); /* * Common helper functions. @@ -1802,7 +2023,7 @@ static unsigned int nr_free_zone_pages(int offset) for_each_zone_zonelist(zone, z, zonelist, offset) { unsigned long size = zone->present_pages; - unsigned long high = zone->pages_high; + unsigned long high = high_wmark_pages(zone); if (size > high) sum += size - high; } @@ -1894,19 +2115,14 @@ void show_free_areas(void) printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" " inactive_file:%lu" -//TODO: check/adjust line lengths -#ifdef CONFIG_UNEVICTABLE_LRU " unevictable:%lu" -#endif " dirty:%lu writeback:%lu unstable:%lu\n" " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", global_page_state(NR_ACTIVE_ANON), global_page_state(NR_ACTIVE_FILE), global_page_state(NR_INACTIVE_ANON), global_page_state(NR_INACTIVE_FILE), -#ifdef CONFIG_UNEVICTABLE_LRU global_page_state(NR_UNEVICTABLE), -#endif global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), @@ -1930,25 +2146,21 @@ void show_free_areas(void) " inactive_anon:%lukB" " active_file:%lukB" " inactive_file:%lukB" -#ifdef CONFIG_UNEVICTABLE_LRU " unevictable:%lukB" -#endif " present:%lukB" " pages_scanned:%lu" " all_unreclaimable? %s" "\n", zone->name, K(zone_page_state(zone, NR_FREE_PAGES)), - K(zone->pages_min), - K(zone->pages_low), - K(zone->pages_high), + K(min_wmark_pages(zone)), + K(low_wmark_pages(zone)), + K(high_wmark_pages(zone)), K(zone_page_state(zone, NR_ACTIVE_ANON)), K(zone_page_state(zone, NR_INACTIVE_ANON)), K(zone_page_state(zone, NR_ACTIVE_FILE)), K(zone_page_state(zone, NR_INACTIVE_FILE)), -#ifdef CONFIG_UNEVICTABLE_LRU K(zone_page_state(zone, NR_UNEVICTABLE)), -#endif K(zone->present_pages), zone->pages_scanned, (zone_is_all_unreclaimable(zone) ? "yes" : "no") @@ -2106,7 +2318,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, } -#define MAX_NODE_LOAD (num_online_nodes()) +#define MAX_NODE_LOAD (nr_online_nodes) static int node_load[MAX_NUMNODES]; /** @@ -2315,7 +2527,7 @@ static void build_zonelists(pg_data_t *pgdat) /* NUMA-aware ordering of nodes */ local_node = pgdat->node_id; - load = num_online_nodes(); + load = nr_online_nodes; prev_node = local_node; nodes_clear(used_mask); @@ -2466,7 +2678,7 @@ void build_all_zonelists(void) printk("Built %i zonelists in %s order, mobility grouping %s. " "Total pages: %ld\n", - num_online_nodes(), + nr_online_nodes, zonelist_order_name[current_zonelist_order], page_group_by_mobility_disabled ? "off" : "on", vm_total_pages); @@ -2545,8 +2757,8 @@ static inline unsigned long wait_table_bits(unsigned long size) /* * Mark a number of pageblocks as MIGRATE_RESERVE. The number - * of blocks reserved is based on zone->pages_min. The memory within the - * reserve will tend to store contiguous free pages. Setting min_free_kbytes + * of blocks reserved is based on min_wmark_pages(zone). The memory within + * the reserve will tend to store contiguous free pages. Setting min_free_kbytes * higher will lead to a bigger reserve which will get freed as contiguous * blocks as reclaim kicks in */ @@ -2559,7 +2771,7 @@ static void setup_zone_migrate_reserve(struct zone *zone) /* Get the start pfn, end pfn and the number of blocks to reserve */ start_pfn = zone->zone_start_pfn; end_pfn = start_pfn + zone->spanned_pages; - reserve = roundup(zone->pages_min, pageblock_nr_pages) >> + reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> pageblock_order; for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { @@ -3103,64 +3315,6 @@ void __init sparse_memory_present_with_active_regions(int nid) } /** - * push_node_boundaries - Push node boundaries to at least the requested boundary - * @nid: The nid of the node to push the boundary for - * @start_pfn: The start pfn of the node - * @end_pfn: The end pfn of the node - * - * In reserve-based hot-add, mem_map is allocated that is unused until hotadd - * time. Specifically, on x86_64, SRAT will report ranges that can potentially - * be hotplugged even though no physical memory exists. This function allows - * an arch to push out the node boundaries so mem_map is allocated that can - * be used later. - */ -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE -void __init push_node_boundaries(unsigned int nid, - unsigned long start_pfn, unsigned long end_pfn) -{ - mminit_dprintk(MMINIT_TRACE, "zoneboundary", - "Entering push_node_boundaries(%u, %lu, %lu)\n", - nid, start_pfn, end_pfn); - - /* Initialise the boundary for this node if necessary */ - if (node_boundary_end_pfn[nid] == 0) - node_boundary_start_pfn[nid] = -1UL; - - /* Update the boundaries */ - if (node_boundary_start_pfn[nid] > start_pfn) - node_boundary_start_pfn[nid] = start_pfn; - if (node_boundary_end_pfn[nid] < end_pfn) - node_boundary_end_pfn[nid] = end_pfn; -} - -/* If necessary, push the node boundary out for reserve hotadd */ -static void __meminit account_node_boundary(unsigned int nid, - unsigned long *start_pfn, unsigned long *end_pfn) -{ - mminit_dprintk(MMINIT_TRACE, "zoneboundary", - "Entering account_node_boundary(%u, %lu, %lu)\n", - nid, *start_pfn, *end_pfn); - - /* Return if boundary information has not been provided */ - if (node_boundary_end_pfn[nid] == 0) - return; - - /* Check the boundaries and update if necessary */ - if (node_boundary_start_pfn[nid] < *start_pfn) - *start_pfn = node_boundary_start_pfn[nid]; - if (node_boundary_end_pfn[nid] > *end_pfn) - *end_pfn = node_boundary_end_pfn[nid]; -} -#else -void __init push_node_boundaries(unsigned int nid, - unsigned long start_pfn, unsigned long end_pfn) {} - -static void __meminit account_node_boundary(unsigned int nid, - unsigned long *start_pfn, unsigned long *end_pfn) {} -#endif - - -/** * get_pfn_range_for_nid - Return the start and end page frames for a node * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. * @start_pfn: Passed by reference. On return, it will have the node start_pfn. @@ -3185,9 +3339,6 @@ void __meminit get_pfn_range_for_nid(unsigned int nid, if (*start_pfn == -1UL) *start_pfn = 0; - - /* Push the node boundaries out if requested */ - account_node_boundary(nid, start_pfn, end_pfn); } /* @@ -3552,7 +3703,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone_pcp_init(zone); for_each_lru(l) { INIT_LIST_HEAD(&zone->lru[l].list); - zone->lru[l].nr_scan = 0; + zone->lru[l].nr_saved_scan = 0; } zone->reclaim_stat.recent_rotated[0] = 0; zone->reclaim_stat.recent_rotated[1] = 0; @@ -3793,10 +3944,6 @@ void __init remove_all_active_ranges(void) { memset(early_node_map, 0, sizeof(early_node_map)); nr_nodemap_entries = 0; -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE - memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn)); - memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn)); -#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ } /* Compare two active node_active_regions */ @@ -4093,6 +4240,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) early_node_map[i].start_pfn, early_node_map[i].end_pfn); + /* + * find_zone_movable_pfns_for_nodes/early_calculate_totalpages init + * that node_mask, clear it at first + */ + nodes_clear(node_states[N_HIGH_MEMORY]); /* Initialise every node */ mminit_verify_pageflags_layout(); setup_nr_node_ids(); @@ -4227,8 +4379,8 @@ static void calculate_totalreserve_pages(void) max = zone->lowmem_reserve[j]; } - /* we treat pages_high as reserved pages. */ - max += zone->pages_high; + /* we treat the high watermark as reserved pages. */ + max += high_wmark_pages(zone); if (max > zone->present_pages) max = zone->present_pages; @@ -4278,12 +4430,13 @@ static void setup_per_zone_lowmem_reserve(void) } /** - * setup_per_zone_pages_min - called when min_free_kbytes changes. + * setup_per_zone_wmarks - called when min_free_kbytes changes + * or when memory is hot-{added|removed} * - * Ensures that the pages_{min,low,high} values for each zone are set correctly - * with respect to min_free_kbytes. + * Ensures that the watermark[min,low,high] values for each zone are set + * correctly with respect to min_free_kbytes. */ -void setup_per_zone_pages_min(void) +void setup_per_zone_wmarks(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; @@ -4308,7 +4461,7 @@ void setup_per_zone_pages_min(void) * need highmem pages, so cap pages_min to a small * value here. * - * The (pages_high-pages_low) and (pages_low-pages_min) + * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) * deltas controls asynch page reclaim, and so should * not be capped for highmem. */ @@ -4319,17 +4472,17 @@ void setup_per_zone_pages_min(void) min_pages = SWAP_CLUSTER_MAX; if (min_pages > 128) min_pages = 128; - zone->pages_min = min_pages; + zone->watermark[WMARK_MIN] = min_pages; } else { /* * If it's a lowmem zone, reserve a number of pages * proportionate to the zone's size. */ - zone->pages_min = tmp; + zone->watermark[WMARK_MIN] = tmp; } - zone->pages_low = zone->pages_min + (tmp >> 2); - zone->pages_high = zone->pages_min + (tmp >> 1); + zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); + zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lock, flags); } @@ -4339,8 +4492,6 @@ void setup_per_zone_pages_min(void) } /** - * setup_per_zone_inactive_ratio - called when min_free_kbytes changes. - * * The inactive anon list should be small enough that the VM never has to * do too much work, but large enough that each inactive page has a chance * to be referenced again before it is swapped out. @@ -4361,21 +4512,26 @@ void setup_per_zone_pages_min(void) * 1TB 101 10GB * 10TB 320 32GB */ -static void setup_per_zone_inactive_ratio(void) +void calculate_zone_inactive_ratio(struct zone *zone) { - struct zone *zone; + unsigned int gb, ratio; - for_each_zone(zone) { - unsigned int gb, ratio; - - /* Zone size in gigabytes */ - gb = zone->present_pages >> (30 - PAGE_SHIFT); + /* Zone size in gigabytes */ + gb = zone->present_pages >> (30 - PAGE_SHIFT); + if (gb) ratio = int_sqrt(10 * gb); - if (!ratio) - ratio = 1; + else + ratio = 1; - zone->inactive_ratio = ratio; - } + zone->inactive_ratio = ratio; +} + +static void __init setup_per_zone_inactive_ratio(void) +{ + struct zone *zone; + + for_each_zone(zone) + calculate_zone_inactive_ratio(zone); } /* @@ -4402,7 +4558,7 @@ static void setup_per_zone_inactive_ratio(void) * 8192MB: 11584k * 16384MB: 16384k */ -static int __init init_per_zone_pages_min(void) +static int __init init_per_zone_wmark_min(void) { unsigned long lowmem_kbytes; @@ -4413,12 +4569,12 @@ static int __init init_per_zone_pages_min(void) min_free_kbytes = 128; if (min_free_kbytes > 65536) min_free_kbytes = 65536; - setup_per_zone_pages_min(); + setup_per_zone_wmarks(); setup_per_zone_lowmem_reserve(); setup_per_zone_inactive_ratio(); return 0; } -module_init(init_per_zone_pages_min) +module_init(init_per_zone_wmark_min) /* * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so @@ -4430,7 +4586,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, { proc_dointvec(table, write, file, buffer, length, ppos); if (write) - setup_per_zone_pages_min(); + setup_per_zone_wmarks(); return 0; } @@ -4474,7 +4630,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, * whenever sysctl_lowmem_reserve_ratio changes. * * The reserve ratio obviously has absolutely no relation with the - * pages_min watermarks. The lowmem reserve ratio can only make sense + * minimum watermarks. The lowmem reserve ratio can only make sense * if in function of the boot time zone sizes. */ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, @@ -4581,23 +4737,13 @@ void *__init alloc_large_system_hash(const char *tablename, else if (hashdist) table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); else { - unsigned long order = get_order(size); - table = (void*) __get_free_pages(GFP_ATOMIC, order); /* * If bucketsize is not a power-of-two, we may free - * some pages at the end of hash table. + * some pages at the end of hash table which + * alloc_pages_exact() automatically does */ - if (table) { - unsigned long alloc_end = (unsigned long)table + - (PAGE_SIZE << order); - unsigned long used = (unsigned long)table + - PAGE_ALIGN(size); - split_page(virt_to_page(table), order); - while (used < alloc_end) { - free_page(used); - used += PAGE_SIZE; - } - } + if (get_order(size) < MAX_ORDER) + table = alloc_pages_exact(size, GFP_ATOMIC); } } while (!table && size > PAGE_SIZE && --log2qty); @@ -4615,6 +4761,16 @@ void *__init alloc_large_system_hash(const char *tablename, if (_hash_mask) *_hash_mask = (1 << log2qty) - 1; + /* + * If hashdist is set, the table allocation is done with __vmalloc() + * which invokes the kmemleak_alloc() callback. This function may also + * be called before the slab and kmemleak are initialised when + * kmemleak simply buffers the request to be executed later + * (GFP_ATOMIC flag ignored in this case). + */ + if (!hashdist) + kmemleak_alloc(table, size, 1, GFP_ATOMIC); + return table; } diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 791905c..11a8a10 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -69,7 +69,7 @@ static int __init alloc_node_page_cgroup(int nid) return 0; } -void __init page_cgroup_init(void) +void __init page_cgroup_init_flatmem(void) { int nid, fail; @@ -113,16 +113,11 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn) if (!section->page_cgroup) { nid = page_to_nid(pfn_to_page(pfn)); table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; - if (slab_is_available()) { - base = kmalloc_node(table_size, - GFP_KERNEL | __GFP_NOWARN, nid); - if (!base) - base = vmalloc_node(table_size, nid); - } else { - base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), - table_size, - PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); - } + VM_BUG_ON(!slab_is_available()); + base = kmalloc_node(table_size, + GFP_KERNEL | __GFP_NOWARN, nid); + if (!base) + base = vmalloc_node(table_size, nid); } else { /* * We don't have to allocate page_cgroup again, but diff --git a/mm/page_io.c b/mm/page_io.c index 3023c47..c6f3e50 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -120,7 +120,7 @@ out: return ret; } -int swap_readpage(struct file *file, struct page *page) +int swap_readpage(struct page *page) { struct bio *bio; int ret = 0; diff --git a/mm/percpu.c b/mm/percpu.c index 1aa5d8f..c0b2c1a 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -23,7 +23,7 @@ * Allocation is done in offset-size areas of single unit space. Ie, * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring - * percpu base registers UNIT_SIZE apart. + * percpu base registers pcpu_unit_size apart. * * There are usually many small percpu allocations many of them as * small as 4 bytes. The allocator organizes chunks into lists @@ -38,8 +38,8 @@ * region and negative allocated. Allocation inside a chunk is done * by scanning this map sequentially and serving the first matching * entry. This is mostly copied from the percpu_modalloc() allocator. - * Chunks are also linked into a rb tree to ease address to chunk - * mapping during free. + * Chunks can be determined from the address using the index field + * in the page struct. The index field contains a pointer to the chunk. * * To use this allocator, arch code should do the followings. * @@ -61,7 +61,6 @@ #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/pfn.h> -#include <linux/rbtree.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/vmalloc.h> @@ -88,7 +87,6 @@ struct pcpu_chunk { struct list_head list; /* linked to pcpu_slot lists */ - struct rb_node rb_node; /* key is chunk->vm->addr */ int free_size; /* free bytes in the chunk */ int contig_hint; /* max contiguous size hint */ struct vm_struct *vm; /* mapped vmalloc region */ @@ -110,9 +108,21 @@ static size_t pcpu_chunk_struct_size __read_mostly; void *pcpu_base_addr __read_mostly; EXPORT_SYMBOL_GPL(pcpu_base_addr); -/* optional reserved chunk, only accessible for reserved allocations */ +/* + * The first chunk which always exists. Note that unlike other + * chunks, this one can be allocated and mapped in several different + * ways and thus often doesn't live in the vmalloc area. + */ +static struct pcpu_chunk *pcpu_first_chunk; + +/* + * Optional reserved chunk. This chunk reserves part of the first + * chunk and serves it for reserved allocations. The amount of + * reserved offset is in pcpu_reserved_chunk_limit. When reserved + * area doesn't exist, the following variables contain NULL and 0 + * respectively. + */ static struct pcpu_chunk *pcpu_reserved_chunk; -/* offset limit of the reserved chunk */ static int pcpu_reserved_chunk_limit; /* @@ -121,7 +131,7 @@ static int pcpu_reserved_chunk_limit; * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former * protects allocation/reclaim paths, chunks and chunk->page arrays. * The latter is a spinlock and protects the index data structures - - * chunk slots, rbtree, chunks and area maps in chunks. + * chunk slots, chunks and area maps in chunks. * * During allocation, pcpu_alloc_mutex is kept locked all the time and * pcpu_lock is grabbed and released as necessary. All actual memory @@ -140,7 +150,6 @@ static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ -static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */ /* reclaim work to release fully free chunks, scheduled from free path */ static void pcpu_reclaim(struct work_struct *work); @@ -191,6 +200,18 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; } +/* set the pointer to a chunk in a page struct */ +static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu) +{ + page->index = (unsigned long)pcpu; +} + +/* obtain pointer to a chunk from a page struct */ +static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) +{ + return (struct pcpu_chunk *)page->index; +} + /** * pcpu_mem_alloc - allocate memory * @size: bytes to allocate @@ -257,93 +278,26 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) } } -static struct rb_node **pcpu_chunk_rb_search(void *addr, - struct rb_node **parentp) -{ - struct rb_node **p = &pcpu_addr_root.rb_node; - struct rb_node *parent = NULL; - struct pcpu_chunk *chunk; - - while (*p) { - parent = *p; - chunk = rb_entry(parent, struct pcpu_chunk, rb_node); - - if (addr < chunk->vm->addr) - p = &(*p)->rb_left; - else if (addr > chunk->vm->addr) - p = &(*p)->rb_right; - else - break; - } - - if (parentp) - *parentp = parent; - return p; -} - /** - * pcpu_chunk_addr_search - search for chunk containing specified address - * @addr: address to search for - * - * Look for chunk which might contain @addr. More specifically, it - * searchs for the chunk with the highest start address which isn't - * beyond @addr. - * - * CONTEXT: - * pcpu_lock. + * pcpu_chunk_addr_search - determine chunk containing specified address + * @addr: address for which the chunk needs to be determined. * * RETURNS: * The address of the found chunk. */ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) { - struct rb_node *n, *parent; - struct pcpu_chunk *chunk; + void *first_start = pcpu_first_chunk->vm->addr; - /* is it in the reserved chunk? */ - if (pcpu_reserved_chunk) { - void *start = pcpu_reserved_chunk->vm->addr; - - if (addr >= start && addr < start + pcpu_reserved_chunk_limit) + /* is it in the first chunk? */ + if (addr >= first_start && addr < first_start + pcpu_chunk_size) { + /* is it in the reserved area? */ + if (addr < first_start + pcpu_reserved_chunk_limit) return pcpu_reserved_chunk; + return pcpu_first_chunk; } - /* nah... search the regular ones */ - n = *pcpu_chunk_rb_search(addr, &parent); - if (!n) { - /* no exactly matching chunk, the parent is the closest */ - n = parent; - BUG_ON(!n); - } - chunk = rb_entry(n, struct pcpu_chunk, rb_node); - - if (addr < chunk->vm->addr) { - /* the parent was the next one, look for the previous one */ - n = rb_prev(n); - BUG_ON(!n); - chunk = rb_entry(n, struct pcpu_chunk, rb_node); - } - - return chunk; -} - -/** - * pcpu_chunk_addr_insert - insert chunk into address rb tree - * @new: chunk to insert - * - * Insert @new into address rb tree. - * - * CONTEXT: - * pcpu_lock. - */ -static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) -{ - struct rb_node **p, *parent; - - p = pcpu_chunk_rb_search(new->vm->addr, &parent); - BUG_ON(*p); - rb_link_node(&new->rb_node, parent, p); - rb_insert_color(&new->rb_node, &pcpu_addr_root); + return pcpu_get_page_chunk(vmalloc_to_page(addr)); } /** @@ -755,6 +709,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) alloc_mask, 0); if (!*pagep) goto err; + pcpu_set_page_chunk(*pagep, chunk); } } @@ -879,7 +834,6 @@ restart: spin_lock_irq(&pcpu_lock); pcpu_chunk_relocate(chunk, -1); - pcpu_chunk_addr_insert(chunk); goto restart; area_found: @@ -968,7 +922,6 @@ static void pcpu_reclaim(struct work_struct *work) if (chunk == list_first_entry(head, struct pcpu_chunk, list)) continue; - rb_erase(&chunk->rb_node, &pcpu_addr_root); list_move(&chunk->list, &todo); } @@ -1147,7 +1100,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, if (reserved_size) { schunk->free_size = reserved_size; - pcpu_reserved_chunk = schunk; /* not for dynamic alloc */ + pcpu_reserved_chunk = schunk; + pcpu_reserved_chunk_limit = static_size + reserved_size; } else { schunk->free_size = dyn_size; dyn_size = 0; /* dynamic area covered */ @@ -1158,8 +1112,6 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, if (schunk->free_size) schunk->map[schunk->map_used++] = schunk->free_size; - pcpu_reserved_chunk_limit = static_size + schunk->free_size; - /* init dynamic chunk if necessary */ if (dyn_size) { dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); @@ -1226,13 +1178,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, } /* link the first chunk in */ - if (!dchunk) { - pcpu_chunk_relocate(schunk, -1); - pcpu_chunk_addr_insert(schunk); - } else { - pcpu_chunk_relocate(dchunk, -1); - pcpu_chunk_addr_insert(dchunk); - } + pcpu_first_chunk = dchunk ?: schunk; + pcpu_chunk_relocate(pcpu_first_chunk, -1); /* we're done */ pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); diff --git a/mm/readahead.c b/mm/readahead.c index 133b6d5..aa1aa23 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -133,15 +133,12 @@ out: } /* - * do_page_cache_readahead actually reads a chunk of disk. It allocates all + * __do_page_cache_readahead() actually reads a chunk of disk. It allocates all * the pages first, then submits them all for I/O. This avoids the very bad * behaviour which would occur if page allocations are causing VM writeback. * We really don't want to intermingle reads and writes like that. * * Returns the number of pages requested, or the maximum amount of I/O allowed. - * - * do_page_cache_readahead() returns -1 if it encountered request queue - * congestion. */ static int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, @@ -210,6 +207,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) return -EINVAL; + nr_to_read = max_sane_readahead(nr_to_read); while (nr_to_read) { int err; @@ -231,22 +229,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, } /* - * This version skips the IO if the queue is read-congested, and will tell the - * block layer to abandon the readahead if request allocation would block. - * - * force_page_cache_readahead() will ignore queue congestion and will block on - * request queues. - */ -int do_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read) -{ - if (bdi_read_congested(mapping->backing_dev_info)) - return -1; - - return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0); -} - -/* * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a * sensible upper limit. */ @@ -259,7 +241,7 @@ unsigned long max_sane_readahead(unsigned long nr) /* * Submit IO for the read-ahead request in file_ra_state. */ -static unsigned long ra_submit(struct file_ra_state *ra, +unsigned long ra_submit(struct file_ra_state *ra, struct address_space *mapping, struct file *filp) { int actual; @@ -348,6 +330,59 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra, */ /* + * Count contiguously cached pages from @offset-1 to @offset-@max, + * this count is a conservative estimation of + * - length of the sequential read sequence, or + * - thrashing threshold in memory tight systems + */ +static pgoff_t count_history_pages(struct address_space *mapping, + struct file_ra_state *ra, + pgoff_t offset, unsigned long max) +{ + pgoff_t head; + + rcu_read_lock(); + head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max); + rcu_read_unlock(); + + return offset - 1 - head; +} + +/* + * page cache context based read-ahead + */ +static int try_context_readahead(struct address_space *mapping, + struct file_ra_state *ra, + pgoff_t offset, + unsigned long req_size, + unsigned long max) +{ + pgoff_t size; + + size = count_history_pages(mapping, ra, offset, max); + + /* + * no history pages: + * it could be a random read + */ + if (!size) + return 0; + + /* + * starts from beginning of file: + * it is a strong indication of long-run stream (or whole-file-read) + */ + if (size >= offset) + size *= 2; + + ra->start = offset; + ra->size = get_init_ra_size(size + req_size, max); + ra->async_size = ra->size; + + return 1; +} + +/* * A minimal readahead algorithm for trivial sequential/random reads. */ static unsigned long @@ -356,34 +391,26 @@ ondemand_readahead(struct address_space *mapping, bool hit_readahead_marker, pgoff_t offset, unsigned long req_size) { - int max = ra->ra_pages; /* max readahead pages */ - pgoff_t prev_offset; - int sequential; + unsigned long max = max_sane_readahead(ra->ra_pages); + + /* + * start of file + */ + if (!offset) + goto initial_readahead; /* * It's the expected callback offset, assume sequential access. * Ramp up sizes, and push forward the readahead window. */ - if (offset && (offset == (ra->start + ra->size - ra->async_size) || - offset == (ra->start + ra->size))) { + if ((offset == (ra->start + ra->size - ra->async_size) || + offset == (ra->start + ra->size))) { ra->start += ra->size; ra->size = get_next_ra_size(ra, max); ra->async_size = ra->size; goto readit; } - prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT; - sequential = offset - prev_offset <= 1UL || req_size > max; - - /* - * Standalone, small read. - * Read as is, and do not pollute the readahead state. - */ - if (!hit_readahead_marker && !sequential) { - return __do_page_cache_readahead(mapping, filp, - offset, req_size, 0); - } - /* * Hit a marked page without valid readahead state. * E.g. interleaved reads. @@ -394,7 +421,7 @@ ondemand_readahead(struct address_space *mapping, pgoff_t start; rcu_read_lock(); - start = radix_tree_next_hole(&mapping->page_tree, offset,max+1); + start = radix_tree_next_hole(&mapping->page_tree, offset+1,max); rcu_read_unlock(); if (!start || start - offset > max) @@ -402,23 +429,53 @@ ondemand_readahead(struct address_space *mapping, ra->start = start; ra->size = start - offset; /* old async_size */ + ra->size += req_size; ra->size = get_next_ra_size(ra, max); ra->async_size = ra->size; goto readit; } /* - * It may be one of - * - first read on start of file - * - sequential cache miss - * - oversize random read - * Start readahead for it. + * oversize read + */ + if (req_size > max) + goto initial_readahead; + + /* + * sequential cache miss + */ + if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL) + goto initial_readahead; + + /* + * Query the page cache and look for the traces(cached history pages) + * that a sequential stream would leave behind. + */ + if (try_context_readahead(mapping, ra, offset, req_size, max)) + goto readit; + + /* + * standalone, small random read + * Read as is, and do not pollute the readahead state. */ + return __do_page_cache_readahead(mapping, filp, offset, req_size, 0); + +initial_readahead: ra->start = offset; ra->size = get_init_ra_size(req_size, max); ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; readit: + /* + * Will this read hit the readahead marker made by itself? + * If so, trigger the readahead marker hit now, and merge + * the resulted next readahead window into the current one. + */ + if (offset == ra->start && ra->size == ra->async_size) { + ra->async_size = get_next_ra_size(ra, max); + ra->size += ra->async_size; + } + return ra_submit(ra, mapping, filp); } @@ -14,7 +14,7 @@ * Original design by Rik van Riel <riel@conectiva.com.br> 2001 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 - * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004 + * Contributions by Hugh Dickins 2003, 2004 */ /* @@ -333,7 +333,9 @@ static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) * repeatedly from either page_referenced_anon or page_referenced_file. */ static int page_referenced_one(struct page *page, - struct vm_area_struct *vma, unsigned int *mapcount) + struct vm_area_struct *vma, + unsigned int *mapcount, + unsigned long *vm_flags) { struct mm_struct *mm = vma->vm_mm; unsigned long address; @@ -381,11 +383,14 @@ out_unmap: (*mapcount)--; pte_unmap_unlock(pte, ptl); out: + if (referenced) + *vm_flags |= vma->vm_flags; return referenced; } static int page_referenced_anon(struct page *page, - struct mem_cgroup *mem_cont) + struct mem_cgroup *mem_cont, + unsigned long *vm_flags) { unsigned int mapcount; struct anon_vma *anon_vma; @@ -405,7 +410,8 @@ static int page_referenced_anon(struct page *page, */ if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) continue; - referenced += page_referenced_one(page, vma, &mapcount); + referenced += page_referenced_one(page, vma, + &mapcount, vm_flags); if (!mapcount) break; } @@ -418,6 +424,7 @@ static int page_referenced_anon(struct page *page, * page_referenced_file - referenced check for object-based rmap * @page: the page we're checking references on. * @mem_cont: target memory controller + * @vm_flags: collect encountered vma->vm_flags who actually referenced the page * * For an object-based mapped page, find all the places it is mapped and * check/clear the referenced flag. This is done by following the page->mapping @@ -427,7 +434,8 @@ static int page_referenced_anon(struct page *page, * This function is only called from page_referenced for object-based pages. */ static int page_referenced_file(struct page *page, - struct mem_cgroup *mem_cont) + struct mem_cgroup *mem_cont, + unsigned long *vm_flags) { unsigned int mapcount; struct address_space *mapping = page->mapping; @@ -467,7 +475,8 @@ static int page_referenced_file(struct page *page, */ if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) continue; - referenced += page_referenced_one(page, vma, &mapcount); + referenced += page_referenced_one(page, vma, + &mapcount, vm_flags); if (!mapcount) break; } @@ -481,29 +490,35 @@ static int page_referenced_file(struct page *page, * @page: the page to test * @is_locked: caller holds lock on the page * @mem_cont: target memory controller + * @vm_flags: collect encountered vma->vm_flags who actually referenced the page * * Quick test_and_clear_referenced for all mappings to a page, * returns the number of ptes which referenced the page. */ -int page_referenced(struct page *page, int is_locked, - struct mem_cgroup *mem_cont) +int page_referenced(struct page *page, + int is_locked, + struct mem_cgroup *mem_cont, + unsigned long *vm_flags) { int referenced = 0; if (TestClearPageReferenced(page)) referenced++; + *vm_flags = 0; if (page_mapped(page) && page->mapping) { if (PageAnon(page)) - referenced += page_referenced_anon(page, mem_cont); + referenced += page_referenced_anon(page, mem_cont, + vm_flags); else if (is_locked) - referenced += page_referenced_file(page, mem_cont); + referenced += page_referenced_file(page, mem_cont, + vm_flags); else if (!trylock_page(page)) referenced++; else { if (page->mapping) - referenced += - page_referenced_file(page, mem_cont); + referenced += page_referenced_file(page, + mem_cont, vm_flags); unlock_page(page); } } @@ -1202,7 +1217,6 @@ int try_to_unmap(struct page *page, int migration) return ret; } -#ifdef CONFIG_UNEVICTABLE_LRU /** * try_to_munlock - try to munlock a page * @page: the page to be munlocked @@ -1226,4 +1240,4 @@ int try_to_munlock(struct page *page) else return try_to_unmap_file(page, 1, 0); } -#endif + @@ -1097,7 +1097,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) shmem_swp_unmap(entry); unlock: spin_unlock(&info->lock); - swap_free(swap); + swapcache_free(swap, NULL); redirty: set_page_dirty(page); if (wbc->for_reclaim) @@ -2612,7 +2612,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ -struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) +struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) { int error; struct file *file; @@ -2659,6 +2659,7 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) if (error) goto close_file; #endif + ima_counts_get(file); return file; close_file: @@ -2684,7 +2685,6 @@ int shmem_zero_setup(struct vm_area_struct *vma) if (IS_ERR(file)) return PTR_ERR(file); - ima_shm_check(file); if (vma->vm_file) fput(vma->vm_file); vma->vm_file = file; @@ -107,12 +107,14 @@ #include <linux/string.h> #include <linux/uaccess.h> #include <linux/nodemask.h> +#include <linux/kmemleak.h> #include <linux/mempolicy.h> #include <linux/mutex.h> #include <linux/fault-inject.h> #include <linux/rtmutex.h> #include <linux/reciprocal_div.h> #include <linux/debugobjects.h> +#include <linux/kmemcheck.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> @@ -178,13 +180,13 @@ SLAB_STORE_USER | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ - SLAB_DEBUG_OBJECTS) + SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) #else # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ SLAB_CACHE_DMA | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ - SLAB_DEBUG_OBJECTS) + SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) #endif /* @@ -303,6 +305,12 @@ struct kmem_list3 { }; /* + * The slab allocator is initialized with interrupts disabled. Therefore, make + * sure early boot allocations don't accidentally enable interrupts. + */ +static gfp_t slab_gfp_mask __read_mostly = SLAB_GFP_BOOT_MASK; + +/* * Need this for bootstrapping a per node allocator. */ #define NUM_INIT_LISTS (3 * MAX_NUMNODES) @@ -315,7 +323,7 @@ static int drain_freelist(struct kmem_cache *cache, struct kmem_list3 *l3, int tofree); static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node); -static int enable_cpucache(struct kmem_cache *cachep); +static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); static void cache_reap(struct work_struct *unused); /* @@ -373,87 +381,6 @@ static void kmem_list3_init(struct kmem_list3 *parent) MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ } while (0) -/* - * struct kmem_cache - * - * manages a cache. - */ - -struct kmem_cache { -/* 1) per-cpu data, touched during every alloc/free */ - struct array_cache *array[NR_CPUS]; -/* 2) Cache tunables. Protected by cache_chain_mutex */ - unsigned int batchcount; - unsigned int limit; - unsigned int shared; - - unsigned int buffer_size; - u32 reciprocal_buffer_size; -/* 3) touched by every alloc & free from the backend */ - - unsigned int flags; /* constant flags */ - unsigned int num; /* # of objs per slab */ - -/* 4) cache_grow/shrink */ - /* order of pgs per slab (2^n) */ - unsigned int gfporder; - - /* force GFP flags, e.g. GFP_DMA */ - gfp_t gfpflags; - - size_t colour; /* cache colouring range */ - unsigned int colour_off; /* colour offset */ - struct kmem_cache *slabp_cache; - unsigned int slab_size; - unsigned int dflags; /* dynamic flags */ - - /* constructor func */ - void (*ctor)(void *obj); - -/* 5) cache creation/removal */ - const char *name; - struct list_head next; - -/* 6) statistics */ -#if STATS - unsigned long num_active; - unsigned long num_allocations; - unsigned long high_mark; - unsigned long grown; - unsigned long reaped; - unsigned long errors; - unsigned long max_freeable; - unsigned long node_allocs; - unsigned long node_frees; - unsigned long node_overflow; - atomic_t allochit; - atomic_t allocmiss; - atomic_t freehit; - atomic_t freemiss; -#endif -#if DEBUG - /* - * If debugging is enabled, then the allocator can add additional - * fields and/or padding to every object. buffer_size contains the total - * object size including these internal fields, the following two - * variables contain the offset to the user object and its size. - */ - int obj_offset; - int obj_size; -#endif - /* - * We put nodelists[] at the end of kmem_cache, because we want to size - * this array to nr_node_ids slots instead of MAX_NUMNODES - * (see kmem_cache_init()) - * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache - * is statically defined, so we reserve the max number of nodes. - */ - struct kmem_list3 *nodelists[MAX_NUMNODES]; - /* - * Do not add fields after nodelists[] - */ -}; - #define CFLGS_OFF_SLAB (0x80000000UL) #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) @@ -752,6 +679,7 @@ static enum { NONE, PARTIAL_AC, PARTIAL_L3, + EARLY, FULL } g_cpucache_up; @@ -760,7 +688,7 @@ static enum { */ int slab_is_available(void) { - return g_cpucache_up == FULL; + return g_cpucache_up >= EARLY; } static DEFINE_PER_CPU(struct delayed_work, reap_work); @@ -890,7 +818,6 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, */ static int use_alien_caches __read_mostly = 1; -static int numa_platform __read_mostly = 1; static int __init noaliencache_setup(char *s) { use_alien_caches = 0; @@ -958,12 +885,20 @@ static void __cpuinit start_cpu_timer(int cpu) } static struct array_cache *alloc_arraycache(int node, int entries, - int batchcount) + int batchcount, gfp_t gfp) { int memsize = sizeof(void *) * entries + sizeof(struct array_cache); struct array_cache *nc = NULL; - nc = kmalloc_node(memsize, GFP_KERNEL, node); + nc = kmalloc_node(memsize, gfp, node); + /* + * The array_cache structures contain pointers to free object. + * However, when such objects are allocated or transfered to another + * cache the pointers are not cleared and they could be counted as + * valid references during a kmemleak scan. Therefore, kmemleak must + * not scan such objects. + */ + kmemleak_no_scan(nc); if (nc) { nc->avail = 0; nc->limit = entries; @@ -1003,7 +938,7 @@ static int transfer_objects(struct array_cache *to, #define drain_alien_cache(cachep, alien) do { } while (0) #define reap_alien(cachep, l3) do { } while (0) -static inline struct array_cache **alloc_alien_cache(int node, int limit) +static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) { return (struct array_cache **)BAD_ALIEN_MAGIC; } @@ -1034,7 +969,7 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep, static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); static void *alternate_node_alloc(struct kmem_cache *, gfp_t); -static struct array_cache **alloc_alien_cache(int node, int limit) +static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) { struct array_cache **ac_ptr; int memsize = sizeof(void *) * nr_node_ids; @@ -1042,14 +977,14 @@ static struct array_cache **alloc_alien_cache(int node, int limit) if (limit > 1) limit = 12; - ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node); + ac_ptr = kmalloc_node(memsize, gfp, node); if (ac_ptr) { for_each_node(i) { if (i == node || !node_online(i)) { ac_ptr[i] = NULL; continue; } - ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d); + ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); if (!ac_ptr[i]) { for (i--; i >= 0; i--) kfree(ac_ptr[i]); @@ -1282,20 +1217,20 @@ static int __cpuinit cpuup_prepare(long cpu) struct array_cache **alien = NULL; nc = alloc_arraycache(node, cachep->limit, - cachep->batchcount); + cachep->batchcount, GFP_KERNEL); if (!nc) goto bad; if (cachep->shared) { shared = alloc_arraycache(node, cachep->shared * cachep->batchcount, - 0xbaadf00d); + 0xbaadf00d, GFP_KERNEL); if (!shared) { kfree(nc); goto bad; } } if (use_alien_caches) { - alien = alloc_alien_cache(node, cachep->limit); + alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL); if (!alien) { kfree(shared); kfree(nc); @@ -1399,10 +1334,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, { struct kmem_list3 *ptr; - ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid); + ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid); BUG_ON(!ptr); - local_irq_disable(); memcpy(ptr, list, sizeof(struct kmem_list3)); /* * Do not assume that spinlocks can be initialized via memcpy: @@ -1411,7 +1345,6 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, MAKE_ALL_LISTS(cachep, ptr, nodeid); cachep->nodelists[nodeid] = ptr; - local_irq_enable(); } /* @@ -1443,10 +1376,8 @@ void __init kmem_cache_init(void) int order; int node; - if (num_possible_nodes() == 1) { + if (num_possible_nodes() == 1) use_alien_caches = 0; - numa_platform = 0; - } for (i = 0; i < NUM_INIT_LISTS; i++) { kmem_list3_init(&initkmem_list3[i]); @@ -1575,9 +1506,8 @@ void __init kmem_cache_init(void) { struct array_cache *ptr; - ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); + ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); - local_irq_disable(); BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); memcpy(ptr, cpu_cache_get(&cache_cache), sizeof(struct arraycache_init)); @@ -1587,11 +1517,9 @@ void __init kmem_cache_init(void) spin_lock_init(&ptr->lock); cache_cache.array[smp_processor_id()] = ptr; - local_irq_enable(); - ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); + ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); - local_irq_disable(); BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) != &initarray_generic.cache); memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), @@ -1603,7 +1531,6 @@ void __init kmem_cache_init(void) malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = ptr; - local_irq_enable(); } /* 5) Replace the bootstrap kmem_list3's */ { @@ -1622,19 +1549,27 @@ void __init kmem_cache_init(void) } } - /* 6) resize the head arrays to their final sizes */ - { - struct kmem_cache *cachep; - mutex_lock(&cache_chain_mutex); - list_for_each_entry(cachep, &cache_chain, next) - if (enable_cpucache(cachep)) - BUG(); - mutex_unlock(&cache_chain_mutex); - } + g_cpucache_up = EARLY; /* Annotate slab for lockdep -- annotate the malloc caches */ init_lock_keys(); +} + +void __init kmem_cache_init_late(void) +{ + struct kmem_cache *cachep; + /* + * Interrupts are enabled now so all GFP allocations are safe. + */ + slab_gfp_mask = __GFP_BITS_MASK; + + /* 6) resize the head arrays to their final sizes */ + mutex_lock(&cache_chain_mutex); + list_for_each_entry(cachep, &cache_chain, next) + if (enable_cpucache(cachep, GFP_NOWAIT)) + BUG(); + mutex_unlock(&cache_chain_mutex); /* Done! */ g_cpucache_up = FULL; @@ -1689,7 +1624,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) if (cachep->flags & SLAB_RECLAIM_ACCOUNT) flags |= __GFP_RECLAIMABLE; - page = alloc_pages_node(nodeid, flags, cachep->gfporder); + page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); if (!page) return NULL; @@ -1702,6 +1637,16 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) NR_SLAB_UNRECLAIMABLE, nr_pages); for (i = 0; i < nr_pages; i++) __SetPageSlab(page + i); + + if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { + kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); + + if (cachep->ctor) + kmemcheck_mark_uninitialized_pages(page, nr_pages); + else + kmemcheck_mark_unallocated_pages(page, nr_pages); + } + return page_address(page); } @@ -1714,6 +1659,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) struct page *page = virt_to_page(addr); const unsigned long nr_freed = i; + kmemcheck_free_shadow(page, cachep->gfporder); + if (cachep->flags & SLAB_RECLAIM_ACCOUNT) sub_zone_page_state(page_zone(page), NR_SLAB_RECLAIMABLE, nr_freed); @@ -2064,10 +2011,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, return left_over; } -static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) +static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) { if (g_cpucache_up == FULL) - return enable_cpucache(cachep); + return enable_cpucache(cachep, gfp); if (g_cpucache_up == NONE) { /* @@ -2089,7 +2036,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) g_cpucache_up = PARTIAL_AC; } else { cachep->array[smp_processor_id()] = - kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); + kmalloc(sizeof(struct arraycache_init), gfp); if (g_cpucache_up == PARTIAL_AC) { set_up_list3s(cachep, SIZE_L3); @@ -2099,7 +2046,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) for_each_online_node(node) { cachep->nodelists[node] = kmalloc_node(sizeof(struct kmem_list3), - GFP_KERNEL, node); + gfp, node); BUG_ON(!cachep->nodelists[node]); kmem_list3_init(cachep->nodelists[node]); } @@ -2153,6 +2100,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, { size_t left_over, slab_size, ralign; struct kmem_cache *cachep = NULL, *pc; + gfp_t gfp; /* * Sanity checks... these are all serious usage bugs. @@ -2168,8 +2116,10 @@ kmem_cache_create (const char *name, size_t size, size_t align, * We use cache_chain_mutex to ensure a consistent view of * cpu_online_mask as well. Please see cpuup_callback */ - get_online_cpus(); - mutex_lock(&cache_chain_mutex); + if (slab_is_available()) { + get_online_cpus(); + mutex_lock(&cache_chain_mutex); + } list_for_each_entry(pc, &cache_chain, next) { char tmp; @@ -2278,8 +2228,13 @@ kmem_cache_create (const char *name, size_t size, size_t align, */ align = ralign; + if (slab_is_available()) + gfp = GFP_KERNEL; + else + gfp = GFP_NOWAIT; + /* Get cache's description obj. */ - cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL); + cachep = kmem_cache_zalloc(&cache_cache, gfp); if (!cachep) goto oops; @@ -2382,7 +2337,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, cachep->ctor = ctor; cachep->name = name; - if (setup_cpu_cache(cachep)) { + if (setup_cpu_cache(cachep, gfp)) { __kmem_cache_destroy(cachep); cachep = NULL; goto oops; @@ -2394,8 +2349,10 @@ oops: if (!cachep && (flags & SLAB_PANIC)) panic("kmem_cache_create(): failed to create slab `%s'\n", name); - mutex_unlock(&cache_chain_mutex); - put_online_cpus(); + if (slab_is_available()) { + mutex_unlock(&cache_chain_mutex); + put_online_cpus(); + } return cachep; } EXPORT_SYMBOL(kmem_cache_create); @@ -2621,6 +2578,14 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, /* Slab management obj is off-slab. */ slabp = kmem_cache_alloc_node(cachep->slabp_cache, local_flags, nodeid); + /* + * If the first object in the slab is leaked (it's allocated + * but no one has a reference to it), we want to make sure + * kmemleak does not treat the ->s_mem pointer as a reference + * to the object. Otherwise we will not report the leak. + */ + kmemleak_scan_area(slabp, offsetof(struct slab, list), + sizeof(struct list_head), local_flags); if (!slabp) return NULL; } else { @@ -3141,6 +3106,12 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) STATS_INC_ALLOCMISS(cachep); objp = cache_alloc_refill(cachep, flags); } + /* + * To avoid a false negative, if an object that is in one of the + * per-CPU caches is leaked, we need to make sure kmemleak doesn't + * treat the array pointers as a reference to the object. + */ + kmemleak_erase(&ac->entry[ac->avail]); return objp; } @@ -3219,7 +3190,7 @@ retry: if (local_flags & __GFP_WAIT) local_irq_enable(); kmem_flagcheck(cache, flags); - obj = kmem_getpages(cache, local_flags, -1); + obj = kmem_getpages(cache, local_flags, numa_node_id()); if (local_flags & __GFP_WAIT) local_irq_disable(); if (obj) { @@ -3327,6 +3298,8 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, unsigned long save_flags; void *ptr; + flags &= slab_gfp_mask; + lockdep_trace_alloc(flags); if (slab_should_failslab(cachep, flags)) @@ -3360,6 +3333,11 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, out: local_irq_restore(save_flags); ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); + kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags, + flags); + + if (likely(ptr)) + kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep)); if (unlikely((flags & __GFP_ZERO) && ptr)) memset(ptr, 0, obj_size(cachep)); @@ -3405,6 +3383,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) unsigned long save_flags; void *objp; + flags &= slab_gfp_mask; + lockdep_trace_alloc(flags); if (slab_should_failslab(cachep, flags)) @@ -3415,8 +3395,13 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) objp = __do_cache_alloc(cachep, flags); local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); + kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags, + flags); prefetchw(objp); + if (likely(objp)) + kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep)); + if (unlikely((flags & __GFP_ZERO) && objp)) memset(objp, 0, obj_size(cachep)); @@ -3530,8 +3515,11 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) struct array_cache *ac = cpu_cache_get(cachep); check_irq_off(); + kmemleak_free_recursive(objp, cachep->flags); objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); + kmemcheck_slab_free(cachep, objp, obj_size(cachep)); + /* * Skip calling cache_free_alien() when the platform is not numa. * This will avoid cache misses that happen while accessing slabp (which @@ -3539,7 +3527,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) * variable to skip the call, which is mostly likely to be present in * the cache. */ - if (numa_platform && cache_free_alien(cachep, objp)) + if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) return; if (likely(ac->avail < ac->limit)) { @@ -3802,7 +3790,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name); /* * This initializes kmem_list3 or resizes various caches for all nodes. */ -static int alloc_kmemlist(struct kmem_cache *cachep) +static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) { int node; struct kmem_list3 *l3; @@ -3812,7 +3800,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep) for_each_online_node(node) { if (use_alien_caches) { - new_alien = alloc_alien_cache(node, cachep->limit); + new_alien = alloc_alien_cache(node, cachep->limit, gfp); if (!new_alien) goto fail; } @@ -3821,7 +3809,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep) if (cachep->shared) { new_shared = alloc_arraycache(node, cachep->shared*cachep->batchcount, - 0xbaadf00d); + 0xbaadf00d, gfp); if (!new_shared) { free_alien_cache(new_alien); goto fail; @@ -3850,7 +3838,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep) free_alien_cache(new_alien); continue; } - l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node); + l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node); if (!l3) { free_alien_cache(new_alien); kfree(new_shared); @@ -3906,18 +3894,18 @@ static void do_ccupdate_local(void *info) /* Always called with the cache_chain_mutex held */ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, - int batchcount, int shared) + int batchcount, int shared, gfp_t gfp) { struct ccupdate_struct *new; int i; - new = kzalloc(sizeof(*new), GFP_KERNEL); + new = kzalloc(sizeof(*new), gfp); if (!new) return -ENOMEM; for_each_online_cpu(i) { new->new[i] = alloc_arraycache(cpu_to_node(i), limit, - batchcount); + batchcount, gfp); if (!new->new[i]) { for (i--; i >= 0; i--) kfree(new->new[i]); @@ -3944,11 +3932,11 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, kfree(ccold); } kfree(new); - return alloc_kmemlist(cachep); + return alloc_kmemlist(cachep, gfp); } /* Called with cache_chain_mutex held always */ -static int enable_cpucache(struct kmem_cache *cachep) +static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) { int err; int limit, shared; @@ -3994,7 +3982,7 @@ static int enable_cpucache(struct kmem_cache *cachep) if (limit > 32) limit = 32; #endif - err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared); + err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp); if (err) printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", cachep->name, -err); @@ -4300,7 +4288,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, res = 0; } else { res = do_tune_cpucache(cachep, limit, - batchcount, shared); + batchcount, shared, + GFP_KERNEL); } break; } @@ -46,7 +46,7 @@ * NUMA support in SLOB is fairly simplistic, pushing most of the real * logic down to the page allocator, and simply doing the node accounting * on the upper levels. In the event that a node id is explicitly - * provided, alloc_pages_node() with the specified node id is used + * provided, alloc_pages_exact_node() with the specified node id is used * instead. The common case (or when the node id isn't explicitly provided) * will default to the current node, as per numa_node_id(). * @@ -60,12 +60,14 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/mm.h> +#include <linux/swap.h> /* struct reclaim_state */ #include <linux/cache.h> #include <linux/init.h> #include <linux/module.h> #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/kmemtrace.h> +#include <linux/kmemleak.h> #include <asm/atomic.h> /* @@ -242,7 +244,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node) #ifdef CONFIG_NUMA if (node != -1) - page = alloc_pages_node(node, gfp, order); + page = alloc_pages_exact_node(node, gfp, order); else #endif page = alloc_pages(gfp, order); @@ -255,6 +257,8 @@ static void *slob_new_pages(gfp_t gfp, int order, int node) static void slob_free_pages(void *b, int order) { + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += 1 << order; free_pages((unsigned long)b, order); } @@ -407,7 +411,7 @@ static void slob_free(void *block, int size) spin_unlock_irqrestore(&slob_lock, flags); clear_slob_page(sp); free_slob_page(sp); - free_page((unsigned long)b); + slob_free_pages(b, 0); return; } @@ -506,6 +510,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) size, PAGE_SIZE << order, gfp, node); } + kmemleak_alloc(ret, size, 1, gfp); return ret; } EXPORT_SYMBOL(__kmalloc_node); @@ -518,6 +523,7 @@ void kfree(const void *block) if (unlikely(ZERO_OR_NULL_PTR(block))) return; + kmemleak_free(block); sp = slob_page(block); if (is_slob_page(sp)) { @@ -581,12 +587,14 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, } else if (flags & SLAB_PANIC) panic("Cannot create slab cache %s\n", name); + kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL); return c; } EXPORT_SYMBOL(kmem_cache_create); void kmem_cache_destroy(struct kmem_cache *c) { + kmemleak_free(c); slob_free(c, sizeof(struct kmem_cache)); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -610,6 +618,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) if (c->ctor) c->ctor(b); + kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags); return b; } EXPORT_SYMBOL(kmem_cache_alloc_node); @@ -632,6 +641,7 @@ static void kmem_rcu_free(struct rcu_head *head) void kmem_cache_free(struct kmem_cache *c, void *b) { + kmemleak_free_recursive(b, c->flags); if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) { struct slob_rcu *slob_rcu; slob_rcu = b + (c->size - sizeof(struct slob_rcu)); @@ -9,6 +9,7 @@ */ #include <linux/mm.h> +#include <linux/swap.h> /* struct reclaim_state */ #include <linux/module.h> #include <linux/bit_spinlock.h> #include <linux/interrupt.h> @@ -17,8 +18,10 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/kmemtrace.h> +#include <linux/kmemcheck.h> #include <linux/cpu.h> #include <linux/cpuset.h> +#include <linux/kmemleak.h> #include <linux/mempolicy.h> #include <linux/ctype.h> #include <linux/debugobjects.h> @@ -142,10 +145,10 @@ * Set of flags that will prevent slab merging */ #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_DESTROY_BY_RCU) + SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE) #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_CACHE_DMA) + SLAB_CACHE_DMA | SLAB_NOTRACK) #ifndef ARCH_KMALLOC_MINALIGN #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) @@ -176,6 +179,12 @@ static enum { SYSFS /* Sysfs up */ } slab_state = DOWN; +/* + * The slab allocator is initialized with interrupts disabled. Therefore, make + * sure early boot allocations don't accidentally enable interrupts. + */ +static gfp_t slab_gfp_mask __read_mostly = SLAB_GFP_BOOT_MASK; + /* A list of all slab caches on the system */ static DECLARE_RWSEM(slub_lock); static LIST_HEAD(slab_caches); @@ -1063,6 +1072,8 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node, { int order = oo_order(oo); + flags |= __GFP_NOTRACK; + if (node == -1) return alloc_pages(flags, order); else @@ -1090,6 +1101,24 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); } + + if (kmemcheck_enabled + && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) + { + int pages = 1 << oo_order(oo); + + kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); + + /* + * Objects from caches that have a constructor don't get + * cleared when they're allocated, so we need to do it here. + */ + if (s->ctor) + kmemcheck_mark_uninitialized_pages(page, pages); + else + kmemcheck_mark_unallocated_pages(page, pages); + } + page->objects = oo_objects(oo); mod_zone_page_state(page_zone(page), (s->flags & SLAB_RECLAIM_ACCOUNT) ? @@ -1163,6 +1192,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlubDebug(page); } + kmemcheck_free_shadow(page, compound_order(page)); + mod_zone_page_state(page_zone(page), (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, @@ -1170,6 +1201,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlab(page); reset_page_mapcount(page); + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += pages; __free_pages(page, order); } @@ -1591,6 +1624,8 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, unsigned long flags; unsigned int objsize; + gfpflags &= slab_gfp_mask; + lockdep_trace_alloc(gfpflags); might_sleep_if(gfpflags & __GFP_WAIT); @@ -1614,6 +1649,9 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, if (unlikely((gfpflags & __GFP_ZERO) && object)) memset(object, 0, objsize); + kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); + kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags); + return object; } @@ -1743,8 +1781,10 @@ static __always_inline void slab_free(struct kmem_cache *s, struct kmem_cache_cpu *c; unsigned long flags; + kmemleak_free_recursive(x, s->flags); local_irq_save(flags); c = get_cpu_slab(s, smp_processor_id()); + kmemcheck_slab_free(s, object, c->objsize); debug_check_no_locks_freed(object, c->objsize); if (!(s->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(object, c->objsize); @@ -1909,7 +1949,7 @@ static inline int calculate_order(int size) * Doh this slab cannot be placed using slub_max_order. */ order = slab_order(size, 1, MAX_ORDER, 1); - if (order <= MAX_ORDER) + if (order < MAX_ORDER) return order; return -ENOSYS; } @@ -2522,6 +2562,7 @@ __setup("slub_min_order=", setup_slub_min_order); static int __init setup_slub_max_order(char *str) { get_option(&str, &slub_max_order); + slub_max_order = min(slub_max_order, MAX_ORDER - 1); return 1; } @@ -2553,13 +2594,16 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, if (gfp_flags & SLUB_DMA) flags = SLAB_CACHE_DMA; - down_write(&slub_lock); + /* + * This function is called with IRQs disabled during early-boot on + * single CPU so there's no need to take slub_lock here. + */ if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, flags, NULL)) goto panic; list_add(&s->list, &slab_caches); - up_write(&slub_lock); + if (sysfs_slab_add(s)) goto panic; return s; @@ -2615,7 +2659,8 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) if (!s || !text || !kmem_cache_open(s, flags, text, realsize, ARCH_KMALLOC_MINALIGN, - SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) { + SLAB_CACHE_DMA|SLAB_NOTRACK|__SYSFS_ADD_DEFERRED, + NULL)) { kfree(s); kfree(text); goto unlock_out; @@ -2709,9 +2754,10 @@ EXPORT_SYMBOL(__kmalloc); static void *kmalloc_large_node(size_t size, gfp_t flags, int node) { - struct page *page = alloc_pages_node(node, flags | __GFP_COMP, - get_order(size)); + struct page *page; + flags |= __GFP_COMP | __GFP_NOTRACK; + page = alloc_pages_node(node, flags, get_order(size)); if (page) return page_address(page); else @@ -3017,7 +3063,7 @@ void __init kmem_cache_init(void) * kmem_cache_open for slab_state == DOWN. */ create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", - sizeof(struct kmem_cache_node), GFP_KERNEL); + sizeof(struct kmem_cache_node), GFP_NOWAIT); kmalloc_caches[0].refcount = -1; caches++; @@ -3030,16 +3076,16 @@ void __init kmem_cache_init(void) /* Caches that are not of the two-to-the-power-of size */ if (KMALLOC_MIN_SIZE <= 64) { create_kmalloc_cache(&kmalloc_caches[1], - "kmalloc-96", 96, GFP_KERNEL); + "kmalloc-96", 96, GFP_NOWAIT); caches++; create_kmalloc_cache(&kmalloc_caches[2], - "kmalloc-192", 192, GFP_KERNEL); + "kmalloc-192", 192, GFP_NOWAIT); caches++; } for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { create_kmalloc_cache(&kmalloc_caches[i], - "kmalloc", 1 << i, GFP_KERNEL); + "kmalloc", 1 << i, GFP_NOWAIT); caches++; } @@ -3076,7 +3122,7 @@ void __init kmem_cache_init(void) /* Provide the correct kmalloc names now that the caches are up */ for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) kmalloc_caches[i]. name = - kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); + kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i); #ifdef CONFIG_SMP register_cpu_notifier(&slab_notifier); @@ -3094,6 +3140,14 @@ void __init kmem_cache_init(void) nr_cpu_ids, nr_node_ids); } +void __init kmem_cache_init_late(void) +{ + /* + * Interrupts are enabled now so all GFP allocations are safe. + */ + slab_gfp_mask = __GFP_BITS_MASK; +} + /* * Find a mergeable slab cache */ @@ -3711,7 +3765,7 @@ static int list_locations(struct kmem_cache *s, char *buf, to_cpumask(l->cpus)); } - if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && + if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && len < PAGE_SIZE - 60) { len += sprintf(buf + len, " nodes="); len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, @@ -4386,6 +4440,8 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = 'a'; if (s->flags & SLAB_DEBUG_FREE) *p++ = 'F'; + if (!(s->flags & SLAB_NOTRACK)) + *p++ = 't'; if (p != name + 1) *p++ = '-'; p += sprintf(p, "%07d", s->size); diff --git a/mm/swap_state.c b/mm/swap_state.c index 3ecea98..42cd38e 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -109,8 +109,6 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) */ void __delete_from_swap_cache(struct page *page) { - swp_entry_t ent = {.val = page_private(page)}; - VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(!PageSwapCache(page)); VM_BUG_ON(PageWriteback(page)); @@ -121,13 +119,11 @@ void __delete_from_swap_cache(struct page *page) total_swapcache_pages--; __dec_zone_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(del_total); - mem_cgroup_uncharge_swapcache(page, ent); } /** * add_to_swap - allocate swap space for a page * @page: page we want to move to swap - * @gfp_mask: memory allocation flags * * Allocate swap space for the page and add the page to the * swap cache. Caller needs to hold the page lock. @@ -165,11 +161,11 @@ int add_to_swap(struct page *page) return 1; case -EEXIST: /* Raced with "speculative" read_swap_cache_async */ - swap_free(entry); + swapcache_free(entry, NULL); continue; default: /* -ENOMEM radix-tree allocation failure */ - swap_free(entry); + swapcache_free(entry, NULL); return 0; } } @@ -191,7 +187,7 @@ void delete_from_swap_cache(struct page *page) __delete_from_swap_cache(page); spin_unlock_irq(&swapper_space.tree_lock); - swap_free(entry); + swapcache_free(entry, page); page_cache_release(page); } @@ -295,7 +291,10 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * Swap entry may have been freed since our caller observed it. */ - if (!swap_duplicate(entry)) + err = swapcache_prepare(entry); + if (err == -EEXIST) /* seems racy */ + continue; + if (err) /* swp entry is obsolete ? */ break; /* @@ -314,12 +313,12 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * Initiate read into locked page and return. */ lru_cache_add_anon(new_page); - swap_readpage(NULL, new_page); + swap_readpage(new_page); return new_page; } ClearPageSwapBacked(new_page); __clear_page_locked(new_page); - swap_free(entry); + swapcache_free(entry, NULL); } while (err != -ENOMEM); if (new_page) diff --git a/mm/swapfile.c b/mm/swapfile.c index 312fafe..28faa01 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -53,6 +53,59 @@ static struct swap_info_struct swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); +/* For reference count accounting in swap_map */ +/* enum for swap_map[] handling. internal use only */ +enum { + SWAP_MAP = 0, /* ops for reference from swap users */ + SWAP_CACHE, /* ops for reference from swap cache */ +}; + +static inline int swap_count(unsigned short ent) +{ + return ent & SWAP_COUNT_MASK; +} + +static inline bool swap_has_cache(unsigned short ent) +{ + return !!(ent & SWAP_HAS_CACHE); +} + +static inline unsigned short encode_swapmap(int count, bool has_cache) +{ + unsigned short ret = count; + + if (has_cache) + return SWAP_HAS_CACHE | ret; + return ret; +} + +/* returnes 1 if swap entry is freed */ +static int +__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) +{ + int type = si - swap_info; + swp_entry_t entry = swp_entry(type, offset); + struct page *page; + int ret = 0; + + page = find_get_page(&swapper_space, entry.val); + if (!page) + return 0; + /* + * This function is called from scan_swap_map() and it's called + * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here. + * We have to use trylock for avoiding deadlock. This is a special + * case and you should use try_to_free_swap() with explicit lock_page() + * in usual operations. + */ + if (trylock_page(page)) { + ret = try_to_free_swap(page); + unlock_page(page); + } + page_cache_release(page); + return ret; +} + /* * We need this because the bdev->unplug_fn can sleep and we cannot * hold swap_lock while calling the unplug_fn. And swap_lock @@ -167,7 +220,8 @@ static int wait_for_discard(void *word) #define SWAPFILE_CLUSTER 256 #define LATENCY_LIMIT 256 -static inline unsigned long scan_swap_map(struct swap_info_struct *si) +static inline unsigned long scan_swap_map(struct swap_info_struct *si, + int cache) { unsigned long offset; unsigned long scan_base; @@ -273,6 +327,19 @@ checks: goto no_page; if (offset > si->highest_bit) scan_base = offset = si->lowest_bit; + + /* reuse swap entry of cache-only swap if not busy. */ + if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + int swap_was_freed; + spin_unlock(&swap_lock); + swap_was_freed = __try_to_reclaim_swap(si, offset); + spin_lock(&swap_lock); + /* entry was freed successfully, try to use this again */ + if (swap_was_freed) + goto checks; + goto scan; /* check next one */ + } + if (si->swap_map[offset]) goto scan; @@ -285,7 +352,10 @@ checks: si->lowest_bit = si->max; si->highest_bit = 0; } - si->swap_map[offset] = 1; + if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */ + si->swap_map[offset] = encode_swapmap(0, true); + else /* at suspend */ + si->swap_map[offset] = encode_swapmap(1, false); si->cluster_next = offset + 1; si->flags -= SWP_SCANNING; @@ -351,6 +421,10 @@ scan: spin_lock(&swap_lock); goto checks; } + if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + spin_lock(&swap_lock); + goto checks; + } if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; @@ -362,6 +436,10 @@ scan: spin_lock(&swap_lock); goto checks; } + if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + spin_lock(&swap_lock); + goto checks; + } if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; @@ -401,7 +479,8 @@ swp_entry_t get_swap_page(void) continue; swap_list.next = next; - offset = scan_swap_map(si); + /* This is called for allocating swap entry for cache */ + offset = scan_swap_map(si, SWAP_CACHE); if (offset) { spin_unlock(&swap_lock); return swp_entry(type, offset); @@ -415,6 +494,7 @@ noswap: return (swp_entry_t) {0}; } +/* The only caller of this function is now susupend routine */ swp_entry_t get_swap_page_of_type(int type) { struct swap_info_struct *si; @@ -424,7 +504,8 @@ swp_entry_t get_swap_page_of_type(int type) si = swap_info + type; if (si->flags & SWP_WRITEOK) { nr_swap_pages--; - offset = scan_swap_map(si); + /* This is called for allocating swap entry, not cache */ + offset = scan_swap_map(si, SWAP_MAP); if (offset) { spin_unlock(&swap_lock); return swp_entry(type, offset); @@ -471,25 +552,38 @@ out: return NULL; } -static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent) +static int swap_entry_free(struct swap_info_struct *p, + swp_entry_t ent, int cache) { unsigned long offset = swp_offset(ent); - int count = p->swap_map[offset]; - - if (count < SWAP_MAP_MAX) { - count--; - p->swap_map[offset] = count; - if (!count) { - if (offset < p->lowest_bit) - p->lowest_bit = offset; - if (offset > p->highest_bit) - p->highest_bit = offset; - if (p->prio > swap_info[swap_list.next].prio) - swap_list.next = p - swap_info; - nr_swap_pages++; - p->inuse_pages--; - mem_cgroup_uncharge_swap(ent); + int count = swap_count(p->swap_map[offset]); + bool has_cache; + + has_cache = swap_has_cache(p->swap_map[offset]); + + if (cache == SWAP_MAP) { /* dropping usage count of swap */ + if (count < SWAP_MAP_MAX) { + count--; + p->swap_map[offset] = encode_swapmap(count, has_cache); } + } else { /* dropping swap cache flag */ + VM_BUG_ON(!has_cache); + p->swap_map[offset] = encode_swapmap(count, false); + + } + /* return code. */ + count = p->swap_map[offset]; + /* free if no reference */ + if (!count) { + if (offset < p->lowest_bit) + p->lowest_bit = offset; + if (offset > p->highest_bit) + p->highest_bit = offset; + if (p->prio > swap_info[swap_list.next].prio) + swap_list.next = p - swap_info; + nr_swap_pages++; + p->inuse_pages--; + mem_cgroup_uncharge_swap(ent); } return count; } @@ -504,9 +598,26 @@ void swap_free(swp_entry_t entry) p = swap_info_get(entry); if (p) { - swap_entry_free(p, entry); + swap_entry_free(p, entry, SWAP_MAP); + spin_unlock(&swap_lock); + } +} + +/* + * Called after dropping swapcache to decrease refcnt to swap entries. + */ +void swapcache_free(swp_entry_t entry, struct page *page) +{ + struct swap_info_struct *p; + + if (page) + mem_cgroup_uncharge_swapcache(page, entry); + p = swap_info_get(entry); + if (p) { + swap_entry_free(p, entry, SWAP_CACHE); spin_unlock(&swap_lock); } + return; } /* @@ -521,8 +632,7 @@ static inline int page_swapcount(struct page *page) entry.val = page_private(page); p = swap_info_get(entry); if (p) { - /* Subtract the 1 for the swap cache itself */ - count = p->swap_map[swp_offset(entry)] - 1; + count = swap_count(p->swap_map[swp_offset(entry)]); spin_unlock(&swap_lock); } return count; @@ -584,7 +694,7 @@ int free_swap_and_cache(swp_entry_t entry) p = swap_info_get(entry); if (p) { - if (swap_entry_free(p, entry) == 1) { + if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) { page = find_get_page(&swapper_space, entry.val); if (page && !trylock_page(page)) { page_cache_release(page); @@ -891,7 +1001,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, i = 1; } count = si->swap_map[i]; - if (count && count != SWAP_MAP_BAD) + if (count && swap_count(count) != SWAP_MAP_BAD) break; } return i; @@ -995,13 +1105,13 @@ static int try_to_unuse(unsigned int type) */ shmem = 0; swcount = *swap_map; - if (swcount > 1) { + if (swap_count(swcount)) { if (start_mm == &init_mm) shmem = shmem_unuse(entry, page); else retval = unuse_mm(start_mm, entry, page); } - if (*swap_map > 1) { + if (swap_count(*swap_map)) { int set_start_mm = (*swap_map >= swcount); struct list_head *p = &start_mm->mmlist; struct mm_struct *new_start_mm = start_mm; @@ -1011,7 +1121,7 @@ static int try_to_unuse(unsigned int type) atomic_inc(&new_start_mm->mm_users); atomic_inc(&prev_mm->mm_users); spin_lock(&mmlist_lock); - while (*swap_map > 1 && !retval && !shmem && + while (swap_count(*swap_map) && !retval && !shmem && (p = p->next) != &start_mm->mmlist) { mm = list_entry(p, struct mm_struct, mmlist); if (!atomic_inc_not_zero(&mm->mm_users)) @@ -1023,14 +1133,16 @@ static int try_to_unuse(unsigned int type) cond_resched(); swcount = *swap_map; - if (swcount <= 1) + if (!swap_count(swcount)) /* any usage ? */ ; else if (mm == &init_mm) { set_start_mm = 1; shmem = shmem_unuse(entry, page); } else retval = unuse_mm(mm, entry, page); - if (set_start_mm && *swap_map < swcount) { + + if (set_start_mm && + swap_count(*swap_map) < swcount) { mmput(new_start_mm); atomic_inc(&mm->mm_users); new_start_mm = mm; @@ -1057,21 +1169,25 @@ static int try_to_unuse(unsigned int type) } /* - * How could swap count reach 0x7fff when the maximum - * pid is 0x7fff, and there's no way to repeat a swap - * page within an mm (except in shmem, where it's the - * shared object which takes the reference count)? - * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. - * + * How could swap count reach 0x7ffe ? + * There's no way to repeat a swap page within an mm + * (except in shmem, where it's the shared object which takes + * the reference count)? + * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned + * short is too small....) * If that's wrong, then we should worry more about * exit_mmap() and do_munmap() cases described above: * we might be resetting SWAP_MAP_MAX too early here. * We know "Undead"s can happen, they're okay, so don't * report them; but do report if we reset SWAP_MAP_MAX. */ - if (*swap_map == SWAP_MAP_MAX) { + /* We might release the lock_page() in unuse_mm(). */ + if (!PageSwapCache(page) || page_private(page) != entry.val) + goto retry; + + if (swap_count(*swap_map) == SWAP_MAP_MAX) { spin_lock(&swap_lock); - *swap_map = 1; + *swap_map = encode_swapmap(0, true); spin_unlock(&swap_lock); reset_overflow = 1; } @@ -1089,7 +1205,8 @@ static int try_to_unuse(unsigned int type) * pages would be incorrect if swap supported "shared * private" pages, but they are handled by tmpfs files. */ - if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { + if (swap_count(*swap_map) && + PageDirty(page) && PageSwapCache(page)) { struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, }; @@ -1116,6 +1233,7 @@ static int try_to_unuse(unsigned int type) * mark page dirty so shrink_page_list will preserve it. */ SetPageDirty(page); +retry: unlock_page(page); page_cache_release(page); @@ -1942,15 +2060,23 @@ void si_swapinfo(struct sysinfo *val) * * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as * "permanent", but will be reclaimed by the next swapoff. + * Returns error code in following case. + * - success -> 0 + * - swp_entry is invalid -> EINVAL + * - swp_entry is migration entry -> EINVAL + * - swap-cache reference is requested but there is already one. -> EEXIST + * - swap-cache reference is requested but the entry is not used. -> ENOENT */ -int swap_duplicate(swp_entry_t entry) +static int __swap_duplicate(swp_entry_t entry, bool cache) { struct swap_info_struct * p; unsigned long offset, type; - int result = 0; + int result = -EINVAL; + int count; + bool has_cache; if (is_migration_entry(entry)) - return 1; + return -EINVAL; type = swp_type(entry); if (type >= nr_swapfiles) @@ -1959,17 +2085,40 @@ int swap_duplicate(swp_entry_t entry) offset = swp_offset(entry); spin_lock(&swap_lock); - if (offset < p->max && p->swap_map[offset]) { - if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { - p->swap_map[offset]++; - result = 1; - } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { + + if (unlikely(offset >= p->max)) + goto unlock_out; + + count = swap_count(p->swap_map[offset]); + has_cache = swap_has_cache(p->swap_map[offset]); + + if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */ + + /* set SWAP_HAS_CACHE if there is no cache and entry is used */ + if (!has_cache && count) { + p->swap_map[offset] = encode_swapmap(count, true); + result = 0; + } else if (has_cache) /* someone added cache */ + result = -EEXIST; + else if (!count) /* no users */ + result = -ENOENT; + + } else if (count || has_cache) { + if (count < SWAP_MAP_MAX - 1) { + p->swap_map[offset] = encode_swapmap(count + 1, + has_cache); + result = 0; + } else if (count <= SWAP_MAP_MAX) { if (swap_overflow++ < 5) - printk(KERN_WARNING "swap_dup: swap entry overflow\n"); - p->swap_map[offset] = SWAP_MAP_MAX; - result = 1; + printk(KERN_WARNING + "swap_dup: swap entry overflow\n"); + p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX, + has_cache); + result = 0; } - } + } else + result = -ENOENT; /* unused swap entry */ +unlock_out: spin_unlock(&swap_lock); out: return result; @@ -1978,6 +2127,27 @@ bad_file: printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); goto out; } +/* + * increase reference count of swap entry by 1. + */ +void swap_duplicate(swp_entry_t entry) +{ + __swap_duplicate(entry, SWAP_MAP); +} + +/* + * @entry: swap entry for which we allocate swap cache. + * + * Called when allocating swap cache for exising swap entry, + * This can return error codes. Returns 0 at success. + * -EBUSY means there is a swap cache. + * Note: return code is different from swap_duplicate(). + */ +int swapcache_prepare(swp_entry_t entry) +{ + return __swap_duplicate(entry, SWAP_CACHE); +} + struct swap_info_struct * get_swap_info_struct(unsigned type) @@ -2016,7 +2186,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) /* Don't read in free or bad pages */ if (!si->swap_map[toff]) break; - if (si->swap_map[toff] == SWAP_MAP_BAD) + if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) break; } /* Count contiguous allocated slots below our target */ @@ -2024,7 +2194,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) /* Don't read in free or bad pages */ if (!si->swap_map[toff]) break; - if (si->swap_map[toff] == SWAP_MAP_BAD) + if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) break; } spin_unlock(&swap_lock); diff --git a/mm/truncate.c b/mm/truncate.c index 55206fa..ccc3ecf 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -267,8 +267,21 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) } EXPORT_SYMBOL(truncate_inode_pages); -unsigned long __invalidate_mapping_pages(struct address_space *mapping, - pgoff_t start, pgoff_t end, bool be_atomic) +/** + * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode + * @mapping: the address_space which holds the pages to invalidate + * @start: the offset 'from' which to invalidate + * @end: the offset 'to' which to invalidate (inclusive) + * + * This function only removes the unlocked pages, if you want to + * remove all the pages of one inode, you must call truncate_inode_pages. + * + * invalidate_mapping_pages() will not block on IO activity. It will not + * invalidate pages which are dirty, locked, under writeback or mapped into + * pagetables. + */ +unsigned long invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end) { struct pagevec pvec; pgoff_t next = start; @@ -309,30 +322,10 @@ unlock: break; } pagevec_release(&pvec); - if (likely(!be_atomic)) - cond_resched(); + cond_resched(); } return ret; } - -/** - * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode - * @mapping: the address_space which holds the pages to invalidate - * @start: the offset 'from' which to invalidate - * @end: the offset 'to' which to invalidate (inclusive) - * - * This function only removes the unlocked pages, if you want to - * remove all the pages of one inode, you must call truncate_inode_pages. - * - * invalidate_mapping_pages() will not block on IO activity. It will not - * invalidate pages which are dirty, locked, under writeback or mapped into - * pagetables. - */ -unsigned long invalidate_mapping_pages(struct address_space *mapping, - pgoff_t start, pgoff_t end) -{ - return __invalidate_mapping_pages(mapping, start, end, false); -} EXPORT_SYMBOL(invalidate_mapping_pages); /* @@ -359,6 +352,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) BUG_ON(page_has_private(page)); __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); + mem_cgroup_uncharge_cache_page(page); page_cache_release(page); /* pagecache ref */ return 1; failed: @@ -233,13 +233,21 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * - * Attempt to pin user pages in memory without taking mm->mmap_sem. - * If not successful, it will fall back to taking the lock and - * calling get_user_pages(). - * * Returns number of pages pinned. This may be fewer than the number * requested. If nr_pages is 0 or negative, returns 0. If no pages * were pinned, returns -errno. + * + * get_user_pages_fast provides equivalent functionality to get_user_pages, + * operating on current and current->mm, with force=0 and vma=NULL. However + * unlike get_user_pages, it must be called without mmap_sem held. + * + * get_user_pages_fast may take mmap_sem and page table locks, so no + * assumptions can be made about lack of locking. get_user_pages_fast is to be + * implemented in a way that is advantageous (vs get_user_pages()) when the + * user memory area is already faulted in and present in ptes. However if the + * pages have to be faulted in, it may turn out to be slightly slower so + * callers need to carefully consider what to use. On many architectures, + * get_user_pages_fast simply falls back to get_user_pages. */ int __attribute__((weak)) get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 083716e..f8189a4 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -23,8 +23,8 @@ #include <linux/rbtree.h> #include <linux/radix-tree.h> #include <linux/rcupdate.h> -#include <linux/bootmem.h> #include <linux/pfn.h> +#include <linux/kmemleak.h> #include <asm/atomic.h> #include <asm/uaccess.h> @@ -1032,7 +1032,7 @@ void __init vmalloc_init(void) /* Import existing vmlist entries. */ for (tmp = vmlist; tmp; tmp = tmp->next) { - va = alloc_bootmem(sizeof(struct vmap_area)); + va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); va->flags = tmp->flags | VM_VM_AREA; va->va_start = (unsigned long)tmp->addr; va->va_end = va->va_start + tmp->size; @@ -1327,6 +1327,9 @@ static void __vunmap(const void *addr, int deallocate_pages) void vfree(const void *addr) { BUG_ON(in_interrupt()); + + kmemleak_free(addr); + __vunmap(addr, 1); } EXPORT_SYMBOL(vfree); @@ -1439,8 +1442,17 @@ fail: void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) { - return __vmalloc_area_node(area, gfp_mask, prot, -1, - __builtin_return_address(0)); + void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1, + __builtin_return_address(0)); + + /* + * A ref_count = 3 is needed because the vm_struct and vmap_area + * structures allocated in the __get_vm_area_node() function contain + * references to the virtual address of the vmalloc'ed block. + */ + kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask); + + return addr; } /** @@ -1459,6 +1471,8 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, int node, void *caller) { struct vm_struct *area; + void *addr; + unsigned long real_size = size; size = PAGE_ALIGN(size); if (!size || (size >> PAGE_SHIFT) > num_physpages) @@ -1470,7 +1484,16 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, if (!area) return NULL; - return __vmalloc_area_node(area, gfp_mask, prot, node, caller); + addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); + + /* + * A ref_count = 3 is needed because the vm_struct and vmap_area + * structures allocated in the __get_vm_area_node() function contain + * references to the virtual address of the vmalloc'ed block. + */ + kmemleak_alloc(addr, real_size, 3, gfp_mask); + + return addr; } void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) diff --git a/mm/vmscan.c b/mm/vmscan.c index 5fa3eda..4139aa5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -470,10 +470,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) swp_entry_t swap = { .val = page_private(page) }; __delete_from_swap_cache(page); spin_unlock_irq(&mapping->tree_lock); - swap_free(swap); + swapcache_free(swap, page); } else { __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); + mem_cgroup_uncharge_cache_page(page); } return 1; @@ -512,7 +513,6 @@ int remove_mapping(struct address_space *mapping, struct page *page) * * lru_lock must not be held, interrupts must be enabled. */ -#ifdef CONFIG_UNEVICTABLE_LRU void putback_lru_page(struct page *page) { int lru; @@ -566,20 +566,6 @@ redo: put_page(page); /* drop ref from isolate */ } -#else /* CONFIG_UNEVICTABLE_LRU */ - -void putback_lru_page(struct page *page) -{ - int lru; - VM_BUG_ON(PageLRU(page)); - - lru = !!TestClearPageActive(page) + page_is_file_cache(page); - lru_cache_add_lru(page, lru); - put_page(page); -} -#endif /* CONFIG_UNEVICTABLE_LRU */ - - /* * shrink_page_list() returns the number of reclaimed pages */ @@ -591,6 +577,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct pagevec freed_pvec; int pgactivate = 0; unsigned long nr_reclaimed = 0; + unsigned long vm_flags; cond_resched(); @@ -641,7 +628,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; } - referenced = page_referenced(page, 1, sc->mem_cgroup); + referenced = page_referenced(page, 1, + sc->mem_cgroup, &vm_flags); /* In active use or really unfreeable? Activate it. */ if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced && page_mapping_inuse(page)) @@ -941,18 +929,10 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, /* Check that we have not crossed a zone boundary. */ if (unlikely(page_zone_id(cursor_page) != zone_id)) continue; - switch (__isolate_lru_page(cursor_page, mode, file)) { - case 0: + if (__isolate_lru_page(cursor_page, mode, file) == 0) { list_move(&cursor_page->lru, dst); nr_taken++; scan++; - break; - - case -EBUSY: - /* else it is being freed elsewhere */ - list_move(&cursor_page->lru, src); - default: - break; /* ! on LRU or wrong list */ } } } @@ -1059,6 +1039,19 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, unsigned long nr_scanned = 0; unsigned long nr_reclaimed = 0; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + int lumpy_reclaim = 0; + + /* + * If we need a large contiguous chunk of memory, or have + * trouble getting a small set of contiguous pages, we + * will reclaim both active and inactive pages. + * + * We use the same threshold as pageout congestion_wait below. + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + lumpy_reclaim = 1; + else if (sc->order && priority < DEF_PRIORITY - 2) + lumpy_reclaim = 1; pagevec_init(&pvec, 1); @@ -1071,19 +1064,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, unsigned long nr_freed; unsigned long nr_active; unsigned int count[NR_LRU_LISTS] = { 0, }; - int mode = ISOLATE_INACTIVE; - - /* - * If we need a large contiguous chunk of memory, or have - * trouble getting a small set of contiguous pages, we - * will reclaim both active and inactive pages. - * - * We use the same threshold as pageout congestion_wait below. - */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - mode = ISOLATE_BOTH; - else if (sc->order && priority < DEF_PRIORITY - 2) - mode = ISOLATE_BOTH; + int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE; nr_taken = sc->isolate_pages(sc->swap_cluster_max, &page_list, &nr_scan, sc->order, mode, @@ -1120,7 +1101,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, * but that should be acceptable to the caller */ if (nr_freed < nr_taken && !current_is_kswapd() && - sc->order > PAGE_ALLOC_COSTLY_ORDER) { + lumpy_reclaim) { congestion_wait(WRITE, HZ/10); /* @@ -1215,18 +1196,54 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority) * But we had to alter page->flags anyway. */ +static void move_active_pages_to_lru(struct zone *zone, + struct list_head *list, + enum lru_list lru) +{ + unsigned long pgmoved = 0; + struct pagevec pvec; + struct page *page; + + pagevec_init(&pvec, 1); + + while (!list_empty(list)) { + page = lru_to_page(list); + prefetchw_prev_lru_page(page, list, flags); + + VM_BUG_ON(PageLRU(page)); + SetPageLRU(page); + + VM_BUG_ON(!PageActive(page)); + if (!is_active_lru(lru)) + ClearPageActive(page); /* we are de-activating */ + + list_move(&page->lru, &zone->lru[lru].list); + mem_cgroup_add_lru_list(page, lru); + pgmoved++; + + if (!pagevec_add(&pvec, page) || list_empty(list)) { + spin_unlock_irq(&zone->lru_lock); + if (buffer_heads_over_limit) + pagevec_strip(&pvec); + __pagevec_release(&pvec); + spin_lock_irq(&zone->lru_lock); + } + } + __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); + if (!is_active_lru(lru)) + __count_vm_events(PGDEACTIVATE, pgmoved); +} static void shrink_active_list(unsigned long nr_pages, struct zone *zone, struct scan_control *sc, int priority, int file) { unsigned long pgmoved; - int pgdeactivate = 0; unsigned long pgscanned; + unsigned long vm_flags; LIST_HEAD(l_hold); /* The pages which were snipped off */ + LIST_HEAD(l_active); LIST_HEAD(l_inactive); struct page *page; - struct pagevec pvec; - enum lru_list lru; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); lru_add_drain(); @@ -1243,13 +1260,14 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } reclaim_stat->recent_scanned[!!file] += pgmoved; + __count_zone_vm_events(PGREFILL, zone, pgscanned); if (file) __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); else __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); spin_unlock_irq(&zone->lru_lock); - pgmoved = 0; + pgmoved = 0; /* count referenced (mapping) mapped pages */ while (!list_empty(&l_hold)) { cond_resched(); page = lru_to_page(&l_hold); @@ -1262,58 +1280,44 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, /* page_referenced clears PageReferenced */ if (page_mapping_inuse(page) && - page_referenced(page, 0, sc->mem_cgroup)) + page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { pgmoved++; + /* + * Identify referenced, file-backed active pages and + * give them one more trip around the active list. So + * that executable code get better chances to stay in + * memory under moderate memory pressure. Anon pages + * are not likely to be evicted by use-once streaming + * IO, plus JVM can create lots of anon VM_EXEC pages, + * so we ignore them here. + */ + if ((vm_flags & VM_EXEC) && !PageAnon(page)) { + list_add(&page->lru, &l_active); + continue; + } + } list_add(&page->lru, &l_inactive); } /* - * Move the pages to the [file or anon] inactive list. + * Move pages back to the lru list. */ - pagevec_init(&pvec, 1); - lru = LRU_BASE + file * LRU_FILE; - spin_lock_irq(&zone->lru_lock); /* - * Count referenced pages from currently used mappings as - * rotated, even though they are moved to the inactive list. - * This helps balance scan pressure between file and anonymous - * pages in get_scan_ratio. + * Count referenced pages from currently used mappings as rotated, + * even though only some of them are actually re-activated. This + * helps balance scan pressure between file and anonymous pages in + * get_scan_ratio. */ reclaim_stat->recent_rotated[!!file] += pgmoved; - pgmoved = 0; - while (!list_empty(&l_inactive)) { - page = lru_to_page(&l_inactive); - prefetchw_prev_lru_page(page, &l_inactive, flags); - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); - VM_BUG_ON(!PageActive(page)); - ClearPageActive(page); + move_active_pages_to_lru(zone, &l_active, + LRU_ACTIVE + file * LRU_FILE); + move_active_pages_to_lru(zone, &l_inactive, + LRU_BASE + file * LRU_FILE); - list_move(&page->lru, &zone->lru[lru].list); - mem_cgroup_add_lru_list(page, lru); - pgmoved++; - if (!pagevec_add(&pvec, page)) { - __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); - spin_unlock_irq(&zone->lru_lock); - pgdeactivate += pgmoved; - pgmoved = 0; - if (buffer_heads_over_limit) - pagevec_strip(&pvec); - __pagevec_release(&pvec); - spin_lock_irq(&zone->lru_lock); - } - } - __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); - pgdeactivate += pgmoved; - __count_zone_vm_events(PGREFILL, zone, pgscanned); - __count_vm_events(PGDEACTIVATE, pgdeactivate); spin_unlock_irq(&zone->lru_lock); - if (buffer_heads_over_limit) - pagevec_strip(&pvec); - pagevec_release(&pvec); } static int inactive_anon_is_low_global(struct zone *zone) @@ -1348,12 +1352,48 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) return low; } +static int inactive_file_is_low_global(struct zone *zone) +{ + unsigned long active, inactive; + + active = zone_page_state(zone, NR_ACTIVE_FILE); + inactive = zone_page_state(zone, NR_INACTIVE_FILE); + + return (active > inactive); +} + +/** + * inactive_file_is_low - check if file pages need to be deactivated + * @zone: zone to check + * @sc: scan control of this context + * + * When the system is doing streaming IO, memory pressure here + * ensures that active file pages get deactivated, until more + * than half of the file pages are on the inactive list. + * + * Once we get to that situation, protect the system's working + * set from being evicted by disabling active file page aging. + * + * This uses a different ratio than the anonymous pages, because + * the page cache uses a use-once replacement algorithm. + */ +static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) +{ + int low; + + if (scanning_global_lru(sc)) + low = inactive_file_is_low_global(zone); + else + low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup); + return low; +} + static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct zone *zone, struct scan_control *sc, int priority) { int file = is_file_lru(lru); - if (lru == LRU_ACTIVE_FILE) { + if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) { shrink_active_list(nr_to_scan, zone, sc, priority, file); return 0; } @@ -1382,13 +1422,6 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, unsigned long ap, fp; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - /* If we have no swap space, do not bother scanning anon pages. */ - if (!sc->may_swap || (nr_swap_pages <= 0)) { - percent[0] = 0; - percent[1] = 100; - return; - } - anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + @@ -1398,7 +1431,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, free = zone_page_state(zone, NR_FREE_PAGES); /* If we have very few page cache pages, force-scan anon pages. */ - if (unlikely(file + free <= zone->pages_high)) { + if (unlikely(file + free <= high_wmark_pages(zone))) { percent[0] = 100; percent[1] = 0; return; @@ -1453,6 +1486,26 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, percent[1] = 100 - percent[0]; } +/* + * Smallish @nr_to_scan's are deposited in @nr_saved_scan, + * until we collected @swap_cluster_max pages to scan. + */ +static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, + unsigned long *nr_saved_scan, + unsigned long swap_cluster_max) +{ + unsigned long nr; + + *nr_saved_scan += nr_to_scan; + nr = *nr_saved_scan; + + if (nr >= swap_cluster_max) + *nr_saved_scan = 0; + else + nr = 0; + + return nr; +} /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. @@ -1466,26 +1519,30 @@ static void shrink_zone(int priority, struct zone *zone, enum lru_list l; unsigned long nr_reclaimed = sc->nr_reclaimed; unsigned long swap_cluster_max = sc->swap_cluster_max; + int noswap = 0; - get_scan_ratio(zone, sc, percent); + /* If we have no swap space, do not bother scanning anon pages. */ + if (!sc->may_swap || (nr_swap_pages <= 0)) { + noswap = 1; + percent[0] = 0; + percent[1] = 100; + } else + get_scan_ratio(zone, sc, percent); for_each_evictable_lru(l) { int file = is_file_lru(l); unsigned long scan; scan = zone_nr_pages(zone, sc, l); - if (priority) { + if (priority || noswap) { scan >>= priority; scan = (scan * percent[file]) / 100; } - if (scanning_global_lru(sc)) { - zone->lru[l].nr_scan += scan; - nr[l] = zone->lru[l].nr_scan; - if (nr[l] >= swap_cluster_max) - zone->lru[l].nr_scan = 0; - else - nr[l] = 0; - } else + if (scanning_global_lru(sc)) + nr[l] = nr_scan_try_batch(scan, + &zone->lru[l].nr_saved_scan, + swap_cluster_max); + else nr[l] = scan; } @@ -1519,7 +1576,7 @@ static void shrink_zone(int priority, struct zone *zone, * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_anon_is_low(zone, sc)) + if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0) shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); throttle_vm_writeout(sc->gfp_mask); @@ -1530,11 +1587,13 @@ static void shrink_zone(int priority, struct zone *zone, * try to reclaim pages from zones which will satisfy the caller's allocation * request. * - * We reclaim from a zone even if that zone is over pages_high. Because: + * We reclaim from a zone even if that zone is over high_wmark_pages(zone). + * Because: * a) The caller may be trying to free *extra* pages to satisfy a higher-order * allocation or - * b) The zones may be over pages_high but they must go *over* pages_high to - * satisfy the `incremental min' zone defense algorithm. + * b) The target zone may be at high_wmark_pages(zone) but the lower zones + * must go *over* high_wmark_pages(zone) to satisfy the `incremental min' + * zone defense algorithm. * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. @@ -1740,7 +1799,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, /* * For kswapd, balance_pgdat() will work across all this node's zones until - * they are all at pages_high. + * they are all at high_wmark_pages(zone). * * Returns the number of pages which were actually freed. * @@ -1753,11 +1812,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, * the zone for when the problem goes away. * * kswapd scans the zones in the highmem->normal->dma direction. It skips - * zones which have free_pages > pages_high, but once a zone is found to have - * free_pages <= pages_high, we scan that zone and the lower zones regardless - * of the number of free pages in the lower zones. This interoperates with - * the page allocator fallback scheme to ensure that aging of pages is balanced - * across the zones. + * zones which have free_pages > high_wmark_pages(zone), but once a zone is + * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the + * lower zones regardless of the number of free pages in the lower zones. This + * interoperates with the page allocator fallback scheme to ensure that aging + * of pages is balanced across the zones. */ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) { @@ -1778,7 +1837,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) }; /* * temp_priority is used to remember the scanning priority at which - * this zone was successfully refilled to free_pages == pages_high. + * this zone was successfully refilled to + * free_pages == high_wmark_pages(zone). */ int temp_priority[MAX_NR_ZONES]; @@ -1823,8 +1883,8 @@ loop_again: shrink_active_list(SWAP_CLUSTER_MAX, zone, &sc, priority, 0); - if (!zone_watermark_ok(zone, order, zone->pages_high, - 0, 0)) { + if (!zone_watermark_ok(zone, order, + high_wmark_pages(zone), 0, 0)) { end_zone = i; break; } @@ -1858,8 +1918,8 @@ loop_again: priority != DEF_PRIORITY) continue; - if (!zone_watermark_ok(zone, order, zone->pages_high, - end_zone, 0)) + if (!zone_watermark_ok(zone, order, + high_wmark_pages(zone), end_zone, 0)) all_zones_ok = 0; temp_priority[i] = priority; sc.nr_scanned = 0; @@ -1868,8 +1928,8 @@ loop_again: * We put equal pressure on every zone, unless one * zone has way too many pages free already. */ - if (!zone_watermark_ok(zone, order, 8*zone->pages_high, - end_zone, 0)) + if (!zone_watermark_ok(zone, order, + 8*high_wmark_pages(zone), end_zone, 0)) shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, @@ -2035,7 +2095,7 @@ void wakeup_kswapd(struct zone *zone, int order) return; pgdat = zone->zone_pgdat; - if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) + if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) return; if (pgdat->kswapd_max_order < order) pgdat->kswapd_max_order = order; @@ -2054,7 +2114,7 @@ unsigned long global_lru_pages(void) + global_page_state(NR_INACTIVE_FILE); } -#ifdef CONFIG_PM +#ifdef CONFIG_HIBERNATION /* * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages * from LRU lists system-wide, for given pass and priority. @@ -2082,11 +2142,11 @@ static void shrink_all_zones(unsigned long nr_pages, int prio, l == LRU_ACTIVE_FILE)) continue; - zone->lru[l].nr_scan += (lru_pages >> prio) + 1; - if (zone->lru[l].nr_scan >= nr_pages || pass > 3) { + zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1; + if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) { unsigned long nr_to_scan; - zone->lru[l].nr_scan = 0; + zone->lru[l].nr_saved_scan = 0; nr_to_scan = min(nr_pages, lru_pages); nr_reclaimed += shrink_list(l, nr_to_scan, zone, sc, prio); @@ -2194,7 +2254,7 @@ out: return sc.nr_reclaimed; } -#endif +#endif /* CONFIG_HIBERNATION */ /* It's optimal to keep kswapds on the same CPUs as their memory, but not required for correctness. So if the last cpu in a node goes @@ -2288,6 +2348,48 @@ int sysctl_min_unmapped_ratio = 1; */ int sysctl_min_slab_ratio = 5; +static inline unsigned long zone_unmapped_file_pages(struct zone *zone) +{ + unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); + unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + + zone_page_state(zone, NR_ACTIVE_FILE); + + /* + * It's possible for there to be more file mapped pages than + * accounted for by the pages on the file LRU lists because + * tmpfs pages accounted for as ANON can also be FILE_MAPPED + */ + return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; +} + +/* Work out how many page cache pages we can reclaim in this reclaim_mode */ +static long zone_pagecache_reclaimable(struct zone *zone) +{ + long nr_pagecache_reclaimable; + long delta = 0; + + /* + * If RECLAIM_SWAP is set, then all file pages are considered + * potentially reclaimable. Otherwise, we have to worry about + * pages like swapcache and zone_unmapped_file_pages() provides + * a better estimate + */ + if (zone_reclaim_mode & RECLAIM_SWAP) + nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); + else + nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); + + /* If we can't clean pages, remove dirty pages from consideration */ + if (!(zone_reclaim_mode & RECLAIM_WRITE)) + delta += zone_page_state(zone, NR_FILE_DIRTY); + + /* Watch for any possible underflows due to delta */ + if (unlikely(delta > nr_pagecache_reclaimable)) + delta = nr_pagecache_reclaimable; + + return nr_pagecache_reclaimable - delta; +} + /* * Try to free up some pages from this zone through reclaim. */ @@ -2322,9 +2424,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - if (zone_page_state(zone, NR_FILE_PAGES) - - zone_page_state(zone, NR_FILE_MAPPED) > - zone->min_unmapped_pages) { + if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) { /* * Free memory by calling shrink zone with increasing * priorities until we have enough memory freed. @@ -2382,20 +2482,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * if less than a specified percentage of the zone is used by * unmapped file backed pages. */ - if (zone_page_state(zone, NR_FILE_PAGES) - - zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages - && zone_page_state(zone, NR_SLAB_RECLAIMABLE) - <= zone->min_slab_pages) - return 0; + if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && + zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) + return ZONE_RECLAIM_FULL; if (zone_is_all_unreclaimable(zone)) - return 0; + return ZONE_RECLAIM_FULL; /* * Do not scan if the allocation should not be delayed. */ if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) - return 0; + return ZONE_RECLAIM_NOSCAN; /* * Only run zone reclaim on the local zone or on zones that do not @@ -2405,18 +2503,21 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) */ node_id = zone_to_nid(zone); if (node_state(node_id, N_CPU) && node_id != numa_node_id()) - return 0; + return ZONE_RECLAIM_NOSCAN; if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) - return 0; + return ZONE_RECLAIM_NOSCAN; + ret = __zone_reclaim(zone, gfp_mask, order); zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); + if (!ret) + count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); + return ret; } #endif -#ifdef CONFIG_UNEVICTABLE_LRU /* * page_evictable - test whether a page is evictable * @page: the page to test @@ -2663,4 +2764,3 @@ void scan_unevictable_unregister_node(struct node *node) sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); } -#endif diff --git a/mm/vmstat.c b/mm/vmstat.c index 66f6130..138bed5 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -509,22 +509,11 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, continue; page = pfn_to_page(pfn); -#ifdef CONFIG_ARCH_FLATMEM_HAS_HOLES - /* - * Ordinarily, memory holes in flatmem still have a valid - * memmap for the PFN range. However, an architecture for - * embedded systems (e.g. ARM) can free up the memmap backing - * holes to save memory on the assumption the memmap is - * never used. The page_zone linkages are then broken even - * though pfn_valid() returns true. Skip the page if the - * linkages are broken. Even if this test passed, the impact - * is that the counters for the movable type are off but - * fragmentation monitoring is likely meaningless on small - * systems. - */ - if (page_zone(page) != zone) + + /* Watch for unexpected holes punched in the memmap */ + if (!memmap_valid_within(pfn, page, zone)) continue; -#endif + mtype = get_pageblock_migratetype(page); if (mtype < MIGRATE_TYPES) @@ -640,10 +629,8 @@ static const char * const vmstat_text[] = { "nr_active_anon", "nr_inactive_file", "nr_active_file", -#ifdef CONFIG_UNEVICTABLE_LRU "nr_unevictable", "nr_mlock", -#endif "nr_anon_pages", "nr_mapped", "nr_file_pages", @@ -686,6 +673,9 @@ static const char * const vmstat_text[] = { TEXTS_FOR_ZONES("pgscan_kswapd") TEXTS_FOR_ZONES("pgscan_direct") +#ifdef CONFIG_NUMA + "zone_reclaim_failed", +#endif "pginodesteal", "slabs_scanned", "kswapd_steal", @@ -698,7 +688,6 @@ static const char * const vmstat_text[] = { "htlb_buddy_alloc_success", "htlb_buddy_alloc_fail", #endif -#ifdef CONFIG_UNEVICTABLE_LRU "unevictable_pgs_culled", "unevictable_pgs_scanned", "unevictable_pgs_rescued", @@ -708,7 +697,6 @@ static const char * const vmstat_text[] = { "unevictable_pgs_stranded", "unevictable_pgs_mlockfreed", #endif -#endif }; static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, @@ -721,18 +709,14 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n min %lu" "\n low %lu" "\n high %lu" - "\n scanned %lu (aa: %lu ia: %lu af: %lu if: %lu)" + "\n scanned %lu" "\n spanned %lu" "\n present %lu", zone_page_state(zone, NR_FREE_PAGES), - zone->pages_min, - zone->pages_low, - zone->pages_high, + min_wmark_pages(zone), + low_wmark_pages(zone), + high_wmark_pages(zone), zone->pages_scanned, - zone->lru[LRU_ACTIVE_ANON].nr_scan, - zone->lru[LRU_INACTIVE_ANON].nr_scan, - zone->lru[LRU_ACTIVE_FILE].nr_scan, - zone->lru[LRU_INACTIVE_FILE].nr_scan, zone->spanned_pages, zone->present_pages); |