From 1c1271850494f06b63ae6b485e2e1b9c27ffb2d1 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 12 Nov 2008 01:24:41 +0100 Subject: parisc: fix find_extend_vma() breakage The STACK_GROWSUP case of stack expansion was missing a test for 'prev', which got removed by commit cb8f488c33539f096580e202f5438a809195008f ("mmap.c: deinline a few functions") by mistake. I found my original email in "sent" folder. The patch in that mail does NOT remove !prev. That change had beed added by someone else. Ok, I think we are not much interested in who did it, let's fix it for good. [ "It looks like this was caused by me fixing rejects. That was the fancy include-lots-of-context-so-it-wont-apply patch." - akpm ] Reported-and-bisected-by: Helge Deller Signed-off-by: Denys Vlasenko Cc: Andrew Morton Cc: Jiri Kosina Signed-off-by: Linus Torvalds --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index de14ac2..d4855a68 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1704,7 +1704,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) vma = find_vma_prev(mm, addr, &prev); if (vma && (vma->vm_start <= addr)) return vma; - if (expand_stack(prev, addr)) + if (!prev || expand_stack(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) { if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0) -- cgit v1.1 From 7526674de0c921e7f1e9b6f71a1f9d832557b554 Mon Sep 17 00:00:00 2001 From: Adam Litke Date: Wed, 12 Nov 2008 13:24:56 -0800 Subject: hugetlb: make unmap_ref_private multi-size-aware Oops. Part of the hugetlb private reservation code was not fully converted to use hstates. When a huge page must be unmapped from VMAs due to a failed COW, HPAGE_SIZE is used in the call to unmap_hugepage_range() regardless of the page size being used. This works if the VMA is using the default huge page size. Otherwise we might unmap too much, too little, or trigger a BUG_ON. Rare but serious -- fix it. Signed-off-by: Adam Litke Cc: Jon Tollefson Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d143ab6..6058b53 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1796,6 +1796,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page, unsigned long address) { + struct hstate *h = hstate_vma(vma); struct vm_area_struct *iter_vma; struct address_space *mapping; struct prio_tree_iter iter; @@ -1805,7 +1806,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * vm_pgoff is in PAGE_SIZE units, hence the different calculation * from page cache lookup which is in HPAGE_SIZE units. */ - address = address & huge_page_mask(hstate_vma(vma)); + address = address & huge_page_mask(h); pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + (vma->vm_pgoff >> PAGE_SHIFT); mapping = (struct address_space *)page_private(page); @@ -1824,7 +1825,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, */ if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) unmap_hugepage_range(iter_vma, - address, address + HPAGE_SIZE, + address, address + huge_page_size(h), page); } -- cgit v1.1 From e33c3b5e172e2e45456f42fba47227d48745543f Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 12 Nov 2008 13:25:37 -0800 Subject: cpusets: update mems allowed in page allocator If all allowable memory is unreclaimable, it is possible to loop forever in the page allocator for ~__GFP_NORETRY allocations. During this time, it is also possible for a task's cpuset to expand its set of allowable nodes so that it now includes free memory. The cached copy of this set, current->mems_allowed, is stale, however, since there has not been a subsequent call to cpuset_update_task_memory_state(). The cached copy of the set of allowable nodes is now updated in the page allocator's slow path so the additional memory is available to get_page_from_freelist(). [akpm@linux-foundation.org: add comment] Signed-off-by: David Rientjes Cc: Paul Menage Cc: Christoph Lameter Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 54069e6..d8ac014 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1561,6 +1561,10 @@ nofail_alloc: /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); + /* + * The task's cpuset might have expanded its set of allowable nodes + */ + cpuset_update_task_memory_state(); p->flags |= PF_MEMALLOC; reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; -- cgit v1.1 From 8891d6da17db0f9bb507d3a017f130b9970c3087 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Wed, 12 Nov 2008 13:26:53 -0800 Subject: mm: remove lru_add_drain_all() from the munlock path lockdep warns about following message at boot time on one of my test machine. Then, schedule_on_each_cpu() sholdn't be called when the task have mmap_sem. Actually, lru_add_drain_all() exist to prevent the unevictalble pages stay on reclaimable lru list. but currenct unevictable code can rescue unevictable pages although it stay on reclaimable list. So removing is better. In addition, this patch add lru_add_drain_all() to sys_mlock() and sys_mlockall(). it isn't must. but it reduce the failure of moving to unevictable list. its failure can rescue in vmscan later. but reducing is better. Note, if above rescuing happend, the Mlocked and the Unevictable field mismatching happend in /proc/meminfo. but it doesn't cause any real trouble. ======================================================= [ INFO: possible circular locking dependency detected ] 2.6.28-rc2-mm1 #2 ------------------------------------------------------- lvm/1103 is trying to acquire lock: (&cpu_hotplug.lock){--..}, at: [] get_online_cpus+0x29/0x50 but task is already holding lock: (&mm->mmap_sem){----}, at: [] sys_mlockall+0x4e/0xb0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (&mm->mmap_sem){----}: [] check_noncircular+0x82/0x110 [] might_fault+0x4a/0xa0 [] validate_chain+0xb11/0x1070 [] might_fault+0x4a/0xa0 [] __lock_acquire+0x263/0xa10 [] lock_acquire+0x7c/0xb0 (*) grab mmap_sem [] might_fault+0x4a/0xa0 [] might_fault+0x7b/0xa0 [] might_fault+0x4a/0xa0 [] copy_to_user+0x30/0x60 [] filldir+0x7c/0xd0 [] sysfs_readdir+0x11a/0x1f0 (*) grab sysfs_mutex [] filldir+0x0/0xd0 [] filldir+0x0/0xd0 [] vfs_readdir+0x86/0xa0 (*) grab i_mutex [] sys_getdents+0x6b/0xc0 [] syscall_call+0x7/0xb [] 0xffffffff -> #2 (sysfs_mutex){--..}: [] check_noncircular+0x82/0x110 [] sysfs_addrm_start+0x2c/0xc0 [] validate_chain+0xb11/0x1070 [] sysfs_addrm_start+0x2c/0xc0 [] __lock_acquire+0x263/0xa10 [] lock_acquire+0x7c/0xb0 (*) grab sysfs_mutex [] sysfs_addrm_start+0x2c/0xc0 [] mutex_lock_nested+0xa5/0x2f0 [] sysfs_addrm_start+0x2c/0xc0 [] sysfs_addrm_start+0x2c/0xc0 [] sysfs_addrm_start+0x2c/0xc0 [] create_dir+0x3f/0x90 [] sysfs_create_dir+0x29/0x50 [] _spin_unlock+0x25/0x40 [] kobject_add_internal+0xcd/0x1a0 [] kobject_set_name_vargs+0x3a/0x50 [] kobject_init_and_add+0x2d/0x40 [] sysfs_slab_add+0xd2/0x180 [] sysfs_add_func+0x0/0x70 [] sysfs_add_func+0x5c/0x70 (*) grab slub_lock [] run_workqueue+0x172/0x200 [] run_workqueue+0x10f/0x200 [] worker_thread+0x0/0xf0 [] worker_thread+0x9c/0xf0 [] autoremove_wake_function+0x0/0x50 [] worker_thread+0x0/0xf0 [] kthread+0x42/0x70 [] kthread+0x0/0x70 [] kernel_thread_helper+0x7/0x1c [] 0xffffffff -> #1 (slub_lock){----}: [] check_noncircular+0xd/0x110 [] slab_cpuup_callback+0x11f/0x1d0 [] validate_chain+0xb11/0x1070 [] slab_cpuup_callback+0x11f/0x1d0 [] mark_lock+0x35d/0xd00 [] __lock_acquire+0x263/0xa10 [] lock_acquire+0x7c/0xb0 [] slab_cpuup_callback+0x11f/0x1d0 [] down_read+0x43/0x80 [] slab_cpuup_callback+0x11f/0x1d0 (*) grab slub_lock [] slab_cpuup_callback+0x11f/0x1d0 [] notifier_call_chain+0x3c/0x70 [] _cpu_up+0x84/0x110 [] cpu_up+0x4b/0x70 (*) grab cpu_hotplug.lock [] kernel_init+0x0/0x170 [] kernel_init+0xb5/0x170 [] kernel_init+0x0/0x170 [] kernel_thread_helper+0x7/0x1c [] 0xffffffff -> #0 (&cpu_hotplug.lock){--..}: [] validate_chain+0x5af/0x1070 [] dev_status+0x0/0x50 [] __lock_acquire+0x263/0xa10 [] lock_acquire+0x7c/0xb0 [] get_online_cpus+0x29/0x50 [] mutex_lock_nested+0xa5/0x2f0 [] get_online_cpus+0x29/0x50 [] get_online_cpus+0x29/0x50 [] lru_add_drain_per_cpu+0x0/0x10 [] get_online_cpus+0x29/0x50 (*) grab cpu_hotplug.lock [] schedule_on_each_cpu+0x32/0xe0 [] __mlock_vma_pages_range+0x85/0x2c0 [] __lock_acquire+0x285/0xa10 [] vma_merge+0xa9/0x1d0 [] mlock_fixup+0x180/0x200 [] do_mlockall+0x78/0x90 (*) grab mmap_sem [] sys_mlockall+0x81/0xb0 [] syscall_call+0x7/0xb [] 0xffffffff other info that might help us debug this: 1 lock held by lvm/1103: #0: (&mm->mmap_sem){----}, at: [] sys_mlockall+0x4e/0xb0 stack backtrace: Pid: 1103, comm: lvm Not tainted 2.6.28-rc2-mm1 #2 Call Trace: [] print_circular_bug_tail+0x7c/0xd0 [] validate_chain+0x5af/0x1070 [] dev_status+0x0/0x50 [] __lock_acquire+0x263/0xa10 [] lock_acquire+0x7c/0xb0 [] get_online_cpus+0x29/0x50 [] mutex_lock_nested+0xa5/0x2f0 [] get_online_cpus+0x29/0x50 [] get_online_cpus+0x29/0x50 [] lru_add_drain_per_cpu+0x0/0x10 [] get_online_cpus+0x29/0x50 [] schedule_on_each_cpu+0x32/0xe0 [] __mlock_vma_pages_range+0x85/0x2c0 [] __lock_acquire+0x285/0xa10 [] vma_merge+0xa9/0x1d0 [] mlock_fixup+0x180/0x200 [] do_mlockall+0x78/0x90 [] sys_mlockall+0x81/0xb0 [] syscall_call+0x7/0xb Signed-off-by: KOSAKI Motohiro Tested-by: Kamalesh Babulal Cc: Lee Schermerhorn Cc: Christoph Lameter Cc: Heiko Carstens Cc: Nick Piggin Cc: Hugh Dickins Cc: Rik van Riel Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index 008ea70..a6da2ae 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -66,14 +66,10 @@ void __clear_page_mlock(struct page *page) putback_lru_page(page); } else { /* - * Page not on the LRU yet. Flush all pagevecs and retry. + * We lost the race. the page already moved to evictable list. */ - lru_add_drain_all(); - if (!isolate_lru_page(page)) - putback_lru_page(page); - else if (PageUnevictable(page)) + if (PageUnevictable(page)) count_vm_event(UNEVICTABLE_PGSTRANDED); - } } @@ -187,8 +183,6 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, if (vma->vm_flags & VM_WRITE) gup_flags |= GUP_FLAGS_WRITE; - lru_add_drain_all(); /* push cached pages to LRU */ - while (nr_pages > 0) { int i; @@ -251,8 +245,6 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, ret = 0; } - lru_add_drain_all(); /* to update stats */ - return ret; /* count entire vma as locked_vm */ } @@ -546,6 +538,8 @@ asmlinkage long sys_mlock(unsigned long start, size_t len) if (!can_do_mlock()) return -EPERM; + lru_add_drain_all(); /* flush pagevec */ + down_write(¤t->mm->mmap_sem); len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); start &= PAGE_MASK; @@ -612,6 +606,8 @@ asmlinkage long sys_mlockall(int flags) if (!can_do_mlock()) goto out; + lru_add_drain_all(); /* flush pagevec */ + down_write(¤t->mm->mmap_sem); lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; -- cgit v1.1 From 33c5d3d64589c5d379db5a5615735f6d08438369 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 12 Nov 2008 13:27:01 -0800 Subject: memcg: bugfix for memory hotplug The start pfn calculation in page_cgroup's memory hotplug notifier chain is wrong. Tested-by: Badari Pulavarty Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index f59d797..1223d92 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -165,7 +165,7 @@ int online_page_cgroup(unsigned long start_pfn, unsigned long start, end, pfn; int fail = 0; - start = start_pfn & (PAGES_PER_SECTION - 1); + start = start_pfn & ~(PAGES_PER_SECTION - 1); end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { @@ -188,7 +188,7 @@ int offline_page_cgroup(unsigned long start_pfn, { unsigned long start, end, pfn; - start = start_pfn & (PAGES_PER_SECTION - 1); + start = start_pfn & ~(PAGES_PER_SECTION - 1); end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) -- cgit v1.1 From 748f1a2ed7a68e15b28a1da3559afbebba121772 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Fri, 14 Nov 2008 16:25:01 +0900 Subject: mm: remove unevictable's show_page_path Hugh Dickins reported show_page_path() is buggy and unsafe because - lack dput() against d_find_alias() - don't concern vma->vm_mm->owner == NULL - lack lock_page() it was only for debugging, so rather than trying to fix it, just remove it now. Reported-by: Hugh Dickins Signed-off-by: Hugh Dickins Signed-off-by: KOSAKI Motohiro CC: Lee Schermerhorn CC: Rik van Riel Signed-off-by: Linus Torvalds --- mm/vmscan.c | 35 ----------------------------------- 1 file changed, 35 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 3b58602..c141b3e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2368,39 +2368,6 @@ int page_evictable(struct page *page, struct vm_area_struct *vma) return 1; } -static void show_page_path(struct page *page) -{ - char buf[256]; - if (page_is_file_cache(page)) { - struct address_space *mapping = page->mapping; - struct dentry *dentry; - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - - spin_lock(&mapping->i_mmap_lock); - dentry = d_find_alias(mapping->host); - printk(KERN_INFO "rescued: %s %lu\n", - dentry_path(dentry, buf, 256), pgoff); - spin_unlock(&mapping->i_mmap_lock); - } else { -#if defined(CONFIG_MM_OWNER) && defined(CONFIG_MMU) - struct anon_vma *anon_vma; - struct vm_area_struct *vma; - - anon_vma = page_lock_anon_vma(page); - if (!anon_vma) - return; - - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { - printk(KERN_INFO "rescued: anon %s\n", - vma->vm_mm->owner->comm); - break; - } - page_unlock_anon_vma(anon_vma); -#endif - } -} - - /** * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list * @page: page to check evictability and move to appropriate lru list @@ -2421,8 +2388,6 @@ retry: if (page_evictable(page, NULL)) { enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page); - show_page_path(page); - __dec_zone_state(zone, NR_UNEVICTABLE); list_move(&page->lru, &zone->lru[l].list); __inc_zone_state(zone, NR_INACTIVE_ANON + l); -- cgit v1.1 From 72eb8c6747b49e41fd2b042510f03ac7c13426fc Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 17 Nov 2008 00:30:57 +0100 Subject: unitialized return value in mm/mlock.c: __mlock_vma_pages_range() Fix an unitialized return value when compiling on parisc (with CONFIG_UNEVICTABLE_LRU=y): mm/mlock.c: In function `__mlock_vma_pages_range': mm/mlock.c:165: warning: `ret' might be used uninitialized in this function Signed-off-by: Helge Deller [ It isn't ever really used uninitialized, since no caller should ever call this function with an empty range. But the compiler is correct that from a local analysis standpoint that is impossible to see, and fixing the warning is appropriate. ] Signed-off-by: Linus Torvalds --- mm/mlock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index a6da2ae..1ada366 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -162,7 +162,7 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, unsigned long addr = start; struct page *pages[16]; /* 16 gives a reasonable batch */ int nr_pages = (end - start) / PAGE_SIZE; - int ret; + int ret = 0; int gup_flags = 0; VM_BUG_ON(start & ~PAGE_MASK); -- cgit v1.1 From f481891fdc49d3d1b8a9674a1825d183069a805f Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 19 Nov 2008 15:36:30 -0800 Subject: cpuset: update top cpuset's mems after adding a node After adding a node into the machine, top cpuset's mems isn't updated. By reviewing the code, we found that the update function cpuset_track_online_nodes() was invoked after node_states[N_ONLINE] changes. It is wrong because N_ONLINE just means node has pgdat, and if node has/added memory, we use N_HIGH_MEMORY. So, We should invoke the update function after node_states[N_HIGH_MEMORY] changes, just like its commit says. This patch fixes it. And we use notifier of memory hotplug instead of direct calling of cpuset_track_online_nodes(). Signed-off-by: Miao Xie Acked-by: Yasunori Goto Cc: David Rientjes Cc: Paul Menage Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6837a10..b5b2b15 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -498,8 +497,6 @@ int add_memory(int nid, u64 start, u64 size) /* we online node here. we can't roll back from here. */ node_set_online(nid); - cpuset_track_online_nodes(); - if (new_pgdat) { ret = register_one_node(nid); /* -- cgit v1.1 From f011c2dae6cffc50ef67d9bd937b488ba5db8913 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 19 Nov 2008 15:36:32 -0800 Subject: mm: vmalloc allocator off by one Fix off by one bug in the KVA allocator that can leave gaps in the address space. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ba6b0f5..46aab4d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -362,7 +362,7 @@ retry: goto found; } - while (addr + size >= first->va_start && addr + size <= vend) { + while (addr + size > first->va_start && addr + size <= vend) { addr = ALIGN(first->va_end + PAGE_SIZE, align); n = rb_next(&first->rb_node); -- cgit v1.1 From 496850e5f5a372029ceb2b35c811770a9bb073b6 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 19 Nov 2008 15:36:33 -0800 Subject: mm: vmalloc failure flush fix An initial vmalloc failure should start off a synchronous flush of lazy areas, in case someone is in progress flushing them already, which could cause us to return an allocation failure even if there is plenty of KVA free. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 46aab4d..04f5e320 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -522,13 +522,24 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, } /* + * Kick off a purge of the outstanding lazy areas. Don't bother if somebody + * is already purging. + */ +static void try_purge_vmap_area_lazy(void) +{ + unsigned long start = ULONG_MAX, end = 0; + + __purge_vmap_area_lazy(&start, &end, 0, 0); +} + +/* * Kick off a purge of the outstanding lazy areas. */ static void purge_vmap_area_lazy(void) { unsigned long start = ULONG_MAX, end = 0; - __purge_vmap_area_lazy(&start, &end, 0, 0); + __purge_vmap_area_lazy(&start, &end, 1, 0); } /* @@ -539,7 +550,7 @@ static void free_unmap_vmap_area(struct vmap_area *va) va->flags |= VM_LAZY_FREE; atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages())) - purge_vmap_area_lazy(); + try_purge_vmap_area_lazy(); } static struct vmap_area *find_vmap_area(unsigned long addr) -- cgit v1.1 From 0ae15132a4f5c758a6ffcde74495641dc3f62ba1 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Wed, 19 Nov 2008 15:36:33 -0800 Subject: mm: vmalloc search restart fix Current vmalloc restart search for a free area in case we can't find one. The reason is there are areas which are lazily freed, and could be possibly freed now. However, current implementation start searching the tree from the last failing address, which is pretty much by definition at the end of address space. So, we fail. The proposal of this patch is to restart the search from the beginning of the requested vstart address. This fixes the regression in running KVM virtual machines for me, described in http://lkml.org/lkml/2008/10/28/349, caused by commit db64fe02258f1507e13fe5212a989922323685ce. Signed-off-by: Glauber Costa Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 04f5e320..30f826d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -324,14 +324,14 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, BUG_ON(size & ~PAGE_MASK); - addr = ALIGN(vstart, align); - va = kmalloc_node(sizeof(struct vmap_area), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!va)) return ERR_PTR(-ENOMEM); retry: + addr = ALIGN(vstart, align); + spin_lock(&vmap_area_lock); /* XXX: could have a last_hole cache */ n = vmap_area_root.rb_node; -- cgit v1.1 From bda8550deed96687f29992d711a88ea21cff4d26 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 19 Nov 2008 15:36:36 -0800 Subject: migration: fix writepage error Page migration's writeout() has got understandably confused by the nasty AOP_WRITEPAGE_ACTIVATE case: as in normal success, a writepage() error has unlocked the page, so writeout() then needs to relock it. Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 385db89..1e0d6b2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -522,15 +522,12 @@ static int writeout(struct address_space *mapping, struct page *page) remove_migration_ptes(page, page); rc = mapping->a_ops->writepage(page, &wbc); - if (rc < 0) - /* I/O Error writing */ - return -EIO; if (rc != AOP_WRITEPAGE_ACTIVATE) /* unlocked. Relock */ lock_page(page); - return -EAGAIN; + return (rc < 0) ? -EIO : -EAGAIN; } /* -- cgit v1.1 From 63eb6b93ce725e4c5f38fc85dd703d49465b03cb Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 19 Nov 2008 15:36:37 -0800 Subject: vmscan: let GFP_NOFS go to swap again In the past, GFP_NOFS (but of course not GFP_NOIO) was allowed to reclaim by writing to swap. That got partially broken in 2.6.23, when may_enter_fs initialization was moved up before the allocation of swap, so its PageSwapCache test was failing the first time around, Fix it by setting may_enter_fs when add_to_swap() succeeds with __GFP_IO. In fact, check __GFP_IO before calling add_to_swap(): allocating swap we're not ready to use just increases disk seeking. Signed-off-by: Hugh Dickins Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index c141b3e..f83a7ed 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -623,6 +623,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, * Try to allocate it some swap space here. */ if (PageAnon(page) && !PageSwapCache(page)) { + if (!(sc->gfp_mask & __GFP_IO)) + goto keep_locked; switch (try_to_munlock(page)) { case SWAP_FAIL: /* shouldn't happen */ case SWAP_AGAIN: @@ -634,6 +636,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } if (!add_to_swap(page, GFP_ATOMIC)) goto activate_locked; + may_enter_fs = 1; } #endif /* CONFIG_SWAP */ -- cgit v1.1 From 00d8089c54867053a5aae062b765f257ca419e27 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 19 Nov 2008 15:36:44 -0800 Subject: vmscan: fix get_scan_ratio() comment Fix the old comment on the scan ratio calculations. Signed-off-by: Rik van Riel Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index f83a7ed..7ea1440 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1389,9 +1389,9 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, file_prio = 200 - sc->swappiness; /* - * anon recent_rotated[0] - * %anon = 100 * ----------- / ----------------- * IO cost - * anon + file rotate_sum + * The amount of pressure on anon vs file pages is inversely + * proportional to the fraction of recently scanned pages on + * each list that were recently referenced and in active use. */ ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1); ap /= zone->recent_rotated[0] + 1; -- cgit v1.1 From 31168481c32c8a485e1003af9433124dede57f8d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 22 Nov 2008 17:33:24 +0000 Subject: meminit section warnings Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 9 +++++---- mm/page_cgroup.c | 13 +++++++------ mm/sparse.c | 2 +- 3 files changed, 13 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b5b2b15..b1737118 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -189,7 +189,7 @@ static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, pgdat->node_start_pfn; } -static int __add_zone(struct zone *zone, unsigned long phys_start_pfn) +static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) { struct pglist_data *pgdat = zone->zone_pgdat; int nr_pages = PAGES_PER_SECTION; @@ -216,7 +216,7 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn) return 0; } -static int __add_section(struct zone *zone, unsigned long phys_start_pfn) +static int __meminit __add_section(struct zone *zone, unsigned long phys_start_pfn) { int nr_pages = PAGES_PER_SECTION; int ret; @@ -273,7 +273,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms) * call this function after deciding the zone to which to * add the new pages. */ -int __add_pages(struct zone *zone, unsigned long phys_start_pfn, +int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn, unsigned long nr_pages) { unsigned long i; @@ -470,7 +470,8 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat) } -int add_memory(int nid, u64 start, u64 size) +/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ +int __ref add_memory(int nid, u64 start, u64 size) { pg_data_t *pgdat = NULL; int new_pgdat = 0; diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 1223d92..436c002 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -21,7 +21,7 @@ static unsigned long total_usage; #if !defined(CONFIG_SPARSEMEM) -void __init pgdat_page_cgroup_init(struct pglist_data *pgdat) +void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) { pgdat->node_page_cgroup = NULL; } @@ -97,7 +97,8 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) return section->page_cgroup + pfn; } -int __meminit init_section_page_cgroup(unsigned long pfn) +/* __alloc_bootmem...() is protected by !slab_available() */ +int __init_refok init_section_page_cgroup(unsigned long pfn) { struct mem_section *section; struct page_cgroup *base, *pc; @@ -158,7 +159,7 @@ void __free_page_cgroup(unsigned long pfn) } } -int online_page_cgroup(unsigned long start_pfn, +int __meminit online_page_cgroup(unsigned long start_pfn, unsigned long nr_pages, int nid) { @@ -183,7 +184,7 @@ int online_page_cgroup(unsigned long start_pfn, return -ENOMEM; } -int offline_page_cgroup(unsigned long start_pfn, +int __meminit offline_page_cgroup(unsigned long start_pfn, unsigned long nr_pages, int nid) { unsigned long start, end, pfn; @@ -197,7 +198,7 @@ int offline_page_cgroup(unsigned long start_pfn, } -static int page_cgroup_callback(struct notifier_block *self, +static int __meminit page_cgroup_callback(struct notifier_block *self, unsigned long action, void *arg) { struct memory_notify *mn = arg; @@ -248,7 +249,7 @@ void __init page_cgroup_init(void) " want\n"); } -void __init pgdat_page_cgroup_init(struct pglist_data *pgdat) +void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) { return; } diff --git a/mm/sparse.c b/mm/sparse.c index 39db301..083f5b6 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -570,7 +570,7 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap) * set. If this is <=0, then that means that the passed-in * map was not consumed and must be freed. */ -int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, +int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn, int nr_pages) { unsigned long section_nr = pfn_to_section_nr(start_pfn); -- cgit v1.1 From 2a1dc509747fdcfdf3a2df818a14908aed86c3d4 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 1 Dec 2008 03:00:35 +0100 Subject: vmscan: protect zone rotation stats by lru lock The zone's rotation statistics must not be accessed without the corresponding LRU lock held. Fix an unprotected write in shrink_active_list(). Acked-by: Rik van Riel Reviewed-by: KOSAKI Motohiro Signed-off-by: Johannes Weiner Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 7ea1440..62e7f62 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1248,6 +1248,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, list_add(&page->lru, &l_inactive); } + spin_lock_irq(&zone->lru_lock); /* * Count referenced pages from currently used mappings as * rotated, even though they are moved to the inactive list. @@ -1263,7 +1264,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, pgmoved = 0; lru = LRU_BASE + file * LRU_FILE; - spin_lock_irq(&zone->lru_lock); while (!list_empty(&l_inactive)) { page = lru_to_page(&l_inactive); prefetchw_prev_lru_page(page, &l_inactive, flags); -- cgit v1.1 From b29acbdcf877009af3f1fc0750bcac314c51e055 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 1 Dec 2008 13:13:47 -0800 Subject: mm: vmalloc fix lazy unmapping cache aliasing Jim Radford has reported that the vmap subsystem rewrite was sometimes causing his VIVT ARM system to behave strangely (seemed like going into infinite loops trying to fault in pages to userspace). We determined that the problem was most likely due to a cache aliasing issue. flush_cache_vunmap was only being called at the moment the page tables were to be taken down, however with lazy unmapping, this can happen after the page has subsequently been freed and allocated for something else. The dangling alias may still have dirty data attached to it. The fix for this problem is to do the cache flushing when the caller has called vunmap -- it would be a bug for them to write anything else to the mapping at that point. That appeared to solve Jim's problems. Reported-by: Jim Radford Signed-off-by: Nick Piggin Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 30f826d..f3f6e07 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -77,7 +77,6 @@ static void vunmap_page_range(unsigned long addr, unsigned long end) BUG_ON(addr >= end); pgd = pgd_offset_k(addr); - flush_cache_vunmap(addr, end); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) @@ -543,9 +542,10 @@ static void purge_vmap_area_lazy(void) } /* - * Free and unmap a vmap area + * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been + * called for the correct range previously. */ -static void free_unmap_vmap_area(struct vmap_area *va) +static void free_unmap_vmap_area_noflush(struct vmap_area *va) { va->flags |= VM_LAZY_FREE; atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); @@ -553,6 +553,15 @@ static void free_unmap_vmap_area(struct vmap_area *va) try_purge_vmap_area_lazy(); } +/* + * Free and unmap a vmap area + */ +static void free_unmap_vmap_area(struct vmap_area *va) +{ + flush_cache_vunmap(va->va_start, va->va_end); + free_unmap_vmap_area_noflush(va); +} + static struct vmap_area *find_vmap_area(unsigned long addr) { struct vmap_area *va; @@ -734,7 +743,7 @@ static void free_vmap_block(struct vmap_block *vb) spin_unlock(&vmap_block_tree_lock); BUG_ON(tmp != vb); - free_unmap_vmap_area(vb->va); + free_unmap_vmap_area_noflush(vb->va); call_rcu(&vb->rcu_head, rcu_free_vb); } @@ -796,6 +805,9 @@ static void vb_free(const void *addr, unsigned long size) BUG_ON(size & ~PAGE_MASK); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); + + flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size); + order = get_order(size); offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); -- cgit v1.1 From dc19f9db38295f811d9041bd89b113beccbd763a Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 1 Dec 2008 13:13:48 -0800 Subject: memcg: memory hotplug fix for notifier callback Fixes for memcg/memory hotplug. While memory hotplug allocate/free memmap, page_cgroup doesn't free page_cgroup at OFFLINE when page_cgroup is allocated via bootomem. (Because freeing bootmem requires special care.) Then, if page_cgroup is allocated by bootmem and memmap is freed/allocated by memory hotplug, page_cgroup->page == page is no longer true. But current MEM_ONLINE handler doesn't check it and update page_cgroup->page if it's not necessary to allocate page_cgroup. (This was not found because memmap is not freed if SPARSEMEM_VMEMMAP is y.) And I noticed that MEM_ONLINE can be called against "part of section". So, freeing page_cgroup at CANCEL_ONLINE will cause trouble. (freeing used page_cgroup) Don't rollback at CANCEL. One more, current memory hotplug notifier is stopped by slub because it sets NOTIFY_STOP_MASK to return vaule. So, page_cgroup's callback never be called. (low priority than slub now.) I think this slub's behavior is not intentional(BUG). and fixes it. Another way to be considered about page_cgroup allocation: - free page_cgroup at OFFLINE even if it's from bootmem and remove specieal handler. But it requires more changes. Addresses http://bugzilla.kernel.org/show_bug.cgi?id=12041 Signed-off-by: KAMEZAWA Hiruyoki Cc: Li Zefan Cc: Balbir Singh Cc: Pavel Emelyanov Tested-by: Badari Pulavarty Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_cgroup.c | 43 +++++++++++++++++++++++++++++-------------- mm/slub.c | 6 ++++-- 2 files changed, 33 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 436c002..0b3cbf0 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -107,19 +107,29 @@ int __init_refok init_section_page_cgroup(unsigned long pfn) section = __pfn_to_section(pfn); - if (section->page_cgroup) - return 0; - - nid = page_to_nid(pfn_to_page(pfn)); - - table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; - if (slab_is_available()) { - base = kmalloc_node(table_size, GFP_KERNEL, nid); - if (!base) - base = vmalloc_node(table_size, nid); - } else { - base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), table_size, + if (!section->page_cgroup) { + nid = page_to_nid(pfn_to_page(pfn)); + table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; + if (slab_is_available()) { + base = kmalloc_node(table_size, GFP_KERNEL, nid); + if (!base) + base = vmalloc_node(table_size, nid); + } else { + base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), + table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + } + } else { + /* + * We don't have to allocate page_cgroup again, but + * address of memmap may be changed. So, we have to initialize + * again. + */ + base = section->page_cgroup + pfn; + table_size = 0; + /* check address of memmap is changed or not. */ + if (base->page == pfn_to_page(pfn)) + return 0; } if (!base) { @@ -208,18 +218,23 @@ static int __meminit page_cgroup_callback(struct notifier_block *self, ret = online_page_cgroup(mn->start_pfn, mn->nr_pages, mn->status_change_nid); break; - case MEM_CANCEL_ONLINE: case MEM_OFFLINE: offline_page_cgroup(mn->start_pfn, mn->nr_pages, mn->status_change_nid); break; + case MEM_CANCEL_ONLINE: case MEM_GOING_OFFLINE: break; case MEM_ONLINE: case MEM_CANCEL_OFFLINE: break; } - ret = notifier_from_errno(ret); + + if (ret) + ret = notifier_from_errno(ret); + else + ret = NOTIFY_OK; + return ret; } diff --git a/mm/slub.c b/mm/slub.c index 7ad489a..749588a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2931,8 +2931,10 @@ static int slab_memory_callback(struct notifier_block *self, case MEM_CANCEL_OFFLINE: break; } - - ret = notifier_from_errno(ret); + if (ret) + ret = notifier_from_errno(ret); + else + ret = NOTIFY_OK; return ret; } -- cgit v1.1