From 5ca3957510b9fc2a14d3647db518014842f9a2b4 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Fri, 8 Mar 2013 12:43:28 -0800 Subject: mm/mempolicy.c: fix wrong sp_node insertion n->end is accessed in sp_insert(). Thus it should be update before calling sp_insert(). This mistake may make kernel panic. Signed-off-by: Hillf Danton Signed-off-by: KOSAKI Motohiro Cc: Sasha Levin Cc: Hugh Dickins Cc: Mel Gorman Cc: Dave Jones Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 31d2663..868d08f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2391,8 +2391,8 @@ restart: *mpol_new = *n->policy; atomic_set(&mpol_new->refcnt, 1); sp_node_init(n_new, n->end, end, mpol_new); - sp_insert(sp, n_new); n->end = start; + sp_insert(sp, n_new); n_new = NULL; mpol_new = NULL; break; -- cgit v1.1 From 7880639c3e4fde5953ff243ee52204ddc5af641b Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Fri, 8 Mar 2013 12:43:29 -0800 Subject: mm/mempolicy.c: fix sp_node_init() argument ordering Currently, n_new is wrongly initialized. start and end parameter are inverted. Let's fix it. Signed-off-by: KOSAKI Motohiro Cc: Hillf Danton Cc: Sasha Levin Cc: Hugh Dickins Cc: Mel Gorman Cc: Dave Jones Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 868d08f..7431001 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2390,7 +2390,7 @@ restart: *mpol_new = *n->policy; atomic_set(&mpol_new->refcnt, 1); - sp_node_init(n_new, n->end, end, mpol_new); + sp_node_init(n_new, end, n->end, mpol_new); n->end = start; sp_insert(sp, n_new); n_new = NULL; -- cgit v1.1 From d8fc16a825eb7780db71268a8502fb3e6af95753 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 8 Mar 2013 12:43:34 -0800 Subject: ksm: fix m68k build: only NUMA needs pfn_to_nid A CONFIG_DISCONTIGMEM=y m68k config gave mm/ksm.c: In function `get_kpfn_nid': mm/ksm.c:492: error: implicit declaration of function `pfn_to_nid' linux/mmzone.h declares it for CONFIG_SPARSEMEM and CONFIG_FLATMEM, but expects the arch's asm/mmzone.h to declare it for CONFIG_DISCONTIGMEM (see arch/mips/include/asm/mmzone.h for example). Or perhaps it is only expected when CONFIG_NUMA=y: too much of a maze, and m68k got away without it so far, so fix the build in mm/ksm.c. Signed-off-by: Hugh Dickins Reported-by: Geert Uytterhoeven Cc: Petr Holasek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index 85bfd4c..b6afe0c 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -489,7 +489,7 @@ out: page = NULL; */ static inline int get_kpfn_nid(unsigned long kpfn) { - return ksm_merge_across_nodes ? 0 : pfn_to_nid(kpfn); + return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn)); } static void remove_node_from_stable_tree(struct stable_node *stable_node) -- cgit v1.1 From 15cf17d26e08ee95c2e392a3a71f55d32e99e971 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Fri, 8 Mar 2013 12:43:36 -0800 Subject: memcg: initialize kmem-cache destroying work earlier Fix a warning from lockdep caused by calling cancel_work_sync() for uninitialized struct work. This path has been triggered by destructon kmem-cache hierarchy via destroying its root kmem-cache. cache ffff88003c072d80 obj ffff88003b410000 cache ffff88003c072d80 obj ffff88003b924000 cache ffff88003c20bd40 INFO: trying to register non-static key. the code is fine but needs lockdep annotation. turning off the locking correctness validator. Pid: 2825, comm: insmod Tainted: G O 3.9.0-rc1-next-20130307+ #611 Call Trace: __lock_acquire+0x16a2/0x1cb0 lock_acquire+0x8a/0x120 flush_work+0x38/0x2a0 __cancel_work_timer+0x89/0xf0 cancel_work_sync+0xb/0x10 kmem_cache_destroy_memcg_children+0x81/0xb0 kmem_cache_destroy+0xf/0xe0 init_module+0xcb/0x1000 [kmem_test] do_one_initcall+0x11a/0x170 load_module+0x19b0/0x2320 SyS_init_module+0xc6/0xf0 system_call_fastpath+0x16/0x1b Example module to demonstrate: #include #include #include #include int __init mod_init(void) { int size = 256; struct kmem_cache *cache; void *obj; struct page *page; cache = kmem_cache_create("kmem_cache_test", size, size, 0, NULL); if (!cache) return -ENOMEM; printk("cache %p\n", cache); obj = kmem_cache_alloc(cache, GFP_KERNEL); if (obj) { page = virt_to_head_page(obj); printk("obj %p cache %p\n", obj, page->slab_cache); kmem_cache_free(cache, obj); } flush_scheduled_work(); obj = kmem_cache_alloc(cache, GFP_KERNEL); if (obj) { page = virt_to_head_page(obj); printk("obj %p cache %p\n", obj, page->slab_cache); kmem_cache_free(cache, obj); } kmem_cache_destroy(cache); return -EBUSY; } module_init(mod_init); MODULE_LICENSE("GPL"); Signed-off-by: Konstantin Khlebnikov Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 53b8201..2b55222 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3012,6 +3012,8 @@ void memcg_update_array_size(int num) memcg_limited_groups_array_size = memcg_caches_array_size(num); } +static void kmem_cache_destroy_work_func(struct work_struct *w); + int memcg_update_cache_size(struct kmem_cache *s, int num_groups) { struct memcg_cache_params *cur_params = s->memcg_params; @@ -3031,6 +3033,8 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) return -ENOMEM; } + INIT_WORK(&s->memcg_params->destroy, + kmem_cache_destroy_work_func); s->memcg_params->is_root_cache = true; /* @@ -3078,6 +3082,8 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, if (!s->memcg_params) return -ENOMEM; + INIT_WORK(&s->memcg_params->destroy, + kmem_cache_destroy_work_func); if (memcg) { s->memcg_params->memcg = memcg; s->memcg_params->root_cache = root_cache; @@ -3358,8 +3364,6 @@ static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) list_for_each_entry(params, &memcg->memcg_slab_caches, list) { cachep = memcg_params_to_cache(params); cachep->memcg_params->dead = true; - INIT_WORK(&cachep->memcg_params->destroy, - kmem_cache_destroy_work_func); schedule_work(&cachep->memcg_params->destroy); } mutex_unlock(&memcg->slab_caches_mutex); -- cgit v1.1 From 8aec0f5d4137532de14e6554fd5dd201ff3a3c49 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 25 Feb 2013 10:20:36 -0500 Subject: Fix: compat_rw_copy_check_uvector() misuse in aio, readv, writev, and security keys Looking at mm/process_vm_access.c:process_vm_rw() and comparing it to compat_process_vm_rw() shows that the compatibility code requires an explicit "access_ok()" check before calling compat_rw_copy_check_uvector(). The same difference seems to appear when we compare fs/read_write.c:do_readv_writev() to fs/compat.c:compat_do_readv_writev(). This subtle difference between the compat and non-compat requirements should probably be debated, as it seems to be error-prone. In fact, there are two others sites that use this function in the Linux kernel, and they both seem to get it wrong: Now shifting our attention to fs/aio.c, we see that aio_setup_iocb() also ends up calling compat_rw_copy_check_uvector() through aio_setup_vectored_rw(). Unfortunately, the access_ok() check appears to be missing. Same situation for security/keys/compat.c:compat_keyctl_instantiate_key_iov(). I propose that we add the access_ok() check directly into compat_rw_copy_check_uvector(), so callers don't have to worry about it, and it therefore makes the compat call code similar to its non-compat counterpart. Place the access_ok() check in the same location where copy_from_user() can trigger a -EFAULT error in the non-compat code, so the ABI behaviors are alike on both compat and non-compat. While we are here, fix compat_do_readv_writev() so it checks for compat_rw_copy_check_uvector() negative return values. And also, fix a memory leak in compat_keyctl_instantiate_key_iov() error handling. Acked-by: Linus Torvalds Acked-by: Al Viro Signed-off-by: Mathieu Desnoyers Signed-off-by: Linus Torvalds --- mm/process_vm_access.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'mm') diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 926b466..fd26d04 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -429,12 +429,6 @@ compat_process_vm_rw(compat_pid_t pid, if (flags != 0) return -EINVAL; - if (!access_ok(VERIFY_READ, lvec, liovcnt * sizeof(*lvec))) - goto out; - - if (!access_ok(VERIFY_READ, rvec, riovcnt * sizeof(*rvec))) - goto out; - if (vm_write) rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV, iovstack_l, @@ -459,8 +453,6 @@ free_iovecs: kfree(iov_r); if (iov_l != iovstack_l) kfree(iov_l); - -out: return rc; } -- cgit v1.1 From 4febd95a8a85dd38b1a71fcf9726e19c7fd20039 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 7 Mar 2013 15:48:16 +1100 Subject: Select VIRT_TO_BUS directly where needed In commit 887cbce0adea ("arch Kconfig: centralise ARCH_NO_VIRT_TO_BUS") I introduced the config sybmol HAVE_VIRT_TO_BUS and selected that where needed. I am not sure what I was thinking. Instead, just directly select VIRT_TO_BUS where it is needed. Signed-off-by: Stephen Rothwell Signed-off-by: Linus Torvalds --- mm/Kconfig | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index ae55c1e..3bea74f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -286,8 +286,12 @@ config NR_QUICK default "1" config VIRT_TO_BUS - def_bool y - depends on HAVE_VIRT_TO_BUS + bool + help + An architecture should select this if it implements the + deprecated interface virt_to_bus(). All new architectures + should probably not select this. + config MMU_NOTIFIER bool -- cgit v1.1 From f8749452adcddd62e3707709ec2ae4856e70a3f2 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 13 Mar 2013 14:59:31 -0700 Subject: mm: remove_memory(): fix end_pfn setting remove_memory() calls walk_memory_range() with [start_pfn, end_pfn), where end_pfn is exclusive in this range. Therefore, end_pfn needs to be set to the next page of the end address. Signed-off-by: Toshi Kani Cc: Wen Congyang Cc: Tang Chen Cc: Kamezawa Hiroyuki Cc: KOSAKI Motohiro Cc: Jiang Liu Cc: Jianguo Wu Cc: Lai Jiangshan Cc: Wu Jianguo Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b81a367b..9597eec 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1801,7 +1801,7 @@ int __ref remove_memory(int nid, u64 start, u64 size) int retry = 1; start_pfn = PFN_DOWN(start); - end_pfn = start_pfn + PFN_DOWN(size); + end_pfn = PFN_UP(start + size - 1); /* * When CONFIG_MEMCG is on, one memory block may be used by other -- cgit v1.1 From 6d7825b10dbeafd60627cd04291fb10ec2b5b973 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 13 Mar 2013 14:59:43 -0700 Subject: mm/fremap.c: fix oops on error path If find_vma() fails, sys_remap_file_pages() will dereference `vma', which contains NULL. Fix it by checking the pointer. (We could alternatively check for err==0, but this seems more direct) (The vm_flags change is to squish a bogus used-uninitialised warning without adding extra code). Reported-by: Tommi Rantala Cc: Michel Lespinasse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/fremap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/fremap.c b/mm/fremap.c index 0cd4c1148..6a8da7e 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -163,7 +163,8 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, * and that the remapped range is valid and fully within * the single existing vma. */ - if (!vma || !(vma->vm_flags & VM_SHARED)) + vm_flags = vma->vm_flags; + if (!vma || !(vm_flags & VM_SHARED)) goto out; if (!vma->vm_ops || !vma->vm_ops->remap_pages) @@ -254,7 +255,8 @@ get_write_lock: */ out: - vm_flags = vma->vm_flags; + if (vma) + vm_flags = vma->vm_flags; if (likely(!has_write_lock)) up_read(&mm->mmap_sem); else -- cgit v1.1 From a2362d24764a4e9a3187fc46b14e1d2cd0657700 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 14 Mar 2013 16:50:02 -0700 Subject: mm/fremap.c: fix possible oops on error path The vm_flags introduced in 6d7825b10dbe ("mm/fremap.c: fix oops on error path") is supposed to avoid a compiler warning about unitialized vm_flags without changing the generated code. However I am concerned that this is going to be very brittle, and fail with some compiler versions. The failure could be either of: - compiler could actually load vma->vm_flags before checking for the !vma condition, thus reintroducing the oops - compiler could optimize out the !vma check, since the pointer just got dereferenced shortly before (so the compiler knows it can't be NULL!) I propose reversing this part of the change and initializing vm_flags to 0 just to avoid the bogus uninitialized use warning. Signed-off-by: Michel Lespinasse Cc: Tommi Rantala Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/fremap.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/fremap.c b/mm/fremap.c index 6a8da7e..4723ac8 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -129,7 +129,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, struct vm_area_struct *vma; int err = -EINVAL; int has_write_lock = 0; - vm_flags_t vm_flags; + vm_flags_t vm_flags = 0; if (prot) return err; @@ -163,8 +163,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, * and that the remapped range is valid and fully within * the single existing vma. */ - vm_flags = vma->vm_flags; - if (!vma || !(vm_flags & VM_SHARED)) + if (!vma || !(vma->vm_flags & VM_SHARED)) goto out; if (!vma->vm_ops || !vma->vm_ops->remap_pages) -- cgit v1.1 From d00285884c0892bb1310df96bce6056e9ce9b9d9 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Fri, 22 Mar 2013 15:04:40 -0700 Subject: mm/hugetlb: fix total hugetlbfs pages count when using memory overcommit accouting hugetlb_total_pages is used for overcommit calculations but the current implementation considers only the default hugetlb page size (which is either the first defined hugepage size or the one specified by default_hugepagesz kernel boot parameter). If the system is configured for more than one hugepage size, which is possible since commit a137e1cc6d6e ("hugetlbfs: per mount huge page sizes") then the overcommit estimation done by __vm_enough_memory() (resp. shown by meminfo_proc_show) is not precise - there is an impression of more available/allowed memory. This can lead to an unexpected ENOMEM/EFAULT resp. SIGSEGV when memory is accounted. Testcase: boot: hugepagesz=1G hugepages=1 the default overcommit ratio is 50 before patch: egrep 'CommitLimit' /proc/meminfo CommitLimit: 55434168 kB after patch: egrep 'CommitLimit' /proc/meminfo CommitLimit: 54909880 kB [akpm@linux-foundation.org: coding-style tweak] Signed-off-by: Wanpeng Li Acked-by: Michal Hocko Cc: "Aneesh Kumar K.V" Cc: Hillf Danton Cc: KAMEZAWA Hiroyuki Cc: [3.0+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0a0be33..ca9a7c6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2124,8 +2124,12 @@ int hugetlb_report_node_meminfo(int nid, char *buf) /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ unsigned long hugetlb_total_pages(void) { - struct hstate *h = &default_hstate; - return h->nr_huge_pages * pages_per_huge_page(h); + struct hstate *h; + unsigned long nr_total_pages = 0; + + for_each_hstate(h) + nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h); + return nr_total_pages; } static int hugetlb_acct_memory(struct hstate *h, long delta) -- cgit v1.1 From ca4b3f302c90de5e516296e581c31c80125cd24b Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Fri, 22 Mar 2013 15:04:50 -0700 Subject: mm/hotplug: only free wait_table if it's allocated by vmalloc zone->wait_table may be allocated from bootmem, it can not be freed. Signed-off-by: Jianguo Wu Reviewed-by: Tang Chen Cc: Tang Chen Cc: Jiang Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9597eec..ee37657 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1779,7 +1779,11 @@ void try_offline_node(int nid) for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; - if (zone->wait_table) + /* + * wait_table may be allocated from boot memory, + * here only free if it's allocated by vmalloc. + */ + if (is_vmalloc_addr(zone->wait_table)) vfree(zone->wait_table); } -- cgit v1.1 From 09a9f1d27892255cfb9c91203f19476765e2d8d1 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 28 Mar 2013 16:26:23 -0700 Subject: Revert "mm: introduce VM_POPULATE flag to better deal with racy userspace programs" This reverts commit 186930500985 ("mm: introduce VM_POPULATE flag to better deal with racy userspace programs"). VM_POPULATE only has any effect when userspace plays racy games with vmas by trying to unmap and remap memory regions that mmap or mlock are operating on. Also, the only effect of VM_POPULATE when userspace plays such games is that it avoids populating new memory regions that get remapped into the address range that was being operated on by the original mmap or mlock calls. Let's remove VM_POPULATE as there isn't any strong argument to mandate a new vm_flag. Signed-off-by: Michel Lespinasse Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- mm/fremap.c | 12 ++---------- mm/mlock.c | 11 +++++------ mm/mmap.c | 4 +++- 3 files changed, 10 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/fremap.c b/mm/fremap.c index 4723ac8..87da359 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -204,10 +204,8 @@ get_write_lock: unsigned long addr; struct file *file = get_file(vma->vm_file); - vm_flags = vma->vm_flags; - if (!(flags & MAP_NONBLOCK)) - vm_flags |= VM_POPULATE; - addr = mmap_region(file, start, size, vm_flags, pgoff); + addr = mmap_region(file, start, size, + vma->vm_flags, pgoff); fput(file); if (IS_ERR_VALUE(addr)) { err = addr; @@ -226,12 +224,6 @@ get_write_lock: mutex_unlock(&mapping->i_mmap_mutex); } - if (!(flags & MAP_NONBLOCK) && !(vma->vm_flags & VM_POPULATE)) { - if (!has_write_lock) - goto get_write_lock; - vma->vm_flags |= VM_POPULATE; - } - if (vma->vm_flags & VM_LOCKED) { /* * drop PG_Mlocked flag for over-mapped range diff --git a/mm/mlock.c b/mm/mlock.c index 1c5e33f..79b7cf7 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -358,7 +358,7 @@ static int do_mlock(unsigned long start, size_t len, int on) newflags = vma->vm_flags & ~VM_LOCKED; if (on) - newflags |= VM_LOCKED | VM_POPULATE; + newflags |= VM_LOCKED; tmp = vma->vm_end; if (tmp > end) @@ -418,8 +418,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) * range with the first VMA. Also, skip undesirable VMA types. */ nend = min(end, vma->vm_end); - if ((vma->vm_flags & (VM_IO | VM_PFNMAP | VM_POPULATE)) != - VM_POPULATE) + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) continue; if (nstart < vma->vm_start) nstart = vma->vm_start; @@ -492,9 +491,9 @@ static int do_mlockall(int flags) struct vm_area_struct * vma, * prev = NULL; if (flags & MCL_FUTURE) - current->mm->def_flags |= VM_LOCKED | VM_POPULATE; + current->mm->def_flags |= VM_LOCKED; else - current->mm->def_flags &= ~(VM_LOCKED | VM_POPULATE); + current->mm->def_flags &= ~VM_LOCKED; if (flags == MCL_FUTURE) goto out; @@ -503,7 +502,7 @@ static int do_mlockall(int flags) newflags = vma->vm_flags & ~VM_LOCKED; if (flags & MCL_CURRENT) - newflags |= VM_LOCKED | VM_POPULATE; + newflags |= VM_LOCKED; /* Ignore errors */ mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); diff --git a/mm/mmap.c b/mm/mmap.c index 2664a47..6466699 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1306,7 +1306,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, } addr = mmap_region(file, addr, len, vm_flags, pgoff); - if (!IS_ERR_VALUE(addr) && (vm_flags & VM_POPULATE)) + if (!IS_ERR_VALUE(addr) && + ((vm_flags & VM_LOCKED) || + (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) *populate = len; return addr; } -- cgit v1.1 From 5853ff23c2f0f6c87a859e7f882eac3300b329a0 Mon Sep 17 00:00:00 2001 From: "K. Y. Srinivasan" Date: Mon, 25 Mar 2013 15:47:38 -0700 Subject: mm: export split_page() This symbol will be used in the Hyper-V balloon driver to support 2M allocations. Signed-off-by: K. Y. Srinivasan Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/page_alloc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8fcced7..7ff1536 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1397,6 +1397,7 @@ void split_page(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); } +EXPORT_SYMBOL_GPL(split_page); static int __isolate_free_page(struct page *page, unsigned int order) { -- cgit v1.1 From b6a9b7f6b1f21735a7456d534dc0e68e61359d2c Mon Sep 17 00:00:00 2001 From: Jan Stancek Date: Thu, 4 Apr 2013 11:35:10 -0700 Subject: mm: prevent mmap_cache race in find_vma() find_vma() can be called by multiple threads with read lock held on mm->mmap_sem and any of them can update mm->mmap_cache. Prevent compiler from re-fetching mm->mmap_cache, because other readers could update it in the meantime: thread 1 thread 2 | find_vma() | find_vma() struct vm_area_struct *vma = NULL; | vma = mm->mmap_cache; | if (!(vma && vma->vm_end > addr | && vma->vm_start <= addr)) { | | mm->mmap_cache = vma; return vma; | ^^ compiler may optimize this | local variable out and re-read | mm->mmap_cache | This issue can be reproduced with gcc-4.8.0-1 on s390x by running mallocstress testcase from LTP, which triggers: kernel BUG at mm/rmap.c:1088! Call Trace: ([<000003d100c57000>] 0x3d100c57000) [<000000000023a1c0>] do_wp_page+0x2fc/0xa88 [<000000000023baae>] handle_pte_fault+0x41a/0xac8 [<000000000023d832>] handle_mm_fault+0x17a/0x268 [<000000000060507a>] do_protection_exception+0x1e2/0x394 [<0000000000603a04>] pgm_check_handler+0x138/0x13c [<000003fffcf1f07a>] 0x3fffcf1f07a Last Breaking-Event-Address: [<000000000024755e>] page_add_new_anon_rmap+0xc2/0x168 Thanks to Jakub Jelinek for his insight on gcc and helping to track this down. Signed-off-by: Jan Stancek Acked-by: David Rientjes Signed-off-by: Hugh Dickins Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- mm/mmap.c | 2 +- mm/nommu.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 6466699..0db0de1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1940,7 +1940,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) /* Check the cache first. */ /* (Cache hit rate is typically around 35%.) */ - vma = mm->mmap_cache; + vma = ACCESS_ONCE(mm->mmap_cache); if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { struct rb_node *rb_node; diff --git a/mm/nommu.c b/mm/nommu.c index e193280..2f3ea74 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -821,7 +821,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) struct vm_area_struct *vma; /* check the cache first */ - vma = mm->mmap_cache; + vma = ACCESS_ONCE(mm->mmap_cache); if (vma && vma->vm_start <= addr && vma->vm_end > addr) return vma; -- cgit v1.1 From d9c10ddddc98db0a316243cd266c466875975a94 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 28 Mar 2013 08:48:14 +0100 Subject: memcg: fix memcg_cache_name() to use cgroup_name() As cgroup supports rename, it's unsafe to dereference dentry->d_name without proper vfs locks. Fix this by using cgroup_name() rather than dentry directly. Also open code memcg_cache_name because it is called only from kmem_cache_dup which frees the returned name right after kmem_cache_create_memcg makes a copy of it. Such a short-lived allocation doesn't make too much sense. So replace it by a static buffer as kmem_cache_dup is called with memcg_cache_mutex. Signed-off-by: Li Zefan Signed-off-by: Michal Hocko Acked-by: Glauber Costa Signed-off-by: Tejun Heo --- mm/memcontrol.c | 63 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 32 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 53b8201..9715c0c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3214,52 +3214,53 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep) schedule_work(&cachep->memcg_params->destroy); } -static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s) -{ - char *name; - struct dentry *dentry; - - rcu_read_lock(); - dentry = rcu_dereference(memcg->css.cgroup->dentry); - rcu_read_unlock(); - - BUG_ON(dentry == NULL); - - name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name, - memcg_cache_id(memcg), dentry->d_name.name); - - return name; -} +/* + * This lock protects updaters, not readers. We want readers to be as fast as + * they can, and they will either see NULL or a valid cache value. Our model + * allow them to see NULL, in which case the root memcg will be selected. + * + * We need this lock because multiple allocations to the same cache from a non + * will span more than one worker. Only one of them can create the cache. + */ +static DEFINE_MUTEX(memcg_cache_mutex); +/* + * Called with memcg_cache_mutex held + */ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, struct kmem_cache *s) { - char *name; struct kmem_cache *new; + static char *tmp_name = NULL; - name = memcg_cache_name(memcg, s); - if (!name) - return NULL; + lockdep_assert_held(&memcg_cache_mutex); + + /* + * kmem_cache_create_memcg duplicates the given name and + * cgroup_name for this name requires RCU context. + * This static temporary buffer is used to prevent from + * pointless shortliving allocation. + */ + if (!tmp_name) { + tmp_name = kmalloc(PATH_MAX, GFP_KERNEL); + if (!tmp_name) + return NULL; + } + + rcu_read_lock(); + snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name, + memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup)); + rcu_read_unlock(); - new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align, + new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align, (s->flags & ~SLAB_PANIC), s->ctor, s); if (new) new->allocflags |= __GFP_KMEMCG; - kfree(name); return new; } -/* - * This lock protects updaters, not readers. We want readers to be as fast as - * they can, and they will either see NULL or a valid cache value. Our model - * allow them to see NULL, in which case the root memcg will be selected. - * - * We need this lock because multiple allocations to the same cache from a non - * will span more than one worker. Only one of them can create the cache. - */ -static DEFINE_MUTEX(memcg_cache_mutex); static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, struct kmem_cache *cachep) { -- cgit v1.1 From 1de14c3c5cbc9bb17e9dcc648cda51c0c85d54b9 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 12 Apr 2013 16:23:54 -0700 Subject: x86-32: Fix possible incomplete TLB invalidate with PAE pagetables This patch attempts to fix: https://bugzilla.kernel.org/show_bug.cgi?id=56461 The symptom is a crash and messages like this: chrome: Corrupted page table at address 34a03000 *pdpt = 0000000000000000 *pde = 0000000000000000 Bad pagetable: 000f [#1] PREEMPT SMP Ingo guesses this got introduced by commit 611ae8e3f520 ("x86/tlb: enable tlb flush range support for x86") since that code started to free unused pagetables. On x86-32 PAE kernels, that new code has the potential to free an entire PMD page and will clear one of the four page-directory-pointer-table (aka pgd_t entries). The hardware aggressively "caches" these top-level entries and invlpg does not actually affect the CPU's copy. If we clear one we *HAVE* to do a full TLB flush, otherwise we might continue using a freed pmd page. (note, we do this properly on the population side in pud_populate()). This patch tracks whenever we clear one of these entries in the 'struct mmu_gather', and ensures that we follow up with a full tlb flush. BTW, I disassembled and checked that: if (tlb->fullmm == 0) and if (!tlb->fullmm && !tlb->need_flush_all) generate essentially the same code, so there should be zero impact there to the !PAE case. Signed-off-by: Dave Hansen Cc: Peter Anvin Cc: Ingo Molnar Cc: Artem S Tashkinov Signed-off-by: Linus Torvalds --- mm/memory.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 494526a..13cbc42 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -216,6 +216,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) tlb->mm = mm; tlb->fullmm = fullmm; + tlb->need_flush_all = 0; tlb->start = -1UL; tlb->end = 0; tlb->need_flush = 0; -- cgit v1.1 From f00baae7ad6c5f1503528efa852f0be8e9513f0e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 15 Apr 2013 13:41:15 -0700 Subject: memcg: force use_hierarchy if sane_behavior Turn on use_hierarchy by default if sane_behavior is specified and don't create .use_hierarchy file. It is debatable whether to remove .use_hierarchy file or make it ro as the former could make transition easier in certain cases; however, the behavior changes which will be gated by sane_behavior are intensive including changing basic meaning of certain control knobs in a few controllers and I don't really think keeping this piece would make things easier in any noticeable way, so let's remove it. v2: Explain that mem_cgroup_bind() doesn't have to worry about children as suggested by Michal Hocko. Signed-off-by: Tejun Heo Acked-by: Serge E. Hallyn Acked-by: Li Zefan Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki --- mm/memcontrol.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9715c0c..df87bdd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5814,6 +5814,7 @@ static struct cftype mem_cgroup_files[] = { }, { .name = "use_hierarchy", + .flags = CFTYPE_INSANE, .write_u64 = mem_cgroup_hierarchy_write, .read_u64 = mem_cgroup_hierarchy_read, }, @@ -6784,6 +6785,21 @@ static void mem_cgroup_move_task(struct cgroup *cont, } #endif +/* + * Cgroup retains root cgroups across [un]mount cycles making it necessary + * to verify sane_behavior flag on each mount attempt. + */ +static void mem_cgroup_bind(struct cgroup *root) +{ + /* + * use_hierarchy is forced with sane_behavior. cgroup core + * guarantees that @root doesn't have any children, so turning it + * on for the root memcg is enough. + */ + if (cgroup_sane_behavior(root)) + mem_cgroup_from_cont(root)->use_hierarchy = true; +} + struct cgroup_subsys mem_cgroup_subsys = { .name = "memory", .subsys_id = mem_cgroup_subsys_id, @@ -6794,6 +6810,7 @@ struct cgroup_subsys mem_cgroup_subsys = { .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, .attach = mem_cgroup_move_task, + .bind = mem_cgroup_bind, .base_cftypes = mem_cgroup_files, .early_init = 0, .use_id = 1, -- cgit v1.1 From b4cbb197c7e7a68dbad0d491242e3ca67420c13e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 16 Apr 2013 13:45:37 -0700 Subject: vm: add vm_iomap_memory() helper function Various drivers end up replicating the code to mmap() their memory buffers into user space, and our core memory remapping function may be very flexible but it is unnecessarily complicated for the common cases to use. Our internal VM uses pfn's ("page frame numbers") which simplifies things for the VM, and allows us to pass physical addresses around in a denser and more efficient format than passing a "phys_addr_t" around, and having to shift it up and down by the page size. But it just means that drivers end up doing that shifting instead at the interface level. It also means that drivers end up mucking around with internal VM things like the vma details (vm_pgoff, vm_start/end) way more than they really need to. So this just exports a function to map a certain physical memory range into user space (using a phys_addr_t based interface that is much more natural for a driver) and hides all the complexity from the driver. Some drivers will still end up tweaking the vm_page_prot details for things like prefetching or cacheability etc, but that's actually relevant to the driver, rather than caring about what the page offset of the mapping is into the particular IO memory region. Acked-by: Greg Kroah-Hartman Signed-off-by: Linus Torvalds --- mm/memory.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 13cbc42..ba94dec 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2393,6 +2393,53 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(remap_pfn_range); +/** + * vm_iomap_memory - remap memory to userspace + * @vma: user vma to map to + * @start: start of area + * @len: size of area + * + * This is a simplified io_remap_pfn_range() for common driver use. The + * driver just needs to give us the physical memory range to be mapped, + * we'll figure out the rest from the vma information. + * + * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get + * whatever write-combining details or similar. + */ +int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) +{ + unsigned long vm_len, pfn, pages; + + /* Check that the physical memory area passed in looks valid */ + if (start + len < start) + return -EINVAL; + /* + * You *really* shouldn't map things that aren't page-aligned, + * but we've historically allowed it because IO memory might + * just have smaller alignment. + */ + len += start & ~PAGE_MASK; + pfn = start >> PAGE_SHIFT; + pages = (len + ~PAGE_MASK) >> PAGE_SHIFT; + if (pfn + pages < pfn) + return -EINVAL; + + /* We start the mapping 'vm_pgoff' pages into the area */ + if (vma->vm_pgoff > pages) + return -EINVAL; + pfn += vma->vm_pgoff; + pages -= vma->vm_pgoff; + + /* Can we fit all of the mapping? */ + vm_len = vma->vm_end - vma->vm_start; + if (vm_len >> PAGE_SHIFT > pages) + return -EINVAL; + + /* Ok, let it rip */ + return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_iomap_memory); + static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pte_fn_t fn, void *data) -- cgit v1.1 From 9cc3a5bd40067b9a0fbd49199d0780463fc2140f Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 17 Apr 2013 15:58:30 -0700 Subject: hugetlbfs: add swap entry check in follow_hugetlb_page() With applying the previous patch "hugetlbfs: stop setting VM_DONTDUMP in initializing vma(VM_HUGETLB)" to reenable hugepage coredump, if a memory error happens on a hugepage and the affected processes try to access the error hugepage, we hit VM_BUG_ON(atomic_read(&page->_count) <= 0) in get_page(). The reason for this bug is that coredump-related code doesn't recognise "hugepage hwpoison entry" with which a pmd entry is replaced when a memory error occurs on a hugepage. In other words, physical address information is stored in different bit layout between hugepage hwpoison entry and pmd entry, so follow_hugetlb_page() which is called in get_dump_page() returns a wrong page from a given address. The expected behavior is like this: absent is_swap_pte FOLL_DUMP Expected behavior ------------------------------------------------------------------- true false false hugetlb_fault false true false hugetlb_fault false false false return page true false true skip page (to avoid allocation) false true true hugetlb_fault false false true return page With this patch, we can call hugetlb_fault() and take proper actions (we wait for migration entries, fail with VM_FAULT_HWPOISON_LARGE for hwpoisoned entries,) and as the result we can dump all hugepages except for hwpoisoned ones. Signed-off-by: Naoya Horiguchi Cc: Rik van Riel Acked-by: Michal Hocko Cc: HATAYAMA Daisuke Acked-by: KOSAKI Motohiro Acked-by: David Rientjes Cc: [2.6.34+?] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ca9a7c6..1a12f5b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2961,7 +2961,17 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, break; } - if (absent || + /* + * We need call hugetlb_fault for both hugepages under migration + * (in which case hugetlb_fault waits for the migration,) and + * hwpoisoned hugepages (in which case we need to prevent the + * caller from accessing to them.) In order to do this, we use + * here is_swap_pte instead of is_hugetlb_entry_migration and + * is_hugetlb_entry_hwpoisoned. This is because it simply covers + * both cases, and because we can't follow correct pages + * directly from any kind of swap entries. + */ + if (absent || is_swap_pte(huge_ptep_get(pte)) || ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) { int ret; -- cgit v1.1 From d72515b85a6583db131ec6032978e3c9d4291d95 Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Wed, 17 Apr 2013 15:58:34 -0700 Subject: mm/vmscan: fix error return in kswapd_run() Fix the error return value in kswapd_run(). The bug was introduced by commit d5dc0ad928fb ("mm/vmscan: fix error number for failed kthread"). Signed-off-by: Xishi Qiu Reviewed-by: Wanpeng Li Reviewed-by: Rik van Riel Reported-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 88c5fed..669fba3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3188,9 +3188,9 @@ int kswapd_run(int nid) if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ BUG_ON(system_state == SYSTEM_BOOTING); - pgdat->kswapd = NULL; pr_err("Failed to start kswapd on node %d\n", nid); ret = PTR_ERR(pgdat->kswapd); + pgdat->kswapd = NULL; } return ret; } -- cgit v1.1 From 3c0b9de6d37a481673e81001c57ca0e410c72346 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 27 Apr 2013 13:25:38 -0700 Subject: vm: add no-mmu vm_iomap_memory() stub I think we could just move the full vm_iomap_memory() function into util.h or similar, but I didn't get any reply from anybody actually using nommu even to this trivial patch, so I'm not going to touch it any more than required. Here's the fairly minimal stub to make the nommu case at least potentially work. It doesn't seem like anybody cares, though. Signed-off-by: Linus Torvalds --- mm/nommu.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index 2f3ea74..e001768 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1838,6 +1838,16 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(remap_pfn_range); +int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) +{ + unsigned long pfn = start >> PAGE_SHIFT; + unsigned long vm_len = vma->vm_end - vma->vm_start; + + pfn += vma->vm_pgoff; + return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_iomap_memory); + int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff) { -- cgit v1.1 From e39862958d54e4cccec01f5cdef3ae298e7386b8 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 29 Apr 2013 15:06:08 -0700 Subject: HWPOISON: check dirty flag to match against clean page Currently page_action() does not check dirty flag to determine whether the error page is "clean mlocked/unevictable LRU" page. This doesn't cause any misjudgement because we do matching against "dirty mlocked/unevictable LRU" just before the check. But in order to make code consistent and/or to avoid potential regression, we had better check dirty flag explicitly. Signed-off-by: Naoya Horiguchi Suggested-by: Chen Gong Cc: Andi Kleen Cc: Tony Luck Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index df0694c..ceb0c7f 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -785,10 +785,10 @@ static struct page_state { { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, - { mlock, mlock, "clean mlocked LRU", me_pagecache_clean }, + { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean }, { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, - { unevict, unevict, "clean unevictable LRU", me_pagecache_clean }, + { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean }, { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, { lru|dirty, lru, "clean LRU", me_pagecache_clean }, -- cgit v1.1 From fe0bfaaff84429a35e4447d4ccd646aecf5999fb Mon Sep 17 00:00:00 2001 From: Robert Jarzmik Date: Mon, 29 Apr 2013 15:06:10 -0700 Subject: mm: trace filemap add and del Use the events API to trace filemap loading and unloading of file pieces into the page cache. This patch aims at tracing the eviction reload cycle of executable and shared libraries pages in a memory constrained environment. The typical usage is to spot a specific device and inode (for example /lib/libc.so) to see the eviction cycles, and find out if frequently used code is rather spread across many pages (bad) or coallesced (good). Signed-off-by: Robert Jarzmik Cc: Dave Chinner Cc: Hugh Dickins Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index e1979fd..2581826 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -35,6 +35,9 @@ #include #include "internal.h" +#define CREATE_TRACE_POINTS +#include + /* * FIXME: remove all knowledge of the buffer layer from the core VM */ @@ -113,6 +116,7 @@ void __delete_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; + trace_mm_filemap_delete_from_page_cache(page); /* * if we're uptodate, flush out into the cleancache, otherwise * invalidate any existing cleancache entries. We can't leave @@ -464,6 +468,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); spin_unlock_irq(&mapping->tree_lock); + trace_mm_filemap_add_to_page_cache(page); } else { page->mapping = NULL; /* Leave page->index set: truncation relies upon it */ -- cgit v1.1 From 4b59e6c4730978679b414a8da61514a2518da512 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 29 Apr 2013 15:06:11 -0700 Subject: mm, show_mem: suppress page counts in non-blockable contexts On large systems with a lot of memory, walking all RAM to determine page types may take a half second or even more. In non-blockable contexts, the page allocator will emit a page allocation failure warning unless __GFP_NOWARN is specified. In such contexts, irqs are typically disabled and such a lengthy delay may even result in NMI watchdog timeouts. To fix this, suppress the page walk in such contexts when printing the page allocation failure warning. Signed-off-by: David Rientjes Cc: Mel Gorman Acked-by: Michal Hocko Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7ff1536..da7a2fe 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2003,6 +2003,13 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) return; /* + * Walking all memory to count page types is very expensive and should + * be inhibited in non-blockable contexts. + */ + if (!(gfp_mask & __GFP_WAIT)) + filter |= SHOW_MEM_FILTER_PAGE_COUNT; + + /* * This documents exceptions given to allocations in certain * contexts that are allowed to allocate outside current's set * of allowed nodes. -- cgit v1.1 From 250297edf83292c831fbf4504df54953c2aacfe4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 29 Apr 2013 15:06:12 -0700 Subject: mm/shmem.c: remove an ifdef Create a CONFIG_MMU=y stub for ramfs_nommu_expand_for_mapping() in the usual fashion. Cc: Al Viro Cc: Wolfram Sang Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 1c44af7..39b2a0b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -2830,8 +2831,6 @@ out4: * effectively equivalent, but much lighter weight. */ -#include - static struct file_system_type shmem_fs_type = { .name = "tmpfs", .mount = ramfs_mount, @@ -2931,11 +2930,9 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags d_instantiate(path.dentry, inode); inode->i_size = size; clear_nlink(inode); /* It is unlinked */ -#ifndef CONFIG_MMU res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); if (IS_ERR(res)) goto put_dentry; -#endif res = alloc_file(&path, FMODE_WRITE | FMODE_READ, &shmem_file_operations); -- cgit v1.1 From 369a713e9678227e203b53931ad1a10cd8eac811 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Mon, 29 Apr 2013 15:06:14 -0700 Subject: rmap: recompute pgoff for unmapping huge page We have to recompute pgoff if the given page is huge, since result based on HPAGE_SIZE is not approapriate for scanning the vma interval tree, as shown by commit 36e4f20af833 ("hugetlb: do not use vma_hugecache_offset() for vma_prio_tree_foreach"). Signed-off-by: Hillf Danton Cc: Michal Hocko Cc: Michel Lespinasse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 807c96b..6280da8 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1513,6 +1513,9 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) unsigned long max_nl_size = 0; unsigned int mapcount; + if (PageHuge(page)) + pgoff = page->index << compound_order(page); + mutex_lock(&mapping->i_mmap_mutex); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); -- cgit v1.1 From 94f3d3afb65f27d4e7b8251e323c6418982cb9c7 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Mon, 29 Apr 2013 15:06:15 -0700 Subject: memblock: add assertion for zero allocation alignment This came to light when calling memblock allocator from arc port (for copying flattended DT). If a "0" alignment is passed, the allocator round_up() call incorrectly rounds up the size to 0. round_up(num, alignto) => ((num - 1) | (alignto -1)) + 1 While the obvious allocation failure causes kernel to panic, it is better to warn the caller to fix the code. Tejun suggested that instead of BUG_ON(!align) - which might be ineffective due to pending console init and such, it is better to WARN_ON, and continue the boot with a reasonable default align. Caller passing @size need not be handled similarly as the subsequent panic will indicate that anyhow. Signed-off-by: Vineet Gupta Cc: Yinghai Lu Cc: Wanpeng Li Cc: Ingo Molnar Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index b8d9147..2cce8b3 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -771,6 +771,9 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, { phys_addr_t found; + if (WARN_ON(!align)) + align = __alignof__(long long); + /* align @size to avoid excessive fragmentation on reserved array */ size = round_up(size, align); -- cgit v1.1 From e05c4bbfaedb7585a5b7936b3b84e68928c6474f Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Mon, 29 Apr 2013 15:06:16 -0700 Subject: mm: walk_memory_range(): fix typo in comment Fix a typo "end_pft" in the comment of walk_memory_range(). Signed-off-by: Toshi Kani Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ee37657..57decb2 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1613,7 +1613,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) /** * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) * @start_pfn: start pfn of the memory range - * @end_pfn: end pft of the memory range + * @end_pfn: end pfn of the memory range * @arg: argument passed to func * @func: callback for each memory section walked * -- cgit v1.1 From 2d42a40d592fadbd01cb639ac21a761fa31423f8 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Mon, 29 Apr 2013 15:06:19 -0700 Subject: mm/vmscan.c: minor cleanup for kswapd Local variable total_scanned is no longer used. Signed-off-by: Hillf Danton Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 669fba3..e03a00b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2619,7 +2619,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, bool pgdat_is_balanced = false; int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ - unsigned long total_scanned; struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; @@ -2639,7 +2638,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, .gfp_mask = sc.gfp_mask, }; loop_again: - total_scanned = 0; sc.priority = DEF_PRIORITY; sc.nr_reclaimed = 0; sc.may_writepage = !laptop_mode; @@ -2730,7 +2728,6 @@ loop_again: order, sc.gfp_mask, &nr_soft_scanned); sc.nr_reclaimed += nr_soft_reclaimed; - total_scanned += nr_soft_scanned; /* * We put equal pressure on every zone, unless @@ -2765,7 +2762,6 @@ loop_again: reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); sc.nr_reclaimed += reclaim_state->reclaimed_slab; - total_scanned += sc.nr_scanned; if (nr_slab == 0 && !zone_reclaimable(zone)) zone->all_unreclaimable = 1; -- cgit v1.1 From 69afade72a3e13e96a065f757891d384d466123f Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 29 Apr 2013 15:06:21 -0700 Subject: mm: introduce common help functions to deal with reserved/managed pages The original goal of this patchset is to fix the bug reported by https://bugzilla.kernel.org/show_bug.cgi?id=53501 Now it has also been expanded to reduce common code used by memory initializion. This is the first part, which applies to v3.9-rc1. It introduces following common helper functions to simplify free_initmem() and free_initrd_mem() on different architectures: adjust_managed_page_count(): will be used to adjust totalram_pages, totalhigh_pages, zone->managed_pages when reserving/unresering a page. __free_reserved_page(): free a reserved page into the buddy system without adjusting page statistics info free_reserved_page(): free a reserved page into the buddy system and adjust page statistics info mark_page_reserved(): mark a page as reserved and adjust page statistics info free_reserved_area(): free a continous ranges of pages by calling free_reserved_page() free_initmem_default(): default method to free __init pages. We have only tested these patchset on x86 platforms, and have done basic compliation tests using cross-compilers from ftp.kernel.org. That means some code may not pass compilation on some architectures. So any help to test this patchset are welcomed! There are several other parts still under development: Part2: introduce free_highmem_page() to simplify freeing highmem pages Part3: refine code to manage totalram_pages, totalhigh_pages and zone->managed_pages Part4: introduce helper functions to simplify mem_init() and remove the global variable num_physpages. This patch: Code to deal with reserved/managed pages are duplicated by many architectures, so introduce common help functions to reduce duplicated code. These common help functions will also be used to concentrate code to modify totalram_pages and zone->managed_pages, which makes the code much more clear. Signed-off-by: Jiang Liu Acked-by: Geert Uytterhoeven Cc: "H. Peter Anvin" Cc: "James E.J. Bottomley" Cc: Anatolij Gustschin Cc: Aurelien Jacquiot Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chen Liqin Cc: Chris Zankel Cc: David Howells Cc: David S. Miller Cc: Eric Biederman Cc: Fenghua Yu Cc: Guan Xuetao Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Heiko Carstens Cc: Helge Deller Cc: Hirokazu Takata Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: James Hogan Cc: Jeff Dike Cc: Jiang Liu Cc: Jiang Liu Cc: Jonas Bonn Cc: Koichi Yasutake Cc: Lennox Wu Cc: Mark Salter Cc: Martin Schwidefsky Cc: Matt Turner Cc: Max Filippov Cc: Michal Simek Cc: Mikael Starvik Cc: Mike Frysinger Cc: Paul Mackerras Cc: Paul Mundt Cc: Ralf Baechle Cc: Richard Henderson Cc: Russell King Cc: Sam Ravnborg Cc: Thomas Gleixner Cc: Tony Luck Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Cc: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index da7a2fe..5c660f5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5121,6 +5121,26 @@ early_param("movablecore", cmdline_parse_movablecore); #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +unsigned long free_reserved_area(unsigned long start, unsigned long end, + int poison, char *s) +{ + unsigned long pages, pos; + + pos = start = PAGE_ALIGN(start); + end &= PAGE_MASK; + for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) { + if (poison) + memset((void *)pos, poison, PAGE_SIZE); + free_reserved_page(virt_to_page(pos)); + } + + if (pages && s) + pr_info("Freeing %s memory: %ldK (%lx - %lx)\n", + s, pages << (PAGE_SHIFT - 10), start, end); + + return pages; +} + /** * set_dma_reserve - set the specified number of pages reserved in the first zone * @new_dma_reserve: The number of pages to mark reserved -- cgit v1.1 From cfa11e08ed39eb28a9eff9a907b20913020c69b5 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 29 Apr 2013 15:07:00 -0700 Subject: mm: introduce free_highmem_page() helper to free highmem pages into buddy system The original goal of this patchset is to fix the bug reported by https://bugzilla.kernel.org/show_bug.cgi?id=53501 Now it has also been expanded to reduce common code used by memory initializion. This is the second part, which applies to the previous part at: http://marc.info/?l=linux-mm&m=136289696323825&w=2 It introduces a helper function free_highmem_page() to free highmem pages into the buddy system when initializing mm subsystem. Introduction of free_highmem_page() is one step forward to clean up accesses and modificaitons of totalhigh_pages, totalram_pages and zone->managed_pages etc. I hope we could remove all references to totalhigh_pages from the arch/ subdirectory. We have only tested these patchset on x86 platforms, and have done basic compliation tests using cross-compilers from ftp.kernel.org. That means some code may not pass compilation on some architectures. So any help to test this patchset are welcomed! There are several other parts still under development: Part3: refine code to manage totalram_pages, totalhigh_pages and zone->managed_pages Part4: introduce helper functions to simplify mem_init() and remove the global variable num_physpages. This patch: Introduce helper function free_highmem_page(), which will be used by architectures with HIGHMEM enabled to free highmem pages into the buddy system. Signed-off-by: Jiang Liu Cc: "David S. Miller" Cc: "H. Peter Anvin" Cc: "Suzuki K. Poulose" Cc: Alexander Graf Cc: Arnd Bergmann Cc: Attilio Rao Cc: Benjamin Herrenschmidt Cc: Cong Wang Cc: David Daney Cc: David Howells Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: James Hogan Cc: Jeff Dike Cc: Jiang Liu Cc: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: Konstantin Khlebnikov Cc: Linus Walleij Cc: Marek Szyprowski Cc: Mel Gorman Cc: Michal Nazarewicz Cc: Michal Simek Cc: Michel Lespinasse Cc: Minchan Kim Cc: Paul Mackerras Cc: Ralf Baechle Cc: Richard Weinberger Cc: Rik van Riel Cc: Russell King Cc: Sam Ravnborg Cc: Stephen Boyd Cc: Thomas Gleixner Cc: Yinghai Lu Reviewed-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5c660f5..72da11c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5141,6 +5141,15 @@ unsigned long free_reserved_area(unsigned long start, unsigned long end, return pages; } +#ifdef CONFIG_HIGHMEM +void free_highmem_page(struct page *page) +{ + __free_reserved_page(page); + totalram_pages++; + totalhigh_pages++; +} +#endif + /** * set_dma_reserve - set the specified number of pages reserved in the first zone * @new_dma_reserve: The number of pages to mark reserved -- cgit v1.1 From c40046f3ad5e877b18cc721aaa7906b98077bc2d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 29 Apr 2013 15:07:14 -0700 Subject: memcg: keep prev's css alive for the whole mem_cgroup_iter The patchset tries to make mem_cgroup_iter saner in the way how it walks hierarchies. css->id based traversal is far from being ideal as it is not deterministic because it depends on the creation ordering. Additional to that css_id is considered a burden for cgroup maintainers because it is quite some code and memcg is the last user of it. After this series only the swap accounting uses css_id but that one will follow up later. Diffstat (if we exclude removed/added comments) looks quite promising. We got rid of some code: $ git diff mmotm... | grep -v "^[+-][[:space:]]*[/ ]\*" | diffstat b/include/linux/cgroup.h | 3 --- kernel/cgroup.c | 33 --------------------------------- mm/memcontrol.c | 4 +++- 3 files changed, 3 insertions(+), 37 deletions(-) The first patch is just preparatory and it changes when we release css of the previously returned memcg. Nothing controlversial. The second patch is the core of the patchset and it replaces css_get_next based on css_id by the generic cgroup pre-order. This brings some chalanges for the last visited group caching during the reclaim (mem_cgroup_per_zone::reclaim_iter). We have to use memcg pointers directly now which means that we have to keep a reference to those groups' css to keep them alive. I also folded iter_lock introduced by https://lkml.org/lkml/2013/1/3/295 in the previous version into this patch. Johannes felt the race I was describing should be mostly harmless and I haven't been able to trigger it so the lock doesn't deserve its own patch. It is still needed temporarily, though, because the reference counting on iter->last_visited depends on it. It will go away with the next patch. The next patch fixups an unbounded cgroup removal holdoff caused by the elevated css refcount. The issue has been observed by Ying Han. Johannes wasn't impressed by the previous version of the fix (https://lkml.org/lkml/2013/2/8/379) which cleaned up pending references during mem_cgroup_css_offline when a group is removed. He has suggested a different way when the iterator checks whether a cached memcg is still valid or no. More on that in the patch but the basic idea is that every memcg tracks the number removed subgroups and iterator records this number when a group is cached. These numbers are checked before iter->last_visited is about to be used and the iteration is restarted if it is invalid. The fourth and fifth patches are an attempt for simplification of the mem_cgroup_iter. css juggling is removed and the iteration logic is moved to a helper so that the reference counting and iteration are separated. The last patch just removes css_get_next as there is no user for it any longer. My testing looked as follows: A (use_hierarchy=1, limit_in_bytes=150M) /|\ 1 2 3 Children groups were created so that the number is never higher than 3 and their limits were random between 50-100M. Each group hosts a kernel build (starting with tar -xf so the tree is not shared and make -jNUM_CPUs/3) and terminated after random time - up to 5 minutes) and then it is removed. This should exercise both leaf and hierarchical reclaim as well as races with cgroup removals and debugging messages I added on top proved that. 100 groups were created during the test. This patch: css reference counting keeps the cgroup alive even though it has been already removed. mem_cgroup_iter relies on this fact and takes a reference to the returned group. The reference is then released on the next iteration or mem_cgroup_iter_break. mem_cgroup_iter currently releases the reference right after it gets the last css_id. This is correct because neither prev's memcg nor cgroup are accessed after then. This will change in the next patch so we need to hold the group alive a bit longer so let's move the css_put at the end of the function. Signed-off-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Johannes Weiner Cc: Li Zefan Cc: Ying Han Cc: Tejun Heo Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2b55222..661a2c6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1100,12 +1100,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (prev && !reclaim) id = css_id(&prev->css); - if (prev && prev != root) - css_put(&prev->css); - if (!root->use_hierarchy && root != root_mem_cgroup) { if (prev) - return NULL; + goto out_css_put; return root; } @@ -1121,7 +1118,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, mz = mem_cgroup_zoneinfo(root, nid, zid); iter = &mz->reclaim_iter[reclaim->priority]; if (prev && reclaim->generation != iter->generation) - return NULL; + goto out_css_put; id = iter->position; } @@ -1143,8 +1140,12 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, } if (prev && !css) - return NULL; + goto out_css_put; } +out_css_put: + if (prev && prev != root) + css_put(&prev->css); + return memcg; } -- cgit v1.1 From 542f85f9ae4acd17dca06f4390f54d411a387efd Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 29 Apr 2013 15:07:15 -0700 Subject: memcg: rework mem_cgroup_iter to use cgroup iterators mem_cgroup_iter curently relies on css->id when walking down a group hierarchy tree. This is really awkward because the tree walk depends on the groups creation ordering. The only guarantee is that a parent node is visited before its children. Example: 1) mkdir -p a a/d a/b/c 2) mkdir -a a/b/c a/d Will create the same trees but the tree walks will be different: 1) a, d, b, c 2) a, b, c, d Commit 574bd9f7c7c1 ("cgroup: implement generic child / descendant walk macros") has introduced generic cgroup tree walkers which provide either pre-order or post-order tree walk. This patch converts css->id based iteration to pre-order tree walk to keep the semantic with the original iterator where parent is always visited before its subtree. cgroup_for_each_descendant_pre suggests using post_create and pre_destroy for proper synchronization with groups addidition resp. removal. This implementation doesn't use those because a new memory cgroup is initialized sufficiently for iteration in mem_cgroup_css_alloc already and css reference counting enforces that the group is alive for both the last seen cgroup and the found one resp. it signals that the group is dead and it should be skipped. If the reclaim cookie is used we need to store the last visited group into the iterator so we have to be careful that it doesn't disappear in the mean time. Elevated reference count on the css keeps it alive even though the group have been removed (parked waiting for the last dput so that it can be freed). Per node-zone-prio iter_lock has been introduced to ensure that css_tryget and iter->last_visited is set atomically. Otherwise two racing walkers could both take a references and only one release it leading to a css leak (which pins cgroup dentry). Signed-off-by: Michal Hocko Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Li Zefan Cc: Ying Han Cc: Tejun Heo Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 86 +++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 68 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 661a2c6..26a38b7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -152,10 +152,12 @@ struct mem_cgroup_stat_cpu { }; struct mem_cgroup_reclaim_iter { - /* css_id of the last scanned hierarchy member */ - int position; + /* last scanned hierarchy member with elevated css ref count */ + struct mem_cgroup *last_visited; /* scan generation, increased every round-trip */ unsigned int generation; + /* lock to protect the position and generation */ + spinlock_t iter_lock; }; /* @@ -1089,7 +1091,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup_reclaim_cookie *reclaim) { struct mem_cgroup *memcg = NULL; - int id = 0; + struct mem_cgroup *last_visited = NULL; if (mem_cgroup_disabled()) return NULL; @@ -1098,7 +1100,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, root = root_mem_cgroup; if (prev && !reclaim) - id = css_id(&prev->css); + last_visited = prev; if (!root->use_hierarchy && root != root_mem_cgroup) { if (prev) @@ -1106,9 +1108,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, return root; } + rcu_read_lock(); while (!memcg) { struct mem_cgroup_reclaim_iter *uninitialized_var(iter); - struct cgroup_subsys_state *css; + struct cgroup_subsys_state *css = NULL; if (reclaim) { int nid = zone_to_nid(reclaim->zone); @@ -1117,31 +1120,74 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, mz = mem_cgroup_zoneinfo(root, nid, zid); iter = &mz->reclaim_iter[reclaim->priority]; - if (prev && reclaim->generation != iter->generation) - goto out_css_put; - id = iter->position; + spin_lock(&iter->iter_lock); + last_visited = iter->last_visited; + if (prev && reclaim->generation != iter->generation) { + if (last_visited) { + css_put(&last_visited->css); + iter->last_visited = NULL; + } + spin_unlock(&iter->iter_lock); + goto out_unlock; + } } - rcu_read_lock(); - css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); - if (css) { - if (css == &root->css || css_tryget(css)) - memcg = mem_cgroup_from_css(css); - } else - id = 0; - rcu_read_unlock(); + /* + * Root is not visited by cgroup iterators so it needs an + * explicit visit. + */ + if (!last_visited) { + css = &root->css; + } else { + struct cgroup *prev_cgroup, *next_cgroup; + + prev_cgroup = (last_visited == root) ? NULL + : last_visited->css.cgroup; + next_cgroup = cgroup_next_descendant_pre(prev_cgroup, + root->css.cgroup); + if (next_cgroup) + css = cgroup_subsys_state(next_cgroup, + mem_cgroup_subsys_id); + } + + /* + * Even if we found a group we have to make sure it is alive. + * css && !memcg means that the groups should be skipped and + * we should continue the tree walk. + * last_visited css is safe to use because it is protected by + * css_get and the tree walk is rcu safe. + */ + if (css == &root->css || (css && css_tryget(css))) + memcg = mem_cgroup_from_css(css); if (reclaim) { - iter->position = id; + struct mem_cgroup *curr = memcg; + + if (last_visited) + css_put(&last_visited->css); + + if (css && !memcg) + curr = mem_cgroup_from_css(css); + + /* make sure that the cached memcg is not removed */ + if (curr) + css_get(&curr->css); + iter->last_visited = curr; + if (!css) iter->generation++; else if (!prev && memcg) reclaim->generation = iter->generation; + spin_unlock(&iter->iter_lock); + } else if (css && !memcg) { + last_visited = mem_cgroup_from_css(css); } if (prev && !css) - goto out_css_put; + goto out_unlock; } +out_unlock: + rcu_read_unlock(); out_css_put: if (prev && prev != root) css_put(&prev->css); @@ -5929,8 +5975,12 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) return 1; for (zone = 0; zone < MAX_NR_ZONES; zone++) { + int prio; + mz = &pn->zoneinfo[zone]; lruvec_init(&mz->lruvec); + for (prio = 0; prio < DEF_PRIORITY + 1; prio++) + spin_lock_init(&mz->reclaim_iter[prio].iter_lock); mz->usage_in_excess = 0; mz->on_tree = false; mz->memcg = memcg; -- cgit v1.1 From 5f57816197186378449b848e99ca9cf41a0055f1 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 29 Apr 2013 15:07:17 -0700 Subject: memcg: relax memcg iter caching Now that the per-node-zone-priority iterator caches memory cgroups rather than their css ids we have to be careful and remove them from the iterator when they are on the way out otherwise they might live for unbounded amount of time even though their group is already gone (until the global/targeted reclaim triggers the zone under priority to find out the group is dead and let it to find the final rest). We can fix this issue by relaxing rules for the last_visited memcg. Instead of taking a reference to the css before it is stored into iter->last_visited we can just store its pointer and track the number of removed groups from each memcg's subhierarchy. This number would be stored into iterator everytime when a memcg is cached. If the iter count doesn't match the curent walker root's one we will start from the root again. The group counter is incremented upwards the hierarchy every time a group is removed. The iter_lock can be dropped because racing iterators cannot leak the reference anymore as the reference count is not elevated for last_visited when it is cached. Locking rules got a bit complicated by this change though. The iterator primarily relies on rcu read lock which makes sure that once we see a valid last_visited pointer then it will be valid for the whole RCU walk. smp_rmb makes sure that dead_count is read before last_visited and last_dead_count while smp_wmb makes sure that last_visited is updated before last_dead_count so the up-to-date last_dead_count cannot point to an outdated last_visited. css_tryget then makes sure that the last_visited is still alive in case the iteration races with the cached group removal (css is invalidated before mem_cgroup_css_offline increments dead_count). In short: mem_cgroup_iter rcu_read_lock() dead_count = atomic_read(parent->dead_count) smp_rmb() if (dead_count != iter->last_dead_count) last_visited POSSIBLY INVALID -> last_visited = NULL if (!css_tryget(iter->last_visited)) last_visited DEAD -> last_visited = NULL next = find_next(last_visited) css_tryget(next) css_put(last_visited) // css would be invalidated and parent->dead_count // incremented if this was the last reference iter->last_visited = next smp_wmb() iter->last_dead_count = dead_count rcu_read_unlock() cgroup_rmdir cgroup_destroy_locked atomic_add(CSS_DEACT_BIAS, &css->refcnt) // subsequent css_tryget fail mem_cgroup_css_offline mem_cgroup_invalidate_reclaim_iterators while(parent = parent_mem_cgroup) atomic_inc(parent->dead_count) css_put(css) // last reference held by cgroup core Spotted by Ying Han. Original idea from Johannes Weiner. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Michal Hocko Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Ying Han Cc: Li Zefan Cc: Tejun Heo Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 69 +++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 52 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 26a38b7..408a5c7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -152,12 +152,15 @@ struct mem_cgroup_stat_cpu { }; struct mem_cgroup_reclaim_iter { - /* last scanned hierarchy member with elevated css ref count */ + /* + * last scanned hierarchy member. Valid only if last_dead_count + * matches memcg->dead_count of the hierarchy root group. + */ struct mem_cgroup *last_visited; + unsigned long last_dead_count; + /* scan generation, increased every round-trip */ unsigned int generation; - /* lock to protect the position and generation */ - spinlock_t iter_lock; }; /* @@ -337,6 +340,7 @@ struct mem_cgroup { struct mem_cgroup_stat_cpu nocpu_base; spinlock_t pcp_counter_lock; + atomic_t dead_count; #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) struct tcp_memcontrol tcp_mem; #endif @@ -1092,6 +1096,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, { struct mem_cgroup *memcg = NULL; struct mem_cgroup *last_visited = NULL; + unsigned long uninitialized_var(dead_count); if (mem_cgroup_disabled()) return NULL; @@ -1120,16 +1125,33 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, mz = mem_cgroup_zoneinfo(root, nid, zid); iter = &mz->reclaim_iter[reclaim->priority]; - spin_lock(&iter->iter_lock); last_visited = iter->last_visited; if (prev && reclaim->generation != iter->generation) { - if (last_visited) { - css_put(&last_visited->css); - iter->last_visited = NULL; - } - spin_unlock(&iter->iter_lock); + iter->last_visited = NULL; goto out_unlock; } + + /* + * If the dead_count mismatches, a destruction + * has happened or is happening concurrently. + * If the dead_count matches, a destruction + * might still happen concurrently, but since + * we checked under RCU, that destruction + * won't free the object until we release the + * RCU reader lock. Thus, the dead_count + * check verifies the pointer is still valid, + * css_tryget() verifies the cgroup pointed to + * is alive. + */ + dead_count = atomic_read(&root->dead_count); + smp_rmb(); + last_visited = iter->last_visited; + if (last_visited) { + if ((dead_count != iter->last_dead_count) || + !css_tryget(&last_visited->css)) { + last_visited = NULL; + } + } } /* @@ -1169,16 +1191,14 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (css && !memcg) curr = mem_cgroup_from_css(css); - /* make sure that the cached memcg is not removed */ - if (curr) - css_get(&curr->css); iter->last_visited = curr; + smp_wmb(); + iter->last_dead_count = dead_count; if (!css) iter->generation++; else if (!prev && memcg) reclaim->generation = iter->generation; - spin_unlock(&iter->iter_lock); } else if (css && !memcg) { last_visited = mem_cgroup_from_css(css); } @@ -5975,12 +5995,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) return 1; for (zone = 0; zone < MAX_NR_ZONES; zone++) { - int prio; - mz = &pn->zoneinfo[zone]; lruvec_init(&mz->lruvec); - for (prio = 0; prio < DEF_PRIORITY + 1; prio++) - spin_lock_init(&mz->reclaim_iter[prio].iter_lock); mz->usage_in_excess = 0; mz->on_tree = false; mz->memcg = memcg; @@ -6235,10 +6251,29 @@ mem_cgroup_css_online(struct cgroup *cont) return error; } +/* + * Announce all parents that a group from their hierarchy is gone. + */ +static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) +{ + struct mem_cgroup *parent = memcg; + + while ((parent = parent_mem_cgroup(parent))) + atomic_inc(&parent->dead_count); + + /* + * if the root memcg is not hierarchical we have to check it + * explicitely. + */ + if (!root_mem_cgroup->use_hierarchy) + atomic_inc(&root_mem_cgroup->dead_count); +} + static void mem_cgroup_css_offline(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + mem_cgroup_invalidate_reclaim_iterators(memcg); mem_cgroup_reparent_charges(memcg); mem_cgroup_destroy_all_caches(memcg); } -- cgit v1.1 From 19f39402864ea3935b36ab1066c6283d6ea53f44 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 29 Apr 2013 15:07:18 -0700 Subject: memcg: simplify mem_cgroup_iter The current implementation of mem_cgroup_iter has to consider both css and memcg to find out whether no group has been found (css==NULL - aka the loop is completed) and that no memcg is associated with the found node (!memcg - aka css_tryget failed because the group is no longer alive). This leads to awkward tweaks like tests for css && !memcg to skip the current node. It will be much easier if we got rid off css variable altogether and only rely on memcg. In order to do that the iteration part has to skip dead nodes. This sounds natural to me and as a nice side effect we will get a simple invariant that memcg is always alive when non-NULL and all nodes have been visited otherwise. We could get rid of the surrounding while loop but keep it in for now to make review easier. It will go away in the following patch. Signed-off-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Johannes Weiner Cc: Li Zefan Cc: Ying Han Cc: Tejun Heo Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 52 +++++++++++++++++++++++++--------------------------- 1 file changed, 25 insertions(+), 27 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 408a5c7..614d0d9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1116,7 +1116,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, rcu_read_lock(); while (!memcg) { struct mem_cgroup_reclaim_iter *uninitialized_var(iter); - struct cgroup_subsys_state *css = NULL; if (reclaim) { int nid = zone_to_nid(reclaim->zone); @@ -1159,51 +1158,50 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, * explicit visit. */ if (!last_visited) { - css = &root->css; + memcg = root; } else { struct cgroup *prev_cgroup, *next_cgroup; prev_cgroup = (last_visited == root) ? NULL : last_visited->css.cgroup; - next_cgroup = cgroup_next_descendant_pre(prev_cgroup, - root->css.cgroup); - if (next_cgroup) - css = cgroup_subsys_state(next_cgroup, - mem_cgroup_subsys_id); - } +skip_node: + next_cgroup = cgroup_next_descendant_pre( + prev_cgroup, root->css.cgroup); - /* - * Even if we found a group we have to make sure it is alive. - * css && !memcg means that the groups should be skipped and - * we should continue the tree walk. - * last_visited css is safe to use because it is protected by - * css_get and the tree walk is rcu safe. - */ - if (css == &root->css || (css && css_tryget(css))) - memcg = mem_cgroup_from_css(css); + /* + * Even if we found a group we have to make sure it is + * alive. css && !memcg means that the groups should be + * skipped and we should continue the tree walk. + * last_visited css is safe to use because it is + * protected by css_get and the tree walk is rcu safe. + */ + if (next_cgroup) { + struct mem_cgroup *mem = mem_cgroup_from_cont( + next_cgroup); + if (css_tryget(&mem->css)) + memcg = mem; + else { + prev_cgroup = next_cgroup; + goto skip_node; + } + } + } if (reclaim) { - struct mem_cgroup *curr = memcg; - if (last_visited) css_put(&last_visited->css); - if (css && !memcg) - curr = mem_cgroup_from_css(css); - - iter->last_visited = curr; + iter->last_visited = memcg; smp_wmb(); iter->last_dead_count = dead_count; - if (!css) + if (!memcg) iter->generation++; else if (!prev && memcg) reclaim->generation = iter->generation; - } else if (css && !memcg) { - last_visited = mem_cgroup_from_css(css); } - if (prev && !css) + if (prev && !memcg) goto out_unlock; } out_unlock: -- cgit v1.1 From 16248d8fe65ea116cc30b915e05a1c29496da120 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 29 Apr 2013 15:07:19 -0700 Subject: memcg: further simplify mem_cgroup_iter mem_cgroup_iter basically does two things currently. It takes care of the house keeping (reference counting, raclaim cookie) and it iterates through a hierarchy tree (by using cgroup generic tree walk). The code would be much more easier to follow if we move the iteration outside of the function (to __mem_cgrou_iter_next) so the distinction is more clear. This patch doesn't introduce any functional changes. Signed-off-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Johannes Weiner Cc: Li Zefan Cc: Ying Han Cc: Tejun Heo Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 79 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 46 insertions(+), 33 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 614d0d9..2bdac3e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1073,6 +1073,51 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) return memcg; } +/* + * Returns a next (in a pre-order walk) alive memcg (with elevated css + * ref. count) or NULL if the whole root's subtree has been visited. + * + * helper function to be used by mem_cgroup_iter + */ +static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, + struct mem_cgroup *last_visited) +{ + struct cgroup *prev_cgroup, *next_cgroup; + + /* + * Root is not visited by cgroup iterators so it needs an + * explicit visit. + */ + if (!last_visited) + return root; + + prev_cgroup = (last_visited == root) ? NULL + : last_visited->css.cgroup; +skip_node: + next_cgroup = cgroup_next_descendant_pre( + prev_cgroup, root->css.cgroup); + + /* + * Even if we found a group we have to make sure it is + * alive. css && !memcg means that the groups should be + * skipped and we should continue the tree walk. + * last_visited css is safe to use because it is + * protected by css_get and the tree walk is rcu safe. + */ + if (next_cgroup) { + struct mem_cgroup *mem = mem_cgroup_from_cont( + next_cgroup); + if (css_tryget(&mem->css)) + return mem; + else { + prev_cgroup = next_cgroup; + goto skip_node; + } + } + + return NULL; +} + /** * mem_cgroup_iter - iterate over memory cgroup hierarchy * @root: hierarchy root @@ -1153,39 +1198,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, } } - /* - * Root is not visited by cgroup iterators so it needs an - * explicit visit. - */ - if (!last_visited) { - memcg = root; - } else { - struct cgroup *prev_cgroup, *next_cgroup; - - prev_cgroup = (last_visited == root) ? NULL - : last_visited->css.cgroup; -skip_node: - next_cgroup = cgroup_next_descendant_pre( - prev_cgroup, root->css.cgroup); - - /* - * Even if we found a group we have to make sure it is - * alive. css && !memcg means that the groups should be - * skipped and we should continue the tree walk. - * last_visited css is safe to use because it is - * protected by css_get and the tree walk is rcu safe. - */ - if (next_cgroup) { - struct mem_cgroup *mem = mem_cgroup_from_cont( - next_cgroup); - if (css_tryget(&mem->css)) - memcg = mem; - else { - prev_cgroup = next_cgroup; - goto skip_node; - } - } - } + memcg = __mem_cgroup_iter_next(root, last_visited); if (reclaim) { if (last_visited) -- cgit v1.1 From 106c992a5ebef28193cf5958e49ceff5e4aebb04 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Mon, 29 Apr 2013 15:07:23 -0700 Subject: mm/hugetlb: add more arch-defined huge_pte functions Commit abf09bed3cce ("s390/mm: implement software dirty bits") introduced another difference in the pte layout vs. the pmd layout on s390, thoroughly breaking the s390 support for hugetlbfs. This requires replacing some more pte_xxx functions in mm/hugetlbfs.c with a huge_pte_xxx version. This patch introduces those huge_pte_xxx functions and their generic implementation in asm-generic/hugetlb.h, which will now be included on all architectures supporting hugetlbfs apart from s390. This change will be a no-op for those architectures. [akpm@linux-foundation.org: fix warning] Signed-off-by: Gerald Schaefer Cc: Mel Gorman Cc: Hugh Dickins Cc: Hillf Danton Acked-by: Michal Hocko [for !s390 parts] Cc: Tony Luck Cc: Fenghua Yu Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Paul Mundt Cc: "David S. Miller" Cc: Chris Metcalf Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1a12f5b..73b864a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2247,10 +2247,11 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, pte_t entry; if (writable) { - entry = - pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); + entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, + vma->vm_page_prot))); } else { - entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot)); + entry = huge_pte_wrprotect(mk_huge_pte(page, + vma->vm_page_prot)); } entry = pte_mkyoung(entry); entry = pte_mkhuge(entry); @@ -2264,7 +2265,7 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, { pte_t entry; - entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); + entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep))); if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) update_mmu_cache(vma, address, ptep); } @@ -2379,7 +2380,7 @@ again: * HWPoisoned hugepage is already unmapped and dropped reference */ if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { - pte_clear(mm, address, ptep); + huge_pte_clear(mm, address, ptep); continue; } @@ -2403,7 +2404,7 @@ again: pte = huge_ptep_get_and_clear(mm, address, ptep); tlb_remove_tlb_entry(tlb, ptep, address); - if (pte_dirty(pte)) + if (huge_pte_dirty(pte)) set_page_dirty(page); page_remove_rmap(page); @@ -2856,7 +2857,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * page now as it is used to determine if a reservation has been * consumed. */ - if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) { + if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { if (vma_needs_reservation(h, vma, address) < 0) { ret = VM_FAULT_OOM; goto out_mutex; @@ -2886,12 +2887,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & FAULT_FLAG_WRITE) { - if (!pte_write(entry)) { + if (!huge_pte_write(entry)) { ret = hugetlb_cow(mm, vma, address, ptep, entry, pagecache_page); goto out_page_table_lock; } - entry = pte_mkdirty(entry); + entry = huge_pte_mkdirty(entry); } entry = pte_mkyoung(entry); if (huge_ptep_set_access_flags(vma, address, ptep, entry, @@ -2972,7 +2973,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * directly from any kind of swap entries. */ if (absent || is_swap_pte(huge_ptep_get(pte)) || - ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) { + ((flags & FOLL_WRITE) && + !huge_pte_write(huge_ptep_get(pte)))) { int ret; spin_unlock(&mm->page_table_lock); @@ -3042,7 +3044,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, } if (!huge_pte_none(huge_ptep_get(ptep))) { pte = huge_ptep_get_and_clear(mm, address, ptep); - pte = pte_mkhuge(pte_modify(pte, newprot)); + pte = pte_mkhuge(huge_pte_modify(pte, newprot)); pte = arch_make_huge_pte(pte, vma, NULL, 0); set_huge_pte_at(mm, address, ptep, pte); pages++; -- cgit v1.1 From 7136851117744f1d291bed6d307432699d405109 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 29 Apr 2013 15:07:25 -0700 Subject: mm: make snapshotting pages for stable writes a per-bio operation Walking a bio's page mappings has proved problematic, so create a new bio flag to indicate that a bio's data needs to be snapshotted in order to guarantee stable pages during writeback. Next, for the one user (ext3/jbd) of snapshotting, hook all the places where writes can be initiated without PG_writeback set, and set BIO_SNAP_STABLE there. We must also flag journal "metadata" bios for stable writeout, since file data can be written through the journal. Finally, the MS_SNAP_STABLE mount flag (only used by ext3) is now superfluous, so get rid of it. [akpm@linux-foundation.org: rename _submit_bh()'s `flags' to `bio_flags', delobotomize the _submit_bh declaration] [akpm@linux-foundation.org: teeny cleanup] Signed-off-by: Darrick J. Wong Cc: Andy Lutomirski Cc: Adrian Hunter Cc: Artem Bityutskiy Reviewed-by: Jan Kara Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bounce.c | 21 +-------------------- mm/page-writeback.c | 4 ---- 2 files changed, 1 insertion(+), 24 deletions(-) (limited to 'mm') diff --git a/mm/bounce.c b/mm/bounce.c index 5f89017..a5c2ec3 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -181,32 +181,13 @@ static void bounce_end_io_read_isa(struct bio *bio, int err) #ifdef CONFIG_NEED_BOUNCE_POOL static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) { - struct page *page; - struct backing_dev_info *bdi; - struct address_space *mapping; - struct bio_vec *from; - int i; - if (bio_data_dir(bio) != WRITE) return 0; if (!bdi_cap_stable_pages_required(&q->backing_dev_info)) return 0; - /* - * Based on the first page that has a valid mapping, decide whether or - * not we have to employ bounce buffering to guarantee stable pages. - */ - bio_for_each_segment(from, bio, i) { - page = from->bv_page; - mapping = page_mapping(page); - if (!mapping) - continue; - bdi = mapping->backing_dev_info; - return mapping->host->i_sb->s_flags & MS_SNAP_STABLE; - } - - return 0; + return test_bit(BIO_SNAP_STABLE, &bio->bi_flags); } #else static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index efe6814..4514ad7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2311,10 +2311,6 @@ void wait_for_stable_page(struct page *page) if (!bdi_cap_stable_pages_required(bdi)) return; -#ifdef CONFIG_NEED_BOUNCE_POOL - if (mapping->host->i_sb->s_flags & MS_SNAP_STABLE) - return; -#endif /* CONFIG_NEED_BOUNCE_POOL */ wait_on_page_writeback(page); } -- cgit v1.1 From db3808c1bac64740b9d830fda92801ae65f1c851 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 29 Apr 2013 15:07:28 -0700 Subject: mm, vmalloc: move get_vmalloc_info() to vmalloc.c Now get_vmalloc_info() is in fs/proc/mmu.c. There is no reason that this code must be here and it's implementation needs vmlist_lock and it iterate a vmlist which may be internal data structure for vmalloc. It is preferable that vmlist_lock and vmlist is only used in vmalloc.c for maintainability. So move the code to vmalloc.c Signed-off-by: Joonsoo Kim Signed-off-by: Joonsoo Kim Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Atsushi Kumagai Cc: Chris Metcalf Cc: Dave Anderson Cc: Eric Biederman Cc: Guan Xuetao Cc: Ingo Molnar Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0f751f2..1d9878b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2645,5 +2645,49 @@ static int __init proc_vmalloc_init(void) return 0; } module_init(proc_vmalloc_init); + +void get_vmalloc_info(struct vmalloc_info *vmi) +{ + struct vm_struct *vma; + unsigned long free_area_size; + unsigned long prev_end; + + vmi->used = 0; + + if (!vmlist) { + vmi->largest_chunk = VMALLOC_TOTAL; + } else { + vmi->largest_chunk = 0; + + prev_end = VMALLOC_START; + + read_lock(&vmlist_lock); + + for (vma = vmlist; vma; vma = vma->next) { + unsigned long addr = (unsigned long) vma->addr; + + /* + * Some archs keep another range for modules in vmlist + */ + if (addr < VMALLOC_START) + continue; + if (addr >= VMALLOC_END) + break; + + vmi->used += vma->size; + + free_area_size = addr - prev_end; + if (vmi->largest_chunk < free_area_size) + vmi->largest_chunk = free_area_size; + + prev_end = vma->size + addr; + } + + if (VMALLOC_END - prev_end > vmi->largest_chunk) + vmi->largest_chunk = VMALLOC_END - prev_end; + + read_unlock(&vmlist_lock); + } +} #endif -- cgit v1.1 From c69480adeea15883d9459a8adc3da3f6e8cb7a8c Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 29 Apr 2013 15:07:30 -0700 Subject: mm, vmalloc: protect va->vm by vmap_area_lock Inserting and removing an entry to vmlist is linear time complexity, so it is inefficient. Following patches will try to remove vmlist entirely. This patch is preparing step for it. For removing vmlist, iterating vmlist codes should be changed to iterating a vmap_area_list. Before implementing that, we should make sure that when we iterate a vmap_area_list, accessing to va->vm doesn't cause a race condition. This patch ensure that when iterating a vmap_area_list, there is no race condition for accessing to vm_struct. Signed-off-by: Joonsoo Kim Signed-off-by: Joonsoo Kim Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Atsushi Kumagai Cc: Chris Metcalf Cc: Dave Anderson Cc: Eric Biederman Cc: Guan Xuetao Cc: Ingo Molnar Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1d9878b..1bf94ad 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1290,12 +1290,14 @@ struct vm_struct *vmlist; static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { + spin_lock(&vmap_area_lock); vm->flags = flags; vm->addr = (void *)va->va_start; vm->size = va->va_end - va->va_start; vm->caller = caller; va->vm = vm; va->flags |= VM_VM_AREA; + spin_unlock(&vmap_area_lock); } static void insert_vmalloc_vmlist(struct vm_struct *vm) @@ -1447,6 +1449,11 @@ struct vm_struct *remove_vm_area(const void *addr) if (va && va->flags & VM_VM_AREA) { struct vm_struct *vm = va->vm; + spin_lock(&vmap_area_lock); + va->vm = NULL; + va->flags &= ~VM_VM_AREA; + spin_unlock(&vmap_area_lock); + if (!(vm->flags & VM_UNLIST)) { struct vm_struct *tmp, **p; /* -- cgit v1.1 From e81ce85f960c2e26efb5d0802d56c34533edb1bd Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 29 Apr 2013 15:07:32 -0700 Subject: mm, vmalloc: iterate vmap_area_list, instead of vmlist in vread/vwrite() Now, when we hold a vmap_area_lock, va->vm can't be discarded. So we can safely access to va->vm when iterating a vmap_area_list with holding a vmap_area_lock. With this property, change iterating vmlist codes in vread/vwrite() to iterating vmap_area_list. There is a little difference relate to lock, because vmlist_lock is mutex, but, vmap_area_lock is spin_lock. It may introduce a spinning overhead during vread/vwrite() is executing. But, these are debug-oriented functions, so this overhead is not real problem for common case. Signed-off-by: Joonsoo Kim Signed-off-by: Joonsoo Kim Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Atsushi Kumagai Cc: Chris Metcalf Cc: Dave Anderson Cc: Eric Biederman Cc: Guan Xuetao Cc: Ingo Molnar Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 48 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1bf94ad..59aa328 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2012,7 +2012,8 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count) long vread(char *buf, char *addr, unsigned long count) { - struct vm_struct *tmp; + struct vmap_area *va; + struct vm_struct *vm; char *vaddr, *buf_start = buf; unsigned long buflen = count; unsigned long n; @@ -2021,10 +2022,17 @@ long vread(char *buf, char *addr, unsigned long count) if ((unsigned long) addr + count < count) count = -(unsigned long) addr; - read_lock(&vmlist_lock); - for (tmp = vmlist; count && tmp; tmp = tmp->next) { - vaddr = (char *) tmp->addr; - if (addr >= vaddr + tmp->size - PAGE_SIZE) + spin_lock(&vmap_area_lock); + list_for_each_entry(va, &vmap_area_list, list) { + if (!count) + break; + + if (!(va->flags & VM_VM_AREA)) + continue; + + vm = va->vm; + vaddr = (char *) vm->addr; + if (addr >= vaddr + vm->size - PAGE_SIZE) continue; while (addr < vaddr) { if (count == 0) @@ -2034,10 +2042,10 @@ long vread(char *buf, char *addr, unsigned long count) addr++; count--; } - n = vaddr + tmp->size - PAGE_SIZE - addr; + n = vaddr + vm->size - PAGE_SIZE - addr; if (n > count) n = count; - if (!(tmp->flags & VM_IOREMAP)) + if (!(vm->flags & VM_IOREMAP)) aligned_vread(buf, addr, n); else /* IOREMAP area is treated as memory hole */ memset(buf, 0, n); @@ -2046,7 +2054,7 @@ long vread(char *buf, char *addr, unsigned long count) count -= n; } finished: - read_unlock(&vmlist_lock); + spin_unlock(&vmap_area_lock); if (buf == buf_start) return 0; @@ -2085,7 +2093,8 @@ finished: long vwrite(char *buf, char *addr, unsigned long count) { - struct vm_struct *tmp; + struct vmap_area *va; + struct vm_struct *vm; char *vaddr; unsigned long n, buflen; int copied = 0; @@ -2095,10 +2104,17 @@ long vwrite(char *buf, char *addr, unsigned long count) count = -(unsigned long) addr; buflen = count; - read_lock(&vmlist_lock); - for (tmp = vmlist; count && tmp; tmp = tmp->next) { - vaddr = (char *) tmp->addr; - if (addr >= vaddr + tmp->size - PAGE_SIZE) + spin_lock(&vmap_area_lock); + list_for_each_entry(va, &vmap_area_list, list) { + if (!count) + break; + + if (!(va->flags & VM_VM_AREA)) + continue; + + vm = va->vm; + vaddr = (char *) vm->addr; + if (addr >= vaddr + vm->size - PAGE_SIZE) continue; while (addr < vaddr) { if (count == 0) @@ -2107,10 +2123,10 @@ long vwrite(char *buf, char *addr, unsigned long count) addr++; count--; } - n = vaddr + tmp->size - PAGE_SIZE - addr; + n = vaddr + vm->size - PAGE_SIZE - addr; if (n > count) n = count; - if (!(tmp->flags & VM_IOREMAP)) { + if (!(vm->flags & VM_IOREMAP)) { aligned_vwrite(buf, addr, n); copied++; } @@ -2119,7 +2135,7 @@ long vwrite(char *buf, char *addr, unsigned long count) count -= n; } finished: - read_unlock(&vmlist_lock); + spin_unlock(&vmap_area_lock); if (!copied) return 0; return buflen; -- cgit v1.1 From f98782ddd31ac6f938386b79d8bd7aa7c8a78c50 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 29 Apr 2013 15:07:34 -0700 Subject: mm, vmalloc: iterate vmap_area_list in get_vmalloc_info() This patch is a preparatory step for removing vmlist entirely. For above purpose, we change iterating a vmap_list codes to iterating a vmap_area_list. It is somewhat trivial change, but just one thing should be noticed. vmlist is lack of information about some areas in vmalloc address space. For example, vm_map_ram() allocate area in vmalloc address space, but it doesn't make a link with vmlist. To provide full information about vmalloc address space is better idea, so we don't use va->vm and use vmap_area directly. This makes get_vmalloc_info() more precise. Signed-off-by: Joonsoo Kim Signed-off-by: Joonsoo Kim Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Atsushi Kumagai Cc: Chris Metcalf Cc: Dave Anderson Cc: Eric Biederman Cc: Guan Xuetao Cc: Ingo Molnar Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 56 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 30 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 59aa328..aee1f61 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2671,46 +2671,50 @@ module_init(proc_vmalloc_init); void get_vmalloc_info(struct vmalloc_info *vmi) { - struct vm_struct *vma; + struct vmap_area *va; unsigned long free_area_size; unsigned long prev_end; vmi->used = 0; + vmi->largest_chunk = 0; - if (!vmlist) { - vmi->largest_chunk = VMALLOC_TOTAL; - } else { - vmi->largest_chunk = 0; + prev_end = VMALLOC_START; - prev_end = VMALLOC_START; - - read_lock(&vmlist_lock); + spin_lock(&vmap_area_lock); - for (vma = vmlist; vma; vma = vma->next) { - unsigned long addr = (unsigned long) vma->addr; + if (list_empty(&vmap_area_list)) { + vmi->largest_chunk = VMALLOC_TOTAL; + goto out; + } - /* - * Some archs keep another range for modules in vmlist - */ - if (addr < VMALLOC_START) - continue; - if (addr >= VMALLOC_END) - break; + list_for_each_entry(va, &vmap_area_list, list) { + unsigned long addr = va->va_start; - vmi->used += vma->size; + /* + * Some archs keep another range for modules in vmalloc space + */ + if (addr < VMALLOC_START) + continue; + if (addr >= VMALLOC_END) + break; - free_area_size = addr - prev_end; - if (vmi->largest_chunk < free_area_size) - vmi->largest_chunk = free_area_size; + if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING)) + continue; - prev_end = vma->size + addr; - } + vmi->used += (va->va_end - va->va_start); - if (VMALLOC_END - prev_end > vmi->largest_chunk) - vmi->largest_chunk = VMALLOC_END - prev_end; + free_area_size = addr - prev_end; + if (vmi->largest_chunk < free_area_size) + vmi->largest_chunk = free_area_size; - read_unlock(&vmlist_lock); + prev_end = va->va_end; } + + if (VMALLOC_END - prev_end > vmi->largest_chunk) + vmi->largest_chunk = VMALLOC_END - prev_end; + +out: + spin_unlock(&vmap_area_lock); } #endif -- cgit v1.1 From d4033afdf8282802ad28b0ed854393454115a071 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 29 Apr 2013 15:07:35 -0700 Subject: mm, vmalloc: iterate vmap_area_list, instead of vmlist, in vmallocinfo() This patch is a preparatory step for removing vmlist entirely. For above purpose, we change iterating a vmap_list codes to iterating a vmap_area_list. It is somewhat trivial change, but just one thing should be noticed. Using vmap_area_list in vmallocinfo() introduce ordering problem in SMP system. In s_show(), we retrieve some values from vm_struct. vm_struct's values is not fully setup when va->vm is assigned. Full setup is notified by removing VM_UNLIST flag without holding a lock. When we see that VM_UNLIST is removed, it is not ensured that vm_struct has proper values in view of other CPUs. So we need smp_[rw]mb for ensuring that proper values is assigned when we see that VM_UNLIST is removed. Therefore, this patch not only change a iteration list, but also add a appropriate smp_[rw]mb to right places. Signed-off-by: Joonsoo Kim Signed-off-by: Joonsoo Kim Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Atsushi Kumagai Cc: Chris Metcalf Cc: Dave Anderson Cc: Eric Biederman Cc: Guan Xuetao Cc: Ingo Molnar Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 55 ++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index aee1f61..bda6cef 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1304,7 +1304,14 @@ static void insert_vmalloc_vmlist(struct vm_struct *vm) { struct vm_struct *tmp, **p; + /* + * Before removing VM_UNLIST, + * we should make sure that vm has proper values. + * Pair with smp_rmb() in show_numa_info(). + */ + smp_wmb(); vm->flags &= ~VM_UNLIST; + write_lock(&vmlist_lock); for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { if (tmp->addr >= vm->addr) @@ -2542,19 +2549,19 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) - __acquires(&vmlist_lock) + __acquires(&vmap_area_lock) { loff_t n = *pos; - struct vm_struct *v; + struct vmap_area *va; - read_lock(&vmlist_lock); - v = vmlist; - while (n > 0 && v) { + spin_lock(&vmap_area_lock); + va = list_entry((&vmap_area_list)->next, typeof(*va), list); + while (n > 0 && &va->list != &vmap_area_list) { n--; - v = v->next; + va = list_entry(va->list.next, typeof(*va), list); } - if (!n) - return v; + if (!n && &va->list != &vmap_area_list) + return va; return NULL; @@ -2562,16 +2569,20 @@ static void *s_start(struct seq_file *m, loff_t *pos) static void *s_next(struct seq_file *m, void *p, loff_t *pos) { - struct vm_struct *v = p; + struct vmap_area *va = p, *next; ++*pos; - return v->next; + next = list_entry(va->list.next, typeof(*va), list); + if (&next->list != &vmap_area_list) + return next; + + return NULL; } static void s_stop(struct seq_file *m, void *p) - __releases(&vmlist_lock) + __releases(&vmap_area_lock) { - read_unlock(&vmlist_lock); + spin_unlock(&vmap_area_lock); } static void show_numa_info(struct seq_file *m, struct vm_struct *v) @@ -2582,6 +2593,11 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) if (!counters) return; + /* Pair with smp_wmb() in insert_vmalloc_vmlist() */ + smp_rmb(); + if (v->flags & VM_UNLIST) + return; + memset(counters, 0, nr_node_ids * sizeof(unsigned int)); for (nr = 0; nr < v->nr_pages; nr++) @@ -2595,7 +2611,20 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) static int s_show(struct seq_file *m, void *p) { - struct vm_struct *v = p; + struct vmap_area *va = p; + struct vm_struct *v; + + if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING)) + return 0; + + if (!(va->flags & VM_VM_AREA)) { + seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", + (void *)va->va_start, (void *)va->va_end, + va->va_end - va->va_start); + return 0; + } + + v = va->vm; seq_printf(m, "0x%pK-0x%pK %7ld", v->addr, v->addr + v->size, v->size); -- cgit v1.1 From f1c4069e1dc128dc8a851174cba2e273652e9216 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 29 Apr 2013 15:07:37 -0700 Subject: mm, vmalloc: export vmap_area_list, instead of vmlist Although our intention is to unexport internal structure entirely, but there is one exception for kexec. kexec dumps address of vmlist and makedumpfile uses this information. We are about to remove vmlist, then another way to retrieve information of vmalloc layer is needed for makedumpfile. For this purpose, we export vmap_area_list, instead of vmlist. Signed-off-by: Joonsoo Kim Signed-off-by: Joonsoo Kim Cc: Eric Biederman Cc: Dave Anderson Cc: Vivek Goyal Cc: Atsushi Kumagai Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Chris Metcalf Cc: Guan Xuetao Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nommu.c | 3 +-- mm/vmalloc.c | 11 ++++++----- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index e001768..2f1c75e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -228,8 +228,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address, } EXPORT_SYMBOL(follow_pfn); -DEFINE_RWLOCK(vmlist_lock); -struct vm_struct *vmlist; +LIST_HEAD(vmap_area_list); void vfree(const void *addr) { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index bda6cef..7e63984 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -261,7 +261,8 @@ struct vmap_area { }; static DEFINE_SPINLOCK(vmap_area_lock); -static LIST_HEAD(vmap_area_list); +/* Export for kexec only */ +LIST_HEAD(vmap_area_list); static struct rb_root vmap_area_root = RB_ROOT; /* The vmap cache globals are protected by vmap_area_lock */ @@ -272,6 +273,10 @@ static unsigned long cached_align; static unsigned long vmap_area_pcpu_hole; +/*** Old vmalloc interfaces ***/ +static DEFINE_RWLOCK(vmlist_lock); +static struct vm_struct *vmlist; + static struct vmap_area *__find_vmap_area(unsigned long addr) { struct rb_node *n = vmap_area_root.rb_node; @@ -1283,10 +1288,6 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) } EXPORT_SYMBOL_GPL(map_vm_area); -/*** Old vmalloc interfaces ***/ -DEFINE_RWLOCK(vmlist_lock); -struct vm_struct *vmlist; - static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { -- cgit v1.1 From 4341fa454796b8a37efd5db98112524e85e7114e Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 29 Apr 2013 15:07:39 -0700 Subject: mm, vmalloc: remove list management of vmlist after initializing vmalloc Now, there is no need to maintain vmlist after initializing vmalloc. So remove related code and data structure. Signed-off-by: Joonsoo Kim Signed-off-by: Joonsoo Kim Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Atsushi Kumagai Cc: Chris Metcalf Cc: Dave Anderson Cc: Eric Biederman Cc: Guan Xuetao Cc: Ingo Molnar Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 52 ++++++++++++---------------------------------------- 1 file changed, 12 insertions(+), 40 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7e63984..151da8a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -273,10 +273,6 @@ static unsigned long cached_align; static unsigned long vmap_area_pcpu_hole; -/*** Old vmalloc interfaces ***/ -static DEFINE_RWLOCK(vmlist_lock); -static struct vm_struct *vmlist; - static struct vmap_area *__find_vmap_area(unsigned long addr) { struct rb_node *n = vmap_area_root.rb_node; @@ -318,7 +314,7 @@ static void __insert_vmap_area(struct vmap_area *va) rb_link_node(&va->rb_node, parent, p); rb_insert_color(&va->rb_node, &vmap_area_root); - /* address-sort this list so it is usable like the vmlist */ + /* address-sort this list */ tmp = rb_prev(&va->rb_node); if (tmp) { struct vmap_area *prev; @@ -1130,6 +1126,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro } EXPORT_SYMBOL(vm_map_ram); +static struct vm_struct *vmlist __initdata; /** * vm_area_add_early - add vmap area early during boot * @vm: vm_struct to add @@ -1301,10 +1298,8 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, spin_unlock(&vmap_area_lock); } -static void insert_vmalloc_vmlist(struct vm_struct *vm) +static void clear_vm_unlist(struct vm_struct *vm) { - struct vm_struct *tmp, **p; - /* * Before removing VM_UNLIST, * we should make sure that vm has proper values. @@ -1312,22 +1307,13 @@ static void insert_vmalloc_vmlist(struct vm_struct *vm) */ smp_wmb(); vm->flags &= ~VM_UNLIST; - - write_lock(&vmlist_lock); - for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { - if (tmp->addr >= vm->addr) - break; - } - vm->next = *p; - *p = vm; - write_unlock(&vmlist_lock); } static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { setup_vmalloc_vm(vm, va, flags, caller); - insert_vmalloc_vmlist(vm); + clear_vm_unlist(vm); } static struct vm_struct *__get_vm_area_node(unsigned long size, @@ -1370,10 +1356,9 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, /* * When this function is called from __vmalloc_node_range, - * we do not add vm_struct to vmlist here to avoid - * accessing uninitialized members of vm_struct such as - * pages and nr_pages fields. They will be set later. - * To distinguish it from others, we use a VM_UNLIST flag. + * we add VM_UNLIST flag to avoid accessing uninitialized + * members of vm_struct such as pages and nr_pages fields. + * They will be set later. */ if (flags & VM_UNLIST) setup_vmalloc_vm(area, va, flags, caller); @@ -1462,20 +1447,6 @@ struct vm_struct *remove_vm_area(const void *addr) va->flags &= ~VM_VM_AREA; spin_unlock(&vmap_area_lock); - if (!(vm->flags & VM_UNLIST)) { - struct vm_struct *tmp, **p; - /* - * remove from list and disallow access to - * this vm_struct before unmap. (address range - * confliction is maintained by vmap.) - */ - write_lock(&vmlist_lock); - for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) - ; - *p = tmp->next; - write_unlock(&vmlist_lock); - } - vmap_debug_free_range(va->va_start, va->va_end); free_unmap_vmap_area(va); vm->size -= PAGE_SIZE; @@ -1695,10 +1666,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, return NULL; /* - * In this function, newly allocated vm_struct is not added - * to vmlist at __get_vm_area_node(). so, it is added here. + * In this function, newly allocated vm_struct has VM_UNLIST flag. + * It means that vm_struct is not fully initialized. + * Now, it is fully initialized, so remove this flag here. */ - insert_vmalloc_vmlist(area); + clear_vm_unlist(area); /* * A ref_count = 3 is needed because the vm_struct and vmap_area @@ -2594,7 +2566,7 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) if (!counters) return; - /* Pair with smp_wmb() in insert_vmalloc_vmlist() */ + /* Pair with smp_wmb() in clear_vm_unlist() */ smp_rmb(); if (v->flags & VM_UNLIST) return; -- cgit v1.1 From 13ba3fcbbe31068b1ee7c39a0b58ecbed03c4d72 Mon Sep 17 00:00:00 2001 From: Atsushi Kumagai Date: Mon, 29 Apr 2013 15:07:40 -0700 Subject: kexec, vmalloc: export additional vmalloc layer information Now, vmap_area_list is exported as VMCOREINFO for makedumpfile to get the start address of vmalloc region (vmalloc_start). The address which contains vmalloc_start value is represented as below: vmap_area_list.next - OFFSET(vmap_area.list) + OFFSET(vmap_area.va_start) However, both OFFSET(vmap_area.va_start) and OFFSET(vmap_area.list) aren't exported as VMCOREINFO. So this patch exports them externally with small cleanup. [akpm@linux-foundation.org: vmalloc.h should include list.h for list_head] Signed-off-by: Atsushi Kumagai Cc: Joonsoo Kim Cc: Joonsoo Kim Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Atsushi Kumagai Cc: Chris Metcalf Cc: Dave Anderson Cc: Eric Biederman Cc: Guan Xuetao Cc: Ingo Molnar Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 151da8a..72043d6 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -249,17 +249,6 @@ EXPORT_SYMBOL(vmalloc_to_pfn); #define VM_LAZY_FREEING 0x02 #define VM_VM_AREA 0x04 -struct vmap_area { - unsigned long va_start; - unsigned long va_end; - unsigned long flags; - struct rb_node rb_node; /* address sorted rbtree */ - struct list_head list; /* address sorted list */ - struct list_head purge_list; /* "lazy purge" list */ - struct vm_struct *vm; - struct rcu_head rcu_head; -}; - static DEFINE_SPINLOCK(vmap_area_lock); /* Export for kexec only */ LIST_HEAD(vmap_area_list); -- cgit v1.1 From ee5df0570c8af2610b28ab79bd8f8f8195687773 Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 29 Apr 2013 15:07:42 -0700 Subject: mmap: find_vma: remove the WARN_ON_ONCE(!mm) check Remove the WARN_ON_ONCE(!mm) check as the comment suggested. Kernel code calls find_vma only when it is absolutely sure that the mm_struct arg to it is non-NULL. Signed-off-by: Zhang Yanfei Cc: k80c Cc: Michel Lespinasse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 0db0de1..b2c363f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1935,9 +1935,6 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma = NULL; - if (WARN_ON_ONCE(!mm)) /* Remove this in linux-3.6 */ - return NULL; - /* Check the cache first. */ /* (Cache hit rate is typically around 35%.) */ vma = ACCESS_ONCE(mm->mmap_cache); -- cgit v1.1 From acb6d558f4c8fbacc8e774c9d7737220a3777882 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 29 Apr 2013 15:07:43 -0700 Subject: memcg: do not check for do_swap_account in mem_cgroup_{read,write,reset} Since commit 2d11085e404f ("memcg: do not create memsw files if swap accounting is disabled") memsw files are created only if memcg swap accounting is enabled so it doesn't make any sense to check for it explicitly in mem_cgroup_read(), mem_cgroup_write() and mem_cgroup_reset(). Signed-off-by: Michal Hocko Cc: Kamezawa Hiroyuki Cc: Tejun Heo Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2bdac3e..d280016 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5025,9 +5025,6 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, type = MEMFILE_TYPE(cft->private); name = MEMFILE_ATTR(cft->private); - if (!do_swap_account && type == _MEMSWAP) - return -EOPNOTSUPP; - switch (type) { case _MEM: if (name == RES_USAGE) @@ -5162,9 +5159,6 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, type = MEMFILE_TYPE(cft->private); name = MEMFILE_ATTR(cft->private); - if (!do_swap_account && type == _MEMSWAP) - return -EOPNOTSUPP; - switch (name) { case RES_LIMIT: if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ @@ -5241,9 +5235,6 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) type = MEMFILE_TYPE(event); name = MEMFILE_ATTR(event); - if (!do_swap_account && type == _MEMSWAP) - return -EOPNOTSUPP; - switch (name) { case RES_MAX_USAGE: if (type == _MEM) -- cgit v1.1 From 6ee8630e02be6dd89926ca0fbc21af68b23dc087 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 29 Apr 2013 15:07:44 -0700 Subject: mm: allow arch code to control the user page table ceiling On architectures where a pgd entry may be shared between user and kernel (e.g. ARM+LPAE), freeing page tables needs a ceiling other than 0. This patch introduces a generic USER_PGTABLES_CEILING that arch code can override. It is the responsibility of the arch code setting the ceiling to ensure the complete freeing of the page tables (usually in pgd_free()). [catalin.marinas@arm.com: commit log; shift_arg_pages(), asm-generic/pgtables.h changes] Signed-off-by: Hugh Dickins Signed-off-by: Catalin Marinas Cc: Russell King Cc: [3.3+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index b2c363f..288958f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2302,7 +2302,7 @@ static void unmap_region(struct mm_struct *mm, update_hiwater_rss(mm); unmap_vmas(&tlb, vma, start, end); free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, - next ? next->vm_start : 0); + next ? next->vm_start : USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb, start, end); } @@ -2682,7 +2682,7 @@ void exit_mmap(struct mm_struct *mm) /* Use -1 here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, vma, 0, -1); - free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); + free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb, 0, -1); /* -- cgit v1.1 From 1444f92c84984dd13f3e8e121115783ae5b22c55 Mon Sep 17 00:00:00 2001 From: "Hampson, Steven T" Date: Mon, 29 Apr 2013 15:07:47 -0700 Subject: mm: merging memory blocks resets mempolicy Using mbind to change the mempolicy to MPOL_BIND on several adjacent mmapped blocks may result in a reset of the mempolicy to MPOL_DEFAULT in vma_adjust. Test code. Correct result is three lines containing "OK". #include #include #include #include #include /* gcc mbind_test.c -lnuma -o mbind_test -Wall */ #define MAXNODE 4096 void allocate() { int ret; int len; int policy = -1; unsigned char *p; unsigned long mask[MAXNODE] = { 0 }; unsigned long retmask[MAXNODE] = { 0 }; len = getpagesize() * 0x2fc00; p = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); if (p == MAP_FAILED) printf("mbind err: %d\n", errno); mask[0] = 1; ret = mbind(p, len, MPOL_BIND, mask, MAXNODE, 0); if (ret < 0) printf("mbind err: %d %d\n", ret, errno); ret = get_mempolicy(&policy, retmask, MAXNODE, p, MPOL_F_ADDR); if (ret < 0) printf("get_mempolicy err: %d %d\n", ret, errno); if (policy == MPOL_BIND) printf("OK\n"); else printf("ERROR: policy is %d\n", policy); } int main() { allocate(); allocate(); allocate(); return 0; } Signed-off-by: Steven T Hampson Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 288958f..081e6da 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -829,7 +829,7 @@ again: remove_next = 1 + (end > next->vm_end); if (next->anon_vma) anon_vma_merge(vma, next); mm->map_count--; - mpol_put(vma_policy(next)); + vma_set_policy(vma, vma_policy(next)); kmem_cache_free(vm_area_cachep, next); /* * In mprotect's case 6 (see comments on vma_merge), -- cgit v1.1 From 949f7ec5760b021da3cccc1eaeb0671270e4238f Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 29 Apr 2013 15:07:48 -0700 Subject: mm, hugetlb: include hugepages in meminfo Particularly in oom conditions, it's troublesome that hugetlb memory is not displayed. All other meminfo that is emitted will not add up to what is expected, and there is no artifact left in the kernel log to show that a potentially significant amount of memory is actually allocated as hugepages which are not available to be reclaimed. Booting with hugepages=8192 on the command line, this memory is now shown in oom conditions. For example, with echo m > /proc/sysrq-trigger: Node 0 hugepages_total=2048 hugepages_free=2048 hugepages_surp=0 hugepages_size=2048kB Node 1 hugepages_total=2048 hugepages_free=2048 hugepages_surp=0 hugepages_size=2048kB Node 2 hugepages_total=2048 hugepages_free=2048 hugepages_surp=0 hugepages_size=2048kB Node 3 hugepages_total=2048 hugepages_free=2048 hugepages_surp=0 hugepages_size=2048kB [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: David Rientjes Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 15 +++++++++++++++ mm/page_alloc.c | 3 +++ 2 files changed, 18 insertions(+) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 73b864a..9b9aeef 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2121,6 +2121,21 @@ int hugetlb_report_node_meminfo(int nid, char *buf) nid, h->surplus_huge_pages_node[nid]); } +void hugetlb_show_meminfo(void) +{ + struct hstate *h; + int nid; + + for_each_node_state(nid, N_MEMORY) + for_each_hstate(h) + pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", + nid, + h->nr_huge_pages_node[nid], + h->free_huge_pages_node[nid], + h->surplus_huge_pages_node[nid], + 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); +} + /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ unsigned long hugetlb_total_pages(void) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 72da11c..7350986 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include @@ -3113,6 +3114,8 @@ void show_free_areas(unsigned int filter) printk("= %lukB\n", K(total)); } + hugetlb_show_meminfo(); + printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); show_swap_cache_info(); -- cgit v1.1 From 055e4fd96e95b0eee0d92fd54a26be7f0d3bcad0 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 29 Apr 2013 15:07:49 -0700 Subject: mm: try harder to allocate vmemmap blocks Hot-adding memory on x86_64 normally requires huge page allocation. When this is done to a VM guest, it's usually because the system is already tight on memory, so the request tends to fail. Try to avoid this by adding __GFP_REPEAT to the allocation flags. Addresses http://bugs.debian.org/699913 Signed-off-by: Ben Hutchings Signed-off-by: Johannes Weiner Reported-by: Bernhard Schmidt Tested-by: Bernhard Schmidt Cc: Russell King Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Benjamin Herrenschmidt Cc: "Luck, Tony" Cc: Heiko Carstens Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse-vmemmap.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 1b7e22a..22b7e18 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -53,10 +53,12 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) struct page *page; if (node_state(node, N_HIGH_MEMORY)) - page = alloc_pages_node(node, - GFP_KERNEL | __GFP_ZERO, get_order(size)); + page = alloc_pages_node( + node, GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT, + get_order(size)); else - page = alloc_pages(GFP_KERNEL | __GFP_ZERO, + page = alloc_pages( + GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT, get_order(size)); if (page) return page_address(page); -- cgit v1.1 From 0aad818b2de455f1bfd7ef87c28cdbbaaed9a699 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Apr 2013 15:07:50 -0700 Subject: sparse-vmemmap: specify vmemmap population range in bytes The sparse code, when asking the architecture to populate the vmemmap, specifies the section range as a starting page and a number of pages. This is an awkward interface, because none of the arch-specific code actually thinks of the range in terms of 'struct page' units and always translates it to bytes first. In addition, later patches mix huge page and regular page backing for the vmemmap. For this, they need to call vmemmap_populate_basepages() on sub-section ranges with PAGE_SIZE and PMD_SIZE in mind. But these are not necessarily multiples of the 'struct page' size and so this unit is too coarse. Just translate the section range into bytes once in the generic sparse code, then pass byte ranges down the stack. Signed-off-by: Johannes Weiner Cc: Ben Hutchings Cc: Bernhard Schmidt Cc: Johannes Weiner Cc: Russell King Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Benjamin Herrenschmidt Cc: "Luck, Tony" Cc: Heiko Carstens Acked-by: David S. Miller Tested-by: David S. Miller Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse-vmemmap.c | 19 ++++++++++++------- mm/sparse.c | 10 ++++++++-- 2 files changed, 20 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 22b7e18..27eeab3 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -147,11 +147,10 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) return pgd; } -int __meminit vmemmap_populate_basepages(struct page *start_page, - unsigned long size, int node) +int __meminit vmemmap_populate_basepages(unsigned long start, + unsigned long end, int node) { - unsigned long addr = (unsigned long)start_page; - unsigned long end = (unsigned long)(start_page + size); + unsigned long addr = start; pgd_t *pgd; pud_t *pud; pmd_t *pmd; @@ -178,9 +177,15 @@ int __meminit vmemmap_populate_basepages(struct page *start_page, struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid) { - struct page *map = pfn_to_page(pnum * PAGES_PER_SECTION); - int error = vmemmap_populate(map, PAGES_PER_SECTION, nid); - if (error) + unsigned long start; + unsigned long end; + struct page *map; + + map = pfn_to_page(pnum * PAGES_PER_SECTION); + start = (unsigned long)map; + end = (unsigned long)(map + PAGES_PER_SECTION); + + if (vmemmap_populate(start, end, nid)) return NULL; return map; diff --git a/mm/sparse.c b/mm/sparse.c index 7ca6dc8..a37be5f 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -615,11 +615,17 @@ static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, } static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) { - vmemmap_free(memmap, nr_pages); + unsigned long start = (unsigned long)memmap; + unsigned long end = (unsigned long)(memmap + nr_pages); + + vmemmap_free(start, end); } static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) { - vmemmap_free(memmap, nr_pages); + unsigned long start = (unsigned long)memmap; + unsigned long end = (unsigned long)(memmap + nr_pages); + + vmemmap_free(start, end); } #else static struct page *__kmalloc_section_memmap(unsigned long nr_pages) -- cgit v1.1 From fed2719e7a8612471bd17113ed326d38df434f17 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 29 Apr 2013 15:07:57 -0700 Subject: mm: page_alloc: avoid marking zones full prematurely after zone_reclaim() The following problem was reported against a distribution kernel when zone_reclaim was enabled but the same problem applies to the mainline kernel. The reproduction case was as follows 1. Run numactl -m +0 dd if=largefile of=/dev/null This allocates a large number of clean pages in node 0 2. numactl -N +0 memhog 0.5*Mg This start a memory-using application in node 0. The expected behaviour is that the clean pages get reclaimed and the application uses node 0 for its memory. The observed behaviour was that the memory for the memhog application was allocated off-node since commits cd38b115d5ad ("mm: page allocator: initialise ZLC for first zone eligible for zone_reclaim") and commit 76d3fbf8fbf6 ("mm: page allocator: reconsider zones for allocation after direct reclaim"). The assumption of those patches was that it was always preferable to allocate quickly than stall for long periods of time and they were meant to take care that the zone was only marked full when necessary but an important case was missed. In the allocator fast path, only the low watermarks are checked. If the zones free pages are between the low and min watermark then allocations from the allocators slow path will succeed. However, zone_reclaim will only reclaim SWAP_CLUSTER_MAX or 1< Reported-by: Hedi Berriche Tested-by: Hedi Berriche Reviewed-by: Michal Hocko Reviewed-by: Wanpeng Li Signed-off-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7350986..b54c5cb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1942,9 +1942,24 @@ zonelist_scan: continue; default: /* did we reclaim enough */ - if (!zone_watermark_ok(zone, order, mark, + if (zone_watermark_ok(zone, order, mark, classzone_idx, alloc_flags)) + goto try_this_zone; + + /* + * Failed to reclaim enough to meet watermark. + * Only mark the zone full if checking the min + * watermark or if we failed to reclaim just + * 1< Date: Mon, 29 Apr 2013 15:07:58 -0700 Subject: mm/migrate: fix comment typo syncronous->synchronous Signed-off-by: Jianguo Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 3bbaf5d..c87ef92 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -736,7 +736,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, if (PageWriteback(page)) { /* - * Only in the case of a full syncronous migration is it + * Only in the case of a full synchronous migration is it * necessary to wait for PageWriteback. In the async case, * the retry loop is too short and in the sync-light case, * the overhead of stalling is too much -- cgit v1.1 From 7c243c7168dcc1bc2081fc0494923cd7cc808fb6 Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Mon, 29 Apr 2013 15:07:59 -0700 Subject: mm: speedup in __early_pfn_to_nid When booting on a large memory system, the kernel spends considerable time in memmap_init_zone() setting up memory zones. Analysis shows significant time spent in __early_pfn_to_nid(). The routine memmap_init_zone() checks each PFN to verify the nid is valid. __early_pfn_to_nid() sequentially scans the list of pfn ranges to find the right range and returns the nid. This does not scale well. On a 4 TB (single rack) system there are 308 memory ranges to scan. The higher the PFN the more time spent sequentially spinning through memory ranges. Since memmap_init_zone() increments pfn, it will almost always be looking for the same range as the previous pfn, so check that range first. If it is in the same range, return that nid. If not, scan the list as before. A 4 TB (single rack) UV1 system takes 512 seconds to get through the zone code. This performance optimization reduces the time by 189 seconds, a 36% improvement. A 2 TB (single rack) UV2 system goes from 212.7 seconds to 99.8 seconds, a 112.9 second (53%) reduction. [akpm@linux-foundation.org: make the statics __meminitdata] [akpm@linux-foundation.org: fix comment formatting] [akpm@linux-foundation.org: fix ia64, per yinghai] [akpm@linux-foundation.org: add missing semicolon, per Tony] Signed-off-by: Russ Anderson Cc: David Rientjes Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Tested-by: "Luck, Tony" Cc: Yinghai Lu Cc: Lin Feng Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b54c5cb..5a234b6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4187,10 +4187,23 @@ int __meminit __early_pfn_to_nid(unsigned long pfn) { unsigned long start_pfn, end_pfn; int i, nid; + /* + * NOTE: The following SMP-unsafe globals are only used early in boot + * when the kernel is running single-threaded. + */ + static unsigned long __meminitdata last_start_pfn, last_end_pfn; + static int __meminitdata last_nid; + + if (last_start_pfn <= pfn && pfn < last_end_pfn) + return last_nid; for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) - if (start_pfn <= pfn && pfn < end_pfn) + if (start_pfn <= pfn && pfn < end_pfn) { + last_start_pfn = start_pfn; + last_end_pfn = end_pfn; + last_nid = nid; return nid; + } /* This is a memory hole */ return -1; } -- cgit v1.1 From f9872caf07c1c774034b8bddde7d4a3a7f4a6484 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Mon, 29 Apr 2013 15:08:01 -0700 Subject: page_alloc: make setup_nr_node_ids() usable for arch init code powerpc and x86 were opencoding copies of setup_nr_node_ids(), which page_alloc provides but makes static. Make it avaliable to the archs in linux/mm.h. Signed-off-by: Cody P Schafer Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5a234b6..98cbdf6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4749,7 +4749,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, /* * Figure out the number of possible node ids. */ -static void __init setup_nr_node_ids(void) +void __init setup_nr_node_ids(void) { unsigned int node; unsigned int highest = 0; @@ -4758,10 +4758,6 @@ static void __init setup_nr_node_ids(void) highest = node; nr_node_ids = highest + 1; } -#else -static inline void setup_nr_node_ids(void) -{ -} #endif /** -- cgit v1.1 From 3ac38faa1f09d7eb01497eb50eb83e4f2d132667 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 29 Apr 2013 15:08:06 -0700 Subject: mm/slub.c: use register_hotmemory_notifier() Squishes a statement-with-no-effect warning, removes some ifdefs and shrinks .text by 2 bytes. Note that this code fails to check for blocking_notifier_chain_register() failures. Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 4aec537..a0206df 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -18,6 +18,7 @@ #include #include "slab.h" #include +#include #include #include #include @@ -3483,7 +3484,6 @@ int kmem_cache_shrink(struct kmem_cache *s) } EXPORT_SYMBOL(kmem_cache_shrink); -#if defined(CONFIG_MEMORY_HOTPLUG) static int slab_mem_going_offline_callback(void *arg) { struct kmem_cache *s; @@ -3598,7 +3598,10 @@ static int slab_memory_callback(struct notifier_block *self, return ret; } -#endif /* CONFIG_MEMORY_HOTPLUG */ +static struct notifier_block slab_memory_callback_nb = { + .notifier_call = slab_memory_callback, + .priority = SLAB_CALLBACK_PRI, +}; /******************************************************************** * Basic setup of slabs @@ -3651,7 +3654,7 @@ void __init kmem_cache_init(void) create_boot_cache(kmem_cache_node, "kmem_cache_node", sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN); - hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); + register_hotmemory_notifier(&slab_memory_callback_nb); /* Able to allocate the per node structures */ slab_state = PARTIAL; -- cgit v1.1 From c9b1d0981fcce3d9976d7b7a56e4e0503bc610dd Mon Sep 17 00:00:00 2001 From: Andrew Shewmaker Date: Mon, 29 Apr 2013 15:08:10 -0700 Subject: mm: limit growth of 3% hardcoded other user reserve Add user_reserve_kbytes knob. Limit the growth of the memory reserved for other user processes to min(3% current process size, user_reserve_pages). Only about 8MB is necessary to enable recovery in the default mode, and only a few hundred MB are required even when overcommit is disabled. user_reserve_pages defaults to min(3% free pages, 128MB) I arrived at 128MB by taking the max VSZ of sshd, login, bash, and top ... then adding the RSS of each. This only affects OVERCOMMIT_NEVER mode. Background 1. user reserve __vm_enough_memory reserves a hardcoded 3% of the current process size for other applications when overcommit is disabled. This was done so that a user could recover if they launched a memory hogging process. Without the reserve, a user would easily run into a message such as: bash: fork: Cannot allocate memory 2. admin reserve Additionally, a hardcoded 3% of free memory is reserved for root in both overcommit 'guess' and 'never' modes. This was intended to prevent a scenario where root-cant-log-in and perform recovery operations. Note that this reserve shrinks, and doesn't guarantee a useful reserve. Motivation The two hardcoded memory reserves should be updated to account for current memory sizes. Also, the admin reserve would be more useful if it didn't shrink too much. When the current code was originally written, 1GB was considered "enterprise". Now the 3% reserve can grow to multiple GB on large memory systems, and it only needs to be a few hundred MB at most to enable a user or admin to recover a system with an unwanted memory hogging process. I've found that reducing these reserves is especially beneficial for a specific type of application load: * single application system * one or few processes (e.g. one per core) * allocating all available memory * not initializing every page immediately * long running I've run scientific clusters with this sort of load. A long running job sometimes failed many hours (weeks of CPU time) into a calculation. They weren't initializing all of their memory immediately, and they weren't using calloc, so I put systems into overcommit 'never' mode. These clusters run diskless and have no swap. However, with the current reserves, a user wishing to allocate as much memory as possible to one process may be prevented from using, for example, almost 2GB out of 32GB. The effect is less, but still significant when a user starts a job with one process per core. I have repeatedly seen a set of processes requesting the same amount of memory fail because one of them could not allocate the amount of memory a user would expect to be able to allocate. For example, Message Passing Interfce (MPI) processes, one per core. And it is similar for other parallel programming frameworks. Changing this reserve code will make the overcommit never mode more useful by allowing applications to allocate nearly all of the available memory. Also, the new admin_reserve_kbytes will be safer than the current behavior since the hardcoded 3% of available memory reserve can shrink to something useless in the case where applications have grabbed all available memory. Risks * "bash: fork: Cannot allocate memory" The downside of the first patch-- which creates a tunable user reserve that is only used in overcommit 'never' mode--is that an admin can set it so low that a user may not be able to kill their process, even if they already have a shell prompt. Of course, a user can get in the same predicament with the current 3% reserve--they just have to launch processes until 3% becomes negligible. * root-cant-log-in problem The second patch, adding the tunable rootuser_reserve_pages, allows the admin to shoot themselves in the foot by setting it too small. They can easily get the system into a state where root-can't-log-in. However, the new admin_reserve_kbytes will be safer than the current behavior since the hardcoded 3% of available memory reserve can shrink to something useless in the case where applications have grabbed all available memory. Alternatives * Memory cgroups provide a more flexible way to limit application memory. Not everyone wants to set up cgroups or deal with their overhead. * We could create a fourth overcommit mode which provides smaller reserves. The size of useful reserves may be drastically different depending on the whether the system is embedded or enterprise. * Force users to initialize all of their memory or use calloc. Some users don't want/expect the system to overcommit when they malloc. Overcommit 'never' mode is for this scenario, and it should work well. The new user and admin reserve tunables are simple to use, with low overhead compared to cgroups. The patches preserve current behavior where 3% of memory is less than 128MB, except that the admin reserve doesn't shrink to an unusable size under pressure. The code allows admins to tune for embedded and enterprise usage. FAQ * How is the root-cant-login problem addressed? What happens if admin_reserve_pages is set to 0? Root is free to shoot themselves in the foot by setting admin_reserve_kbytes too low. On x86_64, the minimum useful reserve is: 8MB for overcommit 'guess' 128MB for overcommit 'never' admin_reserve_pages defaults to min(3% free memory, 8MB) So, anyone switching to 'never' mode needs to adjust admin_reserve_pages. * How do you calculate a minimum useful reserve? A user or the admin needs enough memory to login and perform recovery operations, which includes, at a minimum: sshd or login + bash (or some other shell) + top (or ps, kill, etc.) For overcommit 'guess', we can sum resident set sizes (RSS) because we only need enough memory to handle what the recovery programs will typically use. On x86_64 this is about 8MB. For overcommit 'never', we can take the max of their virtual sizes (VSZ) and add the sum of their RSS. We use VSZ instead of RSS because mode forces us to ensure we can fulfill all of the requested memory allocations-- even if the programs only use a fraction of what they ask for. On x86_64 this is about 128MB. When swap is enabled, reserves are useful even when they are as small as 10MB, regardless of overcommit mode. When both swap and overcommit are disabled, then the admin should tune the reserves higher to be absolutley safe. Over 230MB each was safest in my testing. * What happens if user_reserve_pages is set to 0? Note, this only affects overcomitt 'never' mode. Then a user will be able to allocate all available memory minus admin_reserve_kbytes. However, they will easily see a message such as: "bash: fork: Cannot allocate memory" And they won't be able to recover/kill their application. The admin should be able to recover the system if admin_reserve_kbytes is set appropriately. * What's the difference between overcommit 'guess' and 'never'? "Guess" allows an allocation if there are enough free + reclaimable pages. It has a hardcoded 3% of free pages reserved for root. "Never" allows an allocation if there is enough swap + a configurable percentage (default is 50) of physical RAM. It has a hardcoded 3% of free pages reserved for root, like "Guess" mode. It also has a hardcoded 3% of the current process size reserved for additional applications. * Why is overcommit 'guess' not suitable even when an app eventually writes to every page? It takes free pages, file pages, available swap pages, reclaimable slab pages into consideration. In other words, these are all pages available, then why isn't overcommit suitable? Because it only looks at the present state of the system. It does not take into account the memory that other applications have malloced, but haven't initialized yet. It overcommits the system. Test Summary There was little change in behavior in the default overcommit 'guess' mode with swap enabled before and after the patch. This was expected. Systems run most predictably (i.e. no oom kills) in overcommit 'never' mode with swap enabled. This also allowed the most memory to be allocated to a user application. Overcommit 'guess' mode without swap is a bad idea. It is easy to crash the system. None of the other tested combinations crashed. This matches my experience on the Roadrunner supercomputer. Without the tunable user reserve, a system in overcommit 'never' mode and without swap does not allow the admin to recover, although the admin can. With the new tunable reserves, a system in overcommit 'never' mode and without swap can be configured to: 1. maximize user-allocatable memory, running close to the edge of recoverability 2. maximize recoverability, sacrificing allocatable memory to ensure that a user cannot take down a system Test Description Fedora 18 VM - 4 x86_64 cores, 5725MB RAM, 4GB Swap System is booted into multiuser console mode, with unnecessary services turned off. Caches were dropped before each test. Hogs are user memtester processes that attempt to allocate all free memory as reported by /proc/meminfo In overcommit 'never' mode, memory_ratio=100 Test Results 3.9.0-rc1-mm1 Overcommit | Swap | Hogs | MB Got/Wanted | OOMs | User Recovery | Admin Recovery ---------- ---- ---- ------------- ---- ------------- -------------- guess yes 1 5432/5432 no yes yes guess yes 4 5444/5444 1 yes yes guess no 1 5302/5449 no yes yes guess no 4 - crash no no never yes 1 5460/5460 1 yes yes never yes 4 5460/5460 1 yes yes never no 1 5218/5432 no no yes never no 4 5203/5448 no no yes 3.9.0-rc1-mm1-tunablereserves User and Admin Recovery show their respective reserves, if applicable. Overcommit | Swap | Hogs | MB Got/Wanted | OOMs | User Recovery | Admin Recovery ---------- ---- ---- ------------- ---- ------------- -------------- guess yes 1 5419/5419 no - yes 8MB yes guess yes 4 5436/5436 1 - yes 8MB yes guess no 1 5440/5440 * - yes 8MB yes guess no 4 - crash - no 8MB no * process would successfully mlock, then the oom killer would pick it never yes 1 5446/5446 no 10MB yes 20MB yes never yes 4 5456/5456 no 10MB yes 20MB yes never no 1 5387/5429 no 128MB no 8MB barely never no 1 5323/5428 no 226MB barely 8MB barely never no 1 5323/5428 no 226MB barely 8MB barely never no 1 5359/5448 no 10MB no 10MB barely never no 1 5323/5428 no 0MB no 10MB barely never no 1 5332/5428 no 0MB no 50MB yes never no 1 5293/5429 no 0MB no 90MB yes never no 1 5001/5427 no 230MB yes 338MB yes never no 4* 4998/5424 no 230MB yes 338MB yes * more memtesters were launched, able to allocate approximately another 100MB Future Work - Test larger memory systems. - Test an embedded image. - Test other architectures. - Time malloc microbenchmarks. - Would it be useful to be able to set overcommit policy for each memory cgroup? - Some lines are slightly above 80 chars. Perhaps define a macro to convert between pages and kb? Other places in the kernel do this. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: make init_user_reserve() static] Signed-off-by: Andrew Shewmaker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 35 ++++++++++++++++++++++++++++++----- mm/nommu.c | 35 ++++++++++++++++++++++++++++++----- 2 files changed, 60 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 081e6da..80a965f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -84,6 +84,7 @@ EXPORT_SYMBOL(vm_get_page_prot); int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; +unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ /* * Make sure vm_committed_as in one cacheline and not cacheline shared with * other variables. It can be updated by several CPUs frequently. @@ -122,7 +123,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed); */ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { - unsigned long free, allowed; + unsigned long free, allowed, reserve; vm_acct_memory(pages); @@ -183,10 +184,13 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) allowed -= allowed / 32; allowed += total_swap_pages; - /* Don't let a single process grow too big: - leave 3% of the size of this process for other processes */ - if (mm) - allowed -= mm->total_vm / 32; + /* + * Don't let a single process grow so big a user can't recover + */ + if (mm) { + reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); + allowed -= min(mm->total_vm / 32, reserve); + } if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; @@ -3094,3 +3098,24 @@ void __init mmap_init(void) ret = percpu_counter_init(&vm_committed_as, 0); VM_BUG_ON(ret); } + +/* + * Initialise sysctl_user_reserve_kbytes. + * + * This is intended to prevent a user from starting a single memory hogging + * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER + * mode. + * + * The default value is min(3% of free memory, 128MB) + * 128MB is enough to recover with sshd/login, bash, and top/kill. + */ +static int __meminit init_user_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); + return 0; +} +module_init(init_user_reserve) diff --git a/mm/nommu.c b/mm/nommu.c index 2f1c75e..58e4a0a 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -63,6 +63,7 @@ int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; +unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ int heap_stack_gap = 0; atomic_long_t mmap_pages_allocated; @@ -1897,7 +1898,7 @@ EXPORT_SYMBOL(unmap_mapping_range); */ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { - unsigned long free, allowed; + unsigned long free, allowed, reserve; vm_acct_memory(pages); @@ -1957,10 +1958,13 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) allowed -= allowed / 32; allowed += total_swap_pages; - /* Don't let a single process grow too big: - leave 3% of the size of this process for other processes */ - if (mm) - allowed -= mm->total_vm / 32; + /* + * Don't let a single process grow so big a user can't recover + */ + if (mm) { + reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); + allowed -= min(mm->total_vm / 32, reserve); + } if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; @@ -2122,3 +2126,24 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, up_write(&nommu_region_sem); return 0; } + +/* + * Initialise sysctl_user_reserve_kbytes. + * + * This is intended to prevent a user from starting a single memory hogging + * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER + * mode. + * + * The default value is min(3% of free memory, 128MB) + * 128MB is enough to recover with sshd/login, bash, and top/kill. + */ +static int __meminit init_user_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); + return 0; +} +module_init(init_user_reserve) -- cgit v1.1 From 4eeab4f5580d11bffedc697684b91b0bca0d5009 Mon Sep 17 00:00:00 2001 From: Andrew Shewmaker Date: Mon, 29 Apr 2013 15:08:11 -0700 Subject: mm: replace hardcoded 3% with admin_reserve_pages knob Add an admin_reserve_kbytes knob to allow admins to change the hardcoded memory reserve to something other than 3%, which may be multiple gigabytes on large memory systems. Only about 8MB is necessary to enable recovery in the default mode, and only a few hundred MB are required even when overcommit is disabled. This affects OVERCOMMIT_GUESS and OVERCOMMIT_NEVER. admin_reserve_kbytes is initialized to min(3% free pages, 8MB) I arrived at 8MB by summing the RSS of sshd or login, bash, and top. Please see first patch in this series for full background, motivation, testing, and full changelog. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: make init_admin_reserve() static] Signed-off-by: Andrew Shewmaker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 30 ++++++++++++++++++++++++++---- mm/nommu.c | 30 ++++++++++++++++++++++++++---- 2 files changed, 52 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 80a965f..5485f18 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -85,6 +85,7 @@ int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic ove int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ +unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ /* * Make sure vm_committed_as in one cacheline and not cacheline shared with * other variables. It can be updated by several CPUs frequently. @@ -164,10 +165,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) free -= totalreserve_pages; /* - * Leave the last 3% for root + * Reserve some for root */ if (!cap_sys_admin) - free -= free / 32; + free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); if (free > pages) return 0; @@ -178,10 +179,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) allowed = (totalram_pages - hugetlb_total_pages()) * sysctl_overcommit_ratio / 100; /* - * Leave the last 3% for root + * Reserve some for root */ if (!cap_sys_admin) - allowed -= allowed / 32; + allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); allowed += total_swap_pages; /* @@ -3119,3 +3120,24 @@ static int __meminit init_user_reserve(void) return 0; } module_init(init_user_reserve) + +/* + * Initialise sysctl_admin_reserve_kbytes. + * + * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin + * to log in and kill a memory hogging process. + * + * Systems with more than 256MB will reserve 8MB, enough to recover + * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will + * only reserve 3% of free pages by default. + */ +static int __meminit init_admin_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); + return 0; +} +module_init(init_admin_reserve) diff --git a/mm/nommu.c b/mm/nommu.c index 58e4a0a..fbe3e2f 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -64,6 +64,7 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ +unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ int heap_stack_gap = 0; atomic_long_t mmap_pages_allocated; @@ -1939,10 +1940,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) free -= totalreserve_pages; /* - * Leave the last 3% for root + * Reserve some for root */ if (!cap_sys_admin) - free -= free / 32; + free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); if (free > pages) return 0; @@ -1952,10 +1953,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) allowed = totalram_pages * sysctl_overcommit_ratio / 100; /* - * Leave the last 3% for root + * Reserve some 3% for root */ if (!cap_sys_admin) - allowed -= allowed / 32; + allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); allowed += total_swap_pages; /* @@ -2147,3 +2148,24 @@ static int __meminit init_user_reserve(void) return 0; } module_init(init_user_reserve) + +/* + * Initialise sysctl_admin_reserve_kbytes. + * + * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin + * to log in and kill a memory hogging process. + * + * Systems with more than 256MB will reserve 8MB, enough to recover + * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will + * only reserve 3% of free pages by default. + */ +static int __meminit init_admin_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); + return 0; +} +module_init(init_admin_reserve) -- cgit v1.1 From 1640879afe0065caf276e98fff059c4dc01c97ae Mon Sep 17 00:00:00 2001 From: Andrew Shewmaker Date: Mon, 29 Apr 2013 15:08:12 -0700 Subject: mm: reinititalise user and admin reserves if memory is added or removed Alter the admin and user reserves of the previous patches in this series when memory is added or removed. If memory is added and the reserves have been eliminated or increased above the default max, then we'll trust the admin. If memory is removed and there isn't enough free memory, then we need to reset the reserves. Otherwise keep the reserve set by the admin. The reserve reset code is the same as the reserve initialization code. I tested hot addition and removal by triggering it via sysfs. The reserves shrunk when they were set high and memory was removed. They were reset higher when memory was added again. [akpm@linux-foundation.org: use register_hotmemory_notifier()] [akpm@linux-foundation.org: init_user_reserve() and init_admin_reserve can no longer be __meminit] [fengguang.wu@intel.com: make init_reserve_notifier() static] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Andrew Shewmaker Signed-off-by: Fengguang Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 74 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 5485f18..43c4955 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -33,6 +33,8 @@ #include #include #include +#include +#include #include #include @@ -3110,7 +3112,7 @@ void __init mmap_init(void) * The default value is min(3% of free memory, 128MB) * 128MB is enough to recover with sshd/login, bash, and top/kill. */ -static int __meminit init_user_reserve(void) +static int init_user_reserve(void) { unsigned long free_kbytes; @@ -3131,7 +3133,7 @@ module_init(init_user_reserve) * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will * only reserve 3% of free pages by default. */ -static int __meminit init_admin_reserve(void) +static int init_admin_reserve(void) { unsigned long free_kbytes; @@ -3141,3 +3143,73 @@ static int __meminit init_admin_reserve(void) return 0; } module_init(init_admin_reserve) + +/* + * Reinititalise user and admin reserves if memory is added or removed. + * + * The default user reserve max is 128MB, and the default max for the + * admin reserve is 8MB. These are usually, but not always, enough to + * enable recovery from a memory hogging process using login/sshd, a shell, + * and tools like top. It may make sense to increase or even disable the + * reserve depending on the existence of swap or variations in the recovery + * tools. So, the admin may have changed them. + * + * If memory is added and the reserves have been eliminated or increased above + * the default max, then we'll trust the admin. + * + * If memory is removed and there isn't enough free memory, then we + * need to reset the reserves. + * + * Otherwise keep the reserve set by the admin. + */ +static int reserve_mem_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + unsigned long tmp, free_kbytes; + + switch (action) { + case MEM_ONLINE: + /* Default max is 128MB. Leave alone if modified by operator. */ + tmp = sysctl_user_reserve_kbytes; + if (0 < tmp && tmp < (1UL << 17)) + init_user_reserve(); + + /* Default max is 8MB. Leave alone if modified by operator. */ + tmp = sysctl_admin_reserve_kbytes; + if (0 < tmp && tmp < (1UL << 13)) + init_admin_reserve(); + + break; + case MEM_OFFLINE: + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + if (sysctl_user_reserve_kbytes > free_kbytes) { + init_user_reserve(); + pr_info("vm.user_reserve_kbytes reset to %lu\n", + sysctl_user_reserve_kbytes); + } + + if (sysctl_admin_reserve_kbytes > free_kbytes) { + init_admin_reserve(); + pr_info("vm.admin_reserve_kbytes reset to %lu\n", + sysctl_admin_reserve_kbytes); + } + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block reserve_mem_nb = { + .notifier_call = reserve_mem_notifier, +}; + +static int __meminit init_reserve_notifier(void) +{ + if (register_hotmemory_notifier(&reserve_mem_nb)) + printk("Failed registering memory add/remove notifier for admin reserve"); + + return 0; +} +module_init(init_reserve_notifier) -- cgit v1.1 From 573b400d01b3411c2436e73d8e63fc1186b94535 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 29 Apr 2013 15:08:13 -0700 Subject: mm/memcontrol.c: remove unnecessary ; Just a trivial issue I stumbled on while doing something else... Signed-off-by: Michel Lespinasse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d280016..7e5bc43 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5813,7 +5813,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) return ret; return mem_cgroup_sockets_init(memcg, ss); -}; +} static void kmem_cgroup_destroy(struct mem_cgroup *memcg) { -- cgit v1.1 From f1cb08798e2497238b28f377bd131426f0b9835d Mon Sep 17 00:00:00 2001 From: Yijing Wang Date: Mon, 29 Apr 2013 15:08:14 -0700 Subject: mm: remove CONFIG_HOTPLUG ifdefs CONFIG_HOTPLUG is going away as an option, cleanup CONFIG_HOTPLUG ifdefs in mm files. Signed-off-by: Yijing Wang Acked-by: Greg Kroah-Hartman Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index e1d8ed1..c823776 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -52,7 +52,6 @@ void all_vm_events(unsigned long *ret) } EXPORT_SYMBOL_GPL(all_vm_events); -#ifdef CONFIG_HOTPLUG /* * Fold the foreign cpu events into our own. * @@ -69,7 +68,6 @@ void vm_events_fold_cpu(int cpu) fold_state->event[i] = 0; } } -#endif /* CONFIG_HOTPLUG */ #endif /* CONFIG_VM_EVENT_COUNTERS */ -- cgit v1.1 From 52f37629fd3c7b24e1e6c125e665454cd7ac1acb Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 29 Apr 2013 15:08:15 -0700 Subject: THP: fix comment about memory barrier Currently the memory barrier in __do_huge_pmd_anonymous_page doesn't work. Because lru_cache_add_lru uses pagevec so it could miss spinlock easily so above rule was broken so user might see inconsistent data. I was not first person who pointed out the problem. Mel and Peter pointed out a few months ago and Peter pointed out further that even spin_lock/unlock can't make sure of it: http://marc.info/?t=134333512700004 In particular: *A = a; LOCK UNLOCK *B = b; may occur as: LOCK, STORE *B, STORE *A, UNLOCK At last, Hugh pointed out that even we don't need memory barrier in there because __SetPageUpdate already have done it from Nick's commit 0ed361dec369 ("mm: fix PageUptodate data race") explicitly. So this patch fixes comment on THP and adds same comment for do_anonymous_page, too because everybody except Hugh was missing that. It means we need a comment about that. Signed-off-by: Minchan Kim Acked-by: Andrea Arcangeli Acked-by: David Rientjes Acked-by: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Hugh Dickins Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 11 +++++------ mm/memory.c | 5 +++++ 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e2f7f5aa..45eaae0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -713,6 +713,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, return VM_FAULT_OOM; clear_huge_page(page, haddr, HPAGE_PMD_NR); + /* + * The memory barrier inside __SetPageUptodate makes sure that + * clear_huge_page writes become visible before the set_pmd_at() + * write. + */ __SetPageUptodate(page); spin_lock(&mm->page_table_lock); @@ -724,12 +729,6 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, } else { pmd_t entry; entry = mk_huge_pmd(page, vma); - /* - * The spinlocking to take the lru_lock inside - * page_add_new_anon_rmap() acts as a full memory - * barrier to be sure clear_huge_page writes become - * visible after the set_pmd_at() write. - */ page_add_new_anon_rmap(page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); pgtable_trans_huge_deposit(mm, pgtable); diff --git a/mm/memory.c b/mm/memory.c index ba94dec..f7a1fba 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3244,6 +3244,11 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, page = alloc_zeroed_user_highpage_movable(vma, address); if (!page) goto oom; + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceeding stores to the page contents become visible before + * the set_pte_at() write. + */ __SetPageUptodate(page); if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) -- cgit v1.1 From c73e5c9c59a0f7ba30b3e5f7bd2d8097d4c89c6d Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Mon, 29 Apr 2013 15:08:16 -0700 Subject: mm: rewrite the comment over migrate_pages() more comprehensibly The comment over migrate_pages() looks quite weird, and makes it hard to grasp what it is trying to say. Rewrite it more comprehensibly. Signed-off-by: Srivatsa S. Bhat Acked-by: Christoph Lameter Cc: Mel Gorman Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index c87ef92..27ed225 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -973,19 +973,23 @@ out: } /* - * migrate_pages + * migrate_pages - migrate the pages specified in a list, to the free pages + * supplied as the target for the page migration * - * The function takes one list of pages to migrate and a function - * that determines from the page to be migrated and the private data - * the target of the move and allocates the page. + * @from: The list of pages to be migrated. + * @get_new_page: The function used to allocate free pages to be used + * as the target of the page migration. + * @private: Private data to be passed on to get_new_page() + * @mode: The migration mode that specifies the constraints for + * page migration, if any. + * @reason: The reason for page migration. * - * The function returns after 10 attempts or if no pages - * are movable anymore because to has become empty - * or no retryable pages exist anymore. - * Caller should call putback_lru_pages to return pages to the LRU + * The function returns after 10 attempts or if no pages are movable any more + * because the list has become empty or no retryable pages exist any more. + * The caller should call putback_lru_pages() to return pages to the LRU * or free list only if ret != 0. * - * Return: Number of pages not migrated or error code. + * Returns the number of pages that were not migrated, or an error code. */ int migrate_pages(struct list_head *from, new_page_t get_new_page, unsigned long private, enum migrate_mode mode, int reason) -- cgit v1.1 From fe74ebb106a5950e82222c8ea258a9c0d7c65f04 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Mon, 29 Apr 2013 15:08:20 -0700 Subject: mm: change __remove_pages() to call release_mem_region_adjustable() Change __remove_pages() to call release_mem_region_adjustable(). This allows a requested memory range to be released from the iomem_resource table even if it does not match exactly to an resource entry but still fits into. The resource entries initialized at bootup usually cover the whole contiguous memory ranges and may not necessarily match with the size of memory hot-delete requests. If release_mem_region_adjustable() failed, __remove_pages() emits a warning message and continues to proceed as it was the case with release_mem_region(). release_mem_region(), which is defined to __release_region(), emits a warning message and returns no error since a void function. Signed-off-by: Toshi Kani Reviewed-by : Yasuaki Ishimatsu Acked-by: David Rientjes Cc: Ram Pai Cc: T Makphaibulchoke Cc: Wen Congyang Cc: Tang Chen Cc: Jiang Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 57decb2..c916582 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -705,8 +705,10 @@ EXPORT_SYMBOL_GPL(__add_pages); int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, unsigned long nr_pages) { - unsigned long i, ret = 0; + unsigned long i; int sections_to_remove; + resource_size_t start, size; + int ret = 0; /* * We can only remove entire sections @@ -714,7 +716,12 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); BUG_ON(nr_pages % PAGES_PER_SECTION); - release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE); + start = phys_start_pfn << PAGE_SHIFT; + size = nr_pages * PAGE_SIZE; + ret = release_mem_region_adjustable(&iomem_resource, start, size); + if (ret) + pr_warn("Unable to release resource <%016llx-%016llx> (%d)\n", + start, start + size - 1, ret); sections_to_remove = nr_pages / PAGES_PER_SECTION; for (i = 0; i < sections_to_remove; i++) { -- cgit v1.1 From 4edd7ceff0662afde195da6f6c43e7cbe1ed2dc4 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 29 Apr 2013 15:08:22 -0700 Subject: mm, hotplug: avoid compiling memory hotremove functions when disabled __remove_pages() is only necessary for CONFIG_MEMORY_HOTREMOVE. PowerPC pseries will return -EOPNOTSUPP if unsupported. Adding an #ifdef causes several other functions it depends on to also become unnecessary, which saves in .text when disabled (it's disabled in most defconfigs besides powerpc, including x86). remove_memory_block() becomes static since it is not referenced outside of drivers/base/memory.c. Build tested on x86 and powerpc with CONFIG_MEMORY_HOTREMOVE both enabled and disabled. Signed-off-by: David Rientjes Acked-by: Toshi Kani Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Greg Kroah-Hartman Cc: Wen Congyang Cc: Tang Chen Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 68 ++++++++++++++++++++++++++------------------------ mm/sparse.c | 72 +++++++++++++++++++++++++++++------------------------ 2 files changed, 74 insertions(+), 66 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c916582..60f6daa 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -436,6 +436,40 @@ static int __meminit __add_section(int nid, struct zone *zone, return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); } +/* + * Reasonably generic function for adding memory. It is + * expected that archs that support memory hotplug will + * call this function after deciding the zone to which to + * add the new pages. + */ +int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, + unsigned long nr_pages) +{ + unsigned long i; + int err = 0; + int start_sec, end_sec; + /* during initialize mem_map, align hot-added range to section */ + start_sec = pfn_to_section_nr(phys_start_pfn); + end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); + + for (i = start_sec; i <= end_sec; i++) { + err = __add_section(nid, zone, i << PFN_SECTION_SHIFT); + + /* + * EEXIST is finally dealt with by ioresource collision + * check. see add_memory() => register_memory_resource() + * Warning will be printed if there is collision. + */ + if (err && (err != -EEXIST)) + break; + err = 0; + } + + return err; +} +EXPORT_SYMBOL_GPL(__add_pages); + +#ifdef CONFIG_MEMORY_HOTREMOVE /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ static int find_smallest_section_pfn(int nid, struct zone *zone, unsigned long start_pfn, @@ -658,39 +692,6 @@ static int __remove_section(struct zone *zone, struct mem_section *ms) return 0; } -/* - * Reasonably generic function for adding memory. It is - * expected that archs that support memory hotplug will - * call this function after deciding the zone to which to - * add the new pages. - */ -int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, - unsigned long nr_pages) -{ - unsigned long i; - int err = 0; - int start_sec, end_sec; - /* during initialize mem_map, align hot-added range to section */ - start_sec = pfn_to_section_nr(phys_start_pfn); - end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); - - for (i = start_sec; i <= end_sec; i++) { - err = __add_section(nid, zone, i << PFN_SECTION_SHIFT); - - /* - * EEXIST is finally dealt with by ioresource collision - * check. see add_memory() => register_memory_resource() - * Warning will be printed if there is collision. - */ - if (err && (err != -EEXIST)) - break; - err = 0; - } - - return err; -} -EXPORT_SYMBOL_GPL(__add_pages); - /** * __remove_pages() - remove sections of pages from a zone * @zone: zone from which pages need to be removed @@ -733,6 +734,7 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, return ret; } EXPORT_SYMBOL_GPL(__remove_pages); +#endif /* CONFIG_MEMORY_HOTREMOVE */ int set_online_page_callback(online_page_callback_t callback) { diff --git a/mm/sparse.c b/mm/sparse.c index a37be5f..1c91f0d3 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -620,6 +620,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) vmemmap_free(start, end); } +#ifdef CONFIG_MEMORY_HOTREMOVE static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) { unsigned long start = (unsigned long)memmap; @@ -627,6 +628,7 @@ static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) vmemmap_free(start, end); } +#endif /* CONFIG_MEMORY_HOTREMOVE */ #else static struct page *__kmalloc_section_memmap(unsigned long nr_pages) { @@ -664,6 +666,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) get_order(sizeof(struct page) * nr_pages)); } +#ifdef CONFIG_MEMORY_HOTREMOVE static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) { unsigned long maps_section_nr, removing_section_nr, i; @@ -690,40 +693,9 @@ static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) put_page_bootmem(page); } } +#endif /* CONFIG_MEMORY_HOTREMOVE */ #endif /* CONFIG_SPARSEMEM_VMEMMAP */ -static void free_section_usemap(struct page *memmap, unsigned long *usemap) -{ - struct page *usemap_page; - unsigned long nr_pages; - - if (!usemap) - return; - - usemap_page = virt_to_page(usemap); - /* - * Check to see if allocation came from hot-plug-add - */ - if (PageSlab(usemap_page) || PageCompound(usemap_page)) { - kfree(usemap); - if (memmap) - __kfree_section_memmap(memmap, PAGES_PER_SECTION); - return; - } - - /* - * The usemap came from bootmem. This is packed with other usemaps - * on the section which has pgdat at boot time. Just keep it as is now. - */ - - if (memmap) { - nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) - >> PAGE_SHIFT; - - free_map_bootmem(memmap, nr_pages); - } -} - /* * returns the number of sections whose mem_maps were properly * set. If this is <=0, then that means that the passed-in @@ -800,6 +772,39 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) } #endif +#ifdef CONFIG_MEMORY_HOTREMOVE +static void free_section_usemap(struct page *memmap, unsigned long *usemap) +{ + struct page *usemap_page; + unsigned long nr_pages; + + if (!usemap) + return; + + usemap_page = virt_to_page(usemap); + /* + * Check to see if allocation came from hot-plug-add + */ + if (PageSlab(usemap_page) || PageCompound(usemap_page)) { + kfree(usemap); + if (memmap) + __kfree_section_memmap(memmap, PAGES_PER_SECTION); + return; + } + + /* + * The usemap came from bootmem. This is packed with other usemaps + * on the section which has pgdat at boot time. Just keep it as is now. + */ + + if (memmap) { + nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) + >> PAGE_SHIFT; + + free_map_bootmem(memmap, nr_pages); + } +} + void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) { struct page *memmap = NULL; @@ -819,4 +824,5 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION); free_section_usemap(memmap, usemap); } -#endif +#endif /* CONFIG_MEMORY_HOTREMOVE */ +#endif /* CONFIG_MEMORY_HOTPLUG */ -- cgit v1.1 From 84d96d897671cfb386e722acbefdb3a79e115a8a Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 29 Apr 2013 15:08:23 -0700 Subject: mm: madvise: complete input validation before taking lock In madvise(), there doesn't seem to be any reason for taking the ¤t->mm->mmap_sem before start and len_in have been validated. Incidentally, this removes the need for the out: label. [akpm@linux-foundation.org: s/out_plug/out/, per David] Signed-off-by: Rasmus Villemoes Acked-by: KOSAKI Motohiro Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/madvise.c b/mm/madvise.c index c58c94b..7055883 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -473,27 +473,27 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) if (!madvise_behavior_valid(behavior)) return error; - write = madvise_need_mmap_write(behavior); - if (write) - down_write(¤t->mm->mmap_sem); - else - down_read(¤t->mm->mmap_sem); - if (start & ~PAGE_MASK) - goto out; + return error; len = (len_in + ~PAGE_MASK) & PAGE_MASK; /* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) - goto out; + return error; end = start + len; if (end < start) - goto out; + return error; error = 0; if (end == start) - goto out; + return error; + + write = madvise_need_mmap_write(behavior); + if (write) + down_write(¤t->mm->mmap_sem); + else + down_read(¤t->mm->mmap_sem); /* * If the interval [start,end) covers some unmapped address @@ -509,14 +509,14 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) /* Still start < end. */ error = -ENOMEM; if (!vma) - goto out_plug; + goto out; /* Here start < (end|vma->vm_end). */ if (start < vma->vm_start) { unmapped_error = -ENOMEM; start = vma->vm_start; if (start >= end) - goto out_plug; + goto out; } /* Here vma->vm_start <= start < (end|vma->vm_end) */ @@ -527,21 +527,20 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ error = madvise_vma(vma, &prev, start, tmp, behavior); if (error) - goto out_plug; + goto out; start = tmp; if (prev && start < prev->vm_end) start = prev->vm_end; error = unmapped_error; if (start >= end) - goto out_plug; + goto out; if (prev) vma = prev->vm_next; else /* madvise_remove dropped mmap_sem */ vma = find_vma(current->mm, start); } -out_plug: - blk_finish_plug(&plug); out: + blk_finish_plug(&plug); if (write) up_write(¤t->mm->mmap_sem); else -- cgit v1.1 From 70ddf637eebe47e61fb2be08a59315581b6d2f38 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Mon, 29 Apr 2013 15:08:31 -0700 Subject: memcg: add memory.pressure_level events With this patch userland applications that want to maintain the interactivity/memory allocation cost can use the pressure level notifications. The levels are defined like this: The "low" level means that the system is reclaiming memory for new allocations. Monitoring this reclaiming activity might be useful for maintaining cache level. Upon notification, the program (typically "Activity Manager") might analyze vmstat and act in advance (i.e. prematurely shutdown unimportant services). The "medium" level means that the system is experiencing medium memory pressure, the system might be making swap, paging out active file caches, etc. Upon this event applications may decide to further analyze vmstat/zoneinfo/memcg or internal memory usage statistics and free any resources that can be easily reconstructed or re-read from a disk. The "critical" level means that the system is actively thrashing, it is about to out of memory (OOM) or even the in-kernel OOM killer is on its way to trigger. Applications should do whatever they can to help the system. It might be too late to consult with vmstat or any other statistics, so it's advisable to take an immediate action. The events are propagated upward until the event is handled, i.e. the events are not pass-through. Here is what this means: for example you have three cgroups: A->B->C. Now you set up an event listener on cgroups A, B and C, and suppose group C experiences some pressure. In this situation, only group C will receive the notification, i.e. groups A and B will not receive it. This is done to avoid excessive "broadcasting" of messages, which disturbs the system and which is especially bad if we are low on memory or thrashing. So, organize the cgroups wisely, or propagate the events manually (or, ask us to implement the pass-through events, explaining why would you need them.) Performance wise, the memory pressure notifications feature itself is lightweight and does not require much of bookkeeping, in contrast to the rest of memcg features. Unfortunately, as of current memcg implementation, pages accounting is an inseparable part and cannot be turned off. The good news is that there are some efforts[1] to improve the situation; plus, implementing the same, fully API-compatible[2] interface for CONFIG_MEMCG=n case (e.g. embedded) is also a viable option, so it will not require any changes on the userland side. [1] http://permalink.gmane.org/gmane.linux.kernel.cgroups/6291 [2] http://lkml.org/lkml/2013/2/21/454 [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: fix CONFIG_CGROPUPS=n warnings] Signed-off-by: Anton Vorontsov Acked-by: Kirill A. Shutemov Acked-by: KAMEZAWA Hiroyuki Cc: Tejun Heo Cc: David Rientjes Cc: Pekka Enberg Cc: Mel Gorman Cc: Glauber Costa Cc: Michal Hocko Cc: Luiz Capitulino Cc: Greg Thelen Cc: Leonid Moiseichuk Cc: KOSAKI Motohiro Cc: Minchan Kim Cc: Bartlomiej Zolnierkiewicz Cc: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Makefile | 2 +- mm/memcontrol.c | 29 +++++ mm/vmpressure.c | 374 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/vmscan.c | 8 ++ 4 files changed, 412 insertions(+), 1 deletion(-) create mode 100644 mm/vmpressure.c (limited to 'mm') diff --git a/mm/Makefile b/mm/Makefile index 3a46287..72c5acb 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -50,7 +50,7 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o -obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o +obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7e5bc43..360464f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -261,6 +262,9 @@ struct mem_cgroup { */ struct res_counter res; + /* vmpressure notifications */ + struct vmpressure vmpressure; + union { /* * the counter to account for mem+swap usage. @@ -359,6 +363,7 @@ struct mem_cgroup { atomic_t numainfo_events; atomic_t numainfo_updating; #endif + /* * Per cgroup active and inactive list, similar to the * per zone LRU lists. @@ -510,6 +515,24 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) return container_of(s, struct mem_cgroup, css); } +/* Some nice accessors for the vmpressure. */ +struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) +{ + if (!memcg) + memcg = root_mem_cgroup; + return &memcg->vmpressure; +} + +struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) +{ + return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; +} + +struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css) +{ + return &mem_cgroup_from_css(css)->vmpressure; +} + static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return (memcg == root_mem_cgroup); @@ -5907,6 +5930,11 @@ static struct cftype mem_cgroup_files[] = { .unregister_event = mem_cgroup_oom_unregister_event, .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), }, + { + .name = "pressure_level", + .register_event = vmpressure_register_event, + .unregister_event = vmpressure_unregister_event, + }, #ifdef CONFIG_NUMA { .name = "numa_stat", @@ -6188,6 +6216,7 @@ mem_cgroup_css_alloc(struct cgroup *cont) memcg->move_charge_at_immigrate = 0; mutex_init(&memcg->thresholds_lock); spin_lock_init(&memcg->move_lock); + vmpressure_init(&memcg->vmpressure); return &memcg->css; diff --git a/mm/vmpressure.c b/mm/vmpressure.c new file mode 100644 index 0000000..736a601 --- /dev/null +++ b/mm/vmpressure.c @@ -0,0 +1,374 @@ +/* + * Linux VM pressure + * + * Copyright 2012 Linaro Ltd. + * Anton Vorontsov + * + * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro, + * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * The window size (vmpressure_win) is the number of scanned pages before + * we try to analyze scanned/reclaimed ratio. So the window is used as a + * rate-limit tunable for the "low" level notification, and also for + * averaging the ratio for medium/critical levels. Using small window + * sizes can cause lot of false positives, but too big window size will + * delay the notifications. + * + * As the vmscan reclaimer logic works with chunks which are multiple of + * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well. + * + * TODO: Make the window size depend on machine size, as we do for vmstat + * thresholds. Currently we set it to 512 pages (2MB for 4KB pages). + */ +static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; + +/* + * These thresholds are used when we account memory pressure through + * scanned/reclaimed ratio. The current values were chosen empirically. In + * essence, they are percents: the higher the value, the more number + * unsuccessful reclaims there were. + */ +static const unsigned int vmpressure_level_med = 60; +static const unsigned int vmpressure_level_critical = 95; + +/* + * When there are too little pages left to scan, vmpressure() may miss the + * critical pressure as number of pages will be less than "window size". + * However, in that case the vmscan priority will raise fast as the + * reclaimer will try to scan LRUs more deeply. + * + * The vmscan logic considers these special priorities: + * + * prio == DEF_PRIORITY (12): reclaimer starts with that value + * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed + * prio == 0 : close to OOM, kernel scans every page in an lru + * + * Any value in this range is acceptable for this tunable (i.e. from 12 to + * 0). Current value for the vmpressure_level_critical_prio is chosen + * empirically, but the number, in essence, means that we consider + * critical level when scanning depth is ~10% of the lru size (vmscan + * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one + * eights). + */ +static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10); + +static struct vmpressure *work_to_vmpressure(struct work_struct *work) +{ + return container_of(work, struct vmpressure, work); +} + +static struct vmpressure *cg_to_vmpressure(struct cgroup *cg) +{ + return css_to_vmpressure(cgroup_subsys_state(cg, mem_cgroup_subsys_id)); +} + +static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) +{ + struct cgroup *cg = vmpressure_to_css(vmpr)->cgroup; + struct mem_cgroup *memcg = mem_cgroup_from_cont(cg); + + memcg = parent_mem_cgroup(memcg); + if (!memcg) + return NULL; + return memcg_to_vmpressure(memcg); +} + +enum vmpressure_levels { + VMPRESSURE_LOW = 0, + VMPRESSURE_MEDIUM, + VMPRESSURE_CRITICAL, + VMPRESSURE_NUM_LEVELS, +}; + +static const char * const vmpressure_str_levels[] = { + [VMPRESSURE_LOW] = "low", + [VMPRESSURE_MEDIUM] = "medium", + [VMPRESSURE_CRITICAL] = "critical", +}; + +static enum vmpressure_levels vmpressure_level(unsigned long pressure) +{ + if (pressure >= vmpressure_level_critical) + return VMPRESSURE_CRITICAL; + else if (pressure >= vmpressure_level_med) + return VMPRESSURE_MEDIUM; + return VMPRESSURE_LOW; +} + +static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned, + unsigned long reclaimed) +{ + unsigned long scale = scanned + reclaimed; + unsigned long pressure; + + /* + * We calculate the ratio (in percents) of how many pages were + * scanned vs. reclaimed in a given time frame (window). Note that + * time is in VM reclaimer's "ticks", i.e. number of pages + * scanned. This makes it possible to set desired reaction time + * and serves as a ratelimit. + */ + pressure = scale - (reclaimed * scale / scanned); + pressure = pressure * 100 / scale; + + pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure, + scanned, reclaimed); + + return vmpressure_level(pressure); +} + +struct vmpressure_event { + struct eventfd_ctx *efd; + enum vmpressure_levels level; + struct list_head node; +}; + +static bool vmpressure_event(struct vmpressure *vmpr, + unsigned long scanned, unsigned long reclaimed) +{ + struct vmpressure_event *ev; + enum vmpressure_levels level; + bool signalled = false; + + level = vmpressure_calc_level(scanned, reclaimed); + + mutex_lock(&vmpr->events_lock); + + list_for_each_entry(ev, &vmpr->events, node) { + if (level >= ev->level) { + eventfd_signal(ev->efd, 1); + signalled = true; + } + } + + mutex_unlock(&vmpr->events_lock); + + return signalled; +} + +static void vmpressure_work_fn(struct work_struct *work) +{ + struct vmpressure *vmpr = work_to_vmpressure(work); + unsigned long scanned; + unsigned long reclaimed; + + /* + * Several contexts might be calling vmpressure(), so it is + * possible that the work was rescheduled again before the old + * work context cleared the counters. In that case we will run + * just after the old work returns, but then scanned might be zero + * here. No need for any locks here since we don't care if + * vmpr->reclaimed is in sync. + */ + if (!vmpr->scanned) + return; + + mutex_lock(&vmpr->sr_lock); + scanned = vmpr->scanned; + reclaimed = vmpr->reclaimed; + vmpr->scanned = 0; + vmpr->reclaimed = 0; + mutex_unlock(&vmpr->sr_lock); + + do { + if (vmpressure_event(vmpr, scanned, reclaimed)) + break; + /* + * If not handled, propagate the event upward into the + * hierarchy. + */ + } while ((vmpr = vmpressure_parent(vmpr))); +} + +/** + * vmpressure() - Account memory pressure through scanned/reclaimed ratio + * @gfp: reclaimer's gfp mask + * @memcg: cgroup memory controller handle + * @scanned: number of pages scanned + * @reclaimed: number of pages reclaimed + * + * This function should be called from the vmscan reclaim path to account + * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw + * pressure index is then further refined and averaged over time. + * + * This function does not return any value. + */ +void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, + unsigned long scanned, unsigned long reclaimed) +{ + struct vmpressure *vmpr = memcg_to_vmpressure(memcg); + + /* + * Here we only want to account pressure that userland is able to + * help us with. For example, suppose that DMA zone is under + * pressure; if we notify userland about that kind of pressure, + * then it will be mostly a waste as it will trigger unnecessary + * freeing of memory by userland (since userland is more likely to + * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That + * is why we include only movable, highmem and FS/IO pages. + * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so + * we account it too. + */ + if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS))) + return; + + /* + * If we got here with no pages scanned, then that is an indicator + * that reclaimer was unable to find any shrinkable LRUs at the + * current scanning depth. But it does not mean that we should + * report the critical pressure, yet. If the scanning priority + * (scanning depth) goes too high (deep), we will be notified + * through vmpressure_prio(). But so far, keep calm. + */ + if (!scanned) + return; + + mutex_lock(&vmpr->sr_lock); + vmpr->scanned += scanned; + vmpr->reclaimed += reclaimed; + scanned = vmpr->scanned; + mutex_unlock(&vmpr->sr_lock); + + if (scanned < vmpressure_win || work_pending(&vmpr->work)) + return; + schedule_work(&vmpr->work); +} + +/** + * vmpressure_prio() - Account memory pressure through reclaimer priority level + * @gfp: reclaimer's gfp mask + * @memcg: cgroup memory controller handle + * @prio: reclaimer's priority + * + * This function should be called from the reclaim path every time when + * the vmscan's reclaiming priority (scanning depth) changes. + * + * This function does not return any value. + */ +void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) +{ + /* + * We only use prio for accounting critical level. For more info + * see comment for vmpressure_level_critical_prio variable above. + */ + if (prio > vmpressure_level_critical_prio) + return; + + /* + * OK, the prio is below the threshold, updating vmpressure + * information before shrinker dives into long shrinking of long + * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0 + * to the vmpressure() basically means that we signal 'critical' + * level. + */ + vmpressure(gfp, memcg, vmpressure_win, 0); +} + +/** + * vmpressure_register_event() - Bind vmpressure notifications to an eventfd + * @cg: cgroup that is interested in vmpressure notifications + * @cft: cgroup control files handle + * @eventfd: eventfd context to link notifications with + * @args: event arguments (used to set up a pressure level threshold) + * + * This function associates eventfd context with the vmpressure + * infrastructure, so that the notifications will be delivered to the + * @eventfd. The @args parameter is a string that denotes pressure level + * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or + * "critical"). + * + * This function should not be used directly, just pass it to (struct + * cftype).register_event, and then cgroup core will handle everything by + * itself. + */ +int vmpressure_register_event(struct cgroup *cg, struct cftype *cft, + struct eventfd_ctx *eventfd, const char *args) +{ + struct vmpressure *vmpr = cg_to_vmpressure(cg); + struct vmpressure_event *ev; + int level; + + for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) { + if (!strcmp(vmpressure_str_levels[level], args)) + break; + } + + if (level >= VMPRESSURE_NUM_LEVELS) + return -EINVAL; + + ev = kzalloc(sizeof(*ev), GFP_KERNEL); + if (!ev) + return -ENOMEM; + + ev->efd = eventfd; + ev->level = level; + + mutex_lock(&vmpr->events_lock); + list_add(&ev->node, &vmpr->events); + mutex_unlock(&vmpr->events_lock); + + return 0; +} + +/** + * vmpressure_unregister_event() - Unbind eventfd from vmpressure + * @cg: cgroup handle + * @cft: cgroup control files handle + * @eventfd: eventfd context that was used to link vmpressure with the @cg + * + * This function does internal manipulations to detach the @eventfd from + * the vmpressure notifications, and then frees internal resources + * associated with the @eventfd (but the @eventfd itself is not freed). + * + * This function should not be used directly, just pass it to (struct + * cftype).unregister_event, and then cgroup core will handle everything + * by itself. + */ +void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft, + struct eventfd_ctx *eventfd) +{ + struct vmpressure *vmpr = cg_to_vmpressure(cg); + struct vmpressure_event *ev; + + mutex_lock(&vmpr->events_lock); + list_for_each_entry(ev, &vmpr->events, node) { + if (ev->efd != eventfd) + continue; + list_del(&ev->node); + kfree(ev); + break; + } + mutex_unlock(&vmpr->events_lock); +} + +/** + * vmpressure_init() - Initialize vmpressure control structure + * @vmpr: Structure to be initialized + * + * This function should be called on every allocated vmpressure structure + * before any usage. + */ +void vmpressure_init(struct vmpressure *vmpr) +{ + mutex_init(&vmpr->sr_lock); + mutex_init(&vmpr->events_lock); + INIT_LIST_HEAD(&vmpr->events); + INIT_WORK(&vmpr->work, vmpressure_work_fn); +} diff --git a/mm/vmscan.c b/mm/vmscan.c index e03a00b..e53e495 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -1982,6 +1983,11 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) } memcg = mem_cgroup_iter(root, memcg, &reclaim); } while (memcg); + + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, + sc->nr_scanned - nr_scanned, + sc->nr_reclaimed - nr_reclaimed); + } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); } @@ -2167,6 +2173,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, count_vm_event(ALLOCSTALL); do { + vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, + sc->priority); sc->nr_scanned = 0; aborted_reclaim = shrink_zones(zonelist, sc); -- cgit v1.1 From e8420a8ece80b3fe810415ecf061d54ca7fab266 Mon Sep 17 00:00:00 2001 From: Cyril Hrubis Date: Mon, 29 Apr 2013 15:08:33 -0700 Subject: mm/mmap: check for RLIMIT_AS before unmapping Fix a corner case for MAP_FIXED when requested mapping length is larger than rlimit for virtual memory. In such case any overlapping mappings are unmapped before we check for the limit and return ENOMEM. The check is moved before the loop that unmaps overlapping parts of existing mappings. When we are about to hit the limit (currently mapped pages + len > limit) we scan for overlapping pages and check again accounting for them. This fixes situation when userspace program expects that the previous mappings are preserved after the mmap() syscall has returned with error. (POSIX clearly states that successfull mapping shall replace any previous mappings.) This corner case was found and can be tested with LTP testcase: testcases/open_posix_testsuite/conformance/interfaces/mmap/24-2.c In this case the mmap, which is clearly over current limit, unmaps dynamic libraries and the testcase segfaults right after returning into userspace. I've also looked at the second instance of the unmapping loop in the do_brk(). The do_brk() is called from brk() syscall and from vm_brk(). The brk() syscall checks for overlapping mappings and bails out when there are any (so it can't be triggered from the brk syscall). The vm_brk() is called only from binmft handlers so it shouldn't be triggered unless binmft handler created overlapping mappings. Signed-off-by: Cyril Hrubis Reviewed-by: Mel Gorman Reviewed-by: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 43c4955..da3e9c0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -6,6 +6,7 @@ * Address space accounting code */ +#include #include #include #include @@ -550,6 +551,34 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr, return 0; } +static unsigned long count_vma_pages_range(struct mm_struct *mm, + unsigned long addr, unsigned long end) +{ + unsigned long nr_pages = 0; + struct vm_area_struct *vma; + + /* Find first overlaping mapping */ + vma = find_vma_intersection(mm, addr, end); + if (!vma) + return 0; + + nr_pages = (min(end, vma->vm_end) - + max(addr, vma->vm_start)) >> PAGE_SHIFT; + + /* Iterate over the rest of the overlaps */ + for (vma = vma->vm_next; vma; vma = vma->vm_next) { + unsigned long overlap_len; + + if (vma->vm_start > end) + break; + + overlap_len = min(end, vma->vm_end) - vma->vm_start; + nr_pages += overlap_len >> PAGE_SHIFT; + } + + return nr_pages; +} + void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, struct rb_node **rb_link, struct rb_node *rb_parent) { @@ -1442,6 +1471,23 @@ unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long charged = 0; struct inode *inode = file ? file_inode(file) : NULL; + /* Check against address space limit. */ + if (!may_expand_vm(mm, len >> PAGE_SHIFT)) { + unsigned long nr_pages; + + /* + * MAP_FIXED may remove pages of mappings that intersects with + * requested mapping. Account for the pages it would unmap. + */ + if (!(vm_flags & MAP_FIXED)) + return -ENOMEM; + + nr_pages = count_vma_pages_range(mm, addr, addr + len); + + if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages)) + return -ENOMEM; + } + /* Clear old maps */ error = -ENOMEM; munmap_back: @@ -1451,10 +1497,6 @@ munmap_back: goto munmap_back; } - /* Check against address space limit. */ - if (!may_expand_vm(mm, len >> PAGE_SHIFT)) - return -ENOMEM; - /* * Private writable mapping: check memory availability */ -- cgit v1.1 From 2f772e6cadf8ad8fca38927b17e6be028be669f5 Mon Sep 17 00:00:00 2001 From: Seth Jennings Date: Mon, 29 Apr 2013 15:08:34 -0700 Subject: mm: break up swap_writepage() for frontswap backends swap_writepage() is currently where frontswap hooks into the swap write path to capture pages with the frontswap_store() function. However, if a frontswap backend wants to "resume" the writeback of a page to the swap device, it can't call swap_writepage() as the page will simply reenter the backend. This patch separates swap_writepage() into a top and bottom half, the bottom half named __swap_writepage() to allow a frontswap backend, like zswap, to resume writeback beyond the frontswap_store() hook. __add_to_swap_cache() is also made non-static so that the page for which writeback is to be resumed can be added to the swap cache. Signed-off-by: Seth Jennings Signed-off-by: Bob Liu Acked-by: Minchan Kim Reviewed-by: Dan Magenheimer Cc: Konrad Rzeszutek Wilk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_io.c | 14 +++++++++++--- mm/swap_state.c | 2 +- 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_io.c b/mm/page_io.c index 78eee32..8e6bcf1 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -185,9 +185,7 @@ bad_bmap: */ int swap_writepage(struct page *page, struct writeback_control *wbc) { - struct bio *bio; - int ret = 0, rw = WRITE; - struct swap_info_struct *sis = page_swap_info(page); + int ret = 0; if (try_to_free_swap(page)) { unlock_page(page); @@ -199,6 +197,16 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) end_page_writeback(page); goto out; } + ret = __swap_writepage(page, wbc); +out: + return ret; +} + +int __swap_writepage(struct page *page, struct writeback_control *wbc) +{ + struct bio *bio; + int ret = 0, rw = WRITE; + struct swap_info_struct *sis = page_swap_info(page); if (sis->flags & SWP_FILE) { struct kiocb kiocb; diff --git a/mm/swap_state.c b/mm/swap_state.c index 7efcf15..fe43fd5 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -78,7 +78,7 @@ void show_swap_cache_info(void) * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ -static int __add_to_swap_cache(struct page *page, swp_entry_t entry) +int __add_to_swap_cache(struct page *page, swp_entry_t entry) { int error; struct address_space *address_space; -- cgit v1.1 From 1eec6702a80e04416d528846a5ff2122484d95ec Mon Sep 17 00:00:00 2001 From: Seth Jennings Date: Mon, 29 Apr 2013 15:08:35 -0700 Subject: mm: allow for outstanding swap writeback accounting To prevent flooding the swap device with writebacks, frontswap backends need to count and limit the number of outstanding writebacks. The incrementing of the counter can be done before the call to __swap_writepage(). However, the caller must receive a notification when the writeback completes in order to decrement the counter. To achieve this functionality, this patch modifies __swap_writepage() to take the bio completion callback function as an argument. end_swap_bio_write(), the normal bio completion function, is also made non-static so that code doing the accounting can call it after the accounting is done. There should be no behavioural change to existing code. Signed-off-by: Seth Jennings Signed-off-by: Bob Liu Acked-by: Minchan Kim Reviewed-by: Dan Magenheimer Cc: Konrad Rzeszutek Wilk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_io.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_io.c b/mm/page_io.c index 8e6bcf1..8e0e5c0 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -42,7 +42,7 @@ static struct bio *get_swap_bio(gfp_t gfp_flags, return bio; } -static void end_swap_bio_write(struct bio *bio, int err) +void end_swap_bio_write(struct bio *bio, int err) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct page *page = bio->bi_io_vec[0].bv_page; @@ -197,12 +197,13 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) end_page_writeback(page); goto out; } - ret = __swap_writepage(page, wbc); + ret = __swap_writepage(page, wbc, end_swap_bio_write); out: return ret; } -int __swap_writepage(struct page *page, struct writeback_control *wbc) +int __swap_writepage(struct page *page, struct writeback_control *wbc, + void (*end_write_func)(struct bio *, int)) { struct bio *bio; int ret = 0, rw = WRITE; @@ -234,7 +235,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc) return ret; } - bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); + bio = get_swap_bio(GFP_NOIO, page, end_write_func); if (bio == NULL) { set_page_dirty(page); unlock_page(page); -- cgit v1.1 From 5bc7b8aca942d03bf2716ddcfcb4e0b57e43a1b8 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 29 Apr 2013 15:08:36 -0700 Subject: mm: thp: add split tail pages to shrink page list in page reclaim In page reclaim, huge page is split. split_huge_page() adds tail pages to LRU list. Since we are reclaiming a huge page, it's better we reclaim all subpages of the huge page instead of just the head page. This patch adds split tail pages to shrink page list so the tail pages can be reclaimed soon. Before this patch, run a swap workload: thp_fault_alloc 3492 thp_fault_fallback 608 thp_collapse_alloc 6 thp_collapse_alloc_failed 0 thp_split 916 With this patch: thp_fault_alloc 4085 thp_fault_fallback 16 thp_collapse_alloc 90 thp_collapse_alloc_failed 0 thp_split 1272 fallback allocation is reduced a lot. [akpm@linux-foundation.org: fix CONFIG_SWAP=n build] Signed-off-by: Shaohua Li Acked-by: Rik van Riel Acked-by: Minchan Kim Acked-by: Johannes Weiner Reviewed-by: Wanpeng Li Cc: Andrea Arcangeli Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 21 +++++++++++++++------ mm/swap.c | 11 ++++++++--- mm/swap_state.c | 4 ++-- mm/vmscan.c | 2 +- 4 files changed, 26 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 45eaae0..2ed1a16 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1559,7 +1559,8 @@ static int __split_huge_page_splitting(struct page *page, return ret; } -static void __split_huge_page_refcount(struct page *page) +static void __split_huge_page_refcount(struct page *page, + struct list_head *list) { int i; struct zone *zone = page_zone(page); @@ -1645,7 +1646,7 @@ static void __split_huge_page_refcount(struct page *page) BUG_ON(!PageDirty(page_tail)); BUG_ON(!PageSwapBacked(page_tail)); - lru_add_page_tail(page, page_tail, lruvec); + lru_add_page_tail(page, page_tail, lruvec, list); } atomic_sub(tail_count, &page->_count); BUG_ON(atomic_read(&page->_count) <= 0); @@ -1752,7 +1753,8 @@ static int __split_huge_page_map(struct page *page, /* must be called with anon_vma->root->rwsem held */ static void __split_huge_page(struct page *page, - struct anon_vma *anon_vma) + struct anon_vma *anon_vma, + struct list_head *list) { int mapcount, mapcount2; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -1783,7 +1785,7 @@ static void __split_huge_page(struct page *page, mapcount, page_mapcount(page)); BUG_ON(mapcount != page_mapcount(page)); - __split_huge_page_refcount(page); + __split_huge_page_refcount(page, list); mapcount2 = 0; anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { @@ -1798,7 +1800,14 @@ static void __split_huge_page(struct page *page, BUG_ON(mapcount != mapcount2); } -int split_huge_page(struct page *page) +/* + * Split a hugepage into normal pages. This doesn't change the position of head + * page. If @list is null, tail pages will be added to LRU list, otherwise, to + * @list. Both head page and tail pages will inherit mapping, flags, and so on + * from the hugepage. + * Return 0 if the hugepage is split successfully otherwise return 1. + */ +int split_huge_page_to_list(struct page *page, struct list_head *list) { struct anon_vma *anon_vma; int ret = 1; @@ -1823,7 +1832,7 @@ int split_huge_page(struct page *page) goto out_unlock; BUG_ON(!PageSwapBacked(page)); - __split_huge_page(page, anon_vma); + __split_huge_page(page, anon_vma, list); count_vm_event(THP_SPLIT); BUG_ON(PageCompound(page)); diff --git a/mm/swap.c b/mm/swap.c index 8a529a0..acd40bf 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -737,7 +737,7 @@ EXPORT_SYMBOL(__pagevec_release); #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* used by __split_huge_page_refcount() */ void lru_add_page_tail(struct page *page, struct page *page_tail, - struct lruvec *lruvec) + struct lruvec *lruvec, struct list_head *list) { int uninitialized_var(active); enum lru_list lru; @@ -749,7 +749,8 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); - SetPageLRU(page_tail); + if (!list) + SetPageLRU(page_tail); if (page_evictable(page_tail)) { if (PageActive(page)) { @@ -767,7 +768,11 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, if (likely(PageLRU(page))) list_add_tail(&page_tail->lru, &page->lru); - else { + else if (list) { + /* page reclaim is reclaiming a huge page */ + get_page(page_tail); + list_add_tail(&page_tail->lru, list); + } else { struct list_head *list_head; /* * Head page has not yet been counted, as an hpage, diff --git a/mm/swap_state.c b/mm/swap_state.c index fe43fd5..b3d40dc 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -160,7 +160,7 @@ void __delete_from_swap_cache(struct page *page) * Allocate swap space for the page and add the page to the * swap cache. Caller needs to hold the page lock. */ -int add_to_swap(struct page *page) +int add_to_swap(struct page *page, struct list_head *list) { swp_entry_t entry; int err; @@ -173,7 +173,7 @@ int add_to_swap(struct page *page) return 0; if (unlikely(PageTransHuge(page))) - if (unlikely(split_huge_page(page))) { + if (unlikely(split_huge_page_to_list(page, list))) { swapcache_free(entry, NULL); return 0; } diff --git a/mm/vmscan.c b/mm/vmscan.c index e53e495..fa6a853 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -781,7 +781,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (PageAnon(page) && !PageSwapCache(page)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; - if (!add_to_swap(page)) + if (!add_to_swap(page, page_list)) goto activate_locked; may_enter_fs = 1; } -- cgit v1.1 From 40f4b1ead0e7a31bd1acf9b683f9ddcf52b0dfe4 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Mon, 29 Apr 2013 15:08:38 -0700 Subject: mm/vmstat: add note on safety of drain_zonestat Signed-off-by: Cody P Schafer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index c823776..f42745e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -493,6 +493,10 @@ void refresh_cpu_vm_stats(int cpu) atomic_long_add(global_diff[i], &vm_stat[i]); } +/* + * this is only called if !populated_zone(zone), which implies no other users of + * pset->vm_stat_diff[] exsist. + */ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) { int i; -- cgit v1.1 From 209ff86d61d6b50979cffb47878fc77ca2f6c8a2 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Mon, 29 Apr 2013 15:08:41 -0700 Subject: memblock: fix missing comment of memblock_insert_region() There is no comment for parameter nid of memblock_insert_region(). This patch adds comment for it. Signed-off-by: Tang Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 2cce8b3..c5fad93 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -322,10 +322,11 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type) /** * memblock_insert_region - insert new memblock region - * @type: memblock type to insert into - * @idx: index for the insertion point - * @base: base address of the new region - * @size: size of the new region + * @type: memblock type to insert into + * @idx: index for the insertion point + * @base: base address of the new region + * @size: size of the new region + * @nid: node id of the new region * * Insert new memblock region [@base,@base+@size) into @type at @idx. * @type must already have extra room to accomodate the new region. -- cgit v1.1 From 865ffef3797da2cac85b3354b5b6050dc9660978 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 29 Apr 2013 15:08:42 -0700 Subject: fs: fix fsync() error reporting There are two convenient ways to report errors to userspace 1) retun error to original syscall for example write(2) 2) mark mapping with error flag and return it on later fsync(2) Second one is broken if (mapping->nrpages == 0) This is real-life situation because after error pages are likey to be truncated or invalidated. We have to return an error regardless to number of pages in the mapping. #Original testcase: git@github.com:dmonakhov/xfstests.git MOUNT_OPTIONS="-b1024" ./check shared/305 Signed-off-by: Dmitry Monakhov Reviewed-by: Jan Kara Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 2581826..e989fb1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -188,6 +188,17 @@ static int sleep_on_page_killable(void *word) return fatal_signal_pending(current) ? -EINTR : 0; } +static int filemap_check_errors(struct address_space *mapping) +{ + int ret = 0; + /* Check for outstanding write errors */ + if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) + ret = -ENOSPC; + if (test_and_clear_bit(AS_EIO, &mapping->flags)) + ret = -EIO; + return ret; +} + /** * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range * @mapping: address space structure to write @@ -269,10 +280,10 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; struct pagevec pvec; int nr_pages; - int ret = 0; + int ret2, ret = 0; if (end_byte < start_byte) - return 0; + goto out; pagevec_init(&pvec, 0); while ((index <= end) && @@ -295,12 +306,10 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, pagevec_release(&pvec); cond_resched(); } - - /* Check for outstanding write errors */ - if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) - ret = -ENOSPC; - if (test_and_clear_bit(AS_EIO, &mapping->flags)) - ret = -EIO; +out: + ret2 = filemap_check_errors(mapping); + if (!ret) + ret = ret2; return ret; } @@ -341,6 +350,8 @@ int filemap_write_and_wait(struct address_space *mapping) if (!err) err = err2; } + } else { + err = filemap_check_errors(mapping); } return err; } @@ -372,6 +383,8 @@ int filemap_write_and_wait_range(struct address_space *mapping, if (!err) err = err2; } + } else { + err = filemap_check_errors(mapping); } return err; } -- cgit v1.1 From fd0ccaf2bd04e54d2a6979fbfdcad856694e3877 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 29 Apr 2013 15:08:43 -0700 Subject: memcg: avoid accessing memcg after releasing reference This might cause a use-after-free bug. Signed-off-by: Li Zefan Cc: Glauber Costa Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 360464f..c92bcfc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3215,12 +3215,12 @@ void memcg_release_cache(struct kmem_cache *s) root = s->memcg_params->root_cache; root->memcg_params->memcg_caches[id] = NULL; - mem_cgroup_put(memcg); mutex_lock(&memcg->slab_caches_mutex); list_del(&s->memcg_params->list); mutex_unlock(&memcg->slab_caches_mutex); + mem_cgroup_put(memcg); out: kfree(s->memcg_params); } -- cgit v1.1 From 5918d10a4bb1081920a04e2c17197a02ff06e651 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 29 Apr 2013 15:08:44 -0700 Subject: thp: fix huge zero page logic for page with pfn == 0 Current implementation of huge zero page uses pfn value 0 to indicate that the page hasn't allocated yet. It assumes that buddy page allocator can't return page with pfn == 0. Let's rework the code to store 'struct page *' of huge zero page, not its pfn. This way we can avoid the weak assumption. [akpm@linux-foundation.org: fix sparse warning] Signed-off-by: Kirill A. Shutemov Reported-by: Minchan Kim Acked-by: Minchan Kim Reviewed-by: Andrea Arcangeli Acked-by: Johannes Weiner Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 45 ++++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 23 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2ed1a16..03a89a2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -163,35 +163,34 @@ static int start_khugepaged(void) } static atomic_t huge_zero_refcount; -static unsigned long huge_zero_pfn __read_mostly; +static struct page *huge_zero_page __read_mostly; -static inline bool is_huge_zero_pfn(unsigned long pfn) +static inline bool is_huge_zero_page(struct page *page) { - unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn); - return zero_pfn && pfn == zero_pfn; + return ACCESS_ONCE(huge_zero_page) == page; } static inline bool is_huge_zero_pmd(pmd_t pmd) { - return is_huge_zero_pfn(pmd_pfn(pmd)); + return is_huge_zero_page(pmd_page(pmd)); } -static unsigned long get_huge_zero_page(void) +static struct page *get_huge_zero_page(void) { struct page *zero_page; retry: if (likely(atomic_inc_not_zero(&huge_zero_refcount))) - return ACCESS_ONCE(huge_zero_pfn); + return ACCESS_ONCE(huge_zero_page); zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, HPAGE_PMD_ORDER); if (!zero_page) { count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); - return 0; + return NULL; } count_vm_event(THP_ZERO_PAGE_ALLOC); preempt_disable(); - if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) { + if (cmpxchg(&huge_zero_page, NULL, zero_page)) { preempt_enable(); __free_page(zero_page); goto retry; @@ -200,7 +199,7 @@ retry: /* We take additional reference here. It will be put back by shrinker */ atomic_set(&huge_zero_refcount, 2); preempt_enable(); - return ACCESS_ONCE(huge_zero_pfn); + return ACCESS_ONCE(huge_zero_page); } static void put_huge_zero_page(void) @@ -220,9 +219,9 @@ static int shrink_huge_zero_page(struct shrinker *shrink, return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { - unsigned long zero_pfn = xchg(&huge_zero_pfn, 0); - BUG_ON(zero_pfn == 0); - __free_page(__pfn_to_page(zero_pfn)); + struct page *zero_page = xchg(&huge_zero_page, NULL); + BUG_ON(zero_page == NULL); + __free_page(zero_page); } return 0; @@ -764,12 +763,12 @@ static inline struct page *alloc_hugepage(int defrag) static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, - unsigned long zero_pfn) + struct page *zero_page) { pmd_t entry; if (!pmd_none(*pmd)) return false; - entry = pfn_pmd(zero_pfn, vma->vm_page_prot); + entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_wrprotect(entry); entry = pmd_mkhuge(entry); set_pmd_at(mm, haddr, pmd, entry); @@ -794,20 +793,20 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!(flags & FAULT_FLAG_WRITE) && transparent_hugepage_use_zero_page()) { pgtable_t pgtable; - unsigned long zero_pfn; + struct page *zero_page; bool set; pgtable = pte_alloc_one(mm, haddr); if (unlikely(!pgtable)) return VM_FAULT_OOM; - zero_pfn = get_huge_zero_page(); - if (unlikely(!zero_pfn)) { + zero_page = get_huge_zero_page(); + if (unlikely(!zero_page)) { pte_free(mm, pgtable); count_vm_event(THP_FAULT_FALLBACK); goto out; } spin_lock(&mm->page_table_lock); set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, - zero_pfn); + zero_page); spin_unlock(&mm->page_table_lock); if (!set) { pte_free(mm, pgtable); @@ -886,16 +885,16 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, * a page table. */ if (is_huge_zero_pmd(pmd)) { - unsigned long zero_pfn; + struct page *zero_page; bool set; /* * get_huge_zero_page() will never allocate a new page here, * since we already have a zero page to copy. It just takes a * reference. */ - zero_pfn = get_huge_zero_page(); + zero_page = get_huge_zero_page(); set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, - zero_pfn); + zero_page); BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */ ret = 0; goto out_unlock; @@ -1812,7 +1811,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) struct anon_vma *anon_vma; int ret = 1; - BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); + BUG_ON(is_huge_zero_page(page)); BUG_ON(!PageAnon(page)); /* -- cgit v1.1 From 465adcf1ea7b2e49b2e0899366624f5532b64012 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 29 Apr 2013 15:08:45 -0700 Subject: mm, memcg: give exiting processes access to memory reserves A memcg may livelock when oom if the process that grabs the hierarchy's oom lock is never the first process with PF_EXITING set in the memcg's task iteration. The oom killer, both global and memcg, will defer if it finds an eligible process that is in the process of exiting and it is not being ptraced. The idea is to allow it to exit without using memory reserves before needlessly killing another process. This normally works fine except in the memcg case with a large number of threads attached to the oom memcg. In this case, the memcg oom killer only gets called for the process that grabs the hierarchy's oom lock; all others end up blocked on the memcg's oom waitqueue. Thus, if the process that grabs the hierarchy's oom lock is never the first PF_EXITING process in the memcg's task iteration, the oom killer is constantly deferred without anything making progress. The fix is to give PF_EXITING processes access to memory reserves so that we've marked them as oom killed without any iteration. This allows __mem_cgroup_try_charge() to succeed so that the process may exit. This makes the memcg oom killer exemption for TIF_MEMDIE tasks, now immediately granted for processes with pending SIGKILLs and those in the exit path, to be equivalent to what is done for the global oom killer. Signed-off-by: David Rientjes Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c92bcfc..47b36fe 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1787,11 +1787,11 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, struct task_struct *chosen = NULL; /* - * If current has a pending SIGKILL, then automatically select it. The - * goal is to allow it to allocate so that it may quickly exit and free - * its memory. + * If current has a pending SIGKILL or is exiting, then automatically + * select it. The goal is to allow it to allocate so that it may + * quickly exit and free its memory. */ - if (fatal_signal_pending(current)) { + if (fatal_signal_pending(current) || current->flags & PF_EXITING) { set_thread_flag(TIF_MEMDIE); return; } -- cgit v1.1 From 2d30d31ea3c5be426ce25607b9bd1835acb85e0a Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Mon, 29 Apr 2013 15:08:47 -0700 Subject: swap: redirty page if page write fails on swap file Since commit 62c230bc1790 ("mm: add support for a filesystem to activate swap files and use direct_IO for writing swap pages"), swap_writepage() calls direct_IO on swap files. However, in that case the page isn't redirtied if I/O fails, and is therefore handled afterwards as if it has been successfully written to the swap file, leading to memory corruption when the page is eventually swapped back in. This patch sets the page dirty when direct_IO() fails. It fixes a memory corruption that happened while using swap-over-NFS. Signed-off-by: Jerome Marchand Acked-by: Johannes Weiner Acked-by: Mel Gorman Cc: Hugh Dickins Cc: [3.6+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_io.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/page_io.c b/mm/page_io.c index 8e0e5c0..eb3300f 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -231,6 +231,8 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, if (ret == PAGE_SIZE) { count_vm_event(PSWPOUT); ret = 0; + } else { + set_page_dirty(page); } return ret; } -- cgit v1.1 From 0cdc444a67ccdbd58bfbcba865cb17a9f17a7691 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 29 Apr 2013 15:08:48 -0700 Subject: mm: swap: mark swap pages writeback before queueing for direct IO As pointed out by Andrew Morton, the swap-over-NFS writeback is not setting PageWriteback before it is queued for direct IO. While swap pages do not participate in BDI or process dirty accounting and the IO is synchronous, the writeback bit is still required and not setting it in this case was an oversight. swapoff depends on the page writeback to synchronoise all pending writes on a swap page before it is reused. Swapcache freeing and reuse depend on checking the PageWriteback under lock to ensure the page is safe to reuse. Direct IO handlers and the direct IO handler for NFS do not deal with PageWriteback as they are synchronous writes. In the case of NFS, it schedules pages (or a page in the case of swap) for IO and then waits synchronously for IO to complete in nfs_direct_write(). It is recognised that this is a slowdown from normal swap handling which is asynchronous and uses a completion handler. Shoving PageWriteback handling down into direct IO handlers looks like a bad fit to handle the swap case although it may have to be dealt with some day if swap is converted to use direct IO in general and bmap is finally done away with. At that point it will be necessary to refit asynchronous direct IO with completion handlers onto the swap subsystem. As swapcache currently depends on PageWriteback to protect against races, this patch sets PageWriteback under the page lock before queueing it for direct IO. It is cleared when the direct IO handler returns. IO errors are treated similarly to the direct-to-bio case except PageError is not set as in the case of swap-over-NFS, it is likely to be a transient error. It was asked what prevents such a page being reclaimed in parallel. With this patch applied, such a page will now be skipped (most of the time) or blocked until the writeback completes. Reclaim checks PageWriteback under the page lock before calling try_to_free_swap and the page lock should prevent the page being requeued for IO before it is freed. This and Jerome's related patch should considered for -stable as far back as 3.6 when swap-over-NFS was introduced. [akpm@linux-foundation.org: use pr_err_ratelimited()] [akpm@linux-foundation.org: remove hopefully-unneeded cast in printk] Signed-off-by: Mel Gorman Cc: Jerome Marchand Cc: Hugh Dickins Cc: [3.6+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_io.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'mm') diff --git a/mm/page_io.c b/mm/page_io.c index eb3300f..bb5d752 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -223,6 +223,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, kiocb.ki_left = PAGE_SIZE; kiocb.ki_nbytes = PAGE_SIZE; + set_page_writeback(page); unlock_page(page); ret = mapping->a_ops->direct_IO(KERNEL_WRITE, &kiocb, &iov, @@ -232,8 +233,22 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, count_vm_event(PSWPOUT); ret = 0; } else { + /* + * In the case of swap-over-nfs, this can be a + * temporary failure if the system has limited + * memory for allocating transmit buffers. + * Mark the page dirty and avoid + * rotate_reclaimable_page but rate-limit the + * messages but do not flag PageError like + * the normal direct-to-bio case as it could + * be temporary. + */ set_page_dirty(page); + ClearPageReclaim(page); + pr_err_ratelimited("Write error on dio swapfile (%Lu)\n", + page_file_offset(page)); } + end_page_writeback(page); return ret; } -- cgit v1.1 From 349daa0f93177958c4618542a61926674169a698 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 29 Apr 2013 15:08:49 -0700 Subject: mm: fix memory_hotplug.c printk format warning PFN_PHYS() is a phys_addr_t, which can be u32 or u64. Fix the build warning when phys_addr_t is u32. mm/memory_hotplug.c: warning: format '%llx' expects argument of type 'long long unsigned int', but argument 2 has type 'unsigned int' [-Wformat]: => 1685:3 mm/memory_hotplug.c: warning: format '%llx' expects argument of type 'long long unsigned int', but argument 3 has type 'unsigned int' [-Wformat]: => 1685:3 Signed-off-by: Randy Dunlap Reported-by: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 60f6daa..a221fac 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1690,11 +1690,15 @@ static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) { int ret = !is_memblock_offlined(mem); - if (unlikely(ret)) + if (unlikely(ret)) { + phys_addr_t beginpa, endpa; + + beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); + endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; pr_warn("removing memory fails, because memory " - "[%#010llx-%#010llx] is onlined\n", - PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)), - PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1); + "[%pa-%pa] is onlined\n", + &beginpa, &endpa); + } return ret; } -- cgit v1.1 From b4def3509d18c1db9198f92d4c35065e029a09a1 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 29 Apr 2013 15:08:52 -0700 Subject: mm, nobootmem: clean-up of free_low_memory_core_early() Remove unused argument and make function static, because there is no user outside of nobootmem.c Signed-off-by: Joonsoo Kim Cc: Yinghai Lu Acked-by: Johannes Weiner Cc: Jiang Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nobootmem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 5e07d36..a31be7a 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -120,7 +120,7 @@ static unsigned long __init __free_memory_core(phys_addr_t start, return end_pfn - start_pfn; } -unsigned long __init free_low_memory_core_early(int nodeid) +static unsigned long __init free_low_memory_core_early(void) { unsigned long count = 0; phys_addr_t start, end, size; @@ -170,7 +170,7 @@ unsigned long __init free_all_bootmem(void) * because in some case like Node0 doesn't have RAM installed * low ram will be on Node1 */ - return free_low_memory_core_early(MAX_NUMNODES); + return free_low_memory_core_early(); } /** -- cgit v1.1 From b476e2951fcf7a25574b7b193944b041687f3ed4 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 29 Apr 2013 15:08:53 -0700 Subject: mm, nobootmem: do memset() after memblock_reserve() Currently, we do memset() before reserving the area. This may not cause any problem, but it is somewhat weird. So change execution order. Signed-off-by: Joonsoo Kim Cc: Yinghai Lu Acked-by: Johannes Weiner Cc: Jiang Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nobootmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/nobootmem.c b/mm/nobootmem.c index a31be7a..bdd3fa2 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -45,9 +45,9 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, if (!addr) return NULL; + memblock_reserve(addr, size); ptr = phys_to_virt(addr); memset(ptr, 0, size); - memblock_reserve(addr, size); /* * The min_count is set to 0 so that bootmem allocated blocks * are never reported as leaks. -- cgit v1.1 From 9ca24e2e19325edc3ed0b437657d26219b7a768a Mon Sep 17 00:00:00 2001 From: Vinayak Menon Date: Mon, 29 Apr 2013 15:08:55 -0700 Subject: mmKconfig: add an option to disable bounce There are times when HIGHMEM is enabled, but we don't prefer CONFIG_BOUNCE to be enabled. CONFIG_BOUNCE can reduce the block device throughput, and this is not ideal for machines where we don't gain much by enabling it. So provide an option to deselect CONFIG_BOUNCE. The observation was made while measuring eMMC throughput using iozone on an ARM device with 1GB RAM. Signed-off-by: Vinayak Menon Cc: David Rientjes Cc: Jens Axboe Cc: Randy Dunlap Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 3bea74f..e742d06 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -263,8 +263,14 @@ config ZONE_DMA_FLAG default "1" config BOUNCE - def_bool y + bool "Enable bounce buffers" + default y depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM) + help + Enable bounce buffers for devices that cannot access + the full range of memory available to the CPU. Enabled + by default when ZONE_DMA or HIGHMEM is selected, but you + may say n to override this. # On the 'tile' arch, USB OHCI needs the bounce pool since tilegx will often # have more than 4GB of memory, but we don't currently use the IOTLB to present -- cgit v1.1 From ca0dde97178e75ed1370b8616326f5496a803d65 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 29 Apr 2013 15:08:57 -0700 Subject: memcg: take reference before releasing rcu_read_lock The memcg is not referenced, so it can be destroyed at anytime right after we exit rcu read section, so it's not safe to access it. To fix this, we call css_tryget() to get a reference while we're still in rcu read section. This also removes a bogus comment above __memcg_create_cache_enqueue(). Signed-off-by: Li Zefan Acked-by: Glauber Costa Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 63 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 33 insertions(+), 30 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 47b36fe..b8dc8e4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3483,7 +3483,6 @@ static void memcg_create_cache_work_func(struct work_struct *w) /* * Enqueue the creation of a per-memcg kmem_cache. - * Called with rcu_read_lock. */ static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, struct kmem_cache *cachep) @@ -3491,12 +3490,8 @@ static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, struct create_work *cw; cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); - if (cw == NULL) - return; - - /* The corresponding put will be done in the workqueue. */ - if (!css_tryget(&memcg->css)) { - kfree(cw); + if (cw == NULL) { + css_put(&memcg->css); return; } @@ -3552,10 +3547,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); - rcu_read_unlock(); if (!memcg_can_account_kmem(memcg)) - return cachep; + goto out; idx = memcg_cache_id(memcg); @@ -3564,29 +3558,38 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, * code updating memcg_caches will issue a write barrier to match this. */ read_barrier_depends(); - if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) { - /* - * If we are in a safe context (can wait, and not in interrupt - * context), we could be be predictable and return right away. - * This would guarantee that the allocation being performed - * already belongs in the new cache. - * - * However, there are some clashes that can arrive from locking. - * For instance, because we acquire the slab_mutex while doing - * kmem_cache_dup, this means no further allocation could happen - * with the slab_mutex held. - * - * Also, because cache creation issue get_online_cpus(), this - * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, - * that ends up reversed during cpu hotplug. (cpuset allocates - * a bunch of GFP_KERNEL memory during cpuup). Due to all that, - * better to defer everything. - */ - memcg_create_cache_enqueue(memcg, cachep); - return cachep; + if (likely(cachep->memcg_params->memcg_caches[idx])) { + cachep = cachep->memcg_params->memcg_caches[idx]; + goto out; } - return cachep->memcg_params->memcg_caches[idx]; + /* The corresponding put will be done in the workqueue. */ + if (!css_tryget(&memcg->css)) + goto out; + rcu_read_unlock(); + + /* + * If we are in a safe context (can wait, and not in interrupt + * context), we could be be predictable and return right away. + * This would guarantee that the allocation being performed + * already belongs in the new cache. + * + * However, there are some clashes that can arrive from locking. + * For instance, because we acquire the slab_mutex while doing + * kmem_cache_dup, this means no further allocation could happen + * with the slab_mutex held. + * + * Also, because cache creation issue get_online_cpus(), this + * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, + * that ends up reversed during cpu hotplug. (cpuset allocates + * a bunch of GFP_KERNEL memory during cpuup). Due to all that, + * better to defer everything. + */ + memcg_create_cache_enqueue(memcg, cachep); + return cachep; +out: + rcu_read_unlock(); + return cachep; } EXPORT_SYMBOL(__memcg_kmem_get_cache); -- cgit v1.1 From d3d30417d32adb5bec041f2c3988b5f91c4e7eec Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Mon, 29 Apr 2013 16:21:29 -0700 Subject: mm/: rename random32() to prandom_u32() Use preferable function name which implies using a pseudo-random number generator. Signed-off-by: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index a1f7772..d417efd 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2120,7 +2120,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (p->bdev) { if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { p->flags |= SWP_SOLIDSTATE; - p->cluster_next = 1 + (random32() % p->highest_bit); + p->cluster_next = 1 + (prandom_u32() % p->highest_bit); } if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0) p->flags |= SWP_DISCARDABLE; -- cgit v1.1