From fda901241fb89449244537db4fb27b06e491b74f Mon Sep 17 00:00:00 2001 From: Denis Kirjanov Date: Thu, 5 Nov 2015 18:44:59 -0800 Subject: slab: convert slab_is_available() to boolean A good candidate to return a boolean result. Signed-off-by: Denis Kirjanov Cc: Christoph Lameter Reviewed-by: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index 5ce4fae..113a6fd 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -692,7 +692,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep) } EXPORT_SYMBOL(kmem_cache_shrink); -int slab_is_available(void) +bool slab_is_available(void) { return slab_state >= UP; } -- cgit v1.1 From c9a77a792003ce9d70df8937c8c87aee6e177149 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 5 Nov 2015 18:45:08 -0800 Subject: mm/slab_common.c: rename cache create/destroy helpers do_kmem_cache_create(), do_kmem_cache_shutdown(), and do_kmem_cache_release() sound awkward for static helper functions that are not supposed to be used outside slab_common.c. Rename them to create_cache(), shutdown_cache(), and release_caches(), respectively. This patch is a pure cleanup and does not introduce any functional changes. Signed-off-by: Vladimir Davydov Acked-by: Christoph Lameter Cc: Pekka Enberg Acked-by: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index 113a6fd..c8d2ed7 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -316,10 +316,10 @@ unsigned long calculate_alignment(unsigned long flags, return ALIGN(align, sizeof(void *)); } -static struct kmem_cache * -do_kmem_cache_create(const char *name, size_t object_size, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *), - struct mem_cgroup *memcg, struct kmem_cache *root_cache) +static struct kmem_cache *create_cache(const char *name, + size_t object_size, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *), + struct mem_cgroup *memcg, struct kmem_cache *root_cache) { struct kmem_cache *s; int err; @@ -418,9 +418,9 @@ kmem_cache_create(const char *name, size_t size, size_t align, goto out_unlock; } - s = do_kmem_cache_create(cache_name, size, size, - calculate_alignment(flags, align, size), - flags, ctor, NULL, NULL); + s = create_cache(cache_name, size, size, + calculate_alignment(flags, align, size), + flags, ctor, NULL, NULL); if (IS_ERR(s)) { err = PTR_ERR(s); kfree_const(cache_name); @@ -448,7 +448,7 @@ out_unlock: } EXPORT_SYMBOL(kmem_cache_create); -static int do_kmem_cache_shutdown(struct kmem_cache *s, +static int shutdown_cache(struct kmem_cache *s, struct list_head *release, bool *need_rcu_barrier) { if (__kmem_cache_shutdown(s) != 0) { @@ -469,8 +469,7 @@ static int do_kmem_cache_shutdown(struct kmem_cache *s, return 0; } -static void do_kmem_cache_release(struct list_head *release, - bool need_rcu_barrier) +static void release_caches(struct list_head *release, bool need_rcu_barrier) { struct kmem_cache *s, *s2; @@ -536,10 +535,10 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, if (!cache_name) goto out_unlock; - s = do_kmem_cache_create(cache_name, root_cache->object_size, - root_cache->size, root_cache->align, - root_cache->flags, root_cache->ctor, - memcg, root_cache); + s = create_cache(cache_name, root_cache->object_size, + root_cache->size, root_cache->align, + root_cache->flags, root_cache->ctor, + memcg, root_cache); /* * If we could not create a memcg cache, do not complain, because * that's not critical at all as we can always proceed with the root @@ -615,14 +614,14 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) * The cgroup is about to be freed and therefore has no charges * left. Hence, all its caches must be empty by now. */ - BUG_ON(do_kmem_cache_shutdown(s, &release, &need_rcu_barrier)); + BUG_ON(shutdown_cache(s, &release, &need_rcu_barrier)); } mutex_unlock(&slab_mutex); put_online_mems(); put_online_cpus(); - do_kmem_cache_release(&release, need_rcu_barrier); + release_caches(&release, need_rcu_barrier); } #endif /* CONFIG_MEMCG_KMEM */ @@ -655,12 +654,12 @@ void kmem_cache_destroy(struct kmem_cache *s) goto out_unlock; for_each_memcg_cache_safe(c, c2, s) { - if (do_kmem_cache_shutdown(c, &release, &need_rcu_barrier)) + if (shutdown_cache(c, &release, &need_rcu_barrier)) busy = true; } if (!busy) - do_kmem_cache_shutdown(s, &release, &need_rcu_barrier); + shutdown_cache(s, &release, &need_rcu_barrier); out_unlock: mutex_unlock(&slab_mutex); @@ -668,7 +667,7 @@ out_unlock: put_online_mems(); put_online_cpus(); - do_kmem_cache_release(&release, need_rcu_barrier); + release_caches(&release, need_rcu_barrier); } EXPORT_SYMBOL(kmem_cache_destroy); -- cgit v1.1 From d60fdcc9e3febde2ebd49fe517e13f428bc12843 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 5 Nov 2015 18:45:11 -0800 Subject: mm/slab_common.c: clear pointers to per memcg caches on destroy Currently, we do not clear pointers to per memcg caches in the memcg_params.memcg_caches array when a global cache is destroyed with kmem_cache_destroy. This is fine if the global cache does get destroyed. However, a cache can be left on the list if it still has active objects when kmem_cache_destroy is called (due to a memory leak). If this happens, the entries in the array will point to already freed areas, which is likely to result in data corruption when the cache is reused (via slab merging). Signed-off-by: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.h | 6 ---- mm/slab_common.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 78 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/slab.h b/mm/slab.h index a3a967d..bf51a8d 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -181,10 +181,6 @@ bool __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); list_for_each_entry(iter, &(root)->memcg_params.list, \ memcg_params.list) -#define for_each_memcg_cache_safe(iter, tmp, root) \ - list_for_each_entry_safe(iter, tmp, &(root)->memcg_params.list, \ - memcg_params.list) - static inline bool is_root_cache(struct kmem_cache *s) { return s->memcg_params.is_root_cache; @@ -265,8 +261,6 @@ extern void slab_init_memcg_params(struct kmem_cache *); #define for_each_memcg_cache(iter, root) \ for ((void)(iter), (void)(root); 0; ) -#define for_each_memcg_cache_safe(iter, tmp, root) \ - for ((void)(iter), (void)(tmp), (void)(root); 0; ) static inline bool is_root_cache(struct kmem_cache *s) { diff --git a/mm/slab_common.c b/mm/slab_common.c index c8d2ed7..ab1f20e 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -461,10 +461,6 @@ static int shutdown_cache(struct kmem_cache *s, if (s->flags & SLAB_DESTROY_BY_RCU) *need_rcu_barrier = true; -#ifdef CONFIG_MEMCG_KMEM - if (!is_root_cache(s)) - list_del(&s->memcg_params.list); -#endif list_move(&s->list, release); return 0; } @@ -597,6 +593,18 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) put_online_cpus(); } +static int __shutdown_memcg_cache(struct kmem_cache *s, + struct list_head *release, bool *need_rcu_barrier) +{ + BUG_ON(is_root_cache(s)); + + if (shutdown_cache(s, release, need_rcu_barrier)) + return -EBUSY; + + list_del(&s->memcg_params.list); + return 0; +} + void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) { LIST_HEAD(release); @@ -614,7 +622,7 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) * The cgroup is about to be freed and therefore has no charges * left. Hence, all its caches must be empty by now. */ - BUG_ON(shutdown_cache(s, &release, &need_rcu_barrier)); + BUG_ON(__shutdown_memcg_cache(s, &release, &need_rcu_barrier)); } mutex_unlock(&slab_mutex); @@ -623,6 +631,68 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) release_caches(&release, need_rcu_barrier); } + +static int shutdown_memcg_caches(struct kmem_cache *s, + struct list_head *release, bool *need_rcu_barrier) +{ + struct memcg_cache_array *arr; + struct kmem_cache *c, *c2; + LIST_HEAD(busy); + int i; + + BUG_ON(!is_root_cache(s)); + + /* + * First, shutdown active caches, i.e. caches that belong to online + * memory cgroups. + */ + arr = rcu_dereference_protected(s->memcg_params.memcg_caches, + lockdep_is_held(&slab_mutex)); + for_each_memcg_cache_index(i) { + c = arr->entries[i]; + if (!c) + continue; + if (__shutdown_memcg_cache(c, release, need_rcu_barrier)) + /* + * The cache still has objects. Move it to a temporary + * list so as not to try to destroy it for a second + * time while iterating over inactive caches below. + */ + list_move(&c->memcg_params.list, &busy); + else + /* + * The cache is empty and will be destroyed soon. Clear + * the pointer to it in the memcg_caches array so that + * it will never be accessed even if the root cache + * stays alive. + */ + arr->entries[i] = NULL; + } + + /* + * Second, shutdown all caches left from memory cgroups that are now + * offline. + */ + list_for_each_entry_safe(c, c2, &s->memcg_params.list, + memcg_params.list) + __shutdown_memcg_cache(c, release, need_rcu_barrier); + + list_splice(&busy, &s->memcg_params.list); + + /* + * A cache being destroyed must be empty. In particular, this means + * that all per memcg caches attached to it must be empty too. + */ + if (!list_empty(&s->memcg_params.list)) + return -EBUSY; + return 0; +} +#else +static inline int shutdown_memcg_caches(struct kmem_cache *s, + struct list_head *release, bool *need_rcu_barrier) +{ + return 0; +} #endif /* CONFIG_MEMCG_KMEM */ void slab_kmem_cache_release(struct kmem_cache *s) @@ -634,16 +704,13 @@ void slab_kmem_cache_release(struct kmem_cache *s) void kmem_cache_destroy(struct kmem_cache *s) { - struct kmem_cache *c, *c2; LIST_HEAD(release); bool need_rcu_barrier = false; - bool busy = false; + int err; if (unlikely(!s)) return; - BUG_ON(!is_root_cache(s)); - get_online_cpus(); get_online_mems(); @@ -653,12 +720,8 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->refcount) goto out_unlock; - for_each_memcg_cache_safe(c, c2, s) { - if (shutdown_cache(c, &release, &need_rcu_barrier)) - busy = true; - } - - if (!busy) + err = shutdown_memcg_caches(s, &release, &need_rcu_barrier); + if (!err) shutdown_cache(s, &release, &need_rcu_barrier); out_unlock: -- cgit v1.1 From cd918c557439c8f0750f64883367aeff264b5fd8 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 5 Nov 2015 18:45:14 -0800 Subject: mm/slab_common.c: do not warn that cache is busy on destroy more than once Currently, when kmem_cache_destroy() is called for a global cache, we print a warning for each per memcg cache attached to it that has active objects (see shutdown_cache). This is redundant, because it gives no new information and only clutters the log. If a cache being destroyed has active objects, there must be a memory leak in the module that created the cache, and it does not matter if the cache was used by users in memory cgroups or not. This patch moves the warning from shutdown_cache(), which is called for shutting down both global and per memcg caches, to kmem_cache_destroy(), so that the warning is only printed once if there are objects left in the cache being destroyed. Signed-off-by: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index ab1f20e..fba78e4 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -451,12 +451,8 @@ EXPORT_SYMBOL(kmem_cache_create); static int shutdown_cache(struct kmem_cache *s, struct list_head *release, bool *need_rcu_barrier) { - if (__kmem_cache_shutdown(s) != 0) { - printk(KERN_ERR "kmem_cache_destroy %s: " - "Slab cache still has objects\n", s->name); - dump_stack(); + if (__kmem_cache_shutdown(s) != 0) return -EBUSY; - } if (s->flags & SLAB_DESTROY_BY_RCU) *need_rcu_barrier = true; @@ -722,8 +718,13 @@ void kmem_cache_destroy(struct kmem_cache *s) err = shutdown_memcg_caches(s, &release, &need_rcu_barrier); if (!err) - shutdown_cache(s, &release, &need_rcu_barrier); + err = shutdown_cache(s, &release, &need_rcu_barrier); + if (err) { + pr_err("kmem_cache_destroy %s: " + "Slab cache still has objects\n", s->name); + dump_stack(); + } out_unlock: mutex_unlock(&slab_mutex); -- cgit v1.1 From 40911a798b5abbbec6b2e271a42addd6b26228a0 Mon Sep 17 00:00:00 2001 From: Alexandru Moise <00moses.alexander00@gmail.com> Date: Thu, 5 Nov 2015 18:45:43 -0800 Subject: mm/slab_common.c: initialize kmem_cache pointer to NULL The assignment to NULL within the error condition was written in a 2014 patch to suppress a compiler warning. However it would be cleaner to just initialize the kmem_cache to NULL and just return it in case of an error condition. Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com> Acked-by: Christoph Lameter Cc: Pekka Enberg Acked-by: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index fba78e4..d88e97c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -384,7 +384,7 @@ struct kmem_cache * kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { - struct kmem_cache *s; + struct kmem_cache *s = NULL; const char *cache_name; int err; @@ -396,7 +396,6 @@ kmem_cache_create(const char *name, size_t size, size_t align, err = kmem_cache_sanity_check(name, size); if (err) { - s = NULL; /* suppress uninit var warning */ goto out_unlock; } -- cgit v1.1 From 422ff4d70c1b3b2deed431dc095432dc691f4269 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 5 Nov 2015 18:45:46 -0800 Subject: mm/slub: correct the comment in calculate_order() In calculate_order(), it tries to calculate the best order by adjusting the fraction and min_objects. On each iteration on min_objects, fraction iterates on 16, 8, 4. Which means the acceptable waste increases with 1/16, 1/8, 1/4. This patch corrects the comment according to the code. Signed-off-by: Wei Yang Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index f614b5d..a94b9f4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2943,7 +2943,7 @@ static inline int calculate_order(int size, int reserved) * works by first attempting to generate a layout with * the best configuration and backing off gradually. * - * First we reduce the acceptable waste in a slab. Then + * First we increase the acceptable waste in a slab. Then * we reduce the minimum objects required in a slab. */ min_objects = slub_min_objects; -- cgit v1.1 From 033fd1bd3c50fdda267d27d02f9bc656f0b9ddb8 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 5 Nov 2015 18:45:48 -0800 Subject: mm/slub: use get_order() instead of fls() get_order() is more easy to understand. This patch just replaces it. Signed-off-by: Wei Yang Cc: Christoph Lameter Cc: David Rientjes Cc: Joonsoo Kim Reviewed-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index a94b9f4..e309ed1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2912,8 +2912,7 @@ static inline int slab_order(int size, int min_objects, if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) return get_order(size * MAX_OBJS_PER_PAGE) - 1; - for (order = max(min_order, - fls(min_objects * size - 1) - PAGE_SHIFT); + for (order = max(min_order, get_order(min_objects * size)); order <= max_order; order++) { unsigned long slab_size = PAGE_SIZE << order; -- cgit v1.1 From 9f835703ea67633617ca82bc150f6ee70831b40a Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 5 Nov 2015 18:45:51 -0800 Subject: mm/slub: calculate start order with reserved in consideration In slub_order(), the order starts from max(min_order, get_order(min_objects * size)). When (min_objects * size) has different order from (min_objects * size + reserved), it will skip this order via a check in the loop. This patch optimizes this a little by calculating the start order with `reserved' in consideration and removing the check in loop. Signed-off-by: Wei Yang Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index e309ed1..e1bb147 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2912,19 +2912,15 @@ static inline int slab_order(int size, int min_objects, if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) return get_order(size * MAX_OBJS_PER_PAGE) - 1; - for (order = max(min_order, get_order(min_objects * size)); + for (order = max(min_order, get_order(min_objects * size + reserved)); order <= max_order; order++) { unsigned long slab_size = PAGE_SIZE << order; - if (slab_size < min_objects * size + reserved) - continue; - rem = (slab_size - reserved) % size; if (rem <= slab_size / fract_leftover) break; - } return order; -- cgit v1.1 From d4322d88f5fdf92729dd40f923013414fbb2184d Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 5 Nov 2015 18:45:54 -0800 Subject: mm: slab: only move management objects off-slab for sizes larger than KMALLOC_MIN_SIZE On systems with a KMALLOC_MIN_SIZE of 128 (arm64, some mips and powerpc configurations defining ARCH_DMA_MINALIGN to 128), the first kmalloc_caches[] entry to be initialised after slab_early_init = 0 is "kmalloc-128" with index 7. Depending on the debug kernel configuration, sizeof(struct kmem_cache) can be larger than 128 resulting in an INDEX_NODE of 8. Commit 8fc9cf420b36 ("slab: make more slab management structure off the slab") enables off-slab management objects for sizes starting with PAGE_SIZE >> 5 (128 bytes for a 4KB page configuration) and the creation of the "kmalloc-128" cache would try to place the management objects off-slab. However, since KMALLOC_MIN_SIZE is already 128 and freelist_size == 32 in __kmem_cache_create(), kmalloc_slab(freelist_size) returns NULL (kmalloc_caches[7] not populated yet). This triggers the following bug on arm64: kernel BUG at /work/Linux/linux-2.6-aarch64/mm/slab.c:2283! Internal error: Oops - BUG: 0 [#1] SMP Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 4.3.0-rc4+ #540 Hardware name: Juno (DT) PC is at __kmem_cache_create+0x21c/0x280 LR is at __kmem_cache_create+0x210/0x280 [...] Call trace: __kmem_cache_create+0x21c/0x280 create_boot_cache+0x48/0x80 create_kmalloc_cache+0x50/0x88 create_kmalloc_caches+0x4c/0xf4 kmem_cache_init+0x100/0x118 start_kernel+0x214/0x33c This patch introduces an OFF_SLAB_MIN_SIZE definition to avoid off-slab management objects for sizes equal to or smaller than KMALLOC_MIN_SIZE. Fixes: 8fc9cf420b36 ("slab: make more slab management structure off the slab") Signed-off-by: Catalin Marinas Reported-by: Geert Uytterhoeven Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: [3.15+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 4fcc5dd..461935b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -282,6 +282,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) #define CFLGS_OFF_SLAB (0x80000000UL) #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) +#define OFF_SLAB_MIN_SIZE (max_t(size_t, PAGE_SIZE >> 5, KMALLOC_MIN_SIZE + 1)) #define BATCHREFILL_LIMIT 16 /* @@ -2212,7 +2213,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * it too early on. Always use on-slab management when * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) */ - if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init && + if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init && !(flags & SLAB_NOLEAKTRACE)) /* * Size is large, assume best to place the slab management obj @@ -2276,7 +2277,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) /* * This is a possibility for one of the kmalloc_{dma,}_caches. * But since we go off slab only for object size greater than - * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created + * OFF_SLAB_MIN_SIZE, and kmalloc_{dma,}_caches get created * in ascending order,this should not happen at all. * But leave a BUG_ON for some lucky dude. */ -- cgit v1.1 From 9fbed25407ccc87a7bb47ea3f411e1ca34a95f8b Mon Sep 17 00:00:00 2001 From: Alexey Klimov Date: Thu, 5 Nov 2015 18:45:57 -0800 Subject: mm/kmemleak.c: remove unneeded initialization of object to NULL Few lines below object is reinitialized by lookup_object() so we don't need to init it by NULL in the beginning of find_and_get_object(). Signed-off-by: Alexey Klimov Acked-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 77191ec..19423a4 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -479,7 +479,7 @@ static void put_object(struct kmemleak_object *object) static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) { unsigned long flags; - struct kmemleak_object *object = NULL; + struct kmemleak_object *object; rcu_read_lock(); read_lock_irqsave(&kmemleak_lock, flags); -- cgit v1.1 From 86d2adccfbe7d5a1f050fa08db9638c9168736d9 Mon Sep 17 00:00:00 2001 From: Alexey Klimov Date: Thu, 5 Nov 2015 18:46:00 -0800 Subject: mm/mlock.c: reorganize mlockall() return values and remove goto-out label In mlockall syscall wrapper after out-label for goto code just doing return. Remove goto out statements and return error values directly. Also instead of rewriting ret variable before every if-check move returns to 'error'-like path under if-check. Objdump asm listing showed me reducing by few asm lines. Object file size descreased from 220592 bytes to 220528 bytes for me (for aarch64). Signed-off-by: Alexey Klimov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index 25936680..7e6ad9c 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -684,14 +684,13 @@ out: SYSCALL_DEFINE1(mlockall, int, flags) { unsigned long lock_limit; - int ret = -EINVAL; + int ret; if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE))) - goto out; + return -EINVAL; - ret = -EPERM; if (!can_do_mlock()) - goto out; + return -EPERM; if (flags & MCL_CURRENT) lru_add_drain_all(); /* flush pagevec */ @@ -708,7 +707,7 @@ SYSCALL_DEFINE1(mlockall, int, flags) up_write(¤t->mm->mmap_sem); if (!ret && (flags & MCL_CURRENT)) mm_populate(0, TASK_SIZE); -out: + return ret; } -- cgit v1.1 From 0ab32b6f1b88444524e52429fab334ff96683a3f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 5 Nov 2015 18:46:03 -0800 Subject: uaccess: reimplement probe_kernel_address() using probe_kernel_read() probe_kernel_address() is basically the same as the (later added) probe_kernel_read(). The return value on EFAULT is a bit different: probe_kernel_address() returns number-of-bytes-not-copied whereas probe_kernel_read() returns -EFAULT. All callers have been checked, none cared. probe_kernel_read() can be overridden by the architecture whereas probe_kernel_address() cannot. parisc, blackfin and um do this, to insert additional checking. Hence this patch possibly fixes obscure bugs, although there are only two probe_kernel_address() callsites outside arch/. My first attempt involved removing probe_kernel_address() entirely and converting all callsites to use probe_kernel_read() directly, but that got tiresome. This patch shrinks mm/slab_common.o by 218 bytes. For a single probe_kernel_address() callsite. Cc: Steven Miao Cc: Jeff Dike Cc: Richard Weinberger Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/maccess.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/maccess.c b/mm/maccess.c index 34fe247..1b13638 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -13,6 +13,11 @@ * * Safely read from address @src to the buffer at @dst. If a kernel fault * happens, handle that and return -EFAULT. + * + * We ensure that the copy_from_user is executed in atomic context so that + * do_page_fault() doesn't attempt to take mmap_sem. This makes + * probe_kernel_read() suitable for use within regions where the caller + * already holds mmap_sem, or other locks which nest inside mmap_sem. */ long __weak probe_kernel_read(void *dst, const void *src, size_t size) -- cgit v1.1 From 55e1ceaf2586ab11aafba798a6b9499dd7c14441 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Thu, 5 Nov 2015 18:46:06 -0800 Subject: mm/mmap.c: remove useless statement "vma = NULL" in find_vma() Before the main loop, vma is already is NULL. There is no need to set it to NULL again. Signed-off-by: Chen Gang Reviewed-by: Oleg Nesterov Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 79bcc9f..bd932c1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2047,7 +2047,6 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) return vma; rb_node = mm->mm_rb.rb_node; - vma = NULL; while (rb_node) { struct vm_area_struct *tmp; -- cgit v1.1 From 626ebc4100285be56fe3546f29b6afeb36b6871a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Nov 2015 18:46:09 -0800 Subject: memcg: flatten task_struct->memcg_oom task_struct->memcg_oom is a sub-struct containing fields which are used for async memcg oom handling. Most task_struct fields aren't packaged this way and it can lead to unnecessary alignment paddings. This patch flattens it. * task.memcg_oom.memcg -> task.memcg_in_oom * task.memcg_oom.gfp_mask -> task.memcg_oom_gfp_mask * task.memcg_oom.order -> task.memcg_oom_order * task.memcg_oom.may_oom -> task.memcg_may_oom In addition, task.memcg_may_oom is relocated to where other bitfields are which reduces the size of task_struct. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Reviewed-by: Vladimir Davydov Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c57c442..47bd7f1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1661,7 +1661,7 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - if (!current->memcg_oom.may_oom) + if (!current->memcg_may_oom) return; /* * We are in the middle of the charge context here, so we @@ -1678,9 +1678,9 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) * and when we know whether the fault was overall successful. */ css_get(&memcg->css); - current->memcg_oom.memcg = memcg; - current->memcg_oom.gfp_mask = mask; - current->memcg_oom.order = order; + current->memcg_in_oom = memcg; + current->memcg_oom_gfp_mask = mask; + current->memcg_oom_order = order; } /** @@ -1702,7 +1702,7 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) */ bool mem_cgroup_oom_synchronize(bool handle) { - struct mem_cgroup *memcg = current->memcg_oom.memcg; + struct mem_cgroup *memcg = current->memcg_in_oom; struct oom_wait_info owait; bool locked; @@ -1730,8 +1730,8 @@ bool mem_cgroup_oom_synchronize(bool handle) if (locked && !memcg->oom_kill_disable) { mem_cgroup_unmark_under_oom(memcg); finish_wait(&memcg_oom_waitq, &owait.wait); - mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, - current->memcg_oom.order); + mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask, + current->memcg_oom_order); } else { schedule(); mem_cgroup_unmark_under_oom(memcg); @@ -1748,7 +1748,7 @@ bool mem_cgroup_oom_synchronize(bool handle) memcg_oom_recover(memcg); } cleanup: - current->memcg_oom.memcg = NULL; + current->memcg_in_oom = NULL; css_put(&memcg->css); return true; } -- cgit v1.1 From b23afb93d317c65cef553b804f08dec8a7a0f7e1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Nov 2015 18:46:11 -0800 Subject: memcg: punt high overage reclaim to return-to-userland path Currently, try_charge() tries to reclaim memory synchronously when the high limit is breached; however, if the allocation doesn't have __GFP_WAIT, synchronous reclaim is skipped. If a process performs only speculative allocations, it can blow way past the high limit. This is actually easily reproducible by simply doing "find /". slab/slub allocator tries speculative allocations first, so as long as there's memory which can be consumed without blocking, it can keep allocating memory regardless of the high limit. This patch makes try_charge() always punt the over-high reclaim to the return-to-userland path. If try_charge() detects that high limit is breached, it adds the overage to current->memcg_nr_pages_over_high and schedules execution of mem_cgroup_handle_over_high() which performs synchronous reclaim from the return-to-userland path. As long as kernel doesn't have a run-away allocation spree, this should provide enough protection while making kmemcg behave more consistently. It also has the following benefits. - All over-high reclaims can use GFP_KERNEL regardless of the specific gfp mask in use, e.g. GFP_NOFS, when the limit was breached. - It copes with prio inversion. Previously, a low-prio task with small memory.high might perform over-high reclaim with a bunch of locks held. If a higher prio task needed any of these locks, it would have to wait until the low prio task finished reclaim and released the locks. By handing over-high reclaim to the task exit path this issue can be avoided. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Reviewed-by: Vladimir Davydov Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 47 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 39 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 47bd7f1..327dcda 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -62,6 +62,7 @@ #include #include #include +#include #include "internal.h" #include #include @@ -1972,6 +1973,31 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, return NOTIFY_OK; } +/* + * Scheduled by try_charge() to be executed from the userland return path + * and reclaims memory over the high limit. + */ +void mem_cgroup_handle_over_high(void) +{ + unsigned int nr_pages = current->memcg_nr_pages_over_high; + struct mem_cgroup *memcg, *pos; + + if (likely(!nr_pages)) + return; + + pos = memcg = get_mem_cgroup_from_mm(current->mm); + + do { + if (page_counter_read(&pos->memory) <= pos->high) + continue; + mem_cgroup_events(pos, MEMCG_HIGH, 1); + try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true); + } while ((pos = parent_mem_cgroup(pos))); + + css_put(&memcg->css); + current->memcg_nr_pages_over_high = 0; +} + static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int nr_pages) { @@ -2080,17 +2106,22 @@ done_restock: css_get_many(&memcg->css, batch); if (batch > nr_pages) refill_stock(memcg, batch - nr_pages); - if (!(gfp_mask & __GFP_WAIT)) - goto done; + /* - * If the hierarchy is above the normal consumption range, - * make the charging task trim their excess contribution. + * If the hierarchy is above the normal consumption range, schedule + * reclaim on returning to userland. We can perform reclaim here + * if __GFP_WAIT but let's always punt for simplicity and so that + * GFP_KERNEL can consistently be used during reclaim. @memcg is + * not recorded as it most likely matches current's and won't + * change in the meantime. As high limit is checked again before + * reclaim, the cost of mismatch is negligible. */ do { - if (page_counter_read(&memcg->memory) <= memcg->high) - continue; - mem_cgroup_events(memcg, MEMCG_HIGH, 1); - try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); + if (page_counter_read(&memcg->memory) > memcg->high) { + current->memcg_nr_pages_over_high += nr_pages; + set_notify_resume(current); + break; + } } while ((memcg = parent_mem_cgroup(memcg))); done: return ret; -- cgit v1.1 From 10d53c748bc9531f47e13f98e32ef28be4399862 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Nov 2015 18:46:17 -0800 Subject: memcg: ratify and consolidate over-charge handling try_charge() is the main charging logic of memcg. When it hits the limit but either can't fail the allocation due to __GFP_NOFAIL or the task is likely to free memory very soon, being OOM killed, has SIGKILL pending or exiting, it "bypasses" the charge to the root memcg and returns -EINTR. While this is one approach which can be taken for these situations, it has several issues. * It unnecessarily lies about the reality. The number itself doesn't go over the limit but the actual usage does. memcg is either forced to or actively chooses to go over the limit because that is the right behavior under the circumstances, which is completely fine, but, if at all avoidable, it shouldn't be misrepresenting what's happening by sneaking the charges into the root memcg. * Despite trying, we already do over-charge. kmemcg can't deal with switching over to the root memcg by the point try_charge() returns -EINTR, so it open-codes over-charing. * It complicates the callers. Each try_charge() user has to handle the weird -EINTR exception. memcg_charge_kmem() does the manual over-charging. mem_cgroup_do_precharge() performs unnecessary uncharging of root memcg, which BTW is inconsistent with what memcg_charge_kmem() does but not broken as [un]charging are noops on root memcg. mem_cgroup_try_charge() needs to switch the returned cgroup to the root one. The reality is that in memcg there are cases where we are forced and/or willing to go over the limit. Each such case needs to be scrutinized and justified but there definitely are situations where that is the right thing to do. We alredy do this but with a superficial and inconsistent disguise which leads to unnecessary complications. This patch updates try_charge() so that it over-charges and returns 0 when deemed necessary. -EINTR return is removed along with all special case handling in the callers. While at it, remove the local variable @ret, which was initialized to zero and never changed, along with done: label which just returned the always zero @ret. Signed-off-by: Tejun Heo Reviewed-by: Vladimir Davydov Acked-by: Michal Hocko Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 69 +++++++++++++++++---------------------------------------- 1 file changed, 20 insertions(+), 49 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 327dcda..b952abe 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2008,13 +2008,12 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned long nr_reclaimed; bool may_swap = true; bool drained = false; - int ret = 0; if (mem_cgroup_is_root(memcg)) - goto done; + return 0; retry: if (consume_stock(memcg, nr_pages)) - goto done; + return 0; if (!do_swap_account || !page_counter_try_charge(&memcg->memsw, batch, &counter)) { @@ -2042,7 +2041,7 @@ retry: if (unlikely(test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current) || current->flags & PF_EXITING)) - goto bypass; + goto force; if (unlikely(task_in_memcg_oom(current))) goto nomem; @@ -2088,10 +2087,10 @@ retry: goto retry; if (gfp_mask & __GFP_NOFAIL) - goto bypass; + goto force; if (fatal_signal_pending(current)) - goto bypass; + goto force; mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1); @@ -2099,8 +2098,18 @@ retry: nomem: if (!(gfp_mask & __GFP_NOFAIL)) return -ENOMEM; -bypass: - return -EINTR; +force: + /* + * The allocation either can't fail or will lead to more memory + * being freed very soon. Allow memory usage go over the limit + * temporarily by force charging it. + */ + page_counter_charge(&memcg->memory, nr_pages); + if (do_swap_account) + page_counter_charge(&memcg->memsw, nr_pages); + css_get_many(&memcg->css, nr_pages); + + return 0; done_restock: css_get_many(&memcg->css, batch); @@ -2123,8 +2132,8 @@ done_restock: break; } } while ((memcg = parent_mem_cgroup(memcg))); -done: - return ret; + + return 0; } static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) @@ -2216,28 +2225,7 @@ int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, return ret; ret = try_charge(memcg, gfp, nr_pages); - if (ret == -EINTR) { - /* - * try_charge() chose to bypass to root due to OOM kill or - * fatal signal. Since our only options are to either fail - * the allocation or charge it to this cgroup, do it as a - * temporary condition. But we can't fail. From a kmem/slab - * perspective, the cache has already been selected, by - * mem_cgroup_kmem_get_cache(), so it is too late to change - * our minds. - * - * This condition will only trigger if the task entered - * memcg_charge_kmem in a sane state, but was OOM-killed - * during try_charge() above. Tasks that were already dying - * when the allocation triggers should have been already - * directed to the root cgroup in memcontrol.h - */ - page_counter_charge(&memcg->memory, nr_pages); - if (do_swap_account) - page_counter_charge(&memcg->memsw, nr_pages); - css_get_many(&memcg->css, nr_pages); - ret = 0; - } else if (ret) + if (ret) page_counter_uncharge(&memcg->kmem, nr_pages); return ret; @@ -4438,22 +4426,10 @@ static int mem_cgroup_do_precharge(unsigned long count) mc.precharge += count; return ret; } - if (ret == -EINTR) { - cancel_charge(root_mem_cgroup, count); - return ret; - } /* Try charges one by one with reclaim */ while (count--) { ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); - /* - * In case of failure, any residual charges against - * mc.to will be dropped by mem_cgroup_clear_mc() - * later on. However, cancel any charges that are - * bypassed to root right away or they'll be lost. - */ - if (ret == -EINTR) - cancel_charge(root_mem_cgroup, 1); if (ret) return ret; mc.precharge++; @@ -5358,11 +5334,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, ret = try_charge(memcg, gfp_mask, nr_pages); css_put(&memcg->css); - - if (ret == -EINTR) { - memcg = root_mem_cgroup; - ret = 0; - } out: *memcgp = memcg; return ret; -- cgit v1.1 From 61f9ec1d8e97131ce55159647fcdfeccc0f40647 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Thu, 5 Nov 2015 18:46:23 -0800 Subject: mm: fix docbook comment for get_vaddr_frames() get_vaddr_frames() has a comment that's *almost* a docbook comment; add the missing star so that the tools will find it properly. Signed-off-by: Jonathan Corbet Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/frame_vector.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/frame_vector.c b/mm/frame_vector.c index cdabcb9..7cf2b71 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -7,7 +7,7 @@ #include #include -/* +/** * get_vaddr_frames() - map virtual addresses to pfns * @start: starting user address * @nr_frames: number of pages / pfns from start to map -- cgit v1.1 From 145949a1387ba7a4fd0df15181e09345ec7b0492 Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Thu, 5 Nov 2015 18:46:26 -0800 Subject: mm/list_lru.c: replace nr_node_ids for loop with for_each_node() The functions used in the patch are in slowpath, which gets called whenever alloc_super is called during mounts. Though this should not make difference for the architectures with sequential numa node ids, for the powerpc which can potentially have sparse node ids (for e.g., 4 node system having numa ids, 0,1,16,17 is common), this patch saves some unnecessary allocations for non existing numa nodes. Even without that saving, perhaps patch makes code more readable. [vdavydov@parallels.com: take memcg_aware check outside for_each loop] Signed-off-by: Raghavendra K T Reviewed-by: Vladimir Davydov Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Anton Blanchard Cc: Nishanth Aravamudan Cc: Greg Kurz Cc: Grant Likely Cc: Nikunj A Dadhania Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/list_lru.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/list_lru.c b/mm/list_lru.c index e1da19f..2823747 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -42,6 +42,10 @@ static void list_lru_unregister(struct list_lru *lru) #ifdef CONFIG_MEMCG_KMEM static inline bool list_lru_memcg_aware(struct list_lru *lru) { + /* + * This needs node 0 to be always present, even + * in the systems supporting sparse numa ids. + */ return !!lru->node[0].memcg_lrus; } @@ -377,16 +381,20 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) { int i; - for (i = 0; i < nr_node_ids; i++) { - if (!memcg_aware) - lru->node[i].memcg_lrus = NULL; - else if (memcg_init_list_lru_node(&lru->node[i])) + if (!memcg_aware) + return 0; + + for_each_node(i) { + if (memcg_init_list_lru_node(&lru->node[i])) goto fail; } return 0; fail: - for (i = i - 1; i >= 0; i--) + for (i = i - 1; i >= 0; i--) { + if (!lru->node[i].memcg_lrus) + continue; memcg_destroy_list_lru_node(&lru->node[i]); + } return -ENOMEM; } @@ -397,7 +405,7 @@ static void memcg_destroy_list_lru(struct list_lru *lru) if (!list_lru_memcg_aware(lru)) return; - for (i = 0; i < nr_node_ids; i++) + for_each_node(i) memcg_destroy_list_lru_node(&lru->node[i]); } @@ -409,16 +417,20 @@ static int memcg_update_list_lru(struct list_lru *lru, if (!list_lru_memcg_aware(lru)) return 0; - for (i = 0; i < nr_node_ids; i++) { + for_each_node(i) { if (memcg_update_list_lru_node(&lru->node[i], old_size, new_size)) goto fail; } return 0; fail: - for (i = i - 1; i >= 0; i--) + for (i = i - 1; i >= 0; i--) { + if (!lru->node[i].memcg_lrus) + continue; + memcg_cancel_update_list_lru_node(&lru->node[i], old_size, new_size); + } return -ENOMEM; } @@ -430,7 +442,7 @@ static void memcg_cancel_update_list_lru(struct list_lru *lru, if (!list_lru_memcg_aware(lru)) return; - for (i = 0; i < nr_node_ids; i++) + for_each_node(i) memcg_cancel_update_list_lru_node(&lru->node[i], old_size, new_size); } @@ -485,7 +497,7 @@ static void memcg_drain_list_lru(struct list_lru *lru, if (!list_lru_memcg_aware(lru)) return; - for (i = 0; i < nr_node_ids; i++) + for_each_node(i) memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx); } @@ -522,7 +534,7 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, if (!lru->node) goto out; - for (i = 0; i < nr_node_ids; i++) { + for_each_node(i) { spin_lock_init(&lru->node[i].lock); if (key) lockdep_set_class(&lru->node[i].lock, key); -- cgit v1.1 From b0d61c7e56815b0b881c81f6779a65f4fdae4bc0 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 5 Nov 2015 18:46:32 -0800 Subject: mm/msync: use offset_in_page macro linux/mm.h provides offset_in_page() macro. Let's use already predefined macro instead of (addr & ~PAGE_MASK). Signed-off-by: Alexander Kuleshov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/msync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/msync.c b/mm/msync.c index bb04d53..24e612f 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -38,7 +38,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) goto out; - if (start & ~PAGE_MASK) + if (offset_in_page(start)) goto out; if ((flags & MS_ASYNC) && (flags & MS_SYNC)) goto out; -- cgit v1.1 From 1824cb753354e026ab898cd472bddd540b50b00b Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 5 Nov 2015 18:46:35 -0800 Subject: mm/nommu: use offset_in_page macro linux/mm.h provides offset_in_page() macro. Let's use already predefined macro instead of (addr & ~PAGE_MASK). Signed-off-by: Alexander Kuleshov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nommu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index ab14a20..1e0f168 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1497,7 +1497,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; - if (a.offset & ~PAGE_MASK) + if (offset_in_page(a.offset)) return -EINVAL; return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, @@ -1653,9 +1653,9 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) goto erase_whole_vma; if (start < vma->vm_start || end > vma->vm_end) return -EINVAL; - if (start & ~PAGE_MASK) + if (offset_in_page(start)) return -EINVAL; - if (end != vma->vm_end && end & ~PAGE_MASK) + if (end != vma->vm_end && offset_in_page(end)) return -EINVAL; if (start != vma->vm_start && end != vma->vm_end) { ret = split_vma(mm, vma, start, 1); @@ -1736,7 +1736,7 @@ static unsigned long do_mremap(unsigned long addr, if (old_len == 0 || new_len == 0) return (unsigned long) -EINVAL; - if (addr & ~PAGE_MASK) + if (offset_in_page(addr)) return -EINVAL; if (flags & MREMAP_FIXED && new_addr != addr) -- cgit v1.1 From e7bbdd071314b52507e6c615e2cec90d46f82c57 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 5 Nov 2015 18:46:38 -0800 Subject: mm/mincore: use offset_in_page macro linux/mm.h provides offset_in_page() macro. Let's use already predefined macro instead of (addr & ~PAGE_MASK). Signed-off-by: Alexander Kuleshov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mincore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mincore.c b/mm/mincore.c index be25efd..14bb9fb 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -234,7 +234,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, /* This also avoids any overflows on PAGE_CACHE_ALIGN */ pages = len >> PAGE_SHIFT; - pages += (len & ~PAGE_MASK) != 0; + pages += (offset_in_page(len)) != 0; if (!access_ok(VERIFY_WRITE, vec, pages)) return -EFAULT; -- cgit v1.1 From 5d57b0146aa942b939bbd77e09130270dc9b97d2 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 5 Nov 2015 18:46:40 -0800 Subject: mm/early_ioremap: use offset_in_page macro linux/mm.h provides offset_in_page() macro. Let's use already predefined macro instead of (addr & ~PAGE_MASK). Signed-off-by: Alexander Kuleshov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/early_ioremap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index 17ae14b..6d5717b 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -126,7 +126,7 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) /* * Mappings have to be page-aligned */ - offset = phys_addr & ~PAGE_MASK; + offset = offset_in_page(phys_addr); phys_addr &= PAGE_MASK; size = PAGE_ALIGN(last_addr + 1) - phys_addr; @@ -189,7 +189,7 @@ void __init early_iounmap(void __iomem *addr, unsigned long size) if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))) return; - offset = virt_addr & ~PAGE_MASK; + offset = offset_in_page(virt_addr); nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT; idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; @@ -234,7 +234,7 @@ void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size) char *p; while (size) { - slop = src & ~PAGE_MASK; + slop = offset_in_page(src); clen = size; if (clen > MAX_MAP_CHUNK - slop) clen = MAX_MAP_CHUNK - slop; -- cgit v1.1 From f09f1243ca2d5d297881bf2c2148d9ab35314314 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 5 Nov 2015 18:46:43 -0800 Subject: mm/percpu: use offset_in_page macro linux/mm.h provides offset_in_page() macro. Let's use already predefined macro instead of (addr & ~PAGE_MASK). Signed-off-by: Alexander Kuleshov Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/percpu.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index a63b4d8..8a943b9 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1554,12 +1554,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); #ifdef CONFIG_SMP PCPU_SETUP_BUG_ON(!ai->static_size); - PCPU_SETUP_BUG_ON((unsigned long)__per_cpu_start & ~PAGE_MASK); + PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start)); #endif PCPU_SETUP_BUG_ON(!base_addr); - PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK); + PCPU_SETUP_BUG_ON(offset_in_page(base_addr)); PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); - PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); + PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size)); PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE); PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); @@ -1806,7 +1806,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( alloc_size = roundup(min_unit_size, atom_size); upa = alloc_size / min_unit_size; - while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + while (alloc_size % upa || (offset_in_page(alloc_size / upa))) upa--; max_upa = upa; @@ -1838,7 +1838,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( for (upa = max_upa; upa; upa--) { int allocs = 0, wasted = 0; - if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + if (alloc_size % upa || (offset_in_page(alloc_size / upa))) continue; for (group = 0; group < nr_groups; group++) { -- cgit v1.1 From ea53cde089e07cfd7996c2072f770ebb984ce8db Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 5 Nov 2015 18:46:46 -0800 Subject: mm/util: use offset_in_page macro linux/mm.h provides offset_in_page() macro. Let's use already predefined macro instead of (addr & ~PAGE_MASK). Signed-off-by: Alexander Kuleshov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/util.c b/mm/util.c index 68ff8a5..9af1c12 100644 --- a/mm/util.c +++ b/mm/util.c @@ -309,7 +309,7 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, { if (unlikely(offset + PAGE_ALIGN(len) < offset)) return -EINVAL; - if (unlikely(offset & ~PAGE_MASK)) + if (unlikely(offset_in_page(offset))) return -EINVAL; return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); -- cgit v1.1 From 8fd9e4883a2b08c52ec00f3c214b45d096fc697a Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 5 Nov 2015 18:46:49 -0800 Subject: mm/mlock: use offset_in_page macro linux/mm.h provides offset_in_page() macro. Let's use already predefined macro instead of (addr & ~PAGE_MASK). Signed-off-by: Alexander Kuleshov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index 7e6ad9c..550228d 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -560,7 +560,7 @@ static int do_mlock(unsigned long start, size_t len, int on) struct vm_area_struct * vma, * prev; int error; - VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(offset_in_page(start)); VM_BUG_ON(len != PAGE_ALIGN(len)); end = start + len; if (end < start) @@ -616,7 +616,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) lru_add_drain_all(); /* flush pagevec */ - len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); + len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; lock_limit = rlimit(RLIMIT_MEMLOCK); @@ -645,7 +645,7 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { int ret; - len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); + len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; down_write(¤t->mm->mmap_sem); -- cgit v1.1 From 891c49abfb097bbd7024b4072dd1c8e1c995d3ec Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 5 Nov 2015 18:46:51 -0800 Subject: mm/vmalloc: use offset_in_page macro linux/mm.h provides offset_in_page() macro. Let's use already predefined macro instead of (addr & ~PAGE_MASK). Signed-off-by: Alexander Kuleshov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index af3a519..9db9ef5 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -358,7 +358,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, struct vmap_area *first; BUG_ON(!size); - BUG_ON(size & ~PAGE_MASK); + BUG_ON(offset_in_page(size)); BUG_ON(!is_power_of_2(align)); va = kmalloc_node(sizeof(struct vmap_area), @@ -936,7 +936,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) void *vaddr = NULL; unsigned int order; - BUG_ON(size & ~PAGE_MASK); + BUG_ON(offset_in_page(size)); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); if (WARN_ON(size == 0)) { /* @@ -989,7 +989,7 @@ static void vb_free(const void *addr, unsigned long size) unsigned int order; struct vmap_block *vb; - BUG_ON(size & ~PAGE_MASK); + BUG_ON(offset_in_page(size)); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size); @@ -1902,7 +1902,7 @@ static int aligned_vread(char *buf, char *addr, unsigned long count) while (count) { unsigned long offset, length; - offset = (unsigned long)addr & ~PAGE_MASK; + offset = offset_in_page(addr); length = PAGE_SIZE - offset; if (length > count) length = count; @@ -1941,7 +1941,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count) while (count) { unsigned long offset, length; - offset = (unsigned long)addr & ~PAGE_MASK; + offset = offset_in_page(addr); length = PAGE_SIZE - offset; if (length > count) length = count; @@ -2392,7 +2392,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, bool purged = false; /* verify parameters and allocate data structures */ - BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); + BUG_ON(offset_in_page(align) || !is_power_of_2(align)); for (last_area = 0, area = 0; area < nr_vms; area++) { start = offsets[area]; end = start + sizes[area]; -- cgit v1.1 From de1741a1333ea37694dddf7c94aa4cf2d0e58912 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 5 Nov 2015 18:46:54 -0800 Subject: mm/mmap: use offset_in_page macro linux/mm.h provides offset_in_page() macro. Let's use already predefined macro instead of (addr & ~PAGE_MASK). Signed-off-by: Alexander Kuleshov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index bd932c1..3ec19b6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1302,7 +1302,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, * that it represents a valid section of the address space. */ addr = get_unmapped_area(file, addr, len, pgoff, flags); - if (addr & ~PAGE_MASK) + if (offset_in_page(addr)) return addr; /* Do simple checking here so the lower-level routines won't have @@ -1473,7 +1473,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; - if (a.offset & ~PAGE_MASK) + if (offset_in_page(a.offset)) return -EINVAL; return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, @@ -1989,7 +1989,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, * can happen with large stack limits and large mmap() * allocations. */ - if (addr & ~PAGE_MASK) { + if (offset_in_page(addr)) { VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = TASK_UNMAPPED_BASE; @@ -2025,7 +2025,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (addr > TASK_SIZE - len) return -ENOMEM; - if (addr & ~PAGE_MASK) + if (offset_in_page(addr)) return -EINVAL; addr = arch_rebalance_pgtables(addr, len); @@ -2535,7 +2535,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) unsigned long end; struct vm_area_struct *vma, *prev, *last; - if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) + if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) return -EINVAL; len = PAGE_ALIGN(len); @@ -2733,7 +2733,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); - if (error & ~PAGE_MASK) + if (offset_in_page(error)) return error; error = mlock_future_check(mm, mm->def_flags, len); -- cgit v1.1 From f19cb115a25f3f25752fdc56340e7433462157ba Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 5 Nov 2015 18:46:57 -0800 Subject: mm/mremap: use offset_in_page macro linux/mm.h provides offset_in_page() macro. Let's use already predefined macro instead of (addr & ~PAGE_MASK). Signed-off-by: Alexander Kuleshov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mremap.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/mremap.c b/mm/mremap.c index 5a71cce..c25bc62 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -401,7 +401,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, unsigned long charged = 0; unsigned long map_flags; - if (new_addr & ~PAGE_MASK) + if (offset_in_page(new_addr)) goto out; if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) @@ -435,11 +435,11 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT), map_flags); - if (ret & ~PAGE_MASK) + if (offset_in_page(ret)) goto out1; ret = move_vma(vma, addr, old_len, new_len, new_addr, locked); - if (!(ret & ~PAGE_MASK)) + if (!(offset_in_page(ret))) goto out; out1: vm_unacct_memory(charged); @@ -484,7 +484,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE)) return ret; - if (addr & ~PAGE_MASK) + if (offset_in_page(addr)) return ret; old_len = PAGE_ALIGN(old_len); @@ -566,7 +566,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT), map_flags); - if (new_addr & ~PAGE_MASK) { + if (offset_in_page(new_addr)) { ret = new_addr; goto out; } @@ -574,7 +574,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked); } out: - if (ret & ~PAGE_MASK) { + if (offset_in_page(ret)) { vm_unacct_memory(charged); locked = 0; } -- cgit v1.1 From 35bd16a227534cb6ffc9b26a33061c2dcf91934b Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 5 Nov 2015 18:47:00 -0800 Subject: mm/memblock: make memblock_remove_range() static memblock_remove_range() is only used in the mm/memblock.c, so we can make it static. Signed-off-by: Alexander Kuleshov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 1c7b647..d300f13 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -706,7 +706,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, return 0; } -int __init_memblock memblock_remove_range(struct memblock_type *type, +static int __init_memblock memblock_remove_range(struct memblock_type *type, phys_addr_t base, phys_addr_t size) { int start_rgn, end_rgn; -- cgit v1.1 From f2f81fb2b72b83b661b11da6f1b0bd3526706278 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 5 Nov 2015 18:47:03 -0800 Subject: mm, migrate: count pages failing all retries in vmstat and tracepoint Migration tries up to 10 times to migrate pages that return -EAGAIN until it gives up. If some pages fail all retries, they are counted towards the number of failed pages that migrate_pages() returns. They should also be counted in the /proc/vmstat pgmigrate_fail and in the mm_migrate_pages tracepoint. Signed-off-by: Vlastimil Babka Acked-by: David Rientjes Cc: Naoya Horiguchi Cc: "Kirill A. Shutemov" Cc: "Aneesh Kumar K.V" Cc: Konstantin Khlebnikov Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 842ecd7..94961f4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1169,7 +1169,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, } } } - rc = nr_failed + retry; + nr_failed += retry; + rc = nr_failed; out: if (nr_succeeded) count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); -- cgit v1.1 From b171e4093017d4d6e411f5e97823e5e4a21266a2 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 5 Nov 2015 18:47:06 -0800 Subject: mm/page_alloc: remove unused parameter in init_currently_empty_zone() Commit a2f3aa025766 ("[PATCH] Fix sparsemem on Cell") fixed an oops experienced on the Cell architecture when init-time functions, early_*(), are called at runtime by introducing an 'enum memmap_context' parameter to memmap_init_zone() and init_currently_empty_zone(). This parameter is intended to be used to tell whether the call of these two functions is being made on behalf of a hotplug event, or happening at boot-time. However, init_currently_empty_zone() does not use this parameter at all, so remove it. Signed-off-by: Yaowei Bai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 4 ++-- mm/page_alloc.c | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0780d11..67d488a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -339,8 +339,8 @@ static int __ref ensure_zone_is_initialized(struct zone *zone, unsigned long start_pfn, unsigned long num_pages) { if (!zone_is_initialized(zone)) - return init_currently_empty_zone(zone, start_pfn, num_pages, - MEMMAP_HOTPLUG); + return init_currently_empty_zone(zone, start_pfn, num_pages); + return 0; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 805bbad..c60605d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4900,8 +4900,7 @@ static __meminit void zone_pcp_init(struct zone *zone) int __meminit init_currently_empty_zone(struct zone *zone, unsigned long zone_start_pfn, - unsigned long size, - enum memmap_context context) + unsigned long size) { struct pglist_data *pgdat = zone->zone_pgdat; int ret; @@ -5413,8 +5412,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) set_pageblock_order(); setup_usemap(pgdat, zone, zone_start_pfn, size); - ret = init_currently_empty_zone(zone, zone_start_pfn, - size, MEMMAP_EARLY); + ret = init_currently_empty_zone(zone, zone_start_pfn, size); BUG_ON(ret); memmap_init(size, nid, j, zone_start_pfn); zone_start_pfn += size; -- cgit v1.1 From 600e19afc5f8a6c18ea49cee9511c5797db02391 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 5 Nov 2015 18:47:08 -0800 Subject: mm: use only per-device readahead limit Maximal readahead size is limited now by two values: 1) by global 2Mb constant (MAX_READAHEAD in max_sane_readahead()) 2) by configurable per-device value* (bdi->ra_pages) There are devices, which require custom readahead limit. For instance, for RAIDs it's calculated as number of devices multiplied by chunk size times 2. Readahead size can never be larger than bdi->ra_pages * 2 value (POSIX_FADV_SEQUNTIAL doubles readahead size). If so, why do we need two limits? I suggest to completely remove this max_sane_readahead() stuff and use per-device readahead limit everywhere. Also, using right readahead size for RAID disks can significantly increase i/o performance: before: dd if=/dev/md2 of=/dev/null bs=100M count=100 100+0 records in 100+0 records out 10485760000 bytes (10 GB) copied, 12.9741 s, 808 MB/s after: $ dd if=/dev/md2 of=/dev/null bs=100M count=100 100+0 records in 100+0 records out 10485760000 bytes (10 GB) copied, 8.91317 s, 1.2 GB/s (It's an 8-disks RAID5 storage). This patch doesn't change sys_readahead and madvise(MADV_WILLNEED) behavior introduced by 6d2be915e589b58 ("mm/readahead.c: fix readahead failure for memoryless NUMA nodes and limit readahead pages"). Signed-off-by: Roman Gushchin Cc: Raghavendra K T Cc: Jan Kara Cc: Wu Fengguang Cc: David Rientjes Cc: onstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 8 +++----- mm/readahead.c | 14 ++------------ 2 files changed, 5 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 327910c..1fe962b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1807,7 +1807,6 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, struct file *file, pgoff_t offset) { - unsigned long ra_pages; struct address_space *mapping = file->f_mapping; /* If we don't want any read-ahead, don't bother */ @@ -1836,10 +1835,9 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, /* * mmap read-around */ - ra_pages = max_sane_readahead(ra->ra_pages); - ra->start = max_t(long, 0, offset - ra_pages / 2); - ra->size = ra_pages; - ra->async_size = ra_pages / 4; + ra->start = max_t(long, 0, offset - ra->ra_pages / 2); + ra->size = ra->ra_pages; + ra->async_size = ra->ra_pages / 4; ra_submit(ra, mapping, file); } diff --git a/mm/readahead.c b/mm/readahead.c index 24682f6..998ad59 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -213,7 +213,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) return -EINVAL; - nr_to_read = max_sane_readahead(nr_to_read); + nr_to_read = min(nr_to_read, inode_to_bdi(mapping->host)->ra_pages); while (nr_to_read) { int err; @@ -232,16 +232,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, return 0; } -#define MAX_READAHEAD ((512*4096)/PAGE_CACHE_SIZE) -/* - * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a - * sensible upper limit. - */ -unsigned long max_sane_readahead(unsigned long nr) -{ - return min(nr, MAX_READAHEAD); -} - /* * Set the initial window size, round to next power of 2 and square * for small size, x 4 for medium, and x 2 for large @@ -380,7 +370,7 @@ ondemand_readahead(struct address_space *mapping, bool hit_readahead_marker, pgoff_t offset, unsigned long req_size) { - unsigned long max = max_sane_readahead(ra->ra_pages); + unsigned long max = ra->ra_pages; pgoff_t prev_offset; /* -- cgit v1.1 From 5d317b2b6536592a9b51fe65faed43d65ca9158e Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 5 Nov 2015 18:47:14 -0800 Subject: mm: hugetlb: proc: add HugetlbPages field to /proc/PID/status Currently there's no easy way to get per-process usage of hugetlb pages, which is inconvenient because userspace applications which use hugetlb typically want to control their processes on the basis of how much memory (including hugetlb) they use. So this patch simply provides easy access to the info via /proc/PID/status. Signed-off-by: Naoya Horiguchi Acked-by: Joern Engel Acked-by: David Rientjes Acked-by: Michal Hocko Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 9 +++++++++ mm/rmap.c | 4 +++- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9cc7734..abfbe8c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2790,6 +2790,12 @@ void hugetlb_show_meminfo(void) 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); } +void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm) +{ + seq_printf(m, "HugetlbPages:\t%8lu kB\n", + atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10)); +} + /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ unsigned long hugetlb_total_pages(void) { @@ -3025,6 +3031,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, get_page(ptepage); page_dup_rmap(ptepage); set_huge_pte_at(dst, addr, dst_pte, entry); + hugetlb_count_add(pages_per_huge_page(h), dst); } spin_unlock(src_ptl); spin_unlock(dst_ptl); @@ -3105,6 +3112,7 @@ again: if (huge_pte_dirty(pte)) set_page_dirty(page); + hugetlb_count_sub(pages_per_huge_page(h), mm); page_remove_rmap(page); force_flush = !__tlb_remove_page(tlb, page); if (force_flush) { @@ -3509,6 +3517,7 @@ retry: && (vma->vm_flags & VM_SHARED))); set_huge_pte_at(mm, address, ptep, new_pte); + hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl); diff --git a/mm/rmap.c b/mm/rmap.c index f5b5c1f..d40e7ae 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1352,7 +1352,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, update_hiwater_rss(mm); if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { - if (!PageHuge(page)) { + if (PageHuge(page)) { + hugetlb_count_sub(1 << compound_order(page), mm); + } else { if (PageAnon(page)) dec_mm_counter(mm, MM_ANONPAGES); else -- cgit v1.1 From 29d06bbb41595f82db309a5516426ef8bd0f27b7 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 5 Nov 2015 18:47:17 -0800 Subject: mm/vmscan: make inactive_anon_is_low_global return directly Delete unnecessary if to let inactive_anon_is_low_global return directly. No functional changes. Signed-off-by: Yaowei Bai Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 7f63a93..773fc8d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1866,10 +1866,7 @@ static int inactive_anon_is_low_global(struct zone *zone) active = zone_page_state(zone, NR_ACTIVE_ANON); inactive = zone_page_state(zone, NR_INACTIVE_ANON); - if (inactive * zone->inactive_ratio < active) - return 1; - - return 0; + return inactive * zone->inactive_ratio < active; } /** -- cgit v1.1 From 21c527a3cba07f9a9ce17b3a445f110a847793e2 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 5 Nov 2015 18:47:20 -0800 Subject: mm/compaction.c: add an is_via_compact_memory() helper Introduce is_via_compact_memory() helper indicating compacting via /proc/sys/vm/compact_memory to improve readability. To catch this situation in __compaction_suitable, use order as parameter directly instead of using struct compact_control. This patch has no functional changes. Signed-off-by: Yaowei Bai Cc: Mel Gorman Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index c5c627a..a8e6593 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1197,6 +1197,15 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; } +/* + * order == -1 is expected when compacting via + * /proc/sys/vm/compact_memory + */ +static inline bool is_via_compact_memory(int order) +{ + return order == -1; +} + static int __compact_finished(struct zone *zone, struct compact_control *cc, const int migratetype) { @@ -1223,11 +1232,7 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc, return COMPACT_COMPLETE; } - /* - * order == -1 is expected when compacting via - * /proc/sys/vm/compact_memory - */ - if (cc->order == -1) + if (is_via_compact_memory(cc->order)) return COMPACT_CONTINUE; /* Compaction run is not finished if the watermark is not met */ @@ -1290,11 +1295,7 @@ static unsigned long __compaction_suitable(struct zone *zone, int order, int fragindex; unsigned long watermark; - /* - * order == -1 is expected when compacting via - * /proc/sys/vm/compact_memory - */ - if (order == -1) + if (is_via_compact_memory(order)) return COMPACT_CONTINUE; watermark = low_wmark_pages(zone); @@ -1658,10 +1659,11 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) * this makes sure we compact the whole zone regardless of * cached scanner positions. */ - if (cc->order == -1) + if (is_via_compact_memory(cc->order)) __reset_isolation_suitable(zone); - if (cc->order == -1 || !compaction_deferred(zone, cc->order)) + if (is_via_compact_memory(cc->order) || + !compaction_deferred(zone, cc->order)) compact_zone(zone, cc); if (cc->order > 0) { -- cgit v1.1 From aa750fd71c242dba02ee2034e15fbd7d0cdb2461 Mon Sep 17 00:00:00 2001 From: Junichi Nomura Date: Thu, 5 Nov 2015 18:47:23 -0800 Subject: mm/filemap.c: make global sync not clear error status of individual inodes filemap_fdatawait() is a function to wait for on-going writeback to complete but also consume and clear error status of the mapping set during writeback. The latter functionality is critical for applications to detect writeback error with system calls like fsync(2)/fdatasync(2). However filemap_fdatawait() is also used by sync(2) or FIFREEZE ioctl, which don't check error status of individual mappings. As a result, fsync() may not be able to detect writeback error if events happen in the following order: Application System admin ---------------------------------------------------------- write data on page cache Run sync command writeback completes with error filemap_fdatawait() clears error fsync returns success (but the data is not on disk) This patch adds filemap_fdatawait_keep_errors() for call sites where writeback error is not handled so that they don't clear error status. Signed-off-by: Jun'ichi Nomura Acked-by: Andi Kleen Reviewed-by: Tejun Heo Cc: Fengguang Wu Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 54 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 1fe962b..884766d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -331,23 +331,14 @@ int filemap_flush(struct address_space *mapping) } EXPORT_SYMBOL(filemap_flush); -/** - * filemap_fdatawait_range - wait for writeback to complete - * @mapping: address space structure to wait for - * @start_byte: offset in bytes where the range starts - * @end_byte: offset in bytes where the range ends (inclusive) - * - * Walk the list of under-writeback pages of the given address space - * in the given range and wait for all of them. - */ -int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, - loff_t end_byte) +static int __filemap_fdatawait_range(struct address_space *mapping, + loff_t start_byte, loff_t end_byte) { pgoff_t index = start_byte >> PAGE_CACHE_SHIFT; pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; struct pagevec pvec; int nr_pages; - int ret2, ret = 0; + int ret = 0; if (end_byte < start_byte) goto out; @@ -374,6 +365,29 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, cond_resched(); } out: + return ret; +} + +/** + * filemap_fdatawait_range - wait for writeback to complete + * @mapping: address space structure to wait for + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) + * + * Walk the list of under-writeback pages of the given address space + * in the given range and wait for all of them. Check error status of + * the address space and return it. + * + * Since the error status of the address space is cleared by this function, + * callers are responsible for checking the return value and handling and/or + * reporting the error. + */ +int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, + loff_t end_byte) +{ + int ret, ret2; + + ret = __filemap_fdatawait_range(mapping, start_byte, end_byte); ret2 = filemap_check_errors(mapping); if (!ret) ret = ret2; @@ -383,11 +397,38 @@ out: EXPORT_SYMBOL(filemap_fdatawait_range); /** + * filemap_fdatawait_keep_errors - wait for writeback without clearing errors + * @mapping: address space structure to wait for + * + * Walk the list of under-writeback pages of the given address space + * and wait for all of them. Unlike filemap_fdatawait(), this function + * does not clear error status of the address space. + * + * Use this function if callers don't handle errors themselves. Expected + * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), + * fsfreeze(8) + */ +void filemap_fdatawait_keep_errors(struct address_space *mapping) +{ + loff_t i_size = i_size_read(mapping->host); + + if (i_size == 0) + return; + + __filemap_fdatawait_range(mapping, 0, i_size - 1); +} + +/** * filemap_fdatawait - wait for all under-writeback pages to complete * @mapping: address space structure to wait for * * Walk the list of under-writeback pages of the given address space - * and wait for all of them. + * and wait for all of them. Check error status of the address space + * and return it. + * + * Since the error status of the address space is cleared by this function, + * callers are responsible for checking the return value and handling and/or + * reporting the error. */ int filemap_fdatawait(struct address_space *mapping) { -- cgit v1.1 From a5f65109026b35b654b94fdcd26a971185a53adc Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 5 Nov 2015 18:47:26 -0800 Subject: mm: hwpoison: ratelimit messages from unpoison_memory() Currently kernel prints out results of every single unpoison event, which i= s not necessary because unpoison is purely a testing feature and testers can = get little or no information from lots of lines of unpoison log storm. So this patch ratelimits printk in unpoison_memory(). This patch introduces a file local ratelimit_state, which adds 64 bytes to memory-failure.o. If we apply pr_info_ratelimited() for 8 callsite below, 2= 56 bytes is added, so it's a win. Signed-off-by: Naoya Horiguchi Cc: Andi Kleen Cc: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9588269..16a0ec3 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -56,6 +56,7 @@ #include #include #include +#include #include "internal.h" #include "ras/ras_event.h" @@ -1403,6 +1404,12 @@ static int __init memory_failure_init(void) } core_initcall(memory_failure_init); +#define unpoison_pr_info(fmt, pfn, rs) \ +({ \ + if (__ratelimit(rs)) \ + pr_info(fmt, pfn); \ +}) + /** * unpoison_memory - Unpoison a previously poisoned page * @pfn: Page number of the to be unpoisoned page @@ -1421,6 +1428,8 @@ int unpoison_memory(unsigned long pfn) struct page *p; int freeit = 0; unsigned int nr_pages; + static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); if (!pfn_valid(pfn)) return -ENXIO; @@ -1429,23 +1438,26 @@ int unpoison_memory(unsigned long pfn) page = compound_head(p); if (!PageHWPoison(p)) { - pr_info("MCE: Page was already unpoisoned %#lx\n", pfn); + unpoison_pr_info("MCE: Page was already unpoisoned %#lx\n", + pfn, &unpoison_rs); return 0; } if (page_count(page) > 1) { - pr_info("MCE: Someone grabs the hwpoison page %#lx\n", pfn); + unpoison_pr_info("MCE: Someone grabs the hwpoison page %#lx\n", + pfn, &unpoison_rs); return 0; } if (page_mapped(page)) { - pr_info("MCE: Someone maps the hwpoison page %#lx\n", pfn); + unpoison_pr_info("MCE: Someone maps the hwpoison page %#lx\n", + pfn, &unpoison_rs); return 0; } if (page_mapping(page)) { - pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n", - pfn); + unpoison_pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n", + pfn, &unpoison_rs); return 0; } @@ -1455,7 +1467,8 @@ int unpoison_memory(unsigned long pfn) * In such case, we yield to memory_failure() and make unpoison fail. */ if (!PageHuge(page) && PageTransHuge(page)) { - pr_info("MCE: Memory failure is now running on %#lx\n", pfn); + unpoison_pr_info("MCE: Memory failure is now running on %#lx\n", + pfn, &unpoison_rs); return 0; } @@ -1469,12 +1482,14 @@ int unpoison_memory(unsigned long pfn) * to the end. */ if (PageHuge(page)) { - pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); + unpoison_pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", + pfn, &unpoison_rs); return 0; } if (TestClearPageHWPoison(p)) num_poisoned_pages_dec(); - pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); + unpoison_pr_info("MCE: Software-unpoisoned free page %#lx\n", + pfn, &unpoison_rs); return 0; } @@ -1486,7 +1501,8 @@ int unpoison_memory(unsigned long pfn) * the free buddy page pool. */ if (TestClearPageHWPoison(page)) { - pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); + unpoison_pr_info("MCE: Software-unpoisoned page %#lx\n", + pfn, &unpoison_rs); num_poisoned_pages_sub(nr_pages); freeit = 1; if (PageHuge(page)) -- cgit v1.1 From 3608de0787e51d3d826656e105524b48ade7b16f Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Thu, 5 Nov 2015 18:47:29 -0800 Subject: mm/memcontrol.c: fix order calculation in try_charge() Since commit 6539cc053869 ("mm: memcontrol: fold mem_cgroup_do_charge()"), the order to pass to mem_cgroup_oom() is calculated by passing the number of pages to get_order() instead of the expected size in bytes. AFAICT, it only affects the value displayed in the oom warning message. This patch fix this. Michal said: : We haven't noticed that just because the OOM is enabled only for page : faults of order-0 (single page) and get_order work just fine. Thanks for : noticing this. If we ever start triggering OOM on different orders this : would be broken. Signed-off-by: Jerome Marchand Acked-by: Michal Hocko Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b952abe..a1c05ff 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2094,7 +2094,8 @@ retry: mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1); - mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); + mem_cgroup_oom(mem_over_limit, gfp_mask, + get_order(nr_pages * PAGE_SIZE)); nomem: if (!(gfp_mask & __GFP_NOFAIL)) return -ENOMEM; -- cgit v1.1 From 42e2e45777a8c2ec32b6a3c3d81a7d454f6afb6d Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 5 Nov 2015 18:47:36 -0800 Subject: mm/vmscan: make inactive_anon/file_is_low return bool Make inactive_anon/file_is_low return bool due to these particular functions only using either one or zero as their return value. No functional change. Signed-off-by: Yaowei Bai Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 773fc8d..38d0481 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1859,7 +1859,7 @@ static void shrink_active_list(unsigned long nr_to_scan, } #ifdef CONFIG_SWAP -static int inactive_anon_is_low_global(struct zone *zone) +static bool inactive_anon_is_low_global(struct zone *zone) { unsigned long active, inactive; @@ -1876,14 +1876,14 @@ static int inactive_anon_is_low_global(struct zone *zone) * Returns true if the zone does not have enough inactive anon pages, * meaning some active anon pages need to be deactivated. */ -static int inactive_anon_is_low(struct lruvec *lruvec) +static bool inactive_anon_is_low(struct lruvec *lruvec) { /* * If we don't have swap space, anonymous page deactivation * is pointless. */ if (!total_swap_pages) - return 0; + return false; if (!mem_cgroup_disabled()) return mem_cgroup_inactive_anon_is_low(lruvec); @@ -1891,9 +1891,9 @@ static int inactive_anon_is_low(struct lruvec *lruvec) return inactive_anon_is_low_global(lruvec_zone(lruvec)); } #else -static inline int inactive_anon_is_low(struct lruvec *lruvec) +static inline bool inactive_anon_is_low(struct lruvec *lruvec) { - return 0; + return false; } #endif @@ -1911,7 +1911,7 @@ static inline int inactive_anon_is_low(struct lruvec *lruvec) * This uses a different ratio than the anonymous pages, because * the page cache uses a use-once replacement algorithm. */ -static int inactive_file_is_low(struct lruvec *lruvec) +static bool inactive_file_is_low(struct lruvec *lruvec) { unsigned long inactive; unsigned long active; @@ -1922,7 +1922,7 @@ static int inactive_file_is_low(struct lruvec *lruvec) return active > inactive; } -static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) +static bool inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) { if (is_file_lru(lru)) return inactive_file_is_low(lruvec); -- cgit v1.1 From 426fb5e72d92b868912e47a1e3ca2df6eabc3872 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 5 Nov 2015 18:47:44 -0800 Subject: mm/oom_kill.c: reverse the order of setting TIF_MEMDIE and sending SIGKILL It was confirmed that a local unprivileged user can consume all memory reserves and hang up that system using time lag between the OOM killer sets TIF_MEMDIE on an OOM victim and sends SIGKILL to that victim, for printk() inside for_each_process() loop at oom_kill_process() can consume many seconds when there are many thread groups sharing the same memory. Before starting oom-depleter process: Node 0 DMA: 3*4kB (UM) 6*8kB (U) 4*16kB (UEM) 0*32kB 0*64kB 1*128kB (M) 2*256kB (EM) 2*512kB (UE) 2*1024kB (EM) 1*2048kB (E) 1*4096kB (M) = 9980kB Node 0 DMA32: 31*4kB (UEM) 27*8kB (UE) 32*16kB (UE) 13*32kB (UE) 14*64kB (UM) 7*128kB (UM) 8*256kB (UM) 8*512kB (UM) 3*1024kB (U) 4*2048kB (UM) 362*4096kB (UM) = 1503220kB As of invoking the OOM killer: Node 0 DMA: 11*4kB (UE) 8*8kB (UEM) 6*16kB (UE) 2*32kB (EM) 0*64kB 1*128kB (U) 3*256kB (UEM) 2*512kB (UE) 3*1024kB (UEM) 1*2048kB (U) 0*4096kB = 7308kB Node 0 DMA32: 1049*4kB (UEM) 507*8kB (UE) 151*16kB (UE) 53*32kB (UEM) 83*64kB (UEM) 52*128kB (EM) 25*256kB (UEM) 11*512kB (M) 6*1024kB (UM) 1*2048kB (M) 0*4096kB = 44556kB Between the thread group leader got TIF_MEMDIE and receives SIGKILL: Node 0 DMA: 0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 0kB Node 0 DMA32: 0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 0kB The oom-depleter's thread group leader which got TIF_MEMDIE started memset() in user space after the OOM killer set TIF_MEMDIE, and it was free to abuse ALLOC_NO_WATERMARKS by TIF_MEMDIE for memset() in user space until SIGKILL is delivered. If SIGKILL is delivered before TIF_MEMDIE is set, the oom-depleter can terminate without touching memory reserves. Although the possibility of hitting this time lag is very small for 3.19 and earlier kernels because TIF_MEMDIE is set immediately before sending SIGKILL, preemption or long interrupts (an extreme example is SysRq-t) can step between and allow memory allocations which are not needed for terminating the OOM victim. Fixes: 83363b917a29 ("oom: make sure that TIF_MEMDIE is set under task_lock") Signed-off-by: Tetsuo Handa Acked-by: Michal Hocko Cc: David Rientjes Cc: [4.0+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1ecc0bc..8ad35aa 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -554,6 +554,12 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, /* mm cannot safely be dereferenced after task_unlock(victim) */ mm = victim->mm; + /* + * We should send SIGKILL before setting TIF_MEMDIE in order to prevent + * the OOM victim from depleting the memory reserves from the user + * space under its control. + */ + do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); mark_oom_victim(victim); pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), @@ -585,7 +591,6 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, } rcu_read_unlock(); - do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); put_task_struct(victim); } #undef K -- cgit v1.1 From 880b768937e90c433c0c8254a22b1eb63df005a4 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 5 Nov 2015 18:47:51 -0800 Subject: mm/oom_kill.c: fix potentially killing unrelated process At the for_each_process() loop in oom_kill_process(), we are comparing address of OOM victim's mm without holding a reference to that mm. If there are a lot of processes to compare or a lot of "Kill process %d (%s) sharing same memory" messages to print, for_each_process() loop could take very long time. It is possible that meanwhile the OOM victim exits and releases its mm, and then mm is allocated with the same address and assigned to some unrelated process. When we hit such race, the unrelated process will be killed by error. To make sure that the OOM victim's mm does not go away until for_each_process() loop finishes, get a reference on the OOM victim's mm before calling task_unlock(victim). [oleg@redhat.com: several fixes] Signed-off-by: Tetsuo Handa Acked-by: Michal Hocko Cc: David Rientjes Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 8ad35aa..5ba743a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -552,8 +552,9 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, victim = p; } - /* mm cannot safely be dereferenced after task_unlock(victim) */ + /* Get a reference to safely compare mm after task_unlock(victim) */ mm = victim->mm; + atomic_inc(&mm->mm_count); /* * We should send SIGKILL before setting TIF_MEMDIE in order to prevent * the OOM victim from depleting the memory reserves from the user @@ -591,6 +592,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, } rcu_read_unlock(); + mmdrop(mm); put_task_struct(victim); } #undef K -- cgit v1.1 From 840807a8f40bb25a8df5b6412bba6bc156643be5 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 5 Nov 2015 18:47:54 -0800 Subject: mm/oom_kill.c: suppress unnecessary "sharing same memory" message oom_kill_process() sends SIGKILL to other thread groups sharing victim's mm. But printing "Kill process %d (%s) sharing same memory\n" lines makes no sense if they already have pending SIGKILL. This patch reduces the "Kill process" lines by printing that line with info level only if SIGKILL is not pending. Signed-off-by: Tetsuo Handa Acked-by: Michal Hocko Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 5ba743a..c170d9f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -583,9 +583,11 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, !(p->flags & PF_KTHREAD)) { if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) continue; + if (fatal_signal_pending(p)) + continue; task_lock(p); /* Protect ->comm from prctl() */ - pr_err("Kill process %d (%s) sharing same memory\n", + pr_info("Kill process %d (%s) sharing same memory\n", task_pid_nr(p), p->comm); task_unlock(p); do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); -- cgit v1.1 From fa6c7b46aaa0cc00846703e8c0ec1e1636ff25ba Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 5 Nov 2015 18:47:56 -0800 Subject: mm, compaction: export tracepoints status strings to userspace Some compaction tracepoints convert the integer return values to strings using the compaction_status_string array. This works for in-kernel printing, but not userspace trace printing of raw captured trace such as via trace-cmd report. This patch converts the private array to appropriate tracepoint macros that result in proper userspace support. trace-cmd output before: transhuge-stres-4235 [000] 453.149280: mm_compaction_finished: node=0 zone=ffffffff81815d7a order=9 ret= after: transhuge-stres-4235 [000] 453.149280: mm_compaction_finished: node=0 zone=ffffffff81815d7a order=9 ret=partial Signed-off-by: Vlastimil Babka Reviewed-by: Steven Rostedt Cc: Joonsoo Kim Cc: Ingo Molnar Cc: Mel Gorman Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index a8e6593..a5849c4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -35,17 +35,6 @@ static inline void count_compact_events(enum vm_event_item item, long delta) #endif #if defined CONFIG_COMPACTION || defined CONFIG_CMA -#ifdef CONFIG_TRACEPOINTS -static const char *const compaction_status_string[] = { - "deferred", - "skipped", - "continue", - "partial", - "complete", - "no_suitable_page", - "not_suitable_zone", -}; -#endif #define CREATE_TRACE_POINTS #include -- cgit v1.1 From 2d1e10412c2388ff9b6afc60536eaa195a419289 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 5 Nov 2015 18:48:02 -0800 Subject: mm, compaction: distinguish contended status in tracepoints Compaction returns prematurely with COMPACT_PARTIAL when contended or has fatal signal pending. This is ok for the callers, but might be misleading in the traces, as the usual reason to return COMPACT_PARTIAL is that we think the allocation should succeed. After this patch we distinguish the premature ending condition in the mm_compaction_finished and mm_compaction_end tracepoints. The contended status covers the following reasons: - lock contention or need_resched() detected in async compaction - fatal signal pending - too many pages isolated in the zone (only for async compaction) Further distinguishing the exact reason seems unnecessary for now. Signed-off-by: Vlastimil Babka Cc: Joonsoo Kim Cc: Mel Gorman Cc: David Rientjes Cc: Steven Rostedt Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index a5849c4..de3e1e7 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1202,7 +1202,7 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc, unsigned long watermark; if (cc->contended || fatal_signal_pending(current)) - return COMPACT_PARTIAL; + return COMPACT_CONTENDED; /* Compaction run completes if the migrate and free scanner meet */ if (compact_scanners_met(cc)) { @@ -1393,7 +1393,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) switch (isolate_migratepages(zone, cc)) { case ISOLATE_ABORT: - ret = COMPACT_PARTIAL; + ret = COMPACT_CONTENDED; putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; goto out; @@ -1424,7 +1424,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) * and we want compact_finished() to detect it */ if (err == -ENOMEM && !compact_scanners_met(cc)) { - ret = COMPACT_PARTIAL; + ret = COMPACT_CONTENDED; goto out; } } @@ -1477,6 +1477,9 @@ out: trace_mm_compaction_end(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn, sync, ret); + if (ret == COMPACT_CONTENDED) + ret = COMPACT_PARTIAL; + return ret; } -- cgit v1.1 From da39da3a54fed88e29024f2f1f6cd7357cd03a44 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 5 Nov 2015 18:48:05 -0800 Subject: mm, oom: remove task_lock protecting comm printing The oom killer takes task_lock() in a couple of places solely to protect printing the task's comm. A process's comm, including current's comm, may change due to /proc/pid/comm or PR_SET_NAME. The comm will always be NULL-terminated, so the worst race scenario would only be during update. We can tolerate a comm being printed that is in the middle of an update to avoid taking the lock. Other locations in the kernel have already dropped task_lock() when printing comm, so this is consistent. Signed-off-by: David Rientjes Suggested-by: Oleg Nesterov Cc: Michal Hocko Cc: Vladimir Davydov Cc: Sergey Senozhatsky Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c170d9f..58f3d27 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -377,13 +377,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) static void dump_header(struct oom_control *oc, struct task_struct *p, struct mem_cgroup *memcg) { - task_lock(current); pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " "oom_score_adj=%hd\n", current->comm, oc->gfp_mask, oc->order, current->signal->oom_score_adj); - cpuset_print_task_mems_allowed(current); - task_unlock(current); + cpuset_print_current_mems_allowed(); dump_stack(); if (memcg) mem_cgroup_print_oom_info(memcg, p); @@ -509,10 +507,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, if (__ratelimit(&oom_rs)) dump_header(oc, p, memcg); - task_lock(p); pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", message, task_pid_nr(p), p->comm, points); - task_unlock(p); /* * If any of p's children has a different mm and is eligible for kill, @@ -586,10 +582,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, if (fatal_signal_pending(p)) continue; - task_lock(p); /* Protect ->comm from prctl() */ pr_info("Kill process %d (%s) sharing same memory\n", task_pid_nr(p), p->comm); - task_unlock(p); do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); } rcu_read_unlock(); -- cgit v1.1 From d031a157915e0508ffa1ab9f1bbf977257529cb4 Mon Sep 17 00:00:00 2001 From: Alexandru Moise <00moses.alexander00@gmail.com> Date: Thu, 5 Nov 2015 18:48:08 -0800 Subject: mm/vmscan.c: fix types of some locals In zone_reclaimable_pages(), `nr' is returned by a function which is declared as returning "unsigned long", so declare it such. Negative values are meaningless here. In zone_pagecache_reclaimable() we should also declare `delta' and `nr_pagecache_reclaimable' as being unsigned longs because they're used to store the values returned by zone_page_state() and zone_unmapped_file_pages() which also happen to return unsigned integers. [akpm@linux-foundation.org: make zone_pagecache_reclaimable() return ulong rather than long] Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com> Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 38d0481..fdd89978 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -194,7 +194,7 @@ static bool sane_reclaim(struct scan_control *sc) static unsigned long zone_reclaimable_pages(struct zone *zone) { - int nr; + unsigned long nr; nr = zone_page_state(zone, NR_ACTIVE_FILE) + zone_page_state(zone, NR_INACTIVE_FILE); @@ -3693,10 +3693,10 @@ static inline unsigned long zone_unmapped_file_pages(struct zone *zone) } /* Work out how many page cache pages we can reclaim in this reclaim_mode */ -static long zone_pagecache_reclaimable(struct zone *zone) +static unsigned long zone_pagecache_reclaimable(struct zone *zone) { - long nr_pagecache_reclaimable; - long delta = 0; + unsigned long nr_pagecache_reclaimable; + unsigned long delta = 0; /* * If RECLAIM_UNMAP is set, then all file pages are considered -- cgit v1.1 From 9fd745d450e7e2b0d2f1b386b886e7d568b64404 Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Thu, 5 Nov 2015 18:48:11 -0800 Subject: mm: fix overflow in find_zone_movable_pfns_for_nodes() If the user set "movablecore=xx" to a large number, corepages will overflow. Fix the problem. Signed-off-by: Xishi Qiu Reviewed-by: Yasuaki Ishimatsu Acked-by: Tang Chen Acked-by: David Rientjes Cc: Mel Gorman Cc: Tang Chen Cc: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c60605d..4aed338 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5666,6 +5666,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) */ required_movablecore = roundup(required_movablecore, MAX_ORDER_NR_PAGES); + required_movablecore = min(totalpages, required_movablecore); corepages = totalpages - required_movablecore; required_kernelcore = max(required_kernelcore, corepages); -- cgit v1.1 From 87e8827b37c0c391d9915d0dc6a06c9b5f9cac65 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 5 Nov 2015 18:48:14 -0800 Subject: mm: fix the racy mm->locked_vm change in "mm->locked_vm += grow" and vm_stat_account() in acct_stack_growth() are not safe; multiple threads using the same ->mm can do this at the same time trying to expans different vma's under down_read(mmap_sem). This means that one of the "locked_vm += grow" changes can be lost and we can miss munlock_vma_pages_all() later. Move this code into the caller(s) under mm->page_table_lock. All other updates to ->locked_vm hold mmap_sem for writing. Signed-off-by: Oleg Nesterov Acked-by: Hugh Dickins Cc: Andrey Konovalov Cc: Davidlohr Bueso Cc: "Kirill A. Shutemov" Cc: Sasha Levin Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 3ec19b6..d1ac224 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2138,10 +2138,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns if (security_vm_enough_memory_mm(mm, grow)) return -ENOMEM; - /* Ok, everything looks good - let it rip */ - if (vma->vm_flags & VM_LOCKED) - mm->locked_vm += grow; - vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); return 0; } @@ -2202,6 +2198,10 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) * against concurrent vma expansions. */ spin_lock(&vma->vm_mm->page_table_lock); + if (vma->vm_flags & VM_LOCKED) + vma->vm_mm->locked_vm += grow; + vm_stat_account(vma->vm_mm, vma->vm_flags, + vma->vm_file, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; anon_vma_interval_tree_post_update_vma(vma); @@ -2273,6 +2273,10 @@ int expand_downwards(struct vm_area_struct *vma, * against concurrent vma expansions. */ spin_lock(&vma->vm_mm->page_table_lock); + if (vma->vm_flags & VM_LOCKED) + vma->vm_mm->locked_vm += grow; + vm_stat_account(vma->vm_mm, vma->vm_flags, + vma->vm_file, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_start = address; vma->vm_pgoff -= grow; -- cgit v1.1 From 09357814778a38a5ab2d031cba6c9e9fe090c849 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 5 Nov 2015 18:48:17 -0800 Subject: mm: add the "struct mm_struct *mm" local into Cosmetic, but expand_upwards() and expand_downwards() overuse vma->vm_mm, a local variable makes sense imho. Signed-off-by: Oleg Nesterov Acked-by: Hugh Dickins Cc: Andrey Konovalov Cc: Davidlohr Bueso Cc: "Kirill A. Shutemov" Cc: Sasha Levin Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index d1ac224..3204a7e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2148,6 +2148,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns */ int expand_upwards(struct vm_area_struct *vma, unsigned long address) { + struct mm_struct *mm = vma->vm_mm; int error; if (!(vma->vm_flags & VM_GROWSUP)) @@ -2197,10 +2198,10 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) * So, we reuse mm->page_table_lock to guard * against concurrent vma expansions. */ - spin_lock(&vma->vm_mm->page_table_lock); + spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) - vma->vm_mm->locked_vm += grow; - vm_stat_account(vma->vm_mm, vma->vm_flags, + mm->locked_vm += grow; + vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; @@ -2208,8 +2209,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (vma->vm_next) vma_gap_update(vma->vm_next); else - vma->vm_mm->highest_vm_end = address; - spin_unlock(&vma->vm_mm->page_table_lock); + mm->highest_vm_end = address; + spin_unlock(&mm->page_table_lock); perf_event_mmap(vma); } @@ -2217,7 +2218,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } vma_unlock_anon_vma(vma); khugepaged_enter_vma_merge(vma, vma->vm_flags); - validate_mm(vma->vm_mm); + validate_mm(mm); return error; } #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ @@ -2228,6 +2229,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) int expand_downwards(struct vm_area_struct *vma, unsigned long address) { + struct mm_struct *mm = vma->vm_mm; int error; /* @@ -2272,17 +2274,17 @@ int expand_downwards(struct vm_area_struct *vma, * So, we reuse mm->page_table_lock to guard * against concurrent vma expansions. */ - spin_lock(&vma->vm_mm->page_table_lock); + spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) - vma->vm_mm->locked_vm += grow; - vm_stat_account(vma->vm_mm, vma->vm_flags, + mm->locked_vm += grow; + vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_start = address; vma->vm_pgoff -= grow; anon_vma_interval_tree_post_update_vma(vma); vma_gap_update(vma); - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(&mm->page_table_lock); perf_event_mmap(vma); } @@ -2290,7 +2292,7 @@ int expand_downwards(struct vm_area_struct *vma, } vma_unlock_anon_vma(vma); khugepaged_enter_vma_merge(vma, vma->vm_flags); - validate_mm(vma->vm_mm); + validate_mm(mm); return error; } -- cgit v1.1 From 0c1b2d783cf3432490bf1e532c742fffeadc0bf3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 5 Nov 2015 18:48:20 -0800 Subject: mm/oom_kill: remove the wrong fatal_signal_pending() check in oom_kill_process() The fatal_signal_pending() was added to suppress unnecessary "sharing same memory" message, but it can't 100% help anyway because it can be false-negative; SIGKILL can be already dequeued. And worse, it can be false-positive due to exec or coredump. exec is mostly fine, but coredump is not. It is possible that the group leader has the pending SIGKILL because its sub-thread originated the coredump, in this case we must not skip this process. We could probably add the additional ->group_exit_task check but this patch just removes the wrong check along with pr_info(). Signed-off-by: Oleg Nesterov Acked-by: David Rientjes Acked-by: Tetsuo Handa Acked-by: Michal Hocko Cc: Kyle Walker Cc: Stanislav Kozina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 58f3d27..c837d06 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -579,11 +579,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, !(p->flags & PF_KTHREAD)) { if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) continue; - if (fatal_signal_pending(p)) - continue; - pr_info("Kill process %d (%s) sharing same memory\n", - task_pid_nr(p), p->comm); do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); } rcu_read_unlock(); -- cgit v1.1 From c319025a6c79e532d862e3a0b9506ba316a4d13a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 5 Nov 2015 18:48:23 -0800 Subject: mm/oom_kill: cleanup the "kill sharing same memory" loop Purely cosmetic, but the complex "if" condition looks annoying to me. Especially because it is not consistent with OOM_SCORE_ADJ_MIN check which adds another if/continue. Signed-off-by: Oleg Nesterov Acked-by: David Rientjes Acked-by: Michal Hocko Acked-by: Hillf Danton Cc: Tetsuo Handa Cc: Kyle Walker Cc: Stanislav Kozina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c837d06..2b6e880 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -574,14 +574,18 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, * pending fatal signal. */ rcu_read_lock(); - for_each_process(p) - if (p->mm == mm && !same_thread_group(p, victim) && - !(p->flags & PF_KTHREAD)) { - if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) - continue; + for_each_process(p) { + if (p->mm != mm) + continue; + if (same_thread_group(p, victim)) + continue; + if (unlikely(p->flags & PF_KTHREAD)) + continue; + if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + continue; - do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); - } + do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); + } rcu_read_unlock(); mmdrop(mm); -- cgit v1.1 From 4d7b3394f76ed72cfdec23ca5571dbab6ec41793 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 5 Nov 2015 18:48:26 -0800 Subject: mm/oom_kill: fix the wrong task->mm == mm checks in oom_kill_process() Both "child->mm == mm" and "p->mm != mm" checks in oom_kill_process() are wrong. task->mm can be NULL if the task is the exited group leader. This means in particular that "kill sharing same memory" loop can miss a process with a zombie leader which uses the same ->mm. Note: the process_has_mm(child, p->mm) check is still not 100% correct, p->mm can be NULL too. This is minor, but probably deserves a fix or a comment anyway. [akpm@linux-foundation.org: document process_shares_mm() a bit] Signed-off-by: Oleg Nesterov Acked-by: David Rientjes Acked-by: Michal Hocko Cc: Tetsuo Handa Cc: Kyle Walker Cc: Stanislav Kozina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 2b6e880..e477828 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -474,6 +474,24 @@ void oom_killer_enable(void) oom_killer_disabled = false; } +/* + * task->mm can be NULL if the task is the exited group leader. So to + * determine whether the task is using a particular mm, we examine all the + * task's threads: if one of those is using this mm then this task was also + * using it. + */ +static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) +{ + struct task_struct *t; + + for_each_thread(p, t) { + struct mm_struct *t_mm = READ_ONCE(t->mm); + if (t_mm) + return t_mm == mm; + } + return false; +} + #define K(x) ((x) << (PAGE_SHIFT-10)) /* * Must be called while holding a reference to p, which will be released upon @@ -521,7 +539,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, list_for_each_entry(child, &t->children, sibling) { unsigned int child_points; - if (child->mm == p->mm) + if (process_shares_mm(child, p->mm)) continue; /* * oom_badness() returns 0 if the thread is unkillable @@ -575,7 +593,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, */ rcu_read_lock(); for_each_process(p) { - if (p->mm != mm) + if (!process_shares_mm(p, mm)) continue; if (same_thread_group(p, victim)) continue; -- cgit v1.1 From e6ee219fdd69c87ceaeb421bcd753a63937f8f31 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Thu, 5 Nov 2015 18:48:32 -0800 Subject: mm/mmap.c: remove redundant statement "error = -ENOMEM" It is still a little better to remove it, although it should be skipped by "-O2". Signed-off-by: Chen Gang =0A= Acked-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 3204a7e..28d1b35 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1562,7 +1562,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr, } /* Clear old maps */ - error = -ENOMEM; while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { if (do_munmap(mm, addr, len)) -- cgit v1.1 From 1e3ee14b9355a688ffe24725fa746ab120c42881 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Thu, 5 Nov 2015 18:48:35 -0800 Subject: mm/mmap.c: do not initialize retval in mmap_pgoff() When fget() fails we can return -EBADF directly. Signed-off-by: Chen Gang Acked-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 28d1b35..7e69f30 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1412,13 +1412,13 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, unsigned long, fd, unsigned long, pgoff) { struct file *file = NULL; - unsigned long retval = -EBADF; + unsigned long retval; if (!(flags & MAP_ANONYMOUS)) { audit_mmap_fd(fd, flags); file = fget(fd); if (!file) - goto out; + return -EBADF; if (is_file_hugepages(file)) len = ALIGN(len, huge_page_size(hstate_file(file))); retval = -EINVAL; @@ -1453,7 +1453,6 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, out_fput: if (file) fput(file); -out: return retval; } -- cgit v1.1 From c9427bc043da23de03d142c3c87ce4a57297c471 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 5 Nov 2015 18:48:38 -0800 Subject: mm/nommu.c: drop unlikely inside BUG_ON() (1) For !CONFIG_BUG cases, the bug call is a no-op, so we couldn't care less and the change is ok. (2) ppc and mips, which HAVE_ARCH_BUG_ON, do not rely on branch predictions as it seems to be pointless[1] and thus callers should not be trying to push an optimization in the first place. (3) For CONFIG_BUG and !HAVE_ARCH_BUG_ON cases, BUG_ON() contains an unlikely compiler flag already. Hence, we can drop unlikely behind BUG_ON(). [1] http://lkml.iu.edu/hypermail/linux/kernel/1101.3/02289.html Signed-off-by: Geliang Tang Acked-by: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nommu.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index 1e0f168..92be862 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -578,16 +578,16 @@ static noinline void validate_nommu_regions(void) return; last = rb_entry(lastp, struct vm_region, vm_rb); - BUG_ON(unlikely(last->vm_end <= last->vm_start)); - BUG_ON(unlikely(last->vm_top < last->vm_end)); + BUG_ON(last->vm_end <= last->vm_start); + BUG_ON(last->vm_top < last->vm_end); while ((p = rb_next(lastp))) { region = rb_entry(p, struct vm_region, vm_rb); last = rb_entry(lastp, struct vm_region, vm_rb); - BUG_ON(unlikely(region->vm_end <= region->vm_start)); - BUG_ON(unlikely(region->vm_top < region->vm_end)); - BUG_ON(unlikely(region->vm_start < last->vm_top)); + BUG_ON(region->vm_end <= region->vm_start); + BUG_ON(region->vm_top < region->vm_end); + BUG_ON(region->vm_start < last->vm_top); lastp = p; } -- cgit v1.1 From 27f28b972e12a4080e5f5e4eb36b8224705652d4 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Thu, 5 Nov 2015 18:48:41 -0800 Subject: mm/mmap.c: change __install_special_mapping() args order Make __install_special_mapping() args order match the caller, so the caller can pass their register args directly to callee with no touch. For most of architectures, args (at least the first 5th args) are in registers, so this change will have effect on most of architectures. For -O2, __install_special_mapping() may be inlined under most of architectures, but for -Os, it should not. So this change can get a little better performance for -Os, at least. Signed-off-by: Chen Gang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 7e69f30..220effd 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3052,8 +3052,8 @@ static int special_mapping_fault(struct vm_area_struct *vma, static struct vm_area_struct *__install_special_mapping( struct mm_struct *mm, unsigned long addr, unsigned long len, - unsigned long vm_flags, const struct vm_operations_struct *ops, - void *priv) + unsigned long vm_flags, void *priv, + const struct vm_operations_struct *ops) { int ret; struct vm_area_struct *vma; @@ -3102,8 +3102,8 @@ struct vm_area_struct *_install_special_mapping( unsigned long addr, unsigned long len, unsigned long vm_flags, const struct vm_special_mapping *spec) { - return __install_special_mapping(mm, addr, len, vm_flags, - &special_mapping_vmops, (void *)spec); + return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec, + &special_mapping_vmops); } int install_special_mapping(struct mm_struct *mm, @@ -3111,8 +3111,8 @@ int install_special_mapping(struct mm_struct *mm, unsigned long vm_flags, struct page **pages) { struct vm_area_struct *vma = __install_special_mapping( - mm, addr, len, vm_flags, &legacy_special_mapping_vmops, - (void *)pages); + mm, addr, len, vm_flags, (void *)pages, + &legacy_special_mapping_vmops); return PTR_ERR_OR_ZERO(vma); } -- cgit v1.1 From c2d42c16ad83006a706d83e51a7268db04af733a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 5 Nov 2015 18:48:43 -0800 Subject: mm/vmstat.c: uninline node_page_state() With x86_64 (config http://ozlabs.org/~akpm/config-akpm2.txt) and old gcc (4.4.4), drivers/base/node.c:node_read_meminfo() is using 2344 bytes of stack. Uninlining node_page_state() reduces this to 440 bytes. The stack consumption issue is fixed by newer gcc (4.8.4) however with that compiler this patch reduces the node.o text size from 7314 bytes to 4578. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index fbf1448..ffcb4f5 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -591,6 +591,28 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags) else __inc_zone_state(z, NUMA_OTHER); } + +/* + * Determine the per node value of a stat item. + */ +unsigned long node_page_state(int node, enum zone_stat_item item) +{ + struct zone *zones = NODE_DATA(node)->node_zones; + + return +#ifdef CONFIG_ZONE_DMA + zone_page_state(&zones[ZONE_DMA], item) + +#endif +#ifdef CONFIG_ZONE_DMA32 + zone_page_state(&zones[ZONE_DMA32], item) + +#endif +#ifdef CONFIG_HIGHMEM + zone_page_state(&zones[ZONE_HIGHMEM], item) + +#endif + zone_page_state(&zones[ZONE_NORMAL], item) + + zone_page_state(&zones[ZONE_MOVABLE], item); +} + #endif #ifdef CONFIG_COMPACTION -- cgit v1.1 From a1c34a3bf00af2cede839879502e12dc68491ad5 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Thu, 5 Nov 2015 18:48:46 -0800 Subject: mm: Don't offset memmap for flatmem Srinivas Kandagatla reported bad page messages when trying to remove the bottom 2MB on an ARM based IFC6410 board BUG: Bad page state in process swapper pfn:fffa8 page:ef7fb500 count:0 mapcount:0 mapping: (null) index:0x0 flags: 0x96640253(locked|error|dirty|active|arch_1|reclaim|mlocked) page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set bad because of flags: flags: 0x200041(locked|active|mlocked) Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 3.19.0-rc3-00007-g412f9ba-dirty #816 Hardware name: Qualcomm (Flattened Device Tree) unwind_backtrace show_stack dump_stack bad_page free_pages_prepare free_hot_cold_page __free_pages free_highmem_page mem_init start_kernel Disabling lock debugging due to kernel taint Removing the lower 2MB made the start of the lowmem zone to no longer be page block aligned. IFC6410 uses CONFIG_FLATMEM where alloc_node_mem_map allocates memory for the mem_map. alloc_node_mem_map will offset for unaligned nodes with the assumption the pfn/page translation functions will account for the offset. The functions for CONFIG_FLATMEM do not offset however, resulting in overrunning the memmap array. Just use the allocated memmap without any offset when running with CONFIG_FLATMEM to avoid the overrun. Signed-off-by: Laura Abbott Signed-off-by: Laura Abbott Reported-by: Srinivas Kandagatla Tested-by: Srinivas Kandagatla Acked-by: Vlastimil Babka Tested-by: Bjorn Andersson Cc: Santosh Shilimkar Cc: Russell King Cc: Kevin Hilman Cc: Arnd Bergman Cc: Stephen Boyd Cc: Andy Gross Cc: Mel Gorman Cc: Steven Rostedt Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4aed338..86f7d95 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5421,6 +5421,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) { + unsigned long __maybe_unused offset = 0; + /* Skip empty nodes */ if (!pgdat->node_spanned_pages) return; @@ -5437,6 +5439,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) * for the buddy allocator to function correctly. */ start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); + offset = pgdat->node_start_pfn - start; end = pgdat_end_pfn(pgdat); end = ALIGN(end, MAX_ORDER_NR_PAGES); size = (end - start) * sizeof(struct page); @@ -5444,7 +5447,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) if (!map) map = memblock_virt_alloc_node_nopanic(size, pgdat->node_id); - pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); + pgdat->node_mem_map = map + offset; } #ifndef CONFIG_NEED_MULTIPLE_NODES /* @@ -5452,9 +5455,9 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) */ if (pgdat == NODE_DATA(0)) { mem_map = NODE_DATA(0)->node_mem_map; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM) if (page_to_pfn(mem_map) != pgdat->node_start_pfn) - mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); + mem_map -= offset; #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ } #endif -- cgit v1.1 From a2c1aad3b5fccbb948878b75f9b8f13248666fd6 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 5 Nov 2015 18:48:52 -0800 Subject: mm/vmacache: inline vmacache_valid_mm() This function incurs in very hot paths and merely does a few loads for validity check. Lets inline it, such that we can save the function call overhead. (akpm: this is cosmetic - the compiler already inlines vmacache_valid_mm()) Signed-off-by: Davidlohr Bueso Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmacache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmacache.c b/mm/vmacache.c index b6e3662..fd09dc9 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c @@ -52,7 +52,7 @@ void vmacache_flush_all(struct mm_struct *mm) * Also handle the case where a kernel thread has adopted this mm via use_mm(). * That kernel thread's vmacache is not applicable to this mm. */ -static bool vmacache_valid_mm(struct mm_struct *mm) +static inline bool vmacache_valid_mm(struct mm_struct *mm) { return current->mm == mm && !(current->flags & PF_KTHREAD); } -- cgit v1.1 From bde304bdf4ec4a5f58cc1e90fe2d9cd2d96304c4 Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Thu, 5 Nov 2015 18:48:56 -0800 Subject: mm/page_alloc.c: skip ZONE_MOVABLE if required_kernelcore is larger than totalpages If kernelcore was not specified, or the kernelcore size is zero (required_movablecore >= totalpages), or the kernelcore size is larger than totalpages, there is no ZONE_MOVABLE. We should fill the zone with both kernel memory and movable memory. Signed-off-by: Xishi Qiu Reviewed-by: Yasuaki Ishimatsu Cc: Mel Gorman Cc: David Rientjes Cc: Tang Chen Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 86f7d95..06e6230 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5675,8 +5675,11 @@ static void __init find_zone_movable_pfns_for_nodes(void) required_kernelcore = max(required_kernelcore, corepages); } - /* If kernelcore was not specified, there is no ZONE_MOVABLE */ - if (!required_kernelcore) + /* + * If kernelcore was not specified or kernelcore size is larger + * than totalpages, there is no ZONE_MOVABLE. + */ + if (!required_kernelcore || required_kernelcore >= totalpages) goto out; /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ -- cgit v1.1 From d05e83a6f861ad02c2fcba75d4c4cfe49e3bc90f Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 5 Nov 2015 18:48:59 -0800 Subject: memcg: simplify charging kmem pages Charging kmem pages proceeds in two steps. First, we try to charge the allocation size to the memcg the current task belongs to, then we allocate a page and "commit" the charge storing the pointer to the memcg in the page struct. Such a design looks overcomplicated, because there is not much sense in trying charging the allocation before actually allocating a page: we won't be able to consume much memory over the limit even if we charge after doing the actual allocation, besides we already charge user pages post factum, so being pedantic with kmem pages just looks pointless. So this patch simplifies the design by merging the "charge" and the "commit" steps into the same function, which takes the allocated page. Also, rename the charge and uncharge methods to memcg_kmem_charge and memcg_kmem_uncharge and make the charge method return error code instead of bool to conform to mem_cgroup_try_charge. Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 39 ++++----------------------------------- mm/page_alloc.c | 18 +++++++++--------- 2 files changed, 13 insertions(+), 44 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a1c05ff..e249279 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2404,57 +2404,26 @@ void __memcg_kmem_put_cache(struct kmem_cache *cachep) css_put(&cachep->memcg_params.memcg->css); } -/* - * We need to verify if the allocation against current->mm->owner's memcg is - * possible for the given order. But the page is not allocated yet, so we'll - * need a further commit step to do the final arrangements. - * - * It is possible for the task to switch cgroups in this mean time, so at - * commit time, we can't rely on task conversion any longer. We'll then use - * the handle argument to return to the caller which cgroup we should commit - * against. We could also return the memcg directly and avoid the pointer - * passing, but a boolean return value gives better semantics considering - * the compiled-out case as well. - * - * Returning true means the allocation is possible. - */ -bool -__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) +int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) { struct mem_cgroup *memcg; int ret; - *_memcg = NULL; - memcg = get_mem_cgroup_from_mm(current->mm); if (!memcg_kmem_is_active(memcg)) { css_put(&memcg->css); - return true; + return 0; } ret = memcg_charge_kmem(memcg, gfp, 1 << order); - if (!ret) - *_memcg = memcg; css_put(&memcg->css); - return (ret == 0); -} - -void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, - int order) -{ - VM_BUG_ON(mem_cgroup_is_root(memcg)); - - /* The page allocation failed. Revert */ - if (!page) { - memcg_uncharge_kmem(memcg, 1 << order); - return; - } page->mem_cgroup = memcg; + return ret; } -void __memcg_kmem_uncharge_pages(struct page *page, int order) +void __memcg_kmem_uncharge(struct page *page, int order) { struct mem_cgroup *memcg = page->mem_cgroup; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 06e6230..446bb36 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3428,24 +3428,24 @@ EXPORT_SYMBOL(__free_page_frag); struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) { struct page *page; - struct mem_cgroup *memcg = NULL; - if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) - return NULL; page = alloc_pages(gfp_mask, order); - memcg_kmem_commit_charge(page, memcg, order); + if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) { + __free_pages(page, order); + page = NULL; + } return page; } struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) { struct page *page; - struct mem_cgroup *memcg = NULL; - if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) - return NULL; page = alloc_pages_node(nid, gfp_mask, order); - memcg_kmem_commit_charge(page, memcg, order); + if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) { + __free_pages(page, order); + page = NULL; + } return page; } @@ -3455,7 +3455,7 @@ struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) */ void __free_kmem_pages(struct page *page, unsigned int order) { - memcg_kmem_uncharge_pages(page, order); + memcg_kmem_uncharge(page, order); __free_pages(page, order); } -- cgit v1.1 From f3ccb2c42297757d2e9b820ad37960462df7b7c1 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 5 Nov 2015 18:49:01 -0800 Subject: memcg: unify slab and other kmem pages charging We have memcg_kmem_charge and memcg_kmem_uncharge methods for charging and uncharging kmem pages to memcg, but currently they are not used for charging slab pages (i.e. they are only used for charging pages allocated with alloc_kmem_pages). The only reason why the slab subsystem uses special helpers, memcg_charge_slab and memcg_uncharge_slab, is that it needs to charge to the memcg of kmem cache while memcg_charge_kmem charges to the memcg that the current task belongs to. To remove this diversity, this patch adds an extra argument to __memcg_kmem_charge that can be a pointer to a memcg or NULL. If it is not NULL, the function tries to charge to the memcg it points to, otherwise it charge to the current context. Next, it makes the slab subsystem use this function to charge slab pages. Since memcg_charge_kmem and memcg_uncharge_kmem helpers are now used only in __memcg_kmem_charge and __memcg_kmem_uncharge, they are inlined. Since __memcg_kmem_charge stores a pointer to the memcg in the page struct, we don't need memcg_uncharge_slab anymore and can use free_kmem_pages. Besides, one can now detect which memcg a slab page belongs to by reading /proc/kpagecgroup. Note, this patch switches slab to charge-after-alloc design. Since this design is already used for all other memcg charges, it should not make any difference. [hannes@cmpxchg.org: better to have an outer function than a magic parameter for the memcg lookup] Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Signed-off-by: Johannes Weiner Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 71 +++++++++++++++++++++++++++------------------------------ mm/slab.c | 12 +++++----- mm/slab.h | 24 ++++++------------- mm/slub.c | 12 ++++------ 4 files changed, 51 insertions(+), 68 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e249279..9575cff 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2215,34 +2215,6 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, } #ifdef CONFIG_MEMCG_KMEM -int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, - unsigned long nr_pages) -{ - struct page_counter *counter; - int ret = 0; - - ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter); - if (ret < 0) - return ret; - - ret = try_charge(memcg, gfp, nr_pages); - if (ret) - page_counter_uncharge(&memcg->kmem, nr_pages); - - return ret; -} - -void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages) -{ - page_counter_uncharge(&memcg->memory, nr_pages); - if (do_swap_account) - page_counter_uncharge(&memcg->memsw, nr_pages); - - page_counter_uncharge(&memcg->kmem, nr_pages); - - css_put_many(&memcg->css, nr_pages); -} - static int memcg_alloc_cache_id(void) { int id, size; @@ -2404,36 +2376,59 @@ void __memcg_kmem_put_cache(struct kmem_cache *cachep) css_put(&cachep->memcg_params.memcg->css); } -int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, + struct mem_cgroup *memcg) { - struct mem_cgroup *memcg; - int ret; - - memcg = get_mem_cgroup_from_mm(current->mm); + unsigned int nr_pages = 1 << order; + struct page_counter *counter; + int ret = 0; - if (!memcg_kmem_is_active(memcg)) { - css_put(&memcg->css); + if (!memcg_kmem_is_active(memcg)) return 0; + + ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter); + if (ret) + return ret; + + ret = try_charge(memcg, gfp, nr_pages); + if (ret) { + page_counter_uncharge(&memcg->kmem, nr_pages); + return ret; } - ret = memcg_charge_kmem(memcg, gfp, 1 << order); + page->mem_cgroup = memcg; + return 0; +} + +int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +{ + struct mem_cgroup *memcg; + int ret; + + memcg = get_mem_cgroup_from_mm(current->mm); + ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); css_put(&memcg->css); - page->mem_cgroup = memcg; return ret; } void __memcg_kmem_uncharge(struct page *page, int order) { struct mem_cgroup *memcg = page->mem_cgroup; + unsigned int nr_pages = 1 << order; if (!memcg) return; VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); - memcg_uncharge_kmem(memcg, 1 << order); + page_counter_uncharge(&memcg->kmem, nr_pages); + page_counter_uncharge(&memcg->memory, nr_pages); + if (do_swap_account) + page_counter_uncharge(&memcg->memsw, nr_pages); + page->mem_cgroup = NULL; + css_put_many(&memcg->css, nr_pages); } struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr) diff --git a/mm/slab.c b/mm/slab.c index 461935b..272e809 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1593,16 +1593,17 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, if (cachep->flags & SLAB_RECLAIM_ACCOUNT) flags |= __GFP_RECLAIMABLE; - if (memcg_charge_slab(cachep, flags, cachep->gfporder)) - return NULL; - page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); if (!page) { - memcg_uncharge_slab(cachep, cachep->gfporder); slab_out_of_memory(cachep, flags, nodeid); return NULL; } + if (memcg_charge_slab(page, flags, cachep->gfporder, cachep)) { + __free_pages(page, cachep->gfporder); + return NULL; + } + /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ if (page_is_pfmemalloc(page)) pfmemalloc_active = true; @@ -1654,8 +1655,7 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; - __free_pages(page, cachep->gfporder); - memcg_uncharge_slab(cachep, cachep->gfporder); + __free_kmem_pages(page, cachep->gfporder); } static void kmem_rcu_free(struct rcu_head *head) diff --git a/mm/slab.h b/mm/slab.h index bf51a8d..27492eb 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -236,23 +236,16 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) return s->memcg_params.root_cache; } -static __always_inline int memcg_charge_slab(struct kmem_cache *s, - gfp_t gfp, int order) +static __always_inline int memcg_charge_slab(struct page *page, + gfp_t gfp, int order, + struct kmem_cache *s) { if (!memcg_kmem_enabled()) return 0; if (is_root_cache(s)) return 0; - return memcg_charge_kmem(s->memcg_params.memcg, gfp, 1 << order); -} - -static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) -{ - if (!memcg_kmem_enabled()) - return; - if (is_root_cache(s)) - return; - memcg_uncharge_kmem(s->memcg_params.memcg, 1 << order); + return __memcg_kmem_charge_memcg(page, gfp, order, + s->memcg_params.memcg); } extern void slab_init_memcg_params(struct kmem_cache *); @@ -289,15 +282,12 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) return s; } -static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order) +static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, + struct kmem_cache *s) { return 0; } -static inline void memcg_uncharge_slab(struct kmem_cache *s, int order) -{ -} - static inline void slab_init_memcg_params(struct kmem_cache *s) { } diff --git a/mm/slub.c b/mm/slub.c index e1bb147..423dbe7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1328,16 +1328,15 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s, flags |= __GFP_NOTRACK; - if (memcg_charge_slab(s, flags, order)) - return NULL; - if (node == NUMA_NO_NODE) page = alloc_pages(flags, order); else page = __alloc_pages_node(node, flags, order); - if (!page) - memcg_uncharge_slab(s, order); + if (page && memcg_charge_slab(page, flags, order, s)) { + __free_pages(page, order); + page = NULL; + } return page; } @@ -1476,8 +1475,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) page_mapcount_reset(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; - __free_pages(page, order); - memcg_uncharge_slab(s, order); + __free_kmem_pages(page, order); } #define need_reserve_slab_rcu \ -- cgit v1.1 From df4065516b0dbfa35ac0e9b8124d441221c0a285 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 5 Nov 2015 18:49:04 -0800 Subject: memcg: simplify and inline __mem_cgroup_from_kmem Before the previous patch ("memcg: unify slab and other kmem pages charging"), __mem_cgroup_from_kmem had to handle two types of kmem - slab pages and pages allocated with alloc_kmem_pages - memcg in the page struct. Now we can unify it. Since after it, this function becomes tiny we can fold it into mem_cgroup_from_kmem. [hughd@google.com: move mem_cgroup_from_kmem into list_lru.c] Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/list_lru.c | 10 ++++++++++ mm/memcontrol.c | 18 ------------------ 2 files changed, 10 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/list_lru.c b/mm/list_lru.c index 2823747..afc71ea 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -63,6 +63,16 @@ list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) return &nlru->lru; } +static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr) +{ + struct page *page; + + if (!memcg_kmem_enabled()) + return NULL; + page = virt_to_head_page(ptr); + return page->mem_cgroup; +} + static inline struct list_lru_one * list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9575cff..c0111cb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2430,24 +2430,6 @@ void __memcg_kmem_uncharge(struct page *page, int order) page->mem_cgroup = NULL; css_put_many(&memcg->css, nr_pages); } - -struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr) -{ - struct mem_cgroup *memcg = NULL; - struct kmem_cache *cachep; - struct page *page; - - page = virt_to_head_page(ptr); - if (PageSlab(page)) { - cachep = page->slab_cache; - if (!is_root_cache(cachep)) - memcg = cachep->memcg_params.memcg; - } else - /* page allocated by alloc_kmem_pages */ - memcg = page->mem_cgroup; - - return memcg; -} #endif /* CONFIG_MEMCG_KMEM */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE -- cgit v1.1 From ad12695f177c3403a64348b42718faf9727fe358 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 5 Nov 2015 18:49:07 -0800 Subject: ksm: add cond_resched() to the rmap_walks While at it add it to the file and anon walks too. Signed-off-by: Andrea Arcangeli Acked-by: Hugh Dickins Cc: Petr Holasek Acked-by: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 2 ++ mm/rmap.c | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index 7ee101e..e87dec7 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1914,9 +1914,11 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; + cond_resched(); anon_vma_lock_read(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { + cond_resched(); vma = vmac->vma; if (rmap_item->address < vma->vm_start || rmap_item->address >= vma->vm_end) diff --git a/mm/rmap.c b/mm/rmap.c index d40e7ae..78a6928 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1609,6 +1609,8 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); + cond_resched(); + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; @@ -1658,6 +1660,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); + cond_resched(); + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; -- cgit v1.1 From f2e5ff85edea30a59b96cf9e20e8886991b0d097 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 5 Nov 2015 18:49:10 -0800 Subject: ksm: don't fail stable tree lookups if walking over stale stable_nodes The stable_nodes can become stale at any time if the underlying pages gets freed. The stable_node gets collected and removed from the stable rbtree if that is detected during the rbtree lookups. Don't fail the lookup if running into stale stable_nodes, just restart the lookup after collecting the stale stable_nodes. Otherwise the CPU spent in the preparation stage is wasted and the lookup must be repeated at the next loop potentially failing a second time in a second stale stable_node. If we don't prune aggressively we delay the merging of the unstable node candidates and at the same time we delay the freeing of the stale stable_nodes. Keeping stale stable_nodes around wastes memory and it can't provide any benefit. Signed-off-by: Andrea Arcangeli Acked-by: Hugh Dickins Cc: Petr Holasek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index e87dec7..9f182f9 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1177,8 +1177,18 @@ again: cond_resched(); stable_node = rb_entry(*new, struct stable_node, node); tree_page = get_ksm_page(stable_node, false); - if (!tree_page) - return NULL; + if (!tree_page) { + /* + * If we walked over a stale stable_node, + * get_ksm_page() will call rb_erase() and it + * may rebalance the tree from under us. So + * restart the search from scratch. Returning + * NULL would be safe too, but we'd generate + * false negative insertions just because some + * stable_node was stale. + */ + goto again; + } ret = memcmp_pages(page, tree_page); put_page(tree_page); @@ -1254,12 +1264,14 @@ static struct stable_node *stable_tree_insert(struct page *kpage) unsigned long kpfn; struct rb_root *root; struct rb_node **new; - struct rb_node *parent = NULL; + struct rb_node *parent; struct stable_node *stable_node; kpfn = page_to_pfn(kpage); nid = get_kpfn_nid(kpfn); root = root_stable_tree + nid; +again: + parent = NULL; new = &root->rb_node; while (*new) { @@ -1269,8 +1281,18 @@ static struct stable_node *stable_tree_insert(struct page *kpage) cond_resched(); stable_node = rb_entry(*new, struct stable_node, node); tree_page = get_ksm_page(stable_node, false); - if (!tree_page) - return NULL; + if (!tree_page) { + /* + * If we walked over a stale stable_node, + * get_ksm_page() will call rb_erase() and it + * may rebalance the tree from under us. So + * restart the search from scratch. Returning + * NULL would be safe too, but we'd generate + * false negative insertions just because some + * stable_node was stale. + */ + goto again; + } ret = memcmp_pages(kpage, tree_page); put_page(tree_page); -- cgit v1.1 From 98666f8a2576b12f5f3ebcef61a8cdbefede1be3 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 5 Nov 2015 18:49:13 -0800 Subject: ksm: use the helper method to do the hlist_empty check This just uses the helper function to cleanup the assumption on the hlist_node internals. Signed-off-by: Andrea Arcangeli Acked-by: Hugh Dickins Cc: Petr Holasek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index 9f182f9..d4ee159 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -625,7 +625,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) unlock_page(page); put_page(page); - if (stable_node->hlist.first) + if (!hlist_empty(&stable_node->hlist)) ksm_pages_sharing--; else ksm_pages_shared--; -- cgit v1.1 From 85c6e8dd23c6aa9ff299bf5256fe5f0a6c44f100 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 5 Nov 2015 18:49:16 -0800 Subject: ksm: use find_mergeable_vma in try_to_merge_with_ksm_page Doing the VM_MERGEABLE check after the page == kpage check won't provide any meaningful benefit. The !vma->anon_vma check of find_mergeable_vma is the only superfluous bit in using find_mergeable_vma because the !PageAnon check of try_to_merge_one_page() implicitly checks for that, but it still looks cleaner to share the same find_mergeable_vma(). Signed-off-by: Andrea Arcangeli Acked-by: Hugh Dickins Cc: Petr Holasek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index d4ee159..0183083 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1021,8 +1021,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, if (page == kpage) /* ksm page forked */ return 0; - if (!(vma->vm_flags & VM_MERGEABLE)) - goto out; if (PageTransCompound(page) && page_trans_compound_anon_split(page)) goto out; BUG_ON(PageTransCompound(page)); @@ -1087,10 +1085,8 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, int err = -EFAULT; down_read(&mm->mmap_sem); - if (ksm_test_exit(mm)) - goto out; - vma = find_vma(mm, rmap_item->address); - if (!vma || vma->vm_start > rmap_item->address) + vma = find_mergeable_vma(mm, rmap_item->address); + if (!vma) goto out; err = try_to_merge_one_page(vma, page, kpage); -- cgit v1.1 From c8f95ed1a9ce373d20d3b09a6a9fb91c8beef27e Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 5 Nov 2015 18:49:19 -0800 Subject: ksm: unstable_tree_search_insert error checking cleanup get_mergeable_page() can only return NULL (also in case of errors) or the pinned mergeable page. It can't return an error different than NULL. This optimizes away the unnecessary error check. Add a return after the "out:" label in the callee to make it more readable. Signed-off-by: Andrea Arcangeli Acked-by: Hugh Dickins Cc: Petr Holasek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index 0183083..b5cd647 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -475,7 +475,8 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) flush_dcache_page(page); } else { put_page(page); -out: page = NULL; +out: + page = NULL; } up_read(&mm->mmap_sem); return page; @@ -1358,7 +1359,7 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, cond_resched(); tree_rmap_item = rb_entry(*new, struct rmap_item, node); tree_page = get_mergeable_page(tree_rmap_item); - if (IS_ERR_OR_NULL(tree_page)) + if (!tree_page) return NULL; /* -- cgit v1.1 From b87537d9e2feb30f6a962f27eb32768682698d3b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 Nov 2015 18:49:33 -0800 Subject: mm: rmap use pte lock not mmap_sem to set PageMlocked KernelThreadSanitizer (ktsan) has shown that the down_read_trylock() of mmap_sem in try_to_unmap_one() (when going to set PageMlocked on a page found mapped in a VM_LOCKED vma) is ineffective against races with exit_mmap()'s munlock_vma_pages_all(), because mmap_sem is not held when tearing down an mm. But that's okay, those races are benign; and although we've believed for years in that ugly down_read_trylock(), it's unsuitable for the job, and frustrates the good intention of setting PageMlocked when it fails. It just doesn't matter if here we read vm_flags an instant before or after a racing mlock() or munlock() or exit_mmap() sets or clears VM_LOCKED: the syscalls (or exit) work their way up the address space (taking pt locks after updating vm_flags) to establish the final state. We do still need to be careful never to mark a page Mlocked (hence unevictable) by any race that will not be corrected shortly after. The page lock protects from many of the races, but not all (a page is not necessarily locked when it's unmapped). But the pte lock we just dropped is good to cover the rest (and serializes even with munlock_vma_pages_all(), so no special barriers required): now hold on to the pte lock while calling mlock_vma_page(). Is that lock ordering safe? Yes, that's how follow_page_pte() calls it, and how page_remove_rmap() calls the complementary clear_page_mlock(). This fixes the following case (though not a case which anyone has complained of), which mmap_sem did not: truncation's preliminary unmap_mapping_range() is supposed to remove even the anonymous COWs of filecache pages, and that might race with try_to_unmap_one() on a VM_LOCKED vma, so that mlock_vma_page() sets PageMlocked just after zap_pte_range() unmaps the page, causing "Bad page state (mlocked)" when freed. The pte lock protects against this. You could say that it also protects against the more ordinary case, racing with the preliminary unmapping of a filecache page itself: but in our current tree, that's independently protected by i_mmap_rwsem; and that race would be why "Bad page state (mlocked)" was seen before commit 48ec833b7851 ("Revert mm/memory.c: share the i_mmap_rwsem"). Vlastimil Babka points out another race which this patch protects against. try_to_unmap_one() might reach its mlock_vma_page() TestSetPageMlocked a moment after munlock_vma_pages_all() did its Phase 1 TestClearPageMlocked: leaving PageMlocked and unevictable when it should be evictable. mmap_sem is ineffective because exit_mmap() does not hold it; page lock ineffective because __munlock_pagevec() only takes it afterwards, in Phase 2; pte lock is effective because __munlock_pagevec_fill() takes it to get the page, after VM_LOCKED was cleared from vm_flags, so visible to try_to_unmap_one. Kirill Shutemov points out that if the compiler chooses to implement a "vma->vm_flags &= VM_WHATEVER" or "vma->vm_flags |= VM_WHATEVER" operation with an intermediate store of unrelated bits set, since I'm here foregoing its usual protection by mmap_sem, try_to_unmap_one() might catch sight of a spurious VM_LOCKED in vm_flags, and make the wrong decision. This does not appear to be an immediate problem, but we may want to define vm_flags accessors in future, to guard against such a possibility. While we're here, make a related optimization in try_to_munmap_one(): if it's doing TTU_MUNLOCK, then there's no point at all in descending the page tables and getting the pt lock, unless the vma is VM_LOCKED. Yes, that can change racily, but it can change racily even without the optimization: it's not critical. Far better not to waste time here. Stopped short of separating try_to_munlock_one() from try_to_munmap_one() on this occasion, but that's probably the sensible next step - with a rename, given that try_to_munlock()'s business is to try to set Mlocked. Updated the unevictable-lru Documentation, to remove its reference to mmap semaphore, but found a few more updates needed in just that area. Signed-off-by: Hugh Dickins Cc: Christoph Lameter Cc: "Kirill A. Shutemov" Cc: Rik van Riel Acked-by: Vlastimil Babka Cc: Davidlohr Bueso Cc: Oleg Nesterov Cc: Sasha Levin Cc: Dmitry Vyukov Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 36 +++++++++++------------------------- 1 file changed, 11 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 78a6928..b93fb54 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1304,6 +1304,10 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, int ret = SWAP_AGAIN; enum ttu_flags flags = (enum ttu_flags)arg; + /* munlock has nothing to gain from examining un-locked vmas */ + if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) + goto out; + pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) goto out; @@ -1314,9 +1318,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * skipped over this mm) then we should reactivate it. */ if (!(flags & TTU_IGNORE_MLOCK)) { - if (vma->vm_flags & VM_LOCKED) - goto out_mlock; - + if (vma->vm_flags & VM_LOCKED) { + /* Holding pte lock, we do *not* need mmap_sem here */ + mlock_vma_page(page); + ret = SWAP_MLOCK; + goto out_unmap; + } if (flags & TTU_MUNLOCK) goto out_unmap; } @@ -1421,31 +1428,10 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, out_unmap: pte_unmap_unlock(pte, ptl); - if (ret != SWAP_FAIL && !(flags & TTU_MUNLOCK)) + if (ret != SWAP_FAIL && ret != SWAP_MLOCK && !(flags & TTU_MUNLOCK)) mmu_notifier_invalidate_page(mm, address); out: return ret; - -out_mlock: - pte_unmap_unlock(pte, ptl); - - - /* - * We need mmap_sem locking, Otherwise VM_LOCKED check makes - * unstable result and race. Plus, We can't wait here because - * we now hold anon_vma->rwsem or mapping->i_mmap_rwsem. - * if trylock failed, the page remain in evictable lru and later - * vmscan could retry to move the page to unevictable lru if the - * page is actually mlocked. - */ - if (down_read_trylock(&vma->vm_mm->mmap_sem)) { - if (vma->vm_flags & VM_LOCKED) { - mlock_vma_page(page); - ret = SWAP_MLOCK; - } - up_read(&vma->vm_mm->mmap_sem); - } - return ret; } bool is_vma_temporary_stack(struct vm_area_struct *vma) -- cgit v1.1 From 51afb12ba809db664682a31154c11e720e2c363c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 Nov 2015 18:49:37 -0800 Subject: mm: page migration fix PageMlocked on migrated pages Commit e6c509f85455 ("mm: use clear_page_mlock() in page_remove_rmap()") in v3.7 inadvertently made mlock_migrate_page() impotent: page migration unmaps the page from userspace before migrating, and that commit clears PageMlocked on the final unmap, leaving mlock_migrate_page() with nothing to do. Not a serious bug, the next attempt at reclaiming the page would fix it up; but a betrayal of page migration's intent - the new page ought to emerge as PageMlocked. I don't see how to fix it for mlock_migrate_page() itself; but easily fixed in remove_migration_pte(), by calling mlock_vma_page() when the vma is VM_LOCKED - under pte lock as in try_to_unmap_one(). Delete mlock_migrate_page()? Not quite, it does still serve a purpose for migrate_misplaced_transhuge_page(): where we could replace it by a test, clear_page_mlock(), mlock_vma_page() sequence; but would that be an improvement? mlock_migrate_page() is fairly lean, and let's make it leaner by skipping the irq save/restore now clearly not needed. Signed-off-by: Hugh Dickins Cc: Christoph Lameter Cc: "Kirill A. Shutemov" Cc: Rik van Riel Acked-by: Vlastimil Babka Cc: Davidlohr Bueso Cc: Oleg Nesterov Cc: Sasha Levin Cc: Dmitry Vyukov Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 9 ++++----- mm/migrate.c | 6 ++++-- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index bc0fa9a..d4b807d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -271,20 +271,19 @@ extern unsigned int munlock_vma_page(struct page *page); extern void clear_page_mlock(struct page *page); /* - * mlock_migrate_page - called only from migrate_page_copy() to - * migrate the Mlocked page flag; update statistics. + * mlock_migrate_page - called only from migrate_misplaced_transhuge_page() + * (because that does not go through the full procedure of migration ptes): + * to migrate the Mlocked page flag; update statistics. */ static inline void mlock_migrate_page(struct page *newpage, struct page *page) { if (TestClearPageMlocked(page)) { - unsigned long flags; int nr_pages = hpage_nr_pages(page); - local_irq_save(flags); + /* Holding pmd lock, no change in irq context: __mod is safe */ __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); SetPageMlocked(newpage); __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages); - local_irq_restore(flags); } } diff --git a/mm/migrate.c b/mm/migrate.c index 94961f4..ed72c49 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -171,6 +171,9 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, else page_add_file_rmap(new); + if (vma->vm_flags & VM_LOCKED) + mlock_vma_page(new); + /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, ptep); unlock: @@ -537,7 +540,6 @@ void migrate_page_copy(struct page *newpage, struct page *page) cpupid = page_cpupid_xchg_last(page, -1); page_cpupid_xchg_last(newpage, cpupid); - mlock_migrate_page(newpage, page); ksm_migrate_page(newpage, page); /* * Please do not reorder this without considering how mm/ksm.c's @@ -1787,7 +1789,6 @@ fail_putback: SetPageActive(page); if (TestClearPageUnevictable(new_page)) SetPageUnevictable(page); - mlock_migrate_page(page, new_page); unlock_page(new_page); put_page(new_page); /* Free it */ @@ -1829,6 +1830,7 @@ fail_putback: goto fail_putback; } + mlock_migrate_page(new_page, page); mem_cgroup_migrate(page, new_page, false); page_remove_rmap(page); -- cgit v1.1 From 45637bab30d6e7651737f51aa99417baef4d114a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 Nov 2015 18:49:40 -0800 Subject: mm: rename mem_cgroup_migrate to mem_cgroup_replace_page After v4.3's commit 0610c25daa3e ("memcg: fix dirty page migration") mem_cgroup_migrate() doesn't have much to offer in page migration: convert migrate_misplaced_transhuge_page() to set_page_memcg() instead. Then rename mem_cgroup_migrate() to mem_cgroup_replace_page(), since its remaining callers are replace_page_cache_page() and shmem_replace_page(): both of whom passed lrucare true, so just eliminate that argument. Signed-off-by: Hugh Dickins Cc: Christoph Lameter Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Vlastimil Babka Cc: Davidlohr Bueso Cc: Oleg Nesterov Cc: Sasha Levin Cc: Dmitry Vyukov Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- mm/memcontrol.c | 29 ++++++++--------------------- mm/migrate.c | 5 ++--- mm/shmem.c | 2 +- 4 files changed, 12 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 884766d..58e04e2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -551,7 +551,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) __inc_zone_page_state(new, NR_SHMEM); spin_unlock_irqrestore(&mapping->tree_lock, flags); mem_cgroup_end_page_stat(memcg); - mem_cgroup_migrate(old, new, true); + mem_cgroup_replace_page(old, new); radix_tree_preload_end(); if (freepage) freepage(old); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c0111cb..a44494f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4531,9 +4531,8 @@ static int mem_cgroup_move_account(struct page *page, goto out; /* - * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup - * of its source page while we change it: page migration takes - * both pages off the LRU, but page cache replacement doesn't. + * Prevent mem_cgroup_replace_page() from looking at + * page->mem_cgroup of its source page while we change it. */ if (!trylock_page(page)) goto out; @@ -5495,7 +5494,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) } /** - * mem_cgroup_migrate - migrate a charge to another page + * mem_cgroup_replace_page - migrate a charge to another page * @oldpage: currently charged page * @newpage: page to transfer the charge to * @lrucare: either or both pages might be on the LRU already @@ -5504,16 +5503,13 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) * * Both pages must be locked, @newpage->mapping must be set up. */ -void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, - bool lrucare) +void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) { struct mem_cgroup *memcg; int isolated; VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); - VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage); - VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage); VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), newpage); @@ -5525,25 +5521,16 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, if (newpage->mem_cgroup) return; - /* - * Swapcache readahead pages can get migrated before being - * charged, and migration from compaction can happen to an - * uncharged page when the PFN walker finds a page that - * reclaim just put back on the LRU but has not released yet. - */ + /* Swapcache readahead pages can get replaced before being charged */ memcg = oldpage->mem_cgroup; if (!memcg) return; - if (lrucare) - lock_page_lru(oldpage, &isolated); - + lock_page_lru(oldpage, &isolated); oldpage->mem_cgroup = NULL; + unlock_page_lru(oldpage, isolated); - if (lrucare) - unlock_page_lru(oldpage, isolated); - - commit_charge(newpage, memcg, lrucare); + commit_charge(newpage, memcg, true); } /* diff --git a/mm/migrate.c b/mm/migrate.c index ed72c49..836e410 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include @@ -1831,8 +1830,8 @@ fail_putback: } mlock_migrate_page(new_page, page); - mem_cgroup_migrate(page, new_page, false); - + set_page_memcg(new_page, page_memcg(page)); + set_page_memcg(page, NULL); page_remove_rmap(page); spin_unlock(ptl); diff --git a/mm/shmem.c b/mm/shmem.c index 48ce829..6529226 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1023,7 +1023,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, */ oldpage = newpage; } else { - mem_cgroup_migrate(oldpage, newpage, true); + mem_cgroup_replace_page(oldpage, newpage); lru_cache_add_anon(newpage); *pagep = newpage; } -- cgit v1.1 From 14e0f9bcc95f1aef26a9f860cceda35faee79b34 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 Nov 2015 18:49:43 -0800 Subject: mm: correct a couple of page migration comments It's migrate.c not migration,c, and nowadays putback_movable_pages() not putback_lru_pages(). Signed-off-by: Hugh Dickins Cc: Christoph Lameter Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Vlastimil Babka Cc: Davidlohr Bueso Cc: Oleg Nesterov Cc: Sasha Levin Cc: Dmitry Vyukov Cc: KOSAKI Motohiro Acked-by: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 836e410..d149cbb 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1,5 +1,5 @@ /* - * Memory Migration functionality - linux/mm/migration.c + * Memory Migration functionality - linux/mm/migrate.c * * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter * @@ -1113,7 +1113,7 @@ out: * * The function returns after 10 attempts or if no pages are movable any more * because the list has become empty or no retryable pages exist any more. - * The caller should call putback_lru_pages() to return pages to the LRU + * The caller should call putback_movable_pages() to return pages to the LRU * or free list only if ret != 0. * * Returns the number of pages that were not migrated, or an error code. -- cgit v1.1 From 2def7424c9be0069831380823fdb5cf72103b919 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 Nov 2015 18:49:46 -0800 Subject: mm: page migration use the put_new_page whenever necessary I don't know of any problem from the way it's used in our current tree, but there is one defect in page migration's custom put_new_page feature. An unused newpage is expected to be released with the put_new_page(), but there was one MIGRATEPAGE_SUCCESS (0) path which released it with putback_lru_page(): which can be very wrong for a custom pool. Fixed more easily by resetting put_new_page once it won't be needed, than by adding a further flag to modify the rc test. Signed-off-by: Hugh Dickins Cc: Christoph Lameter Cc: "Kirill A. Shutemov" Cc: Rik van Riel Acked-by: Vlastimil Babka Cc: Davidlohr Bueso Cc: Oleg Nesterov Cc: Sasha Levin Cc: Dmitry Vyukov Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index d149cbb..2f2e223 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -938,10 +938,11 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, int force, enum migrate_mode mode, enum migrate_reason reason) { - int rc = 0; + int rc = MIGRATEPAGE_SUCCESS; int *result = NULL; - struct page *newpage = get_new_page(page, private, &result); + struct page *newpage; + newpage = get_new_page(page, private, &result); if (!newpage) return -ENOMEM; @@ -955,6 +956,8 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, goto out; rc = __unmap_and_move(page, newpage, force, mode); + if (rc == MIGRATEPAGE_SUCCESS) + put_new_page = NULL; out: if (rc != -EAGAIN) { @@ -981,7 +984,7 @@ out: * it. Otherwise, putback_lru_page() will drop the reference grabbed * during isolation. */ - if (rc != MIGRATEPAGE_SUCCESS && put_new_page) { + if (put_new_page) { ClearPageSwapBacked(newpage); put_new_page(newpage, private); } else if (unlikely(__is_movable_balloon_page(newpage))) { @@ -1022,7 +1025,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, struct page *hpage, int force, enum migrate_mode mode) { - int rc = 0; + int rc = -EAGAIN; int *result = NULL; int page_was_mapped = 0; struct page *new_hpage; @@ -1044,8 +1047,6 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (!new_hpage) return -ENOMEM; - rc = -EAGAIN; - if (!trylock_page(hpage)) { if (!force || mode != MIGRATE_SYNC) goto out; @@ -1070,8 +1071,10 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (anon_vma) put_anon_vma(anon_vma); - if (rc == MIGRATEPAGE_SUCCESS) + if (rc == MIGRATEPAGE_SUCCESS) { hugetlb_cgroup_migrate(hpage, new_hpage); + put_new_page = NULL; + } unlock_page(hpage); out: @@ -1083,7 +1086,7 @@ out: * it. Otherwise, put_page() will drop the reference grabbed during * isolation. */ - if (rc != MIGRATEPAGE_SUCCESS && put_new_page) + if (put_new_page) put_new_page(new_hpage, private); else putback_active_hugepage(new_hpage); -- cgit v1.1 From 7db7671f835ccad66db20154ac1274140937d9b7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 Nov 2015 18:49:49 -0800 Subject: mm: page migration trylock newpage at same level as oldpage Clean up page migration a little by moving the trylock of newpage from move_to_new_page() into __unmap_and_move(), where the old page has been locked. Adjust unmap_and_move_huge_page() and balloon_page_migrate() accordingly. But make one kind-of-functional change on the way: whereas trylock of newpage used to BUG() if it failed, now simply return -EAGAIN if so. Cutting out BUG()s is good, right? But, to be honest, this is really to extend the usefulness of the custom put_new_page feature, allowing a pool of new pages to be shared perhaps with racing uses. Use an "else" instead of that "skip_unmap" label. Signed-off-by: Hugh Dickins Cc: Christoph Lameter Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Vlastimil Babka Cc: Davidlohr Bueso Cc: Oleg Nesterov Cc: Sasha Levin Cc: Dmitry Vyukov Cc: KOSAKI Motohiro Acked-by: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/balloon_compaction.c | 10 ++-------- mm/migrate.c | 46 ++++++++++++++++++++++++++-------------------- 2 files changed, 28 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index fcad832..d3116be 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -199,23 +199,17 @@ int balloon_page_migrate(struct page *newpage, struct balloon_dev_info *balloon = balloon_page_device(page); int rc = -EAGAIN; - /* - * Block others from accessing the 'newpage' when we get around to - * establishing additional references. We should be the only one - * holding a reference to the 'newpage' at this point. - */ - BUG_ON(!trylock_page(newpage)); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); if (WARN_ON(!__is_movable_balloon_page(page))) { dump_page(page, "not movable balloon page"); - unlock_page(newpage); return rc; } if (balloon && balloon->migratepage) rc = balloon->migratepage(balloon, newpage, page, mode); - unlock_page(newpage); return rc; } #endif /* CONFIG_BALLOON_COMPACTION */ diff --git a/mm/migrate.c b/mm/migrate.c index 2f2e223..6d7774e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -727,13 +727,8 @@ static int move_to_new_page(struct page *newpage, struct page *page, struct address_space *mapping; int rc; - /* - * Block others from accessing the page when we get around to - * establishing additional references. We are the only one - * holding a reference to the new page at this point. - */ - if (!trylock_page(newpage)) - BUG(); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); /* Prepare mapping for the new page.*/ newpage->index = page->index; @@ -774,9 +769,6 @@ static int move_to_new_page(struct page *newpage, struct page *page, remove_migration_ptes(page, newpage); page->mapping = NULL; } - - unlock_page(newpage); - return rc; } @@ -861,6 +853,17 @@ static int __unmap_and_move(struct page *page, struct page *newpage, } } + /* + * Block others from accessing the new page when we get around to + * establishing additional references. We are usually the only one + * holding a reference to newpage at this point. We used to have a BUG + * here if trylock_page(newpage) fails, but would like to allow for + * cases where there might be a race with the previous use of newpage. + * This is much like races on refcount of oldpage: just don't BUG(). + */ + if (unlikely(!trylock_page(newpage))) + goto out_unlock; + if (unlikely(isolated_balloon_page(page))) { /* * A ballooned page does not need any special attention from @@ -870,7 +873,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * the page migration right away (proteced by page lock). */ rc = balloon_page_migrate(newpage, page, mode); - goto out_unlock; + goto out_unlock_both; } /* @@ -889,30 +892,27 @@ static int __unmap_and_move(struct page *page, struct page *newpage, VM_BUG_ON_PAGE(PageAnon(page), page); if (page_has_private(page)) { try_to_free_buffers(page); - goto out_unlock; + goto out_unlock_both; } - goto skip_unmap; - } - - /* Establish migration ptes or remove ptes */ - if (page_mapped(page)) { + } else if (page_mapped(page)) { + /* Establish migration ptes */ try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); page_was_mapped = 1; } -skip_unmap: if (!page_mapped(page)) rc = move_to_new_page(newpage, page, page_was_mapped, mode); if (rc && page_was_mapped) remove_migration_ptes(page, page); +out_unlock_both: + unlock_page(newpage); +out_unlock: /* Drop an anon_vma reference if we took one */ if (anon_vma) put_anon_vma(anon_vma); - -out_unlock: unlock_page(page); out: return rc; @@ -1056,6 +1056,9 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (PageAnon(hpage)) anon_vma = page_get_anon_vma(hpage); + if (unlikely(!trylock_page(new_hpage))) + goto put_anon; + if (page_mapped(hpage)) { try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); @@ -1068,6 +1071,9 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (rc != MIGRATEPAGE_SUCCESS && page_was_mapped) remove_migration_ptes(hpage, hpage); + unlock_page(new_hpage); + +put_anon: if (anon_vma) put_anon_vma(anon_vma); -- cgit v1.1 From 5c3f9a67371643b6faa987622bc1b67667bab848 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 Nov 2015 18:49:53 -0800 Subject: mm: page migration remove_migration_ptes at lock+unlock level Clean up page migration a little more by calling remove_migration_ptes() from the same level, on success or on failure, from __unmap_and_move() or from unmap_and_move_huge_page(). Don't reset page->mapping of a PageAnon old page in move_to_new_page(), leave that to when the page is freed. Except for here in page migration, it has been an invariant that a PageAnon (bit set in page->mapping) page stays PageAnon until it is freed, and I think we're safer to keep to that. And with the above rearrangement, it's necessary because zap_pte_range() wants to identify whether a migration entry represents a file or an anon page, to update the appropriate rss stats without waiting on it. Signed-off-by: Hugh Dickins Cc: Christoph Lameter Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Vlastimil Babka Cc: Davidlohr Bueso Cc: Oleg Nesterov Cc: Sasha Levin Cc: Dmitry Vyukov Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 6d7774e..7b44ebdf 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -722,7 +722,7 @@ static int fallback_migrate_page(struct address_space *mapping, * MIGRATEPAGE_SUCCESS - success */ static int move_to_new_page(struct page *newpage, struct page *page, - int page_was_mapped, enum migrate_mode mode) + enum migrate_mode mode) { struct address_space *mapping; int rc; @@ -755,19 +755,21 @@ static int move_to_new_page(struct page *newpage, struct page *page, * space which also has its own migratepage callback. This * is the most common path for page migration. */ - rc = mapping->a_ops->migratepage(mapping, - newpage, page, mode); + rc = mapping->a_ops->migratepage(mapping, newpage, page, mode); else rc = fallback_migrate_page(mapping, newpage, page, mode); - if (rc != MIGRATEPAGE_SUCCESS) { + /* + * When successful, old pagecache page->mapping must be cleared before + * page is freed; but stats require that PageAnon be left as PageAnon. + */ + if (rc == MIGRATEPAGE_SUCCESS) { + set_page_memcg(page, NULL); + if (!PageAnon(page)) + page->mapping = NULL; + } else { set_page_memcg(newpage, NULL); newpage->mapping = NULL; - } else { - set_page_memcg(page, NULL); - if (page_was_mapped) - remove_migration_ptes(page, newpage); - page->mapping = NULL; } return rc; } @@ -902,10 +904,11 @@ static int __unmap_and_move(struct page *page, struct page *newpage, } if (!page_mapped(page)) - rc = move_to_new_page(newpage, page, page_was_mapped, mode); + rc = move_to_new_page(newpage, page, mode); - if (rc && page_was_mapped) - remove_migration_ptes(page, page); + if (page_was_mapped) + remove_migration_ptes(page, + rc == MIGRATEPAGE_SUCCESS ? newpage : page); out_unlock_both: unlock_page(newpage); @@ -1066,10 +1069,11 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, } if (!page_mapped(hpage)) - rc = move_to_new_page(new_hpage, hpage, page_was_mapped, mode); + rc = move_to_new_page(new_hpage, hpage, mode); - if (rc != MIGRATEPAGE_SUCCESS && page_was_mapped) - remove_migration_ptes(hpage, hpage); + if (page_was_mapped) + remove_migration_ptes(hpage, + rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage); unlock_page(new_hpage); -- cgit v1.1 From 03f15c86c8d1b9d81e6d215715e110aef8f936e0 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 Nov 2015 18:49:56 -0800 Subject: mm: simplify page migration's anon_vma comment and flow __unmap_and_move() contains a long stale comment on page_get_anon_vma() and PageSwapCache(), with an odd control flow that's hard to follow. Mostly this reflects our confusion about the lifetime of an anon_vma, in the early days of page migration, before we could take a reference to one. Nowadays this seems quite straightforward: cut it all down to essentials. I cannot see the relevance of swapcache here at all, so don't treat it any differently: I believe the old comment reflects in part our anon_vma confusions, and in part the original v2.6.16 page migration technique, which used actual swap to migrate anon instead of swap-like migration entries. Why should a swapcache page not be migrated with the aid of migration entry ptes like everything else? So lose that comment now, and enable migration entries for swapcache in the next patch. Signed-off-by: Hugh Dickins Cc: Christoph Lameter Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Vlastimil Babka Cc: Davidlohr Bueso Cc: Oleg Nesterov Cc: Sasha Levin Cc: Dmitry Vyukov Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 36 ++++++++++-------------------------- 1 file changed, 10 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 7b44ebdf..08a7b6c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -819,6 +819,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, goto out_unlock; wait_on_page_writeback(page); } + /* * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, * we cannot notice that anon_vma is freed while we migrates a page. @@ -826,34 +827,15 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * of migration. File cache pages are no problem because of page_lock() * File Caches may use write_page() or lock_page() in migration, then, * just care Anon page here. + * + * Only page_get_anon_vma() understands the subtleties of + * getting a hold on an anon_vma from outside one of its mms. + * But if we cannot get anon_vma, then we won't need it anyway, + * because that implies that the anon page is no longer mapped + * (and cannot be remapped so long as we hold the page lock). */ - if (PageAnon(page) && !PageKsm(page)) { - /* - * Only page_lock_anon_vma_read() understands the subtleties of - * getting a hold on an anon_vma from outside one of its mms. - */ + if (PageAnon(page) && !PageKsm(page)) anon_vma = page_get_anon_vma(page); - if (anon_vma) { - /* - * Anon page - */ - } else if (PageSwapCache(page)) { - /* - * We cannot be sure that the anon_vma of an unmapped - * swapcache page is safe to use because we don't - * know in advance if the VMA that this page belonged - * to still exists. If the VMA and others sharing the - * data have been freed, then the anon_vma could - * already be invalid. - * - * To avoid this possibility, swapcache pages get - * migrated but are not remapped when migration - * completes - */ - } else { - goto out_unlock; - } - } /* * Block others from accessing the new page when we get around to @@ -898,6 +880,8 @@ static int __unmap_and_move(struct page *page, struct page *newpage, } } else if (page_mapped(page)) { /* Establish migration ptes */ + VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma, + page); try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); page_was_mapped = 1; -- cgit v1.1 From 470f119f012068e5d94458c98dc4eec102f88cd3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 Nov 2015 18:49:59 -0800 Subject: mm: page migration use migration entry for swapcache too Hitherto page migration has avoided using a migration entry for a swapcache page mapped into userspace, apparently for historical reasons. So any page blessed with swapcache would entail a minor fault when it's next touched, which page migration otherwise tries to avoid. Swapcache in an mlocked area is rare, so won't often matter, but still better fixed. Just rearrange the block in try_to_unmap_one(), to handle TTU_MIGRATION before checking PageAnon, that's all (apart from some reindenting). Well, no, that's not quite all: doesn't this by the way fix a soft_dirty bug, that page migration of a file page was forgetting to transfer the soft_dirty bit? Probably not a serious bug: if I understand correctly, soft_dirty afficionados usually have to handle file pages separately anyway; but we publish the bit in /proc//pagemap on file mappings as well as anonymous, so page migration ought not to perturb it. Signed-off-by: Hugh Dickins Cc: Christoph Lameter Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Vlastimil Babka Cc: Davidlohr Bueso Cc: Oleg Nesterov Cc: Sasha Levin Cc: Dmitry Vyukov Cc: KOSAKI Motohiro Reviewed-by: Cyrill Gorcunov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 63 ++++++++++++++++++++++++++++++--------------------------------- 1 file changed, 30 insertions(+), 33 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index b93fb54..b577fbb 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1379,47 +1379,44 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, dec_mm_counter(mm, MM_ANONPAGES); else dec_mm_counter(mm, MM_FILEPAGES); + } else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION)) { + swp_entry_t entry; + pte_t swp_pte; + /* + * Store the pfn of the page in a special migration + * pte. do_swap_page() will wait until the migration + * pte is removed and then restart fault handling. + */ + entry = make_migration_entry(page, pte_write(pteval)); + swp_pte = swp_entry_to_pte(entry); + if (pte_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + set_pte_at(mm, address, pte, swp_pte); } else if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(page) }; pte_t swp_pte; - - if (PageSwapCache(page)) { - /* - * Store the swap location in the pte. - * See handle_pte_fault() ... - */ - if (swap_duplicate(entry) < 0) { - set_pte_at(mm, address, pte, pteval); - ret = SWAP_FAIL; - goto out_unmap; - } - if (list_empty(&mm->mmlist)) { - spin_lock(&mmlist_lock); - if (list_empty(&mm->mmlist)) - list_add(&mm->mmlist, &init_mm.mmlist); - spin_unlock(&mmlist_lock); - } - dec_mm_counter(mm, MM_ANONPAGES); - inc_mm_counter(mm, MM_SWAPENTS); - } else if (IS_ENABLED(CONFIG_MIGRATION)) { - /* - * Store the pfn of the page in a special migration - * pte. do_swap_page() will wait until the migration - * pte is removed and then restart fault handling. - */ - BUG_ON(!(flags & TTU_MIGRATION)); - entry = make_migration_entry(page, pte_write(pteval)); + /* + * Store the swap location in the pte. + * See handle_pte_fault() ... + */ + VM_BUG_ON_PAGE(!PageSwapCache(page), page); + if (swap_duplicate(entry) < 0) { + set_pte_at(mm, address, pte, pteval); + ret = SWAP_FAIL; + goto out_unmap; + } + if (list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + if (list_empty(&mm->mmlist)) + list_add(&mm->mmlist, &init_mm.mmlist); + spin_unlock(&mmlist_lock); } + dec_mm_counter(mm, MM_ANONPAGES); + inc_mm_counter(mm, MM_SWAPENTS); swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); set_pte_at(mm, address, pte, swp_pte); - } else if (IS_ENABLED(CONFIG_MIGRATION) && - (flags & TTU_MIGRATION)) { - /* Establish migration entry for a file page */ - swp_entry_t entry; - entry = make_migration_entry(page, pte_write(pteval)); - set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); } else dec_mm_counter(mm, MM_FILEPAGES); -- cgit v1.1 From cf4b769abb8aef01f887543cb8308c0d8671367c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 Nov 2015 18:50:02 -0800 Subject: mm: page migration avoid touching newpage until no going back We have had trouble in the past from the way in which page migration's newpage is initialized in dribs and drabs - see commit 8bdd63809160 ("mm: fix direct reclaim writeback regression") which proposed a cleanup. We have no actual problem now, but I think the procedure would be clearer (and alternative get_new_page pools safer to implement) if we assert that newpage is not touched until we are sure that it's going to be used - except for taking the trylock on it in __unmap_and_move(). So shift the early initializations from move_to_new_page() into migrate_page_move_mapping(), mapping and NULL-mapping paths. Similarly migrate_huge_page_move_mapping(), but its NULL-mapping path can just be deleted: you cannot reach hugetlbfs_migrate_page() with a NULL mapping. Adjust stages 3 to 8 in the Documentation file accordingly. Signed-off-by: Hugh Dickins Cc: Christoph Lameter Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Vlastimil Babka Cc: Davidlohr Bueso Cc: Oleg Nesterov Cc: Sasha Levin Cc: Dmitry Vyukov Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 49 +++++++++++++++++++++---------------------------- 1 file changed, 21 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 08a7b6c..3067e40 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -320,6 +320,14 @@ int migrate_page_move_mapping(struct address_space *mapping, /* Anonymous page without mapping */ if (page_count(page) != expected_count) return -EAGAIN; + + /* No turning back from here */ + set_page_memcg(newpage, page_memcg(page)); + newpage->index = page->index; + newpage->mapping = page->mapping; + if (PageSwapBacked(page)) + SetPageSwapBacked(newpage); + return MIGRATEPAGE_SUCCESS; } @@ -355,8 +363,15 @@ int migrate_page_move_mapping(struct address_space *mapping, } /* - * Now we know that no one else is looking at the page. + * Now we know that no one else is looking at the page: + * no turning back from here. */ + set_page_memcg(newpage, page_memcg(page)); + newpage->index = page->index; + newpage->mapping = page->mapping; + if (PageSwapBacked(page)) + SetPageSwapBacked(newpage); + get_page(newpage); /* add cache reference */ if (PageSwapCache(page)) { SetPageSwapCache(newpage); @@ -403,12 +418,6 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, int expected_count; void **pslot; - if (!mapping) { - if (page_count(page) != 1) - return -EAGAIN; - return MIGRATEPAGE_SUCCESS; - } - spin_lock_irq(&mapping->tree_lock); pslot = radix_tree_lookup_slot(&mapping->page_tree, @@ -426,6 +435,9 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, return -EAGAIN; } + set_page_memcg(newpage, page_memcg(page)); + newpage->index = page->index; + newpage->mapping = page->mapping; get_page(newpage); radix_tree_replace_slot(pslot, newpage); @@ -730,21 +742,6 @@ static int move_to_new_page(struct page *newpage, struct page *page, VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); - /* Prepare mapping for the new page.*/ - newpage->index = page->index; - newpage->mapping = page->mapping; - if (PageSwapBacked(page)) - SetPageSwapBacked(newpage); - - /* - * Indirectly called below, migrate_page_copy() copies PG_dirty and thus - * needs newpage's memcg set to transfer memcg dirty page accounting. - * So perform memcg migration in two steps: - * 1. set newpage->mem_cgroup (here) - * 2. clear page->mem_cgroup (below) - */ - set_page_memcg(newpage, page_memcg(page)); - mapping = page_mapping(page); if (!mapping) rc = migrate_page(mapping, newpage, page, mode); @@ -767,9 +764,6 @@ static int move_to_new_page(struct page *newpage, struct page *page, set_page_memcg(page, NULL); if (!PageAnon(page)) page->mapping = NULL; - } else { - set_page_memcg(newpage, NULL); - newpage->mapping = NULL; } return rc; } @@ -971,10 +965,9 @@ out: * it. Otherwise, putback_lru_page() will drop the reference grabbed * during isolation. */ - if (put_new_page) { - ClearPageSwapBacked(newpage); + if (put_new_page) put_new_page(newpage, private); - } else if (unlikely(__is_movable_balloon_page(newpage))) { + else if (unlikely(__is_movable_balloon_page(newpage))) { /* drop our reference, page already in the balloon */ put_page(newpage); } else -- cgit v1.1 From 42cb14b110a5698ccf26ce59c4441722605a3743 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 Nov 2015 18:50:05 -0800 Subject: mm: migrate dirty page without clear_page_dirty_for_io etc clear_page_dirty_for_io() has accumulated writeback and memcg subtleties since v2.6.16 first introduced page migration; and the set_page_dirty() which completed its migration of PageDirty, later had to be moderated to __set_page_dirty_nobuffers(); then PageSwapBacked had to skip that too. No actual problems seen with this procedure recently, but if you look into what the clear_page_dirty_for_io(page)+set_page_dirty(newpage) is actually achieving, it turns out to be nothing more than moving the PageDirty flag, and its NR_FILE_DIRTY stat from one zone to another. It would be good to avoid a pile of irrelevant decrementations and incrementations, and improper event counting, and unnecessary descent of the radix_tree under tree_lock (to set the PAGECACHE_TAG_DIRTY which radix_tree_replace_slot() left in place anyway). Do the NR_FILE_DIRTY movement, like the other stats movements, while interrupts still disabled in migrate_page_move_mapping(); and don't even bother if the zone is the same. Do the PageDirty movement there under tree_lock too, where old page is frozen and newpage not yet visible: bearing in mind that as soon as newpage becomes visible in radix_tree, an un-page-locked set_page_dirty() might interfere (or perhaps that's just not possible: anything doing so should already hold an additional reference to the old page, preventing its migration; but play safe). But we do still need to transfer PageDirty in migrate_page_copy(), for those who don't go the mapping route through migrate_page_move_mapping(). Signed-off-by: Hugh Dickins Cc: Christoph Lameter Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Vlastimil Babka Cc: Davidlohr Bueso Cc: Oleg Nesterov Cc: Sasha Levin Cc: Dmitry Vyukov Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 51 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 3067e40..2834fab 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -313,6 +314,8 @@ int migrate_page_move_mapping(struct address_space *mapping, struct buffer_head *head, enum migrate_mode mode, int extra_count) { + struct zone *oldzone, *newzone; + int dirty; int expected_count = 1 + extra_count; void **pslot; @@ -331,6 +334,9 @@ int migrate_page_move_mapping(struct address_space *mapping, return MIGRATEPAGE_SUCCESS; } + oldzone = page_zone(page); + newzone = page_zone(newpage); + spin_lock_irq(&mapping->tree_lock); pslot = radix_tree_lookup_slot(&mapping->page_tree, @@ -378,6 +384,13 @@ int migrate_page_move_mapping(struct address_space *mapping, set_page_private(newpage, page_private(page)); } + /* Move dirty while page refs frozen and newpage not yet exposed */ + dirty = PageDirty(page); + if (dirty) { + ClearPageDirty(page); + SetPageDirty(newpage); + } + radix_tree_replace_slot(pslot, newpage); /* @@ -387,6 +400,9 @@ int migrate_page_move_mapping(struct address_space *mapping, */ page_unfreeze_refs(page, expected_count - 1); + spin_unlock(&mapping->tree_lock); + /* Leave irq disabled to prevent preemption while updating stats */ + /* * If moved to a different zone then also account * the page for that zone. Other VM counters will be @@ -397,13 +413,19 @@ int migrate_page_move_mapping(struct address_space *mapping, * via NR_FILE_PAGES and NR_ANON_PAGES if they * are mapped to swap space. */ - __dec_zone_page_state(page, NR_FILE_PAGES); - __inc_zone_page_state(newpage, NR_FILE_PAGES); - if (!PageSwapCache(page) && PageSwapBacked(page)) { - __dec_zone_page_state(page, NR_SHMEM); - __inc_zone_page_state(newpage, NR_SHMEM); + if (newzone != oldzone) { + __dec_zone_state(oldzone, NR_FILE_PAGES); + __inc_zone_state(newzone, NR_FILE_PAGES); + if (PageSwapBacked(page) && !PageSwapCache(page)) { + __dec_zone_state(oldzone, NR_SHMEM); + __inc_zone_state(newzone, NR_SHMEM); + } + if (dirty && mapping_cap_account_dirty(mapping)) { + __dec_zone_state(oldzone, NR_FILE_DIRTY); + __inc_zone_state(newzone, NR_FILE_DIRTY); + } } - spin_unlock_irq(&mapping->tree_lock); + local_irq_enable(); return MIGRATEPAGE_SUCCESS; } @@ -524,20 +546,9 @@ void migrate_page_copy(struct page *newpage, struct page *page) if (PageMappedToDisk(page)) SetPageMappedToDisk(newpage); - if (PageDirty(page)) { - clear_page_dirty_for_io(page); - /* - * Want to mark the page and the radix tree as dirty, and - * redo the accounting that clear_page_dirty_for_io undid, - * but we can't use set_page_dirty because that function - * is actually a signal that all of the page has become dirty. - * Whereas only part of our page may be dirty. - */ - if (PageSwapBacked(page)) - SetPageDirty(newpage); - else - __set_page_dirty_nobuffers(newpage); - } + /* Move dirty on pages not done by migrate_page_move_mapping() */ + if (PageDirty(page)) + SetPageDirty(newpage); if (page_is_young(page)) set_page_young(newpage); -- cgit v1.1 From 3acaea6804b3a10e996ce6ebc342089f481e1cdb Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 5 Nov 2015 18:50:08 -0800 Subject: mm/cma.c: suppress warning mm/cma.c: In function 'cma_alloc': mm/cma.c:366: warning: 'pfn' may be used uninitialized in this function The patch actually improves the tracing a bit: if alloc_contig_range() fails, tracing will display the offending pfn rather than -1. Cc: Stefan Strogin Cc: Michal Nazarewicz Cc: Marek Szyprowski Cc: Laurent Pinchart Cc: Thierry Reding Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/cma.c b/mm/cma.c index 4eb56ba..ea506eb 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -363,7 +363,9 @@ err: */ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align) { - unsigned long mask, offset, pfn, start = 0; + unsigned long mask, offset; + unsigned long pfn = -1; + unsigned long start = 0; unsigned long bitmap_maxno, bitmap_no, bitmap_count; struct page *page = NULL; int ret; @@ -418,7 +420,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align) start = bitmap_no + mask + 1; } - trace_cma_alloc(page ? pfn : -1UL, page, count, align); + trace_cma_alloc(pfn, page, count, align); pr_debug("%s(): returned %p\n", __func__, page); return page; -- cgit v1.1 From 9dd861d55b01f1d0848f82007e8665371ae18710 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 5 Nov 2015 18:50:11 -0800 Subject: mm/maccess.c: actually return -EFAULT from strncpy_from_unsafe As far as I can tell, strncpy_from_unsafe never returns -EFAULT. ret is the result of a __copy_from_user_inatomic(), which is 0 for success and positive (in this case necessarily 1) for access error - it is never negative. So we were always returning the length of the, possibly truncated, destination string. Signed-off-by: Rasmus Villemoes Acked-by: Alexei Starovoitov Cc: Masami Hiramatsu Cc: Namhyung Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/maccess.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/maccess.c b/mm/maccess.c index 1b13638..d159b1c 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -104,5 +104,5 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) pagefault_enable(); set_fs(old_fs); - return ret < 0 ? ret : src - unsafe_addr; + return ret ? -EFAULT : src - unsafe_addr; } -- cgit v1.1 From b4e289a6a659c5c2c056a67fa4f31f3dd8317537 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 5 Nov 2015 18:50:14 -0800 Subject: mm/hugetlb: make node_hstates array static There are no users of the node_hstates array outside of the mm/hugetlb.c. So let's make it static. Signed-off-by: Alexander Kuleshov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index abfbe8c..4fc590a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2376,7 +2376,7 @@ struct node_hstate { struct kobject *hugepages_kobj; struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; }; -struct node_hstate node_hstates[MAX_NUMNODES]; +static struct node_hstate node_hstates[MAX_NUMNODES]; /* * A subset of global hstate attributes for node devices -- cgit v1.1 From 099730d67417dfee273e9b10ac2560ca7fac7eb9 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 5 Nov 2015 18:50:17 -0800 Subject: mm, hugetlb: use memory policy when available I have a hugetlbfs user which is never explicitly allocating huge pages with 'nr_hugepages'. They only set 'nr_overcommit_hugepages' and then let the pages be allocated from the buddy allocator at fault time. This works, but they noticed that mbind() was not doing them any good and the pages were being allocated without respect for the policy they specified. The code in question is this: > struct page *alloc_huge_page(struct vm_area_struct *vma, ... > page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); > if (!page) { > page = alloc_buddy_huge_page(h, NUMA_NO_NODE); dequeue_huge_page_vma() is smart and will respect the VMA's memory policy. But, it only grabs _existing_ huge pages from the huge page pool. If the pool is empty, we fall back to alloc_buddy_huge_page() which obviously can't do anything with the VMA's policy because it isn't even passed the VMA. Almost everybody preallocates huge pages. That's probably why nobody has ever noticed this. Looking back at the git history, I don't think this _ever_ worked from when alloc_buddy_huge_page() was introduced in 7893d1d5, 8 years ago. The fix is to pass vma/addr down in to the places where we actually call in to the buddy allocator. It's fairly straightforward plumbing. This has been lightly tested. Signed-off-by: Dave Hansen Cc: Naoya Horiguchi Cc: Mike Kravetz Cc: Hillf Danton Cc: David Rientjes Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 104 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4fc590a..899f6a8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1437,7 +1437,76 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) dissolve_free_huge_page(pfn_to_page(pfn)); } -static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) +/* + * There are 3 ways this can get called: + * 1. With vma+addr: we use the VMA's memory policy + * 2. With !vma, but nid=NUMA_NO_NODE: We try to allocate a huge + * page from any node, and let the buddy allocator itself figure + * it out. + * 3. With !vma, but nid!=NUMA_NO_NODE. We allocate a huge page + * strictly from 'nid' + */ +static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr, int nid) +{ + int order = huge_page_order(h); + gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN; + unsigned int cpuset_mems_cookie; + + /* + * We need a VMA to get a memory policy. If we do not + * have one, we use the 'nid' argument + */ + if (!vma) { + /* + * If a specific node is requested, make sure to + * get memory from there, but only when a node + * is explicitly specified. + */ + if (nid != NUMA_NO_NODE) + gfp |= __GFP_THISNODE; + /* + * Make sure to call something that can handle + * nid=NUMA_NO_NODE + */ + return alloc_pages_node(nid, gfp, order); + } + + /* + * OK, so we have a VMA. Fetch the mempolicy and try to + * allocate a huge page with it. + */ + do { + struct page *page; + struct mempolicy *mpol; + struct zonelist *zl; + nodemask_t *nodemask; + + cpuset_mems_cookie = read_mems_allowed_begin(); + zl = huge_zonelist(vma, addr, gfp, &mpol, &nodemask); + mpol_cond_put(mpol); + page = __alloc_pages_nodemask(gfp, order, zl, nodemask); + if (page) + return page; + } while (read_mems_allowed_retry(cpuset_mems_cookie)); + + return NULL; +} + +/* + * There are two ways to allocate a huge page: + * 1. When you have a VMA and an address (like a fault) + * 2. When you have no VMA (like when setting /proc/.../nr_hugepages) + * + * 'vma' and 'addr' are only for (1). 'nid' is always NUMA_NO_NODE in + * this case which signifies that the allocation should be done with + * respect for the VMA's memory policy. + * + * For (2), we ignore 'vma' and 'addr' and use 'nid' exclusively. This + * implies that memory policies will not be taken in to account. + */ +static struct page *__alloc_buddy_huge_page(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr, int nid) { struct page *page; unsigned int r_nid; @@ -1446,6 +1515,15 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) return NULL; /* + * Make sure that anyone specifying 'nid' is not also specifying a VMA. + * This makes sure the caller is picking _one_ of the modes with which + * we can call this function, not both. + */ + if (vma || (addr != -1)) { + WARN_ON_ONCE(addr == -1); + WARN_ON_ONCE(nid != NUMA_NO_NODE); + } + /* * Assume we will successfully allocate the surplus page to * prevent racing processes from causing the surplus to exceed * overcommit @@ -1478,14 +1556,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) } spin_unlock(&hugetlb_lock); - if (nid == NUMA_NO_NODE) - page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP| - __GFP_REPEAT|__GFP_NOWARN, - huge_page_order(h)); - else - page = __alloc_pages_node(nid, - htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| - __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); + page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid); spin_lock(&hugetlb_lock); if (page) { @@ -1510,6 +1581,27 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) } /* + * Allocate a huge page from 'nid'. Note, 'nid' may be + * NUMA_NO_NODE, which means that it may be allocated + * anywhere. + */ +struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid) +{ + unsigned long addr = -1; + + return __alloc_buddy_huge_page(h, NULL, addr, nid); +} + +/* + * Use the VMA's mpolicy to allocate a huge page from the buddy. + */ +struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +{ + return __alloc_buddy_huge_page(h, vma, addr, NUMA_NO_NODE); +} + +/* * This allocation function is useful in the context where vma is irrelevant. * E.g. soft-offlining uses this function because it only cares physical * address of error page. @@ -1524,7 +1616,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid) spin_unlock(&hugetlb_lock); if (!page) - page = alloc_buddy_huge_page(h, nid); + page = __alloc_buddy_huge_page_no_mpol(h, nid); return page; } @@ -1554,7 +1646,7 @@ static int gather_surplus_pages(struct hstate *h, int delta) retry: spin_unlock(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = alloc_buddy_huge_page(h, NUMA_NO_NODE); + page = __alloc_buddy_huge_page_no_mpol(h, NUMA_NO_NODE); if (!page) { alloc_ok = false; break; @@ -1787,7 +1879,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); if (!page) { spin_unlock(&hugetlb_lock); - page = alloc_buddy_huge_page(h, NUMA_NO_NODE); + page = __alloc_buddy_huge_page_with_mpol(h, vma, addr); if (!page) goto out_uncharge_cgroup; -- cgit v1.1 From e0ec90ee7e6f6cbaa6d59ffb48d2a7af5e80e61d Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 5 Nov 2015 18:50:20 -0800 Subject: mm, hugetlbfs: optimize when NUMA=n My recent patch "mm, hugetlb: use memory policy when available" added some bloat to hugetlb.o. This patch aims to get some of the bloat back, especially when NUMA is not in play. It does this with an implicit #ifdef and marking some things static that should have been static in my first patch. It also makes the warnings only VM_WARN_ON()s. They were responsible for a pretty big chunk of the bloat. Doing this gets our NUMA=n text size back to a wee bit _below_ where we started before the original patch. It also shaves a bit of space off the NUMA=y case, but not much. Enforcing the mempolicy definitely takes some text and it's hard to avoid. size(1) output: text data bss dec hex filename 30745 3433 2492 36670 8f3e hugetlb.o.nonuma.baseline 31305 3755 2492 37552 92b0 hugetlb.o.nonuma.patch1 30713 3433 2492 36638 8f1e hugetlb.o.nonuma.patch2 (this patch) 25235 473 41276 66984 105a8 hugetlb.o.numa.baseline 25715 475 41276 67466 1078a hugetlb.o.numa.patch1 25491 473 41276 67240 106a8 hugetlb.o.numa.patch2 (this patch) Signed-off-by: Dave Hansen Cc: Naoya Horiguchi Cc: Mike Kravetz Cc: Hillf Danton Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 899f6a8..241de27 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1455,9 +1455,14 @@ static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, /* * We need a VMA to get a memory policy. If we do not - * have one, we use the 'nid' argument + * have one, we use the 'nid' argument. + * + * The mempolicy stuff below has some non-inlined bits + * and calls ->vm_ops. That makes it hard to optimize at + * compile-time, even when NUMA is off and it does + * nothing. This helps the compiler optimize it out. */ - if (!vma) { + if (!IS_ENABLED(CONFIG_NUMA) || !vma) { /* * If a specific node is requested, make sure to * get memory from there, but only when a node @@ -1474,7 +1479,8 @@ static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, /* * OK, so we have a VMA. Fetch the mempolicy and try to - * allocate a huge page with it. + * allocate a huge page with it. We will only reach this + * when CONFIG_NUMA=y. */ do { struct page *page; @@ -1520,8 +1526,8 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, * we can call this function, not both. */ if (vma || (addr != -1)) { - WARN_ON_ONCE(addr == -1); - WARN_ON_ONCE(nid != NUMA_NO_NODE); + VM_WARN_ON_ONCE(addr == -1); + VM_WARN_ON_ONCE(nid != NUMA_NO_NODE); } /* * Assume we will successfully allocate the surplus page to @@ -1585,6 +1591,7 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, * NUMA_NO_NODE, which means that it may be allocated * anywhere. */ +static struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid) { unsigned long addr = -1; @@ -1595,6 +1602,7 @@ struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid) /* * Use the VMA's mpolicy to allocate a huge page from the buddy. */ +static struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { -- cgit v1.1 From f5fc3c5d817435970aa301d066820a9ac12c8120 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 5 Nov 2015 18:50:23 -0800 Subject: mm: memcontrol: eliminate root memory.current memory.current on the root level doesn't add anything that wouldn't be more accurate and detailed using system statistics. It already doesn't include slabs, and it'll be a pain to keep in sync when further memory types are accounted in the memory controller. Remove it. Note that this applies to the new unified hierarchy interface only. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a44494f..59b100f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5026,7 +5026,9 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) static u64 memory_current_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return mem_cgroup_usage(mem_cgroup_from_css(css), false); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; } static int memory_low_show(struct seq_file *m, void *v) @@ -5138,6 +5140,7 @@ static int memory_events_show(struct seq_file *m, void *v) static struct cftype memory_files[] = { { .name = "current", + .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = memory_current_read, }, { -- cgit v1.1 From 6071ca5201066f4b2a61cfb693dd186d6bc6e9f3 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 5 Nov 2015 18:50:26 -0800 Subject: mm: page_counter: let page_counter_try_charge() return bool page_counter_try_charge() currently returns 0 on success and -ENOMEM on failure, which is surprising behavior given the function name. Make it follow the expected pattern of try_stuff() functions that return a boolean true to indicate success, or false for failure. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Linus Torvalds --- mm/hugetlb_cgroup.c | 3 ++- mm/memcontrol.c | 11 +++++------ mm/page_counter.c | 14 +++++++------- 3 files changed, 14 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 6e00574..33d59ab 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -186,7 +186,8 @@ again: } rcu_read_unlock(); - ret = page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter); + if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter)) + ret = -ENOMEM; css_put(&h_cg->css); done: *ptr = h_cg; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 59b100f..d47de73 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2016,8 +2016,8 @@ retry: return 0; if (!do_swap_account || - !page_counter_try_charge(&memcg->memsw, batch, &counter)) { - if (!page_counter_try_charge(&memcg->memory, batch, &counter)) + page_counter_try_charge(&memcg->memsw, batch, &counter)) { + if (page_counter_try_charge(&memcg->memory, batch, &counter)) goto done_restock; if (do_swap_account) page_counter_uncharge(&memcg->memsw, batch); @@ -2381,14 +2381,13 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, { unsigned int nr_pages = 1 << order; struct page_counter *counter; - int ret = 0; + int ret; if (!memcg_kmem_is_active(memcg)) return 0; - ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter); - if (ret) - return ret; + if (!page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) + return -ENOMEM; ret = try_charge(memcg, gfp, nr_pages); if (ret) { diff --git a/mm/page_counter.c b/mm/page_counter.c index 11b4bed..7c6a63d 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -56,12 +56,12 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) * @nr_pages: number of pages to charge * @fail: points first counter to hit its limit, if any * - * Returns 0 on success, or -ENOMEM and @fail if the counter or one of - * its ancestors has hit its configured limit. + * Returns %true on success, or %false and @fail if the counter or one + * of its ancestors has hit its configured limit. */ -int page_counter_try_charge(struct page_counter *counter, - unsigned long nr_pages, - struct page_counter **fail) +bool page_counter_try_charge(struct page_counter *counter, + unsigned long nr_pages, + struct page_counter **fail) { struct page_counter *c; @@ -99,13 +99,13 @@ int page_counter_try_charge(struct page_counter *counter, if (new > c->watermark) c->watermark = new; } - return 0; + return true; failed: for (c = counter; c != *fail; c = c->parent) page_counter_cancel(c, nr_pages); - return -ENOMEM; + return false; } /** -- cgit v1.1 From c12176d3368b9b36ae484d323d41e94be26f9b65 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 5 Nov 2015 18:50:29 -0800 Subject: memcg: fix thresholds for 32b architectures. Commit 424cdc141380 ("memcg: convert threshold to bytes") has fixed a regression introduced by 3e32cb2e0a12 ("mm: memcontrol: lockless page counters") where thresholds were silently converted to use page units rather than bytes when interpreting the user input. The fix is not complete, though, as properly pointed out by Ben Hutchings during stable backport review. The page count is converted to bytes but unsigned long is used to hold the value which would be obviously not sufficient for 32b systems with more than 4G thresholds. The same applies to usage as taken from mem_cgroup_usage which might overflow. Let's remove this bytes vs. pages internal tracking differences and handle thresholds in page units internally. Chage mem_cgroup_usage() to return the value in page units and revert 424cdc141380 because this should be sufficient for the consistent handling. mem_cgroup_read_u64 as the only users of mem_cgroup_usage outside of the threshold handling code is converted to give the proper in bytes result. It is doing that already for page_counter output so this is more consistent as well. The value presented to the userspace is still in bytes units. Fixes: 424cdc141380 ("memcg: convert threshold to bytes") Fixes: 3e32cb2e0a12 ("mm: memcontrol: lockless page counters") Signed-off-by: Michal Hocko Reported-by: Ben Hutchings Reviewed-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: From: Michal Hocko Subject: memcg-fix-thresholds-for-32b-architectures-fix Cc: Ben Hutchings Cc: Vladimir Davydov Cc: Johannes Weiner From: Andrew Morton Subject: memcg-fix-thresholds-for-32b-architectures-fix-fix don't attempt to inline mem_cgroup_usage() The compiler ignores the inline anwyay. And __always_inlining it adds 600 bytes of goop to the .o file. Cc: Ben Hutchings Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d47de73..38765d8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2801,9 +2801,9 @@ static unsigned long tree_stat(struct mem_cgroup *memcg, return val; } -static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) +static inline unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) { - u64 val; + unsigned long val; if (mem_cgroup_is_root(memcg)) { val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); @@ -2816,7 +2816,7 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) else val = page_counter_read(&memcg->memsw); } - return val << PAGE_SHIFT; + return val; } enum { @@ -2850,9 +2850,9 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, switch (MEMFILE_ATTR(cft->private)) { case RES_USAGE: if (counter == &memcg->memory) - return mem_cgroup_usage(memcg, false); + return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; if (counter == &memcg->memsw) - return mem_cgroup_usage(memcg, true); + return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; return (u64)page_counter_read(counter) * PAGE_SIZE; case RES_LIMIT: return (u64)counter->limit * PAGE_SIZE; @@ -3352,7 +3352,6 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, ret = page_counter_memparse(args, "-1", &threshold); if (ret) return ret; - threshold <<= PAGE_SHIFT; mutex_lock(&memcg->thresholds_lock); -- cgit v1.1 From d0424c429f8e0555a337d71e0a13f2289c636ec9 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 Nov 2015 18:50:34 -0800 Subject: tmpfs: avoid a little creat and stat slowdown LKP reports that v4.2 commit afa2db2fb6f1 ("tmpfs: truncate prealloc blocks past i_size") causes a 14.5% slowdown in the AIM9 creat-clo benchmark. creat-clo does just what you'd expect from the name, and creat's O_TRUNC on 0-length file does indeed get into more overhead now shmem_setattr() tests "0 <= 0" instead of "0 < 0". I'm not sure how much we care, but I think it would not be too VW-like to add in a check for whether any pages (or swap) are allocated: if none are allocated, there's none to remove from the radix_tree. At first I thought that check would be good enough for the unmaps too, but no: we should not skip the unlikely case of unmapping pages beyond the new EOF, which were COWed from holes which have now been reclaimed, leaving none. This gives me an 8.5% speedup: on Haswell instead of LKP's Westmere, and running a debug config before and after: I hope those account for the lesser speedup. And probably someone has a benchmark where a thousand threads keep on stat'ing the same file repeatedly: forestall that report by adjusting v4.3 commit 44a30220bc0a ("shmem: recalculate file inode when fstat") not to take the spinlock in shmem_getattr() when there's no work to do. Signed-off-by: Hugh Dickins Reported-by: Ying Huang Tested-by: Ying Huang Cc: Josef Bacik Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 6529226..3b8b739 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -548,12 +548,12 @@ static int shmem_getattr(struct vfsmount *mnt, struct dentry *dentry, struct inode *inode = dentry->d_inode; struct shmem_inode_info *info = SHMEM_I(inode); - spin_lock(&info->lock); - shmem_recalc_inode(inode); - spin_unlock(&info->lock); - + if (info->alloced - info->swapped != inode->i_mapping->nrpages) { + spin_lock(&info->lock); + shmem_recalc_inode(inode); + spin_unlock(&info->lock); + } generic_fillattr(inode, stat); - return 0; } @@ -586,10 +586,16 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) } if (newsize <= oldsize) { loff_t holebegin = round_up(newsize, PAGE_SIZE); - unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); - shmem_truncate_range(inode, newsize, (loff_t)-1); + if (oldsize > holebegin) + unmap_mapping_range(inode->i_mapping, + holebegin, 0, 1); + if (info->alloced) + shmem_truncate_range(inode, + newsize, (loff_t)-1); /* unmap again to remove racily COWed private pages */ - unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); + if (oldsize > holebegin) + unmap_mapping_range(inode->i_mapping, + holebegin, 0, 1); } } -- cgit v1.1 From 0ba8663cbfae066fc504b858db7cbb7d03c2b872 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 5 Nov 2015 18:50:43 -0800 Subject: mm/kasan: rename kasan_enabled() to kasan_report_enabled() The function only disable/enable reporting. In the later patch we will be adding a kasan early enable/disable. Rename kasan_enabled to properly reflect its function. Signed-off-by: Aneesh Kumar K.V Reviewed-by: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/kasan.h | 2 +- mm/kasan/report.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index c242adf..a6b46cc 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -63,7 +63,7 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr) << KASAN_SHADOW_SCALE_SHIFT); } -static inline bool kasan_enabled(void) +static inline bool kasan_report_enabled(void) { return !current->kasan_depth; } diff --git a/mm/kasan/report.c b/mm/kasan/report.c index e07c94f..6c3f82b 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -220,7 +220,7 @@ void kasan_report(unsigned long addr, size_t size, { struct kasan_access_info info; - if (likely(!kasan_enabled())) + if (likely(!kasan_report_enabled())) return; info.access_addr = (void *)addr; -- cgit v1.1 From 527f215b78976e94995dce7163b07539b576d519 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 5 Nov 2015 18:50:46 -0800 Subject: mm/kasan: MODULE_VADDR is not available on all archs Use is_module_address instead Signed-off-by: Aneesh Kumar K.V Reviewed-by: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/report.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 6c3f82b..d269f20 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -85,9 +86,11 @@ static void print_error_description(struct kasan_access_info *info) static inline bool kernel_or_module_addr(const void *addr) { - return (addr >= (void *)_stext && addr < (void *)_end) - || (addr >= (void *)MODULES_VADDR - && addr < (void *)MODULES_END); + if (addr >= (void *)_stext && addr < (void *)_end) + return true; + if (is_module_address((unsigned long)addr)) + return true; + return false; } static inline bool init_task_stack_addr(const void *addr) -- cgit v1.1 From f2377d4eaab2aabe1938b3974b5b94f5ba4c7ead Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 5 Nov 2015 18:50:48 -0800 Subject: mm/kasan: don't use kasan shadow pointer in generic functions We can't use generic functions like print_hex_dump to access kasan shadow region. This require us to setup another kasan shadow region for the address passed (kasan shadow address). Some architectures won't be able to do that. Hence make a copy of the shadow region row and pass that to generic functions. Signed-off-by: Aneesh Kumar K.V Reviewed-by: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/report.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/kasan/report.c b/mm/kasan/report.c index d269f20..c536708 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -164,14 +164,20 @@ static void print_shadow_for_address(const void *addr) for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) { const void *kaddr = kasan_shadow_to_mem(shadow_row); char buffer[4 + (BITS_PER_LONG/8)*2]; + char shadow_buf[SHADOW_BYTES_PER_ROW]; snprintf(buffer, sizeof(buffer), (i == 0) ? ">%p: " : " %p: ", kaddr); - + /* + * We should not pass a shadow pointer to generic + * function, because generic functions may try to + * access kasan mapping for the passed address. + */ kasan_disable_current(); + memcpy(shadow_buf, shadow_row, SHADOW_BYTES_PER_ROW); print_hex_dump(KERN_ERR, buffer, DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1, - shadow_row, SHADOW_BYTES_PER_ROW, 0); + shadow_buf, SHADOW_BYTES_PER_ROW, 0); kasan_enable_current(); if (row_is_guilty(shadow_row, shadow)) -- cgit v1.1 From fc5aeeaf593278f07ffa4d97296e27423ecae867 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 5 Nov 2015 18:50:51 -0800 Subject: mm/kasan: prevent deadlock in kasan reporting When we end up calling kasan_report in real mode, our shadow mapping for the spinlock variable will show poisoned. This will result in us calling kasan_report_error with lock_report spin lock held. To prevent this disable kasan reporting when we are priting error w.r.t kasan. Signed-off-by: Aneesh Kumar K.V Reviewed-by: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/report.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/kasan/report.c b/mm/kasan/report.c index c536708..7833f07 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -173,12 +173,10 @@ static void print_shadow_for_address(const void *addr) * function, because generic functions may try to * access kasan mapping for the passed address. */ - kasan_disable_current(); memcpy(shadow_buf, shadow_row, SHADOW_BYTES_PER_ROW); print_hex_dump(KERN_ERR, buffer, DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1, shadow_buf, SHADOW_BYTES_PER_ROW, 0); - kasan_enable_current(); if (row_is_guilty(shadow_row, shadow)) pr_err("%*c\n", @@ -195,6 +193,10 @@ void kasan_report_error(struct kasan_access_info *info) { unsigned long flags; + /* + * Make sure we don't end up in loop. + */ + kasan_disable_current(); spin_lock_irqsave(&report_lock, flags); pr_err("=================================" "=================================\n"); @@ -204,12 +206,17 @@ void kasan_report_error(struct kasan_access_info *info) pr_err("=================================" "=================================\n"); spin_unlock_irqrestore(&report_lock, flags); + kasan_enable_current(); } void kasan_report_user_access(struct kasan_access_info *info) { unsigned long flags; + /* + * Make sure we don't end up in loop. + */ + kasan_disable_current(); spin_lock_irqsave(&report_lock, flags); pr_err("=================================" "=================================\n"); @@ -222,6 +229,7 @@ void kasan_report_user_access(struct kasan_access_info *info) pr_err("=================================" "=================================\n"); spin_unlock_irqrestore(&report_lock, flags); + kasan_enable_current(); } void kasan_report(unsigned long addr, size_t size, -- cgit v1.1 From e91210766341cb356ead7fd39f07493a3d00b80f Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 5 Nov 2015 18:50:55 -0800 Subject: kasan: update reported bug types for not user nor kernel memory accesses Each access with address lower than kasan_shadow_to_mem(KASAN_SHADOW_START) is reported as user-memory-access. This is not always true, the accessed address might not be in user space. Fix this by reporting such accesses as null-ptr-derefs or wild-memory-accesses. There's another reason for this change. For userspace ASan we have a bunch of systems that analyze error types for the purpose of classification and deduplication. Sooner of later we will write them to KASAN as well. Then clearly and explicitly stated error types will bring value. Signed-off-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Konstantin Serebryany Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/kasan.c | 8 +------- mm/kasan/kasan.h | 3 --- mm/kasan/report.c | 50 +++++++++++++++++++++++--------------------------- 3 files changed, 24 insertions(+), 37 deletions(-) (limited to 'mm') diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 8da2114..1104cb0 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -235,18 +235,12 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) static __always_inline void check_memory_region(unsigned long addr, size_t size, bool write) { - struct kasan_access_info info; - if (unlikely(size == 0)) return; if (unlikely((void *)addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { - info.access_addr = (void *)addr; - info.access_size = size; - info.is_write = write; - info.ip = _RET_IP_; - kasan_report_user_access(&info); + kasan_report(addr, size, write, _RET_IP_); return; } diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index a6b46cc..4f6c62e 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -54,9 +54,6 @@ struct kasan_global { #endif }; -void kasan_report_error(struct kasan_access_info *info); -void kasan_report_user_access(struct kasan_access_info *info); - static inline const void *kasan_shadow_to_mem(const void *shadow_addr) { return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 7833f07..964aaf4 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -189,9 +189,10 @@ static void print_shadow_for_address(const void *addr) static DEFINE_SPINLOCK(report_lock); -void kasan_report_error(struct kasan_access_info *info) +static void kasan_report_error(struct kasan_access_info *info) { unsigned long flags; + const char *bug_type; /* * Make sure we don't end up in loop. @@ -200,32 +201,26 @@ void kasan_report_error(struct kasan_access_info *info) spin_lock_irqsave(&report_lock, flags); pr_err("=================================" "=================================\n"); - print_error_description(info); - print_address_description(info); - print_shadow_for_address(info->first_bad_addr); - pr_err("=================================" - "=================================\n"); - spin_unlock_irqrestore(&report_lock, flags); - kasan_enable_current(); -} - -void kasan_report_user_access(struct kasan_access_info *info) -{ - unsigned long flags; - - /* - * Make sure we don't end up in loop. - */ - kasan_disable_current(); - spin_lock_irqsave(&report_lock, flags); - pr_err("=================================" - "=================================\n"); - pr_err("BUG: KASan: user-memory-access on address %p\n", - info->access_addr); - pr_err("%s of size %zu by task %s/%d\n", - info->is_write ? "Write" : "Read", - info->access_size, current->comm, task_pid_nr(current)); - dump_stack(); + if (info->access_addr < + kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) { + if ((unsigned long)info->access_addr < PAGE_SIZE) + bug_type = "null-ptr-deref"; + else if ((unsigned long)info->access_addr < TASK_SIZE) + bug_type = "user-memory-access"; + else + bug_type = "wild-memory-access"; + pr_err("BUG: KASan: %s on address %p\n", + bug_type, info->access_addr); + pr_err("%s of size %zu by task %s/%d\n", + info->is_write ? "Write" : "Read", + info->access_size, current->comm, + task_pid_nr(current)); + dump_stack(); + } else { + print_error_description(info); + print_address_description(info); + print_shadow_for_address(info->first_bad_addr); + } pr_err("=================================" "=================================\n"); spin_unlock_irqrestore(&report_lock, flags); @@ -244,6 +239,7 @@ void kasan_report(unsigned long addr, size_t size, info.access_size = size; info.is_write = is_write; info.ip = ip; + kasan_report_error(&info); } -- cgit v1.1 From 0952d87fd6a6211ac51b2abdc5c066b49c651fd8 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 5 Nov 2015 18:50:58 -0800 Subject: kasan: update reported bug types for kernel memory accesses Update the names of the bad access types to better reflect the type of the access that happended and make these error types "literals" that can be used for classification and deduplication in scripts. Signed-off-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Konstantin Serebryany Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/report.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 964aaf4..cdf4c31 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -49,7 +49,7 @@ static const void *find_first_bad_addr(const void *addr, size_t size) static void print_error_description(struct kasan_access_info *info) { - const char *bug_type = "unknown crash"; + const char *bug_type = "unknown-crash"; u8 shadow_val; info->first_bad_addr = find_first_bad_addr(info->access_addr, @@ -58,21 +58,25 @@ static void print_error_description(struct kasan_access_info *info) shadow_val = *(u8 *)kasan_mem_to_shadow(info->first_bad_addr); switch (shadow_val) { - case KASAN_FREE_PAGE: - case KASAN_KMALLOC_FREE: - bug_type = "use after free"; + case 0 ... KASAN_SHADOW_SCALE_SIZE - 1: + bug_type = "out-of-bounds"; break; case KASAN_PAGE_REDZONE: case KASAN_KMALLOC_REDZONE: + bug_type = "slab-out-of-bounds"; + break; case KASAN_GLOBAL_REDZONE: - case 0 ... KASAN_SHADOW_SCALE_SIZE - 1: - bug_type = "out of bounds access"; + bug_type = "global-out-of-bounds"; break; case KASAN_STACK_LEFT: case KASAN_STACK_MID: case KASAN_STACK_RIGHT: case KASAN_STACK_PARTIAL: - bug_type = "out of bounds on stack"; + bug_type = "stack-out-of-bounds"; + break; + case KASAN_FREE_PAGE: + case KASAN_KMALLOC_FREE: + bug_type = "use-after-free"; break; } -- cgit v1.1 From cdf6a273dc4346277ab9d148ef29f6e058624a8c Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 5 Nov 2015 18:51:01 -0800 Subject: kasan: accurately determine the type of the bad access Makes KASAN accurately determine the type of the bad access. If the shadow byte value is in the [0, KASAN_SHADOW_SCALE_SIZE) range we can look at the next shadow byte to determine the type of the access. Signed-off-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Konstantin Serebryany Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/report.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/kasan/report.c b/mm/kasan/report.c index cdf4c31..be53a8f 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -50,15 +50,26 @@ static const void *find_first_bad_addr(const void *addr, size_t size) static void print_error_description(struct kasan_access_info *info) { const char *bug_type = "unknown-crash"; - u8 shadow_val; + u8 *shadow_addr; info->first_bad_addr = find_first_bad_addr(info->access_addr, info->access_size); - shadow_val = *(u8 *)kasan_mem_to_shadow(info->first_bad_addr); + shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr); - switch (shadow_val) { + /* + * If shadow byte value is in [0, KASAN_SHADOW_SCALE_SIZE) we can look + * at the next shadow byte to determine the type of the bad access. + */ + if (*shadow_addr > 0 && *shadow_addr <= KASAN_SHADOW_SCALE_SIZE - 1) + shadow_addr++; + + switch (*shadow_addr) { case 0 ... KASAN_SHADOW_SCALE_SIZE - 1: + /* + * In theory it's still possible to see these shadow values + * due to a data race in the kernel code. + */ bug_type = "out-of-bounds"; break; case KASAN_PAGE_REDZONE: -- cgit v1.1 From 25add7ec708170e4eaef1f9793a07803b2fb5c71 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 5 Nov 2015 18:51:03 -0800 Subject: kasan: update log messages We decided to use KASAN as the short name of the tool and KernelAddressSanitizer as the full one. Update log messages according to that. Signed-off-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Konstantin Serebryany Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/kasan.c | 2 +- mm/kasan/report.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 1104cb0..be9b78d 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -518,7 +518,7 @@ static int kasan_mem_notifier(struct notifier_block *nb, static int __init kasan_memhotplug_init(void) { - pr_err("WARNING: KASan doesn't support memory hot-add\n"); + pr_err("WARNING: KASAN doesn't support memory hot-add\n"); pr_err("Memory hot-add will be disabled\n"); hotplug_memory_notifier(kasan_mem_notifier, 0); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index be53a8f..ae6bd36 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -91,7 +91,7 @@ static void print_error_description(struct kasan_access_info *info) break; } - pr_err("BUG: KASan: %s in %pS at addr %p\n", + pr_err("BUG: KASAN: %s in %pS at addr %p\n", bug_type, (void *)info->ip, info->access_addr); pr_err("%s of size %zu by task %s/%d\n", @@ -224,7 +224,7 @@ static void kasan_report_error(struct kasan_access_info *info) bug_type = "user-memory-access"; else bug_type = "wild-memory-access"; - pr_err("BUG: KASan: %s on address %p\n", + pr_err("BUG: KASAN: %s on address %p\n", bug_type, info->access_addr); pr_err("%s of size %zu by task %s/%d\n", info->is_write ? "Write" : "Read", -- cgit v1.1 From 5d0926efe728e00afbd81a1e3c498222cf908d23 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 5 Nov 2015 18:51:12 -0800 Subject: kasan: update reference to kasan prototype repo Update the reference to the kasan prototype repository on github, since it was renamed. Signed-off-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Konstantin Serebryany Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/kasan.c | 2 +- mm/kasan/report.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index be9b78d..21c50dc 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -4,7 +4,7 @@ * Copyright (c) 2014 Samsung Electronics Co., Ltd. * Author: Andrey Ryabinin * - * Some of code borrowed from https://github.com/xairy/linux by + * Some code borrowed from https://github.com/xairy/kasan-prototype by * Andrey Konovalov * * This program is free software; you can redistribute it and/or modify diff --git a/mm/kasan/report.c b/mm/kasan/report.c index ae6bd36..f5e068a 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -4,7 +4,7 @@ * Copyright (c) 2014 Samsung Electronics Co., Ltd. * Author: Andrey Ryabinin * - * Some of code borrowed from https://github.com/xairy/linux by + * Some code borrowed from https://github.com/xairy/kasan-prototype by * Andrey Konovalov * * This program is free software; you can redistribute it and/or modify -- cgit v1.1 From e0d57714394f5e2ce4e2f9bbebf48e3c7a7fd3be Mon Sep 17 00:00:00 2001 From: Wang Long Date: Thu, 5 Nov 2015 18:51:18 -0800 Subject: kasan: Fix a type conversion error The current KASAN code can not find the following out-of-bounds bugs: char *ptr; ptr = kmalloc(8, GFP_KERNEL); memset(ptr+7, 0, 2); the cause of the problem is the type conversion error in *memory_is_poisoned_n* function. So this patch fix that. Signed-off-by: Wang Long Acked-by: Andrey Ryabinin Cc: Vladimir Murzin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/kasan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 21c50dc..2b21ccd 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -203,7 +203,7 @@ static __always_inline bool memory_is_poisoned_n(unsigned long addr, s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte); if (unlikely(ret != (unsigned long)last_shadow || - ((last_byte & KASAN_SHADOW_MASK) >= *last_shadow))) + ((long)(last_byte & KASAN_SHADOW_MASK) >= *last_shadow))) return true; } return false; -- cgit v1.1 From 10f702627e139e21465f4c9d44f63527bbca163c Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Thu, 5 Nov 2015 18:51:21 -0800 Subject: kasan: use IS_ALIGNED in memory_is_poisoned_8() Use IS_ALIGNED() to determine whether the shadow span two bytes. It generates less code and more readable. Also add some comments in shadow check functions. Signed-off-by: Xishi Qiu Acked-by: Andrey Ryabinin Cc: Andrey Konovalov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/kasan.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 2b21ccd..d41b21b 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -86,6 +86,11 @@ static __always_inline bool memory_is_poisoned_2(unsigned long addr) if (memory_is_poisoned_1(addr + 1)) return true; + /* + * If single shadow byte covers 2-byte access, we don't + * need to do anything more. Otherwise, test the first + * shadow byte. + */ if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0)) return false; @@ -103,6 +108,11 @@ static __always_inline bool memory_is_poisoned_4(unsigned long addr) if (memory_is_poisoned_1(addr + 3)) return true; + /* + * If single shadow byte covers 4-byte access, we don't + * need to do anything more. Otherwise, test the first + * shadow byte. + */ if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3)) return false; @@ -120,7 +130,12 @@ static __always_inline bool memory_is_poisoned_8(unsigned long addr) if (memory_is_poisoned_1(addr + 7)) return true; - if (likely(((addr + 7) & KASAN_SHADOW_MASK) >= 7)) + /* + * If single shadow byte covers 8-byte access, we don't + * need to do anything more. Otherwise, test the first + * shadow byte. + */ + if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE))) return false; return unlikely(*(u8 *)shadow_addr); @@ -139,7 +154,12 @@ static __always_inline bool memory_is_poisoned_16(unsigned long addr) if (unlikely(shadow_first_bytes)) return true; - if (likely(IS_ALIGNED(addr, 8))) + /* + * If two shadow bytes covers 16-byte access, we don't + * need to do anything more. Otherwise, test the last + * shadow byte. + */ + if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE))) return false; return memory_is_poisoned_1(addr + 15); -- cgit v1.1 From 89d3c87e20d95e3238eac85e43de7b3cb1f39d8b Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 5 Nov 2015 18:51:23 -0800 Subject: mm, slub, kasan: enable user tracking by default with KASAN=y It's recommended to have slub's user tracking enabled with CONFIG_KASAN, because: a) User tracking disables slab merging which improves detecting out-of-bounds accesses. b) User tracking metadata acts as redzone which also improves detecting out-of-bounds accesses. c) User tracking provides additional information about object. This information helps to understand bugs. Currently it is not enabled by default. Besides recompiling the kernel with KASAN and reinstalling it, user also have to change the boot cmdline, which is not very handy. Enable slub user tracking by default with KASAN=y, since there is no good reason to not do this. [akpm@linux-foundation.org: little fixes, per David] Signed-off-by: Andrey Ryabinin Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 423dbe7..75a5fa9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -459,8 +459,10 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) /* * Debug settings: */ -#ifdef CONFIG_SLUB_DEBUG_ON +#if defined(CONFIG_SLUB_DEBUG_ON) static int slub_debug = DEBUG_DEFAULT_FLAGS; +#elif defined(CONFIG_KASAN) +static int slub_debug = SLAB_STORE_USER; #else static int slub_debug; #endif -- cgit v1.1 From eb06f43f1c94d502b7867b0998e92cdabbc060bc Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 5 Nov 2015 18:51:26 -0800 Subject: kasan: always taint kernel on report Currently we already taint the kernel in some cases. E.g. if we hit some bug in slub memory we call object_err() which will taint the kernel with TAINT_BAD_PAGE flag. But for other kind of bugs kernel left untainted. Always taint with TAINT_BAD_PAGE if kasan found some bug. This is useful for automated testing. Signed-off-by: Andrey Ryabinin Cc: Alexander Potapenko Reviewed-by: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/report.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/kasan/report.c b/mm/kasan/report.c index f5e068a..12f222d 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -238,6 +238,7 @@ static void kasan_report_error(struct kasan_access_info *info) } pr_err("=================================" "=================================\n"); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irqrestore(&report_lock, flags); kasan_enable_current(); } -- cgit v1.1 From 1aab92ec3de552362397b718744872ea2d17add2 Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Thu, 5 Nov 2015 18:51:29 -0800 Subject: mm: mlock: refactor mlock, munlock, and munlockall code mlock() allows a user to control page out of program memory, but this comes at the cost of faulting in the entire mapping when it is allocated. For large mappings where the entire area is not necessary this is not ideal. Instead of forcing all locked pages to be present when they are allocated, this set creates a middle ground. Pages are marked to be placed on the unevictable LRU (locked) when they are first used, but they are not faulted in by the mlock call. This series introduces a new mlock() system call that takes a flags argument along with the start address and size. This flags argument gives the caller the ability to request memory be locked in the traditional way, or to be locked after the page is faulted in. A new MCL flag is added to mirror the lock on fault behavior from mlock() in mlockall(). There are two main use cases that this set covers. The first is the security focussed mlock case. A buffer is needed that cannot be written to swap. The maximum size is known, but on average the memory used is significantly less than this maximum. With lock on fault, the buffer is guaranteed to never be paged out without consuming the maximum size every time such a buffer is created. The second use case is focussed on performance. Portions of a large file are needed and we want to keep the used portions in memory once accessed. This is the case for large graphical models where the path through the graph is not known until run time. The entire graph is unlikely to be used in a given invocation, but once a node has been used it needs to stay resident for further processing. Given these constraints we have a number of options. We can potentially waste a large amount of memory by mlocking the entire region (this can also cause a significant stall at startup as the entire file is read in). We can mlock every page as we access them without tracking if the page is already resident but this introduces large overhead for each access. The third option is mapping the entire region with PROT_NONE and using a signal handler for SIGSEGV to mprotect(PROT_READ) and mlock() the needed page. Doing this page at a time adds a significant performance penalty. Batching can be used to mitigate this overhead, but in order to safely avoid trying to mprotect pages outside of the mapping, the boundaries of each mapping to be used in this way must be tracked and available to the signal handler. This is precisely what the mm system in the kernel should already be doing. For mlock(MLOCK_ONFAULT) the user is charged against RLIMIT_MEMLOCK as if mlock(MLOCK_LOCKED) or mmap(MAP_LOCKED) was used, so when the VMA is created not when the pages are faulted in. For mlockall(MCL_ONFAULT) the user is charged as if MCL_FUTURE was used. This decision was made to keep the accounting checks out of the page fault path. To illustrate the benefit of this set I wrote a test program that mmaps a 5 GB file filled with random data and then makes 15,000,000 accesses to random addresses in that mapping. The test program was run 20 times for each setup. Results are reported for two program portions, setup and execution. The setup phase is calling mmap and optionally mlock on the entire region. For most experiments this is trivial, but it highlights the cost of faulting in the entire region. Results are averages across the 20 runs in milliseconds. mmap with mlock(MLOCK_LOCKED) on entire range: Setup avg: 8228.666 Processing avg: 8274.257 mmap with mlock(MLOCK_LOCKED) before each access: Setup avg: 0.113 Processing avg: 90993.552 mmap with PROT_NONE and signal handler and batch size of 1 page: With the default value in max_map_count, this gets ENOMEM as I attempt to change the permissions, after upping the sysctl significantly I get: Setup avg: 0.058 Processing avg: 69488.073 mmap with PROT_NONE and signal handler and batch size of 8 pages: Setup avg: 0.068 Processing avg: 38204.116 mmap with PROT_NONE and signal handler and batch size of 16 pages: Setup avg: 0.044 Processing avg: 29671.180 mmap with mlock(MLOCK_ONFAULT) on entire range: Setup avg: 0.189 Processing avg: 17904.899 The signal handler in the batch cases faulted in memory in two steps to avoid having to know the start and end of the faulting mapping. The first step covers the page that caused the fault as we know that it will be possible to lock. The second step speculatively tries to mlock and mprotect the batch size - 1 pages that follow. There may be a clever way to avoid this without having the program track each mapping to be covered by this handeler in a globally accessible structure, but I could not find it. It should be noted that with a large enough batch size this two step fault handler can still cause the program to crash if it reaches far beyond the end of the mapping. These results show that if the developer knows that a majority of the mapping will be used, it is better to try and fault it in at once, otherwise mlock(MLOCK_ONFAULT) is significantly faster. The performance cost of these patches are minimal on the two benchmarks I have tested (stream and kernbench). The following are the average values across 20 runs of stream and 10 runs of kernbench after a warmup run whose results were discarded. Avg throughput in MB/s from stream using 1000000 element arrays Test 4.2-rc1 4.2-rc1+lock-on-fault Copy: 10,566.5 10,421 Scale: 10,685 10,503.5 Add: 12,044.1 11,814.2 Triad: 12,064.8 11,846.3 Kernbench optimal load 4.2-rc1 4.2-rc1+lock-on-fault Elapsed Time 78.453 78.991 User Time 64.2395 65.2355 System Time 9.7335 9.7085 Context Switches 22211.5 22412.1 Sleeps 14965.3 14956.1 This patch (of 6): Extending the mlock system call is very difficult because it currently does not take a flags argument. A later patch in this set will extend mlock to support a middle ground between pages that are locked and faulted in immediately and unlocked pages. To pave the way for the new system call, the code needs some reorganization so that all the actual entry point handles is checking input and translating to VMA flags. Signed-off-by: Eric B Munson Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Michael Kerrisk Cc: Catalin Marinas Cc: Geert Uytterhoeven Cc: Guenter Roeck Cc: Heiko Carstens Cc: Jonathan Corbet Cc: Ralf Baechle Cc: Shuah Khan Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index 550228d..fbd8c03 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -554,7 +554,8 @@ out: return ret; } -static int do_mlock(unsigned long start, size_t len, int on) +static int apply_vma_lock_flags(unsigned long start, size_t len, + vm_flags_t flags) { unsigned long nstart, end, tmp; struct vm_area_struct * vma, * prev; @@ -576,14 +577,11 @@ static int do_mlock(unsigned long start, size_t len, int on) prev = vma; for (nstart = start ; ; ) { - vm_flags_t newflags; - - /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + vm_flags_t newflags = vma->vm_flags & ~VM_LOCKED; - newflags = vma->vm_flags & ~VM_LOCKED; - if (on) - newflags |= VM_LOCKED; + newflags |= flags; + /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ tmp = vma->vm_end; if (tmp > end) tmp = end; @@ -605,7 +603,7 @@ static int do_mlock(unsigned long start, size_t len, int on) return error; } -SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) +static int do_mlock(unsigned long start, size_t len, vm_flags_t flags) { unsigned long locked; unsigned long lock_limit; @@ -629,7 +627,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) /* check against resource limits */ if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) - error = do_mlock(start, len, 1); + error = apply_vma_lock_flags(start, len, flags); up_write(¤t->mm->mmap_sem); if (error) @@ -641,6 +639,11 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) return 0; } +SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) +{ + return do_mlock(start, len, VM_LOCKED); +} + SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { int ret; @@ -649,13 +652,13 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) start &= PAGE_MASK; down_write(¤t->mm->mmap_sem); - ret = do_mlock(start, len, 0); + ret = apply_vma_lock_flags(start, len, 0); up_write(¤t->mm->mmap_sem); return ret; } -static int do_mlockall(int flags) +static int apply_mlockall_flags(int flags) { struct vm_area_struct * vma, * prev = NULL; @@ -663,6 +666,7 @@ static int do_mlockall(int flags) current->mm->def_flags |= VM_LOCKED; else current->mm->def_flags &= ~VM_LOCKED; + if (flags == MCL_FUTURE) goto out; @@ -703,7 +707,7 @@ SYSCALL_DEFINE1(mlockall, int, flags) if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || capable(CAP_IPC_LOCK)) - ret = do_mlockall(flags); + ret = apply_mlockall_flags(flags); up_write(¤t->mm->mmap_sem); if (!ret && (flags & MCL_CURRENT)) mm_populate(0, TASK_SIZE); @@ -716,7 +720,7 @@ SYSCALL_DEFINE0(munlockall) int ret; down_write(¤t->mm->mmap_sem); - ret = do_mlockall(0); + ret = apply_mlockall_flags(0); up_write(¤t->mm->mmap_sem); return ret; } -- cgit v1.1 From a8ca5d0ecbdde5cc3d7accacbd69968b0c98764e Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Thu, 5 Nov 2015 18:51:33 -0800 Subject: mm: mlock: add new mlock system call With the refactored mlock code, introduce a new system call for mlock. The new call will allow the user to specify what lock states are being added. mlock2 is trivial at the moment, but a follow on patch will add a new mlock state making it useful. Signed-off-by: Eric B Munson Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Heiko Carstens Cc: Geert Uytterhoeven Cc: Catalin Marinas Cc: Stephen Rothwell Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Kirill A. Shutemov Cc: Michael Kerrisk Cc: Ralf Baechle Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index fbd8c03..35dcf8f 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -644,6 +644,14 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) return do_mlock(start, len, VM_LOCKED); } +SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags) +{ + if (flags) + return -EINVAL; + + return do_mlock(start, len, VM_LOCKED); +} + SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { int ret; -- cgit v1.1 From de60f5f10c58d4f34b68622442c0e04180367f3f Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Thu, 5 Nov 2015 18:51:36 -0800 Subject: mm: introduce VM_LOCKONFAULT The cost of faulting in all memory to be locked can be very high when working with large mappings. If only portions of the mapping will be used this can incur a high penalty for locking. For the example of a large file, this is the usage pattern for a large statical language model (probably applies to other statical or graphical models as well). For the security example, any application transacting in data that cannot be swapped out (credit card data, medical records, etc). This patch introduces the ability to request that pages are not pre-faulted, but are placed on the unevictable LRU when they are finally faulted in. The VM_LOCKONFAULT flag will be used together with VM_LOCKED and has no effect when set without VM_LOCKED. Setting the VM_LOCKONFAULT flag for a VMA will cause pages faulted into that VMA to be added to the unevictable LRU when they are faulted or if they are already present, but will not cause any missing pages to be faulted in. Exposing this new lock state means that we cannot overload the meaning of the FOLL_POPULATE flag any longer. Prior to this patch it was used to mean that the VMA for a fault was locked. This means we need the new FOLL_MLOCK flag to communicate the locked state of a VMA. FOLL_POPULATE will now only control if the VMA should be populated and in the case of VM_LOCKONFAULT, it will not be set. Signed-off-by: Eric B Munson Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Cc: Michal Hocko Cc: Jonathan Corbet Cc: Catalin Marinas Cc: Geert Uytterhoeven Cc: Guenter Roeck Cc: Heiko Carstens Cc: Michael Kerrisk Cc: Ralf Baechle Cc: Shuah Khan Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug.c | 1 + mm/gup.c | 10 ++++++++-- mm/huge_memory.c | 2 +- mm/hugetlb.c | 4 ++-- mm/mlock.c | 2 +- mm/mmap.c | 2 +- 6 files changed, 14 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/debug.c b/mm/debug.c index 6c1b3ea..e784110 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -125,6 +125,7 @@ static const struct trace_print_flags vmaflags_names[] = { {VM_GROWSDOWN, "growsdown" }, {VM_PFNMAP, "pfnmap" }, {VM_DENYWRITE, "denywrite" }, + {VM_LOCKONFAULT, "lockonfault" }, {VM_LOCKED, "locked" }, {VM_IO, "io" }, {VM_SEQ_READ, "seqread" }, diff --git a/mm/gup.c b/mm/gup.c index a798293..deafa2c 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -129,7 +129,7 @@ retry: */ mark_page_accessed(page); } - if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* * The preliminary mapping check is mainly to avoid the * pointless overhead of lock_page on the ZERO_PAGE @@ -299,6 +299,9 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, unsigned int fault_flags = 0; int ret; + /* mlock all present pages, but do not fault in new pages */ + if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK) + return -ENOENT; /* For mm_populate(), just skip the stack guard page. */ if ((*flags & FOLL_POPULATE) && (stack_guard_page_start(vma, address) || @@ -890,7 +893,10 @@ long populate_vma_page_range(struct vm_area_struct *vma, VM_BUG_ON_VMA(end > vma->vm_end, vma); VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); - gup_flags = FOLL_TOUCH | FOLL_POPULATE; + gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; + if (vma->vm_flags & VM_LOCKONFAULT) + gup_flags &= ~FOLL_POPULATE; + /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3fd0311..f5c08b4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1307,7 +1307,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, pmd, _pmd, 1)) update_mmu_cache_pmd(vma, addr, pmd); } - if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { if (page->mapping && trylock_page(page)) { lru_add_drain(); if (page->mapping) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 241de27..74ef0c6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4137,8 +4137,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, unsigned long s_end = sbase + PUD_SIZE; /* Allow segments to share if only one is marked locked */ - unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED; - unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED; + unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; + unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK; /* * match the virtual addresses, permission and the alignment of the diff --git a/mm/mlock.c b/mm/mlock.c index 35dcf8f..ca38941 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -422,7 +422,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, void munlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - vma->vm_flags &= ~VM_LOCKED; + vma->vm_flags &= VM_LOCKED_CLEAR_MASK; while (start < end) { struct page *page = NULL; diff --git a/mm/mmap.c b/mm/mmap.c index 220effd..2ce04a6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1661,7 +1661,7 @@ out: vma == get_gate_vma(current->mm))) mm->locked_vm += (len >> PAGE_SHIFT); else - vma->vm_flags &= ~VM_LOCKED; + vma->vm_flags &= VM_LOCKED_CLEAR_MASK; } if (file) -- cgit v1.1 From b0f205c2a3082dd9081f9a94e50658c5fa906ff1 Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Thu, 5 Nov 2015 18:51:39 -0800 Subject: mm: mlock: add mlock flags to enable VM_LOCKONFAULT usage The previous patch introduced a flag that specified pages in a VMA should be placed on the unevictable LRU, but they should not be made present when the area is created. This patch adds the ability to set this state via the new mlock system calls. We add MLOCK_ONFAULT for mlock2 and MCL_ONFAULT for mlockall. MLOCK_ONFAULT will set the VM_LOCKONFAULT modifier for VM_LOCKED. MCL_ONFAULT should be used as a modifier to the two other mlockall flags. When used with MCL_CURRENT, all current mappings will be marked with VM_LOCKED | VM_LOCKONFAULT. When used with MCL_FUTURE, the mm->def_flags will be marked with VM_LOCKED | VM_LOCKONFAULT. When used with both MCL_CURRENT and MCL_FUTURE, all current mappings and mm->def_flags will be marked with VM_LOCKED | VM_LOCKONFAULT. Prior to this patch, mlockall() will unconditionally clear the mm->def_flags any time it is called without MCL_FUTURE. This behavior is maintained after adding MCL_ONFAULT. If a call to mlockall(MCL_FUTURE) is followed by mlockall(MCL_CURRENT), the mm->def_flags will be cleared and new VMAs will be unlocked. This remains true with or without MCL_ONFAULT in either mlockall() invocation. munlock() will unconditionally clear both vma flags. munlockall() unconditionally clears for VMA flags on all VMAs and in the mm->def_flags field. Signed-off-by: Eric B Munson Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Jonathan Corbet Cc: Catalin Marinas Cc: Geert Uytterhoeven Cc: Guenter Roeck Cc: Heiko Carstens Cc: Kirill A. Shutemov Cc: Michael Kerrisk Cc: Ralf Baechle Cc: Shuah Khan Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 51 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index ca38941..339d9e0 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -506,7 +506,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) - goto out; /* don't set VM_LOCKED, don't count */ + /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ + goto out; pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, @@ -577,7 +578,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, prev = vma; for (nstart = start ; ; ) { - vm_flags_t newflags = vma->vm_flags & ~VM_LOCKED; + vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; newflags |= flags; @@ -646,10 +647,15 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags) { - if (flags) + vm_flags_t vm_flags = VM_LOCKED; + + if (flags & ~MLOCK_ONFAULT) return -EINVAL; - return do_mlock(start, len, VM_LOCKED); + if (flags & MLOCK_ONFAULT) + vm_flags |= VM_LOCKONFAULT; + + return do_mlock(start, len, vm_flags); } SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) @@ -666,24 +672,43 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) return ret; } +/* + * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall) + * and translate into the appropriate modifications to mm->def_flags and/or the + * flags for all current VMAs. + * + * There are a couple of subtleties with this. If mlockall() is called multiple + * times with different flags, the values do not necessarily stack. If mlockall + * is called once including the MCL_FUTURE flag and then a second time without + * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags. + */ static int apply_mlockall_flags(int flags) { struct vm_area_struct * vma, * prev = NULL; + vm_flags_t to_add = 0; - if (flags & MCL_FUTURE) + current->mm->def_flags &= VM_LOCKED_CLEAR_MASK; + if (flags & MCL_FUTURE) { current->mm->def_flags |= VM_LOCKED; - else - current->mm->def_flags &= ~VM_LOCKED; - if (flags == MCL_FUTURE) - goto out; + if (flags & MCL_ONFAULT) + current->mm->def_flags |= VM_LOCKONFAULT; + + if (!(flags & MCL_CURRENT)) + goto out; + } + + if (flags & MCL_CURRENT) { + to_add |= VM_LOCKED; + if (flags & MCL_ONFAULT) + to_add |= VM_LOCKONFAULT; + } for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { vm_flags_t newflags; - newflags = vma->vm_flags & ~VM_LOCKED; - if (flags & MCL_CURRENT) - newflags |= VM_LOCKED; + newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; + newflags |= to_add; /* Ignore errors */ mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); @@ -698,7 +723,7 @@ SYSCALL_DEFINE1(mlockall, int, flags) unsigned long lock_limit; int ret; - if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE))) + if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT))) return -EINVAL; if (!can_do_mlock()) -- cgit v1.1