From fda901241fb89449244537db4fb27b06e491b74f Mon Sep 17 00:00:00 2001
From: Denis Kirjanov <kda@linux-powerpc.org>
Date: Thu, 5 Nov 2015 18:44:59 -0800
Subject: slab: convert slab_is_available() to boolean

A good candidate to return a boolean result.

Signed-off-by: Denis Kirjanov <kda@linux-powerpc.org>
Cc: Christoph Lameter <cl@linux.com>
Reviewed-by: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slab_common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slab_common.c b/mm/slab_common.c
index 5ce4fae..113a6fd 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -692,7 +692,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
 }
 EXPORT_SYMBOL(kmem_cache_shrink);
 
-int slab_is_available(void)
+bool slab_is_available(void)
 {
 	return slab_state >= UP;
 }
-- 
cgit v1.1


From c9a77a792003ce9d70df8937c8c87aee6e177149 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Thu, 5 Nov 2015 18:45:08 -0800
Subject: mm/slab_common.c: rename cache create/destroy helpers

do_kmem_cache_create(), do_kmem_cache_shutdown(), and
do_kmem_cache_release() sound awkward for static helper functions that are
not supposed to be used outside slab_common.c.  Rename them to
create_cache(), shutdown_cache(), and release_caches(), respectively.
This patch is a pure cleanup and does not introduce any functional
changes.

Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slab_common.c | 37 ++++++++++++++++++-------------------
 1 file changed, 18 insertions(+), 19 deletions(-)

(limited to 'mm')

diff --git a/mm/slab_common.c b/mm/slab_common.c
index 113a6fd..c8d2ed7 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -316,10 +316,10 @@ unsigned long calculate_alignment(unsigned long flags,
 	return ALIGN(align, sizeof(void *));
 }
 
-static struct kmem_cache *
-do_kmem_cache_create(const char *name, size_t object_size, size_t size,
-		     size_t align, unsigned long flags, void (*ctor)(void *),
-		     struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+static struct kmem_cache *create_cache(const char *name,
+		size_t object_size, size_t size, size_t align,
+		unsigned long flags, void (*ctor)(void *),
+		struct mem_cgroup *memcg, struct kmem_cache *root_cache)
 {
 	struct kmem_cache *s;
 	int err;
@@ -418,9 +418,9 @@ kmem_cache_create(const char *name, size_t size, size_t align,
 		goto out_unlock;
 	}
 
-	s = do_kmem_cache_create(cache_name, size, size,
-				 calculate_alignment(flags, align, size),
-				 flags, ctor, NULL, NULL);
+	s = create_cache(cache_name, size, size,
+			 calculate_alignment(flags, align, size),
+			 flags, ctor, NULL, NULL);
 	if (IS_ERR(s)) {
 		err = PTR_ERR(s);
 		kfree_const(cache_name);
@@ -448,7 +448,7 @@ out_unlock:
 }
 EXPORT_SYMBOL(kmem_cache_create);
 
-static int do_kmem_cache_shutdown(struct kmem_cache *s,
+static int shutdown_cache(struct kmem_cache *s,
 		struct list_head *release, bool *need_rcu_barrier)
 {
 	if (__kmem_cache_shutdown(s) != 0) {
@@ -469,8 +469,7 @@ static int do_kmem_cache_shutdown(struct kmem_cache *s,
 	return 0;
 }
 
-static void do_kmem_cache_release(struct list_head *release,
-				  bool need_rcu_barrier)
+static void release_caches(struct list_head *release, bool need_rcu_barrier)
 {
 	struct kmem_cache *s, *s2;
 
@@ -536,10 +535,10 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
 	if (!cache_name)
 		goto out_unlock;
 
-	s = do_kmem_cache_create(cache_name, root_cache->object_size,
-				 root_cache->size, root_cache->align,
-				 root_cache->flags, root_cache->ctor,
-				 memcg, root_cache);
+	s = create_cache(cache_name, root_cache->object_size,
+			 root_cache->size, root_cache->align,
+			 root_cache->flags, root_cache->ctor,
+			 memcg, root_cache);
 	/*
 	 * If we could not create a memcg cache, do not complain, because
 	 * that's not critical at all as we can always proceed with the root
@@ -615,14 +614,14 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
 		 * The cgroup is about to be freed and therefore has no charges
 		 * left. Hence, all its caches must be empty by now.
 		 */
-		BUG_ON(do_kmem_cache_shutdown(s, &release, &need_rcu_barrier));
+		BUG_ON(shutdown_cache(s, &release, &need_rcu_barrier));
 	}
 	mutex_unlock(&slab_mutex);
 
 	put_online_mems();
 	put_online_cpus();
 
-	do_kmem_cache_release(&release, need_rcu_barrier);
+	release_caches(&release, need_rcu_barrier);
 }
 #endif /* CONFIG_MEMCG_KMEM */
 
@@ -655,12 +654,12 @@ void kmem_cache_destroy(struct kmem_cache *s)
 		goto out_unlock;
 
 	for_each_memcg_cache_safe(c, c2, s) {
-		if (do_kmem_cache_shutdown(c, &release, &need_rcu_barrier))
+		if (shutdown_cache(c, &release, &need_rcu_barrier))
 			busy = true;
 	}
 
 	if (!busy)
-		do_kmem_cache_shutdown(s, &release, &need_rcu_barrier);
+		shutdown_cache(s, &release, &need_rcu_barrier);
 
 out_unlock:
 	mutex_unlock(&slab_mutex);
@@ -668,7 +667,7 @@ out_unlock:
 	put_online_mems();
 	put_online_cpus();
 
-	do_kmem_cache_release(&release, need_rcu_barrier);
+	release_caches(&release, need_rcu_barrier);
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
 
-- 
cgit v1.1


From d60fdcc9e3febde2ebd49fe517e13f428bc12843 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Thu, 5 Nov 2015 18:45:11 -0800
Subject: mm/slab_common.c: clear pointers to per memcg caches on destroy

Currently, we do not clear pointers to per memcg caches in the
memcg_params.memcg_caches array when a global cache is destroyed with
kmem_cache_destroy.

This is fine if the global cache does get destroyed.  However, a cache can
be left on the list if it still has active objects when kmem_cache_destroy
is called (due to a memory leak).  If this happens, the entries in the
array will point to already freed areas, which is likely to result in data
corruption when the cache is reused (via slab merging).

Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slab.h        |  6 ----
 mm/slab_common.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 78 insertions(+), 21 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.h b/mm/slab.h
index a3a967d..bf51a8d 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -181,10 +181,6 @@ bool __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
 	list_for_each_entry(iter, &(root)->memcg_params.list, \
 			    memcg_params.list)
 
-#define for_each_memcg_cache_safe(iter, tmp, root) \
-	list_for_each_entry_safe(iter, tmp, &(root)->memcg_params.list, \
-				 memcg_params.list)
-
 static inline bool is_root_cache(struct kmem_cache *s)
 {
 	return s->memcg_params.is_root_cache;
@@ -265,8 +261,6 @@ extern void slab_init_memcg_params(struct kmem_cache *);
 
 #define for_each_memcg_cache(iter, root) \
 	for ((void)(iter), (void)(root); 0; )
-#define for_each_memcg_cache_safe(iter, tmp, root) \
-	for ((void)(iter), (void)(tmp), (void)(root); 0; )
 
 static inline bool is_root_cache(struct kmem_cache *s)
 {
diff --git a/mm/slab_common.c b/mm/slab_common.c
index c8d2ed7..ab1f20e 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -461,10 +461,6 @@ static int shutdown_cache(struct kmem_cache *s,
 	if (s->flags & SLAB_DESTROY_BY_RCU)
 		*need_rcu_barrier = true;
 
-#ifdef CONFIG_MEMCG_KMEM
-	if (!is_root_cache(s))
-		list_del(&s->memcg_params.list);
-#endif
 	list_move(&s->list, release);
 	return 0;
 }
@@ -597,6 +593,18 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
 	put_online_cpus();
 }
 
+static int __shutdown_memcg_cache(struct kmem_cache *s,
+		struct list_head *release, bool *need_rcu_barrier)
+{
+	BUG_ON(is_root_cache(s));
+
+	if (shutdown_cache(s, release, need_rcu_barrier))
+		return -EBUSY;
+
+	list_del(&s->memcg_params.list);
+	return 0;
+}
+
 void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
 {
 	LIST_HEAD(release);
@@ -614,7 +622,7 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
 		 * The cgroup is about to be freed and therefore has no charges
 		 * left. Hence, all its caches must be empty by now.
 		 */
-		BUG_ON(shutdown_cache(s, &release, &need_rcu_barrier));
+		BUG_ON(__shutdown_memcg_cache(s, &release, &need_rcu_barrier));
 	}
 	mutex_unlock(&slab_mutex);
 
@@ -623,6 +631,68 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
 
 	release_caches(&release, need_rcu_barrier);
 }
+
+static int shutdown_memcg_caches(struct kmem_cache *s,
+		struct list_head *release, bool *need_rcu_barrier)
+{
+	struct memcg_cache_array *arr;
+	struct kmem_cache *c, *c2;
+	LIST_HEAD(busy);
+	int i;
+
+	BUG_ON(!is_root_cache(s));
+
+	/*
+	 * First, shutdown active caches, i.e. caches that belong to online
+	 * memory cgroups.
+	 */
+	arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
+					lockdep_is_held(&slab_mutex));
+	for_each_memcg_cache_index(i) {
+		c = arr->entries[i];
+		if (!c)
+			continue;
+		if (__shutdown_memcg_cache(c, release, need_rcu_barrier))
+			/*
+			 * The cache still has objects. Move it to a temporary
+			 * list so as not to try to destroy it for a second
+			 * time while iterating over inactive caches below.
+			 */
+			list_move(&c->memcg_params.list, &busy);
+		else
+			/*
+			 * The cache is empty and will be destroyed soon. Clear
+			 * the pointer to it in the memcg_caches array so that
+			 * it will never be accessed even if the root cache
+			 * stays alive.
+			 */
+			arr->entries[i] = NULL;
+	}
+
+	/*
+	 * Second, shutdown all caches left from memory cgroups that are now
+	 * offline.
+	 */
+	list_for_each_entry_safe(c, c2, &s->memcg_params.list,
+				 memcg_params.list)
+		__shutdown_memcg_cache(c, release, need_rcu_barrier);
+
+	list_splice(&busy, &s->memcg_params.list);
+
+	/*
+	 * A cache being destroyed must be empty. In particular, this means
+	 * that all per memcg caches attached to it must be empty too.
+	 */
+	if (!list_empty(&s->memcg_params.list))
+		return -EBUSY;
+	return 0;
+}
+#else
+static inline int shutdown_memcg_caches(struct kmem_cache *s,
+		struct list_head *release, bool *need_rcu_barrier)
+{
+	return 0;
+}
 #endif /* CONFIG_MEMCG_KMEM */
 
 void slab_kmem_cache_release(struct kmem_cache *s)
@@ -634,16 +704,13 @@ void slab_kmem_cache_release(struct kmem_cache *s)
 
 void kmem_cache_destroy(struct kmem_cache *s)
 {
-	struct kmem_cache *c, *c2;
 	LIST_HEAD(release);
 	bool need_rcu_barrier = false;
-	bool busy = false;
+	int err;
 
 	if (unlikely(!s))
 		return;
 
-	BUG_ON(!is_root_cache(s));
-
 	get_online_cpus();
 	get_online_mems();
 
@@ -653,12 +720,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
 	if (s->refcount)
 		goto out_unlock;
 
-	for_each_memcg_cache_safe(c, c2, s) {
-		if (shutdown_cache(c, &release, &need_rcu_barrier))
-			busy = true;
-	}
-
-	if (!busy)
+	err = shutdown_memcg_caches(s, &release, &need_rcu_barrier);
+	if (!err)
 		shutdown_cache(s, &release, &need_rcu_barrier);
 
 out_unlock:
-- 
cgit v1.1


From cd918c557439c8f0750f64883367aeff264b5fd8 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Thu, 5 Nov 2015 18:45:14 -0800
Subject: mm/slab_common.c: do not warn that cache is busy on destroy more than
 once

Currently, when kmem_cache_destroy() is called for a global cache, we
print a warning for each per memcg cache attached to it that has active
objects (see shutdown_cache).  This is redundant, because it gives no new
information and only clutters the log.  If a cache being destroyed has
active objects, there must be a memory leak in the module that created the
cache, and it does not matter if the cache was used by users in memory
cgroups or not.

This patch moves the warning from shutdown_cache(), which is called for
shutting down both global and per memcg caches, to kmem_cache_destroy(),
so that the warning is only printed once if there are objects left in the
cache being destroyed.

Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slab_common.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/slab_common.c b/mm/slab_common.c
index ab1f20e..fba78e4 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -451,12 +451,8 @@ EXPORT_SYMBOL(kmem_cache_create);
 static int shutdown_cache(struct kmem_cache *s,
 		struct list_head *release, bool *need_rcu_barrier)
 {
-	if (__kmem_cache_shutdown(s) != 0) {
-		printk(KERN_ERR "kmem_cache_destroy %s: "
-		       "Slab cache still has objects\n", s->name);
-		dump_stack();
+	if (__kmem_cache_shutdown(s) != 0)
 		return -EBUSY;
-	}
 
 	if (s->flags & SLAB_DESTROY_BY_RCU)
 		*need_rcu_barrier = true;
@@ -722,8 +718,13 @@ void kmem_cache_destroy(struct kmem_cache *s)
 
 	err = shutdown_memcg_caches(s, &release, &need_rcu_barrier);
 	if (!err)
-		shutdown_cache(s, &release, &need_rcu_barrier);
+		err = shutdown_cache(s, &release, &need_rcu_barrier);
 
+	if (err) {
+		pr_err("kmem_cache_destroy %s: "
+		       "Slab cache still has objects\n", s->name);
+		dump_stack();
+	}
 out_unlock:
 	mutex_unlock(&slab_mutex);
 
-- 
cgit v1.1


From 40911a798b5abbbec6b2e271a42addd6b26228a0 Mon Sep 17 00:00:00 2001
From: Alexandru Moise <00moses.alexander00@gmail.com>
Date: Thu, 5 Nov 2015 18:45:43 -0800
Subject: mm/slab_common.c: initialize kmem_cache pointer to NULL

The assignment to NULL within the error condition was written in a 2014
patch to suppress a compiler warning.  However it would be cleaner to just
initialize the kmem_cache to NULL and just return it in case of an error
condition.

Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slab_common.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/slab_common.c b/mm/slab_common.c
index fba78e4..d88e97c 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -384,7 +384,7 @@ struct kmem_cache *
 kmem_cache_create(const char *name, size_t size, size_t align,
 		  unsigned long flags, void (*ctor)(void *))
 {
-	struct kmem_cache *s;
+	struct kmem_cache *s = NULL;
 	const char *cache_name;
 	int err;
 
@@ -396,7 +396,6 @@ kmem_cache_create(const char *name, size_t size, size_t align,
 
 	err = kmem_cache_sanity_check(name, size);
 	if (err) {
-		s = NULL;	/* suppress uninit var warning */
 		goto out_unlock;
 	}
 
-- 
cgit v1.1


From 422ff4d70c1b3b2deed431dc095432dc691f4269 Mon Sep 17 00:00:00 2001
From: Wei Yang <weiyang@linux.vnet.ibm.com>
Date: Thu, 5 Nov 2015 18:45:46 -0800
Subject: mm/slub: correct the comment in calculate_order()

In calculate_order(), it tries to calculate the best order by adjusting
the fraction and min_objects.  On each iteration on min_objects, fraction
iterates on 16, 8, 4.  Which means the acceptable waste increases with
1/16, 1/8, 1/4.

This patch corrects the comment according to the code.

Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index f614b5d..a94b9f4 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2943,7 +2943,7 @@ static inline int calculate_order(int size, int reserved)
 	 * works by first attempting to generate a layout with
 	 * the best configuration and backing off gradually.
 	 *
-	 * First we reduce the acceptable waste in a slab. Then
+	 * First we increase the acceptable waste in a slab. Then
 	 * we reduce the minimum objects required in a slab.
 	 */
 	min_objects = slub_min_objects;
-- 
cgit v1.1


From 033fd1bd3c50fdda267d27d02f9bc656f0b9ddb8 Mon Sep 17 00:00:00 2001
From: Wei Yang <weiyang@linux.vnet.ibm.com>
Date: Thu, 5 Nov 2015 18:45:48 -0800
Subject: mm/slub: use get_order() instead of fls()

get_order() is more easy to understand.

This patch just replaces it.

Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Reviewed-by: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index a94b9f4..e309ed1 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2912,8 +2912,7 @@ static inline int slab_order(int size, int min_objects,
 	if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
 		return get_order(size * MAX_OBJS_PER_PAGE) - 1;
 
-	for (order = max(min_order,
-				fls(min_objects * size - 1) - PAGE_SHIFT);
+	for (order = max(min_order, get_order(min_objects * size));
 			order <= max_order; order++) {
 
 		unsigned long slab_size = PAGE_SIZE << order;
-- 
cgit v1.1


From 9f835703ea67633617ca82bc150f6ee70831b40a Mon Sep 17 00:00:00 2001
From: Wei Yang <weiyang@linux.vnet.ibm.com>
Date: Thu, 5 Nov 2015 18:45:51 -0800
Subject: mm/slub: calculate start order with reserved in consideration

In slub_order(), the order starts from max(min_order,
get_order(min_objects * size)).  When (min_objects * size) has different
order from (min_objects * size + reserved), it will skip this order via a
check in the loop.

This patch optimizes this a little by calculating the start order with
`reserved' in consideration and removing the check in loop.

Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index e309ed1..e1bb147 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2912,19 +2912,15 @@ static inline int slab_order(int size, int min_objects,
 	if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
 		return get_order(size * MAX_OBJS_PER_PAGE) - 1;
 
-	for (order = max(min_order, get_order(min_objects * size));
+	for (order = max(min_order, get_order(min_objects * size + reserved));
 			order <= max_order; order++) {
 
 		unsigned long slab_size = PAGE_SIZE << order;
 
-		if (slab_size < min_objects * size + reserved)
-			continue;
-
 		rem = (slab_size - reserved) % size;
 
 		if (rem <= slab_size / fract_leftover)
 			break;
-
 	}
 
 	return order;
-- 
cgit v1.1


From d4322d88f5fdf92729dd40f923013414fbb2184d Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 5 Nov 2015 18:45:54 -0800
Subject: mm: slab: only move management objects off-slab for sizes larger than
 KMALLOC_MIN_SIZE

On systems with a KMALLOC_MIN_SIZE of 128 (arm64, some mips and powerpc
configurations defining ARCH_DMA_MINALIGN to 128), the first
kmalloc_caches[] entry to be initialised after slab_early_init = 0 is
"kmalloc-128" with index 7.  Depending on the debug kernel configuration,
sizeof(struct kmem_cache) can be larger than 128 resulting in an
INDEX_NODE of 8.

Commit 8fc9cf420b36 ("slab: make more slab management structure off the
slab") enables off-slab management objects for sizes starting with
PAGE_SIZE >> 5 (128 bytes for a 4KB page configuration) and the creation
of the "kmalloc-128" cache would try to place the management objects
off-slab.  However, since KMALLOC_MIN_SIZE is already 128 and
freelist_size == 32 in __kmem_cache_create(), kmalloc_slab(freelist_size)
returns NULL (kmalloc_caches[7] not populated yet).  This triggers the
following bug on arm64:

  kernel BUG at /work/Linux/linux-2.6-aarch64/mm/slab.c:2283!
  Internal error: Oops - BUG: 0 [#1] SMP
  Modules linked in:
  CPU: 0 PID: 0 Comm: swapper Not tainted 4.3.0-rc4+ #540
  Hardware name: Juno (DT)
  PC is at __kmem_cache_create+0x21c/0x280
  LR is at __kmem_cache_create+0x210/0x280
  [...]
  Call trace:
    __kmem_cache_create+0x21c/0x280
    create_boot_cache+0x48/0x80
    create_kmalloc_cache+0x50/0x88
    create_kmalloc_caches+0x4c/0xf4
    kmem_cache_init+0x100/0x118
    start_kernel+0x214/0x33c

This patch introduces an OFF_SLAB_MIN_SIZE definition to avoid off-slab
management objects for sizes equal to or smaller than KMALLOC_MIN_SIZE.

Fixes: 8fc9cf420b36 ("slab: make more slab management structure off the slab")
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: <stable@vger.kernel.org>	[3.15+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slab.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 4fcc5dd..461935b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -282,6 +282,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
 
 #define CFLGS_OFF_SLAB		(0x80000000UL)
 #define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)
+#define OFF_SLAB_MIN_SIZE (max_t(size_t, PAGE_SIZE >> 5, KMALLOC_MIN_SIZE + 1))
 
 #define BATCHREFILL_LIMIT	16
 /*
@@ -2212,7 +2213,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 	 * it too early on. Always use on-slab management when
 	 * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
 	 */
-	if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init &&
+	if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init &&
 	    !(flags & SLAB_NOLEAKTRACE))
 		/*
 		 * Size is large, assume best to place the slab management obj
@@ -2276,7 +2277,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 		/*
 		 * This is a possibility for one of the kmalloc_{dma,}_caches.
 		 * But since we go off slab only for object size greater than
-		 * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created
+		 * OFF_SLAB_MIN_SIZE, and kmalloc_{dma,}_caches get created
 		 * in ascending order,this should not happen at all.
 		 * But leave a BUG_ON for some lucky dude.
 		 */
-- 
cgit v1.1


From 9fbed25407ccc87a7bb47ea3f411e1ca34a95f8b Mon Sep 17 00:00:00 2001
From: Alexey Klimov <alexey.klimov@linaro.org>
Date: Thu, 5 Nov 2015 18:45:57 -0800
Subject: mm/kmemleak.c: remove unneeded initialization of object to NULL

Few lines below object is reinitialized by lookup_object() so we don't
need to init it by NULL in the beginning of find_and_get_object().

Signed-off-by: Alexey Klimov <alexey.klimov@linaro.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/kmemleak.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 77191ec..19423a4 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -479,7 +479,7 @@ static void put_object(struct kmemleak_object *object)
 static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
 {
 	unsigned long flags;
-	struct kmemleak_object *object = NULL;
+	struct kmemleak_object *object;
 
 	rcu_read_lock();
 	read_lock_irqsave(&kmemleak_lock, flags);
-- 
cgit v1.1


From 86d2adccfbe7d5a1f050fa08db9638c9168736d9 Mon Sep 17 00:00:00 2001
From: Alexey Klimov <klimov.linux@gmail.com>
Date: Thu, 5 Nov 2015 18:46:00 -0800
Subject: mm/mlock.c: reorganize mlockall() return values and remove goto-out
 label

In mlockall syscall wrapper after out-label for goto code just doing
return.  Remove goto out statements and return error values directly.

Also instead of rewriting ret variable before every if-check move returns
to 'error'-like path under if-check.

Objdump asm listing showed me reducing by few asm lines.  Object file size
descreased from 220592 bytes to 220528 bytes for me (for aarch64).

Signed-off-by: Alexey Klimov <klimov.linux@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mlock.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/mlock.c b/mm/mlock.c
index 25936680..7e6ad9c 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -684,14 +684,13 @@ out:
 SYSCALL_DEFINE1(mlockall, int, flags)
 {
 	unsigned long lock_limit;
-	int ret = -EINVAL;
+	int ret;
 
 	if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE)))
-		goto out;
+		return -EINVAL;
 
-	ret = -EPERM;
 	if (!can_do_mlock())
-		goto out;
+		return -EPERM;
 
 	if (flags & MCL_CURRENT)
 		lru_add_drain_all();	/* flush pagevec */
@@ -708,7 +707,7 @@ SYSCALL_DEFINE1(mlockall, int, flags)
 	up_write(&current->mm->mmap_sem);
 	if (!ret && (flags & MCL_CURRENT))
 		mm_populate(0, TASK_SIZE);
-out:
+
 	return ret;
 }
 
-- 
cgit v1.1


From 0ab32b6f1b88444524e52429fab334ff96683a3f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 5 Nov 2015 18:46:03 -0800
Subject: uaccess: reimplement probe_kernel_address() using probe_kernel_read()

probe_kernel_address() is basically the same as the (later added)
probe_kernel_read().

The return value on EFAULT is a bit different: probe_kernel_address()
returns number-of-bytes-not-copied whereas probe_kernel_read() returns
-EFAULT.  All callers have been checked, none cared.

probe_kernel_read() can be overridden by the architecture whereas
probe_kernel_address() cannot.  parisc, blackfin and um do this, to insert
additional checking.  Hence this patch possibly fixes obscure bugs,
although there are only two probe_kernel_address() callsites outside
arch/.

My first attempt involved removing probe_kernel_address() entirely and
converting all callsites to use probe_kernel_read() directly, but that got
tiresome.

This patch shrinks mm/slab_common.o by 218 bytes.  For a single
probe_kernel_address() callsite.

Cc: Steven Miao <realmz6@gmail.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Helge Deller <deller@gmx.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/maccess.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'mm')

diff --git a/mm/maccess.c b/mm/maccess.c
index 34fe247..1b13638 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -13,6 +13,11 @@
  *
  * Safely read from address @src to the buffer at @dst.  If a kernel fault
  * happens, handle that and return -EFAULT.
+ *
+ * We ensure that the copy_from_user is executed in atomic context so that
+ * do_page_fault() doesn't attempt to take mmap_sem.  This makes
+ * probe_kernel_read() suitable for use within regions where the caller
+ * already holds mmap_sem, or other locks which nest inside mmap_sem.
  */
 
 long __weak probe_kernel_read(void *dst, const void *src, size_t size)
-- 
cgit v1.1


From 55e1ceaf2586ab11aafba798a6b9499dd7c14441 Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen.5i5j@gmail.com>
Date: Thu, 5 Nov 2015 18:46:06 -0800
Subject: mm/mmap.c: remove useless statement "vma = NULL" in find_vma()

Before the main loop, vma is already is NULL.  There is no need to set it
to NULL again.

Signed-off-by: Chen Gang <gang.chen.5i5j@gmail.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index 79bcc9f..bd932c1 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2047,7 +2047,6 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 		return vma;
 
 	rb_node = mm->mm_rb.rb_node;
-	vma = NULL;
 
 	while (rb_node) {
 		struct vm_area_struct *tmp;
-- 
cgit v1.1


From 626ebc4100285be56fe3546f29b6afeb36b6871a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 5 Nov 2015 18:46:09 -0800
Subject: memcg: flatten task_struct->memcg_oom

task_struct->memcg_oom is a sub-struct containing fields which are used
for async memcg oom handling.  Most task_struct fields aren't packaged
this way and it can lead to unnecessary alignment paddings.  This patch
flattens it.

* task.memcg_oom.memcg          -> task.memcg_in_oom
* task.memcg_oom.gfp_mask	-> task.memcg_oom_gfp_mask
* task.memcg_oom.order          -> task.memcg_oom_order
* task.memcg_oom.may_oom        -> task.memcg_may_oom

In addition, task.memcg_may_oom is relocated to where other bitfields are
which reduces the size of task_struct.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c57c442..47bd7f1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1661,7 +1661,7 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
 
 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 {
-	if (!current->memcg_oom.may_oom)
+	if (!current->memcg_may_oom)
 		return;
 	/*
 	 * We are in the middle of the charge context here, so we
@@ -1678,9 +1678,9 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 	 * and when we know whether the fault was overall successful.
 	 */
 	css_get(&memcg->css);
-	current->memcg_oom.memcg = memcg;
-	current->memcg_oom.gfp_mask = mask;
-	current->memcg_oom.order = order;
+	current->memcg_in_oom = memcg;
+	current->memcg_oom_gfp_mask = mask;
+	current->memcg_oom_order = order;
 }
 
 /**
@@ -1702,7 +1702,7 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
  */
 bool mem_cgroup_oom_synchronize(bool handle)
 {
-	struct mem_cgroup *memcg = current->memcg_oom.memcg;
+	struct mem_cgroup *memcg = current->memcg_in_oom;
 	struct oom_wait_info owait;
 	bool locked;
 
@@ -1730,8 +1730,8 @@ bool mem_cgroup_oom_synchronize(bool handle)
 	if (locked && !memcg->oom_kill_disable) {
 		mem_cgroup_unmark_under_oom(memcg);
 		finish_wait(&memcg_oom_waitq, &owait.wait);
-		mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
-					 current->memcg_oom.order);
+		mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
+					 current->memcg_oom_order);
 	} else {
 		schedule();
 		mem_cgroup_unmark_under_oom(memcg);
@@ -1748,7 +1748,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
 		memcg_oom_recover(memcg);
 	}
 cleanup:
-	current->memcg_oom.memcg = NULL;
+	current->memcg_in_oom = NULL;
 	css_put(&memcg->css);
 	return true;
 }
-- 
cgit v1.1


From b23afb93d317c65cef553b804f08dec8a7a0f7e1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 5 Nov 2015 18:46:11 -0800
Subject: memcg: punt high overage reclaim to return-to-userland path

Currently, try_charge() tries to reclaim memory synchronously when the
high limit is breached; however, if the allocation doesn't have
__GFP_WAIT, synchronous reclaim is skipped.  If a process performs only
speculative allocations, it can blow way past the high limit.  This is
actually easily reproducible by simply doing "find /".  slab/slub
allocator tries speculative allocations first, so as long as there's
memory which can be consumed without blocking, it can keep allocating
memory regardless of the high limit.

This patch makes try_charge() always punt the over-high reclaim to the
return-to-userland path.  If try_charge() detects that high limit is
breached, it adds the overage to current->memcg_nr_pages_over_high and
schedules execution of mem_cgroup_handle_over_high() which performs
synchronous reclaim from the return-to-userland path.

As long as kernel doesn't have a run-away allocation spree, this should
provide enough protection while making kmemcg behave more consistently.
It also has the following benefits.

- All over-high reclaims can use GFP_KERNEL regardless of the specific
  gfp mask in use, e.g. GFP_NOFS, when the limit was breached.

- It copes with prio inversion.  Previously, a low-prio task with
  small memory.high might perform over-high reclaim with a bunch of
  locks held.  If a higher prio task needed any of these locks, it
  would have to wait until the low prio task finished reclaim and
  released the locks.  By handing over-high reclaim to the task exit
  path this issue can be avoided.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Michal Hocko <mhocko@kernel.org>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 47 +++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 39 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 47bd7f1..327dcda 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -62,6 +62,7 @@
 #include <linux/oom.h>
 #include <linux/lockdep.h>
 #include <linux/file.h>
+#include <linux/tracehook.h>
 #include "internal.h"
 #include <net/sock.h>
 #include <net/ip.h>
@@ -1972,6 +1973,31 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
 	return NOTIFY_OK;
 }
 
+/*
+ * Scheduled by try_charge() to be executed from the userland return path
+ * and reclaims memory over the high limit.
+ */
+void mem_cgroup_handle_over_high(void)
+{
+	unsigned int nr_pages = current->memcg_nr_pages_over_high;
+	struct mem_cgroup *memcg, *pos;
+
+	if (likely(!nr_pages))
+		return;
+
+	pos = memcg = get_mem_cgroup_from_mm(current->mm);
+
+	do {
+		if (page_counter_read(&pos->memory) <= pos->high)
+			continue;
+		mem_cgroup_events(pos, MEMCG_HIGH, 1);
+		try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true);
+	} while ((pos = parent_mem_cgroup(pos)));
+
+	css_put(&memcg->css);
+	current->memcg_nr_pages_over_high = 0;
+}
+
 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 		      unsigned int nr_pages)
 {
@@ -2080,17 +2106,22 @@ done_restock:
 	css_get_many(&memcg->css, batch);
 	if (batch > nr_pages)
 		refill_stock(memcg, batch - nr_pages);
-	if (!(gfp_mask & __GFP_WAIT))
-		goto done;
+
 	/*
-	 * If the hierarchy is above the normal consumption range,
-	 * make the charging task trim their excess contribution.
+	 * If the hierarchy is above the normal consumption range, schedule
+	 * reclaim on returning to userland.  We can perform reclaim here
+	 * if __GFP_WAIT but let's always punt for simplicity and so that
+	 * GFP_KERNEL can consistently be used during reclaim.  @memcg is
+	 * not recorded as it most likely matches current's and won't
+	 * change in the meantime.  As high limit is checked again before
+	 * reclaim, the cost of mismatch is negligible.
 	 */
 	do {
-		if (page_counter_read(&memcg->memory) <= memcg->high)
-			continue;
-		mem_cgroup_events(memcg, MEMCG_HIGH, 1);
-		try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+		if (page_counter_read(&memcg->memory) > memcg->high) {
+			current->memcg_nr_pages_over_high += nr_pages;
+			set_notify_resume(current);
+			break;
+		}
 	} while ((memcg = parent_mem_cgroup(memcg)));
 done:
 	return ret;
-- 
cgit v1.1


From 10d53c748bc9531f47e13f98e32ef28be4399862 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 5 Nov 2015 18:46:17 -0800
Subject: memcg: ratify and consolidate over-charge handling

try_charge() is the main charging logic of memcg.  When it hits the limit
but either can't fail the allocation due to __GFP_NOFAIL or the task is
likely to free memory very soon, being OOM killed, has SIGKILL pending or
exiting, it "bypasses" the charge to the root memcg and returns -EINTR.
While this is one approach which can be taken for these situations, it has
several issues.

* It unnecessarily lies about the reality.  The number itself doesn't
  go over the limit but the actual usage does.  memcg is either forced
  to or actively chooses to go over the limit because that is the
  right behavior under the circumstances, which is completely fine,
  but, if at all avoidable, it shouldn't be misrepresenting what's
  happening by sneaking the charges into the root memcg.

* Despite trying, we already do over-charge.  kmemcg can't deal with
  switching over to the root memcg by the point try_charge() returns
  -EINTR, so it open-codes over-charing.

* It complicates the callers.  Each try_charge() user has to handle
  the weird -EINTR exception.  memcg_charge_kmem() does the manual
  over-charging.  mem_cgroup_do_precharge() performs unnecessary
  uncharging of root memcg, which BTW is inconsistent with what
  memcg_charge_kmem() does but not broken as [un]charging are noops on
  root memcg.  mem_cgroup_try_charge() needs to switch the returned
  cgroup to the root one.

The reality is that in memcg there are cases where we are forced and/or
willing to go over the limit.  Each such case needs to be scrutinized and
justified but there definitely are situations where that is the right
thing to do.  We alredy do this but with a superficial and inconsistent
disguise which leads to unnecessary complications.

This patch updates try_charge() so that it over-charges and returns 0 when
deemed necessary.  -EINTR return is removed along with all special case
handling in the callers.

While at it, remove the local variable @ret, which was initialized to zero
and never changed, along with done: label which just returned the always
zero @ret.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 69 +++++++++++++++++----------------------------------------
 1 file changed, 20 insertions(+), 49 deletions(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 327dcda..b952abe 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2008,13 +2008,12 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	unsigned long nr_reclaimed;
 	bool may_swap = true;
 	bool drained = false;
-	int ret = 0;
 
 	if (mem_cgroup_is_root(memcg))
-		goto done;
+		return 0;
 retry:
 	if (consume_stock(memcg, nr_pages))
-		goto done;
+		return 0;
 
 	if (!do_swap_account ||
 	    !page_counter_try_charge(&memcg->memsw, batch, &counter)) {
@@ -2042,7 +2041,7 @@ retry:
 	if (unlikely(test_thread_flag(TIF_MEMDIE) ||
 		     fatal_signal_pending(current) ||
 		     current->flags & PF_EXITING))
-		goto bypass;
+		goto force;
 
 	if (unlikely(task_in_memcg_oom(current)))
 		goto nomem;
@@ -2088,10 +2087,10 @@ retry:
 		goto retry;
 
 	if (gfp_mask & __GFP_NOFAIL)
-		goto bypass;
+		goto force;
 
 	if (fatal_signal_pending(current))
-		goto bypass;
+		goto force;
 
 	mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
 
@@ -2099,8 +2098,18 @@ retry:
 nomem:
 	if (!(gfp_mask & __GFP_NOFAIL))
 		return -ENOMEM;
-bypass:
-	return -EINTR;
+force:
+	/*
+	 * The allocation either can't fail or will lead to more memory
+	 * being freed very soon.  Allow memory usage go over the limit
+	 * temporarily by force charging it.
+	 */
+	page_counter_charge(&memcg->memory, nr_pages);
+	if (do_swap_account)
+		page_counter_charge(&memcg->memsw, nr_pages);
+	css_get_many(&memcg->css, nr_pages);
+
+	return 0;
 
 done_restock:
 	css_get_many(&memcg->css, batch);
@@ -2123,8 +2132,8 @@ done_restock:
 			break;
 		}
 	} while ((memcg = parent_mem_cgroup(memcg)));
-done:
-	return ret;
+
+	return 0;
 }
 
 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
@@ -2216,28 +2225,7 @@ int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
 		return ret;
 
 	ret = try_charge(memcg, gfp, nr_pages);
-	if (ret == -EINTR)  {
-		/*
-		 * try_charge() chose to bypass to root due to OOM kill or
-		 * fatal signal.  Since our only options are to either fail
-		 * the allocation or charge it to this cgroup, do it as a
-		 * temporary condition. But we can't fail. From a kmem/slab
-		 * perspective, the cache has already been selected, by
-		 * mem_cgroup_kmem_get_cache(), so it is too late to change
-		 * our minds.
-		 *
-		 * This condition will only trigger if the task entered
-		 * memcg_charge_kmem in a sane state, but was OOM-killed
-		 * during try_charge() above. Tasks that were already dying
-		 * when the allocation triggers should have been already
-		 * directed to the root cgroup in memcontrol.h
-		 */
-		page_counter_charge(&memcg->memory, nr_pages);
-		if (do_swap_account)
-			page_counter_charge(&memcg->memsw, nr_pages);
-		css_get_many(&memcg->css, nr_pages);
-		ret = 0;
-	} else if (ret)
+	if (ret)
 		page_counter_uncharge(&memcg->kmem, nr_pages);
 
 	return ret;
@@ -4438,22 +4426,10 @@ static int mem_cgroup_do_precharge(unsigned long count)
 		mc.precharge += count;
 		return ret;
 	}
-	if (ret == -EINTR) {
-		cancel_charge(root_mem_cgroup, count);
-		return ret;
-	}
 
 	/* Try charges one by one with reclaim */
 	while (count--) {
 		ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
-		/*
-		 * In case of failure, any residual charges against
-		 * mc.to will be dropped by mem_cgroup_clear_mc()
-		 * later on.  However, cancel any charges that are
-		 * bypassed to root right away or they'll be lost.
-		 */
-		if (ret == -EINTR)
-			cancel_charge(root_mem_cgroup, 1);
 		if (ret)
 			return ret;
 		mc.precharge++;
@@ -5358,11 +5334,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
 	ret = try_charge(memcg, gfp_mask, nr_pages);
 
 	css_put(&memcg->css);
-
-	if (ret == -EINTR) {
-		memcg = root_mem_cgroup;
-		ret = 0;
-	}
 out:
 	*memcgp = memcg;
 	return ret;
-- 
cgit v1.1


From 61f9ec1d8e97131ce55159647fcdfeccc0f40647 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Thu, 5 Nov 2015 18:46:23 -0800
Subject: mm: fix docbook comment for get_vaddr_frames()

get_vaddr_frames() has a comment that's *almost* a docbook comment; add
the missing star so that the tools will find it properly.

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/frame_vector.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/frame_vector.c b/mm/frame_vector.c
index cdabcb9..7cf2b71 100644
--- a/mm/frame_vector.c
+++ b/mm/frame_vector.c
@@ -7,7 +7,7 @@
 #include <linux/pagemap.h>
 #include <linux/sched.h>
 
-/*
+/**
  * get_vaddr_frames() - map virtual addresses to pfns
  * @start:	starting user address
  * @nr_frames:	number of pages / pfns from start to map
-- 
cgit v1.1


From 145949a1387ba7a4fd0df15181e09345ec7b0492 Mon Sep 17 00:00:00 2001
From: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Date: Thu, 5 Nov 2015 18:46:26 -0800
Subject: mm/list_lru.c: replace nr_node_ids for loop with for_each_node()

The functions used in the patch are in slowpath, which gets called
whenever alloc_super is called during mounts.

Though this should not make difference for the architectures with
sequential numa node ids, for the powerpc which can potentially have
sparse node ids (for e.g., 4 node system having numa ids, 0,1,16,17 is
common), this patch saves some unnecessary allocations for non existing
numa nodes.

Even without that saving, perhaps patch makes code more readable.

[vdavydov@parallels.com: take memcg_aware check outside for_each loop]
Signed-off-by: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Anton Blanchard <anton@samba.org>
Cc: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Cc: Greg Kurz <gkurz@linux.vnet.ibm.com>
Cc: Grant Likely <grant.likely@linaro.org>
Cc: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/list_lru.c | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/list_lru.c b/mm/list_lru.c
index e1da19f..2823747 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -42,6 +42,10 @@ static void list_lru_unregister(struct list_lru *lru)
 #ifdef CONFIG_MEMCG_KMEM
 static inline bool list_lru_memcg_aware(struct list_lru *lru)
 {
+	/*
+	 * This needs node 0 to be always present, even
+	 * in the systems supporting sparse numa ids.
+	 */
 	return !!lru->node[0].memcg_lrus;
 }
 
@@ -377,16 +381,20 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
 {
 	int i;
 
-	for (i = 0; i < nr_node_ids; i++) {
-		if (!memcg_aware)
-			lru->node[i].memcg_lrus = NULL;
-		else if (memcg_init_list_lru_node(&lru->node[i]))
+	if (!memcg_aware)
+		return 0;
+
+	for_each_node(i) {
+		if (memcg_init_list_lru_node(&lru->node[i]))
 			goto fail;
 	}
 	return 0;
 fail:
-	for (i = i - 1; i >= 0; i--)
+	for (i = i - 1; i >= 0; i--) {
+		if (!lru->node[i].memcg_lrus)
+			continue;
 		memcg_destroy_list_lru_node(&lru->node[i]);
+	}
 	return -ENOMEM;
 }
 
@@ -397,7 +405,7 @@ static void memcg_destroy_list_lru(struct list_lru *lru)
 	if (!list_lru_memcg_aware(lru))
 		return;
 
-	for (i = 0; i < nr_node_ids; i++)
+	for_each_node(i)
 		memcg_destroy_list_lru_node(&lru->node[i]);
 }
 
@@ -409,16 +417,20 @@ static int memcg_update_list_lru(struct list_lru *lru,
 	if (!list_lru_memcg_aware(lru))
 		return 0;
 
-	for (i = 0; i < nr_node_ids; i++) {
+	for_each_node(i) {
 		if (memcg_update_list_lru_node(&lru->node[i],
 					       old_size, new_size))
 			goto fail;
 	}
 	return 0;
 fail:
-	for (i = i - 1; i >= 0; i--)
+	for (i = i - 1; i >= 0; i--) {
+		if (!lru->node[i].memcg_lrus)
+			continue;
+
 		memcg_cancel_update_list_lru_node(&lru->node[i],
 						  old_size, new_size);
+	}
 	return -ENOMEM;
 }
 
@@ -430,7 +442,7 @@ static void memcg_cancel_update_list_lru(struct list_lru *lru,
 	if (!list_lru_memcg_aware(lru))
 		return;
 
-	for (i = 0; i < nr_node_ids; i++)
+	for_each_node(i)
 		memcg_cancel_update_list_lru_node(&lru->node[i],
 						  old_size, new_size);
 }
@@ -485,7 +497,7 @@ static void memcg_drain_list_lru(struct list_lru *lru,
 	if (!list_lru_memcg_aware(lru))
 		return;
 
-	for (i = 0; i < nr_node_ids; i++)
+	for_each_node(i)
 		memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx);
 }
 
@@ -522,7 +534,7 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
 	if (!lru->node)
 		goto out;
 
-	for (i = 0; i < nr_node_ids; i++) {
+	for_each_node(i) {
 		spin_lock_init(&lru->node[i].lock);
 		if (key)
 			lockdep_set_class(&lru->node[i].lock, key);
-- 
cgit v1.1


From b0d61c7e56815b0b881c81f6779a65f4fdae4bc0 Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Thu, 5 Nov 2015 18:46:32 -0800
Subject: mm/msync: use offset_in_page macro

linux/mm.h provides offset_in_page() macro.  Let's use already predefined
macro instead of (addr & ~PAGE_MASK).

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/msync.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/msync.c b/mm/msync.c
index bb04d53..24e612f 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -38,7 +38,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
 
 	if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
 		goto out;
-	if (start & ~PAGE_MASK)
+	if (offset_in_page(start))
 		goto out;
 	if ((flags & MS_ASYNC) && (flags & MS_SYNC))
 		goto out;
-- 
cgit v1.1


From 1824cb753354e026ab898cd472bddd540b50b00b Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Thu, 5 Nov 2015 18:46:35 -0800
Subject: mm/nommu: use offset_in_page macro

linux/mm.h provides offset_in_page() macro.  Let's use already predefined
macro instead of (addr & ~PAGE_MASK).

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/nommu.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/nommu.c b/mm/nommu.c
index ab14a20..1e0f168 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1497,7 +1497,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
 
 	if (copy_from_user(&a, arg, sizeof(a)))
 		return -EFAULT;
-	if (a.offset & ~PAGE_MASK)
+	if (offset_in_page(a.offset))
 		return -EINVAL;
 
 	return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
@@ -1653,9 +1653,9 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 			goto erase_whole_vma;
 		if (start < vma->vm_start || end > vma->vm_end)
 			return -EINVAL;
-		if (start & ~PAGE_MASK)
+		if (offset_in_page(start))
 			return -EINVAL;
-		if (end != vma->vm_end && end & ~PAGE_MASK)
+		if (end != vma->vm_end && offset_in_page(end))
 			return -EINVAL;
 		if (start != vma->vm_start && end != vma->vm_end) {
 			ret = split_vma(mm, vma, start, 1);
@@ -1736,7 +1736,7 @@ static unsigned long do_mremap(unsigned long addr,
 	if (old_len == 0 || new_len == 0)
 		return (unsigned long) -EINVAL;
 
-	if (addr & ~PAGE_MASK)
+	if (offset_in_page(addr))
 		return -EINVAL;
 
 	if (flags & MREMAP_FIXED && new_addr != addr)
-- 
cgit v1.1


From e7bbdd071314b52507e6c615e2cec90d46f82c57 Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Thu, 5 Nov 2015 18:46:38 -0800
Subject: mm/mincore: use offset_in_page macro

linux/mm.h provides offset_in_page() macro.  Let's use already predefined
macro instead of (addr & ~PAGE_MASK).

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mincore.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/mincore.c b/mm/mincore.c
index be25efd..14bb9fb 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -234,7 +234,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
 
 	/* This also avoids any overflows on PAGE_CACHE_ALIGN */
 	pages = len >> PAGE_SHIFT;
-	pages += (len & ~PAGE_MASK) != 0;
+	pages += (offset_in_page(len)) != 0;
 
 	if (!access_ok(VERIFY_WRITE, vec, pages))
 		return -EFAULT;
-- 
cgit v1.1


From 5d57b0146aa942b939bbd77e09130270dc9b97d2 Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Thu, 5 Nov 2015 18:46:40 -0800
Subject: mm/early_ioremap: use offset_in_page macro

linux/mm.h provides offset_in_page() macro.  Let's use already predefined
macro instead of (addr & ~PAGE_MASK).

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/early_ioremap.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index 17ae14b..6d5717b 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -126,7 +126,7 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
 	/*
 	 * Mappings have to be page-aligned
 	 */
-	offset = phys_addr & ~PAGE_MASK;
+	offset = offset_in_page(phys_addr);
 	phys_addr &= PAGE_MASK;
 	size = PAGE_ALIGN(last_addr + 1) - phys_addr;
 
@@ -189,7 +189,7 @@ void __init early_iounmap(void __iomem *addr, unsigned long size)
 	if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)))
 		return;
 
-	offset = virt_addr & ~PAGE_MASK;
+	offset = offset_in_page(virt_addr);
 	nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
 
 	idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
@@ -234,7 +234,7 @@ void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size)
 	char *p;
 
 	while (size) {
-		slop = src & ~PAGE_MASK;
+		slop = offset_in_page(src);
 		clen = size;
 		if (clen > MAX_MAP_CHUNK - slop)
 			clen = MAX_MAP_CHUNK - slop;
-- 
cgit v1.1


From f09f1243ca2d5d297881bf2c2148d9ab35314314 Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Thu, 5 Nov 2015 18:46:43 -0800
Subject: mm/percpu: use offset_in_page macro

linux/mm.h provides offset_in_page() macro.  Let's use already predefined
macro instead of (addr & ~PAGE_MASK).

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/percpu.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/percpu.c b/mm/percpu.c
index a63b4d8..8a943b9 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1554,12 +1554,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 	PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
 #ifdef CONFIG_SMP
 	PCPU_SETUP_BUG_ON(!ai->static_size);
-	PCPU_SETUP_BUG_ON((unsigned long)__per_cpu_start & ~PAGE_MASK);
+	PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
 #endif
 	PCPU_SETUP_BUG_ON(!base_addr);
-	PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK);
+	PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
 	PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
-	PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
+	PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
 	PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
 	PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
 	PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
@@ -1806,7 +1806,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
 
 	alloc_size = roundup(min_unit_size, atom_size);
 	upa = alloc_size / min_unit_size;
-	while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+	while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
 		upa--;
 	max_upa = upa;
 
@@ -1838,7 +1838,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
 	for (upa = max_upa; upa; upa--) {
 		int allocs = 0, wasted = 0;
 
-		if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+		if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
 			continue;
 
 		for (group = 0; group < nr_groups; group++) {
-- 
cgit v1.1


From ea53cde089e07cfd7996c2072f770ebb984ce8db Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Thu, 5 Nov 2015 18:46:46 -0800
Subject: mm/util: use offset_in_page macro

linux/mm.h provides offset_in_page() macro.  Let's use already predefined
macro instead of (addr & ~PAGE_MASK).

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/util.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/util.c b/mm/util.c
index 68ff8a5..9af1c12 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -309,7 +309,7 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
 {
 	if (unlikely(offset + PAGE_ALIGN(len) < offset))
 		return -EINVAL;
-	if (unlikely(offset & ~PAGE_MASK))
+	if (unlikely(offset_in_page(offset)))
 		return -EINVAL;
 
 	return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
-- 
cgit v1.1


From 8fd9e4883a2b08c52ec00f3c214b45d096fc697a Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Thu, 5 Nov 2015 18:46:49 -0800
Subject: mm/mlock: use offset_in_page macro

linux/mm.h provides offset_in_page() macro.  Let's use already predefined
macro instead of (addr & ~PAGE_MASK).

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mlock.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/mlock.c b/mm/mlock.c
index 7e6ad9c..550228d 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -560,7 +560,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
 	struct vm_area_struct * vma, * prev;
 	int error;
 
-	VM_BUG_ON(start & ~PAGE_MASK);
+	VM_BUG_ON(offset_in_page(start));
 	VM_BUG_ON(len != PAGE_ALIGN(len));
 	end = start + len;
 	if (end < start)
@@ -616,7 +616,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
 
 	lru_add_drain_all();	/* flush pagevec */
 
-	len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
+	len = PAGE_ALIGN(len + (offset_in_page(start)));
 	start &= PAGE_MASK;
 
 	lock_limit = rlimit(RLIMIT_MEMLOCK);
@@ -645,7 +645,7 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
 {
 	int ret;
 
-	len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
+	len = PAGE_ALIGN(len + (offset_in_page(start)));
 	start &= PAGE_MASK;
 
 	down_write(&current->mm->mmap_sem);
-- 
cgit v1.1


From 891c49abfb097bbd7024b4072dd1c8e1c995d3ec Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Thu, 5 Nov 2015 18:46:51 -0800
Subject: mm/vmalloc: use offset_in_page macro

linux/mm.h provides offset_in_page() macro.  Let's use already predefined
macro instead of (addr & ~PAGE_MASK).

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index af3a519..9db9ef5 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -358,7 +358,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	struct vmap_area *first;
 
 	BUG_ON(!size);
-	BUG_ON(size & ~PAGE_MASK);
+	BUG_ON(offset_in_page(size));
 	BUG_ON(!is_power_of_2(align));
 
 	va = kmalloc_node(sizeof(struct vmap_area),
@@ -936,7 +936,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
 	void *vaddr = NULL;
 	unsigned int order;
 
-	BUG_ON(size & ~PAGE_MASK);
+	BUG_ON(offset_in_page(size));
 	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
 	if (WARN_ON(size == 0)) {
 		/*
@@ -989,7 +989,7 @@ static void vb_free(const void *addr, unsigned long size)
 	unsigned int order;
 	struct vmap_block *vb;
 
-	BUG_ON(size & ~PAGE_MASK);
+	BUG_ON(offset_in_page(size));
 	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
 
 	flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);
@@ -1902,7 +1902,7 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
 	while (count) {
 		unsigned long offset, length;
 
-		offset = (unsigned long)addr & ~PAGE_MASK;
+		offset = offset_in_page(addr);
 		length = PAGE_SIZE - offset;
 		if (length > count)
 			length = count;
@@ -1941,7 +1941,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)
 	while (count) {
 		unsigned long offset, length;
 
-		offset = (unsigned long)addr & ~PAGE_MASK;
+		offset = offset_in_page(addr);
 		length = PAGE_SIZE - offset;
 		if (length > count)
 			length = count;
@@ -2392,7 +2392,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 	bool purged = false;
 
 	/* verify parameters and allocate data structures */
-	BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
+	BUG_ON(offset_in_page(align) || !is_power_of_2(align));
 	for (last_area = 0, area = 0; area < nr_vms; area++) {
 		start = offsets[area];
 		end = start + sizes[area];
-- 
cgit v1.1


From de1741a1333ea37694dddf7c94aa4cf2d0e58912 Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Thu, 5 Nov 2015 18:46:54 -0800
Subject: mm/mmap: use offset_in_page macro

linux/mm.h provides offset_in_page() macro.  Let's use already predefined
macro instead of (addr & ~PAGE_MASK).

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index bd932c1..3ec19b6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1302,7 +1302,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 	 * that it represents a valid section of the address space.
 	 */
 	addr = get_unmapped_area(file, addr, len, pgoff, flags);
-	if (addr & ~PAGE_MASK)
+	if (offset_in_page(addr))
 		return addr;
 
 	/* Do simple checking here so the lower-level routines won't have
@@ -1473,7 +1473,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
 
 	if (copy_from_user(&a, arg, sizeof(a)))
 		return -EFAULT;
-	if (a.offset & ~PAGE_MASK)
+	if (offset_in_page(a.offset))
 		return -EINVAL;
 
 	return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
@@ -1989,7 +1989,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 	 * can happen with large stack limits and large mmap()
 	 * allocations.
 	 */
-	if (addr & ~PAGE_MASK) {
+	if (offset_in_page(addr)) {
 		VM_BUG_ON(addr != -ENOMEM);
 		info.flags = 0;
 		info.low_limit = TASK_UNMAPPED_BASE;
@@ -2025,7 +2025,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 
 	if (addr > TASK_SIZE - len)
 		return -ENOMEM;
-	if (addr & ~PAGE_MASK)
+	if (offset_in_page(addr))
 		return -EINVAL;
 
 	addr = arch_rebalance_pgtables(addr, len);
@@ -2535,7 +2535,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 	unsigned long end;
 	struct vm_area_struct *vma, *prev, *last;
 
-	if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
+	if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
 		return -EINVAL;
 
 	len = PAGE_ALIGN(len);
@@ -2733,7 +2733,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
 	flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
 
 	error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
-	if (error & ~PAGE_MASK)
+	if (offset_in_page(error))
 		return error;
 
 	error = mlock_future_check(mm, mm->def_flags, len);
-- 
cgit v1.1


From f19cb115a25f3f25752fdc56340e7433462157ba Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Thu, 5 Nov 2015 18:46:57 -0800
Subject: mm/mremap: use offset_in_page macro

linux/mm.h provides offset_in_page() macro.  Let's use already predefined
macro instead of (addr & ~PAGE_MASK).

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mremap.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/mremap.c b/mm/mremap.c
index 5a71cce..c25bc62 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -401,7 +401,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 	unsigned long charged = 0;
 	unsigned long map_flags;
 
-	if (new_addr & ~PAGE_MASK)
+	if (offset_in_page(new_addr))
 		goto out;
 
 	if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
@@ -435,11 +435,11 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 	ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
 				((addr - vma->vm_start) >> PAGE_SHIFT),
 				map_flags);
-	if (ret & ~PAGE_MASK)
+	if (offset_in_page(ret))
 		goto out1;
 
 	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
-	if (!(ret & ~PAGE_MASK))
+	if (!(offset_in_page(ret)))
 		goto out;
 out1:
 	vm_unacct_memory(charged);
@@ -484,7 +484,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
 		return ret;
 
-	if (addr & ~PAGE_MASK)
+	if (offset_in_page(addr))
 		return ret;
 
 	old_len = PAGE_ALIGN(old_len);
@@ -566,7 +566,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 					vma->vm_pgoff +
 					((addr - vma->vm_start) >> PAGE_SHIFT),
 					map_flags);
-		if (new_addr & ~PAGE_MASK) {
+		if (offset_in_page(new_addr)) {
 			ret = new_addr;
 			goto out;
 		}
@@ -574,7 +574,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 		ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
 	}
 out:
-	if (ret & ~PAGE_MASK) {
+	if (offset_in_page(ret)) {
 		vm_unacct_memory(charged);
 		locked = 0;
 	}
-- 
cgit v1.1


From 35bd16a227534cb6ffc9b26a33061c2dcf91934b Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Thu, 5 Nov 2015 18:47:00 -0800
Subject: mm/memblock: make memblock_remove_range() static

memblock_remove_range() is only used in the mm/memblock.c, so we can make
it static.

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memblock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memblock.c b/mm/memblock.c
index 1c7b647..d300f13 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -706,7 +706,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
 	return 0;
 }
 
-int __init_memblock memblock_remove_range(struct memblock_type *type,
+static int __init_memblock memblock_remove_range(struct memblock_type *type,
 					  phys_addr_t base, phys_addr_t size)
 {
 	int start_rgn, end_rgn;
-- 
cgit v1.1


From f2f81fb2b72b83b661b11da6f1b0bd3526706278 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 5 Nov 2015 18:47:03 -0800
Subject: mm, migrate: count pages failing all retries in vmstat and tracepoint

Migration tries up to 10 times to migrate pages that return -EAGAIN until
it gives up.  If some pages fail all retries, they are counted towards the
number of failed pages that migrate_pages() returns.  They should also be
counted in the /proc/vmstat pgmigrate_fail and in the mm_migrate_pages
tracepoint.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/migrate.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index 842ecd7..94961f4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1169,7 +1169,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
 			}
 		}
 	}
-	rc = nr_failed + retry;
+	nr_failed += retry;
+	rc = nr_failed;
 out:
 	if (nr_succeeded)
 		count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
-- 
cgit v1.1


From b171e4093017d4d6e411f5e97823e5e4a21266a2 Mon Sep 17 00:00:00 2001
From: Yaowei Bai <bywxiaobai@163.com>
Date: Thu, 5 Nov 2015 18:47:06 -0800
Subject: mm/page_alloc: remove unused parameter in init_currently_empty_zone()

Commit a2f3aa025766 ("[PATCH] Fix sparsemem on Cell") fixed an oops
experienced on the Cell architecture when init-time functions,
early_*(), are called at runtime by introducing an 'enum memmap_context'
parameter to memmap_init_zone() and init_currently_empty_zone().  This
parameter is intended to be used to tell whether the call of these two
functions is being made on behalf of a hotplug event, or happening at
boot-time.  However, init_currently_empty_zone() does not use this
parameter at all, so remove it.

Signed-off-by: Yaowei Bai <bywxiaobai@163.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 4 ++--
 mm/page_alloc.c     | 6 ++----
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0780d11..67d488a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -339,8 +339,8 @@ static int __ref ensure_zone_is_initialized(struct zone *zone,
 			unsigned long start_pfn, unsigned long num_pages)
 {
 	if (!zone_is_initialized(zone))
-		return init_currently_empty_zone(zone, start_pfn, num_pages,
-						 MEMMAP_HOTPLUG);
+		return init_currently_empty_zone(zone, start_pfn, num_pages);
+
 	return 0;
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 805bbad..c60605d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4900,8 +4900,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
 
 int __meminit init_currently_empty_zone(struct zone *zone,
 					unsigned long zone_start_pfn,
-					unsigned long size,
-					enum memmap_context context)
+					unsigned long size)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	int ret;
@@ -5413,8 +5412,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
 
 		set_pageblock_order();
 		setup_usemap(pgdat, zone, zone_start_pfn, size);
-		ret = init_currently_empty_zone(zone, zone_start_pfn,
-						size, MEMMAP_EARLY);
+		ret = init_currently_empty_zone(zone, zone_start_pfn, size);
 		BUG_ON(ret);
 		memmap_init(size, nid, j, zone_start_pfn);
 		zone_start_pfn += size;
-- 
cgit v1.1


From 600e19afc5f8a6c18ea49cee9511c5797db02391 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <klamm@yandex-team.ru>
Date: Thu, 5 Nov 2015 18:47:08 -0800
Subject: mm: use only per-device readahead limit

Maximal readahead size is limited now by two values:
 1) by global 2Mb constant (MAX_READAHEAD in max_sane_readahead())
 2) by configurable per-device value* (bdi->ra_pages)

There are devices, which require custom readahead limit.
For instance, for RAIDs it's calculated as number of devices
multiplied by chunk size times 2.

Readahead size can never be larger than bdi->ra_pages * 2 value
(POSIX_FADV_SEQUNTIAL doubles readahead size).

If so, why do we need two limits?
I suggest to completely remove this max_sane_readahead() stuff and
use per-device readahead limit everywhere.

Also, using right readahead size for RAID disks can significantly
increase i/o performance:

before:
  dd if=/dev/md2 of=/dev/null bs=100M count=100
  100+0 records in
  100+0 records out
  10485760000 bytes (10 GB) copied, 12.9741 s, 808 MB/s

after:
  $ dd if=/dev/md2 of=/dev/null bs=100M count=100
  100+0 records in
  100+0 records out
  10485760000 bytes (10 GB) copied, 8.91317 s, 1.2 GB/s

(It's an 8-disks RAID5 storage).

This patch doesn't change sys_readahead and madvise(MADV_WILLNEED)
behavior introduced by 6d2be915e589b58 ("mm/readahead.c: fix readahead
failure for memoryless NUMA nodes and limit readahead pages").

Signed-off-by: Roman Gushchin <klamm@yandex-team.ru>
Cc: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: onstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/filemap.c   |  8 +++-----
 mm/readahead.c | 14 ++------------
 2 files changed, 5 insertions(+), 17 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index 327910c..1fe962b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1807,7 +1807,6 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
 				   struct file *file,
 				   pgoff_t offset)
 {
-	unsigned long ra_pages;
 	struct address_space *mapping = file->f_mapping;
 
 	/* If we don't want any read-ahead, don't bother */
@@ -1836,10 +1835,9 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
 	/*
 	 * mmap read-around
 	 */
-	ra_pages = max_sane_readahead(ra->ra_pages);
-	ra->start = max_t(long, 0, offset - ra_pages / 2);
-	ra->size = ra_pages;
-	ra->async_size = ra_pages / 4;
+	ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
+	ra->size = ra->ra_pages;
+	ra->async_size = ra->ra_pages / 4;
 	ra_submit(ra, mapping, file);
 }
 
diff --git a/mm/readahead.c b/mm/readahead.c
index 24682f6..998ad59 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -213,7 +213,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
 	if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
 		return -EINVAL;
 
-	nr_to_read = max_sane_readahead(nr_to_read);
+	nr_to_read = min(nr_to_read, inode_to_bdi(mapping->host)->ra_pages);
 	while (nr_to_read) {
 		int err;
 
@@ -232,16 +232,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
 	return 0;
 }
 
-#define MAX_READAHEAD   ((512*4096)/PAGE_CACHE_SIZE)
-/*
- * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
- * sensible upper limit.
- */
-unsigned long max_sane_readahead(unsigned long nr)
-{
-	return min(nr, MAX_READAHEAD);
-}
-
 /*
  * Set the initial window size, round to next power of 2 and square
  * for small size, x 4 for medium, and x 2 for large
@@ -380,7 +370,7 @@ ondemand_readahead(struct address_space *mapping,
 		   bool hit_readahead_marker, pgoff_t offset,
 		   unsigned long req_size)
 {
-	unsigned long max = max_sane_readahead(ra->ra_pages);
+	unsigned long max = ra->ra_pages;
 	pgoff_t prev_offset;
 
 	/*
-- 
cgit v1.1


From 5d317b2b6536592a9b51fe65faed43d65ca9158e Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Thu, 5 Nov 2015 18:47:14 -0800
Subject: mm: hugetlb: proc: add HugetlbPages field to /proc/PID/status

Currently there's no easy way to get per-process usage of hugetlb pages,
which is inconvenient because userspace applications which use hugetlb
typically want to control their processes on the basis of how much memory
(including hugetlb) they use.  So this patch simply provides easy access
to the info via /proc/PID/status.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Joern Engel <joern@logfs.org>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 9 +++++++++
 mm/rmap.c    | 4 +++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9cc7734..abfbe8c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2790,6 +2790,12 @@ void hugetlb_show_meminfo(void)
 				1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
 }
 
+void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
+{
+	seq_printf(m, "HugetlbPages:\t%8lu kB\n",
+		   atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10));
+}
+
 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 unsigned long hugetlb_total_pages(void)
 {
@@ -3025,6 +3031,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			get_page(ptepage);
 			page_dup_rmap(ptepage);
 			set_huge_pte_at(dst, addr, dst_pte, entry);
+			hugetlb_count_add(pages_per_huge_page(h), dst);
 		}
 		spin_unlock(src_ptl);
 		spin_unlock(dst_ptl);
@@ -3105,6 +3112,7 @@ again:
 		if (huge_pte_dirty(pte))
 			set_page_dirty(page);
 
+		hugetlb_count_sub(pages_per_huge_page(h), mm);
 		page_remove_rmap(page);
 		force_flush = !__tlb_remove_page(tlb, page);
 		if (force_flush) {
@@ -3509,6 +3517,7 @@ retry:
 				&& (vma->vm_flags & VM_SHARED)));
 	set_huge_pte_at(mm, address, ptep, new_pte);
 
+	hugetlb_count_add(pages_per_huge_page(h), mm);
 	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
 		/* Optimization, do the COW without a second fault */
 		ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
diff --git a/mm/rmap.c b/mm/rmap.c
index f5b5c1f..d40e7ae 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1352,7 +1352,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	update_hiwater_rss(mm);
 
 	if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
-		if (!PageHuge(page)) {
+		if (PageHuge(page)) {
+			hugetlb_count_sub(1 << compound_order(page), mm);
+		} else {
 			if (PageAnon(page))
 				dec_mm_counter(mm, MM_ANONPAGES);
 			else
-- 
cgit v1.1


From 29d06bbb41595f82db309a5516426ef8bd0f27b7 Mon Sep 17 00:00:00 2001
From: Yaowei Bai <bywxiaobai@163.com>
Date: Thu, 5 Nov 2015 18:47:17 -0800
Subject: mm/vmscan: make inactive_anon_is_low_global return directly

Delete unnecessary if to let inactive_anon_is_low_global return
directly.

No functional changes.

Signed-off-by: Yaowei Bai <bywxiaobai@163.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7f63a93..773fc8d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1866,10 +1866,7 @@ static int inactive_anon_is_low_global(struct zone *zone)
 	active = zone_page_state(zone, NR_ACTIVE_ANON);
 	inactive = zone_page_state(zone, NR_INACTIVE_ANON);
 
-	if (inactive * zone->inactive_ratio < active)
-		return 1;
-
-	return 0;
+	return inactive * zone->inactive_ratio < active;
 }
 
 /**
-- 
cgit v1.1


From 21c527a3cba07f9a9ce17b3a445f110a847793e2 Mon Sep 17 00:00:00 2001
From: Yaowei Bai <bywxiaobai@163.com>
Date: Thu, 5 Nov 2015 18:47:20 -0800
Subject: mm/compaction.c: add an is_via_compact_memory() helper

Introduce is_via_compact_memory() helper indicating compacting via
/proc/sys/vm/compact_memory to improve readability.

To catch this situation in __compaction_suitable, use order as parameter
directly instead of using struct compact_control.

This patch has no functional changes.

Signed-off-by: Yaowei Bai <bywxiaobai@163.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index c5c627a..a8e6593 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1197,6 +1197,15 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 	return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
 }
 
+/*
+ * order == -1 is expected when compacting via
+ * /proc/sys/vm/compact_memory
+ */
+static inline bool is_via_compact_memory(int order)
+{
+	return order == -1;
+}
+
 static int __compact_finished(struct zone *zone, struct compact_control *cc,
 			    const int migratetype)
 {
@@ -1223,11 +1232,7 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
 		return COMPACT_COMPLETE;
 	}
 
-	/*
-	 * order == -1 is expected when compacting via
-	 * /proc/sys/vm/compact_memory
-	 */
-	if (cc->order == -1)
+	if (is_via_compact_memory(cc->order))
 		return COMPACT_CONTINUE;
 
 	/* Compaction run is not finished if the watermark is not met */
@@ -1290,11 +1295,7 @@ static unsigned long __compaction_suitable(struct zone *zone, int order,
 	int fragindex;
 	unsigned long watermark;
 
-	/*
-	 * order == -1 is expected when compacting via
-	 * /proc/sys/vm/compact_memory
-	 */
-	if (order == -1)
+	if (is_via_compact_memory(order))
 		return COMPACT_CONTINUE;
 
 	watermark = low_wmark_pages(zone);
@@ -1658,10 +1659,11 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
 		 * this makes sure we compact the whole zone regardless of
 		 * cached scanner positions.
 		 */
-		if (cc->order == -1)
+		if (is_via_compact_memory(cc->order))
 			__reset_isolation_suitable(zone);
 
-		if (cc->order == -1 || !compaction_deferred(zone, cc->order))
+		if (is_via_compact_memory(cc->order) ||
+				!compaction_deferred(zone, cc->order))
 			compact_zone(zone, cc);
 
 		if (cc->order > 0) {
-- 
cgit v1.1


From aa750fd71c242dba02ee2034e15fbd7d0cdb2461 Mon Sep 17 00:00:00 2001
From: Junichi Nomura <j-nomura@ce.jp.nec.com>
Date: Thu, 5 Nov 2015 18:47:23 -0800
Subject: mm/filemap.c: make global sync not clear error status of individual
 inodes

filemap_fdatawait() is a function to wait for on-going writeback to
complete but also consume and clear error status of the mapping set during
writeback.

The latter functionality is critical for applications to detect writeback
error with system calls like fsync(2)/fdatasync(2).

However filemap_fdatawait() is also used by sync(2) or FIFREEZE ioctl,
which don't check error status of individual mappings.

As a result, fsync() may not be able to detect writeback error if events
happen in the following order:

   Application                    System admin
   ----------------------------------------------------------
   write data on page cache
                                  Run sync command
                                  writeback completes with error
                                  filemap_fdatawait() clears error
   fsync returns success
   (but the data is not on disk)

This patch adds filemap_fdatawait_keep_errors() for call sites where
writeback error is not handled so that they don't clear error status.

Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Acked-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Cc: Fengguang Wu <fengguang.wu@gmail.com>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/filemap.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 54 insertions(+), 13 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index 1fe962b..884766d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -331,23 +331,14 @@ int filemap_flush(struct address_space *mapping)
 }
 EXPORT_SYMBOL(filemap_flush);
 
-/**
- * filemap_fdatawait_range - wait for writeback to complete
- * @mapping:		address space structure to wait for
- * @start_byte:		offset in bytes where the range starts
- * @end_byte:		offset in bytes where the range ends (inclusive)
- *
- * Walk the list of under-writeback pages of the given address space
- * in the given range and wait for all of them.
- */
-int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
-			    loff_t end_byte)
+static int __filemap_fdatawait_range(struct address_space *mapping,
+				     loff_t start_byte, loff_t end_byte)
 {
 	pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;
 	pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
 	struct pagevec pvec;
 	int nr_pages;
-	int ret2, ret = 0;
+	int ret = 0;
 
 	if (end_byte < start_byte)
 		goto out;
@@ -374,6 +365,29 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
 		cond_resched();
 	}
 out:
+	return ret;
+}
+
+/**
+ * filemap_fdatawait_range - wait for writeback to complete
+ * @mapping:		address space structure to wait for
+ * @start_byte:		offset in bytes where the range starts
+ * @end_byte:		offset in bytes where the range ends (inclusive)
+ *
+ * Walk the list of under-writeback pages of the given address space
+ * in the given range and wait for all of them.  Check error status of
+ * the address space and return it.
+ *
+ * Since the error status of the address space is cleared by this function,
+ * callers are responsible for checking the return value and handling and/or
+ * reporting the error.
+ */
+int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
+			    loff_t end_byte)
+{
+	int ret, ret2;
+
+	ret = __filemap_fdatawait_range(mapping, start_byte, end_byte);
 	ret2 = filemap_check_errors(mapping);
 	if (!ret)
 		ret = ret2;
@@ -383,11 +397,38 @@ out:
 EXPORT_SYMBOL(filemap_fdatawait_range);
 
 /**
+ * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
+ * @mapping: address space structure to wait for
+ *
+ * Walk the list of under-writeback pages of the given address space
+ * and wait for all of them.  Unlike filemap_fdatawait(), this function
+ * does not clear error status of the address space.
+ *
+ * Use this function if callers don't handle errors themselves.  Expected
+ * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
+ * fsfreeze(8)
+ */
+void filemap_fdatawait_keep_errors(struct address_space *mapping)
+{
+	loff_t i_size = i_size_read(mapping->host);
+
+	if (i_size == 0)
+		return;
+
+	__filemap_fdatawait_range(mapping, 0, i_size - 1);
+}
+
+/**
  * filemap_fdatawait - wait for all under-writeback pages to complete
  * @mapping: address space structure to wait for
  *
  * Walk the list of under-writeback pages of the given address space
- * and wait for all of them.
+ * and wait for all of them.  Check error status of the address space
+ * and return it.
+ *
+ * Since the error status of the address space is cleared by this function,
+ * callers are responsible for checking the return value and handling and/or
+ * reporting the error.
  */
 int filemap_fdatawait(struct address_space *mapping)
 {
-- 
cgit v1.1


From a5f65109026b35b654b94fdcd26a971185a53adc Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Thu, 5 Nov 2015 18:47:26 -0800
Subject: mm: hwpoison: ratelimit messages from unpoison_memory()

Currently kernel prints out results of every single unpoison event, which
i= s not necessary because unpoison is purely a testing feature and
testers can = get little or no information from lots of lines of unpoison
log storm.  So this patch ratelimits printk in unpoison_memory().

This patch introduces a file local ratelimit_state, which adds 64 bytes to
memory-failure.o.  If we apply pr_info_ratelimited() for 8 callsite below,
2= 56 bytes is added, so it's a win.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory-failure.c | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9588269..16a0ec3 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -56,6 +56,7 @@
 #include <linux/memory_hotplug.h>
 #include <linux/mm_inline.h>
 #include <linux/kfifo.h>
+#include <linux/ratelimit.h>
 #include "internal.h"
 #include "ras/ras_event.h"
 
@@ -1403,6 +1404,12 @@ static int __init memory_failure_init(void)
 }
 core_initcall(memory_failure_init);
 
+#define unpoison_pr_info(fmt, pfn, rs)			\
+({							\
+	if (__ratelimit(rs))				\
+		pr_info(fmt, pfn);			\
+})
+
 /**
  * unpoison_memory - Unpoison a previously poisoned page
  * @pfn: Page number of the to be unpoisoned page
@@ -1421,6 +1428,8 @@ int unpoison_memory(unsigned long pfn)
 	struct page *p;
 	int freeit = 0;
 	unsigned int nr_pages;
+	static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
+					DEFAULT_RATELIMIT_BURST);
 
 	if (!pfn_valid(pfn))
 		return -ENXIO;
@@ -1429,23 +1438,26 @@ int unpoison_memory(unsigned long pfn)
 	page = compound_head(p);
 
 	if (!PageHWPoison(p)) {
-		pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
+		unpoison_pr_info("MCE: Page was already unpoisoned %#lx\n",
+				 pfn, &unpoison_rs);
 		return 0;
 	}
 
 	if (page_count(page) > 1) {
-		pr_info("MCE: Someone grabs the hwpoison page %#lx\n", pfn);
+		unpoison_pr_info("MCE: Someone grabs the hwpoison page %#lx\n",
+				 pfn, &unpoison_rs);
 		return 0;
 	}
 
 	if (page_mapped(page)) {
-		pr_info("MCE: Someone maps the hwpoison page %#lx\n", pfn);
+		unpoison_pr_info("MCE: Someone maps the hwpoison page %#lx\n",
+				 pfn, &unpoison_rs);
 		return 0;
 	}
 
 	if (page_mapping(page)) {
-		pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n",
-			pfn);
+		unpoison_pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n",
+				 pfn, &unpoison_rs);
 		return 0;
 	}
 
@@ -1455,7 +1467,8 @@ int unpoison_memory(unsigned long pfn)
 	 * In such case, we yield to memory_failure() and make unpoison fail.
 	 */
 	if (!PageHuge(page) && PageTransHuge(page)) {
-		pr_info("MCE: Memory failure is now running on %#lx\n", pfn);
+		unpoison_pr_info("MCE: Memory failure is now running on %#lx\n",
+				 pfn, &unpoison_rs);
 		return 0;
 	}
 
@@ -1469,12 +1482,14 @@ int unpoison_memory(unsigned long pfn)
 		 * to the end.
 		 */
 		if (PageHuge(page)) {
-			pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
+			unpoison_pr_info("MCE: Memory failure is now running on free hugepage %#lx\n",
+					 pfn, &unpoison_rs);
 			return 0;
 		}
 		if (TestClearPageHWPoison(p))
 			num_poisoned_pages_dec();
-		pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
+		unpoison_pr_info("MCE: Software-unpoisoned free page %#lx\n",
+				 pfn, &unpoison_rs);
 		return 0;
 	}
 
@@ -1486,7 +1501,8 @@ int unpoison_memory(unsigned long pfn)
 	 * the free buddy page pool.
 	 */
 	if (TestClearPageHWPoison(page)) {
-		pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
+		unpoison_pr_info("MCE: Software-unpoisoned page %#lx\n",
+				 pfn, &unpoison_rs);
 		num_poisoned_pages_sub(nr_pages);
 		freeit = 1;
 		if (PageHuge(page))
-- 
cgit v1.1


From 3608de0787e51d3d826656e105524b48ade7b16f Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Thu, 5 Nov 2015 18:47:29 -0800
Subject: mm/memcontrol.c: fix order calculation in try_charge()

Since commit 6539cc053869 ("mm: memcontrol: fold mem_cgroup_do_charge()"),
the order to pass to mem_cgroup_oom() is calculated by passing the
number of pages to get_order() instead of the expected size in bytes.
AFAICT, it only affects the value displayed in the oom warning message.
This patch fix this.

Michal said:

: We haven't noticed that just because the OOM is enabled only for page
: faults of order-0 (single page) and get_order work just fine.  Thanks for
: noticing this.  If we ever start triggering OOM on different orders this
: would be broken.

Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b952abe..a1c05ff 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2094,7 +2094,8 @@ retry:
 
 	mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
 
-	mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
+	mem_cgroup_oom(mem_over_limit, gfp_mask,
+		       get_order(nr_pages * PAGE_SIZE));
 nomem:
 	if (!(gfp_mask & __GFP_NOFAIL))
 		return -ENOMEM;
-- 
cgit v1.1


From 42e2e45777a8c2ec32b6a3c3d81a7d454f6afb6d Mon Sep 17 00:00:00 2001
From: Yaowei Bai <bywxiaobai@163.com>
Date: Thu, 5 Nov 2015 18:47:36 -0800
Subject: mm/vmscan: make inactive_anon/file_is_low return bool

Make inactive_anon/file_is_low return bool due to these particular
functions only using either one or zero as their return value.

No functional change.

Signed-off-by: Yaowei Bai <bywxiaobai@163.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 773fc8d..38d0481 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1859,7 +1859,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
 }
 
 #ifdef CONFIG_SWAP
-static int inactive_anon_is_low_global(struct zone *zone)
+static bool inactive_anon_is_low_global(struct zone *zone)
 {
 	unsigned long active, inactive;
 
@@ -1876,14 +1876,14 @@ static int inactive_anon_is_low_global(struct zone *zone)
  * Returns true if the zone does not have enough inactive anon pages,
  * meaning some active anon pages need to be deactivated.
  */
-static int inactive_anon_is_low(struct lruvec *lruvec)
+static bool inactive_anon_is_low(struct lruvec *lruvec)
 {
 	/*
 	 * If we don't have swap space, anonymous page deactivation
 	 * is pointless.
 	 */
 	if (!total_swap_pages)
-		return 0;
+		return false;
 
 	if (!mem_cgroup_disabled())
 		return mem_cgroup_inactive_anon_is_low(lruvec);
@@ -1891,9 +1891,9 @@ static int inactive_anon_is_low(struct lruvec *lruvec)
 	return inactive_anon_is_low_global(lruvec_zone(lruvec));
 }
 #else
-static inline int inactive_anon_is_low(struct lruvec *lruvec)
+static inline bool inactive_anon_is_low(struct lruvec *lruvec)
 {
-	return 0;
+	return false;
 }
 #endif
 
@@ -1911,7 +1911,7 @@ static inline int inactive_anon_is_low(struct lruvec *lruvec)
  * This uses a different ratio than the anonymous pages, because
  * the page cache uses a use-once replacement algorithm.
  */
-static int inactive_file_is_low(struct lruvec *lruvec)
+static bool inactive_file_is_low(struct lruvec *lruvec)
 {
 	unsigned long inactive;
 	unsigned long active;
@@ -1922,7 +1922,7 @@ static int inactive_file_is_low(struct lruvec *lruvec)
 	return active > inactive;
 }
 
-static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
+static bool inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
 {
 	if (is_file_lru(lru))
 		return inactive_file_is_low(lruvec);
-- 
cgit v1.1


From 426fb5e72d92b868912e47a1e3ca2df6eabc3872 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Thu, 5 Nov 2015 18:47:44 -0800
Subject: mm/oom_kill.c: reverse the order of setting TIF_MEMDIE and sending
 SIGKILL

It was confirmed that a local unprivileged user can consume all memory
reserves and hang up that system using time lag between the OOM killer
sets TIF_MEMDIE on an OOM victim and sends SIGKILL to that victim, for
printk() inside for_each_process() loop at oom_kill_process() can consume
many seconds when there are many thread groups sharing the same memory.

Before starting oom-depleter process:

    Node 0 DMA: 3*4kB (UM) 6*8kB (U) 4*16kB (UEM) 0*32kB 0*64kB 1*128kB (M) 2*256kB (EM) 2*512kB (UE) 2*1024kB (EM) 1*2048kB (E) 1*4096kB (M) = 9980kB
    Node 0 DMA32: 31*4kB (UEM) 27*8kB (UE) 32*16kB (UE) 13*32kB (UE) 14*64kB (UM) 7*128kB (UM) 8*256kB (UM) 8*512kB (UM) 3*1024kB (U) 4*2048kB (UM) 362*4096kB (UM) = 1503220kB

As of invoking the OOM killer:

    Node 0 DMA: 11*4kB (UE) 8*8kB (UEM) 6*16kB (UE) 2*32kB (EM) 0*64kB 1*128kB (U) 3*256kB (UEM) 2*512kB (UE) 3*1024kB (UEM) 1*2048kB (U) 0*4096kB = 7308kB
    Node 0 DMA32: 1049*4kB (UEM) 507*8kB (UE) 151*16kB (UE) 53*32kB (UEM) 83*64kB (UEM) 52*128kB (EM) 25*256kB (UEM) 11*512kB (M) 6*1024kB (UM) 1*2048kB (M) 0*4096kB = 44556kB

Between the thread group leader got TIF_MEMDIE and receives SIGKILL:

    Node 0 DMA: 0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 0kB
    Node 0 DMA32: 0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 0kB

The oom-depleter's thread group leader which got TIF_MEMDIE started
memset() in user space after the OOM killer set TIF_MEMDIE, and it was
free to abuse ALLOC_NO_WATERMARKS by TIF_MEMDIE for memset() in user space
until SIGKILL is delivered.  If SIGKILL is delivered before TIF_MEMDIE is
set, the oom-depleter can terminate without touching memory reserves.

Although the possibility of hitting this time lag is very small for 3.19
and earlier kernels because TIF_MEMDIE is set immediately before sending
SIGKILL, preemption or long interrupts (an extreme example is SysRq-t) can
step between and allow memory allocations which are not needed for
terminating the OOM victim.

Fixes: 83363b917a29 ("oom: make sure that TIF_MEMDIE is set under task_lock")
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: <stable@vger.kernel.org>	[4.0+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/oom_kill.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1ecc0bc..8ad35aa 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -554,6 +554,12 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 
 	/* mm cannot safely be dereferenced after task_unlock(victim) */
 	mm = victim->mm;
+	/*
+	 * We should send SIGKILL before setting TIF_MEMDIE in order to prevent
+	 * the OOM victim from depleting the memory reserves from the user
+	 * space under its control.
+	 */
+	do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
 	mark_oom_victim(victim);
 	pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
 		task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
@@ -585,7 +591,6 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 		}
 	rcu_read_unlock();
 
-	do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
 	put_task_struct(victim);
 }
 #undef K
-- 
cgit v1.1


From 880b768937e90c433c0c8254a22b1eb63df005a4 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Thu, 5 Nov 2015 18:47:51 -0800
Subject: mm/oom_kill.c: fix potentially killing unrelated process

At the for_each_process() loop in oom_kill_process(), we are comparing
address of OOM victim's mm without holding a reference to that mm.  If
there are a lot of processes to compare or a lot of "Kill process %d (%s)
sharing same memory" messages to print, for_each_process() loop could take
very long time.

It is possible that meanwhile the OOM victim exits and releases its mm,
and then mm is allocated with the same address and assigned to some
unrelated process.  When we hit such race, the unrelated process will be
killed by error.  To make sure that the OOM victim's mm does not go away
until for_each_process() loop finishes, get a reference on the OOM
victim's mm before calling task_unlock(victim).

[oleg@redhat.com: several fixes]
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/oom_kill.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 8ad35aa..5ba743a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -552,8 +552,9 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 		victim = p;
 	}
 
-	/* mm cannot safely be dereferenced after task_unlock(victim) */
+	/* Get a reference to safely compare mm after task_unlock(victim) */
 	mm = victim->mm;
+	atomic_inc(&mm->mm_count);
 	/*
 	 * We should send SIGKILL before setting TIF_MEMDIE in order to prevent
 	 * the OOM victim from depleting the memory reserves from the user
@@ -591,6 +592,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 		}
 	rcu_read_unlock();
 
+	mmdrop(mm);
 	put_task_struct(victim);
 }
 #undef K
-- 
cgit v1.1


From 840807a8f40bb25a8df5b6412bba6bc156643be5 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Thu, 5 Nov 2015 18:47:54 -0800
Subject: mm/oom_kill.c: suppress unnecessary "sharing same memory" message

oom_kill_process() sends SIGKILL to other thread groups sharing victim's
mm.  But printing

  "Kill process %d (%s) sharing same memory\n"

lines makes no sense if they already have pending SIGKILL.  This patch
reduces the "Kill process" lines by printing that line with info level
only if SIGKILL is not pending.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/oom_kill.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 5ba743a..c170d9f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -583,9 +583,11 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 		    !(p->flags & PF_KTHREAD)) {
 			if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
 				continue;
+			if (fatal_signal_pending(p))
+				continue;
 
 			task_lock(p);	/* Protect ->comm from prctl() */
-			pr_err("Kill process %d (%s) sharing same memory\n",
+			pr_info("Kill process %d (%s) sharing same memory\n",
 				task_pid_nr(p), p->comm);
 			task_unlock(p);
 			do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
-- 
cgit v1.1


From fa6c7b46aaa0cc00846703e8c0ec1e1636ff25ba Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 5 Nov 2015 18:47:56 -0800
Subject: mm, compaction: export tracepoints status strings to userspace

Some compaction tracepoints convert the integer return values to strings
using the compaction_status_string array.  This works for in-kernel
printing, but not userspace trace printing of raw captured trace such as
via trace-cmd report.

This patch converts the private array to appropriate tracepoint macros
that result in proper userspace support.

trace-cmd output before:
transhuge-stres-4235  [000]   453.149280: mm_compaction_finished: node=0
  zone=ffffffff81815d7a order=9 ret=

after:
transhuge-stres-4235  [000]   453.149280: mm_compaction_finished: node=0
  zone=ffffffff81815d7a order=9 ret=partial

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index a8e6593..a5849c4 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -35,17 +35,6 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
 #endif
 
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
-#ifdef CONFIG_TRACEPOINTS
-static const char *const compaction_status_string[] = {
-	"deferred",
-	"skipped",
-	"continue",
-	"partial",
-	"complete",
-	"no_suitable_page",
-	"not_suitable_zone",
-};
-#endif
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/compaction.h>
-- 
cgit v1.1


From 2d1e10412c2388ff9b6afc60536eaa195a419289 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 5 Nov 2015 18:48:02 -0800
Subject: mm, compaction: distinguish contended status in tracepoints

Compaction returns prematurely with COMPACT_PARTIAL when contended or has
fatal signal pending.  This is ok for the callers, but might be misleading
in the traces, as the usual reason to return COMPACT_PARTIAL is that we
think the allocation should succeed.  After this patch we distinguish the
premature ending condition in the mm_compaction_finished and
mm_compaction_end tracepoints.

The contended status covers the following reasons:
- lock contention or need_resched() detected in async compaction
- fatal signal pending
- too many pages isolated in the zone (only for async compaction)
Further distinguishing the exact reason seems unnecessary for now.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index a5849c4..de3e1e7 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1202,7 +1202,7 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
 	unsigned long watermark;
 
 	if (cc->contended || fatal_signal_pending(current))
-		return COMPACT_PARTIAL;
+		return COMPACT_CONTENDED;
 
 	/* Compaction run completes if the migrate and free scanner meet */
 	if (compact_scanners_met(cc)) {
@@ -1393,7 +1393,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 
 		switch (isolate_migratepages(zone, cc)) {
 		case ISOLATE_ABORT:
-			ret = COMPACT_PARTIAL;
+			ret = COMPACT_CONTENDED;
 			putback_movable_pages(&cc->migratepages);
 			cc->nr_migratepages = 0;
 			goto out;
@@ -1424,7 +1424,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 			 * and we want compact_finished() to detect it
 			 */
 			if (err == -ENOMEM && !compact_scanners_met(cc)) {
-				ret = COMPACT_PARTIAL;
+				ret = COMPACT_CONTENDED;
 				goto out;
 			}
 		}
@@ -1477,6 +1477,9 @@ out:
 	trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
 				cc->free_pfn, end_pfn, sync, ret);
 
+	if (ret == COMPACT_CONTENDED)
+		ret = COMPACT_PARTIAL;
+
 	return ret;
 }
 
-- 
cgit v1.1


From da39da3a54fed88e29024f2f1f6cd7357cd03a44 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 5 Nov 2015 18:48:05 -0800
Subject: mm, oom: remove task_lock protecting comm printing

The oom killer takes task_lock() in a couple of places solely to protect
printing the task's comm.

A process's comm, including current's comm, may change due to
/proc/pid/comm or PR_SET_NAME.

The comm will always be NULL-terminated, so the worst race scenario would
only be during update.  We can tolerate a comm being printed that is in
the middle of an update to avoid taking the lock.

Other locations in the kernel have already dropped task_lock() when
printing comm, so this is consistent.

Signed-off-by: David Rientjes <rientjes@google.com>
Suggested-by: Oleg Nesterov <oleg@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/oom_kill.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c170d9f..58f3d27 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -377,13 +377,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 static void dump_header(struct oom_control *oc, struct task_struct *p,
 			struct mem_cgroup *memcg)
 {
-	task_lock(current);
 	pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
 		"oom_score_adj=%hd\n",
 		current->comm, oc->gfp_mask, oc->order,
 		current->signal->oom_score_adj);
-	cpuset_print_task_mems_allowed(current);
-	task_unlock(current);
+	cpuset_print_current_mems_allowed();
 	dump_stack();
 	if (memcg)
 		mem_cgroup_print_oom_info(memcg, p);
@@ -509,10 +507,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 	if (__ratelimit(&oom_rs))
 		dump_header(oc, p, memcg);
 
-	task_lock(p);
 	pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
 		message, task_pid_nr(p), p->comm, points);
-	task_unlock(p);
 
 	/*
 	 * If any of p's children has a different mm and is eligible for kill,
@@ -586,10 +582,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 			if (fatal_signal_pending(p))
 				continue;
 
-			task_lock(p);	/* Protect ->comm from prctl() */
 			pr_info("Kill process %d (%s) sharing same memory\n",
 				task_pid_nr(p), p->comm);
-			task_unlock(p);
 			do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
 		}
 	rcu_read_unlock();
-- 
cgit v1.1


From d031a157915e0508ffa1ab9f1bbf977257529cb4 Mon Sep 17 00:00:00 2001
From: Alexandru Moise <00moses.alexander00@gmail.com>
Date: Thu, 5 Nov 2015 18:48:08 -0800
Subject: mm/vmscan.c: fix types of some locals

In zone_reclaimable_pages(), `nr' is returned by a function which is
declared as returning "unsigned long", so declare it such.  Negative
values are meaningless here.

In zone_pagecache_reclaimable() we should also declare `delta' and
`nr_pagecache_reclaimable' as being unsigned longs because they're used to
store the values returned by zone_page_state() and
zone_unmapped_file_pages() which also happen to return unsigned integers.

[akpm@linux-foundation.org: make zone_pagecache_reclaimable() return ulong rather than long]
Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 38d0481..fdd89978 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -194,7 +194,7 @@ static bool sane_reclaim(struct scan_control *sc)
 
 static unsigned long zone_reclaimable_pages(struct zone *zone)
 {
-	int nr;
+	unsigned long nr;
 
 	nr = zone_page_state(zone, NR_ACTIVE_FILE) +
 	     zone_page_state(zone, NR_INACTIVE_FILE);
@@ -3693,10 +3693,10 @@ static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
 }
 
 /* Work out how many page cache pages we can reclaim in this reclaim_mode */
-static long zone_pagecache_reclaimable(struct zone *zone)
+static unsigned long zone_pagecache_reclaimable(struct zone *zone)
 {
-	long nr_pagecache_reclaimable;
-	long delta = 0;
+	unsigned long nr_pagecache_reclaimable;
+	unsigned long delta = 0;
 
 	/*
 	 * If RECLAIM_UNMAP is set, then all file pages are considered
-- 
cgit v1.1


From 9fd745d450e7e2b0d2f1b386b886e7d568b64404 Mon Sep 17 00:00:00 2001
From: Xishi Qiu <qiuxishi@huawei.com>
Date: Thu, 5 Nov 2015 18:48:11 -0800
Subject: mm: fix overflow in find_zone_movable_pfns_for_nodes()

If the user set "movablecore=xx" to a large number, corepages will
overflow.  Fix the problem.

Signed-off-by: Xishi Qiu <qiuxishi@huawei.com>
Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Acked-by: Tang Chen <tangchen@cn.fujitsu.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c60605d..4aed338 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5666,6 +5666,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
 		 */
 		required_movablecore =
 			roundup(required_movablecore, MAX_ORDER_NR_PAGES);
+		required_movablecore = min(totalpages, required_movablecore);
 		corepages = totalpages - required_movablecore;
 
 		required_kernelcore = max(required_kernelcore, corepages);
-- 
cgit v1.1


From 87e8827b37c0c391d9915d0dc6a06c9b5f9cac65 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 5 Nov 2015 18:48:14 -0800
Subject: mm: fix the racy mm->locked_vm change in

"mm->locked_vm += grow" and vm_stat_account() in acct_stack_growth() are
not safe; multiple threads using the same ->mm can do this at the same
time trying to expans different vma's under down_read(mmap_sem).  This
means that one of the "locked_vm += grow" changes can be lost and we can
miss munlock_vma_pages_all() later.

Move this code into the caller(s) under mm->page_table_lock.  All other
updates to ->locked_vm hold mmap_sem for writing.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index 3ec19b6..d1ac224 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2138,10 +2138,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
 	if (security_vm_enough_memory_mm(mm, grow))
 		return -ENOMEM;
 
-	/* Ok, everything looks good - let it rip */
-	if (vma->vm_flags & VM_LOCKED)
-		mm->locked_vm += grow;
-	vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
 	return 0;
 }
 
@@ -2202,6 +2198,10 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 				 * against concurrent vma expansions.
 				 */
 				spin_lock(&vma->vm_mm->page_table_lock);
+				if (vma->vm_flags & VM_LOCKED)
+					vma->vm_mm->locked_vm += grow;
+				vm_stat_account(vma->vm_mm, vma->vm_flags,
+						vma->vm_file, grow);
 				anon_vma_interval_tree_pre_update_vma(vma);
 				vma->vm_end = address;
 				anon_vma_interval_tree_post_update_vma(vma);
@@ -2273,6 +2273,10 @@ int expand_downwards(struct vm_area_struct *vma,
 				 * against concurrent vma expansions.
 				 */
 				spin_lock(&vma->vm_mm->page_table_lock);
+				if (vma->vm_flags & VM_LOCKED)
+					vma->vm_mm->locked_vm += grow;
+				vm_stat_account(vma->vm_mm, vma->vm_flags,
+						vma->vm_file, grow);
 				anon_vma_interval_tree_pre_update_vma(vma);
 				vma->vm_start = address;
 				vma->vm_pgoff -= grow;
-- 
cgit v1.1


From 09357814778a38a5ab2d031cba6c9e9fe090c849 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 5 Nov 2015 18:48:17 -0800
Subject: mm: add the "struct mm_struct *mm" local into

Cosmetic, but expand_upwards() and expand_downwards() overuse vma->vm_mm,
a local variable makes sense imho.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index d1ac224..3204a7e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2148,6 +2148,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
  */
 int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 {
+	struct mm_struct *mm = vma->vm_mm;
 	int error;
 
 	if (!(vma->vm_flags & VM_GROWSUP))
@@ -2197,10 +2198,10 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 				 * So, we reuse mm->page_table_lock to guard
 				 * against concurrent vma expansions.
 				 */
-				spin_lock(&vma->vm_mm->page_table_lock);
+				spin_lock(&mm->page_table_lock);
 				if (vma->vm_flags & VM_LOCKED)
-					vma->vm_mm->locked_vm += grow;
-				vm_stat_account(vma->vm_mm, vma->vm_flags,
+					mm->locked_vm += grow;
+				vm_stat_account(mm, vma->vm_flags,
 						vma->vm_file, grow);
 				anon_vma_interval_tree_pre_update_vma(vma);
 				vma->vm_end = address;
@@ -2208,8 +2209,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 				if (vma->vm_next)
 					vma_gap_update(vma->vm_next);
 				else
-					vma->vm_mm->highest_vm_end = address;
-				spin_unlock(&vma->vm_mm->page_table_lock);
+					mm->highest_vm_end = address;
+				spin_unlock(&mm->page_table_lock);
 
 				perf_event_mmap(vma);
 			}
@@ -2217,7 +2218,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 	}
 	vma_unlock_anon_vma(vma);
 	khugepaged_enter_vma_merge(vma, vma->vm_flags);
-	validate_mm(vma->vm_mm);
+	validate_mm(mm);
 	return error;
 }
 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -2228,6 +2229,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 int expand_downwards(struct vm_area_struct *vma,
 				   unsigned long address)
 {
+	struct mm_struct *mm = vma->vm_mm;
 	int error;
 
 	/*
@@ -2272,17 +2274,17 @@ int expand_downwards(struct vm_area_struct *vma,
 				 * So, we reuse mm->page_table_lock to guard
 				 * against concurrent vma expansions.
 				 */
-				spin_lock(&vma->vm_mm->page_table_lock);
+				spin_lock(&mm->page_table_lock);
 				if (vma->vm_flags & VM_LOCKED)
-					vma->vm_mm->locked_vm += grow;
-				vm_stat_account(vma->vm_mm, vma->vm_flags,
+					mm->locked_vm += grow;
+				vm_stat_account(mm, vma->vm_flags,
 						vma->vm_file, grow);
 				anon_vma_interval_tree_pre_update_vma(vma);
 				vma->vm_start = address;
 				vma->vm_pgoff -= grow;
 				anon_vma_interval_tree_post_update_vma(vma);
 				vma_gap_update(vma);
-				spin_unlock(&vma->vm_mm->page_table_lock);
+				spin_unlock(&mm->page_table_lock);
 
 				perf_event_mmap(vma);
 			}
@@ -2290,7 +2292,7 @@ int expand_downwards(struct vm_area_struct *vma,
 	}
 	vma_unlock_anon_vma(vma);
 	khugepaged_enter_vma_merge(vma, vma->vm_flags);
-	validate_mm(vma->vm_mm);
+	validate_mm(mm);
 	return error;
 }
 
-- 
cgit v1.1


From 0c1b2d783cf3432490bf1e532c742fffeadc0bf3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 5 Nov 2015 18:48:20 -0800
Subject: mm/oom_kill: remove the wrong fatal_signal_pending() check in
 oom_kill_process()

The fatal_signal_pending() was added to suppress unnecessary "sharing same
memory" message, but it can't 100% help anyway because it can be
false-negative; SIGKILL can be already dequeued.

And worse, it can be false-positive due to exec or coredump.  exec is
mostly fine, but coredump is not.  It is possible that the group leader
has the pending SIGKILL because its sub-thread originated the coredump, in
this case we must not skip this process.

We could probably add the additional ->group_exit_task check but this
patch just removes the wrong check along with pr_info().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Kyle Walker <kwalker@redhat.com>
Cc: Stanislav Kozina <skozina@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/oom_kill.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 58f3d27..c837d06 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -579,11 +579,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 		    !(p->flags & PF_KTHREAD)) {
 			if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
 				continue;
-			if (fatal_signal_pending(p))
-				continue;
 
-			pr_info("Kill process %d (%s) sharing same memory\n",
-				task_pid_nr(p), p->comm);
 			do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
 		}
 	rcu_read_unlock();
-- 
cgit v1.1


From c319025a6c79e532d862e3a0b9506ba316a4d13a Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 5 Nov 2015 18:48:23 -0800
Subject: mm/oom_kill: cleanup the "kill sharing same memory" loop

Purely cosmetic, but the complex "if" condition looks annoying to me.
Especially because it is not consistent with OOM_SCORE_ADJ_MIN check
which adds another if/continue.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Kyle Walker <kwalker@redhat.com>
Cc: Stanislav Kozina <skozina@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/oom_kill.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c837d06..2b6e880 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -574,14 +574,18 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 	 * pending fatal signal.
 	 */
 	rcu_read_lock();
-	for_each_process(p)
-		if (p->mm == mm && !same_thread_group(p, victim) &&
-		    !(p->flags & PF_KTHREAD)) {
-			if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-				continue;
+	for_each_process(p) {
+		if (p->mm != mm)
+			continue;
+		if (same_thread_group(p, victim))
+			continue;
+		if (unlikely(p->flags & PF_KTHREAD))
+			continue;
+		if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+			continue;
 
-			do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
-		}
+		do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
+	}
 	rcu_read_unlock();
 
 	mmdrop(mm);
-- 
cgit v1.1


From 4d7b3394f76ed72cfdec23ca5571dbab6ec41793 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 5 Nov 2015 18:48:26 -0800
Subject: mm/oom_kill: fix the wrong task->mm == mm checks in
 oom_kill_process()

Both "child->mm == mm" and "p->mm != mm" checks in oom_kill_process() are
wrong.  task->mm can be NULL if the task is the exited group leader.  This
means in particular that "kill sharing same memory" loop can miss a
process with a zombie leader which uses the same ->mm.

Note: the process_has_mm(child, p->mm) check is still not 100% correct,
p->mm can be NULL too.  This is minor, but probably deserves a fix or a
comment anyway.

[akpm@linux-foundation.org: document process_shares_mm() a bit]
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Kyle Walker <kwalker@redhat.com>
Cc: Stanislav Kozina <skozina@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/oom_kill.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2b6e880..e477828 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -474,6 +474,24 @@ void oom_killer_enable(void)
 	oom_killer_disabled = false;
 }
 
+/*
+ * task->mm can be NULL if the task is the exited group leader.  So to
+ * determine whether the task is using a particular mm, we examine all the
+ * task's threads: if one of those is using this mm then this task was also
+ * using it.
+ */
+static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
+{
+	struct task_struct *t;
+
+	for_each_thread(p, t) {
+		struct mm_struct *t_mm = READ_ONCE(t->mm);
+		if (t_mm)
+			return t_mm == mm;
+	}
+	return false;
+}
+
 #define K(x) ((x) << (PAGE_SHIFT-10))
 /*
  * Must be called while holding a reference to p, which will be released upon
@@ -521,7 +539,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 		list_for_each_entry(child, &t->children, sibling) {
 			unsigned int child_points;
 
-			if (child->mm == p->mm)
+			if (process_shares_mm(child, p->mm))
 				continue;
 			/*
 			 * oom_badness() returns 0 if the thread is unkillable
@@ -575,7 +593,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 	 */
 	rcu_read_lock();
 	for_each_process(p) {
-		if (p->mm != mm)
+		if (!process_shares_mm(p, mm))
 			continue;
 		if (same_thread_group(p, victim))
 			continue;
-- 
cgit v1.1


From e6ee219fdd69c87ceaeb421bcd753a63937f8f31 Mon Sep 17 00:00:00 2001
From: Chen Gang <xili_gchen_5257@hotmail.com>
Date: Thu, 5 Nov 2015 18:48:32 -0800
Subject: mm/mmap.c: remove redundant statement "error = -ENOMEM"

It is still a little better to remove it, although it should be skipped
by "-O2".

Signed-off-by: Chen Gang <gang.chen.5i5j@gmail.com>=0A=
Acked-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index 3204a7e..28d1b35 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1562,7 +1562,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	}
 
 	/* Clear old maps */
-	error = -ENOMEM;
 	while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
 			      &rb_parent)) {
 		if (do_munmap(mm, addr, len))
-- 
cgit v1.1


From 1e3ee14b9355a688ffe24725fa746ab120c42881 Mon Sep 17 00:00:00 2001
From: Chen Gang <xili_gchen_5257@hotmail.com>
Date: Thu, 5 Nov 2015 18:48:35 -0800
Subject: mm/mmap.c: do not initialize retval in mmap_pgoff()

When fget() fails we can return -EBADF directly.

Signed-off-by: Chen Gang <gang.chen.5i5j@gmail.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index 28d1b35..7e69f30 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1412,13 +1412,13 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
 		unsigned long, fd, unsigned long, pgoff)
 {
 	struct file *file = NULL;
-	unsigned long retval = -EBADF;
+	unsigned long retval;
 
 	if (!(flags & MAP_ANONYMOUS)) {
 		audit_mmap_fd(fd, flags);
 		file = fget(fd);
 		if (!file)
-			goto out;
+			return -EBADF;
 		if (is_file_hugepages(file))
 			len = ALIGN(len, huge_page_size(hstate_file(file)));
 		retval = -EINVAL;
@@ -1453,7 +1453,6 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
 out_fput:
 	if (file)
 		fput(file);
-out:
 	return retval;
 }
 
-- 
cgit v1.1


From c9427bc043da23de03d142c3c87ce4a57297c471 Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@163.com>
Date: Thu, 5 Nov 2015 18:48:38 -0800
Subject: mm/nommu.c: drop unlikely inside BUG_ON()

(1) For !CONFIG_BUG cases, the bug call is a no-op, so we couldn't
    care less and the change is ok.

(2) ppc and mips, which HAVE_ARCH_BUG_ON, do not rely on branch
    predictions as it seems to be pointless[1] and thus callers should not
    be trying to push an optimization in the first place.

(3) For CONFIG_BUG and !HAVE_ARCH_BUG_ON cases, BUG_ON() contains an
    unlikely compiler flag already.

Hence, we can drop unlikely behind BUG_ON().

[1] http://lkml.iu.edu/hypermail/linux/kernel/1101.3/02289.html

Signed-off-by: Geliang Tang <geliangtang@163.com>
Acked-by: Davidlohr Bueso <dave@stgolabs.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/nommu.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/nommu.c b/mm/nommu.c
index 1e0f168..92be862 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -578,16 +578,16 @@ static noinline void validate_nommu_regions(void)
 		return;
 
 	last = rb_entry(lastp, struct vm_region, vm_rb);
-	BUG_ON(unlikely(last->vm_end <= last->vm_start));
-	BUG_ON(unlikely(last->vm_top < last->vm_end));
+	BUG_ON(last->vm_end <= last->vm_start);
+	BUG_ON(last->vm_top < last->vm_end);
 
 	while ((p = rb_next(lastp))) {
 		region = rb_entry(p, struct vm_region, vm_rb);
 		last = rb_entry(lastp, struct vm_region, vm_rb);
 
-		BUG_ON(unlikely(region->vm_end <= region->vm_start));
-		BUG_ON(unlikely(region->vm_top < region->vm_end));
-		BUG_ON(unlikely(region->vm_start < last->vm_top));
+		BUG_ON(region->vm_end <= region->vm_start);
+		BUG_ON(region->vm_top < region->vm_end);
+		BUG_ON(region->vm_start < last->vm_top);
 
 		lastp = p;
 	}
-- 
cgit v1.1


From 27f28b972e12a4080e5f5e4eb36b8224705652d4 Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen.5i5j@gmail.com>
Date: Thu, 5 Nov 2015 18:48:41 -0800
Subject: mm/mmap.c: change __install_special_mapping() args order

Make __install_special_mapping() args order match the caller, so the
caller can pass their register args directly to callee with no touch.

For most of architectures, args (at least the first 5th args) are in
registers, so this change will have effect on most of architectures.

For -O2, __install_special_mapping() may be inlined under most of
architectures, but for -Os, it should not. So this change can get a
little better performance for -Os, at least.

Signed-off-by: Chen Gang <gang.chen.5i5j@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index 7e69f30..220effd 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3052,8 +3052,8 @@ static int special_mapping_fault(struct vm_area_struct *vma,
 static struct vm_area_struct *__install_special_mapping(
 	struct mm_struct *mm,
 	unsigned long addr, unsigned long len,
-	unsigned long vm_flags, const struct vm_operations_struct *ops,
-	void *priv)
+	unsigned long vm_flags, void *priv,
+	const struct vm_operations_struct *ops)
 {
 	int ret;
 	struct vm_area_struct *vma;
@@ -3102,8 +3102,8 @@ struct vm_area_struct *_install_special_mapping(
 	unsigned long addr, unsigned long len,
 	unsigned long vm_flags, const struct vm_special_mapping *spec)
 {
-	return __install_special_mapping(mm, addr, len, vm_flags,
-					 &special_mapping_vmops, (void *)spec);
+	return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec,
+					&special_mapping_vmops);
 }
 
 int install_special_mapping(struct mm_struct *mm,
@@ -3111,8 +3111,8 @@ int install_special_mapping(struct mm_struct *mm,
 			    unsigned long vm_flags, struct page **pages)
 {
 	struct vm_area_struct *vma = __install_special_mapping(
-		mm, addr, len, vm_flags, &legacy_special_mapping_vmops,
-		(void *)pages);
+		mm, addr, len, vm_flags, (void *)pages,
+		&legacy_special_mapping_vmops);
 
 	return PTR_ERR_OR_ZERO(vma);
 }
-- 
cgit v1.1


From c2d42c16ad83006a706d83e51a7268db04af733a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 5 Nov 2015 18:48:43 -0800
Subject: mm/vmstat.c: uninline node_page_state()

With x86_64 (config http://ozlabs.org/~akpm/config-akpm2.txt) and old gcc
(4.4.4), drivers/base/node.c:node_read_meminfo() is using 2344 bytes of
stack.  Uninlining node_page_state() reduces this to 440 bytes.

The stack consumption issue is fixed by newer gcc (4.8.4) however with
that compiler this patch reduces the node.o text size from 7314 bytes to
4578.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmstat.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'mm')

diff --git a/mm/vmstat.c b/mm/vmstat.c
index fbf1448..ffcb4f5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -591,6 +591,28 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
 	else
 		__inc_zone_state(z, NUMA_OTHER);
 }
+
+/*
+ * Determine the per node value of a stat item.
+ */
+unsigned long node_page_state(int node, enum zone_stat_item item)
+{
+	struct zone *zones = NODE_DATA(node)->node_zones;
+
+	return
+#ifdef CONFIG_ZONE_DMA
+		zone_page_state(&zones[ZONE_DMA], item) +
+#endif
+#ifdef CONFIG_ZONE_DMA32
+		zone_page_state(&zones[ZONE_DMA32], item) +
+#endif
+#ifdef CONFIG_HIGHMEM
+		zone_page_state(&zones[ZONE_HIGHMEM], item) +
+#endif
+		zone_page_state(&zones[ZONE_NORMAL], item) +
+		zone_page_state(&zones[ZONE_MOVABLE], item);
+}
+
 #endif
 
 #ifdef CONFIG_COMPACTION
-- 
cgit v1.1


From a1c34a3bf00af2cede839879502e12dc68491ad5 Mon Sep 17 00:00:00 2001
From: Laura Abbott <laura@labbott.name>
Date: Thu, 5 Nov 2015 18:48:46 -0800
Subject: mm: Don't offset memmap for flatmem

Srinivas Kandagatla reported bad page messages when trying to remove the
bottom 2MB on an ARM based IFC6410 board

  BUG: Bad page state in process swapper  pfn:fffa8
  page:ef7fb500 count:0 mapcount:0 mapping:  (null) index:0x0
  flags: 0x96640253(locked|error|dirty|active|arch_1|reclaim|mlocked)
  page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
  bad because of flags:
  flags: 0x200041(locked|active|mlocked)
  Modules linked in:
  CPU: 0 PID: 0 Comm: swapper Not tainted 3.19.0-rc3-00007-g412f9ba-dirty #816
  Hardware name: Qualcomm (Flattened Device Tree)
    unwind_backtrace
    show_stack
    dump_stack
    bad_page
    free_pages_prepare
    free_hot_cold_page
    __free_pages
    free_highmem_page
    mem_init
    start_kernel
  Disabling lock debugging due to kernel taint

Removing the lower 2MB made the start of the lowmem zone to no longer be
page block aligned.  IFC6410 uses CONFIG_FLATMEM where alloc_node_mem_map
allocates memory for the mem_map.  alloc_node_mem_map will offset for
unaligned nodes with the assumption the pfn/page translation functions
will account for the offset.  The functions for CONFIG_FLATMEM do not
offset however, resulting in overrunning the memmap array.  Just use the
allocated memmap without any offset when running with CONFIG_FLATMEM to
avoid the overrun.

Signed-off-by: Laura Abbott <laura@labbott.name>
Signed-off-by: Laura Abbott <lauraa@codeaurora.org>
Reported-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Tested-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Bjorn Andersson <bjorn.andersson@sonymobile.com>
Cc: Santosh Shilimkar <ssantosh@kernel.org>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Arnd Bergman <arnd@arndb.de>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Andy Gross <agross@codeaurora.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4aed338..86f7d95 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5421,6 +5421,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
 
 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
 {
+	unsigned long __maybe_unused offset = 0;
+
 	/* Skip empty nodes */
 	if (!pgdat->node_spanned_pages)
 		return;
@@ -5437,6 +5439,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
 		 * for the buddy allocator to function correctly.
 		 */
 		start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
+		offset = pgdat->node_start_pfn - start;
 		end = pgdat_end_pfn(pgdat);
 		end = ALIGN(end, MAX_ORDER_NR_PAGES);
 		size =  (end - start) * sizeof(struct page);
@@ -5444,7 +5447,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
 		if (!map)
 			map = memblock_virt_alloc_node_nopanic(size,
 							       pgdat->node_id);
-		pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
+		pgdat->node_mem_map = map + offset;
 	}
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 	/*
@@ -5452,9 +5455,9 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
 	 */
 	if (pgdat == NODE_DATA(0)) {
 		mem_map = NODE_DATA(0)->node_mem_map;
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM)
 		if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
-			mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
+			mem_map -= offset;
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 	}
 #endif
-- 
cgit v1.1


From a2c1aad3b5fccbb948878b75f9b8f13248666fd6 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Thu, 5 Nov 2015 18:48:52 -0800
Subject: mm/vmacache: inline vmacache_valid_mm()

This function incurs in very hot paths and merely does a few loads for
validity check.  Lets inline it, such that we can save the function call
overhead.

(akpm: this is cosmetic - the compiler already inlines vmacache_valid_mm())

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmacache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/vmacache.c b/mm/vmacache.c
index b6e3662..fd09dc9 100644
--- a/mm/vmacache.c
+++ b/mm/vmacache.c
@@ -52,7 +52,7 @@ void vmacache_flush_all(struct mm_struct *mm)
  * Also handle the case where a kernel thread has adopted this mm via use_mm().
  * That kernel thread's vmacache is not applicable to this mm.
  */
-static bool vmacache_valid_mm(struct mm_struct *mm)
+static inline bool vmacache_valid_mm(struct mm_struct *mm)
 {
 	return current->mm == mm && !(current->flags & PF_KTHREAD);
 }
-- 
cgit v1.1


From bde304bdf4ec4a5f58cc1e90fe2d9cd2d96304c4 Mon Sep 17 00:00:00 2001
From: Xishi Qiu <qiuxishi@huawei.com>
Date: Thu, 5 Nov 2015 18:48:56 -0800
Subject: mm/page_alloc.c: skip ZONE_MOVABLE if required_kernelcore is larger
 than totalpages

If kernelcore was not specified, or the kernelcore size is zero
(required_movablecore >= totalpages), or the kernelcore size is larger
than totalpages, there is no ZONE_MOVABLE.  We should fill the zone with
both kernel memory and movable memory.

Signed-off-by: Xishi Qiu <qiuxishi@huawei.com>
Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: <zhongjiang@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 86f7d95..06e6230 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5675,8 +5675,11 @@ static void __init find_zone_movable_pfns_for_nodes(void)
 		required_kernelcore = max(required_kernelcore, corepages);
 	}
 
-	/* If kernelcore was not specified, there is no ZONE_MOVABLE */
-	if (!required_kernelcore)
+	/*
+	 * If kernelcore was not specified or kernelcore size is larger
+	 * than totalpages, there is no ZONE_MOVABLE.
+	 */
+	if (!required_kernelcore || required_kernelcore >= totalpages)
 		goto out;
 
 	/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
-- 
cgit v1.1


From d05e83a6f861ad02c2fcba75d4c4cfe49e3bc90f Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Thu, 5 Nov 2015 18:48:59 -0800
Subject: memcg: simplify charging kmem pages

Charging kmem pages proceeds in two steps.  First, we try to charge the
allocation size to the memcg the current task belongs to, then we allocate
a page and "commit" the charge storing the pointer to the memcg in the
page struct.

Such a design looks overcomplicated, because there is not much sense in
trying charging the allocation before actually allocating a page: we won't
be able to consume much memory over the limit even if we charge after
doing the actual allocation, besides we already charge user pages post
factum, so being pedantic with kmem pages just looks pointless.

So this patch simplifies the design by merging the "charge" and the
"commit" steps into the same function, which takes the allocated page.

Also, rename the charge and uncharge methods to memcg_kmem_charge and
memcg_kmem_uncharge and make the charge method return error code instead
of bool to conform to mem_cgroup_try_charge.

Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 39 ++++-----------------------------------
 mm/page_alloc.c | 18 +++++++++---------
 2 files changed, 13 insertions(+), 44 deletions(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a1c05ff..e249279 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2404,57 +2404,26 @@ void __memcg_kmem_put_cache(struct kmem_cache *cachep)
 		css_put(&cachep->memcg_params.memcg->css);
 }
 
-/*
- * We need to verify if the allocation against current->mm->owner's memcg is
- * possible for the given order. But the page is not allocated yet, so we'll
- * need a further commit step to do the final arrangements.
- *
- * It is possible for the task to switch cgroups in this mean time, so at
- * commit time, we can't rely on task conversion any longer.  We'll then use
- * the handle argument to return to the caller which cgroup we should commit
- * against. We could also return the memcg directly and avoid the pointer
- * passing, but a boolean return value gives better semantics considering
- * the compiled-out case as well.
- *
- * Returning true means the allocation is possible.
- */
-bool
-__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
+int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
 {
 	struct mem_cgroup *memcg;
 	int ret;
 
-	*_memcg = NULL;
-
 	memcg = get_mem_cgroup_from_mm(current->mm);
 
 	if (!memcg_kmem_is_active(memcg)) {
 		css_put(&memcg->css);
-		return true;
+		return 0;
 	}
 
 	ret = memcg_charge_kmem(memcg, gfp, 1 << order);
-	if (!ret)
-		*_memcg = memcg;
 
 	css_put(&memcg->css);
-	return (ret == 0);
-}
-
-void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
-			      int order)
-{
-	VM_BUG_ON(mem_cgroup_is_root(memcg));
-
-	/* The page allocation failed. Revert */
-	if (!page) {
-		memcg_uncharge_kmem(memcg, 1 << order);
-		return;
-	}
 	page->mem_cgroup = memcg;
+	return ret;
 }
 
-void __memcg_kmem_uncharge_pages(struct page *page, int order)
+void __memcg_kmem_uncharge(struct page *page, int order)
 {
 	struct mem_cgroup *memcg = page->mem_cgroup;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 06e6230..446bb36 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3428,24 +3428,24 @@ EXPORT_SYMBOL(__free_page_frag);
 struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order)
 {
 	struct page *page;
-	struct mem_cgroup *memcg = NULL;
 
-	if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
-		return NULL;
 	page = alloc_pages(gfp_mask, order);
-	memcg_kmem_commit_charge(page, memcg, order);
+	if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
+		__free_pages(page, order);
+		page = NULL;
+	}
 	return page;
 }
 
 struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
 {
 	struct page *page;
-	struct mem_cgroup *memcg = NULL;
 
-	if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
-		return NULL;
 	page = alloc_pages_node(nid, gfp_mask, order);
-	memcg_kmem_commit_charge(page, memcg, order);
+	if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
+		__free_pages(page, order);
+		page = NULL;
+	}
 	return page;
 }
 
@@ -3455,7 +3455,7 @@ struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
  */
 void __free_kmem_pages(struct page *page, unsigned int order)
 {
-	memcg_kmem_uncharge_pages(page, order);
+	memcg_kmem_uncharge(page, order);
 	__free_pages(page, order);
 }
 
-- 
cgit v1.1


From f3ccb2c42297757d2e9b820ad37960462df7b7c1 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Thu, 5 Nov 2015 18:49:01 -0800
Subject: memcg: unify slab and other kmem pages charging

We have memcg_kmem_charge and memcg_kmem_uncharge methods for charging and
uncharging kmem pages to memcg, but currently they are not used for
charging slab pages (i.e.  they are only used for charging pages allocated
with alloc_kmem_pages).  The only reason why the slab subsystem uses
special helpers, memcg_charge_slab and memcg_uncharge_slab, is that it
needs to charge to the memcg of kmem cache while memcg_charge_kmem charges
to the memcg that the current task belongs to.

To remove this diversity, this patch adds an extra argument to
__memcg_kmem_charge that can be a pointer to a memcg or NULL.  If it is
not NULL, the function tries to charge to the memcg it points to,
otherwise it charge to the current context.  Next, it makes the slab
subsystem use this function to charge slab pages.

Since memcg_charge_kmem and memcg_uncharge_kmem helpers are now used only
in __memcg_kmem_charge and __memcg_kmem_uncharge, they are inlined.  Since
__memcg_kmem_charge stores a pointer to the memcg in the page struct, we
don't need memcg_uncharge_slab anymore and can use free_kmem_pages.
Besides, one can now detect which memcg a slab page belongs to by reading
/proc/kpagecgroup.

Note, this patch switches slab to charge-after-alloc design.  Since this
design is already used for all other memcg charges, it should not make any
difference.

[hannes@cmpxchg.org: better to have an outer function than a magic parameter for the memcg lookup]
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 71 +++++++++++++++++++++++++++------------------------------
 mm/slab.c       | 12 +++++-----
 mm/slab.h       | 24 ++++++-------------
 mm/slub.c       | 12 ++++------
 4 files changed, 51 insertions(+), 68 deletions(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e249279..9575cff 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2215,34 +2215,6 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
 }
 
 #ifdef CONFIG_MEMCG_KMEM
-int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
-		      unsigned long nr_pages)
-{
-	struct page_counter *counter;
-	int ret = 0;
-
-	ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
-	if (ret < 0)
-		return ret;
-
-	ret = try_charge(memcg, gfp, nr_pages);
-	if (ret)
-		page_counter_uncharge(&memcg->kmem, nr_pages);
-
-	return ret;
-}
-
-void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
-{
-	page_counter_uncharge(&memcg->memory, nr_pages);
-	if (do_swap_account)
-		page_counter_uncharge(&memcg->memsw, nr_pages);
-
-	page_counter_uncharge(&memcg->kmem, nr_pages);
-
-	css_put_many(&memcg->css, nr_pages);
-}
-
 static int memcg_alloc_cache_id(void)
 {
 	int id, size;
@@ -2404,36 +2376,59 @@ void __memcg_kmem_put_cache(struct kmem_cache *cachep)
 		css_put(&cachep->memcg_params.memcg->css);
 }
 
-int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
+int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
+			      struct mem_cgroup *memcg)
 {
-	struct mem_cgroup *memcg;
-	int ret;
-
-	memcg = get_mem_cgroup_from_mm(current->mm);
+	unsigned int nr_pages = 1 << order;
+	struct page_counter *counter;
+	int ret = 0;
 
-	if (!memcg_kmem_is_active(memcg)) {
-		css_put(&memcg->css);
+	if (!memcg_kmem_is_active(memcg))
 		return 0;
+
+	ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
+	if (ret)
+		return ret;
+
+	ret = try_charge(memcg, gfp, nr_pages);
+	if (ret) {
+		page_counter_uncharge(&memcg->kmem, nr_pages);
+		return ret;
 	}
 
-	ret = memcg_charge_kmem(memcg, gfp, 1 << order);
+	page->mem_cgroup = memcg;
 
+	return 0;
+}
+
+int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
+{
+	struct mem_cgroup *memcg;
+	int ret;
+
+	memcg = get_mem_cgroup_from_mm(current->mm);
+	ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
 	css_put(&memcg->css);
-	page->mem_cgroup = memcg;
 	return ret;
 }
 
 void __memcg_kmem_uncharge(struct page *page, int order)
 {
 	struct mem_cgroup *memcg = page->mem_cgroup;
+	unsigned int nr_pages = 1 << order;
 
 	if (!memcg)
 		return;
 
 	VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
 
-	memcg_uncharge_kmem(memcg, 1 << order);
+	page_counter_uncharge(&memcg->kmem, nr_pages);
+	page_counter_uncharge(&memcg->memory, nr_pages);
+	if (do_swap_account)
+		page_counter_uncharge(&memcg->memsw, nr_pages);
+
 	page->mem_cgroup = NULL;
+	css_put_many(&memcg->css, nr_pages);
 }
 
 struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
diff --git a/mm/slab.c b/mm/slab.c
index 461935b..272e809 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1593,16 +1593,17 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
 		flags |= __GFP_RECLAIMABLE;
 
-	if (memcg_charge_slab(cachep, flags, cachep->gfporder))
-		return NULL;
-
 	page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
 	if (!page) {
-		memcg_uncharge_slab(cachep, cachep->gfporder);
 		slab_out_of_memory(cachep, flags, nodeid);
 		return NULL;
 	}
 
+	if (memcg_charge_slab(page, flags, cachep->gfporder, cachep)) {
+		__free_pages(page, cachep->gfporder);
+		return NULL;
+	}
+
 	/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
 	if (page_is_pfmemalloc(page))
 		pfmemalloc_active = true;
@@ -1654,8 +1655,7 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
 
 	if (current->reclaim_state)
 		current->reclaim_state->reclaimed_slab += nr_freed;
-	__free_pages(page, cachep->gfporder);
-	memcg_uncharge_slab(cachep, cachep->gfporder);
+	__free_kmem_pages(page, cachep->gfporder);
 }
 
 static void kmem_rcu_free(struct rcu_head *head)
diff --git a/mm/slab.h b/mm/slab.h
index bf51a8d..27492eb 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -236,23 +236,16 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
 	return s->memcg_params.root_cache;
 }
 
-static __always_inline int memcg_charge_slab(struct kmem_cache *s,
-					     gfp_t gfp, int order)
+static __always_inline int memcg_charge_slab(struct page *page,
+					     gfp_t gfp, int order,
+					     struct kmem_cache *s)
 {
 	if (!memcg_kmem_enabled())
 		return 0;
 	if (is_root_cache(s))
 		return 0;
-	return memcg_charge_kmem(s->memcg_params.memcg, gfp, 1 << order);
-}
-
-static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
-{
-	if (!memcg_kmem_enabled())
-		return;
-	if (is_root_cache(s))
-		return;
-	memcg_uncharge_kmem(s->memcg_params.memcg, 1 << order);
+	return __memcg_kmem_charge_memcg(page, gfp, order,
+					 s->memcg_params.memcg);
 }
 
 extern void slab_init_memcg_params(struct kmem_cache *);
@@ -289,15 +282,12 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
 	return s;
 }
 
-static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
+static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order,
+				    struct kmem_cache *s)
 {
 	return 0;
 }
 
-static inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
-{
-}
-
 static inline void slab_init_memcg_params(struct kmem_cache *s)
 {
 }
diff --git a/mm/slub.c b/mm/slub.c
index e1bb147..423dbe7 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1328,16 +1328,15 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
 
 	flags |= __GFP_NOTRACK;
 
-	if (memcg_charge_slab(s, flags, order))
-		return NULL;
-
 	if (node == NUMA_NO_NODE)
 		page = alloc_pages(flags, order);
 	else
 		page = __alloc_pages_node(node, flags, order);
 
-	if (!page)
-		memcg_uncharge_slab(s, order);
+	if (page && memcg_charge_slab(page, flags, order, s)) {
+		__free_pages(page, order);
+		page = NULL;
+	}
 
 	return page;
 }
@@ -1476,8 +1475,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
 	page_mapcount_reset(page);
 	if (current->reclaim_state)
 		current->reclaim_state->reclaimed_slab += pages;
-	__free_pages(page, order);
-	memcg_uncharge_slab(s, order);
+	__free_kmem_pages(page, order);
 }
 
 #define need_reserve_slab_rcu						\
-- 
cgit v1.1


From df4065516b0dbfa35ac0e9b8124d441221c0a285 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Thu, 5 Nov 2015 18:49:04 -0800
Subject: memcg: simplify and inline __mem_cgroup_from_kmem

Before the previous patch ("memcg: unify slab and other kmem pages
charging"), __mem_cgroup_from_kmem had to handle two types of kmem - slab
pages and pages allocated with alloc_kmem_pages - memcg in the page
struct.  Now we can unify it.  Since after it, this function becomes tiny
we can fold it into mem_cgroup_from_kmem.

[hughd@google.com: move mem_cgroup_from_kmem into list_lru.c]
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/list_lru.c   | 10 ++++++++++
 mm/memcontrol.c | 18 ------------------
 2 files changed, 10 insertions(+), 18 deletions(-)

(limited to 'mm')

diff --git a/mm/list_lru.c b/mm/list_lru.c
index 2823747..afc71ea 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -63,6 +63,16 @@ list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
 	return &nlru->lru;
 }
 
+static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
+{
+	struct page *page;
+
+	if (!memcg_kmem_enabled())
+		return NULL;
+	page = virt_to_head_page(ptr);
+	return page->mem_cgroup;
+}
+
 static inline struct list_lru_one *
 list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9575cff..c0111cb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2430,24 +2430,6 @@ void __memcg_kmem_uncharge(struct page *page, int order)
 	page->mem_cgroup = NULL;
 	css_put_many(&memcg->css, nr_pages);
 }
-
-struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
-{
-	struct mem_cgroup *memcg = NULL;
-	struct kmem_cache *cachep;
-	struct page *page;
-
-	page = virt_to_head_page(ptr);
-	if (PageSlab(page)) {
-		cachep = page->slab_cache;
-		if (!is_root_cache(cachep))
-			memcg = cachep->memcg_params.memcg;
-	} else
-		/* page allocated by alloc_kmem_pages */
-		memcg = page->mem_cgroup;
-
-	return memcg;
-}
 #endif /* CONFIG_MEMCG_KMEM */
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-- 
cgit v1.1


From ad12695f177c3403a64348b42718faf9727fe358 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 5 Nov 2015 18:49:07 -0800
Subject: ksm: add cond_resched() to the rmap_walks

While at it add it to the file and anon walks too.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Petr Holasek <pholasek@redhat.com>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/ksm.c  | 2 ++
 mm/rmap.c | 4 ++++
 2 files changed, 6 insertions(+)

(limited to 'mm')

diff --git a/mm/ksm.c b/mm/ksm.c
index 7ee101e..e87dec7 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1914,9 +1914,11 @@ again:
 		struct anon_vma_chain *vmac;
 		struct vm_area_struct *vma;
 
+		cond_resched();
 		anon_vma_lock_read(anon_vma);
 		anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
 					       0, ULONG_MAX) {
+			cond_resched();
 			vma = vmac->vma;
 			if (rmap_item->address < vma->vm_start ||
 			    rmap_item->address >= vma->vm_end)
diff --git a/mm/rmap.c b/mm/rmap.c
index d40e7ae..78a6928 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1609,6 +1609,8 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long address = vma_address(page, vma);
 
+		cond_resched();
+
 		if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
 			continue;
 
@@ -1658,6 +1660,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
 	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		unsigned long address = vma_address(page, vma);
 
+		cond_resched();
+
 		if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
 			continue;
 
-- 
cgit v1.1


From f2e5ff85edea30a59b96cf9e20e8886991b0d097 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 5 Nov 2015 18:49:10 -0800
Subject: ksm: don't fail stable tree lookups if walking over stale
 stable_nodes

The stable_nodes can become stale at any time if the underlying pages gets
freed.  The stable_node gets collected and removed from the stable rbtree
if that is detected during the rbtree lookups.

Don't fail the lookup if running into stale stable_nodes, just restart the
lookup after collecting the stale stable_nodes.  Otherwise the CPU spent
in the preparation stage is wasted and the lookup must be repeated at the
next loop potentially failing a second time in a second stale stable_node.

If we don't prune aggressively we delay the merging of the unstable node
candidates and at the same time we delay the freeing of the stale
stable_nodes.  Keeping stale stable_nodes around wastes memory and it
can't provide any benefit.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Petr Holasek <pholasek@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/ksm.c | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/ksm.c b/mm/ksm.c
index e87dec7..9f182f9 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1177,8 +1177,18 @@ again:
 		cond_resched();
 		stable_node = rb_entry(*new, struct stable_node, node);
 		tree_page = get_ksm_page(stable_node, false);
-		if (!tree_page)
-			return NULL;
+		if (!tree_page) {
+			/*
+			 * If we walked over a stale stable_node,
+			 * get_ksm_page() will call rb_erase() and it
+			 * may rebalance the tree from under us. So
+			 * restart the search from scratch. Returning
+			 * NULL would be safe too, but we'd generate
+			 * false negative insertions just because some
+			 * stable_node was stale.
+			 */
+			goto again;
+		}
 
 		ret = memcmp_pages(page, tree_page);
 		put_page(tree_page);
@@ -1254,12 +1264,14 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
 	unsigned long kpfn;
 	struct rb_root *root;
 	struct rb_node **new;
-	struct rb_node *parent = NULL;
+	struct rb_node *parent;
 	struct stable_node *stable_node;
 
 	kpfn = page_to_pfn(kpage);
 	nid = get_kpfn_nid(kpfn);
 	root = root_stable_tree + nid;
+again:
+	parent = NULL;
 	new = &root->rb_node;
 
 	while (*new) {
@@ -1269,8 +1281,18 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
 		cond_resched();
 		stable_node = rb_entry(*new, struct stable_node, node);
 		tree_page = get_ksm_page(stable_node, false);
-		if (!tree_page)
-			return NULL;
+		if (!tree_page) {
+			/*
+			 * If we walked over a stale stable_node,
+			 * get_ksm_page() will call rb_erase() and it
+			 * may rebalance the tree from under us. So
+			 * restart the search from scratch. Returning
+			 * NULL would be safe too, but we'd generate
+			 * false negative insertions just because some
+			 * stable_node was stale.
+			 */
+			goto again;
+		}
 
 		ret = memcmp_pages(kpage, tree_page);
 		put_page(tree_page);
-- 
cgit v1.1


From 98666f8a2576b12f5f3ebcef61a8cdbefede1be3 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 5 Nov 2015 18:49:13 -0800
Subject: ksm: use the helper method to do the hlist_empty check

This just uses the helper function to cleanup the assumption on the
hlist_node internals.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Petr Holasek <pholasek@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/ksm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/ksm.c b/mm/ksm.c
index 9f182f9..d4ee159 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -625,7 +625,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
 		unlock_page(page);
 		put_page(page);
 
-		if (stable_node->hlist.first)
+		if (!hlist_empty(&stable_node->hlist))
 			ksm_pages_sharing--;
 		else
 			ksm_pages_shared--;
-- 
cgit v1.1


From 85c6e8dd23c6aa9ff299bf5256fe5f0a6c44f100 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 5 Nov 2015 18:49:16 -0800
Subject: ksm: use find_mergeable_vma in try_to_merge_with_ksm_page

Doing the VM_MERGEABLE check after the page == kpage check won't provide
any meaningful benefit.  The !vma->anon_vma check of find_mergeable_vma is
the only superfluous bit in using find_mergeable_vma because the !PageAnon
check of try_to_merge_one_page() implicitly checks for that, but it still
looks cleaner to share the same find_mergeable_vma().

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Petr Holasek <pholasek@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/ksm.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/ksm.c b/mm/ksm.c
index d4ee159..0183083 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1021,8 +1021,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
 	if (page == kpage)			/* ksm page forked */
 		return 0;
 
-	if (!(vma->vm_flags & VM_MERGEABLE))
-		goto out;
 	if (PageTransCompound(page) && page_trans_compound_anon_split(page))
 		goto out;
 	BUG_ON(PageTransCompound(page));
@@ -1087,10 +1085,8 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
 	int err = -EFAULT;
 
 	down_read(&mm->mmap_sem);
-	if (ksm_test_exit(mm))
-		goto out;
-	vma = find_vma(mm, rmap_item->address);
-	if (!vma || vma->vm_start > rmap_item->address)
+	vma = find_mergeable_vma(mm, rmap_item->address);
+	if (!vma)
 		goto out;
 
 	err = try_to_merge_one_page(vma, page, kpage);
-- 
cgit v1.1


From c8f95ed1a9ce373d20d3b09a6a9fb91c8beef27e Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 5 Nov 2015 18:49:19 -0800
Subject: ksm: unstable_tree_search_insert error checking cleanup

get_mergeable_page() can only return NULL (also in case of errors) or the
pinned mergeable page.  It can't return an error different than NULL.
This optimizes away the unnecessary error check.

Add a return after the "out:" label in the callee to make it more
readable.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Petr Holasek <pholasek@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/ksm.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/ksm.c b/mm/ksm.c
index 0183083..b5cd647 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -475,7 +475,8 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
 		flush_dcache_page(page);
 	} else {
 		put_page(page);
-out:		page = NULL;
+out:
+		page = NULL;
 	}
 	up_read(&mm->mmap_sem);
 	return page;
@@ -1358,7 +1359,7 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
 		cond_resched();
 		tree_rmap_item = rb_entry(*new, struct rmap_item, node);
 		tree_page = get_mergeable_page(tree_rmap_item);
-		if (IS_ERR_OR_NULL(tree_page))
+		if (!tree_page)
 			return NULL;
 
 		/*
-- 
cgit v1.1


From b87537d9e2feb30f6a962f27eb32768682698d3b Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 5 Nov 2015 18:49:33 -0800
Subject: mm: rmap use pte lock not mmap_sem to set PageMlocked

KernelThreadSanitizer (ktsan) has shown that the down_read_trylock() of
mmap_sem in try_to_unmap_one() (when going to set PageMlocked on a page
found mapped in a VM_LOCKED vma) is ineffective against races with
exit_mmap()'s munlock_vma_pages_all(), because mmap_sem is not held when
tearing down an mm.

But that's okay, those races are benign; and although we've believed for
years in that ugly down_read_trylock(), it's unsuitable for the job, and
frustrates the good intention of setting PageMlocked when it fails.

It just doesn't matter if here we read vm_flags an instant before or after
a racing mlock() or munlock() or exit_mmap() sets or clears VM_LOCKED: the
syscalls (or exit) work their way up the address space (taking pt locks
after updating vm_flags) to establish the final state.

We do still need to be careful never to mark a page Mlocked (hence
unevictable) by any race that will not be corrected shortly after.  The
page lock protects from many of the races, but not all (a page is not
necessarily locked when it's unmapped).  But the pte lock we just dropped
is good to cover the rest (and serializes even with
munlock_vma_pages_all(), so no special barriers required): now hold on to
the pte lock while calling mlock_vma_page().  Is that lock ordering safe?
Yes, that's how follow_page_pte() calls it, and how page_remove_rmap()
calls the complementary clear_page_mlock().

This fixes the following case (though not a case which anyone has
complained of), which mmap_sem did not: truncation's preliminary
unmap_mapping_range() is supposed to remove even the anonymous COWs of
filecache pages, and that might race with try_to_unmap_one() on a
VM_LOCKED vma, so that mlock_vma_page() sets PageMlocked just after
zap_pte_range() unmaps the page, causing "Bad page state (mlocked)" when
freed.  The pte lock protects against this.

You could say that it also protects against the more ordinary case, racing
with the preliminary unmapping of a filecache page itself: but in our
current tree, that's independently protected by i_mmap_rwsem; and that
race would be why "Bad page state (mlocked)" was seen before commit
48ec833b7851 ("Revert mm/memory.c: share the i_mmap_rwsem").

Vlastimil Babka points out another race which this patch protects against.
 try_to_unmap_one() might reach its mlock_vma_page() TestSetPageMlocked a
moment after munlock_vma_pages_all() did its Phase 1 TestClearPageMlocked:
leaving PageMlocked and unevictable when it should be evictable.  mmap_sem
is ineffective because exit_mmap() does not hold it; page lock ineffective
because __munlock_pagevec() only takes it afterwards, in Phase 2; pte lock
is effective because __munlock_pagevec_fill() takes it to get the page,
after VM_LOCKED was cleared from vm_flags, so visible to try_to_unmap_one.

Kirill Shutemov points out that if the compiler chooses to implement a
"vma->vm_flags &= VM_WHATEVER" or "vma->vm_flags |= VM_WHATEVER" operation
with an intermediate store of unrelated bits set, since I'm here foregoing
its usual protection by mmap_sem, try_to_unmap_one() might catch sight of
a spurious VM_LOCKED in vm_flags, and make the wrong decision.  This does
not appear to be an immediate problem, but we may want to define vm_flags
accessors in future, to guard against such a possibility.

While we're here, make a related optimization in try_to_munmap_one(): if
it's doing TTU_MUNLOCK, then there's no point at all in descending the
page tables and getting the pt lock, unless the vma is VM_LOCKED.  Yes,
that can change racily, but it can change racily even without the
optimization: it's not critical.  Far better not to waste time here.

Stopped short of separating try_to_munlock_one() from try_to_munmap_one()
on this occasion, but that's probably the sensible next step - with a
rename, given that try_to_munlock()'s business is to try to set Mlocked.

Updated the unevictable-lru Documentation, to remove its reference to mmap
semaphore, but found a few more updates needed in just that area.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Rik van Riel <riel@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/rmap.c | 36 +++++++++++-------------------------
 1 file changed, 11 insertions(+), 25 deletions(-)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index 78a6928..b93fb54 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1304,6 +1304,10 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	int ret = SWAP_AGAIN;
 	enum ttu_flags flags = (enum ttu_flags)arg;
 
+	/* munlock has nothing to gain from examining un-locked vmas */
+	if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
+		goto out;
+
 	pte = page_check_address(page, mm, address, &ptl, 0);
 	if (!pte)
 		goto out;
@@ -1314,9 +1318,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	 * skipped over this mm) then we should reactivate it.
 	 */
 	if (!(flags & TTU_IGNORE_MLOCK)) {
-		if (vma->vm_flags & VM_LOCKED)
-			goto out_mlock;
-
+		if (vma->vm_flags & VM_LOCKED) {
+			/* Holding pte lock, we do *not* need mmap_sem here */
+			mlock_vma_page(page);
+			ret = SWAP_MLOCK;
+			goto out_unmap;
+		}
 		if (flags & TTU_MUNLOCK)
 			goto out_unmap;
 	}
@@ -1421,31 +1428,10 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 
 out_unmap:
 	pte_unmap_unlock(pte, ptl);
-	if (ret != SWAP_FAIL && !(flags & TTU_MUNLOCK))
+	if (ret != SWAP_FAIL && ret != SWAP_MLOCK && !(flags & TTU_MUNLOCK))
 		mmu_notifier_invalidate_page(mm, address);
 out:
 	return ret;
-
-out_mlock:
-	pte_unmap_unlock(pte, ptl);
-
-
-	/*
-	 * We need mmap_sem locking, Otherwise VM_LOCKED check makes
-	 * unstable result and race. Plus, We can't wait here because
-	 * we now hold anon_vma->rwsem or mapping->i_mmap_rwsem.
-	 * if trylock failed, the page remain in evictable lru and later
-	 * vmscan could retry to move the page to unevictable lru if the
-	 * page is actually mlocked.
-	 */
-	if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
-		if (vma->vm_flags & VM_LOCKED) {
-			mlock_vma_page(page);
-			ret = SWAP_MLOCK;
-		}
-		up_read(&vma->vm_mm->mmap_sem);
-	}
-	return ret;
 }
 
 bool is_vma_temporary_stack(struct vm_area_struct *vma)
-- 
cgit v1.1


From 51afb12ba809db664682a31154c11e720e2c363c Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 5 Nov 2015 18:49:37 -0800
Subject: mm: page migration fix PageMlocked on migrated pages

Commit e6c509f85455 ("mm: use clear_page_mlock() in page_remove_rmap()")
in v3.7 inadvertently made mlock_migrate_page() impotent: page migration
unmaps the page from userspace before migrating, and that commit clears
PageMlocked on the final unmap, leaving mlock_migrate_page() with
nothing to do.  Not a serious bug, the next attempt at reclaiming the
page would fix it up; but a betrayal of page migration's intent - the
new page ought to emerge as PageMlocked.

I don't see how to fix it for mlock_migrate_page() itself; but easily
fixed in remove_migration_pte(), by calling mlock_vma_page() when the vma
is VM_LOCKED - under pte lock as in try_to_unmap_one().

Delete mlock_migrate_page()?  Not quite, it does still serve a purpose for
migrate_misplaced_transhuge_page(): where we could replace it by a test,
clear_page_mlock(), mlock_vma_page() sequence; but would that be an
improvement?  mlock_migrate_page() is fairly lean, and let's make it
leaner by skipping the irq save/restore now clearly not needed.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Rik van Riel <riel@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/internal.h | 9 ++++-----
 mm/migrate.c  | 6 ++++--
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/internal.h b/mm/internal.h
index bc0fa9a..d4b807d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -271,20 +271,19 @@ extern unsigned int munlock_vma_page(struct page *page);
 extern void clear_page_mlock(struct page *page);
 
 /*
- * mlock_migrate_page - called only from migrate_page_copy() to
- * migrate the Mlocked page flag; update statistics.
+ * mlock_migrate_page - called only from migrate_misplaced_transhuge_page()
+ * (because that does not go through the full procedure of migration ptes):
+ * to migrate the Mlocked page flag; update statistics.
  */
 static inline void mlock_migrate_page(struct page *newpage, struct page *page)
 {
 	if (TestClearPageMlocked(page)) {
-		unsigned long flags;
 		int nr_pages = hpage_nr_pages(page);
 
-		local_irq_save(flags);
+		/* Holding pmd lock, no change in irq context: __mod is safe */
 		__mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
 		SetPageMlocked(newpage);
 		__mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
-		local_irq_restore(flags);
 	}
 }
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 94961f4..ed72c49 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -171,6 +171,9 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
 	else
 		page_add_file_rmap(new);
 
+	if (vma->vm_flags & VM_LOCKED)
+		mlock_vma_page(new);
+
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, addr, ptep);
 unlock:
@@ -537,7 +540,6 @@ void migrate_page_copy(struct page *newpage, struct page *page)
 	cpupid = page_cpupid_xchg_last(page, -1);
 	page_cpupid_xchg_last(newpage, cpupid);
 
-	mlock_migrate_page(newpage, page);
 	ksm_migrate_page(newpage, page);
 	/*
 	 * Please do not reorder this without considering how mm/ksm.c's
@@ -1787,7 +1789,6 @@ fail_putback:
 			SetPageActive(page);
 		if (TestClearPageUnevictable(new_page))
 			SetPageUnevictable(page);
-		mlock_migrate_page(page, new_page);
 
 		unlock_page(new_page);
 		put_page(new_page);		/* Free it */
@@ -1829,6 +1830,7 @@ fail_putback:
 		goto fail_putback;
 	}
 
+	mlock_migrate_page(new_page, page);
 	mem_cgroup_migrate(page, new_page, false);
 
 	page_remove_rmap(page);
-- 
cgit v1.1


From 45637bab30d6e7651737f51aa99417baef4d114a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 5 Nov 2015 18:49:40 -0800
Subject: mm: rename mem_cgroup_migrate to mem_cgroup_replace_page

After v4.3's commit 0610c25daa3e ("memcg: fix dirty page migration")
mem_cgroup_migrate() doesn't have much to offer in page migration: convert
migrate_misplaced_transhuge_page() to set_page_memcg() instead.

Then rename mem_cgroup_migrate() to mem_cgroup_replace_page(), since its
remaining callers are replace_page_cache_page() and shmem_replace_page():
both of whom passed lrucare true, so just eliminate that argument.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/filemap.c    |  2 +-
 mm/memcontrol.c | 29 ++++++++---------------------
 mm/migrate.c    |  5 ++---
 mm/shmem.c      |  2 +-
 4 files changed, 12 insertions(+), 26 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index 884766d..58e04e2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -551,7 +551,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
 			__inc_zone_page_state(new, NR_SHMEM);
 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
 		mem_cgroup_end_page_stat(memcg);
-		mem_cgroup_migrate(old, new, true);
+		mem_cgroup_replace_page(old, new);
 		radix_tree_preload_end();
 		if (freepage)
 			freepage(old);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c0111cb..a44494f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4531,9 +4531,8 @@ static int mem_cgroup_move_account(struct page *page,
 		goto out;
 
 	/*
-	 * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
-	 * of its source page while we change it: page migration takes
-	 * both pages off the LRU, but page cache replacement doesn't.
+	 * Prevent mem_cgroup_replace_page() from looking at
+	 * page->mem_cgroup of its source page while we change it.
 	 */
 	if (!trylock_page(page))
 		goto out;
@@ -5495,7 +5494,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
 }
 
 /**
- * mem_cgroup_migrate - migrate a charge to another page
+ * mem_cgroup_replace_page - migrate a charge to another page
  * @oldpage: currently charged page
  * @newpage: page to transfer the charge to
  * @lrucare: either or both pages might be on the LRU already
@@ -5504,16 +5503,13 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
  *
  * Both pages must be locked, @newpage->mapping must be set up.
  */
-void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
-			bool lrucare)
+void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
 {
 	struct mem_cgroup *memcg;
 	int isolated;
 
 	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
 	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
-	VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage);
-	VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage);
 	VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
 	VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
 		       newpage);
@@ -5525,25 +5521,16 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
 	if (newpage->mem_cgroup)
 		return;
 
-	/*
-	 * Swapcache readahead pages can get migrated before being
-	 * charged, and migration from compaction can happen to an
-	 * uncharged page when the PFN walker finds a page that
-	 * reclaim just put back on the LRU but has not released yet.
-	 */
+	/* Swapcache readahead pages can get replaced before being charged */
 	memcg = oldpage->mem_cgroup;
 	if (!memcg)
 		return;
 
-	if (lrucare)
-		lock_page_lru(oldpage, &isolated);
-
+	lock_page_lru(oldpage, &isolated);
 	oldpage->mem_cgroup = NULL;
+	unlock_page_lru(oldpage, isolated);
 
-	if (lrucare)
-		unlock_page_lru(oldpage, isolated);
-
-	commit_charge(newpage, memcg, lrucare);
+	commit_charge(newpage, memcg, true);
 }
 
 /*
diff --git a/mm/migrate.c b/mm/migrate.c
index ed72c49..836e410 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -30,7 +30,6 @@
 #include <linux/mempolicy.h>
 #include <linux/vmalloc.h>
 #include <linux/security.h>
-#include <linux/memcontrol.h>
 #include <linux/syscalls.h>
 #include <linux/hugetlb.h>
 #include <linux/hugetlb_cgroup.h>
@@ -1831,8 +1830,8 @@ fail_putback:
 	}
 
 	mlock_migrate_page(new_page, page);
-	mem_cgroup_migrate(page, new_page, false);
-
+	set_page_memcg(new_page, page_memcg(page));
+	set_page_memcg(page, NULL);
 	page_remove_rmap(page);
 
 	spin_unlock(ptl);
diff --git a/mm/shmem.c b/mm/shmem.c
index 48ce829..6529226 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1023,7 +1023,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
 		 */
 		oldpage = newpage;
 	} else {
-		mem_cgroup_migrate(oldpage, newpage, true);
+		mem_cgroup_replace_page(oldpage, newpage);
 		lru_cache_add_anon(newpage);
 		*pagep = newpage;
 	}
-- 
cgit v1.1


From 14e0f9bcc95f1aef26a9f860cceda35faee79b34 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 5 Nov 2015 18:49:43 -0800
Subject: mm: correct a couple of page migration comments

It's migrate.c not migration,c, and nowadays putback_movable_pages() not
putback_lru_pages().

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Rafael Aquini <aquini@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/migrate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index 836e410..d149cbb 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1,5 +1,5 @@
 /*
- * Memory Migration functionality - linux/mm/migration.c
+ * Memory Migration functionality - linux/mm/migrate.c
  *
  * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
  *
@@ -1113,7 +1113,7 @@ out:
  *
  * The function returns after 10 attempts or if no pages are movable any more
  * because the list has become empty or no retryable pages exist any more.
- * The caller should call putback_lru_pages() to return pages to the LRU
+ * The caller should call putback_movable_pages() to return pages to the LRU
  * or free list only if ret != 0.
  *
  * Returns the number of pages that were not migrated, or an error code.
-- 
cgit v1.1


From 2def7424c9be0069831380823fdb5cf72103b919 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 5 Nov 2015 18:49:46 -0800
Subject: mm: page migration use the put_new_page whenever necessary

I don't know of any problem from the way it's used in our current tree,
but there is one defect in page migration's custom put_new_page feature.

An unused newpage is expected to be released with the put_new_page(), but
there was one MIGRATEPAGE_SUCCESS (0) path which released it with
putback_lru_page(): which can be very wrong for a custom pool.

Fixed more easily by resetting put_new_page once it won't be needed, than
by adding a further flag to modify the rc test.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Rik van Riel <riel@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/migrate.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index d149cbb..2f2e223 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -938,10 +938,11 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
 				   int force, enum migrate_mode mode,
 				   enum migrate_reason reason)
 {
-	int rc = 0;
+	int rc = MIGRATEPAGE_SUCCESS;
 	int *result = NULL;
-	struct page *newpage = get_new_page(page, private, &result);
+	struct page *newpage;
 
+	newpage = get_new_page(page, private, &result);
 	if (!newpage)
 		return -ENOMEM;
 
@@ -955,6 +956,8 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
 			goto out;
 
 	rc = __unmap_and_move(page, newpage, force, mode);
+	if (rc == MIGRATEPAGE_SUCCESS)
+		put_new_page = NULL;
 
 out:
 	if (rc != -EAGAIN) {
@@ -981,7 +984,7 @@ out:
 	 * it.  Otherwise, putback_lru_page() will drop the reference grabbed
 	 * during isolation.
 	 */
-	if (rc != MIGRATEPAGE_SUCCESS && put_new_page) {
+	if (put_new_page) {
 		ClearPageSwapBacked(newpage);
 		put_new_page(newpage, private);
 	} else if (unlikely(__is_movable_balloon_page(newpage))) {
@@ -1022,7 +1025,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 				struct page *hpage, int force,
 				enum migrate_mode mode)
 {
-	int rc = 0;
+	int rc = -EAGAIN;
 	int *result = NULL;
 	int page_was_mapped = 0;
 	struct page *new_hpage;
@@ -1044,8 +1047,6 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 	if (!new_hpage)
 		return -ENOMEM;
 
-	rc = -EAGAIN;
-
 	if (!trylock_page(hpage)) {
 		if (!force || mode != MIGRATE_SYNC)
 			goto out;
@@ -1070,8 +1071,10 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 	if (anon_vma)
 		put_anon_vma(anon_vma);
 
-	if (rc == MIGRATEPAGE_SUCCESS)
+	if (rc == MIGRATEPAGE_SUCCESS) {
 		hugetlb_cgroup_migrate(hpage, new_hpage);
+		put_new_page = NULL;
+	}
 
 	unlock_page(hpage);
 out:
@@ -1083,7 +1086,7 @@ out:
 	 * it.  Otherwise, put_page() will drop the reference grabbed during
 	 * isolation.
 	 */
-	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+	if (put_new_page)
 		put_new_page(new_hpage, private);
 	else
 		putback_active_hugepage(new_hpage);
-- 
cgit v1.1


From 7db7671f835ccad66db20154ac1274140937d9b7 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 5 Nov 2015 18:49:49 -0800
Subject: mm: page migration trylock newpage at same level as oldpage

Clean up page migration a little by moving the trylock of newpage from
move_to_new_page() into __unmap_and_move(), where the old page has been
locked.  Adjust unmap_and_move_huge_page() and balloon_page_migrate()
accordingly.

But make one kind-of-functional change on the way: whereas trylock of
newpage used to BUG() if it failed, now simply return -EAGAIN if so.
Cutting out BUG()s is good, right?  But, to be honest, this is really to
extend the usefulness of the custom put_new_page feature, allowing a pool
of new pages to be shared perhaps with racing uses.

Use an "else" instead of that "skip_unmap" label.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Rafael Aquini <aquini@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/balloon_compaction.c | 10 ++--------
 mm/migrate.c            | 46 ++++++++++++++++++++++++++--------------------
 2 files changed, 28 insertions(+), 28 deletions(-)

(limited to 'mm')

diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index fcad832..d3116be 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -199,23 +199,17 @@ int balloon_page_migrate(struct page *newpage,
 	struct balloon_dev_info *balloon = balloon_page_device(page);
 	int rc = -EAGAIN;
 
-	/*
-	 * Block others from accessing the 'newpage' when we get around to
-	 * establishing additional references. We should be the only one
-	 * holding a reference to the 'newpage' at this point.
-	 */
-	BUG_ON(!trylock_page(newpage));
+	VM_BUG_ON_PAGE(!PageLocked(page), page);
+	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
 
 	if (WARN_ON(!__is_movable_balloon_page(page))) {
 		dump_page(page, "not movable balloon page");
-		unlock_page(newpage);
 		return rc;
 	}
 
 	if (balloon && balloon->migratepage)
 		rc = balloon->migratepage(balloon, newpage, page, mode);
 
-	unlock_page(newpage);
 	return rc;
 }
 #endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/mm/migrate.c b/mm/migrate.c
index 2f2e223..6d7774e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -727,13 +727,8 @@ static int move_to_new_page(struct page *newpage, struct page *page,
 	struct address_space *mapping;
 	int rc;
 
-	/*
-	 * Block others from accessing the page when we get around to
-	 * establishing additional references. We are the only one
-	 * holding a reference to the new page at this point.
-	 */
-	if (!trylock_page(newpage))
-		BUG();
+	VM_BUG_ON_PAGE(!PageLocked(page), page);
+	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
 
 	/* Prepare mapping for the new page.*/
 	newpage->index = page->index;
@@ -774,9 +769,6 @@ static int move_to_new_page(struct page *newpage, struct page *page,
 			remove_migration_ptes(page, newpage);
 		page->mapping = NULL;
 	}
-
-	unlock_page(newpage);
-
 	return rc;
 }
 
@@ -861,6 +853,17 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 		}
 	}
 
+	/*
+	 * Block others from accessing the new page when we get around to
+	 * establishing additional references. We are usually the only one
+	 * holding a reference to newpage at this point. We used to have a BUG
+	 * here if trylock_page(newpage) fails, but would like to allow for
+	 * cases where there might be a race with the previous use of newpage.
+	 * This is much like races on refcount of oldpage: just don't BUG().
+	 */
+	if (unlikely(!trylock_page(newpage)))
+		goto out_unlock;
+
 	if (unlikely(isolated_balloon_page(page))) {
 		/*
 		 * A ballooned page does not need any special attention from
@@ -870,7 +873,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 		 * the page migration right away (proteced by page lock).
 		 */
 		rc = balloon_page_migrate(newpage, page, mode);
-		goto out_unlock;
+		goto out_unlock_both;
 	}
 
 	/*
@@ -889,30 +892,27 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 		VM_BUG_ON_PAGE(PageAnon(page), page);
 		if (page_has_private(page)) {
 			try_to_free_buffers(page);
-			goto out_unlock;
+			goto out_unlock_both;
 		}
-		goto skip_unmap;
-	}
-
-	/* Establish migration ptes or remove ptes */
-	if (page_mapped(page)) {
+	} else if (page_mapped(page)) {
+		/* Establish migration ptes */
 		try_to_unmap(page,
 			TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
 		page_was_mapped = 1;
 	}
 
-skip_unmap:
 	if (!page_mapped(page))
 		rc = move_to_new_page(newpage, page, page_was_mapped, mode);
 
 	if (rc && page_was_mapped)
 		remove_migration_ptes(page, page);
 
+out_unlock_both:
+	unlock_page(newpage);
+out_unlock:
 	/* Drop an anon_vma reference if we took one */
 	if (anon_vma)
 		put_anon_vma(anon_vma);
-
-out_unlock:
 	unlock_page(page);
 out:
 	return rc;
@@ -1056,6 +1056,9 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 	if (PageAnon(hpage))
 		anon_vma = page_get_anon_vma(hpage);
 
+	if (unlikely(!trylock_page(new_hpage)))
+		goto put_anon;
+
 	if (page_mapped(hpage)) {
 		try_to_unmap(hpage,
 			TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
@@ -1068,6 +1071,9 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 	if (rc != MIGRATEPAGE_SUCCESS && page_was_mapped)
 		remove_migration_ptes(hpage, hpage);
 
+	unlock_page(new_hpage);
+
+put_anon:
 	if (anon_vma)
 		put_anon_vma(anon_vma);
 
-- 
cgit v1.1


From 5c3f9a67371643b6faa987622bc1b67667bab848 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 5 Nov 2015 18:49:53 -0800
Subject: mm: page migration remove_migration_ptes at lock+unlock level

Clean up page migration a little more by calling remove_migration_ptes()
from the same level, on success or on failure, from __unmap_and_move() or
from unmap_and_move_huge_page().

Don't reset page->mapping of a PageAnon old page in move_to_new_page(),
leave that to when the page is freed.  Except for here in page migration,
it has been an invariant that a PageAnon (bit set in page->mapping) page
stays PageAnon until it is freed, and I think we're safer to keep to that.

And with the above rearrangement, it's necessary because zap_pte_range()
wants to identify whether a migration entry represents a file or an anon
page, to update the appropriate rss stats without waiting on it.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/migrate.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index 6d7774e..7b44ebdf 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -722,7 +722,7 @@ static int fallback_migrate_page(struct address_space *mapping,
  *  MIGRATEPAGE_SUCCESS - success
  */
 static int move_to_new_page(struct page *newpage, struct page *page,
-				int page_was_mapped, enum migrate_mode mode)
+				enum migrate_mode mode)
 {
 	struct address_space *mapping;
 	int rc;
@@ -755,19 +755,21 @@ static int move_to_new_page(struct page *newpage, struct page *page,
 		 * space which also has its own migratepage callback. This
 		 * is the most common path for page migration.
 		 */
-		rc = mapping->a_ops->migratepage(mapping,
-						newpage, page, mode);
+		rc = mapping->a_ops->migratepage(mapping, newpage, page, mode);
 	else
 		rc = fallback_migrate_page(mapping, newpage, page, mode);
 
-	if (rc != MIGRATEPAGE_SUCCESS) {
+	/*
+	 * When successful, old pagecache page->mapping must be cleared before
+	 * page is freed; but stats require that PageAnon be left as PageAnon.
+	 */
+	if (rc == MIGRATEPAGE_SUCCESS) {
+		set_page_memcg(page, NULL);
+		if (!PageAnon(page))
+			page->mapping = NULL;
+	} else {
 		set_page_memcg(newpage, NULL);
 		newpage->mapping = NULL;
-	} else {
-		set_page_memcg(page, NULL);
-		if (page_was_mapped)
-			remove_migration_ptes(page, newpage);
-		page->mapping = NULL;
 	}
 	return rc;
 }
@@ -902,10 +904,11 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 	}
 
 	if (!page_mapped(page))
-		rc = move_to_new_page(newpage, page, page_was_mapped, mode);
+		rc = move_to_new_page(newpage, page, mode);
 
-	if (rc && page_was_mapped)
-		remove_migration_ptes(page, page);
+	if (page_was_mapped)
+		remove_migration_ptes(page,
+			rc == MIGRATEPAGE_SUCCESS ? newpage : page);
 
 out_unlock_both:
 	unlock_page(newpage);
@@ -1066,10 +1069,11 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 	}
 
 	if (!page_mapped(hpage))
-		rc = move_to_new_page(new_hpage, hpage, page_was_mapped, mode);
+		rc = move_to_new_page(new_hpage, hpage, mode);
 
-	if (rc != MIGRATEPAGE_SUCCESS && page_was_mapped)
-		remove_migration_ptes(hpage, hpage);
+	if (page_was_mapped)
+		remove_migration_ptes(hpage,
+			rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage);
 
 	unlock_page(new_hpage);
 
-- 
cgit v1.1


From 03f15c86c8d1b9d81e6d215715e110aef8f936e0 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 5 Nov 2015 18:49:56 -0800
Subject: mm: simplify page migration's anon_vma comment and flow

__unmap_and_move() contains a long stale comment on page_get_anon_vma()
and PageSwapCache(), with an odd control flow that's hard to follow.
Mostly this reflects our confusion about the lifetime of an anon_vma, in
the early days of page migration, before we could take a reference to one.
 Nowadays this seems quite straightforward: cut it all down to essentials.

I cannot see the relevance of swapcache here at all, so don't treat it any
differently: I believe the old comment reflects in part our anon_vma
confusions, and in part the original v2.6.16 page migration technique,
which used actual swap to migrate anon instead of swap-like migration
entries.  Why should a swapcache page not be migrated with the aid of
migration entry ptes like everything else?  So lose that comment now, and
enable migration entries for swapcache in the next patch.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/migrate.c | 36 ++++++++++--------------------------
 1 file changed, 10 insertions(+), 26 deletions(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index 7b44ebdf..08a7b6c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -819,6 +819,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 			goto out_unlock;
 		wait_on_page_writeback(page);
 	}
+
 	/*
 	 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
 	 * we cannot notice that anon_vma is freed while we migrates a page.
@@ -826,34 +827,15 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 	 * of migration. File cache pages are no problem because of page_lock()
 	 * File Caches may use write_page() or lock_page() in migration, then,
 	 * just care Anon page here.
+	 *
+	 * Only page_get_anon_vma() understands the subtleties of
+	 * getting a hold on an anon_vma from outside one of its mms.
+	 * But if we cannot get anon_vma, then we won't need it anyway,
+	 * because that implies that the anon page is no longer mapped
+	 * (and cannot be remapped so long as we hold the page lock).
 	 */
-	if (PageAnon(page) && !PageKsm(page)) {
-		/*
-		 * Only page_lock_anon_vma_read() understands the subtleties of
-		 * getting a hold on an anon_vma from outside one of its mms.
-		 */
+	if (PageAnon(page) && !PageKsm(page))
 		anon_vma = page_get_anon_vma(page);
-		if (anon_vma) {
-			/*
-			 * Anon page
-			 */
-		} else if (PageSwapCache(page)) {
-			/*
-			 * We cannot be sure that the anon_vma of an unmapped
-			 * swapcache page is safe to use because we don't
-			 * know in advance if the VMA that this page belonged
-			 * to still exists. If the VMA and others sharing the
-			 * data have been freed, then the anon_vma could
-			 * already be invalid.
-			 *
-			 * To avoid this possibility, swapcache pages get
-			 * migrated but are not remapped when migration
-			 * completes
-			 */
-		} else {
-			goto out_unlock;
-		}
-	}
 
 	/*
 	 * Block others from accessing the new page when we get around to
@@ -898,6 +880,8 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 		}
 	} else if (page_mapped(page)) {
 		/* Establish migration ptes */
+		VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
+				page);
 		try_to_unmap(page,
 			TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
 		page_was_mapped = 1;
-- 
cgit v1.1


From 470f119f012068e5d94458c98dc4eec102f88cd3 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 5 Nov 2015 18:49:59 -0800
Subject: mm: page migration use migration entry for swapcache too

Hitherto page migration has avoided using a migration entry for a
swapcache page mapped into userspace, apparently for historical reasons.
So any page blessed with swapcache would entail a minor fault when it's
next touched, which page migration otherwise tries to avoid.  Swapcache in
an mlocked area is rare, so won't often matter, but still better fixed.

Just rearrange the block in try_to_unmap_one(), to handle TTU_MIGRATION
before checking PageAnon, that's all (apart from some reindenting).

Well, no, that's not quite all: doesn't this by the way fix a soft_dirty
bug, that page migration of a file page was forgetting to transfer the
soft_dirty bit?  Probably not a serious bug: if I understand correctly,
soft_dirty afficionados usually have to handle file pages separately
anyway; but we publish the bit in /proc/<pid>/pagemap on file mappings as
well as anonymous, so page migration ought not to perturb it.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/rmap.c | 63 ++++++++++++++++++++++++++++++---------------------------------
 1 file changed, 30 insertions(+), 33 deletions(-)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index b93fb54..b577fbb 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1379,47 +1379,44 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			dec_mm_counter(mm, MM_ANONPAGES);
 		else
 			dec_mm_counter(mm, MM_FILEPAGES);
+	} else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION)) {
+		swp_entry_t entry;
+		pte_t swp_pte;
+		/*
+		 * Store the pfn of the page in a special migration
+		 * pte. do_swap_page() will wait until the migration
+		 * pte is removed and then restart fault handling.
+		 */
+		entry = make_migration_entry(page, pte_write(pteval));
+		swp_pte = swp_entry_to_pte(entry);
+		if (pte_soft_dirty(pteval))
+			swp_pte = pte_swp_mksoft_dirty(swp_pte);
+		set_pte_at(mm, address, pte, swp_pte);
 	} else if (PageAnon(page)) {
 		swp_entry_t entry = { .val = page_private(page) };
 		pte_t swp_pte;
-
-		if (PageSwapCache(page)) {
-			/*
-			 * Store the swap location in the pte.
-			 * See handle_pte_fault() ...
-			 */
-			if (swap_duplicate(entry) < 0) {
-				set_pte_at(mm, address, pte, pteval);
-				ret = SWAP_FAIL;
-				goto out_unmap;
-			}
-			if (list_empty(&mm->mmlist)) {
-				spin_lock(&mmlist_lock);
-				if (list_empty(&mm->mmlist))
-					list_add(&mm->mmlist, &init_mm.mmlist);
-				spin_unlock(&mmlist_lock);
-			}
-			dec_mm_counter(mm, MM_ANONPAGES);
-			inc_mm_counter(mm, MM_SWAPENTS);
-		} else if (IS_ENABLED(CONFIG_MIGRATION)) {
-			/*
-			 * Store the pfn of the page in a special migration
-			 * pte. do_swap_page() will wait until the migration
-			 * pte is removed and then restart fault handling.
-			 */
-			BUG_ON(!(flags & TTU_MIGRATION));
-			entry = make_migration_entry(page, pte_write(pteval));
+		/*
+		 * Store the swap location in the pte.
+		 * See handle_pte_fault() ...
+		 */
+		VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+		if (swap_duplicate(entry) < 0) {
+			set_pte_at(mm, address, pte, pteval);
+			ret = SWAP_FAIL;
+			goto out_unmap;
+		}
+		if (list_empty(&mm->mmlist)) {
+			spin_lock(&mmlist_lock);
+			if (list_empty(&mm->mmlist))
+				list_add(&mm->mmlist, &init_mm.mmlist);
+			spin_unlock(&mmlist_lock);
 		}
+		dec_mm_counter(mm, MM_ANONPAGES);
+		inc_mm_counter(mm, MM_SWAPENTS);
 		swp_pte = swp_entry_to_pte(entry);
 		if (pte_soft_dirty(pteval))
 			swp_pte = pte_swp_mksoft_dirty(swp_pte);
 		set_pte_at(mm, address, pte, swp_pte);
-	} else if (IS_ENABLED(CONFIG_MIGRATION) &&
-		   (flags & TTU_MIGRATION)) {
-		/* Establish migration entry for a file page */
-		swp_entry_t entry;
-		entry = make_migration_entry(page, pte_write(pteval));
-		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
 	} else
 		dec_mm_counter(mm, MM_FILEPAGES);
 
-- 
cgit v1.1


From cf4b769abb8aef01f887543cb8308c0d8671367c Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 5 Nov 2015 18:50:02 -0800
Subject: mm: page migration avoid touching newpage until no going back

We have had trouble in the past from the way in which page migration's
newpage is initialized in dribs and drabs - see commit 8bdd63809160 ("mm:
fix direct reclaim writeback regression") which proposed a cleanup.

We have no actual problem now, but I think the procedure would be clearer
(and alternative get_new_page pools safer to implement) if we assert that
newpage is not touched until we are sure that it's going to be used -
except for taking the trylock on it in __unmap_and_move().

So shift the early initializations from move_to_new_page() into
migrate_page_move_mapping(), mapping and NULL-mapping paths.  Similarly
migrate_huge_page_move_mapping(), but its NULL-mapping path can just be
deleted: you cannot reach hugetlbfs_migrate_page() with a NULL mapping.

Adjust stages 3 to 8 in the Documentation file accordingly.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/migrate.c | 49 +++++++++++++++++++++----------------------------
 1 file changed, 21 insertions(+), 28 deletions(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index 08a7b6c..3067e40 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -320,6 +320,14 @@ int migrate_page_move_mapping(struct address_space *mapping,
 		/* Anonymous page without mapping */
 		if (page_count(page) != expected_count)
 			return -EAGAIN;
+
+		/* No turning back from here */
+		set_page_memcg(newpage, page_memcg(page));
+		newpage->index = page->index;
+		newpage->mapping = page->mapping;
+		if (PageSwapBacked(page))
+			SetPageSwapBacked(newpage);
+
 		return MIGRATEPAGE_SUCCESS;
 	}
 
@@ -355,8 +363,15 @@ int migrate_page_move_mapping(struct address_space *mapping,
 	}
 
 	/*
-	 * Now we know that no one else is looking at the page.
+	 * Now we know that no one else is looking at the page:
+	 * no turning back from here.
 	 */
+	set_page_memcg(newpage, page_memcg(page));
+	newpage->index = page->index;
+	newpage->mapping = page->mapping;
+	if (PageSwapBacked(page))
+		SetPageSwapBacked(newpage);
+
 	get_page(newpage);	/* add cache reference */
 	if (PageSwapCache(page)) {
 		SetPageSwapCache(newpage);
@@ -403,12 +418,6 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
 	int expected_count;
 	void **pslot;
 
-	if (!mapping) {
-		if (page_count(page) != 1)
-			return -EAGAIN;
-		return MIGRATEPAGE_SUCCESS;
-	}
-
 	spin_lock_irq(&mapping->tree_lock);
 
 	pslot = radix_tree_lookup_slot(&mapping->page_tree,
@@ -426,6 +435,9 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
 		return -EAGAIN;
 	}
 
+	set_page_memcg(newpage, page_memcg(page));
+	newpage->index = page->index;
+	newpage->mapping = page->mapping;
 	get_page(newpage);
 
 	radix_tree_replace_slot(pslot, newpage);
@@ -730,21 +742,6 @@ static int move_to_new_page(struct page *newpage, struct page *page,
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
 
-	/* Prepare mapping for the new page.*/
-	newpage->index = page->index;
-	newpage->mapping = page->mapping;
-	if (PageSwapBacked(page))
-		SetPageSwapBacked(newpage);
-
-	/*
-	 * Indirectly called below, migrate_page_copy() copies PG_dirty and thus
-	 * needs newpage's memcg set to transfer memcg dirty page accounting.
-	 * So perform memcg migration in two steps:
-	 * 1. set newpage->mem_cgroup (here)
-	 * 2. clear page->mem_cgroup (below)
-	 */
-	set_page_memcg(newpage, page_memcg(page));
-
 	mapping = page_mapping(page);
 	if (!mapping)
 		rc = migrate_page(mapping, newpage, page, mode);
@@ -767,9 +764,6 @@ static int move_to_new_page(struct page *newpage, struct page *page,
 		set_page_memcg(page, NULL);
 		if (!PageAnon(page))
 			page->mapping = NULL;
-	} else {
-		set_page_memcg(newpage, NULL);
-		newpage->mapping = NULL;
 	}
 	return rc;
 }
@@ -971,10 +965,9 @@ out:
 	 * it.  Otherwise, putback_lru_page() will drop the reference grabbed
 	 * during isolation.
 	 */
-	if (put_new_page) {
-		ClearPageSwapBacked(newpage);
+	if (put_new_page)
 		put_new_page(newpage, private);
-	} else if (unlikely(__is_movable_balloon_page(newpage))) {
+	else if (unlikely(__is_movable_balloon_page(newpage))) {
 		/* drop our reference, page already in the balloon */
 		put_page(newpage);
 	} else
-- 
cgit v1.1


From 42cb14b110a5698ccf26ce59c4441722605a3743 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 5 Nov 2015 18:50:05 -0800
Subject: mm: migrate dirty page without clear_page_dirty_for_io etc

clear_page_dirty_for_io() has accumulated writeback and memcg subtleties
since v2.6.16 first introduced page migration; and the set_page_dirty()
which completed its migration of PageDirty, later had to be moderated to
__set_page_dirty_nobuffers(); then PageSwapBacked had to skip that too.

No actual problems seen with this procedure recently, but if you look into
what the clear_page_dirty_for_io(page)+set_page_dirty(newpage) is actually
achieving, it turns out to be nothing more than moving the PageDirty flag,
and its NR_FILE_DIRTY stat from one zone to another.

It would be good to avoid a pile of irrelevant decrementations and
incrementations, and improper event counting, and unnecessary descent of
the radix_tree under tree_lock (to set the PAGECACHE_TAG_DIRTY which
radix_tree_replace_slot() left in place anyway).

Do the NR_FILE_DIRTY movement, like the other stats movements, while
interrupts still disabled in migrate_page_move_mapping(); and don't even
bother if the zone is the same.  Do the PageDirty movement there under
tree_lock too, where old page is frozen and newpage not yet visible:
bearing in mind that as soon as newpage becomes visible in radix_tree, an
un-page-locked set_page_dirty() might interfere (or perhaps that's just
not possible: anything doing so should already hold an additional
reference to the old page, preventing its migration; but play safe).

But we do still need to transfer PageDirty in migrate_page_copy(), for
those who don't go the mapping route through migrate_page_move_mapping().

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/migrate.c | 51 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 31 insertions(+), 20 deletions(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index 3067e40..2834fab 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -30,6 +30,7 @@
 #include <linux/mempolicy.h>
 #include <linux/vmalloc.h>
 #include <linux/security.h>
+#include <linux/backing-dev.h>
 #include <linux/syscalls.h>
 #include <linux/hugetlb.h>
 #include <linux/hugetlb_cgroup.h>
@@ -313,6 +314,8 @@ int migrate_page_move_mapping(struct address_space *mapping,
 		struct buffer_head *head, enum migrate_mode mode,
 		int extra_count)
 {
+	struct zone *oldzone, *newzone;
+	int dirty;
 	int expected_count = 1 + extra_count;
 	void **pslot;
 
@@ -331,6 +334,9 @@ int migrate_page_move_mapping(struct address_space *mapping,
 		return MIGRATEPAGE_SUCCESS;
 	}
 
+	oldzone = page_zone(page);
+	newzone = page_zone(newpage);
+
 	spin_lock_irq(&mapping->tree_lock);
 
 	pslot = radix_tree_lookup_slot(&mapping->page_tree,
@@ -378,6 +384,13 @@ int migrate_page_move_mapping(struct address_space *mapping,
 		set_page_private(newpage, page_private(page));
 	}
 
+	/* Move dirty while page refs frozen and newpage not yet exposed */
+	dirty = PageDirty(page);
+	if (dirty) {
+		ClearPageDirty(page);
+		SetPageDirty(newpage);
+	}
+
 	radix_tree_replace_slot(pslot, newpage);
 
 	/*
@@ -387,6 +400,9 @@ int migrate_page_move_mapping(struct address_space *mapping,
 	 */
 	page_unfreeze_refs(page, expected_count - 1);
 
+	spin_unlock(&mapping->tree_lock);
+	/* Leave irq disabled to prevent preemption while updating stats */
+
 	/*
 	 * If moved to a different zone then also account
 	 * the page for that zone. Other VM counters will be
@@ -397,13 +413,19 @@ int migrate_page_move_mapping(struct address_space *mapping,
 	 * via NR_FILE_PAGES and NR_ANON_PAGES if they
 	 * are mapped to swap space.
 	 */
-	__dec_zone_page_state(page, NR_FILE_PAGES);
-	__inc_zone_page_state(newpage, NR_FILE_PAGES);
-	if (!PageSwapCache(page) && PageSwapBacked(page)) {
-		__dec_zone_page_state(page, NR_SHMEM);
-		__inc_zone_page_state(newpage, NR_SHMEM);
+	if (newzone != oldzone) {
+		__dec_zone_state(oldzone, NR_FILE_PAGES);
+		__inc_zone_state(newzone, NR_FILE_PAGES);
+		if (PageSwapBacked(page) && !PageSwapCache(page)) {
+			__dec_zone_state(oldzone, NR_SHMEM);
+			__inc_zone_state(newzone, NR_SHMEM);
+		}
+		if (dirty && mapping_cap_account_dirty(mapping)) {
+			__dec_zone_state(oldzone, NR_FILE_DIRTY);
+			__inc_zone_state(newzone, NR_FILE_DIRTY);
+		}
 	}
-	spin_unlock_irq(&mapping->tree_lock);
+	local_irq_enable();
 
 	return MIGRATEPAGE_SUCCESS;
 }
@@ -524,20 +546,9 @@ void migrate_page_copy(struct page *newpage, struct page *page)
 	if (PageMappedToDisk(page))
 		SetPageMappedToDisk(newpage);
 
-	if (PageDirty(page)) {
-		clear_page_dirty_for_io(page);
-		/*
-		 * Want to mark the page and the radix tree as dirty, and
-		 * redo the accounting that clear_page_dirty_for_io undid,
-		 * but we can't use set_page_dirty because that function
-		 * is actually a signal that all of the page has become dirty.
-		 * Whereas only part of our page may be dirty.
-		 */
-		if (PageSwapBacked(page))
-			SetPageDirty(newpage);
-		else
-			__set_page_dirty_nobuffers(newpage);
- 	}
+	/* Move dirty on pages not done by migrate_page_move_mapping() */
+	if (PageDirty(page))
+		SetPageDirty(newpage);
 
 	if (page_is_young(page))
 		set_page_young(newpage);
-- 
cgit v1.1


From 3acaea6804b3a10e996ce6ebc342089f481e1cdb Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 5 Nov 2015 18:50:08 -0800
Subject: mm/cma.c: suppress warning

mm/cma.c: In function 'cma_alloc':
mm/cma.c:366: warning: 'pfn' may be used uninitialized in this function

The patch actually improves the tracing a bit: if alloc_contig_range()
fails, tracing will display the offending pfn rather than -1.

Cc: Stefan Strogin <stefan.strogin@gmail.com>
Cc: Michal Nazarewicz <mpn@google.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Cc: Thierry Reding <treding@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/cma.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/cma.c b/mm/cma.c
index 4eb56ba..ea506eb 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -363,7 +363,9 @@ err:
  */
 struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align)
 {
-	unsigned long mask, offset, pfn, start = 0;
+	unsigned long mask, offset;
+	unsigned long pfn = -1;
+	unsigned long start = 0;
 	unsigned long bitmap_maxno, bitmap_no, bitmap_count;
 	struct page *page = NULL;
 	int ret;
@@ -418,7 +420,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align)
 		start = bitmap_no + mask + 1;
 	}
 
-	trace_cma_alloc(page ? pfn : -1UL, page, count, align);
+	trace_cma_alloc(pfn, page, count, align);
 
 	pr_debug("%s(): returned %p\n", __func__, page);
 	return page;
-- 
cgit v1.1


From 9dd861d55b01f1d0848f82007e8665371ae18710 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 5 Nov 2015 18:50:11 -0800
Subject: mm/maccess.c: actually return -EFAULT from strncpy_from_unsafe

As far as I can tell, strncpy_from_unsafe never returns -EFAULT.  ret is
the result of a __copy_from_user_inatomic(), which is 0 for success and
positive (in this case necessarily 1) for access error - it is never
negative.  So we were always returning the length of the, possibly
truncated, destination string.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/maccess.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/maccess.c b/mm/maccess.c
index 1b13638..d159b1c 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -104,5 +104,5 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
 	pagefault_enable();
 	set_fs(old_fs);
 
-	return ret < 0 ? ret : src - unsafe_addr;
+	return ret ? -EFAULT : src - unsafe_addr;
 }
-- 
cgit v1.1


From b4e289a6a659c5c2c056a67fa4f31f3dd8317537 Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Thu, 5 Nov 2015 18:50:14 -0800
Subject: mm/hugetlb: make node_hstates array static

There are no users of the node_hstates array outside of the
mm/hugetlb.c. So let's make it static.

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index abfbe8c..4fc590a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2376,7 +2376,7 @@ struct node_hstate {
 	struct kobject		*hugepages_kobj;
 	struct kobject		*hstate_kobjs[HUGE_MAX_HSTATE];
 };
-struct node_hstate node_hstates[MAX_NUMNODES];
+static struct node_hstate node_hstates[MAX_NUMNODES];
 
 /*
  * A subset of global hstate attributes for node devices
-- 
cgit v1.1


From 099730d67417dfee273e9b10ac2560ca7fac7eb9 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Thu, 5 Nov 2015 18:50:17 -0800
Subject: mm, hugetlb: use memory policy when available

I have a hugetlbfs user which is never explicitly allocating huge pages
with 'nr_hugepages'.  They only set 'nr_overcommit_hugepages' and then let
the pages be allocated from the buddy allocator at fault time.

This works, but they noticed that mbind() was not doing them any good and
the pages were being allocated without respect for the policy they
specified.

The code in question is this:

> struct page *alloc_huge_page(struct vm_area_struct *vma,
...
>         page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
>         if (!page) {
>                 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);

dequeue_huge_page_vma() is smart and will respect the VMA's memory policy.
 But, it only grabs _existing_ huge pages from the huge page pool.  If the
pool is empty, we fall back to alloc_buddy_huge_page() which obviously
can't do anything with the VMA's policy because it isn't even passed the
VMA.

Almost everybody preallocates huge pages.  That's probably why nobody has
ever noticed this.  Looking back at the git history, I don't think this
_ever_ worked from when alloc_buddy_huge_page() was introduced in
7893d1d5, 8 years ago.

The fix is to pass vma/addr down in to the places where we actually call
in to the buddy allocator.  It's fairly straightforward plumbing.  This
has been lightly tested.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: David Rientjes <rientjes@google.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 104 insertions(+), 12 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4fc590a..899f6a8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1437,7 +1437,76 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
 		dissolve_free_huge_page(pfn_to_page(pfn));
 }
 
-static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
+/*
+ * There are 3 ways this can get called:
+ * 1. With vma+addr: we use the VMA's memory policy
+ * 2. With !vma, but nid=NUMA_NO_NODE:  We try to allocate a huge
+ *    page from any node, and let the buddy allocator itself figure
+ *    it out.
+ * 3. With !vma, but nid!=NUMA_NO_NODE.  We allocate a huge page
+ *    strictly from 'nid'
+ */
+static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
+		struct vm_area_struct *vma, unsigned long addr, int nid)
+{
+	int order = huge_page_order(h);
+	gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN;
+	unsigned int cpuset_mems_cookie;
+
+	/*
+	 * We need a VMA to get a memory policy.  If we do not
+	 * have one, we use the 'nid' argument
+	 */
+	if (!vma) {
+		/*
+		 * If a specific node is requested, make sure to
+		 * get memory from there, but only when a node
+		 * is explicitly specified.
+		 */
+		if (nid != NUMA_NO_NODE)
+			gfp |= __GFP_THISNODE;
+		/*
+		 * Make sure to call something that can handle
+		 * nid=NUMA_NO_NODE
+		 */
+		return alloc_pages_node(nid, gfp, order);
+	}
+
+	/*
+	 * OK, so we have a VMA.  Fetch the mempolicy and try to
+	 * allocate a huge page with it.
+	 */
+	do {
+		struct page *page;
+		struct mempolicy *mpol;
+		struct zonelist *zl;
+		nodemask_t *nodemask;
+
+		cpuset_mems_cookie = read_mems_allowed_begin();
+		zl = huge_zonelist(vma, addr, gfp, &mpol, &nodemask);
+		mpol_cond_put(mpol);
+		page = __alloc_pages_nodemask(gfp, order, zl, nodemask);
+		if (page)
+			return page;
+	} while (read_mems_allowed_retry(cpuset_mems_cookie));
+
+	return NULL;
+}
+
+/*
+ * There are two ways to allocate a huge page:
+ * 1. When you have a VMA and an address (like a fault)
+ * 2. When you have no VMA (like when setting /proc/.../nr_hugepages)
+ *
+ * 'vma' and 'addr' are only for (1).  'nid' is always NUMA_NO_NODE in
+ * this case which signifies that the allocation should be done with
+ * respect for the VMA's memory policy.
+ *
+ * For (2), we ignore 'vma' and 'addr' and use 'nid' exclusively. This
+ * implies that memory policies will not be taken in to account.
+ */
+static struct page *__alloc_buddy_huge_page(struct hstate *h,
+		struct vm_area_struct *vma, unsigned long addr, int nid)
 {
 	struct page *page;
 	unsigned int r_nid;
@@ -1446,6 +1515,15 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
 		return NULL;
 
 	/*
+	 * Make sure that anyone specifying 'nid' is not also specifying a VMA.
+	 * This makes sure the caller is picking _one_ of the modes with which
+	 * we can call this function, not both.
+	 */
+	if (vma || (addr != -1)) {
+		WARN_ON_ONCE(addr == -1);
+		WARN_ON_ONCE(nid != NUMA_NO_NODE);
+	}
+	/*
 	 * Assume we will successfully allocate the surplus page to
 	 * prevent racing processes from causing the surplus to exceed
 	 * overcommit
@@ -1478,14 +1556,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
 	}
 	spin_unlock(&hugetlb_lock);
 
-	if (nid == NUMA_NO_NODE)
-		page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP|
-				   __GFP_REPEAT|__GFP_NOWARN,
-				   huge_page_order(h));
-	else
-		page = __alloc_pages_node(nid,
-			htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
-			__GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
+	page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid);
 
 	spin_lock(&hugetlb_lock);
 	if (page) {
@@ -1510,6 +1581,27 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
 }
 
 /*
+ * Allocate a huge page from 'nid'.  Note, 'nid' may be
+ * NUMA_NO_NODE, which means that it may be allocated
+ * anywhere.
+ */
+struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid)
+{
+	unsigned long addr = -1;
+
+	return __alloc_buddy_huge_page(h, NULL, addr, nid);
+}
+
+/*
+ * Use the VMA's mpolicy to allocate a huge page from the buddy.
+ */
+struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
+		struct vm_area_struct *vma, unsigned long addr)
+{
+	return __alloc_buddy_huge_page(h, vma, addr, NUMA_NO_NODE);
+}
+
+/*
  * This allocation function is useful in the context where vma is irrelevant.
  * E.g. soft-offlining uses this function because it only cares physical
  * address of error page.
@@ -1524,7 +1616,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
 	spin_unlock(&hugetlb_lock);
 
 	if (!page)
-		page = alloc_buddy_huge_page(h, nid);
+		page = __alloc_buddy_huge_page_no_mpol(h, nid);
 
 	return page;
 }
@@ -1554,7 +1646,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
 retry:
 	spin_unlock(&hugetlb_lock);
 	for (i = 0; i < needed; i++) {
-		page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
+		page = __alloc_buddy_huge_page_no_mpol(h, NUMA_NO_NODE);
 		if (!page) {
 			alloc_ok = false;
 			break;
@@ -1787,7 +1879,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
 	page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
 	if (!page) {
 		spin_unlock(&hugetlb_lock);
-		page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
+		page = __alloc_buddy_huge_page_with_mpol(h, vma, addr);
 		if (!page)
 			goto out_uncharge_cgroup;
 
-- 
cgit v1.1


From e0ec90ee7e6f6cbaa6d59ffb48d2a7af5e80e61d Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Thu, 5 Nov 2015 18:50:20 -0800
Subject: mm, hugetlbfs: optimize when NUMA=n

My recent patch "mm, hugetlb: use memory policy when available" added some
bloat to hugetlb.o.  This patch aims to get some of the bloat back,
especially when NUMA is not in play.

It does this with an implicit #ifdef and marking some things static that
should have been static in my first patch.  It also makes the warnings
only VM_WARN_ON()s.  They were responsible for a pretty big chunk of the
bloat.

Doing this gets our NUMA=n text size back to a wee bit _below_ where we
started before the original patch.

It also shaves a bit of space off the NUMA=y case, but not much.
Enforcing the mempolicy definitely takes some text and it's hard to avoid.

size(1) output:

   text	   data	    bss	    dec	    hex	filename
  30745	   3433	   2492	  36670	   8f3e	hugetlb.o.nonuma.baseline
  31305	   3755	   2492	  37552	   92b0	hugetlb.o.nonuma.patch1
  30713	   3433	   2492	  36638	   8f1e	hugetlb.o.nonuma.patch2 (this patch)
  25235	    473	  41276	  66984	  105a8	hugetlb.o.numa.baseline
  25715	    475	  41276	  67466	  1078a	hugetlb.o.numa.patch1
  25491	    473	  41276	  67240	  106a8	hugetlb.o.numa.patch2 (this patch)

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 899f6a8..241de27 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1455,9 +1455,14 @@ static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
 
 	/*
 	 * We need a VMA to get a memory policy.  If we do not
-	 * have one, we use the 'nid' argument
+	 * have one, we use the 'nid' argument.
+	 *
+	 * The mempolicy stuff below has some non-inlined bits
+	 * and calls ->vm_ops.  That makes it hard to optimize at
+	 * compile-time, even when NUMA is off and it does
+	 * nothing.  This helps the compiler optimize it out.
 	 */
-	if (!vma) {
+	if (!IS_ENABLED(CONFIG_NUMA) || !vma) {
 		/*
 		 * If a specific node is requested, make sure to
 		 * get memory from there, but only when a node
@@ -1474,7 +1479,8 @@ static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
 
 	/*
 	 * OK, so we have a VMA.  Fetch the mempolicy and try to
-	 * allocate a huge page with it.
+	 * allocate a huge page with it.  We will only reach this
+	 * when CONFIG_NUMA=y.
 	 */
 	do {
 		struct page *page;
@@ -1520,8 +1526,8 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h,
 	 * we can call this function, not both.
 	 */
 	if (vma || (addr != -1)) {
-		WARN_ON_ONCE(addr == -1);
-		WARN_ON_ONCE(nid != NUMA_NO_NODE);
+		VM_WARN_ON_ONCE(addr == -1);
+		VM_WARN_ON_ONCE(nid != NUMA_NO_NODE);
 	}
 	/*
 	 * Assume we will successfully allocate the surplus page to
@@ -1585,6 +1591,7 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h,
  * NUMA_NO_NODE, which means that it may be allocated
  * anywhere.
  */
+static
 struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid)
 {
 	unsigned long addr = -1;
@@ -1595,6 +1602,7 @@ struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid)
 /*
  * Use the VMA's mpolicy to allocate a huge page from the buddy.
  */
+static
 struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
 		struct vm_area_struct *vma, unsigned long addr)
 {
-- 
cgit v1.1


From f5fc3c5d817435970aa301d066820a9ac12c8120 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 5 Nov 2015 18:50:23 -0800
Subject: mm: memcontrol: eliminate root memory.current

memory.current on the root level doesn't add anything that wouldn't be
more accurate and detailed using system statistics.  It already doesn't
include slabs, and it'll be a pain to keep in sync when further memory
types are accounted in the memory controller.  Remove it.

Note that this applies to the new unified hierarchy interface only.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a44494f..59b100f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5026,7 +5026,9 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
 static u64 memory_current_read(struct cgroup_subsys_state *css,
 			       struct cftype *cft)
 {
-	return mem_cgroup_usage(mem_cgroup_from_css(css), false);
+	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+	return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
 }
 
 static int memory_low_show(struct seq_file *m, void *v)
@@ -5138,6 +5140,7 @@ static int memory_events_show(struct seq_file *m, void *v)
 static struct cftype memory_files[] = {
 	{
 		.name = "current",
+		.flags = CFTYPE_NOT_ON_ROOT,
 		.read_u64 = memory_current_read,
 	},
 	{
-- 
cgit v1.1


From 6071ca5201066f4b2a61cfb693dd186d6bc6e9f3 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 5 Nov 2015 18:50:26 -0800
Subject: mm: page_counter: let page_counter_try_charge() return bool

page_counter_try_charge() currently returns 0 on success and -ENOMEM on
failure, which is surprising behavior given the function name.

Make it follow the expected pattern of try_stuff() functions that return a
boolean true to indicate success, or false for failure.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb_cgroup.c |  3 ++-
 mm/memcontrol.c     | 11 +++++------
 mm/page_counter.c   | 14 +++++++-------
 3 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 6e00574..33d59ab 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -186,7 +186,8 @@ again:
 	}
 	rcu_read_unlock();
 
-	ret = page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter);
+	if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter))
+		ret = -ENOMEM;
 	css_put(&h_cg->css);
 done:
 	*ptr = h_cg;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 59b100f..d47de73 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2016,8 +2016,8 @@ retry:
 		return 0;
 
 	if (!do_swap_account ||
-	    !page_counter_try_charge(&memcg->memsw, batch, &counter)) {
-		if (!page_counter_try_charge(&memcg->memory, batch, &counter))
+	    page_counter_try_charge(&memcg->memsw, batch, &counter)) {
+		if (page_counter_try_charge(&memcg->memory, batch, &counter))
 			goto done_restock;
 		if (do_swap_account)
 			page_counter_uncharge(&memcg->memsw, batch);
@@ -2381,14 +2381,13 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
 {
 	unsigned int nr_pages = 1 << order;
 	struct page_counter *counter;
-	int ret = 0;
+	int ret;
 
 	if (!memcg_kmem_is_active(memcg))
 		return 0;
 
-	ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
-	if (ret)
-		return ret;
+	if (!page_counter_try_charge(&memcg->kmem, nr_pages, &counter))
+		return -ENOMEM;
 
 	ret = try_charge(memcg, gfp, nr_pages);
 	if (ret) {
diff --git a/mm/page_counter.c b/mm/page_counter.c
index 11b4bed..7c6a63d 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -56,12 +56,12 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
  * @nr_pages: number of pages to charge
  * @fail: points first counter to hit its limit, if any
  *
- * Returns 0 on success, or -ENOMEM and @fail if the counter or one of
- * its ancestors has hit its configured limit.
+ * Returns %true on success, or %false and @fail if the counter or one
+ * of its ancestors has hit its configured limit.
  */
-int page_counter_try_charge(struct page_counter *counter,
-			    unsigned long nr_pages,
-			    struct page_counter **fail)
+bool page_counter_try_charge(struct page_counter *counter,
+			     unsigned long nr_pages,
+			     struct page_counter **fail)
 {
 	struct page_counter *c;
 
@@ -99,13 +99,13 @@ int page_counter_try_charge(struct page_counter *counter,
 		if (new > c->watermark)
 			c->watermark = new;
 	}
-	return 0;
+	return true;
 
 failed:
 	for (c = counter; c != *fail; c = c->parent)
 		page_counter_cancel(c, nr_pages);
 
-	return -ENOMEM;
+	return false;
 }
 
 /**
-- 
cgit v1.1


From c12176d3368b9b36ae484d323d41e94be26f9b65 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 5 Nov 2015 18:50:29 -0800
Subject: memcg: fix thresholds for 32b architectures.

Commit 424cdc141380 ("memcg: convert threshold to bytes") has fixed a
regression introduced by 3e32cb2e0a12 ("mm: memcontrol: lockless page
counters") where thresholds were silently converted to use page units
rather than bytes when interpreting the user input.

The fix is not complete, though, as properly pointed out by Ben Hutchings
during stable backport review.  The page count is converted to bytes but
unsigned long is used to hold the value which would be obviously not
sufficient for 32b systems with more than 4G thresholds.  The same applies
to usage as taken from mem_cgroup_usage which might overflow.

Let's remove this bytes vs.  pages internal tracking differences and
handle thresholds in page units internally.  Chage mem_cgroup_usage() to
return the value in page units and revert 424cdc141380 because this should
be sufficient for the consistent handling.  mem_cgroup_read_u64 as the
only users of mem_cgroup_usage outside of the threshold handling code is
converted to give the proper in bytes result.  It is doing that already
for page_counter output so this is more consistent as well.

The value presented to the userspace is still in bytes units.

Fixes: 424cdc141380 ("memcg: convert threshold to bytes")
Fixes: 3e32cb2e0a12 ("mm: memcontrol: lockless page counters")
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Ben Hutchings <ben@decadent.org.uk>
Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: <stable@vger.kernel.org>
From: Michal Hocko <mhocko@kernel.org>
Subject: memcg-fix-thresholds-for-32b-architectures-fix

Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
From: Andrew Morton <akpm@linux-foundation.org>
Subject: memcg-fix-thresholds-for-32b-architectures-fix-fix

don't attempt to inline mem_cgroup_usage()

The compiler ignores the inline anwyay.  And __always_inlining it adds 600
bytes of goop to the .o file.

Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d47de73..38765d8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2801,9 +2801,9 @@ static unsigned long tree_stat(struct mem_cgroup *memcg,
 	return val;
 }
 
-static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
+static inline unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 {
-	u64 val;
+	unsigned long val;
 
 	if (mem_cgroup_is_root(memcg)) {
 		val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
@@ -2816,7 +2816,7 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 		else
 			val = page_counter_read(&memcg->memsw);
 	}
-	return val << PAGE_SHIFT;
+	return val;
 }
 
 enum {
@@ -2850,9 +2850,9 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
 	switch (MEMFILE_ATTR(cft->private)) {
 	case RES_USAGE:
 		if (counter == &memcg->memory)
-			return mem_cgroup_usage(memcg, false);
+			return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
 		if (counter == &memcg->memsw)
-			return mem_cgroup_usage(memcg, true);
+			return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
 		return (u64)page_counter_read(counter) * PAGE_SIZE;
 	case RES_LIMIT:
 		return (u64)counter->limit * PAGE_SIZE;
@@ -3352,7 +3352,6 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
 	ret = page_counter_memparse(args, "-1", &threshold);
 	if (ret)
 		return ret;
-	threshold <<= PAGE_SHIFT;
 
 	mutex_lock(&memcg->thresholds_lock);
 
-- 
cgit v1.1


From d0424c429f8e0555a337d71e0a13f2289c636ec9 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 5 Nov 2015 18:50:34 -0800
Subject: tmpfs: avoid a little creat and stat slowdown

LKP reports that v4.2 commit afa2db2fb6f1 ("tmpfs: truncate prealloc
blocks past i_size") causes a 14.5% slowdown in the AIM9 creat-clo
benchmark.

creat-clo does just what you'd expect from the name, and creat's O_TRUNC
on 0-length file does indeed get into more overhead now shmem_setattr()
tests "0 <= 0" instead of "0 < 0".

I'm not sure how much we care, but I think it would not be too VW-like to
add in a check for whether any pages (or swap) are allocated: if none are
allocated, there's none to remove from the radix_tree.  At first I thought
that check would be good enough for the unmaps too, but no: we should not
skip the unlikely case of unmapping pages beyond the new EOF, which were
COWed from holes which have now been reclaimed, leaving none.

This gives me an 8.5% speedup: on Haswell instead of LKP's Westmere, and
running a debug config before and after: I hope those account for the
lesser speedup.

And probably someone has a benchmark where a thousand threads keep on
stat'ing the same file repeatedly: forestall that report by adjusting v4.3
commit 44a30220bc0a ("shmem: recalculate file inode when fstat") not to
take the spinlock in shmem_getattr() when there's no work to do.

Signed-off-by: Hugh Dickins <hughd@google.com>
Reported-by: Ying Huang <ying.huang@linux.intel.com>
Tested-by: Ying Huang <ying.huang@linux.intel.com>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/shmem.c b/mm/shmem.c
index 6529226..3b8b739 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -548,12 +548,12 @@ static int shmem_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	struct inode *inode = dentry->d_inode;
 	struct shmem_inode_info *info = SHMEM_I(inode);
 
-	spin_lock(&info->lock);
-	shmem_recalc_inode(inode);
-	spin_unlock(&info->lock);
-
+	if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
+		spin_lock(&info->lock);
+		shmem_recalc_inode(inode);
+		spin_unlock(&info->lock);
+	}
 	generic_fillattr(inode, stat);
-
 	return 0;
 }
 
@@ -586,10 +586,16 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
 		}
 		if (newsize <= oldsize) {
 			loff_t holebegin = round_up(newsize, PAGE_SIZE);
-			unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
-			shmem_truncate_range(inode, newsize, (loff_t)-1);
+			if (oldsize > holebegin)
+				unmap_mapping_range(inode->i_mapping,
+							holebegin, 0, 1);
+			if (info->alloced)
+				shmem_truncate_range(inode,
+							newsize, (loff_t)-1);
 			/* unmap again to remove racily COWed private pages */
-			unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
+			if (oldsize > holebegin)
+				unmap_mapping_range(inode->i_mapping,
+							holebegin, 0, 1);
 		}
 	}
 
-- 
cgit v1.1


From 0ba8663cbfae066fc504b858db7cbb7d03c2b872 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 5 Nov 2015 18:50:43 -0800
Subject: mm/kasan: rename kasan_enabled() to kasan_report_enabled()

The function only disable/enable reporting.  In the later patch we will be
adding a kasan early enable/disable.  Rename kasan_enabled to properly
reflect its function.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/kasan/kasan.h  | 2 +-
 mm/kasan/report.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index c242adf..a6b46cc 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -63,7 +63,7 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
 		<< KASAN_SHADOW_SCALE_SHIFT);
 }
 
-static inline bool kasan_enabled(void)
+static inline bool kasan_report_enabled(void)
 {
 	return !current->kasan_depth;
 }
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index e07c94f..6c3f82b 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -220,7 +220,7 @@ void kasan_report(unsigned long addr, size_t size,
 {
 	struct kasan_access_info info;
 
-	if (likely(!kasan_enabled()))
+	if (likely(!kasan_report_enabled()))
 		return;
 
 	info.access_addr = (void *)addr;
-- 
cgit v1.1


From 527f215b78976e94995dce7163b07539b576d519 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 5 Nov 2015 18:50:46 -0800
Subject: mm/kasan: MODULE_VADDR is not available on all archs

Use is_module_address instead

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/kasan/report.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 6c3f82b..d269f20 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -22,6 +22,7 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/kasan.h>
+#include <linux/module.h>
 
 #include <asm/sections.h>
 
@@ -85,9 +86,11 @@ static void print_error_description(struct kasan_access_info *info)
 
 static inline bool kernel_or_module_addr(const void *addr)
 {
-	return (addr >= (void *)_stext && addr < (void *)_end)
-		|| (addr >= (void *)MODULES_VADDR
-			&& addr < (void *)MODULES_END);
+	if (addr >= (void *)_stext && addr < (void *)_end)
+		return true;
+	if (is_module_address((unsigned long)addr))
+		return true;
+	return false;
 }
 
 static inline bool init_task_stack_addr(const void *addr)
-- 
cgit v1.1


From f2377d4eaab2aabe1938b3974b5b94f5ba4c7ead Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 5 Nov 2015 18:50:48 -0800
Subject: mm/kasan: don't use kasan shadow pointer in generic functions

We can't use generic functions like print_hex_dump to access kasan shadow
region.  This require us to setup another kasan shadow region for the
address passed (kasan shadow address).  Some architectures won't be able
to do that.  Hence make a copy of the shadow region row and pass that to
generic functions.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/kasan/report.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index d269f20..c536708 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -164,14 +164,20 @@ static void print_shadow_for_address(const void *addr)
 	for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) {
 		const void *kaddr = kasan_shadow_to_mem(shadow_row);
 		char buffer[4 + (BITS_PER_LONG/8)*2];
+		char shadow_buf[SHADOW_BYTES_PER_ROW];
 
 		snprintf(buffer, sizeof(buffer),
 			(i == 0) ? ">%p: " : " %p: ", kaddr);
-
+		/*
+		 * We should not pass a shadow pointer to generic
+		 * function, because generic functions may try to
+		 * access kasan mapping for the passed address.
+		 */
 		kasan_disable_current();
+		memcpy(shadow_buf, shadow_row, SHADOW_BYTES_PER_ROW);
 		print_hex_dump(KERN_ERR, buffer,
 			DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1,
-			shadow_row, SHADOW_BYTES_PER_ROW, 0);
+			shadow_buf, SHADOW_BYTES_PER_ROW, 0);
 		kasan_enable_current();
 
 		if (row_is_guilty(shadow_row, shadow))
-- 
cgit v1.1


From fc5aeeaf593278f07ffa4d97296e27423ecae867 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 5 Nov 2015 18:50:51 -0800
Subject: mm/kasan: prevent deadlock in kasan reporting

When we end up calling kasan_report in real mode, our shadow mapping for
the spinlock variable will show poisoned.  This will result in us calling
kasan_report_error with lock_report spin lock held.  To prevent this
disable kasan reporting when we are priting error w.r.t kasan.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/kasan/report.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index c536708..7833f07 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -173,12 +173,10 @@ static void print_shadow_for_address(const void *addr)
 		 * function, because generic functions may try to
 		 * access kasan mapping for the passed address.
 		 */
-		kasan_disable_current();
 		memcpy(shadow_buf, shadow_row, SHADOW_BYTES_PER_ROW);
 		print_hex_dump(KERN_ERR, buffer,
 			DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1,
 			shadow_buf, SHADOW_BYTES_PER_ROW, 0);
-		kasan_enable_current();
 
 		if (row_is_guilty(shadow_row, shadow))
 			pr_err("%*c\n",
@@ -195,6 +193,10 @@ void kasan_report_error(struct kasan_access_info *info)
 {
 	unsigned long flags;
 
+	/*
+	 * Make sure we don't end up in loop.
+	 */
+	kasan_disable_current();
 	spin_lock_irqsave(&report_lock, flags);
 	pr_err("================================="
 		"=================================\n");
@@ -204,12 +206,17 @@ void kasan_report_error(struct kasan_access_info *info)
 	pr_err("================================="
 		"=================================\n");
 	spin_unlock_irqrestore(&report_lock, flags);
+	kasan_enable_current();
 }
 
 void kasan_report_user_access(struct kasan_access_info *info)
 {
 	unsigned long flags;
 
+	/*
+	 * Make sure we don't end up in loop.
+	 */
+	kasan_disable_current();
 	spin_lock_irqsave(&report_lock, flags);
 	pr_err("================================="
 		"=================================\n");
@@ -222,6 +229,7 @@ void kasan_report_user_access(struct kasan_access_info *info)
 	pr_err("================================="
 		"=================================\n");
 	spin_unlock_irqrestore(&report_lock, flags);
+	kasan_enable_current();
 }
 
 void kasan_report(unsigned long addr, size_t size,
-- 
cgit v1.1


From e91210766341cb356ead7fd39f07493a3d00b80f Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Thu, 5 Nov 2015 18:50:55 -0800
Subject: kasan: update reported bug types for not user nor kernel memory
 accesses

Each access with address lower than
kasan_shadow_to_mem(KASAN_SHADOW_START) is reported as user-memory-access.
This is not always true, the accessed address might not be in user space.
Fix this by reporting such accesses as null-ptr-derefs or
wild-memory-accesses.

There's another reason for this change.  For userspace ASan we have a
bunch of systems that analyze error types for the purpose of
classification and deduplication.  Sooner of later we will write them to
KASAN as well.  Then clearly and explicitly stated error types will bring
value.

Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/kasan/kasan.c  |  8 +-------
 mm/kasan/kasan.h  |  3 ---
 mm/kasan/report.c | 50 +++++++++++++++++++++++---------------------------
 3 files changed, 24 insertions(+), 37 deletions(-)

(limited to 'mm')

diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 8da2114..1104cb0 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -235,18 +235,12 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
 static __always_inline void check_memory_region(unsigned long addr,
 						size_t size, bool write)
 {
-	struct kasan_access_info info;
-
 	if (unlikely(size == 0))
 		return;
 
 	if (unlikely((void *)addr <
 		kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
-		info.access_addr = (void *)addr;
-		info.access_size = size;
-		info.is_write = write;
-		info.ip = _RET_IP_;
-		kasan_report_user_access(&info);
+		kasan_report(addr, size, write, _RET_IP_);
 		return;
 	}
 
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index a6b46cc..4f6c62e 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -54,9 +54,6 @@ struct kasan_global {
 #endif
 };
 
-void kasan_report_error(struct kasan_access_info *info);
-void kasan_report_user_access(struct kasan_access_info *info);
-
 static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
 {
 	return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 7833f07..964aaf4 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -189,9 +189,10 @@ static void print_shadow_for_address(const void *addr)
 
 static DEFINE_SPINLOCK(report_lock);
 
-void kasan_report_error(struct kasan_access_info *info)
+static void kasan_report_error(struct kasan_access_info *info)
 {
 	unsigned long flags;
+	const char *bug_type;
 
 	/*
 	 * Make sure we don't end up in loop.
@@ -200,32 +201,26 @@ void kasan_report_error(struct kasan_access_info *info)
 	spin_lock_irqsave(&report_lock, flags);
 	pr_err("================================="
 		"=================================\n");
-	print_error_description(info);
-	print_address_description(info);
-	print_shadow_for_address(info->first_bad_addr);
-	pr_err("================================="
-		"=================================\n");
-	spin_unlock_irqrestore(&report_lock, flags);
-	kasan_enable_current();
-}
-
-void kasan_report_user_access(struct kasan_access_info *info)
-{
-	unsigned long flags;
-
-	/*
-	 * Make sure we don't end up in loop.
-	 */
-	kasan_disable_current();
-	spin_lock_irqsave(&report_lock, flags);
-	pr_err("================================="
-		"=================================\n");
-	pr_err("BUG: KASan: user-memory-access on address %p\n",
-		info->access_addr);
-	pr_err("%s of size %zu by task %s/%d\n",
-		info->is_write ? "Write" : "Read",
-		info->access_size, current->comm, task_pid_nr(current));
-	dump_stack();
+	if (info->access_addr <
+			kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) {
+		if ((unsigned long)info->access_addr < PAGE_SIZE)
+			bug_type = "null-ptr-deref";
+		else if ((unsigned long)info->access_addr < TASK_SIZE)
+			bug_type = "user-memory-access";
+		else
+			bug_type = "wild-memory-access";
+		pr_err("BUG: KASan: %s on address %p\n",
+			bug_type, info->access_addr);
+		pr_err("%s of size %zu by task %s/%d\n",
+			info->is_write ? "Write" : "Read",
+			info->access_size, current->comm,
+			task_pid_nr(current));
+		dump_stack();
+	} else {
+		print_error_description(info);
+		print_address_description(info);
+		print_shadow_for_address(info->first_bad_addr);
+	}
 	pr_err("================================="
 		"=================================\n");
 	spin_unlock_irqrestore(&report_lock, flags);
@@ -244,6 +239,7 @@ void kasan_report(unsigned long addr, size_t size,
 	info.access_size = size;
 	info.is_write = is_write;
 	info.ip = ip;
+
 	kasan_report_error(&info);
 }
 
-- 
cgit v1.1


From 0952d87fd6a6211ac51b2abdc5c066b49c651fd8 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Thu, 5 Nov 2015 18:50:58 -0800
Subject: kasan: update reported bug types for kernel memory accesses

Update the names of the bad access types to better reflect the type of
the access that happended and make these error types "literals" that can
be used for classification and deduplication in scripts.

Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/kasan/report.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 964aaf4..cdf4c31 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -49,7 +49,7 @@ static const void *find_first_bad_addr(const void *addr, size_t size)
 
 static void print_error_description(struct kasan_access_info *info)
 {
-	const char *bug_type = "unknown crash";
+	const char *bug_type = "unknown-crash";
 	u8 shadow_val;
 
 	info->first_bad_addr = find_first_bad_addr(info->access_addr,
@@ -58,21 +58,25 @@ static void print_error_description(struct kasan_access_info *info)
 	shadow_val = *(u8 *)kasan_mem_to_shadow(info->first_bad_addr);
 
 	switch (shadow_val) {
-	case KASAN_FREE_PAGE:
-	case KASAN_KMALLOC_FREE:
-		bug_type = "use after free";
+	case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
+		bug_type = "out-of-bounds";
 		break;
 	case KASAN_PAGE_REDZONE:
 	case KASAN_KMALLOC_REDZONE:
+		bug_type = "slab-out-of-bounds";
+		break;
 	case KASAN_GLOBAL_REDZONE:
-	case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
-		bug_type = "out of bounds access";
+		bug_type = "global-out-of-bounds";
 		break;
 	case KASAN_STACK_LEFT:
 	case KASAN_STACK_MID:
 	case KASAN_STACK_RIGHT:
 	case KASAN_STACK_PARTIAL:
-		bug_type = "out of bounds on stack";
+		bug_type = "stack-out-of-bounds";
+		break;
+	case KASAN_FREE_PAGE:
+	case KASAN_KMALLOC_FREE:
+		bug_type = "use-after-free";
 		break;
 	}
 
-- 
cgit v1.1


From cdf6a273dc4346277ab9d148ef29f6e058624a8c Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Thu, 5 Nov 2015 18:51:01 -0800
Subject: kasan: accurately determine the type of the bad access

Makes KASAN accurately determine the type of the bad access. If the shadow
byte value is in the [0, KASAN_SHADOW_SCALE_SIZE) range we can look at
the next shadow byte to determine the type of the access.

Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/kasan/report.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index cdf4c31..be53a8f 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -50,15 +50,26 @@ static const void *find_first_bad_addr(const void *addr, size_t size)
 static void print_error_description(struct kasan_access_info *info)
 {
 	const char *bug_type = "unknown-crash";
-	u8 shadow_val;
+	u8 *shadow_addr;
 
 	info->first_bad_addr = find_first_bad_addr(info->access_addr,
 						info->access_size);
 
-	shadow_val = *(u8 *)kasan_mem_to_shadow(info->first_bad_addr);
+	shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr);
 
-	switch (shadow_val) {
+	/*
+	 * If shadow byte value is in [0, KASAN_SHADOW_SCALE_SIZE) we can look
+	 * at the next shadow byte to determine the type of the bad access.
+	 */
+	if (*shadow_addr > 0 && *shadow_addr <= KASAN_SHADOW_SCALE_SIZE - 1)
+		shadow_addr++;
+
+	switch (*shadow_addr) {
 	case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
+		/*
+		 * In theory it's still possible to see these shadow values
+		 * due to a data race in the kernel code.
+		 */
 		bug_type = "out-of-bounds";
 		break;
 	case KASAN_PAGE_REDZONE:
-- 
cgit v1.1


From 25add7ec708170e4eaef1f9793a07803b2fb5c71 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Thu, 5 Nov 2015 18:51:03 -0800
Subject: kasan: update log messages

We decided to use KASAN as the short name of the tool and
KernelAddressSanitizer as the full one.  Update log messages according to
that.

Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/kasan/kasan.c  | 2 +-
 mm/kasan/report.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 1104cb0..be9b78d 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -518,7 +518,7 @@ static int kasan_mem_notifier(struct notifier_block *nb,
 
 static int __init kasan_memhotplug_init(void)
 {
-	pr_err("WARNING: KASan doesn't support memory hot-add\n");
+	pr_err("WARNING: KASAN doesn't support memory hot-add\n");
 	pr_err("Memory hot-add will be disabled\n");
 
 	hotplug_memory_notifier(kasan_mem_notifier, 0);
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index be53a8f..ae6bd36 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -91,7 +91,7 @@ static void print_error_description(struct kasan_access_info *info)
 		break;
 	}
 
-	pr_err("BUG: KASan: %s in %pS at addr %p\n",
+	pr_err("BUG: KASAN: %s in %pS at addr %p\n",
 		bug_type, (void *)info->ip,
 		info->access_addr);
 	pr_err("%s of size %zu by task %s/%d\n",
@@ -224,7 +224,7 @@ static void kasan_report_error(struct kasan_access_info *info)
 			bug_type = "user-memory-access";
 		else
 			bug_type = "wild-memory-access";
-		pr_err("BUG: KASan: %s on address %p\n",
+		pr_err("BUG: KASAN: %s on address %p\n",
 			bug_type, info->access_addr);
 		pr_err("%s of size %zu by task %s/%d\n",
 			info->is_write ? "Write" : "Read",
-- 
cgit v1.1


From 5d0926efe728e00afbd81a1e3c498222cf908d23 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Thu, 5 Nov 2015 18:51:12 -0800
Subject: kasan: update reference to kasan prototype repo

Update the reference to the kasan prototype repository on github, since it
was renamed.

Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/kasan/kasan.c  | 2 +-
 mm/kasan/report.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index be9b78d..21c50dc 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -4,7 +4,7 @@
  * Copyright (c) 2014 Samsung Electronics Co., Ltd.
  * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
  *
- * Some of code borrowed from https://github.com/xairy/linux by
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
  *        Andrey Konovalov <adech.fo@gmail.com>
  *
  * This program is free software; you can redistribute it and/or modify
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index ae6bd36..f5e068a 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -4,7 +4,7 @@
  * Copyright (c) 2014 Samsung Electronics Co., Ltd.
  * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
  *
- * Some of code borrowed from https://github.com/xairy/linux by
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
  *        Andrey Konovalov <adech.fo@gmail.com>
  *
  * This program is free software; you can redistribute it and/or modify
-- 
cgit v1.1


From e0d57714394f5e2ce4e2f9bbebf48e3c7a7fd3be Mon Sep 17 00:00:00 2001
From: Wang Long <long.wanglong@huawei.com>
Date: Thu, 5 Nov 2015 18:51:18 -0800
Subject: kasan: Fix a type conversion error

The current KASAN code can not find the following out-of-bounds bugs:

        char *ptr;
        ptr = kmalloc(8, GFP_KERNEL);
        memset(ptr+7, 0, 2);

the cause of the problem is the type conversion error in
*memory_is_poisoned_n* function.  So this patch fix that.

Signed-off-by: Wang Long <long.wanglong@huawei.com>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Vladimir Murzin <vladimir.murzin@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/kasan/kasan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 21c50dc..2b21ccd 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -203,7 +203,7 @@ static __always_inline bool memory_is_poisoned_n(unsigned long addr,
 		s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
 
 		if (unlikely(ret != (unsigned long)last_shadow ||
-			((last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
+			((long)(last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
 			return true;
 	}
 	return false;
-- 
cgit v1.1


From 10f702627e139e21465f4c9d44f63527bbca163c Mon Sep 17 00:00:00 2001
From: Xishi Qiu <qiuxishi@huawei.com>
Date: Thu, 5 Nov 2015 18:51:21 -0800
Subject: kasan: use IS_ALIGNED in memory_is_poisoned_8()

Use IS_ALIGNED() to determine whether the shadow span two bytes.  It
generates less code and more readable.  Also add some comments in shadow
check functions.

Signed-off-by: Xishi Qiu <qiuxishi@huawei.com>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andrey Konovalov <adech.fo@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/kasan/kasan.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 2b21ccd..d41b21b 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -86,6 +86,11 @@ static __always_inline bool memory_is_poisoned_2(unsigned long addr)
 		if (memory_is_poisoned_1(addr + 1))
 			return true;
 
+		/*
+		 * If single shadow byte covers 2-byte access, we don't
+		 * need to do anything more. Otherwise, test the first
+		 * shadow byte.
+		 */
 		if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0))
 			return false;
 
@@ -103,6 +108,11 @@ static __always_inline bool memory_is_poisoned_4(unsigned long addr)
 		if (memory_is_poisoned_1(addr + 3))
 			return true;
 
+		/*
+		 * If single shadow byte covers 4-byte access, we don't
+		 * need to do anything more. Otherwise, test the first
+		 * shadow byte.
+		 */
 		if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3))
 			return false;
 
@@ -120,7 +130,12 @@ static __always_inline bool memory_is_poisoned_8(unsigned long addr)
 		if (memory_is_poisoned_1(addr + 7))
 			return true;
 
-		if (likely(((addr + 7) & KASAN_SHADOW_MASK) >= 7))
+		/*
+		 * If single shadow byte covers 8-byte access, we don't
+		 * need to do anything more. Otherwise, test the first
+		 * shadow byte.
+		 */
+		if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
 			return false;
 
 		return unlikely(*(u8 *)shadow_addr);
@@ -139,7 +154,12 @@ static __always_inline bool memory_is_poisoned_16(unsigned long addr)
 		if (unlikely(shadow_first_bytes))
 			return true;
 
-		if (likely(IS_ALIGNED(addr, 8)))
+		/*
+		 * If two shadow bytes covers 16-byte access, we don't
+		 * need to do anything more. Otherwise, test the last
+		 * shadow byte.
+		 */
+		if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
 			return false;
 
 		return memory_is_poisoned_1(addr + 15);
-- 
cgit v1.1


From 89d3c87e20d95e3238eac85e43de7b3cb1f39d8b Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Thu, 5 Nov 2015 18:51:23 -0800
Subject: mm, slub, kasan: enable user tracking by default with KASAN=y

It's recommended to have slub's user tracking enabled with CONFIG_KASAN,
because:

a) User tracking disables slab merging which improves
    detecting out-of-bounds accesses.
b) User tracking metadata acts as redzone which also improves
    detecting out-of-bounds accesses.
c) User tracking provides additional information about object.
    This information helps to understand bugs.

Currently it is not enabled by default.  Besides recompiling the kernel
with KASAN and reinstalling it, user also have to change the boot cmdline,
which is not very handy.

Enable slub user tracking by default with KASAN=y, since there is no good
reason to not do this.

[akpm@linux-foundation.org: little fixes, per David]
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 423dbe7..75a5fa9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -459,8 +459,10 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
 /*
  * Debug settings:
  */
-#ifdef CONFIG_SLUB_DEBUG_ON
+#if defined(CONFIG_SLUB_DEBUG_ON)
 static int slub_debug = DEBUG_DEFAULT_FLAGS;
+#elif defined(CONFIG_KASAN)
+static int slub_debug = SLAB_STORE_USER;
 #else
 static int slub_debug;
 #endif
-- 
cgit v1.1


From eb06f43f1c94d502b7867b0998e92cdabbc060bc Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Thu, 5 Nov 2015 18:51:26 -0800
Subject: kasan: always taint kernel on report

Currently we already taint the kernel in some cases.  E.g.  if we hit some
bug in slub memory we call object_err() which will taint the kernel with
TAINT_BAD_PAGE flag.  But for other kind of bugs kernel left untainted.

Always taint with TAINT_BAD_PAGE if kasan found some bug.  This is useful
for automated testing.

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/kasan/report.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index f5e068a..12f222d 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -238,6 +238,7 @@ static void kasan_report_error(struct kasan_access_info *info)
 	}
 	pr_err("================================="
 		"=================================\n");
+	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 	spin_unlock_irqrestore(&report_lock, flags);
 	kasan_enable_current();
 }
-- 
cgit v1.1


From 1aab92ec3de552362397b718744872ea2d17add2 Mon Sep 17 00:00:00 2001
From: Eric B Munson <emunson@akamai.com>
Date: Thu, 5 Nov 2015 18:51:29 -0800
Subject: mm: mlock: refactor mlock, munlock, and munlockall code

mlock() allows a user to control page out of program memory, but this
comes at the cost of faulting in the entire mapping when it is allocated.
For large mappings where the entire area is not necessary this is not
ideal.  Instead of forcing all locked pages to be present when they are
allocated, this set creates a middle ground.  Pages are marked to be
placed on the unevictable LRU (locked) when they are first used, but they
are not faulted in by the mlock call.

This series introduces a new mlock() system call that takes a flags
argument along with the start address and size.  This flags argument gives
the caller the ability to request memory be locked in the traditional way,
or to be locked after the page is faulted in.  A new MCL flag is added to
mirror the lock on fault behavior from mlock() in mlockall().

There are two main use cases that this set covers.  The first is the
security focussed mlock case.  A buffer is needed that cannot be written
to swap.  The maximum size is known, but on average the memory used is
significantly less than this maximum.  With lock on fault, the buffer is
guaranteed to never be paged out without consuming the maximum size every
time such a buffer is created.

The second use case is focussed on performance.  Portions of a large file
are needed and we want to keep the used portions in memory once accessed.
This is the case for large graphical models where the path through the
graph is not known until run time.  The entire graph is unlikely to be
used in a given invocation, but once a node has been used it needs to stay
resident for further processing.  Given these constraints we have a number
of options.  We can potentially waste a large amount of memory by mlocking
the entire region (this can also cause a significant stall at startup as
the entire file is read in).  We can mlock every page as we access them
without tracking if the page is already resident but this introduces large
overhead for each access.  The third option is mapping the entire region
with PROT_NONE and using a signal handler for SIGSEGV to
mprotect(PROT_READ) and mlock() the needed page.  Doing this page at a
time adds a significant performance penalty.  Batching can be used to
mitigate this overhead, but in order to safely avoid trying to mprotect
pages outside of the mapping, the boundaries of each mapping to be used in
this way must be tracked and available to the signal handler.  This is
precisely what the mm system in the kernel should already be doing.

For mlock(MLOCK_ONFAULT) the user is charged against RLIMIT_MEMLOCK as if
mlock(MLOCK_LOCKED) or mmap(MAP_LOCKED) was used, so when the VMA is
created not when the pages are faulted in.  For mlockall(MCL_ONFAULT) the
user is charged as if MCL_FUTURE was used.  This decision was made to keep
the accounting checks out of the page fault path.

To illustrate the benefit of this set I wrote a test program that mmaps a
5 GB file filled with random data and then makes 15,000,000 accesses to
random addresses in that mapping.  The test program was run 20 times for
each setup.  Results are reported for two program portions, setup and
execution.  The setup phase is calling mmap and optionally mlock on the
entire region.  For most experiments this is trivial, but it highlights
the cost of faulting in the entire region.  Results are averages across
the 20 runs in milliseconds.

mmap with mlock(MLOCK_LOCKED) on entire range:
Setup avg:      8228.666
Processing avg: 8274.257

mmap with mlock(MLOCK_LOCKED) before each access:
Setup avg:      0.113
Processing avg: 90993.552

mmap with PROT_NONE and signal handler and batch size of 1 page:
With the default value in max_map_count, this gets ENOMEM as I attempt
to change the permissions, after upping the sysctl significantly I get:
Setup avg:      0.058
Processing avg: 69488.073
mmap with PROT_NONE and signal handler and batch size of 8 pages:
Setup avg:      0.068
Processing avg: 38204.116

mmap with PROT_NONE and signal handler and batch size of 16 pages:
Setup avg:      0.044
Processing avg: 29671.180

mmap with mlock(MLOCK_ONFAULT) on entire range:
Setup avg:      0.189
Processing avg: 17904.899

The signal handler in the batch cases faulted in memory in two steps to
avoid having to know the start and end of the faulting mapping.  The first
step covers the page that caused the fault as we know that it will be
possible to lock.  The second step speculatively tries to mlock and
mprotect the batch size - 1 pages that follow.  There may be a clever way
to avoid this without having the program track each mapping to be covered
by this handeler in a globally accessible structure, but I could not find
it.  It should be noted that with a large enough batch size this two step
fault handler can still cause the program to crash if it reaches far
beyond the end of the mapping.

These results show that if the developer knows that a majority of the
mapping will be used, it is better to try and fault it in at once,
otherwise mlock(MLOCK_ONFAULT) is significantly faster.

The performance cost of these patches are minimal on the two benchmarks I
have tested (stream and kernbench).  The following are the average values
across 20 runs of stream and 10 runs of kernbench after a warmup run whose
results were discarded.

Avg throughput in MB/s from stream using 1000000 element arrays
Test     4.2-rc1      4.2-rc1+lock-on-fault
Copy:    10,566.5     10,421
Scale:   10,685       10,503.5
Add:     12,044.1     11,814.2
Triad:   12,064.8     11,846.3

Kernbench optimal load
                 4.2-rc1  4.2-rc1+lock-on-fault
Elapsed Time     78.453   78.991
User Time        64.2395  65.2355
System Time      9.7335   9.7085
Context Switches 22211.5  22412.1
Sleeps           14965.3  14956.1

This patch (of 6):

Extending the mlock system call is very difficult because it currently
does not take a flags argument.  A later patch in this set will extend
mlock to support a middle ground between pages that are locked and faulted
in immediately and unlocked pages.  To pave the way for the new system
call, the code needs some reorganization so that all the actual entry
point handles is checking input and translating to VMA flags.

Signed-off-by: Eric B Munson <emunson@akamai.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mlock.c | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

(limited to 'mm')

diff --git a/mm/mlock.c b/mm/mlock.c
index 550228d..fbd8c03 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -554,7 +554,8 @@ out:
 	return ret;
 }
 
-static int do_mlock(unsigned long start, size_t len, int on)
+static int apply_vma_lock_flags(unsigned long start, size_t len,
+				vm_flags_t flags)
 {
 	unsigned long nstart, end, tmp;
 	struct vm_area_struct * vma, * prev;
@@ -576,14 +577,11 @@ static int do_mlock(unsigned long start, size_t len, int on)
 		prev = vma;
 
 	for (nstart = start ; ; ) {
-		vm_flags_t newflags;
-
-		/* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
+		vm_flags_t newflags = vma->vm_flags & ~VM_LOCKED;
 
-		newflags = vma->vm_flags & ~VM_LOCKED;
-		if (on)
-			newflags |= VM_LOCKED;
+		newflags |= flags;
 
+		/* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
 		tmp = vma->vm_end;
 		if (tmp > end)
 			tmp = end;
@@ -605,7 +603,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
 	return error;
 }
 
-SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
+static int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
 {
 	unsigned long locked;
 	unsigned long lock_limit;
@@ -629,7 +627,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
 
 	/* check against resource limits */
 	if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
-		error = do_mlock(start, len, 1);
+		error = apply_vma_lock_flags(start, len, flags);
 
 	up_write(&current->mm->mmap_sem);
 	if (error)
@@ -641,6 +639,11 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
 	return 0;
 }
 
+SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
+{
+	return do_mlock(start, len, VM_LOCKED);
+}
+
 SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
 {
 	int ret;
@@ -649,13 +652,13 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
 	start &= PAGE_MASK;
 
 	down_write(&current->mm->mmap_sem);
-	ret = do_mlock(start, len, 0);
+	ret = apply_vma_lock_flags(start, len, 0);
 	up_write(&current->mm->mmap_sem);
 
 	return ret;
 }
 
-static int do_mlockall(int flags)
+static int apply_mlockall_flags(int flags)
 {
 	struct vm_area_struct * vma, * prev = NULL;
 
@@ -663,6 +666,7 @@ static int do_mlockall(int flags)
 		current->mm->def_flags |= VM_LOCKED;
 	else
 		current->mm->def_flags &= ~VM_LOCKED;
+
 	if (flags == MCL_FUTURE)
 		goto out;
 
@@ -703,7 +707,7 @@ SYSCALL_DEFINE1(mlockall, int, flags)
 
 	if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
 	    capable(CAP_IPC_LOCK))
-		ret = do_mlockall(flags);
+		ret = apply_mlockall_flags(flags);
 	up_write(&current->mm->mmap_sem);
 	if (!ret && (flags & MCL_CURRENT))
 		mm_populate(0, TASK_SIZE);
@@ -716,7 +720,7 @@ SYSCALL_DEFINE0(munlockall)
 	int ret;
 
 	down_write(&current->mm->mmap_sem);
-	ret = do_mlockall(0);
+	ret = apply_mlockall_flags(0);
 	up_write(&current->mm->mmap_sem);
 	return ret;
 }
-- 
cgit v1.1


From a8ca5d0ecbdde5cc3d7accacbd69968b0c98764e Mon Sep 17 00:00:00 2001
From: Eric B Munson <emunson@akamai.com>
Date: Thu, 5 Nov 2015 18:51:33 -0800
Subject: mm: mlock: add new mlock system call

With the refactored mlock code, introduce a new system call for mlock.
The new call will allow the user to specify what lock states are being
added.  mlock2 is trivial at the moment, but a follow on patch will add a
new mlock state making it useful.

Signed-off-by: Eric B Munson <emunson@akamai.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mlock.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'mm')

diff --git a/mm/mlock.c b/mm/mlock.c
index fbd8c03..35dcf8f 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -644,6 +644,14 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
 	return do_mlock(start, len, VM_LOCKED);
 }
 
+SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
+{
+	if (flags)
+		return -EINVAL;
+
+	return do_mlock(start, len, VM_LOCKED);
+}
+
 SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
 {
 	int ret;
-- 
cgit v1.1


From de60f5f10c58d4f34b68622442c0e04180367f3f Mon Sep 17 00:00:00 2001
From: Eric B Munson <emunson@akamai.com>
Date: Thu, 5 Nov 2015 18:51:36 -0800
Subject: mm: introduce VM_LOCKONFAULT

The cost of faulting in all memory to be locked can be very high when
working with large mappings.  If only portions of the mapping will be used
this can incur a high penalty for locking.

For the example of a large file, this is the usage pattern for a large
statical language model (probably applies to other statical or graphical
models as well).  For the security example, any application transacting in
data that cannot be swapped out (credit card data, medical records, etc).

This patch introduces the ability to request that pages are not
pre-faulted, but are placed on the unevictable LRU when they are finally
faulted in.  The VM_LOCKONFAULT flag will be used together with VM_LOCKED
and has no effect when set without VM_LOCKED.  Setting the VM_LOCKONFAULT
flag for a VMA will cause pages faulted into that VMA to be added to the
unevictable LRU when they are faulted or if they are already present, but
will not cause any missing pages to be faulted in.

Exposing this new lock state means that we cannot overload the meaning of
the FOLL_POPULATE flag any longer.  Prior to this patch it was used to
mean that the VMA for a fault was locked.  This means we need the new
FOLL_MLOCK flag to communicate the locked state of a VMA.  FOLL_POPULATE
will now only control if the VMA should be populated and in the case of
VM_LOCKONFAULT, it will not be set.

Signed-off-by: Eric B Munson <emunson@akamai.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/debug.c       |  1 +
 mm/gup.c         | 10 ++++++++--
 mm/huge_memory.c |  2 +-
 mm/hugetlb.c     |  4 ++--
 mm/mlock.c       |  2 +-
 mm/mmap.c        |  2 +-
 6 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/debug.c b/mm/debug.c
index 6c1b3ea..e784110 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -125,6 +125,7 @@ static const struct trace_print_flags vmaflags_names[] = {
 	{VM_GROWSDOWN,			"growsdown"	},
 	{VM_PFNMAP,			"pfnmap"	},
 	{VM_DENYWRITE,			"denywrite"	},
+	{VM_LOCKONFAULT,		"lockonfault"	},
 	{VM_LOCKED,			"locked"	},
 	{VM_IO,				"io"		},
 	{VM_SEQ_READ,			"seqread"	},
diff --git a/mm/gup.c b/mm/gup.c
index a798293..deafa2c 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -129,7 +129,7 @@ retry:
 		 */
 		mark_page_accessed(page);
 	}
-	if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
+	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
 		/*
 		 * The preliminary mapping check is mainly to avoid the
 		 * pointless overhead of lock_page on the ZERO_PAGE
@@ -299,6 +299,9 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
 	unsigned int fault_flags = 0;
 	int ret;
 
+	/* mlock all present pages, but do not fault in new pages */
+	if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
+		return -ENOENT;
 	/* For mm_populate(), just skip the stack guard page. */
 	if ((*flags & FOLL_POPULATE) &&
 			(stack_guard_page_start(vma, address) ||
@@ -890,7 +893,10 @@ long populate_vma_page_range(struct vm_area_struct *vma,
 	VM_BUG_ON_VMA(end   > vma->vm_end, vma);
 	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
 
-	gup_flags = FOLL_TOUCH | FOLL_POPULATE;
+	gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
+	if (vma->vm_flags & VM_LOCKONFAULT)
+		gup_flags &= ~FOLL_POPULATE;
+
 	/*
 	 * We want to touch writable mappings with a write fault in order
 	 * to break COW, except for shared mappings because these don't COW
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3fd0311..f5c08b4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1307,7 +1307,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 					  pmd, _pmd,  1))
 			update_mmu_cache_pmd(vma, addr, pmd);
 	}
-	if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
+	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
 		if (page->mapping && trylock_page(page)) {
 			lru_add_drain();
 			if (page->mapping)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 241de27..74ef0c6 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4137,8 +4137,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
 	unsigned long s_end = sbase + PUD_SIZE;
 
 	/* Allow segments to share if only one is marked locked */
-	unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
-	unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
+	unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
+	unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
 
 	/*
 	 * match the virtual addresses, permission and the alignment of the
diff --git a/mm/mlock.c b/mm/mlock.c
index 35dcf8f..ca38941 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -422,7 +422,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
 void munlock_vma_pages_range(struct vm_area_struct *vma,
 			     unsigned long start, unsigned long end)
 {
-	vma->vm_flags &= ~VM_LOCKED;
+	vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
 
 	while (start < end) {
 		struct page *page = NULL;
diff --git a/mm/mmap.c b/mm/mmap.c
index 220effd..2ce04a6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1661,7 +1661,7 @@ out:
 					vma == get_gate_vma(current->mm)))
 			mm->locked_vm += (len >> PAGE_SHIFT);
 		else
-			vma->vm_flags &= ~VM_LOCKED;
+			vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
 	}
 
 	if (file)
-- 
cgit v1.1


From b0f205c2a3082dd9081f9a94e50658c5fa906ff1 Mon Sep 17 00:00:00 2001
From: Eric B Munson <emunson@akamai.com>
Date: Thu, 5 Nov 2015 18:51:39 -0800
Subject: mm: mlock: add mlock flags to enable VM_LOCKONFAULT usage

The previous patch introduced a flag that specified pages in a VMA should
be placed on the unevictable LRU, but they should not be made present when
the area is created.  This patch adds the ability to set this state via
the new mlock system calls.

We add MLOCK_ONFAULT for mlock2 and MCL_ONFAULT for mlockall.
MLOCK_ONFAULT will set the VM_LOCKONFAULT modifier for VM_LOCKED.
MCL_ONFAULT should be used as a modifier to the two other mlockall flags.
When used with MCL_CURRENT, all current mappings will be marked with
VM_LOCKED | VM_LOCKONFAULT.  When used with MCL_FUTURE, the mm->def_flags
will be marked with VM_LOCKED | VM_LOCKONFAULT.  When used with both
MCL_CURRENT and MCL_FUTURE, all current mappings and mm->def_flags will be
marked with VM_LOCKED | VM_LOCKONFAULT.

Prior to this patch, mlockall() will unconditionally clear the
mm->def_flags any time it is called without MCL_FUTURE.  This behavior is
maintained after adding MCL_ONFAULT.  If a call to mlockall(MCL_FUTURE) is
followed by mlockall(MCL_CURRENT), the mm->def_flags will be cleared and
new VMAs will be unlocked.  This remains true with or without MCL_ONFAULT
in either mlockall() invocation.

munlock() will unconditionally clear both vma flags.  munlockall()
unconditionally clears for VMA flags on all VMAs and in the mm->def_flags
field.

Signed-off-by: Eric B Munson <emunson@akamai.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mlock.c | 51 ++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 38 insertions(+), 13 deletions(-)

(limited to 'mm')

diff --git a/mm/mlock.c b/mm/mlock.c
index ca38941..339d9e0 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -506,7 +506,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 
 	if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
 	    is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
-		goto out;	/* don't set VM_LOCKED,  don't count */
+		/* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
+		goto out;
 
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 	*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
@@ -577,7 +578,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
 		prev = vma;
 
 	for (nstart = start ; ; ) {
-		vm_flags_t newflags = vma->vm_flags & ~VM_LOCKED;
+		vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
 
 		newflags |= flags;
 
@@ -646,10 +647,15 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
 
 SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
 {
-	if (flags)
+	vm_flags_t vm_flags = VM_LOCKED;
+
+	if (flags & ~MLOCK_ONFAULT)
 		return -EINVAL;
 
-	return do_mlock(start, len, VM_LOCKED);
+	if (flags & MLOCK_ONFAULT)
+		vm_flags |= VM_LOCKONFAULT;
+
+	return do_mlock(start, len, vm_flags);
 }
 
 SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
@@ -666,24 +672,43 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
 	return ret;
 }
 
+/*
+ * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall)
+ * and translate into the appropriate modifications to mm->def_flags and/or the
+ * flags for all current VMAs.
+ *
+ * There are a couple of subtleties with this.  If mlockall() is called multiple
+ * times with different flags, the values do not necessarily stack.  If mlockall
+ * is called once including the MCL_FUTURE flag and then a second time without
+ * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags.
+ */
 static int apply_mlockall_flags(int flags)
 {
 	struct vm_area_struct * vma, * prev = NULL;
+	vm_flags_t to_add = 0;
 
-	if (flags & MCL_FUTURE)
+	current->mm->def_flags &= VM_LOCKED_CLEAR_MASK;
+	if (flags & MCL_FUTURE) {
 		current->mm->def_flags |= VM_LOCKED;
-	else
-		current->mm->def_flags &= ~VM_LOCKED;
 
-	if (flags == MCL_FUTURE)
-		goto out;
+		if (flags & MCL_ONFAULT)
+			current->mm->def_flags |= VM_LOCKONFAULT;
+
+		if (!(flags & MCL_CURRENT))
+			goto out;
+	}
+
+	if (flags & MCL_CURRENT) {
+		to_add |= VM_LOCKED;
+		if (flags & MCL_ONFAULT)
+			to_add |= VM_LOCKONFAULT;
+	}
 
 	for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
 		vm_flags_t newflags;
 
-		newflags = vma->vm_flags & ~VM_LOCKED;
-		if (flags & MCL_CURRENT)
-			newflags |= VM_LOCKED;
+		newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
+		newflags |= to_add;
 
 		/* Ignore errors */
 		mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
@@ -698,7 +723,7 @@ SYSCALL_DEFINE1(mlockall, int, flags)
 	unsigned long lock_limit;
 	int ret;
 
-	if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE)))
+	if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)))
 		return -EINVAL;
 
 	if (!can_do_mlock())
-- 
cgit v1.1