From 4b8164b91d9fdff4dbac0a742d076bdff7fda21b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 31 Jan 2015 20:08:47 -0500
Subject: new helper: dup_iter()

Copy iter and kmemdup the underlying array for the copy.  Returns
a pointer to result of kmemdup() to be kfree()'d later.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/iov_iter.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'mm')

diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index 8277320..9d96e283 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -751,3 +751,18 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages)
 	return npages;
 }
 EXPORT_SYMBOL(iov_iter_npages);
+
+const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
+{
+	*new = *old;
+	if (new->type & ITER_BVEC)
+		return new->bvec = kmemdup(new->bvec,
+				    new->nr_segs * sizeof(struct bio_vec),
+				    flags);
+	else
+		/* iovec and kvec have identical layout */
+		return new->iov = kmemdup(new->iov,
+				   new->nr_segs * sizeof(struct iovec),
+				   flags);
+}
+EXPORT_SYMBOL(dup_iter);
-- 
cgit v1.1


From d879cb83417a71c435f1263e1160a9fce8e95d87 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 10 Dec 2014 16:05:55 -0500
Subject: move iov_iter.c from mm/ to lib/

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/Makefile   |   2 +-
 mm/iov_iter.c | 768 ----------------------------------------------------------
 2 files changed, 1 insertion(+), 769 deletions(-)
 delete mode 100644 mm/iov_iter.c

(limited to 'mm')

diff --git a/mm/Makefile b/mm/Makefile
index 3c1caa2..15dbe99 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -21,7 +21,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o \
 			   mm_init.o mmu_context.o percpu.o slab_common.o \
 			   compaction.o vmacache.o \
 			   interval_tree.o list_lru.o workingset.o \
-			   iov_iter.o debug.o $(mmu-y)
+			   debug.o $(mmu-y)
 
 obj-y += init-mm.o
 
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
deleted file mode 100644
index 9d96e283..0000000
--- a/mm/iov_iter.c
+++ /dev/null
@@ -1,768 +0,0 @@
-#include <linux/export.h>
-#include <linux/uio.h>
-#include <linux/pagemap.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <net/checksum.h>
-
-#define iterate_iovec(i, n, __v, __p, skip, STEP) {	\
-	size_t left;					\
-	size_t wanted = n;				\
-	__p = i->iov;					\
-	__v.iov_len = min(n, __p->iov_len - skip);	\
-	if (likely(__v.iov_len)) {			\
-		__v.iov_base = __p->iov_base + skip;	\
-		left = (STEP);				\
-		__v.iov_len -= left;			\
-		skip += __v.iov_len;			\
-		n -= __v.iov_len;			\
-	} else {					\
-		left = 0;				\
-	}						\
-	while (unlikely(!left && n)) {			\
-		__p++;					\
-		__v.iov_len = min(n, __p->iov_len);	\
-		if (unlikely(!__v.iov_len))		\
-			continue;			\
-		__v.iov_base = __p->iov_base;		\
-		left = (STEP);				\
-		__v.iov_len -= left;			\
-		skip = __v.iov_len;			\
-		n -= __v.iov_len;			\
-	}						\
-	n = wanted - n;					\
-}
-
-#define iterate_kvec(i, n, __v, __p, skip, STEP) {	\
-	size_t wanted = n;				\
-	__p = i->kvec;					\
-	__v.iov_len = min(n, __p->iov_len - skip);	\
-	if (likely(__v.iov_len)) {			\
-		__v.iov_base = __p->iov_base + skip;	\
-		(void)(STEP);				\
-		skip += __v.iov_len;			\
-		n -= __v.iov_len;			\
-	}						\
-	while (unlikely(n)) {				\
-		__p++;					\
-		__v.iov_len = min(n, __p->iov_len);	\
-		if (unlikely(!__v.iov_len))		\
-			continue;			\
-		__v.iov_base = __p->iov_base;		\
-		(void)(STEP);				\
-		skip = __v.iov_len;			\
-		n -= __v.iov_len;			\
-	}						\
-	n = wanted;					\
-}
-
-#define iterate_bvec(i, n, __v, __p, skip, STEP) {	\
-	size_t wanted = n;				\
-	__p = i->bvec;					\
-	__v.bv_len = min_t(size_t, n, __p->bv_len - skip);	\
-	if (likely(__v.bv_len)) {			\
-		__v.bv_page = __p->bv_page;		\
-		__v.bv_offset = __p->bv_offset + skip; 	\
-		(void)(STEP);				\
-		skip += __v.bv_len;			\
-		n -= __v.bv_len;			\
-	}						\
-	while (unlikely(n)) {				\
-		__p++;					\
-		__v.bv_len = min_t(size_t, n, __p->bv_len);	\
-		if (unlikely(!__v.bv_len))		\
-			continue;			\
-		__v.bv_page = __p->bv_page;		\
-		__v.bv_offset = __p->bv_offset;		\
-		(void)(STEP);				\
-		skip = __v.bv_len;			\
-		n -= __v.bv_len;			\
-	}						\
-	n = wanted;					\
-}
-
-#define iterate_all_kinds(i, n, v, I, B, K) {			\
-	size_t skip = i->iov_offset;				\
-	if (unlikely(i->type & ITER_BVEC)) {			\
-		const struct bio_vec *bvec;			\
-		struct bio_vec v;				\
-		iterate_bvec(i, n, v, bvec, skip, (B))		\
-	} else if (unlikely(i->type & ITER_KVEC)) {		\
-		const struct kvec *kvec;			\
-		struct kvec v;					\
-		iterate_kvec(i, n, v, kvec, skip, (K))		\
-	} else {						\
-		const struct iovec *iov;			\
-		struct iovec v;					\
-		iterate_iovec(i, n, v, iov, skip, (I))		\
-	}							\
-}
-
-#define iterate_and_advance(i, n, v, I, B, K) {			\
-	size_t skip = i->iov_offset;				\
-	if (unlikely(i->type & ITER_BVEC)) {			\
-		const struct bio_vec *bvec;			\
-		struct bio_vec v;				\
-		iterate_bvec(i, n, v, bvec, skip, (B))		\
-		if (skip == bvec->bv_len) {			\
-			bvec++;					\
-			skip = 0;				\
-		}						\
-		i->nr_segs -= bvec - i->bvec;			\
-		i->bvec = bvec;					\
-	} else if (unlikely(i->type & ITER_KVEC)) {		\
-		const struct kvec *kvec;			\
-		struct kvec v;					\
-		iterate_kvec(i, n, v, kvec, skip, (K))		\
-		if (skip == kvec->iov_len) {			\
-			kvec++;					\
-			skip = 0;				\
-		}						\
-		i->nr_segs -= kvec - i->kvec;			\
-		i->kvec = kvec;					\
-	} else {						\
-		const struct iovec *iov;			\
-		struct iovec v;					\
-		iterate_iovec(i, n, v, iov, skip, (I))		\
-		if (skip == iov->iov_len) {			\
-			iov++;					\
-			skip = 0;				\
-		}						\
-		i->nr_segs -= iov - i->iov;			\
-		i->iov = iov;					\
-	}							\
-	i->count -= n;						\
-	i->iov_offset = skip;					\
-}
-
-static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
-			 struct iov_iter *i)
-{
-	size_t skip, copy, left, wanted;
-	const struct iovec *iov;
-	char __user *buf;
-	void *kaddr, *from;
-
-	if (unlikely(bytes > i->count))
-		bytes = i->count;
-
-	if (unlikely(!bytes))
-		return 0;
-
-	wanted = bytes;
-	iov = i->iov;
-	skip = i->iov_offset;
-	buf = iov->iov_base + skip;
-	copy = min(bytes, iov->iov_len - skip);
-
-	if (!fault_in_pages_writeable(buf, copy)) {
-		kaddr = kmap_atomic(page);
-		from = kaddr + offset;
-
-		/* first chunk, usually the only one */
-		left = __copy_to_user_inatomic(buf, from, copy);
-		copy -= left;
-		skip += copy;
-		from += copy;
-		bytes -= copy;
-
-		while (unlikely(!left && bytes)) {
-			iov++;
-			buf = iov->iov_base;
-			copy = min(bytes, iov->iov_len);
-			left = __copy_to_user_inatomic(buf, from, copy);
-			copy -= left;
-			skip = copy;
-			from += copy;
-			bytes -= copy;
-		}
-		if (likely(!bytes)) {
-			kunmap_atomic(kaddr);
-			goto done;
-		}
-		offset = from - kaddr;
-		buf += copy;
-		kunmap_atomic(kaddr);
-		copy = min(bytes, iov->iov_len - skip);
-	}
-	/* Too bad - revert to non-atomic kmap */
-	kaddr = kmap(page);
-	from = kaddr + offset;
-	left = __copy_to_user(buf, from, copy);
-	copy -= left;
-	skip += copy;
-	from += copy;
-	bytes -= copy;
-	while (unlikely(!left && bytes)) {
-		iov++;
-		buf = iov->iov_base;
-		copy = min(bytes, iov->iov_len);
-		left = __copy_to_user(buf, from, copy);
-		copy -= left;
-		skip = copy;
-		from += copy;
-		bytes -= copy;
-	}
-	kunmap(page);
-done:
-	if (skip == iov->iov_len) {
-		iov++;
-		skip = 0;
-	}
-	i->count -= wanted - bytes;
-	i->nr_segs -= iov - i->iov;
-	i->iov = iov;
-	i->iov_offset = skip;
-	return wanted - bytes;
-}
-
-static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes,
-			 struct iov_iter *i)
-{
-	size_t skip, copy, left, wanted;
-	const struct iovec *iov;
-	char __user *buf;
-	void *kaddr, *to;
-
-	if (unlikely(bytes > i->count))
-		bytes = i->count;
-
-	if (unlikely(!bytes))
-		return 0;
-
-	wanted = bytes;
-	iov = i->iov;
-	skip = i->iov_offset;
-	buf = iov->iov_base + skip;
-	copy = min(bytes, iov->iov_len - skip);
-
-	if (!fault_in_pages_readable(buf, copy)) {
-		kaddr = kmap_atomic(page);
-		to = kaddr + offset;
-
-		/* first chunk, usually the only one */
-		left = __copy_from_user_inatomic(to, buf, copy);
-		copy -= left;
-		skip += copy;
-		to += copy;
-		bytes -= copy;
-
-		while (unlikely(!left && bytes)) {
-			iov++;
-			buf = iov->iov_base;
-			copy = min(bytes, iov->iov_len);
-			left = __copy_from_user_inatomic(to, buf, copy);
-			copy -= left;
-			skip = copy;
-			to += copy;
-			bytes -= copy;
-		}
-		if (likely(!bytes)) {
-			kunmap_atomic(kaddr);
-			goto done;
-		}
-		offset = to - kaddr;
-		buf += copy;
-		kunmap_atomic(kaddr);
-		copy = min(bytes, iov->iov_len - skip);
-	}
-	/* Too bad - revert to non-atomic kmap */
-	kaddr = kmap(page);
-	to = kaddr + offset;
-	left = __copy_from_user(to, buf, copy);
-	copy -= left;
-	skip += copy;
-	to += copy;
-	bytes -= copy;
-	while (unlikely(!left && bytes)) {
-		iov++;
-		buf = iov->iov_base;
-		copy = min(bytes, iov->iov_len);
-		left = __copy_from_user(to, buf, copy);
-		copy -= left;
-		skip = copy;
-		to += copy;
-		bytes -= copy;
-	}
-	kunmap(page);
-done:
-	if (skip == iov->iov_len) {
-		iov++;
-		skip = 0;
-	}
-	i->count -= wanted - bytes;
-	i->nr_segs -= iov - i->iov;
-	i->iov = iov;
-	i->iov_offset = skip;
-	return wanted - bytes;
-}
-
-/*
- * Fault in the first iovec of the given iov_iter, to a maximum length
- * of bytes. Returns 0 on success, or non-zero if the memory could not be
- * accessed (ie. because it is an invalid address).
- *
- * writev-intensive code may want this to prefault several iovecs -- that
- * would be possible (callers must not rely on the fact that _only_ the
- * first iovec will be faulted with the current implementation).
- */
-int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
-{
-	if (!(i->type & (ITER_BVEC|ITER_KVEC))) {
-		char __user *buf = i->iov->iov_base + i->iov_offset;
-		bytes = min(bytes, i->iov->iov_len - i->iov_offset);
-		return fault_in_pages_readable(buf, bytes);
-	}
-	return 0;
-}
-EXPORT_SYMBOL(iov_iter_fault_in_readable);
-
-void iov_iter_init(struct iov_iter *i, int direction,
-			const struct iovec *iov, unsigned long nr_segs,
-			size_t count)
-{
-	/* It will get better.  Eventually... */
-	if (segment_eq(get_fs(), KERNEL_DS)) {
-		direction |= ITER_KVEC;
-		i->type = direction;
-		i->kvec = (struct kvec *)iov;
-	} else {
-		i->type = direction;
-		i->iov = iov;
-	}
-	i->nr_segs = nr_segs;
-	i->iov_offset = 0;
-	i->count = count;
-}
-EXPORT_SYMBOL(iov_iter_init);
-
-static void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len)
-{
-	char *from = kmap_atomic(page);
-	memcpy(to, from + offset, len);
-	kunmap_atomic(from);
-}
-
-static void memcpy_to_page(struct page *page, size_t offset, char *from, size_t len)
-{
-	char *to = kmap_atomic(page);
-	memcpy(to + offset, from, len);
-	kunmap_atomic(to);
-}
-
-static void memzero_page(struct page *page, size_t offset, size_t len)
-{
-	char *addr = kmap_atomic(page);
-	memset(addr + offset, 0, len);
-	kunmap_atomic(addr);
-}
-
-size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i)
-{
-	char *from = addr;
-	if (unlikely(bytes > i->count))
-		bytes = i->count;
-
-	if (unlikely(!bytes))
-		return 0;
-
-	iterate_and_advance(i, bytes, v,
-		__copy_to_user(v.iov_base, (from += v.iov_len) - v.iov_len,
-			       v.iov_len),
-		memcpy_to_page(v.bv_page, v.bv_offset,
-			       (from += v.bv_len) - v.bv_len, v.bv_len),
-		memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len)
-	)
-
-	return bytes;
-}
-EXPORT_SYMBOL(copy_to_iter);
-
-size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
-{
-	char *to = addr;
-	if (unlikely(bytes > i->count))
-		bytes = i->count;
-
-	if (unlikely(!bytes))
-		return 0;
-
-	iterate_and_advance(i, bytes, v,
-		__copy_from_user((to += v.iov_len) - v.iov_len, v.iov_base,
-				 v.iov_len),
-		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
-				 v.bv_offset, v.bv_len),
-		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
-	)
-
-	return bytes;
-}
-EXPORT_SYMBOL(copy_from_iter);
-
-size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
-{
-	char *to = addr;
-	if (unlikely(bytes > i->count))
-		bytes = i->count;
-
-	if (unlikely(!bytes))
-		return 0;
-
-	iterate_and_advance(i, bytes, v,
-		__copy_from_user_nocache((to += v.iov_len) - v.iov_len,
-					 v.iov_base, v.iov_len),
-		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
-				 v.bv_offset, v.bv_len),
-		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
-	)
-
-	return bytes;
-}
-EXPORT_SYMBOL(copy_from_iter_nocache);
-
-size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
-			 struct iov_iter *i)
-{
-	if (i->type & (ITER_BVEC|ITER_KVEC)) {
-		void *kaddr = kmap_atomic(page);
-		size_t wanted = copy_to_iter(kaddr + offset, bytes, i);
-		kunmap_atomic(kaddr);
-		return wanted;
-	} else
-		return copy_page_to_iter_iovec(page, offset, bytes, i);
-}
-EXPORT_SYMBOL(copy_page_to_iter);
-
-size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
-			 struct iov_iter *i)
-{
-	if (i->type & (ITER_BVEC|ITER_KVEC)) {
-		void *kaddr = kmap_atomic(page);
-		size_t wanted = copy_from_iter(kaddr + offset, bytes, i);
-		kunmap_atomic(kaddr);
-		return wanted;
-	} else
-		return copy_page_from_iter_iovec(page, offset, bytes, i);
-}
-EXPORT_SYMBOL(copy_page_from_iter);
-
-size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
-{
-	if (unlikely(bytes > i->count))
-		bytes = i->count;
-
-	if (unlikely(!bytes))
-		return 0;
-
-	iterate_and_advance(i, bytes, v,
-		__clear_user(v.iov_base, v.iov_len),
-		memzero_page(v.bv_page, v.bv_offset, v.bv_len),
-		memset(v.iov_base, 0, v.iov_len)
-	)
-
-	return bytes;
-}
-EXPORT_SYMBOL(iov_iter_zero);
-
-size_t iov_iter_copy_from_user_atomic(struct page *page,
-		struct iov_iter *i, unsigned long offset, size_t bytes)
-{
-	char *kaddr = kmap_atomic(page), *p = kaddr + offset;
-	iterate_all_kinds(i, bytes, v,
-		__copy_from_user_inatomic((p += v.iov_len) - v.iov_len,
-					  v.iov_base, v.iov_len),
-		memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page,
-				 v.bv_offset, v.bv_len),
-		memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
-	)
-	kunmap_atomic(kaddr);
-	return bytes;
-}
-EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
-
-void iov_iter_advance(struct iov_iter *i, size_t size)
-{
-	iterate_and_advance(i, size, v, 0, 0, 0)
-}
-EXPORT_SYMBOL(iov_iter_advance);
-
-/*
- * Return the count of just the current iov_iter segment.
- */
-size_t iov_iter_single_seg_count(const struct iov_iter *i)
-{
-	if (i->nr_segs == 1)
-		return i->count;
-	else if (i->type & ITER_BVEC)
-		return min(i->count, i->bvec->bv_len - i->iov_offset);
-	else
-		return min(i->count, i->iov->iov_len - i->iov_offset);
-}
-EXPORT_SYMBOL(iov_iter_single_seg_count);
-
-void iov_iter_kvec(struct iov_iter *i, int direction,
-			const struct kvec *kvec, unsigned long nr_segs,
-			size_t count)
-{
-	BUG_ON(!(direction & ITER_KVEC));
-	i->type = direction;
-	i->kvec = kvec;
-	i->nr_segs = nr_segs;
-	i->iov_offset = 0;
-	i->count = count;
-}
-EXPORT_SYMBOL(iov_iter_kvec);
-
-void iov_iter_bvec(struct iov_iter *i, int direction,
-			const struct bio_vec *bvec, unsigned long nr_segs,
-			size_t count)
-{
-	BUG_ON(!(direction & ITER_BVEC));
-	i->type = direction;
-	i->bvec = bvec;
-	i->nr_segs = nr_segs;
-	i->iov_offset = 0;
-	i->count = count;
-}
-EXPORT_SYMBOL(iov_iter_bvec);
-
-unsigned long iov_iter_alignment(const struct iov_iter *i)
-{
-	unsigned long res = 0;
-	size_t size = i->count;
-
-	if (!size)
-		return 0;
-
-	iterate_all_kinds(i, size, v,
-		(res |= (unsigned long)v.iov_base | v.iov_len, 0),
-		res |= v.bv_offset | v.bv_len,
-		res |= (unsigned long)v.iov_base | v.iov_len
-	)
-	return res;
-}
-EXPORT_SYMBOL(iov_iter_alignment);
-
-ssize_t iov_iter_get_pages(struct iov_iter *i,
-		   struct page **pages, size_t maxsize, unsigned maxpages,
-		   size_t *start)
-{
-	if (maxsize > i->count)
-		maxsize = i->count;
-
-	if (!maxsize)
-		return 0;
-
-	iterate_all_kinds(i, maxsize, v, ({
-		unsigned long addr = (unsigned long)v.iov_base;
-		size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
-		int n;
-		int res;
-
-		if (len > maxpages * PAGE_SIZE)
-			len = maxpages * PAGE_SIZE;
-		addr &= ~(PAGE_SIZE - 1);
-		n = DIV_ROUND_UP(len, PAGE_SIZE);
-		res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages);
-		if (unlikely(res < 0))
-			return res;
-		return (res == n ? len : res * PAGE_SIZE) - *start;
-	0;}),({
-		/* can't be more than PAGE_SIZE */
-		*start = v.bv_offset;
-		get_page(*pages = v.bv_page);
-		return v.bv_len;
-	}),({
-		return -EFAULT;
-	})
-	)
-	return 0;
-}
-EXPORT_SYMBOL(iov_iter_get_pages);
-
-static struct page **get_pages_array(size_t n)
-{
-	struct page **p = kmalloc(n * sizeof(struct page *), GFP_KERNEL);
-	if (!p)
-		p = vmalloc(n * sizeof(struct page *));
-	return p;
-}
-
-ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
-		   struct page ***pages, size_t maxsize,
-		   size_t *start)
-{
-	struct page **p;
-
-	if (maxsize > i->count)
-		maxsize = i->count;
-
-	if (!maxsize)
-		return 0;
-
-	iterate_all_kinds(i, maxsize, v, ({
-		unsigned long addr = (unsigned long)v.iov_base;
-		size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
-		int n;
-		int res;
-
-		addr &= ~(PAGE_SIZE - 1);
-		n = DIV_ROUND_UP(len, PAGE_SIZE);
-		p = get_pages_array(n);
-		if (!p)
-			return -ENOMEM;
-		res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p);
-		if (unlikely(res < 0)) {
-			kvfree(p);
-			return res;
-		}
-		*pages = p;
-		return (res == n ? len : res * PAGE_SIZE) - *start;
-	0;}),({
-		/* can't be more than PAGE_SIZE */
-		*start = v.bv_offset;
-		*pages = p = get_pages_array(1);
-		if (!p)
-			return -ENOMEM;
-		get_page(*p = v.bv_page);
-		return v.bv_len;
-	}),({
-		return -EFAULT;
-	})
-	)
-	return 0;
-}
-EXPORT_SYMBOL(iov_iter_get_pages_alloc);
-
-size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
-			       struct iov_iter *i)
-{
-	char *to = addr;
-	__wsum sum, next;
-	size_t off = 0;
-	if (unlikely(bytes > i->count))
-		bytes = i->count;
-
-	if (unlikely(!bytes))
-		return 0;
-
-	sum = *csum;
-	iterate_and_advance(i, bytes, v, ({
-		int err = 0;
-		next = csum_and_copy_from_user(v.iov_base, 
-					       (to += v.iov_len) - v.iov_len,
-					       v.iov_len, 0, &err);
-		if (!err) {
-			sum = csum_block_add(sum, next, off);
-			off += v.iov_len;
-		}
-		err ? v.iov_len : 0;
-	}), ({
-		char *p = kmap_atomic(v.bv_page);
-		next = csum_partial_copy_nocheck(p + v.bv_offset,
-						 (to += v.bv_len) - v.bv_len,
-						 v.bv_len, 0);
-		kunmap_atomic(p);
-		sum = csum_block_add(sum, next, off);
-		off += v.bv_len;
-	}),({
-		next = csum_partial_copy_nocheck(v.iov_base,
-						 (to += v.iov_len) - v.iov_len,
-						 v.iov_len, 0);
-		sum = csum_block_add(sum, next, off);
-		off += v.iov_len;
-	})
-	)
-	*csum = sum;
-	return bytes;
-}
-EXPORT_SYMBOL(csum_and_copy_from_iter);
-
-size_t csum_and_copy_to_iter(void *addr, size_t bytes, __wsum *csum,
-			     struct iov_iter *i)
-{
-	char *from = addr;
-	__wsum sum, next;
-	size_t off = 0;
-	if (unlikely(bytes > i->count))
-		bytes = i->count;
-
-	if (unlikely(!bytes))
-		return 0;
-
-	sum = *csum;
-	iterate_and_advance(i, bytes, v, ({
-		int err = 0;
-		next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len,
-					     v.iov_base, 
-					     v.iov_len, 0, &err);
-		if (!err) {
-			sum = csum_block_add(sum, next, off);
-			off += v.iov_len;
-		}
-		err ? v.iov_len : 0;
-	}), ({
-		char *p = kmap_atomic(v.bv_page);
-		next = csum_partial_copy_nocheck((from += v.bv_len) - v.bv_len,
-						 p + v.bv_offset,
-						 v.bv_len, 0);
-		kunmap_atomic(p);
-		sum = csum_block_add(sum, next, off);
-		off += v.bv_len;
-	}),({
-		next = csum_partial_copy_nocheck((from += v.iov_len) - v.iov_len,
-						 v.iov_base,
-						 v.iov_len, 0);
-		sum = csum_block_add(sum, next, off);
-		off += v.iov_len;
-	})
-	)
-	*csum = sum;
-	return bytes;
-}
-EXPORT_SYMBOL(csum_and_copy_to_iter);
-
-int iov_iter_npages(const struct iov_iter *i, int maxpages)
-{
-	size_t size = i->count;
-	int npages = 0;
-
-	if (!size)
-		return 0;
-
-	iterate_all_kinds(i, size, v, ({
-		unsigned long p = (unsigned long)v.iov_base;
-		npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
-			- p / PAGE_SIZE;
-		if (npages >= maxpages)
-			return maxpages;
-	0;}),({
-		npages++;
-		if (npages >= maxpages)
-			return maxpages;
-	}),({
-		unsigned long p = (unsigned long)v.iov_base;
-		npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
-			- p / PAGE_SIZE;
-		if (npages >= maxpages)
-			return maxpages;
-	})
-	)
-	return npages;
-}
-EXPORT_SYMBOL(iov_iter_npages);
-
-const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
-{
-	*new = *old;
-	if (new->type & ITER_BVEC)
-		return new->bvec = kmemdup(new->bvec,
-				    new->nr_segs * sizeof(struct bio_vec),
-				    flags);
-	else
-		/* iovec and kvec have identical layout */
-		return new->iov = kmemdup(new->iov,
-				   new->nr_segs * sizeof(struct iovec),
-				   flags);
-}
-EXPORT_SYMBOL(dup_iter);
-- 
cgit v1.1


From 53da3bc2ba9e4899f32707b5cd7d18421b943687 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 12 Mar 2015 08:45:46 -0700
Subject: mm: fix up numa read-only thread grouping logic

Dave Chinner reported that commit 4d9424669946 ("mm: convert
p[te|md]_mknonnuma and remaining page table manipulations") slowed down
his xfsrepair test enormously.  In particular, it was using more system
time due to extra TLB flushing.

The ultimate reason turns out to be how the change to use the regular
page table accessor functions broke the NUMA grouping logic.  The old
special mknuma/mknonnuma code accessed the page table present bit and
the magic NUMA bit directly, while the new code just changes the page
protections using PROT_NONE and the regular vma protections.

That sounds equivalent, and from a fault standpoint it really is, but a
subtle side effect is that the *other* protection bits of the page table
entries also change.  And the code to decide how to group the NUMA
entries together used the writable bit to decide whether a particular
page was likely to be shared read-only or not.

And with the change to make the NUMA handling use the regular permission
setting functions, that writable bit was basically always cleared for
private mappings due to COW.  So even if the page actually ends up being
written to in the end, the NUMA balancing would act as if it was always
shared RO.

This code is a heuristic anyway, so the fix - at least for now - is to
instead check whether the page is dirty rather than writable.  The bit
doesn't change with protection changes.

NOTE! This also adds a FIXME comment to revisit this issue,

Not only should we probably re-visit the whole "is this a shared
read-only page" heuristic (we might want to take the vma permissions
into account and base this more on those than the per-page ones, and
also look at whether the particular access that triggers it is a write
or not), but the whole COW issue shows that we should think about the
NUMA fault handling some more.

For example, maybe we should do the early-COW thing that a regular fault
does.  Or maybe we should accept that while using the same bits as
PROTNONE was a good thing (and got rid of the specual NUMA bit), we
might still want to just preseve the other protection bits across NUMA
faulting.

Those are bigger questions, left for later.  This just fixes up the
heuristic so that it at least approximates working again.  More analysis
and work needed.

Reported-by: Dave Chinner <david@fromorbit.com>
Tested-by: Mel Gorman <mgorman@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>,
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 7 ++++++-
 mm/memory.c      | 7 ++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index fc00c8c..89b9075 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1295,8 +1295,13 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * Avoid grouping on DSO/COW pages in specific and RO pages
 	 * in general, RO pages shouldn't hurt as much anyway since
 	 * they can be in shared cache state.
+	 *
+	 * FIXME! This checks "pmd_dirty()" as an approximation of
+	 * "is this a read-only page", since checking "pmd_write()"
+	 * is even more broken. We haven't actually turned this into
+	 * a writable page, so pmd_write() will always be false.
 	 */
-	if (!pmd_write(pmd))
+	if (!pmd_dirty(pmd))
 		flags |= TNF_NO_GROUP;
 
 	/*
diff --git a/mm/memory.c b/mm/memory.c
index 8068893..411144f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3072,8 +3072,13 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * Avoid grouping on DSO/COW pages in specific and RO pages
 	 * in general, RO pages shouldn't hurt as much anyway since
 	 * they can be in shared cache state.
+	 *
+	 * FIXME! This checks "pmd_dirty()" as an approximation of
+	 * "is this a read-only page", since checking "pmd_write()"
+	 * is even more broken. We haven't actually turned this into
+	 * a writable page, so pmd_write() will always be false.
 	 */
-	if (!pte_write(pte))
+	if (!pte_dirty(pte))
 		flags |= TNF_NO_GROUP;
 
 	/*
-- 
cgit v1.1


From ba68bc0115ebfc37f911db4e87bf5f7991f89698 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Sat, 7 Mar 2015 15:20:48 +0000
Subject: mm: thp: Return the correct value for change_huge_pmd

The wrong value is being returned by change_huge_pmd since commit
10c1045f28e8 ("mm: numa: avoid unnecessary TLB flushes when setting
NUMA hinting entries") which allows a fallthrough that tries to adjust
non-existent PTEs. This patch corrects it.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 89b9075..626e93d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1487,6 +1487,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 
 	if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
 		pmd_t entry;
+		ret = 1;
 
 		/*
 		 * Avoid trapping faults against the zero page. The read-only
@@ -1495,11 +1496,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		 */
 		if (prot_numa && is_huge_zero_pmd(*pmd)) {
 			spin_unlock(ptl);
-			return 0;
+			return ret;
 		}
 
 		if (!prot_numa || !pmd_protnone(*pmd)) {
-			ret = 1;
 			entry = pmdp_get_and_clear_notify(mm, addr, pmd);
 			entry = pmd_modify(entry, newprot);
 			ret = HPAGE_PMD_NR;
-- 
cgit v1.1


From e009d5dc0a94a7133e5f1c083732d760bfd038e6 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Thu, 12 Mar 2015 16:25:52 -0700
Subject: mm, oom: do not fail __GFP_NOFAIL allocation if oom killer is
 disabled

Tetsuo Handa has pointed out that __GFP_NOFAIL allocations might fail
after OOM killer is disabled if the allocation is performed by a kernel
thread.  This behavior was introduced from the very beginning by
7f33d49a2ed5 ("mm, PM/Freezer: Disable OOM killer when tasks are frozen").
 This means that the basic contract for the allocation request is broken
and the context requesting such an allocation might blow up unexpectedly.

There are basically two ways forward.

1) move oom_killer_disable after kernel threads are frozen.  This has a
   risk that the OOM victim wouldn't be able to finish because it would
   depend on an already frozen kernel thread.  This would be really tricky
   to debug.

2) do not fail GFP_NOFAIL allocation no matter what and risk a
   potential Freezable kernel threads will loop and fail the suspend.
   Incidental allocations after kernel threads are frozen will at least
   dump a warning - if we are lucky and the serial console is still active
   of course...

This patch implements the later option because it is safer.  We would see
warning rather than allocation failures for the kernel threads which would
blow up otherwise and have a higher chances to identify __GFP_NOFAIL users
from deeper pm code.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Acked-by: David Rientjes <rientjes@gooogle.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7abfa70..40e2942 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2373,7 +2373,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 			goto out;
 	}
 	/* Exhausted what can be done so it's blamo time */
-	if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false))
+	if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)
+			|| WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
 		*did_some_progress = 1;
 out:
 	oom_zonelist_unlock(ac->zonelist, gfp_mask);
-- 
cgit v1.1


From 44fc80573cc760a7154f41fd0a958ee10eba1a81 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 12 Mar 2015 16:25:54 -0700
Subject: mm, hugetlb: close race when setting PageTail for gigantic pages

Now that gigantic pages are dynamically allocatable, care must be taken to
ensure that p->first_page is valid before setting PageTail.

If this isn't done, then it is possible to race and have compound_head()
return NULL.

Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Davidlohr Bueso <dave@stgolabs.net>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0a9ac6c..c41b2a0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -917,7 +917,6 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
 	__SetPageHead(page);
 	__ClearPageReserved(page);
 	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
-		__SetPageTail(p);
 		/*
 		 * For gigantic hugepages allocated through bootmem at
 		 * boot, it's safer to be consistent with the not-gigantic
@@ -933,6 +932,9 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
 		__ClearPageReserved(p);
 		set_page_count(p, 0);
 		p->first_page = page;
+		/* Make sure p->first_page is always valid for PageTail() */
+		smp_wmb();
+		__SetPageTail(p);
 	}
 }
 
-- 
cgit v1.1


From 850fc430f47aad52092deaaeb32b99f97f0e6aca Mon Sep 17 00:00:00 2001
From: Danesh Petigara <dpetigara@broadcom.com>
Date: Thu, 12 Mar 2015 16:25:57 -0700
Subject: mm: cma: fix CMA aligned offset calculation

The CMA aligned offset calculation is incorrect for non-zero order_per_bit
values.

For example, if cma->order_per_bit=1, cma->base_pfn= 0x2f800000 and
align_order=12, the function returns a value of 0x17c00 instead of 0x400.

This patch fixes the CMA aligned offset calculation.

The previous calculation was wrong and would return too-large values for
the offset, so that when cma_alloc looks for free pages in the bitmap with
the requested alignment > order_per_bit, it starts too far into the bitmap
and so CMA allocations will fail despite there actually being plenty of
free pages remaining.  It will also probably have the wrong alignment.
With this change, we will get the correct offset into the bitmap.

One affected user is powerpc KVM, which has kvm_cma->order_per_bit set to
KVM_CMA_CHUNK_ORDER - PAGE_SHIFT, or 18 - 12 = 6.

[gregory.0xf0@gmail.com: changelog additions]
Signed-off-by: Danesh Petigara <dpetigara@broadcom.com>
Reviewed-by: Gregory Fong <gregory.0xf0@gmail.com>
Acked-by: Michal Nazarewicz <mina86@mina86.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/cma.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/cma.c b/mm/cma.c
index 75016fd..68ecb7a 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -64,15 +64,17 @@ static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order)
 	return (1UL << (align_order - cma->order_per_bit)) - 1;
 }
 
+/*
+ * Find a PFN aligned to the specified order and return an offset represented in
+ * order_per_bits.
+ */
 static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order)
 {
-	unsigned int alignment;
-
 	if (align_order <= cma->order_per_bit)
 		return 0;
-	alignment = 1UL << (align_order - cma->order_per_bit);
-	return ALIGN(cma->base_pfn, alignment) -
-		(cma->base_pfn >> cma->order_per_bit);
+
+	return (ALIGN(cma->base_pfn, (1UL << align_order))
+		- cma->base_pfn) >> cma->order_per_bit;
 }
 
 static unsigned long cma_bitmap_maxno(struct cma *cma)
-- 
cgit v1.1


From 5b8bf30721980b254be7a07315c353b3a3175b74 Mon Sep 17 00:00:00 2001
From: gchen gchen <xili_gchen_5257@hotmail.com>
Date: Thu, 12 Mar 2015 16:26:05 -0700
Subject: mm/nommu.c: export symbol max_mapnr

Several modules may need max_mapnr, so export, the related error with
allmodconfig under c6x:

  MODPOST 3327 modules
  ERROR: "max_mapnr" [fs/pstore/ramoops.ko] undefined!
  ERROR: "max_mapnr" [drivers/media/v4l2-core/videobuf2-dma-contig.ko] undefined!

Signed-off-by: Chen Gang <gang.chen.5i5j@gmail.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/nommu.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/nommu.c b/mm/nommu.c
index 3e67e75..3fba2dc9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -62,6 +62,7 @@ void *high_memory;
 EXPORT_SYMBOL(high_memory);
 struct page *mem_map;
 unsigned long max_mapnr;
+EXPORT_SYMBOL(max_mapnr);
 unsigned long highest_memmap_pfn;
 struct percpu_counter vm_committed_as;
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
-- 
cgit v1.1


From a5af5aa8b67dfdba36c853b70564fd2dfe73d478 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Thu, 12 Mar 2015 16:26:11 -0700
Subject: kasan, module, vmalloc: rework shadow allocation for modules

Current approach in handling shadow memory for modules is broken.

Shadow memory could be freed only after memory shadow corresponds it is no
longer used.  vfree() called from interrupt context could use memory its
freeing to store 'struct llist_node' in it:

    void vfree(const void *addr)
    {
    ...
        if (unlikely(in_interrupt())) {
            struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred);
            if (llist_add((struct llist_node *)addr, &p->list))
                    schedule_work(&p->wq);

Later this list node used in free_work() which actually frees memory.
Currently module_memfree() called in interrupt context will free shadow
before freeing module's memory which could provoke kernel crash.

So shadow memory should be freed after module's memory.  However, such
deallocation order could race with kasan_module_alloc() in module_alloc().

Free shadow right before releasing vm area.  At this point vfree()'d
memory is not used anymore and yet not available for other allocations.
New VM_KASAN flag used to indicate that vm area has dynamically allocated
shadow memory so kasan frees shadow only if it was previously allocated.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/kasan/kasan.c | 14 +++++++++++---
 mm/vmalloc.c     |  1 +
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 78fee63..936d816 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -29,6 +29,7 @@
 #include <linux/stacktrace.h>
 #include <linux/string.h>
 #include <linux/types.h>
+#include <linux/vmalloc.h>
 #include <linux/kasan.h>
 
 #include "kasan.h"
@@ -414,12 +415,19 @@ int kasan_module_alloc(void *addr, size_t size)
 			GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
 			PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
 			__builtin_return_address(0));
-	return ret ? 0 : -ENOMEM;
+
+	if (ret) {
+		find_vm_area(addr)->flags |= VM_KASAN;
+		return 0;
+	}
+
+	return -ENOMEM;
 }
 
-void kasan_module_free(void *addr)
+void kasan_free_shadow(const struct vm_struct *vm)
 {
-	vfree(kasan_mem_to_shadow(addr));
+	if (vm->flags & VM_KASAN)
+		vfree(kasan_mem_to_shadow(vm->addr));
 }
 
 static void register_global(struct kasan_global *global)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 35b25e1..49abccf 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1418,6 +1418,7 @@ struct vm_struct *remove_vm_area(const void *addr)
 		spin_unlock(&vmap_area_lock);
 
 		vmap_debug_free_range(va->va_start, va->va_end);
+		kasan_free_shadow(vm);
 		free_unmap_vmap_area(va);
 		vm->size -= PAGE_SIZE;
 
-- 
cgit v1.1


From a5a6579db33af91f4f5134e14be758dc71c1b694 Mon Sep 17 00:00:00 2001
From: Jeff Vander Stoep <jeffv@google.com>
Date: Thu, 12 Mar 2015 16:26:17 -0700
Subject: mm: reorder can_do_mlock to fix audit denial

A userspace call to mmap(MAP_LOCKED) may result in the successful locking
of memory while also producing a confusing audit log denial.  can_do_mlock
checks capable and rlimit.  If either of these return positive
can_do_mlock returns true.  The capable check leads to an LSM hook used by
apparmour and selinux which produce the audit denial.  Reordering so
rlimit is checked first eliminates the denial on success, only recording a
denial when the lock is unsuccessful as a result of the denial.

Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Acked-by: Nick Kralevich <nnk@google.com>
Cc: Jeff Vander Stoep <jeffv@google.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Paul Cassella <cassella@cray.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mlock.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/mlock.c b/mm/mlock.c
index 73cf098..8a54cd2 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -26,10 +26,10 @@
 
 int can_do_mlock(void)
 {
-	if (capable(CAP_IPC_LOCK))
-		return 1;
 	if (rlimit(RLIMIT_MEMLOCK) != 0)
 		return 1;
+	if (capable(CAP_IPC_LOCK))
+		return 1;
 	return 0;
 }
 EXPORT_SYMBOL(can_do_mlock);
-- 
cgit v1.1


From 7feee590bb18ffc42636975f74c2c3120ce1901c Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 12 Mar 2015 16:26:19 -0700
Subject: memcg: disable hierarchy support if bound to the legacy cgroup
 hierarchy

If the memory cgroup controller is initially mounted in the scope of the
default cgroup hierarchy and then remounted to a legacy hierarchy, it will
still have hierarchy support enabled, which is incorrect.  We should
disable hierarchy support if bound to the legacy cgroup hierarchy.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9fe0769..b34ef4a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5232,7 +5232,9 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
 	 * on for the root memcg is enough.
 	 */
 	if (cgroup_on_dfl(root_css->cgroup))
-		mem_cgroup_from_css(root_css)->use_hierarchy = true;
+		root_mem_cgroup->use_hierarchy = true;
+	else
+		root_mem_cgroup->use_hierarchy = false;
 }
 
 static u64 memory_current_read(struct cgroup_subsys_state *css,
-- 
cgit v1.1