From 4b8164b91d9fdff4dbac0a742d076bdff7fda21b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 31 Jan 2015 20:08:47 -0500 Subject: new helper: dup_iter() Copy iter and kmemdup the underlying array for the copy. Returns a pointer to result of kmemdup() to be kfree()'d later. Signed-off-by: Al Viro --- mm/iov_iter.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'mm') diff --git a/mm/iov_iter.c b/mm/iov_iter.c index 8277320..9d96e283 100644 --- a/mm/iov_iter.c +++ b/mm/iov_iter.c @@ -751,3 +751,18 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages) return npages; } EXPORT_SYMBOL(iov_iter_npages); + +const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) +{ + *new = *old; + if (new->type & ITER_BVEC) + return new->bvec = kmemdup(new->bvec, + new->nr_segs * sizeof(struct bio_vec), + flags); + else + /* iovec and kvec have identical layout */ + return new->iov = kmemdup(new->iov, + new->nr_segs * sizeof(struct iovec), + flags); +} +EXPORT_SYMBOL(dup_iter); -- cgit v1.1 From d879cb83417a71c435f1263e1160a9fce8e95d87 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 10 Dec 2014 16:05:55 -0500 Subject: move iov_iter.c from mm/ to lib/ Signed-off-by: Al Viro --- mm/Makefile | 2 +- mm/iov_iter.c | 768 ---------------------------------------------------------- 2 files changed, 1 insertion(+), 769 deletions(-) delete mode 100644 mm/iov_iter.c (limited to 'mm') diff --git a/mm/Makefile b/mm/Makefile index 3c1caa2..15dbe99 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -21,7 +21,7 @@ obj-y := filemap.o mempool.o oom_kill.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ compaction.o vmacache.o \ interval_tree.o list_lru.o workingset.o \ - iov_iter.o debug.o $(mmu-y) + debug.o $(mmu-y) obj-y += init-mm.o diff --git a/mm/iov_iter.c b/mm/iov_iter.c deleted file mode 100644 index 9d96e283..0000000 --- a/mm/iov_iter.c +++ /dev/null @@ -1,768 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#define iterate_iovec(i, n, __v, __p, skip, STEP) { \ - size_t left; \ - size_t wanted = n; \ - __p = i->iov; \ - __v.iov_len = min(n, __p->iov_len - skip); \ - if (likely(__v.iov_len)) { \ - __v.iov_base = __p->iov_base + skip; \ - left = (STEP); \ - __v.iov_len -= left; \ - skip += __v.iov_len; \ - n -= __v.iov_len; \ - } else { \ - left = 0; \ - } \ - while (unlikely(!left && n)) { \ - __p++; \ - __v.iov_len = min(n, __p->iov_len); \ - if (unlikely(!__v.iov_len)) \ - continue; \ - __v.iov_base = __p->iov_base; \ - left = (STEP); \ - __v.iov_len -= left; \ - skip = __v.iov_len; \ - n -= __v.iov_len; \ - } \ - n = wanted - n; \ -} - -#define iterate_kvec(i, n, __v, __p, skip, STEP) { \ - size_t wanted = n; \ - __p = i->kvec; \ - __v.iov_len = min(n, __p->iov_len - skip); \ - if (likely(__v.iov_len)) { \ - __v.iov_base = __p->iov_base + skip; \ - (void)(STEP); \ - skip += __v.iov_len; \ - n -= __v.iov_len; \ - } \ - while (unlikely(n)) { \ - __p++; \ - __v.iov_len = min(n, __p->iov_len); \ - if (unlikely(!__v.iov_len)) \ - continue; \ - __v.iov_base = __p->iov_base; \ - (void)(STEP); \ - skip = __v.iov_len; \ - n -= __v.iov_len; \ - } \ - n = wanted; \ -} - -#define iterate_bvec(i, n, __v, __p, skip, STEP) { \ - size_t wanted = n; \ - __p = i->bvec; \ - __v.bv_len = min_t(size_t, n, __p->bv_len - skip); \ - if (likely(__v.bv_len)) { \ - __v.bv_page = __p->bv_page; \ - __v.bv_offset = __p->bv_offset + skip; \ - (void)(STEP); \ - skip += __v.bv_len; \ - n -= __v.bv_len; \ - } \ - while (unlikely(n)) { \ - __p++; \ - __v.bv_len = min_t(size_t, n, __p->bv_len); \ - if (unlikely(!__v.bv_len)) \ - continue; \ - __v.bv_page = __p->bv_page; \ - __v.bv_offset = __p->bv_offset; \ - (void)(STEP); \ - skip = __v.bv_len; \ - n -= __v.bv_len; \ - } \ - n = wanted; \ -} - -#define iterate_all_kinds(i, n, v, I, B, K) { \ - size_t skip = i->iov_offset; \ - if (unlikely(i->type & ITER_BVEC)) { \ - const struct bio_vec *bvec; \ - struct bio_vec v; \ - iterate_bvec(i, n, v, bvec, skip, (B)) \ - } else if (unlikely(i->type & ITER_KVEC)) { \ - const struct kvec *kvec; \ - struct kvec v; \ - iterate_kvec(i, n, v, kvec, skip, (K)) \ - } else { \ - const struct iovec *iov; \ - struct iovec v; \ - iterate_iovec(i, n, v, iov, skip, (I)) \ - } \ -} - -#define iterate_and_advance(i, n, v, I, B, K) { \ - size_t skip = i->iov_offset; \ - if (unlikely(i->type & ITER_BVEC)) { \ - const struct bio_vec *bvec; \ - struct bio_vec v; \ - iterate_bvec(i, n, v, bvec, skip, (B)) \ - if (skip == bvec->bv_len) { \ - bvec++; \ - skip = 0; \ - } \ - i->nr_segs -= bvec - i->bvec; \ - i->bvec = bvec; \ - } else if (unlikely(i->type & ITER_KVEC)) { \ - const struct kvec *kvec; \ - struct kvec v; \ - iterate_kvec(i, n, v, kvec, skip, (K)) \ - if (skip == kvec->iov_len) { \ - kvec++; \ - skip = 0; \ - } \ - i->nr_segs -= kvec - i->kvec; \ - i->kvec = kvec; \ - } else { \ - const struct iovec *iov; \ - struct iovec v; \ - iterate_iovec(i, n, v, iov, skip, (I)) \ - if (skip == iov->iov_len) { \ - iov++; \ - skip = 0; \ - } \ - i->nr_segs -= iov - i->iov; \ - i->iov = iov; \ - } \ - i->count -= n; \ - i->iov_offset = skip; \ -} - -static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes, - struct iov_iter *i) -{ - size_t skip, copy, left, wanted; - const struct iovec *iov; - char __user *buf; - void *kaddr, *from; - - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - wanted = bytes; - iov = i->iov; - skip = i->iov_offset; - buf = iov->iov_base + skip; - copy = min(bytes, iov->iov_len - skip); - - if (!fault_in_pages_writeable(buf, copy)) { - kaddr = kmap_atomic(page); - from = kaddr + offset; - - /* first chunk, usually the only one */ - left = __copy_to_user_inatomic(buf, from, copy); - copy -= left; - skip += copy; - from += copy; - bytes -= copy; - - while (unlikely(!left && bytes)) { - iov++; - buf = iov->iov_base; - copy = min(bytes, iov->iov_len); - left = __copy_to_user_inatomic(buf, from, copy); - copy -= left; - skip = copy; - from += copy; - bytes -= copy; - } - if (likely(!bytes)) { - kunmap_atomic(kaddr); - goto done; - } - offset = from - kaddr; - buf += copy; - kunmap_atomic(kaddr); - copy = min(bytes, iov->iov_len - skip); - } - /* Too bad - revert to non-atomic kmap */ - kaddr = kmap(page); - from = kaddr + offset; - left = __copy_to_user(buf, from, copy); - copy -= left; - skip += copy; - from += copy; - bytes -= copy; - while (unlikely(!left && bytes)) { - iov++; - buf = iov->iov_base; - copy = min(bytes, iov->iov_len); - left = __copy_to_user(buf, from, copy); - copy -= left; - skip = copy; - from += copy; - bytes -= copy; - } - kunmap(page); -done: - if (skip == iov->iov_len) { - iov++; - skip = 0; - } - i->count -= wanted - bytes; - i->nr_segs -= iov - i->iov; - i->iov = iov; - i->iov_offset = skip; - return wanted - bytes; -} - -static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes, - struct iov_iter *i) -{ - size_t skip, copy, left, wanted; - const struct iovec *iov; - char __user *buf; - void *kaddr, *to; - - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - wanted = bytes; - iov = i->iov; - skip = i->iov_offset; - buf = iov->iov_base + skip; - copy = min(bytes, iov->iov_len - skip); - - if (!fault_in_pages_readable(buf, copy)) { - kaddr = kmap_atomic(page); - to = kaddr + offset; - - /* first chunk, usually the only one */ - left = __copy_from_user_inatomic(to, buf, copy); - copy -= left; - skip += copy; - to += copy; - bytes -= copy; - - while (unlikely(!left && bytes)) { - iov++; - buf = iov->iov_base; - copy = min(bytes, iov->iov_len); - left = __copy_from_user_inatomic(to, buf, copy); - copy -= left; - skip = copy; - to += copy; - bytes -= copy; - } - if (likely(!bytes)) { - kunmap_atomic(kaddr); - goto done; - } - offset = to - kaddr; - buf += copy; - kunmap_atomic(kaddr); - copy = min(bytes, iov->iov_len - skip); - } - /* Too bad - revert to non-atomic kmap */ - kaddr = kmap(page); - to = kaddr + offset; - left = __copy_from_user(to, buf, copy); - copy -= left; - skip += copy; - to += copy; - bytes -= copy; - while (unlikely(!left && bytes)) { - iov++; - buf = iov->iov_base; - copy = min(bytes, iov->iov_len); - left = __copy_from_user(to, buf, copy); - copy -= left; - skip = copy; - to += copy; - bytes -= copy; - } - kunmap(page); -done: - if (skip == iov->iov_len) { - iov++; - skip = 0; - } - i->count -= wanted - bytes; - i->nr_segs -= iov - i->iov; - i->iov = iov; - i->iov_offset = skip; - return wanted - bytes; -} - -/* - * Fault in the first iovec of the given iov_iter, to a maximum length - * of bytes. Returns 0 on success, or non-zero if the memory could not be - * accessed (ie. because it is an invalid address). - * - * writev-intensive code may want this to prefault several iovecs -- that - * would be possible (callers must not rely on the fact that _only_ the - * first iovec will be faulted with the current implementation). - */ -int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) -{ - if (!(i->type & (ITER_BVEC|ITER_KVEC))) { - char __user *buf = i->iov->iov_base + i->iov_offset; - bytes = min(bytes, i->iov->iov_len - i->iov_offset); - return fault_in_pages_readable(buf, bytes); - } - return 0; -} -EXPORT_SYMBOL(iov_iter_fault_in_readable); - -void iov_iter_init(struct iov_iter *i, int direction, - const struct iovec *iov, unsigned long nr_segs, - size_t count) -{ - /* It will get better. Eventually... */ - if (segment_eq(get_fs(), KERNEL_DS)) { - direction |= ITER_KVEC; - i->type = direction; - i->kvec = (struct kvec *)iov; - } else { - i->type = direction; - i->iov = iov; - } - i->nr_segs = nr_segs; - i->iov_offset = 0; - i->count = count; -} -EXPORT_SYMBOL(iov_iter_init); - -static void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len) -{ - char *from = kmap_atomic(page); - memcpy(to, from + offset, len); - kunmap_atomic(from); -} - -static void memcpy_to_page(struct page *page, size_t offset, char *from, size_t len) -{ - char *to = kmap_atomic(page); - memcpy(to + offset, from, len); - kunmap_atomic(to); -} - -static void memzero_page(struct page *page, size_t offset, size_t len) -{ - char *addr = kmap_atomic(page); - memset(addr + offset, 0, len); - kunmap_atomic(addr); -} - -size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i) -{ - char *from = addr; - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - iterate_and_advance(i, bytes, v, - __copy_to_user(v.iov_base, (from += v.iov_len) - v.iov_len, - v.iov_len), - memcpy_to_page(v.bv_page, v.bv_offset, - (from += v.bv_len) - v.bv_len, v.bv_len), - memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len) - ) - - return bytes; -} -EXPORT_SYMBOL(copy_to_iter); - -size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) -{ - char *to = addr; - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - iterate_and_advance(i, bytes, v, - __copy_from_user((to += v.iov_len) - v.iov_len, v.iov_base, - v.iov_len), - memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, - v.bv_offset, v.bv_len), - memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) - ) - - return bytes; -} -EXPORT_SYMBOL(copy_from_iter); - -size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) -{ - char *to = addr; - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - iterate_and_advance(i, bytes, v, - __copy_from_user_nocache((to += v.iov_len) - v.iov_len, - v.iov_base, v.iov_len), - memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, - v.bv_offset, v.bv_len), - memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) - ) - - return bytes; -} -EXPORT_SYMBOL(copy_from_iter_nocache); - -size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, - struct iov_iter *i) -{ - if (i->type & (ITER_BVEC|ITER_KVEC)) { - void *kaddr = kmap_atomic(page); - size_t wanted = copy_to_iter(kaddr + offset, bytes, i); - kunmap_atomic(kaddr); - return wanted; - } else - return copy_page_to_iter_iovec(page, offset, bytes, i); -} -EXPORT_SYMBOL(copy_page_to_iter); - -size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, - struct iov_iter *i) -{ - if (i->type & (ITER_BVEC|ITER_KVEC)) { - void *kaddr = kmap_atomic(page); - size_t wanted = copy_from_iter(kaddr + offset, bytes, i); - kunmap_atomic(kaddr); - return wanted; - } else - return copy_page_from_iter_iovec(page, offset, bytes, i); -} -EXPORT_SYMBOL(copy_page_from_iter); - -size_t iov_iter_zero(size_t bytes, struct iov_iter *i) -{ - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - iterate_and_advance(i, bytes, v, - __clear_user(v.iov_base, v.iov_len), - memzero_page(v.bv_page, v.bv_offset, v.bv_len), - memset(v.iov_base, 0, v.iov_len) - ) - - return bytes; -} -EXPORT_SYMBOL(iov_iter_zero); - -size_t iov_iter_copy_from_user_atomic(struct page *page, - struct iov_iter *i, unsigned long offset, size_t bytes) -{ - char *kaddr = kmap_atomic(page), *p = kaddr + offset; - iterate_all_kinds(i, bytes, v, - __copy_from_user_inatomic((p += v.iov_len) - v.iov_len, - v.iov_base, v.iov_len), - memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page, - v.bv_offset, v.bv_len), - memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) - ) - kunmap_atomic(kaddr); - return bytes; -} -EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); - -void iov_iter_advance(struct iov_iter *i, size_t size) -{ - iterate_and_advance(i, size, v, 0, 0, 0) -} -EXPORT_SYMBOL(iov_iter_advance); - -/* - * Return the count of just the current iov_iter segment. - */ -size_t iov_iter_single_seg_count(const struct iov_iter *i) -{ - if (i->nr_segs == 1) - return i->count; - else if (i->type & ITER_BVEC) - return min(i->count, i->bvec->bv_len - i->iov_offset); - else - return min(i->count, i->iov->iov_len - i->iov_offset); -} -EXPORT_SYMBOL(iov_iter_single_seg_count); - -void iov_iter_kvec(struct iov_iter *i, int direction, - const struct kvec *kvec, unsigned long nr_segs, - size_t count) -{ - BUG_ON(!(direction & ITER_KVEC)); - i->type = direction; - i->kvec = kvec; - i->nr_segs = nr_segs; - i->iov_offset = 0; - i->count = count; -} -EXPORT_SYMBOL(iov_iter_kvec); - -void iov_iter_bvec(struct iov_iter *i, int direction, - const struct bio_vec *bvec, unsigned long nr_segs, - size_t count) -{ - BUG_ON(!(direction & ITER_BVEC)); - i->type = direction; - i->bvec = bvec; - i->nr_segs = nr_segs; - i->iov_offset = 0; - i->count = count; -} -EXPORT_SYMBOL(iov_iter_bvec); - -unsigned long iov_iter_alignment(const struct iov_iter *i) -{ - unsigned long res = 0; - size_t size = i->count; - - if (!size) - return 0; - - iterate_all_kinds(i, size, v, - (res |= (unsigned long)v.iov_base | v.iov_len, 0), - res |= v.bv_offset | v.bv_len, - res |= (unsigned long)v.iov_base | v.iov_len - ) - return res; -} -EXPORT_SYMBOL(iov_iter_alignment); - -ssize_t iov_iter_get_pages(struct iov_iter *i, - struct page **pages, size_t maxsize, unsigned maxpages, - size_t *start) -{ - if (maxsize > i->count) - maxsize = i->count; - - if (!maxsize) - return 0; - - iterate_all_kinds(i, maxsize, v, ({ - unsigned long addr = (unsigned long)v.iov_base; - size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); - int n; - int res; - - if (len > maxpages * PAGE_SIZE) - len = maxpages * PAGE_SIZE; - addr &= ~(PAGE_SIZE - 1); - n = DIV_ROUND_UP(len, PAGE_SIZE); - res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages); - if (unlikely(res < 0)) - return res; - return (res == n ? len : res * PAGE_SIZE) - *start; - 0;}),({ - /* can't be more than PAGE_SIZE */ - *start = v.bv_offset; - get_page(*pages = v.bv_page); - return v.bv_len; - }),({ - return -EFAULT; - }) - ) - return 0; -} -EXPORT_SYMBOL(iov_iter_get_pages); - -static struct page **get_pages_array(size_t n) -{ - struct page **p = kmalloc(n * sizeof(struct page *), GFP_KERNEL); - if (!p) - p = vmalloc(n * sizeof(struct page *)); - return p; -} - -ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, - struct page ***pages, size_t maxsize, - size_t *start) -{ - struct page **p; - - if (maxsize > i->count) - maxsize = i->count; - - if (!maxsize) - return 0; - - iterate_all_kinds(i, maxsize, v, ({ - unsigned long addr = (unsigned long)v.iov_base; - size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); - int n; - int res; - - addr &= ~(PAGE_SIZE - 1); - n = DIV_ROUND_UP(len, PAGE_SIZE); - p = get_pages_array(n); - if (!p) - return -ENOMEM; - res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p); - if (unlikely(res < 0)) { - kvfree(p); - return res; - } - *pages = p; - return (res == n ? len : res * PAGE_SIZE) - *start; - 0;}),({ - /* can't be more than PAGE_SIZE */ - *start = v.bv_offset; - *pages = p = get_pages_array(1); - if (!p) - return -ENOMEM; - get_page(*p = v.bv_page); - return v.bv_len; - }),({ - return -EFAULT; - }) - ) - return 0; -} -EXPORT_SYMBOL(iov_iter_get_pages_alloc); - -size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, - struct iov_iter *i) -{ - char *to = addr; - __wsum sum, next; - size_t off = 0; - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - sum = *csum; - iterate_and_advance(i, bytes, v, ({ - int err = 0; - next = csum_and_copy_from_user(v.iov_base, - (to += v.iov_len) - v.iov_len, - v.iov_len, 0, &err); - if (!err) { - sum = csum_block_add(sum, next, off); - off += v.iov_len; - } - err ? v.iov_len : 0; - }), ({ - char *p = kmap_atomic(v.bv_page); - next = csum_partial_copy_nocheck(p + v.bv_offset, - (to += v.bv_len) - v.bv_len, - v.bv_len, 0); - kunmap_atomic(p); - sum = csum_block_add(sum, next, off); - off += v.bv_len; - }),({ - next = csum_partial_copy_nocheck(v.iov_base, - (to += v.iov_len) - v.iov_len, - v.iov_len, 0); - sum = csum_block_add(sum, next, off); - off += v.iov_len; - }) - ) - *csum = sum; - return bytes; -} -EXPORT_SYMBOL(csum_and_copy_from_iter); - -size_t csum_and_copy_to_iter(void *addr, size_t bytes, __wsum *csum, - struct iov_iter *i) -{ - char *from = addr; - __wsum sum, next; - size_t off = 0; - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - sum = *csum; - iterate_and_advance(i, bytes, v, ({ - int err = 0; - next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len, - v.iov_base, - v.iov_len, 0, &err); - if (!err) { - sum = csum_block_add(sum, next, off); - off += v.iov_len; - } - err ? v.iov_len : 0; - }), ({ - char *p = kmap_atomic(v.bv_page); - next = csum_partial_copy_nocheck((from += v.bv_len) - v.bv_len, - p + v.bv_offset, - v.bv_len, 0); - kunmap_atomic(p); - sum = csum_block_add(sum, next, off); - off += v.bv_len; - }),({ - next = csum_partial_copy_nocheck((from += v.iov_len) - v.iov_len, - v.iov_base, - v.iov_len, 0); - sum = csum_block_add(sum, next, off); - off += v.iov_len; - }) - ) - *csum = sum; - return bytes; -} -EXPORT_SYMBOL(csum_and_copy_to_iter); - -int iov_iter_npages(const struct iov_iter *i, int maxpages) -{ - size_t size = i->count; - int npages = 0; - - if (!size) - return 0; - - iterate_all_kinds(i, size, v, ({ - unsigned long p = (unsigned long)v.iov_base; - npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE) - - p / PAGE_SIZE; - if (npages >= maxpages) - return maxpages; - 0;}),({ - npages++; - if (npages >= maxpages) - return maxpages; - }),({ - unsigned long p = (unsigned long)v.iov_base; - npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE) - - p / PAGE_SIZE; - if (npages >= maxpages) - return maxpages; - }) - ) - return npages; -} -EXPORT_SYMBOL(iov_iter_npages); - -const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) -{ - *new = *old; - if (new->type & ITER_BVEC) - return new->bvec = kmemdup(new->bvec, - new->nr_segs * sizeof(struct bio_vec), - flags); - else - /* iovec and kvec have identical layout */ - return new->iov = kmemdup(new->iov, - new->nr_segs * sizeof(struct iovec), - flags); -} -EXPORT_SYMBOL(dup_iter); -- cgit v1.1 From 53da3bc2ba9e4899f32707b5cd7d18421b943687 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 12 Mar 2015 08:45:46 -0700 Subject: mm: fix up numa read-only thread grouping logic Dave Chinner reported that commit 4d9424669946 ("mm: convert p[te|md]_mknonnuma and remaining page table manipulations") slowed down his xfsrepair test enormously. In particular, it was using more system time due to extra TLB flushing. The ultimate reason turns out to be how the change to use the regular page table accessor functions broke the NUMA grouping logic. The old special mknuma/mknonnuma code accessed the page table present bit and the magic NUMA bit directly, while the new code just changes the page protections using PROT_NONE and the regular vma protections. That sounds equivalent, and from a fault standpoint it really is, but a subtle side effect is that the *other* protection bits of the page table entries also change. And the code to decide how to group the NUMA entries together used the writable bit to decide whether a particular page was likely to be shared read-only or not. And with the change to make the NUMA handling use the regular permission setting functions, that writable bit was basically always cleared for private mappings due to COW. So even if the page actually ends up being written to in the end, the NUMA balancing would act as if it was always shared RO. This code is a heuristic anyway, so the fix - at least for now - is to instead check whether the page is dirty rather than writable. The bit doesn't change with protection changes. NOTE! This also adds a FIXME comment to revisit this issue, Not only should we probably re-visit the whole "is this a shared read-only page" heuristic (we might want to take the vma permissions into account and base this more on those than the per-page ones, and also look at whether the particular access that triggers it is a write or not), but the whole COW issue shows that we should think about the NUMA fault handling some more. For example, maybe we should do the early-COW thing that a regular fault does. Or maybe we should accept that while using the same bits as PROTNONE was a good thing (and got rid of the specual NUMA bit), we might still want to just preseve the other protection bits across NUMA faulting. Those are bigger questions, left for later. This just fixes up the heuristic so that it at least approximates working again. More analysis and work needed. Reported-by: Dave Chinner Tested-by: Mel Gorman Cc: Andrew Morton Cc: Aneesh Kumar Cc: Ingo Molnar , Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 7 ++++++- mm/memory.c | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index fc00c8c..89b9075 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1295,8 +1295,13 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * Avoid grouping on DSO/COW pages in specific and RO pages * in general, RO pages shouldn't hurt as much anyway since * they can be in shared cache state. + * + * FIXME! This checks "pmd_dirty()" as an approximation of + * "is this a read-only page", since checking "pmd_write()" + * is even more broken. We haven't actually turned this into + * a writable page, so pmd_write() will always be false. */ - if (!pmd_write(pmd)) + if (!pmd_dirty(pmd)) flags |= TNF_NO_GROUP; /* diff --git a/mm/memory.c b/mm/memory.c index 8068893..411144f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3072,8 +3072,13 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * Avoid grouping on DSO/COW pages in specific and RO pages * in general, RO pages shouldn't hurt as much anyway since * they can be in shared cache state. + * + * FIXME! This checks "pmd_dirty()" as an approximation of + * "is this a read-only page", since checking "pmd_write()" + * is even more broken. We haven't actually turned this into + * a writable page, so pmd_write() will always be false. */ - if (!pte_write(pte)) + if (!pte_dirty(pte)) flags |= TNF_NO_GROUP; /* -- cgit v1.1 From ba68bc0115ebfc37f911db4e87bf5f7991f89698 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Sat, 7 Mar 2015 15:20:48 +0000 Subject: mm: thp: Return the correct value for change_huge_pmd The wrong value is being returned by change_huge_pmd since commit 10c1045f28e8 ("mm: numa: avoid unnecessary TLB flushes when setting NUMA hinting entries") which allows a fallthrough that tries to adjust non-existent PTEs. This patch corrects it. Signed-off-by: Mel Gorman Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 89b9075..626e93d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1487,6 +1487,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { pmd_t entry; + ret = 1; /* * Avoid trapping faults against the zero page. The read-only @@ -1495,11 +1496,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, */ if (prot_numa && is_huge_zero_pmd(*pmd)) { spin_unlock(ptl); - return 0; + return ret; } if (!prot_numa || !pmd_protnone(*pmd)) { - ret = 1; entry = pmdp_get_and_clear_notify(mm, addr, pmd); entry = pmd_modify(entry, newprot); ret = HPAGE_PMD_NR; -- cgit v1.1 From e009d5dc0a94a7133e5f1c083732d760bfd038e6 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 12 Mar 2015 16:25:52 -0700 Subject: mm, oom: do not fail __GFP_NOFAIL allocation if oom killer is disabled Tetsuo Handa has pointed out that __GFP_NOFAIL allocations might fail after OOM killer is disabled if the allocation is performed by a kernel thread. This behavior was introduced from the very beginning by 7f33d49a2ed5 ("mm, PM/Freezer: Disable OOM killer when tasks are frozen"). This means that the basic contract for the allocation request is broken and the context requesting such an allocation might blow up unexpectedly. There are basically two ways forward. 1) move oom_killer_disable after kernel threads are frozen. This has a risk that the OOM victim wouldn't be able to finish because it would depend on an already frozen kernel thread. This would be really tricky to debug. 2) do not fail GFP_NOFAIL allocation no matter what and risk a potential Freezable kernel threads will loop and fail the suspend. Incidental allocations after kernel threads are frozen will at least dump a warning - if we are lucky and the serial console is still active of course... This patch implements the later option because it is safer. We would see warning rather than allocation failures for the kernel threads which would blow up otherwise and have a higher chances to identify __GFP_NOFAIL users from deeper pm code. Signed-off-by: Michal Hocko Acked-by: David Rientjes Cc: Johannes Weiner Cc: Tetsuo Handa Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7abfa70..40e2942 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2373,7 +2373,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, goto out; } /* Exhausted what can be done so it's blamo time */ - if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)) + if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false) + || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) *did_some_progress = 1; out: oom_zonelist_unlock(ac->zonelist, gfp_mask); -- cgit v1.1 From 44fc80573cc760a7154f41fd0a958ee10eba1a81 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 12 Mar 2015 16:25:54 -0700 Subject: mm, hugetlb: close race when setting PageTail for gigantic pages Now that gigantic pages are dynamically allocatable, care must be taken to ensure that p->first_page is valid before setting PageTail. If this isn't done, then it is possible to race and have compound_head() return NULL. Signed-off-by: David Rientjes Acked-by: Davidlohr Bueso Cc: Luiz Capitulino Cc: Joonsoo Kim Acked-by: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0a9ac6c..c41b2a0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -917,7 +917,6 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) __SetPageHead(page); __ClearPageReserved(page); for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { - __SetPageTail(p); /* * For gigantic hugepages allocated through bootmem at * boot, it's safer to be consistent with the not-gigantic @@ -933,6 +932,9 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) __ClearPageReserved(p); set_page_count(p, 0); p->first_page = page; + /* Make sure p->first_page is always valid for PageTail() */ + smp_wmb(); + __SetPageTail(p); } } -- cgit v1.1 From 850fc430f47aad52092deaaeb32b99f97f0e6aca Mon Sep 17 00:00:00 2001 From: Danesh Petigara Date: Thu, 12 Mar 2015 16:25:57 -0700 Subject: mm: cma: fix CMA aligned offset calculation The CMA aligned offset calculation is incorrect for non-zero order_per_bit values. For example, if cma->order_per_bit=1, cma->base_pfn= 0x2f800000 and align_order=12, the function returns a value of 0x17c00 instead of 0x400. This patch fixes the CMA aligned offset calculation. The previous calculation was wrong and would return too-large values for the offset, so that when cma_alloc looks for free pages in the bitmap with the requested alignment > order_per_bit, it starts too far into the bitmap and so CMA allocations will fail despite there actually being plenty of free pages remaining. It will also probably have the wrong alignment. With this change, we will get the correct offset into the bitmap. One affected user is powerpc KVM, which has kvm_cma->order_per_bit set to KVM_CMA_CHUNK_ORDER - PAGE_SHIFT, or 18 - 12 = 6. [gregory.0xf0@gmail.com: changelog additions] Signed-off-by: Danesh Petigara Reviewed-by: Gregory Fong Acked-by: Michal Nazarewicz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/cma.c b/mm/cma.c index 75016fd..68ecb7a 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -64,15 +64,17 @@ static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) return (1UL << (align_order - cma->order_per_bit)) - 1; } +/* + * Find a PFN aligned to the specified order and return an offset represented in + * order_per_bits. + */ static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order) { - unsigned int alignment; - if (align_order <= cma->order_per_bit) return 0; - alignment = 1UL << (align_order - cma->order_per_bit); - return ALIGN(cma->base_pfn, alignment) - - (cma->base_pfn >> cma->order_per_bit); + + return (ALIGN(cma->base_pfn, (1UL << align_order)) + - cma->base_pfn) >> cma->order_per_bit; } static unsigned long cma_bitmap_maxno(struct cma *cma) -- cgit v1.1 From 5b8bf30721980b254be7a07315c353b3a3175b74 Mon Sep 17 00:00:00 2001 From: gchen gchen Date: Thu, 12 Mar 2015 16:26:05 -0700 Subject: mm/nommu.c: export symbol max_mapnr Several modules may need max_mapnr, so export, the related error with allmodconfig under c6x: MODPOST 3327 modules ERROR: "max_mapnr" [fs/pstore/ramoops.ko] undefined! ERROR: "max_mapnr" [drivers/media/v4l2-core/videobuf2-dma-contig.ko] undefined! Signed-off-by: Chen Gang Cc: Mark Salter Cc: Aurelien Jacquiot Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nommu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index 3e67e75..3fba2dc9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -62,6 +62,7 @@ void *high_memory; EXPORT_SYMBOL(high_memory); struct page *mem_map; unsigned long max_mapnr; +EXPORT_SYMBOL(max_mapnr); unsigned long highest_memmap_pfn; struct percpu_counter vm_committed_as; int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ -- cgit v1.1 From a5af5aa8b67dfdba36c853b70564fd2dfe73d478 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 12 Mar 2015 16:26:11 -0700 Subject: kasan, module, vmalloc: rework shadow allocation for modules Current approach in handling shadow memory for modules is broken. Shadow memory could be freed only after memory shadow corresponds it is no longer used. vfree() called from interrupt context could use memory its freeing to store 'struct llist_node' in it: void vfree(const void *addr) { ... if (unlikely(in_interrupt())) { struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred); if (llist_add((struct llist_node *)addr, &p->list)) schedule_work(&p->wq); Later this list node used in free_work() which actually frees memory. Currently module_memfree() called in interrupt context will free shadow before freeing module's memory which could provoke kernel crash. So shadow memory should be freed after module's memory. However, such deallocation order could race with kasan_module_alloc() in module_alloc(). Free shadow right before releasing vm area. At this point vfree()'d memory is not used anymore and yet not available for other allocations. New VM_KASAN flag used to indicate that vm area has dynamically allocated shadow memory so kasan frees shadow only if it was previously allocated. Signed-off-by: Andrey Ryabinin Acked-by: Rusty Russell Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/kasan.c | 14 +++++++++++--- mm/vmalloc.c | 1 + 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 78fee63..936d816 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include "kasan.h" @@ -414,12 +415,19 @@ int kasan_module_alloc(void *addr, size_t size) GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE, __builtin_return_address(0)); - return ret ? 0 : -ENOMEM; + + if (ret) { + find_vm_area(addr)->flags |= VM_KASAN; + return 0; + } + + return -ENOMEM; } -void kasan_module_free(void *addr) +void kasan_free_shadow(const struct vm_struct *vm) { - vfree(kasan_mem_to_shadow(addr)); + if (vm->flags & VM_KASAN) + vfree(kasan_mem_to_shadow(vm->addr)); } static void register_global(struct kasan_global *global) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 35b25e1..49abccf 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1418,6 +1418,7 @@ struct vm_struct *remove_vm_area(const void *addr) spin_unlock(&vmap_area_lock); vmap_debug_free_range(va->va_start, va->va_end); + kasan_free_shadow(vm); free_unmap_vmap_area(va); vm->size -= PAGE_SIZE; -- cgit v1.1 From a5a6579db33af91f4f5134e14be758dc71c1b694 Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Thu, 12 Mar 2015 16:26:17 -0700 Subject: mm: reorder can_do_mlock to fix audit denial A userspace call to mmap(MAP_LOCKED) may result in the successful locking of memory while also producing a confusing audit log denial. can_do_mlock checks capable and rlimit. If either of these return positive can_do_mlock returns true. The capable check leads to an LSM hook used by apparmour and selinux which produce the audit denial. Reordering so rlimit is checked first eliminates the denial on success, only recording a denial when the lock is unsuccessful as a result of the denial. Signed-off-by: Jeff Vander Stoep Acked-by: Nick Kralevich Cc: Jeff Vander Stoep Cc: Sasha Levin Cc: "Paul E. McKenney" Cc: Rik van Riel Cc: Vlastimil Babka Cc: Paul Cassella Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index 73cf098..8a54cd2 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -26,10 +26,10 @@ int can_do_mlock(void) { - if (capable(CAP_IPC_LOCK)) - return 1; if (rlimit(RLIMIT_MEMLOCK) != 0) return 1; + if (capable(CAP_IPC_LOCK)) + return 1; return 0; } EXPORT_SYMBOL(can_do_mlock); -- cgit v1.1 From 7feee590bb18ffc42636975f74c2c3120ce1901c Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 12 Mar 2015 16:26:19 -0700 Subject: memcg: disable hierarchy support if bound to the legacy cgroup hierarchy If the memory cgroup controller is initially mounted in the scope of the default cgroup hierarchy and then remounted to a legacy hierarchy, it will still have hierarchy support enabled, which is incorrect. We should disable hierarchy support if bound to the legacy cgroup hierarchy. Signed-off-by: Vladimir Davydov Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9fe0769..b34ef4a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5232,7 +5232,9 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) * on for the root memcg is enough. */ if (cgroup_on_dfl(root_css->cgroup)) - mem_cgroup_from_css(root_css)->use_hierarchy = true; + root_mem_cgroup->use_hierarchy = true; + else + root_mem_cgroup->use_hierarchy = false; } static u64 memory_current_read(struct cgroup_subsys_state *css, -- cgit v1.1