diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 2 | ||||
-rw-r--r-- | mm/hugetlb.c | 2 | ||||
-rw-r--r-- | mm/maccess.c | 11 | ||||
-rw-r--r-- | mm/memcontrol.c | 11 | ||||
-rw-r--r-- | mm/memory-failure.c | 9 | ||||
-rw-r--r-- | mm/mmap.c | 40 | ||||
-rw-r--r-- | mm/nommu.c | 144 | ||||
-rw-r--r-- | mm/page_alloc.c | 15 | ||||
-rw-r--r-- | mm/percpu.c | 4 | ||||
-rw-r--r-- | mm/slab.c | 4 | ||||
-rw-r--r-- | mm/truncate.c | 30 | ||||
-rw-r--r-- | mm/util.c | 46 | ||||
-rw-r--r-- | mm/vmalloc.c | 4 | ||||
-rw-r--r-- | mm/vmscan.c | 3 |
14 files changed, 205 insertions, 120 deletions
@@ -253,7 +253,7 @@ config MEMORY_FAILURE config HWPOISON_INJECT tristate "HWPoison pages injector" - depends on MEMORY_FAILURE && DEBUG_KERNEL + depends on MEMORY_FAILURE && DEBUG_KERNEL && PROC_FS select PROC_PAGE_MONITOR config NOMMU_INITIAL_TRIM_EXCESS diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 65f38c2..e91b81b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -402,7 +402,7 @@ static void clear_huge_page(struct page *page, { int i; - if (unlikely(sz > MAX_ORDER_NR_PAGES)) { + if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) { clear_gigantic_page(page, addr, sz); return; } diff --git a/mm/maccess.c b/mm/maccess.c index 9073695..4e348db 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -14,7 +14,11 @@ * Safely read from address @src to the buffer at @dst. If a kernel fault * happens, handle that and return -EFAULT. */ -long probe_kernel_read(void *dst, void *src, size_t size) + +long __weak probe_kernel_read(void *dst, void *src, size_t size) + __attribute__((alias("__probe_kernel_read"))); + +long __probe_kernel_read(void *dst, void *src, size_t size) { long ret; mm_segment_t old_fs = get_fs(); @@ -39,7 +43,10 @@ EXPORT_SYMBOL_GPL(probe_kernel_read); * Safely write to address @dst from the buffer at @src. If a kernel fault * happens, handle that and return -EFAULT. */ -long notrace __weak probe_kernel_write(void *dst, void *src, size_t size) +long __weak probe_kernel_write(void *dst, void *src, size_t size) + __attribute__((alias("__probe_kernel_write"))); + +long __probe_kernel_write(void *dst, void *src, size_t size) { long ret; mm_segment_t old_fs = get_fs(); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 488b644..954032b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2586,7 +2586,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) if (free_all) goto try_to_free; move_account: - while (mem->res.usage > 0) { + do { ret = -EBUSY; if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) goto out; @@ -2614,8 +2614,8 @@ move_account: if (ret == -ENOMEM) goto try_to_free; cond_resched(); - } - ret = 0; + /* "ret" should also be checked to ensure all lists are empty. */ + } while (mem->res.usage > 0 || ret); out: css_put(&mem->css); return ret; @@ -2648,10 +2648,7 @@ try_to_free: } lru_add_drain(); /* try move_account...there may be some *locked* pages. */ - if (mem->res.usage) - goto move_account; - ret = 0; - goto out; + goto move_account; } int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6a0466e..17299fd 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -52,6 +52,8 @@ int sysctl_memory_failure_recovery __read_mostly = 1; atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); +#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) + u32 hwpoison_filter_enable = 0; u32 hwpoison_filter_dev_major = ~0U; u32 hwpoison_filter_dev_minor = ~0U; @@ -164,6 +166,13 @@ int hwpoison_filter(struct page *p) return 0; } +#else +int hwpoison_filter(struct page *p) +{ + return 0; +} +#endif + EXPORT_SYMBOL_GPL(hwpoison_filter); /* @@ -1043,6 +1043,46 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, } EXPORT_SYMBOL(do_mmap_pgoff); +SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, pgoff) +{ + struct file *file = NULL; + unsigned long retval = -EBADF; + + if (!(flags & MAP_ANONYMOUS)) { + if (unlikely(flags & MAP_HUGETLB)) + return -EINVAL; + file = fget(fd); + if (!file) + goto out; + } else if (flags & MAP_HUGETLB) { + struct user_struct *user = NULL; + /* + * VM_NORESERVE is used because the reservations will be + * taken when vm_ops->mmap() is called + * A dummy user value is used because we are not locking + * memory so no accounting is necessary + */ + len = ALIGN(len, huge_page_size(&default_hstate)); + file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, + &user, HUGETLB_ANONHUGE_INODE); + if (IS_ERR(file)) + return PTR_ERR(file); + } + + flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); + + down_write(¤t->mm->mmap_sem); + retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); + up_write(¤t->mm->mmap_sem); + + if (file) + fput(file); +out: + return retval; +} + /* * Some shared mappigns will want the pages marked read-only * to track write events. If so, we'll downgrade vm_page_prot @@ -432,6 +432,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* * Ok, looks good - let it rip. */ + flush_icache_range(mm->brk, brk); return mm->brk = brk; } @@ -551,11 +552,11 @@ static void free_page_series(unsigned long from, unsigned long to) static void __put_nommu_region(struct vm_region *region) __releases(nommu_region_sem) { - kenter("%p{%d}", region, atomic_read(®ion->vm_usage)); + kenter("%p{%d}", region, region->vm_usage); BUG_ON(!nommu_region_tree.rb_node); - if (atomic_dec_and_test(®ion->vm_usage)) { + if (--region->vm_usage == 0) { if (region->vm_top > region->vm_start) delete_nommu_region(region); up_write(&nommu_region_sem); @@ -1204,7 +1205,7 @@ unsigned long do_mmap_pgoff(struct file *file, if (!vma) goto error_getting_vma; - atomic_set(®ion->vm_usage, 1); + region->vm_usage = 1; region->vm_flags = vm_flags; region->vm_pgoff = pgoff; @@ -1271,7 +1272,7 @@ unsigned long do_mmap_pgoff(struct file *file, } /* we've found a region we can share */ - atomic_inc(&pregion->vm_usage); + pregion->vm_usage++; vma->vm_region = pregion; start = pregion->vm_start; start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT; @@ -1288,7 +1289,7 @@ unsigned long do_mmap_pgoff(struct file *file, vma->vm_region = NULL; vma->vm_start = 0; vma->vm_end = 0; - atomic_dec(&pregion->vm_usage); + pregion->vm_usage--; pregion = NULL; goto error_just_free; } @@ -1353,10 +1354,14 @@ unsigned long do_mmap_pgoff(struct file *file, share: add_vma_to_mm(current->mm, vma); - up_write(&nommu_region_sem); + /* we flush the region from the icache only when the first executable + * mapping of it is made */ + if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) { + flush_icache_range(region->vm_start, region->vm_end); + region->vm_icache_flushed = true; + } - if (prot & PROT_EXEC) - flush_icache_range(result, result + len); + up_write(&nommu_region_sem); kleave(" = %lx", result); return result; @@ -1398,6 +1403,31 @@ error_getting_region: } EXPORT_SYMBOL(do_mmap_pgoff); +SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, pgoff) +{ + struct file *file = NULL; + unsigned long retval = -EBADF; + + if (!(flags & MAP_ANONYMOUS)) { + file = fget(fd); + if (!file) + goto out; + } + + flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); + + down_write(¤t->mm->mmap_sem); + retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); + up_write(¤t->mm->mmap_sem); + + if (file) + fput(file); +out: + return retval; +} + /* * split a vma into two pieces at address 'addr', a new vma is allocated either * for the first part or the tail. @@ -1411,10 +1441,9 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, kenter(""); - /* we're only permitted to split anonymous regions that have a single - * owner */ - if (vma->vm_file || - atomic_read(&vma->vm_region->vm_usage) != 1) + /* we're only permitted to split anonymous regions (these should have + * only a single usage on the region) */ + if (vma->vm_file) return -ENOMEM; if (mm->map_count >= sysctl_max_map_count) @@ -1488,7 +1517,7 @@ static int shrink_vma(struct mm_struct *mm, /* cut the backing region down to size */ region = vma->vm_region; - BUG_ON(atomic_read(®ion->vm_usage) != 1); + BUG_ON(region->vm_usage != 1); down_write(&nommu_region_sem); delete_nommu_region(region); @@ -1732,27 +1761,6 @@ void unmap_mapping_range(struct address_space *mapping, EXPORT_SYMBOL(unmap_mapping_range); /* - * ask for an unmapped area at which to create a mapping on a file - */ -unsigned long get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - unsigned long (*get_area)(struct file *, unsigned long, unsigned long, - unsigned long, unsigned long); - - get_area = current->mm->get_unmapped_area; - if (file && file->f_op && file->f_op->get_unmapped_area) - get_area = file->f_op->get_unmapped_area; - - if (!get_area) - return -ENOSYS; - - return get_area(file, addr, len, pgoff, flags); -} -EXPORT_SYMBOL(get_unmapped_area); - -/* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to * succeed and -ENOMEM implies there is not. @@ -1891,9 +1899,11 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in /* only read or write mappings where it is permitted */ if (write && vma->vm_flags & VM_MAYWRITE) - len -= copy_to_user((void *) addr, buf, len); + copy_to_user_page(vma, NULL, addr, + (void *) addr, buf, len); else if (!write && vma->vm_flags & VM_MAYREAD) - len -= copy_from_user(buf, (void *) addr, len); + copy_from_user_page(vma, NULL, addr, + buf, (void *) addr, len); else len = 0; } else { @@ -1904,3 +1914,65 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in mmput(mm); return len; } + +/** + * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode + * @inode: The inode to check + * @size: The current filesize of the inode + * @newsize: The proposed filesize of the inode + * + * Check the shared mappings on an inode on behalf of a shrinking truncate to + * make sure that that any outstanding VMAs aren't broken and then shrink the + * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't + * automatically grant mappings that are too large. + */ +int nommu_shrink_inode_mappings(struct inode *inode, size_t size, + size_t newsize) +{ + struct vm_area_struct *vma; + struct prio_tree_iter iter; + struct vm_region *region; + pgoff_t low, high; + size_t r_size, r_top; + + low = newsize >> PAGE_SHIFT; + high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + + down_write(&nommu_region_sem); + + /* search for VMAs that fall within the dead zone */ + vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, + low, high) { + /* found one - only interested if it's shared out of the page + * cache */ + if (vma->vm_flags & VM_SHARED) { + up_write(&nommu_region_sem); + return -ETXTBSY; /* not quite true, but near enough */ + } + } + + /* reduce any regions that overlap the dead zone - if in existence, + * these will be pointed to by VMAs that don't overlap the dead zone + * + * we don't check for any regions that start beyond the EOF as there + * shouldn't be any + */ + vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, + 0, ULONG_MAX) { + if (!(vma->vm_flags & VM_SHARED)) + continue; + + region = vma->vm_region; + r_size = region->vm_top - region->vm_start; + r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size; + + if (r_top > newsize) { + region->vm_top -= r_top - newsize; + if (region->vm_end > region->vm_top) + region->vm_end = region->vm_top; + } + } + + up_write(&nommu_region_sem); + return 0; +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d79b925..d2a8889 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1222,10 +1222,10 @@ again: } spin_lock_irqsave(&zone->lock, flags); page = __rmqueue(zone, order, migratetype); - __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); spin_unlock(&zone->lock); if (!page) goto failed; + __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); } __count_zone_vm_events(PGALLOC, zone, 1 << order); @@ -2402,13 +2402,14 @@ int numa_zonelist_order_handler(ctl_table *table, int write, { char saved_string[NUMA_ZONELIST_ORDER_LEN]; int ret; + static DEFINE_MUTEX(zl_order_mutex); + mutex_lock(&zl_order_mutex); if (write) - strncpy(saved_string, (char*)table->data, - NUMA_ZONELIST_ORDER_LEN); + strcpy(saved_string, (char*)table->data); ret = proc_dostring(table, write, buffer, length, ppos); if (ret) - return ret; + goto out; if (write) { int oldval = user_zonelist_order; if (__parse_numa_zonelist_order((char*)table->data)) { @@ -2421,7 +2422,9 @@ int numa_zonelist_order_handler(ctl_table *table, int write, } else if (oldval != user_zonelist_order) build_all_zonelists(); } - return 0; +out: + mutex_unlock(&zl_order_mutex); + return ret; } @@ -3995,7 +3998,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, } /* Merge backward if suitable */ - if (start_pfn < early_node_map[i].end_pfn && + if (start_pfn < early_node_map[i].start_pfn && end_pfn >= early_node_map[i].start_pfn) { early_node_map[i].start_pfn = start_pfn; return; diff --git a/mm/percpu.c b/mm/percpu.c index 442010c..083e7c9 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1271,7 +1271,7 @@ static void pcpu_reclaim(struct work_struct *work) */ void free_percpu(void *ptr) { - void *addr = __pcpu_ptr_to_addr(ptr); + void *addr; struct pcpu_chunk *chunk; unsigned long flags; int off; @@ -1279,6 +1279,8 @@ void free_percpu(void *ptr) if (!ptr) return; + addr = __pcpu_ptr_to_addr(ptr); + spin_lock_irqsave(&pcpu_lock, flags); chunk = pcpu_chunk_addr_search(addr); @@ -654,7 +654,7 @@ static void init_node_lock_keys(int q) l3 = s->cs_cachep->nodelists[q]; if (!l3 || OFF_SLAB(s->cs_cachep)) - return; + continue; lockdep_set_class(&l3->list_lock, &on_slab_l3_key); alc = l3->alien; /* @@ -665,7 +665,7 @@ static void init_node_lock_keys(int q) * for alloc_alien_cache, */ if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) - return; + continue; for_each_node(r) { if (alc[r]) lockdep_set_class(&alc[r]->lock, diff --git a/mm/truncate.c b/mm/truncate.c index 342deee..e87e372 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -522,22 +522,20 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); */ void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) { - if (new < old) { - struct address_space *mapping = inode->i_mapping; - - /* - * unmap_mapping_range is called twice, first simply for - * efficiency so that truncate_inode_pages does fewer - * single-page unmaps. However after this first call, and - * before truncate_inode_pages finishes, it is possible for - * private pages to be COWed, which remain after - * truncate_inode_pages finishes, hence the second - * unmap_mapping_range call must be made for correctness. - */ - unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(mapping, new); - unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); - } + struct address_space *mapping = inode->i_mapping; + + /* + * unmap_mapping_range is called twice, first simply for + * efficiency so that truncate_inode_pages does fewer + * single-page unmaps. However after this first call, and + * before truncate_inode_pages finishes, it is possible for + * private pages to be COWed, which remain after + * truncate_inode_pages finishes, hence the second + * unmap_mapping_range call must be made for correctness. + */ + unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(mapping, new); + unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); } EXPORT_SYMBOL(truncate_pagecache); @@ -4,10 +4,6 @@ #include <linux/module.h> #include <linux/err.h> #include <linux/sched.h> -#include <linux/hugetlb.h> -#include <linux/syscalls.h> -#include <linux/mman.h> -#include <linux/file.h> #include <asm/uaccess.h> #define CREATE_TRACE_POINTS @@ -224,7 +220,7 @@ char *strndup_user(const char __user *s, long n) } EXPORT_SYMBOL(strndup_user); -#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT +#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm) { mm->mmap_base = TASK_UNMAPPED_BASE; @@ -272,46 +268,6 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start, } EXPORT_SYMBOL_GPL(get_user_pages_fast); -SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, - unsigned long, prot, unsigned long, flags, - unsigned long, fd, unsigned long, pgoff) -{ - struct file * file = NULL; - unsigned long retval = -EBADF; - - if (!(flags & MAP_ANONYMOUS)) { - if (unlikely(flags & MAP_HUGETLB)) - return -EINVAL; - file = fget(fd); - if (!file) - goto out; - } else if (flags & MAP_HUGETLB) { - struct user_struct *user = NULL; - /* - * VM_NORESERVE is used because the reservations will be - * taken when vm_ops->mmap() is called - * A dummy user value is used because we are not locking - * memory so no accounting is necessary - */ - len = ALIGN(len, huge_page_size(&default_hstate)); - file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, - &user, HUGETLB_ANONHUGE_INODE); - if (IS_ERR(file)) - return PTR_ERR(file); - } - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - down_write(¤t->mm->mmap_sem); - retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return retval; -} - /* Tracepoints definitions. */ EXPORT_TRACEPOINT_SYMBOL(kmalloc); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 37e6929..d55d905 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -555,10 +555,8 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, } rcu_read_unlock(); - if (nr) { - BUG_ON(nr > atomic_read(&vmap_lazy_nr)); + if (nr) atomic_sub(nr, &vmap_lazy_nr); - } if (nr || force_flush) flush_tlb_kernel_range(*start, *end); diff --git a/mm/vmscan.c b/mm/vmscan.c index 885207a..c26986c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1922,6 +1922,9 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) if (!populated_zone(zone)) continue; + if (zone_is_all_unreclaimable(zone)) + continue; + if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), 0, 0)) return 1; |