summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2011-06-10 14:46:48 +0200
committerJiri Kosina <jkosina@suse.cz>2011-06-10 14:46:57 +0200
commit5be5758c114b18260c6fd4c8373bf89e39b0fe82 (patch)
tree54390f904df6ff11e570f764c444356cf2709fda /mm
parent71f66a6580c4e42df377bebbcca5c72661a40700 (diff)
parent7f45e5cd1718ed769295033ca214032848a0097d (diff)
downloadop-kernel-dev-5be5758c114b18260c6fd4c8373bf89e39b0fe82.zip
op-kernel-dev-5be5758c114b18260c6fd4c8373bf89e39b0fe82.tar.gz
Merge branch 'master' into for-next
Sync with Linus' tree to be able to apply patches against new code I have in queue.
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig23
-rw-r--r--mm/Makefile1
-rw-r--r--mm/backing-dev.c4
-rw-r--r--mm/cleancache.c244
-rw-r--r--mm/filemap.c101
-rw-r--r--mm/filemap_xip.c4
-rw-r--r--mm/fremap.c6
-rw-r--r--mm/huge_memory.c25
-rw-r--r--mm/hugetlb.c22
-rw-r--r--mm/init-mm.c1
-rw-r--r--mm/internal.h4
-rw-r--r--mm/ksm.c7
-rw-r--r--mm/maccess.c8
-rw-r--r--mm/memcontrol.c377
-rw-r--r--mm/memory-failure.c21
-rw-r--r--mm/memory.c444
-rw-r--r--mm/memory_hotplug.c21
-rw-r--r--mm/mempolicy.c164
-rw-r--r--mm/migrate.c17
-rw-r--r--mm/mlock.c8
-rw-r--r--mm/mmap.c129
-rw-r--r--mm/mremap.c5
-rw-r--r--mm/nobootmem.c23
-rw-r--r--mm/nommu.c108
-rw-r--r--mm/oom_kill.c36
-rw-r--r--mm/page_alloc.c121
-rw-r--r--mm/page_cgroup.c28
-rw-r--r--mm/percpu.c6
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/rmap.c195
-rw-r--r--mm/shmem.c333
-rw-r--r--mm/slub.c4
-rw-r--r--mm/swap.c52
-rw-r--r--mm/swapfile.c6
-rw-r--r--mm/truncate.c6
-rw-r--r--mm/util.c24
-rw-r--r--mm/vmalloc.c19
-rw-r--r--mm/vmscan.c184
-rw-r--r--mm/vmstat.c264
39 files changed, 2007 insertions, 1040 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index e9c0c61..8ca47a5 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -347,3 +347,26 @@ config NEED_PER_CPU_KM
depends on !SMP
bool
default y
+
+config CLEANCACHE
+ bool "Enable cleancache driver to cache clean pages if tmem is present"
+ default n
+ help
+ Cleancache can be thought of as a page-granularity victim cache
+ for clean pages that the kernel's pageframe replacement algorithm
+ (PFRA) would like to keep around, but can't since there isn't enough
+ memory. So when the PFRA "evicts" a page, it first attempts to use
+ cleancacne code to put the data contained in that page into
+ "transcendent memory", memory that is not directly accessible or
+ addressable by the kernel and is of unknown and possibly
+ time-varying size. And when a cleancache-enabled
+ filesystem wishes to access a page in a file on disk, it first
+ checks cleancache to see if it already contains it; if it does,
+ the page is copied into the kernel and a disk access is avoided.
+ When a transcendent memory driver is available (such as zcache or
+ Xen transcendent memory), a significant I/O reduction
+ may be achieved. When none is available, all cleancache calls
+ are reduced to a single pointer-compare-against-NULL resulting
+ in a negligible performance hit.
+
+ If unsure, say Y to enable cleancache
diff --git a/mm/Makefile b/mm/Makefile
index 42a8326..836e416 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -49,3 +49,4 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
+obj-$(CONFIG_CLEANCACHE) += cleancache.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index befc875..f032e6e 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -63,10 +63,10 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
unsigned long background_thresh;
unsigned long dirty_thresh;
unsigned long bdi_thresh;
- unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
+ unsigned long nr_dirty, nr_io, nr_more_io;
struct inode *inode;
- nr_wb = nr_dirty = nr_io = nr_more_io = 0;
+ nr_dirty = nr_io = nr_more_io = 0;
spin_lock(&inode_wb_list_lock);
list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
nr_dirty++;
diff --git a/mm/cleancache.c b/mm/cleancache.c
new file mode 100644
index 0000000..bcaae4c
--- /dev/null
+++ b/mm/cleancache.c
@@ -0,0 +1,244 @@
+/*
+ * Cleancache frontend
+ *
+ * This code provides the generic "frontend" layer to call a matching
+ * "backend" driver implementation of cleancache. See
+ * Documentation/vm/cleancache.txt for more information.
+ *
+ * Copyright (C) 2009-2010 Oracle Corp. All rights reserved.
+ * Author: Dan Magenheimer
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/exportfs.h>
+#include <linux/mm.h>
+#include <linux/cleancache.h>
+
+/*
+ * This global enablement flag may be read thousands of times per second
+ * by cleancache_get/put/flush even on systems where cleancache_ops
+ * is not claimed (e.g. cleancache is config'ed on but remains
+ * disabled), so is preferred to the slower alternative: a function
+ * call that checks a non-global.
+ */
+int cleancache_enabled;
+EXPORT_SYMBOL(cleancache_enabled);
+
+/*
+ * cleancache_ops is set by cleancache_ops_register to contain the pointers
+ * to the cleancache "backend" implementation functions.
+ */
+static struct cleancache_ops cleancache_ops;
+
+/* useful stats available in /sys/kernel/mm/cleancache */
+static unsigned long cleancache_succ_gets;
+static unsigned long cleancache_failed_gets;
+static unsigned long cleancache_puts;
+static unsigned long cleancache_flushes;
+
+/*
+ * register operations for cleancache, returning previous thus allowing
+ * detection of multiple backends and possible nesting
+ */
+struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops)
+{
+ struct cleancache_ops old = cleancache_ops;
+
+ cleancache_ops = *ops;
+ cleancache_enabled = 1;
+ return old;
+}
+EXPORT_SYMBOL(cleancache_register_ops);
+
+/* Called by a cleancache-enabled filesystem at time of mount */
+void __cleancache_init_fs(struct super_block *sb)
+{
+ sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE);
+}
+EXPORT_SYMBOL(__cleancache_init_fs);
+
+/* Called by a cleancache-enabled clustered filesystem at time of mount */
+void __cleancache_init_shared_fs(char *uuid, struct super_block *sb)
+{
+ sb->cleancache_poolid =
+ (*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE);
+}
+EXPORT_SYMBOL(__cleancache_init_shared_fs);
+
+/*
+ * If the filesystem uses exportable filehandles, use the filehandle as
+ * the key, else use the inode number.
+ */
+static int cleancache_get_key(struct inode *inode,
+ struct cleancache_filekey *key)
+{
+ int (*fhfn)(struct dentry *, __u32 *fh, int *, int);
+ int len = 0, maxlen = CLEANCACHE_KEY_MAX;
+ struct super_block *sb = inode->i_sb;
+
+ key->u.ino = inode->i_ino;
+ if (sb->s_export_op != NULL) {
+ fhfn = sb->s_export_op->encode_fh;
+ if (fhfn) {
+ struct dentry d;
+ d.d_inode = inode;
+ len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0);
+ if (len <= 0 || len == 255)
+ return -1;
+ if (maxlen > CLEANCACHE_KEY_MAX)
+ return -1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * "Get" data from cleancache associated with the poolid/inode/index
+ * that were specified when the data was put to cleanache and, if
+ * successful, use it to fill the specified page with data and return 0.
+ * The pageframe is unchanged and returns -1 if the get fails.
+ * Page must be locked by caller.
+ */
+int __cleancache_get_page(struct page *page)
+{
+ int ret = -1;
+ int pool_id;
+ struct cleancache_filekey key = { .u.key = { 0 } };
+
+ VM_BUG_ON(!PageLocked(page));
+ pool_id = page->mapping->host->i_sb->cleancache_poolid;
+ if (pool_id < 0)
+ goto out;
+
+ if (cleancache_get_key(page->mapping->host, &key) < 0)
+ goto out;
+
+ ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page);
+ if (ret == 0)
+ cleancache_succ_gets++;
+ else
+ cleancache_failed_gets++;
+out:
+ return ret;
+}
+EXPORT_SYMBOL(__cleancache_get_page);
+
+/*
+ * "Put" data from a page to cleancache and associate it with the
+ * (previously-obtained per-filesystem) poolid and the page's,
+ * inode and page index. Page must be locked. Note that a put_page
+ * always "succeeds", though a subsequent get_page may succeed or fail.
+ */
+void __cleancache_put_page(struct page *page)
+{
+ int pool_id;
+ struct cleancache_filekey key = { .u.key = { 0 } };
+
+ VM_BUG_ON(!PageLocked(page));
+ pool_id = page->mapping->host->i_sb->cleancache_poolid;
+ if (pool_id >= 0 &&
+ cleancache_get_key(page->mapping->host, &key) >= 0) {
+ (*cleancache_ops.put_page)(pool_id, key, page->index, page);
+ cleancache_puts++;
+ }
+}
+EXPORT_SYMBOL(__cleancache_put_page);
+
+/*
+ * Flush any data from cleancache associated with the poolid and the
+ * page's inode and page index so that a subsequent "get" will fail.
+ */
+void __cleancache_flush_page(struct address_space *mapping, struct page *page)
+{
+ /* careful... page->mapping is NULL sometimes when this is called */
+ int pool_id = mapping->host->i_sb->cleancache_poolid;
+ struct cleancache_filekey key = { .u.key = { 0 } };
+
+ if (pool_id >= 0) {
+ VM_BUG_ON(!PageLocked(page));
+ if (cleancache_get_key(mapping->host, &key) >= 0) {
+ (*cleancache_ops.flush_page)(pool_id, key, page->index);
+ cleancache_flushes++;
+ }
+ }
+}
+EXPORT_SYMBOL(__cleancache_flush_page);
+
+/*
+ * Flush all data from cleancache associated with the poolid and the
+ * mappings's inode so that all subsequent gets to this poolid/inode
+ * will fail.
+ */
+void __cleancache_flush_inode(struct address_space *mapping)
+{
+ int pool_id = mapping->host->i_sb->cleancache_poolid;
+ struct cleancache_filekey key = { .u.key = { 0 } };
+
+ if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
+ (*cleancache_ops.flush_inode)(pool_id, key);
+}
+EXPORT_SYMBOL(__cleancache_flush_inode);
+
+/*
+ * Called by any cleancache-enabled filesystem at time of unmount;
+ * note that pool_id is surrendered and may be reutrned by a subsequent
+ * cleancache_init_fs or cleancache_init_shared_fs
+ */
+void __cleancache_flush_fs(struct super_block *sb)
+{
+ if (sb->cleancache_poolid >= 0) {
+ int old_poolid = sb->cleancache_poolid;
+ sb->cleancache_poolid = -1;
+ (*cleancache_ops.flush_fs)(old_poolid);
+ }
+}
+EXPORT_SYMBOL(__cleancache_flush_fs);
+
+#ifdef CONFIG_SYSFS
+
+/* see Documentation/ABI/xxx/sysfs-kernel-mm-cleancache */
+
+#define CLEANCACHE_SYSFS_RO(_name) \
+ static ssize_t cleancache_##_name##_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, char *buf) \
+ { \
+ return sprintf(buf, "%lu\n", cleancache_##_name); \
+ } \
+ static struct kobj_attribute cleancache_##_name##_attr = { \
+ .attr = { .name = __stringify(_name), .mode = 0444 }, \
+ .show = cleancache_##_name##_show, \
+ }
+
+CLEANCACHE_SYSFS_RO(succ_gets);
+CLEANCACHE_SYSFS_RO(failed_gets);
+CLEANCACHE_SYSFS_RO(puts);
+CLEANCACHE_SYSFS_RO(flushes);
+
+static struct attribute *cleancache_attrs[] = {
+ &cleancache_succ_gets_attr.attr,
+ &cleancache_failed_gets_attr.attr,
+ &cleancache_puts_attr.attr,
+ &cleancache_flushes_attr.attr,
+ NULL,
+};
+
+static struct attribute_group cleancache_attr_group = {
+ .attrs = cleancache_attrs,
+ .name = "cleancache",
+};
+
+#endif /* CONFIG_SYSFS */
+
+static int __init init_cleancache(void)
+{
+#ifdef CONFIG_SYSFS
+ int err;
+
+ err = sysfs_create_group(mm_kobj, &cleancache_attr_group);
+#endif /* CONFIG_SYSFS */
+ return 0;
+}
+module_init(init_cleancache)
diff --git a/mm/filemap.c b/mm/filemap.c
index c641edf..a8251a8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -34,6 +34,7 @@
#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
#include <linux/memcontrol.h>
#include <linux/mm_inline.h> /* for page_is_file_cache() */
+#include <linux/cleancache.h>
#include "internal.h"
/*
@@ -58,16 +59,16 @@
/*
* Lock ordering:
*
- * ->i_mmap_lock (truncate_pagecache)
+ * ->i_mmap_mutex (truncate_pagecache)
* ->private_lock (__free_pte->__set_page_dirty_buffers)
* ->swap_lock (exclusive_swap_page, others)
* ->mapping->tree_lock
*
* ->i_mutex
- * ->i_mmap_lock (truncate->unmap_mapping_range)
+ * ->i_mmap_mutex (truncate->unmap_mapping_range)
*
* ->mmap_sem
- * ->i_mmap_lock
+ * ->i_mmap_mutex
* ->page_table_lock or pte_lock (various, mainly in memory.c)
* ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
*
@@ -84,7 +85,7 @@
* sb_lock (fs/fs-writeback.c)
* ->mapping->tree_lock (__sync_single_inode)
*
- * ->i_mmap_lock
+ * ->i_mmap_mutex
* ->anon_vma.lock (vma_adjust)
*
* ->anon_vma.lock
@@ -106,7 +107,7 @@
*
* (code doesn't rely on that order, so you could switch it around)
* ->tasklist_lock (memory_failure, collect_procs_ao)
- * ->i_mmap_lock
+ * ->i_mmap_mutex
*/
/*
@@ -118,6 +119,16 @@ void __delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
+ /*
+ * if we're uptodate, flush out into the cleancache, otherwise
+ * invalidate any existing cleancache entries. We can't leave
+ * stale data around in the cleancache once our page is gone
+ */
+ if (PageUptodate(page) && PageMappedToDisk(page))
+ cleancache_put_page(page);
+ else
+ cleancache_flush_page(mapping, page);
+
radix_tree_delete(&mapping->page_tree, page->index);
page->mapping = NULL;
mapping->nrpages--;
@@ -562,6 +573,17 @@ void wait_on_page_bit(struct page *page, int bit_nr)
}
EXPORT_SYMBOL(wait_on_page_bit);
+int wait_on_page_bit_killable(struct page *page, int bit_nr)
+{
+ DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+
+ if (!test_bit(bit_nr, &page->flags))
+ return 0;
+
+ return __wait_on_bit(page_waitqueue(page), &wait,
+ sleep_on_page_killable, TASK_KILLABLE);
+}
+
/**
* add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
* @page: Page defining the wait queue of interest
@@ -643,15 +665,32 @@ EXPORT_SYMBOL_GPL(__lock_page_killable);
int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
unsigned int flags)
{
- if (!(flags & FAULT_FLAG_ALLOW_RETRY)) {
- __lock_page(page);
- return 1;
- } else {
- if (!(flags & FAULT_FLAG_RETRY_NOWAIT)) {
- up_read(&mm->mmap_sem);
+ if (flags & FAULT_FLAG_ALLOW_RETRY) {
+ /*
+ * CAUTION! In this case, mmap_sem is not released
+ * even though return 0.
+ */
+ if (flags & FAULT_FLAG_RETRY_NOWAIT)
+ return 0;
+
+ up_read(&mm->mmap_sem);
+ if (flags & FAULT_FLAG_KILLABLE)
+ wait_on_page_locked_killable(page);
+ else
wait_on_page_locked(page);
- }
return 0;
+ } else {
+ if (flags & FAULT_FLAG_KILLABLE) {
+ int ret;
+
+ ret = __lock_page_killable(page);
+ if (ret) {
+ up_read(&mm->mmap_sem);
+ return 0;
+ }
+ } else
+ __lock_page(page);
+ return 1;
}
}
@@ -1528,15 +1567,17 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
/* If we don't want any read-ahead, don't bother */
if (VM_RandomReadHint(vma))
return;
+ if (!ra->ra_pages)
+ return;
- if (VM_SequentialReadHint(vma) ||
- offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) {
+ if (VM_SequentialReadHint(vma)) {
page_cache_sync_readahead(mapping, ra, file, offset,
ra->ra_pages);
return;
}
- if (ra->mmap_miss < INT_MAX)
+ /* Avoid banging the cache line if not needed */
+ if (ra->mmap_miss < MMAP_LOTSAMISS * 10)
ra->mmap_miss++;
/*
@@ -1550,12 +1591,10 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
* mmap read-around
*/
ra_pages = max_sane_readahead(ra->ra_pages);
- if (ra_pages) {
- ra->start = max_t(long, 0, offset - ra_pages/2);
- ra->size = ra_pages;
- ra->async_size = 0;
- ra_submit(ra, mapping, file);
- }
+ ra->start = max_t(long, 0, offset - ra_pages / 2);
+ ra->size = ra_pages;
+ ra->async_size = ra_pages / 4;
+ ra_submit(ra, mapping, file);
}
/*
@@ -1622,6 +1661,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
/* No page in the page cache at all */
do_sync_mmap_readahead(vma, ra, file, offset);
count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
ret = VM_FAULT_MAJOR;
retry_find:
page = find_get_page(mapping, offset);
@@ -1660,7 +1700,6 @@ retry_find:
return VM_FAULT_SIGBUS;
}
- ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT;
vmf->page = page;
return ret | VM_FAULT_LOCKED;
@@ -1943,16 +1982,26 @@ static int __remove_suid(struct dentry *dentry, int kill)
int file_remove_suid(struct file *file)
{
struct dentry *dentry = file->f_path.dentry;
- int killsuid = should_remove_suid(dentry);
- int killpriv = security_inode_need_killpriv(dentry);
+ struct inode *inode = dentry->d_inode;
+ int killsuid;
+ int killpriv;
int error = 0;
+ /* Fast path for nothing security related */
+ if (IS_NOSEC(inode))
+ return 0;
+
+ killsuid = should_remove_suid(dentry);
+ killpriv = security_inode_need_killpriv(dentry);
+
if (killpriv < 0)
return killpriv;
if (killpriv)
error = security_inode_killpriv(dentry);
if (!error && killsuid)
error = __remove_suid(dentry, killsuid);
+ if (!error && (inode->i_sb->s_flags & MS_NOSEC))
+ inode->i_flags |= S_NOSEC;
return error;
}
@@ -2288,7 +2337,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
repeat:
page = find_lock_page(mapping, index);
if (page)
- return page;
+ goto found;
page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
if (!page)
@@ -2301,6 +2350,8 @@ repeat:
goto repeat;
return NULL;
}
+found:
+ wait_on_page_writeback(page);
return page;
}
EXPORT_SYMBOL(grab_cache_page_write_begin);
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 83364df..93356cd 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -183,7 +183,7 @@ __xip_unmap (struct address_space * mapping,
return;
retry:
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_mutex);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
mm = vma->vm_mm;
address = vma->vm_start +
@@ -201,7 +201,7 @@ retry:
page_cache_release(page);
}
}
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
if (locked) {
mutex_unlock(&xip_sparse_mutex);
diff --git a/mm/fremap.c b/mm/fremap.c
index ec520c7..b8e0e2d 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -211,20 +211,20 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
}
goto out;
}
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_mutex);
flush_dcache_mmap_lock(mapping);
vma->vm_flags |= VM_NONLINEAR;
vma_prio_tree_remove(vma, &mapping->i_mmap);
vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
flush_dcache_mmap_unlock(mapping);
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
}
if (vma->vm_flags & VM_LOCKED) {
/*
* drop PG_Mlocked flag for over-mapped range
*/
- unsigned int saved_flags = vma->vm_flags;
+ vm_flags_t saved_flags = vma->vm_flags;
munlock_vma_pages_range(vma, start, start + size);
vma->vm_flags = saved_flags;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 83326ad..615d974 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1139,7 +1139,7 @@ static int __split_huge_page_splitting(struct page *page,
* We can't temporarily set the pmd to null in order
* to split it, the pmd must remain marked huge at all
* times or the VM won't take the pmd_trans_huge paths
- * and it won't wait on the anon_vma->root->lock to
+ * and it won't wait on the anon_vma->root->mutex to
* serialize against split_huge_page*.
*/
pmdp_splitting_flush_notify(vma, address, pmd);
@@ -1333,7 +1333,7 @@ static int __split_huge_page_map(struct page *page,
return ret;
}
-/* must be called with anon_vma->root->lock hold */
+/* must be called with anon_vma->root->mutex hold */
static void __split_huge_page(struct page *page,
struct anon_vma *anon_vma)
{
@@ -1771,12 +1771,9 @@ static void collapse_huge_page(struct mm_struct *mm,
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
#ifndef CONFIG_NUMA
+ up_read(&mm->mmap_sem);
VM_BUG_ON(!*hpage);
new_page = *hpage;
- if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
- up_read(&mm->mmap_sem);
- return;
- }
#else
VM_BUG_ON(*hpage);
/*
@@ -1791,22 +1788,26 @@ static void collapse_huge_page(struct mm_struct *mm,
*/
new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
node, __GFP_OTHER_NODE);
+
+ /*
+ * After allocating the hugepage, release the mmap_sem read lock in
+ * preparation for taking it in write mode.
+ */
+ up_read(&mm->mmap_sem);
if (unlikely(!new_page)) {
- up_read(&mm->mmap_sem);
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
*hpage = ERR_PTR(-ENOMEM);
return;
}
+#endif
+
count_vm_event(THP_COLLAPSE_ALLOC);
if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
- up_read(&mm->mmap_sem);
+#ifdef CONFIG_NUMA
put_page(new_page);
+#endif
return;
}
-#endif
-
- /* after allocating the hugepage upgrade to mmap_sem write mode */
- up_read(&mm->mmap_sem);
/*
* Prevent all access to pagetables with the exception of
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bbb4a5b..6402458 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1033,10 +1033,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
*/
chg = vma_needs_reservation(h, vma, addr);
if (chg < 0)
- return ERR_PTR(chg);
+ return ERR_PTR(-VM_FAULT_OOM);
if (chg)
if (hugetlb_get_quota(inode->i_mapping, chg))
- return ERR_PTR(-ENOSPC);
+ return ERR_PTR(-VM_FAULT_SIGBUS);
spin_lock(&hugetlb_lock);
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
@@ -2205,7 +2205,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
unsigned long sz = huge_page_size(h);
/*
- * A page gathering list, protected by per file i_mmap_lock. The
+ * A page gathering list, protected by per file i_mmap_mutex. The
* lock is used to avoid list corruption from multiple unmapping
* of the same page since we are using page->lru.
*/
@@ -2274,9 +2274,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end, struct page *ref_page)
{
- spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+ mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
__unmap_hugepage_range(vma, start, end, ref_page);
- spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+ mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
}
/*
@@ -2308,7 +2308,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
* this mapping should be shared between all the VMAs,
* __unmap_hugepage_range() is called as the lock is already held
*/
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_mutex);
vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
/* Do not unmap the current VMA */
if (iter_vma == vma)
@@ -2326,7 +2326,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
address, address + huge_page_size(h),
page);
}
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
return 1;
}
@@ -2810,7 +2810,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
BUG_ON(address >= end);
flush_cache_range(vma, address, end);
- spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+ mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
spin_lock(&mm->page_table_lock);
for (; address < end; address += huge_page_size(h)) {
ptep = huge_pte_offset(mm, address);
@@ -2825,7 +2825,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
}
}
spin_unlock(&mm->page_table_lock);
- spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+ mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
flush_tlb_range(vma, start, end);
}
@@ -2833,7 +2833,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
int hugetlb_reserve_pages(struct inode *inode,
long from, long to,
struct vm_area_struct *vma,
- int acctflag)
+ vm_flags_t vm_flags)
{
long ret, chg;
struct hstate *h = hstate_inode(inode);
@@ -2843,7 +2843,7 @@ int hugetlb_reserve_pages(struct inode *inode,
* attempt will be made for VM_NORESERVE to allocate a page
* and filesystem quota without using reserves
*/
- if (acctflag & VM_NORESERVE)
+ if (vm_flags & VM_NORESERVE)
return 0;
/*
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 1d29cdf..4019979 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -21,6 +21,5 @@ struct mm_struct init_mm = {
.mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
- .cpu_vm_mask = CPU_MASK_ALL,
INIT_MM_CONTEXT(init_mm)
};
diff --git a/mm/internal.h b/mm/internal.h
index 9d0ced8..d071d38 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -66,6 +66,10 @@ static inline unsigned long page_order(struct page *page)
return page_private(page);
}
+/* mm/util.c */
+void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct vm_area_struct *prev, struct rb_node *rb_parent);
+
#ifdef CONFIG_MMU
extern long mlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
diff --git a/mm/ksm.c b/mm/ksm.c
index 942dfc7..d708b3e 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -35,6 +35,7 @@
#include <linux/ksm.h>
#include <linux/hash.h>
#include <linux/freezer.h>
+#include <linux/oom.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -1894,9 +1895,11 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
if (ksm_run != flags) {
ksm_run = flags;
if (flags & KSM_RUN_UNMERGE) {
- current->flags |= PF_OOM_ORIGIN;
+ int oom_score_adj;
+
+ oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
err = unmerge_and_remove_all_rmap_items();
- current->flags &= ~PF_OOM_ORIGIN;
+ test_set_oom_score_adj(oom_score_adj);
if (err) {
ksm_run = KSM_RUN_STOP;
count = err;
diff --git a/mm/maccess.c b/mm/maccess.c
index e2b6f56..4cee182 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -15,10 +15,10 @@
* happens, handle that and return -EFAULT.
*/
-long __weak probe_kernel_read(void *dst, void *src, size_t size)
+long __weak probe_kernel_read(void *dst, const void *src, size_t size)
__attribute__((alias("__probe_kernel_read")));
-long __probe_kernel_read(void *dst, void *src, size_t size)
+long __probe_kernel_read(void *dst, const void *src, size_t size)
{
long ret;
mm_segment_t old_fs = get_fs();
@@ -43,10 +43,10 @@ EXPORT_SYMBOL_GPL(probe_kernel_read);
* Safely write to address @dst from the buffer at @src. If a kernel fault
* happens, handle that and return -EFAULT.
*/
-long __weak probe_kernel_write(void *dst, void *src, size_t size)
+long __weak probe_kernel_write(void *dst, const void *src, size_t size)
__attribute__((alias("__probe_kernel_write")));
-long __probe_kernel_write(void *dst, void *src, size_t size)
+long __probe_kernel_write(void *dst, const void *src, size_t size)
{
long ret;
mm_segment_t old_fs = get_fs();
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 010f916..bd9052a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -94,6 +94,8 @@ enum mem_cgroup_events_index {
MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */
+ MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */
+ MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */
MEM_CGROUP_EVENTS_NSTATS,
};
/*
@@ -231,6 +233,11 @@ struct mem_cgroup {
* reclaimed from.
*/
int last_scanned_child;
+ int last_scanned_node;
+#if MAX_NUMNODES > 1
+ nodemask_t scan_nodes;
+ unsigned long next_scan_node_update;
+#endif
/*
* Should the accounting and control be hierarchical, per subtree?
*/
@@ -585,6 +592,16 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
}
+void mem_cgroup_pgfault(struct mem_cgroup *mem, int val)
+{
+ this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);
+}
+
+void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val)
+{
+ this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);
+}
+
static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem,
enum mem_cgroup_events_index idx)
{
@@ -624,18 +641,27 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
preempt_enable();
}
+static unsigned long
+mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
+{
+ struct mem_cgroup_per_zone *mz;
+ u64 total = 0;
+ int zid;
+
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ mz = mem_cgroup_zoneinfo(mem, nid, zid);
+ total += MEM_CGROUP_ZSTAT(mz, idx);
+ }
+ return total;
+}
static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
enum lru_list idx)
{
- int nid, zid;
- struct mem_cgroup_per_zone *mz;
+ int nid;
u64 total = 0;
for_each_online_node(nid)
- for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- mz = mem_cgroup_zoneinfo(mem, nid, zid);
- total += MEM_CGROUP_ZSTAT(mz, idx);
- }
+ total += mem_cgroup_get_zonestat_node(mem, nid, idx);
return total;
}
@@ -813,6 +839,33 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
return (mem == root_mem_cgroup);
}
+void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
+{
+ struct mem_cgroup *mem;
+
+ if (!mm)
+ return;
+
+ rcu_read_lock();
+ mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!mem))
+ goto out;
+
+ switch (idx) {
+ case PGMAJFAULT:
+ mem_cgroup_pgmajfault(mem, 1);
+ break;
+ case PGFAULT:
+ mem_cgroup_pgfault(mem, 1);
+ break;
+ default:
+ BUG();
+ }
+out:
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(mem_cgroup_count_vm_event);
+
/*
* Following LRU functions are allowed to be used without PCG_LOCK.
* Operations are called by routine of global LRU independently from memcg.
@@ -1064,9 +1117,9 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
return (active > inactive);
}
-unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
- struct zone *zone,
- enum lru_list lru)
+unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
+ struct zone *zone,
+ enum lru_list lru)
{
int nid = zone_to_nid(zone);
int zid = zone_idx(zone);
@@ -1075,6 +1128,93 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
return MEM_CGROUP_ZSTAT(mz, lru);
}
+#ifdef CONFIG_NUMA
+static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
+ int nid)
+{
+ unsigned long ret;
+
+ ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_FILE) +
+ mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_FILE);
+
+ return ret;
+}
+
+static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
+{
+ u64 total = 0;
+ int nid;
+
+ for_each_node_state(nid, N_HIGH_MEMORY)
+ total += mem_cgroup_node_nr_file_lru_pages(memcg, nid);
+
+ return total;
+}
+
+static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
+ int nid)
+{
+ unsigned long ret;
+
+ ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
+ mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
+
+ return ret;
+}
+
+static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg)
+{
+ u64 total = 0;
+ int nid;
+
+ for_each_node_state(nid, N_HIGH_MEMORY)
+ total += mem_cgroup_node_nr_anon_lru_pages(memcg, nid);
+
+ return total;
+}
+
+static unsigned long
+mem_cgroup_node_nr_unevictable_lru_pages(struct mem_cgroup *memcg, int nid)
+{
+ return mem_cgroup_get_zonestat_node(memcg, nid, LRU_UNEVICTABLE);
+}
+
+static unsigned long
+mem_cgroup_nr_unevictable_lru_pages(struct mem_cgroup *memcg)
+{
+ u64 total = 0;
+ int nid;
+
+ for_each_node_state(nid, N_HIGH_MEMORY)
+ total += mem_cgroup_node_nr_unevictable_lru_pages(memcg, nid);
+
+ return total;
+}
+
+static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+ int nid)
+{
+ enum lru_list l;
+ u64 total = 0;
+
+ for_each_lru(l)
+ total += mem_cgroup_get_zonestat_node(memcg, nid, l);
+
+ return total;
+}
+
+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg)
+{
+ u64 total = 0;
+ int nid;
+
+ for_each_node_state(nid, N_HIGH_MEMORY)
+ total += mem_cgroup_node_nr_lru_pages(memcg, nid);
+
+ return total;
+}
+#endif /* CONFIG_NUMA */
+
struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
struct zone *zone)
{
@@ -1418,6 +1558,81 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
return ret;
}
+#if MAX_NUMNODES > 1
+
+/*
+ * Always updating the nodemask is not very good - even if we have an empty
+ * list or the wrong list here, we can start from some node and traverse all
+ * nodes based on the zonelist. So update the list loosely once per 10 secs.
+ *
+ */
+static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
+{
+ int nid;
+
+ if (time_after(mem->next_scan_node_update, jiffies))
+ return;
+
+ mem->next_scan_node_update = jiffies + 10*HZ;
+ /* make a nodemask where this memcg uses memory from */
+ mem->scan_nodes = node_states[N_HIGH_MEMORY];
+
+ for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
+
+ if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
+ mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
+ continue;
+
+ if (total_swap_pages &&
+ (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
+ mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
+ continue;
+ node_clear(nid, mem->scan_nodes);
+ }
+}
+
+/*
+ * Selecting a node where we start reclaim from. Because what we need is just
+ * reducing usage counter, start from anywhere is O,K. Considering
+ * memory reclaim from current node, there are pros. and cons.
+ *
+ * Freeing memory from current node means freeing memory from a node which
+ * we'll use or we've used. So, it may make LRU bad. And if several threads
+ * hit limits, it will see a contention on a node. But freeing from remote
+ * node means more costs for memory reclaim because of memory latency.
+ *
+ * Now, we use round-robin. Better algorithm is welcomed.
+ */
+int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+{
+ int node;
+
+ mem_cgroup_may_update_nodemask(mem);
+ node = mem->last_scanned_node;
+
+ node = next_node(node, mem->scan_nodes);
+ if (node == MAX_NUMNODES)
+ node = first_node(mem->scan_nodes);
+ /*
+ * We call this when we hit limit, not when pages are added to LRU.
+ * No LRU may hold pages because all pages are UNEVICTABLE or
+ * memcg is too small and all pages are not on LRU. In that case,
+ * we use curret node.
+ */
+ if (unlikely(node == MAX_NUMNODES))
+ node = numa_node_id();
+
+ mem->last_scanned_node = node;
+ return node;
+}
+
+#else
+int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+{
+ return 0;
+}
+#endif
+
/*
* Scan the hierarchy if needed to reclaim memory. We remember the last child
* we reclaimed from, so that we don't end up penalizing one child extensively
@@ -1433,7 +1648,8 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
struct zone *zone,
gfp_t gfp_mask,
- unsigned long reclaim_options)
+ unsigned long reclaim_options,
+ unsigned long *total_scanned)
{
struct mem_cgroup *victim;
int ret, total = 0;
@@ -1442,6 +1658,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
unsigned long excess;
+ unsigned long nr_scanned;
excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
@@ -1484,10 +1701,12 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
continue;
}
/* we use swappiness of local cgroup */
- if (check_soft)
+ if (check_soft) {
ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
- noswap, get_swappiness(victim), zone);
- else
+ noswap, get_swappiness(victim), zone,
+ &nr_scanned);
+ *total_scanned += nr_scanned;
+ } else
ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
noswap, get_swappiness(victim));
css_put(&victim->css);
@@ -1503,7 +1722,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
if (!res_counter_soft_limit_excess(&root_mem->res))
return total;
} else if (mem_cgroup_margin(root_mem))
- return 1 + total;
+ return total;
}
return total;
}
@@ -1928,7 +2147,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
return CHARGE_WOULDBLOCK;
ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
- gfp_mask, flags);
+ gfp_mask, flags, NULL);
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
return CHARGE_RETRY;
/*
@@ -3211,7 +3430,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
break;
mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
- MEM_CGROUP_RECLAIM_SHRINK);
+ MEM_CGROUP_RECLAIM_SHRINK,
+ NULL);
curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
/* Usage is reduced ? */
if (curusage >= oldusage)
@@ -3271,7 +3491,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
MEM_CGROUP_RECLAIM_NOSWAP |
- MEM_CGROUP_RECLAIM_SHRINK);
+ MEM_CGROUP_RECLAIM_SHRINK,
+ NULL);
curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
/* Usage is reduced ? */
if (curusage >= oldusage)
@@ -3285,7 +3506,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
}
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
- gfp_t gfp_mask)
+ gfp_t gfp_mask,
+ unsigned long *total_scanned)
{
unsigned long nr_reclaimed = 0;
struct mem_cgroup_per_zone *mz, *next_mz = NULL;
@@ -3293,6 +3515,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
int loop = 0;
struct mem_cgroup_tree_per_zone *mctz;
unsigned long long excess;
+ unsigned long nr_scanned;
if (order > 0)
return 0;
@@ -3311,10 +3534,13 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
if (!mz)
break;
+ nr_scanned = 0;
reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
gfp_mask,
- MEM_CGROUP_RECLAIM_SOFT);
+ MEM_CGROUP_RECLAIM_SOFT,
+ &nr_scanned);
nr_reclaimed += reclaimed;
+ *total_scanned += nr_scanned;
spin_lock(&mctz->lock);
/*
@@ -3337,10 +3563,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
*/
next_mz =
__mem_cgroup_largest_soft_limit_node(mctz);
- if (next_mz == mz) {
+ if (next_mz == mz)
css_put(&next_mz->mem->css);
- next_mz = NULL;
- } else /* next_mz == NULL or other memcg */
+ else /* next_mz == NULL or other memcg */
break;
} while (1);
}
@@ -3772,6 +3997,8 @@ enum {
MCS_PGPGIN,
MCS_PGPGOUT,
MCS_SWAP,
+ MCS_PGFAULT,
+ MCS_PGMAJFAULT,
MCS_INACTIVE_ANON,
MCS_ACTIVE_ANON,
MCS_INACTIVE_FILE,
@@ -3794,6 +4021,8 @@ struct {
{"pgpgin", "total_pgpgin"},
{"pgpgout", "total_pgpgout"},
{"swap", "total_swap"},
+ {"pgfault", "total_pgfault"},
+ {"pgmajfault", "total_pgmajfault"},
{"inactive_anon", "total_inactive_anon"},
{"active_anon", "total_active_anon"},
{"inactive_file", "total_inactive_file"},
@@ -3822,6 +4051,10 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
s->stat[MCS_SWAP] += val * PAGE_SIZE;
}
+ val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT);
+ s->stat[MCS_PGFAULT] += val;
+ val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT);
+ s->stat[MCS_PGMAJFAULT] += val;
/* per zone stat */
val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
@@ -3845,6 +4078,51 @@ mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
mem_cgroup_get_local_stat(iter, s);
}
+#ifdef CONFIG_NUMA
+static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
+{
+ int nid;
+ unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
+ unsigned long node_nr;
+ struct cgroup *cont = m->private;
+ struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
+
+ total_nr = mem_cgroup_nr_lru_pages(mem_cont);
+ seq_printf(m, "total=%lu", total_nr);
+ for_each_node_state(nid, N_HIGH_MEMORY) {
+ node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid);
+ seq_printf(m, " N%d=%lu", nid, node_nr);
+ }
+ seq_putc(m, '\n');
+
+ file_nr = mem_cgroup_nr_file_lru_pages(mem_cont);
+ seq_printf(m, "file=%lu", file_nr);
+ for_each_node_state(nid, N_HIGH_MEMORY) {
+ node_nr = mem_cgroup_node_nr_file_lru_pages(mem_cont, nid);
+ seq_printf(m, " N%d=%lu", nid, node_nr);
+ }
+ seq_putc(m, '\n');
+
+ anon_nr = mem_cgroup_nr_anon_lru_pages(mem_cont);
+ seq_printf(m, "anon=%lu", anon_nr);
+ for_each_node_state(nid, N_HIGH_MEMORY) {
+ node_nr = mem_cgroup_node_nr_anon_lru_pages(mem_cont, nid);
+ seq_printf(m, " N%d=%lu", nid, node_nr);
+ }
+ seq_putc(m, '\n');
+
+ unevictable_nr = mem_cgroup_nr_unevictable_lru_pages(mem_cont);
+ seq_printf(m, "unevictable=%lu", unevictable_nr);
+ for_each_node_state(nid, N_HIGH_MEMORY) {
+ node_nr = mem_cgroup_node_nr_unevictable_lru_pages(mem_cont,
+ nid);
+ seq_printf(m, " N%d=%lu", nid, node_nr);
+ }
+ seq_putc(m, '\n');
+ return 0;
+}
+#endif /* CONFIG_NUMA */
+
static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
struct cgroup_map_cb *cb)
{
@@ -3855,6 +4133,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
memset(&mystat, 0, sizeof(mystat));
mem_cgroup_get_local_stat(mem_cont, &mystat);
+
for (i = 0; i < NR_MCS_STAT; i++) {
if (i == MCS_SWAP && !do_swap_account)
continue;
@@ -4278,6 +4557,22 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
return 0;
}
+#ifdef CONFIG_NUMA
+static const struct file_operations mem_control_numa_stat_file_operations = {
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
+{
+ struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
+
+ file->f_op = &mem_control_numa_stat_file_operations;
+ return single_open(file, mem_control_numa_stat_show, cont);
+}
+#endif /* CONFIG_NUMA */
+
static struct cftype mem_cgroup_files[] = {
{
.name = "usage_in_bytes",
@@ -4341,6 +4636,12 @@ static struct cftype mem_cgroup_files[] = {
.unregister_event = mem_cgroup_oom_unregister_event,
.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
},
+#ifdef CONFIG_NUMA
+ {
+ .name = "numa_stat",
+ .open = mem_control_numa_stat_open,
+ },
+#endif
};
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -4596,6 +4897,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
res_counter_init(&mem->memsw, NULL);
}
mem->last_scanned_child = 0;
+ mem->last_scanned_node = MAX_NUMNODES;
INIT_LIST_HEAD(&mem->oom_notify);
if (parent)
@@ -4953,8 +5255,7 @@ static void mem_cgroup_clear_mc(void)
static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
int ret = 0;
struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
@@ -4993,8 +5294,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
mem_cgroup_clear_mc();
}
@@ -5112,8 +5412,7 @@ retry:
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct cgroup *cont,
struct cgroup *old_cont,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
struct mm_struct *mm;
@@ -5131,22 +5430,19 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
#else /* !CONFIG_MMU */
static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
return 0;
}
static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
}
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct cgroup *cont,
struct cgroup *old_cont,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
}
#endif
@@ -5169,19 +5465,12 @@ struct cgroup_subsys mem_cgroup_subsys = {
static int __init enable_swap_account(char *s)
{
/* consider enabled if no parameter or 1 is given */
- if (!(*s) || !strcmp(s, "=1"))
+ if (!strcmp(s, "1"))
really_do_swap_account = 1;
- else if (!strcmp(s, "=0"))
+ else if (!strcmp(s, "0"))
really_do_swap_account = 0;
return 1;
}
-__setup("swapaccount", enable_swap_account);
+__setup("swapaccount=", enable_swap_account);
-static int __init disable_swap_account(char *s)
-{
- printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n");
- enable_swap_account("=0");
- return 1;
-}
-__setup("noswapaccount", disable_swap_account);
#endif
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 2b9a5ee..5c8f7e0 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -239,7 +239,11 @@ void shake_page(struct page *p, int access)
if (access) {
int nr;
do {
- nr = shrink_slab(1000, GFP_KERNEL, 1000);
+ struct shrink_control shrink = {
+ .gfp_mask = GFP_KERNEL,
+ };
+
+ nr = shrink_slab(&shrink, 1000, 1000);
if (page_count(p) == 1)
break;
} while (nr > 10);
@@ -429,7 +433,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
*/
read_lock(&tasklist_lock);
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_mutex);
for_each_process(tsk) {
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -449,7 +453,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
add_to_kill(tsk, page, vma, to_kill, tkc);
}
}
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
read_unlock(&tasklist_lock);
}
@@ -1440,16 +1444,12 @@ int soft_offline_page(struct page *page, int flags)
*/
ret = invalidate_inode_page(page);
unlock_page(page);
-
/*
- * Drop count because page migration doesn't like raised
- * counts. The page could get re-allocated, but if it becomes
- * LRU the isolation will just fail.
* RED-PEN would be better to keep it isolated here, but we
* would need to fix isolation locking first.
*/
- put_page(page);
if (ret == 1) {
+ put_page(page);
ret = 0;
pr_info("soft_offline: %#lx: invalidated\n", pfn);
goto done;
@@ -1461,6 +1461,11 @@ int soft_offline_page(struct page *page, int flags)
* handles a large number of cases for us.
*/
ret = isolate_lru_page(page);
+ /*
+ * Drop page reference which is came from get_any_page()
+ * successful isolate_lru_page() already took another one.
+ */
+ put_page(page);
if (!ret) {
LIST_HEAD(pagelist);
diff --git a/mm/memory.c b/mm/memory.c
index 61e66f0..6953d39 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -182,7 +182,7 @@ void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
{
__sync_task_rss_stat(task, mm);
}
-#else
+#else /* SPLIT_RSS_COUNTING */
#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
@@ -191,8 +191,205 @@ static void check_sync_rss_stat(struct task_struct *task)
{
}
+#endif /* SPLIT_RSS_COUNTING */
+
+#ifdef HAVE_GENERIC_MMU_GATHER
+
+static int tlb_next_batch(struct mmu_gather *tlb)
+{
+ struct mmu_gather_batch *batch;
+
+ batch = tlb->active;
+ if (batch->next) {
+ tlb->active = batch->next;
+ return 1;
+ }
+
+ batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
+ if (!batch)
+ return 0;
+
+ batch->next = NULL;
+ batch->nr = 0;
+ batch->max = MAX_GATHER_BATCH;
+
+ tlb->active->next = batch;
+ tlb->active = batch;
+
+ return 1;
+}
+
+/* tlb_gather_mmu
+ * Called to initialize an (on-stack) mmu_gather structure for page-table
+ * tear-down from @mm. The @fullmm argument is used when @mm is without
+ * users and we're going to destroy the full address space (exit/execve).
+ */
+void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
+{
+ tlb->mm = mm;
+
+ tlb->fullmm = fullmm;
+ tlb->need_flush = 0;
+ tlb->fast_mode = (num_possible_cpus() == 1);
+ tlb->local.next = NULL;
+ tlb->local.nr = 0;
+ tlb->local.max = ARRAY_SIZE(tlb->__pages);
+ tlb->active = &tlb->local;
+
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+ tlb->batch = NULL;
+#endif
+}
+
+void tlb_flush_mmu(struct mmu_gather *tlb)
+{
+ struct mmu_gather_batch *batch;
+
+ if (!tlb->need_flush)
+ return;
+ tlb->need_flush = 0;
+ tlb_flush(tlb);
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+ tlb_table_flush(tlb);
#endif
+ if (tlb_fast_mode(tlb))
+ return;
+
+ for (batch = &tlb->local; batch; batch = batch->next) {
+ free_pages_and_swap_cache(batch->pages, batch->nr);
+ batch->nr = 0;
+ }
+ tlb->active = &tlb->local;
+}
+
+/* tlb_finish_mmu
+ * Called at the end of the shootdown operation to free up any resources
+ * that were required.
+ */
+void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+{
+ struct mmu_gather_batch *batch, *next;
+
+ tlb_flush_mmu(tlb);
+
+ /* keep the page table cache within bounds */
+ check_pgt_cache();
+
+ for (batch = tlb->local.next; batch; batch = next) {
+ next = batch->next;
+ free_pages((unsigned long)batch, 0);
+ }
+ tlb->local.next = NULL;
+}
+
+/* __tlb_remove_page
+ * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
+ * handling the additional races in SMP caused by other CPUs caching valid
+ * mappings in their TLBs. Returns the number of free page slots left.
+ * When out of page slots we must call tlb_flush_mmu().
+ */
+int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+{
+ struct mmu_gather_batch *batch;
+
+ tlb->need_flush = 1;
+
+ if (tlb_fast_mode(tlb)) {
+ free_page_and_swap_cache(page);
+ return 1; /* avoid calling tlb_flush_mmu() */
+ }
+
+ batch = tlb->active;
+ batch->pages[batch->nr++] = page;
+ if (batch->nr == batch->max) {
+ if (!tlb_next_batch(tlb))
+ return 0;
+ }
+ VM_BUG_ON(batch->nr > batch->max);
+
+ return batch->max - batch->nr;
+}
+
+#endif /* HAVE_GENERIC_MMU_GATHER */
+
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+
+/*
+ * See the comment near struct mmu_table_batch.
+ */
+
+static void tlb_remove_table_smp_sync(void *arg)
+{
+ /* Simply deliver the interrupt */
+}
+
+static void tlb_remove_table_one(void *table)
+{
+ /*
+ * This isn't an RCU grace period and hence the page-tables cannot be
+ * assumed to be actually RCU-freed.
+ *
+ * It is however sufficient for software page-table walkers that rely on
+ * IRQ disabling. See the comment near struct mmu_table_batch.
+ */
+ smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
+ __tlb_remove_table(table);
+}
+
+static void tlb_remove_table_rcu(struct rcu_head *head)
+{
+ struct mmu_table_batch *batch;
+ int i;
+
+ batch = container_of(head, struct mmu_table_batch, rcu);
+
+ for (i = 0; i < batch->nr; i++)
+ __tlb_remove_table(batch->tables[i]);
+
+ free_page((unsigned long)batch);
+}
+
+void tlb_table_flush(struct mmu_gather *tlb)
+{
+ struct mmu_table_batch **batch = &tlb->batch;
+
+ if (*batch) {
+ call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
+ *batch = NULL;
+ }
+}
+
+void tlb_remove_table(struct mmu_gather *tlb, void *table)
+{
+ struct mmu_table_batch **batch = &tlb->batch;
+
+ tlb->need_flush = 1;
+
+ /*
+ * When there's less then two users of this mm there cannot be a
+ * concurrent page-table walk.
+ */
+ if (atomic_read(&tlb->mm->mm_users) < 2) {
+ __tlb_remove_table(table);
+ return;
+ }
+
+ if (*batch == NULL) {
+ *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+ if (*batch == NULL) {
+ tlb_remove_table_one(table);
+ return;
+ }
+ (*batch)->nr = 0;
+ }
+ (*batch)->tables[(*batch)->nr++] = table;
+ if ((*batch)->nr == MAX_TABLE_BATCH)
+ tlb_table_flush(tlb);
+}
+
+#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
+
/*
* If a p?d_bad entry is found while walking page tables, report
* the error, before resetting entry to p?d_none. Usually (but
@@ -533,7 +730,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
add_taint(TAINT_BAD_PAGE);
}
-static inline int is_cow_mapping(unsigned int flags)
+static inline int is_cow_mapping(vm_flags_t flags)
{
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}
@@ -909,26 +1106,24 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
- long *zap_work, struct zap_details *details)
+ struct zap_details *details)
{
struct mm_struct *mm = tlb->mm;
- pte_t *pte;
- spinlock_t *ptl;
+ int force_flush = 0;
int rss[NR_MM_COUNTERS];
+ spinlock_t *ptl;
+ pte_t *pte;
+again:
init_rss_vec(rss);
-
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode();
do {
pte_t ptent = *pte;
if (pte_none(ptent)) {
- (*zap_work)--;
continue;
}
- (*zap_work) -= PAGE_SIZE;
-
if (pte_present(ptent)) {
struct page *page;
@@ -974,7 +1169,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
page_remove_rmap(page);
if (unlikely(page_mapcount(page) < 0))
print_bad_pte(vma, addr, ptent, page);
- tlb_remove_page(tlb, page);
+ force_flush = !__tlb_remove_page(tlb, page);
+ if (force_flush)
+ break;
continue;
}
/*
@@ -995,19 +1192,31 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
print_bad_pte(vma, addr, ptent, NULL);
}
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
- } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
+ } while (pte++, addr += PAGE_SIZE, addr != end);
add_mm_rss_vec(mm, rss);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
+ /*
+ * mmu_gather ran out of room to batch pages, we break out of
+ * the PTE lock to avoid doing the potential expensive TLB invalidate
+ * and page-free while holding it.
+ */
+ if (force_flush) {
+ force_flush = 0;
+ tlb_flush_mmu(tlb);
+ if (addr != end)
+ goto again;
+ }
+
return addr;
}
static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end,
- long *zap_work, struct zap_details *details)
+ struct zap_details *details)
{
pmd_t *pmd;
unsigned long next;
@@ -1019,19 +1228,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
if (next-addr != HPAGE_PMD_SIZE) {
VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
split_huge_page_pmd(vma->vm_mm, pmd);
- } else if (zap_huge_pmd(tlb, vma, pmd)) {
- (*zap_work)--;
+ } else if (zap_huge_pmd(tlb, vma, pmd))
continue;
- }
/* fall through */
}
- if (pmd_none_or_clear_bad(pmd)) {
- (*zap_work)--;
+ if (pmd_none_or_clear_bad(pmd))
continue;
- }
- next = zap_pte_range(tlb, vma, pmd, addr, next,
- zap_work, details);
- } while (pmd++, addr = next, (addr != end && *zap_work > 0));
+ next = zap_pte_range(tlb, vma, pmd, addr, next, details);
+ cond_resched();
+ } while (pmd++, addr = next, addr != end);
return addr;
}
@@ -1039,7 +1244,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end,
- long *zap_work, struct zap_details *details)
+ struct zap_details *details)
{
pud_t *pud;
unsigned long next;
@@ -1047,13 +1252,10 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
- if (pud_none_or_clear_bad(pud)) {
- (*zap_work)--;
+ if (pud_none_or_clear_bad(pud))
continue;
- }
- next = zap_pmd_range(tlb, vma, pud, addr, next,
- zap_work, details);
- } while (pud++, addr = next, (addr != end && *zap_work > 0));
+ next = zap_pmd_range(tlb, vma, pud, addr, next, details);
+ } while (pud++, addr = next, addr != end);
return addr;
}
@@ -1061,7 +1263,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
static unsigned long unmap_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
- long *zap_work, struct zap_details *details)
+ struct zap_details *details)
{
pgd_t *pgd;
unsigned long next;
@@ -1075,13 +1277,10 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
pgd = pgd_offset(vma->vm_mm, addr);
do {
next = pgd_addr_end(addr, end);
- if (pgd_none_or_clear_bad(pgd)) {
- (*zap_work)--;
+ if (pgd_none_or_clear_bad(pgd))
continue;
- }
- next = zap_pud_range(tlb, vma, pgd, addr, next,
- zap_work, details);
- } while (pgd++, addr = next, (addr != end && *zap_work > 0));
+ next = zap_pud_range(tlb, vma, pgd, addr, next, details);
+ } while (pgd++, addr = next, addr != end);
tlb_end_vma(tlb, vma);
mem_cgroup_uncharge_end();
@@ -1121,17 +1320,12 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
* drops the lock and schedules.
*/
-unsigned long unmap_vmas(struct mmu_gather **tlbp,
+unsigned long unmap_vmas(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr, unsigned long *nr_accounted,
struct zap_details *details)
{
- long zap_work = ZAP_BLOCK_SIZE;
- unsigned long tlb_start = 0; /* For tlb_finish_mmu */
- int tlb_start_valid = 0;
unsigned long start = start_addr;
- spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
- int fullmm = (*tlbp)->fullmm;
struct mm_struct *mm = vma->vm_mm;
mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
@@ -1152,11 +1346,6 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
untrack_pfn_vma(vma, 0, 0);
while (start != end) {
- if (!tlb_start_valid) {
- tlb_start = start;
- tlb_start_valid = 1;
- }
-
if (unlikely(is_vm_hugetlb_page(vma))) {
/*
* It is undesirable to test vma->vm_file as it
@@ -1169,39 +1358,15 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
* Since no pte has actually been setup, it is
* safe to do nothing in this case.
*/
- if (vma->vm_file) {
+ if (vma->vm_file)
unmap_hugepage_range(vma, start, end, NULL);
- zap_work -= (end - start) /
- pages_per_huge_page(hstate_vma(vma));
- }
start = end;
} else
- start = unmap_page_range(*tlbp, vma,
- start, end, &zap_work, details);
-
- if (zap_work > 0) {
- BUG_ON(start != end);
- break;
- }
-
- tlb_finish_mmu(*tlbp, tlb_start, start);
-
- if (need_resched() ||
- (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
- if (i_mmap_lock) {
- *tlbp = NULL;
- goto out;
- }
- cond_resched();
- }
-
- *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
- tlb_start_valid = 0;
- zap_work = ZAP_BLOCK_SIZE;
+ start = unmap_page_range(tlb, vma, start, end, details);
}
}
-out:
+
mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
return start; /* which is now the end (or restart) address */
}
@@ -1217,16 +1382,15 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *details)
{
struct mm_struct *mm = vma->vm_mm;
- struct mmu_gather *tlb;
+ struct mmu_gather tlb;
unsigned long end = address + size;
unsigned long nr_accounted = 0;
lru_add_drain();
- tlb = tlb_gather_mmu(mm, 0);
+ tlb_gather_mmu(&tlb, mm, 0);
update_hiwater_rss(mm);
end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
- if (tlb)
- tlb_finish_mmu(tlb, address, end);
+ tlb_finish_mmu(&tlb, address, end);
return end;
}
@@ -2535,96 +2699,11 @@ unwritable_page:
return ret;
}
-/*
- * Helper functions for unmap_mapping_range().
- *
- * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
- *
- * We have to restart searching the prio_tree whenever we drop the lock,
- * since the iterator is only valid while the lock is held, and anyway
- * a later vma might be split and reinserted earlier while lock dropped.
- *
- * The list of nonlinear vmas could be handled more efficiently, using
- * a placeholder, but handle it in the same way until a need is shown.
- * It is important to search the prio_tree before nonlinear list: a vma
- * may become nonlinear and be shifted from prio_tree to nonlinear list
- * while the lock is dropped; but never shifted from list to prio_tree.
- *
- * In order to make forward progress despite restarting the search,
- * vm_truncate_count is used to mark a vma as now dealt with, so we can
- * quickly skip it next time around. Since the prio_tree search only
- * shows us those vmas affected by unmapping the range in question, we
- * can't efficiently keep all vmas in step with mapping->truncate_count:
- * so instead reset them all whenever it wraps back to 0 (then go to 1).
- * mapping->truncate_count and vma->vm_truncate_count are protected by
- * i_mmap_lock.
- *
- * In order to make forward progress despite repeatedly restarting some
- * large vma, note the restart_addr from unmap_vmas when it breaks out:
- * and restart from that address when we reach that vma again. It might
- * have been split or merged, shrunk or extended, but never shifted: so
- * restart_addr remains valid so long as it remains in the vma's range.
- * unmap_mapping_range forces truncate_count to leap over page-aligned
- * values so we can save vma's restart_addr in its truncate_count field.
- */
-#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
-
-static void reset_vma_truncate_counts(struct address_space *mapping)
-{
- struct vm_area_struct *vma;
- struct prio_tree_iter iter;
-
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
- vma->vm_truncate_count = 0;
- list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
- vma->vm_truncate_count = 0;
-}
-
-static int unmap_mapping_range_vma(struct vm_area_struct *vma,
+static void unmap_mapping_range_vma(struct vm_area_struct *vma,
unsigned long start_addr, unsigned long end_addr,
struct zap_details *details)
{
- unsigned long restart_addr;
- int need_break;
-
- /*
- * files that support invalidating or truncating portions of the
- * file from under mmaped areas must have their ->fault function
- * return a locked page (and set VM_FAULT_LOCKED in the return).
- * This provides synchronisation against concurrent unmapping here.
- */
-
-again:
- restart_addr = vma->vm_truncate_count;
- if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
- start_addr = restart_addr;
- if (start_addr >= end_addr) {
- /* Top of vma has been split off since last time */
- vma->vm_truncate_count = details->truncate_count;
- return 0;
- }
- }
-
- restart_addr = zap_page_range(vma, start_addr,
- end_addr - start_addr, details);
- need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
-
- if (restart_addr >= end_addr) {
- /* We have now completed this vma: mark it so */
- vma->vm_truncate_count = details->truncate_count;
- if (!need_break)
- return 0;
- } else {
- /* Note restart_addr in vma's truncate_count field */
- vma->vm_truncate_count = restart_addr;
- if (!need_break)
- goto again;
- }
-
- spin_unlock(details->i_mmap_lock);
- cond_resched();
- spin_lock(details->i_mmap_lock);
- return -EINTR;
+ zap_page_range(vma, start_addr, end_addr - start_addr, details);
}
static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
@@ -2634,12 +2713,8 @@ static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
struct prio_tree_iter iter;
pgoff_t vba, vea, zba, zea;
-restart:
vma_prio_tree_foreach(vma, &iter, root,
details->first_index, details->last_index) {
- /* Skip quickly over those we have already dealt with */
- if (vma->vm_truncate_count == details->truncate_count)
- continue;
vba = vma->vm_pgoff;
vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
@@ -2651,11 +2726,10 @@ restart:
if (zea > vea)
zea = vea;
- if (unmap_mapping_range_vma(vma,
+ unmap_mapping_range_vma(vma,
((zba - vba) << PAGE_SHIFT) + vma->vm_start,
((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
- details) < 0)
- goto restart;
+ details);
}
}
@@ -2670,15 +2744,9 @@ static inline void unmap_mapping_range_list(struct list_head *head,
* across *all* the pages in each nonlinear VMA, not just the pages
* whose virtual address lies outside the file truncation point.
*/
-restart:
list_for_each_entry(vma, head, shared.vm_set.list) {
- /* Skip quickly over those we have already dealt with */
- if (vma->vm_truncate_count == details->truncate_count)
- continue;
details->nonlinear_vma = vma;
- if (unmap_mapping_range_vma(vma, vma->vm_start,
- vma->vm_end, details) < 0)
- goto restart;
+ unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
}
}
@@ -2717,26 +2785,14 @@ void unmap_mapping_range(struct address_space *mapping,
details.last_index = hba + hlen - 1;
if (details.last_index < details.first_index)
details.last_index = ULONG_MAX;
- details.i_mmap_lock = &mapping->i_mmap_lock;
- mutex_lock(&mapping->unmap_mutex);
- spin_lock(&mapping->i_mmap_lock);
-
- /* Protect against endless unmapping loops */
- mapping->truncate_count++;
- if (unlikely(is_restart_addr(mapping->truncate_count))) {
- if (mapping->truncate_count == 0)
- reset_vma_truncate_counts(mapping);
- mapping->truncate_count++;
- }
- details.truncate_count = mapping->truncate_count;
+ mutex_lock(&mapping->i_mmap_mutex);
if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
- spin_unlock(&mapping->i_mmap_lock);
- mutex_unlock(&mapping->unmap_mutex);
+ mutex_unlock(&mapping->i_mmap_mutex);
}
EXPORT_SYMBOL(unmap_mapping_range);
@@ -2818,6 +2874,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* Had to read the page from swap area: Major fault */
ret = VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(mm, PGMAJFAULT);
} else if (PageHWPoison(page)) {
/*
* hwpoisoned dirty swapcache pages are kept for killing
@@ -2966,7 +3023,7 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
if (prev && prev->vm_end == address)
return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
- expand_stack(vma, address - PAGE_SIZE);
+ expand_downwards(vma, address - PAGE_SIZE);
}
if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
struct vm_area_struct *next = vma->vm_next;
@@ -3357,6 +3414,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
__set_current_state(TASK_RUNNING);
count_vm_event(PGFAULT);
+ mem_cgroup_count_vm_event(mm, PGFAULT);
/* do counter updates before entering really critical section. */
check_sync_rss_stat(current);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9ca1d60..9f64637 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -374,10 +374,6 @@ void online_page(struct page *page)
totalhigh_pages++;
#endif
-#ifdef CONFIG_FLATMEM
- max_mapnr = max(pfn, max_mapnr);
-#endif
-
ClearPageReserved(page);
init_page_count(page);
__free_page(page);
@@ -400,7 +396,7 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
}
-int online_pages(unsigned long pfn, unsigned long nr_pages)
+int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
{
unsigned long onlined_pages = 0;
struct zone *zone;
@@ -459,8 +455,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
zone_pcp_update(zone);
mutex_unlock(&zonelists_mutex);
- setup_per_zone_wmarks();
- calculate_zone_inactive_ratio(zone);
+
+ init_per_zone_wmark_min();
+
if (onlined_pages) {
kswapd_run(zone_to_nid(zone));
node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
@@ -705,7 +702,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (!pfn_valid(pfn))
continue;
page = pfn_to_page(pfn);
- if (!page_count(page))
+ if (!get_page_unless_zero(page))
continue;
/*
* We can skip free pages. And we can only deal with pages on
@@ -713,6 +710,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
*/
ret = isolate_lru_page(page);
if (!ret) { /* Success */
+ put_page(page);
list_add_tail(&page->lru, &source);
move_pages--;
inc_zone_page_state(page, NR_ISOLATED_ANON +
@@ -724,6 +722,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
pfn);
dump_page(page);
#endif
+ put_page(page);
/* Because we don't have big zone->lock. we should
check this again here. */
if (page_count(page)) {
@@ -795,7 +794,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
return offlined;
}
-static int offline_pages(unsigned long start_pfn,
+static int __ref offline_pages(unsigned long start_pfn,
unsigned long end_pfn, unsigned long timeout)
{
unsigned long pfn, nr_pages, expire;
@@ -893,8 +892,8 @@ repeat:
zone->zone_pgdat->node_present_pages -= offlined_pages;
totalram_pages -= offlined_pages;
- setup_per_zone_wmarks();
- calculate_zone_inactive_ratio(zone);
+ init_per_zone_wmark_min();
+
if (!node_present_pages(node)) {
node_clear_state(node, N_HIGH_MEMORY);
kswapd_stop(node);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 959a8b8..e7fb9d2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -99,7 +99,6 @@
/* Internal flags */
#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
-#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
static struct kmem_cache *policy_cache;
static struct kmem_cache *sn_cache;
@@ -457,7 +456,6 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
},
};
-static void gather_stats(struct page *, void *, int pte_dirty);
static void migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags);
@@ -492,9 +490,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
continue;
- if (flags & MPOL_MF_STATS)
- gather_stats(page, private, pte_dirty(*pte));
- else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
migrate_page_add(page, private, flags);
else
break;
@@ -1489,7 +1485,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
* freeing by another task. It is the caller's responsibility to free the
* extra reference for shared policies.
*/
-static struct mempolicy *get_vma_policy(struct task_struct *task,
+struct mempolicy *get_vma_policy(struct task_struct *task,
struct vm_area_struct *vma, unsigned long addr)
{
struct mempolicy *pol = task->mempolicy;
@@ -2529,159 +2525,3 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
}
return p - buffer;
}
-
-struct numa_maps {
- unsigned long pages;
- unsigned long anon;
- unsigned long active;
- unsigned long writeback;
- unsigned long mapcount_max;
- unsigned long dirty;
- unsigned long swapcache;
- unsigned long node[MAX_NUMNODES];
-};
-
-static void gather_stats(struct page *page, void *private, int pte_dirty)
-{
- struct numa_maps *md = private;
- int count = page_mapcount(page);
-
- md->pages++;
- if (pte_dirty || PageDirty(page))
- md->dirty++;
-
- if (PageSwapCache(page))
- md->swapcache++;
-
- if (PageActive(page) || PageUnevictable(page))
- md->active++;
-
- if (PageWriteback(page))
- md->writeback++;
-
- if (PageAnon(page))
- md->anon++;
-
- if (count > md->mapcount_max)
- md->mapcount_max = count;
-
- md->node[page_to_nid(page)]++;
-}
-
-#ifdef CONFIG_HUGETLB_PAGE
-static void check_huge_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct numa_maps *md)
-{
- unsigned long addr;
- struct page *page;
- struct hstate *h = hstate_vma(vma);
- unsigned long sz = huge_page_size(h);
-
- for (addr = start; addr < end; addr += sz) {
- pte_t *ptep = huge_pte_offset(vma->vm_mm,
- addr & huge_page_mask(h));
- pte_t pte;
-
- if (!ptep)
- continue;
-
- pte = *ptep;
- if (pte_none(pte))
- continue;
-
- page = pte_page(pte);
- if (!page)
- continue;
-
- gather_stats(page, md, pte_dirty(*ptep));
- }
-}
-#else
-static inline void check_huge_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct numa_maps *md)
-{
-}
-#endif
-
-/*
- * Display pages allocated per node and memory policy via /proc.
- */
-int show_numa_map(struct seq_file *m, void *v)
-{
- struct proc_maps_private *priv = m->private;
- struct vm_area_struct *vma = v;
- struct numa_maps *md;
- struct file *file = vma->vm_file;
- struct mm_struct *mm = vma->vm_mm;
- struct mempolicy *pol;
- int n;
- char buffer[50];
-
- if (!mm)
- return 0;
-
- md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
- if (!md)
- return 0;
-
- pol = get_vma_policy(priv->task, vma, vma->vm_start);
- mpol_to_str(buffer, sizeof(buffer), pol, 0);
- mpol_cond_put(pol);
-
- seq_printf(m, "%08lx %s", vma->vm_start, buffer);
-
- if (file) {
- seq_printf(m, " file=");
- seq_path(m, &file->f_path, "\n\t= ");
- } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
- seq_printf(m, " heap");
- } else if (vma->vm_start <= mm->start_stack &&
- vma->vm_end >= mm->start_stack) {
- seq_printf(m, " stack");
- }
-
- if (is_vm_hugetlb_page(vma)) {
- check_huge_range(vma, vma->vm_start, vma->vm_end, md);
- seq_printf(m, " huge");
- } else {
- check_pgd_range(vma, vma->vm_start, vma->vm_end,
- &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
- }
-
- if (!md->pages)
- goto out;
-
- if (md->anon)
- seq_printf(m," anon=%lu",md->anon);
-
- if (md->dirty)
- seq_printf(m," dirty=%lu",md->dirty);
-
- if (md->pages != md->anon && md->pages != md->dirty)
- seq_printf(m, " mapped=%lu", md->pages);
-
- if (md->mapcount_max > 1)
- seq_printf(m, " mapmax=%lu", md->mapcount_max);
-
- if (md->swapcache)
- seq_printf(m," swapcache=%lu", md->swapcache);
-
- if (md->active < md->pages && !is_vm_hugetlb_page(vma))
- seq_printf(m," active=%lu", md->active);
-
- if (md->writeback)
- seq_printf(m," writeback=%lu", md->writeback);
-
- for_each_node_state(n, N_HIGH_MEMORY)
- if (md->node[n])
- seq_printf(m, " N%d=%lu", n, md->node[n]);
-out:
- seq_putc(m, '\n');
- kfree(md);
-
- if (m->count < m->size)
- m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
- return 0;
-}
diff --git a/mm/migrate.c b/mm/migrate.c
index 34132f8..e4a5c91 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -721,15 +721,11 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
* Only page_lock_anon_vma() understands the subtleties of
* getting a hold on an anon_vma from outside one of its mms.
*/
- anon_vma = page_lock_anon_vma(page);
+ anon_vma = page_get_anon_vma(page);
if (anon_vma) {
/*
- * Take a reference count on the anon_vma if the
- * page is mapped so that it is guaranteed to
- * exist when the page is remapped later
+ * Anon page
*/
- get_anon_vma(anon_vma);
- page_unlock_anon_vma(anon_vma);
} else if (PageSwapCache(page)) {
/*
* We cannot be sure that the anon_vma of an unmapped
@@ -857,13 +853,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
lock_page(hpage);
}
- if (PageAnon(hpage)) {
- anon_vma = page_lock_anon_vma(hpage);
- if (anon_vma) {
- get_anon_vma(anon_vma);
- page_unlock_anon_vma(anon_vma);
- }
- }
+ if (PageAnon(hpage))
+ anon_vma = page_get_anon_vma(hpage);
try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
diff --git a/mm/mlock.c b/mm/mlock.c
index 516b2c2..048260c 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -307,13 +307,13 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
* For vmas that pass the filters, merge/split as appropriate.
*/
static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
- unsigned long start, unsigned long end, unsigned int newflags)
+ unsigned long start, unsigned long end, vm_flags_t newflags)
{
struct mm_struct *mm = vma->vm_mm;
pgoff_t pgoff;
int nr_pages;
int ret = 0;
- int lock = newflags & VM_LOCKED;
+ int lock = !!(newflags & VM_LOCKED);
if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
@@ -385,7 +385,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
prev = vma;
for (nstart = start ; ; ) {
- unsigned int newflags;
+ vm_flags_t newflags;
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
@@ -524,7 +524,7 @@ static int do_mlockall(int flags)
goto out;
for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
- unsigned int newflags;
+ vm_flags_t newflags;
newflags = vma->vm_flags | VM_LOCKED;
if (!(flags & MCL_CURRENT))
diff --git a/mm/mmap.c b/mm/mmap.c
index 772140c..bbdc9af 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -84,10 +84,14 @@ pgprot_t vm_get_page_prot(unsigned long vm_flags)
}
EXPORT_SYMBOL(vm_get_page_prot);
-int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
-int sysctl_overcommit_ratio = 50; /* default is 50% */
+int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */
+int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
-struct percpu_counter vm_committed_as;
+/*
+ * Make sure vm_committed_as in one cacheline and not cacheline shared with
+ * other variables. It can be updated by several CPUs frequently.
+ */
+struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
/*
* Check that a process has enough memory to allocate a new virtual
@@ -190,7 +194,7 @@ error:
}
/*
- * Requires inode->i_mapping->i_mmap_lock
+ * Requires inode->i_mapping->i_mmap_mutex
*/
static void __remove_shared_vm_struct(struct vm_area_struct *vma,
struct file *file, struct address_space *mapping)
@@ -218,9 +222,9 @@ void unlink_file_vma(struct vm_area_struct *vma)
if (file) {
struct address_space *mapping = file->f_mapping;
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_mutex);
__remove_shared_vm_struct(vma, file, mapping);
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
}
}
@@ -394,29 +398,6 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
return vma;
}
-static inline void
-__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, struct rb_node *rb_parent)
-{
- struct vm_area_struct *next;
-
- vma->vm_prev = prev;
- if (prev) {
- next = prev->vm_next;
- prev->vm_next = vma;
- } else {
- mm->mmap = vma;
- if (rb_parent)
- next = rb_entry(rb_parent,
- struct vm_area_struct, vm_rb);
- else
- next = NULL;
- }
- vma->vm_next = next;
- if (next)
- next->vm_prev = vma;
-}
-
void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
struct rb_node **rb_link, struct rb_node *rb_parent)
{
@@ -464,16 +445,14 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
if (vma->vm_file)
mapping = vma->vm_file->f_mapping;
- if (mapping) {
- spin_lock(&mapping->i_mmap_lock);
- vma->vm_truncate_count = mapping->truncate_count;
- }
+ if (mapping)
+ mutex_lock(&mapping->i_mmap_mutex);
__vma_link(mm, vma, prev, rb_link, rb_parent);
__vma_link_file(vma);
if (mapping)
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
mm->map_count++;
validate_mm(mm);
@@ -576,17 +555,8 @@ again: remove_next = 1 + (end > next->vm_end);
mapping = file->f_mapping;
if (!(vma->vm_flags & VM_NONLINEAR))
root = &mapping->i_mmap;
- spin_lock(&mapping->i_mmap_lock);
- if (importer &&
- vma->vm_truncate_count != next->vm_truncate_count) {
- /*
- * unmap_mapping_range might be in progress:
- * ensure that the expanding vma is rescanned.
- */
- importer->vm_truncate_count = 0;
- }
+ mutex_lock(&mapping->i_mmap_mutex);
if (insert) {
- insert->vm_truncate_count = vma->vm_truncate_count;
/*
* Put into prio_tree now, so instantiated pages
* are visible to arm/parisc __flush_dcache_page
@@ -605,7 +575,7 @@ again: remove_next = 1 + (end > next->vm_end);
* lock may be shared between many sibling processes. Skipping
* the lock for brk adjustments makes a difference sometimes.
*/
- if (vma->anon_vma && (insert || importer || start != vma->vm_start)) {
+ if (vma->anon_vma && (importer || start != vma->vm_start)) {
anon_vma = vma->anon_vma;
anon_vma_lock(anon_vma);
}
@@ -652,7 +622,7 @@ again: remove_next = 1 + (end > next->vm_end);
if (anon_vma)
anon_vma_unlock(anon_vma);
if (mapping)
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
if (remove_next) {
if (file) {
@@ -699,9 +669,17 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
}
static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
- struct anon_vma *anon_vma2)
+ struct anon_vma *anon_vma2,
+ struct vm_area_struct *vma)
{
- return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2);
+ /*
+ * The list_is_singular() test is to avoid merging VMA cloned from
+ * parents. This can improve scalability caused by anon_vma lock.
+ */
+ if ((!anon_vma1 || !anon_vma2) && (!vma ||
+ list_is_singular(&vma->anon_vma_chain)))
+ return 1;
+ return anon_vma1 == anon_vma2;
}
/*
@@ -720,7 +698,7 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
{
if (is_mergeable_vma(vma, file, vm_flags) &&
- is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
+ is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
if (vma->vm_pgoff == vm_pgoff)
return 1;
}
@@ -739,7 +717,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
{
if (is_mergeable_vma(vma, file, vm_flags) &&
- is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
+ is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
pgoff_t vm_pglen;
vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
if (vma->vm_pgoff + vm_pglen == vm_pgoff)
@@ -817,7 +795,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
can_vma_merge_before(next, vm_flags,
anon_vma, file, pgoff+pglen) &&
is_mergeable_anon_vma(prev->anon_vma,
- next->anon_vma)) {
+ next->anon_vma, NULL)) {
/* cases 1, 6 */
err = vma_adjust(prev, prev->vm_start,
next->vm_end, prev->vm_pgoff, NULL);
@@ -982,7 +960,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
{
struct mm_struct * mm = current->mm;
struct inode *inode;
- unsigned int vm_flags;
+ vm_flags_t vm_flags;
int error;
unsigned long reqprot = prot;
@@ -1187,7 +1165,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
*/
int vma_wants_writenotify(struct vm_area_struct *vma)
{
- unsigned int vm_flags = vma->vm_flags;
+ vm_flags_t vm_flags = vma->vm_flags;
/* If it was private or non-writable, the write bit is already clear */
if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
@@ -1215,7 +1193,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
* We account for memory if it's a private writeable mapping,
* not hugepages and VM_NORESERVE wasn't set.
*/
-static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
+static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
{
/*
* hugetlb has its own accounting separate from the core VM
@@ -1229,7 +1207,7 @@ static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, unsigned long flags,
- unsigned int vm_flags, unsigned long pgoff)
+ vm_flags_t vm_flags, unsigned long pgoff)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
@@ -1785,7 +1763,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
/*
* vma is the first one with address < vma->vm_start. Have to extend vma.
*/
-static int expand_downwards(struct vm_area_struct *vma,
+int expand_downwards(struct vm_area_struct *vma,
unsigned long address)
{
int error;
@@ -1832,11 +1810,6 @@ static int expand_downwards(struct vm_area_struct *vma,
return error;
}
-int expand_stack_downwards(struct vm_area_struct *vma, unsigned long address)
-{
- return expand_downwards(vma, address);
-}
-
#ifdef CONFIG_STACK_GROWSUP
int expand_stack(struct vm_area_struct *vma, unsigned long address)
{
@@ -1919,17 +1892,17 @@ static void unmap_region(struct mm_struct *mm,
unsigned long start, unsigned long end)
{
struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
- struct mmu_gather *tlb;
+ struct mmu_gather tlb;
unsigned long nr_accounted = 0;
lru_add_drain();
- tlb = tlb_gather_mmu(mm, 0);
+ tlb_gather_mmu(&tlb, mm, 0);
update_hiwater_rss(mm);
unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
- free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
- next? next->vm_start: 0);
- tlb_finish_mmu(tlb, start, end);
+ free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
+ next ? next->vm_start : 0);
+ tlb_finish_mmu(&tlb, start, end);
}
/*
@@ -2271,7 +2244,7 @@ EXPORT_SYMBOL(do_brk);
/* Release all mmaps. */
void exit_mmap(struct mm_struct *mm)
{
- struct mmu_gather *tlb;
+ struct mmu_gather tlb;
struct vm_area_struct *vma;
unsigned long nr_accounted = 0;
unsigned long end;
@@ -2296,14 +2269,14 @@ void exit_mmap(struct mm_struct *mm)
lru_add_drain();
flush_cache_mm(mm);
- tlb = tlb_gather_mmu(mm, 1);
+ tlb_gather_mmu(&tlb, mm, 1);
/* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
- free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
- tlb_finish_mmu(tlb, 0, end);
+ free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
+ tlb_finish_mmu(&tlb, 0, end);
/*
* Walk the list again, actually closing and freeing it,
@@ -2317,7 +2290,7 @@ void exit_mmap(struct mm_struct *mm)
/* Insert vm structure into process list sorted by address
* and into the inode's i_mmap tree. If vm_file is non-NULL
- * then i_mmap_lock is taken here.
+ * then i_mmap_mutex is taken here.
*/
int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
{
@@ -2529,15 +2502,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
* The LSB of head.next can't change from under us
* because we hold the mm_all_locks_mutex.
*/
- spin_lock_nest_lock(&anon_vma->root->lock, &mm->mmap_sem);
+ mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem);
/*
* We can safely modify head.next after taking the
- * anon_vma->root->lock. If some other vma in this mm shares
+ * anon_vma->root->mutex. If some other vma in this mm shares
* the same anon_vma we won't take it again.
*
* No need of atomic instructions here, head.next
* can't change from under us thanks to the
- * anon_vma->root->lock.
+ * anon_vma->root->mutex.
*/
if (__test_and_set_bit(0, (unsigned long *)
&anon_vma->root->head.next))
@@ -2559,7 +2532,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
*/
if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
BUG();
- spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem);
+ mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem);
}
}
@@ -2586,7 +2559,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
* vma in this mm is backed by the same anon_vma or address_space.
*
* We can take all the locks in random order because the VM code
- * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never
+ * taking i_mmap_mutex or anon_vma->mutex outside the mmap_sem never
* takes more than one of them in a row. Secondly we're protected
* against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
*
@@ -2642,7 +2615,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
*
* No need of atomic instructions here, head.next
* can't change from under us until we release the
- * anon_vma->root->lock.
+ * anon_vma->root->mutex.
*/
if (!__test_and_clear_bit(0, (unsigned long *)
&anon_vma->root->head.next))
@@ -2658,7 +2631,7 @@ static void vm_unlock_mapping(struct address_space *mapping)
* AS_MM_ALL_LOCKS can't change to 0 from under us
* because we hold the mm_all_locks_mutex.
*/
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
&mapping->flags))
BUG();
diff --git a/mm/mremap.c b/mm/mremap.c
index a7c1f9f..506fa44 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -93,8 +93,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
* and we propagate stale pages into the dst afterward.
*/
mapping = vma->vm_file->f_mapping;
- spin_lock(&mapping->i_mmap_lock);
- new_vma->vm_truncate_count = 0;
+ mutex_lock(&mapping->i_mmap_mutex);
}
/*
@@ -123,7 +122,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
pte_unmap(new_pte - 1);
pte_unmap_unlock(old_pte - 1, old_ptl);
if (mapping)
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
}
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 9109049..6e93dc7 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -307,30 +307,7 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
-#ifdef MAX_DMA32_PFN
- unsigned long end_pfn;
-
- if (WARN_ON_ONCE(slab_is_available()))
- return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-
- /* update goal according ...MAX_DMA32_PFN */
- end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages;
-
- if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
- (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
- void *ptr;
- unsigned long new_goal;
-
- new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
- ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
- new_goal, -1ULL);
- if (ptr)
- return ptr;
- }
-#endif
-
return __alloc_bootmem_node(pgdat, size, align, goal);
-
}
#ifdef CONFIG_SPARSEMEM
diff --git a/mm/nommu.c b/mm/nommu.c
index c4c542c..1fd0c51 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -680,9 +680,9 @@ static void protect_vma(struct vm_area_struct *vma, unsigned long flags)
*/
static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
{
- struct vm_area_struct *pvma, **pp, *next;
+ struct vm_area_struct *pvma, *prev;
struct address_space *mapping;
- struct rb_node **p, *parent;
+ struct rb_node **p, *parent, *rb_prev;
kenter(",%p", vma);
@@ -703,7 +703,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
}
/* add the VMA to the tree */
- parent = NULL;
+ parent = rb_prev = NULL;
p = &mm->mm_rb.rb_node;
while (*p) {
parent = *p;
@@ -713,17 +713,20 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
* (the latter is necessary as we may get identical VMAs) */
if (vma->vm_start < pvma->vm_start)
p = &(*p)->rb_left;
- else if (vma->vm_start > pvma->vm_start)
+ else if (vma->vm_start > pvma->vm_start) {
+ rb_prev = parent;
p = &(*p)->rb_right;
- else if (vma->vm_end < pvma->vm_end)
+ } else if (vma->vm_end < pvma->vm_end)
p = &(*p)->rb_left;
- else if (vma->vm_end > pvma->vm_end)
+ else if (vma->vm_end > pvma->vm_end) {
+ rb_prev = parent;
p = &(*p)->rb_right;
- else if (vma < pvma)
+ } else if (vma < pvma)
p = &(*p)->rb_left;
- else if (vma > pvma)
+ else if (vma > pvma) {
+ rb_prev = parent;
p = &(*p)->rb_right;
- else
+ } else
BUG();
}
@@ -731,20 +734,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
rb_insert_color(&vma->vm_rb, &mm->mm_rb);
/* add VMA to the VMA list also */
- for (pp = &mm->mmap; (pvma = *pp); pp = &(*pp)->vm_next) {
- if (pvma->vm_start > vma->vm_start)
- break;
- if (pvma->vm_start < vma->vm_start)
- continue;
- if (pvma->vm_end < vma->vm_end)
- break;
- }
+ prev = NULL;
+ if (rb_prev)
+ prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
- next = *pp;
- *pp = vma;
- vma->vm_next = next;
- if (next)
- next->vm_prev = vma;
+ __vma_link_list(mm, vma, prev, parent);
}
/*
@@ -752,7 +746,6 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
*/
static void delete_vma_from_mm(struct vm_area_struct *vma)
{
- struct vm_area_struct **pp;
struct address_space *mapping;
struct mm_struct *mm = vma->vm_mm;
@@ -775,12 +768,14 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
/* remove from the MM's tree and list */
rb_erase(&vma->vm_rb, &mm->mm_rb);
- for (pp = &mm->mmap; *pp; pp = &(*pp)->vm_next) {
- if (*pp == vma) {
- *pp = vma->vm_next;
- break;
- }
- }
+
+ if (vma->vm_prev)
+ vma->vm_prev->vm_next = vma->vm_next;
+ else
+ mm->mmap = vma->vm_next;
+
+ if (vma->vm_next)
+ vma->vm_next->vm_prev = vma->vm_prev;
vma->vm_mm = NULL;
}
@@ -809,17 +804,15 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
struct vm_area_struct *vma;
- struct rb_node *n = mm->mm_rb.rb_node;
/* check the cache first */
vma = mm->mmap_cache;
if (vma && vma->vm_start <= addr && vma->vm_end > addr)
return vma;
- /* trawl the tree (there may be multiple mappings in which addr
+ /* trawl the list (there may be multiple mappings in which addr
* resides) */
- for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) {
- vma = rb_entry(n, struct vm_area_struct, vm_rb);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (vma->vm_start > addr)
return NULL;
if (vma->vm_end > addr) {
@@ -859,7 +852,6 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
unsigned long len)
{
struct vm_area_struct *vma;
- struct rb_node *n = mm->mm_rb.rb_node;
unsigned long end = addr + len;
/* check the cache first */
@@ -867,10 +859,9 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
if (vma && vma->vm_start == addr && vma->vm_end == end)
return vma;
- /* trawl the tree (there may be multiple mappings in which addr
+ /* trawl the list (there may be multiple mappings in which addr
* resides) */
- for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) {
- vma = rb_entry(n, struct vm_area_struct, vm_rb);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (vma->vm_start < addr)
continue;
if (vma->vm_start > addr)
@@ -1133,7 +1124,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
unsigned long capabilities)
{
struct page *pages;
- unsigned long total, point, n, rlen;
+ unsigned long total, point, n;
void *base;
int ret, order;
@@ -1157,13 +1148,12 @@ static int do_mmap_private(struct vm_area_struct *vma,
* make a private copy of the data and map that instead */
}
- rlen = PAGE_ALIGN(len);
/* allocate some memory to hold the mapping
* - note that this may not return a page-aligned address if the object
* we're allocating is smaller than a page
*/
- order = get_order(rlen);
+ order = get_order(len);
kdebug("alloc order %d for %lx", order, len);
pages = alloc_pages(GFP_KERNEL, order);
@@ -1173,7 +1163,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
total = 1 << order;
atomic_long_add(total, &mmap_pages_allocated);
- point = rlen >> PAGE_SHIFT;
+ point = len >> PAGE_SHIFT;
/* we allocated a power-of-2 sized page set, so we may want to trim off
* the excess */
@@ -1195,7 +1185,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
base = page_address(pages);
region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY;
region->vm_start = (unsigned long) base;
- region->vm_end = region->vm_start + rlen;
+ region->vm_end = region->vm_start + len;
region->vm_top = region->vm_start + (total << PAGE_SHIFT);
vma->vm_start = region->vm_start;
@@ -1211,22 +1201,22 @@ static int do_mmap_private(struct vm_area_struct *vma,
old_fs = get_fs();
set_fs(KERNEL_DS);
- ret = vma->vm_file->f_op->read(vma->vm_file, base, rlen, &fpos);
+ ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos);
set_fs(old_fs);
if (ret < 0)
goto error_free;
/* clear the last little bit */
- if (ret < rlen)
- memset(base + ret, 0, rlen - ret);
+ if (ret < len)
+ memset(base + ret, 0, len - ret);
}
return 0;
error_free:
- free_page_series(region->vm_start, region->vm_end);
+ free_page_series(region->vm_start, region->vm_top);
region->vm_start = vma->vm_start = 0;
region->vm_end = vma->vm_end = 0;
region->vm_top = 0;
@@ -1235,7 +1225,7 @@ error_free:
enomem:
printk("Allocation of length %lu from process %d (%s) failed\n",
len, current->pid, current->comm);
- show_free_areas();
+ show_free_areas(0);
return -ENOMEM;
}
@@ -1268,6 +1258,7 @@ unsigned long do_mmap_pgoff(struct file *file,
/* we ignore the address hint */
addr = 0;
+ len = PAGE_ALIGN(len);
/* we've determined that we can make the mapping, now translate what we
* now know into VMA flags */
@@ -1385,15 +1376,15 @@ unsigned long do_mmap_pgoff(struct file *file,
if (capabilities & BDI_CAP_MAP_DIRECT) {
addr = file->f_op->get_unmapped_area(file, addr, len,
pgoff, flags);
- if (IS_ERR((void *) addr)) {
+ if (IS_ERR_VALUE(addr)) {
ret = addr;
- if (ret != (unsigned long) -ENOSYS)
+ if (ret != -ENOSYS)
goto error_just_free;
/* the driver refused to tell us where to site
* the mapping so we'll have to attempt to copy
* it */
- ret = (unsigned long) -ENODEV;
+ ret = -ENODEV;
if (!(capabilities & BDI_CAP_MAP_COPY))
goto error_just_free;
@@ -1468,14 +1459,14 @@ error_getting_vma:
printk(KERN_WARNING "Allocation of vma for %lu byte allocation"
" from process %d failed\n",
len, current->pid);
- show_free_areas();
+ show_free_areas(0);
return -ENOMEM;
error_getting_region:
printk(KERN_WARNING "Allocation of vm region for %lu byte allocation"
" from process %d failed\n",
len, current->pid);
- show_free_areas();
+ show_free_areas(0);
return -ENOMEM;
}
EXPORT_SYMBOL(do_mmap_pgoff);
@@ -1644,15 +1635,17 @@ static int shrink_vma(struct mm_struct *mm,
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
{
struct vm_area_struct *vma;
- struct rb_node *rb;
- unsigned long end = start + len;
+ unsigned long end;
int ret;
kenter(",%lx,%zx", start, len);
+ len = PAGE_ALIGN(len);
if (len == 0)
return -EINVAL;
+ end = start + len;
+
/* find the first potentially overlapping VMA */
vma = find_vma(mm, start);
if (!vma) {
@@ -1677,9 +1670,8 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
}
if (end == vma->vm_end)
goto erase_whole_vma;
- rb = rb_next(&vma->vm_rb);
- vma = rb_entry(rb, struct vm_area_struct, vm_rb);
- } while (rb);
+ vma = vma->vm_next;
+ } while (vma);
kleave(" = -EINVAL [split file]");
return -EINVAL;
} else {
@@ -1773,6 +1765,8 @@ unsigned long do_mremap(unsigned long addr,
struct vm_area_struct *vma;
/* insanity checks first */
+ old_len = PAGE_ALIGN(old_len);
+ new_len = PAGE_ALIGN(new_len);
if (old_len == 0 || new_len == 0)
return (unsigned long) -EINVAL;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f52e85c..e4b0991 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -38,6 +38,33 @@ int sysctl_oom_kill_allocating_task;
int sysctl_oom_dump_tasks = 1;
static DEFINE_SPINLOCK(zone_scan_lock);
+/**
+ * test_set_oom_score_adj() - set current's oom_score_adj and return old value
+ * @new_val: new oom_score_adj value
+ *
+ * Sets the oom_score_adj value for current to @new_val with proper
+ * synchronization and returns the old value. Usually used to temporarily
+ * set a value, save the old value in the caller, and then reinstate it later.
+ */
+int test_set_oom_score_adj(int new_val)
+{
+ struct sighand_struct *sighand = current->sighand;
+ int old_val;
+
+ spin_lock_irq(&sighand->siglock);
+ old_val = current->signal->oom_score_adj;
+ if (new_val != old_val) {
+ if (new_val == OOM_SCORE_ADJ_MIN)
+ atomic_inc(&current->mm->oom_disable_count);
+ else if (old_val == OOM_SCORE_ADJ_MIN)
+ atomic_dec(&current->mm->oom_disable_count);
+ current->signal->oom_score_adj = new_val;
+ }
+ spin_unlock_irq(&sighand->siglock);
+
+ return old_val;
+}
+
#ifdef CONFIG_NUMA
/**
* has_intersects_mems_allowed() - check task eligiblity for kill
@@ -155,15 +182,6 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
}
/*
- * When the PF_OOM_ORIGIN bit is set, it indicates the task should have
- * priority for oom killing.
- */
- if (p->flags & PF_OOM_ORIGIN) {
- task_unlock(p);
- return 1000;
- }
-
- /*
* The memory controller may have a limit of 0 bytes, so avoid a divide
* by zero, if necessary.
*/
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9d5498e..4e8985a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -30,6 +30,7 @@
#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
+#include <linux/ratelimit.h>
#include <linux/oom.h>
#include <linux/notifier.h>
#include <linux/topology.h>
@@ -39,6 +40,7 @@
#include <linux/memory_hotplug.h>
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
+#include <linux/vmstat.h>
#include <linux/mempolicy.h>
#include <linux/stop_machine.h>
#include <linux/sort.h>
@@ -1735,6 +1737,45 @@ static inline bool should_suppress_show_mem(void)
return ret;
}
+static DEFINE_RATELIMIT_STATE(nopage_rs,
+ DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
+{
+ va_list args;
+ unsigned int filter = SHOW_MEM_FILTER_NODES;
+
+ if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
+ return;
+
+ /*
+ * This documents exceptions given to allocations in certain
+ * contexts that are allowed to allocate outside current's set
+ * of allowed nodes.
+ */
+ if (!(gfp_mask & __GFP_NOMEMALLOC))
+ if (test_thread_flag(TIF_MEMDIE) ||
+ (current->flags & (PF_MEMALLOC | PF_EXITING)))
+ filter &= ~SHOW_MEM_FILTER_NODES;
+ if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
+ filter &= ~SHOW_MEM_FILTER_NODES;
+
+ if (fmt) {
+ printk(KERN_WARNING);
+ va_start(args, fmt);
+ vprintk(fmt, args);
+ va_end(args);
+ }
+
+ pr_warning("%s: page allocation failure: order:%d, mode:0x%x\n",
+ current->comm, order, gfp_mask);
+
+ dump_stack();
+ if (!should_suppress_show_mem())
+ show_mem(filter);
+}
+
static inline int
should_alloc_retry(gfp_t gfp_mask, unsigned int order,
unsigned long pages_reclaimed)
@@ -2065,6 +2106,7 @@ restart:
first_zones_zonelist(zonelist, high_zoneidx, NULL,
&preferred_zone);
+rebalance:
/* This is the last chance, in general, before the goto nopage. */
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2072,7 +2114,6 @@ restart:
if (page)
goto got_pg;
-rebalance:
/* Allocate without watermarks if the context allows */
if (alloc_flags & ALLOC_NO_WATERMARKS) {
page = __alloc_pages_high_priority(gfp_mask, order,
@@ -2106,7 +2147,7 @@ rebalance:
sync_migration);
if (page)
goto got_pg;
- sync_migration = !(gfp_mask & __GFP_NO_KSWAPD);
+ sync_migration = true;
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -2177,27 +2218,7 @@ rebalance:
}
nopage:
- if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
- unsigned int filter = SHOW_MEM_FILTER_NODES;
-
- /*
- * This documents exceptions given to allocations in certain
- * contexts that are allowed to allocate outside current's set
- * of allowed nodes.
- */
- if (!(gfp_mask & __GFP_NOMEMALLOC))
- if (test_thread_flag(TIF_MEMDIE) ||
- (current->flags & (PF_MEMALLOC | PF_EXITING)))
- filter &= ~SHOW_MEM_FILTER_NODES;
- if (in_interrupt() || !wait)
- filter &= ~SHOW_MEM_FILTER_NODES;
-
- pr_warning("%s: page allocation failure. order:%d, mode:0x%x\n",
- current->comm, order, gfp_mask);
- dump_stack();
- if (!should_suppress_show_mem())
- show_mem(filter);
- }
+ warn_alloc_failed(gfp_mask, order, NULL);
return page;
got_pg:
if (kmemcheck_enabled)
@@ -2473,10 +2494,10 @@ void si_meminfo_node(struct sysinfo *val, int nid)
#endif
/*
- * Determine whether the zone's node should be displayed or not, depending on
- * whether SHOW_MEM_FILTER_NODES was passed to __show_free_areas().
+ * Determine whether the node should be displayed or not, depending on whether
+ * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
*/
-static bool skip_free_areas_zone(unsigned int flags, const struct zone *zone)
+bool skip_free_areas_node(unsigned int flags, int nid)
{
bool ret = false;
@@ -2484,8 +2505,7 @@ static bool skip_free_areas_zone(unsigned int flags, const struct zone *zone)
goto out;
get_mems_allowed();
- ret = !node_isset(zone->zone_pgdat->node_id,
- cpuset_current_mems_allowed);
+ ret = !node_isset(nid, cpuset_current_mems_allowed);
put_mems_allowed();
out:
return ret;
@@ -2500,13 +2520,13 @@ out:
* Suppresses nodes that are not allowed by current's cpuset if
* SHOW_MEM_FILTER_NODES is passed.
*/
-void __show_free_areas(unsigned int filter)
+void show_free_areas(unsigned int filter)
{
int cpu;
struct zone *zone;
for_each_populated_zone(zone) {
- if (skip_free_areas_zone(filter, zone))
+ if (skip_free_areas_node(filter, zone_to_nid(zone)))
continue;
show_node(zone);
printk("%s per-cpu:\n", zone->name);
@@ -2549,7 +2569,7 @@ void __show_free_areas(unsigned int filter)
for_each_populated_zone(zone) {
int i;
- if (skip_free_areas_zone(filter, zone))
+ if (skip_free_areas_node(filter, zone_to_nid(zone)))
continue;
show_node(zone);
printk("%s"
@@ -2618,7 +2638,7 @@ void __show_free_areas(unsigned int filter)
for_each_populated_zone(zone) {
unsigned long nr[MAX_ORDER], flags, order, total = 0;
- if (skip_free_areas_zone(filter, zone))
+ if (skip_free_areas_node(filter, zone_to_nid(zone)))
continue;
show_node(zone);
printk("%s: ", zone->name);
@@ -2639,11 +2659,6 @@ void __show_free_areas(unsigned int filter)
show_swap_cache_info();
}
-void show_free_areas(void)
-{
- __show_free_areas(0);
-}
-
static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
{
zoneref->zone = zone;
@@ -3314,6 +3329,20 @@ static inline unsigned long wait_table_bits(unsigned long size)
#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
/*
+ * Check if a pageblock contains reserved pages
+ */
+static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long pfn;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
+ return 1;
+ }
+ return 0;
+}
+
+/*
* Mark a number of pageblocks as MIGRATE_RESERVE. The number
* of blocks reserved is based on min_wmark_pages(zone). The memory within
* the reserve will tend to store contiguous free pages. Setting min_free_kbytes
@@ -3322,7 +3351,7 @@ static inline unsigned long wait_table_bits(unsigned long size)
*/
static void setup_zone_migrate_reserve(struct zone *zone)
{
- unsigned long start_pfn, pfn, end_pfn;
+ unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
struct page *page;
unsigned long block_migratetype;
int reserve;
@@ -3352,7 +3381,8 @@ static void setup_zone_migrate_reserve(struct zone *zone)
continue;
/* Blocks with reserved pages will never free, skip them. */
- if (PageReserved(page))
+ block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
+ if (pageblock_is_reserved(pfn, block_end_pfn))
continue;
block_migratetype = get_pageblock_migratetype(page);
@@ -4289,10 +4319,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
zone->zone_pgdat = pgdat;
zone_pcp_init(zone);
- for_each_lru(l) {
+ for_each_lru(l)
INIT_LIST_HEAD(&zone->lru[l].list);
- zone->reclaim_stat.nr_saved_scan[l] = 0;
- }
zone->reclaim_stat.recent_rotated[0] = 0;
zone->reclaim_stat.recent_rotated[1] = 0;
zone->reclaim_stat.recent_scanned[0] = 0;
@@ -5100,7 +5128,7 @@ void setup_per_zone_wmarks(void)
* 1TB 101 10GB
* 10TB 320 32GB
*/
-void calculate_zone_inactive_ratio(struct zone *zone)
+static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
{
unsigned int gb, ratio;
@@ -5114,7 +5142,7 @@ void calculate_zone_inactive_ratio(struct zone *zone)
zone->inactive_ratio = ratio;
}
-static void __init setup_per_zone_inactive_ratio(void)
+static void __meminit setup_per_zone_inactive_ratio(void)
{
struct zone *zone;
@@ -5146,7 +5174,7 @@ static void __init setup_per_zone_inactive_ratio(void)
* 8192MB: 11584k
* 16384MB: 16384k
*/
-static int __init init_per_zone_wmark_min(void)
+int __meminit init_per_zone_wmark_min(void)
{
unsigned long lowmem_kbytes;
@@ -5158,6 +5186,7 @@ static int __init init_per_zone_wmark_min(void)
if (min_free_kbytes > 65536)
min_free_kbytes = 65536;
setup_per_zone_wmarks();
+ refresh_zone_stat_thresholds();
setup_per_zone_lowmem_reserve();
setup_per_zone_inactive_ratio();
return 0;
@@ -5508,10 +5537,8 @@ int set_migratetype_isolate(struct page *page)
struct memory_isolate_notify arg;
int notifier_ret;
int ret = -EBUSY;
- int zone_idx;
zone = page_zone(page);
- zone_idx = zone_idx(zone);
spin_lock_irqsave(&zone->lock, flags);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 2daadc3..74ccff6 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -130,7 +130,7 @@ struct page *lookup_cgroup_page(struct page_cgroup *pc)
return page;
}
-static void *__init_refok alloc_page_cgroup(size_t size, int nid)
+static void *__meminit alloc_page_cgroup(size_t size, int nid)
{
void *addr = NULL;
@@ -162,7 +162,7 @@ static void free_page_cgroup(void *addr)
}
#endif
-static int __init_refok init_section_page_cgroup(unsigned long pfn)
+static int __meminit init_section_page_cgroup(unsigned long pfn)
{
struct page_cgroup *base, *pc;
struct mem_section *section;
@@ -475,7 +475,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
if (!do_swap_account)
return 0;
- length = ((max_pages/SC_PER_PAGE) + 1);
+ length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
array_size = length * sizeof(void *);
array = vmalloc(array_size);
@@ -492,8 +492,8 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
/* memory shortage */
ctrl->map = NULL;
ctrl->length = 0;
- vfree(array);
mutex_unlock(&swap_cgroup_mutex);
+ vfree(array);
goto nomem;
}
mutex_unlock(&swap_cgroup_mutex);
@@ -508,7 +508,8 @@ nomem:
void swap_cgroup_swapoff(int type)
{
- int i;
+ struct page **map;
+ unsigned long i, length;
struct swap_cgroup_ctrl *ctrl;
if (!do_swap_account)
@@ -516,17 +517,20 @@ void swap_cgroup_swapoff(int type)
mutex_lock(&swap_cgroup_mutex);
ctrl = &swap_cgroup_ctrl[type];
- if (ctrl->map) {
- for (i = 0; i < ctrl->length; i++) {
- struct page *page = ctrl->map[i];
+ map = ctrl->map;
+ length = ctrl->length;
+ ctrl->map = NULL;
+ ctrl->length = 0;
+ mutex_unlock(&swap_cgroup_mutex);
+
+ if (map) {
+ for (i = 0; i < length; i++) {
+ struct page *page = map[i];
if (page)
__free_page(page);
}
- vfree(ctrl->map);
- ctrl->map = NULL;
- ctrl->length = 0;
+ vfree(map);
}
- mutex_unlock(&swap_cgroup_mutex);
}
#endif
diff --git a/mm/percpu.c b/mm/percpu.c
index a160db3..bf80e55 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1215,8 +1215,10 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
#ifdef CONFIG_SMP
PCPU_SETUP_BUG_ON(!ai->static_size);
+ PCPU_SETUP_BUG_ON((unsigned long)__per_cpu_start & ~PAGE_MASK);
#endif
PCPU_SETUP_BUG_ON(!base_addr);
+ PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK);
PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
@@ -1645,8 +1647,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
/* warn if maximum distance is further than 75% of vmalloc space */
if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) {
pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
- "space 0x%lx\n",
- max_distance, VMALLOC_END - VMALLOC_START);
+ "space 0x%lx\n", max_distance,
+ (unsigned long)(VMALLOC_END - VMALLOC_START));
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
/* and fail if we have fallback */
rc = -EINVAL;
diff --git a/mm/readahead.c b/mm/readahead.c
index 2c0cc48..867f9dd 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -180,7 +180,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
if (page)
continue;
- page = page_cache_alloc_cold(mapping);
+ page = page_cache_alloc_readahead(mapping);
if (!page)
break;
page->index = page_offset;
diff --git a/mm/rmap.c b/mm/rmap.c
index 8da044a..0eb463e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -24,8 +24,8 @@
* inode->i_alloc_sem (vmtruncate_range)
* mm->mmap_sem
* page->flags PG_locked (lock_page)
- * mapping->i_mmap_lock
- * anon_vma->lock
+ * mapping->i_mmap_mutex
+ * anon_vma->mutex
* mm->page_table_lock or pte_lock
* zone->lru_lock (in mark_page_accessed, isolate_lru_page)
* swap_lock (in swap_duplicate, swap_info_get)
@@ -40,7 +40,7 @@
*
* (code doesn't rely on that order so it could be switched around)
* ->tasklist_lock
- * anon_vma->lock (memory_failure, collect_procs_anon)
+ * anon_vma->mutex (memory_failure, collect_procs_anon)
* pte map lock
*/
@@ -86,6 +86,29 @@ static inline struct anon_vma *anon_vma_alloc(void)
static inline void anon_vma_free(struct anon_vma *anon_vma)
{
VM_BUG_ON(atomic_read(&anon_vma->refcount));
+
+ /*
+ * Synchronize against page_lock_anon_vma() such that
+ * we can safely hold the lock without the anon_vma getting
+ * freed.
+ *
+ * Relies on the full mb implied by the atomic_dec_and_test() from
+ * put_anon_vma() against the acquire barrier implied by
+ * mutex_trylock() from page_lock_anon_vma(). This orders:
+ *
+ * page_lock_anon_vma() VS put_anon_vma()
+ * mutex_trylock() atomic_dec_and_test()
+ * LOCK MB
+ * atomic_read() mutex_is_locked()
+ *
+ * LOCK should suffice since the actual taking of the lock must
+ * happen _before_ what follows.
+ */
+ if (mutex_is_locked(&anon_vma->root->mutex)) {
+ anon_vma_lock(anon_vma);
+ anon_vma_unlock(anon_vma);
+ }
+
kmem_cache_free(anon_vma_cachep, anon_vma);
}
@@ -307,7 +330,7 @@ static void anon_vma_ctor(void *data)
{
struct anon_vma *anon_vma = data;
- spin_lock_init(&anon_vma->lock);
+ mutex_init(&anon_vma->mutex);
atomic_set(&anon_vma->refcount, 0);
INIT_LIST_HEAD(&anon_vma->head);
}
@@ -320,12 +343,31 @@ void __init anon_vma_init(void)
}
/*
- * Getting a lock on a stable anon_vma from a page off the LRU is
- * tricky: page_lock_anon_vma rely on RCU to guard against the races.
+ * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
+ *
+ * Since there is no serialization what so ever against page_remove_rmap()
+ * the best this function can do is return a locked anon_vma that might
+ * have been relevant to this page.
+ *
+ * The page might have been remapped to a different anon_vma or the anon_vma
+ * returned may already be freed (and even reused).
+ *
+ * In case it was remapped to a different anon_vma, the new anon_vma will be a
+ * child of the old anon_vma, and the anon_vma lifetime rules will therefore
+ * ensure that any anon_vma obtained from the page will still be valid for as
+ * long as we observe page_mapped() [ hence all those page_mapped() tests ].
+ *
+ * All users of this function must be very careful when walking the anon_vma
+ * chain and verify that the page in question is indeed mapped in it
+ * [ something equivalent to page_mapped_in_vma() ].
+ *
+ * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap()
+ * that the anon_vma pointer from page->mapping is valid if there is a
+ * mapcount, we can dereference the anon_vma after observing those.
*/
-struct anon_vma *__page_lock_anon_vma(struct page *page)
+struct anon_vma *page_get_anon_vma(struct page *page)
{
- struct anon_vma *anon_vma, *root_anon_vma;
+ struct anon_vma *anon_vma = NULL;
unsigned long anon_mapping;
rcu_read_lock();
@@ -336,32 +378,100 @@ struct anon_vma *__page_lock_anon_vma(struct page *page)
goto out;
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
- root_anon_vma = ACCESS_ONCE(anon_vma->root);
- spin_lock(&root_anon_vma->lock);
+ if (!atomic_inc_not_zero(&anon_vma->refcount)) {
+ anon_vma = NULL;
+ goto out;
+ }
/*
* If this page is still mapped, then its anon_vma cannot have been
- * freed. But if it has been unmapped, we have no security against
- * the anon_vma structure being freed and reused (for another anon_vma:
- * SLAB_DESTROY_BY_RCU guarantees that - so the spin_lock above cannot
- * corrupt): with anon_vma_prepare() or anon_vma_fork() redirecting
- * anon_vma->root before page_unlock_anon_vma() is called to unlock.
+ * freed. But if it has been unmapped, we have no security against the
+ * anon_vma structure being freed and reused (for another anon_vma:
+ * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero()
+ * above cannot corrupt).
*/
- if (page_mapped(page))
- return anon_vma;
+ if (!page_mapped(page)) {
+ put_anon_vma(anon_vma);
+ anon_vma = NULL;
+ }
+out:
+ rcu_read_unlock();
+
+ return anon_vma;
+}
+
+/*
+ * Similar to page_get_anon_vma() except it locks the anon_vma.
+ *
+ * Its a little more complex as it tries to keep the fast path to a single
+ * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
+ * reference like with page_get_anon_vma() and then block on the mutex.
+ */
+struct anon_vma *page_lock_anon_vma(struct page *page)
+{
+ struct anon_vma *anon_vma = NULL;
+ struct anon_vma *root_anon_vma;
+ unsigned long anon_mapping;
+
+ rcu_read_lock();
+ anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
+ if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
+ goto out;
+ if (!page_mapped(page))
+ goto out;
+
+ anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
+ root_anon_vma = ACCESS_ONCE(anon_vma->root);
+ if (mutex_trylock(&root_anon_vma->mutex)) {
+ /*
+ * If the page is still mapped, then this anon_vma is still
+ * its anon_vma, and holding the mutex ensures that it will
+ * not go away, see anon_vma_free().
+ */
+ if (!page_mapped(page)) {
+ mutex_unlock(&root_anon_vma->mutex);
+ anon_vma = NULL;
+ }
+ goto out;
+ }
+
+ /* trylock failed, we got to sleep */
+ if (!atomic_inc_not_zero(&anon_vma->refcount)) {
+ anon_vma = NULL;
+ goto out;
+ }
+
+ if (!page_mapped(page)) {
+ put_anon_vma(anon_vma);
+ anon_vma = NULL;
+ goto out;
+ }
+
+ /* we pinned the anon_vma, its safe to sleep */
+ rcu_read_unlock();
+ anon_vma_lock(anon_vma);
+
+ if (atomic_dec_and_test(&anon_vma->refcount)) {
+ /*
+ * Oops, we held the last refcount, release the lock
+ * and bail -- can't simply use put_anon_vma() because
+ * we'll deadlock on the anon_vma_lock() recursion.
+ */
+ anon_vma_unlock(anon_vma);
+ __put_anon_vma(anon_vma);
+ anon_vma = NULL;
+ }
+
+ return anon_vma;
- spin_unlock(&root_anon_vma->lock);
out:
rcu_read_unlock();
- return NULL;
+ return anon_vma;
}
void page_unlock_anon_vma(struct anon_vma *anon_vma)
- __releases(&anon_vma->root->lock)
- __releases(RCU)
{
anon_vma_unlock(anon_vma);
- rcu_read_unlock();
}
/*
@@ -646,14 +756,14 @@ static int page_referenced_file(struct page *page,
* The page lock not only makes sure that page->mapping cannot
* suddenly be NULLified by truncation, it makes sure that the
* structure at mapping cannot be freed and reused yet,
- * so we can safely take mapping->i_mmap_lock.
+ * so we can safely take mapping->i_mmap_mutex.
*/
BUG_ON(!PageLocked(page));
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_mutex);
/*
- * i_mmap_lock does not stabilize mapcount at all, but mapcount
+ * i_mmap_mutex does not stabilize mapcount at all, but mapcount
* is more likely to be accurate if we note it after spinning.
*/
mapcount = page_mapcount(page);
@@ -675,7 +785,7 @@ static int page_referenced_file(struct page *page,
break;
}
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
return referenced;
}
@@ -719,7 +829,7 @@ int page_referenced(struct page *page,
unlock_page(page);
}
out:
- if (page_test_and_clear_young(page))
+ if (page_test_and_clear_young(page_to_pfn(page)))
referenced++;
return referenced;
@@ -762,7 +872,7 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
BUG_ON(PageAnon(page));
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_mutex);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
if (vma->vm_flags & VM_SHARED) {
unsigned long address = vma_address(page, vma);
@@ -771,7 +881,7 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
ret += page_mkclean_one(page, vma, address);
}
}
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
return ret;
}
@@ -785,10 +895,8 @@ int page_mkclean(struct page *page)
struct address_space *mapping = page_mapping(page);
if (mapping) {
ret = page_mkclean_file(mapping, page);
- if (page_test_dirty(page)) {
- page_clear_dirty(page, 1);
+ if (page_test_and_clear_dirty(page_to_pfn(page), 1))
ret = 1;
- }
}
}
@@ -914,7 +1022,7 @@ void do_page_add_anon_rmap(struct page *page,
return;
VM_BUG_ON(!PageLocked(page));
- VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ /* address might be in next vma when migration races vma_adjust */
if (first)
__page_set_anon_rmap(page, vma, address, exclusive);
else
@@ -981,10 +1089,9 @@ void page_remove_rmap(struct page *page)
* not if it's in swapcache - there might be another pte slot
* containing the swap entry, but page not yet written to swap.
*/
- if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) {
- page_clear_dirty(page, 1);
+ if ((!PageAnon(page) || PageSwapCache(page)) &&
+ page_test_and_clear_dirty(page_to_pfn(page), 1))
set_page_dirty(page);
- }
/*
* Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
* and not charged by memcg for now.
@@ -1122,7 +1229,7 @@ out_mlock:
/*
* We need mmap_sem locking, Otherwise VM_LOCKED check makes
* unstable result and race. Plus, We can't wait here because
- * we now hold anon_vma->lock or mapping->i_mmap_lock.
+ * we now hold anon_vma->mutex or mapping->i_mmap_mutex.
* if trylock failed, the page remain in evictable lru and later
* vmscan could retry to move the page to unevictable lru if the
* page is actually mlocked.
@@ -1348,7 +1455,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
unsigned long max_nl_size = 0;
unsigned int mapcount;
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_mutex);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
if (address == -EFAULT)
@@ -1394,7 +1501,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
mapcount = page_mapcount(page);
if (!mapcount)
goto out;
- cond_resched_lock(&mapping->i_mmap_lock);
+ cond_resched();
max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
if (max_nl_cursor == 0)
@@ -1416,7 +1523,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
}
vma->vm_private_data = (void *) max_nl_cursor;
}
- cond_resched_lock(&mapping->i_mmap_lock);
+ cond_resched();
max_nl_cursor += CLUSTER_SIZE;
} while (max_nl_cursor <= max_nl_size);
@@ -1428,7 +1535,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
vma->vm_private_data = NULL;
out:
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
return ret;
}
@@ -1547,7 +1654,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
if (!mapping)
return ret;
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_mutex);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
if (address == -EFAULT)
@@ -1561,7 +1668,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
* never contain migration ptes. Decide what to do about this
* limitation to linear when we need rmap_walk() on nonlinear.
*/
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
return ret;
}
@@ -1610,7 +1717,7 @@ void hugepage_add_anon_rmap(struct page *page,
BUG_ON(!PageLocked(page));
BUG_ON(!anon_vma);
- BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ /* address might be in next vma when migration races vma_adjust */
first = atomic_inc_and_test(&page->_mapcount);
if (first)
__hugepage_set_anon_rmap(page, vma, address, 0);
diff --git a/mm/shmem.c b/mm/shmem.c
index ba4ad28..d221a1c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -99,6 +99,13 @@ static struct vfsmount *shm_mnt;
/* Pretend that each entry is of this size in directory's i_size */
#define BOGO_DIRENT_SIZE 20
+struct shmem_xattr {
+ struct list_head list; /* anchored by shmem_inode_info->xattr_list */
+ char *name; /* xattr name */
+ size_t size;
+ char value[0];
+};
+
/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
enum sgp_type {
SGP_READ, /* don't exceed i_size, don't allocate page */
@@ -822,6 +829,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
static void shmem_evict_inode(struct inode *inode)
{
struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_xattr *xattr, *nxattr;
if (inode->i_mapping->a_ops == &shmem_aops) {
truncate_inode_pages(inode->i_mapping, 0);
@@ -834,6 +842,11 @@ static void shmem_evict_inode(struct inode *inode)
mutex_unlock(&shmem_swaplist_mutex);
}
}
+
+ list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
+ kfree(xattr->name);
+ kfree(xattr);
+ }
BUG_ON(inode->i_blocks);
shmem_free_inode(inode->i_sb);
end_writeback(inode);
@@ -1101,8 +1114,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
delete_from_page_cache(page);
shmem_swp_set(info, entry, swap.val);
shmem_swp_unmap(entry);
- spin_unlock(&info->lock);
swap_shmem_alloc(swap);
+ spin_unlock(&info->lock);
BUG_ON(page_mapped(page));
swap_writepage(page, wbc);
return 0;
@@ -1292,12 +1305,10 @@ repeat:
swappage = lookup_swap_cache(swap);
if (!swappage) {
shmem_swp_unmap(entry);
+ spin_unlock(&info->lock);
/* here we actually do the io */
- if (type && !(*type & VM_FAULT_MAJOR)) {
- __count_vm_event(PGMAJFAULT);
+ if (type)
*type |= VM_FAULT_MAJOR;
- }
- spin_unlock(&info->lock);
swappage = shmem_swapin(swap, gfp, info, idx);
if (!swappage) {
spin_lock(&info->lock);
@@ -1536,7 +1547,10 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
if (error)
return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
-
+ if (ret & VM_FAULT_MAJOR) {
+ count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+ }
return ret | VM_FAULT_LOCKED;
}
@@ -1615,6 +1629,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
spin_lock_init(&info->lock);
info->flags = flags & VM_NORESERVE;
INIT_LIST_HEAD(&info->swaplist);
+ INIT_LIST_HEAD(&info->xattr_list);
cache_no_acl(inode);
switch (mode & S_IFMT) {
@@ -2014,9 +2029,9 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
info = SHMEM_I(inode);
inode->i_size = len-1;
- if (len <= (char *)inode - (char *)info) {
+ if (len <= SHMEM_SYMLINK_INLINE_LEN) {
/* do it inline */
- memcpy(info, symname, len);
+ memcpy(info->inline_symlink, symname, len);
inode->i_op = &shmem_symlink_inline_operations;
} else {
error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
@@ -2042,7 +2057,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
{
- nd_set_link(nd, (char *)SHMEM_I(dentry->d_inode));
+ nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink);
return NULL;
}
@@ -2066,63 +2081,253 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co
}
}
-static const struct inode_operations shmem_symlink_inline_operations = {
- .readlink = generic_readlink,
- .follow_link = shmem_follow_link_inline,
-};
-
-static const struct inode_operations shmem_symlink_inode_operations = {
- .readlink = generic_readlink,
- .follow_link = shmem_follow_link,
- .put_link = shmem_put_link,
-};
-
-#ifdef CONFIG_TMPFS_POSIX_ACL
+#ifdef CONFIG_TMPFS_XATTR
/*
- * Superblocks without xattr inode operations will get security.* xattr
- * support from the VFS "for free". As soon as we have any other xattrs
+ * Superblocks without xattr inode operations may get some security.* xattr
+ * support from the LSM "for free". As soon as we have any other xattrs
* like ACLs, we also need to implement the security.* handlers at
* filesystem level, though.
*/
-static size_t shmem_xattr_security_list(struct dentry *dentry, char *list,
- size_t list_len, const char *name,
- size_t name_len, int handler_flags)
+static int shmem_xattr_get(struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- return security_inode_listsecurity(dentry->d_inode, list, list_len);
-}
+ struct shmem_inode_info *info;
+ struct shmem_xattr *xattr;
+ int ret = -ENODATA;
-static int shmem_xattr_security_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int handler_flags)
-{
- if (strcmp(name, "") == 0)
- return -EINVAL;
- return xattr_getsecurity(dentry->d_inode, name, buffer, size);
+ info = SHMEM_I(dentry->d_inode);
+
+ spin_lock(&info->lock);
+ list_for_each_entry(xattr, &info->xattr_list, list) {
+ if (strcmp(name, xattr->name))
+ continue;
+
+ ret = xattr->size;
+ if (buffer) {
+ if (size < xattr->size)
+ ret = -ERANGE;
+ else
+ memcpy(buffer, xattr->value, xattr->size);
+ }
+ break;
+ }
+ spin_unlock(&info->lock);
+ return ret;
}
-static int shmem_xattr_security_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int handler_flags)
+static int shmem_xattr_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
- return security_inode_setsecurity(dentry->d_inode, name, value,
- size, flags);
+ struct inode *inode = dentry->d_inode;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_xattr *xattr;
+ struct shmem_xattr *new_xattr = NULL;
+ size_t len;
+ int err = 0;
+
+ /* value == NULL means remove */
+ if (value) {
+ /* wrap around? */
+ len = sizeof(*new_xattr) + size;
+ if (len <= sizeof(*new_xattr))
+ return -ENOMEM;
+
+ new_xattr = kmalloc(len, GFP_KERNEL);
+ if (!new_xattr)
+ return -ENOMEM;
+
+ new_xattr->name = kstrdup(name, GFP_KERNEL);
+ if (!new_xattr->name) {
+ kfree(new_xattr);
+ return -ENOMEM;
+ }
+
+ new_xattr->size = size;
+ memcpy(new_xattr->value, value, size);
+ }
+
+ spin_lock(&info->lock);
+ list_for_each_entry(xattr, &info->xattr_list, list) {
+ if (!strcmp(name, xattr->name)) {
+ if (flags & XATTR_CREATE) {
+ xattr = new_xattr;
+ err = -EEXIST;
+ } else if (new_xattr) {
+ list_replace(&xattr->list, &new_xattr->list);
+ } else {
+ list_del(&xattr->list);
+ }
+ goto out;
+ }
+ }
+ if (flags & XATTR_REPLACE) {
+ xattr = new_xattr;
+ err = -ENODATA;
+ } else {
+ list_add(&new_xattr->list, &info->xattr_list);
+ xattr = NULL;
+ }
+out:
+ spin_unlock(&info->lock);
+ if (xattr)
+ kfree(xattr->name);
+ kfree(xattr);
+ return err;
}
-static const struct xattr_handler shmem_xattr_security_handler = {
- .prefix = XATTR_SECURITY_PREFIX,
- .list = shmem_xattr_security_list,
- .get = shmem_xattr_security_get,
- .set = shmem_xattr_security_set,
-};
static const struct xattr_handler *shmem_xattr_handlers[] = {
+#ifdef CONFIG_TMPFS_POSIX_ACL
&generic_acl_access_handler,
&generic_acl_default_handler,
- &shmem_xattr_security_handler,
+#endif
NULL
};
+
+static int shmem_xattr_validate(const char *name)
+{
+ struct { const char *prefix; size_t len; } arr[] = {
+ { XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN },
+ { XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN }
+ };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(arr); i++) {
+ size_t preflen = arr[i].len;
+ if (strncmp(name, arr[i].prefix, preflen) == 0) {
+ if (!name[preflen])
+ return -EINVAL;
+ return 0;
+ }
+ }
+ return -EOPNOTSUPP;
+}
+
+static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
+{
+ int err;
+
+ /*
+ * If this is a request for a synthetic attribute in the system.*
+ * namespace use the generic infrastructure to resolve a handler
+ * for it via sb->s_xattr.
+ */
+ if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ return generic_getxattr(dentry, name, buffer, size);
+
+ err = shmem_xattr_validate(name);
+ if (err)
+ return err;
+
+ return shmem_xattr_get(dentry, name, buffer, size);
+}
+
+static int shmem_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
+{
+ int err;
+
+ /*
+ * If this is a request for a synthetic attribute in the system.*
+ * namespace use the generic infrastructure to resolve a handler
+ * for it via sb->s_xattr.
+ */
+ if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ return generic_setxattr(dentry, name, value, size, flags);
+
+ err = shmem_xattr_validate(name);
+ if (err)
+ return err;
+
+ if (size == 0)
+ value = ""; /* empty EA, do not remove */
+
+ return shmem_xattr_set(dentry, name, value, size, flags);
+
+}
+
+static int shmem_removexattr(struct dentry *dentry, const char *name)
+{
+ int err;
+
+ /*
+ * If this is a request for a synthetic attribute in the system.*
+ * namespace use the generic infrastructure to resolve a handler
+ * for it via sb->s_xattr.
+ */
+ if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ return generic_removexattr(dentry, name);
+
+ err = shmem_xattr_validate(name);
+ if (err)
+ return err;
+
+ return shmem_xattr_set(dentry, name, NULL, 0, XATTR_REPLACE);
+}
+
+static bool xattr_is_trusted(const char *name)
+{
+ return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
+}
+
+static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+ bool trusted = capable(CAP_SYS_ADMIN);
+ struct shmem_xattr *xattr;
+ struct shmem_inode_info *info;
+ size_t used = 0;
+
+ info = SHMEM_I(dentry->d_inode);
+
+ spin_lock(&info->lock);
+ list_for_each_entry(xattr, &info->xattr_list, list) {
+ size_t len;
+
+ /* skip "trusted." attributes for unprivileged callers */
+ if (!trusted && xattr_is_trusted(xattr->name))
+ continue;
+
+ len = strlen(xattr->name) + 1;
+ used += len;
+ if (buffer) {
+ if (size < used) {
+ used = -ERANGE;
+ break;
+ }
+ memcpy(buffer, xattr->name, len);
+ buffer += len;
+ }
+ }
+ spin_unlock(&info->lock);
+
+ return used;
+}
+#endif /* CONFIG_TMPFS_XATTR */
+
+static const struct inode_operations shmem_symlink_inline_operations = {
+ .readlink = generic_readlink,
+ .follow_link = shmem_follow_link_inline,
+#ifdef CONFIG_TMPFS_XATTR
+ .setxattr = shmem_setxattr,
+ .getxattr = shmem_getxattr,
+ .listxattr = shmem_listxattr,
+ .removexattr = shmem_removexattr,
#endif
+};
+
+static const struct inode_operations shmem_symlink_inode_operations = {
+ .readlink = generic_readlink,
+ .follow_link = shmem_follow_link,
+ .put_link = shmem_put_link,
+#ifdef CONFIG_TMPFS_XATTR
+ .setxattr = shmem_setxattr,
+ .getxattr = shmem_getxattr,
+ .listxattr = shmem_listxattr,
+ .removexattr = shmem_removexattr,
+#endif
+};
static struct dentry *shmem_get_parent(struct dentry *child)
{
@@ -2402,8 +2607,10 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
sb->s_magic = TMPFS_MAGIC;
sb->s_op = &shmem_ops;
sb->s_time_gran = 1;
-#ifdef CONFIG_TMPFS_POSIX_ACL
+#ifdef CONFIG_TMPFS_XATTR
sb->s_xattr = shmem_xattr_handlers;
+#endif
+#ifdef CONFIG_TMPFS_POSIX_ACL
sb->s_flags |= MS_POSIXACL;
#endif
@@ -2501,11 +2708,13 @@ static const struct file_operations shmem_file_operations = {
static const struct inode_operations shmem_inode_operations = {
.setattr = shmem_notify_change,
.truncate_range = shmem_truncate_range,
+#ifdef CONFIG_TMPFS_XATTR
+ .setxattr = shmem_setxattr,
+ .getxattr = shmem_getxattr,
+ .listxattr = shmem_listxattr,
+ .removexattr = shmem_removexattr,
+#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
- .listxattr = generic_listxattr,
- .removexattr = generic_removexattr,
.check_acl = generic_check_acl,
#endif
@@ -2523,23 +2732,27 @@ static const struct inode_operations shmem_dir_inode_operations = {
.mknod = shmem_mknod,
.rename = shmem_rename,
#endif
+#ifdef CONFIG_TMPFS_XATTR
+ .setxattr = shmem_setxattr,
+ .getxattr = shmem_getxattr,
+ .listxattr = shmem_listxattr,
+ .removexattr = shmem_removexattr,
+#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
.setattr = shmem_notify_change,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
- .listxattr = generic_listxattr,
- .removexattr = generic_removexattr,
.check_acl = generic_check_acl,
#endif
};
static const struct inode_operations shmem_special_inode_operations = {
+#ifdef CONFIG_TMPFS_XATTR
+ .setxattr = shmem_setxattr,
+ .getxattr = shmem_getxattr,
+ .listxattr = shmem_listxattr,
+ .removexattr = shmem_removexattr,
+#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
.setattr = shmem_notify_change,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
- .listxattr = generic_listxattr,
- .removexattr = generic_removexattr,
.check_acl = generic_check_acl,
#endif
};
diff --git a/mm/slub.c b/mm/slub.c
index 4ea7f1a..7be0223 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1831,7 +1831,6 @@ load_freelist:
page->inuse = page->objects;
page->freelist = NULL;
-unlock_out:
slab_unlock(page);
c->tid = next_tid(c->tid);
local_irq_restore(flags);
@@ -1884,7 +1883,8 @@ debug:
deactivate_slab(s, c);
c->page = NULL;
c->node = NUMA_NO_NODE;
- goto unlock_out;
+ local_irq_restore(flags);
+ return object;
}
/*
diff --git a/mm/swap.c b/mm/swap.c
index 5602f1a..3a442f1 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -272,14 +272,10 @@ static void update_page_reclaim_stat(struct zone *zone, struct page *page,
memcg_reclaim_stat->recent_rotated[file]++;
}
-/*
- * FIXME: speed this up?
- */
-void activate_page(struct page *page)
+static void __activate_page(struct page *page, void *arg)
{
struct zone *zone = page_zone(page);
- spin_lock_irq(&zone->lru_lock);
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
int file = page_is_file_cache(page);
int lru = page_lru_base_type(page);
@@ -292,8 +288,45 @@ void activate_page(struct page *page)
update_page_reclaim_stat(zone, page, file, 1);
}
+}
+
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
+
+static void activate_page_drain(int cpu)
+{
+ struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu);
+
+ if (pagevec_count(pvec))
+ pagevec_lru_move_fn(pvec, __activate_page, NULL);
+}
+
+void activate_page(struct page *page)
+{
+ if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+ struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
+
+ page_cache_get(page);
+ if (!pagevec_add(pvec, page))
+ pagevec_lru_move_fn(pvec, __activate_page, NULL);
+ put_cpu_var(activate_page_pvecs);
+ }
+}
+
+#else
+static inline void activate_page_drain(int cpu)
+{
+}
+
+void activate_page(struct page *page)
+{
+ struct zone *zone = page_zone(page);
+
+ spin_lock_irq(&zone->lru_lock);
+ __activate_page(page, NULL);
spin_unlock_irq(&zone->lru_lock);
}
+#endif
/*
* Mark a page as having seen activity.
@@ -464,6 +497,8 @@ static void drain_cpu_pagevecs(int cpu)
pvec = &per_cpu(lru_deactivate_pvecs, cpu);
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+
+ activate_page_drain(cpu);
}
/**
@@ -476,6 +511,13 @@ static void drain_cpu_pagevecs(int cpu)
*/
void deactivate_page(struct page *page)
{
+ /*
+ * In a workload with many unevictable page such as mprotect, unevictable
+ * page deactivation for accelerating reclaim is pointless.
+ */
+ if (PageUnevictable(page))
+ return;
+
if (likely(get_page_unless_zero(page))) {
struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8c6b3ce..d537d29 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -31,6 +31,7 @@
#include <linux/syscalls.h>
#include <linux/memcontrol.h>
#include <linux/poll.h>
+#include <linux/oom.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -1555,6 +1556,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
struct address_space *mapping;
struct inode *inode;
char *pathname;
+ int oom_score_adj;
int i, type, prev;
int err;
@@ -1613,9 +1615,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
p->flags &= ~SWP_WRITEOK;
spin_unlock(&swap_lock);
- current->flags |= PF_OOM_ORIGIN;
+ oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
err = try_to_unuse(type);
- current->flags &= ~PF_OOM_ORIGIN;
+ test_set_oom_score_adj(oom_score_adj);
if (err) {
/*
diff --git a/mm/truncate.c b/mm/truncate.c
index a956675..3a29a61 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -19,6 +19,7 @@
#include <linux/task_io_accounting_ops.h>
#include <linux/buffer_head.h> /* grr. try_to_release_page,
do_invalidatepage */
+#include <linux/cleancache.h>
#include "internal.h"
@@ -51,6 +52,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
static inline void truncate_partial_page(struct page *page, unsigned partial)
{
zero_user_segment(page, partial, PAGE_CACHE_SIZE);
+ cleancache_flush_page(page->mapping, page);
if (page_has_private(page))
do_invalidatepage(page, partial);
}
@@ -214,6 +216,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
pgoff_t next;
int i;
+ cleancache_flush_inode(mapping);
if (mapping->nrpages == 0)
return;
@@ -291,6 +294,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
pagevec_release(&pvec);
mem_cgroup_uncharge_end();
}
+ cleancache_flush_inode(mapping);
}
EXPORT_SYMBOL(truncate_inode_pages_range);
@@ -440,6 +444,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
int did_range_unmap = 0;
int wrapped = 0;
+ cleancache_flush_inode(mapping);
pagevec_init(&pvec, 0);
next = start;
while (next <= end && !wrapped &&
@@ -498,6 +503,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
mem_cgroup_uncharge_end();
cond_resched();
}
+ cleancache_flush_inode(mapping);
return ret;
}
EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
diff --git a/mm/util.c b/mm/util.c
index e7b103a..88ea1bd 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -6,6 +6,8 @@
#include <linux/sched.h>
#include <asm/uaccess.h>
+#include "internal.h"
+
#define CREATE_TRACE_POINTS
#include <trace/events/kmem.h>
@@ -215,6 +217,28 @@ char *strndup_user(const char __user *s, long n)
}
EXPORT_SYMBOL(strndup_user);
+void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct vm_area_struct *prev, struct rb_node *rb_parent)
+{
+ struct vm_area_struct *next;
+
+ vma->vm_prev = prev;
+ if (prev) {
+ next = prev->vm_next;
+ prev->vm_next = vma;
+ } else {
+ mm->mmap = vma;
+ if (rb_parent)
+ next = rb_entry(rb_parent,
+ struct vm_area_struct, vm_rb);
+ else
+ next = NULL;
+ }
+ vma->vm_next = next;
+ if (next)
+ next->vm_prev = vma;
+}
+
#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
void arch_pick_mmap_layout(struct mm_struct *mm)
{
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5d60302..1d34d75 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -375,7 +375,7 @@ nocache:
/* find starting point for our search */
if (free_vmap_cache) {
first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
- addr = ALIGN(first->va_end + PAGE_SIZE, align);
+ addr = ALIGN(first->va_end, align);
if (addr < vstart)
goto nocache;
if (addr + size - 1 < addr)
@@ -406,10 +406,10 @@ nocache:
}
/* from the starting point, walk areas until a suitable hole is found */
- while (addr + size >= first->va_start && addr + size <= vend) {
+ while (addr + size > first->va_start && addr + size <= vend) {
if (addr + cached_hole_size < first->va_start)
cached_hole_size = first->va_start - addr;
- addr = ALIGN(first->va_end + PAGE_SIZE, align);
+ addr = ALIGN(first->va_end, align);
if (addr + size - 1 < addr)
goto overflow;
@@ -1534,6 +1534,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node, void *caller)
{
+ const int order = 0;
struct page **pages;
unsigned int nr_pages, array_size, i;
gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
@@ -1560,11 +1561,12 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
for (i = 0; i < area->nr_pages; i++) {
struct page *page;
+ gfp_t tmp_mask = gfp_mask | __GFP_NOWARN;
if (node < 0)
- page = alloc_page(gfp_mask);
+ page = alloc_page(tmp_mask);
else
- page = alloc_pages_node(node, gfp_mask, 0);
+ page = alloc_pages_node(node, tmp_mask, order);
if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vunmap() */
@@ -1579,6 +1581,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
return area->addr;
fail:
+ warn_alloc_failed(gfp_mask, order, "vmalloc: allocation failure, "
+ "allocated %ld of %ld bytes\n",
+ (area->nr_pages*PAGE_SIZE), area->size);
vfree(area->addr);
return NULL;
}
@@ -2148,10 +2153,6 @@ struct vm_struct *alloc_vm_area(size_t size)
return NULL;
}
- /* Make sure the pagetables are constructed in process kernel
- mappings */
- vmalloc_sync_all();
-
return area;
}
EXPORT_SYMBOL_GPL(alloc_vm_area);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c917720..faa0a08 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -173,7 +173,7 @@ static unsigned long zone_nr_lru_pages(struct zone *zone,
struct scan_control *sc, enum lru_list lru)
{
if (!scanning_global_lru(sc))
- return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru);
+ return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru);
return zone_page_state(zone, NR_LRU_BASE + lru);
}
@@ -202,6 +202,14 @@ void unregister_shrinker(struct shrinker *shrinker)
}
EXPORT_SYMBOL(unregister_shrinker);
+static inline int do_shrinker_shrink(struct shrinker *shrinker,
+ struct shrink_control *sc,
+ unsigned long nr_to_scan)
+{
+ sc->nr_to_scan = nr_to_scan;
+ return (*shrinker->shrink)(shrinker, sc);
+}
+
#define SHRINK_BATCH 128
/*
* Call the shrink functions to age shrinkable caches
@@ -222,25 +230,29 @@ EXPORT_SYMBOL(unregister_shrinker);
*
* Returns the number of slab objects which we shrunk.
*/
-unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
- unsigned long lru_pages)
+unsigned long shrink_slab(struct shrink_control *shrink,
+ unsigned long nr_pages_scanned,
+ unsigned long lru_pages)
{
struct shrinker *shrinker;
unsigned long ret = 0;
- if (scanned == 0)
- scanned = SWAP_CLUSTER_MAX;
+ if (nr_pages_scanned == 0)
+ nr_pages_scanned = SWAP_CLUSTER_MAX;
- if (!down_read_trylock(&shrinker_rwsem))
- return 1; /* Assume we'll be able to shrink next time */
+ if (!down_read_trylock(&shrinker_rwsem)) {
+ /* Assume we'll be able to shrink next time */
+ ret = 1;
+ goto out;
+ }
list_for_each_entry(shrinker, &shrinker_list, list) {
unsigned long long delta;
unsigned long total_scan;
unsigned long max_pass;
- max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask);
- delta = (4 * scanned) / shrinker->seeks;
+ max_pass = do_shrinker_shrink(shrinker, shrink, 0);
+ delta = (4 * nr_pages_scanned) / shrinker->seeks;
delta *= max_pass;
do_div(delta, lru_pages + 1);
shrinker->nr += delta;
@@ -267,9 +279,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
int shrink_ret;
int nr_before;
- nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask);
- shrink_ret = (*shrinker->shrink)(shrinker, this_scan,
- gfp_mask);
+ nr_before = do_shrinker_shrink(shrinker, shrink, 0);
+ shrink_ret = do_shrinker_shrink(shrinker, shrink,
+ this_scan);
if (shrink_ret == -1)
break;
if (shrink_ret < nr_before)
@@ -283,6 +295,8 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
shrinker->nr += total_scan;
}
up_read(&shrinker_rwsem);
+out:
+ cond_resched();
return ret;
}
@@ -1202,13 +1216,16 @@ int isolate_lru_page(struct page *page)
{
int ret = -EBUSY;
+ VM_BUG_ON(!page_count(page));
+
if (PageLRU(page)) {
struct zone *zone = page_zone(page);
spin_lock_irq(&zone->lru_lock);
- if (PageLRU(page) && get_page_unless_zero(page)) {
+ if (PageLRU(page)) {
int lru = page_lru(page);
ret = 0;
+ get_page(page);
ClearPageLRU(page);
del_page_from_lru_list(zone, page, lru);
@@ -1701,26 +1718,6 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
}
/*
- * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
- * until we collected @swap_cluster_max pages to scan.
- */
-static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
- unsigned long *nr_saved_scan)
-{
- unsigned long nr;
-
- *nr_saved_scan += nr_to_scan;
- nr = *nr_saved_scan;
-
- if (nr >= SWAP_CLUSTER_MAX)
- *nr_saved_scan = 0;
- else
- nr = 0;
-
- return nr;
-}
-
-/*
* Determine how aggressively the anon and file LRU lists should be
* scanned. The relative value of each set of LRU lists is determined
* by looking at the fraction of the pages scanned we did rotate back
@@ -1738,6 +1735,22 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
u64 fraction[2], denominator;
enum lru_list l;
int noswap = 0;
+ int force_scan = 0;
+
+
+ anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
+ zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
+ file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
+ zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+
+ if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) {
+ /* kswapd does zone balancing and need to scan this zone */
+ if (scanning_global_lru(sc) && current_is_kswapd())
+ force_scan = 1;
+ /* memcg may have small limit and need to avoid priority drop */
+ if (!scanning_global_lru(sc))
+ force_scan = 1;
+ }
/* If we have no swap space, do not bother scanning anon pages. */
if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1748,11 +1761,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
goto out;
}
- anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
- zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
- file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
- zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
-
if (scanning_global_lru(sc)) {
free = zone_page_state(zone, NR_FREE_PAGES);
/* If we have very few page cache pages,
@@ -1819,8 +1827,23 @@ out:
scan >>= priority;
scan = div64_u64(scan * fraction[file], denominator);
}
- nr[l] = nr_scan_try_batch(scan,
- &reclaim_stat->nr_saved_scan[l]);
+
+ /*
+ * If zone is small or memcg is small, nr[l] can be 0.
+ * This results no-scan on this priority and priority drop down.
+ * For global direct reclaim, it can visit next zone and tend
+ * not to have problems. For global kswapd, it's for zone
+ * balancing and it need to scan a small amounts. When using
+ * memcg, priority drop can cause big latency. So, it's better
+ * to scan small amount. See may_noscan above.
+ */
+ if (!scan && force_scan) {
+ if (file)
+ scan = SWAP_CLUSTER_MAX;
+ else if (!noswap)
+ scan = SWAP_CLUSTER_MAX;
+ }
+ nr[l] = scan;
}
}
@@ -1960,11 +1983,14 @@ restart:
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*/
-static void shrink_zones(int priority, struct zonelist *zonelist,
+static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
+ unsigned long nr_soft_reclaimed;
+ unsigned long nr_soft_scanned;
+ unsigned long total_scanned = 0;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -1981,8 +2007,17 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
continue; /* Let kswapd poll it */
}
+ nr_soft_scanned = 0;
+ nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+ sc->order, sc->gfp_mask,
+ &nr_soft_scanned);
+ sc->nr_reclaimed += nr_soft_reclaimed;
+ total_scanned += nr_soft_scanned;
+
shrink_zone(priority, zone, sc);
}
+
+ return total_scanned;
}
static bool zone_reclaimable(struct zone *zone)
@@ -2027,7 +2062,8 @@ static bool all_unreclaimable(struct zonelist *zonelist,
* else, the number of pages reclaimed
*/
static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
- struct scan_control *sc)
+ struct scan_control *sc,
+ struct shrink_control *shrink)
{
int priority;
unsigned long total_scanned = 0;
@@ -2046,7 +2082,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
sc->nr_scanned = 0;
if (!priority)
disable_swap_token();
- shrink_zones(priority, zonelist, sc);
+ total_scanned += shrink_zones(priority, zonelist, sc);
/*
* Don't shrink slabs when reclaiming memory from
* over limit cgroups
@@ -2061,7 +2097,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
lru_pages += zone_reclaimable_pages(zone);
}
- shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
+ shrink_slab(shrink, sc->nr_scanned, lru_pages);
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
@@ -2133,12 +2169,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
.mem_cgroup = NULL,
.nodemask = nodemask,
};
+ struct shrink_control shrink = {
+ .gfp_mask = sc.gfp_mask,
+ };
trace_mm_vmscan_direct_reclaim_begin(order,
sc.may_writepage,
gfp_mask);
- nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
@@ -2150,9 +2189,11 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap,
unsigned int swappiness,
- struct zone *zone)
+ struct zone *zone,
+ unsigned long *nr_scanned)
{
struct scan_control sc = {
+ .nr_scanned = 0,
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.may_writepage = !laptop_mode,
.may_unmap = 1,
@@ -2161,6 +2202,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
.order = 0,
.mem_cgroup = mem,
};
+
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -2179,6 +2221,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
+ *nr_scanned = sc.nr_scanned;
return sc.nr_reclaimed;
}
@@ -2189,6 +2232,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
{
struct zonelist *zonelist;
unsigned long nr_reclaimed;
+ int nid;
struct scan_control sc = {
.may_writepage = !laptop_mode,
.may_unmap = 1,
@@ -2198,17 +2242,27 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
.order = 0,
.mem_cgroup = mem_cont,
.nodemask = NULL, /* we don't care the placement */
+ .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+ (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
+ };
+ struct shrink_control shrink = {
+ .gfp_mask = sc.gfp_mask,
};
- sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
- (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
- zonelist = NODE_DATA(numa_node_id())->node_zonelists;
+ /*
+ * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
+ * take care of from where we get pages. So the node where we start the
+ * scan does not need to be the current node.
+ */
+ nid = mem_cgroup_select_victim_node(mem_cont);
+
+ zonelist = NODE_DATA(nid)->node_zonelists;
trace_mm_vmscan_memcg_reclaim_begin(0,
sc.may_writepage,
sc.gfp_mask);
- nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
@@ -2287,7 +2341,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
* must be balanced
*/
if (order)
- return pgdat_balanced(pgdat, balanced, classzone_idx);
+ return !pgdat_balanced(pgdat, balanced, classzone_idx);
else
return !all_zones_ok;
}
@@ -2323,6 +2377,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long total_scanned;
struct reclaim_state *reclaim_state = current->reclaim_state;
+ unsigned long nr_soft_reclaimed;
+ unsigned long nr_soft_scanned;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.may_unmap = 1,
@@ -2336,6 +2392,9 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
.order = order,
.mem_cgroup = NULL,
};
+ struct shrink_control shrink = {
+ .gfp_mask = sc.gfp_mask,
+ };
loop_again:
total_scanned = 0;
sc.nr_reclaimed = 0;
@@ -2412,11 +2471,15 @@ loop_again:
sc.nr_scanned = 0;
+ nr_soft_scanned = 0;
/*
* Call soft limit reclaim before calling shrink_zone.
- * For now we ignore the return value
*/
- mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
+ nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+ order, sc.gfp_mask,
+ &nr_soft_scanned);
+ sc.nr_reclaimed += nr_soft_reclaimed;
+ total_scanned += nr_soft_scanned;
/*
* We put equal pressure on every zone, unless
@@ -2435,8 +2498,7 @@ loop_again:
end_zone, 0))
shrink_zone(priority, zone, &sc);
reclaim_state->reclaimed_slab = 0;
- nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
- lru_pages);
+ nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
sc.nr_reclaimed += reclaim_state->reclaimed_slab;
total_scanned += sc.nr_scanned;
@@ -2788,7 +2850,10 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
.swappiness = vm_swappiness,
.order = 0,
};
- struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
+ struct shrink_control shrink = {
+ .gfp_mask = sc.gfp_mask,
+ };
+ struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
struct task_struct *p = current;
unsigned long nr_reclaimed;
@@ -2797,7 +2862,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
- nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
p->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
@@ -2972,6 +3037,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
.swappiness = vm_swappiness,
.order = order,
};
+ struct shrink_control shrink = {
+ .gfp_mask = sc.gfp_mask,
+ };
unsigned long nr_slab_pages0, nr_slab_pages1;
cond_resched();
@@ -3013,7 +3081,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
unsigned long lru_pages = zone_reclaimable_pages(zone);
/* No reclaimable slab or very low memory pressure */
- if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages))
+ if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
break;
/* Freed enough memory */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 897ea9e..20c18b7 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -157,7 +157,7 @@ int calculate_normal_threshold(struct zone *zone)
/*
* Refresh the thresholds for each zone.
*/
-static void refresh_zone_stat_thresholds(void)
+void refresh_zone_stat_thresholds(void)
{
struct zone *zone;
int cpu;
@@ -659,6 +659,138 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
}
#endif
+#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS)
+#ifdef CONFIG_ZONE_DMA
+#define TEXT_FOR_DMA(xx) xx "_dma",
+#else
+#define TEXT_FOR_DMA(xx)
+#endif
+
+#ifdef CONFIG_ZONE_DMA32
+#define TEXT_FOR_DMA32(xx) xx "_dma32",
+#else
+#define TEXT_FOR_DMA32(xx)
+#endif
+
+#ifdef CONFIG_HIGHMEM
+#define TEXT_FOR_HIGHMEM(xx) xx "_high",
+#else
+#define TEXT_FOR_HIGHMEM(xx)
+#endif
+
+#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
+ TEXT_FOR_HIGHMEM(xx) xx "_movable",
+
+const char * const vmstat_text[] = {
+ /* Zoned VM counters */
+ "nr_free_pages",
+ "nr_inactive_anon",
+ "nr_active_anon",
+ "nr_inactive_file",
+ "nr_active_file",
+ "nr_unevictable",
+ "nr_mlock",
+ "nr_anon_pages",
+ "nr_mapped",
+ "nr_file_pages",
+ "nr_dirty",
+ "nr_writeback",
+ "nr_slab_reclaimable",
+ "nr_slab_unreclaimable",
+ "nr_page_table_pages",
+ "nr_kernel_stack",
+ "nr_unstable",
+ "nr_bounce",
+ "nr_vmscan_write",
+ "nr_writeback_temp",
+ "nr_isolated_anon",
+ "nr_isolated_file",
+ "nr_shmem",
+ "nr_dirtied",
+ "nr_written",
+
+#ifdef CONFIG_NUMA
+ "numa_hit",
+ "numa_miss",
+ "numa_foreign",
+ "numa_interleave",
+ "numa_local",
+ "numa_other",
+#endif
+ "nr_anon_transparent_hugepages",
+ "nr_dirty_threshold",
+ "nr_dirty_background_threshold",
+
+#ifdef CONFIG_VM_EVENT_COUNTERS
+ "pgpgin",
+ "pgpgout",
+ "pswpin",
+ "pswpout",
+
+ TEXTS_FOR_ZONES("pgalloc")
+
+ "pgfree",
+ "pgactivate",
+ "pgdeactivate",
+
+ "pgfault",
+ "pgmajfault",
+
+ TEXTS_FOR_ZONES("pgrefill")
+ TEXTS_FOR_ZONES("pgsteal")
+ TEXTS_FOR_ZONES("pgscan_kswapd")
+ TEXTS_FOR_ZONES("pgscan_direct")
+
+#ifdef CONFIG_NUMA
+ "zone_reclaim_failed",
+#endif
+ "pginodesteal",
+ "slabs_scanned",
+ "kswapd_steal",
+ "kswapd_inodesteal",
+ "kswapd_low_wmark_hit_quickly",
+ "kswapd_high_wmark_hit_quickly",
+ "kswapd_skip_congestion_wait",
+ "pageoutrun",
+ "allocstall",
+
+ "pgrotated",
+
+#ifdef CONFIG_COMPACTION
+ "compact_blocks_moved",
+ "compact_pages_moved",
+ "compact_pagemigrate_failed",
+ "compact_stall",
+ "compact_fail",
+ "compact_success",
+#endif
+
+#ifdef CONFIG_HUGETLB_PAGE
+ "htlb_buddy_alloc_success",
+ "htlb_buddy_alloc_fail",
+#endif
+ "unevictable_pgs_culled",
+ "unevictable_pgs_scanned",
+ "unevictable_pgs_rescued",
+ "unevictable_pgs_mlocked",
+ "unevictable_pgs_munlocked",
+ "unevictable_pgs_cleared",
+ "unevictable_pgs_stranded",
+ "unevictable_pgs_mlockfreed",
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ "thp_fault_alloc",
+ "thp_fault_fallback",
+ "thp_collapse_alloc",
+ "thp_collapse_alloc_failed",
+ "thp_split",
+#endif
+
+#endif /* CONFIG_VM_EVENTS_COUNTERS */
+};
+#endif /* CONFIG_PROC_FS || CONFIG_SYSFS */
+
+
#ifdef CONFIG_PROC_FS
static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
struct zone *zone)
@@ -831,135 +963,6 @@ static const struct file_operations pagetypeinfo_file_ops = {
.release = seq_release,
};
-#ifdef CONFIG_ZONE_DMA
-#define TEXT_FOR_DMA(xx) xx "_dma",
-#else
-#define TEXT_FOR_DMA(xx)
-#endif
-
-#ifdef CONFIG_ZONE_DMA32
-#define TEXT_FOR_DMA32(xx) xx "_dma32",
-#else
-#define TEXT_FOR_DMA32(xx)
-#endif
-
-#ifdef CONFIG_HIGHMEM
-#define TEXT_FOR_HIGHMEM(xx) xx "_high",
-#else
-#define TEXT_FOR_HIGHMEM(xx)
-#endif
-
-#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
- TEXT_FOR_HIGHMEM(xx) xx "_movable",
-
-static const char * const vmstat_text[] = {
- /* Zoned VM counters */
- "nr_free_pages",
- "nr_inactive_anon",
- "nr_active_anon",
- "nr_inactive_file",
- "nr_active_file",
- "nr_unevictable",
- "nr_mlock",
- "nr_anon_pages",
- "nr_mapped",
- "nr_file_pages",
- "nr_dirty",
- "nr_writeback",
- "nr_slab_reclaimable",
- "nr_slab_unreclaimable",
- "nr_page_table_pages",
- "nr_kernel_stack",
- "nr_unstable",
- "nr_bounce",
- "nr_vmscan_write",
- "nr_writeback_temp",
- "nr_isolated_anon",
- "nr_isolated_file",
- "nr_shmem",
- "nr_dirtied",
- "nr_written",
-
-#ifdef CONFIG_NUMA
- "numa_hit",
- "numa_miss",
- "numa_foreign",
- "numa_interleave",
- "numa_local",
- "numa_other",
-#endif
- "nr_anon_transparent_hugepages",
- "nr_dirty_threshold",
- "nr_dirty_background_threshold",
-
-#ifdef CONFIG_VM_EVENT_COUNTERS
- "pgpgin",
- "pgpgout",
- "pswpin",
- "pswpout",
-
- TEXTS_FOR_ZONES("pgalloc")
-
- "pgfree",
- "pgactivate",
- "pgdeactivate",
-
- "pgfault",
- "pgmajfault",
-
- TEXTS_FOR_ZONES("pgrefill")
- TEXTS_FOR_ZONES("pgsteal")
- TEXTS_FOR_ZONES("pgscan_kswapd")
- TEXTS_FOR_ZONES("pgscan_direct")
-
-#ifdef CONFIG_NUMA
- "zone_reclaim_failed",
-#endif
- "pginodesteal",
- "slabs_scanned",
- "kswapd_steal",
- "kswapd_inodesteal",
- "kswapd_low_wmark_hit_quickly",
- "kswapd_high_wmark_hit_quickly",
- "kswapd_skip_congestion_wait",
- "pageoutrun",
- "allocstall",
-
- "pgrotated",
-
-#ifdef CONFIG_COMPACTION
- "compact_blocks_moved",
- "compact_pages_moved",
- "compact_pagemigrate_failed",
- "compact_stall",
- "compact_fail",
- "compact_success",
-#endif
-
-#ifdef CONFIG_HUGETLB_PAGE
- "htlb_buddy_alloc_success",
- "htlb_buddy_alloc_fail",
-#endif
- "unevictable_pgs_culled",
- "unevictable_pgs_scanned",
- "unevictable_pgs_rescued",
- "unevictable_pgs_mlocked",
- "unevictable_pgs_munlocked",
- "unevictable_pgs_cleared",
- "unevictable_pgs_stranded",
- "unevictable_pgs_mlockfreed",
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- "thp_fault_alloc",
- "thp_fault_fallback",
- "thp_collapse_alloc",
- "thp_collapse_alloc_failed",
- "thp_split",
-#endif
-
-#endif /* CONFIG_VM_EVENTS_COUNTERS */
-};
-
static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
struct zone *zone)
{
@@ -1198,7 +1201,6 @@ static int __init setup_vmstat(void)
#ifdef CONFIG_SMP
int cpu;
- refresh_zone_stat_thresholds();
register_cpu_notifier(&vmstat_notifier);
for_each_online_cpu(cpu)
OpenPOWER on IntegriCloud