diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2009-09-24 07:53:22 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-09-24 07:53:22 -0700 |
commit | db16826367fefcb0ddb93d76b66adc52eb4e6339 (patch) | |
tree | 626224c1eb1eb79c522714591f208b4fdbdcd9d4 /mm | |
parent | cd6045138ed1bb5d8773e940d51c34318eef3ef2 (diff) | |
parent | 465fdd97cbe16ef8727221857e96ef62dd352017 (diff) | |
download | op-kernel-dev-db16826367fefcb0ddb93d76b66adc52eb4e6339.zip op-kernel-dev-db16826367fefcb0ddb93d76b66adc52eb4e6339.tar.gz |
Merge branch 'hwpoison' of git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6
* 'hwpoison' of git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6: (21 commits)
HWPOISON: Enable error_remove_page on btrfs
HWPOISON: Add simple debugfs interface to inject hwpoison on arbitary PFNs
HWPOISON: Add madvise() based injector for hardware poisoned pages v4
HWPOISON: Enable error_remove_page for NFS
HWPOISON: Enable .remove_error_page for migration aware file systems
HWPOISON: The high level memory error handler in the VM v7
HWPOISON: Add PR_MCE_KILL prctl to control early kill behaviour per process
HWPOISON: shmem: call set_page_dirty() with locked page
HWPOISON: Define a new error_remove_page address space op for async truncation
HWPOISON: Add invalidate_inode_page
HWPOISON: Refactor truncate to allow direct truncating of page v2
HWPOISON: check and isolate corrupted free pages v2
HWPOISON: Handle hardware poisoned pages in try_to_unmap
HWPOISON: Use bitmask/action code for try_to_unmap behaviour
HWPOISON: x86: Add VM_FAULT_HWPOISON handling to x86 page fault handler v2
HWPOISON: Add poison check to page fault handling
HWPOISON: Add basic support for poisoned pages in fault handler v3
HWPOISON: Add new SIGBUS error codes for hardware poison signals
HWPOISON: Add support for poison swap entries v2
HWPOISON: Export some rmap vma locking to outside world
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 14 | ||||
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/filemap.c | 4 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 41 | ||||
-rw-r--r-- | mm/madvise.c | 30 | ||||
-rw-r--r-- | mm/memory-failure.c | 832 | ||||
-rw-r--r-- | mm/memory.c | 24 | ||||
-rw-r--r-- | mm/migrate.c | 2 | ||||
-rw-r--r-- | mm/page-writeback.c | 7 | ||||
-rw-r--r-- | mm/page_alloc.c | 20 | ||||
-rw-r--r-- | mm/rmap.c | 60 | ||||
-rw-r--r-- | mm/shmem.c | 5 | ||||
-rw-r--r-- | mm/swapfile.c | 4 | ||||
-rw-r--r-- | mm/truncate.c | 72 | ||||
-rw-r--r-- | mm/vmscan.c | 2 |
15 files changed, 1067 insertions, 52 deletions
@@ -245,6 +245,20 @@ config DEFAULT_MMAP_MIN_ADDR /proc/sys/vm/mmap_min_addr tunable. +config MEMORY_FAILURE + depends on MMU + depends on X86_MCE + bool "Enable recovery from hardware memory errors" + help + Enables code to recover from some memory failures on systems + with MCA recovery. This allows a system to continue running + even when some of its memory has uncorrected errors. This requires + special hardware support and typically ECC memory. + +config HWPOISON_INJECT + tristate "Poison pages injector" + depends on MEMORY_FAILURE && DEBUG_KERNEL + config NOMMU_INITIAL_TRIM_EXCESS int "Turn on mmap() excess space trimming before booting" depends on !MMU diff --git a/mm/Makefile b/mm/Makefile index 88193d7..515fd79 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -41,5 +41,7 @@ obj-$(CONFIG_SMP) += allocpercpu.o endif obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o +obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o +obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o diff --git a/mm/filemap.c b/mm/filemap.c index bcc7372..c1fc205 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -104,6 +104,10 @@ * * ->task->proc_lock * ->dcache_lock (proc_pid_lookup) + * + * (code doesn't rely on that order, so you could switch it around) + * ->tasklist_lock (memory_failure, collect_procs_ao) + * ->i_mmap_lock */ /* diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c new file mode 100644 index 0000000..e1d8513 --- /dev/null +++ b/mm/hwpoison-inject.c @@ -0,0 +1,41 @@ +/* Inject a hwpoison memory failure on a arbitary pfn */ +#include <linux/module.h> +#include <linux/debugfs.h> +#include <linux/kernel.h> +#include <linux/mm.h> + +static struct dentry *hwpoison_dir, *corrupt_pfn; + +static int hwpoison_inject(void *data, u64 val) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val); + return __memory_failure(val, 18, 0); +} + +DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); + +static void pfn_inject_exit(void) +{ + if (hwpoison_dir) + debugfs_remove_recursive(hwpoison_dir); +} + +static int pfn_inject_init(void) +{ + hwpoison_dir = debugfs_create_dir("hwpoison", NULL); + if (hwpoison_dir == NULL) + return -ENOMEM; + corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, + NULL, &hwpoison_fops); + if (corrupt_pfn == NULL) { + pfn_inject_exit(); + return -ENOMEM; + } + return 0; +} + +module_init(pfn_inject_init); +module_exit(pfn_inject_exit); +MODULE_LICENSE("GPL"); diff --git a/mm/madvise.c b/mm/madvise.c index d9ae206..35b1479b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -218,6 +218,32 @@ static long madvise_remove(struct vm_area_struct *vma, return error; } +#ifdef CONFIG_MEMORY_FAILURE +/* + * Error injection support for memory error handling. + */ +static int madvise_hwpoison(unsigned long start, unsigned long end) +{ + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + for (; start < end; start += PAGE_SIZE) { + struct page *p; + int ret = get_user_pages(current, current->mm, start, 1, + 0, 0, &p, NULL); + if (ret != 1) + return ret; + printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", + page_to_pfn(p), start); + /* Ignore return value for now */ + __memory_failure(page_to_pfn(p), 0, 1); + put_page(p); + } + return ret; +} +#endif + static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, int behavior) @@ -308,6 +334,10 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) int write; size_t len; +#ifdef CONFIG_MEMORY_FAILURE + if (behavior == MADV_HWPOISON) + return madvise_hwpoison(start, start+len_in); +#endif if (!madvise_behavior_valid(behavior)) return error; diff --git a/mm/memory-failure.c b/mm/memory-failure.c new file mode 100644 index 0000000..729d4b1 --- /dev/null +++ b/mm/memory-failure.c @@ -0,0 +1,832 @@ +/* + * Copyright (C) 2008, 2009 Intel Corporation + * Authors: Andi Kleen, Fengguang Wu + * + * This software may be redistributed and/or modified under the terms of + * the GNU General Public License ("GPL") version 2 only as published by the + * Free Software Foundation. + * + * High level machine check handler. Handles pages reported by the + * hardware as being corrupted usually due to a 2bit ECC memory or cache + * failure. + * + * Handles page cache pages in various states. The tricky part + * here is that we can access any page asynchronous to other VM + * users, because memory failures could happen anytime and anywhere, + * possibly violating some of their assumptions. This is why this code + * has to be extremely careful. Generally it tries to use normal locking + * rules, as in get the standard locks, even if that means the + * error handling takes potentially a long time. + * + * The operation to map back from RMAP chains to processes has to walk + * the complete process list and has non linear complexity with the number + * mappings. In short it can be quite slow. But since memory corruptions + * are rare we hope to get away with this. + */ + +/* + * Notebook: + * - hugetlb needs more code + * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages + * - pass bad pages to kdump next kernel + */ +#define DEBUG 1 /* remove me in 2.6.34 */ +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/page-flags.h> +#include <linux/sched.h> +#include <linux/rmap.h> +#include <linux/pagemap.h> +#include <linux/swap.h> +#include <linux/backing-dev.h> +#include "internal.h" + +int sysctl_memory_failure_early_kill __read_mostly = 0; + +int sysctl_memory_failure_recovery __read_mostly = 1; + +atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); + +/* + * Send all the processes who have the page mapped an ``action optional'' + * signal. + */ +static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, + unsigned long pfn) +{ + struct siginfo si; + int ret; + + printk(KERN_ERR + "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n", + pfn, t->comm, t->pid); + si.si_signo = SIGBUS; + si.si_errno = 0; + si.si_code = BUS_MCEERR_AO; + si.si_addr = (void *)addr; +#ifdef __ARCH_SI_TRAPNO + si.si_trapno = trapno; +#endif + si.si_addr_lsb = PAGE_SHIFT; + /* + * Don't use force here, it's convenient if the signal + * can be temporarily blocked. + * This could cause a loop when the user sets SIGBUS + * to SIG_IGN, but hopefully noone will do that? + */ + ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ + if (ret < 0) + printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n", + t->comm, t->pid, ret); + return ret; +} + +/* + * Kill all processes that have a poisoned page mapped and then isolate + * the page. + * + * General strategy: + * Find all processes having the page mapped and kill them. + * But we keep a page reference around so that the page is not + * actually freed yet. + * Then stash the page away + * + * There's no convenient way to get back to mapped processes + * from the VMAs. So do a brute-force search over all + * running processes. + * + * Remember that machine checks are not common (or rather + * if they are common you have other problems), so this shouldn't + * be a performance issue. + * + * Also there are some races possible while we get from the + * error detection to actually handle it. + */ + +struct to_kill { + struct list_head nd; + struct task_struct *tsk; + unsigned long addr; + unsigned addr_valid:1; +}; + +/* + * Failure handling: if we can't find or can't kill a process there's + * not much we can do. We just print a message and ignore otherwise. + */ + +/* + * Schedule a process for later kill. + * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. + * TBD would GFP_NOIO be enough? + */ +static void add_to_kill(struct task_struct *tsk, struct page *p, + struct vm_area_struct *vma, + struct list_head *to_kill, + struct to_kill **tkc) +{ + struct to_kill *tk; + + if (*tkc) { + tk = *tkc; + *tkc = NULL; + } else { + tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); + if (!tk) { + printk(KERN_ERR + "MCE: Out of memory while machine check handling\n"); + return; + } + } + tk->addr = page_address_in_vma(p, vma); + tk->addr_valid = 1; + + /* + * In theory we don't have to kill when the page was + * munmaped. But it could be also a mremap. Since that's + * likely very rare kill anyways just out of paranoia, but use + * a SIGKILL because the error is not contained anymore. + */ + if (tk->addr == -EFAULT) { + pr_debug("MCE: Unable to find user space address %lx in %s\n", + page_to_pfn(p), tsk->comm); + tk->addr_valid = 0; + } + get_task_struct(tsk); + tk->tsk = tsk; + list_add_tail(&tk->nd, to_kill); +} + +/* + * Kill the processes that have been collected earlier. + * + * Only do anything when DOIT is set, otherwise just free the list + * (this is used for clean pages which do not need killing) + * Also when FAIL is set do a force kill because something went + * wrong earlier. + */ +static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, + int fail, unsigned long pfn) +{ + struct to_kill *tk, *next; + + list_for_each_entry_safe (tk, next, to_kill, nd) { + if (doit) { + /* + * In case something went wrong with munmaping + * make sure the process doesn't catch the + * signal and then access the memory. Just kill it. + * the signal handlers + */ + if (fail || tk->addr_valid == 0) { + printk(KERN_ERR + "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", + pfn, tk->tsk->comm, tk->tsk->pid); + force_sig(SIGKILL, tk->tsk); + } + + /* + * In theory the process could have mapped + * something else on the address in-between. We could + * check for that, but we need to tell the + * process anyways. + */ + else if (kill_proc_ao(tk->tsk, tk->addr, trapno, + pfn) < 0) + printk(KERN_ERR + "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", + pfn, tk->tsk->comm, tk->tsk->pid); + } + put_task_struct(tk->tsk); + kfree(tk); + } +} + +static int task_early_kill(struct task_struct *tsk) +{ + if (!tsk->mm) + return 0; + if (tsk->flags & PF_MCE_PROCESS) + return !!(tsk->flags & PF_MCE_EARLY); + return sysctl_memory_failure_early_kill; +} + +/* + * Collect processes when the error hit an anonymous page. + */ +static void collect_procs_anon(struct page *page, struct list_head *to_kill, + struct to_kill **tkc) +{ + struct vm_area_struct *vma; + struct task_struct *tsk; + struct anon_vma *av; + + read_lock(&tasklist_lock); + av = page_lock_anon_vma(page); + if (av == NULL) /* Not actually mapped anymore */ + goto out; + for_each_process (tsk) { + if (!task_early_kill(tsk)) + continue; + list_for_each_entry (vma, &av->head, anon_vma_node) { + if (!page_mapped_in_vma(page, vma)) + continue; + if (vma->vm_mm == tsk->mm) + add_to_kill(tsk, page, vma, to_kill, tkc); + } + } + page_unlock_anon_vma(av); +out: + read_unlock(&tasklist_lock); +} + +/* + * Collect processes when the error hit a file mapped page. + */ +static void collect_procs_file(struct page *page, struct list_head *to_kill, + struct to_kill **tkc) +{ + struct vm_area_struct *vma; + struct task_struct *tsk; + struct prio_tree_iter iter; + struct address_space *mapping = page->mapping; + + /* + * A note on the locking order between the two locks. + * We don't rely on this particular order. + * If you have some other code that needs a different order + * feel free to switch them around. Or add a reverse link + * from mm_struct to task_struct, then this could be all + * done without taking tasklist_lock and looping over all tasks. + */ + + read_lock(&tasklist_lock); + spin_lock(&mapping->i_mmap_lock); + for_each_process(tsk) { + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + + if (!task_early_kill(tsk)) + continue; + + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, + pgoff) { + /* + * Send early kill signal to tasks where a vma covers + * the page but the corrupted page is not necessarily + * mapped it in its pte. + * Assume applications who requested early kill want + * to be informed of all such data corruptions. + */ + if (vma->vm_mm == tsk->mm) + add_to_kill(tsk, page, vma, to_kill, tkc); + } + } + spin_unlock(&mapping->i_mmap_lock); + read_unlock(&tasklist_lock); +} + +/* + * Collect the processes who have the corrupted page mapped to kill. + * This is done in two steps for locking reasons. + * First preallocate one tokill structure outside the spin locks, + * so that we can kill at least one process reasonably reliable. + */ +static void collect_procs(struct page *page, struct list_head *tokill) +{ + struct to_kill *tk; + + if (!page->mapping) + return; + + tk = kmalloc(sizeof(struct to_kill), GFP_NOIO); + if (!tk) + return; + if (PageAnon(page)) + collect_procs_anon(page, tokill, &tk); + else + collect_procs_file(page, tokill, &tk); + kfree(tk); +} + +/* + * Error handlers for various types of pages. + */ + +enum outcome { + FAILED, /* Error handling failed */ + DELAYED, /* Will be handled later */ + IGNORED, /* Error safely ignored */ + RECOVERED, /* Successfully recovered */ +}; + +static const char *action_name[] = { + [FAILED] = "Failed", + [DELAYED] = "Delayed", + [IGNORED] = "Ignored", + [RECOVERED] = "Recovered", +}; + +/* + * Error hit kernel page. + * Do nothing, try to be lucky and not touch this instead. For a few cases we + * could be more sophisticated. + */ +static int me_kernel(struct page *p, unsigned long pfn) +{ + return DELAYED; +} + +/* + * Already poisoned page. + */ +static int me_ignore(struct page *p, unsigned long pfn) +{ + return IGNORED; +} + +/* + * Page in unknown state. Do nothing. + */ +static int me_unknown(struct page *p, unsigned long pfn) +{ + printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn); + return FAILED; +} + +/* + * Free memory + */ +static int me_free(struct page *p, unsigned long pfn) +{ + return DELAYED; +} + +/* + * Clean (or cleaned) page cache page. + */ +static int me_pagecache_clean(struct page *p, unsigned long pfn) +{ + int err; + int ret = FAILED; + struct address_space *mapping; + + if (!isolate_lru_page(p)) + page_cache_release(p); + + /* + * For anonymous pages we're done the only reference left + * should be the one m_f() holds. + */ + if (PageAnon(p)) + return RECOVERED; + + /* + * Now truncate the page in the page cache. This is really + * more like a "temporary hole punch" + * Don't do this for block devices when someone else + * has a reference, because it could be file system metadata + * and that's not safe to truncate. + */ + mapping = page_mapping(p); + if (!mapping) { + /* + * Page has been teared down in the meanwhile + */ + return FAILED; + } + + /* + * Truncation is a bit tricky. Enable it per file system for now. + * + * Open: to take i_mutex or not for this? Right now we don't. + */ + if (mapping->a_ops->error_remove_page) { + err = mapping->a_ops->error_remove_page(mapping, p); + if (err != 0) { + printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n", + pfn, err); + } else if (page_has_private(p) && + !try_to_release_page(p, GFP_NOIO)) { + pr_debug("MCE %#lx: failed to release buffers\n", pfn); + } else { + ret = RECOVERED; + } + } else { + /* + * If the file system doesn't support it just invalidate + * This fails on dirty or anything with private pages + */ + if (invalidate_inode_page(p)) + ret = RECOVERED; + else + printk(KERN_INFO "MCE %#lx: Failed to invalidate\n", + pfn); + } + return ret; +} + +/* + * Dirty cache page page + * Issues: when the error hit a hole page the error is not properly + * propagated. + */ +static int me_pagecache_dirty(struct page *p, unsigned long pfn) +{ + struct address_space *mapping = page_mapping(p); + + SetPageError(p); + /* TBD: print more information about the file. */ + if (mapping) { + /* + * IO error will be reported by write(), fsync(), etc. + * who check the mapping. + * This way the application knows that something went + * wrong with its dirty file data. + * + * There's one open issue: + * + * The EIO will be only reported on the next IO + * operation and then cleared through the IO map. + * Normally Linux has two mechanisms to pass IO error + * first through the AS_EIO flag in the address space + * and then through the PageError flag in the page. + * Since we drop pages on memory failure handling the + * only mechanism open to use is through AS_AIO. + * + * This has the disadvantage that it gets cleared on + * the first operation that returns an error, while + * the PageError bit is more sticky and only cleared + * when the page is reread or dropped. If an + * application assumes it will always get error on + * fsync, but does other operations on the fd before + * and the page is dropped inbetween then the error + * will not be properly reported. + * + * This can already happen even without hwpoisoned + * pages: first on metadata IO errors (which only + * report through AS_EIO) or when the page is dropped + * at the wrong time. + * + * So right now we assume that the application DTRT on + * the first EIO, but we're not worse than other parts + * of the kernel. + */ + mapping_set_error(mapping, EIO); + } + + return me_pagecache_clean(p, pfn); +} + +/* + * Clean and dirty swap cache. + * + * Dirty swap cache page is tricky to handle. The page could live both in page + * cache and swap cache(ie. page is freshly swapped in). So it could be + * referenced concurrently by 2 types of PTEs: + * normal PTEs and swap PTEs. We try to handle them consistently by calling + * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs, + * and then + * - clear dirty bit to prevent IO + * - remove from LRU + * - but keep in the swap cache, so that when we return to it on + * a later page fault, we know the application is accessing + * corrupted data and shall be killed (we installed simple + * interception code in do_swap_page to catch it). + * + * Clean swap cache pages can be directly isolated. A later page fault will + * bring in the known good data from disk. + */ +static int me_swapcache_dirty(struct page *p, unsigned long pfn) +{ + int ret = FAILED; + + ClearPageDirty(p); + /* Trigger EIO in shmem: */ + ClearPageUptodate(p); + + if (!isolate_lru_page(p)) { + page_cache_release(p); + ret = DELAYED; + } + + return ret; +} + +static int me_swapcache_clean(struct page *p, unsigned long pfn) +{ + int ret = FAILED; + + if (!isolate_lru_page(p)) { + page_cache_release(p); + ret = RECOVERED; + } + delete_from_swap_cache(p); + return ret; +} + +/* + * Huge pages. Needs work. + * Issues: + * No rmap support so we cannot find the original mapper. In theory could walk + * all MMs and look for the mappings, but that would be non atomic and racy. + * Need rmap for hugepages for this. Alternatively we could employ a heuristic, + * like just walking the current process and hoping it has it mapped (that + * should be usually true for the common "shared database cache" case) + * Should handle free huge pages and dequeue them too, but this needs to + * handle huge page accounting correctly. + */ +static int me_huge_page(struct page *p, unsigned long pfn) +{ + return FAILED; +} + +/* + * Various page states we can handle. + * + * A page state is defined by its current page->flags bits. + * The table matches them in order and calls the right handler. + * + * This is quite tricky because we can access page at any time + * in its live cycle, so all accesses have to be extremly careful. + * + * This is not complete. More states could be added. + * For any missing state don't attempt recovery. + */ + +#define dirty (1UL << PG_dirty) +#define sc (1UL << PG_swapcache) +#define unevict (1UL << PG_unevictable) +#define mlock (1UL << PG_mlocked) +#define writeback (1UL << PG_writeback) +#define lru (1UL << PG_lru) +#define swapbacked (1UL << PG_swapbacked) +#define head (1UL << PG_head) +#define tail (1UL << PG_tail) +#define compound (1UL << PG_compound) +#define slab (1UL << PG_slab) +#define buddy (1UL << PG_buddy) +#define reserved (1UL << PG_reserved) + +static struct page_state { + unsigned long mask; + unsigned long res; + char *msg; + int (*action)(struct page *p, unsigned long pfn); +} error_states[] = { + { reserved, reserved, "reserved kernel", me_ignore }, + { buddy, buddy, "free kernel", me_free }, + + /* + * Could in theory check if slab page is free or if we can drop + * currently unused objects without touching them. But just + * treat it as standard kernel for now. + */ + { slab, slab, "kernel slab", me_kernel }, + +#ifdef CONFIG_PAGEFLAGS_EXTENDED + { head, head, "huge", me_huge_page }, + { tail, tail, "huge", me_huge_page }, +#else + { compound, compound, "huge", me_huge_page }, +#endif + + { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty }, + { sc|dirty, sc, "swapcache", me_swapcache_clean }, + + { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, + { unevict, unevict, "unevictable LRU", me_pagecache_clean}, + +#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT + { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, + { mlock, mlock, "mlocked LRU", me_pagecache_clean }, +#endif + + { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, + { lru|dirty, lru, "clean LRU", me_pagecache_clean }, + { swapbacked, swapbacked, "anonymous", me_pagecache_clean }, + + /* + * Catchall entry: must be at end. + */ + { 0, 0, "unknown page state", me_unknown }, +}; + +#undef lru + +static void action_result(unsigned long pfn, char *msg, int result) +{ + struct page *page = NULL; + if (pfn_valid(pfn)) + page = pfn_to_page(pfn); + + printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", + pfn, + page && PageDirty(page) ? "dirty " : "", + msg, action_name[result]); +} + +static int page_action(struct page_state *ps, struct page *p, + unsigned long pfn, int ref) +{ + int result; + + result = ps->action(p, pfn); + action_result(pfn, ps->msg, result); + if (page_count(p) != 1 + ref) + printk(KERN_ERR + "MCE %#lx: %s page still referenced by %d users\n", + pfn, ps->msg, page_count(p) - 1); + + /* Could do more checks here if page looks ok */ + /* + * Could adjust zone counters here to correct for the missing page. + */ + + return result == RECOVERED ? 0 : -EBUSY; +} + +#define N_UNMAP_TRIES 5 + +/* + * Do all that is necessary to remove user space mappings. Unmap + * the pages and send SIGBUS to the processes if the data was dirty. + */ +static void hwpoison_user_mappings(struct page *p, unsigned long pfn, + int trapno) +{ + enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; + struct address_space *mapping; + LIST_HEAD(tokill); + int ret; + int i; + int kill = 1; + + if (PageReserved(p) || PageCompound(p) || PageSlab(p)) + return; + + if (!PageLRU(p)) + lru_add_drain_all(); + + /* + * This check implies we don't kill processes if their pages + * are in the swap cache early. Those are always late kills. + */ + if (!page_mapped(p)) + return; + + if (PageSwapCache(p)) { + printk(KERN_ERR + "MCE %#lx: keeping poisoned page in swap cache\n", pfn); + ttu |= TTU_IGNORE_HWPOISON; + } + + /* + * Propagate the dirty bit from PTEs to struct page first, because we + * need this to decide if we should kill or just drop the page. + */ + mapping = page_mapping(p); + if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { + if (page_mkclean(p)) { + SetPageDirty(p); + } else { + kill = 0; + ttu |= TTU_IGNORE_HWPOISON; + printk(KERN_INFO + "MCE %#lx: corrupted page was clean: dropped without side effects\n", + pfn); + } + } + + /* + * First collect all the processes that have the page + * mapped in dirty form. This has to be done before try_to_unmap, + * because ttu takes the rmap data structures down. + * + * Error handling: We ignore errors here because + * there's nothing that can be done. + */ + if (kill) + collect_procs(p, &tokill); + + /* + * try_to_unmap can fail temporarily due to races. + * Try a few times (RED-PEN better strategy?) + */ + for (i = 0; i < N_UNMAP_TRIES; i++) { + ret = try_to_unmap(p, ttu); + if (ret == SWAP_SUCCESS) + break; + pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); + } + + if (ret != SWAP_SUCCESS) + printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", + pfn, page_mapcount(p)); + + /* + * Now that the dirty bit has been propagated to the + * struct page and all unmaps done we can decide if + * killing is needed or not. Only kill when the page + * was dirty, otherwise the tokill list is merely + * freed. When there was a problem unmapping earlier + * use a more force-full uncatchable kill to prevent + * any accesses to the poisoned memory. + */ + kill_procs_ao(&tokill, !!PageDirty(p), trapno, + ret != SWAP_SUCCESS, pfn); +} + +int __memory_failure(unsigned long pfn, int trapno, int ref) +{ + struct page_state *ps; + struct page *p; + int res; + + if (!sysctl_memory_failure_recovery) + panic("Memory failure from trap %d on page %lx", trapno, pfn); + + if (!pfn_valid(pfn)) { + action_result(pfn, "memory outside kernel control", IGNORED); + return -EIO; + } + + p = pfn_to_page(pfn); + if (TestSetPageHWPoison(p)) { + action_result(pfn, "already hardware poisoned", IGNORED); + return 0; + } + + atomic_long_add(1, &mce_bad_pages); + + /* + * We need/can do nothing about count=0 pages. + * 1) it's a free page, and therefore in safe hand: + * prep_new_page() will be the gate keeper. + * 2) it's part of a non-compound high order page. + * Implies some kernel user: cannot stop them from + * R/W the page; let's pray that the page has been + * used and will be freed some time later. + * In fact it's dangerous to directly bump up page count from 0, + * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. + */ + if (!get_page_unless_zero(compound_head(p))) { + action_result(pfn, "free or high order kernel", IGNORED); + return PageBuddy(compound_head(p)) ? 0 : -EBUSY; + } + + /* + * Lock the page and wait for writeback to finish. + * It's very difficult to mess with pages currently under IO + * and in many cases impossible, so we just avoid it here. + */ + lock_page_nosync(p); + wait_on_page_writeback(p); + + /* + * Now take care of user space mappings. + */ + hwpoison_user_mappings(p, pfn, trapno); + + /* + * Torn down by someone else? + */ + if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { + action_result(pfn, "already truncated LRU", IGNORED); + res = 0; + goto out; + } + + res = -EBUSY; + for (ps = error_states;; ps++) { + if ((p->flags & ps->mask) == ps->res) { + res = page_action(ps, p, pfn, ref); + break; + } + } +out: + unlock_page(p); + return res; +} +EXPORT_SYMBOL_GPL(__memory_failure); + +/** + * memory_failure - Handle memory failure of a page. + * @pfn: Page Number of the corrupted page + * @trapno: Trap number reported in the signal to user space. + * + * This function is called by the low level machine check code + * of an architecture when it detects hardware memory corruption + * of a page. It tries its best to recover, which includes + * dropping pages, killing processes etc. + * + * The function is primarily of use for corruptions that + * happen outside the current execution context (e.g. when + * detected by a background scrubber) + * + * Must run in process context (e.g. a work queue) with interrupts + * enabled and no spinlocks hold. + */ +void memory_failure(unsigned long pfn, int trapno) +{ + __memory_failure(pfn, trapno, 0); +} diff --git a/mm/memory.c b/mm/memory.c index b1443ac07..987389a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1325,7 +1325,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) return i ? i : -ENOMEM; - else if (ret & VM_FAULT_SIGBUS) + if (ret & + (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS)) return i ? i : -EFAULT; BUG(); } @@ -2559,8 +2560,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out; entry = pte_to_swp_entry(orig_pte); - if (is_migration_entry(entry)) { - migration_entry_wait(mm, pmd, address); + if (unlikely(non_swap_entry(entry))) { + if (is_migration_entry(entry)) { + migration_entry_wait(mm, pmd, address); + } else if (is_hwpoison_entry(entry)) { + ret = VM_FAULT_HWPOISON; + } else { + print_bad_pte(vma, address, orig_pte, NULL); + ret = VM_FAULT_OOM; + } goto out; } delayacct_set_flag(DELAYACCT_PF_SWAPIN); @@ -2584,6 +2592,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); + } else if (PageHWPoison(page)) { + ret = VM_FAULT_HWPOISON; + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + goto out; } lock_page(page); @@ -2760,6 +2772,12 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) return ret; + if (unlikely(PageHWPoison(vmf.page))) { + if (ret & VM_FAULT_LOCKED) + unlock_page(vmf.page); + return VM_FAULT_HWPOISON; + } + /* * For consistency in subsequent calls, make the faulted page always * locked. diff --git a/mm/migrate.c b/mm/migrate.c index 16052e8..1a4bf48 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -675,7 +675,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, } /* Establish migration ptes or remove ptes */ - try_to_unmap(page, 1); + try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); skip_unmap: if (!page_mapped(page)) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index be197f7..d99664e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1149,6 +1149,13 @@ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) EXPORT_SYMBOL(redirty_page_for_writepage); /* + * Dirty a page. + * + * For pages with a mapping this should be done under the page lock + * for the benefit of asynchronous memory errors who prefer a consistent + * dirty state. This rule can be broken in some special cases, + * but should be better not to. + * * If the mapping doesn't provide a set_page_dirty a_op, then * just fall through and assume that it wants buffer_heads. */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 88248b3..bf72055 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -234,6 +234,12 @@ static void bad_page(struct page *page) static unsigned long nr_shown; static unsigned long nr_unshown; + /* Don't complain about poisoned pages */ + if (PageHWPoison(page)) { + __ClearPageBuddy(page); + return; + } + /* * Allow a burst of 60 reports, then keep quiet for that minute; * or allow a steady drip of one report per second. @@ -666,7 +672,7 @@ static inline void expand(struct zone *zone, struct page *page, /* * This page is about to be returned from the page allocator */ -static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) +static inline int check_new_page(struct page *page) { if (unlikely(page_mapcount(page) | (page->mapping != NULL) | @@ -675,6 +681,18 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) bad_page(page); return 1; } + return 0; +} + +static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) +{ + int i; + + for (i = 0; i < (1 << order); i++) { + struct page *p = page + i; + if (unlikely(check_new_page(p))) + return 1; + } set_page_private(page, 0); set_page_refcounted(page); @@ -36,6 +36,11 @@ * mapping->tree_lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, * within inode_lock in __sync_single_inode) + * + * (code doesn't rely on that order so it could be switched around) + * ->tasklist_lock + * anon_vma->lock (memory_failure, collect_procs_anon) + * pte map lock */ #include <linux/mm.h> @@ -191,7 +196,7 @@ void __init anon_vma_init(void) * Getting a lock on a stable anon_vma from a page off the LRU is * tricky: page_lock_anon_vma rely on RCU to guard against the races. */ -static struct anon_vma *page_lock_anon_vma(struct page *page) +struct anon_vma *page_lock_anon_vma(struct page *page) { struct anon_vma *anon_vma; unsigned long anon_mapping; @@ -211,7 +216,7 @@ out: return NULL; } -static void page_unlock_anon_vma(struct anon_vma *anon_vma) +void page_unlock_anon_vma(struct anon_vma *anon_vma) { spin_unlock(&anon_vma->lock); rcu_read_unlock(); @@ -311,7 +316,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, * if the page is not mapped into the page tables of this VMA. Only * valid for normal file or anonymous VMAs. */ -static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) +int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) { unsigned long address; pte_t *pte; @@ -756,7 +761,7 @@ void page_remove_rmap(struct page *page) * repeatedly from either try_to_unmap_anon or try_to_unmap_file. */ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, - int migration) + enum ttu_flags flags) { struct mm_struct *mm = vma->vm_mm; unsigned long address; @@ -778,11 +783,13 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * If it's recently referenced (perhaps page_referenced * skipped over this mm) then we should reactivate it. */ - if (!migration) { + if (!(flags & TTU_IGNORE_MLOCK)) { if (vma->vm_flags & VM_LOCKED) { ret = SWAP_MLOCK; goto out_unmap; } + } + if (!(flags & TTU_IGNORE_ACCESS)) { if (ptep_clear_flush_young_notify(vma, address, pte)) { ret = SWAP_FAIL; goto out_unmap; @@ -800,7 +807,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* Update high watermark before we lower rss */ update_hiwater_rss(mm); - if (PageAnon(page)) { + if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { + if (PageAnon(page)) + dec_mm_counter(mm, anon_rss); + else + dec_mm_counter(mm, file_rss); + set_pte_at(mm, address, pte, + swp_entry_to_pte(make_hwpoison_entry(page))); + } else if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(page) }; if (PageSwapCache(page)) { @@ -822,12 +836,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * pte. do_swap_page() will wait until the migration * pte is removed and then restart fault handling. */ - BUG_ON(!migration); + BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); entry = make_migration_entry(page, pte_write(pteval)); } set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); BUG_ON(pte_file(*pte)); - } else if (PAGE_MIGRATION && migration) { + } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) { /* Establish migration entry for a file page */ swp_entry_t entry; entry = make_migration_entry(page, pte_write(pteval)); @@ -996,12 +1010,13 @@ static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma) * vm_flags for that VMA. That should be OK, because that vma shouldn't be * 'LOCKED. */ -static int try_to_unmap_anon(struct page *page, int unlock, int migration) +static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) { struct anon_vma *anon_vma; struct vm_area_struct *vma; unsigned int mlocked = 0; int ret = SWAP_AGAIN; + int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; if (MLOCK_PAGES && unlikely(unlock)) ret = SWAP_SUCCESS; /* default for try_to_munlock() */ @@ -1017,7 +1032,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration) continue; /* must visit all unlocked vmas */ ret = SWAP_MLOCK; /* saw at least one mlocked vma */ } else { - ret = try_to_unmap_one(page, vma, migration); + ret = try_to_unmap_one(page, vma, flags); if (ret == SWAP_FAIL || !page_mapped(page)) break; } @@ -1041,8 +1056,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration) /** * try_to_unmap_file - unmap/unlock file page using the object-based rmap method * @page: the page to unmap/unlock - * @unlock: request for unlock rather than unmap [unlikely] - * @migration: unmapping for migration - ignored if @unlock + * @flags: action and flags * * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the address_space struct it points to. @@ -1054,7 +1068,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration) * vm_flags for that VMA. That should be OK, because that vma shouldn't be * 'LOCKED. */ -static int try_to_unmap_file(struct page *page, int unlock, int migration) +static int try_to_unmap_file(struct page *page, enum ttu_flags flags) { struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -1066,6 +1080,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) unsigned long max_nl_size = 0; unsigned int mapcount; unsigned int mlocked = 0; + int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; if (MLOCK_PAGES && unlikely(unlock)) ret = SWAP_SUCCESS; /* default for try_to_munlock() */ @@ -1078,7 +1093,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) continue; /* must visit all vmas */ ret = SWAP_MLOCK; } else { - ret = try_to_unmap_one(page, vma, migration); + ret = try_to_unmap_one(page, vma, flags); if (ret == SWAP_FAIL || !page_mapped(page)) goto out; } @@ -1103,7 +1118,8 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) ret = SWAP_MLOCK; /* leave mlocked == 0 */ goto out; /* no need to look further */ } - if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED)) + if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && + (vma->vm_flags & VM_LOCKED)) continue; cursor = (unsigned long) vma->vm_private_data; if (cursor > max_nl_cursor) @@ -1137,7 +1153,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) do { list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) { - if (!MLOCK_PAGES && !migration && + if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && (vma->vm_flags & VM_LOCKED)) continue; cursor = (unsigned long) vma->vm_private_data; @@ -1177,7 +1193,7 @@ out: /** * try_to_unmap - try to remove all page table mappings to a page * @page: the page to get unmapped - * @migration: migration flag + * @flags: action and flags * * Tries to remove all the page table entries which are mapping this * page, used in the pageout path. Caller must hold the page lock. @@ -1188,16 +1204,16 @@ out: * SWAP_FAIL - the page is unswappable * SWAP_MLOCK - page is mlocked. */ -int try_to_unmap(struct page *page, int migration) +int try_to_unmap(struct page *page, enum ttu_flags flags) { int ret; BUG_ON(!PageLocked(page)); if (PageAnon(page)) - ret = try_to_unmap_anon(page, 0, migration); + ret = try_to_unmap_anon(page, flags); else - ret = try_to_unmap_file(page, 0, migration); + ret = try_to_unmap_file(page, flags); if (ret != SWAP_MLOCK && !page_mapped(page)) ret = SWAP_SUCCESS; return ret; @@ -1222,8 +1238,8 @@ int try_to_munlock(struct page *page) VM_BUG_ON(!PageLocked(page) || PageLRU(page)); if (PageAnon(page)) - return try_to_unmap_anon(page, 1, 0); + return try_to_unmap_anon(page, TTU_MUNLOCK); else - return try_to_unmap_file(page, 1, 0); + return try_to_unmap_file(page, TTU_MUNLOCK); } @@ -1633,8 +1633,8 @@ shmem_write_end(struct file *file, struct address_space *mapping, if (pos + copied > inode->i_size) i_size_write(inode, pos + copied); - unlock_page(page); set_page_dirty(page); + unlock_page(page); page_cache_release(page); return copied; @@ -1971,13 +1971,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s iput(inode); return error; } - unlock_page(page); inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_symlink_inode_operations; kaddr = kmap_atomic(page, KM_USER0); memcpy(kaddr, symname, len); kunmap_atomic(kaddr, KM_USER0); set_page_dirty(page); + unlock_page(page); page_cache_release(page); } if (dir->i_mode & S_ISGID) @@ -2420,6 +2420,7 @@ static const struct address_space_operations shmem_aops = { .write_end = shmem_write_end, #endif .migratepage = migrate_page, + .error_remove_page = generic_error_remove_page, }; static const struct file_operations shmem_file_operations = { diff --git a/mm/swapfile.c b/mm/swapfile.c index f1bf19d..4de7f02 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -699,7 +699,7 @@ int free_swap_and_cache(swp_entry_t entry) struct swap_info_struct *p; struct page *page = NULL; - if (is_migration_entry(entry)) + if (non_swap_entry(entry)) return 1; p = swap_info_get(entry); @@ -2085,7 +2085,7 @@ static int __swap_duplicate(swp_entry_t entry, bool cache) int count; bool has_cache; - if (is_migration_entry(entry)) + if (non_swap_entry(entry)) return -EINVAL; type = swp_type(entry); diff --git a/mm/truncate.c b/mm/truncate.c index ccc3ecf..a17b397 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -93,11 +93,11 @@ EXPORT_SYMBOL(cancel_dirty_page); * its lock, b) when a concurrent invalidate_mapping_pages got there first and * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. */ -static void +static int truncate_complete_page(struct address_space *mapping, struct page *page) { if (page->mapping != mapping) - return; + return -EIO; if (page_has_private(page)) do_invalidatepage(page, 0); @@ -108,6 +108,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page) remove_from_page_cache(page); ClearPageMappedToDisk(page); page_cache_release(page); /* pagecache ref */ + return 0; } /* @@ -135,6 +136,51 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) return ret; } +int truncate_inode_page(struct address_space *mapping, struct page *page) +{ + if (page_mapped(page)) { + unmap_mapping_range(mapping, + (loff_t)page->index << PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE, 0); + } + return truncate_complete_page(mapping, page); +} + +/* + * Used to get rid of pages on hardware memory corruption. + */ +int generic_error_remove_page(struct address_space *mapping, struct page *page) +{ + if (!mapping) + return -EINVAL; + /* + * Only punch for normal data pages for now. + * Handling other types like directories would need more auditing. + */ + if (!S_ISREG(mapping->host->i_mode)) + return -EIO; + return truncate_inode_page(mapping, page); +} +EXPORT_SYMBOL(generic_error_remove_page); + +/* + * Safely invalidate one page from its pagecache mapping. + * It only drops clean, unused pages. The page must be locked. + * + * Returns 1 if the page is successfully invalidated, otherwise 0. + */ +int invalidate_inode_page(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + if (!mapping) + return 0; + if (PageDirty(page) || PageWriteback(page)) + return 0; + if (page_mapped(page)) + return 0; + return invalidate_complete_page(mapping, page); +} + /** * truncate_inode_pages - truncate range of pages specified by start & end byte offsets * @mapping: mapping to truncate @@ -196,12 +242,7 @@ void truncate_inode_pages_range(struct address_space *mapping, unlock_page(page); continue; } - if (page_mapped(page)) { - unmap_mapping_range(mapping, - (loff_t)page_index<<PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE, 0); - } - truncate_complete_page(mapping, page); + truncate_inode_page(mapping, page); unlock_page(page); } pagevec_release(&pvec); @@ -238,15 +279,10 @@ void truncate_inode_pages_range(struct address_space *mapping, break; lock_page(page); wait_on_page_writeback(page); - if (page_mapped(page)) { - unmap_mapping_range(mapping, - (loff_t)page->index<<PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE, 0); - } + truncate_inode_page(mapping, page); if (page->index > next) next = page->index; next++; - truncate_complete_page(mapping, page); unlock_page(page); } pagevec_release(&pvec); @@ -311,12 +347,8 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, if (lock_failed) continue; - if (PageDirty(page) || PageWriteback(page)) - goto unlock; - if (page_mapped(page)) - goto unlock; - ret += invalidate_complete_page(mapping, page); -unlock: + ret += invalidate_inode_page(page); + unlock_page(page); if (next > end) break; diff --git a/mm/vmscan.c b/mm/vmscan.c index f444b74..1219ceb 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -663,7 +663,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, 0)) { + switch (try_to_unmap(page, TTU_UNMAP)) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: |