6 files changed, 216 insertions, 35 deletions
diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt
new file mode 100644
index 0000000..3ffadf8
--- /dev/null
+++ b/Documentation/vm/hwpoison.txt
@@ -0,0 +1,136 @@
+What is hwpoison?
+
+Upcoming Intel CPUs have support for recovering from some memory errors
+(``MCA recovery''). This requires the OS to declare a page "poisoned",
+kill the processes associated with it and avoid using it in the future.
+
+This patchkit implements the necessary infrastructure in the VM.
+
+To quote the overview comment:
+
+ * High level machine check handler. Handles pages reported by the
+ * hardware as being corrupted usually due to a 2bit ECC memory or cache
+ * failure.
+ *
+ * This focusses on pages detected as corrupted in the background.
+ * When the current CPU tries to consume corruption the currently
+ * running process can just be killed directly instead. This implies
+ * that if the error cannot be handled for some reason it's safe to
+ * just ignore it because no corruption has been consumed yet. Instead
+ * when that happens another machine check will happen.
+ *
+ * Handles page cache pages in various states. The tricky part
+ * here is that we can access any page asynchronous to other VM
+ * users, because memory failures could happen anytime and anywhere,
+ * possibly violating some of their assumptions. This is why this code
+ * has to be extremely careful. Generally it tries to use normal locking
+ * rules, as in get the standard locks, even if that means the
+ * error handling takes potentially a long time.
+ *
+ * Some of the operations here are somewhat inefficient and have non
+ * linear algorithmic complexity, because the data structures have not
+ * been optimized for this case. This is in particular the case
+ * for the mapping from a vma to a process. Since this case is expected
+ * to be rare we hope we can get away with this.
+
+The code consists of a the high level handler in mm/memory-failure.c,
+a new page poison bit and various checks in the VM to handle poisoned
+pages.
+
+The main target right now is KVM guests, but it works for all kinds
+of applications. KVM support requires a recent qemu-kvm release.
+
+For the KVM use there was need for a new signal type so that
+KVM can inject the machine check into the guest with the proper
+address. This in theory allows other applications to handle
+memory failures too. The expection is that near all applications
+won't do that, but some very specialized ones might.
+
+---
+
+There are two (actually three) modi memory failure recovery can be in:
+
+vm.memory_failure_recovery sysctl set to zero:
+	All memory failures cause a panic. Do not attempt recovery.
+	(on x86 this can be also affected by the tolerant level of the
+	MCE subsystem)
+
+early kill
+	(can be controlled globally and per process)
+	Send SIGBUS to the application as soon as the error is detected
+	This allows applications who can process memory errors in a gentle
+	way (e.g. drop affected object)
+	This is the mode used by KVM qemu.
+
+late kill
+	Send SIGBUS when the application runs into the corrupted page.
+	This is best for memory error unaware applications and default
+	Note some pages are always handled as late kill.
+
+---
+
+User control:
+
+vm.memory_failure_recovery
+	See sysctl.txt
+
+vm.memory_failure_early_kill
+	Enable early kill mode globally
+
+PR_MCE_KILL
+	Set early/late kill mode/revert to system default
+	arg1: PR_MCE_KILL_CLEAR: Revert to system default
+	arg1: PR_MCE_KILL_SET: arg2 defines thread specific mode
+		PR_MCE_KILL_EARLY: Early kill
+		PR_MCE_KILL_LATE:  Late kill
+		PR_MCE_KILL_DEFAULT: Use system global default
+PR_MCE_KILL_GET
+	return current mode
+
+
+---
+
+Testing:
+
+madvise(MADV_POISON, ....)
+	(as root)
+	Poison a page in the process for testing
+
+
+hwpoison-inject module through debugfs
+	/sys/debug/hwpoison/corrupt-pfn
+
+Inject hwpoison fault at PFN echoed into this file
+
+
+Architecture specific MCE injector
+
+x86 has mce-inject, mce-test
+
+Some portable hwpoison test programs in mce-test, see blow.
+
+---
+
+References:
+
+http://halobates.de/mce-lc09-2.pdf
+	Overview presentation from LinuxCon 09
+
+git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git
+	Test suite (hwpoison specific portable tests in tsrc)
+
+git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git
+	x86 specific injector
+
+
+---
+
+Limitations:
+
+- Not all page types are supported and never will. Most kernel internal
+objects cannot be recovered, only LRU pages for now.
+- Right now hugepage support is missing.
+
+---
+Andi Kleen, Oct 2009
+
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 9311505..a3baeb2 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -88,6 +88,18 @@
 #define PR_TASK_PERF_EVENTS_DISABLE		31
 #define PR_TASK_PERF_EVENTS_ENABLE		32
 
+/*
+ * Set early/late kill mode for hwpoison memory corruption.
+ * This influences when the process gets killed on a memory corruption.
+ */
 #define PR_MCE_KILL	33
+# define PR_MCE_KILL_CLEAR   0
+# define PR_MCE_KILL_SET     1
+
+# define PR_MCE_KILL_LATE    0
+# define PR_MCE_KILL_EARLY   1
+# define PR_MCE_KILL_DEFAULT 2
+
+#define PR_MCE_KILL_GET 34
 
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 1828f8d..ce17760 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1548,24 +1548,37 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			if (arg4 | arg5)
 				return -EINVAL;
 			switch (arg2) {
-			case 0:
+			case PR_MCE_KILL_CLEAR:
 				if (arg3 != 0)
 					return -EINVAL;
 				current->flags &= ~PF_MCE_PROCESS;
 				break;
-			case 1:
+			case PR_MCE_KILL_SET:
 				current->flags |= PF_MCE_PROCESS;
-				if (arg3 != 0)
+				if (arg3 == PR_MCE_KILL_EARLY)
 					current->flags |= PF_MCE_EARLY;
-				else
+				else if (arg3 == PR_MCE_KILL_LATE)
 					current->flags &= ~PF_MCE_EARLY;
+				else if (arg3 == PR_MCE_KILL_DEFAULT)
+					current->flags &=
+						~(PF_MCE_EARLY|PF_MCE_PROCESS);
+				else
+					return -EINVAL;
 				break;
 			default:
 				return -EINVAL;
 			}
 			error = 0;
 			break;
-
+		case PR_MCE_KILL_GET:
+			if (arg2 | arg3 | arg4 | arg5)
+				return -EINVAL;
+			if (current->flags & PF_MCE_PROCESS)
+				error = (current->flags & PF_MCE_EARLY) ?
+					PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
+			else
+				error = PR_MCE_KILL_DEFAULT;
+			break;
 		default:
 			error = -EINVAL;
 			break;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 47cdd7e..1232814 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -685,21 +685,38 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
 int schedule_on_each_cpu(work_func_t func)
 {
 	int cpu;
+	int orig = -1;
 	struct work_struct *works;
 
 	works = alloc_percpu(struct work_struct);
 	if (!works)
 		return -ENOMEM;
 
+	/*
+	 * when running in keventd don't schedule a work item on itself.
+	 * Can just call directly because the work queue is already bound.
+	 * This also is faster.
+	 * Make this a generic parameter for other workqueues?
+	 */
+	if (current_is_keventd()) {
+		orig = raw_smp_processor_id();
+		INIT_WORK(per_cpu_ptr(works, orig), func);
+		func(per_cpu_ptr(works, orig));
+	}
+
 	get_online_cpus();
 	for_each_online_cpu(cpu) {
 		struct work_struct *work = per_cpu_ptr(works, cpu);
 
+		if (cpu == orig)
+			continue;
 		INIT_WORK(work, func);
 		schedule_work_on(cpu, work);
 	}
-	for_each_online_cpu(cpu)
-		flush_work(per_cpu_ptr(works, cpu));
+	for_each_online_cpu(cpu) {
+		if (cpu != orig)
+			flush_work(per_cpu_ptr(works, cpu));
+	}
 	put_online_cpus();
 	free_percpu(works);
 	return 0;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 7fc2130..dacc641 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -371,9 +371,6 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
 	int ret = FAILED;
 	struct address_space *mapping;
 
-	if (!isolate_lru_page(p))
-		page_cache_release(p);
-
 	/*
 	 * For anonymous pages we're done the only reference left
 	 * should be the one m_f() holds.
@@ -499,30 +496,18 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
  */
 static int me_swapcache_dirty(struct page *p, unsigned long pfn)
 {
-	int ret = FAILED;
-
 	ClearPageDirty(p);
 	/* Trigger EIO in shmem: */
 	ClearPageUptodate(p);
 
-	if (!isolate_lru_page(p)) {
-		page_cache_release(p);
-		ret = DELAYED;
-	}
-
-	return ret;
+	return DELAYED;
 }
 
 static int me_swapcache_clean(struct page *p, unsigned long pfn)
 {
-	int ret = FAILED;
-
-	if (!isolate_lru_page(p)) {
-		page_cache_release(p);
-		ret = RECOVERED;
-	}
 	delete_from_swap_cache(p);
-	return ret;
+
+	return RECOVERED;
 }
 
 /*
@@ -612,8 +597,6 @@ static struct page_state {
 	{ 0,		0,		"unknown page state",	me_unknown },
 };
 
-#undef lru
-
 static void action_result(unsigned long pfn, char *msg, int result)
 {
 	struct page *page = NULL;
@@ -630,13 +613,16 @@ static int page_action(struct page_state *ps, struct page *p,
 			unsigned long pfn, int ref)
 {
 	int result;
+	int count;
 
 	result = ps->action(p, pfn);
 	action_result(pfn, ps->msg, result);
-	if (page_count(p) != 1 + ref)
+
+	count = page_count(p) - 1 - ref;
+	if (count != 0)
 		printk(KERN_ERR
 		       "MCE %#lx: %s page still referenced by %d users\n",
-		       pfn, ps->msg, page_count(p) - 1);
+		       pfn, ps->msg, count);
 
 	/* Could do more checks here if page looks ok */
 	/*
@@ -665,9 +651,6 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
 	if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p))
 		return;
 
-	if (!PageLRU(p))
-		lru_add_drain_all();
-
 	/*
 	 * This check implies we don't kill processes if their pages
 	 * are in the swap cache early. Those are always late kills.
@@ -739,6 +722,7 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
 
 int __memory_failure(unsigned long pfn, int trapno, int ref)
 {
+	unsigned long lru_flag;
 	struct page_state *ps;
 	struct page *p;
 	int res;
@@ -776,6 +760,24 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
 	}
 
 	/*
+	 * We ignore non-LRU pages for good reasons.
+	 * - PG_locked is only well defined for LRU pages and a few others
+	 * - to avoid races with __set_page_locked()
+	 * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
+	 * The check (unnecessarily) ignores LRU pages being isolated and
+	 * walked by the page reclaim code, however that's not a big loss.
+	 */
+	if (!PageLRU(p))
+		lru_add_drain_all();
+	lru_flag = p->flags & lru;
+	if (isolate_lru_page(p)) {
+		action_result(pfn, "non LRU", IGNORED);
+		put_page(p);
+		return -EBUSY;
+	}
+	page_cache_release(p);
+
+	/*
 	 * Lock the page and wait for writeback to finish.
 	 * It's very difficult to mess with pages currently under IO
 	 * and in many cases impossible, so we just avoid it here.
@@ -791,7 +793,7 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
 	/*
 	 * Torn down by someone else?
 	 */
-	if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
+	if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) {
 		action_result(pfn, "already truncated LRU", IGNORED);
 		res = 0;
 		goto out;
@@ -799,7 +801,7 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
 
 	res = -EBUSY;
 	for (ps = error_states;; ps++) {
-		if ((p->flags & ps->mask) == ps->res) {
+		if (((p->flags | lru_flag)& ps->mask) == ps->res) {
 			res = page_action(ps, p, pfn, ref);
 			break;
 		}
diff --git a/mm/memory.c b/mm/memory.c
index 60ea601..6ab19dd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2542,7 +2542,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	} else if (PageHWPoison(page)) {
 		ret = VM_FAULT_HWPOISON;
 		delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
-		goto out;
+		goto out_release;
 	}
 
 	lock_page(page);
@@ -2614,6 +2614,7 @@ out_nomap:
 	pte_unmap_unlock(page_table, ptl);
 out_page:
 	unlock_page(page);
+out_release:
 	page_cache_release(page);
 	return ret;
 }