From 61e01be22e954f53a4bbac8066015d9f4ab9e42d Mon Sep 17 00:00:00 2001
From: Jens Rottmann <JRottmann@LiPPERTEmbedded.de>
Date: Tue, 21 Aug 2012 16:15:43 -0700
Subject: cs5535-clockevt: typo, it's MFGPT, not MFPGT

Signed-off-by: Jens Rottmann <JRottmann@LiPPERTEmbedded.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/clocksource/cs5535-clockevt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/clocksource/cs5535-clockevt.c b/drivers/clocksource/cs5535-clockevt.c
index 540795c..d927938 100644
--- a/drivers/clocksource/cs5535-clockevt.c
+++ b/drivers/clocksource/cs5535-clockevt.c
@@ -53,7 +53,7 @@ static struct cs5535_mfgpt_timer *cs5535_event_clock;
 #define MFGPT_PERIODIC (MFGPT_HZ / HZ)
 
 /*
- * The MFPGT timers on the CS5536 provide us with suitable timers to use
+ * The MFGPT timers on the CS5536 provide us with suitable timers to use
  * as clock event sources - not as good as a HPET or APIC, but certainly
  * better than the PIT.  This isn't a general purpose MFGPT driver, but
  * a simplified one designed specifically to act as a clock event source.
@@ -144,7 +144,7 @@ static int __init cs5535_mfgpt_init(void)
 
 	timer = cs5535_mfgpt_alloc_timer(MFGPT_TIMER_ANY, MFGPT_DOMAIN_WORKING);
 	if (!timer) {
-		printk(KERN_ERR DRV_NAME ": Could not allocate MFPGT timer\n");
+		printk(KERN_ERR DRV_NAME ": Could not allocate MFGPT timer\n");
 		return -ENODEV;
 	}
 	cs5535_event_clock = timer;
-- 
cgit v1.1


From f9aed62a2b12a8e04077737c9942111e14ed738e Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 21 Aug 2012 16:15:45 -0700
Subject: mm: change nr_ptes BUG_ON to WARN_ON

Occasionally an isolated BUG_ON(mm->nr_ptes) gets reported, indicating
that not all the page tables allocated could be found and freed when
exit_mmap() tore down the user address space.

There's usually nothing we can say about it, beyond that it's probably a
sign of some bad memory or memory corruption; though it might still
indicate a bug in vma or page table management (and did recently reveal a
race in THP, fixed a few months ago).

But one overdue change we can make is from BUG_ON to WARN_ON.

It's fairly likely that the system will crash shortly afterwards in some
other way (for example, the BUG_ON(page_mapped(page)) in
__delete_from_page_cache(), once an inode mapped into the lost page tables
gets evicted); but might tell us more before that.

Change the BUG_ON(page_mapped) to WARN_ON too?  Later perhaps: I'm less
eager, since that one has several times led to fixes.

Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index e3e8691..9adee9f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2309,7 +2309,7 @@ void exit_mmap(struct mm_struct *mm)
 	}
 	vm_unacct_memory(nr_accounted);
 
-	BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
+	WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
 }
 
 /* Insert vm structure into process list sorted by address
-- 
cgit v1.1


From d65226e2bfe9dd96f193d61892e20fcda9524d22 Mon Sep 17 00:00:00 2001
From: Namjae Jeon <linkinjeon@gmail.com>
Date: Tue, 21 Aug 2012 16:15:46 -0700
Subject: Documentation: update mount option in filesystem/vfat.txt

Update two mount options(discard, nfs) in vfat.txt.

Signed-off-by: Namjae Jeon <linkinjeon@gmail.com>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/vfat.txt | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt
index ead764b..de1e6c4 100644
--- a/Documentation/filesystems/vfat.txt
+++ b/Documentation/filesystems/vfat.txt
@@ -137,6 +137,17 @@ errors=panic|continue|remount-ro
 		 without doing anything or remount the partition in
 		 read-only mode (default behavior).
 
+discard       -- If set, issues discard/TRIM commands to the block
+		 device when blocks are freed. This is useful for SSD devices
+		 and sparse/thinly-provisoned LUNs.
+
+nfs           -- This option maintains an index (cache) of directory
+		 inodes by i_logstart which is used by the nfs-related code to
+		 improve look-ups.
+
+		 Enable this only if you want to export the FAT filesystem
+		 over NFS
+
 <bool>: 0,1,yes,no,true,false
 
 TODO
-- 
cgit v1.1


From b0cf0b118c90477d1a6811f2cd2307f6a5578362 Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Tue, 21 Aug 2012 16:15:49 -0700
Subject: cciss: fix incorrect scsi status reporting

Delete code which sets SCSI status incorrectly as it's already been set
correctly above this incorrect code.  The bug was introduced in 2009 by
commit b0e15f6db111 ("cciss: fix typo that causes scsi status to be
lost.")

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Reported-by: Roel van Meer <roel.vanmeer@bokxing.nl>
Tested-by: Roel van Meer <roel.vanmeer@bokxing.nl>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/cciss_scsi.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index acda773..38aa6dd 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -763,16 +763,7 @@ static void complete_scsi_command(CommandList_struct *c, int timeout,
 		{
 			case CMD_TARGET_STATUS:
 				/* Pass it up to the upper layers... */
-				if( ei->ScsiStatus)
-                		{
-#if 0
-                    			printk(KERN_WARNING "cciss: cmd %p "
-						"has SCSI Status = %x\n",
-						c, ei->ScsiStatus);
-#endif
-					cmd->result |= (ei->ScsiStatus << 1);
-                		}
-				else {  /* scsi status is zero??? How??? */
+				if (!ei->ScsiStatus) {
 					
 	/* Ordinarily, this case should never happen, but there is a bug
 	   in some released firmware revisions that allows it to happen
-- 
cgit v1.1


From eb48c071464757414538c68a6033c8f8c15196f8 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Tue, 21 Aug 2012 16:15:52 -0700
Subject: mm: hugetlbfs: correctly populate shared pmd

Each page mapped in a process's address space must be correctly
accounted for in _mapcount.  Normally the rules for this are
straightforward but hugetlbfs page table sharing is different.  The page
table pages at the PMD level are reference counted while the mapcount
remains the same.

If this accounting is wrong, it causes bugs like this one reported by
Larry Woodman:

  kernel BUG at mm/filemap.c:135!
  invalid opcode: 0000 [#1] SMP
  CPU 22
  Modules linked in: bridge stp llc sunrpc binfmt_misc dcdbas microcode pcspkr acpi_pad acpi]
  Pid: 18001, comm: mpitest Tainted: G        W    3.3.0+ #4 Dell Inc. PowerEdge R620/07NDJ2
  RIP: 0010:[<ffffffff8112cfed>]  [<ffffffff8112cfed>] __delete_from_page_cache+0x15d/0x170
  Process mpitest (pid: 18001, threadinfo ffff880428972000, task ffff880428b5cc20)
  Call Trace:
    delete_from_page_cache+0x40/0x80
    truncate_hugepages+0x115/0x1f0
    hugetlbfs_evict_inode+0x18/0x30
    evict+0x9f/0x1b0
    iput_final+0xe3/0x1e0
    iput+0x3e/0x50
    d_kill+0xf8/0x110
    dput+0xe2/0x1b0
    __fput+0x162/0x240

During fork(), copy_hugetlb_page_range() detects if huge_pte_alloc()
shared page tables with the check dst_pte == src_pte.  The logic is if
the PMD page is the same, they must be shared.  This assumes that the
sharing is between the parent and child.  However, if the sharing is
with a different process entirely then this check fails as in this
diagram:

  parent
    |
    ------------>pmd
                 src_pte----------> data page
                                        ^
  other--------->pmd--------------------|
                  ^
  child-----------|
                 dst_pte

For this situation to occur, it must be possible for Parent and Other to
have faulted and failed to share page tables with each other.  This is
possible due to the following style of race.

  PROC A                                          PROC B
  copy_hugetlb_page_range                         copy_hugetlb_page_range
    src_pte == huge_pte_offset                      src_pte == huge_pte_offset
    !src_pte so no sharing                          !src_pte so no sharing

  (time passes)

  hugetlb_fault                                   hugetlb_fault
    huge_pte_alloc                                  huge_pte_alloc
      huge_pmd_share                                 huge_pmd_share
        LOCK(i_mmap_mutex)
        find nothing, no sharing
        UNLOCK(i_mmap_mutex)
                                                      LOCK(i_mmap_mutex)
                                                      find nothing, no sharing
                                                      UNLOCK(i_mmap_mutex)
      pmd_alloc                                       pmd_alloc
      LOCK(instantiation_mutex)
      fault
      UNLOCK(instantiation_mutex)
                                                  LOCK(instantiation_mutex)
                                                  fault
                                                  UNLOCK(instantiation_mutex)

These two processes are not poing to the same data page but are not
sharing page tables because the opportunity was missed.  When either
process later forks, the src_pte == dst pte is potentially insufficient.
As the check falls through, the wrong PTE information is copied in
(harmless but wrong) and the mapcount is bumped for a page mapped by a
shared page table leading to the BUG_ON.

This patch addresses the issue by moving pmd_alloc into huge_pmd_share
which guarantees that the shared pud is populated in the same critical
section as pmd.  This also means that huge_pte_offset test in
huge_pmd_share is serialized correctly now which in turn means that the
success of the sharing will be higher as the racing tasks see the pud
and pmd populated together.

Race identified and changelog written mostly by Mel Gorman.

{akpm@linux-foundation.org: attempt to make the huge_pmd_share() comment comprehensible, clean up coding style]
Reported-by: Larry Woodman <lwoodman@redhat.com>
Tested-by: Larry Woodman <lwoodman@redhat.com>
Reviewed-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Ken Chen <kenchen@google.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/hugetlbpage.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index f6679a7..b91e485 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -56,9 +56,16 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
 }
 
 /*
- * search for a shareable pmd page for hugetlb.
+ * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
+ * and returns the corresponding pte. While this is not necessary for the
+ * !shared pmd case because we can allocate the pmd later as well, it makes the
+ * code much cleaner. pmd allocation is essential for the shared case because
+ * pud has to be populated inside the same i_mmap_mutex section - otherwise
+ * racing tasks could either miss the sharing (see huge_pte_offset) or select a
+ * bad pmd for sharing.
  */
-static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+static pte_t *
+huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 {
 	struct vm_area_struct *vma = find_vma(mm, addr);
 	struct address_space *mapping = vma->vm_file->f_mapping;
@@ -68,9 +75,10 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 	struct vm_area_struct *svma;
 	unsigned long saddr;
 	pte_t *spte = NULL;
+	pte_t *pte;
 
 	if (!vma_shareable(vma, addr))
-		return;
+		return (pte_t *)pmd_alloc(mm, pud, addr);
 
 	mutex_lock(&mapping->i_mmap_mutex);
 	vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
@@ -97,7 +105,9 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 		put_page(virt_to_page(spte));
 	spin_unlock(&mm->page_table_lock);
 out:
+	pte = (pte_t *)pmd_alloc(mm, pud, addr);
 	mutex_unlock(&mapping->i_mmap_mutex);
+	return pte;
 }
 
 /*
@@ -142,8 +152,9 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 		} else {
 			BUG_ON(sz != PMD_SIZE);
 			if (pud_none(*pud))
-				huge_pmd_share(mm, addr, pud);
-			pte = (pte_t *) pmd_alloc(mm, pud, addr);
+				pte = huge_pmd_share(mm, addr, pud);
+			else
+				pte = (pte_t *)pmd_alloc(mm, pud, addr);
 		}
 	}
 	BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
-- 
cgit v1.1


From ac8e97f8a742828daa1d9de37f6e635888f8d71e Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 21 Aug 2012 16:15:53 -0700
Subject: checkpatch: add control statement test to
 SINGLE_STATEMENT_DO_WHILE_MACRO

Commit b13edf7ff2dd ("checkpatch: add checks for do {} while (0) macro
misuses") added a test that is overly simplistic for single statement
macros.

Macros that start with control tests should be enclosed in a do {} while
(0) loop.

Add the necessary control tests to the check.

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Andy Whitcroft <apw@canonical.com>
Tested-by: Franz Schrober <franzschrober@yahoo.de>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 913d6bd..ca05ba2 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -3016,7 +3016,8 @@ sub process {
 					$herectx .= raw_line($linenr, $n) . "\n";
 				}
 
-				if (($stmts =~ tr/;/;/) == 1) {
+				if (($stmts =~ tr/;/;/) == 1 &&
+				    $stmts !~ /^\s*(if|while|for|switch)\b/) {
 					WARN("SINGLE_STATEMENT_DO_WHILE_MACRO",
 					     "Single statement macros should not use a do {} while (0) loop\n" . "$herectx");
 				}
-- 
cgit v1.1


From d46f3d86fdc9248b4a8497a7da229812a15ba670 Mon Sep 17 00:00:00 2001
From: Zhouping Liu <sanweidaying@gmail.com>
Date: Tue, 21 Aug 2012 16:15:57 -0700
Subject: hugetlb: update hugetlbpage.txt

Commit f0f57b2b1488 ("mm: move hugepage test examples to
tools/testing/selftests/vm") moved map_hugetlb.c, hugepage-shm.c and
hugepage-mmap.c tests into tools/testing/selftests/vm/ directory, but it
didn't update hugetlbpage.txt

Signed-off-by: Zhouping Liu <sanweidaying@gmail.com>
Acked-by: Dave Young <dyoung@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/hugetlbpage.txt | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt
index f8551b3..4ac359b 100644
--- a/Documentation/vm/hugetlbpage.txt
+++ b/Documentation/vm/hugetlbpage.txt
@@ -299,11 +299,17 @@ map_hugetlb.c.
 *******************************************************************
 
 /*
- * hugepage-shm:  see Documentation/vm/hugepage-shm.c
+ * map_hugetlb: see tools/testing/selftests/vm/map_hugetlb.c
  */
 
 *******************************************************************
 
 /*
- * hugepage-mmap:  see Documentation/vm/hugepage-mmap.c
+ * hugepage-shm:  see tools/testing/selftests/vm/hugepage-shm.c
+ */
+
+*******************************************************************
+
+/*
+ * hugepage-mmap:  see tools/testing/selftests/vm/hugepage-mmap.c
  */
-- 
cgit v1.1


From c3a5ce0416b6c172a23bc8a3760d8704d3d1535b Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Tue, 21 Aug 2012 16:16:00 -0700
Subject: string: do not export memweight() to userspace

Fix the following warning:

  usr/include/linux/string.h:8: userspace cannot reference function or variable defined in the kernel

Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
Acked-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/string.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/string.h b/include/linux/string.h
index ffe0442..b917881 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -144,8 +144,8 @@ static inline bool strstarts(const char *str, const char *prefix)
 {
 	return strncmp(str, prefix, strlen(prefix)) == 0;
 }
-#endif
 
 extern size_t memweight(const void *ptr, size_t bytes);
 
+#endif /* __KERNEL__ */
 #endif /* _LINUX_STRING_H_ */
-- 
cgit v1.1


From 7838f994b4fceff24c343f4e26a6cf4393869579 Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Tue, 21 Aug 2012 16:16:02 -0700
Subject: drivers/misc/sgi-xp/xpc_uv.c: SGI XPC fails to load when cpu 0 is out
 of IRQ resources

On many of our larger systems, CPU 0 has had all of its IRQ resources
consumed before XPC loads.  Worst cases on machines with multiple 10
GigE cards and multiple IB cards have depleted the entire first socket
of IRQs.

This patch makes selecting the node upon which IRQs are allocated (as
well as all the other GRU Message Queue structures) specifiable as a
module load param and has a default behavior of searching all nodes/cpus
for an available resources.

[akpm@linux-foundation.org: fix build: include cpu.h and module.h]
Signed-off-by: Robin Holt <holt@sgi.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/misc/sgi-xp/xpc_uv.c | 84 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 65 insertions(+), 19 deletions(-)

diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c
index 87b251a..b9e2000 100644
--- a/drivers/misc/sgi-xp/xpc_uv.c
+++ b/drivers/misc/sgi-xp/xpc_uv.c
@@ -18,6 +18,8 @@
 #include <linux/interrupt.h>
 #include <linux/delay.h>
 #include <linux/device.h>
+#include <linux/cpu.h>
+#include <linux/module.h>
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <asm/uv/uv_hub.h>
@@ -59,6 +61,8 @@ static struct xpc_heartbeat_uv *xpc_heartbeat_uv;
 					 XPC_NOTIFY_MSG_SIZE_UV)
 #define XPC_NOTIFY_IRQ_NAME		"xpc_notify"
 
+static int xpc_mq_node = -1;
+
 static struct xpc_gru_mq_uv *xpc_activate_mq_uv;
 static struct xpc_gru_mq_uv *xpc_notify_mq_uv;
 
@@ -109,11 +113,8 @@ xpc_get_gru_mq_irq_uv(struct xpc_gru_mq_uv *mq, int cpu, char *irq_name)
 #if defined CONFIG_X86_64
 	mq->irq = uv_setup_irq(irq_name, cpu, mq->mmr_blade, mq->mmr_offset,
 			UV_AFFINITY_CPU);
-	if (mq->irq < 0) {
-		dev_err(xpc_part, "uv_setup_irq() returned error=%d\n",
-			-mq->irq);
+	if (mq->irq < 0)
 		return mq->irq;
-	}
 
 	mq->mmr_value = uv_read_global_mmr64(mmr_pnode, mq->mmr_offset);
 
@@ -238,8 +239,9 @@ xpc_create_gru_mq_uv(unsigned int mq_size, int cpu, char *irq_name,
 	mq->mmr_blade = uv_cpu_to_blade_id(cpu);
 
 	nid = cpu_to_node(cpu);
-	page = alloc_pages_exact_node(nid, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
-				pg_order);
+	page = alloc_pages_exact_node(nid,
+				      GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+				      pg_order);
 	if (page == NULL) {
 		dev_err(xpc_part, "xpc_create_gru_mq_uv() failed to alloc %d "
 			"bytes of memory on nid=%d for GRU mq\n", mq_size, nid);
@@ -1731,9 +1733,50 @@ static struct xpc_arch_operations xpc_arch_ops_uv = {
 	.notify_senders_of_disconnect = xpc_notify_senders_of_disconnect_uv,
 };
 
+static int
+xpc_init_mq_node(int nid)
+{
+	int cpu;
+
+	get_online_cpus();
+
+	for_each_cpu(cpu, cpumask_of_node(nid)) {
+		xpc_activate_mq_uv =
+			xpc_create_gru_mq_uv(XPC_ACTIVATE_MQ_SIZE_UV, nid,
+					     XPC_ACTIVATE_IRQ_NAME,
+					     xpc_handle_activate_IRQ_uv);
+		if (!IS_ERR(xpc_activate_mq_uv))
+			break;
+	}
+	if (IS_ERR(xpc_activate_mq_uv)) {
+		put_online_cpus();
+		return PTR_ERR(xpc_activate_mq_uv);
+	}
+
+	for_each_cpu(cpu, cpumask_of_node(nid)) {
+		xpc_notify_mq_uv =
+			xpc_create_gru_mq_uv(XPC_NOTIFY_MQ_SIZE_UV, nid,
+					     XPC_NOTIFY_IRQ_NAME,
+					     xpc_handle_notify_IRQ_uv);
+		if (!IS_ERR(xpc_notify_mq_uv))
+			break;
+	}
+	if (IS_ERR(xpc_notify_mq_uv)) {
+		xpc_destroy_gru_mq_uv(xpc_activate_mq_uv);
+		put_online_cpus();
+		return PTR_ERR(xpc_notify_mq_uv);
+	}
+
+	put_online_cpus();
+	return 0;
+}
+
 int
 xpc_init_uv(void)
 {
+	int nid;
+	int ret = 0;
+
 	xpc_arch_ops = xpc_arch_ops_uv;
 
 	if (sizeof(struct xpc_notify_mq_msghdr_uv) > XPC_MSG_HDR_MAX_SIZE) {
@@ -1742,21 +1785,21 @@ xpc_init_uv(void)
 		return -E2BIG;
 	}
 
-	xpc_activate_mq_uv = xpc_create_gru_mq_uv(XPC_ACTIVATE_MQ_SIZE_UV, 0,
-						  XPC_ACTIVATE_IRQ_NAME,
-						  xpc_handle_activate_IRQ_uv);
-	if (IS_ERR(xpc_activate_mq_uv))
-		return PTR_ERR(xpc_activate_mq_uv);
+	if (xpc_mq_node < 0)
+		for_each_online_node(nid) {
+			ret = xpc_init_mq_node(nid);
 
-	xpc_notify_mq_uv = xpc_create_gru_mq_uv(XPC_NOTIFY_MQ_SIZE_UV, 0,
-						XPC_NOTIFY_IRQ_NAME,
-						xpc_handle_notify_IRQ_uv);
-	if (IS_ERR(xpc_notify_mq_uv)) {
-		xpc_destroy_gru_mq_uv(xpc_activate_mq_uv);
-		return PTR_ERR(xpc_notify_mq_uv);
-	}
+			if (!ret)
+				break;
+		}
+	else
+		ret = xpc_init_mq_node(xpc_mq_node);
 
-	return 0;
+	if (ret < 0)
+		dev_err(xpc_part, "xpc_init_mq_node() returned error=%d\n",
+			-ret);
+
+	return ret;
 }
 
 void
@@ -1765,3 +1808,6 @@ xpc_exit_uv(void)
 	xpc_destroy_gru_mq_uv(xpc_notify_mq_uv);
 	xpc_destroy_gru_mq_uv(xpc_activate_mq_uv);
 }
+
+module_param(xpc_mq_node, int, 0);
+MODULE_PARM_DESC(xpc_mq_node, "Node number on which to allocate message queues.");
-- 
cgit v1.1


From c81758fbe0fdbbc0c74b37798f55bd9c91d5c068 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Tue, 21 Aug 2012 16:16:03 -0700
Subject: mm/compaction.c: fix deferring compaction mistake

Commit aff622495c9a ("vmscan: only defer compaction for failed order and
higher") fixed bad deferring policy but made mistake about checking
compact_order_failed in __compact_pgdat().  So it can't update
compact_order_failed with the new order.  This ends up preventing
correct operation of policy deferral.  This patch fixes it.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index e78cb96..b6984e23 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -861,7 +861,7 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
 		if (cc->order > 0) {
 			int ok = zone_watermark_ok(zone, cc->order,
 						low_wmark_pages(zone), 0, 0);
-			if (ok && cc->order > zone->compact_order_failed)
+			if (ok && cc->order >= zone->compact_order_failed)
 				zone->compact_order_failed = cc->order + 1;
 			/* Currently async compaction is never deferred. */
 			else if (!ok && cc->sync)
-- 
cgit v1.1


From 5ed12f12825c6c0451d703bfe918a7fc190e2738 Mon Sep 17 00:00:00 2001
From: Ilya Shchepetkov <shchepetkov@ispras.ru>
Date: Tue, 21 Aug 2012 16:16:06 -0700
Subject: drivers/rtc/rtc-pcf2123.c: initialize dynamic sysfs attributes

Dynamically allocated sysfs attributes must be initialized using
sysfs_attr_init(), otherwise lockdep complains: BUG: key <address> not in
.data!

Found by Linux Driver Verification project (linuxtesting.org).

Signed-off-by: Ilya Shchepetkov <shchepetkov@ispras.ru>
Cc: Chris Verges <chrisv@cyberswitching.com>
Cc: Christian Pellegrin <chripell@fsfe.org>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-pcf2123.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/rtc/rtc-pcf2123.c b/drivers/rtc/rtc-pcf2123.c
index 8361187..13e4df6 100644
--- a/drivers/rtc/rtc-pcf2123.c
+++ b/drivers/rtc/rtc-pcf2123.c
@@ -43,6 +43,7 @@
 #include <linux/rtc.h>
 #include <linux/spi/spi.h>
 #include <linux/module.h>
+#include <linux/sysfs.h>
 
 #define DRV_VERSION "0.6"
 
@@ -292,6 +293,7 @@ static int __devinit pcf2123_probe(struct spi_device *spi)
 	pdata->rtc = rtc;
 
 	for (i = 0; i < 16; i++) {
+		sysfs_attr_init(&pdata->regs[i].attr.attr);
 		sprintf(pdata->regs[i].name, "%1x", i);
 		pdata->regs[i].attr.attr.mode = S_IRUGO | S_IWUSR;
 		pdata->regs[i].attr.attr.name = pdata->regs[i].name;
-- 
cgit v1.1


From b121186ab1b12e2a96a945d88eae0735b4542158 Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@intel.com>
Date: Tue, 21 Aug 2012 16:16:08 -0700
Subject: mm: correct page->pfmemalloc to fix deactivate_slab regression

Commit cfd19c5a9ecf ("mm: only set page->pfmemalloc when
ALLOC_NO_WATERMARKS was used") tried to narrow down page->pfmemalloc
setting, but it missed some places the pfmemalloc should be set.

So, in __slab_alloc, the unalignment pfmemalloc and ALLOC_NO_WATERMARKS
cause incorrect deactivate_slab() on our core2 server:

    64.73%           fio  [kernel.kallsyms]     [k] _raw_spin_lock
                     |
                     --- _raw_spin_lock
                        |
                        |---0.34%-- deactivate_slab
                        |          __slab_alloc
                        |          kmem_cache_alloc
                        |          |

That causes our fio sync write performance to have a 40% regression.

Move the checking in get_page_from_freelist() which resolves this issue.

Signed-off-by: Alex Shi <alex.shi@intel.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: David Miller <davem@davemloft.net
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Eric Dumazet <eric.dumazet@gmail.com>
Tested-by: Sage Weil <sage@inktank.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 009ac28..07f1924 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1928,6 +1928,17 @@ this_zone_full:
 		zlc_active = 0;
 		goto zonelist_scan;
 	}
+
+	if (page)
+		/*
+		 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
+		 * necessary to allocate the page. The expectation is
+		 * that the caller is taking steps that will free more
+		 * memory. The caller should avoid the page being used
+		 * for !PFMEMALLOC purposes.
+		 */
+		page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
+
 	return page;
 }
 
@@ -2389,14 +2400,6 @@ rebalance:
 				zonelist, high_zoneidx, nodemask,
 				preferred_zone, migratetype);
 		if (page) {
-			/*
-			 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
-			 * necessary to allocate the page. The expectation is
-			 * that the caller is taking steps that will free more
-			 * memory. The caller should avoid the page being used
-			 * for !PFMEMALLOC purposes.
-			 */
-			page->pfmemalloc = true;
 			goto got_pg;
 		}
 	}
@@ -2569,8 +2572,6 @@ retry_cpuset:
 		page = __alloc_pages_slowpath(gfp_mask, order,
 				zonelist, high_zoneidx, nodemask,
 				preferred_zone, migratetype);
-	else
-		page->pfmemalloc = false;
 
 	trace_mm_page_alloc(page, order, gfp_mask, migratetype);
 
-- 
cgit v1.1


From 7dbfb315b2aaef0a115765946bf3026d074c33a7 Mon Sep 17 00:00:00 2001
From: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Date: Tue, 21 Aug 2012 16:16:10 -0700
Subject: drivers/rtc/rtc-rs5c348.c: fix hour decoding in 12-hour mode

Correct the offset by subtracting 20 from tm_hour before taking the
modulo 12.

[ "Why 20?" I hear you ask. Or at least I did.

  Here's the reason why: RS5C348_BIT_PM is 32, and is - stupidly -
  included in the RS5C348_HOURS_MASK define.  So it's really subtracting
  out that bit to get "hour+12".  But then because it does things modulo
  12, it needs to add the 12 in again afterwards anyway.

  This code is confused.  It would be much clearer if RS5C348_HOURS_MASK
  just didn't include the RS5C348_BIT_PM bit at all, then it wouldn't
  need to do the silly subtract either.

  Whatever. It's all just math, the end result is the same.   - Linus ]

Reported-by: James Nute <newten82@gmail.com>
Tested-by: James Nute <newten82@gmail.com>
Signed-off-by: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-rs5c348.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/rtc/rtc-rs5c348.c b/drivers/rtc/rtc-rs5c348.c
index 77074cc..fd5c7af 100644
--- a/drivers/rtc/rtc-rs5c348.c
+++ b/drivers/rtc/rtc-rs5c348.c
@@ -122,9 +122,12 @@ rs5c348_rtc_read_time(struct device *dev, struct rtc_time *tm)
 	tm->tm_min = bcd2bin(rxbuf[RS5C348_REG_MINS] & RS5C348_MINS_MASK);
 	tm->tm_hour = bcd2bin(rxbuf[RS5C348_REG_HOURS] & RS5C348_HOURS_MASK);
 	if (!pdata->rtc_24h) {
-		tm->tm_hour %= 12;
-		if (rxbuf[RS5C348_REG_HOURS] & RS5C348_BIT_PM)
+		if (rxbuf[RS5C348_REG_HOURS] & RS5C348_BIT_PM) {
+			tm->tm_hour -= 20;
+			tm->tm_hour %= 12;
 			tm->tm_hour += 12;
+		} else
+			tm->tm_hour %= 12;
 	}
 	tm->tm_wday = bcd2bin(rxbuf[RS5C348_REG_WDAY] & RS5C348_WDAY_MASK);
 	tm->tm_mday = bcd2bin(rxbuf[RS5C348_REG_DAY] & RS5C348_DAY_MASK);
-- 
cgit v1.1


From 3670e7e12e582c6d67761275d148171feb7a9004 Mon Sep 17 00:00:00 2001
From: Alexandre Bounine <alexandre.bounine@idt.com>
Date: Tue, 21 Aug 2012 16:16:11 -0700
Subject: rapidio/tsi721: fix inbound doorbell interrupt handling

Make sure that there is no doorbell messages left behind due to disabled
interrupts during inbound doorbell processing.

The most common case for this bug is loss of rionet JOIN messages in
systems with three or more rionet participants and MSI or MSI-X enabled.
As result, requests for packet transfers may finish with "destination
unreachable" error message.

This patch is applicable to kernel versions starting from v3.2.

Signed-off-by: Alexandre Bounine <alexandre.bounine@idt.com>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rapidio/devices/tsi721.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/rapidio/devices/tsi721.c b/drivers/rapidio/devices/tsi721.c
index 722246c..c0d6a04 100644
--- a/drivers/rapidio/devices/tsi721.c
+++ b/drivers/rapidio/devices/tsi721.c
@@ -435,6 +435,9 @@ static void tsi721_db_dpc(struct work_struct *work)
 				" info %4.4x\n", DBELL_SID(idb.bytes),
 				DBELL_TID(idb.bytes), DBELL_INF(idb.bytes));
 		}
+
+		wr_ptr = ioread32(priv->regs +
+				  TSI721_IDQ_WP(IDB_QUEUE)) % IDB_QSIZE;
 	}
 
 	iowrite32(rd_ptr & (IDB_QSIZE - 1),
@@ -445,6 +448,10 @@ static void tsi721_db_dpc(struct work_struct *work)
 	regval |= TSI721_SR_CHINT_IDBQRCV;
 	iowrite32(regval,
 		priv->regs + TSI721_SR_CHINTE(IDB_QUEUE));
+
+	wr_ptr = ioread32(priv->regs + TSI721_IDQ_WP(IDB_QUEUE)) % IDB_QSIZE;
+	if (wr_ptr != rd_ptr)
+		schedule_work(&priv->idb_work);
 }
 
 /**
-- 
cgit v1.1


From 9a9a9a7adafe62a34de8b4fb48936c1c5f9bafa5 Mon Sep 17 00:00:00 2001
From: Alexandre Bounine <alexandre.bounine@idt.com>
Date: Tue, 21 Aug 2012 16:16:12 -0700
Subject: rapidio/tsi721: fix unused variable compiler warning

Fix unused variable compiler warning when built with CONFIG_RAPIDIO_DEBUG
option off.

This patch is applicable to kernel versions starting from v3.2

Signed-off-by: Alexandre Bounine <alexandre.bounine@idt.com>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rapidio/devices/tsi721.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/rapidio/devices/tsi721.c b/drivers/rapidio/devices/tsi721.c
index c0d6a04..5d44252 100644
--- a/drivers/rapidio/devices/tsi721.c
+++ b/drivers/rapidio/devices/tsi721.c
@@ -2219,7 +2219,7 @@ static int __devinit tsi721_probe(struct pci_dev *pdev,
 				  const struct pci_device_id *id)
 {
 	struct tsi721_device *priv;
-	int i, cap;
+	int cap;
 	int err;
 	u32 regval;
 
@@ -2239,12 +2239,15 @@ static int __devinit tsi721_probe(struct pci_dev *pdev,
 	priv->pdev = pdev;
 
 #ifdef DEBUG
+	{
+	int i;
 	for (i = 0; i <= PCI_STD_RESOURCE_END; i++) {
 		dev_dbg(&pdev->dev, "res[%d] @ 0x%llx (0x%lx, 0x%lx)\n",
 			i, (unsigned long long)pci_resource_start(pdev, i),
 			(unsigned long)pci_resource_len(pdev, i),
 			pci_resource_flags(pdev, i));
 	}
+	}
 #endif
 	/*
 	 * Verify BAR configuration
-- 
cgit v1.1


From de74f1cc3b1e9730d9b58580cd11361d30cd182d Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Tue, 21 Aug 2012 16:16:15 -0700
Subject: mm: have order > 0 compaction start near a pageblock with free pages

Commit 7db8889ab05b ("mm: have order > 0 compaction start off where it
left") introduced a caching mechanism to reduce the amount work the free
page scanner does in compaction.  However, it has a problem.  Consider
two process simultaneously scanning free pages

					    			C
	Process A		M     S     			F
			|---------------------------------------|
	Process B		M 	FS

	C is zone->compact_cached_free_pfn
	S is cc->start_pfree_pfn
	M is cc->migrate_pfn
	F is cc->free_pfn

In this diagram, Process A has just reached its migrate scanner, wrapped
around and updated compact_cached_free_pfn accordingly.

Simultaneously, Process B finishes isolating in a block and updates
compact_cached_free_pfn again to the location of its free scanner.

Process A moves to "end_of_zone - one_pageblock" and runs this check

                if (cc->order > 0 && (!cc->wrapped ||
                                      zone->compact_cached_free_pfn >
                                      cc->start_free_pfn))
                        pfn = min(pfn, zone->compact_cached_free_pfn);

compact_cached_free_pfn is above where it started so the free scanner
skips almost the entire space it should have scanned.  When there are
multiple processes compacting it can end in a situation where the entire
zone is not being scanned at all.  Further, it is possible for two
processes to ping-pong update to compact_cached_free_pfn which is just
random.

Overall, the end result wrecks allocation success rates.

There is not an obvious way around this problem without introducing new
locking and state so this patch takes a different approach.

First, it gets rid of the skip logic because it's not clear that it
matters if two free scanners happen to be in the same block but with
racing updates it's too easy for it to skip over blocks it should not.

Second, it updates compact_cached_free_pfn in a more limited set of
circumstances.

If a scanner has wrapped, it updates compact_cached_free_pfn to the end
	of the zone. When a wrapped scanner isolates a page, it updates
	compact_cached_free_pfn to point to the highest pageblock it
	can isolate pages from.

If a scanner has not wrapped when it has finished isolated pages it
	checks if compact_cached_free_pfn is pointing to the end of the
	zone. If so, the value is updated to point to the highest
	pageblock that pages were isolated from. This value will not
	be updated again until a free page scanner wraps and resets
	compact_cached_free_pfn.

This is not optimal and it can still race but the compact_cached_free_pfn
will be pointing to or very near a pageblock with free pages.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 54 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 28 insertions(+), 26 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index b6984e23..bcce789 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -384,6 +384,20 @@ static bool suitable_migration_target(struct page *page)
 }
 
 /*
+ * Returns the start pfn of the last page block in a zone.  This is the starting
+ * point for full compaction of a zone.  Compaction searches for free pages from
+ * the end of each zone, while isolate_freepages_block scans forward inside each
+ * page block.
+ */
+static unsigned long start_free_pfn(struct zone *zone)
+{
+	unsigned long free_pfn;
+	free_pfn = zone->zone_start_pfn + zone->spanned_pages;
+	free_pfn &= ~(pageblock_nr_pages-1);
+	return free_pfn;
+}
+
+/*
  * Based on information in the current compact_control, find blocks
  * suitable for isolating free pages from and then isolate them.
  */
@@ -422,17 +436,6 @@ static void isolate_freepages(struct zone *zone,
 					pfn -= pageblock_nr_pages) {
 		unsigned long isolated;
 
-		/*
-		 * Skip ahead if another thread is compacting in the area
-		 * simultaneously. If we wrapped around, we can only skip
-		 * ahead if zone->compact_cached_free_pfn also wrapped to
-		 * above our starting point.
-		 */
-		if (cc->order > 0 && (!cc->wrapped ||
-				      zone->compact_cached_free_pfn >
-				      cc->start_free_pfn))
-			pfn = min(pfn, zone->compact_cached_free_pfn);
-
 		if (!pfn_valid(pfn))
 			continue;
 
@@ -474,7 +477,15 @@ static void isolate_freepages(struct zone *zone,
 		 */
 		if (isolated) {
 			high_pfn = max(high_pfn, pfn);
-			if (cc->order > 0)
+
+			/*
+			 * If the free scanner has wrapped, update
+			 * compact_cached_free_pfn to point to the highest
+			 * pageblock with free pages. This reduces excessive
+			 * scanning of full pageblocks near the end of the
+			 * zone
+			 */
+			if (cc->order > 0 && cc->wrapped)
 				zone->compact_cached_free_pfn = high_pfn;
 		}
 	}
@@ -484,6 +495,11 @@ static void isolate_freepages(struct zone *zone,
 
 	cc->free_pfn = high_pfn;
 	cc->nr_freepages = nr_freepages;
+
+	/* If compact_cached_free_pfn is reset then set it now */
+	if (cc->order > 0 && !cc->wrapped &&
+			zone->compact_cached_free_pfn == start_free_pfn(zone))
+		zone->compact_cached_free_pfn = high_pfn;
 }
 
 /*
@@ -570,20 +586,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 	return ISOLATE_SUCCESS;
 }
 
-/*
- * Returns the start pfn of the last page block in a zone.  This is the starting
- * point for full compaction of a zone.  Compaction searches for free pages from
- * the end of each zone, while isolate_freepages_block scans forward inside each
- * page block.
- */
-static unsigned long start_free_pfn(struct zone *zone)
-{
-	unsigned long free_pfn;
-	free_pfn = zone->zone_start_pfn + zone->spanned_pages;
-	free_pfn &= ~(pageblock_nr_pages-1);
-	return free_pfn;
-}
-
 static int compact_finished(struct zone *zone,
 			    struct compact_control *cc)
 {
-- 
cgit v1.1


From c67fe3752abe6ab47639e2f9b836900c3dc3da84 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Tue, 21 Aug 2012 16:16:17 -0700
Subject: mm: compaction: Abort async compaction if locks are contended or
 taking too long

Jim Schutt reported a problem that pointed at compaction contending
heavily on locks.  The workload is straight-forward and in his own words;

	The systems in question have 24 SAS drives spread across 3 HBAs,
	running 24 Ceph OSD instances, one per drive.  FWIW these servers
	are dual-socket Intel 5675 Xeons w/48 GB memory.  I've got ~160
	Ceph Linux clients doing dd simultaneously to a Ceph file system
	backed by 12 of these servers.

Early in the test everything looks fine

  procs -------------------memory------------------ ---swap-- -----io---- --system-- -----cpu-------
   r  b       swpd       free       buff      cache   si   so    bi    bo   in   cs  us sy  id wa st
  31 15          0     287216        576   38606628    0    0     2  1158    2   14   1  3  95  0  0
  27 15          0     225288        576   38583384    0    0    18 2222016 203357 134876  11 56  17 15  0
  28 17          0     219256        576   38544736    0    0    11 2305932 203141 146296  11 49  23 17  0
   6 18          0     215596        576   38552872    0    0     7 2363207 215264 166502  12 45  22 20  0
  22 18          0     226984        576   38596404    0    0     3 2445741 223114 179527  12 43  23 22  0

and then it goes to pot

  procs -------------------memory------------------ ---swap-- -----io---- --system-- -----cpu-------
   r  b       swpd       free       buff      cache   si   so    bi    bo   in   cs  us sy  id wa st
  163  8          0     464308        576   36791368    0    0    11 22210  866  536   3 13  79  4  0
  207 14          0     917752        576   36181928    0    0   712 1345376 134598 47367   7 90   1  2  0
  123 12          0     685516        576   36296148    0    0   429 1386615 158494 60077   8 84   5  3  0
  123 12          0     598572        576   36333728    0    0  1107 1233281 147542 62351   7 84   5  4  0
  622  7          0     660768        576   36118264    0    0   557 1345548 151394 59353   7 85   4  3  0
  223 11          0     283960        576   36463868    0    0    46 1107160 121846 33006   6 93   1  1  0

Note that system CPU usage is very high blocks being written out has
dropped by 42%. He analysed this with perf and found

  perf record -g -a sleep 10
  perf report --sort symbol --call-graph fractal,5
    34.63%  [k] _raw_spin_lock_irqsave
            |
            |--97.30%-- isolate_freepages
            |          compaction_alloc
            |          unmap_and_move
            |          migrate_pages
            |          compact_zone
            |          compact_zone_order
            |          try_to_compact_pages
            |          __alloc_pages_direct_compact
            |          __alloc_pages_slowpath
            |          __alloc_pages_nodemask
            |          alloc_pages_vma
            |          do_huge_pmd_anonymous_page
            |          handle_mm_fault
            |          do_page_fault
            |          page_fault
            |          |
            |          |--87.39%-- skb_copy_datagram_iovec
            |          |          tcp_recvmsg
            |          |          inet_recvmsg
            |          |          sock_recvmsg
            |          |          sys_recvfrom
            |          |          system_call
            |          |          __recv
            |          |          |
            |          |           --100.00%-- (nil)
            |          |
            |           --12.61%-- memcpy
             --2.70%-- [...]

There was other data but primarily it is all showing that compaction is
contended heavily on the zone->lock and zone->lru_lock.

commit [b2eef8c0: mm: compaction: minimise the time IRQs are disabled
while isolating pages for migration] noted that it was possible for
migration to hold the lru_lock for an excessive amount of time. Very
broadly speaking this patch expands the concept.

This patch introduces compact_checklock_irqsave() to check if a lock
is contended or the process needs to be scheduled. If either condition
is true then async compaction is aborted and the caller is informed.
The page allocator will fail a THP allocation if compaction failed due
to contention. This patch also introduces compact_trylock_irqsave()
which will acquire the lock only if it is not contended and the process
does not need to schedule.

Reported-by: Jim Schutt <jaschut@sandia.gov>
Tested-by: Jim Schutt <jaschut@sandia.gov>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h |   4 +-
 mm/compaction.c            | 100 +++++++++++++++++++++++++++++++++++----------
 mm/internal.h              |   1 +
 mm/page_alloc.c            |  17 +++++---
 4 files changed, 93 insertions(+), 29 deletions(-)

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 133ddcf..ef65814 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *mask,
-			bool sync);
+			bool sync, bool *contended);
 extern int compact_pgdat(pg_data_t *pgdat, int order);
 extern unsigned long compaction_suitable(struct zone *zone, int order);
 
@@ -64,7 +64,7 @@ static inline bool compaction_deferred(struct zone *zone, int order)
 #else
 static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			bool sync)
+			bool sync, bool *contended)
 {
 	return COMPACT_CONTINUE;
 }
diff --git a/mm/compaction.c b/mm/compaction.c
index bcce789..7fcd3a5 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -51,6 +51,47 @@ static inline bool migrate_async_suitable(int migratetype)
 }
 
 /*
+ * Compaction requires the taking of some coarse locks that are potentially
+ * very heavily contended. Check if the process needs to be scheduled or
+ * if the lock is contended. For async compaction, back out in the event
+ * if contention is severe. For sync compaction, schedule.
+ *
+ * Returns true if the lock is held.
+ * Returns false if the lock is released and compaction should abort
+ */
+static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
+				      bool locked, struct compact_control *cc)
+{
+	if (need_resched() || spin_is_contended(lock)) {
+		if (locked) {
+			spin_unlock_irqrestore(lock, *flags);
+			locked = false;
+		}
+
+		/* async aborts if taking too long or contended */
+		if (!cc->sync) {
+			if (cc->contended)
+				*cc->contended = true;
+			return false;
+		}
+
+		cond_resched();
+		if (fatal_signal_pending(current))
+			return false;
+	}
+
+	if (!locked)
+		spin_lock_irqsave(lock, *flags);
+	return true;
+}
+
+static inline bool compact_trylock_irqsave(spinlock_t *lock,
+			unsigned long *flags, struct compact_control *cc)
+{
+	return compact_checklock_irqsave(lock, flags, false, cc);
+}
+
+/*
  * Isolate free pages onto a private freelist. Caller must hold zone->lock.
  * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
  * pages inside of the pageblock (even though it may still end up isolating
@@ -173,7 +214,7 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
 }
 
 /* Update the number of anon and file isolated pages in the zone */
-static void acct_isolated(struct zone *zone, struct compact_control *cc)
+static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc)
 {
 	struct page *page;
 	unsigned int count[2] = { 0, };
@@ -181,8 +222,14 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc)
 	list_for_each_entry(page, &cc->migratepages, lru)
 		count[!!page_is_file_cache(page)]++;
 
-	__mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
-	__mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+	/* If locked we can use the interrupt unsafe versions */
+	if (locked) {
+		__mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+		__mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+	} else {
+		mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+		mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+	}
 }
 
 /* Similar to reclaim, but different enough that they don't share logic */
@@ -228,6 +275,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	struct list_head *migratelist = &cc->migratepages;
 	isolate_mode_t mode = 0;
 	struct lruvec *lruvec;
+	unsigned long flags;
+	bool locked;
 
 	/*
 	 * Ensure that there are not too many pages isolated from the LRU
@@ -247,25 +296,22 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 
 	/* Time to isolate some pages for migration */
 	cond_resched();
-	spin_lock_irq(&zone->lru_lock);
+	spin_lock_irqsave(&zone->lru_lock, flags);
+	locked = true;
 	for (; low_pfn < end_pfn; low_pfn++) {
 		struct page *page;
-		bool locked = true;
 
 		/* give a chance to irqs before checking need_resched() */
 		if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
-			spin_unlock_irq(&zone->lru_lock);
+			spin_unlock_irqrestore(&zone->lru_lock, flags);
 			locked = false;
 		}
-		if (need_resched() || spin_is_contended(&zone->lru_lock)) {
-			if (locked)
-				spin_unlock_irq(&zone->lru_lock);
-			cond_resched();
-			spin_lock_irq(&zone->lru_lock);
-			if (fatal_signal_pending(current))
-				break;
-		} else if (!locked)
-			spin_lock_irq(&zone->lru_lock);
+
+		/* Check if it is ok to still hold the lock */
+		locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
+								locked, cc);
+		if (!locked)
+			break;
 
 		/*
 		 * migrate_pfn does not necessarily start aligned to a
@@ -349,9 +395,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		}
 	}
 
-	acct_isolated(zone, cc);
+	acct_isolated(zone, locked, cc);
 
-	spin_unlock_irq(&zone->lru_lock);
+	if (locked)
+		spin_unlock_irqrestore(&zone->lru_lock, flags);
 
 	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
 
@@ -461,7 +508,16 @@ static void isolate_freepages(struct zone *zone,
 		 * are disabled
 		 */
 		isolated = 0;
-		spin_lock_irqsave(&zone->lock, flags);
+
+		/*
+		 * The zone lock must be held to isolate freepages. This
+		 * unfortunately this is a very coarse lock and can be
+		 * heavily contended if there are parallel allocations
+		 * or parallel compactions. For async compaction do not
+		 * spin on the lock
+		 */
+		if (!compact_trylock_irqsave(&zone->lock, &flags, cc))
+			break;
 		if (suitable_migration_target(page)) {
 			end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
 			isolated = isolate_freepages_block(pfn, end_pfn,
@@ -773,7 +829,7 @@ out:
 
 static unsigned long compact_zone_order(struct zone *zone,
 				 int order, gfp_t gfp_mask,
-				 bool sync)
+				 bool sync, bool *contended)
 {
 	struct compact_control cc = {
 		.nr_freepages = 0,
@@ -782,6 +838,7 @@ static unsigned long compact_zone_order(struct zone *zone,
 		.migratetype = allocflags_to_migratetype(gfp_mask),
 		.zone = zone,
 		.sync = sync,
+		.contended = contended,
 	};
 	INIT_LIST_HEAD(&cc.freepages);
 	INIT_LIST_HEAD(&cc.migratepages);
@@ -803,7 +860,7 @@ int sysctl_extfrag_threshold = 500;
  */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			bool sync)
+			bool sync, bool *contended)
 {
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	int may_enter_fs = gfp_mask & __GFP_FS;
@@ -827,7 +884,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 								nodemask) {
 		int status;
 
-		status = compact_zone_order(zone, order, gfp_mask, sync);
+		status = compact_zone_order(zone, order, gfp_mask, sync,
+						contended);
 		rc = max(status, rc);
 
 		/* If a normal allocation would succeed, stop compacting */
diff --git a/mm/internal.h b/mm/internal.h
index 3314f79..b8c91b3 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -130,6 +130,7 @@ struct compact_control {
 	int order;			/* order a direct compactor needs */
 	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
 	struct zone *zone;
+	bool *contended;		/* True if a lock was contended */
 };
 
 unsigned long
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 07f1924..c66fb87 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2102,7 +2102,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
 	int migratetype, bool sync_migration,
-	bool *deferred_compaction,
+	bool *contended_compaction, bool *deferred_compaction,
 	unsigned long *did_some_progress)
 {
 	struct page *page;
@@ -2117,7 +2117,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 
 	current->flags |= PF_MEMALLOC;
 	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
-						nodemask, sync_migration);
+						nodemask, sync_migration,
+						contended_compaction);
 	current->flags &= ~PF_MEMALLOC;
 	if (*did_some_progress != COMPACT_SKIPPED) {
 
@@ -2163,7 +2164,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
 	int migratetype, bool sync_migration,
-	bool *deferred_compaction,
+	bool *contended_compaction, bool *deferred_compaction,
 	unsigned long *did_some_progress)
 {
 	return NULL;
@@ -2336,6 +2337,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	unsigned long did_some_progress;
 	bool sync_migration = false;
 	bool deferred_compaction = false;
+	bool contended_compaction = false;
 
 	/*
 	 * In the slowpath, we sanity check order to avoid ever trying to
@@ -2425,6 +2427,7 @@ rebalance:
 					nodemask,
 					alloc_flags, preferred_zone,
 					migratetype, sync_migration,
+					&contended_compaction,
 					&deferred_compaction,
 					&did_some_progress);
 	if (page)
@@ -2434,10 +2437,11 @@ rebalance:
 	/*
 	 * If compaction is deferred for high-order allocations, it is because
 	 * sync compaction recently failed. In this is the case and the caller
-	 * has requested the system not be heavily disrupted, fail the
-	 * allocation now instead of entering direct reclaim
+	 * requested a movable allocation that does not heavily disrupt the
+	 * system then fail the allocation instead of entering direct reclaim.
 	 */
-	if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
+	if ((deferred_compaction || contended_compaction) &&
+						(gfp_mask & __GFP_NO_KSWAPD))
 		goto nopage;
 
 	/* Try direct reclaim and then allocating */
@@ -2508,6 +2512,7 @@ rebalance:
 					nodemask,
 					alloc_flags, preferred_zone,
 					migratetype, sync_migration,
+					&contended_compaction,
 					&deferred_compaction,
 					&did_some_progress);
 		if (page)
-- 
cgit v1.1