From c177c81e09e517bbf75b67762cdab1b83aba6976 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 4 Jun 2014 16:05:35 -0700
Subject: hugetlb: restrict hugepage_migration_support() to x86_64

Currently hugepage migration is available for all archs which support
pmd-level hugepage, but testing is done only for x86_64 and there're
bugs for other archs.  So to avoid breaking such archs, this patch
limits the availability strictly to x86_64 until developers of other
archs get interested in enabling this feature.

Simply disabling hugepage migration on non-x86_64 archs is not enough to
fix the reported problem where sys_move_pages() hits the BUG_ON() in
follow_page(FOLL_GET), so let's fix this by checking if hugepage
migration is supported in vma_migratable().

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Reported-by: Michael Ellerman <mpe@ellerman.id.au>
Tested-by: Michael Ellerman <mpe@ellerman.id.au>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: David Miller <davem@davemloft.net>
Cc: <stable@vger.kernel.org>	[3.12+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h   | 13 +++++--------
 include/linux/mempolicy.h |  6 ++++++
 2 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index b65166d..d0bad1a 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -392,15 +392,13 @@ static inline pgoff_t basepage_index(struct page *page)
 
 extern void dissolve_free_huge_pages(unsigned long start_pfn,
 				     unsigned long end_pfn);
-int pmd_huge_support(void);
-/*
- * Currently hugepage migration is enabled only for pmd-based hugepage.
- * This function will be updated when hugepage migration is more widely
- * supported.
- */
 static inline int hugepage_migration_support(struct hstate *h)
 {
-	return pmd_huge_support() && (huge_page_shift(h) == PMD_SHIFT);
+#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
+	return huge_page_shift(h) == PMD_SHIFT;
+#else
+	return 0;
+#endif
 }
 
 static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
@@ -450,7 +448,6 @@ static inline pgoff_t basepage_index(struct page *page)
 	return page->index;
 }
 #define dissolve_free_huge_pages(s, e)	do {} while (0)
-#define pmd_huge_support()	0
 #define hugepage_migration_support(h)	0
 
 static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 3c1b968..f230a97 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -175,6 +175,12 @@ static inline int vma_migratable(struct vm_area_struct *vma)
 {
 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
 		return 0;
+
+#ifndef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
+	if (vma->vm_flags & VM_HUGETLB)
+		return 0;
+#endif
+
 	/*
 	 * Migration allocates pages in the highest zone. If we cannot
 	 * do so then migration (at least from node to node) is not
-- 
cgit v1.1


From ac13a829f6adb674015ab399594c089990104af7 Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Wed, 4 Jun 2014 16:06:27 -0700
Subject: fs/libfs.c: add generic data flush to fsync

Description by Jan Kara:
 "A lot of older filesystems don't properly flush volatile disk caches
  on fsync(2) which can lead to loss of fsynced data after power failure.

This patch makes generic_file_fsync() issue proper cache flush to fix the
problem.  Sysadmin can use /sys/devices/.../cache_type to tell the system
it should not send the cache flush."

[akpm@linux-foundation.org: nuke ifdef]
[akpm@linux-foundation.org: fix warning]
Signed-off-by: Fabian Frederick <fabf@skynet.be>
Suggested-by: Jan Kara <jack@suse.cz>
Suggested-by: Christoph Hellwig <hch@infradead.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/blkdev.h | 9 +++++++++
 include/linux/fs.h     | 1 +
 2 files changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8aba35f..45cf6e5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1607,6 +1607,9 @@ struct block_device_operations {
 extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
 				 unsigned long);
 #else /* CONFIG_BLOCK */
+
+struct block_device;
+
 /*
  * stubs for when the block layer is configured out
  */
@@ -1642,6 +1645,12 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
 	return false;
 }
 
+static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
+				     sector_t *error_sector)
+{
+	return 0;
+}
+
 #endif /* CONFIG_BLOCK */
 
 #endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 87803122..c3f46e4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2590,6 +2590,7 @@ extern ssize_t simple_read_from_buffer(void __user *to, size_t count,
 extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
 		const void __user *from, size_t count);
 
+extern int __generic_file_fsync(struct file *, loff_t, loff_t, int);
 extern int generic_file_fsync(struct file *, loff_t, loff_t, int);
 
 extern int generic_check_addressable(unsigned, u64);
-- 
cgit v1.1


From c46a7c817e662a820373bb76b88d0ad67d6abe5d Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 4 Jun 2014 16:06:30 -0700
Subject: x86: define _PAGE_NUMA by reusing software bits on the PMD and PTE
 levels

_PAGE_NUMA is currently an alias of _PROT_PROTNONE to trap NUMA hinting
faults on x86.  Care is taken such that _PAGE_NUMA is used only in
situations where the VMA flags distinguish between NUMA hinting faults
and prot_none faults.  This decision was x86-specific and conceptually
it is difficult requiring special casing to distinguish between PROTNONE
and NUMA ptes based on context.

Fundamentally, we only need the _PAGE_NUMA bit to tell the difference
between an entry that is really unmapped and a page that is protected
for NUMA hinting faults as if the PTE is not present then a fault will
be trapped.

Swap PTEs on x86-64 use the bits after _PAGE_GLOBAL for the offset.
This patch shrinks the maximum possible swap size and uses the bit to
uniquely distinguish between NUMA hinting ptes and swap ptes.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Anvin <hpa@zytor.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Steven Noonan <steven@uplinklabs.net>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/pgtable.h | 8 ++++++--
 include/linux/swapops.h       | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index a8015a7..53b2acc 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -233,6 +233,10 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
 # define pte_accessible(mm, pte)	((void)(pte), 1)
 #endif
 
+#ifndef pte_present_nonuma
+#define pte_present_nonuma(pte) pte_present(pte)
+#endif
+
 #ifndef flush_tlb_fix_spurious_fault
 #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)
 #endif
@@ -670,7 +674,7 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
 static inline int pte_numa(pte_t pte)
 {
 	return (pte_flags(pte) &
-		(_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
+		(_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA;
 }
 #endif
 
@@ -678,7 +682,7 @@ static inline int pte_numa(pte_t pte)
 static inline int pmd_numa(pmd_t pmd)
 {
 	return (pmd_flags(pmd) &
-		(_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
+		(_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA;
 }
 #endif
 
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index c0f7526..6adfb7b 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -54,7 +54,7 @@ static inline pgoff_t swp_offset(swp_entry_t entry)
 /* check whether a pte points to a swap entry */
 static inline int is_swap_pte(pte_t pte)
 {
-	return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
+	return !pte_none(pte) && !pte_present_nonuma(pte) && !pte_file(pte);
 }
 #endif
 
-- 
cgit v1.1


From 5dfb417509921eb90ee123a4d1525e8916b4ace4 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Wed, 4 Jun 2014 16:06:38 -0700
Subject: sl[au]b: charge slabs to kmemcg explicitly

We have only a few places where we actually want to charge kmem so
instead of intruding into the general page allocation path with
__GFP_KMEMCG it's better to explictly charge kmem there.  All kmem
charges will be easier to follow that way.

This is a step towards removing __GFP_KMEMCG.  It removes __GFP_KMEMCG
from memcg caches' allocflags.  Instead it makes slab allocation path
call memcg_charge_kmem directly getting memcg to charge from the cache's
memcg params.

This also eliminates any possibility of misaccounting an allocation
going from one memcg's cache to another memcg, because now we always
charge slabs against the memcg the cache belongs to.  That's why this
patch removes the big comment to memcg_kmem_get_cache.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Greg Thelen <gthelen@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b569b8b..96e5d25 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -506,6 +506,9 @@ void memcg_update_array_size(int num_groups);
 struct kmem_cache *
 __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp);
 
+int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size);
+void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size);
+
 void mem_cgroup_destroy_cache(struct kmem_cache *cachep);
 int __kmem_cache_destroy_memcg_children(struct kmem_cache *s);
 
@@ -583,17 +586,7 @@ memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
  * @cachep: the original global kmem cache
  * @gfp: allocation flags.
  *
- * This function assumes that the task allocating, which determines the memcg
- * in the page allocator, belongs to the same cgroup throughout the whole
- * process.  Misacounting can happen if the task calls memcg_kmem_get_cache()
- * while belonging to a cgroup, and later on changes. This is considered
- * acceptable, and should only happen upon task migration.
- *
- * Before the cache is created by the memcg core, there is also a possible
- * imbalance: the task belongs to a memcg, but the cache being allocated from
- * is the global cache, since the child cache is not yet guaranteed to be
- * ready. This case is also fine, since in this case the GFP_KMEMCG will not be
- * passed and the page allocator will not attempt any cgroup accounting.
+ * All memory allocated from a per-memcg cache is charged to the owner memcg.
  */
 static __always_inline struct kmem_cache *
 memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
-- 
cgit v1.1


From 52383431b37cdbec63944e953ffc2698a7ad9722 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Wed, 4 Jun 2014 16:06:39 -0700
Subject: mm: get rid of __GFP_KMEMCG

Currently to allocate a page that should be charged to kmemcg (e.g.
threadinfo), we pass __GFP_KMEMCG flag to the page allocator.  The page
allocated is then to be freed by free_memcg_kmem_pages.  Apart from
looking asymmetrical, this also requires intrusion to the general
allocation path.  So let's introduce separate functions that will
alloc/free pages charged to kmemcg.

The new functions are called alloc_kmem_pages and free_kmem_pages.  They
should be used when the caller actually would like to use kmalloc, but
has to fall back to the page allocator for the allocation is large.
They only differ from alloc_pages and free_pages in that besides
allocating or freeing pages they also charge them to the kmem resource
counter of the current memory cgroup.

[sfr@canb.auug.org.au: export kmalloc_order() to modules]
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Greg Thelen <gthelen@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h             | 10 ++++++----
 include/linux/memcontrol.h      |  2 +-
 include/linux/slab.h            | 11 +----------
 include/linux/thread_info.h     |  2 --
 include/trace/events/gfpflags.h |  1 -
 5 files changed, 8 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 39b81dc..d382db7 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -31,7 +31,6 @@ struct vm_area_struct;
 #define ___GFP_HARDWALL		0x20000u
 #define ___GFP_THISNODE		0x40000u
 #define ___GFP_RECLAIMABLE	0x80000u
-#define ___GFP_KMEMCG		0x100000u
 #define ___GFP_NOTRACK		0x200000u
 #define ___GFP_NO_KSWAPD	0x400000u
 #define ___GFP_OTHER_NODE	0x800000u
@@ -91,7 +90,6 @@ struct vm_area_struct;
 
 #define __GFP_NO_KSWAPD	((__force gfp_t)___GFP_NO_KSWAPD)
 #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
-#define __GFP_KMEMCG	((__force gfp_t)___GFP_KMEMCG) /* Allocation comes from a memcg-accounted resource */
 #define __GFP_WRITE	((__force gfp_t)___GFP_WRITE)	/* Allocator intends to dirty page */
 
 /*
@@ -353,6 +351,10 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
 #define alloc_page_vma_node(gfp_mask, vma, addr, node)		\
 	alloc_pages_vma(gfp_mask, 0, vma, addr, node)
 
+extern struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order);
+extern struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask,
+					  unsigned int order);
+
 extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
 extern unsigned long get_zeroed_page(gfp_t gfp_mask);
 
@@ -372,8 +374,8 @@ extern void free_pages(unsigned long addr, unsigned int order);
 extern void free_hot_cold_page(struct page *page, int cold);
 extern void free_hot_cold_page_list(struct list_head *list, int cold);
 
-extern void __free_memcg_kmem_pages(struct page *page, unsigned int order);
-extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order);
+extern void __free_kmem_pages(struct page *page, unsigned int order);
+extern void free_kmem_pages(unsigned long addr, unsigned int order);
 
 #define __free_page(page) __free_pages((page), 0)
 #define free_page(addr) free_pages((addr), 0)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 96e5d25..5155d09 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -537,7 +537,7 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
 	 * res_counter_charge_nofail, but we hope those allocations are rare,
 	 * and won't be worth the trouble.
 	 */
-	if (!(gfp & __GFP_KMEMCG) || (gfp & __GFP_NOFAIL))
+	if (gfp & __GFP_NOFAIL)
 		return true;
 	if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
 		return true;
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 307bfbe..a6aab2c 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -369,16 +369,7 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s,
 #include <linux/slub_def.h>
 #endif
 
-static __always_inline void *
-kmalloc_order(size_t size, gfp_t flags, unsigned int order)
-{
-	void *ret;
-
-	flags |= (__GFP_COMP | __GFP_KMEMCG);
-	ret = (void *) __get_free_pages(flags, order);
-	kmemleak_alloc(ret, size, 1, flags);
-	return ret;
-}
+extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order);
 
 #ifdef CONFIG_TRACING
 extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order);
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index cb0cec9..ff307b5 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -61,8 +61,6 @@ extern long do_no_restart_syscall(struct restart_block *parm);
 # define THREADINFO_GFP		(GFP_KERNEL | __GFP_NOTRACK)
 #endif
 
-#define THREADINFO_GFP_ACCOUNTED (THREADINFO_GFP | __GFP_KMEMCG)
-
 /*
  * flag set/clear/test wrappers
  * - pass TIF_xxxx constants to these functions
diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h
index 1eddbf1..d6fd8e5 100644
--- a/include/trace/events/gfpflags.h
+++ b/include/trace/events/gfpflags.h
@@ -34,7 +34,6 @@
 	{(unsigned long)__GFP_HARDWALL,		"GFP_HARDWALL"},	\
 	{(unsigned long)__GFP_THISNODE,		"GFP_THISNODE"},	\
 	{(unsigned long)__GFP_RECLAIMABLE,	"GFP_RECLAIMABLE"},	\
-	{(unsigned long)__GFP_KMEMCG,		"GFP_KMEMCG"},		\
 	{(unsigned long)__GFP_MOVABLE,		"GFP_MOVABLE"},		\
 	{(unsigned long)__GFP_NOTRACK,		"GFP_NOTRACK"},		\
 	{(unsigned long)__GFP_NO_KSWAPD,	"GFP_NO_KSWAPD"},	\
-- 
cgit v1.1


From 4f115147ff802267d0aa41e361c5aa5bd933d896 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr@hp.com>
Date: Wed, 4 Jun 2014 16:06:46 -0700
Subject: mm,vmacache: add debug data

Introduce a CONFIG_DEBUG_VM_VMACACHE option to enable counting the cache
hit rate -- exported in /proc/vmstat.

Any updates to the caching scheme needs this kind of data, thus it can
save some work re-implementing the counting all the time.

Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Cc: Aswin Chandramouleeswaran <aswin@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vm_event_item.h | 4 ++++
 include/linux/vmstat.h        | 6 ++++++
 2 files changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 486c397..ced9234 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -80,6 +80,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		NR_TLB_LOCAL_FLUSH_ALL,
 		NR_TLB_LOCAL_FLUSH_ONE,
 #endif /* CONFIG_DEBUG_TLBFLUSH */
+#ifdef CONFIG_DEBUG_VM_VMACACHE
+		VMACACHE_FIND_CALLS,
+		VMACACHE_FIND_HITS,
+#endif
 		NR_VM_EVENT_ITEMS
 };
 
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 45c9cd1..82e7db7 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -95,6 +95,12 @@ static inline void vm_events_fold_cpu(int cpu)
 #define count_vm_tlb_events(x, y) do { (void)(y); } while (0)
 #endif
 
+#ifdef CONFIG_DEBUG_VM_VMACACHE
+#define count_vm_vmacache_event(x) count_vm_event(x)
+#else
+#define count_vm_vmacache_event(x) do {} while (0)
+#endif
+
 #define __count_zone_vm_events(item, zone, delta) \
 		__count_vm_events(item##_NORMAL - ZONE_NORMAL + \
 		zone_idx(zone), delta)
-- 
cgit v1.1


From 9c5a3621427da68afe6a078cadf807d2c8cc1d12 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 4 Jun 2014 16:06:50 -0700
Subject: x86: enable DMA CMA with swiotlb

The DMA Contiguous Memory Allocator support on x86 is disabled when
swiotlb config option is enabled.  So DMA CMA is always disabled on
x86_64 because swiotlb is always enabled.  This attempts to support for
DMA CMA with enabling swiotlb config option.

The contiguous memory allocator on x86 is integrated in the function
dma_generic_alloc_coherent() which is .alloc callback in nommu_dma_ops
for dma_alloc_coherent().

x86_swiotlb_alloc_coherent() which is .alloc callback in swiotlb_dma_ops
tries to allocate with dma_generic_alloc_coherent() firstly and then
swiotlb_alloc_coherent() is called as a fallback.

The main part of supporting DMA CMA with swiotlb is that changing
x86_swiotlb_free_coherent() which is .free callback in swiotlb_dma_ops
for dma_free_coherent() so that it can distinguish memory allocated by
dma_generic_alloc_coherent() from one allocated by
swiotlb_alloc_coherent() and release it with dma_generic_free_coherent()
which can handle contiguous memory.  This change requires making
is_swiotlb_buffer() global function.

This also needs to change .free callback in the dma_map_ops for amd_gart
and sta2x11, because these dma_ops are also using
dma_generic_alloc_coherent().

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Acked-by: Marek Szyprowski <m.szyprowski@samsung.com>
Acked-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Don Dutile <ddutile@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swiotlb.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index a5ffd32..e7a018e 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -116,4 +116,6 @@ static inline void swiotlb_free(void) { }
 #endif
 
 extern void swiotlb_print_info(void);
+extern int is_swiotlb_buffer(phys_addr_t paddr);
+
 #endif /* __LINUX_SWIOTLB_H */
-- 
cgit v1.1


From 2bfc2862c4fe38379a2fb2cfba33fad32ccb4ff4 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 4 Jun 2014 16:06:53 -0700
Subject: memblock: introduce memblock_alloc_range()

This introduces memblock_alloc_range() which allocates memblock from the
specified range of physical address.  I would like to use this function
to specify the location of CMA.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Don Dutile <ddutile@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 73dc382..b660e05 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -272,6 +272,8 @@ static inline bool memblock_bottom_up(void) { return false; }
 #define MEMBLOCK_ALLOC_ANYWHERE	(~(phys_addr_t)0)
 #define MEMBLOCK_ALLOC_ACCESSIBLE	0
 
+phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
+					phys_addr_t start, phys_addr_t end);
 phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align,
 				phys_addr_t max_addr);
 phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align,
-- 
cgit v1.1


From 5ea3b1b2f8ad9162684431ce6188102ca4c64b7a Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 4 Jun 2014 16:06:54 -0700
Subject: cma: add placement specifier for "cma=" kernel parameter

Currently, "cma=" kernel parameter is used to specify the size of CMA,
but we can't specify where it is located.  We want to locate CMA below
4GB for devices only supporting 32-bit addressing on 64-bit systems
without iommu.

This enables to specify the placement of CMA by extending "cma=" kernel
parameter.

Examples:
 1. locate 64MB CMA below 4GB by "cma=64M@0-4G"
 2. locate 64MB CMA exact at 512MB by "cma=64M@512M"

Note that the DMA contiguous memory allocator on x86 assumes that
page_address() works for the pages to allocate.  So this change requires
to limit end address of contiguous memory area upto max_pfn_mapped to
prevent from locating it on highmem area by the argument of
dma_contiguous_reserve().

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Don Dutile <ddutile@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/dma-contiguous.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h
index 3b28f93..772eab5 100644
--- a/include/linux/dma-contiguous.h
+++ b/include/linux/dma-contiguous.h
@@ -88,7 +88,8 @@ static inline void dma_contiguous_set_default(struct cma *cma)
 void dma_contiguous_reserve(phys_addr_t addr_limit);
 
 int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base,
-				       phys_addr_t limit, struct cma **res_cma);
+				       phys_addr_t limit, struct cma **res_cma,
+				       bool fixed);
 
 /**
  * dma_declare_contiguous() - reserve area for contiguous memory handling
@@ -108,7 +109,7 @@ static inline int dma_declare_contiguous(struct device *dev, phys_addr_t size,
 {
 	struct cma *cma;
 	int ret;
-	ret = dma_contiguous_reserve_area(size, base, limit, &cma);
+	ret = dma_contiguous_reserve_area(size, base, limit, &cma, true);
 	if (ret == 0)
 		dev_set_cma_area(dev, cma);
 
@@ -136,7 +137,9 @@ static inline void dma_contiguous_set_default(struct cma *cma) { }
 static inline void dma_contiguous_reserve(phys_addr_t limit) { }
 
 static inline int dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base,
-				       phys_addr_t limit, struct cma **res_cma) {
+				       phys_addr_t limit, struct cma **res_cma,
+				       bool fixed)
+{
 	return -ENOSYS;
 }
 
-- 
cgit v1.1


From 02a8efeda894d3541c7143ed818b25b299504190 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 4 Jun 2014 16:06:59 -0700
Subject: include/linux/mmdebug.h: add VM_WARN_ON() and VM_WARN_ON_ONCE()

WARN_ON() and WARN_ON_ONCE(), dependent on CONFIG_DEBUG_VM

Cc: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmdebug.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index 2d57efa..a3499d7 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -11,9 +11,13 @@ extern void dump_page_badflags(struct page *page, const char *reason,
 #define VM_BUG_ON(cond) BUG_ON(cond)
 #define VM_BUG_ON_PAGE(cond, page) \
 	do { if (unlikely(cond)) { dump_page(page, NULL); BUG(); } } while (0)
+#define VM_WARN_ON(cond) WARN_ON(cond)
+#define VM_WARN_ON_ONCE(cond) WARN_ON_ONCE(cond)
 #else
 #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
 #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond)
+#define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond)
+#define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond)
 #endif
 
 #ifdef CONFIG_DEBUG_VIRTUAL
-- 
cgit v1.1


From e4f674229ce63dac60be0c4ddfb5ef8d1225d30d Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 4 Jun 2014 16:07:02 -0700
Subject: mm: pass VM_BUG_ON() reason to dump_page()

I recently added a patch to let folks pass a "reason" string dump_page()
which gets dumped out along with the page's data.  This essentially
saves the bug-reader a trip in to the source to figure out why we
BUG_ON()'d.

The new VM_BUG_ON_PAGE() passes in NULL for "reason".  It seems like we
might as well pass the BUG_ON() condition if we have it.  This will
bloat kernels a bit with ~160 new strings, but this is all under a
debugging option anyway.

	page:ffffea0008560280 count:1 mapcount:0 mapping:(null) index:0x0
	page flags: 0xbfffc0000000001(locked)
	page dumped because: VM_BUG_ON_PAGE(PageLocked(page))
	------------[ cut here ]------------
	kernel BUG at /home/davehans/linux.git/mm/filemap.c:464!
	invalid opcode: 0000 [#1] SMP
	CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.14.0+ #251
	Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
	...

[akpm@linux-foundation.org: include stringify.h]
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Davidlohr Bueso <davidlohr@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmdebug.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index a3499d7..edd82a1 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -1,6 +1,8 @@
 #ifndef LINUX_MM_DEBUG_H
 #define LINUX_MM_DEBUG_H 1
 
+#include <linux/stringify.h>
+
 struct page;
 
 extern void dump_page(struct page *page, const char *reason);
@@ -9,8 +11,13 @@ extern void dump_page_badflags(struct page *page, const char *reason,
 
 #ifdef CONFIG_DEBUG_VM
 #define VM_BUG_ON(cond) BUG_ON(cond)
-#define VM_BUG_ON_PAGE(cond, page) \
-	do { if (unlikely(cond)) { dump_page(page, NULL); BUG(); } } while (0)
+#define VM_BUG_ON_PAGE(cond, page)					\
+	do {								\
+		if (unlikely(cond)) {					\
+			dump_page(page, "VM_BUG_ON_PAGE(" __stringify(cond)")");\
+			BUG();						\
+		}							\
+	} while (0)
 #define VM_WARN_ON(cond) WARN_ON(cond)
 #define VM_WARN_ON_ONCE(cond) WARN_ON_ONCE(cond)
 #else
-- 
cgit v1.1


From bae7f4ae14d47008a11b4358b167cb0ae186c06a Mon Sep 17 00:00:00 2001
From: Luiz Capitulino <lcapitulino@redhat.com>
Date: Wed, 4 Jun 2014 16:07:08 -0700
Subject: hugetlb: add hstate_is_gigantic()

Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Reviewed-by: Davidlohr Bueso <davidlohr@hp.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d0bad1a..35786ee 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -343,6 +343,11 @@ static inline unsigned huge_page_shift(struct hstate *h)
 	return h->order + PAGE_SHIFT;
 }
 
+static inline bool hstate_is_gigantic(struct hstate *h)
+{
+	return huge_page_order(h) >= MAX_ORDER;
+}
+
 static inline unsigned int pages_per_huge_page(struct hstate *h)
 {
 	return 1 << h->order;
-- 
cgit v1.1


From 4f9b16a64753d0bb607454347036dc997fd03b82 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 4 Jun 2014 16:07:14 -0700
Subject: mm: disable zone_reclaim_mode by default

When it was introduced, zone_reclaim_mode made sense as NUMA distances
punished and workloads were generally partitioned to fit into a NUMA
node.  NUMA machines are now common but few of the workloads are
NUMA-aware and it's routine to see major performance degradation due to
zone_reclaim_mode being enabled but relatively few can identify the
problem.

Those that require zone_reclaim_mode are likely to be able to detect
when it needs to be enabled and tune appropriately so lets have a
sensible default for the bulk of users.

This patch (of 2):

zone_reclaim_mode causes processes to prefer reclaiming memory from
local node instead of spilling over to other nodes.  This made sense
initially when NUMA machines were almost exclusively HPC and the
workload was partitioned into nodes.  The NUMA penalties were
sufficiently high to justify reclaiming the memory.  On current machines
and workloads it is often the case that zone_reclaim_mode destroys
performance but not all users know how to detect this.  Favour the
common case and disable it by default.  Users that are sophisticated
enough to know they need zone_reclaim_mode will detect it.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/topology.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/topology.h b/include/linux/topology.h
index 973671f..dda6ee5 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -58,7 +58,8 @@ int arch_update_cpu_topology(void);
 /*
  * If the distance between nodes in a system is larger than RECLAIM_DISTANCE
  * (in whatever arch specific measurement units returned by node_distance())
- * then switch on zone reclaim on boot.
+ * and zone_reclaim_mode is enabled then the VM will only call zone_reclaim()
+ * on nodes within this distance.
  */
 #define RECLAIM_DISTANCE 30
 #endif
-- 
cgit v1.1


From 5f7a75acdb24c7b9c436b3a0a66eec12e101d19c Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 4 Jun 2014 16:07:15 -0700
Subject: mm: page_alloc: do not cache reclaim distances

pgdat->reclaim_nodes tracks if a remote node is allowed to be reclaimed
by zone_reclaim due to its distance.  As it is expected that
zone_reclaim_mode will be rarely enabled it is unreasonable for all
machines to take a penalty.  Fortunately, the zone_reclaim_mode() path
is already slow and it is the path that takes the hit.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fac5509..c1dbe0b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -763,7 +763,6 @@ typedef struct pglist_data {
 	unsigned long node_spanned_pages; /* total size of physical page
 					     range, including holes */
 	int node_id;
-	nodemask_t reclaim_nodes;	/* Nodes allowed to reclaim from */
 	wait_queue_head_t kswapd_wait;
 	wait_queue_head_t pfmemalloc_wait;
 	struct task_struct *kswapd;	/* Protected by lock_memory_hotplug() */
-- 
cgit v1.1


From bfc8c90139ebd049b9801a951db3b9a4a00bed9c Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Wed, 4 Jun 2014 16:07:18 -0700
Subject: mem-hotplug: implement get/put_online_mems

kmem_cache_{create,destroy,shrink} need to get a stable value of
cpu/node online mask, because they init/destroy/access per-cpu/node
kmem_cache parts, which can be allocated or destroyed on cpu/mem
hotplug.  To protect against cpu hotplug, these functions use
{get,put}_online_cpus.  However, they do nothing to synchronize with
memory hotplug - taking the slab_mutex does not eliminate the
possibility of race as described in patch 2.

What we need there is something like get_online_cpus, but for memory.
We already have lock_memory_hotplug, which serves for the purpose, but
it's a bit of a hammer right now, because it's backed by a mutex.  As a
result, it imposes some limitations to locking order, which are not
desirable, and can't be used just like get_online_cpus.  That's why in
patch 1 I substitute it with get/put_online_mems, which work exactly
like get/put_online_cpus except they block not cpu, but memory hotplug.

[ v1 can be found at https://lkml.org/lkml/2014/4/6/68.  I NAK'ed it by
  myself, because it used an rw semaphore for get/put_online_mems,
  making them dead lock prune.  ]

This patch (of 2):

{un}lock_memory_hotplug, which is used to synchronize against memory
hotplug, is currently backed by a mutex, which makes it a bit of a
hammer - threads that only want to get a stable value of online nodes
mask won't be able to proceed concurrently.  Also, it imposes some
strong locking ordering rules on it, which narrows down the set of its
usage scenarios.

This patch introduces get/put_online_mems, which are the same as
get/put_online_cpus, but for memory hotplug, i.e.  executing a code
inside a get/put_online_mems section will guarantee a stable value of
online nodes, present pages, etc.

lock_memory_hotplug()/unlock_memory_hotplug() are removed altogether.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Jiang Liu <liuj97@gmail.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 14 ++++----------
 include/linux/mmzone.h         |  8 ++++----
 2 files changed, 8 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 4ca3d95..010d125 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -187,14 +187,8 @@ extern void put_page_bootmem(struct page *page);
 extern void get_page_bootmem(unsigned long ingo, struct page *page,
 			     unsigned long type);
 
-/*
- * Lock for memory hotplug guarantees 1) all callbacks for memory hotplug
- * notifier will be called under this. 2) offline/online/add/remove memory
- * will not run simultaneously.
- */
-
-void lock_memory_hotplug(void);
-void unlock_memory_hotplug(void);
+void get_online_mems(void);
+void put_online_mems(void);
 
 #else /* ! CONFIG_MEMORY_HOTPLUG */
 /*
@@ -232,8 +226,8 @@ static inline int try_online_node(int nid)
 	return 0;
 }
 
-static inline void lock_memory_hotplug(void) {}
-static inline void unlock_memory_hotplug(void) {}
+static inline void get_online_mems(void) {}
+static inline void put_online_mems(void) {}
 
 #endif /* ! CONFIG_MEMORY_HOTPLUG */
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c1dbe0b..ae693e1 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -481,9 +481,8 @@ struct zone {
 	 * give them a chance of being in the same cacheline.
 	 *
 	 * Write access to present_pages at runtime should be protected by
-	 * lock_memory_hotplug()/unlock_memory_hotplug().  Any reader who can't
-	 * tolerant drift of present_pages should hold memory hotplug lock to
-	 * get a stable value.
+	 * mem_hotplug_begin/end(). Any reader who can't tolerant drift of
+	 * present_pages should get_online_mems() to get a stable value.
 	 *
 	 * Read access to managed_pages should be safe because it's unsigned
 	 * long. Write access to zone->managed_pages and totalram_pages are
@@ -765,7 +764,8 @@ typedef struct pglist_data {
 	int node_id;
 	wait_queue_head_t kswapd_wait;
 	wait_queue_head_t pfmemalloc_wait;
-	struct task_struct *kswapd;	/* Protected by lock_memory_hotplug() */
+	struct task_struct *kswapd;	/* Protected by
+					   mem_hotplug_begin/end() */
 	int kswapd_max_order;
 	enum zone_type classzone_idx;
 #ifdef CONFIG_NUMA_BALANCING
-- 
cgit v1.1


From 2329d3751b082b4fd354f334a88662d72abac52d Mon Sep 17 00:00:00 2001
From: Jianyu Zhan <nasa4836@gmail.com>
Date: Wed, 4 Jun 2014 16:07:31 -0700
Subject: mm/swap.c: clean up *lru_cache_add* functions

In mm/swap.c, __lru_cache_add() is exported, but actually there are no
users outside this file.

This patch unexports __lru_cache_add(), and makes it static.  It also
exports lru_cache_add_file(), as it is use by cifs and fuse, which can
loaded as modules.

Signed-off-by: Jianyu Zhan <nasa4836@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: Bob Liu <bob.liu@oracle.com>
Cc: Seth Jennings <sjenning@linux.vnet.ibm.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Rafael Aquini <aquini@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Khalid Aziz <khalid.aziz@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Reviewed-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3507115..5a14b92 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -308,8 +308,9 @@ extern unsigned long nr_free_pagecache_pages(void);
 
 
 /* linux/mm/swap.c */
-extern void __lru_cache_add(struct page *);
 extern void lru_cache_add(struct page *);
+extern void lru_cache_add_anon(struct page *page);
+extern void lru_cache_add_file(struct page *page);
 extern void lru_add_page_tail(struct page *page, struct page *page_tail,
 			 struct lruvec *lruvec, struct list_head *head);
 extern void activate_page(struct page *);
@@ -323,22 +324,6 @@ extern void swap_setup(void);
 
 extern void add_page_to_unevictable_list(struct page *page);
 
-/**
- * lru_cache_add: add a page to the page lists
- * @page: the page to add
- */
-static inline void lru_cache_add_anon(struct page *page)
-{
-	ClearPageActive(page);
-	__lru_cache_add(page);
-}
-
-static inline void lru_cache_add_file(struct page *page)
-{
-	ClearPageActive(page);
-	__lru_cache_add(page);
-}
-
 /* linux/mm/vmscan.c */
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask, nodemask_t *mask);
-- 
cgit v1.1


From f98bafa06a28fdfdd5c49f820f4d6560f636fc46 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 4 Jun 2014 16:07:34 -0700
Subject: memcg: kill CONFIG_MM_OWNER

CONFIG_MM_OWNER makes no sense.  It is not user-selectable, it is only
selected by CONFIG_MEMCG automatically.  So we can kill this option in
init/Kconfig and do s/CONFIG_MM_OWNER/CONFIG_MEMCG/ globally.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 2 +-
 include/linux/sched.h    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8967e20..de16272 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -406,7 +406,7 @@ struct mm_struct {
 	spinlock_t			ioctx_lock;
 	struct kioctx_table __rcu	*ioctx_table;
 #endif
-#ifdef CONFIG_MM_OWNER
+#ifdef CONFIG_MEMCG
 	/*
 	 * "owner" points to a task that is regarded as the canonical
 	 * user/owner of this mm. All of the following must be true in
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 70f67e4..2f2dd7d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2967,7 +2967,7 @@ static inline void inc_syscw(struct task_struct *tsk)
 #define TASK_SIZE_OF(tsk)	TASK_SIZE
 #endif
 
-#ifdef CONFIG_MM_OWNER
+#ifdef CONFIG_MEMCG
 extern void mm_update_next_owner(struct mm_struct *mm);
 extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
 #else
@@ -2978,7 +2978,7 @@ static inline void mm_update_next_owner(struct mm_struct *mm)
 static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 {
 }
-#endif /* CONFIG_MM_OWNER */
+#endif /* CONFIG_MEMCG */
 
 static inline unsigned long task_rlimit(const struct task_struct *tsk,
 		unsigned int limit)
-- 
cgit v1.1


From 1e32e77f95d60b121b6072e3e3a650a7f93068f9 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Wed, 4 Jun 2014 16:07:37 -0700
Subject: memcg, slab: do not schedule cache destruction when last page goes
 away

This patchset is a part of preparations for kmemcg re-parenting.  It
targets at simplifying kmemcg work-flows and synchronization.

First, it removes async per memcg cache destruction (see patches 1, 2).
Now caches are only destroyed on memcg offline.  That means the caches
that are not empty on memcg offline will be leaked.  However, they are
already leaked, because memcg_cache_params::nr_pages normally never drops
to 0 so the destruction work is never scheduled except kmem_cache_shrink
is called explicitly.  In the future I'm planning reaping such dead caches
on vmpressure or periodically.

Second, it substitutes per memcg slab_caches_mutex's with the global
memcg_slab_mutex, which should be taken during the whole per memcg cache
creation/destruction path before the slab_mutex (see patch 3).  This
greatly simplifies synchronization among various per memcg cache
creation/destruction paths.

I'm still not quite sure about the end picture, in particular I don't know
whether we should reap dead memcgs' kmem caches periodically or try to
merge them with their parents (see https://lkml.org/lkml/2014/4/20/38 for
more details), but whichever way we choose, this set looks like a
reasonable change to me, because it greatly simplifies kmemcg work-flows
and eases further development.

This patch (of 3):

After a memcg is offlined, we mark its kmem caches that cannot be deleted
right now due to pending objects as dead by setting the
memcg_cache_params::dead flag, so that memcg_release_pages will schedule
cache destruction (memcg_cache_params::destroy) as soon as the last slab
of the cache is freed (memcg_cache_params::nr_pages drops to zero).

I guess the idea was to destroy the caches as soon as possible, i.e.
immediately after freeing the last object.  However, it just doesn't work
that way, because kmem caches always preserve some pages for the sake of
performance, so that nr_pages never gets to zero unless the cache is
shrunk explicitly using kmem_cache_shrink.  Of course, we could account
the total number of objects on the cache or check if all the slabs
allocated for the cache are empty on kmem_cache_free and schedule
destruction if so, but that would be too costly.

Thus we have a piece of code that works only when we explicitly call
kmem_cache_shrink, but complicates the whole picture a lot.  Moreover,
it's racy in fact.  For instance, kmem_cache_shrink may free the last slab
and thus schedule cache destruction before it finishes checking that the
cache is empty, which can lead to use-after-free.

So I propose to remove this async cache destruction from
memcg_release_pages, and check if the cache is empty explicitly after
calling kmem_cache_shrink instead.  This will simplify things a lot w/o
introducing any functional changes.

And regarding dead memcg caches (i.e.  those that are left hanging around
after memcg offline for they have objects), I suppose we should reap them
either periodically or on vmpressure as Glauber suggested initially.  I'm
going to implement this later.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 1 -
 include/linux/slab.h       | 2 --
 2 files changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5155d09..087a453 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -509,7 +509,6 @@ __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp);
 int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size);
 void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size);
 
-void mem_cgroup_destroy_cache(struct kmem_cache *cachep);
 int __kmem_cache_destroy_memcg_children(struct kmem_cache *s);
 
 /**
diff --git a/include/linux/slab.h b/include/linux/slab.h
index a6aab2c..905541d 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -524,7 +524,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
  * @memcg: pointer to the memcg this cache belongs to
  * @list: list_head for the list of all caches in this memcg
  * @root_cache: pointer to the global, root cache, this cache was derived from
- * @dead: set to true after the memcg dies; the cache may still be around.
  * @nr_pages: number of pages that belongs to this cache.
  * @destroy: worker to be called whenever we are ready, or believe we may be
  *           ready, to destroy this cache.
@@ -540,7 +539,6 @@ struct memcg_cache_params {
 			struct mem_cgroup *memcg;
 			struct list_head list;
 			struct kmem_cache *root_cache;
-			bool dead;
 			atomic_t nr_pages;
 			struct work_struct destroy;
 		};
-- 
cgit v1.1


From c67a8a685a6e9abbaf0235e084168f15a721ae39 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Wed, 4 Jun 2014 16:07:39 -0700
Subject: memcg, slab: merge memcg_{bind,release}_pages to
 memcg_{un}charge_slab

Currently we have two pairs of kmemcg-related functions that are called on
slab alloc/free.  The first is memcg_{bind,release}_pages that count the
total number of pages allocated on a kmem cache.  The second is
memcg_{un}charge_slab that {un}charge slab pages to kmemcg resource
counter.  Let's just merge them to keep the code clean.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 087a453..d38d190 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -506,8 +506,8 @@ void memcg_update_array_size(int num_groups);
 struct kmem_cache *
 __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp);
 
-int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size);
-void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size);
+int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order);
+void __memcg_uncharge_slab(struct kmem_cache *cachep, int order);
 
 int __kmem_cache_destroy_memcg_children(struct kmem_cache *s);
 
-- 
cgit v1.1


From bd67314586a3d5725e60f2f6587b4cb0f659bb67 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Wed, 4 Jun 2014 16:07:40 -0700
Subject: memcg, slab: simplify synchronization scheme

At present, we have the following mutexes protecting data related to per
memcg kmem caches:

 - slab_mutex.  This one is held during the whole kmem cache creation
   and destruction paths.  We also take it when updating per root cache
   memcg_caches arrays (see memcg_update_all_caches).  As a result, taking
   it guarantees there will be no changes to any kmem cache (including per
   memcg).  Why do we need something else then?  The point is it is
   private to slab implementation and has some internal dependencies with
   other mutexes (get_online_cpus).  So we just don't want to rely upon it
   and prefer to introduce additional mutexes instead.

 - activate_kmem_mutex.  Initially it was added to synchronize
   initializing kmem limit (memcg_activate_kmem).  However, since we can
   grow per root cache memcg_caches arrays only on kmem limit
   initialization (see memcg_update_all_caches), we also employ it to
   protect against memcg_caches arrays relocation (e.g.  see
   __kmem_cache_destroy_memcg_children).

 - We have a convention not to take slab_mutex in memcontrol.c, but we
   want to walk over per memcg memcg_slab_caches lists there (e.g.  for
   destroying all memcg caches on offline).  So we have per memcg
   slab_caches_mutex's protecting those lists.

The mutexes are taken in the following order:

   activate_kmem_mutex -> slab_mutex -> memcg::slab_caches_mutex

Such a syncrhonization scheme has a number of flaws, for instance:

 - We can't call kmem_cache_{destroy,shrink} while walking over a
   memcg::memcg_slab_caches list due to locking order.  As a result, in
   mem_cgroup_destroy_all_caches we schedule the
   memcg_cache_params::destroy work shrinking and destroying the cache.

 - We don't have a mutex to synchronize per memcg caches destruction
   between memcg offline (mem_cgroup_destroy_all_caches) and root cache
   destruction (__kmem_cache_destroy_memcg_children).  Currently we just
   don't bother about it.

This patch simplifies it by substituting per memcg slab_caches_mutex's
with the global memcg_slab_mutex.  It will be held whenever a new per
memcg cache is created or destroyed, so it protects per root cache
memcg_caches arrays and per memcg memcg_slab_caches lists.  The locking
order is following:

   activate_kmem_mutex -> memcg_slab_mutex -> slab_mutex

This allows us to call kmem_cache_{create,shrink,destroy} under the
memcg_slab_mutex.  As a result, we don't need memcg_cache_params::destroy
work any more - we can simply destroy caches while iterating over a per
memcg slab caches list.

Also using the global mutex simplifies synchronization between concurrent
per memcg caches creation/destruction, e.g.  mem_cgroup_destroy_all_caches
vs __kmem_cache_destroy_memcg_children.

The downside of this is that we substitute per-memcg slab_caches_mutex's
with a hummer-like global mutex, but since we already take either the
slab_mutex or the cgroup_mutex along with a memcg::slab_caches_mutex, it
shouldn't hurt concurrency a lot.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 10 ----------
 include/linux/slab.h       |  6 ++----
 2 files changed, 2 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index d38d190..1fa2324 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -497,8 +497,6 @@ char *memcg_create_cache_name(struct mem_cgroup *memcg,
 int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
 			     struct kmem_cache *root_cache);
 void memcg_free_cache_params(struct kmem_cache *s);
-void memcg_register_cache(struct kmem_cache *s);
-void memcg_unregister_cache(struct kmem_cache *s);
 
 int memcg_update_cache_size(struct kmem_cache *s, int num_groups);
 void memcg_update_array_size(int num_groups);
@@ -640,14 +638,6 @@ static inline void memcg_free_cache_params(struct kmem_cache *s)
 {
 }
 
-static inline void memcg_register_cache(struct kmem_cache *s)
-{
-}
-
-static inline void memcg_unregister_cache(struct kmem_cache *s)
-{
-}
-
 static inline struct kmem_cache *
 memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
 {
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 905541d..ecbec9c 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -116,7 +116,8 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
 			unsigned long,
 			void (*)(void *));
 #ifdef CONFIG_MEMCG_KMEM
-void kmem_cache_create_memcg(struct mem_cgroup *, struct kmem_cache *);
+struct kmem_cache *kmem_cache_create_memcg(struct mem_cgroup *,
+					   struct kmem_cache *);
 #endif
 void kmem_cache_destroy(struct kmem_cache *);
 int kmem_cache_shrink(struct kmem_cache *);
@@ -525,8 +526,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
  * @list: list_head for the list of all caches in this memcg
  * @root_cache: pointer to the global, root cache, this cache was derived from
  * @nr_pages: number of pages that belongs to this cache.
- * @destroy: worker to be called whenever we are ready, or believe we may be
- *           ready, to destroy this cache.
  */
 struct memcg_cache_params {
 	bool is_root_cache;
@@ -540,7 +539,6 @@ struct memcg_cache_params {
 			struct list_head list;
 			struct kmem_cache *root_cache;
 			atomic_t nr_pages;
-			struct work_struct destroy;
 		};
 	};
 };
-- 
cgit v1.1


From 1b938c0827478df268d2336469ec48d400a2eb3e Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 4 Jun 2014 16:07:43 -0700
Subject: fs/buffer.c: remove block_write_full_page_endio()

The last in-tree caller of block_write_full_page_endio() was removed in
January 2013.  It's time to remove the EXPORT_SYMBOL, which leaves
block_write_full_page() as the only caller of
block_write_full_page_endio(), so inline block_write_full_page_endio()
into block_write_full_page().

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dheeraj Reddy <dheeraj.reddy@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/buffer_head.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 7cbf837..324329c 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -207,8 +207,6 @@ void block_invalidatepage(struct page *page, unsigned int offset,
 			  unsigned int length);
 int block_write_full_page(struct page *page, get_block_t *get_block,
 				struct writeback_control *wbc);
-int block_write_full_page_endio(struct page *page, get_block_t *get_block,
-			struct writeback_control *wbc, bh_end_io_t *handler);
 int block_read_full_page(struct page*, get_block_t*);
 int block_is_partially_uptodate(struct page *page, unsigned long from,
 				unsigned long count);
-- 
cgit v1.1


From 57d998456ae8680ed446aa1993f45f4d8a9a5973 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 4 Jun 2014 16:07:45 -0700
Subject: fs/mpage.c: factor page_endio() out of mpage_end_io()

page_endio() takes care of updating all the appropriate page flags once
I/O has finished to a page.  Switch to using mapping_set_error() instead
of setting AS_EIO directly; this will handle thin-provisioned devices
correctly.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dheeraj Reddy <dheeraj.reddy@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 45598f1..718214c 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -425,6 +425,8 @@ static inline void wait_on_page_writeback(struct page *page)
 extern void end_page_writeback(struct page *page);
 void wait_for_stable_page(struct page *page);
 
+void page_endio(struct page *page, int rw, int err);
+
 /*
  * Add an arbitrary waiter to a page's wait queue
  */
-- 
cgit v1.1


From 47a191fd38ebddb1bd1510ec2bc1085c578c8868 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 4 Jun 2014 16:07:46 -0700
Subject: fs/block_dev.c: add bdev_read_page() and bdev_write_page()

A block device driver may choose to provide a rw_page operation.  These
will be called when the filesystem is attempting to do page sized I/O to
page cache pages (ie not for direct I/O).  This does preclude I/Os that
are larger than page size, so this may only be a performance gain for
some devices.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Tested-by: Dheeraj Reddy <dheeraj.reddy@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/blkdev.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 45cf6e5..2f3886e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1588,6 +1588,7 @@ static inline bool blk_integrity_is_initialized(struct gendisk *g)
 struct block_device_operations {
 	int (*open) (struct block_device *, fmode_t);
 	void (*release) (struct gendisk *, fmode_t);
+	int (*rw_page)(struct block_device *, sector_t, struct page *, int rw);
 	int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	int (*direct_access) (struct block_device *, sector_t,
@@ -1606,6 +1607,9 @@ struct block_device_operations {
 
 extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
 				 unsigned long);
+extern int bdev_read_page(struct block_device *, sector_t, struct page *);
+extern int bdev_write_page(struct block_device *, sector_t, struct page *,
+						struct writeback_control *);
 #else /* CONFIG_BLOCK */
 
 struct block_device;
-- 
cgit v1.1


From f7f28ca98b9a7a99fc55df2dddcf49857ab004f0 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Wed, 4 Jun 2014 16:07:57 -0700
Subject: mm: constify nmask argument to mbind()

The nmask argument to mbind() is const according to the userspace header
numaif.h, and since the kernel does indeed not modify it, it might as well
be declared const in the kernel.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/syscalls.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a4a0588..bfef0be 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -723,7 +723,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
 				int flags);
 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 				unsigned long mode,
-				unsigned long __user *nmask,
+				const unsigned long __user *nmask,
 				unsigned long maxnode,
 				unsigned flags);
 asmlinkage long sys_get_mempolicy(int __user *policy,
-- 
cgit v1.1


From 23c8902d403ef9a04cdc367d0b76a3ed6d83f5c5 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Wed, 4 Jun 2014 16:07:58 -0700
Subject: mm: constify nmask argument to set_mempolicy()

The nmask argument to set_mempolicy() is const according to the user-space
header numaif.h, and since the kernel does indeed not modify it, it might
as well be declared const in the kernel.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/syscalls.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index bfef0be..b0881a0 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -711,7 +711,7 @@ asmlinkage long sys_keyctl(int cmd, unsigned long arg2, unsigned long arg3,
 
 asmlinkage long sys_ioprio_set(int which, int who, int ioprio);
 asmlinkage long sys_ioprio_get(int which, int who);
-asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
+asmlinkage long sys_set_mempolicy(int mode, const unsigned long __user *nmask,
 				unsigned long maxnode);
 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 				const unsigned long __user *from,
-- 
cgit v1.1


From d2ee40eae98d8a41ff27dcdd13b1b656c4c1ad00 Mon Sep 17 00:00:00 2001
From: Jianyu Zhan <nasa4836@gmail.com>
Date: Wed, 4 Jun 2014 16:08:02 -0700
Subject: mm: introdule compound_head_by_tail()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, in put_compound_page(), we have

======
if (likely(!PageTail(page))) {                  <------  (1)
        if (put_page_testzero(page)) {
                 /*
                 ¦* By the time all refcounts have been released
                 ¦* split_huge_page cannot run anymore from under us.
                 ¦*/
                 if (PageHead(page))
                         __put_compound_page(page);
                 else
                         __put_single_page(page);
         }
         return;
}

/* __split_huge_page_refcount can run under us */
page_head = compound_head(page);        <------------ (2)
======

if at (1) ,  we fail the check, this means page is *likely* a tail page.

Then at (2), as compoud_head(page) is inlined, it is :

======
static inline struct page *compound_head(struct page *page)
{
          if (unlikely(PageTail(page))) {           <----------- (3)
              struct page *head = page->first_page;

                smp_rmb();
                if (likely(PageTail(page)))
                        return head;
        }
        return page;
}
======

here, the (3) unlikely in the case is a negative hint, because it is
*likely* a tail page.  So the check (3) in this case is not good, so I
introduce a helper for this case.

So this patch introduces compound_head_by_tail() which deals with a
possible tail page(though it could be spilt by a racy thread), and make
compound_head() a wrapper on it.

This patch has no functional change, and it reduces the object
size slightly:
   text    data     bss     dec     hex  filename
  11003    1328      16   12347    303b  mm/swap.o.orig
  10971    1328      16   12315    301b  mm/swap.o.patched

I've ran "perf top -e branch-miss" to observe branch-miss in this case.
As Michael points out, it's a slow path, so only very few times this case
happens.  But I grep'ed the code base, and found there still are some
other call sites could be benifited from this helper.  And given that it
only bloating up the source by only 5 lines, but with a reduced object
size.  I still believe this helper deserves to exsit.

Signed-off-by: Jianyu Zhan <nasa4836@gmail.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Jiang Liu <liuj97@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d677706..3686006 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -407,20 +407,25 @@ static inline void compound_unlock_irqrestore(struct page *page,
 #endif
 }
 
+static inline struct page *compound_head_by_tail(struct page *tail)
+{
+	struct page *head = tail->first_page;
+
+	/*
+	 * page->first_page may be a dangling pointer to an old
+	 * compound page, so recheck that it is still a tail
+	 * page before returning.
+	 */
+	smp_rmb();
+	if (likely(PageTail(tail)))
+		return head;
+	return tail;
+}
+
 static inline struct page *compound_head(struct page *page)
 {
-	if (unlikely(PageTail(page))) {
-		struct page *head = page->first_page;
-
-		/*
-		 * page->first_page may be a dangling pointer to an old
-		 * compound page, so recheck that it is still a tail
-		 * page before returning.
-		 */
-		smp_rmb();
-		if (likely(PageTail(page)))
-			return head;
-	}
+	if (unlikely(PageTail(page)))
+		return compound_head_by_tail(page);
 	return page;
 }
 
-- 
cgit v1.1


From 1754e44e8291c92b9d981a6eca59f28dd25f03ab Mon Sep 17 00:00:00 2001
From: Wang Sheng-Hui <shhuiw@gmail.com>
Date: Wed, 4 Jun 2014 16:08:04 -0700
Subject: include/linux/bootmem.h: cleanup the comment for BOOTMEM_ flags

Use BOOTMEM_DEFAULT instead of 0 in the comment.

Signed-off-by: Wang Sheng-Hui <shhuiw@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bootmem.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index db51fe4..4e2bd4c 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -58,9 +58,9 @@ extern void free_bootmem_late(unsigned long physaddr, unsigned long size);
  * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE,
  * the architecture-specific code should honor this).
  *
- * If flags is 0, then the return value is always 0 (success). If
- * flags contains BOOTMEM_EXCLUSIVE, then -EBUSY is returned if the
- * memory already was reserved.
+ * If flags is BOOTMEM_DEFAULT, then the return value is always 0 (success).
+ * If flags contains BOOTMEM_EXCLUSIVE, then -EBUSY is returned if the memory
+ * already was reserved.
  */
 #define BOOTMEM_DEFAULT		0
 #define BOOTMEM_EXCLUSIVE	(1<<0)
-- 
cgit v1.1


From 7fe7047597cf5ebb300802494db4f407327ec94f Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 4 Jun 2014 16:08:06 -0700
Subject: mm: shrinker trace points: fix negatives

I was looking at a trace of the slab shrinkers (attachment in this comment):

	https://bugs.freedesktop.org/show_bug.cgi?id=72742#c67

and noticed that "total_scan" can go negative in some cases.  We
used to dump out the "total_scan" variable directly, but some of
the shrinker modifications along the way changed that.

This patch just dumps it out directly, again.  It doesn't make
any sense to derive it from new_nr and nr any more since there
are now other shrinkers that can be running in parallel and
mucking with those values.

Here's an example of the negative numbers in the output:

>          kswapd0-840   [000]   160.869398: mm_shrink_slab_end:   i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 10 new scan count 39 total_scan 29 last shrinker return val 256
>          kswapd0-840   [000]   160.869618: mm_shrink_slab_end:   i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 39 new scan count 102 total_scan 63 last shrinker return val 256
>          kswapd0-840   [000]   160.870031: mm_shrink_slab_end:   i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 102 new scan count 47 total_scan -55 last shrinker return val 768
>          kswapd0-840   [000]   160.870464: mm_shrink_slab_end:   i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 47 new scan count 45 total_scan -2 last shrinker return val 768
>          kswapd0-840   [000]   163.384144: mm_shrink_slab_end:   i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 45 new scan count 56 total_scan 11 last shrinker return val 0
>          kswapd0-840   [000]   163.384297: mm_shrink_slab_end:   i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 56 new scan count 15 total_scan -41 last shrinker return val 256
>          kswapd0-840   [000]   163.384414: mm_shrink_slab_end:   i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 15 new scan count 117 total_scan 102 last shrinker return val 0
>          kswapd0-840   [000]   163.384657: mm_shrink_slab_end:   i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 117 new scan count 36 total_scan -81 last shrinker return val 512
>          kswapd0-840   [000]   163.384880: mm_shrink_slab_end:   i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 36 new scan count 111 total_scan 75 last shrinker return val 256
>          kswapd0-840   [000]   163.385256: mm_shrink_slab_end:   i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 111 new scan count 34 total_scan -77 last shrinker return val 768
>          kswapd0-840   [000]   163.385598: mm_shrink_slab_end:   i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 34 new scan count 122 total_scan 88 last shrinker return val 512

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Dave Chinner <david@fromorbit.com>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/trace/events/vmscan.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 132a985..1dd5e77 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -226,9 +226,9 @@ TRACE_EVENT(mm_shrink_slab_start,
 
 TRACE_EVENT(mm_shrink_slab_end,
 	TP_PROTO(struct shrinker *shr, int shrinker_retval,
-		long unused_scan_cnt, long new_scan_cnt),
+		long unused_scan_cnt, long new_scan_cnt, long total_scan),
 
-	TP_ARGS(shr, shrinker_retval, unused_scan_cnt, new_scan_cnt),
+	TP_ARGS(shr, shrinker_retval, unused_scan_cnt, new_scan_cnt, total_scan),
 
 	TP_STRUCT__entry(
 		__field(struct shrinker *, shr)
@@ -245,7 +245,7 @@ TRACE_EVENT(mm_shrink_slab_end,
 		__entry->unused_scan = unused_scan_cnt;
 		__entry->new_scan = new_scan_cnt;
 		__entry->retval = shrinker_retval;
-		__entry->total_scan = new_scan_cnt - unused_scan_cnt;
+		__entry->total_scan = total_scan;
 	),
 
 	TP_printk("%pF %p: unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d",
-- 
cgit v1.1


From df9024a8c5a3e031c5df26386f74ffed1b8fc095 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 4 Jun 2014 16:08:07 -0700
Subject: mm: shrinker: add nid to tracepoint output

Now that we are doing NUMA-aware shrinking, and can have shrinkers
running in parallel, or working on individual nodes, it seems like we
should also be sticking the node in the output.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Dave Chinner <david@fromorbit.com>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/trace/events/vmscan.h | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 1dd5e77..69590b6 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -191,6 +191,7 @@ TRACE_EVENT(mm_shrink_slab_start,
 	TP_STRUCT__entry(
 		__field(struct shrinker *, shr)
 		__field(void *, shrink)
+		__field(int, nid)
 		__field(long, nr_objects_to_shrink)
 		__field(gfp_t, gfp_flags)
 		__field(unsigned long, pgs_scanned)
@@ -203,6 +204,7 @@ TRACE_EVENT(mm_shrink_slab_start,
 	TP_fast_assign(
 		__entry->shr = shr;
 		__entry->shrink = shr->scan_objects;
+		__entry->nid = sc->nid;
 		__entry->nr_objects_to_shrink = nr_objects_to_shrink;
 		__entry->gfp_flags = sc->gfp_mask;
 		__entry->pgs_scanned = pgs_scanned;
@@ -212,9 +214,10 @@ TRACE_EVENT(mm_shrink_slab_start,
 		__entry->total_scan = total_scan;
 	),
 
-	TP_printk("%pF %p: objects to shrink %ld gfp_flags %s pgs_scanned %ld lru_pgs %ld cache items %ld delta %lld total_scan %ld",
+	TP_printk("%pF %p: nid: %d objects to shrink %ld gfp_flags %s pgs_scanned %ld lru_pgs %ld cache items %ld delta %lld total_scan %ld",
 		__entry->shrink,
 		__entry->shr,
+		__entry->nid,
 		__entry->nr_objects_to_shrink,
 		show_gfp_flags(__entry->gfp_flags),
 		__entry->pgs_scanned,
@@ -225,13 +228,15 @@ TRACE_EVENT(mm_shrink_slab_start,
 );
 
 TRACE_EVENT(mm_shrink_slab_end,
-	TP_PROTO(struct shrinker *shr, int shrinker_retval,
+	TP_PROTO(struct shrinker *shr, int nid, int shrinker_retval,
 		long unused_scan_cnt, long new_scan_cnt, long total_scan),
 
-	TP_ARGS(shr, shrinker_retval, unused_scan_cnt, new_scan_cnt, total_scan),
+	TP_ARGS(shr, nid, shrinker_retval, unused_scan_cnt, new_scan_cnt,
+		total_scan),
 
 	TP_STRUCT__entry(
 		__field(struct shrinker *, shr)
+		__field(int, nid)
 		__field(void *, shrink)
 		__field(long, unused_scan)
 		__field(long, new_scan)
@@ -241,6 +246,7 @@ TRACE_EVENT(mm_shrink_slab_end,
 
 	TP_fast_assign(
 		__entry->shr = shr;
+		__entry->nid = nid;
 		__entry->shrink = shr->scan_objects;
 		__entry->unused_scan = unused_scan_cnt;
 		__entry->new_scan = new_scan_cnt;
@@ -248,9 +254,10 @@ TRACE_EVENT(mm_shrink_slab_end,
 		__entry->total_scan = total_scan;
 	),
 
-	TP_printk("%pF %p: unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d",
+	TP_printk("%pF %p: nid: %d unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d",
 		__entry->shrink,
 		__entry->shr,
+		__entry->nid,
 		__entry->unused_scan,
 		__entry->new_scan,
 		__entry->total_scan,
-- 
cgit v1.1


From ac7695012a6f3269acd80d6c2b2218a6769edbf3 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 4 Jun 2014 16:08:17 -0700
Subject: mm/rmap.c: make page_referenced_one() and try_to_unmap_one() static

KSM was converted to use rmap_walk() and now nobody uses these functions
outside mm/rmap.c.

Let's covert them back to static.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b66c211..9be55c7 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -183,14 +183,10 @@ static inline void page_dup_rmap(struct page *page)
  */
 int page_referenced(struct page *, int is_locked,
 			struct mem_cgroup *memcg, unsigned long *vm_flags);
-int page_referenced_one(struct page *, struct vm_area_struct *,
-	unsigned long address, void *arg);
 
 #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
 
 int try_to_unmap(struct page *, enum ttu_flags flags);
-int try_to_unmap_one(struct page *, struct vm_area_struct *,
-			unsigned long address, void *arg);
 
 /*
  * Called from mm/filemap_xip.c to unmap empty zero page
-- 
cgit v1.1


From 3fb1c8dcfcda2f5bfb7d79d8b08bf2f04b1eed8f Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 4 Jun 2014 16:08:20 -0700
Subject: mm: update comment for DEFAULT_MAX_MAP_COUNT

With ELF extended numbering 16-bit bound is not hard limit any more.

[akpm@linux-foundation.org: fix typo]
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched/sysctl.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 8045a55..596a0e0 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -25,6 +25,10 @@ enum { sysctl_hung_task_timeout_secs = 0 };
  * Because the kernel adds some informative sections to a image of program at
  * generating coredump, we need some margin. The number of extra sections is
  * 1-3 now and depends on arch. We use "5" as safe margin, here.
+ *
+ * ELF extended numbering allows more than 65535 sections, so 16-bit bound is
+ * not a hard limit any more. Although some userspace tools can be surprised by
+ * that.
  */
 #define MAPCOUNT_ELF_CORE_MARGIN	(5)
 #define DEFAULT_MAX_MAP_COUNT	(USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
-- 
cgit v1.1


From 073ee1c6cd11cd190f4d0da84d9b4ba79d7b9e70 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Wed, 4 Jun 2014 16:08:23 -0700
Subject: memcg: get rid of memcg_create_cache_name

Instead of calling back to memcontrol.c from kmem_cache_create_memcg in
order to just create the name of a per memcg cache, let's allocate it in
place.  We only need to pass the memcg name to kmem_cache_create_memcg for
that - everything else can be done in slab_common.c.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 2 --
 include/linux/slab.h       | 3 ++-
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1fa2324..dfc2929 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -492,8 +492,6 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order);
 
 int memcg_cache_id(struct mem_cgroup *memcg);
 
-char *memcg_create_cache_name(struct mem_cgroup *memcg,
-			      struct kmem_cache *root_cache);
 int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
 			     struct kmem_cache *root_cache);
 void memcg_free_cache_params(struct kmem_cache *s);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index ecbec9c..86e5b26 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -117,7 +117,8 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
 			void (*)(void *));
 #ifdef CONFIG_MEMCG_KMEM
 struct kmem_cache *kmem_cache_create_memcg(struct mem_cgroup *,
-					   struct kmem_cache *);
+					   struct kmem_cache *,
+					   const char *);
 #endif
 void kmem_cache_destroy(struct kmem_cache *);
 int kmem_cache_shrink(struct kmem_cache *);
-- 
cgit v1.1


From 68711a746345c44ae00c64d8dbac6a9ce13ac54a Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 4 Jun 2014 16:08:25 -0700
Subject: mm, migration: add destination page freeing callback

Memory migration uses a callback defined by the caller to determine how to
allocate destination pages.  When migration fails for a source page,
however, it frees the destination page back to the system.

This patch adds a memory migration callback defined by the caller to
determine how to free destination pages.  If a caller, such as memory
compaction, builds its own freelist for migration targets, this can reuse
already freed memory instead of scanning additional memory.

If the caller provides a function to handle freeing of destination pages,
it is called when page migration fails.  If the caller passes NULL then
freeing back to the system will be handled as usual.  This patch
introduces no functional change.

Signed-off-by: David Rientjes <rientjes@google.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 84a31ad..a2901c4 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -5,7 +5,9 @@
 #include <linux/mempolicy.h>
 #include <linux/migrate_mode.h>
 
-typedef struct page *new_page_t(struct page *, unsigned long private, int **);
+typedef struct page *new_page_t(struct page *page, unsigned long private,
+				int **reason);
+typedef void free_page_t(struct page *page, unsigned long private);
 
 /*
  * Return values from addresss_space_operations.migratepage():
@@ -38,7 +40,7 @@ enum migrate_reason {
 extern void putback_movable_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
 			struct page *, struct page *, enum migrate_mode);
-extern int migrate_pages(struct list_head *l, new_page_t x,
+extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
 		unsigned long private, enum migrate_mode mode, int reason);
 
 extern int migrate_prep(void);
@@ -56,8 +58,9 @@ extern int migrate_page_move_mapping(struct address_space *mapping,
 #else
 
 static inline void putback_movable_pages(struct list_head *l) {}
-static inline int migrate_pages(struct list_head *l, new_page_t x,
-		unsigned long private, enum migrate_mode mode, int reason)
+static inline int migrate_pages(struct list_head *l, new_page_t new,
+		free_page_t free, unsigned long private, enum migrate_mode mode,
+		int reason)
 	{ return -ENOSYS; }
 
 static inline int migrate_prep(void) { return -ENOSYS; }
-- 
cgit v1.1


From 35979ef3393110ff3c12c6b94552208d3bdf1a36 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 4 Jun 2014 16:08:27 -0700
Subject: mm, compaction: add per-zone migration pfn cache for async compaction

Each zone has a cached migration scanner pfn for memory compaction so that
subsequent calls to memory compaction can start where the previous call
left off.

Currently, the compaction migration scanner only updates the per-zone
cached pfn when pageblocks were not skipped for async compaction.  This
creates a dependency on calling sync compaction to avoid having subsequent
calls to async compaction from scanning an enormous amount of non-MOVABLE
pageblocks each time it is called.  On large machines, this could be
potentially very expensive.

This patch adds a per-zone cached migration scanner pfn only for async
compaction.  It is updated everytime a pageblock has been scanned in its
entirety and when no pages from it were successfully isolated.  The cached
migration scanner pfn for sync compaction is updated only when called for
sync compaction.

Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ae693e1..10a96ee 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -360,9 +360,10 @@ struct zone {
 	/* Set to true when the PG_migrate_skip bits should be cleared */
 	bool			compact_blockskip_flush;
 
-	/* pfns where compaction scanners should start */
+	/* pfn where compaction free scanner should start */
 	unsigned long		compact_cached_free_pfn;
-	unsigned long		compact_cached_migrate_pfn;
+	/* pfn where async and sync compaction migration scanner should start */
+	unsigned long		compact_cached_migrate_pfn[2];
 #endif
 #ifdef CONFIG_MEMORY_HOTPLUG
 	/* see spanned/present_pages for more description */
-- 
cgit v1.1


From e0b9daeb453e602a95ea43853dc12d385558ce1f Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 4 Jun 2014 16:08:28 -0700
Subject: mm, compaction: embed migration mode in compact_control

We're going to want to manipulate the migration mode for compaction in the
page allocator, and currently compact_control's sync field is only a bool.

Currently, we only do MIGRATE_ASYNC or MIGRATE_SYNC_LIGHT compaction
depending on the value of this bool.  Convert the bool to enum
migrate_mode and pass the migration mode in directly.  Later, we'll want
to avoid MIGRATE_SYNC_LIGHT for thp allocations in the pagefault patch to
avoid unnecessary latency.

This also alters compaction triggered from sysfs, either for the entire
system or for a node, to force MIGRATE_SYNC.

[akpm@linux-foundation.org: fix build]
[iamjoonsoo.kim@lge.com: use MIGRATE_SYNC in alloc_contig_range()]
Signed-off-by: David Rientjes <rientjes@google.com>
Suggested-by: Mel Gorman <mgorman@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 7e1c76e..01e3132 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *mask,
-			bool sync, bool *contended);
+			enum migrate_mode mode, bool *contended);
 extern void compact_pgdat(pg_data_t *pgdat, int order);
 extern void reset_isolation_suitable(pg_data_t *pgdat);
 extern unsigned long compaction_suitable(struct zone *zone, int order);
@@ -91,7 +91,7 @@ static inline bool compaction_restarting(struct zone *zone, int order)
 #else
 static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			bool sync, bool *contended)
+			enum migrate_mode mode, bool *contended)
 {
 	return COMPACT_CONTINUE;
 }
-- 
cgit v1.1


From f8c9301fa5a2a8b873c67f2a3d8230d5c13f61b7 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 4 Jun 2014 16:08:32 -0700
Subject: mm/compaction: do not count migratepages when unnecessary

During compaction, update_nr_listpages() has been used to count remaining
non-migrated and free pages after a call to migrage_pages().  The
freepages counting has become unneccessary, and it turns out that
migratepages counting is also unnecessary in most cases.

The only situation when it's needed to count cc->migratepages is when
migrate_pages() returns with a negative error code.  Otherwise, the
non-negative return value is the number of pages that were not migrated,
which is exactly the count of remaining pages in the cc->migratepages
list.

Furthermore, any non-zero count is only interesting for the tracepoint of
mm_compaction_migratepages events, because after that all remaining
unmigrated pages are put back and their count is set to 0.

This patch therefore removes update_nr_listpages() completely, and changes
the tracepoint definition so that the manual counting is done only when
the tracepoint is enabled, and only when migrate_pages() returns a
negative error code.

Furthermore, migrate_pages() and the tracepoints won't be called when
there's nothing to migrate.  This potentially avoids some wasted cycles
and reduces the volume of uninteresting mm_compaction_migratepages events
where "nr_migrated=0 nr_failed=0".  In the stress-highalloc mmtest, this
was about 75% of the events.  The mm_compaction_isolate_migratepages event
is better for determining that nothing was isolated for migration, and
this one was just duplicating the info.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Acked-by: Michal Nazarewicz <mina86@mina86.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/trace/events/compaction.h | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index 06f544e..c6814b9 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -5,6 +5,7 @@
 #define _TRACE_COMPACTION_H
 
 #include <linux/types.h>
+#include <linux/list.h>
 #include <linux/tracepoint.h>
 #include <trace/events/gfpflags.h>
 
@@ -47,10 +48,11 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
 
 TRACE_EVENT(mm_compaction_migratepages,
 
-	TP_PROTO(unsigned long nr_migrated,
-		unsigned long nr_failed),
+	TP_PROTO(unsigned long nr_all,
+		int migrate_rc,
+		struct list_head *migratepages),
 
-	TP_ARGS(nr_migrated, nr_failed),
+	TP_ARGS(nr_all, migrate_rc, migratepages),
 
 	TP_STRUCT__entry(
 		__field(unsigned long, nr_migrated)
@@ -58,7 +60,22 @@ TRACE_EVENT(mm_compaction_migratepages,
 	),
 
 	TP_fast_assign(
-		__entry->nr_migrated = nr_migrated;
+		unsigned long nr_failed = 0;
+		struct list_head *page_lru;
+
+		/*
+		 * migrate_pages() returns either a non-negative number
+		 * with the number of pages that failed migration, or an
+		 * error code, in which case we need to count the remaining
+		 * pages manually
+		 */
+		if (migrate_rc >= 0)
+			nr_failed = migrate_rc;
+		else
+			list_for_each(page_lru, migratepages)
+				nr_failed++;
+
+		__entry->nr_migrated = nr_all - nr_failed;
 		__entry->nr_failed = nr_failed;
 	),
 
-- 
cgit v1.1


From adfab836f4908deb049a5128082719e689eed964 Mon Sep 17 00:00:00 2001
From: Dan Streetman <ddstreet@ieee.org>
Date: Wed, 4 Jun 2014 16:09:53 -0700
Subject: swap: change swap_info singly-linked list to list_head

The logic controlling the singly-linked list of swap_info_struct entries
for all active, i.e.  swapon'ed, swap targets is rather complex, because:

 - it stores the entries in priority order
 - there is a pointer to the highest priority entry
 - there is a pointer to the highest priority not-full entry
 - there is a highest_priority_index variable set outside the swap_lock
 - swap entries of equal priority should be used equally

this complexity leads to bugs such as: https://lkml.org/lkml/2014/2/13/181
where different priority swap targets are incorrectly used equally.

That bug probably could be solved with the existing singly-linked lists,
but I think it would only add more complexity to the already difficult to
understand get_swap_page() swap_list iteration logic.

The first patch changes from a singly-linked list to a doubly-linked list
using list_heads; the highest_priority_index and related code are removed
and get_swap_page() starts each iteration at the highest priority
swap_info entry, even if it's full.  While this does introduce unnecessary
list iteration (i.e.  Schlemiel the painter's algorithm) in the case where
one or more of the highest priority entries are full, the iteration and
manipulation code is much simpler and behaves correctly re: the above bug;
and the fourth patch removes the unnecessary iteration.

The second patch adds some minor plist helper functions; nothing new
really, just functions to match existing regular list functions.  These
are used by the next two patches.

The third patch adds plist_requeue(), which is used by get_swap_page() in
the next patch - it performs the requeueing of same-priority entries
(which moves the entry to the end of its priority in the plist), so that
all equal-priority swap_info_structs get used equally.

The fourth patch converts the main list into a plist, and adds a new plist
that contains only swap_info entries that are both active and not full.
As Mel suggested using plists allows removing all the ordering code from
swap - plists handle ordering automatically.  The list naming is also
clarified now that there are two lists, with the original list changed
from swap_list_head to swap_active_head and the new list named
swap_avail_head.  A new spinlock is also added for the new list, so
swap_info entries can be added or removed from the new list immediately as
they become full or not full.

This patch (of 4):

Replace the singly-linked list tracking active, i.e.  swapon'ed,
swap_info_struct entries with a doubly-linked list using struct
list_heads.  Simplify the logic iterating and manipulating the list of
entries, especially get_swap_page(), by using standard list_head
functions, and removing the highest priority iteration logic.

The change fixes the bug:
https://lkml.org/lkml/2014/2/13/181
in which different priority swap entries after the highest priority entry
are incorrectly used equally in pairs.  The swap behavior is now as
advertised, i.e. different priority swap entries are used in order, and
equal priority swap targets are used concurrently.

Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Shaohua Li <shli@fusionio.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Cc: Weijie Yang <weijieut@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Bob Liu <bob.liu@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h     | 7 +------
 include/linux/swapfile.h | 2 +-
 2 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 5a14b92..8bb85d6d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -214,8 +214,8 @@ struct percpu_cluster {
 struct swap_info_struct {
 	unsigned long	flags;		/* SWP_USED etc: see above */
 	signed short	prio;		/* swap priority of this type */
+	struct list_head list;		/* entry in swap list */
 	signed char	type;		/* strange name for an index */
-	signed char	next;		/* next type on the swap list */
 	unsigned int	max;		/* extent of the swap_map */
 	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
 	struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
@@ -255,11 +255,6 @@ struct swap_info_struct {
 	struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */
 };
 
-struct swap_list_t {
-	int head;	/* head of priority-ordered swapfile list */
-	int next;	/* swapfile to be used next */
-};
-
 /* linux/mm/workingset.c */
 void *workingset_eviction(struct address_space *mapping, struct page *page);
 bool workingset_refault(void *shadow);
diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
index e282624..2eab382 100644
--- a/include/linux/swapfile.h
+++ b/include/linux/swapfile.h
@@ -6,7 +6,7 @@
  * want to expose them to the dozens of source files that include swap.h
  */
 extern spinlock_t swap_lock;
-extern struct swap_list_t swap_list;
+extern struct list_head swap_list_head;
 extern struct swap_info_struct *swap_info[];
 extern int try_to_unuse(unsigned int, bool, unsigned long);
 
-- 
cgit v1.1


From fd16618e12a05df79a3439d72d5ffdac5d34f3da Mon Sep 17 00:00:00 2001
From: Dan Streetman <ddstreet@ieee.org>
Date: Wed, 4 Jun 2014 16:09:55 -0700
Subject: lib/plist: add helper functions

Add PLIST_HEAD() to plist.h, equivalent to LIST_HEAD() from list.h, to
define and initialize a struct plist_head.

Add plist_for_each_continue() and plist_for_each_entry_continue(),
equivalent to list_for_each_continue() and list_for_each_entry_continue(),
to iterate over a plist continuing after the current position.

Add plist_prev() and plist_next(), equivalent to (struct list_head*)->prev
and ->next, implemented by list_prev_entry() and list_next_entry(), to
access the prev/next struct plist_node entry.  These are needed because
unlike struct list_head, direct access of the prev/next struct plist_node
isn't possible; the list must be navigated via the contained struct
list_head.  e.g.  instead of accessing the prev by list_prev_entry(node,
node_list) it can be accessed by plist_prev(node).

Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Shaohua Li <shli@fusionio.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Cc: Weijie Yang <weijieut@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Bob Liu <bob.liu@oracle.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/plist.h | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

(limited to 'include')

diff --git a/include/linux/plist.h b/include/linux/plist.h
index aa0fb39..c815491 100644
--- a/include/linux/plist.h
+++ b/include/linux/plist.h
@@ -98,6 +98,13 @@ struct plist_node {
 }
 
 /**
+ * PLIST_HEAD - declare and init plist_head
+ * @head:	name for struct plist_head variable
+ */
+#define PLIST_HEAD(head) \
+	struct plist_head head = PLIST_HEAD_INIT(head)
+
+/**
  * PLIST_NODE_INIT - static struct plist_node initializer
  * @node:	struct plist_node variable name
  * @__prio:	initial node priority
@@ -143,6 +150,16 @@ extern void plist_del(struct plist_node *node, struct plist_head *head);
 	 list_for_each_entry(pos, &(head)->node_list, node_list)
 
 /**
+ * plist_for_each_continue - continue iteration over the plist
+ * @pos:	the type * to use as a loop cursor
+ * @head:	the head for your list
+ *
+ * Continue to iterate over plist, continuing after the current position.
+ */
+#define plist_for_each_continue(pos, head)	\
+	 list_for_each_entry_continue(pos, &(head)->node_list, node_list)
+
+/**
  * plist_for_each_safe - iterate safely over a plist of given type
  * @pos:	the type * to use as a loop counter
  * @n:	another type * to use as temporary storage
@@ -163,6 +180,18 @@ extern void plist_del(struct plist_node *node, struct plist_head *head);
 	 list_for_each_entry(pos, &(head)->node_list, mem.node_list)
 
 /**
+ * plist_for_each_entry_continue - continue iteration over list of given type
+ * @pos:	the type * to use as a loop cursor
+ * @head:	the head for your list
+ * @m:		the name of the list_struct within the struct
+ *
+ * Continue to iterate over list of given type, continuing after
+ * the current position.
+ */
+#define plist_for_each_entry_continue(pos, head, m)	\
+	list_for_each_entry_continue(pos, &(head)->node_list, m.node_list)
+
+/**
  * plist_for_each_entry_safe - iterate safely over list of given type
  * @pos:	the type * to use as a loop counter
  * @n:		another type * to use as temporary storage
@@ -229,6 +258,20 @@ static inline int plist_node_empty(const struct plist_node *node)
 #endif
 
 /**
+ * plist_next - get the next entry in list
+ * @pos:	the type * to cursor
+ */
+#define plist_next(pos) \
+	list_next_entry(pos, node_list)
+
+/**
+ * plist_prev - get the prev entry in list
+ * @pos:	the type * to cursor
+ */
+#define plist_prev(pos) \
+	list_prev_entry(pos, node_list)
+
+/**
  * plist_first - return the first node (and thus, highest priority)
  * @head:	the &struct plist_head pointer
  *
-- 
cgit v1.1


From a75f232ce0fe38bd01301899ecd97ffd0254316a Mon Sep 17 00:00:00 2001
From: Dan Streetman <ddstreet@ieee.org>
Date: Wed, 4 Jun 2014 16:09:57 -0700
Subject: lib/plist: add plist_requeue

Add plist_requeue(), which moves the specified plist_node after all other
same-priority plist_nodes in the list.  This is essentially an optimized
plist_del() followed by plist_add().

This is needed by swap, which (with the next patch in this set) uses a
plist of available swap devices.  When a swap device (either a swap
partition or swap file) are added to the system with swapon(), the device
is added to a plist, ordered by the swap device's priority.  When swap
needs to allocate a page from one of the swap devices, it takes the page
from the first swap device on the plist, which is the highest priority
swap device.  The swap device is left in the plist until all its pages are
used, and then removed from the plist when it becomes full.

However, as described in man 2 swapon, swap must allocate pages from swap
devices with the same priority in round-robin order; to do this, on each
swap page allocation, swap uses a page from the first swap device in the
plist, and then calls plist_requeue() to move that swap device entry to
after any other same-priority swap devices.  The next swap page allocation
will again use a page from the first swap device in the plist and requeue
it, and so on, resulting in round-robin usage of equal-priority swap
devices.

Also add plist_test_requeue() test function, for use by plist_test() to
test plist_requeue() function.

Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Shaohua Li <shli@fusionio.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Cc: Weijie Yang <weijieut@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Bob Liu <bob.liu@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/plist.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/plist.h b/include/linux/plist.h
index c815491..8b6c970 100644
--- a/include/linux/plist.h
+++ b/include/linux/plist.h
@@ -141,6 +141,8 @@ static inline void plist_node_init(struct plist_node *node, int prio)
 extern void plist_add(struct plist_node *node, struct plist_head *head);
 extern void plist_del(struct plist_node *node, struct plist_head *head);
 
+extern void plist_requeue(struct plist_node *node, struct plist_head *head);
+
 /**
  * plist_for_each - iterate over the plist
  * @pos:	the type * to use as a loop counter
-- 
cgit v1.1


From 18ab4d4ced0817421e6db6940374cc39d28d65da Mon Sep 17 00:00:00 2001
From: Dan Streetman <ddstreet@ieee.org>
Date: Wed, 4 Jun 2014 16:09:59 -0700
Subject: swap: change swap_list_head to plist, add swap_avail_head

Originally get_swap_page() started iterating through the singly-linked
list of swap_info_structs using swap_list.next or highest_priority_index,
which both were intended to point to the highest priority active swap
target that was not full.  The first patch in this series changed the
singly-linked list to a doubly-linked list, and removed the logic to start
at the highest priority non-full entry; it starts scanning at the highest
priority entry each time, even if the entry is full.

Replace the manually ordered swap_list_head with a plist, swap_active_head.
Add a new plist, swap_avail_head.  The original swap_active_head plist
contains all active swap_info_structs, as before, while the new
swap_avail_head plist contains only swap_info_structs that are active and
available, i.e. not full.  Add a new spinlock, swap_avail_lock, to protect
the swap_avail_head list.

Mel Gorman suggested using plists since they internally handle ordering
the list entries based on priority, which is exactly what swap was doing
manually.  All the ordering code is now removed, and swap_info_struct
entries and simply added to their corresponding plist and automatically
ordered correctly.

Using a new plist for available swap_info_structs simplifies and
optimizes get_swap_page(), which no longer has to iterate over full
swap_info_structs.  Using a new spinlock for swap_avail_head plist
allows each swap_info_struct to add or remove themselves from the
plist when they become full or not-full; previously they could not
do so because the swap_info_struct->lock is held when they change
from full<->not-full, and the swap_lock protecting the main
swap_active_head must be ordered before any swap_info_struct->lock.

Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Shaohua Li <shli@fusionio.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Cc: Weijie Yang <weijieut@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Bob Liu <bob.liu@oracle.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h     | 3 ++-
 include/linux/swapfile.h | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 8bb85d6d..9155bcd 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -214,7 +214,8 @@ struct percpu_cluster {
 struct swap_info_struct {
 	unsigned long	flags;		/* SWP_USED etc: see above */
 	signed short	prio;		/* swap priority of this type */
-	struct list_head list;		/* entry in swap list */
+	struct plist_node list;		/* entry in swap_active_head */
+	struct plist_node avail_list;	/* entry in swap_avail_head */
 	signed char	type;		/* strange name for an index */
 	unsigned int	max;		/* extent of the swap_map */
 	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
index 2eab382..388293a 100644
--- a/include/linux/swapfile.h
+++ b/include/linux/swapfile.h
@@ -6,7 +6,7 @@
  * want to expose them to the dozens of source files that include swap.h
  */
 extern spinlock_t swap_lock;
-extern struct list_head swap_list_head;
+extern struct plist_head swap_active_head;
 extern struct swap_info_struct *swap_info[];
 extern int try_to_unuse(unsigned int, bool, unsigned long);
 
-- 
cgit v1.1


From 776ed0f0377914d1e65fed903c052e9eef3f4cc3 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Wed, 4 Jun 2014 16:10:02 -0700
Subject: memcg: cleanup kmem cache creation/destruction functions naming

Current names are rather inconsistent. Let's try to improve them.

Brief change log:

** old name **                          ** new name **

kmem_cache_create_memcg                 memcg_create_kmem_cache
memcg_kmem_create_cache                 memcg_regsiter_cache
memcg_kmem_destroy_cache                memcg_unregister_cache

kmem_cache_destroy_memcg_children       memcg_cleanup_cache_params
mem_cgroup_destroy_all_caches           memcg_unregister_all_caches

create_work                             memcg_register_cache_work
memcg_create_cache_work_func            memcg_register_cache_func
memcg_create_cache_enqueue              memcg_schedule_register_cache

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 2 +-
 include/linux/slab.h       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index dfc2929..eb65d29 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -505,7 +505,7 @@ __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp);
 int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order);
 void __memcg_uncharge_slab(struct kmem_cache *cachep, int order);
 
-int __kmem_cache_destroy_memcg_children(struct kmem_cache *s);
+int __memcg_cleanup_cache_params(struct kmem_cache *s);
 
 /**
  * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed.
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 86e5b26..1d9abb7 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -116,7 +116,7 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
 			unsigned long,
 			void (*)(void *));
 #ifdef CONFIG_MEMCG_KMEM
-struct kmem_cache *kmem_cache_create_memcg(struct mem_cgroup *,
+struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *,
 					   struct kmem_cache *,
 					   const char *);
 #endif
-- 
cgit v1.1


From ea5e9539abf1258f23e725cb9cb25aa74efa29eb Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 4 Jun 2014 16:10:07 -0700
Subject: include/linux/jump_label.h: expose the reference count

This patch exposes the jump_label reference count in preparation for the
next patch.  cpusets cares about both the jump_label being enabled and how
many users of the cpusets there currently are.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jan Kara <jack@suse.cz>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/jump_label.h | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 5c1dfb2..784304b 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -69,6 +69,10 @@ struct static_key {
 
 # include <asm/jump_label.h>
 # define HAVE_JUMP_LABEL
+#else
+struct static_key {
+	atomic_t enabled;
+};
 #endif	/* CC_HAVE_ASM_GOTO && CONFIG_JUMP_LABEL */
 
 enum jump_label_type {
@@ -79,6 +83,12 @@ enum jump_label_type {
 struct module;
 
 #include <linux/atomic.h>
+
+static inline int static_key_count(struct static_key *key)
+{
+	return atomic_read(&key->enabled);
+}
+
 #ifdef HAVE_JUMP_LABEL
 
 #define JUMP_LABEL_TYPE_FALSE_BRANCH	0UL
@@ -134,10 +144,6 @@ extern void jump_label_apply_nops(struct module *mod);
 
 #else  /* !HAVE_JUMP_LABEL */
 
-struct static_key {
-	atomic_t enabled;
-};
-
 static __always_inline void jump_label_init(void)
 {
 	static_key_initialized = true;
@@ -145,14 +151,14 @@ static __always_inline void jump_label_init(void)
 
 static __always_inline bool static_key_false(struct static_key *key)
 {
-	if (unlikely(atomic_read(&key->enabled) > 0))
+	if (unlikely(static_key_count(key) > 0))
 		return true;
 	return false;
 }
 
 static __always_inline bool static_key_true(struct static_key *key)
 {
-	if (likely(atomic_read(&key->enabled) > 0))
+	if (likely(static_key_count(key) > 0))
 		return true;
 	return false;
 }
@@ -194,7 +200,7 @@ static inline int jump_label_apply_nops(struct module *mod)
 
 static inline bool static_key_enabled(struct static_key *key)
 {
-	return (atomic_read(&key->enabled) > 0);
+	return static_key_count(key) > 0;
 }
 
 #endif	/* _LINUX_JUMP_LABEL_H */
-- 
cgit v1.1


From 664eeddeef6539247691197c1ac124d4aa872ab6 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 4 Jun 2014 16:10:08 -0700
Subject: mm: page_alloc: use jump labels to avoid checking number_of_cpusets

If cpusets are not in use then we still check a global variable on every
page allocation.  Use jump labels to avoid the overhead.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jan Kara <jack@suse.cz>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpuset.h | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index b19d3dc..ade2390 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -12,10 +12,31 @@
 #include <linux/cpumask.h>
 #include <linux/nodemask.h>
 #include <linux/mm.h>
+#include <linux/jump_label.h>
 
 #ifdef CONFIG_CPUSETS
 
-extern int number_of_cpusets;	/* How many cpusets are defined in system? */
+extern struct static_key cpusets_enabled_key;
+static inline bool cpusets_enabled(void)
+{
+	return static_key_false(&cpusets_enabled_key);
+}
+
+static inline int nr_cpusets(void)
+{
+	/* jump label reference count + the top-level cpuset */
+	return static_key_count(&cpusets_enabled_key) + 1;
+}
+
+static inline void cpuset_inc(void)
+{
+	static_key_slow_inc(&cpusets_enabled_key);
+}
+
+static inline void cpuset_dec(void)
+{
+	static_key_slow_dec(&cpusets_enabled_key);
+}
 
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
@@ -32,13 +53,13 @@ extern int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask);
 
 static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
 {
-	return number_of_cpusets <= 1 ||
+	return nr_cpusets() <= 1 ||
 		__cpuset_node_allowed_softwall(node, gfp_mask);
 }
 
 static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
 {
-	return number_of_cpusets <= 1 ||
+	return nr_cpusets() <= 1 ||
 		__cpuset_node_allowed_hardwall(node, gfp_mask);
 }
 
@@ -124,6 +145,8 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 
 #else /* !CONFIG_CPUSETS */
 
+static inline bool cpusets_enabled(void) { return false; }
+
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
 
-- 
cgit v1.1


From e58469bafd0524e848c3733bc3918d854595e20f Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 4 Jun 2014 16:10:16 -0700
Subject: mm: page_alloc: use word-based accesses for get/set pageblock bitmaps

The test_bit operations in get/set pageblock flags are expensive.  This
patch reads the bitmap on a word basis and use shifts and masks to isolate
the bits of interest.  Similarly masks are used to set a local copy of the
bitmap and then use cmpxchg to update the bitmap if there have been no
other changes made in parallel.

In a test running dd onto tmpfs the overhead of the pageblock-related
functions went from 1.27% in profiles to 0.5%.

In addition to the performance benefits, this patch closes races that are
possible between:

a) get_ and set_pageblock_migratetype(), where get_pageblock_migratetype()
   reads part of the bits before and other part of the bits after
   set_pageblock_migratetype() has updated them.

b) set_pageblock_migratetype() and set_pageblock_skip(), where the non-atomic
   read-modify-update set bit operation in set_pageblock_skip() will cause
   lost updates to some bits changed in the set_pageblock_migratetype().

Joonsoo Kim first reported the case a) via code inspection.  Vlastimil
Babka's testing with a debug patch showed that either a) or b) occurs
roughly once per mmtests' stress-highalloc benchmark (although not
necessarily in the same pageblock).  Furthermore during development of
unrelated compaction patches, it was observed that frequent calls to
{start,undo}_isolate_page_range() the race occurs several thousands of
times and has resulted in NULL pointer dereferences in move_freepages()
and free_one_page() in places where free_list[migratetype] is
manipulated by e.g.  list_move().  Further debugging confirmed that
migratetype had invalid value of 6, causing out of bounds access to the
free_list array.

That confirmed that the race exist, although it may be extremely rare,
and currently only fatal where page isolation is performed due to
memory hot remove.  Races on pageblocks being updated by
set_pageblock_migratetype(), where both old and new migratetype are
lower MIGRATE_RESERVE, currently cannot result in an invalid value
being observed, although theoretically they may still lead to
unexpected creation or destruction of MIGRATE_RESERVE pageblocks.
Furthermore, things could get suddenly worse when memory isolation is
used more, or when new migratetypes are added.

After this patch, the race has no longer been observed in testing.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Reported-and-tested-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h          |  6 +++++-
 include/linux/pageblock-flags.h | 37 +++++++++++++++++++++++++++++++------
 2 files changed, 36 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 10a96ee..8ef1e3f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -75,9 +75,13 @@ enum {
 
 extern int page_group_by_mobility_disabled;
 
+#define NR_MIGRATETYPE_BITS (PB_migrate_end - PB_migrate + 1)
+#define MIGRATETYPE_MASK ((1UL << NR_MIGRATETYPE_BITS) - 1)
+
 static inline int get_pageblock_migratetype(struct page *page)
 {
-	return get_pageblock_flags_group(page, PB_migrate, PB_migrate_end);
+	BUILD_BUG_ON(PB_migrate_end - PB_migrate != 2);
+	return get_pageblock_flags_mask(page, PB_migrate_end, MIGRATETYPE_MASK);
 }
 
 struct free_area {
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index 2ee8cd2..c08730c 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -30,9 +30,12 @@ enum pageblock_bits {
 	PB_migrate,
 	PB_migrate_end = PB_migrate + 3 - 1,
 			/* 3 bits required for migrate types */
-#ifdef CONFIG_COMPACTION
 	PB_migrate_skip,/* If set the block is skipped by compaction */
-#endif /* CONFIG_COMPACTION */
+
+	/*
+	 * Assume the bits will always align on a word. If this assumption
+	 * changes then get/set pageblock needs updating.
+	 */
 	NR_PAGEBLOCK_BITS
 };
 
@@ -62,11 +65,33 @@ extern int pageblock_order;
 /* Forward declaration */
 struct page;
 
+unsigned long get_pageblock_flags_mask(struct page *page,
+				unsigned long end_bitidx,
+				unsigned long mask);
+void set_pageblock_flags_mask(struct page *page,
+				unsigned long flags,
+				unsigned long end_bitidx,
+				unsigned long mask);
+
 /* Declarations for getting and setting flags. See mm/page_alloc.c */
-unsigned long get_pageblock_flags_group(struct page *page,
-					int start_bitidx, int end_bitidx);
-void set_pageblock_flags_group(struct page *page, unsigned long flags,
-					int start_bitidx, int end_bitidx);
+static inline unsigned long get_pageblock_flags_group(struct page *page,
+					int start_bitidx, int end_bitidx)
+{
+	unsigned long nr_flag_bits = end_bitidx - start_bitidx + 1;
+	unsigned long mask = (1 << nr_flag_bits) - 1;
+
+	return get_pageblock_flags_mask(page, end_bitidx, mask);
+}
+
+static inline void set_pageblock_flags_group(struct page *page,
+					unsigned long flags,
+					int start_bitidx, int end_bitidx)
+{
+	unsigned long nr_flag_bits = end_bitidx - start_bitidx + 1;
+	unsigned long mask = (1 << nr_flag_bits) - 1;
+
+	set_pageblock_flags_mask(page, flags, end_bitidx, mask);
+}
 
 #ifdef CONFIG_COMPACTION
 #define get_pageblock_skip(page) \
-- 
cgit v1.1


From dc4b0caff24d9b2918e9f27bc65499ee63187eba Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 4 Jun 2014 16:10:17 -0700
Subject: mm: page_alloc: reduce number of times page_to_pfn is called

In the free path we calculate page_to_pfn multiple times. Reduce that.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jan Kara <jack@suse.cz>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h          |  9 +++++++--
 include/linux/pageblock-flags.h | 33 +++++++++++++--------------------
 2 files changed, 20 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8ef1e3f..472426a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -78,10 +78,15 @@ extern int page_group_by_mobility_disabled;
 #define NR_MIGRATETYPE_BITS (PB_migrate_end - PB_migrate + 1)
 #define MIGRATETYPE_MASK ((1UL << NR_MIGRATETYPE_BITS) - 1)
 
-static inline int get_pageblock_migratetype(struct page *page)
+#define get_pageblock_migratetype(page)					\
+	get_pfnblock_flags_mask(page, page_to_pfn(page),		\
+			PB_migrate_end, MIGRATETYPE_MASK)
+
+static inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
 {
 	BUILD_BUG_ON(PB_migrate_end - PB_migrate != 2);
-	return get_pageblock_flags_mask(page, PB_migrate_end, MIGRATETYPE_MASK);
+	return get_pfnblock_flags_mask(page, pfn, PB_migrate_end,
+					MIGRATETYPE_MASK);
 }
 
 struct free_area {
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index c08730c..2baeee1 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -65,33 +65,26 @@ extern int pageblock_order;
 /* Forward declaration */
 struct page;
 
-unsigned long get_pageblock_flags_mask(struct page *page,
+unsigned long get_pfnblock_flags_mask(struct page *page,
+				unsigned long pfn,
 				unsigned long end_bitidx,
 				unsigned long mask);
-void set_pageblock_flags_mask(struct page *page,
+
+void set_pfnblock_flags_mask(struct page *page,
 				unsigned long flags,
+				unsigned long pfn,
 				unsigned long end_bitidx,
 				unsigned long mask);
 
 /* Declarations for getting and setting flags. See mm/page_alloc.c */
-static inline unsigned long get_pageblock_flags_group(struct page *page,
-					int start_bitidx, int end_bitidx)
-{
-	unsigned long nr_flag_bits = end_bitidx - start_bitidx + 1;
-	unsigned long mask = (1 << nr_flag_bits) - 1;
-
-	return get_pageblock_flags_mask(page, end_bitidx, mask);
-}
-
-static inline void set_pageblock_flags_group(struct page *page,
-					unsigned long flags,
-					int start_bitidx, int end_bitidx)
-{
-	unsigned long nr_flag_bits = end_bitidx - start_bitidx + 1;
-	unsigned long mask = (1 << nr_flag_bits) - 1;
-
-	set_pageblock_flags_mask(page, flags, end_bitidx, mask);
-}
+#define get_pageblock_flags_group(page, start_bitidx, end_bitidx) \
+	get_pfnblock_flags_mask(page, page_to_pfn(page),		\
+			end_bitidx,					\
+			(1 << (end_bitidx - start_bitidx + 1)) - 1)
+#define set_pageblock_flags_group(page, flags, start_bitidx, end_bitidx) \
+	set_pfnblock_flags_mask(page, flags, page_to_pfn(page),		\
+			end_bitidx,					\
+			(1 << (end_bitidx - start_bitidx + 1)) - 1)
 
 #ifdef CONFIG_COMPACTION
 #define get_pageblock_skip(page) \
-- 
cgit v1.1


From 7aeb09f9104b760fc53c98cb7d20d06640baf9e6 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 4 Jun 2014 16:10:21 -0700
Subject: mm: page_alloc: use unsigned int for order in more places

X86 prefers the use of unsigned types for iterators and there is a
tendency to mix whether a signed or unsigned type if used for page order.
This converts a number of sites in mm/page_alloc.c to use unsigned int for
order where possible.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jan Kara <jack@suse.cz>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 472426a..6cbd1b6 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -817,10 +817,10 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat)
 extern struct mutex zonelists_mutex;
 void build_all_zonelists(pg_data_t *pgdat, struct zone *zone);
 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
-bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
-		int classzone_idx, int alloc_flags);
-bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
-		int classzone_idx, int alloc_flags);
+bool zone_watermark_ok(struct zone *z, unsigned int order,
+		unsigned long mark, int classzone_idx, int alloc_flags);
+bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
+		unsigned long mark, int classzone_idx, int alloc_flags);
 enum memmap_context {
 	MEMMAP_EARLY,
 	MEMMAP_HOTPLUG,
-- 
cgit v1.1


From b745bc85f21ea707e4ea1a91948055fa3e72c77b Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 4 Jun 2014 16:10:22 -0700
Subject: mm: page_alloc: convert hot/cold parameter and immediate callers to
 bool

cold is a bool, make it one.  Make the likely case the "if" part of the
block instead of the else as according to the optimisation manual this is
preferred.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jan Kara <jack@suse.cz>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h     | 4 ++--
 include/linux/pagemap.h | 2 +-
 include/linux/swap.h    | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index d382db7..454c99f 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -371,8 +371,8 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
 
 extern void __free_pages(struct page *page, unsigned int order);
 extern void free_pages(unsigned long addr, unsigned int order);
-extern void free_hot_cold_page(struct page *page, int cold);
-extern void free_hot_cold_page_list(struct list_head *list, int cold);
+extern void free_hot_cold_page(struct page *page, bool cold);
+extern void free_hot_cold_page_list(struct list_head *list, bool cold);
 
 extern void __free_kmem_pages(struct page *page, unsigned int order);
 extern void free_kmem_pages(unsigned long addr, unsigned int order);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 718214c..c16fb6d 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -110,7 +110,7 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
 
 #define page_cache_get(page)		get_page(page)
 #define page_cache_release(page)	put_page(page)
-void release_pages(struct page **pages, int nr, int cold);
+void release_pages(struct page **pages, int nr, bool cold);
 
 /*
  * speculatively take a reference to a page.
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 9155bcd..97cf161 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -477,7 +477,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
 #define free_page_and_swap_cache(page) \
 	page_cache_release(page)
 #define free_pages_and_swap_cache(pages, nr) \
-	release_pages((pages), (nr), 0);
+	release_pages((pages), (nr), false);
 
 static inline void show_swap_cache_info(void)
 {
-- 
cgit v1.1


From 07a427884348d38a6fd56fa4d78249c407196650 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 4 Jun 2014 16:10:24 -0700
Subject: mm: shmem: avoid atomic operation during shmem_getpage_gfp

shmem_getpage_gfp uses an atomic operation to set the SwapBacked field
before it's even added to the LRU or visible.  This is unnecessary as what
could it possible race against?  Use an unlocked variant.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jan Kara <jack@suse.cz>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index d1fe1a7..4d4b39a 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -208,6 +208,7 @@ PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned)	/* Xen */
 PAGEFLAG(SavePinned, savepinned);			/* Xen */
 PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
 PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
+	__SETPAGEFLAG(SwapBacked, swapbacked)
 
 __PAGEFLAG(SlobFree, slob_free)
 
-- 
cgit v1.1


From 2457aec63745e235bcafb7ef312b182d8682f0fc Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 4 Jun 2014 16:10:31 -0700
Subject: mm: non-atomically mark page accessed during page cache allocation
 where possible

aops->write_begin may allocate a new page and make it visible only to have
mark_page_accessed called almost immediately after.  Once the page is
visible the atomic operations are necessary which is noticable overhead
when writing to an in-memory filesystem like tmpfs but should also be
noticable with fast storage.  The objective of the patch is to initialse
the accessed information with non-atomic operations before the page is
visible.

The bulk of filesystems directly or indirectly use
grab_cache_page_write_begin or find_or_create_page for the initial
allocation of a page cache page.  This patch adds an init_page_accessed()
helper which behaves like the first call to mark_page_accessed() but may
called before the page is visible and can be done non-atomically.

The primary APIs of concern in this care are the following and are used
by most filesystems.

	find_get_page
	find_lock_page
	find_or_create_page
	grab_cache_page_nowait
	grab_cache_page_write_begin

All of them are very similar in detail to the patch creates a core helper
pagecache_get_page() which takes a flags parameter that affects its
behavior such as whether the page should be marked accessed or not.  Then
old API is preserved but is basically a thin wrapper around this core
function.

Each of the filesystems are then updated to avoid calling
mark_page_accessed when it is known that the VM interfaces have already
done the job.  There is a slight snag in that the timing of the
mark_page_accessed() has now changed so in rare cases it's possible a page
gets to the end of the LRU as PageReferenced where as previously it might
have been repromoted.  This is expected to be rare but it's worth the
filesystem people thinking about it in case they see a problem with the
timing change.  It is also the case that some filesystems may be marking
pages accessed that previously did not but it makes sense that filesystems
have consistent behaviour in this regard.

The test case used to evaulate this is a simple dd of a large file done
multiple times with the file deleted on each iterations.  The size of the
file is 1/10th physical memory to avoid dirty page balancing.  In the
async case it will be possible that the workload completes without even
hitting the disk and will have variable results but highlight the impact
of mark_page_accessed for async IO.  The sync results are expected to be
more stable.  The exception is tmpfs where the normal case is for the "IO"
to not hit the disk.

The test machine was single socket and UMA to avoid any scheduling or NUMA
artifacts.  Throughput and wall times are presented for sync IO, only wall
times are shown for async as the granularity reported by dd and the
variability is unsuitable for comparison.  As async results were variable
do to writback timings, I'm only reporting the maximum figures.  The sync
results were stable enough to make the mean and stddev uninteresting.

The performance results are reported based on a run with no profiling.
Profile data is based on a separate run with oprofile running.

async dd
                                    3.15.0-rc3            3.15.0-rc3
                                       vanilla           accessed-v2
ext3    Max      elapsed     13.9900 (  0.00%)     11.5900 ( 17.16%)
tmpfs	Max      elapsed      0.5100 (  0.00%)      0.4900 (  3.92%)
btrfs   Max      elapsed     12.8100 (  0.00%)     12.7800 (  0.23%)
ext4	Max      elapsed     18.6000 (  0.00%)     13.3400 ( 28.28%)
xfs	Max      elapsed     12.5600 (  0.00%)      2.0900 ( 83.36%)

The XFS figure is a bit strange as it managed to avoid a worst case by
sheer luck but the average figures looked reasonable.

        samples percentage
ext3       86107    0.9783  vmlinux-3.15.0-rc4-vanilla        mark_page_accessed
ext3       23833    0.2710  vmlinux-3.15.0-rc4-accessed-v3r25 mark_page_accessed
ext3        5036    0.0573  vmlinux-3.15.0-rc4-accessed-v3r25 init_page_accessed
ext4       64566    0.8961  vmlinux-3.15.0-rc4-vanilla        mark_page_accessed
ext4        5322    0.0713  vmlinux-3.15.0-rc4-accessed-v3r25 mark_page_accessed
ext4        2869    0.0384  vmlinux-3.15.0-rc4-accessed-v3r25 init_page_accessed
xfs        62126    1.7675  vmlinux-3.15.0-rc4-vanilla        mark_page_accessed
xfs         1904    0.0554  vmlinux-3.15.0-rc4-accessed-v3r25 init_page_accessed
xfs          103    0.0030  vmlinux-3.15.0-rc4-accessed-v3r25 mark_page_accessed
btrfs      10655    0.1338  vmlinux-3.15.0-rc4-vanilla        mark_page_accessed
btrfs       2020    0.0273  vmlinux-3.15.0-rc4-accessed-v3r25 init_page_accessed
btrfs        587    0.0079  vmlinux-3.15.0-rc4-accessed-v3r25 mark_page_accessed
tmpfs      59562    3.2628  vmlinux-3.15.0-rc4-vanilla        mark_page_accessed
tmpfs       1210    0.0696  vmlinux-3.15.0-rc4-accessed-v3r25 init_page_accessed
tmpfs         94    0.0054  vmlinux-3.15.0-rc4-accessed-v3r25 mark_page_accessed

[akpm@linux-foundation.org: don't run init_page_accessed() against an uninitialised pointer]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jan Kara <jack@suse.cz>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Tested-by: Prabhakar Lad <prabhakar.csengg@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h |   1 +
 include/linux/pagemap.h    | 107 ++++++++++++++++++++++++++++++++++++++++++---
 include/linux/swap.h       |   1 +
 3 files changed, 103 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 4d4b39a..2093eb7 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -198,6 +198,7 @@ struct page;	/* forward declaration */
 TESTPAGEFLAG(Locked, locked)
 PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error)
 PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
+	__SETPAGEFLAG(Referenced, referenced)
 PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
 PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru)
 PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index c16fb6d..0a97b58 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -259,12 +259,109 @@ pgoff_t page_cache_next_hole(struct address_space *mapping,
 pgoff_t page_cache_prev_hole(struct address_space *mapping,
 			     pgoff_t index, unsigned long max_scan);
 
+#define FGP_ACCESSED		0x00000001
+#define FGP_LOCK		0x00000002
+#define FGP_CREAT		0x00000004
+#define FGP_WRITE		0x00000008
+#define FGP_NOFS		0x00000010
+#define FGP_NOWAIT		0x00000020
+
+struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
+		int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask);
+
+/**
+ * find_get_page - find and get a page reference
+ * @mapping: the address_space to search
+ * @offset: the page index
+ *
+ * Looks up the page cache slot at @mapping & @offset.  If there is a
+ * page cache page, it is returned with an increased refcount.
+ *
+ * Otherwise, %NULL is returned.
+ */
+static inline struct page *find_get_page(struct address_space *mapping,
+					pgoff_t offset)
+{
+	return pagecache_get_page(mapping, offset, 0, 0, 0);
+}
+
+static inline struct page *find_get_page_flags(struct address_space *mapping,
+					pgoff_t offset, int fgp_flags)
+{
+	return pagecache_get_page(mapping, offset, fgp_flags, 0, 0);
+}
+
+/**
+ * find_lock_page - locate, pin and lock a pagecache page
+ * pagecache_get_page - find and get a page reference
+ * @mapping: the address_space to search
+ * @offset: the page index
+ *
+ * Looks up the page cache slot at @mapping & @offset.  If there is a
+ * page cache page, it is returned locked and with an increased
+ * refcount.
+ *
+ * Otherwise, %NULL is returned.
+ *
+ * find_lock_page() may sleep.
+ */
+static inline struct page *find_lock_page(struct address_space *mapping,
+					pgoff_t offset)
+{
+	return pagecache_get_page(mapping, offset, FGP_LOCK, 0, 0);
+}
+
+/**
+ * find_or_create_page - locate or add a pagecache page
+ * @mapping: the page's address_space
+ * @index: the page's index into the mapping
+ * @gfp_mask: page allocation mode
+ *
+ * Looks up the page cache slot at @mapping & @offset.  If there is a
+ * page cache page, it is returned locked and with an increased
+ * refcount.
+ *
+ * If the page is not present, a new page is allocated using @gfp_mask
+ * and added to the page cache and the VM's LRU list.  The page is
+ * returned locked and with an increased refcount.
+ *
+ * On memory exhaustion, %NULL is returned.
+ *
+ * find_or_create_page() may sleep, even if @gfp_flags specifies an
+ * atomic allocation!
+ */
+static inline struct page *find_or_create_page(struct address_space *mapping,
+					pgoff_t offset, gfp_t gfp_mask)
+{
+	return pagecache_get_page(mapping, offset,
+					FGP_LOCK|FGP_ACCESSED|FGP_CREAT,
+					gfp_mask, gfp_mask & GFP_RECLAIM_MASK);
+}
+
+/**
+ * grab_cache_page_nowait - returns locked page at given index in given cache
+ * @mapping: target address_space
+ * @index: the page index
+ *
+ * Same as grab_cache_page(), but do not wait if the page is unavailable.
+ * This is intended for speculative data generators, where the data can
+ * be regenerated if the page couldn't be grabbed.  This routine should
+ * be safe to call while holding the lock for another page.
+ *
+ * Clear __GFP_FS when allocating the page to avoid recursion into the fs
+ * and deadlock against the caller's locked page.
+ */
+static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
+				pgoff_t index)
+{
+	return pagecache_get_page(mapping, index,
+			FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
+			mapping_gfp_mask(mapping),
+			GFP_NOFS);
+}
+
 struct page *find_get_entry(struct address_space *mapping, pgoff_t offset);
-struct page *find_get_page(struct address_space *mapping, pgoff_t offset);
 struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset);
-struct page *find_lock_page(struct address_space *mapping, pgoff_t offset);
-struct page *find_or_create_page(struct address_space *mapping, pgoff_t index,
-				 gfp_t gfp_mask);
 unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
 			  unsigned int nr_entries, struct page **entries,
 			  pgoff_t *indices);
@@ -287,8 +384,6 @@ static inline struct page *grab_cache_page(struct address_space *mapping,
 	return find_or_create_page(mapping, index, mapping_gfp_mask(mapping));
 }
 
-extern struct page * grab_cache_page_nowait(struct address_space *mapping,
-				pgoff_t index);
 extern struct page * read_cache_page(struct address_space *mapping,
 				pgoff_t index, filler_t *filler, void *data);
 extern struct page * read_cache_page_gfp(struct address_space *mapping,
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 97cf161..4348d95 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -311,6 +311,7 @@ extern void lru_add_page_tail(struct page *page, struct page *page_tail,
 			 struct lruvec *lruvec, struct list_head *head);
 extern void activate_page(struct page *);
 extern void mark_page_accessed(struct page *);
+extern void init_page_accessed(struct page *page);
 extern void lru_add_drain(void);
 extern void lru_add_drain_cpu(int cpu);
 extern void lru_add_drain_all(void);
-- 
cgit v1.1


From b7596fb43aa786fb3ee5015a73034fbb9e80feaa Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 4 Jun 2014 16:10:37 -0700
Subject: include/linux/gfp.h: exclude duplicate header

mmdebug.h is included twice.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 454c99f..6eb1fb3 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -6,7 +6,6 @@
 #include <linux/stddef.h>
 #include <linux/linkage.h>
 #include <linux/topology.h>
-#include <linux/mmdebug.h>
 
 struct vm_area_struct;
 
-- 
cgit v1.1


From 4be89a34609659042ef0bf883ad76388fb5251bb Mon Sep 17 00:00:00 2001
From: Jianyu Zhan <nasa4836@gmail.com>
Date: Wed, 4 Jun 2014 16:10:38 -0700
Subject: mm/vmscan.c: use DIV_ROUND_UP for calculation of zone's balance_gap
 and correct comments.

Currently, we use (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1)
/ KSWAPD_ZONE_BALANCE_GAP_RATIO to avoid a zero gap value.  It's better to
use DIV_ROUND_UP macro for neater code and clear meaning.

Besides, the gap value is calculated against the per-zone "managed pages",
not "present pages".  This patch also corrects the comment and do some
rephrasing.

Signed-off-by: Jianyu Zhan <nasa4836@gmail.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Rafael Aquini <aquini@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4348d95..4bdbee8 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -166,10 +166,10 @@ enum {
 #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
 
 /*
- * Ratio between the present memory in the zone and the "gap" that
- * we're allowing kswapd to shrink in addition to the per-zone high
- * wmark, even for zones that already have the high wmark satisfied,
- * in order to provide better per-zone lru behavior. We are ok to
+ * Ratio between zone->managed_pages and the "gap" that above the per-zone
+ * "high_wmark". While balancing nodes, We allow kswapd to shrink zones that
+ * do not meet the (high_wmark + gap) watermark, even which already met the
+ * high_wmark, in order to provide better per-zone lru behavior. We are ok to
  * spend not more than 1% of the memory for this zone balancing "gap".
  */
 #define KSWAPD_ZONE_BALANCE_GAP_RATIO 100
-- 
cgit v1.1


From daa5ba768b9e15da8867824d2f1e8d455f1acac2 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <koct9i@gmail.com>
Date: Wed, 4 Jun 2014 16:10:52 -0700
Subject: mm/rmap.c: cleanup ttu_flags

Transform action part of ttu_flags into individiual bits.  These flags
aren't part of any uses-space visible api or even trace events.

Signed-off-by: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 9be55c7..be57450 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -72,10 +72,9 @@ struct anon_vma_chain {
 };
 
 enum ttu_flags {
-	TTU_UNMAP = 0,			/* unmap mode */
-	TTU_MIGRATION = 1,		/* migration mode */
-	TTU_MUNLOCK = 2,		/* munlock mode */
-	TTU_ACTION_MASK = 0xff,
+	TTU_UNMAP = 1,			/* unmap mode */
+	TTU_MIGRATION = 2,		/* migration mode */
+	TTU_MUNLOCK = 4,		/* munlock mode */
 
 	TTU_IGNORE_MLOCK = (1 << 8),	/* ignore mlock */
 	TTU_IGNORE_ACCESS = (1 << 9),	/* don't age */
-- 
cgit v1.1


From 100873d7a777b67ad35197c5a998b5e778f8bf3f Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 4 Jun 2014 16:10:56 -0700
Subject: hugetlb: rename hugepage_migration_support() to ..._supported()

We already have a function named hugepages_supported(), and the similar
name hugepage_migration_support() is a bit unconfortable, so let's rename
it hugepage_migration_supported().

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 35786ee..255cd5c 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -397,7 +397,7 @@ static inline pgoff_t basepage_index(struct page *page)
 
 extern void dissolve_free_huge_pages(unsigned long start_pfn,
 				     unsigned long end_pfn);
-static inline int hugepage_migration_support(struct hstate *h)
+static inline int hugepage_migration_supported(struct hstate *h)
 {
 #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
 	return huge_page_shift(h) == PMD_SHIFT;
@@ -453,7 +453,7 @@ static inline pgoff_t basepage_index(struct page *page)
 	return page->index;
 }
 #define dissolve_free_huge_pages(s, e)	do {} while (0)
-#define hugepage_migration_support(h)	0
+#define hugepage_migration_supported(h)	0
 
 static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
 					   struct mm_struct *mm, pte_t *pte)
-- 
cgit v1.1


From 50417c55562c03e6746b13aee650c2bbb048fea3 Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Wed, 4 Jun 2014 16:11:07 -0700
Subject: mm/zbud.c: make size unsigned like unique callsite

zbud_alloc is only called by zswap_frontswap_store with unsigned int len.
Change function parameter + update >= 0 check.

Signed-off-by: Fabian Frederick <fabf@skynet.be>
Acked-by: Seth Jennings <sjennings@variantweb.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/zbud.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/zbud.h b/include/linux/zbud.h
index 2571a5c..13af0d4 100644
--- a/include/linux/zbud.h
+++ b/include/linux/zbud.h
@@ -11,7 +11,7 @@ struct zbud_ops {
 
 struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops);
 void zbud_destroy_pool(struct zbud_pool *pool);
-int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
+int zbud_alloc(struct zbud_pool *pool, unsigned int size, gfp_t gfp,
 	unsigned long *handle);
 void zbud_free(struct zbud_pool *pool, unsigned long handle);
 int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries);
-- 
cgit v1.1


From 2c0d259e0e580dd95dd5d2d5aa4926169228d4a0 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Wed, 4 Jun 2014 16:11:16 -0700
Subject: compiler.h: avoid sparse errors in __compiletime_error_fallback()

Usually, BUG_ON and friends aren't even evaluated in sparse, but recently
compiletime_assert_atomic_type() was added, and that now results in a
sparse warning every time it is used.

The reason turns out to be the temporary variable, after it sparse no
longer considers the value to be a constant, and results in a warning and
an error.  The error is the more annoying part of this as it suppresses
any further warnings in the same file, hiding other problems.

Unfortunately the condition cannot be simply expanded out to avoid the
temporary variable since it breaks compiletime_assert on old versions of
GCC such as GCC 4.2.4 which the latest metag compiler is based on.

Therefore #ifndef __CHECKER__ out the __compiletime_error_fallback which
uses the potentially negative size array to trigger a conditional compiler
error, so that sparse doesn't see it.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Johannes Berg <johannes.berg@intel.com>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Luciano Coelho <luciano.coelho@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index ee7239e..64fdfe1 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -323,9 +323,18 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
 #endif
 #ifndef __compiletime_error
 # define __compiletime_error(message)
-# define __compiletime_error_fallback(condition) \
+/*
+ * Sparse complains of variable sized arrays due to the temporary variable in
+ * __compiletime_assert. Unfortunately we can't just expand it out to make
+ * sparse see a constant array size without breaking compiletime_assert on old
+ * versions of GCC (e.g. 4.2.4), so hide the array from sparse altogether.
+ */
+# ifndef __CHECKER__
+#  define __compiletime_error_fallback(condition) \
 	do { ((void)sizeof(char[1 - 2 * condition])); } while (0)
-#else
+# endif
+#endif
+#ifndef __compiletime_error_fallback
 # define __compiletime_error_fallback(condition) do { } while (0)
 #endif
 
-- 
cgit v1.1


From b300a4ea665f7fa44f015616ac1874deca891c5e Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill@shutemov.name>
Date: Wed, 4 Jun 2014 16:11:27 -0700
Subject: kernel/user.c: drop unused field 'files' from user_struct

Nobody seems uses it for a long time. Let's drop it.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2f2dd7d..611676f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -745,7 +745,6 @@ static inline int signal_group_exit(const struct signal_struct *sig)
 struct user_struct {
 	atomic_t __count;	/* reference count */
 	atomic_t processes;	/* How many processes does this user have? */
-	atomic_t files;		/* How many open files does this user have? */
 	atomic_t sigpending;	/* How many pending signals does this user have? */
 #ifdef CONFIG_INOTIFY_USER
 	atomic_t inotify_watches; /* How many inotify watches does this user have? */
-- 
cgit v1.1


From aac74dc495456412c4130a1167ce4beb6c1f0b38 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 4 Jun 2014 16:11:40 -0700
Subject: printk: rename printk_sched to printk_deferred

After learning we'll need some sort of deferred printk functionality in
the timekeeping core, Peter suggested we rename the printk_sched function
so it can be reused by needed subsystems.

This only changes the function name. No logic changes.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 8752f75..7847301 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -128,9 +128,9 @@ asmlinkage __printf(1, 2) __cold
 int printk(const char *fmt, ...);
 
 /*
- * Special printk facility for scheduler use only, _DO_NOT_USE_ !
+ * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ !
  */
-__printf(1, 2) __cold int printk_sched(const char *fmt, ...);
+__printf(1, 2) __cold int printk_deferred(const char *fmt, ...);
 
 /*
  * Please don't use printk_ratelimit(), because it shares ratelimiting state
@@ -165,7 +165,7 @@ int printk(const char *s, ...)
 	return 0;
 }
 static inline __printf(1, 2) __cold
-int printk_sched(const char *s, ...)
+int printk_deferred(const char *s, ...)
 {
 	return 0;
 }
-- 
cgit v1.1


From c224815dac9c739b79050d3cc67443ff500bc478 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 4 Jun 2014 16:11:41 -0700
Subject: printk: Add printk_deferred_once

Two of the three prink_deferred uses are really printk_once style
uses, so add a printk_deferred_once macro to simplify those call
sites.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 7847301..f086d6c 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -266,9 +266,20 @@ extern asmlinkage void dump_stack(void) __cold;
 		printk(fmt, ##__VA_ARGS__);			\
 	}							\
 })
+#define printk_deferred_once(fmt, ...)				\
+({								\
+	static bool __print_once __read_mostly;			\
+								\
+	if (!__print_once) {					\
+		__print_once = true;				\
+		printk_deferred(fmt, ##__VA_ARGS__);		\
+	}							\
+})
 #else
 #define printk_once(fmt, ...)					\
 	no_printk(fmt, ##__VA_ARGS__)
+#define printk_deferred_once(fmt, ...)				\
+	no_printk(fmt, ##__VA_ARGS__)
 #endif
 
 #define pr_emerg_once(fmt, ...)					\
-- 
cgit v1.1


From 6e099f557d9c6797c3ee3ee7b5c8cebe543ec1cc Mon Sep 17 00:00:00 2001
From: Dan Streetman <ddstreet@ieee.org>
Date: Wed, 4 Jun 2014 16:11:44 -0700
Subject: Documentation: expand/clarify debug documentation

The pr_debug() and related debug print macros all differ from the normal
pr_XXX() macros, in that the normal ones print unconditionally, while
the debug macros are compiled out unless DEBUG is defined or
CONFIG_DYNAMIC_DEBUG is set.  This isn't obvious, and the only way to
find this out is either to review the actual printk.h code or to read
CodingStyle, and the message there doesn't highlight the fact.

Change Documentation/CodingStyle to clearly indicate that pr_debug() and
related debug printing macros behave differently than all other pr_XXX()
macros, and attempt to clarify when and where the different debug
printing methods might be used.

Add short comment to printk.h above the pr_XXX() macros indicating that
while these macros print unconditionally, pr_debug() does not.

Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Cc: Joe Perches <joe@perches.com>
Cc: Fabian Frederick <fabf@skynet.be>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index f086d6c..37f3a65 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -210,6 +210,12 @@ extern asmlinkage void dump_stack(void) __cold;
 #define pr_fmt(fmt) fmt
 #endif
 
+/*
+ * These can be used to print at the various log levels.
+ * All of these will print unconditionally, although note that pr_debug()
+ * and other debug macros are compiled out unless either DEBUG is defined
+ * or CONFIG_DYNAMIC_DEBUG is set.
+ */
 #define pr_emerg(fmt, ...) \
 	printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
 #define pr_alert(fmt, ...) \
-- 
cgit v1.1


From a8fe19ebfbfd90ec17c02284717238b02efb9580 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 4 Jun 2014 16:11:46 -0700
Subject: kernel/printk: use symbolic defines for console loglevels

... instead of naked numbers.

Stuff in sysrq.c used to set it to 8 which is supposed to mean above
default level so set it to DEBUG instead as we're terminating/killing all
tasks and we want to be verbose there.

Also, correct the check in x86_64_start_kernel which should be >= as
we're clearly issuing the string there for all debug levels, not only
the magical 10.

Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Joe Perches <joe@perches.com>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 37f3a65..319ff7e 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -30,6 +30,17 @@ static inline const char *printk_skip_level(const char *buffer)
 	return buffer;
 }
 
+/* printk's without a loglevel use this.. */
+#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
+
+/* We show everything that is MORE important than this.. */
+#define CONSOLE_LOGLEVEL_SILENT  0 /* Mum's the word */
+#define CONSOLE_LOGLEVEL_MIN	 1 /* Minimum loglevel we let people use */
+#define CONSOLE_LOGLEVEL_QUIET	 4 /* Shhh ..., when booted with "quiet" */
+#define CONSOLE_LOGLEVEL_DEFAULT 7 /* anything MORE serious than KERN_DEBUG */
+#define CONSOLE_LOGLEVEL_DEBUG	10 /* issue debug messages */
+#define CONSOLE_LOGLEVEL_MOTORMOUTH 15	/* You can't shut this one up */
+
 extern int console_printk[];
 
 #define console_loglevel (console_printk[0])
@@ -39,13 +50,13 @@ extern int console_printk[];
 
 static inline void console_silent(void)
 {
-	console_loglevel = 0;
+	console_loglevel = CONSOLE_LOGLEVEL_SILENT;
 }
 
 static inline void console_verbose(void)
 {
 	if (console_loglevel)
-		console_loglevel = 15;
+		console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
 }
 
 struct va_format {
-- 
cgit v1.1


From 34a1b7236ad6113883f6c448d1da854cad60265e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 4 Jun 2014 16:12:19 -0700
Subject: kthreads: kill CLONE_KERNEL, change kernel_thread(kernel_init) to
 avoid CLONE_SIGHAND

1. Remove CLONE_KERNEL, it has no users and it is dangerous.

   The (old) comment says "List of flags we want to share for kernel
   threads" but this is not true, we do not want to share ->sighand by
   default. This flag can only be used if the caller is sure that both
   parent/child will never play with signals (say, allow_signal/etc).

2. Change rest_init() to clone kernel_init() without CLONE_SIGHAND.

   In this case CLONE_SIGHAND does not really hurt, and it looks like
   optimization because copy_sighand() can avoid kmem_cache_alloc().

   But in fact this only adds the minor pessimization. kernel_init()
   is going to exec the init process, and de_thread() will need to
   unshare ->sighand and do kmem_cache_alloc(sighand_cachep) anyway,
   but it needs to do more work and take tasklist_lock and siglock.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 611676f..8fcd0e6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -137,12 +137,6 @@ struct filename;
 #define VMACACHE_MASK (VMACACHE_SIZE - 1)
 
 /*
- * List of flags we want to share for kernel threads,
- * if only because they are not used by them anyway.
- */
-#define CLONE_KERNEL	(CLONE_FS | CLONE_FILES | CLONE_SIGHAND)
-
-/*
  * These are the constant used to fake the fixed-point load-average
  * counting. Some notes:
  *  - 11 bit fractions expand to 22 bits by the multiplies: this gives
-- 
cgit v1.1


From 647f010bff6795b3e85c2b5a7768c0594a049ab0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 4 Jun 2014 16:12:20 -0700
Subject: init/main.c: remove an ifdef

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/proc_fs.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 608e60a..9d117f6 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -44,6 +44,10 @@ extern int remove_proc_subtree(const char *, struct proc_dir_entry *);
 
 #else /* CONFIG_PROC_FS */
 
+static inline void proc_root_init(void)
+{
+}
+
 static inline void proc_flush_task(struct task_struct *task)
 {
 }
-- 
cgit v1.1