summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-05-19 20:00:06 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2016-05-19 20:00:06 -0700
commita05a70db34ba24ca009e1c9cedaef26fd17d5470 (patch)
treed5d8d0c80293bed52f2103ccc56a9e09117dc983
parent03b979dd0323ace8e29a0561cd5232f73a060c09 (diff)
parent4741526b83c5d3a3d661d1896f9e7414c5730bcb (diff)
downloadop-kernel-dev-a05a70db34ba24ca009e1c9cedaef26fd17d5470.zip
op-kernel-dev-a05a70db34ba24ca009e1c9cedaef26fd17d5470.tar.gz
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton: - fsnotify fix - poll() timeout fix - a few scripts/ tweaks - debugobjects updates - the (small) ocfs2 queue - Minor fixes to kernel/padata.c - Maybe half of the MM queue * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (117 commits) mm, page_alloc: restore the original nodemask if the fast path allocation failed mm, page_alloc: uninline the bad page part of check_new_page() mm, page_alloc: don't duplicate code in free_pcp_prepare mm, page_alloc: defer debugging checks of pages allocated from the PCP mm, page_alloc: defer debugging checks of freed pages until a PCP drain cpuset: use static key better and convert to new API mm, page_alloc: inline pageblock lookup in page free fast paths mm, page_alloc: remove unnecessary variable from free_pcppages_bulk mm, page_alloc: pull out side effects from free_pages_check mm, page_alloc: un-inline the bad part of free_pages_check mm, page_alloc: check multiple page fields with a single branch mm, page_alloc: remove field from alloc_context mm, page_alloc: avoid looking up the first zone in a zonelist twice mm, page_alloc: shortcut watermark checks for order-0 pages mm, page_alloc: reduce cost of fair zone allocation policy retry mm, page_alloc: shorten the page allocator fast path mm, page_alloc: check once if a zone has isolated pageblocks mm, page_alloc: move __GFP_HARDWALL modifications out of the fastpath mm, page_alloc: simplify last cpupid reset mm, page_alloc: remove unnecessary initialisation from __alloc_pages_nodemask() ...
-rw-r--r--Documentation/DocBook/debugobjects.tmpl26
-rw-r--r--Documentation/kernel-parameters.txt8
-rw-r--r--Documentation/memory-hotplug.txt9
-rw-r--r--Documentation/sysctl/vm.txt14
-rw-r--r--Documentation/vm/transhuge.txt10
-rw-r--r--arch/arc/include/asm/hugepage.h2
-rw-r--r--arch/arm/include/asm/pgtable-3level.h5
-rw-r--r--arch/arm64/include/asm/pgtable.h5
-rw-r--r--arch/arm64/mm/hugetlbpage.c1
-rw-r--r--arch/metag/mm/hugetlbpage.c1
-rw-r--r--arch/mips/include/asm/pgtable.h1
-rw-r--r--arch/mips/mm/tlb-r4k.c21
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable.h1
-rw-r--r--arch/powerpc/include/asm/pgtable.h1
-rw-r--r--arch/powerpc/mm/hugetlbpage.c6
-rw-r--r--arch/s390/include/asm/pgtable.h1
-rw-r--r--arch/sparc/include/asm/pgtable_64.h2
-rw-r--r--arch/tile/include/asm/pgtable.h1
-rw-r--r--arch/tile/kernel/setup.c4
-rw-r--r--arch/tile/mm/hugetlbpage.c7
-rw-r--r--arch/tile/mm/init.c2
-rw-r--r--arch/x86/include/asm/pgtable.h1
-rw-r--r--arch/x86/mm/hugetlbpage.c1
-rw-r--r--arch/x86/mm/numa.c4
-rw-r--r--drivers/block/aoe/aoecmd.c2
-rw-r--r--drivers/hwtracing/intel_th/msu.c2
-rw-r--r--drivers/media/usb/dvb-usb/dib0700_core.c2
-rw-r--r--drivers/net/ethernet/cavium/thunder/nicvf_queues.c2
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_rx.c20
-rw-r--r--drivers/net/ethernet/qlogic/qede/qede_main.c6
-rw-r--r--drivers/scsi/bfa/bfi.h2
-rw-r--r--drivers/staging/comedi/drivers/daqboard2000.c2
-rw-r--r--fs/buffer.c10
-rw-r--r--fs/eventpoll.c12
-rw-r--r--fs/notify/fsnotify.h7
-rw-r--r--fs/notify/group.c17
-rw-r--r--fs/notify/mark.c78
-rw-r--r--fs/ocfs2/alloc.c3
-rw-r--r--fs/ocfs2/cluster/heartbeat.c5
-rw-r--r--fs/ocfs2/ocfs2_fs.h2
-rw-r--r--fs/ocfs2/slot_map.c6
-rw-r--r--fs/proc/page.c2
-rw-r--r--fs/select.c67
-rw-r--r--include/asm-generic/pgtable.h8
-rw-r--r--include/linux/bootmem.h16
-rw-r--r--include/linux/compaction.h6
-rw-r--r--include/linux/compiler-gcc.h1
-rw-r--r--include/linux/compiler.h4
-rw-r--r--include/linux/cpuset.h42
-rw-r--r--include/linux/debugobjects.h17
-rw-r--r--include/linux/device.h12
-rw-r--r--include/linux/fsnotify_backend.h2
-rw-r--r--include/linux/huge_mm.h4
-rw-r--r--include/linux/hugetlb.h1
-rw-r--r--include/linux/hugetlb_inline.h6
-rw-r--r--include/linux/kernel.h4
-rw-r--r--include/linux/memcontrol.h6
-rw-r--r--include/linux/memory_hotplug.h6
-rw-r--r--include/linux/mempolicy.h16
-rw-r--r--include/linux/mempool.h3
-rw-r--r--include/linux/mm.h32
-rw-r--r--include/linux/mm_inline.h24
-rw-r--r--include/linux/mm_types.h14
-rw-r--r--include/linux/mmzone.h46
-rw-r--r--include/linux/nodemask.h11
-rw-r--r--include/linux/oom.h8
-rw-r--r--include/linux/padata.h5
-rw-r--r--include/linux/page-flags.h7
-rw-r--r--include/linux/page_ref.h26
-rw-r--r--include/linux/pagemap.h8
-rw-r--r--include/linux/poll.h11
-rw-r--r--include/linux/slab.h16
-rw-r--r--include/linux/slab_def.h4
-rw-r--r--include/linux/string.h2
-rw-r--r--include/linux/time64.h17
-rw-r--r--include/linux/vmstat.h6
-rw-r--r--init/Kconfig9
-rw-r--r--kernel/cpuset.c22
-rw-r--r--kernel/kexec_core.c2
-rw-r--r--kernel/padata.c138
-rw-r--r--kernel/rcu/update.c26
-rw-r--r--kernel/sysctl.c7
-rw-r--r--kernel/time/hrtimer.c23
-rw-r--r--kernel/time/time.c21
-rw-r--r--kernel/time/timer.c63
-rw-r--r--kernel/workqueue.c52
-rw-r--r--lib/Makefile2
-rw-r--r--lib/debugobjects.c92
-rw-r--r--lib/nodemask.c30
-rw-r--r--lib/percpu_counter.c6
-rw-r--r--mm/Kconfig21
-rw-r--r--mm/compaction.c158
-rw-r--r--mm/filemap.c2
-rw-r--r--mm/highmem.c12
-rw-r--r--mm/huge_memory.c11
-rw-r--r--mm/hugetlb.c37
-rw-r--r--mm/internal.h9
-rw-r--r--mm/memcontrol.c38
-rw-r--r--mm/memory_hotplug.c21
-rw-r--r--mm/mempolicy.c63
-rw-r--r--mm/migrate.c6
-rw-r--r--mm/mmap.c5
-rw-r--r--mm/mmzone.c2
-rw-r--r--mm/mremap.c47
-rw-r--r--mm/oom_kill.c112
-rw-r--r--mm/page-writeback.c8
-rw-r--r--mm/page_alloc.c885
-rw-r--r--mm/page_isolation.c10
-rw-r--r--mm/page_owner.c5
-rw-r--r--mm/rmap.c4
-rw-r--r--mm/shmem.c130
-rw-r--r--mm/slab.c771
-rw-r--r--mm/slub.c16
-rw-r--r--mm/swap_state.c3
-rw-r--r--mm/util.c23
-rw-r--r--mm/vmscan.c27
-rw-r--r--mm/vmstat.c112
-rw-r--r--net/socket.c8
-rw-r--r--net/wireless/util.c2
-rwxr-xr-xscripts/bloat-o-meter6
-rwxr-xr-xscripts/decode_stacktrace.sh55
-rw-r--r--scripts/spelling.txt1
122 files changed, 2277 insertions, 1598 deletions
diff --git a/Documentation/DocBook/debugobjects.tmpl b/Documentation/DocBook/debugobjects.tmpl
index 24979f6..7e4f34f 100644
--- a/Documentation/DocBook/debugobjects.tmpl
+++ b/Documentation/DocBook/debugobjects.tmpl
@@ -316,8 +316,8 @@
</itemizedlist>
</para>
<para>
- The function returns 1 when the fixup was successful,
- otherwise 0. The return value is used to update the
+ The function returns true when the fixup was successful,
+ otherwise false. The return value is used to update the
statistics.
</para>
<para>
@@ -341,8 +341,8 @@
</itemizedlist>
</para>
<para>
- The function returns 1 when the fixup was successful,
- otherwise 0. The return value is used to update the
+ The function returns true when the fixup was successful,
+ otherwise false. The return value is used to update the
statistics.
</para>
<para>
@@ -359,7 +359,8 @@
statically initialized object or not. In case it is it calls
debug_object_init() and debug_object_activate() to make the
object known to the tracker and marked active. In this case
- the function should return 0 because this is not a real fixup.
+ the function should return false because this is not a real
+ fixup.
</para>
</sect1>
@@ -376,8 +377,8 @@
</itemizedlist>
</para>
<para>
- The function returns 1 when the fixup was successful,
- otherwise 0. The return value is used to update the
+ The function returns true when the fixup was successful,
+ otherwise false. The return value is used to update the
statistics.
</para>
</sect1>
@@ -397,8 +398,8 @@
</itemizedlist>
</para>
<para>
- The function returns 1 when the fixup was successful,
- otherwise 0. The return value is used to update the
+ The function returns true when the fixup was successful,
+ otherwise false. The return value is used to update the
statistics.
</para>
</sect1>
@@ -414,8 +415,8 @@
debug bucket.
</para>
<para>
- The function returns 1 when the fixup was successful,
- otherwise 0. The return value is used to update the
+ The function returns true when the fixup was successful,
+ otherwise false. The return value is used to update the
statistics.
</para>
<para>
@@ -427,7 +428,8 @@
case. The fixup function should check if this is a legitimate
case of a statically initialized object or not. In this case only
debug_object_init() should be called to make the object known to
- the tracker. Then the function should return 0 because this is not
+ the tracker. Then the function should return false because this
+ is not
a real fixup.
</para>
</sect1>
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index f5c3590..18d7f5b 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2168,6 +2168,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
[KNL,SH] Allow user to override the default size for
per-device physically contiguous DMA buffers.
+ memhp_default_state=online/offline
+ [KNL] Set the initial state for the memory hotplug
+ onlining policy. If not specified, the default value is
+ set according to the
+ CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config
+ option.
+ See Documentation/memory-hotplug.txt.
+
memmap=exactmap [KNL,X86] Enable setting of an exact
E820 memory map, as specified by the user.
Such memmap=exactmap lines can be constructed based on
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index 443f4b4..0d7cb95 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -261,10 +261,11 @@ it according to the policy which can be read from "auto_online_blocks" file:
% cat /sys/devices/system/memory/auto_online_blocks
-The default is "offline" which means the newly added memory is not in a
-ready-to-use state and you have to "online" the newly added memory blocks
-manually. Automatic onlining can be requested by writing "online" to
-"auto_online_blocks" file:
+The default depends on the CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config
+option. If it is disabled the default is "offline" which means the newly added
+memory is not in a ready-to-use state and you have to "online" the newly added
+memory blocks manually. Automatic onlining can be requested by writing "online"
+to "auto_online_blocks" file:
% echo online > /sys/devices/system/memory/auto_online_blocks
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 34a5fec..720355c 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -57,6 +57,7 @@ Currently, these files are in /proc/sys/vm:
- panic_on_oom
- percpu_pagelist_fraction
- stat_interval
+- stat_refresh
- swappiness
- user_reserve_kbytes
- vfs_cache_pressure
@@ -755,6 +756,19 @@ is 1 second.
==============================================================
+stat_refresh
+
+Any read or write (by root only) flushes all the per-cpu vm statistics
+into their global totals, for more accurate reports when testing
+e.g. cat /proc/sys/vm/stat_refresh /proc/meminfo
+
+As a side-effect, it also checks for negative totals (elsewhere reported
+as 0) and "fails" with EINVAL if any are found, with a warning in dmesg.
+(At time of writing, a few stats are known sometimes to be found negative,
+with no ill effects: errors and warnings on these stats are suppressed.)
+
+==============================================================
+
swappiness
This control is used to define how aggressive the kernel will swap
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index d9cb65c..fb0e1f2 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -394,9 +394,9 @@ hugepage natively. Once finished you can drop the page table lock.
Refcounting on THP is mostly consistent with refcounting on other compound
pages:
- - get_page()/put_page() and GUP operate in head page's ->_count.
+ - get_page()/put_page() and GUP operate in head page's ->_refcount.
- - ->_count in tail pages is always zero: get_page_unless_zero() never
+ - ->_refcount in tail pages is always zero: get_page_unless_zero() never
succeed on tail pages.
- map/unmap of the pages with PTE entry increment/decrement ->_mapcount
@@ -426,15 +426,15 @@ requests to split pinned huge page: it expects page count to be equal to
sum of mapcount of all sub-pages plus one (split_huge_page caller must
have reference for head page).
-split_huge_page uses migration entries to stabilize page->_count and
+split_huge_page uses migration entries to stabilize page->_refcount and
page->_mapcount.
We safe against physical memory scanners too: the only legitimate way
scanner can get reference to a page is get_page_unless_zero().
-All tail pages has zero ->_count until atomic_add(). It prevent scanner
+All tail pages has zero ->_refcount until atomic_add(). It prevent scanner
from geting reference to tail page up to the point. After the atomic_add()
-we don't care about ->_count value. We already known how many references
+we don't care about ->_refcount value. We already known how many references
with should uncharge from head page.
For head page get_page_unless_zero() will succeed and we don't mind. It's
diff --git a/arch/arc/include/asm/hugepage.h b/arch/arc/include/asm/hugepage.h
index 7afe335..317ff77 100644
--- a/arch/arc/include/asm/hugepage.h
+++ b/arch/arc/include/asm/hugepage.h
@@ -61,8 +61,6 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd);
-#define has_transparent_hugepage() 1
-
/* Generic variants assume pgtable_t is struct page *, hence need for these */
#define __HAVE_ARCH_PGTABLE_DEPOSIT
extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index dc46398..fa70db7 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -281,11 +281,6 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
flush_pmd_entry(pmdp);
}
-static inline int has_transparent_hugepage(void)
-{
- return 1;
-}
-
#endif /* __ASSEMBLY__ */
#endif /* _ASM_PGTABLE_3LEVEL_H */
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 1910bf47..46472a9 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -316,11 +316,6 @@ static inline int pmd_protnone(pmd_t pmd)
#define set_pmd_at(mm, addr, pmdp, pmd) set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd))
-static inline int has_transparent_hugepage(void)
-{
- return 1;
-}
-
#define __pgprot_modify(prot,mask,bits) \
__pgprot((pgprot_val(prot) & ~(mask)) | (bits))
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 589fd28..aa8aee7 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -307,6 +307,7 @@ static __init int setup_hugepagesz(char *opt)
} else if (ps == PUD_SIZE) {
hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
} else {
+ hugetlb_bad_size();
pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10);
return 0;
}
diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c
index b38700ae..db1b7da 100644
--- a/arch/metag/mm/hugetlbpage.c
+++ b/arch/metag/mm/hugetlbpage.c
@@ -239,6 +239,7 @@ static __init int setup_hugepagesz(char *opt)
if (ps == (1 << HPAGE_SHIFT)) {
hugetlb_add_hstate(HPAGE_SHIFT - PAGE_SHIFT);
} else {
+ hugetlb_bad_size();
pr_err("hugepagesz: Unsupported page size %lu M\n",
ps >> 20);
return 0;
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index e07a105..a6b611f 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -533,6 +533,7 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define has_transparent_hugepage has_transparent_hugepage
extern int has_transparent_hugepage(void);
static inline int pmd_trans_huge(pmd_t pmd)
diff --git a/arch/mips/mm/tlb-r4k.c b/arch/mips/mm/tlb-r4k.c
index 5a5c7fe..e8b335c 100644
--- a/arch/mips/mm/tlb-r4k.c
+++ b/arch/mips/mm/tlb-r4k.c
@@ -405,19 +405,20 @@ void add_wired_entry(unsigned long entrylo0, unsigned long entrylo1,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-int __init has_transparent_hugepage(void)
+int has_transparent_hugepage(void)
{
- unsigned int mask;
- unsigned long flags;
-
- local_irq_save(flags);
- write_c0_pagemask(PM_HUGE_MASK);
- back_to_back_c0_hazard();
- mask = read_c0_pagemask();
- write_c0_pagemask(PM_DEFAULT_MASK);
+ static unsigned int mask = -1;
- local_irq_restore(flags);
+ if (mask == -1) { /* first call comes during __init */
+ unsigned long flags;
+ local_irq_save(flags);
+ write_c0_pagemask(PM_HUGE_MASK);
+ back_to_back_c0_hazard();
+ mask = read_c0_pagemask();
+ write_c0_pagemask(PM_DEFAULT_MASK);
+ local_irq_restore(flags);
+ }
return mask == PM_HUGE_MASK;
}
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 77d3ce0..8fe6f6b4 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -219,6 +219,7 @@ extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd);
extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd);
+#define has_transparent_hugepage has_transparent_hugepage
extern int has_transparent_hugepage(void);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 47897a3..ee09e99 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -65,7 +65,6 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
struct page **pages, int *nr);
#ifndef CONFIG_TRANSPARENT_HUGEPAGE
#define pmd_large(pmd) 0
-#define has_transparent_hugepage() 0
#endif
pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
bool *is_thp, unsigned *shift);
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index d991b9e..a4a90a8 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -772,8 +772,10 @@ static int __init hugepage_setup_sz(char *str)
size = memparse(str, &str);
- if (add_huge_page_size(size) != 0)
- printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
+ if (add_huge_page_size(size) != 0) {
+ hugetlb_bad_size();
+ pr_err("Invalid huge page size specified(%llu)\n", size);
+ }
return 1;
}
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 2f66645..18d2beb 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1223,6 +1223,7 @@ static inline int pmd_trans_huge(pmd_t pmd)
return pmd_val(pmd) & _SEGMENT_ENTRY_LARGE;
}
+#define has_transparent_hugepage has_transparent_hugepage
static inline int has_transparent_hugepage(void)
{
return MACHINE_HAS_HPAGE ? 1 : 0;
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index f089cfa..93ce0ad 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -681,8 +681,6 @@ static inline unsigned long pmd_trans_huge(pmd_t pmd)
return pte_val(pte) & _PAGE_PMD_HUGE;
}
-#define has_transparent_hugepage() 1
-
static inline pmd_t pmd_mkold(pmd_t pmd)
{
pte_t pte = __pte(pmd_val(pmd));
diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h
index 96cecf5..2a26cc4 100644
--- a/arch/tile/include/asm/pgtable.h
+++ b/arch/tile/include/asm/pgtable.h
@@ -487,7 +487,6 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define has_transparent_hugepage() 1
#define pmd_trans_huge pmd_huge_page
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
index a992238..153020a 100644
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -962,9 +962,7 @@ static void __init setup_numa_mapping(void)
cpumask_set_cpu(best_cpu, &node_2_cpu_mask[node]);
cpu_2_node[best_cpu] = node;
cpumask_clear_cpu(best_cpu, &unbound_cpus);
- node = next_node(node, default_nodes);
- if (node == MAX_NUMNODES)
- node = first_node(default_nodes);
+ node = next_node_in(node, default_nodes);
}
/* Print out node assignments and set defaults for disabled cpus */
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c
index e212c64..77ceaa3 100644
--- a/arch/tile/mm/hugetlbpage.c
+++ b/arch/tile/mm/hugetlbpage.c
@@ -308,11 +308,16 @@ static bool saw_hugepagesz;
static __init int setup_hugepagesz(char *opt)
{
+ int rc;
+
if (!saw_hugepagesz) {
saw_hugepagesz = true;
memset(huge_shift, 0, sizeof(huge_shift));
}
- return __setup_hugepagesz(memparse(opt, NULL));
+ rc = __setup_hugepagesz(memparse(opt, NULL));
+ if (rc)
+ hugetlb_bad_size();
+ return rc;
}
__setup("hugepagesz=", setup_hugepagesz);
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index a0582b7..adce254 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -679,7 +679,7 @@ static void __init init_free_pfn_range(unsigned long start, unsigned long end)
* Hacky direct set to avoid unnecessary
* lock take/release for EVERY page here.
*/
- p->_count.counter = 0;
+ p->_refcount.counter = 0;
p->_mapcount.counter = -1;
}
init_page_count(page);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index f86491a..1a27396 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -181,6 +181,7 @@ static inline int pmd_trans_huge(pmd_t pmd)
return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
}
+#define has_transparent_hugepage has_transparent_hugepage
static inline int has_transparent_hugepage(void)
{
return boot_cpu_has(X86_FEATURE_PSE);
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 14a9505..2ae8584 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -165,6 +165,7 @@ static __init int setup_hugepagesz(char *opt)
} else if (ps == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES)) {
hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
} else {
+ hugetlb_bad_size();
printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n",
ps >> 20);
return 0;
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index f70c1ff..9c086c5 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -617,9 +617,7 @@ static void __init numa_init_array(void)
if (early_cpu_to_node(i) != NUMA_NO_NODE)
continue;
numa_set_node(i, rr);
- rr = next_node(rr, node_online_map);
- if (rr == MAX_NUMNODES)
- rr = first_node(node_online_map);
+ rr = next_node_in(rr, node_online_map);
}
}
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 437b3a8..d597e43 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -861,7 +861,7 @@ rqbiocnt(struct request *r)
* discussion.
*
* We cannot use get_page in the workaround, because it insists on a
- * positive page count as a precondition. So we use _count directly.
+ * positive page count as a precondition. So we use _refcount directly.
*/
static void
bio_pageinc(struct bio *bio)
diff --git a/drivers/hwtracing/intel_th/msu.c b/drivers/hwtracing/intel_th/msu.c
index d9d6022..d220914 100644
--- a/drivers/hwtracing/intel_th/msu.c
+++ b/drivers/hwtracing/intel_th/msu.c
@@ -1164,7 +1164,7 @@ static void msc_mmap_close(struct vm_area_struct *vma)
if (!atomic_dec_and_mutex_lock(&msc->mmap_count, &msc->buf_mutex))
return;
- /* drop page _counts */
+ /* drop page _refcounts */
for (pg = 0; pg < msc->nr_pages; pg++) {
struct page *page = msc_buffer_get_page(msc, pg);
diff --git a/drivers/media/usb/dvb-usb/dib0700_core.c b/drivers/media/usb/dvb-usb/dib0700_core.c
index c16f999..bf890c3 100644
--- a/drivers/media/usb/dvb-usb/dib0700_core.c
+++ b/drivers/media/usb/dvb-usb/dib0700_core.c
@@ -517,7 +517,7 @@ int dib0700_download_firmware(struct usb_device *udev, const struct firmware *fw
if (nb_packet_buffer_size < 1)
nb_packet_buffer_size = 1;
- /* get the fimware version */
+ /* get the firmware version */
usb_control_msg(udev, usb_rcvctrlpipe(udev, 0),
REQUEST_GET_VERSION,
USB_TYPE_VENDOR | USB_DIR_IN, 0, 0,
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
index 06b819d..0ff8e60 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
@@ -23,7 +23,7 @@ static void nicvf_get_page(struct nicvf *nic)
if (!nic->rb_pageref || !nic->rb_page)
return;
- atomic_add(nic->rb_pageref, &nic->rb_page->_count);
+ page_ref_add(nic->rb_page, nic->rb_pageref);
nic->rb_pageref = 0;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index f345679..bd94770 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -433,8 +433,8 @@ static int mlx5e_alloc_rx_fragmented_mpwqe(struct mlx5e_rq *rq,
for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
if (unlikely(mlx5e_alloc_and_map_page(rq, wi, i)))
goto err_unmap;
- atomic_add(mlx5e_mpwqe_strides_per_page(rq),
- &wi->umr.dma_info[i].page->_count);
+ page_ref_add(wi->umr.dma_info[i].page,
+ mlx5e_mpwqe_strides_per_page(rq));
wi->skbs_frags[i] = 0;
}
@@ -452,8 +452,8 @@ err_unmap:
while (--i >= 0) {
dma_unmap_page(rq->pdev, wi->umr.dma_info[i].addr, PAGE_SIZE,
PCI_DMA_FROMDEVICE);
- atomic_sub(mlx5e_mpwqe_strides_per_page(rq),
- &wi->umr.dma_info[i].page->_count);
+ page_ref_sub(wi->umr.dma_info[i].page,
+ mlx5e_mpwqe_strides_per_page(rq));
put_page(wi->umr.dma_info[i].page);
}
dma_unmap_single(rq->pdev, wi->umr.mtt_addr, mtt_sz, PCI_DMA_TODEVICE);
@@ -477,8 +477,8 @@ void mlx5e_free_rx_fragmented_mpwqe(struct mlx5e_rq *rq,
for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
dma_unmap_page(rq->pdev, wi->umr.dma_info[i].addr, PAGE_SIZE,
PCI_DMA_FROMDEVICE);
- atomic_sub(mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i],
- &wi->umr.dma_info[i].page->_count);
+ page_ref_sub(wi->umr.dma_info[i].page,
+ mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i]);
put_page(wi->umr.dma_info[i].page);
}
dma_unmap_single(rq->pdev, wi->umr.mtt_addr, mtt_sz, PCI_DMA_TODEVICE);
@@ -527,8 +527,8 @@ static int mlx5e_alloc_rx_linear_mpwqe(struct mlx5e_rq *rq,
*/
split_page(wi->dma_info.page, MLX5_MPWRQ_WQE_PAGE_ORDER);
for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
- atomic_add(mlx5e_mpwqe_strides_per_page(rq),
- &wi->dma_info.page[i]._count);
+ page_ref_add(&wi->dma_info.page[i],
+ mlx5e_mpwqe_strides_per_page(rq));
wi->skbs_frags[i] = 0;
}
@@ -551,8 +551,8 @@ void mlx5e_free_rx_linear_mpwqe(struct mlx5e_rq *rq,
dma_unmap_page(rq->pdev, wi->dma_info.addr, rq->wqe_sz,
PCI_DMA_FROMDEVICE);
for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
- atomic_sub(mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i],
- &wi->dma_info.page[i]._count);
+ page_ref_sub(&wi->dma_info.page[i],
+ mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i]);
put_page(&wi->dma_info.page[i]);
}
}
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 8114541..73dd525 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -920,7 +920,7 @@ static inline int qede_realloc_rx_buffer(struct qede_dev *edev,
* network stack to take the ownership of the page
* which can be recycled multiple times by the driver.
*/
- atomic_inc(&curr_cons->data->_count);
+ page_ref_inc(curr_cons->data);
qede_reuse_page(edev, rxq, curr_cons);
}
@@ -1036,7 +1036,7 @@ static int qede_fill_frag_skb(struct qede_dev *edev,
/* Incr page ref count to reuse on allocation failure
* so that it doesn't get freed while freeing SKB.
*/
- atomic_inc(&current_bd->data->_count);
+ page_ref_inc(current_bd->data);
goto out;
}
@@ -1487,7 +1487,7 @@ static int qede_rx_int(struct qede_fastpath *fp, int budget)
* freeing SKB.
*/
- atomic_inc(&sw_rx_data->data->_count);
+ page_ref_inc(sw_rx_data->data);
rxq->rx_alloc_errors++;
qede_recycle_rx_bd_ring(rxq, edev,
fp_cqe->bd_num);
diff --git a/drivers/scsi/bfa/bfi.h b/drivers/scsi/bfa/bfi.h
index 97600dc..5f698d0 100644
--- a/drivers/scsi/bfa/bfi.h
+++ b/drivers/scsi/bfa/bfi.h
@@ -356,7 +356,7 @@ struct bfi_ioc_image_hdr_s {
u8 port0_mode; /* device mode for port 0 */
u8 port1_mode; /* device mode for port 1 */
u32 exec; /* exec vector */
- u32 bootenv; /* fimware boot env */
+ u32 bootenv; /* firmware boot env */
u32 rsvd_b[2];
struct bfi_ioc_fwver_s fwver;
u32 md5sum[BFI_IOC_MD5SUM_SZ];
diff --git a/drivers/staging/comedi/drivers/daqboard2000.c b/drivers/staging/comedi/drivers/daqboard2000.c
index 57ab668..a536a15 100644
--- a/drivers/staging/comedi/drivers/daqboard2000.c
+++ b/drivers/staging/comedi/drivers/daqboard2000.c
@@ -26,7 +26,7 @@
* Much of the functionality of this driver was determined from reading
* the source code for the Windows driver.
*
- * The FPGA on the board requires fimware, which is available from
+ * The FPGA on the board requires firmware, which is available from
* http://www.comedi.org in the comedi_nonfree_firmware tarball.
*
* Configuration options: not applicable, uses PCI auto config
diff --git a/fs/buffer.c b/fs/buffer.c
index af0d9a8..754813a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -255,17 +255,17 @@ out:
*/
static void free_more_memory(void)
{
- struct zone *zone;
+ struct zoneref *z;
int nid;
wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
yield();
for_each_online_node(nid) {
- (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
- gfp_zone(GFP_NOFS), NULL,
- &zone);
- if (zone)
+
+ z = first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
+ gfp_zone(GFP_NOFS), NULL);
+ if (z->zone)
try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
GFP_NOFS, NULL);
}
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 8a74a2a..10db912 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1583,15 +1583,15 @@ static int ep_send_events(struct eventpoll *ep,
return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
}
-static inline struct timespec ep_set_mstimeout(long ms)
+static inline struct timespec64 ep_set_mstimeout(long ms)
{
- struct timespec now, ts = {
+ struct timespec64 now, ts = {
.tv_sec = ms / MSEC_PER_SEC,
.tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
};
- ktime_get_ts(&now);
- return timespec_add_safe(now, ts);
+ ktime_get_ts64(&now);
+ return timespec64_add_safe(now, ts);
}
/**
@@ -1621,11 +1621,11 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
ktime_t expires, *to = NULL;
if (timeout > 0) {
- struct timespec end_time = ep_set_mstimeout(timeout);
+ struct timespec64 end_time = ep_set_mstimeout(timeout);
slack = select_estimate_accuracy(&end_time);
to = &expires;
- *to = timespec_to_ktime(end_time);
+ *to = timespec64_to_ktime(end_time);
} else if (timeout == 0) {
/*
* Avoid the unnecessary trip to the wait queue loop, if the
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index b44c68a..0a3bc2c 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -56,6 +56,13 @@ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks,
&mnt->mnt_root->d_lock);
}
+/* prepare for freeing all marks associated with given group */
+extern void fsnotify_detach_group_marks(struct fsnotify_group *group);
+/*
+ * wait for fsnotify_mark_srcu period to end and free all marks in destroy_list
+ */
+extern void fsnotify_mark_destroy_list(void);
+
/*
* update the dentry->d_flags of all of inode's children to indicate if inode cares
* about events that happen to its children.
diff --git a/fs/notify/group.c b/fs/notify/group.c
index d16b62c..3e2dd85 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -47,12 +47,21 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
*/
void fsnotify_destroy_group(struct fsnotify_group *group)
{
- /* clear all inode marks for this group */
- fsnotify_clear_marks_by_group(group);
+ /* clear all inode marks for this group, attach them to destroy_list */
+ fsnotify_detach_group_marks(group);
- synchronize_srcu(&fsnotify_mark_srcu);
+ /*
+ * Wait for fsnotify_mark_srcu period to end and free all marks in
+ * destroy_list
+ */
+ fsnotify_mark_destroy_list();
- /* clear the notification queue of all events */
+ /*
+ * Since we have waited for fsnotify_mark_srcu in
+ * fsnotify_mark_destroy_list() there can be no outstanding event
+ * notification against this group. So clearing the notification queue
+ * of all events is reliable now.
+ */
fsnotify_flush_notify(group);
/*
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 7115c5d..d3fea0b 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -97,8 +97,8 @@ struct srcu_struct fsnotify_mark_srcu;
static DEFINE_SPINLOCK(destroy_lock);
static LIST_HEAD(destroy_list);
-static void fsnotify_mark_destroy(struct work_struct *work);
-static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy);
+static void fsnotify_mark_destroy_workfn(struct work_struct *work);
+static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn);
void fsnotify_get_mark(struct fsnotify_mark *mark)
{
@@ -173,11 +173,15 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
}
/*
- * Free fsnotify mark. The freeing is actually happening from a kthread which
- * first waits for srcu period end. Caller must have a reference to the mark
- * or be protected by fsnotify_mark_srcu.
+ * Prepare mark for freeing and add it to the list of marks prepared for
+ * freeing. The actual freeing must happen after SRCU period ends and the
+ * caller is responsible for this.
+ *
+ * The function returns true if the mark was added to the list of marks for
+ * freeing. The function returns false if someone else has already called
+ * __fsnotify_free_mark() for the mark.
*/
-void fsnotify_free_mark(struct fsnotify_mark *mark)
+static bool __fsnotify_free_mark(struct fsnotify_mark *mark)
{
struct fsnotify_group *group = mark->group;
@@ -185,17 +189,11 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
/* something else already called this function on this mark */
if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
spin_unlock(&mark->lock);
- return;
+ return false;
}
mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
spin_unlock(&mark->lock);
- spin_lock(&destroy_lock);
- list_add(&mark->g_list, &destroy_list);
- spin_unlock(&destroy_lock);
- queue_delayed_work(system_unbound_wq, &reaper_work,
- FSNOTIFY_REAPER_DELAY);
-
/*
* Some groups like to know that marks are being freed. This is a
* callback to the group function to let it know that this mark
@@ -203,6 +201,25 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
*/
if (group->ops->freeing_mark)
group->ops->freeing_mark(mark, group);
+
+ spin_lock(&destroy_lock);
+ list_add(&mark->g_list, &destroy_list);
+ spin_unlock(&destroy_lock);
+
+ return true;
+}
+
+/*
+ * Free fsnotify mark. The freeing is actually happening from a workqueue which
+ * first waits for srcu period end. Caller must have a reference to the mark
+ * or be protected by fsnotify_mark_srcu.
+ */
+void fsnotify_free_mark(struct fsnotify_mark *mark)
+{
+ if (__fsnotify_free_mark(mark)) {
+ queue_delayed_work(system_unbound_wq, &reaper_work,
+ FSNOTIFY_REAPER_DELAY);
+ }
}
void fsnotify_destroy_mark(struct fsnotify_mark *mark,
@@ -468,11 +485,29 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
}
/*
- * Given a group, destroy all of the marks associated with that group.
+ * Given a group, prepare for freeing all the marks associated with that group.
+ * The marks are attached to the list of marks prepared for destruction, the
+ * caller is responsible for freeing marks in that list after SRCU period has
+ * ended.
*/
-void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
+void fsnotify_detach_group_marks(struct fsnotify_group *group)
{
- fsnotify_clear_marks_by_group_flags(group, (unsigned int)-1);
+ struct fsnotify_mark *mark;
+
+ while (1) {
+ mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
+ if (list_empty(&group->marks_list)) {
+ mutex_unlock(&group->mark_mutex);
+ break;
+ }
+ mark = list_first_entry(&group->marks_list,
+ struct fsnotify_mark, g_list);
+ fsnotify_get_mark(mark);
+ fsnotify_detach_mark(mark);
+ mutex_unlock(&group->mark_mutex);
+ __fsnotify_free_mark(mark);
+ fsnotify_put_mark(mark);
+ }
}
void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
@@ -499,7 +534,11 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
mark->free_mark = free_mark;
}
-static void fsnotify_mark_destroy(struct work_struct *work)
+/*
+ * Destroy all marks in destroy_list, waits for SRCU period to finish before
+ * actually freeing marks.
+ */
+void fsnotify_mark_destroy_list(void)
{
struct fsnotify_mark *mark, *next;
struct list_head private_destroy_list;
@@ -516,3 +555,8 @@ static void fsnotify_mark_destroy(struct work_struct *work)
fsnotify_put_mark(mark);
}
}
+
+static void fsnotify_mark_destroy_workfn(struct work_struct *work)
+{
+ fsnotify_mark_destroy_list();
+}
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index e361d1a..460c0ce 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5351,7 +5351,7 @@ static int ocfs2_truncate_rec(handle_t *handle,
{
int ret;
u32 left_cpos, rec_range, trunc_range;
- int wants_rotate = 0, is_rightmost_tree_rec = 0;
+ int is_rightmost_tree_rec = 0;
struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
struct ocfs2_path *left_path = NULL;
struct ocfs2_extent_list *el = path_leaf_el(path);
@@ -5457,7 +5457,6 @@ static int ocfs2_truncate_rec(handle_t *handle,
memset(rec, 0, sizeof(*rec));
ocfs2_cleanup_merge(el, index);
- wants_rotate = 1;
next_free = le16_to_cpu(el->l_next_free_rec);
if (is_rightmost_tree_rec && next_free > 1) {
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 1934abb..a8d15be 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1456,7 +1456,6 @@ static void o2hb_region_release(struct config_item *item)
static int o2hb_read_block_input(struct o2hb_region *reg,
const char *page,
- size_t count,
unsigned long *ret_bytes,
unsigned int *ret_bits)
{
@@ -1499,8 +1498,8 @@ static ssize_t o2hb_region_block_bytes_store(struct config_item *item,
if (reg->hr_bdev)
return -EINVAL;
- status = o2hb_read_block_input(reg, page, count,
- &block_bytes, &block_bits);
+ status = o2hb_read_block_input(reg, page, &block_bytes,
+ &block_bits);
if (status)
return status;
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 540ab5b..44d178b 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -580,7 +580,7 @@ struct ocfs2_extended_slot {
/*00*/ __u8 es_valid;
__u8 es_reserved1[3];
__le32 es_node_num;
-/*10*/
+/*08*/
};
/*
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 1e09592..d740799 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -535,12 +535,8 @@ void ocfs2_put_slot(struct ocfs2_super *osb)
spin_unlock(&osb->osb_lock);
status = ocfs2_update_disk_slot(osb, si, slot_num);
- if (status < 0) {
+ if (status < 0)
mlog_errno(status);
- goto bail;
- }
-bail:
ocfs2_free_slot_info(osb);
}
-
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 712f1b9..3ecd445 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -142,7 +142,7 @@ u64 stable_page_flags(struct page *page)
/*
- * Caveats on high order pages: page->_count will only be set
+ * Caveats on high order pages: page->_refcount will only be set
* -1 on the head page; SLUB/SLQB do the same for PG_slab;
* SLOB won't set PG_slab at all on compound pages.
*/
diff --git a/fs/select.c b/fs/select.c
index 8692939..8ed9da5 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -47,7 +47,7 @@
#define MAX_SLACK (100 * NSEC_PER_MSEC)
-static long __estimate_accuracy(struct timespec *tv)
+static long __estimate_accuracy(struct timespec64 *tv)
{
long slack;
int divfactor = 1000;
@@ -70,10 +70,10 @@ static long __estimate_accuracy(struct timespec *tv)
return slack;
}
-u64 select_estimate_accuracy(struct timespec *tv)
+u64 select_estimate_accuracy(struct timespec64 *tv)
{
u64 ret;
- struct timespec now;
+ struct timespec64 now;
/*
* Realtime tasks get a slack of 0 for obvious reasons.
@@ -82,8 +82,8 @@ u64 select_estimate_accuracy(struct timespec *tv)
if (rt_task(current))
return 0;
- ktime_get_ts(&now);
- now = timespec_sub(*tv, now);
+ ktime_get_ts64(&now);
+ now = timespec64_sub(*tv, now);
ret = __estimate_accuracy(&now);
if (ret < current->timer_slack_ns)
return current->timer_slack_ns;
@@ -260,7 +260,7 @@ EXPORT_SYMBOL(poll_schedule_timeout);
/**
* poll_select_set_timeout - helper function to setup the timeout value
- * @to: pointer to timespec variable for the final timeout
+ * @to: pointer to timespec64 variable for the final timeout
* @sec: seconds (from user space)
* @nsec: nanoseconds (from user space)
*
@@ -269,26 +269,28 @@ EXPORT_SYMBOL(poll_schedule_timeout);
*
* Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
*/
-int poll_select_set_timeout(struct timespec *to, long sec, long nsec)
+int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec)
{
- struct timespec ts = {.tv_sec = sec, .tv_nsec = nsec};
+ struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec};
- if (!timespec_valid(&ts))
+ if (!timespec64_valid(&ts))
return -EINVAL;
/* Optimize for the zero timeout value here */
if (!sec && !nsec) {
to->tv_sec = to->tv_nsec = 0;
} else {
- ktime_get_ts(to);
- *to = timespec_add_safe(*to, ts);
+ ktime_get_ts64(to);
+ *to = timespec64_add_safe(*to, ts);
}
return 0;
}
-static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
+static int poll_select_copy_remaining(struct timespec64 *end_time,
+ void __user *p,
int timeval, int ret)
{
+ struct timespec64 rts64;
struct timespec rts;
struct timeval rtv;
@@ -302,16 +304,18 @@ static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
if (!end_time->tv_sec && !end_time->tv_nsec)
return ret;
- ktime_get_ts(&rts);
- rts = timespec_sub(*end_time, rts);
- if (rts.tv_sec < 0)
- rts.tv_sec = rts.tv_nsec = 0;
+ ktime_get_ts64(&rts64);
+ rts64 = timespec64_sub(*end_time, rts64);
+ if (rts64.tv_sec < 0)
+ rts64.tv_sec = rts64.tv_nsec = 0;
+
+ rts = timespec64_to_timespec(rts64);
if (timeval) {
if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
memset(&rtv, 0, sizeof(rtv));
- rtv.tv_sec = rts.tv_sec;
- rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
+ rtv.tv_sec = rts64.tv_sec;
+ rtv.tv_usec = rts64.tv_nsec / NSEC_PER_USEC;
if (!copy_to_user(p, &rtv, sizeof(rtv)))
return ret;
@@ -396,7 +400,7 @@ static inline void wait_key_set(poll_table *wait, unsigned long in,
wait->_key |= POLLOUT_SET;
}
-int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
+int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
{
ktime_t expire, *to = NULL;
struct poll_wqueues table;
@@ -522,7 +526,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
* pointer to the expiry value.
*/
if (end_time && !to) {
- expire = timespec_to_ktime(*end_time);
+ expire = timespec64_to_ktime(*end_time);
to = &expire;
}
@@ -545,7 +549,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
* I'm trying ERESTARTNOHAND which restart only when you want to.
*/
int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
- fd_set __user *exp, struct timespec *end_time)
+ fd_set __user *exp, struct timespec64 *end_time)
{
fd_set_bits fds;
void *bits;
@@ -622,7 +626,7 @@ out_nofds:
SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
fd_set __user *, exp, struct timeval __user *, tvp)
{
- struct timespec end_time, *to = NULL;
+ struct timespec64 end_time, *to = NULL;
struct timeval tv;
int ret;
@@ -648,15 +652,17 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
const sigset_t __user *sigmask, size_t sigsetsize)
{
sigset_t ksigmask, sigsaved;
- struct timespec ts, end_time, *to = NULL;
+ struct timespec ts;
+ struct timespec64 ts64, end_time, *to = NULL;
int ret;
if (tsp) {
if (copy_from_user(&ts, tsp, sizeof(ts)))
return -EFAULT;
+ ts64 = timespec_to_timespec64(ts);
to = &end_time;
- if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+ if (poll_select_set_timeout(to, ts64.tv_sec, ts64.tv_nsec))
return -EINVAL;
}
@@ -779,7 +785,7 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
}
static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
- struct timespec *end_time)
+ struct timespec64 *end_time)
{
poll_table* pt = &wait->pt;
ktime_t expire, *to = NULL;
@@ -854,7 +860,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
* pointer to the expiry value.
*/
if (end_time && !to) {
- expire = timespec_to_ktime(*end_time);
+ expire = timespec64_to_ktime(*end_time);
to = &expire;
}
@@ -868,7 +874,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
sizeof(struct pollfd))
int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
- struct timespec *end_time)
+ struct timespec64 *end_time)
{
struct poll_wqueues table;
int err = -EFAULT, fdcount, len, size;
@@ -936,7 +942,7 @@ static long do_restart_poll(struct restart_block *restart_block)
{
struct pollfd __user *ufds = restart_block->poll.ufds;
int nfds = restart_block->poll.nfds;
- struct timespec *to = NULL, end_time;
+ struct timespec64 *to = NULL, end_time;
int ret;
if (restart_block->poll.has_timeout) {
@@ -957,7 +963,7 @@ static long do_restart_poll(struct restart_block *restart_block)
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
int, timeout_msecs)
{
- struct timespec end_time, *to = NULL;
+ struct timespec64 end_time, *to = NULL;
int ret;
if (timeout_msecs >= 0) {
@@ -993,7 +999,8 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
size_t, sigsetsize)
{
sigset_t ksigmask, sigsaved;
- struct timespec ts, end_time, *to = NULL;
+ struct timespec ts;
+ struct timespec64 end_time, *to = NULL;
int ret;
if (tsp) {
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 9401f48..d4458b6 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -806,4 +806,12 @@ static inline int pmd_clear_huge(pmd_t *pmd)
#define io_remap_pfn_range remap_pfn_range
#endif
+#ifndef has_transparent_hugepage
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define has_transparent_hugepage() 1
+#else
+#define has_transparent_hugepage() 0
+#endif
+#endif
+
#endif /* _ASM_GENERIC_PGTABLE_H */
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 35b22f9..f9be326 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -83,34 +83,34 @@ extern void *__alloc_bootmem(unsigned long size,
unsigned long goal);
extern void *__alloc_bootmem_nopanic(unsigned long size,
unsigned long align,
- unsigned long goal);
+ unsigned long goal) __malloc;
extern void *__alloc_bootmem_node(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
- unsigned long goal);
+ unsigned long goal) __malloc;
void *__alloc_bootmem_node_high(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
- unsigned long goal);
+ unsigned long goal) __malloc;
extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
- unsigned long goal);
+ unsigned long goal) __malloc;
void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
unsigned long goal,
- unsigned long limit);
+ unsigned long limit) __malloc;
extern void *__alloc_bootmem_low(unsigned long size,
unsigned long align,
- unsigned long goal);
+ unsigned long goal) __malloc;
void *__alloc_bootmem_low_nopanic(unsigned long size,
unsigned long align,
- unsigned long goal);
+ unsigned long goal) __malloc;
extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
- unsigned long goal);
+ unsigned long goal) __malloc;
#ifdef CONFIG_NO_BOOTMEM
/* We are using top down, so it is safe to use 0 here */
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index d7c8de5..242b660 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -39,12 +39,12 @@ extern int sysctl_compact_unevictable_allowed;
extern int fragmentation_index(struct zone *zone, unsigned int order);
extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
- int alloc_flags, const struct alloc_context *ac,
- enum migrate_mode mode, int *contended);
+ unsigned int alloc_flags, const struct alloc_context *ac,
+ enum migrate_mode mode, int *contended);
extern void compact_pgdat(pg_data_t *pgdat, int order);
extern void reset_isolation_suitable(pg_data_t *pgdat);
extern unsigned long compaction_suitable(struct zone *zone, int order,
- int alloc_flags, int classzone_idx);
+ unsigned int alloc_flags, int classzone_idx);
extern void defer_compaction(struct zone *zone, int order);
extern bool compaction_deferred(struct zone *zone, int order);
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 3d5202e..e294939 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -142,6 +142,7 @@
#if GCC_VERSION >= 30400
#define __must_check __attribute__((warn_unused_result))
+#define __malloc __attribute__((__malloc__))
#endif
#if GCC_VERSION >= 40000
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index b5ff988..793c082 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -357,6 +357,10 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
#define __deprecated_for_modules
#endif
+#ifndef __malloc
+#define __malloc
+#endif
+
/*
* Allow us to avoid 'defined but not used' warnings on functions and data,
* as well as force them to be emitted to the assembly file.
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 85a868c..bfc204e 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -16,26 +16,26 @@
#ifdef CONFIG_CPUSETS
-extern struct static_key cpusets_enabled_key;
+extern struct static_key_false cpusets_enabled_key;
static inline bool cpusets_enabled(void)
{
- return static_key_false(&cpusets_enabled_key);
+ return static_branch_unlikely(&cpusets_enabled_key);
}
static inline int nr_cpusets(void)
{
/* jump label reference count + the top-level cpuset */
- return static_key_count(&cpusets_enabled_key) + 1;
+ return static_key_count(&cpusets_enabled_key.key) + 1;
}
static inline void cpuset_inc(void)
{
- static_key_slow_inc(&cpusets_enabled_key);
+ static_branch_inc(&cpusets_enabled_key);
}
static inline void cpuset_dec(void)
{
- static_key_slow_dec(&cpusets_enabled_key);
+ static_branch_dec(&cpusets_enabled_key);
}
extern int cpuset_init(void);
@@ -48,16 +48,25 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
void cpuset_init_current_mems_allowed(void);
int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
-extern int __cpuset_node_allowed(int node, gfp_t gfp_mask);
+extern bool __cpuset_node_allowed(int node, gfp_t gfp_mask);
-static inline int cpuset_node_allowed(int node, gfp_t gfp_mask)
+static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask)
{
- return nr_cpusets() <= 1 || __cpuset_node_allowed(node, gfp_mask);
+ if (cpusets_enabled())
+ return __cpuset_node_allowed(node, gfp_mask);
+ return true;
}
-static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
- return cpuset_node_allowed(zone_to_nid(z), gfp_mask);
+ return __cpuset_node_allowed(zone_to_nid(z), gfp_mask);
+}
+
+static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+{
+ if (cpusets_enabled())
+ return __cpuset_zone_allowed(z, gfp_mask);
+ return true;
}
extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
@@ -172,14 +181,19 @@ static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
return 1;
}
-static inline int cpuset_node_allowed(int node, gfp_t gfp_mask)
+static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask)
{
- return 1;
+ return true;
}
-static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
- return 1;
+ return true;
+}
+
+static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+{
+ return true;
}
static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h
index 98ffcbd..46056cb 100644
--- a/include/linux/debugobjects.h
+++ b/include/linux/debugobjects.h
@@ -38,8 +38,10 @@ struct debug_obj {
* @name: name of the object typee
* @debug_hint: function returning address, which have associated
* kernel symbol, to allow identify the object
+ * @is_static_object return true if the obj is static, otherwise return false
* @fixup_init: fixup function, which is called when the init check
- * fails
+ * fails. All fixup functions must return true if fixup
+ * was successful, otherwise return false
* @fixup_activate: fixup function, which is called when the activate check
* fails
* @fixup_destroy: fixup function, which is called when the destroy check
@@ -51,12 +53,13 @@ struct debug_obj {
*/
struct debug_obj_descr {
const char *name;
- void *(*debug_hint) (void *addr);
- int (*fixup_init) (void *addr, enum debug_obj_state state);
- int (*fixup_activate) (void *addr, enum debug_obj_state state);
- int (*fixup_destroy) (void *addr, enum debug_obj_state state);
- int (*fixup_free) (void *addr, enum debug_obj_state state);
- int (*fixup_assert_init)(void *addr, enum debug_obj_state state);
+ void *(*debug_hint)(void *addr);
+ bool (*is_static_object)(void *addr);
+ bool (*fixup_init)(void *addr, enum debug_obj_state state);
+ bool (*fixup_activate)(void *addr, enum debug_obj_state state);
+ bool (*fixup_destroy)(void *addr, enum debug_obj_state state);
+ bool (*fixup_free)(void *addr, enum debug_obj_state state);
+ bool (*fixup_assert_init)(void *addr, enum debug_obj_state state);
};
#ifdef CONFIG_DEBUG_OBJECTS
diff --git a/include/linux/device.h b/include/linux/device.h
index b130304..ca90ad8 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -609,14 +609,14 @@ typedef int (*dr_match_t)(struct device *dev, void *res, void *match_data);
#ifdef CONFIG_DEBUG_DEVRES
extern void *__devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp,
- int nid, const char *name);
+ int nid, const char *name) __malloc;
#define devres_alloc(release, size, gfp) \
__devres_alloc_node(release, size, gfp, NUMA_NO_NODE, #release)
#define devres_alloc_node(release, size, gfp, nid) \
__devres_alloc_node(release, size, gfp, nid, #release)
#else
extern void *devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp,
- int nid);
+ int nid) __malloc;
static inline void *devres_alloc(dr_release_t release, size_t size, gfp_t gfp)
{
return devres_alloc_node(release, size, gfp, NUMA_NO_NODE);
@@ -648,12 +648,12 @@ extern void devres_remove_group(struct device *dev, void *id);
extern int devres_release_group(struct device *dev, void *id);
/* managed devm_k.alloc/kfree for device drivers */
-extern void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp);
+extern void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp) __malloc;
extern __printf(3, 0)
char *devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt,
- va_list ap);
+ va_list ap) __malloc;
extern __printf(3, 4)
-char *devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...);
+char *devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...) __malloc;
static inline void *devm_kzalloc(struct device *dev, size_t size, gfp_t gfp)
{
return devm_kmalloc(dev, size, gfp | __GFP_ZERO);
@@ -671,7 +671,7 @@ static inline void *devm_kcalloc(struct device *dev,
return devm_kmalloc_array(dev, n, size, flags | __GFP_ZERO);
}
extern void devm_kfree(struct device *dev, void *p);
-extern char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp);
+extern char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp) __malloc;
extern void *devm_kmemdup(struct device *dev, const void *src, size_t len,
gfp_t gfp);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 1259e53..29f9175 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -359,8 +359,6 @@ extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
extern void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group);
/* run all the marks in a group, and clear all of the marks where mark->flags & flags is true*/
extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags);
-/* run all the marks in a group, and flag them to be freed */
-extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group);
extern void fsnotify_get_mark(struct fsnotify_mark *mark);
extern void fsnotify_put_mark(struct fsnotify_mark *mark);
extern void fsnotify_unmount_inodes(struct super_block *sb);
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index d7b9e53..419fb9e 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -28,9 +28,7 @@ extern int zap_huge_pmd(struct mmu_gather *tlb,
extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
unsigned char *vec);
-extern bool move_huge_pmd(struct vm_area_struct *vma,
- struct vm_area_struct *new_vma,
- unsigned long old_addr,
+extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
pmd_t *old_pmd, pmd_t *new_pmd);
extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 7d953c2..e44c578 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -338,6 +338,7 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
/* arch callback */
int __init alloc_bootmem_huge_page(struct hstate *h);
+void __init hugetlb_bad_size(void);
void __init hugetlb_add_hstate(unsigned order);
struct hstate *size_to_hstate(unsigned long size);
diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
index 2bb681f..a4e7ca0 100644
--- a/include/linux/hugetlb_inline.h
+++ b/include/linux/hugetlb_inline.h
@@ -5,16 +5,16 @@
#include <linux/mm.h>
-static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
+static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
{
return !!(vma->vm_flags & VM_HUGETLB);
}
#else
-static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
+static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
{
- return 0;
+ return false;
}
#endif
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2f7775e..cc73982 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -412,9 +412,9 @@ extern __printf(3, 4)
int scnprintf(char *buf, size_t size, const char *fmt, ...);
extern __printf(3, 0)
int vscnprintf(char *buf, size_t size, const char *fmt, va_list args);
-extern __printf(2, 3)
+extern __printf(2, 3) __malloc
char *kasprintf(gfp_t gfp, const char *fmt, ...);
-extern __printf(2, 0)
+extern __printf(2, 0) __malloc
char *kvasprintf(gfp_t gfp, const char *fmt, va_list args);
extern __printf(2, 0)
const char *kvasprintf_const(gfp_t gfp, const char *fmt, va_list args);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1191d79..94da967 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -658,12 +658,6 @@ mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
return 0;
}
-static inline void
-mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
- int increment)
-{
-}
-
static inline unsigned long
mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
int nid, unsigned int lru_mask)
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index adbef58..20d8a5d 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -247,16 +247,16 @@ static inline void mem_hotplug_done(void) {}
#ifdef CONFIG_MEMORY_HOTREMOVE
-extern int is_mem_section_removable(unsigned long pfn, unsigned long nr_pages);
+extern bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages);
extern void try_offline_node(int nid);
extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
extern void remove_memory(int nid, u64 start, u64 size);
#else
-static inline int is_mem_section_removable(unsigned long pfn,
+static inline bool is_mem_section_removable(unsigned long pfn,
unsigned long nr_pages)
{
- return 0;
+ return false;
}
static inline void try_offline_node(int nid) {}
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 2696c1f..4429d25 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -172,14 +172,14 @@ extern int mpol_parse_str(char *str, struct mempolicy **mpol);
extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);
/* Check if a vma is migratable */
-static inline int vma_migratable(struct vm_area_struct *vma)
+static inline bool vma_migratable(struct vm_area_struct *vma)
{
if (vma->vm_flags & (VM_IO | VM_PFNMAP))
- return 0;
+ return false;
#ifndef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
if (vma->vm_flags & VM_HUGETLB)
- return 0;
+ return false;
#endif
/*
@@ -190,8 +190,8 @@ static inline int vma_migratable(struct vm_area_struct *vma)
if (vma->vm_file &&
gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
< policy_zone)
- return 0;
- return 1;
+ return false;
+ return true;
}
extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
@@ -228,6 +228,12 @@ static inline void mpol_free_shared_policy(struct shared_policy *p)
{
}
+static inline struct mempolicy *
+mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
+{
+ return NULL;
+}
+
#define vma_policy(vma) NULL
static inline int
diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 69b6951..b1086c9 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -5,6 +5,7 @@
#define _LINUX_MEMPOOL_H
#include <linux/wait.h>
+#include <linux/compiler.h>
struct kmem_cache;
@@ -31,7 +32,7 @@ extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
extern int mempool_resize(mempool_t *pool, int new_min_nr);
extern void mempool_destroy(mempool_t *pool);
-extern void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask);
+extern void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) __malloc;
extern void mempool_free(void *element, mempool_t *pool);
/*
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 727f7997..2b97be1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -447,14 +447,14 @@ unsigned long vmalloc_to_pfn(const void *addr);
* On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there
* is no special casing required.
*/
-static inline int is_vmalloc_addr(const void *x)
+static inline bool is_vmalloc_addr(const void *x)
{
#ifdef CONFIG_MMU
unsigned long addr = (unsigned long)x;
return addr >= VMALLOC_START && addr < VMALLOC_END;
#else
- return 0;
+ return false;
#endif
}
#ifdef CONFIG_MMU
@@ -734,7 +734,7 @@ static inline void get_page(struct page *page)
page = compound_head(page);
/*
* Getting a normal page or the head of a compound page
- * requires to already have an elevated page->_count.
+ * requires to already have an elevated page->_refcount.
*/
VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
page_ref_inc(page);
@@ -850,10 +850,7 @@ extern int page_cpupid_xchg_last(struct page *page, int cpupid);
static inline void page_cpupid_reset_last(struct page *page)
{
- int cpupid = (1 << LAST_CPUPID_SHIFT) - 1;
-
- page->flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
- page->flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
+ page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
}
#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
#else /* !CONFIG_NUMA_BALANCING */
@@ -1032,26 +1029,7 @@ static inline pgoff_t page_file_index(struct page *page)
return page->index;
}
-/*
- * Return true if this page is mapped into pagetables.
- * For compound page it returns true if any subpage of compound page is mapped.
- */
-static inline bool page_mapped(struct page *page)
-{
- int i;
- if (likely(!PageCompound(page)))
- return atomic_read(&page->_mapcount) >= 0;
- page = compound_head(page);
- if (atomic_read(compound_mapcount_ptr(page)) >= 0)
- return true;
- if (PageHuge(page))
- return false;
- for (i = 0; i < hpage_nr_pages(page); i++) {
- if (atomic_read(&page[i]._mapcount) >= 0)
- return true;
- }
- return false;
-}
+bool page_mapped(struct page *page);
/*
* Return true only if the page has been allocated with
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 712e8c3..5bd29ba 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -22,22 +22,34 @@ static inline int page_is_file_cache(struct page *page)
return !PageSwapBacked(page);
}
+static __always_inline void __update_lru_size(struct lruvec *lruvec,
+ enum lru_list lru, int nr_pages)
+{
+ __mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
+}
+
+static __always_inline void update_lru_size(struct lruvec *lruvec,
+ enum lru_list lru, int nr_pages)
+{
+#ifdef CONFIG_MEMCG
+ mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
+#else
+ __update_lru_size(lruvec, lru, nr_pages);
+#endif
+}
+
static __always_inline void add_page_to_lru_list(struct page *page,
struct lruvec *lruvec, enum lru_list lru)
{
- int nr_pages = hpage_nr_pages(page);
- mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
+ update_lru_size(lruvec, lru, hpage_nr_pages(page));
list_add(&page->lru, &lruvec->lists[lru]);
- __mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
}
static __always_inline void del_page_from_lru_list(struct page *page,
struct lruvec *lruvec, enum lru_list lru)
{
- int nr_pages = hpage_nr_pages(page);
- mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
list_del(&page->lru);
- __mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, -nr_pages);
+ update_lru_size(lruvec, lru, -hpage_nr_pages(page));
}
/**
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c2d75b4..1fda9c9 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -73,9 +73,9 @@ struct page {
unsigned long counters;
#else
/*
- * Keep _count separate from slub cmpxchg_double data.
- * As the rest of the double word is protected by
- * slab_lock but _count is not.
+ * Keep _refcount separate from slub cmpxchg_double
+ * data. As the rest of the double word is protected by
+ * slab_lock but _refcount is not.
*/
unsigned counters;
#endif
@@ -97,7 +97,11 @@ struct page {
};
int units; /* SLOB */
};
- atomic_t _count; /* Usage count, see below. */
+ /*
+ * Usage count, *USE WRAPPER FUNCTION*
+ * when manual accounting. See page_ref.h
+ */
+ atomic_t _refcount;
};
unsigned int active; /* SLAB */
};
@@ -248,7 +252,7 @@ struct page_frag_cache {
__u32 offset;
#endif
/* we maintain a pagecount bias, so that we dont dirty cache line
- * containing page->_count every time we allocate a fragment.
+ * containing page->_refcount every time we allocate a fragment.
*/
unsigned int pagecnt_bias;
bool pfmemalloc;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c60df92..c60db20 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -85,13 +85,6 @@ extern int page_group_by_mobility_disabled;
get_pfnblock_flags_mask(page, page_to_pfn(page), \
PB_migrate_end, MIGRATETYPE_MASK)
-static inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
-{
- BUILD_BUG_ON(PB_migrate_end - PB_migrate != 2);
- return get_pfnblock_flags_mask(page, pfn, PB_migrate_end,
- MIGRATETYPE_MASK);
-}
-
struct free_area {
struct list_head free_list[MIGRATE_TYPES];
unsigned long nr_free;
@@ -747,7 +740,8 @@ extern struct mutex zonelists_mutex;
void build_all_zonelists(pg_data_t *pgdat, struct zone *zone);
void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
bool zone_watermark_ok(struct zone *z, unsigned int order,
- unsigned long mark, int classzone_idx, int alloc_flags);
+ unsigned long mark, int classzone_idx,
+ unsigned int alloc_flags);
bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
unsigned long mark, int classzone_idx);
enum memmap_context {
@@ -828,10 +822,7 @@ static inline int is_highmem_idx(enum zone_type idx)
static inline int is_highmem(struct zone *zone)
{
#ifdef CONFIG_HIGHMEM
- int zone_off = (char *)zone - (char *)zone->zone_pgdat->node_zones;
- return zone_off == ZONE_HIGHMEM * sizeof(*zone) ||
- (zone_off == ZONE_MOVABLE * sizeof(*zone) &&
- zone_movable_is_highmem());
+ return is_highmem_idx(zone_idx(zone));
#else
return 0;
#endif
@@ -922,6 +913,10 @@ static inline int zonelist_node_idx(struct zoneref *zoneref)
#endif /* CONFIG_NUMA */
}
+struct zoneref *__next_zones_zonelist(struct zoneref *z,
+ enum zone_type highest_zoneidx,
+ nodemask_t *nodes);
+
/**
* next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point
* @z - The cursor used as a starting point for the search
@@ -934,9 +929,14 @@ static inline int zonelist_node_idx(struct zoneref *zoneref)
* being examined. It should be advanced by one before calling
* next_zones_zonelist again.
*/
-struct zoneref *next_zones_zonelist(struct zoneref *z,
+static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z,
enum zone_type highest_zoneidx,
- nodemask_t *nodes);
+ nodemask_t *nodes)
+{
+ if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx))
+ return z;
+ return __next_zones_zonelist(z, highest_zoneidx, nodes);
+}
/**
* first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
@@ -952,13 +952,10 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
*/
static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
enum zone_type highest_zoneidx,
- nodemask_t *nodes,
- struct zone **zone)
+ nodemask_t *nodes)
{
- struct zoneref *z = next_zones_zonelist(zonelist->_zonerefs,
+ return next_zones_zonelist(zonelist->_zonerefs,
highest_zoneidx, nodes);
- *zone = zonelist_zone(z);
- return z;
}
/**
@@ -973,10 +970,17 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
* within a given nodemask
*/
#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
- for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone); \
+ for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z); \
+ zone; \
+ z = next_zones_zonelist(++z, highidx, nodemask), \
+ zone = zonelist_zone(z))
+
+#define for_next_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
+ for (zone = z->zone; \
zone; \
z = next_zones_zonelist(++z, highidx, nodemask), \
- zone = zonelist_zone(z)) \
+ zone = zonelist_zone(z))
+
/**
* for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 6e85889..f746e44 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -43,8 +43,10 @@
*
* int first_node(mask) Number lowest set bit, or MAX_NUMNODES
* int next_node(node, mask) Next node past 'node', or MAX_NUMNODES
+ * int next_node_in(node, mask) Next node past 'node', or wrap to first,
+ * or MAX_NUMNODES
* int first_unset_node(mask) First node not set in mask, or
- * MAX_NUMNODES.
+ * MAX_NUMNODES
*
* nodemask_t nodemask_of_node(node) Return nodemask with bit 'node' set
* NODE_MASK_ALL Initializer - all bits set
@@ -259,6 +261,13 @@ static inline int __next_node(int n, const nodemask_t *srcp)
return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
}
+/*
+ * Find the next present node in src, starting after node n, wrapping around to
+ * the first node in src if needed. Returns MAX_NUMNODES if src is empty.
+ */
+#define next_node_in(n, src) __next_node_in((n), &(src))
+int __next_node_in(int node, const nodemask_t *srcp);
+
static inline void init_nodemask_of_node(nodemask_t *mask, int node)
{
nodes_clear(*mask);
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 628a432..83b9c39 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -72,6 +72,14 @@ static inline bool oom_task_origin(const struct task_struct *p)
extern void mark_oom_victim(struct task_struct *tsk);
+#ifdef CONFIG_MMU
+extern void try_oom_reaper(struct task_struct *tsk);
+#else
+static inline void try_oom_reaper(struct task_struct *tsk)
+{
+}
+#endif
+
extern unsigned long oom_badness(struct task_struct *p,
struct mem_cgroup *memcg, const nodemask_t *nodemask,
unsigned long totalpages);
diff --git a/include/linux/padata.h b/include/linux/padata.h
index 4386946..113ee62 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -175,11 +175,6 @@ extern int padata_do_parallel(struct padata_instance *pinst,
extern void padata_do_serial(struct padata_priv *padata);
extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
cpumask_var_t cpumask);
-extern int padata_set_cpumasks(struct padata_instance *pinst,
- cpumask_var_t pcpumask,
- cpumask_var_t cbcpumask);
-extern int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask);
-extern int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask);
extern int padata_start(struct padata_instance *pinst);
extern void padata_stop(struct padata_instance *pinst);
extern int padata_register_cpumask_notifier(struct padata_instance *pinst,
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 6b052aa..a61e06e 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -371,10 +371,15 @@ PAGEFLAG(Idle, idle, PF_ANY)
#define PAGE_MAPPING_KSM 2
#define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM)
+static __always_inline int PageAnonHead(struct page *page)
+{
+ return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
+}
+
static __always_inline int PageAnon(struct page *page)
{
page = compound_head(page);
- return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
+ return PageAnonHead(page);
}
#ifdef CONFIG_KSM
diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
index e596d5d9..8b5e0a9 100644
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -63,17 +63,17 @@ static inline void __page_ref_unfreeze(struct page *page, int v)
static inline int page_ref_count(struct page *page)
{
- return atomic_read(&page->_count);
+ return atomic_read(&page->_refcount);
}
static inline int page_count(struct page *page)
{
- return atomic_read(&compound_head(page)->_count);
+ return atomic_read(&compound_head(page)->_refcount);
}
static inline void set_page_count(struct page *page, int v)
{
- atomic_set(&page->_count, v);
+ atomic_set(&page->_refcount, v);
if (page_ref_tracepoint_active(__tracepoint_page_ref_set))
__page_ref_set(page, v);
}
@@ -89,35 +89,35 @@ static inline void init_page_count(struct page *page)
static inline void page_ref_add(struct page *page, int nr)
{
- atomic_add(nr, &page->_count);
+ atomic_add(nr, &page->_refcount);
if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
__page_ref_mod(page, nr);
}
static inline void page_ref_sub(struct page *page, int nr)
{
- atomic_sub(nr, &page->_count);
+ atomic_sub(nr, &page->_refcount);
if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
__page_ref_mod(page, -nr);
}
static inline void page_ref_inc(struct page *page)
{
- atomic_inc(&page->_count);
+ atomic_inc(&page->_refcount);
if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
__page_ref_mod(page, 1);
}
static inline void page_ref_dec(struct page *page)
{
- atomic_dec(&page->_count);
+ atomic_dec(&page->_refcount);
if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
__page_ref_mod(page, -1);
}
static inline int page_ref_sub_and_test(struct page *page, int nr)
{
- int ret = atomic_sub_and_test(nr, &page->_count);
+ int ret = atomic_sub_and_test(nr, &page->_refcount);
if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test))
__page_ref_mod_and_test(page, -nr, ret);
@@ -126,7 +126,7 @@ static inline int page_ref_sub_and_test(struct page *page, int nr)
static inline int page_ref_dec_and_test(struct page *page)
{
- int ret = atomic_dec_and_test(&page->_count);
+ int ret = atomic_dec_and_test(&page->_refcount);
if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test))
__page_ref_mod_and_test(page, -1, ret);
@@ -135,7 +135,7 @@ static inline int page_ref_dec_and_test(struct page *page)
static inline int page_ref_dec_return(struct page *page)
{
- int ret = atomic_dec_return(&page->_count);
+ int ret = atomic_dec_return(&page->_refcount);
if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_return))
__page_ref_mod_and_return(page, -1, ret);
@@ -144,7 +144,7 @@ static inline int page_ref_dec_return(struct page *page)
static inline int page_ref_add_unless(struct page *page, int nr, int u)
{
- int ret = atomic_add_unless(&page->_count, nr, u);
+ int ret = atomic_add_unless(&page->_refcount, nr, u);
if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_unless))
__page_ref_mod_unless(page, nr, ret);
@@ -153,7 +153,7 @@ static inline int page_ref_add_unless(struct page *page, int nr, int u)
static inline int page_ref_freeze(struct page *page, int count)
{
- int ret = likely(atomic_cmpxchg(&page->_count, count, 0) == count);
+ int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count);
if (page_ref_tracepoint_active(__tracepoint_page_ref_freeze))
__page_ref_freeze(page, count, ret);
@@ -165,7 +165,7 @@ static inline void page_ref_unfreeze(struct page *page, int count)
VM_BUG_ON_PAGE(page_count(page) != 0, page);
VM_BUG_ON(count == 0);
- atomic_set(&page->_count, count);
+ atomic_set(&page->_refcount, count);
if (page_ref_tracepoint_active(__tracepoint_page_ref_unfreeze))
__page_ref_unfreeze(page, count);
}
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 7e1ab15..fe1513f 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -90,12 +90,12 @@ void release_pages(struct page **pages, int nr, bool cold);
/*
* speculatively take a reference to a page.
- * If the page is free (_count == 0), then _count is untouched, and 0
- * is returned. Otherwise, _count is incremented by 1 and 1 is returned.
+ * If the page is free (_refcount == 0), then _refcount is untouched, and 0
+ * is returned. Otherwise, _refcount is incremented by 1 and 1 is returned.
*
* This function must be called inside the same rcu_read_lock() section as has
* been used to lookup the page in the pagecache radix-tree (or page table):
- * this allows allocators to use a synchronize_rcu() to stabilize _count.
+ * this allows allocators to use a synchronize_rcu() to stabilize _refcount.
*
* Unless an RCU grace period has passed, the count of all pages coming out
* of the allocator must be considered unstable. page_count may return higher
@@ -111,7 +111,7 @@ void release_pages(struct page **pages, int nr, bool cold);
* 2. conditionally increment refcount
* 3. check the page is still in pagecache (if no, goto 1)
*
- * Remove-side that cares about stability of _count (eg. reclaim) has the
+ * Remove-side that cares about stability of _refcount (eg. reclaim) has the
* following (with tree_lock held for write):
* A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg)
* B. remove page from pagecache
diff --git a/include/linux/poll.h b/include/linux/poll.h
index 9fb4f40..37b057b 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -96,7 +96,7 @@ extern void poll_initwait(struct poll_wqueues *pwq);
extern void poll_freewait(struct poll_wqueues *pwq);
extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
ktime_t *expires, unsigned long slack);
-extern u64 select_estimate_accuracy(struct timespec *tv);
+extern u64 select_estimate_accuracy(struct timespec64 *tv);
static inline int poll_schedule(struct poll_wqueues *pwq, int state)
@@ -153,12 +153,13 @@ void zero_fd_set(unsigned long nr, unsigned long *fdset)
#define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1)
-extern int do_select(int n, fd_set_bits *fds, struct timespec *end_time);
+extern int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time);
extern int do_sys_poll(struct pollfd __user * ufds, unsigned int nfds,
- struct timespec *end_time);
+ struct timespec64 *end_time);
extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
- fd_set __user *exp, struct timespec *end_time);
+ fd_set __user *exp, struct timespec64 *end_time);
-extern int poll_select_set_timeout(struct timespec *to, long sec, long nsec);
+extern int poll_select_set_timeout(struct timespec64 *to, time64_t sec,
+ long nsec);
#endif /* _LINUX_POLL_H */
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 508bd82..aeb3e6d 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -315,8 +315,8 @@ static __always_inline int kmalloc_index(size_t size)
}
#endif /* !CONFIG_SLOB */
-void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment;
-void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment;
+void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc;
+void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment __malloc;
void kmem_cache_free(struct kmem_cache *, void *);
/*
@@ -339,8 +339,8 @@ static __always_inline void kfree_bulk(size_t size, void **p)
}
#ifdef CONFIG_NUMA
-void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment;
-void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment;
+void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc;
+void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc;
#else
static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node)
{
@@ -354,12 +354,12 @@ static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t f
#endif
#ifdef CONFIG_TRACING
-extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment;
+extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment __malloc;
#ifdef CONFIG_NUMA
extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
gfp_t gfpflags,
- int node, size_t size) __assume_slab_alignment;
+ int node, size_t size) __assume_slab_alignment __malloc;
#else
static __always_inline void *
kmem_cache_alloc_node_trace(struct kmem_cache *s,
@@ -392,10 +392,10 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s,
}
#endif /* CONFIG_TRACING */
-extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment;
+extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc;
#ifdef CONFIG_TRACING
-extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment;
+extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc;
#else
static __always_inline void *
kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 9edbbf3..8694f7a 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -80,6 +80,10 @@ struct kmem_cache {
struct kasan_cache kasan_info;
#endif
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+ void *random_seq;
+#endif
+
struct kmem_cache_node *node[MAX_NUMNODES];
};
diff --git a/include/linux/string.h b/include/linux/string.h
index d3993a7..26b6f6a 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -119,7 +119,7 @@ char *strreplace(char *s, char old, char new);
extern void kfree_const(const void *x);
-extern char *kstrdup(const char *s, gfp_t gfp);
+extern char *kstrdup(const char *s, gfp_t gfp) __malloc;
extern const char *kstrdup_const(const char *s, gfp_t gfp);
extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
diff --git a/include/linux/time64.h b/include/linux/time64.h
index 367d5af..7e5d2fa 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -65,7 +65,6 @@ static inline struct itimerspec64 itimerspec_to_itimerspec64(struct itimerspec *
# define timespec64_equal timespec_equal
# define timespec64_compare timespec_compare
# define set_normalized_timespec64 set_normalized_timespec
-# define timespec64_add_safe timespec_add_safe
# define timespec64_add timespec_add
# define timespec64_sub timespec_sub
# define timespec64_valid timespec_valid
@@ -134,15 +133,6 @@ static inline int timespec64_compare(const struct timespec64 *lhs, const struct
extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec);
-/*
- * timespec64_add_safe assumes both values are positive and checks for
- * overflow. It will return TIME_T_MAX if the returned value would be
- * smaller then either of the arguments.
- */
-extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
- const struct timespec64 rhs);
-
-
static inline struct timespec64 timespec64_add(struct timespec64 lhs,
struct timespec64 rhs)
{
@@ -224,4 +214,11 @@ static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns)
#endif
+/*
+ * timespec64_add_safe assumes both values are positive and checks for
+ * overflow. It will return TIME64_MAX in case of overflow.
+ */
+extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
+ const struct timespec64 rhs);
+
#endif /* _LINUX_TIME64_H */
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 73fae8c..d2da8e0 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -163,12 +163,10 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
#ifdef CONFIG_NUMA
extern unsigned long node_page_state(int node, enum zone_stat_item item);
-extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp);
#else
#define node_page_state(node, item) global_page_state(item)
-#define zone_statistics(_zl, _z, gfp) do { } while (0)
#endif /* CONFIG_NUMA */
@@ -193,6 +191,10 @@ void quiet_vmstat(void);
void cpu_vm_stats_fold(int cpu);
void refresh_zone_stat_thresholds(void);
+struct ctl_table;
+int vmstat_refresh(struct ctl_table *, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
+
void drain_zonestat(struct zone *zone, struct per_cpu_pageset *);
int calculate_pressure_threshold(struct zone *zone);
diff --git a/init/Kconfig b/init/Kconfig
index 0dfd09d..79a91a2 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1742,6 +1742,15 @@ config SLOB
endchoice
+config SLAB_FREELIST_RANDOM
+ default n
+ depends on SLAB
+ bool "SLAB freelist randomization"
+ help
+ Randomizes the freelist order used on creating new SLABs. This
+ security feature reduces the predictability of the kernel slab
+ allocator against heap overflows.
+
config SLUB_CPU_PARTIAL
default y
depends on SLUB && SMP
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1902956..73e93e5 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,7 +61,7 @@
#include <linux/cgroup.h>
#include <linux/wait.h>
-struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
+DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
/* See "Frequency meter" comments, below. */
@@ -2528,27 +2528,27 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
* GFP_USER - only nodes in current tasks mems allowed ok.
*/
-int __cpuset_node_allowed(int node, gfp_t gfp_mask)
+bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
{
struct cpuset *cs; /* current cpuset ancestors */
int allowed; /* is allocation in zone z allowed? */
unsigned long flags;
if (in_interrupt())
- return 1;
+ return true;
if (node_isset(node, current->mems_allowed))
- return 1;
+ return true;
/*
* Allow tasks that have access to memory reserves because they have
* been OOM killed to get memory anywhere.
*/
if (unlikely(test_thread_flag(TIF_MEMDIE)))
- return 1;
+ return true;
if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
- return 0;
+ return false;
if (current->flags & PF_EXITING) /* Let dying task have memory */
- return 1;
+ return true;
/* Not hardwall and node outside mems_allowed: scan up cpusets */
spin_lock_irqsave(&callback_lock, flags);
@@ -2591,13 +2591,7 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask)
static int cpuset_spread_node(int *rotor)
{
- int node;
-
- node = next_node(*rotor, current->mems_allowed);
- if (node == MAX_NUMNODES)
- node = first_node(current->mems_allowed);
- *rotor = node;
- return node;
+ return *rotor = next_node_in(*rotor, current->mems_allowed);
}
int cpuset_mem_spread_node(void)
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 1391d3e..1c03dfb 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1410,7 +1410,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_STRUCT_SIZE(list_head);
VMCOREINFO_SIZE(nodemask_t);
VMCOREINFO_OFFSET(page, flags);
- VMCOREINFO_OFFSET(page, _count);
+ VMCOREINFO_OFFSET(page, _refcount);
VMCOREINFO_OFFSET(page, mapping);
VMCOREINFO_OFFSET(page, lru);
VMCOREINFO_OFFSET(page, _mapcount);
diff --git a/kernel/padata.c b/kernel/padata.c
index b38bea9..9932788 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -607,33 +607,6 @@ out_replace:
}
/**
- * padata_set_cpumasks - Set both parallel and serial cpumasks. The first
- * one is used by parallel workers and the second one
- * by the wokers doing serialization.
- *
- * @pinst: padata instance
- * @pcpumask: the cpumask to use for parallel workers
- * @cbcpumask: the cpumsak to use for serial workers
- */
-int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask,
- cpumask_var_t cbcpumask)
-{
- int err;
-
- mutex_lock(&pinst->lock);
- get_online_cpus();
-
- err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask);
-
- put_online_cpus();
- mutex_unlock(&pinst->lock);
-
- return err;
-
-}
-EXPORT_SYMBOL(padata_set_cpumasks);
-
-/**
* padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value
* equivalent to @cpumask.
*
@@ -674,6 +647,43 @@ out:
}
EXPORT_SYMBOL(padata_set_cpumask);
+/**
+ * padata_start - start the parallel processing
+ *
+ * @pinst: padata instance to start
+ */
+int padata_start(struct padata_instance *pinst)
+{
+ int err = 0;
+
+ mutex_lock(&pinst->lock);
+
+ if (pinst->flags & PADATA_INVALID)
+ err = -EINVAL;
+
+ __padata_start(pinst);
+
+ mutex_unlock(&pinst->lock);
+
+ return err;
+}
+EXPORT_SYMBOL(padata_start);
+
+/**
+ * padata_stop - stop the parallel processing
+ *
+ * @pinst: padata instance to stop
+ */
+void padata_stop(struct padata_instance *pinst)
+{
+ mutex_lock(&pinst->lock);
+ __padata_stop(pinst);
+ mutex_unlock(&pinst->lock);
+}
+EXPORT_SYMBOL(padata_stop);
+
+#ifdef CONFIG_HOTPLUG_CPU
+
static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
{
struct parallel_data *pd;
@@ -694,42 +704,6 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
return 0;
}
- /**
- * padata_add_cpu - add a cpu to one or both(parallel and serial)
- * padata cpumasks.
- *
- * @pinst: padata instance
- * @cpu: cpu to add
- * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added.
- * The @mask may be any combination of the following flags:
- * PADATA_CPU_SERIAL - serial cpumask
- * PADATA_CPU_PARALLEL - parallel cpumask
- */
-
-int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask)
-{
- int err;
-
- if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
- return -EINVAL;
-
- mutex_lock(&pinst->lock);
-
- get_online_cpus();
- if (mask & PADATA_CPU_SERIAL)
- cpumask_set_cpu(cpu, pinst->cpumask.cbcpu);
- if (mask & PADATA_CPU_PARALLEL)
- cpumask_set_cpu(cpu, pinst->cpumask.pcpu);
-
- err = __padata_add_cpu(pinst, cpu);
- put_online_cpus();
-
- mutex_unlock(&pinst->lock);
-
- return err;
-}
-EXPORT_SYMBOL(padata_add_cpu);
-
static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
{
struct parallel_data *pd = NULL;
@@ -789,43 +763,6 @@ int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask)
}
EXPORT_SYMBOL(padata_remove_cpu);
-/**
- * padata_start - start the parallel processing
- *
- * @pinst: padata instance to start
- */
-int padata_start(struct padata_instance *pinst)
-{
- int err = 0;
-
- mutex_lock(&pinst->lock);
-
- if (pinst->flags & PADATA_INVALID)
- err =-EINVAL;
-
- __padata_start(pinst);
-
- mutex_unlock(&pinst->lock);
-
- return err;
-}
-EXPORT_SYMBOL(padata_start);
-
-/**
- * padata_stop - stop the parallel processing
- *
- * @pinst: padata instance to stop
- */
-void padata_stop(struct padata_instance *pinst)
-{
- mutex_lock(&pinst->lock);
- __padata_stop(pinst);
- mutex_unlock(&pinst->lock);
-}
-EXPORT_SYMBOL(padata_stop);
-
-#ifdef CONFIG_HOTPLUG_CPU
-
static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu)
{
return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) ||
@@ -1091,7 +1028,6 @@ err_free_inst:
err:
return NULL;
}
-EXPORT_SYMBOL(padata_alloc);
/**
* padata_free - free a padata instance
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 3ccdc8e..3e888cd 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -380,29 +380,9 @@ void destroy_rcu_head(struct rcu_head *head)
debug_object_free(head, &rcuhead_debug_descr);
}
-/*
- * fixup_activate is called when:
- * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
- * Activation is performed internally by call_rcu().
- */
-static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
+static bool rcuhead_is_static_object(void *addr)
{
- struct rcu_head *head = addr;
-
- switch (state) {
-
- case ODEBUG_STATE_NOTAVAILABLE:
- /*
- * This is not really a fixup. We just make sure that it is
- * tracked in the object tracker.
- */
- debug_object_init(head, &rcuhead_debug_descr);
- debug_object_activate(head, &rcuhead_debug_descr);
- return 0;
- default:
- return 1;
- }
+ return true;
}
/**
@@ -440,7 +420,7 @@ EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
struct debug_obj_descr rcuhead_debug_descr = {
.name = "rcu_head",
- .fixup_activate = rcuhead_fixup_activate,
+ .is_static_object = rcuhead_is_static_object,
};
EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c8b3186..2effd84 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1521,6 +1521,13 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
+ {
+ .procname = "stat_refresh",
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0600,
+ .proc_handler = vmstat_refresh,
+ },
#endif
#ifdef CONFIG_MMU
{
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index fa0b983..8c7392c 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -334,7 +334,7 @@ static void *hrtimer_debug_hint(void *addr)
* fixup_init is called when:
* - an active object is initialized
*/
-static int hrtimer_fixup_init(void *addr, enum debug_obj_state state)
+static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state)
{
struct hrtimer *timer = addr;
@@ -342,30 +342,25 @@ static int hrtimer_fixup_init(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
hrtimer_cancel(timer);
debug_object_init(timer, &hrtimer_debug_descr);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
/*
* fixup_activate is called when:
* - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
+ * - an unknown non-static object is activated
*/
-static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
+static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
{
switch (state) {
-
- case ODEBUG_STATE_NOTAVAILABLE:
- WARN_ON_ONCE(1);
- return 0;
-
case ODEBUG_STATE_ACTIVE:
WARN_ON(1);
default:
- return 0;
+ return false;
}
}
@@ -373,7 +368,7 @@ static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
* fixup_free is called when:
* - an active object is freed
*/
-static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
+static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state)
{
struct hrtimer *timer = addr;
@@ -381,9 +376,9 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
hrtimer_cancel(timer);
debug_object_free(timer, &hrtimer_debug_descr);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
diff --git a/kernel/time/time.c b/kernel/time/time.c
index a4064b6..667b933 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -769,3 +769,24 @@ struct timespec timespec_add_safe(const struct timespec lhs,
return res;
}
+
+/*
+ * Add two timespec64 values and do a safety check for overflow.
+ * It's assumed that both values are valid (>= 0).
+ * And, each timespec64 is in normalized form.
+ */
+struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
+ const struct timespec64 rhs)
+{
+ struct timespec64 res;
+
+ set_normalized_timespec64(&res, lhs.tv_sec + rhs.tv_sec,
+ lhs.tv_nsec + rhs.tv_nsec);
+
+ if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) {
+ res.tv_sec = TIME64_MAX;
+ res.tv_nsec = 0;
+ }
+
+ return res;
+}
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 73164c3..3a95f97 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -489,11 +489,19 @@ static void *timer_debug_hint(void *addr)
return ((struct timer_list *) addr)->function;
}
+static bool timer_is_static_object(void *addr)
+{
+ struct timer_list *timer = addr;
+
+ return (timer->entry.pprev == NULL &&
+ timer->entry.next == TIMER_ENTRY_STATIC);
+}
+
/*
* fixup_init is called when:
* - an active object is initialized
*/
-static int timer_fixup_init(void *addr, enum debug_obj_state state)
+static bool timer_fixup_init(void *addr, enum debug_obj_state state)
{
struct timer_list *timer = addr;
@@ -501,9 +509,9 @@ static int timer_fixup_init(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
del_timer_sync(timer);
debug_object_init(timer, &timer_debug_descr);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
@@ -516,36 +524,22 @@ static void stub_timer(unsigned long data)
/*
* fixup_activate is called when:
* - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
+ * - an unknown non-static object is activated
*/
-static int timer_fixup_activate(void *addr, enum debug_obj_state state)
+static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
{
struct timer_list *timer = addr;
switch (state) {
-
case ODEBUG_STATE_NOTAVAILABLE:
- /*
- * This is not really a fixup. The timer was
- * statically initialized. We just make sure that it
- * is tracked in the object tracker.
- */
- if (timer->entry.pprev == NULL &&
- timer->entry.next == TIMER_ENTRY_STATIC) {
- debug_object_init(timer, &timer_debug_descr);
- debug_object_activate(timer, &timer_debug_descr);
- return 0;
- } else {
- setup_timer(timer, stub_timer, 0);
- return 1;
- }
- return 0;
+ setup_timer(timer, stub_timer, 0);
+ return true;
case ODEBUG_STATE_ACTIVE:
WARN_ON(1);
default:
- return 0;
+ return false;
}
}
@@ -553,7 +547,7 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state)
* fixup_free is called when:
* - an active object is freed
*/
-static int timer_fixup_free(void *addr, enum debug_obj_state state)
+static bool timer_fixup_free(void *addr, enum debug_obj_state state)
{
struct timer_list *timer = addr;
@@ -561,9 +555,9 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
del_timer_sync(timer);
debug_object_free(timer, &timer_debug_descr);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
@@ -571,32 +565,23 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
* fixup_assert_init is called when:
* - an untracked/uninit-ed object is found
*/
-static int timer_fixup_assert_init(void *addr, enum debug_obj_state state)
+static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
{
struct timer_list *timer = addr;
switch (state) {
case ODEBUG_STATE_NOTAVAILABLE:
- if (timer->entry.next == TIMER_ENTRY_STATIC) {
- /*
- * This is not really a fixup. The timer was
- * statically initialized. We just make sure that it
- * is tracked in the object tracker.
- */
- debug_object_init(timer, &timer_debug_descr);
- return 0;
- } else {
- setup_timer(timer, stub_timer, 0);
- return 1;
- }
+ setup_timer(timer, stub_timer, 0);
+ return true;
default:
- return 0;
+ return false;
}
}
static struct debug_obj_descr timer_debug_descr = {
.name = "timer_list",
.debug_hint = timer_debug_hint,
+ .is_static_object = timer_is_static_object,
.fixup_init = timer_fixup_init,
.fixup_activate = timer_fixup_activate,
.fixup_free = timer_fixup_free,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5f5068e..e1c0e99 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -433,54 +433,28 @@ static void *work_debug_hint(void *addr)
return ((struct work_struct *) addr)->func;
}
-/*
- * fixup_init is called when:
- * - an active object is initialized
- */
-static int work_fixup_init(void *addr, enum debug_obj_state state)
+static bool work_is_static_object(void *addr)
{
struct work_struct *work = addr;
- switch (state) {
- case ODEBUG_STATE_ACTIVE:
- cancel_work_sync(work);
- debug_object_init(work, &work_debug_descr);
- return 1;
- default:
- return 0;
- }
+ return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work));
}
/*
- * fixup_activate is called when:
- * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
+ * fixup_init is called when:
+ * - an active object is initialized
*/
-static int work_fixup_activate(void *addr, enum debug_obj_state state)
+static bool work_fixup_init(void *addr, enum debug_obj_state state)
{
struct work_struct *work = addr;
switch (state) {
-
- case ODEBUG_STATE_NOTAVAILABLE:
- /*
- * This is not really a fixup. The work struct was
- * statically initialized. We just make sure that it
- * is tracked in the object tracker.
- */
- if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
- debug_object_init(work, &work_debug_descr);
- debug_object_activate(work, &work_debug_descr);
- return 0;
- }
- WARN_ON_ONCE(1);
- return 0;
-
case ODEBUG_STATE_ACTIVE:
- WARN_ON(1);
-
+ cancel_work_sync(work);
+ debug_object_init(work, &work_debug_descr);
+ return true;
default:
- return 0;
+ return false;
}
}
@@ -488,7 +462,7 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state)
* fixup_free is called when:
* - an active object is freed
*/
-static int work_fixup_free(void *addr, enum debug_obj_state state)
+static bool work_fixup_free(void *addr, enum debug_obj_state state)
{
struct work_struct *work = addr;
@@ -496,17 +470,17 @@ static int work_fixup_free(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
cancel_work_sync(work);
debug_object_free(work, &work_debug_descr);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
static struct debug_obj_descr work_debug_descr = {
.name = "work_struct",
.debug_hint = work_debug_hint,
+ .is_static_object = work_is_static_object,
.fixup_init = work_fixup_init,
- .fixup_activate = work_fixup_activate,
.fixup_free = work_fixup_free,
};
diff --git a/lib/Makefile b/lib/Makefile
index 931396a..42b6918 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -25,7 +25,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
sha1.o md5.o irq_regs.o argv_split.o \
flex_proportions.o ratelimit.o show_mem.o \
is_single_threaded.o plist.o decompress.o kobject_uevent.o \
- earlycpio.o seq_buf.o nmi_backtrace.o
+ earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o
obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o
lib-$(CONFIG_MMU) += ioremap.o
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 519b5a1..a8e1260 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -269,16 +269,15 @@ static void debug_print_object(struct debug_obj *obj, char *msg)
* Try to repair the damage, so we have a better chance to get useful
* debug output.
*/
-static int
-debug_object_fixup(int (*fixup)(void *addr, enum debug_obj_state state),
+static bool
+debug_object_fixup(bool (*fixup)(void *addr, enum debug_obj_state state),
void * addr, enum debug_obj_state state)
{
- int fixed = 0;
-
- if (fixup)
- fixed = fixup(addr, state);
- debug_objects_fixups += fixed;
- return fixed;
+ if (fixup && fixup(addr, state)) {
+ debug_objects_fixups++;
+ return true;
+ }
+ return false;
}
static void debug_object_is_on_stack(void *addr, int onstack)
@@ -416,7 +415,7 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr)
state = obj->state;
raw_spin_unlock_irqrestore(&db->lock, flags);
ret = debug_object_fixup(descr->fixup_activate, addr, state);
- return ret ? -EINVAL : 0;
+ return ret ? 0 : -EINVAL;
case ODEBUG_STATE_DESTROYED:
debug_print_object(obj, "activate");
@@ -432,14 +431,21 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr)
raw_spin_unlock_irqrestore(&db->lock, flags);
/*
- * This happens when a static object is activated. We
- * let the type specific code decide whether this is
- * true or not.
+ * We are here when a static object is activated. We
+ * let the type specific code confirm whether this is
+ * true or not. if true, we just make sure that the
+ * static object is tracked in the object tracker. If
+ * not, this must be a bug, so we try to fix it up.
*/
- if (debug_object_fixup(descr->fixup_activate, addr,
- ODEBUG_STATE_NOTAVAILABLE)) {
+ if (descr->is_static_object && descr->is_static_object(addr)) {
+ /* track this static object */
+ debug_object_init(addr, descr);
+ debug_object_activate(addr, descr);
+ } else {
debug_print_object(&o, "activate");
- return -EINVAL;
+ ret = debug_object_fixup(descr->fixup_activate, addr,
+ ODEBUG_STATE_NOTAVAILABLE);
+ return ret ? 0 : -EINVAL;
}
return 0;
}
@@ -603,12 +609,18 @@ void debug_object_assert_init(void *addr, struct debug_obj_descr *descr)
raw_spin_unlock_irqrestore(&db->lock, flags);
/*
- * Maybe the object is static. Let the type specific
- * code decide what to do.
+ * Maybe the object is static, and we let the type specific
+ * code confirm. Track this static object if true, else invoke
+ * fixup.
*/
- if (debug_object_fixup(descr->fixup_assert_init, addr,
- ODEBUG_STATE_NOTAVAILABLE))
+ if (descr->is_static_object && descr->is_static_object(addr)) {
+ /* Track this static object */
+ debug_object_init(addr, descr);
+ } else {
debug_print_object(&o, "assert_init");
+ debug_object_fixup(descr->fixup_assert_init, addr,
+ ODEBUG_STATE_NOTAVAILABLE);
+ }
return;
}
@@ -793,11 +805,18 @@ struct self_test {
static __initdata struct debug_obj_descr descr_type_test;
+static bool __init is_static_object(void *addr)
+{
+ struct self_test *obj = addr;
+
+ return obj->static_init;
+}
+
/*
* fixup_init is called when:
* - an active object is initialized
*/
-static int __init fixup_init(void *addr, enum debug_obj_state state)
+static bool __init fixup_init(void *addr, enum debug_obj_state state)
{
struct self_test *obj = addr;
@@ -805,37 +824,31 @@ static int __init fixup_init(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
debug_object_deactivate(obj, &descr_type_test);
debug_object_init(obj, &descr_type_test);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
/*
* fixup_activate is called when:
* - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
+ * - an unknown non-static object is activated
*/
-static int __init fixup_activate(void *addr, enum debug_obj_state state)
+static bool __init fixup_activate(void *addr, enum debug_obj_state state)
{
struct self_test *obj = addr;
switch (state) {
case ODEBUG_STATE_NOTAVAILABLE:
- if (obj->static_init == 1) {
- debug_object_init(obj, &descr_type_test);
- debug_object_activate(obj, &descr_type_test);
- return 0;
- }
- return 1;
-
+ return true;
case ODEBUG_STATE_ACTIVE:
debug_object_deactivate(obj, &descr_type_test);
debug_object_activate(obj, &descr_type_test);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
@@ -843,7 +856,7 @@ static int __init fixup_activate(void *addr, enum debug_obj_state state)
* fixup_destroy is called when:
* - an active object is destroyed
*/
-static int __init fixup_destroy(void *addr, enum debug_obj_state state)
+static bool __init fixup_destroy(void *addr, enum debug_obj_state state)
{
struct self_test *obj = addr;
@@ -851,9 +864,9 @@ static int __init fixup_destroy(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
debug_object_deactivate(obj, &descr_type_test);
debug_object_destroy(obj, &descr_type_test);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
@@ -861,7 +874,7 @@ static int __init fixup_destroy(void *addr, enum debug_obj_state state)
* fixup_free is called when:
* - an active object is freed
*/
-static int __init fixup_free(void *addr, enum debug_obj_state state)
+static bool __init fixup_free(void *addr, enum debug_obj_state state)
{
struct self_test *obj = addr;
@@ -869,9 +882,9 @@ static int __init fixup_free(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
debug_object_deactivate(obj, &descr_type_test);
debug_object_free(obj, &descr_type_test);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
@@ -917,6 +930,7 @@ out:
static __initdata struct debug_obj_descr descr_type_test = {
.name = "selftest",
+ .is_static_object = is_static_object,
.fixup_init = fixup_init,
.fixup_activate = fixup_activate,
.fixup_destroy = fixup_destroy,
diff --git a/lib/nodemask.c b/lib/nodemask.c
new file mode 100644
index 0000000..e42a5bf4
--- /dev/null
+++ b/lib/nodemask.c
@@ -0,0 +1,30 @@
+#include <linux/nodemask.h>
+#include <linux/module.h>
+#include <linux/random.h>
+
+int __next_node_in(int node, const nodemask_t *srcp)
+{
+ int ret = __next_node(node, srcp);
+
+ if (ret == MAX_NUMNODES)
+ ret = __first_node(srcp);
+ return ret;
+}
+EXPORT_SYMBOL(__next_node_in);
+
+#ifdef CONFIG_NUMA
+/*
+ * Return the bit number of a random bit set in the nodemask.
+ * (returns NUMA_NO_NODE if nodemask is empty)
+ */
+int node_random(const nodemask_t *maskp)
+{
+ int w, bit = NUMA_NO_NODE;
+
+ w = nodes_weight(*maskp);
+ if (w)
+ bit = bitmap_ord_to_pos(maskp->bits,
+ get_random_int() % w, MAX_NUMNODES);
+ return bit;
+}
+#endif
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index f051d69..72d3611 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -19,7 +19,7 @@ static DEFINE_SPINLOCK(percpu_counters_lock);
static struct debug_obj_descr percpu_counter_debug_descr;
-static int percpu_counter_fixup_free(void *addr, enum debug_obj_state state)
+static bool percpu_counter_fixup_free(void *addr, enum debug_obj_state state)
{
struct percpu_counter *fbc = addr;
@@ -27,9 +27,9 @@ static int percpu_counter_fixup_free(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
percpu_counter_destroy(fbc);
debug_object_free(fbc, &percpu_counter_debug_descr);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
diff --git a/mm/Kconfig b/mm/Kconfig
index 989f8f3..b0432b7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -192,6 +192,22 @@ config MEMORY_HOTPLUG_SPARSE
def_bool y
depends on SPARSEMEM && MEMORY_HOTPLUG
+config MEMORY_HOTPLUG_DEFAULT_ONLINE
+ bool "Online the newly added memory blocks by default"
+ default n
+ depends on MEMORY_HOTPLUG
+ help
+ This option sets the default policy setting for memory hotplug
+ onlining policy (/sys/devices/system/memory/auto_online_blocks) which
+ determines what happens to newly added memory regions. Policy setting
+ can always be changed at runtime.
+ See Documentation/memory-hotplug.txt for more information.
+
+ Say Y here if you want all hot-plugged memory blocks to appear in
+ 'online' state by default.
+ Say N here if you want the default policy to keep all hot-plugged
+ memory blocks in 'offline' state.
+
config MEMORY_HOTREMOVE
bool "Allow for memory hot remove"
select MEMORY_ISOLATION
@@ -268,11 +284,6 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION
config PHYS_ADDR_T_64BIT
def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
-config ZONE_DMA_FLAG
- int
- default "0" if !ZONE_DMA
- default "1"
-
config BOUNCE
bool "Enable bounce buffers"
default y
diff --git a/mm/compaction.c b/mm/compaction.c
index 8fa2540..eda3c22 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -42,6 +42,11 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
#define CREATE_TRACE_POINTS
#include <trace/events/compaction.h>
+#define block_start_pfn(pfn, order) round_down(pfn, 1UL << (order))
+#define block_end_pfn(pfn, order) ALIGN((pfn) + 1, 1UL << (order))
+#define pageblock_start_pfn(pfn) block_start_pfn(pfn, pageblock_order)
+#define pageblock_end_pfn(pfn) block_end_pfn(pfn, pageblock_order)
+
static unsigned long release_freepages(struct list_head *freelist)
{
struct page *page, *next;
@@ -161,7 +166,7 @@ static void reset_cached_positions(struct zone *zone)
zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
zone->compact_cached_free_pfn =
- round_down(zone_end_pfn(zone) - 1, pageblock_nr_pages);
+ pageblock_start_pfn(zone_end_pfn(zone) - 1);
}
/*
@@ -519,10 +524,10 @@ isolate_freepages_range(struct compact_control *cc,
LIST_HEAD(freelist);
pfn = start_pfn;
- block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
+ block_start_pfn = pageblock_start_pfn(pfn);
if (block_start_pfn < cc->zone->zone_start_pfn)
block_start_pfn = cc->zone->zone_start_pfn;
- block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_end_pfn = pageblock_end_pfn(pfn);
for (; pfn < end_pfn; pfn += isolated,
block_start_pfn = block_end_pfn,
@@ -538,8 +543,8 @@ isolate_freepages_range(struct compact_control *cc,
* scanning range to right one.
*/
if (pfn >= block_end_pfn) {
- block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
- block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_start_pfn = pageblock_start_pfn(pfn);
+ block_end_pfn = pageblock_end_pfn(pfn);
block_end_pfn = min(block_end_pfn, end_pfn);
}
@@ -633,12 +638,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
{
struct zone *zone = cc->zone;
unsigned long nr_scanned = 0, nr_isolated = 0;
- struct list_head *migratelist = &cc->migratepages;
struct lruvec *lruvec;
unsigned long flags = 0;
bool locked = false;
struct page *page = NULL, *valid_page = NULL;
unsigned long start_pfn = low_pfn;
+ bool skip_on_failure = false;
+ unsigned long next_skip_pfn = 0;
/*
* Ensure that there are not too many pages isolated from the LRU
@@ -659,10 +665,37 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (compact_should_abort(cc))
return 0;
+ if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) {
+ skip_on_failure = true;
+ next_skip_pfn = block_end_pfn(low_pfn, cc->order);
+ }
+
/* Time to isolate some pages for migration */
for (; low_pfn < end_pfn; low_pfn++) {
bool is_lru;
+ if (skip_on_failure && low_pfn >= next_skip_pfn) {
+ /*
+ * We have isolated all migration candidates in the
+ * previous order-aligned block, and did not skip it due
+ * to failure. We should migrate the pages now and
+ * hopefully succeed compaction.
+ */
+ if (nr_isolated)
+ break;
+
+ /*
+ * We failed to isolate in the previous order-aligned
+ * block. Set the new boundary to the end of the
+ * current block. Note we can't simply increase
+ * next_skip_pfn by 1 << order, as low_pfn might have
+ * been incremented by a higher number due to skipping
+ * a compound or a high-order buddy page in the
+ * previous loop iteration.
+ */
+ next_skip_pfn = block_end_pfn(low_pfn, cc->order);
+ }
+
/*
* Periodically drop the lock (if held) regardless of its
* contention, to give chance to IRQs. Abort async compaction
@@ -674,7 +707,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
break;
if (!pfn_valid_within(low_pfn))
- continue;
+ goto isolate_fail;
nr_scanned++;
page = pfn_to_page(low_pfn);
@@ -729,11 +762,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (likely(comp_order < MAX_ORDER))
low_pfn += (1UL << comp_order) - 1;
- continue;
+ goto isolate_fail;
}
if (!is_lru)
- continue;
+ goto isolate_fail;
/*
* Migration will fail if an anonymous page is pinned in memory,
@@ -742,7 +775,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
*/
if (!page_mapping(page) &&
page_count(page) > page_mapcount(page))
- continue;
+ goto isolate_fail;
/* If we already hold the lock, we can skip some rechecking */
if (!locked) {
@@ -753,7 +786,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* Recheck PageLRU and PageCompound under lock */
if (!PageLRU(page))
- continue;
+ goto isolate_fail;
/*
* Page become compound since the non-locked check,
@@ -762,7 +795,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
*/
if (unlikely(PageCompound(page))) {
low_pfn += (1UL << compound_order(page)) - 1;
- continue;
+ goto isolate_fail;
}
}
@@ -770,7 +803,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* Try isolate the page */
if (__isolate_lru_page(page, isolate_mode) != 0)
- continue;
+ goto isolate_fail;
VM_BUG_ON_PAGE(PageCompound(page), page);
@@ -778,15 +811,55 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
del_page_from_lru_list(page, lruvec, page_lru(page));
isolate_success:
- list_add(&page->lru, migratelist);
+ list_add(&page->lru, &cc->migratepages);
cc->nr_migratepages++;
nr_isolated++;
+ /*
+ * Record where we could have freed pages by migration and not
+ * yet flushed them to buddy allocator.
+ * - this is the lowest page that was isolated and likely be
+ * then freed by migration.
+ */
+ if (!cc->last_migrated_pfn)
+ cc->last_migrated_pfn = low_pfn;
+
/* Avoid isolating too much */
if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
++low_pfn;
break;
}
+
+ continue;
+isolate_fail:
+ if (!skip_on_failure)
+ continue;
+
+ /*
+ * We have isolated some pages, but then failed. Release them
+ * instead of migrating, as we cannot form the cc->order buddy
+ * page anyway.
+ */
+ if (nr_isolated) {
+ if (locked) {
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ locked = false;
+ }
+ acct_isolated(zone, cc);
+ putback_movable_pages(&cc->migratepages);
+ cc->nr_migratepages = 0;
+ cc->last_migrated_pfn = 0;
+ nr_isolated = 0;
+ }
+
+ if (low_pfn < next_skip_pfn) {
+ low_pfn = next_skip_pfn - 1;
+ /*
+ * The check near the loop beginning would have updated
+ * next_skip_pfn too, but this is a bit simpler.
+ */
+ next_skip_pfn += 1UL << cc->order;
+ }
}
/*
@@ -834,10 +907,10 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
/* Scan block by block. First and last block may be incomplete */
pfn = start_pfn;
- block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
+ block_start_pfn = pageblock_start_pfn(pfn);
if (block_start_pfn < cc->zone->zone_start_pfn)
block_start_pfn = cc->zone->zone_start_pfn;
- block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_end_pfn = pageblock_end_pfn(pfn);
for (; pfn < end_pfn; pfn = block_end_pfn,
block_start_pfn = block_end_pfn,
@@ -924,10 +997,10 @@ static void isolate_freepages(struct compact_control *cc)
* is using.
*/
isolate_start_pfn = cc->free_pfn;
- block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
+ block_start_pfn = pageblock_start_pfn(cc->free_pfn);
block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
zone_end_pfn(zone));
- low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
+ low_pfn = pageblock_end_pfn(cc->migrate_pfn);
/*
* Isolate free pages until enough are available to migrate the
@@ -1070,7 +1143,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
unsigned long block_start_pfn;
unsigned long block_end_pfn;
unsigned long low_pfn;
- unsigned long isolate_start_pfn;
struct page *page;
const isolate_mode_t isolate_mode =
(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
@@ -1081,12 +1153,12 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
* initialized by compact_zone()
*/
low_pfn = cc->migrate_pfn;
- block_start_pfn = cc->migrate_pfn & ~(pageblock_nr_pages - 1);
+ block_start_pfn = pageblock_start_pfn(low_pfn);
if (block_start_pfn < zone->zone_start_pfn)
block_start_pfn = zone->zone_start_pfn;
/* Only scan within a pageblock boundary */
- block_end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
+ block_end_pfn = pageblock_end_pfn(low_pfn);
/*
* Iterate over whole pageblocks until we find the first suitable.
@@ -1125,7 +1197,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
continue;
/* Perform the isolation */
- isolate_start_pfn = low_pfn;
low_pfn = isolate_migratepages_block(cc, low_pfn,
block_end_pfn, isolate_mode);
@@ -1135,15 +1206,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
}
/*
- * Record where we could have freed pages by migration and not
- * yet flushed them to buddy allocator.
- * - this is the lowest page that could have been isolated and
- * then freed by migration.
- */
- if (cc->nr_migratepages && !cc->last_migrated_pfn)
- cc->last_migrated_pfn = isolate_start_pfn;
-
- /*
* Either we isolated something and proceed with migration. Or
* we failed and compact_zone should decide if we should
* continue or not.
@@ -1251,7 +1313,8 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
* COMPACT_CONTINUE - If compaction should run now
*/
static unsigned long __compaction_suitable(struct zone *zone, int order,
- int alloc_flags, int classzone_idx)
+ unsigned int alloc_flags,
+ int classzone_idx)
{
int fragindex;
unsigned long watermark;
@@ -1296,7 +1359,8 @@ static unsigned long __compaction_suitable(struct zone *zone, int order,
}
unsigned long compaction_suitable(struct zone *zone, int order,
- int alloc_flags, int classzone_idx)
+ unsigned int alloc_flags,
+ int classzone_idx)
{
unsigned long ret;
@@ -1343,7 +1407,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
cc->free_pfn = zone->compact_cached_free_pfn;
if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
- cc->free_pfn = round_down(end_pfn - 1, pageblock_nr_pages);
+ cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
zone->compact_cached_free_pfn = cc->free_pfn;
}
if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
@@ -1398,6 +1462,18 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
ret = COMPACT_CONTENDED;
goto out;
}
+ /*
+ * We failed to migrate at least one page in the current
+ * order-aligned block, so skip the rest of it.
+ */
+ if (cc->direct_compaction &&
+ (cc->mode == MIGRATE_ASYNC)) {
+ cc->migrate_pfn = block_end_pfn(
+ cc->migrate_pfn - 1, cc->order);
+ /* Draining pcplists is useless in this case */
+ cc->last_migrated_pfn = 0;
+
+ }
}
check_drain:
@@ -1411,7 +1487,7 @@ check_drain:
if (cc->order > 0 && cc->last_migrated_pfn) {
int cpu;
unsigned long current_block_start =
- cc->migrate_pfn & ~((1UL << cc->order) - 1);
+ block_start_pfn(cc->migrate_pfn, cc->order);
if (cc->last_migrated_pfn < current_block_start) {
cpu = get_cpu();
@@ -1436,7 +1512,7 @@ out:
cc->nr_freepages = 0;
VM_BUG_ON(free_pfn == 0);
/* The cached pfn is always the first in a pageblock */
- free_pfn &= ~(pageblock_nr_pages-1);
+ free_pfn = pageblock_start_pfn(free_pfn);
/*
* Only go back, not forward. The cached pfn might have been
* already reset to zone end in compact_finished()
@@ -1456,7 +1532,7 @@ out:
static unsigned long compact_zone_order(struct zone *zone, int order,
gfp_t gfp_mask, enum migrate_mode mode, int *contended,
- int alloc_flags, int classzone_idx)
+ unsigned int alloc_flags, int classzone_idx)
{
unsigned long ret;
struct compact_control cc = {
@@ -1497,8 +1573,8 @@ int sysctl_extfrag_threshold = 500;
* This is the main entry point for direct page compaction.
*/
unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
- int alloc_flags, const struct alloc_context *ac,
- enum migrate_mode mode, int *contended)
+ unsigned int alloc_flags, const struct alloc_context *ac,
+ enum migrate_mode mode, int *contended)
{
int may_enter_fs = gfp_mask & __GFP_FS;
int may_perform_io = gfp_mask & __GFP_IO;
@@ -1526,7 +1602,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
status = compact_zone_order(zone, order, gfp_mask, mode,
&zone_contended, alloc_flags,
- ac->classzone_idx);
+ ac_classzone_idx(ac));
rc = max(status, rc);
/*
* It takes at least one zone that wasn't lock contended
@@ -1536,7 +1612,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
/* If a normal allocation would succeed, stop compacting */
if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
- ac->classzone_idx, alloc_flags)) {
+ ac_classzone_idx(ac), alloc_flags)) {
/*
* We think the allocation will succeed in this zone,
* but it is not certain, hence the false. The caller
diff --git a/mm/filemap.c b/mm/filemap.c
index 182b218..0169033 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -213,7 +213,7 @@ void __delete_from_page_cache(struct page *page, void *shadow)
* some other bad page check should catch it later.
*/
page_mapcount_reset(page);
- atomic_sub(mapcount, &page->_count);
+ page_ref_sub(page, mapcount);
}
}
diff --git a/mm/highmem.c b/mm/highmem.c
index 123bcd3..50b4ca6 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -112,16 +112,12 @@ EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
unsigned int nr_free_highpages (void)
{
- pg_data_t *pgdat;
+ struct zone *zone;
unsigned int pages = 0;
- for_each_online_pgdat(pgdat) {
- pages += zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
- NR_FREE_PAGES);
- if (zone_movable_is_highmem())
- pages += zone_page_state(
- &pgdat->node_zones[ZONE_MOVABLE],
- NR_FREE_PAGES);
+ for_each_populated_zone(zone) {
+ if (is_highmem(zone))
+ pages += zone_page_state(zone, NR_FREE_PAGES);
}
return pages;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b49ee12..66675ee 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1698,20 +1698,17 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
return 1;
}
-bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
- unsigned long old_addr,
+bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
pmd_t *old_pmd, pmd_t *new_pmd)
{
spinlock_t *old_ptl, *new_ptl;
pmd_t pmd;
-
struct mm_struct *mm = vma->vm_mm;
if ((old_addr & ~HPAGE_PMD_MASK) ||
(new_addr & ~HPAGE_PMD_MASK) ||
- old_end - old_addr < HPAGE_PMD_SIZE ||
- (new_vma->vm_flags & VM_NOHUGEPAGE))
+ old_end - old_addr < HPAGE_PMD_SIZE)
return false;
/*
@@ -3113,7 +3110,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail);
/*
- * tail_page->_count is zero and not changing from under us. But
+ * tail_page->_refcount is zero and not changing from under us. But
* get_page_unless_zero() may be running from under us on the
* tail_page. If we used atomic_set() below instead of atomic_inc(), we
* would then run atomic_set() concurrently with
@@ -3340,7 +3337,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
if (mlocked)
lru_add_drain();
- /* Prevent deferred_split_scan() touching ->_count */
+ /* Prevent deferred_split_scan() touching ->_refcount */
spin_lock_irqsave(&pgdata->split_queue_lock, flags);
count = page_count(head);
mapcount = total_mapcount(head);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 19d0d08..949d806 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -51,6 +51,7 @@ __initdata LIST_HEAD(huge_boot_pages);
static struct hstate * __initdata parsed_hstate;
static unsigned long __initdata default_hstate_max_huge_pages;
static unsigned long __initdata default_hstate_size;
+static bool __initdata parsed_valid_hugepagesz = true;
/*
* Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
@@ -144,7 +145,8 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
}
}
- if (spool->min_hpages != -1) { /* minimum size accounting */
+ /* minimum size accounting */
+ if (spool->min_hpages != -1 && spool->rsv_hpages) {
if (delta > spool->rsv_hpages) {
/*
* Asking for more reserves than those already taken on
@@ -182,7 +184,8 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
if (spool->max_hpages != -1) /* maximum size accounting */
spool->used_hpages -= delta;
- if (spool->min_hpages != -1) { /* minimum size accounting */
+ /* minimum size accounting */
+ if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
if (spool->rsv_hpages + delta <= spool->min_hpages)
ret = 0;
else
@@ -937,9 +940,7 @@ err:
*/
static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
{
- nid = next_node(nid, *nodes_allowed);
- if (nid == MAX_NUMNODES)
- nid = first_node(*nodes_allowed);
+ nid = next_node_in(nid, *nodes_allowed);
VM_BUG_ON(nid >= MAX_NUMNODES);
return nid;
@@ -1030,8 +1031,8 @@ static int __alloc_gigantic_page(unsigned long start_pfn,
return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
}
-static bool pfn_range_valid_gigantic(unsigned long start_pfn,
- unsigned long nr_pages)
+static bool pfn_range_valid_gigantic(struct zone *z,
+ unsigned long start_pfn, unsigned long nr_pages)
{
unsigned long i, end_pfn = start_pfn + nr_pages;
struct page *page;
@@ -1042,6 +1043,9 @@ static bool pfn_range_valid_gigantic(unsigned long start_pfn,
page = pfn_to_page(i);
+ if (page_zone(page) != z)
+ return false;
+
if (PageReserved(page))
return false;
@@ -1074,7 +1078,7 @@ static struct page *alloc_gigantic_page(int nid, unsigned int order)
pfn = ALIGN(z->zone_start_pfn, nr_pages);
while (zone_spans_last_pfn(z, pfn, nr_pages)) {
- if (pfn_range_valid_gigantic(pfn, nr_pages)) {
+ if (pfn_range_valid_gigantic(z, pfn, nr_pages)) {
/*
* We release the zone lock here because
* alloc_contig_range() will also lock the zone
@@ -2659,6 +2663,11 @@ static int __init hugetlb_init(void)
subsys_initcall(hugetlb_init);
/* Should be called on processing a hugepagesz=... option */
+void __init hugetlb_bad_size(void)
+{
+ parsed_valid_hugepagesz = false;
+}
+
void __init hugetlb_add_hstate(unsigned int order)
{
struct hstate *h;
@@ -2678,8 +2687,8 @@ void __init hugetlb_add_hstate(unsigned int order)
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&h->hugepage_freelists[i]);
INIT_LIST_HEAD(&h->hugepage_activelist);
- h->next_nid_to_alloc = first_node(node_states[N_MEMORY]);
- h->next_nid_to_free = first_node(node_states[N_MEMORY]);
+ h->next_nid_to_alloc = first_memory_node;
+ h->next_nid_to_free = first_memory_node;
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
huge_page_size(h)/1024);
@@ -2691,11 +2700,17 @@ static int __init hugetlb_nrpages_setup(char *s)
unsigned long *mhp;
static unsigned long *last_mhp;
+ if (!parsed_valid_hugepagesz) {
+ pr_warn("hugepages = %s preceded by "
+ "an unsupported hugepagesz, ignoring\n", s);
+ parsed_valid_hugepagesz = true;
+ return 1;
+ }
/*
* !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
* so this hugepages= parameter goes to the "default hstate".
*/
- if (!hugetlb_max_hstate)
+ else if (!hugetlb_max_hstate)
mhp = &default_hstate_max_huge_pages;
else
mhp = &parsed_hstate->max_huge_pages;
diff --git a/mm/internal.h b/mm/internal.h
index b79abb6..3ac544f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -58,7 +58,7 @@ static inline unsigned long ra_submit(struct file_ra_state *ra,
}
/*
- * Turn a non-refcounted page (->_count == 0) into refcounted with
+ * Turn a non-refcounted page (->_refcount == 0) into refcounted with
* a count of one.
*/
static inline void set_page_refcounted(struct page *page)
@@ -102,13 +102,14 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
struct alloc_context {
struct zonelist *zonelist;
nodemask_t *nodemask;
- struct zone *preferred_zone;
- int classzone_idx;
+ struct zoneref *preferred_zoneref;
int migratetype;
enum zone_type high_zoneidx;
bool spread_dirty_pages;
};
+#define ac_classzone_idx(ac) zonelist_zone_idx(ac->preferred_zoneref)
+
/*
* Locate the struct page for both the matching buddy in our
* pair (buddy1) and the combined O(n+1) page they form (page).
@@ -175,7 +176,7 @@ struct compact_control {
bool direct_compaction; /* False from kcompactd or /proc/... */
int order; /* order a direct compactor needs */
const gfp_t gfp_mask; /* gfp mask of a direct compactor */
- const int alloc_flags; /* alloc flags of a direct compactor */
+ const unsigned int alloc_flags; /* alloc flags of a direct compactor */
const int classzone_idx; /* zone index of a direct compactor */
struct zone *zone;
int contended; /* Signal need_sched() or lock
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fe787f5..d71d387 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1023,22 +1023,40 @@ out:
* @lru: index of lru list the page is sitting on
* @nr_pages: positive when adding or negative when removing
*
- * This function must be called when a page is added to or removed from an
- * lru list.
+ * This function must be called under lru_lock, just before a page is added
+ * to or just after a page is removed from an lru list (that ordering being
+ * so as to allow it to check that lru_size 0 is consistent with list_empty).
*/
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
int nr_pages)
{
struct mem_cgroup_per_zone *mz;
unsigned long *lru_size;
+ long size;
+ bool empty;
+
+ __update_lru_size(lruvec, lru, nr_pages);
if (mem_cgroup_disabled())
return;
mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
lru_size = mz->lru_size + lru;
- *lru_size += nr_pages;
- VM_BUG_ON((long)(*lru_size) < 0);
+ empty = list_empty(lruvec->lists + lru);
+
+ if (nr_pages < 0)
+ *lru_size += nr_pages;
+
+ size = *lru_size;
+ if (WARN_ONCE(size < 0 || empty != !size,
+ "%s(%p, %d, %d): lru_size %ld but %sempty\n",
+ __func__, lruvec, lru, nr_pages, size, empty ? "" : "not ")) {
+ VM_BUG_ON(1);
+ *lru_size = 0;
+ }
+
+ if (nr_pages > 0)
+ *lru_size += nr_pages;
}
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
@@ -1257,6 +1275,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
*/
if (fatal_signal_pending(current) || task_will_free_mem(current)) {
mark_oom_victim(current);
+ try_oom_reaper(current);
goto unlock;
}
@@ -1389,14 +1408,11 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
mem_cgroup_may_update_nodemask(memcg);
node = memcg->last_scanned_node;
- node = next_node(node, memcg->scan_nodes);
- if (node == MAX_NUMNODES)
- node = first_node(memcg->scan_nodes);
+ node = next_node_in(node, memcg->scan_nodes);
/*
- * We call this when we hit limit, not when pages are added to LRU.
- * No LRU may hold pages because all pages are UNEVICTABLE or
- * memcg is too small and all pages are not on LRU. In that case,
- * we use curret node.
+ * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages
+ * last time it really checked all the LRUs due to rate limiting.
+ * Fallback to the current node in that case for simplicity.
*/
if (unlikely(node == MAX_NUMNODES))
node = numa_node_id();
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index aa34431..caf2a14 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -78,9 +78,24 @@ static struct {
#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map)
#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map)
+#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
bool memhp_auto_online;
+#else
+bool memhp_auto_online = true;
+#endif
EXPORT_SYMBOL_GPL(memhp_auto_online);
+static int __init setup_memhp_default_state(char *str)
+{
+ if (!strcmp(str, "online"))
+ memhp_auto_online = true;
+ else if (!strcmp(str, "offline"))
+ memhp_auto_online = false;
+
+ return 1;
+}
+__setup("memhp_default_state=", setup_memhp_default_state);
+
void get_online_mems(void)
{
might_sleep();
@@ -1410,7 +1425,7 @@ static struct page *next_active_pageblock(struct page *page)
}
/* Checks if this range of memory is likely to be hot-removable. */
-int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
+bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
{
struct page *page = pfn_to_page(start_pfn);
struct page *end_page = page + nr_pages;
@@ -1418,12 +1433,12 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
/* Check the starting page of each pageblock within the range */
for (; page < end_page; page = next_active_pageblock(page)) {
if (!is_pageblock_removable_nolock(page))
- return 0;
+ return false;
cond_resched();
}
/* All pageblocks in the memory block are likely to be hot-removable */
- return 1;
+ return true;
}
/*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 36cc01b..297d685 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -97,7 +97,6 @@
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
-#include <linux/random.h>
#include "internal.h"
@@ -347,9 +346,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
BUG();
if (!node_isset(current->il_next, tmp)) {
- current->il_next = next_node(current->il_next, tmp);
- if (current->il_next >= MAX_NUMNODES)
- current->il_next = first_node(tmp);
+ current->il_next = next_node_in(current->il_next, tmp);
if (current->il_next >= MAX_NUMNODES)
current->il_next = numa_node_id();
}
@@ -1709,9 +1706,7 @@ static unsigned interleave_nodes(struct mempolicy *policy)
struct task_struct *me = current;
nid = me->il_next;
- next = next_node(nid, policy->v.nodes);
- if (next >= MAX_NUMNODES)
- next = first_node(policy->v.nodes);
+ next = next_node_in(nid, policy->v.nodes);
if (next < MAX_NUMNODES)
me->il_next = next;
return nid;
@@ -1744,18 +1739,18 @@ unsigned int mempolicy_slab_node(void)
return interleave_nodes(policy);
case MPOL_BIND: {
+ struct zoneref *z;
+
/*
* Follow bind policy behavior and start allocation at the
* first node.
*/
struct zonelist *zonelist;
- struct zone *zone;
enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
zonelist = &NODE_DATA(node)->node_zonelists[0];
- (void)first_zones_zonelist(zonelist, highest_zoneidx,
- &policy->v.nodes,
- &zone);
- return zone ? zone->node : node;
+ z = first_zones_zonelist(zonelist, highest_zoneidx,
+ &policy->v.nodes);
+ return z->zone ? z->zone->node : node;
}
default:
@@ -1763,23 +1758,25 @@ unsigned int mempolicy_slab_node(void)
}
}
-/* Do static interleaving for a VMA with known offset. */
+/*
+ * Do static interleaving for a VMA with known offset @n. Returns the n'th
+ * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
+ * number of present nodes.
+ */
static unsigned offset_il_node(struct mempolicy *pol,
- struct vm_area_struct *vma, unsigned long off)
+ struct vm_area_struct *vma, unsigned long n)
{
unsigned nnodes = nodes_weight(pol->v.nodes);
unsigned target;
- int c;
- int nid = NUMA_NO_NODE;
+ int i;
+ int nid;
if (!nnodes)
return numa_node_id();
- target = (unsigned int)off % nnodes;
- c = 0;
- do {
+ target = (unsigned int)n % nnodes;
+ nid = first_node(pol->v.nodes);
+ for (i = 0; i < target; i++)
nid = next_node(nid, pol->v.nodes);
- c++;
- } while (c <= target);
return nid;
}
@@ -1805,21 +1802,6 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
return interleave_nodes(pol);
}
-/*
- * Return the bit number of a random bit set in the nodemask.
- * (returns NUMA_NO_NODE if nodemask is empty)
- */
-int node_random(const nodemask_t *maskp)
-{
- int w, bit = NUMA_NO_NODE;
-
- w = nodes_weight(*maskp);
- if (w)
- bit = bitmap_ord_to_pos(maskp->bits,
- get_random_int() % w, MAX_NUMNODES);
- return bit;
-}
-
#ifdef CONFIG_HUGETLBFS
/*
* huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
@@ -2284,7 +2266,7 @@ static void sp_free(struct sp_node *n)
int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
{
struct mempolicy *pol;
- struct zone *zone;
+ struct zoneref *z;
int curnid = page_to_nid(page);
unsigned long pgoff;
int thiscpu = raw_smp_processor_id();
@@ -2316,6 +2298,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
break;
case MPOL_BIND:
+
/*
* allows binding to multiple nodes.
* use current page if in policy nodemask,
@@ -2324,11 +2307,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
*/
if (node_isset(curnid, pol->v.nodes))
goto out;
- (void)first_zones_zonelist(
+ z = first_zones_zonelist(
node_zonelist(numa_node_id(), GFP_HIGHUSER),
gfp_zone(GFP_HIGHUSER),
- &pol->v.nodes, &zone);
- polnid = zone->node;
+ &pol->v.nodes);
+ polnid = z->zone->node;
break;
default:
diff --git a/mm/migrate.c b/mm/migrate.c
index f9dfb18..53ab639 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -332,7 +332,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
newpage->index = page->index;
newpage->mapping = page->mapping;
if (PageSwapBacked(page))
- SetPageSwapBacked(newpage);
+ __SetPageSwapBacked(newpage);
return MIGRATEPAGE_SUCCESS;
}
@@ -378,7 +378,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
newpage->index = page->index;
newpage->mapping = page->mapping;
if (PageSwapBacked(page))
- SetPageSwapBacked(newpage);
+ __SetPageSwapBacked(newpage);
get_page(newpage); /* add cache reference */
if (PageSwapCache(page)) {
@@ -1791,7 +1791,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
/* Prepare a page as a migration target */
__SetPageLocked(new_page);
- SetPageSwapBacked(new_page);
+ __SetPageSwapBacked(new_page);
/* anon mapping, we can simply copy page->mapping to the new page: */
new_page->mapping = page->mapping;
diff --git a/mm/mmap.c b/mm/mmap.c
index bd2e1a53..fba246b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -55,10 +55,6 @@
#define arch_mmap_check(addr, len, flags) (0)
#endif
-#ifndef arch_rebalance_pgtables
-#define arch_rebalance_pgtables(addr, len) (addr)
-#endif
-
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX;
@@ -1911,7 +1907,6 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
if (offset_in_page(addr))
return -EINVAL;
- addr = arch_rebalance_pgtables(addr, len);
error = security_mmap_addr(addr);
return error ? error : addr;
}
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 52687fb..5652be8 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -52,7 +52,7 @@ static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
}
/* Returns the next zone at or below highest_zoneidx in a zonelist */
-struct zoneref *next_zones_zonelist(struct zoneref *z,
+struct zoneref *__next_zones_zonelist(struct zoneref *z,
enum zone_type highest_zoneidx,
nodemask_t *nodes)
{
diff --git a/mm/mremap.c b/mm/mremap.c
index 3fa0a467..9dc4999 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -70,6 +70,22 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
return pmd;
}
+static void take_rmap_locks(struct vm_area_struct *vma)
+{
+ if (vma->vm_file)
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+ if (vma->anon_vma)
+ anon_vma_lock_write(vma->anon_vma);
+}
+
+static void drop_rmap_locks(struct vm_area_struct *vma)
+{
+ if (vma->anon_vma)
+ anon_vma_unlock_write(vma->anon_vma);
+ if (vma->vm_file)
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+}
+
static pte_t move_soft_dirty_pte(pte_t pte)
{
/*
@@ -90,8 +106,6 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
struct vm_area_struct *new_vma, pmd_t *new_pmd,
unsigned long new_addr, bool need_rmap_locks)
{
- struct address_space *mapping = NULL;
- struct anon_vma *anon_vma = NULL;
struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte;
spinlock_t *old_ptl, *new_ptl;
@@ -114,16 +128,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
* serialize access to individual ptes, but only rmap traversal
* order guarantees that we won't miss both the old and new ptes).
*/
- if (need_rmap_locks) {
- if (vma->vm_file) {
- mapping = vma->vm_file->f_mapping;
- i_mmap_lock_write(mapping);
- }
- if (vma->anon_vma) {
- anon_vma = vma->anon_vma;
- anon_vma_lock_write(anon_vma);
- }
- }
+ if (need_rmap_locks)
+ take_rmap_locks(vma);
/*
* We don't have to worry about the ordering of src and dst
@@ -151,10 +157,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
spin_unlock(new_ptl);
pte_unmap(new_pte - 1);
pte_unmap_unlock(old_pte - 1, old_ptl);
- if (anon_vma)
- anon_vma_unlock_write(anon_vma);
- if (mapping)
- i_mmap_unlock_write(mapping);
+ if (need_rmap_locks)
+ drop_rmap_locks(vma);
}
#define LATENCY_LIMIT (64 * PAGE_SIZE)
@@ -193,16 +197,13 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
if (pmd_trans_huge(*old_pmd)) {
if (extent == HPAGE_PMD_SIZE) {
bool moved;
- VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma,
- vma);
/* See comment in move_ptes() */
if (need_rmap_locks)
- anon_vma_lock_write(vma->anon_vma);
- moved = move_huge_pmd(vma, new_vma, old_addr,
- new_addr, old_end,
- old_pmd, new_pmd);
+ take_rmap_locks(vma);
+ moved = move_huge_pmd(vma, old_addr, new_addr,
+ old_end, old_pmd, new_pmd);
if (need_rmap_locks)
- anon_vma_unlock_write(vma->anon_vma);
+ drop_rmap_locks(vma);
if (moved) {
need_flush = true;
continue;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 8634958..415f7eb 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -412,6 +412,25 @@ bool oom_killer_disabled __read_mostly;
#define K(x) ((x) << (PAGE_SHIFT-10))
+/*
+ * task->mm can be NULL if the task is the exited group leader. So to
+ * determine whether the task is using a particular mm, we examine all the
+ * task's threads: if one of those is using this mm then this task was also
+ * using it.
+ */
+static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
+{
+ struct task_struct *t;
+
+ for_each_thread(p, t) {
+ struct mm_struct *t_mm = READ_ONCE(t->mm);
+ if (t_mm)
+ return t_mm == mm;
+ }
+ return false;
+}
+
+
#ifdef CONFIG_MMU
/*
* OOM Reaper kernel thread which tries to reap the memory used by the OOM
@@ -491,14 +510,10 @@ static bool __oom_reap_task(struct task_struct *tsk)
up_read(&mm->mmap_sem);
/*
- * Clear TIF_MEMDIE because the task shouldn't be sitting on a
- * reasonably reclaimable memory anymore. OOM killer can continue
- * by selecting other victim if unmapping hasn't led to any
- * improvements. This also means that selecting this task doesn't
- * make any sense.
+ * This task can be safely ignored because we cannot do much more
+ * to release its memory.
*/
tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN;
- exit_oom_victim(tsk);
out:
mmput(mm);
return ret;
@@ -519,6 +534,15 @@ static void oom_reap_task(struct task_struct *tsk)
debug_show_all_locks();
}
+ /*
+ * Clear TIF_MEMDIE because the task shouldn't be sitting on a
+ * reasonably reclaimable memory anymore or it is not a good candidate
+ * for the oom victim right now because it cannot release its memory
+ * itself nor by the oom reaper.
+ */
+ tsk->oom_reaper_list = NULL;
+ exit_oom_victim(tsk);
+
/* Drop a reference taken by wake_oom_reaper */
put_task_struct(tsk);
}
@@ -563,6 +587,53 @@ static void wake_oom_reaper(struct task_struct *tsk)
wake_up(&oom_reaper_wait);
}
+/* Check if we can reap the given task. This has to be called with stable
+ * tsk->mm
+ */
+void try_oom_reaper(struct task_struct *tsk)
+{
+ struct mm_struct *mm = tsk->mm;
+ struct task_struct *p;
+
+ if (!mm)
+ return;
+
+ /*
+ * There might be other threads/processes which are either not
+ * dying or even not killable.
+ */
+ if (atomic_read(&mm->mm_users) > 1) {
+ rcu_read_lock();
+ for_each_process(p) {
+ bool exiting;
+
+ if (!process_shares_mm(p, mm))
+ continue;
+ if (same_thread_group(p, tsk))
+ continue;
+ if (fatal_signal_pending(p))
+ continue;
+
+ /*
+ * If the task is exiting make sure the whole thread group
+ * is exiting and cannot acces mm anymore.
+ */
+ spin_lock_irq(&p->sighand->siglock);
+ exiting = signal_group_exit(p->signal);
+ spin_unlock_irq(&p->sighand->siglock);
+ if (exiting)
+ continue;
+
+ /* Give up */
+ rcu_read_unlock();
+ return;
+ }
+ rcu_read_unlock();
+ }
+
+ wake_oom_reaper(tsk);
+}
+
static int __init oom_init(void)
{
oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
@@ -653,24 +724,6 @@ void oom_killer_enable(void)
}
/*
- * task->mm can be NULL if the task is the exited group leader. So to
- * determine whether the task is using a particular mm, we examine all the
- * task's threads: if one of those is using this mm then this task was also
- * using it.
- */
-static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
-{
- struct task_struct *t;
-
- for_each_thread(p, t) {
- struct mm_struct *t_mm = READ_ONCE(t->mm);
- if (t_mm)
- return t_mm == mm;
- }
- return false;
-}
-
-/*
* Must be called while holding a reference to p, which will be released upon
* returning.
*/
@@ -694,6 +747,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
task_lock(p);
if (p->mm && task_will_free_mem(p)) {
mark_oom_victim(p);
+ try_oom_reaper(p);
task_unlock(p);
put_task_struct(p);
return;
@@ -873,10 +927,20 @@ bool out_of_memory(struct oom_control *oc)
if (current->mm &&
(fatal_signal_pending(current) || task_will_free_mem(current))) {
mark_oom_victim(current);
+ try_oom_reaper(current);
return true;
}
/*
+ * The OOM killer does not compensate for IO-less reclaim.
+ * pagefault_out_of_memory lost its gfp context so we have to
+ * make sure exclude 0 mask - all other users should have at least
+ * ___GFP_DIRECT_RECLAIM to get here.
+ */
+ if (oc->gfp_mask && !(oc->gfp_mask & (__GFP_FS|__GFP_NOFAIL)))
+ return true;
+
+ /*
* Check if there were limitations on the allocation (only relevant for
* NUMA) that may require different handling.
*/
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index bc5149d..3b88795 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -296,11 +296,15 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
#ifdef CONFIG_HIGHMEM
int node;
unsigned long x = 0;
+ int i;
for_each_node_state(node, N_HIGH_MEMORY) {
- struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ struct zone *z = &NODE_DATA(node)->node_zones[i];
- x += zone_dirtyable_memory(z);
+ if (is_highmem(z))
+ x += zone_dirtyable_memory(z);
+ }
}
/*
* Unreclaimable memory (kernel memory or anonymous memory
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c1069ef..5c469c1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -352,6 +352,106 @@ static inline bool update_defer_init(pg_data_t *pgdat,
}
#endif
+/* Return a pointer to the bitmap storing bits affecting a block of pages */
+static inline unsigned long *get_pageblock_bitmap(struct page *page,
+ unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+ return __pfn_to_section(pfn)->pageblock_flags;
+#else
+ return page_zone(page)->pageblock_flags;
+#endif /* CONFIG_SPARSEMEM */
+}
+
+static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+ pfn &= (PAGES_PER_SECTION-1);
+ return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
+#else
+ pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
+ return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
+#endif /* CONFIG_SPARSEMEM */
+}
+
+/**
+ * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @end_bitidx: The last bit of interest to retrieve
+ * @mask: mask of bits that the caller is interested in
+ *
+ * Return: pageblock_bits flags
+ */
+static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
+ unsigned long pfn,
+ unsigned long end_bitidx,
+ unsigned long mask)
+{
+ unsigned long *bitmap;
+ unsigned long bitidx, word_bitidx;
+ unsigned long word;
+
+ bitmap = get_pageblock_bitmap(page, pfn);
+ bitidx = pfn_to_bitidx(page, pfn);
+ word_bitidx = bitidx / BITS_PER_LONG;
+ bitidx &= (BITS_PER_LONG-1);
+
+ word = bitmap[word_bitidx];
+ bitidx += end_bitidx;
+ return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
+}
+
+unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
+ unsigned long end_bitidx,
+ unsigned long mask)
+{
+ return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);
+}
+
+static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
+{
+ return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);
+}
+
+/**
+ * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @flags: The flags to set
+ * @pfn: The target page frame number
+ * @end_bitidx: The last bit of interest
+ * @mask: mask of bits that the caller is interested in
+ */
+void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
+ unsigned long pfn,
+ unsigned long end_bitidx,
+ unsigned long mask)
+{
+ unsigned long *bitmap;
+ unsigned long bitidx, word_bitidx;
+ unsigned long old_word, word;
+
+ BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
+
+ bitmap = get_pageblock_bitmap(page, pfn);
+ bitidx = pfn_to_bitidx(page, pfn);
+ word_bitidx = bitidx / BITS_PER_LONG;
+ bitidx &= (BITS_PER_LONG-1);
+
+ VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
+
+ bitidx += end_bitidx;
+ mask <<= (BITS_PER_LONG - bitidx - 1);
+ flags <<= (BITS_PER_LONG - bitidx - 1);
+
+ word = READ_ONCE(bitmap[word_bitidx]);
+ for (;;) {
+ old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
+ if (word == old_word)
+ break;
+ word = old_word;
+ }
+}
void set_pageblock_migratetype(struct page *page, int migratetype)
{
@@ -784,17 +884,42 @@ out:
zone->free_area[order].nr_free++;
}
-static inline int free_pages_check(struct page *page)
+/*
+ * A bad page could be due to a number of fields. Instead of multiple branches,
+ * try and check multiple fields with one check. The caller must do a detailed
+ * check if necessary.
+ */
+static inline bool page_expected_state(struct page *page,
+ unsigned long check_flags)
{
- const char *bad_reason = NULL;
- unsigned long bad_flags = 0;
+ if (unlikely(atomic_read(&page->_mapcount) != -1))
+ return false;
+
+ if (unlikely((unsigned long)page->mapping |
+ page_ref_count(page) |
+#ifdef CONFIG_MEMCG
+ (unsigned long)page->mem_cgroup |
+#endif
+ (page->flags & check_flags)))
+ return false;
+
+ return true;
+}
+
+static void free_pages_check_bad(struct page *page)
+{
+ const char *bad_reason;
+ unsigned long bad_flags;
+
+ bad_reason = NULL;
+ bad_flags = 0;
if (unlikely(atomic_read(&page->_mapcount) != -1))
bad_reason = "nonzero mapcount";
if (unlikely(page->mapping != NULL))
bad_reason = "non-NULL mapping";
if (unlikely(page_ref_count(page) != 0))
- bad_reason = "nonzero _count";
+ bad_reason = "nonzero _refcount";
if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
@@ -803,16 +928,146 @@ static inline int free_pages_check(struct page *page)
if (unlikely(page->mem_cgroup))
bad_reason = "page still charged to cgroup";
#endif
- if (unlikely(bad_reason)) {
- bad_page(page, bad_reason, bad_flags);
- return 1;
+ bad_page(page, bad_reason, bad_flags);
+}
+
+static inline int free_pages_check(struct page *page)
+{
+ if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
+ return 0;
+
+ /* Something has gone sideways, find it */
+ free_pages_check_bad(page);
+ return 1;
+}
+
+static int free_tail_pages_check(struct page *head_page, struct page *page)
+{
+ int ret = 1;
+
+ /*
+ * We rely page->lru.next never has bit 0 set, unless the page
+ * is PageTail(). Let's make sure that's true even for poisoned ->lru.
+ */
+ BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
+
+ if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
+ ret = 0;
+ goto out;
}
+ switch (page - head_page) {
+ case 1:
+ /* the first tail page: ->mapping is compound_mapcount() */
+ if (unlikely(compound_mapcount(page))) {
+ bad_page(page, "nonzero compound_mapcount", 0);
+ goto out;
+ }
+ break;
+ case 2:
+ /*
+ * the second tail page: ->mapping is
+ * page_deferred_list().next -- ignore value.
+ */
+ break;
+ default:
+ if (page->mapping != TAIL_MAPPING) {
+ bad_page(page, "corrupted mapping in tail page", 0);
+ goto out;
+ }
+ break;
+ }
+ if (unlikely(!PageTail(page))) {
+ bad_page(page, "PageTail not set", 0);
+ goto out;
+ }
+ if (unlikely(compound_head(page) != head_page)) {
+ bad_page(page, "compound_head not consistent", 0);
+ goto out;
+ }
+ ret = 0;
+out:
+ page->mapping = NULL;
+ clear_compound_head(page);
+ return ret;
+}
+
+static __always_inline bool free_pages_prepare(struct page *page,
+ unsigned int order, bool check_free)
+{
+ int bad = 0;
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
+ trace_mm_page_free(page, order);
+ kmemcheck_free_shadow(page, order);
+ kasan_free_pages(page, order);
+
+ /*
+ * Check tail pages before head page information is cleared to
+ * avoid checking PageCompound for order-0 pages.
+ */
+ if (unlikely(order)) {
+ bool compound = PageCompound(page);
+ int i;
+
+ VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
+
+ for (i = 1; i < (1 << order); i++) {
+ if (compound)
+ bad += free_tail_pages_check(page, page + i);
+ if (unlikely(free_pages_check(page + i))) {
+ bad++;
+ continue;
+ }
+ (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ }
+ }
+ if (PageAnonHead(page))
+ page->mapping = NULL;
+ if (check_free)
+ bad += free_pages_check(page);
+ if (bad)
+ return false;
+
page_cpupid_reset_last(page);
- if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
- page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
- return 0;
+ page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ reset_page_owner(page, order);
+
+ if (!PageHighMem(page)) {
+ debug_check_no_locks_freed(page_address(page),
+ PAGE_SIZE << order);
+ debug_check_no_obj_freed(page_address(page),
+ PAGE_SIZE << order);
+ }
+ arch_free_page(page, order);
+ kernel_poison_pages(page, 1 << order, 0);
+ kernel_map_pages(page, 1 << order, 0);
+
+ return true;
}
+#ifdef CONFIG_DEBUG_VM
+static inline bool free_pcp_prepare(struct page *page)
+{
+ return free_pages_prepare(page, 0, true);
+}
+
+static inline bool bulkfree_pcp_prepare(struct page *page)
+{
+ return false;
+}
+#else
+static bool free_pcp_prepare(struct page *page)
+{
+ return free_pages_prepare(page, 0, false);
+}
+
+static bool bulkfree_pcp_prepare(struct page *page)
+{
+ return free_pages_check(page);
+}
+#endif /* CONFIG_DEBUG_VM */
+
/*
* Frees a number of pages from the PCP lists
* Assumes all pages on list are in same zone, and of same order.
@@ -829,15 +1084,16 @@ static void free_pcppages_bulk(struct zone *zone, int count,
{
int migratetype = 0;
int batch_free = 0;
- int to_free = count;
unsigned long nr_scanned;
+ bool isolated_pageblocks;
spin_lock(&zone->lock);
+ isolated_pageblocks = has_isolate_pageblock(zone);
nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
if (nr_scanned)
__mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
- while (to_free) {
+ while (count) {
struct page *page;
struct list_head *list;
@@ -857,7 +1113,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
/* This is the only non-empty list. Free them all. */
if (batch_free == MIGRATE_PCPTYPES)
- batch_free = to_free;
+ batch_free = count;
do {
int mt; /* migratetype of the to-be-freed page */
@@ -870,12 +1126,15 @@ static void free_pcppages_bulk(struct zone *zone, int count,
/* MIGRATE_ISOLATE page should not go to pcplists */
VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
/* Pageblock could have been isolated meanwhile */
- if (unlikely(has_isolate_pageblock(zone)))
+ if (unlikely(isolated_pageblocks))
mt = get_pageblock_migratetype(page);
+ if (bulkfree_pcp_prepare(page))
+ continue;
+
__free_one_page(page, page_to_pfn(page), zone, 0, mt);
trace_mm_page_pcpu_drain(page, 0, mt);
- } while (--to_free && --batch_free && !list_empty(list));
+ } while (--count && --batch_free && !list_empty(list));
}
spin_unlock(&zone->lock);
}
@@ -899,56 +1158,6 @@ static void free_one_page(struct zone *zone,
spin_unlock(&zone->lock);
}
-static int free_tail_pages_check(struct page *head_page, struct page *page)
-{
- int ret = 1;
-
- /*
- * We rely page->lru.next never has bit 0 set, unless the page
- * is PageTail(). Let's make sure that's true even for poisoned ->lru.
- */
- BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
-
- if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
- ret = 0;
- goto out;
- }
- switch (page - head_page) {
- case 1:
- /* the first tail page: ->mapping is compound_mapcount() */
- if (unlikely(compound_mapcount(page))) {
- bad_page(page, "nonzero compound_mapcount", 0);
- goto out;
- }
- break;
- case 2:
- /*
- * the second tail page: ->mapping is
- * page_deferred_list().next -- ignore value.
- */
- break;
- default:
- if (page->mapping != TAIL_MAPPING) {
- bad_page(page, "corrupted mapping in tail page", 0);
- goto out;
- }
- break;
- }
- if (unlikely(!PageTail(page))) {
- bad_page(page, "PageTail not set", 0);
- goto out;
- }
- if (unlikely(compound_head(page) != head_page)) {
- bad_page(page, "compound_head not consistent", 0);
- goto out;
- }
- ret = 0;
-out:
- page->mapping = NULL;
- clear_compound_head(page);
- return ret;
-}
-
static void __meminit __init_single_page(struct page *page, unsigned long pfn,
unsigned long zone, int nid)
{
@@ -1022,51 +1231,13 @@ void __meminit reserve_bootmem_region(unsigned long start, unsigned long end)
}
}
-static bool free_pages_prepare(struct page *page, unsigned int order)
-{
- bool compound = PageCompound(page);
- int i, bad = 0;
-
- VM_BUG_ON_PAGE(PageTail(page), page);
- VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
-
- trace_mm_page_free(page, order);
- kmemcheck_free_shadow(page, order);
- kasan_free_pages(page, order);
-
- if (PageAnon(page))
- page->mapping = NULL;
- bad += free_pages_check(page);
- for (i = 1; i < (1 << order); i++) {
- if (compound)
- bad += free_tail_pages_check(page, page + i);
- bad += free_pages_check(page + i);
- }
- if (bad)
- return false;
-
- reset_page_owner(page, order);
-
- if (!PageHighMem(page)) {
- debug_check_no_locks_freed(page_address(page),
- PAGE_SIZE << order);
- debug_check_no_obj_freed(page_address(page),
- PAGE_SIZE << order);
- }
- arch_free_page(page, order);
- kernel_poison_pages(page, 1 << order, 0);
- kernel_map_pages(page, 1 << order, 0);
-
- return true;
-}
-
static void __free_pages_ok(struct page *page, unsigned int order)
{
unsigned long flags;
int migratetype;
unsigned long pfn = page_to_pfn(page);
- if (!free_pages_prepare(page, order))
+ if (!free_pages_prepare(page, order, true))
return;
migratetype = get_pfnblock_migratetype(page, pfn);
@@ -1076,8 +1247,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
local_irq_restore(flags);
}
-static void __init __free_pages_boot_core(struct page *page,
- unsigned long pfn, unsigned int order)
+static void __init __free_pages_boot_core(struct page *page, unsigned int order)
{
unsigned int nr_pages = 1 << order;
struct page *p = page;
@@ -1154,7 +1324,7 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
{
if (early_page_uninitialised(pfn))
return;
- return __free_pages_boot_core(page, pfn, order);
+ return __free_pages_boot_core(page, order);
}
/*
@@ -1239,12 +1409,12 @@ static void __init deferred_free_range(struct page *page,
if (nr_pages == MAX_ORDER_NR_PAGES &&
(pfn & (MAX_ORDER_NR_PAGES-1)) == 0) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- __free_pages_boot_core(page, pfn, MAX_ORDER-1);
+ __free_pages_boot_core(page, MAX_ORDER-1);
return;
}
- for (i = 0; i < nr_pages; i++, page++, pfn++)
- __free_pages_boot_core(page, pfn, 0);
+ for (i = 0; i < nr_pages; i++, page++)
+ __free_pages_boot_core(page, 0);
}
/* Completion tracking for deferred_init_memmap() threads */
@@ -1477,10 +1647,7 @@ static inline void expand(struct zone *zone, struct page *page,
}
}
-/*
- * This page is about to be returned from the page allocator
- */
-static inline int check_new_page(struct page *page)
+static void check_new_page_bad(struct page *page)
{
const char *bad_reason = NULL;
unsigned long bad_flags = 0;
@@ -1503,11 +1670,20 @@ static inline int check_new_page(struct page *page)
if (unlikely(page->mem_cgroup))
bad_reason = "page still charged to cgroup";
#endif
- if (unlikely(bad_reason)) {
- bad_page(page, bad_reason, bad_flags);
- return 1;
- }
- return 0;
+ bad_page(page, bad_reason, bad_flags);
+}
+
+/*
+ * This page is about to be returned from the page allocator
+ */
+static inline int check_new_page(struct page *page)
+{
+ if (likely(page_expected_state(page,
+ PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
+ return 0;
+
+ check_new_page_bad(page);
+ return 1;
}
static inline bool free_pages_prezeroed(bool poisoned)
@@ -1516,16 +1692,48 @@ static inline bool free_pages_prezeroed(bool poisoned)
page_poisoning_enabled() && poisoned;
}
-static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
- int alloc_flags)
+#ifdef CONFIG_DEBUG_VM
+static bool check_pcp_refill(struct page *page)
+{
+ return false;
+}
+
+static bool check_new_pcp(struct page *page)
+{
+ return check_new_page(page);
+}
+#else
+static bool check_pcp_refill(struct page *page)
+{
+ return check_new_page(page);
+}
+static bool check_new_pcp(struct page *page)
+{
+ return false;
+}
+#endif /* CONFIG_DEBUG_VM */
+
+static bool check_new_pages(struct page *page, unsigned int order)
+{
+ int i;
+ for (i = 0; i < (1 << order); i++) {
+ struct page *p = page + i;
+
+ if (unlikely(check_new_page(p)))
+ return true;
+ }
+
+ return false;
+}
+
+static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
+ unsigned int alloc_flags)
{
int i;
bool poisoned = true;
for (i = 0; i < (1 << order); i++) {
struct page *p = page + i;
- if (unlikely(check_new_page(p)))
- return 1;
if (poisoned)
poisoned &= page_is_poisoned(p);
}
@@ -1557,8 +1765,6 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
set_page_pfmemalloc(page);
else
clear_page_pfmemalloc(page);
-
- return 0;
}
/*
@@ -1980,6 +2186,9 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
if (unlikely(page == NULL))
break;
+ if (unlikely(check_pcp_refill(page)))
+ continue;
+
/*
* Split buddy pages returned by expand() are received here
* in physical page order. The page is added to the callers and
@@ -2157,6 +2366,10 @@ void mark_free_pages(struct zone *zone)
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
if (pfn_valid(pfn)) {
page = pfn_to_page(pfn);
+
+ if (page_zone(page) != zone)
+ continue;
+
if (!swsusp_page_is_forbidden(page))
swsusp_unset_page_free(page);
}
@@ -2187,7 +2400,7 @@ void free_hot_cold_page(struct page *page, bool cold)
unsigned long pfn = page_to_pfn(page);
int migratetype;
- if (!free_pages_prepare(page, 0))
+ if (!free_pcp_prepare(page))
return;
migratetype = get_pfnblock_migratetype(page, pfn);
@@ -2343,12 +2556,44 @@ int split_free_page(struct page *page)
}
/*
+ * Update NUMA hit/miss statistics
+ *
+ * Must be called with interrupts disabled.
+ *
+ * When __GFP_OTHER_NODE is set assume the node of the preferred
+ * zone is the local node. This is useful for daemons who allocate
+ * memory on behalf of other processes.
+ */
+static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
+ gfp_t flags)
+{
+#ifdef CONFIG_NUMA
+ int local_nid = numa_node_id();
+ enum zone_stat_item local_stat = NUMA_LOCAL;
+
+ if (unlikely(flags & __GFP_OTHER_NODE)) {
+ local_stat = NUMA_OTHER;
+ local_nid = preferred_zone->node;
+ }
+
+ if (z->node == local_nid) {
+ __inc_zone_state(z, NUMA_HIT);
+ __inc_zone_state(z, local_stat);
+ } else {
+ __inc_zone_state(z, NUMA_MISS);
+ __inc_zone_state(preferred_zone, NUMA_FOREIGN);
+ }
+#endif
+}
+
+/*
* Allocate a page from the given zone. Use pcplists for order-0 allocations.
*/
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
- gfp_t gfp_flags, int alloc_flags, int migratetype)
+ gfp_t gfp_flags, unsigned int alloc_flags,
+ int migratetype)
{
unsigned long flags;
struct page *page;
@@ -2359,21 +2604,24 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
struct list_head *list;
local_irq_save(flags);
- pcp = &this_cpu_ptr(zone->pageset)->pcp;
- list = &pcp->lists[migratetype];
- if (list_empty(list)) {
- pcp->count += rmqueue_bulk(zone, 0,
- pcp->batch, list,
- migratetype, cold);
- if (unlikely(list_empty(list)))
- goto failed;
- }
+ do {
+ pcp = &this_cpu_ptr(zone->pageset)->pcp;
+ list = &pcp->lists[migratetype];
+ if (list_empty(list)) {
+ pcp->count += rmqueue_bulk(zone, 0,
+ pcp->batch, list,
+ migratetype, cold);
+ if (unlikely(list_empty(list)))
+ goto failed;
+ }
- if (cold)
- page = list_last_entry(list, struct page, lru);
- else
- page = list_first_entry(list, struct page, lru);
+ if (cold)
+ page = list_last_entry(list, struct page, lru);
+ else
+ page = list_first_entry(list, struct page, lru);
+ } while (page && check_new_pcp(page));
+ __dec_zone_state(zone, NR_ALLOC_BATCH);
list_del(&page->lru);
pcp->count--;
} else {
@@ -2384,22 +2632,24 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
spin_lock_irqsave(&zone->lock, flags);
- page = NULL;
- if (alloc_flags & ALLOC_HARDER) {
- page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
- if (page)
- trace_mm_page_alloc_zone_locked(page, order, migratetype);
- }
- if (!page)
- page = __rmqueue(zone, order, migratetype);
+ do {
+ page = NULL;
+ if (alloc_flags & ALLOC_HARDER) {
+ page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+ if (page)
+ trace_mm_page_alloc_zone_locked(page, order, migratetype);
+ }
+ if (!page)
+ page = __rmqueue(zone, order, migratetype);
+ } while (page && check_new_pages(page, order));
spin_unlock(&zone->lock);
if (!page)
goto failed;
+ __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
__mod_zone_freepage_state(zone, -(1 << order),
get_pcppage_migratetype(page));
}
- __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&
!test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
set_bit(ZONE_FAIR_DEPLETED, &zone->flags);
@@ -2501,12 +2751,13 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
* to check in the allocation paths if no pages are free.
*/
static bool __zone_watermark_ok(struct zone *z, unsigned int order,
- unsigned long mark, int classzone_idx, int alloc_flags,
+ unsigned long mark, int classzone_idx,
+ unsigned int alloc_flags,
long free_pages)
{
long min = mark;
int o;
- const int alloc_harder = (alloc_flags & ALLOC_HARDER);
+ const bool alloc_harder = (alloc_flags & ALLOC_HARDER);
/* free_pages may go negative - that's OK */
free_pages -= (1 << order) - 1;
@@ -2569,12 +2820,38 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
}
bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
- int classzone_idx, int alloc_flags)
+ int classzone_idx, unsigned int alloc_flags)
{
return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
zone_page_state(z, NR_FREE_PAGES));
}
+static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
+ unsigned long mark, int classzone_idx, unsigned int alloc_flags)
+{
+ long free_pages = zone_page_state(z, NR_FREE_PAGES);
+ long cma_pages = 0;
+
+#ifdef CONFIG_CMA
+ /* If allocation can't use CMA areas don't use free CMA pages */
+ if (!(alloc_flags & ALLOC_CMA))
+ cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
+#endif
+
+ /*
+ * Fast check for order-0 only. If this fails then the reserves
+ * need to be calculated. There is a corner case where the check
+ * passes but only the high-order atomic reserve are free. If
+ * the caller is !atomic then it'll uselessly search the free
+ * list. That corner case is then slower but it is harmless.
+ */
+ if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
+ return true;
+
+ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ free_pages);
+}
+
bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
unsigned long mark, int classzone_idx)
{
@@ -2630,27 +2907,24 @@ static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
const struct alloc_context *ac)
{
- struct zonelist *zonelist = ac->zonelist;
- struct zoneref *z;
- struct page *page = NULL;
+ struct zoneref *z = ac->preferred_zoneref;
struct zone *zone;
- int nr_fair_skipped = 0;
- bool zonelist_rescan;
+ bool fair_skipped = false;
+ bool apply_fair = (alloc_flags & ALLOC_FAIR);
zonelist_scan:
- zonelist_rescan = false;
-
/*
* Scan zonelist, looking for a zone with enough free.
* See also __cpuset_node_allowed() comment in kernel/cpuset.c.
*/
- for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
+ for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
+ struct page *page;
unsigned long mark;
if (cpusets_enabled() &&
(alloc_flags & ALLOC_CPUSET) &&
- !cpuset_zone_allowed(zone, gfp_mask))
+ !__cpuset_zone_allowed(zone, gfp_mask))
continue;
/*
* Distribute pages in proportion to the individual
@@ -2658,13 +2932,16 @@ zonelist_scan:
* page was allocated in should have no effect on the
* time the page has in memory before being reclaimed.
*/
- if (alloc_flags & ALLOC_FAIR) {
- if (!zone_local(ac->preferred_zone, zone))
- break;
+ if (apply_fair) {
if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
- nr_fair_skipped++;
+ fair_skipped = true;
continue;
}
+ if (!zone_local(ac->preferred_zoneref->zone, zone)) {
+ if (fair_skipped)
+ goto reset_fair;
+ apply_fair = false;
+ }
}
/*
* When allocating a page cache page for writing, we
@@ -2696,8 +2973,8 @@ zonelist_scan:
continue;
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
- if (!zone_watermark_ok(zone, order, mark,
- ac->classzone_idx, alloc_flags)) {
+ if (!zone_watermark_fast(zone, order, mark,
+ ac_classzone_idx(ac), alloc_flags)) {
int ret;
/* Checked here to keep the fast path fast */
@@ -2706,7 +2983,7 @@ zonelist_scan:
goto try_this_zone;
if (zone_reclaim_mode == 0 ||
- !zone_allows_reclaim(ac->preferred_zone, zone))
+ !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
continue;
ret = zone_reclaim(zone, gfp_mask, order);
@@ -2720,7 +2997,7 @@ zonelist_scan:
default:
/* did we reclaim enough */
if (zone_watermark_ok(zone, order, mark,
- ac->classzone_idx, alloc_flags))
+ ac_classzone_idx(ac), alloc_flags))
goto try_this_zone;
continue;
@@ -2728,11 +3005,10 @@ zonelist_scan:
}
try_this_zone:
- page = buffered_rmqueue(ac->preferred_zone, zone, order,
+ page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order,
gfp_mask, alloc_flags, ac->migratetype);
if (page) {
- if (prep_new_page(page, order, gfp_mask, alloc_flags))
- goto try_this_zone;
+ prep_new_page(page, order, gfp_mask, alloc_flags);
/*
* If this is a high-order atomic allocation then check
@@ -2753,18 +3029,13 @@ try_this_zone:
* include remote zones now, before entering the slowpath and waking
* kswapd: prefer spilling to a remote zone over swapping locally.
*/
- if (alloc_flags & ALLOC_FAIR) {
- alloc_flags &= ~ALLOC_FAIR;
- if (nr_fair_skipped) {
- zonelist_rescan = true;
- reset_alloc_batches(ac->preferred_zone);
- }
- if (nr_online_nodes > 1)
- zonelist_rescan = true;
- }
-
- if (zonelist_rescan)
+ if (fair_skipped) {
+reset_fair:
+ apply_fair = false;
+ fair_skipped = false;
+ reset_alloc_batches(ac->preferred_zoneref->zone);
goto zonelist_scan;
+ }
return NULL;
}
@@ -2872,22 +3143,18 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
/* The OOM killer does not needlessly kill tasks for lowmem */
if (ac->high_zoneidx < ZONE_NORMAL)
goto out;
- /* The OOM killer does not compensate for IO-less reclaim */
- if (!(gfp_mask & __GFP_FS)) {
- /*
- * XXX: Page reclaim didn't yield anything,
- * and the OOM killer can't be invoked, but
- * keep looping as per tradition.
- *
- * But do not keep looping if oom_killer_disable()
- * was already called, for the system is trying to
- * enter a quiescent state during suspend.
- */
- *did_some_progress = !oom_killer_disabled;
- goto out;
- }
if (pm_suspended_storage())
goto out;
+ /*
+ * XXX: GFP_NOFS allocations should rather fail than rely on
+ * other request to make a forward progress.
+ * We are in an unfortunate situation where out_of_memory cannot
+ * do much for this context but let's try it to at least get
+ * access to memory reserved if the current task is killed (see
+ * out_of_memory). Once filesystems are ready to handle allocation
+ * failures more gracefully we should just bail out here.
+ */
+
/* The OOM killer may not free memory on a specific node */
if (gfp_mask & __GFP_THISNODE)
goto out;
@@ -2917,7 +3184,7 @@ out:
/* Try memory compaction for high-order allocations before reclaim */
static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
- int alloc_flags, const struct alloc_context *ac,
+ unsigned int alloc_flags, const struct alloc_context *ac,
enum migrate_mode mode, int *contended_compaction,
bool *deferred_compaction)
{
@@ -2973,7 +3240,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
#else
static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
- int alloc_flags, const struct alloc_context *ac,
+ unsigned int alloc_flags, const struct alloc_context *ac,
enum migrate_mode mode, int *contended_compaction,
bool *deferred_compaction)
{
@@ -3013,7 +3280,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
/* The really slow allocator path where we enter direct reclaim */
static inline struct page *
__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
- int alloc_flags, const struct alloc_context *ac,
+ unsigned int alloc_flags, const struct alloc_context *ac,
unsigned long *did_some_progress)
{
struct page *page = NULL;
@@ -3049,13 +3316,13 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
ac->high_zoneidx, ac->nodemask)
- wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone));
+ wakeup_kswapd(zone, order, ac_classzone_idx(ac));
}
-static inline int
+static inline unsigned int
gfp_to_alloc_flags(gfp_t gfp_mask)
{
- int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
+ unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
@@ -3116,7 +3383,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
{
bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
struct page *page = NULL;
- int alloc_flags;
+ unsigned int alloc_flags;
unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
enum migrate_mode migration_mode = MIGRATE_ASYNC;
@@ -3153,17 +3420,6 @@ retry:
*/
alloc_flags = gfp_to_alloc_flags(gfp_mask);
- /*
- * Find the true preferred zone if the allocation is unconstrained by
- * cpusets.
- */
- if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) {
- struct zoneref *preferred_zoneref;
- preferred_zoneref = first_zones_zonelist(ac->zonelist,
- ac->high_zoneidx, NULL, &ac->preferred_zone);
- ac->classzone_idx = zonelist_zone_idx(preferred_zoneref);
- }
-
/* This is the last chance, in general, before the goto nopage. */
page = get_page_from_freelist(gfp_mask, order,
alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
@@ -3278,7 +3534,7 @@ retry:
if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) ||
((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) {
/* Wait for some write requests to complete then retry */
- wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50);
+ wait_iff_congested(ac->preferred_zoneref->zone, BLK_RW_ASYNC, HZ/50);
goto retry;
}
@@ -3316,17 +3572,24 @@ struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, nodemask_t *nodemask)
{
- struct zoneref *preferred_zoneref;
- struct page *page = NULL;
+ struct page *page;
unsigned int cpuset_mems_cookie;
- int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
- gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
+ unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR;
+ gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = {
.high_zoneidx = gfp_zone(gfp_mask),
+ .zonelist = zonelist,
.nodemask = nodemask,
.migratetype = gfpflags_to_migratetype(gfp_mask),
};
+ if (cpusets_enabled()) {
+ alloc_mask |= __GFP_HARDWALL;
+ alloc_flags |= ALLOC_CPUSET;
+ if (!ac.nodemask)
+ ac.nodemask = &cpuset_current_mems_allowed;
+ }
+
gfp_mask &= gfp_allowed_mask;
lockdep_trace_alloc(gfp_mask);
@@ -3350,49 +3613,54 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
retry_cpuset:
cpuset_mems_cookie = read_mems_allowed_begin();
- /* We set it here, as __alloc_pages_slowpath might have changed it */
- ac.zonelist = zonelist;
-
/* Dirty zone balancing only done in the fast path */
ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
/* The preferred zone is used for statistics later */
- preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
- ac.nodemask ? : &cpuset_current_mems_allowed,
- &ac.preferred_zone);
- if (!ac.preferred_zone)
- goto out;
- ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);
+ ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
+ ac.high_zoneidx, ac.nodemask);
+ if (!ac.preferred_zoneref) {
+ page = NULL;
+ goto no_zone;
+ }
/* First allocation attempt */
- alloc_mask = gfp_mask|__GFP_HARDWALL;
page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
- if (unlikely(!page)) {
- /*
- * Runtime PM, block IO and its error handling path
- * can deadlock because I/O on the device might not
- * complete.
- */
- alloc_mask = memalloc_noio_flags(gfp_mask);
- ac.spread_dirty_pages = false;
-
- page = __alloc_pages_slowpath(alloc_mask, order, &ac);
- }
+ if (likely(page))
+ goto out;
- if (kmemcheck_enabled && page)
- kmemcheck_pagealloc_alloc(page, order, gfp_mask);
+ /*
+ * Runtime PM, block IO and its error handling path can deadlock
+ * because I/O on the device might not complete.
+ */
+ alloc_mask = memalloc_noio_flags(gfp_mask);
+ ac.spread_dirty_pages = false;
- trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
+ /*
+ * Restore the original nodemask if it was potentially replaced with
+ * &cpuset_current_mems_allowed to optimize the fast-path attempt.
+ */
+ if (cpusets_enabled())
+ ac.nodemask = nodemask;
+ page = __alloc_pages_slowpath(alloc_mask, order, &ac);
-out:
+no_zone:
/*
* When updating a task's mems_allowed, it is possible to race with
* parallel threads in such a way that an allocation can fail while
* the mask is being updated. If a page allocation is about to fail,
* check if the cpuset changed during allocation and if so, retry.
*/
- if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) {
+ alloc_mask = gfp_mask;
goto retry_cpuset;
+ }
+
+out:
+ if (kmemcheck_enabled && page)
+ kmemcheck_pagealloc_alloc(page, order, gfp_mask);
+
+ trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
return page;
}
@@ -3790,6 +4058,8 @@ void si_meminfo_node(struct sysinfo *val, int nid)
{
int zone_type; /* needs to be signed */
unsigned long managed_pages = 0;
+ unsigned long managed_highpages = 0;
+ unsigned long free_highpages = 0;
pg_data_t *pgdat = NODE_DATA(nid);
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
@@ -3798,12 +4068,19 @@ void si_meminfo_node(struct sysinfo *val, int nid)
val->sharedram = node_page_state(nid, NR_SHMEM);
val->freeram = node_page_state(nid, NR_FREE_PAGES);
#ifdef CONFIG_HIGHMEM
- val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
- val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
- NR_FREE_PAGES);
+ for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
+ struct zone *zone = &pgdat->node_zones[zone_type];
+
+ if (is_highmem(zone)) {
+ managed_highpages += zone->managed_pages;
+ free_highpages += zone_page_state(zone, NR_FREE_PAGES);
+ }
+ }
+ val->totalhigh = managed_highpages;
+ val->freehigh = free_highpages;
#else
- val->totalhigh = 0;
- val->freehigh = 0;
+ val->totalhigh = managed_highpages;
+ val->freehigh = free_highpages;
#endif
val->mem_unit = PAGE_SIZE;
}
@@ -4390,13 +4667,12 @@ static void build_zonelists(pg_data_t *pgdat)
*/
int local_memory_node(int node)
{
- struct zone *zone;
+ struct zoneref *z;
- (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
+ z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
gfp_zone(GFP_KERNEL),
- NULL,
- &zone);
- return zone->node;
+ NULL);
+ return z->zone->node;
}
#endif
@@ -6725,98 +7001,6 @@ void *__init alloc_large_system_hash(const char *tablename,
return table;
}
-/* Return a pointer to the bitmap storing bits affecting a block of pages */
-static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
- unsigned long pfn)
-{
-#ifdef CONFIG_SPARSEMEM
- return __pfn_to_section(pfn)->pageblock_flags;
-#else
- return zone->pageblock_flags;
-#endif /* CONFIG_SPARSEMEM */
-}
-
-static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
-{
-#ifdef CONFIG_SPARSEMEM
- pfn &= (PAGES_PER_SECTION-1);
- return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
-#else
- pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
- return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
-#endif /* CONFIG_SPARSEMEM */
-}
-
-/**
- * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
- * @page: The page within the block of interest
- * @pfn: The target page frame number
- * @end_bitidx: The last bit of interest to retrieve
- * @mask: mask of bits that the caller is interested in
- *
- * Return: pageblock_bits flags
- */
-unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
- unsigned long end_bitidx,
- unsigned long mask)
-{
- struct zone *zone;
- unsigned long *bitmap;
- unsigned long bitidx, word_bitidx;
- unsigned long word;
-
- zone = page_zone(page);
- bitmap = get_pageblock_bitmap(zone, pfn);
- bitidx = pfn_to_bitidx(zone, pfn);
- word_bitidx = bitidx / BITS_PER_LONG;
- bitidx &= (BITS_PER_LONG-1);
-
- word = bitmap[word_bitidx];
- bitidx += end_bitidx;
- return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
-}
-
-/**
- * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
- * @page: The page within the block of interest
- * @flags: The flags to set
- * @pfn: The target page frame number
- * @end_bitidx: The last bit of interest
- * @mask: mask of bits that the caller is interested in
- */
-void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
- unsigned long pfn,
- unsigned long end_bitidx,
- unsigned long mask)
-{
- struct zone *zone;
- unsigned long *bitmap;
- unsigned long bitidx, word_bitidx;
- unsigned long old_word, word;
-
- BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
-
- zone = page_zone(page);
- bitmap = get_pageblock_bitmap(zone, pfn);
- bitidx = pfn_to_bitidx(zone, pfn);
- word_bitidx = bitidx / BITS_PER_LONG;
- bitidx &= (BITS_PER_LONG-1);
-
- VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page);
-
- bitidx += end_bitidx;
- mask <<= (BITS_PER_LONG - bitidx - 1);
- flags <<= (BITS_PER_LONG - bitidx - 1);
-
- word = READ_ONCE(bitmap[word_bitidx]);
- for (;;) {
- old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
- if (word == old_word)
- break;
- word = old_word;
- }
-}
-
/*
* This function checks whether pageblock includes unmovable pages or not.
* If @count is not zero, it is okay to include less @count unmovable pages
@@ -6864,7 +7048,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
* We can't use page_count without pin a page
* because another CPU can free compound page.
* This check already skips compound tails of THP
- * because their page->_count is zero at all time.
+ * because their page->_refcount is zero at all time.
*/
if (!page_ref_count(page)) {
if (PageBuddy(page))
@@ -7177,7 +7361,8 @@ void zone_pcp_reset(struct zone *zone)
#ifdef CONFIG_MEMORY_HOTREMOVE
/*
- * All pages in the range must be isolated before calling this.
+ * All pages in the range must be in a single zone and isolated
+ * before calling this.
*/
void
__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c4f5682..612122b 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -246,6 +246,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
return pfn;
}
+/* Caller should ensure that requested range is in a single zone */
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
bool skip_hwpoisoned_pages)
{
@@ -288,13 +289,10 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private,
* accordance with memory policy of the user process if possible. For
* now as a simple work-around, we use the next node for destination.
*/
- if (PageHuge(page)) {
- int node = next_online_node(page_to_nid(page));
- if (node == MAX_NUMNODES)
- node = first_online_node;
+ if (PageHuge(page))
return alloc_huge_page_node(page_hstate(compound_head(page)),
- node);
- }
+ next_node_in(page_to_nid(page),
+ node_online_map));
if (PageHighMem(page))
gfp_mask |= __GFP_HIGHMEM;
diff --git a/mm/page_owner.c b/mm/page_owner.c
index ac3d8d1..792b56d 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -143,7 +143,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
goto err;
/* Print information relevant to grouping pages by mobility */
- pageblock_mt = get_pfnblock_migratetype(page, pfn);
+ pageblock_mt = get_pageblock_migratetype(page);
page_mt = gfpflags_to_migratetype(page_ext->gfp_mask);
ret += snprintf(kbuf + ret, count - ret,
"PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
@@ -301,6 +301,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
page = pfn_to_page(pfn);
+ if (page_zone(page) != zone)
+ continue;
+
/*
* We are safe to check buddy flag and order, because
* this is init stage and only single thread runs.
diff --git a/mm/rmap.c b/mm/rmap.c
index 307b555..8a83993 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -409,7 +409,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma = avc->anon_vma;
- BUG_ON(anon_vma->degree);
+ VM_WARN_ON(anon_vma->degree);
put_anon_vma(anon_vma);
list_del(&avc->same_vma);
@@ -1249,7 +1249,7 @@ void page_add_new_anon_rmap(struct page *page,
int nr = compound ? hpage_nr_pages(page) : 1;
VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
- SetPageSwapBacked(page);
+ __SetPageSwapBacked(page);
if (compound) {
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
/* increment count (starts at -1) */
diff --git a/mm/shmem.c b/mm/shmem.c
index e684a91..e418a99 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -101,7 +101,6 @@ struct shmem_falloc {
enum sgp_type {
SGP_READ, /* don't exceed i_size, don't allocate page */
SGP_CACHE, /* don't exceed i_size, may allocate page */
- SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */
SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */
SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */
};
@@ -122,13 +121,14 @@ static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
static int shmem_replace_page(struct page **pagep, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index);
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
- struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
+ struct page **pagep, enum sgp_type sgp,
+ gfp_t gfp, struct mm_struct *fault_mm, int *fault_type);
static inline int shmem_getpage(struct inode *inode, pgoff_t index,
- struct page **pagep, enum sgp_type sgp, int *fault_type)
+ struct page **pagep, enum sgp_type sgp)
{
return shmem_getpage_gfp(inode, index, pagep, sgp,
- mapping_gfp_mask(inode->i_mapping), fault_type);
+ mapping_gfp_mask(inode->i_mapping), NULL, NULL);
}
static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
@@ -169,7 +169,7 @@ static inline int shmem_reacct_size(unsigned long flags,
/*
* ... whereas tmpfs objects are accounted incrementally as
- * pages are allocated, in order to allow huge sparse files.
+ * pages are allocated, in order to allow large sparse files.
* shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
* so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
*/
@@ -528,7 +528,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
if (partial_start) {
struct page *page = NULL;
- shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
+ shmem_getpage(inode, start - 1, &page, SGP_READ);
if (page) {
unsigned int top = PAGE_SIZE;
if (start > end) {
@@ -543,7 +543,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
}
if (partial_end) {
struct page *page = NULL;
- shmem_getpage(inode, end, &page, SGP_READ, NULL);
+ shmem_getpage(inode, end, &page, SGP_READ);
if (page) {
zero_user_segment(page, 0, partial_end);
set_page_dirty(page);
@@ -947,8 +947,7 @@ redirty:
return 0;
}
-#ifdef CONFIG_NUMA
-#ifdef CONFIG_TMPFS
+#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
{
char buffer[64];
@@ -972,7 +971,18 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
}
return mpol;
}
-#endif /* CONFIG_TMPFS */
+#else /* !CONFIG_NUMA || !CONFIG_TMPFS */
+static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
+{
+}
+static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
+{
+ return NULL;
+}
+#endif /* CONFIG_NUMA && CONFIG_TMPFS */
+#ifndef CONFIG_NUMA
+#define vm_policy vm_private_data
+#endif
static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
@@ -1008,39 +1018,17 @@ static struct page *shmem_alloc_page(gfp_t gfp,
pvma.vm_ops = NULL;
pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
- page = alloc_page_vma(gfp, &pvma, 0);
+ page = alloc_pages_vma(gfp, 0, &pvma, 0, numa_node_id(), false);
+ if (page) {
+ __SetPageLocked(page);
+ __SetPageSwapBacked(page);
+ }
/* Drop reference taken by mpol_shared_policy_lookup() */
mpol_cond_put(pvma.vm_policy);
return page;
}
-#else /* !CONFIG_NUMA */
-#ifdef CONFIG_TMPFS
-static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
-{
-}
-#endif /* CONFIG_TMPFS */
-
-static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
- struct shmem_inode_info *info, pgoff_t index)
-{
- return swapin_readahead(swap, gfp, NULL, 0);
-}
-
-static inline struct page *shmem_alloc_page(gfp_t gfp,
- struct shmem_inode_info *info, pgoff_t index)
-{
- return alloc_page(gfp);
-}
-#endif /* CONFIG_NUMA */
-
-#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS)
-static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
-{
- return NULL;
-}
-#endif
/*
* When a page is moved from swapcache to shmem filecache (either by the
@@ -1084,9 +1072,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
copy_highpage(newpage, oldpage);
flush_dcache_page(newpage);
- __SetPageLocked(newpage);
SetPageUptodate(newpage);
- SetPageSwapBacked(newpage);
set_page_private(newpage, swap_index);
SetPageSwapCache(newpage);
@@ -1130,14 +1116,19 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
*
* If we allocate a new one we do not mark it dirty. That's up to the
* vm. If we swap it in we mark it dirty since we also free the swap
- * entry since a page cannot live in both the swap and page cache
+ * entry since a page cannot live in both the swap and page cache.
+ *
+ * fault_mm and fault_type are only supplied by shmem_fault:
+ * otherwise they are NULL.
*/
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
- struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
+ struct page **pagep, enum sgp_type sgp, gfp_t gfp,
+ struct mm_struct *fault_mm, int *fault_type)
{
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info;
struct shmem_sb_info *sbinfo;
+ struct mm_struct *charge_mm;
struct mem_cgroup *memcg;
struct page *page;
swp_entry_t swap;
@@ -1155,7 +1146,7 @@ repeat:
page = NULL;
}
- if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
+ if (sgp <= SGP_CACHE &&
((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
error = -EINVAL;
goto unlock;
@@ -1183,14 +1174,19 @@ repeat:
*/
info = SHMEM_I(inode);
sbinfo = SHMEM_SB(inode->i_sb);
+ charge_mm = fault_mm ? : current->mm;
if (swap.val) {
/* Look it up and read it in.. */
page = lookup_swap_cache(swap);
if (!page) {
- /* here we actually do the io */
- if (fault_type)
+ /* Or update major stats only when swapin succeeds?? */
+ if (fault_type) {
*fault_type |= VM_FAULT_MAJOR;
+ count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(fault_mm, PGMAJFAULT);
+ }
+ /* Here we actually start the io */
page = shmem_swapin(swap, gfp, info, index);
if (!page) {
error = -ENOMEM;
@@ -1217,7 +1213,7 @@ repeat:
goto failed;
}
- error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
+ error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
false);
if (!error) {
error = shmem_add_to_page_cache(page, mapping, index,
@@ -1275,13 +1271,10 @@ repeat:
error = -ENOMEM;
goto decused;
}
-
- __SetPageSwapBacked(page);
- __SetPageLocked(page);
if (sgp == SGP_WRITE)
__SetPageReferenced(page);
- error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
+ error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
false);
if (error)
goto decused;
@@ -1321,12 +1314,10 @@ clear:
flush_dcache_page(page);
SetPageUptodate(page);
}
- if (sgp == SGP_DIRTY)
- set_page_dirty(page);
}
/* Perhaps the file has been truncated since we checked */
- if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
+ if (sgp <= SGP_CACHE &&
((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
if (alloced) {
ClearPageDirty(page);
@@ -1372,6 +1363,7 @@ unlock:
static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct inode *inode = file_inode(vma->vm_file);
+ gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
int error;
int ret = VM_FAULT_LOCKED;
@@ -1433,14 +1425,10 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
spin_unlock(&inode->i_lock);
}
- error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
+ error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE,
+ gfp, vma->vm_mm, &ret);
if (error)
return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
-
- if (ret & VM_FAULT_MAJOR) {
- count_vm_event(PGMAJFAULT);
- mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
- }
return ret;
}
@@ -1587,7 +1575,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
return -EPERM;
}
- return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
+ return shmem_getpage(inode, index, pagep, SGP_WRITE);
}
static int
@@ -1633,7 +1621,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
* and even mark them dirty, so it cannot exceed the max_blocks limit.
*/
if (!iter_is_iovec(to))
- sgp = SGP_DIRTY;
+ sgp = SGP_CACHE;
index = *ppos >> PAGE_SHIFT;
offset = *ppos & ~PAGE_MASK;
@@ -1653,14 +1641,17 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
break;
}
- error = shmem_getpage(inode, index, &page, sgp, NULL);
+ error = shmem_getpage(inode, index, &page, sgp);
if (error) {
if (error == -EINVAL)
error = 0;
break;
}
- if (page)
+ if (page) {
+ if (sgp == SGP_CACHE)
+ set_page_dirty(page);
unlock_page(page);
+ }
/*
* We must evaluate after, since reads (unlike writes)
@@ -1766,7 +1757,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
error = 0;
while (spd.nr_pages < nr_pages) {
- error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL);
+ error = shmem_getpage(inode, index, &page, SGP_CACHE);
if (error)
break;
unlock_page(page);
@@ -1788,8 +1779,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
page = spd.pages[page_nr];
if (!PageUptodate(page) || page->mapping != mapping) {
- error = shmem_getpage(inode, index, &page,
- SGP_CACHE, NULL);
+ error = shmem_getpage(inode, index, &page, SGP_CACHE);
if (error)
break;
unlock_page(page);
@@ -2232,8 +2222,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
error = -ENOMEM;
else
- error = shmem_getpage(inode, index, &page, SGP_FALLOC,
- NULL);
+ error = shmem_getpage(inode, index, &page, SGP_FALLOC);
if (error) {
/* Remove the !PageUptodate pages we added */
shmem_undo_range(inode,
@@ -2551,7 +2540,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
inode->i_op = &shmem_short_symlink_operations;
} else {
inode_nohighmem(inode);
- error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
+ error = shmem_getpage(inode, 0, &page, SGP_WRITE);
if (error) {
iput(inode);
return error;
@@ -2592,7 +2581,7 @@ static const char *shmem_get_link(struct dentry *dentry,
return ERR_PTR(-ECHILD);
}
} else {
- error = shmem_getpage(inode, 0, &page, SGP_READ, NULL);
+ error = shmem_getpage(inode, 0, &page, SGP_READ);
if (error)
return ERR_PTR(error);
unlock_page(page);
@@ -3496,7 +3485,8 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
int error;
BUG_ON(mapping->a_ops != &shmem_aops);
- error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL);
+ error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE,
+ gfp, NULL, NULL);
if (error)
page = ERR_PTR(error);
else
diff --git a/mm/slab.c b/mm/slab.c
index 17e2848..c11bf50 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -213,6 +213,11 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list);
static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
static void cache_reap(struct work_struct *unused);
+static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
+ void **list);
+static inline void fixup_slab_list(struct kmem_cache *cachep,
+ struct kmem_cache_node *n, struct page *page,
+ void **list);
static int slab_early_init = 1;
#define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node))
@@ -421,8 +426,6 @@ static struct kmem_cache kmem_cache_boot = {
.name = "kmem_cache",
};
-#define BAD_ALIEN_MAGIC 0x01020304ul
-
static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
@@ -519,22 +522,15 @@ static DEFINE_PER_CPU(unsigned long, slab_reap_node);
static void init_reap_node(int cpu)
{
- int node;
-
- node = next_node(cpu_to_mem(cpu), node_online_map);
- if (node == MAX_NUMNODES)
- node = first_node(node_online_map);
-
- per_cpu(slab_reap_node, cpu) = node;
+ per_cpu(slab_reap_node, cpu) = next_node_in(cpu_to_mem(cpu),
+ node_online_map);
}
static void next_reap_node(void)
{
int node = __this_cpu_read(slab_reap_node);
- node = next_node(node, node_online_map);
- if (unlikely(node >= MAX_NUMNODES))
- node = first_node(node_online_map);
+ node = next_node_in(node, node_online_map);
__this_cpu_write(slab_reap_node, node);
}
@@ -644,7 +640,7 @@ static int transfer_objects(struct array_cache *to,
static inline struct alien_cache **alloc_alien_cache(int node,
int limit, gfp_t gfp)
{
- return (struct alien_cache **)BAD_ALIEN_MAGIC;
+ return NULL;
}
static inline void free_alien_cache(struct alien_cache **ac_ptr)
@@ -850,6 +846,46 @@ static inline gfp_t gfp_exact_node(gfp_t flags)
}
#endif
+static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp)
+{
+ struct kmem_cache_node *n;
+
+ /*
+ * Set up the kmem_cache_node for cpu before we can
+ * begin anything. Make sure some other cpu on this
+ * node has not already allocated this
+ */
+ n = get_node(cachep, node);
+ if (n) {
+ spin_lock_irq(&n->list_lock);
+ n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount +
+ cachep->num;
+ spin_unlock_irq(&n->list_lock);
+
+ return 0;
+ }
+
+ n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node);
+ if (!n)
+ return -ENOMEM;
+
+ kmem_cache_node_init(n);
+ n->next_reap = jiffies + REAPTIMEOUT_NODE +
+ ((unsigned long)cachep) % REAPTIMEOUT_NODE;
+
+ n->free_limit =
+ (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num;
+
+ /*
+ * The kmem_cache_nodes don't come and go as CPUs
+ * come and go. slab_mutex is sufficient
+ * protection here.
+ */
+ cachep->node[node] = n;
+
+ return 0;
+}
+
/*
* Allocates and initializes node for a node on each slab cache, used for
* either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node
@@ -861,46 +897,82 @@ static inline gfp_t gfp_exact_node(gfp_t flags)
*/
static int init_cache_node_node(int node)
{
+ int ret;
struct kmem_cache *cachep;
- struct kmem_cache_node *n;
- const size_t memsize = sizeof(struct kmem_cache_node);
list_for_each_entry(cachep, &slab_caches, list) {
- /*
- * Set up the kmem_cache_node for cpu before we can
- * begin anything. Make sure some other cpu on this
- * node has not already allocated this
- */
- n = get_node(cachep, node);
- if (!n) {
- n = kmalloc_node(memsize, GFP_KERNEL, node);
- if (!n)
- return -ENOMEM;
- kmem_cache_node_init(n);
- n->next_reap = jiffies + REAPTIMEOUT_NODE +
- ((unsigned long)cachep) % REAPTIMEOUT_NODE;
-
- /*
- * The kmem_cache_nodes don't come and go as CPUs
- * come and go. slab_mutex is sufficient
- * protection here.
- */
- cachep->node[node] = n;
- }
-
- spin_lock_irq(&n->list_lock);
- n->free_limit =
- (1 + nr_cpus_node(node)) *
- cachep->batchcount + cachep->num;
- spin_unlock_irq(&n->list_lock);
+ ret = init_cache_node(cachep, node, GFP_KERNEL);
+ if (ret)
+ return ret;
}
+
return 0;
}
-static inline int slabs_tofree(struct kmem_cache *cachep,
- struct kmem_cache_node *n)
+static int setup_kmem_cache_node(struct kmem_cache *cachep,
+ int node, gfp_t gfp, bool force_change)
{
- return (n->free_objects + cachep->num - 1) / cachep->num;
+ int ret = -ENOMEM;
+ struct kmem_cache_node *n;
+ struct array_cache *old_shared = NULL;
+ struct array_cache *new_shared = NULL;
+ struct alien_cache **new_alien = NULL;
+ LIST_HEAD(list);
+
+ if (use_alien_caches) {
+ new_alien = alloc_alien_cache(node, cachep->limit, gfp);
+ if (!new_alien)
+ goto fail;
+ }
+
+ if (cachep->shared) {
+ new_shared = alloc_arraycache(node,
+ cachep->shared * cachep->batchcount, 0xbaadf00d, gfp);
+ if (!new_shared)
+ goto fail;
+ }
+
+ ret = init_cache_node(cachep, node, gfp);
+ if (ret)
+ goto fail;
+
+ n = get_node(cachep, node);
+ spin_lock_irq(&n->list_lock);
+ if (n->shared && force_change) {
+ free_block(cachep, n->shared->entry,
+ n->shared->avail, node, &list);
+ n->shared->avail = 0;
+ }
+
+ if (!n->shared || force_change) {
+ old_shared = n->shared;
+ n->shared = new_shared;
+ new_shared = NULL;
+ }
+
+ if (!n->alien) {
+ n->alien = new_alien;
+ new_alien = NULL;
+ }
+
+ spin_unlock_irq(&n->list_lock);
+ slabs_destroy(cachep, &list);
+
+ /*
+ * To protect lockless access to n->shared during irq disabled context.
+ * If n->shared isn't NULL in irq disabled context, accessing to it is
+ * guaranteed to be valid until irq is re-enabled, because it will be
+ * freed after synchronize_sched().
+ */
+ if (force_change)
+ synchronize_sched();
+
+fail:
+ kfree(old_shared);
+ kfree(new_shared);
+ free_alien_cache(new_alien);
+
+ return ret;
}
static void cpuup_canceled(long cpu)
@@ -967,14 +1039,13 @@ free_slab:
n = get_node(cachep, node);
if (!n)
continue;
- drain_freelist(cachep, n, slabs_tofree(cachep, n));
+ drain_freelist(cachep, n, INT_MAX);
}
}
static int cpuup_prepare(long cpu)
{
struct kmem_cache *cachep;
- struct kmem_cache_node *n = NULL;
int node = cpu_to_mem(cpu);
int err;
@@ -993,44 +1064,9 @@ static int cpuup_prepare(long cpu)
* array caches
*/
list_for_each_entry(cachep, &slab_caches, list) {
- struct array_cache *shared = NULL;
- struct alien_cache **alien = NULL;
-
- if (cachep->shared) {
- shared = alloc_arraycache(node,
- cachep->shared * cachep->batchcount,
- 0xbaadf00d, GFP_KERNEL);
- if (!shared)
- goto bad;
- }
- if (use_alien_caches) {
- alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
- if (!alien) {
- kfree(shared);
- goto bad;
- }
- }
- n = get_node(cachep, node);
- BUG_ON(!n);
-
- spin_lock_irq(&n->list_lock);
- if (!n->shared) {
- /*
- * We are serialised from CPU_DEAD or
- * CPU_UP_CANCELLED by the cpucontrol lock
- */
- n->shared = shared;
- shared = NULL;
- }
-#ifdef CONFIG_NUMA
- if (!n->alien) {
- n->alien = alien;
- alien = NULL;
- }
-#endif
- spin_unlock_irq(&n->list_lock);
- kfree(shared);
- free_alien_cache(alien);
+ err = setup_kmem_cache_node(cachep, node, GFP_KERNEL, false);
+ if (err)
+ goto bad;
}
return 0;
@@ -1119,7 +1155,7 @@ static int __meminit drain_cache_node_node(int node)
if (!n)
continue;
- drain_freelist(cachep, n, slabs_tofree(cachep, n));
+ drain_freelist(cachep, n, INT_MAX);
if (!list_empty(&n->slabs_full) ||
!list_empty(&n->slabs_partial)) {
@@ -1200,6 +1236,61 @@ static void __init set_up_node(struct kmem_cache *cachep, int index)
}
}
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+static void freelist_randomize(struct rnd_state *state, freelist_idx_t *list,
+ size_t count)
+{
+ size_t i;
+ unsigned int rand;
+
+ for (i = 0; i < count; i++)
+ list[i] = i;
+
+ /* Fisher-Yates shuffle */
+ for (i = count - 1; i > 0; i--) {
+ rand = prandom_u32_state(state);
+ rand %= (i + 1);
+ swap(list[i], list[rand]);
+ }
+}
+
+/* Create a random sequence per cache */
+static int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp)
+{
+ unsigned int seed, count = cachep->num;
+ struct rnd_state state;
+
+ if (count < 2)
+ return 0;
+
+ /* If it fails, we will just use the global lists */
+ cachep->random_seq = kcalloc(count, sizeof(freelist_idx_t), gfp);
+ if (!cachep->random_seq)
+ return -ENOMEM;
+
+ /* Get best entropy at this stage */
+ get_random_bytes_arch(&seed, sizeof(seed));
+ prandom_seed_state(&state, seed);
+
+ freelist_randomize(&state, cachep->random_seq, count);
+ return 0;
+}
+
+/* Destroy the per-cache random freelist sequence */
+static void cache_random_seq_destroy(struct kmem_cache *cachep)
+{
+ kfree(cachep->random_seq);
+ cachep->random_seq = NULL;
+}
+#else
+static inline int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp)
+{
+ return 0;
+}
+static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { }
+#endif /* CONFIG_SLAB_FREELIST_RANDOM */
+
+
/*
* Initialisation. Called after the page allocator have been initialised and
* before smp_init().
@@ -1212,7 +1303,7 @@ void __init kmem_cache_init(void)
sizeof(struct rcu_head));
kmem_cache = &kmem_cache_boot;
- if (num_possible_nodes() == 1)
+ if (!IS_ENABLED(CONFIG_NUMA) || num_possible_nodes() == 1)
use_alien_caches = 0;
for (i = 0; i < NUM_INIT_LISTS; i++)
@@ -1781,7 +1872,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
/*
* Needed to avoid possible looping condition
- * in cache_grow()
+ * in cache_grow_begin()
*/
if (OFF_SLAB(freelist_cache))
continue;
@@ -2138,7 +2229,7 @@ done:
cachep->freelist_size = cachep->num * sizeof(freelist_idx_t);
cachep->flags = flags;
cachep->allocflags = __GFP_COMP;
- if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
+ if (flags & SLAB_CACHE_DMA)
cachep->allocflags |= GFP_DMA;
cachep->size = size;
cachep->reciprocal_buffer_size = reciprocal_value(size);
@@ -2180,6 +2271,11 @@ static void check_irq_on(void)
BUG_ON(irqs_disabled());
}
+static void check_mutex_acquired(void)
+{
+ BUG_ON(!mutex_is_locked(&slab_mutex));
+}
+
static void check_spinlock_acquired(struct kmem_cache *cachep)
{
#ifdef CONFIG_SMP
@@ -2199,13 +2295,27 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
#else
#define check_irq_off() do { } while(0)
#define check_irq_on() do { } while(0)
+#define check_mutex_acquired() do { } while(0)
#define check_spinlock_acquired(x) do { } while(0)
#define check_spinlock_acquired_node(x, y) do { } while(0)
#endif
-static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
- struct array_cache *ac,
- int force, int node);
+static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
+ int node, bool free_all, struct list_head *list)
+{
+ int tofree;
+
+ if (!ac || !ac->avail)
+ return;
+
+ tofree = free_all ? ac->avail : (ac->limit + 4) / 5;
+ if (tofree > ac->avail)
+ tofree = (ac->avail + 1) / 2;
+
+ free_block(cachep, ac->entry, tofree, node, list);
+ ac->avail -= tofree;
+ memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail);
+}
static void do_drain(void *arg)
{
@@ -2229,6 +2339,7 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
{
struct kmem_cache_node *n;
int node;
+ LIST_HEAD(list);
on_each_cpu(do_drain, cachep, 1);
check_irq_on();
@@ -2236,8 +2347,13 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
if (n->alien)
drain_alien_cache(cachep, n->alien);
- for_each_kmem_cache_node(cachep, node, n)
- drain_array(cachep, n, n->shared, 1, node);
+ for_each_kmem_cache_node(cachep, node, n) {
+ spin_lock_irq(&n->list_lock);
+ drain_array_locked(cachep, n->shared, node, true, &list);
+ spin_unlock_irq(&n->list_lock);
+
+ slabs_destroy(cachep, &list);
+ }
}
/*
@@ -2288,7 +2404,7 @@ int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
check_irq_on();
for_each_kmem_cache_node(cachep, node, n) {
- drain_freelist(cachep, n, slabs_tofree(cachep, n));
+ drain_freelist(cachep, n, INT_MAX);
ret += !list_empty(&n->slabs_full) ||
!list_empty(&n->slabs_partial);
@@ -2306,6 +2422,8 @@ void __kmem_cache_release(struct kmem_cache *cachep)
int i;
struct kmem_cache_node *n;
+ cache_random_seq_destroy(cachep);
+
free_percpu(cachep->cpu_cache);
/* NUMA: free the node structures */
@@ -2412,15 +2530,115 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
#endif
}
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+/* Hold information during a freelist initialization */
+union freelist_init_state {
+ struct {
+ unsigned int pos;
+ freelist_idx_t *list;
+ unsigned int count;
+ unsigned int rand;
+ };
+ struct rnd_state rnd_state;
+};
+
+/*
+ * Initialize the state based on the randomization methode available.
+ * return true if the pre-computed list is available, false otherwize.
+ */
+static bool freelist_state_initialize(union freelist_init_state *state,
+ struct kmem_cache *cachep,
+ unsigned int count)
+{
+ bool ret;
+ unsigned int rand;
+
+ /* Use best entropy available to define a random shift */
+ get_random_bytes_arch(&rand, sizeof(rand));
+
+ /* Use a random state if the pre-computed list is not available */
+ if (!cachep->random_seq) {
+ prandom_seed_state(&state->rnd_state, rand);
+ ret = false;
+ } else {
+ state->list = cachep->random_seq;
+ state->count = count;
+ state->pos = 0;
+ state->rand = rand;
+ ret = true;
+ }
+ return ret;
+}
+
+/* Get the next entry on the list and randomize it using a random shift */
+static freelist_idx_t next_random_slot(union freelist_init_state *state)
+{
+ return (state->list[state->pos++] + state->rand) % state->count;
+}
+
+/*
+ * Shuffle the freelist initialization state based on pre-computed lists.
+ * return true if the list was successfully shuffled, false otherwise.
+ */
+static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page)
+{
+ unsigned int objfreelist = 0, i, count = cachep->num;
+ union freelist_init_state state;
+ bool precomputed;
+
+ if (count < 2)
+ return false;
+
+ precomputed = freelist_state_initialize(&state, cachep, count);
+
+ /* Take a random entry as the objfreelist */
+ if (OBJFREELIST_SLAB(cachep)) {
+ if (!precomputed)
+ objfreelist = count - 1;
+ else
+ objfreelist = next_random_slot(&state);
+ page->freelist = index_to_obj(cachep, page, objfreelist) +
+ obj_offset(cachep);
+ count--;
+ }
+
+ /*
+ * On early boot, generate the list dynamically.
+ * Later use a pre-computed list for speed.
+ */
+ if (!precomputed) {
+ freelist_randomize(&state.rnd_state, page->freelist, count);
+ } else {
+ for (i = 0; i < count; i++)
+ set_free_obj(page, i, next_random_slot(&state));
+ }
+
+ if (OBJFREELIST_SLAB(cachep))
+ set_free_obj(page, cachep->num - 1, objfreelist);
+
+ return true;
+}
+#else
+static inline bool shuffle_freelist(struct kmem_cache *cachep,
+ struct page *page)
+{
+ return false;
+}
+#endif /* CONFIG_SLAB_FREELIST_RANDOM */
+
static void cache_init_objs(struct kmem_cache *cachep,
struct page *page)
{
int i;
void *objp;
+ bool shuffled;
cache_init_objs_debug(cachep, page);
- if (OBJFREELIST_SLAB(cachep)) {
+ /* Try to randomize the freelist if enabled */
+ shuffled = shuffle_freelist(cachep, page);
+
+ if (!shuffled && OBJFREELIST_SLAB(cachep)) {
page->freelist = index_to_obj(cachep, page, cachep->num - 1) +
obj_offset(cachep);
}
@@ -2434,17 +2652,8 @@ static void cache_init_objs(struct kmem_cache *cachep,
kasan_poison_object_data(cachep, objp);
}
- set_free_obj(page, i, i);
- }
-}
-
-static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
-{
- if (CONFIG_ZONE_DMA_FLAG) {
- if (flags & GFP_DMA)
- BUG_ON(!(cachep->allocflags & GFP_DMA));
- else
- BUG_ON(cachep->allocflags & GFP_DMA);
+ if (!shuffled)
+ set_free_obj(page, i, i);
}
}
@@ -2502,13 +2711,15 @@ static void slab_map_pages(struct kmem_cache *cache, struct page *page,
* Grow (by 1) the number of slabs within a cache. This is called by
* kmem_cache_alloc() when there are no active objs left in a cache.
*/
-static int cache_grow(struct kmem_cache *cachep,
- gfp_t flags, int nodeid, struct page *page)
+static struct page *cache_grow_begin(struct kmem_cache *cachep,
+ gfp_t flags, int nodeid)
{
void *freelist;
size_t offset;
gfp_t local_flags;
+ int page_node;
struct kmem_cache_node *n;
+ struct page *page;
/*
* Be lazy and only check for valid flags here, keeping it out of the
@@ -2520,43 +2731,35 @@ static int cache_grow(struct kmem_cache *cachep,
}
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
- /* Take the node list lock to change the colour_next on this node */
check_irq_off();
- n = get_node(cachep, nodeid);
- spin_lock(&n->list_lock);
-
- /* Get colour for the slab, and cal the next value. */
- offset = n->colour_next;
- n->colour_next++;
- if (n->colour_next >= cachep->colour)
- n->colour_next = 0;
- spin_unlock(&n->list_lock);
-
- offset *= cachep->colour_off;
-
if (gfpflags_allow_blocking(local_flags))
local_irq_enable();
/*
- * The test for missing atomic flag is performed here, rather than
- * the more obvious place, simply to reduce the critical path length
- * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
- * will eventually be caught here (where it matters).
- */
- kmem_flagcheck(cachep, flags);
-
- /*
* Get mem for the objs. Attempt to allocate a physical page from
* 'nodeid'.
*/
- if (!page)
- page = kmem_getpages(cachep, local_flags, nodeid);
+ page = kmem_getpages(cachep, local_flags, nodeid);
if (!page)
goto failed;
+ page_node = page_to_nid(page);
+ n = get_node(cachep, page_node);
+
+ /* Get colour for the slab, and cal the next value. */
+ n->colour_next++;
+ if (n->colour_next >= cachep->colour)
+ n->colour_next = 0;
+
+ offset = n->colour_next;
+ if (offset >= cachep->colour)
+ offset = 0;
+
+ offset *= cachep->colour_off;
+
/* Get slab management. */
freelist = alloc_slabmgmt(cachep, page, offset,
- local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
+ local_flags & ~GFP_CONSTRAINT_MASK, page_node);
if (OFF_SLAB(cachep) && !freelist)
goto opps1;
@@ -2567,21 +2770,40 @@ static int cache_grow(struct kmem_cache *cachep,
if (gfpflags_allow_blocking(local_flags))
local_irq_disable();
- check_irq_off();
- spin_lock(&n->list_lock);
- /* Make slab active. */
- list_add_tail(&page->lru, &(n->slabs_free));
- STATS_INC_GROWN(cachep);
- n->free_objects += cachep->num;
- spin_unlock(&n->list_lock);
- return 1;
+ return page;
+
opps1:
kmem_freepages(cachep, page);
failed:
if (gfpflags_allow_blocking(local_flags))
local_irq_disable();
- return 0;
+ return NULL;
+}
+
+static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
+{
+ struct kmem_cache_node *n;
+ void *list = NULL;
+
+ check_irq_off();
+
+ if (!page)
+ return;
+
+ INIT_LIST_HEAD(&page->lru);
+ n = get_node(cachep, page_to_nid(page));
+
+ spin_lock(&n->list_lock);
+ if (!page->active)
+ list_add_tail(&page->lru, &(n->slabs_free));
+ else
+ fixup_slab_list(cachep, n, page, &list);
+ STATS_INC_GROWN(cachep);
+ n->free_objects += cachep->num - page->active;
+ spin_unlock(&n->list_lock);
+
+ fixup_objfreelist_debug(cachep, &list);
}
#if DEBUG
@@ -2785,18 +3007,42 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
return obj;
}
+/*
+ * Slab list should be fixed up by fixup_slab_list() for existing slab
+ * or cache_grow_end() for new slab
+ */
+static __always_inline int alloc_block(struct kmem_cache *cachep,
+ struct array_cache *ac, struct page *page, int batchcount)
+{
+ /*
+ * There must be at least one object available for
+ * allocation.
+ */
+ BUG_ON(page->active >= cachep->num);
+
+ while (page->active < cachep->num && batchcount--) {
+ STATS_INC_ALLOCED(cachep);
+ STATS_INC_ACTIVE(cachep);
+ STATS_SET_HIGH(cachep);
+
+ ac->entry[ac->avail++] = slab_get_obj(cachep, page);
+ }
+
+ return batchcount;
+}
+
static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
{
int batchcount;
struct kmem_cache_node *n;
- struct array_cache *ac;
+ struct array_cache *ac, *shared;
int node;
void *list = NULL;
+ struct page *page;
check_irq_off();
node = numa_mem_id();
-retry:
ac = cpu_cache_get(cachep);
batchcount = ac->batchcount;
if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -2810,16 +3056,20 @@ retry:
n = get_node(cachep, node);
BUG_ON(ac->avail > 0 || !n);
+ shared = READ_ONCE(n->shared);
+ if (!n->free_objects && (!shared || !shared->avail))
+ goto direct_grow;
+
spin_lock(&n->list_lock);
+ shared = READ_ONCE(n->shared);
/* See if we can refill from the shared array */
- if (n->shared && transfer_objects(ac, n->shared, batchcount)) {
- n->shared->touched = 1;
+ if (shared && transfer_objects(ac, shared, batchcount)) {
+ shared->touched = 1;
goto alloc_done;
}
while (batchcount > 0) {
- struct page *page;
/* Get slab alloc is to come from. */
page = get_first_slab(n, false);
if (!page)
@@ -2827,21 +3077,7 @@ retry:
check_spinlock_acquired(cachep);
- /*
- * The slab was either on partial or free list so
- * there must be at least one object available for
- * allocation.
- */
- BUG_ON(page->active >= cachep->num);
-
- while (page->active < cachep->num && batchcount--) {
- STATS_INC_ALLOCED(cachep);
- STATS_INC_ACTIVE(cachep);
- STATS_SET_HIGH(cachep);
-
- ac->entry[ac->avail++] = slab_get_obj(cachep, page);
- }
-
+ batchcount = alloc_block(cachep, ac, page, batchcount);
fixup_slab_list(cachep, n, page, &list);
}
@@ -2851,9 +3087,8 @@ alloc_done:
spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
+direct_grow:
if (unlikely(!ac->avail)) {
- int x;
-
/* Check if we can use obj in pfmemalloc slab */
if (sk_memalloc_socks()) {
void *obj = cache_alloc_pfmemalloc(cachep, n, flags);
@@ -2862,18 +3097,19 @@ alloc_done:
return obj;
}
- x = cache_grow(cachep, gfp_exact_node(flags), node, NULL);
+ page = cache_grow_begin(cachep, gfp_exact_node(flags), node);
- /* cache_grow can reenable interrupts, then ac could change. */
+ /*
+ * cache_grow_begin() can reenable interrupts,
+ * then ac could change.
+ */
ac = cpu_cache_get(cachep);
- node = numa_mem_id();
+ if (!ac->avail && page)
+ alloc_block(cachep, ac, page, batchcount);
+ cache_grow_end(cachep, page);
- /* no objects in sight? abort */
- if (!x && ac->avail == 0)
+ if (!ac->avail)
return NULL;
-
- if (!ac->avail) /* objects refilled by interrupt? */
- goto retry;
}
ac->touched = 1;
@@ -2884,9 +3120,6 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
gfp_t flags)
{
might_sleep_if(gfpflags_allow_blocking(flags));
-#if DEBUG
- kmem_flagcheck(cachep, flags);
-#endif
}
#if DEBUG
@@ -2998,19 +3231,17 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
{
struct zonelist *zonelist;
- gfp_t local_flags;
struct zoneref *z;
struct zone *zone;
enum zone_type high_zoneidx = gfp_zone(flags);
void *obj = NULL;
+ struct page *page;
int nid;
unsigned int cpuset_mems_cookie;
if (flags & __GFP_THISNODE)
return NULL;
- local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
-
retry_cpuset:
cpuset_mems_cookie = read_mems_allowed_begin();
zonelist = node_zonelist(mempolicy_slab_node(), flags);
@@ -3040,33 +3271,19 @@ retry:
* We may trigger various forms of reclaim on the allowed
* set and go into memory reserves if necessary.
*/
- struct page *page;
-
- if (gfpflags_allow_blocking(local_flags))
- local_irq_enable();
- kmem_flagcheck(cache, flags);
- page = kmem_getpages(cache, local_flags, numa_mem_id());
- if (gfpflags_allow_blocking(local_flags))
- local_irq_disable();
+ page = cache_grow_begin(cache, flags, numa_mem_id());
+ cache_grow_end(cache, page);
if (page) {
+ nid = page_to_nid(page);
+ obj = ____cache_alloc_node(cache,
+ gfp_exact_node(flags), nid);
+
/*
- * Insert into the appropriate per node queues
+ * Another processor may allocate the objects in
+ * the slab since we are not holding any locks.
*/
- nid = page_to_nid(page);
- if (cache_grow(cache, flags, nid, page)) {
- obj = ____cache_alloc_node(cache,
- gfp_exact_node(flags), nid);
- if (!obj)
- /*
- * Another processor may allocate the
- * objects in the slab since we are
- * not holding any locks.
- */
- goto retry;
- } else {
- /* cache_grow already freed obj */
- obj = NULL;
- }
+ if (!obj)
+ goto retry;
}
}
@@ -3083,15 +3300,13 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
{
struct page *page;
struct kmem_cache_node *n;
- void *obj;
+ void *obj = NULL;
void *list = NULL;
- int x;
VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES);
n = get_node(cachep, nodeid);
BUG_ON(!n);
-retry:
check_irq_off();
spin_lock(&n->list_lock);
page = get_first_slab(n, false);
@@ -3113,18 +3328,18 @@ retry:
spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
- goto done;
+ return obj;
must_grow:
spin_unlock(&n->list_lock);
- x = cache_grow(cachep, gfp_exact_node(flags), nodeid, NULL);
- if (x)
- goto retry;
-
- return fallback_alloc(cachep, flags);
+ page = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid);
+ if (page) {
+ /* This slab isn't counted yet so don't update free_objects */
+ obj = slab_get_obj(cachep, page);
+ }
+ cache_grow_end(cachep, page);
-done:
- return obj;
+ return obj ? obj : fallback_alloc(cachep, flags);
}
static __always_inline void *
@@ -3242,6 +3457,9 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
{
int i;
struct kmem_cache_node *n = get_node(cachep, node);
+ struct page *page;
+
+ n->free_objects += nr_objects;
for (i = 0; i < nr_objects; i++) {
void *objp;
@@ -3254,17 +3472,11 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
check_spinlock_acquired_node(cachep, node);
slab_put_obj(cachep, page, objp);
STATS_DEC_ACTIVE(cachep);
- n->free_objects++;
/* fixup slab chains */
- if (page->active == 0) {
- if (n->free_objects > n->free_limit) {
- n->free_objects -= cachep->num;
- list_add_tail(&page->lru, list);
- } else {
- list_add(&page->lru, &n->slabs_free);
- }
- } else {
+ if (page->active == 0)
+ list_add(&page->lru, &n->slabs_free);
+ else {
/* Unconditionally move a slab to the end of the
* partial list on free - maximum time for the
* other objects to be freed, too.
@@ -3272,6 +3484,14 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
list_add_tail(&page->lru, &n->slabs_partial);
}
}
+
+ while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) {
+ n->free_objects -= cachep->num;
+
+ page = list_last_entry(&n->slabs_free, struct page, lru);
+ list_del(&page->lru);
+ list_add(&page->lru, list);
+ }
}
static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
@@ -3645,72 +3865,19 @@ EXPORT_SYMBOL(kfree);
/*
* This initializes kmem_cache_node or resizes various caches for all nodes.
*/
-static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp)
+static int setup_kmem_cache_nodes(struct kmem_cache *cachep, gfp_t gfp)
{
+ int ret;
int node;
struct kmem_cache_node *n;
- struct array_cache *new_shared;
- struct alien_cache **new_alien = NULL;
for_each_online_node(node) {
-
- if (use_alien_caches) {
- new_alien = alloc_alien_cache(node, cachep->limit, gfp);
- if (!new_alien)
- goto fail;
- }
-
- new_shared = NULL;
- if (cachep->shared) {
- new_shared = alloc_arraycache(node,
- cachep->shared*cachep->batchcount,
- 0xbaadf00d, gfp);
- if (!new_shared) {
- free_alien_cache(new_alien);
- goto fail;
- }
- }
-
- n = get_node(cachep, node);
- if (n) {
- struct array_cache *shared = n->shared;
- LIST_HEAD(list);
-
- spin_lock_irq(&n->list_lock);
-
- if (shared)
- free_block(cachep, shared->entry,
- shared->avail, node, &list);
-
- n->shared = new_shared;
- if (!n->alien) {
- n->alien = new_alien;
- new_alien = NULL;
- }
- n->free_limit = (1 + nr_cpus_node(node)) *
- cachep->batchcount + cachep->num;
- spin_unlock_irq(&n->list_lock);
- slabs_destroy(cachep, &list);
- kfree(shared);
- free_alien_cache(new_alien);
- continue;
- }
- n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node);
- if (!n) {
- free_alien_cache(new_alien);
- kfree(new_shared);
+ ret = setup_kmem_cache_node(cachep, node, gfp, true);
+ if (ret)
goto fail;
- }
- kmem_cache_node_init(n);
- n->next_reap = jiffies + REAPTIMEOUT_NODE +
- ((unsigned long)cachep) % REAPTIMEOUT_NODE;
- n->shared = new_shared;
- n->alien = new_alien;
- n->free_limit = (1 + nr_cpus_node(node)) *
- cachep->batchcount + cachep->num;
- cachep->node[node] = n;
}
+
return 0;
fail:
@@ -3752,7 +3919,7 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
cachep->shared = shared;
if (!prev)
- goto alloc_node;
+ goto setup_node;
for_each_online_cpu(cpu) {
LIST_HEAD(list);
@@ -3769,8 +3936,8 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
}
free_percpu(prev);
-alloc_node:
- return alloc_kmem_cache_node(cachep, gfp);
+setup_node:
+ return setup_kmem_cache_nodes(cachep, gfp);
}
static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
@@ -3804,6 +3971,10 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
int shared = 0;
int batchcount = 0;
+ err = cache_random_seq_create(cachep, gfp);
+ if (err)
+ goto end;
+
if (!is_root_cache(cachep)) {
struct kmem_cache *root = memcg_root_cache(cachep);
limit = root->limit;
@@ -3857,6 +4028,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
batchcount = (limit + 1) / 2;
skip_setup:
err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
+end:
if (err)
pr_err("enable_cpucache failed for %s, error %d\n",
cachep->name, -err);
@@ -3869,29 +4041,26 @@ skip_setup:
* if drain_array() is used on the shared array.
*/
static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
- struct array_cache *ac, int force, int node)
+ struct array_cache *ac, int node)
{
LIST_HEAD(list);
- int tofree;
+
+ /* ac from n->shared can be freed if we don't hold the slab_mutex. */
+ check_mutex_acquired();
if (!ac || !ac->avail)
return;
- if (ac->touched && !force) {
+
+ if (ac->touched) {
ac->touched = 0;
- } else {
- spin_lock_irq(&n->list_lock);
- if (ac->avail) {
- tofree = force ? ac->avail : (ac->limit + 4) / 5;
- if (tofree > ac->avail)
- tofree = (ac->avail + 1) / 2;
- free_block(cachep, ac->entry, tofree, node, &list);
- ac->avail -= tofree;
- memmove(ac->entry, &(ac->entry[tofree]),
- sizeof(void *) * ac->avail);
- }
- spin_unlock_irq(&n->list_lock);
- slabs_destroy(cachep, &list);
+ return;
}
+
+ spin_lock_irq(&n->list_lock);
+ drain_array_locked(cachep, ac, node, false, &list);
+ spin_unlock_irq(&n->list_lock);
+
+ slabs_destroy(cachep, &list);
}
/**
@@ -3929,7 +4098,7 @@ static void cache_reap(struct work_struct *w)
reap_alien(searchp, n);
- drain_array(searchp, n, cpu_cache_get(searchp), 0, node);
+ drain_array(searchp, n, cpu_cache_get(searchp), node);
/*
* These are racy checks but it does not matter
@@ -3940,7 +4109,7 @@ static void cache_reap(struct work_struct *w)
n->next_reap = jiffies + REAPTIMEOUT_NODE;
- drain_array(searchp, n, n->shared, 0, node);
+ drain_array(searchp, n, n->shared, node);
if (n->free_touched)
n->free_touched = 0;
diff --git a/mm/slub.c b/mm/slub.c
index 4dbb109e..cf1faa4 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -329,8 +329,8 @@ static inline void set_page_slub_counters(struct page *page, unsigned long count
tmp.counters = counters_new;
/*
* page->counters can cover frozen/inuse/objects as well
- * as page->_count. If we assign to ->counters directly
- * we run the risk of losing updates to page->_count, so
+ * as page->_refcount. If we assign to ->counters directly
+ * we run the risk of losing updates to page->_refcount, so
* be careful and only assign to the fields we need.
*/
page->frozen = tmp.frozen;
@@ -1735,11 +1735,11 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
* may return off node objects because partial slabs are obtained
* from other nodes and filled up.
*
- * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes
- * defrag_ratio = 1000) then every (well almost) allocation will
- * first attempt to defrag slab caches on other nodes. This means
- * scanning over all nodes to look for partial slabs which may be
- * expensive if we do it every time we are trying to find a slab
+ * If /sys/kernel/slab/xx/remote_node_defrag_ratio is set to 100
+ * (which makes defrag_ratio = 1000) then every (well almost)
+ * allocation will first attempt to defrag slab caches on other nodes.
+ * This means scanning over all nodes to look for partial slabs which
+ * may be expensive if we do it every time we are trying to find a slab
* with available objects.
*/
if (!s->remote_node_defrag_ratio ||
@@ -3697,7 +3697,7 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
* s->cpu_partial is checked locklessly (see put_cpu_partial),
* so we have to make sure the change is visible.
*/
- kick_all_cpus_sync();
+ synchronize_sched();
}
flush_all(s);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 366ce35..0d457e7 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -358,7 +358,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
/* May fail (-ENOMEM) if radix-tree node allocation failed. */
__SetPageLocked(new_page);
- SetPageSwapBacked(new_page);
+ __SetPageSwapBacked(new_page);
err = __add_to_swap_cache(new_page, entry);
if (likely(!err)) {
radix_tree_preload_end();
@@ -370,7 +370,6 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
return new_page;
}
radix_tree_preload_end();
- ClearPageSwapBacked(new_page);
__ClearPageLocked(new_page);
/*
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
diff --git a/mm/util.c b/mm/util.c
index 6cc81e7..8a1b3a1 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -346,6 +346,29 @@ void *page_rmapping(struct page *page)
return __page_rmapping(page);
}
+/*
+ * Return true if this page is mapped into pagetables.
+ * For compound page it returns true if any subpage of compound page is mapped.
+ */
+bool page_mapped(struct page *page)
+{
+ int i;
+
+ if (likely(!PageCompound(page)))
+ return atomic_read(&page->_mapcount) >= 0;
+ page = compound_head(page);
+ if (atomic_read(compound_mapcount_ptr(page)) >= 0)
+ return true;
+ if (PageHuge(page))
+ return false;
+ for (i = 0; i < hpage_nr_pages(page); i++) {
+ if (atomic_read(&page[i]._mapcount) >= 0)
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL(page_mapped);
+
struct anon_vma *page_anon_vma(struct page *page)
{
unsigned long mapping;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 142cb61..dcfdfc1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -633,7 +633,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
*
* Reversing the order of the tests ensures such a situation cannot
* escape unnoticed. The smp_rmb is needed to ensure the page->flags
- * load is not satisfied before that of page->_count.
+ * load is not satisfied before that of page->_refcount.
*
* Note that if SetPageDirty is always performed via set_page_dirty,
* and thus under tree_lock, then this ordering is not required.
@@ -1374,7 +1374,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
!list_empty(src); scan++) {
struct page *page;
- int nr_pages;
page = lru_to_page(src);
prefetchw_prev_lru_page(page, src, flags);
@@ -1383,10 +1382,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
switch (__isolate_lru_page(page, mode)) {
case 0:
- nr_pages = hpage_nr_pages(page);
- mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
+ nr_taken += hpage_nr_pages(page);
list_move(&page->lru, dst);
- nr_taken += nr_pages;
break;
case -EBUSY:
@@ -1602,8 +1599,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
&nr_scanned, sc, isolate_mode, lru);
- __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
+ update_lru_size(lruvec, lru, -nr_taken);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+ reclaim_stat->recent_scanned[file] += nr_taken;
if (global_reclaim(sc)) {
__mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
@@ -1624,8 +1622,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
spin_lock_irq(&zone->lru_lock);
- reclaim_stat->recent_scanned[file] += nr_taken;
-
if (global_reclaim(sc)) {
if (current_is_kswapd())
__count_zone_vm_events(PGSTEAL_KSWAPD, zone,
@@ -1720,7 +1716,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
* It is safe to rely on PG_active against the non-LRU pages in here because
* nobody will play with that bit on a non-LRU page.
*
- * The downside is that we have to touch page->_count against each page.
+ * The downside is that we have to touch page->_refcount against each page.
* But we had to alter page->flags anyway.
*/
@@ -1742,7 +1738,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
SetPageLRU(page);
nr_pages = hpage_nr_pages(page);
- mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
+ update_lru_size(lruvec, lru, nr_pages);
list_move(&page->lru, &lruvec->lists[lru]);
pgmoved += nr_pages;
@@ -1760,7 +1756,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
list_add(&page->lru, pages_to_free);
}
}
- __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
+
if (!is_active_lru(lru))
__count_vm_events(PGDEACTIVATE, pgmoved);
}
@@ -1794,14 +1790,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, isolate_mode, lru);
- if (global_reclaim(sc))
- __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
+ update_lru_size(lruvec, lru, -nr_taken);
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
reclaim_stat->recent_scanned[file] += nr_taken;
+ if (global_reclaim(sc))
+ __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
__count_zone_vm_events(PGREFILL, zone, nr_scanned);
- __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
- __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+
spin_unlock_irq(&zone->lru_lock);
while (!list_empty(&l_hold)) {
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 5e43004..5b72a8a 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -570,49 +570,18 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
#ifdef CONFIG_NUMA
/*
- * zonelist = the list of zones passed to the allocator
- * z = the zone from which the allocation occurred.
- *
- * Must be called with interrupts disabled.
- *
- * When __GFP_OTHER_NODE is set assume the node of the preferred
- * zone is the local node. This is useful for daemons who allocate
- * memory on behalf of other processes.
- */
-void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
-{
- if (z->zone_pgdat == preferred_zone->zone_pgdat) {
- __inc_zone_state(z, NUMA_HIT);
- } else {
- __inc_zone_state(z, NUMA_MISS);
- __inc_zone_state(preferred_zone, NUMA_FOREIGN);
- }
- if (z->node == ((flags & __GFP_OTHER_NODE) ?
- preferred_zone->node : numa_node_id()))
- __inc_zone_state(z, NUMA_LOCAL);
- else
- __inc_zone_state(z, NUMA_OTHER);
-}
-
-/*
* Determine the per node value of a stat item.
*/
unsigned long node_page_state(int node, enum zone_stat_item item)
{
struct zone *zones = NODE_DATA(node)->node_zones;
+ int i;
+ unsigned long count = 0;
- return
-#ifdef CONFIG_ZONE_DMA
- zone_page_state(&zones[ZONE_DMA], item) +
-#endif
-#ifdef CONFIG_ZONE_DMA32
- zone_page_state(&zones[ZONE_DMA32], item) +
-#endif
-#ifdef CONFIG_HIGHMEM
- zone_page_state(&zones[ZONE_HIGHMEM], item) +
-#endif
- zone_page_state(&zones[ZONE_NORMAL], item) +
- zone_page_state(&zones[ZONE_MOVABLE], item);
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ count += zone_page_state(zones + i, item);
+
+ return count;
}
#endif
@@ -1010,6 +979,9 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
if (!memmap_valid_within(pfn, page, zone))
continue;
+ if (page_zone(page) != zone)
+ continue;
+
mtype = get_pageblock_migratetype(page);
if (mtype < MIGRATE_TYPES)
@@ -1069,13 +1041,17 @@ static void pagetypeinfo_showmixedcount_print(struct seq_file *m,
block_end_pfn = min(block_end_pfn, end_pfn);
page = pfn_to_page(pfn);
- pageblock_mt = get_pfnblock_migratetype(page, pfn);
+ pageblock_mt = get_pageblock_migratetype(page);
for (; pfn < block_end_pfn; pfn++) {
if (!pfn_valid_within(pfn))
continue;
page = pfn_to_page(pfn);
+
+ if (page_zone(page) != zone)
+ continue;
+
if (PageBuddy(page)) {
pfn += (1UL << page_order(page)) - 1;
continue;
@@ -1378,6 +1354,66 @@ static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
int sysctl_stat_interval __read_mostly = HZ;
static cpumask_var_t cpu_stat_off;
+#ifdef CONFIG_PROC_FS
+static void refresh_vm_stats(struct work_struct *work)
+{
+ refresh_cpu_vm_stats(true);
+}
+
+int vmstat_refresh(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ long val;
+ int err;
+ int i;
+
+ /*
+ * The regular update, every sysctl_stat_interval, may come later
+ * than expected: leaving a significant amount in per_cpu buckets.
+ * This is particularly misleading when checking a quantity of HUGE
+ * pages, immediately after running a test. /proc/sys/vm/stat_refresh,
+ * which can equally be echo'ed to or cat'ted from (by root),
+ * can be used to update the stats just before reading them.
+ *
+ * Oh, and since global_page_state() etc. are so careful to hide
+ * transiently negative values, report an error here if any of
+ * the stats is negative, so we know to go looking for imbalance.
+ */
+ err = schedule_on_each_cpu(refresh_vm_stats);
+ if (err)
+ return err;
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+ val = atomic_long_read(&vm_stat[i]);
+ if (val < 0) {
+ switch (i) {
+ case NR_ALLOC_BATCH:
+ case NR_PAGES_SCANNED:
+ /*
+ * These are often seen to go negative in
+ * recent kernels, but not to go permanently
+ * negative. Whilst it would be nicer not to
+ * have exceptions, rooting them out would be
+ * another task, of rather low priority.
+ */
+ break;
+ default:
+ pr_warn("%s: %s %ld\n",
+ __func__, vmstat_text[i], val);
+ err = -EINVAL;
+ break;
+ }
+ }
+ }
+ if (err)
+ return err;
+ if (write)
+ *ppos += *lenp;
+ else
+ *lenp = 0;
+ return 0;
+}
+#endif /* CONFIG_PROC_FS */
+
static void vmstat_update(struct work_struct *w)
{
if (refresh_cpu_vm_stats(true)) {
diff --git a/net/socket.c b/net/socket.c
index e7793f5..a1bd161 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2168,7 +2168,8 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
struct mmsghdr __user *entry;
struct compat_mmsghdr __user *compat_entry;
struct msghdr msg_sys;
- struct timespec end_time;
+ struct timespec64 end_time;
+ struct timespec64 timeout64;
if (timeout &&
poll_select_set_timeout(&end_time, timeout->tv_sec,
@@ -2220,8 +2221,9 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
flags |= MSG_DONTWAIT;
if (timeout) {
- ktime_get_ts(timeout);
- *timeout = timespec_sub(end_time, *timeout);
+ ktime_get_ts64(&timeout64);
+ *timeout = timespec64_to_timespec(
+ timespec64_sub(end_time, timeout64));
if (timeout->tv_sec < 0) {
timeout->tv_sec = timeout->tv_nsec = 0;
break;
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 219bd19..4e809e9 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -651,7 +651,7 @@ __frame_add_frag(struct sk_buff *skb, struct page *page,
struct skb_shared_info *sh = skb_shinfo(skb);
int page_offset;
- atomic_inc(&page->_count);
+ page_ref_inc(page);
page_offset = ptr - page_address(page);
skb_add_rx_frag(skb, sh->nr_frags, page, page_offset, len, size);
}
diff --git a/scripts/bloat-o-meter b/scripts/bloat-o-meter
index 38b64f4..0254f3b 100755
--- a/scripts/bloat-o-meter
+++ b/scripts/bloat-o-meter
@@ -32,18 +32,21 @@ old = getsizes(sys.argv[1])
new = getsizes(sys.argv[2])
grow, shrink, add, remove, up, down = 0, 0, 0, 0, 0, 0
delta, common = [], {}
+otot, ntot = 0, 0
for a in old:
if a in new:
common[a] = 1
for name in old:
+ otot += old[name]
if name not in common:
remove += 1
down += old[name]
delta.append((-old[name], name))
for name in new:
+ ntot += new[name]
if name not in common:
add += 1
up += new[name]
@@ -63,3 +66,6 @@ print("add/remove: %s/%s grow/shrink: %s/%s up/down: %s/%s (%s)" % \
print("%-40s %7s %7s %+7s" % ("function", "old", "new", "delta"))
for d, n in delta:
if d: print("%-40s %7s %7s %+7d" % (n, old.get(n,"-"), new.get(n,"-"), d))
+
+print("Total: Before=%d, After=%d, chg %f%%" % \
+ (otot, ntot, (ntot - otot)*100/otot))
diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh
index 00d6d53c..c332684 100755
--- a/scripts/decode_stacktrace.sh
+++ b/scripts/decode_stacktrace.sh
@@ -2,15 +2,17 @@
# (c) 2014, Sasha Levin <sasha.levin@oracle.com>
#set -x
-if [[ $# != 2 ]]; then
+if [[ $# < 2 ]]; then
echo "Usage:"
- echo " $0 [vmlinux] [base path]"
+ echo " $0 [vmlinux] [base path] [modules path]"
exit 1
fi
vmlinux=$1
basepath=$2
+modpath=$3
declare -A cache
+declare -A modcache
parse_symbol() {
# The structure of symbol at this point is:
@@ -19,6 +21,17 @@ parse_symbol() {
# For example:
# do_basic_setup+0x9c/0xbf
+ if [[ $module == "" ]] ; then
+ local objfile=$vmlinux
+ elif [[ "${modcache[$module]+isset}" == "isset" ]]; then
+ local objfile=${modcache[$module]}
+ else
+ [[ $modpath == "" ]] && return
+ local objfile=$(find "$modpath" -name $module.ko -print -quit)
+ [[ $objfile == "" ]] && return
+ modcache[$module]=$objfile
+ fi
+
# Remove the englobing parenthesis
symbol=${symbol#\(}
symbol=${symbol%\)}
@@ -29,11 +42,11 @@ parse_symbol() {
# Use 'nm vmlinux' to figure out the base address of said symbol.
# It's actually faster to call it every time than to load it
# all into bash.
- if [[ "${cache[$name]+isset}" == "isset" ]]; then
- local base_addr=${cache[$name]}
+ if [[ "${cache[$module,$name]+isset}" == "isset" ]]; then
+ local base_addr=${cache[$module,$name]}
else
- local base_addr=$(nm "$vmlinux" | grep -i ' t ' | awk "/ $name\$/ {print \$1}" | head -n1)
- cache["$name"]="$base_addr"
+ local base_addr=$(nm "$objfile" | grep -i ' t ' | awk "/ $name\$/ {print \$1}" | head -n1)
+ cache[$module,$name]="$base_addr"
fi
# Let's start doing the math to get the exact address into the
# symbol. First, strip out the symbol total length.
@@ -48,12 +61,12 @@ parse_symbol() {
local address=$(printf "%x\n" "$expr")
# Pass it to addr2line to get filename and line number
- # Could get more than one result
- if [[ "${cache[$address]+isset}" == "isset" ]]; then
- local code=${cache[$address]}
+ # Could get more than one result
+ if [[ "${cache[$module,$address]+isset}" == "isset" ]]; then
+ local code=${cache[$module,$address]}
else
- local code=$(addr2line -i -e "$vmlinux" "$address")
- cache[$address]=$code
+ local code=$(addr2line -i -e "$objfile" "$address")
+ cache[$module,$address]=$code
fi
# addr2line doesn't return a proper error code if it fails, so
@@ -105,13 +118,23 @@ handle_line() {
fi
done
- # The symbol is the last element, process it
- symbol=${words[$last]}
+ if [[ ${words[$last]} =~ \[([^]]+)\] ]]; then
+ module=${words[$last]}
+ module=${module#\[}
+ module=${module%\]}
+ symbol=${words[$last-1]}
+ unset words[$last-1]
+ else
+ # The symbol is the last element, process it
+ symbol=${words[$last]}
+ module=
+ fi
+
unset words[$last]
parse_symbol # modifies $symbol
# Add up the line number to the symbol
- echo "${words[@]}" "$symbol"
+ echo "${words[@]}" "$symbol $module"
}
while read line; do
@@ -121,8 +144,8 @@ while read line; do
handle_line "$line"
# Is it a code line?
elif [[ $line == *Code:* ]]; then
- decode_code "$line"
- else
+ decode_code "$line"
+ else
# Nothing special in this line, show it as is
echo "$line"
fi
diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index 946caf3..fa79c6d 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -428,6 +428,7 @@ feautures||features
fetaure||feature
fetaures||features
fileystem||filesystem
+fimware||firmware
finanize||finalize
findn||find
finilizes||finalizes
OpenPOWER on IntegriCloud