summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-10-26 19:33:41 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2018-10-26 19:33:41 -0700
commit345671ea0f9258f410eb057b9ced9cefbbe5dc78 (patch)
treefe97ba3d27679789e6aa34e39b002ee64ce25412
parent4904008165c8a1c48602b8316139691b8c735e6e (diff)
parent22146c3ce98962436e401f7b7016a6f664c9ffb5 (diff)
downloadop-kernel-dev-345671ea0f9258f410eb057b9ced9cefbbe5dc78.zip
op-kernel-dev-345671ea0f9258f410eb057b9ced9cefbbe5dc78.tar.gz
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton: - a few misc things - ocfs2 updates - most of MM * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (132 commits) hugetlbfs: dirty pages as they are added to pagecache mm: export add_swap_extent() mm: split SWP_FILE into SWP_ACTIVATED and SWP_FS tools/testing/selftests/vm/map_fixed_noreplace.c: add test for MAP_FIXED_NOREPLACE mm: thp: relocate flush_cache_range() in migrate_misplaced_transhuge_page() mm: thp: fix mmu_notifier in migrate_misplaced_transhuge_page() mm: thp: fix MADV_DONTNEED vs migrate_misplaced_transhuge_page race condition mm/kasan/quarantine.c: make quarantine_lock a raw_spinlock_t mm/gup: cache dev_pagemap while pinning pages Revert "x86/e820: put !E820_TYPE_RAM regions into memblock.reserved" mm: return zero_resv_unavail optimization mm: zero remaining unavailable struct pages tools/testing/selftests/vm/gup_benchmark.c: add MAP_HUGETLB option tools/testing/selftests/vm/gup_benchmark.c: add MAP_SHARED option tools/testing/selftests/vm/gup_benchmark.c: allow user specified file tools/testing/selftests/vm/gup_benchmark.c: fix 'write' flag usage mm/gup_benchmark.c: add additional pinning methods mm/gup_benchmark.c: time put_page() mm: don't raise MEMCG_OOM event due to failed high-order allocation mm/page-writeback.c: fix range_cyclic writeback vs writepages deadlock ...
-rw-r--r--Documentation/accounting/psi.txt73
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst22
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt12
-rw-r--r--Documentation/filesystems/proc.txt4
-rw-r--r--Documentation/vm/slub.rst12
-rw-r--r--Documentation/x86/pat.txt4
-rw-r--r--arch/alpha/Kconfig2
-rw-r--r--arch/alpha/kernel/core_irongate.c4
-rw-r--r--arch/alpha/kernel/setup.c98
-rw-r--r--arch/alpha/mm/numa.c113
-rw-r--r--arch/arm/include/asm/hugetlb-3level.h32
-rw-r--r--arch/arm/include/asm/hugetlb.h33
-rw-r--r--arch/arm64/include/asm/hugetlb.h39
-rw-r--r--arch/arm64/include/asm/string.h14
-rw-r--r--arch/arm64/kernel/arm64ksyms.c7
-rw-r--r--arch/arm64/lib/memchr.S2
-rw-r--r--arch/arm64/lib/memcmp.S2
-rw-r--r--arch/arm64/lib/strchr.S2
-rw-r--r--arch/arm64/lib/strcmp.S2
-rw-r--r--arch/arm64/lib/strlen.S2
-rw-r--r--arch/arm64/lib/strncmp.S2
-rw-r--r--arch/arm64/lib/strnlen.S2
-rw-r--r--arch/arm64/lib/strrchr.S2
-rw-r--r--arch/hexagon/Kconfig3
-rw-r--r--arch/hexagon/mm/init.c20
-rw-r--r--arch/ia64/include/asm/hugetlb.h47
-rw-r--r--arch/ia64/include/asm/pgtable.h1
-rw-r--r--arch/mips/include/asm/hugetlb.h40
-rw-r--r--arch/nios2/Kconfig3
-rw-r--r--arch/nios2/kernel/prom.c17
-rw-r--r--arch/nios2/kernel/setup.c39
-rw-r--r--arch/parisc/include/asm/hugetlb.h33
-rw-r--r--arch/powerpc/include/asm/book3s/32/pgtable.h6
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable.h1
-rw-r--r--arch/powerpc/include/asm/hugetlb.h43
-rw-r--r--arch/powerpc/include/asm/nohash/32/pgtable.h6
-rw-r--r--arch/powerpc/include/asm/nohash/64/pgtable.h1
-rw-r--r--arch/powerpc/platforms/cell/cpufreq_spudemand.c2
-rw-r--r--arch/powerpc/platforms/cell/spufs/sched.c9
-rw-r--r--arch/s390/appldata/appldata_os.c4
-rw-r--r--arch/sh/include/asm/hugetlb.h54
-rw-r--r--arch/sparc/include/asm/hugetlb.h40
-rw-r--r--arch/um/Kconfig2
-rw-r--r--arch/um/kernel/physmem.c22
-rw-r--r--arch/unicore32/Kconfig1
-rw-r--r--arch/unicore32/mm/init.c54
-rw-r--r--arch/x86/entry/vdso/vma.c24
-rw-r--r--arch/x86/include/asm/hugetlb.h69
-rw-r--r--arch/x86/kernel/e820.c15
-rw-r--r--arch/xtensa/include/asm/Kbuild1
-rw-r--r--arch/xtensa/include/asm/vga.h19
-rw-r--r--block/blk-iolatency.c8
-rw-r--r--drivers/base/node.c19
-rw-r--r--drivers/cpuidle/governors/menu.c4
-rw-r--r--drivers/infiniband/hw/hfi1/mmu_rb.c1
-rw-r--r--drivers/iommu/amd_iommu_v2.c1
-rw-r--r--drivers/iommu/intel-svm.c1
-rw-r--r--drivers/misc/sgi-gru/grutlbpurge.c1
-rw-r--r--drivers/of/fdt.c11
-rw-r--r--drivers/staging/android/ion/ion_page_pool.c8
-rw-r--r--fs/cramfs/inode.c5
-rw-r--r--fs/dcache.c38
-rw-r--r--fs/iomap.c2
-rw-r--r--fs/kernfs/mount.c3
-rw-r--r--fs/ocfs2/alloc.c4
-rw-r--r--fs/ocfs2/aops.c3
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c2
-rw-r--r--fs/ocfs2/dlm/dlmthread.c2
-rw-r--r--fs/ocfs2/refcounttree.c2
-rw-r--r--fs/proc/inode.c3
-rw-r--r--fs/proc/loadavg.c3
-rw-r--r--fs/proc/meminfo.c16
-rw-r--r--fs/proc/task_mmu.c4
-rw-r--r--fs/userfaultfd.c8
-rw-r--r--include/asm-generic/hugetlb.h88
-rw-r--r--include/asm-generic/pgtable.h4
-rw-r--r--include/linux/cgroup-defs.h4
-rw-r--r--include/linux/cgroup.h15
-rw-r--r--include/linux/delayacct.h23
-rw-r--r--include/linux/hmm.h2
-rw-r--r--include/linux/huge_mm.h8
-rw-r--r--include/linux/iomap.h4
-rw-r--r--include/linux/linkage.h1
-rw-r--r--include/linux/math64.h3
-rw-r--r--include/linux/memblock.h15
-rw-r--r--include/linux/memcontrol.h15
-rw-r--r--include/linux/mm.h48
-rw-r--r--include/linux/mmu_notifier.h27
-rw-r--r--include/linux/mmzone.h4
-rw-r--r--include/linux/page-flags.h14
-rw-r--r--include/linux/pfn_t.h4
-rw-r--r--include/linux/psi.h53
-rw-r--r--include/linux/psi_types.h92
-rw-r--r--include/linux/sched.h13
-rw-r--r--include/linux/sched/loadavg.h24
-rw-r--r--include/linux/slab.h56
-rw-r--r--include/linux/swap.h15
-rw-r--r--include/trace/events/mmflags.h1
-rw-r--r--include/uapi/linux/taskstats.h6
-rw-r--r--init/Kconfig19
-rw-r--r--kernel/cgroup/cgroup.c45
-rw-r--r--kernel/debug/kdb/kdb_main.c7
-rw-r--r--kernel/delayacct.c15
-rw-r--r--kernel/fork.c59
-rw-r--r--kernel/memremap.c25
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/core.c16
-rw-r--r--kernel/sched/loadavg.c139
-rw-r--r--kernel/sched/psi.c759
-rw-r--r--kernel/sched/sched.h178
-rw-r--r--kernel/sched/stats.h86
-rw-r--r--lib/test_kasan.c70
-rw-r--r--mm/compaction.c5
-rw-r--r--mm/debug.c46
-rw-r--r--mm/filemap.c37
-rw-r--r--mm/gup.c115
-rw-r--r--mm/gup_benchmark.c37
-rw-r--r--mm/hmm.c12
-rw-r--r--mm/huge_memory.c31
-rw-r--r--mm/hugetlb.c6
-rw-r--r--mm/kasan/quarantine.c18
-rw-r--r--mm/kmemleak.c42
-rw-r--r--mm/memblock.c5
-rw-r--r--mm/memcontrol.c54
-rw-r--r--mm/memory.c156
-rw-r--r--mm/memory_hotplug.c146
-rw-r--r--mm/mempolicy.c35
-rw-r--r--mm/migrate.c44
-rw-r--r--mm/mmap.c96
-rw-r--r--mm/mmu_notifier.c31
-rw-r--r--mm/mremap.c20
-rw-r--r--mm/nommu.c6
-rw-r--r--mm/page-writeback.c33
-rw-r--r--mm/page_alloc.c362
-rw-r--r--mm/page_io.c6
-rw-r--r--mm/slab.c8
-rw-r--r--mm/slab_common.c115
-rw-r--r--mm/slub.c83
-rw-r--r--mm/sparse.c4
-rw-r--r--mm/swap.c1
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/swapfile.c83
-rw-r--r--mm/util.c5
-rw-r--r--mm/vmalloc.c4
-rw-r--r--mm/vmscan.c31
-rw-r--r--mm/vmstat.c10
-rw-r--r--mm/workingset.c135
-rw-r--r--mm/zsmalloc.c2
-rwxr-xr-xscripts/tags.sh2
-rw-r--r--tools/accounting/getdelays.c8
-rw-r--r--tools/testing/selftests/vm/.gitignore1
-rw-r--r--tools/testing/selftests/vm/Makefile1
-rw-r--r--tools/testing/selftests/vm/gup_benchmark.c42
-rw-r--r--tools/testing/selftests/vm/map_fixed_noreplace.c206
-rw-r--r--tools/testing/selftests/vm/userfaultfd.c134
-rw-r--r--virt/kvm/kvm_main.c1
156 files changed, 3400 insertions, 1988 deletions
diff --git a/Documentation/accounting/psi.txt b/Documentation/accounting/psi.txt
new file mode 100644
index 0000000..b8ca28b
--- /dev/null
+++ b/Documentation/accounting/psi.txt
@@ -0,0 +1,73 @@
+================================
+PSI - Pressure Stall Information
+================================
+
+:Date: April, 2018
+:Author: Johannes Weiner <hannes@cmpxchg.org>
+
+When CPU, memory or IO devices are contended, workloads experience
+latency spikes, throughput losses, and run the risk of OOM kills.
+
+Without an accurate measure of such contention, users are forced to
+either play it safe and under-utilize their hardware resources, or
+roll the dice and frequently suffer the disruptions resulting from
+excessive overcommit.
+
+The psi feature identifies and quantifies the disruptions caused by
+such resource crunches and the time impact it has on complex workloads
+or even entire systems.
+
+Having an accurate measure of productivity losses caused by resource
+scarcity aids users in sizing workloads to hardware--or provisioning
+hardware according to workload demand.
+
+As psi aggregates this information in realtime, systems can be managed
+dynamically using techniques such as load shedding, migrating jobs to
+other systems or data centers, or strategically pausing or killing low
+priority or restartable batch jobs.
+
+This allows maximizing hardware utilization without sacrificing
+workload health or risking major disruptions such as OOM kills.
+
+Pressure interface
+==================
+
+Pressure information for each resource is exported through the
+respective file in /proc/pressure/ -- cpu, memory, and io.
+
+The format for CPU is as such:
+
+some avg10=0.00 avg60=0.00 avg300=0.00 total=0
+
+and for memory and IO:
+
+some avg10=0.00 avg60=0.00 avg300=0.00 total=0
+full avg10=0.00 avg60=0.00 avg300=0.00 total=0
+
+The "some" line indicates the share of time in which at least some
+tasks are stalled on a given resource.
+
+The "full" line indicates the share of time in which all non-idle
+tasks are stalled on a given resource simultaneously. In this state
+actual CPU cycles are going to waste, and a workload that spends
+extended time in this state is considered to be thrashing. This has
+severe impact on performance, and it's useful to distinguish this
+situation from a state where some tasks are stalled but the CPU is
+still doing productive work. As such, time spent in this subset of the
+stall state is tracked separately and exported in the "full" averages.
+
+The ratios are tracked as recent trends over ten, sixty, and three
+hundred second windows, which gives insight into short term events as
+well as medium and long term trends. The total absolute stall time is
+tracked and exported as well, to allow detection of latency spikes
+which wouldn't necessarily make a dent in the time averages, or to
+average trends over custom time frames.
+
+Cgroup2 interface
+=================
+
+In a system with a CONFIG_CGROUP=y kernel and the cgroup2 filesystem
+mounted, pressure stall information is also tracked for tasks grouped
+into cgroups. Each subdirectory in the cgroupfs mountpoint contains
+cpu.pressure, memory.pressure, and io.pressure files; the format is
+the same as the /proc/pressure/ files.
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index caf3610..8384c68 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -966,6 +966,12 @@ All time durations are in microseconds.
$PERIOD duration. "max" for $MAX indicates no limit. If only
one number is written, $MAX is updated.
+ cpu.pressure
+ A read-only nested-key file which exists on non-root cgroups.
+
+ Shows pressure stall information for CPU. See
+ Documentation/accounting/psi.txt for details.
+
Memory
------
@@ -1127,6 +1133,10 @@ PAGE_SIZE multiple when read back.
disk readahead. For now OOM in memory cgroup kills
tasks iff shortage has happened inside page fault.
+ This event is not raised if the OOM killer is not
+ considered as an option, e.g. for failed high-order
+ allocations.
+
oom_kill
The number of processes belonging to this cgroup
killed by any kind of OOM killer.
@@ -1271,6 +1281,12 @@ PAGE_SIZE multiple when read back.
higher than the limit for an extended period of time. This
reduces the impact on the workload and memory management.
+ memory.pressure
+ A read-only nested-key file which exists on non-root cgroups.
+
+ Shows pressure stall information for memory. See
+ Documentation/accounting/psi.txt for details.
+
Usage Guidelines
~~~~~~~~~~~~~~~~
@@ -1408,6 +1424,12 @@ IO Interface Files
8:16 rbps=2097152 wbps=max riops=max wiops=max
+ io.pressure
+ A read-only nested-key file which exists on non-root cgroups.
+
+ Shows pressure stall information for IO. See
+ Documentation/accounting/psi.txt for details.
+
Writeback
~~~~~~~~~
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 47ca5cd..b90fe3b 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4851,6 +4851,18 @@
This is actually a boot loader parameter; the value is
passed to the kernel using a special protocol.
+ vm_debug[=options] [KNL] Available with CONFIG_DEBUG_VM=y.
+ May slow down system boot speed, especially when
+ enabled on systems with a large amount of memory.
+ All options are enabled by default, and this
+ interface is meant to allow for selectively
+ enabling or disabling specific virtual memory
+ debugging features.
+
+ Available options are:
+ P Enable page structure init time poisoning
+ - Disable all of the above options
+
vmalloc=nn[KMG] [KNL,BOOT] Forces the vmalloc area to have an exact
size of <nn>. This can be used to increase the
minimum size (128MB on x86). It can also be used to
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 22b4b00..12a5e6e 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -858,6 +858,7 @@ Writeback: 0 kB
AnonPages: 861800 kB
Mapped: 280372 kB
Shmem: 644 kB
+KReclaimable: 168048 kB
Slab: 284364 kB
SReclaimable: 159856 kB
SUnreclaim: 124508 kB
@@ -925,6 +926,9 @@ AnonHugePages: Non-file backed huge pages mapped into userspace page tables
ShmemHugePages: Memory used by shared memory (shmem) and tmpfs allocated
with huge pages
ShmemPmdMapped: Shared memory mapped into userspace with huge pages
+KReclaimable: Kernel allocations that the kernel will attempt to reclaim
+ under memory pressure. Includes SReclaimable (below), and other
+ direct allocations with a shrinker.
Slab: in-kernel data structures cache
SReclaimable: Part of Slab, that might be reclaimed, such as caches
SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure
diff --git a/Documentation/vm/slub.rst b/Documentation/vm/slub.rst
index 3a775fd..1959288 100644
--- a/Documentation/vm/slub.rst
+++ b/Documentation/vm/slub.rst
@@ -36,9 +36,10 @@ debugging is enabled. Format:
slub_debug=<Debug-Options>
Enable options for all slabs
-slub_debug=<Debug-Options>,<slab name>
- Enable options only for select slabs
+slub_debug=<Debug-Options>,<slab name1>,<slab name2>,...
+ Enable options only for select slabs (no spaces
+ after a comma)
Possible debug options are::
@@ -62,7 +63,12 @@ Trying to find an issue in the dentry cache? Try::
slub_debug=,dentry
-to only enable debugging on the dentry cache.
+to only enable debugging on the dentry cache. You may use an asterisk at the
+end of the slab name, in order to cover all slabs with the same prefix. For
+example, here's how you can poison the dentry cache as well as all kmalloc
+slabs:
+
+ slub_debug=P,kmalloc-*,dentry
Red zoning and tracking may realign the slab. We can just apply sanity checks
to the dentry cache with::
diff --git a/Documentation/x86/pat.txt b/Documentation/x86/pat.txt
index 2a4ee63..481d8d8 100644
--- a/Documentation/x86/pat.txt
+++ b/Documentation/x86/pat.txt
@@ -90,12 +90,12 @@ pci proc | -- | -- | WC |
Advanced APIs for drivers
-------------------------
A. Exporting pages to users with remap_pfn_range, io_remap_pfn_range,
-vm_insert_pfn
+vmf_insert_pfn
Drivers wanting to export some pages to userspace do it by using mmap
interface and a combination of
1) pgprot_noncached()
-2) io_remap_pfn_range() or remap_pfn_range() or vm_insert_pfn()
+2) io_remap_pfn_range() or remap_pfn_range() or vmf_insert_pfn()
With PAT support, a new API pgprot_writecombine is being added. So, drivers can
continue to use the above sequence, with either pgprot_noncached() or
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 5b4f883..620b0a7 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -31,6 +31,8 @@ config ALPHA
select ODD_RT_SIGACTION
select OLD_SIGSUSPEND
select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67
+ select HAVE_MEMBLOCK
+ select NO_BOOTMEM
help
The Alpha is a 64-bit general-purpose processor designed and
marketed by the Digital Equipment Corporation of blessed memory,
diff --git a/arch/alpha/kernel/core_irongate.c b/arch/alpha/kernel/core_irongate.c
index aec7572..f709866 100644
--- a/arch/alpha/kernel/core_irongate.c
+++ b/arch/alpha/kernel/core_irongate.c
@@ -21,6 +21,7 @@
#include <linux/init.h>
#include <linux/initrd.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <asm/ptrace.h>
#include <asm/cacheflush.h>
@@ -241,8 +242,7 @@ albacore_init_arch(void)
size / 1024);
}
#endif
- reserve_bootmem_node(NODE_DATA(0), pci_mem, memtop -
- pci_mem, BOOTMEM_DEFAULT);
+ memblock_reserve(pci_mem, memtop - pci_mem);
printk("irongate_init_arch: temporarily reserving "
"region %08lx-%08lx for PCI\n", pci_mem, memtop - 1);
}
diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c
index 5576f76..4f0d944 100644
--- a/arch/alpha/kernel/setup.c
+++ b/arch/alpha/kernel/setup.c
@@ -30,6 +30,7 @@
#include <linux/ioport.h>
#include <linux/platform_device.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/pci.h>
#include <linux/seq_file.h>
#include <linux/root_dev.h>
@@ -312,9 +313,7 @@ setup_memory(void *kernel_end)
{
struct memclust_struct * cluster;
struct memdesc_struct * memdesc;
- unsigned long start_kernel_pfn, end_kernel_pfn;
- unsigned long bootmap_size, bootmap_pages, bootmap_start;
- unsigned long start, end;
+ unsigned long kernel_size;
unsigned long i;
/* Find free clusters, and init and free the bootmem accordingly. */
@@ -322,6 +321,8 @@ setup_memory(void *kernel_end)
(hwrpb->mddt_offset + (unsigned long) hwrpb);
for_each_mem_cluster(memdesc, cluster, i) {
+ unsigned long end;
+
printk("memcluster %lu, usage %01lx, start %8lu, end %8lu\n",
i, cluster->usage, cluster->start_pfn,
cluster->start_pfn + cluster->numpages);
@@ -335,6 +336,9 @@ setup_memory(void *kernel_end)
end = cluster->start_pfn + cluster->numpages;
if (end > max_low_pfn)
max_low_pfn = end;
+
+ memblock_add(PFN_PHYS(cluster->start_pfn),
+ cluster->numpages << PAGE_SHIFT);
}
/*
@@ -363,87 +367,9 @@ setup_memory(void *kernel_end)
max_low_pfn = mem_size_limit;
}
- /* Find the bounds of kernel memory. */
- start_kernel_pfn = PFN_DOWN(KERNEL_START_PHYS);
- end_kernel_pfn = PFN_UP(virt_to_phys(kernel_end));
- bootmap_start = -1;
-
- try_again:
- if (max_low_pfn <= end_kernel_pfn)
- panic("not enough memory to boot");
-
- /* We need to know how many physically contiguous pages
- we'll need for the bootmap. */
- bootmap_pages = bootmem_bootmap_pages(max_low_pfn);
-
- /* Now find a good region where to allocate the bootmap. */
- for_each_mem_cluster(memdesc, cluster, i) {
- if (cluster->usage & 3)
- continue;
-
- start = cluster->start_pfn;
- end = start + cluster->numpages;
- if (start >= max_low_pfn)
- continue;
- if (end > max_low_pfn)
- end = max_low_pfn;
- if (start < start_kernel_pfn) {
- if (end > end_kernel_pfn
- && end - end_kernel_pfn >= bootmap_pages) {
- bootmap_start = end_kernel_pfn;
- break;
- } else if (end > start_kernel_pfn)
- end = start_kernel_pfn;
- } else if (start < end_kernel_pfn)
- start = end_kernel_pfn;
- if (end - start >= bootmap_pages) {
- bootmap_start = start;
- break;
- }
- }
-
- if (bootmap_start == ~0UL) {
- max_low_pfn >>= 1;
- goto try_again;
- }
-
- /* Allocate the bootmap and mark the whole MM as reserved. */
- bootmap_size = init_bootmem(bootmap_start, max_low_pfn);
-
- /* Mark the free regions. */
- for_each_mem_cluster(memdesc, cluster, i) {
- if (cluster->usage & 3)
- continue;
-
- start = cluster->start_pfn;
- end = cluster->start_pfn + cluster->numpages;
- if (start >= max_low_pfn)
- continue;
- if (end > max_low_pfn)
- end = max_low_pfn;
- if (start < start_kernel_pfn) {
- if (end > end_kernel_pfn) {
- free_bootmem(PFN_PHYS(start),
- (PFN_PHYS(start_kernel_pfn)
- - PFN_PHYS(start)));
- printk("freeing pages %ld:%ld\n",
- start, start_kernel_pfn);
- start = end_kernel_pfn;
- } else if (end > start_kernel_pfn)
- end = start_kernel_pfn;
- } else if (start < end_kernel_pfn)
- start = end_kernel_pfn;
- if (start >= end)
- continue;
-
- free_bootmem(PFN_PHYS(start), PFN_PHYS(end) - PFN_PHYS(start));
- printk("freeing pages %ld:%ld\n", start, end);
- }
-
- /* Reserve the bootmap memory. */
- reserve_bootmem(PFN_PHYS(bootmap_start), bootmap_size,
- BOOTMEM_DEFAULT);
- printk("reserving pages %ld:%ld\n", bootmap_start, bootmap_start+PFN_UP(bootmap_size));
+ /* Reserve the kernel memory. */
+ kernel_size = virt_to_phys(kernel_end) - KERNEL_START_PHYS;
+ memblock_reserve(KERNEL_START_PHYS, kernel_size);
#ifdef CONFIG_BLK_DEV_INITRD
initrd_start = INITRD_START;
@@ -459,8 +385,8 @@ setup_memory(void *kernel_end)
initrd_end,
phys_to_virt(PFN_PHYS(max_low_pfn)));
} else {
- reserve_bootmem(virt_to_phys((void *)initrd_start),
- INITRD_SIZE, BOOTMEM_DEFAULT);
+ memblock_reserve(virt_to_phys((void *)initrd_start),
+ INITRD_SIZE);
}
}
#endif /* CONFIG_BLK_DEV_INITRD */
diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c
index a9e8647..26cd925 100644
--- a/arch/alpha/mm/numa.c
+++ b/arch/alpha/mm/numa.c
@@ -11,6 +11,7 @@
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/swap.h>
#include <linux/initrd.h>
#include <linux/pfn.h>
@@ -59,12 +60,10 @@ setup_memory_node(int nid, void *kernel_end)
struct memclust_struct * cluster;
struct memdesc_struct * memdesc;
unsigned long start_kernel_pfn, end_kernel_pfn;
- unsigned long bootmap_size, bootmap_pages, bootmap_start;
unsigned long start, end;
unsigned long node_pfn_start, node_pfn_end;
unsigned long node_min_pfn, node_max_pfn;
int i;
- unsigned long node_datasz = PFN_UP(sizeof(pg_data_t));
int show_init = 0;
/* Find the bounds of current node */
@@ -134,24 +133,14 @@ setup_memory_node(int nid, void *kernel_end)
/* Cute trick to make sure our local node data is on local memory */
node_data[nid] = (pg_data_t *)(__va(node_min_pfn << PAGE_SHIFT));
#endif
- /* Quasi-mark the pg_data_t as in-use */
- node_min_pfn += node_datasz;
- if (node_min_pfn >= node_max_pfn) {
- printk(" not enough mem to reserve NODE_DATA");
- return;
- }
- NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
-
printk(" Detected node memory: start %8lu, end %8lu\n",
node_min_pfn, node_max_pfn);
DBGDCONT(" DISCONTIG: node_data[%d] is at 0x%p\n", nid, NODE_DATA(nid));
- DBGDCONT(" DISCONTIG: NODE_DATA(%d)->bdata is at 0x%p\n", nid, NODE_DATA(nid)->bdata);
/* Find the bounds of kernel memory. */
start_kernel_pfn = PFN_DOWN(KERNEL_START_PHYS);
end_kernel_pfn = PFN_UP(virt_to_phys(kernel_end));
- bootmap_start = -1;
if (!nid && (node_max_pfn < end_kernel_pfn || node_min_pfn > start_kernel_pfn))
panic("kernel loaded out of ram");
@@ -161,89 +150,11 @@ setup_memory_node(int nid, void *kernel_end)
has much larger alignment than 8Mb, so it's safe. */
node_min_pfn &= ~((1UL << (MAX_ORDER-1))-1);
- /* We need to know how many physically contiguous pages
- we'll need for the bootmap. */
- bootmap_pages = bootmem_bootmap_pages(node_max_pfn-node_min_pfn);
-
- /* Now find a good region where to allocate the bootmap. */
- for_each_mem_cluster(memdesc, cluster, i) {
- if (cluster->usage & 3)
- continue;
-
- start = cluster->start_pfn;
- end = start + cluster->numpages;
-
- if (start >= node_max_pfn || end <= node_min_pfn)
- continue;
-
- if (end > node_max_pfn)
- end = node_max_pfn;
- if (start < node_min_pfn)
- start = node_min_pfn;
-
- if (start < start_kernel_pfn) {
- if (end > end_kernel_pfn
- && end - end_kernel_pfn >= bootmap_pages) {
- bootmap_start = end_kernel_pfn;
- break;
- } else if (end > start_kernel_pfn)
- end = start_kernel_pfn;
- } else if (start < end_kernel_pfn)
- start = end_kernel_pfn;
- if (end - start >= bootmap_pages) {
- bootmap_start = start;
- break;
- }
- }
-
- if (bootmap_start == -1)
- panic("couldn't find a contiguous place for the bootmap");
-
- /* Allocate the bootmap and mark the whole MM as reserved. */
- bootmap_size = init_bootmem_node(NODE_DATA(nid), bootmap_start,
- node_min_pfn, node_max_pfn);
- DBGDCONT(" bootmap_start %lu, bootmap_size %lu, bootmap_pages %lu\n",
- bootmap_start, bootmap_size, bootmap_pages);
+ memblock_add(PFN_PHYS(node_min_pfn),
+ (node_max_pfn - node_min_pfn) << PAGE_SHIFT);
- /* Mark the free regions. */
- for_each_mem_cluster(memdesc, cluster, i) {
- if (cluster->usage & 3)
- continue;
-
- start = cluster->start_pfn;
- end = cluster->start_pfn + cluster->numpages;
-
- if (start >= node_max_pfn || end <= node_min_pfn)
- continue;
-
- if (end > node_max_pfn)
- end = node_max_pfn;
- if (start < node_min_pfn)
- start = node_min_pfn;
-
- if (start < start_kernel_pfn) {
- if (end > end_kernel_pfn) {
- free_bootmem_node(NODE_DATA(nid), PFN_PHYS(start),
- (PFN_PHYS(start_kernel_pfn)
- - PFN_PHYS(start)));
- printk(" freeing pages %ld:%ld\n",
- start, start_kernel_pfn);
- start = end_kernel_pfn;
- } else if (end > start_kernel_pfn)
- end = start_kernel_pfn;
- } else if (start < end_kernel_pfn)
- start = end_kernel_pfn;
- if (start >= end)
- continue;
-
- free_bootmem_node(NODE_DATA(nid), PFN_PHYS(start), PFN_PHYS(end) - PFN_PHYS(start));
- printk(" freeing pages %ld:%ld\n", start, end);
- }
-
- /* Reserve the bootmap memory. */
- reserve_bootmem_node(NODE_DATA(nid), PFN_PHYS(bootmap_start),
- bootmap_size, BOOTMEM_DEFAULT);
- printk(" reserving pages %ld:%ld\n", bootmap_start, bootmap_start+PFN_UP(bootmap_size));
+ NODE_DATA(nid)->node_start_pfn = node_min_pfn;
+ NODE_DATA(nid)->node_present_pages = node_max_pfn - node_min_pfn;
node_set_online(nid);
}
@@ -251,6 +162,7 @@ setup_memory_node(int nid, void *kernel_end)
void __init
setup_memory(void *kernel_end)
{
+ unsigned long kernel_size;
int nid;
show_mem_layout();
@@ -262,6 +174,9 @@ setup_memory(void *kernel_end)
for (nid = 0; nid < MAX_NUMNODES; nid++)
setup_memory_node(nid, kernel_end);
+ kernel_size = virt_to_phys(kernel_end) - KERNEL_START_PHYS;
+ memblock_reserve(KERNEL_START_PHYS, kernel_size);
+
#ifdef CONFIG_BLK_DEV_INITRD
initrd_start = INITRD_START;
if (initrd_start) {
@@ -279,9 +194,8 @@ setup_memory(void *kernel_end)
phys_to_virt(PFN_PHYS(max_low_pfn)));
} else {
nid = kvaddr_to_nid(initrd_start);
- reserve_bootmem_node(NODE_DATA(nid),
- virt_to_phys((void *)initrd_start),
- INITRD_SIZE, BOOTMEM_DEFAULT);
+ memblock_reserve(virt_to_phys((void *)initrd_start),
+ INITRD_SIZE);
}
}
#endif /* CONFIG_BLK_DEV_INITRD */
@@ -303,9 +217,8 @@ void __init paging_init(void)
dma_local_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
for_each_online_node(nid) {
- bootmem_data_t *bdata = &bootmem_node_data[nid];
- unsigned long start_pfn = bdata->node_min_pfn;
- unsigned long end_pfn = bdata->node_low_pfn;
+ unsigned long start_pfn = NODE_DATA(nid)->node_start_pfn;
+ unsigned long end_pfn = start_pfn + NODE_DATA(nid)->node_present_pages;
if (dma_local_pfn >= end_pfn - start_pfn)
zones_size[ZONE_DMA] = end_pfn - start_pfn;
diff --git a/arch/arm/include/asm/hugetlb-3level.h b/arch/arm/include/asm/hugetlb-3level.h
index d4014fb..0d9f391 100644
--- a/arch/arm/include/asm/hugetlb-3level.h
+++ b/arch/arm/include/asm/hugetlb-3level.h
@@ -29,6 +29,7 @@
* ptes.
* (The valid bit is automatically cleared by set_pte_at for PROT_NONE ptes).
*/
+#define __HAVE_ARCH_HUGE_PTEP_GET
static inline pte_t huge_ptep_get(pte_t *ptep)
{
pte_t retval = *ptep;
@@ -37,35 +38,4 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
return retval;
}
-static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
-{
- set_pte_at(mm, addr, ptep, pte);
-}
-
-static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
-{
- ptep_clear_flush(vma, addr, ptep);
-}
-
-static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- ptep_set_wrprotect(mm, addr, ptep);
-}
-
-static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- return ptep_get_and_clear(mm, addr, ptep);
-}
-
-static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep,
- pte_t pte, int dirty)
-{
- return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
-}
-
#endif /* _ASM_ARM_HUGETLB_3LEVEL_H */
diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h
index 7d26f6c..b67256c 100644
--- a/arch/arm/include/asm/hugetlb.h
+++ b/arch/arm/include/asm/hugetlb.h
@@ -23,18 +23,8 @@
#define _ASM_ARM_HUGETLB_H
#include <asm/page.h>
-#include <asm-generic/hugetlb.h>
-
#include <asm/hugetlb-3level.h>
-
-static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
- unsigned long addr, unsigned long end,
- unsigned long floor,
- unsigned long ceiling)
-{
- free_pgd_range(tlb, addr, end, floor, ceiling);
-}
-
+#include <asm-generic/hugetlb.h>
static inline int is_hugepage_only_range(struct mm_struct *mm,
unsigned long addr, unsigned long len)
@@ -42,27 +32,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
return 0;
}
-static inline int prepare_hugepage_range(struct file *file,
- unsigned long addr, unsigned long len)
-{
- struct hstate *h = hstate_file(file);
- if (len & ~huge_page_mask(h))
- return -EINVAL;
- if (addr & ~huge_page_mask(h))
- return -EINVAL;
- return 0;
-}
-
-static inline int huge_pte_none(pte_t pte)
-{
- return pte_none(pte);
-}
-
-static inline pte_t huge_pte_wrprotect(pte_t pte)
-{
- return pte_wrprotect(pte);
-}
-
static inline void arch_clear_hugepage_flags(struct page *page)
{
clear_bit(PG_dcache_clean, &page->flags);
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index e73f685..fb66098 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -20,48 +20,18 @@
#include <asm/page.h>
+#define __HAVE_ARCH_HUGE_PTEP_GET
static inline pte_t huge_ptep_get(pte_t *ptep)
{
return READ_ONCE(*ptep);
}
-
-
-static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
- unsigned long addr, unsigned long end,
- unsigned long floor,
- unsigned long ceiling)
-{
- free_pgd_range(tlb, addr, end, floor, ceiling);
-}
-
static inline int is_hugepage_only_range(struct mm_struct *mm,
unsigned long addr, unsigned long len)
{
return 0;
}
-static inline int prepare_hugepage_range(struct file *file,
- unsigned long addr, unsigned long len)
-{
- struct hstate *h = hstate_file(file);
- if (len & ~huge_page_mask(h))
- return -EINVAL;
- if (addr & ~huge_page_mask(h))
- return -EINVAL;
- return 0;
-}
-
-static inline int huge_pte_none(pte_t pte)
-{
- return pte_none(pte);
-}
-
-static inline pte_t huge_pte_wrprotect(pte_t pte)
-{
- return pte_wrprotect(pte);
-}
-
static inline void arch_clear_hugepage_flags(struct page *page)
{
clear_bit(PG_dcache_clean, &page->flags);
@@ -70,20 +40,25 @@ static inline void arch_clear_hugepage_flags(struct page *page)
extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
struct page *page, int writable);
#define arch_make_huge_pte arch_make_huge_pte
+#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
extern void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte);
+#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS
extern int huge_ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
pte_t pte, int dirty);
+#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep);
+#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
extern void huge_ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep);
+#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
extern void huge_ptep_clear_flush(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep);
+#define __HAVE_ARCH_HUGE_PTE_CLEAR
extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned long sz);
-#define huge_pte_clear huge_pte_clear
extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte, unsigned long sz);
#define set_huge_swap_pte_at set_huge_swap_pte_at
diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h
index dd95d33..03a6c256 100644
--- a/arch/arm64/include/asm/string.h
+++ b/arch/arm64/include/asm/string.h
@@ -16,6 +16,7 @@
#ifndef __ASM_STRING_H
#define __ASM_STRING_H
+#ifndef CONFIG_KASAN
#define __HAVE_ARCH_STRRCHR
extern char *strrchr(const char *, int c);
@@ -34,6 +35,13 @@ extern __kernel_size_t strlen(const char *);
#define __HAVE_ARCH_STRNLEN
extern __kernel_size_t strnlen(const char *, __kernel_size_t);
+#define __HAVE_ARCH_MEMCMP
+extern int memcmp(const void *, const void *, size_t);
+
+#define __HAVE_ARCH_MEMCHR
+extern void *memchr(const void *, int, __kernel_size_t);
+#endif
+
#define __HAVE_ARCH_MEMCPY
extern void *memcpy(void *, const void *, __kernel_size_t);
extern void *__memcpy(void *, const void *, __kernel_size_t);
@@ -42,16 +50,10 @@ extern void *__memcpy(void *, const void *, __kernel_size_t);
extern void *memmove(void *, const void *, __kernel_size_t);
extern void *__memmove(void *, const void *, __kernel_size_t);
-#define __HAVE_ARCH_MEMCHR
-extern void *memchr(const void *, int, __kernel_size_t);
-
#define __HAVE_ARCH_MEMSET
extern void *memset(void *, int, __kernel_size_t);
extern void *__memset(void *, int, __kernel_size_t);
-#define __HAVE_ARCH_MEMCMP
-extern int memcmp(const void *, const void *, size_t);
-
#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
#define __HAVE_ARCH_MEMCPY_FLUSHCACHE
void memcpy_flushcache(void *dst, const void *src, size_t cnt);
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index d894a20..72f63a5 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -44,20 +44,23 @@ EXPORT_SYMBOL(__arch_copy_in_user);
EXPORT_SYMBOL(memstart_addr);
/* string / mem functions */
+#ifndef CONFIG_KASAN
EXPORT_SYMBOL(strchr);
EXPORT_SYMBOL(strrchr);
EXPORT_SYMBOL(strcmp);
EXPORT_SYMBOL(strncmp);
EXPORT_SYMBOL(strlen);
EXPORT_SYMBOL(strnlen);
+EXPORT_SYMBOL(memcmp);
+EXPORT_SYMBOL(memchr);
+#endif
+
EXPORT_SYMBOL(memset);
EXPORT_SYMBOL(memcpy);
EXPORT_SYMBOL(memmove);
EXPORT_SYMBOL(__memset);
EXPORT_SYMBOL(__memcpy);
EXPORT_SYMBOL(__memmove);
-EXPORT_SYMBOL(memchr);
-EXPORT_SYMBOL(memcmp);
/* atomic bitops */
EXPORT_SYMBOL(set_bit);
diff --git a/arch/arm64/lib/memchr.S b/arch/arm64/lib/memchr.S
index 4444c1d..0f164a4 100644
--- a/arch/arm64/lib/memchr.S
+++ b/arch/arm64/lib/memchr.S
@@ -30,7 +30,7 @@
* Returns:
* x0 - address of first occurrence of 'c' or 0
*/
-ENTRY(memchr)
+WEAK(memchr)
and w1, w1, #0xff
1: subs x2, x2, #1
b.mi 2f
diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S
index 2a4e239..fb295f5 100644
--- a/arch/arm64/lib/memcmp.S
+++ b/arch/arm64/lib/memcmp.S
@@ -58,7 +58,7 @@ pos .req x11
limit_wd .req x12
mask .req x13
-ENTRY(memcmp)
+WEAK(memcmp)
cbz limit, .Lret0
eor tmp1, src1, src2
tst tmp1, #7
diff --git a/arch/arm64/lib/strchr.S b/arch/arm64/lib/strchr.S
index dae0cf5..7c83091 100644
--- a/arch/arm64/lib/strchr.S
+++ b/arch/arm64/lib/strchr.S
@@ -29,7 +29,7 @@
* Returns:
* x0 - address of first occurrence of 'c' or 0
*/
-ENTRY(strchr)
+WEAK(strchr)
and w1, w1, #0xff
1: ldrb w2, [x0], #1
cmp w2, w1
diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S
index 471fe61..7d5d153 100644
--- a/arch/arm64/lib/strcmp.S
+++ b/arch/arm64/lib/strcmp.S
@@ -60,7 +60,7 @@ tmp3 .req x9
zeroones .req x10
pos .req x11
-ENTRY(strcmp)
+WEAK(strcmp)
eor tmp1, src1, src2
mov zeroones, #REP8_01
tst tmp1, #7
diff --git a/arch/arm64/lib/strlen.S b/arch/arm64/lib/strlen.S
index 55ccc8e..8e0b142 100644
--- a/arch/arm64/lib/strlen.S
+++ b/arch/arm64/lib/strlen.S
@@ -56,7 +56,7 @@ pos .req x12
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080
-ENTRY(strlen)
+WEAK(strlen)
mov zeroones, #REP8_01
bic src, srcin, #15
ands tmp1, srcin, #15
diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S
index e267044..66bd145 100644
--- a/arch/arm64/lib/strncmp.S
+++ b/arch/arm64/lib/strncmp.S
@@ -64,7 +64,7 @@ limit_wd .req x13
mask .req x14
endloop .req x15
-ENTRY(strncmp)
+WEAK(strncmp)
cbz limit, .Lret0
eor tmp1, src1, src2
mov zeroones, #REP8_01
diff --git a/arch/arm64/lib/strnlen.S b/arch/arm64/lib/strnlen.S
index eae38da..355be04 100644
--- a/arch/arm64/lib/strnlen.S
+++ b/arch/arm64/lib/strnlen.S
@@ -59,7 +59,7 @@ limit_wd .req x14
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080
-ENTRY(strnlen)
+WEAK(strnlen)
cbz limit, .Lhit_limit
mov zeroones, #REP8_01
bic src, srcin, #15
diff --git a/arch/arm64/lib/strrchr.S b/arch/arm64/lib/strrchr.S
index f8e2784..ea84924 100644
--- a/arch/arm64/lib/strrchr.S
+++ b/arch/arm64/lib/strrchr.S
@@ -29,7 +29,7 @@
* Returns:
* x0 - address of last occurrence of 'c' or 0
*/
-ENTRY(strrchr)
+WEAK(strrchr)
mov x3, #0
and w1, w1, #0xff
1: ldrb w2, [x0], #1
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index 3ef4652..7b25d7c 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -21,6 +21,9 @@ config HEXAGON
select GENERIC_IRQ_SHOW
select HAVE_ARCH_KGDB
select HAVE_ARCH_TRACEHOOK
+ select HAVE_MEMBLOCK
+ select ARCH_DISCARD_MEMBLOCK
+ select NO_BOOTMEM
select NEED_SG_DMA_LENGTH
select NO_IOPORT_MAP
select GENERIC_IOMAP
diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c
index 1495d45..d789b9c 100644
--- a/arch/hexagon/mm/init.c
+++ b/arch/hexagon/mm/init.c
@@ -21,6 +21,7 @@
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <asm/atomic.h>
#include <linux/highmem.h>
#include <asm/tlb.h>
@@ -176,7 +177,6 @@ size_t hexagon_coherent_pool_size = (size_t) (DMA_RESERVE << 22);
void __init setup_arch_memory(void)
{
- int bootmap_size;
/* XXX Todo: this probably should be cleaned up */
u32 *segtable = (u32 *) &swapper_pg_dir[0];
u32 *segtable_end;
@@ -195,18 +195,22 @@ void __init setup_arch_memory(void)
bootmem_lastpg = PFN_DOWN((bootmem_lastpg << PAGE_SHIFT) &
~((BIG_KERNEL_PAGE_SIZE) - 1));
+ memblock_add(PHYS_OFFSET,
+ (bootmem_lastpg - ARCH_PFN_OFFSET) << PAGE_SHIFT);
+
+ /* Reserve kernel text/data/bss */
+ memblock_reserve(PHYS_OFFSET,
+ (bootmem_startpg - ARCH_PFN_OFFSET) << PAGE_SHIFT);
/*
* Reserve the top DMA_RESERVE bytes of RAM for DMA (uncached)
* memory allocation
*/
-
max_low_pfn = bootmem_lastpg - PFN_DOWN(DMA_RESERVED_BYTES);
min_low_pfn = ARCH_PFN_OFFSET;
- bootmap_size = init_bootmem_node(NODE_DATA(0), bootmem_startpg, min_low_pfn, max_low_pfn);
+ memblock_reserve(PFN_PHYS(max_low_pfn), DMA_RESERVED_BYTES);
printk(KERN_INFO "bootmem_startpg: 0x%08lx\n", bootmem_startpg);
printk(KERN_INFO "bootmem_lastpg: 0x%08lx\n", bootmem_lastpg);
- printk(KERN_INFO "bootmap_size: %d\n", bootmap_size);
printk(KERN_INFO "min_low_pfn: 0x%08lx\n", min_low_pfn);
printk(KERN_INFO "max_low_pfn: 0x%08lx\n", max_low_pfn);
@@ -257,14 +261,6 @@ void __init setup_arch_memory(void)
#endif
/*
- * Free all the memory that wasn't taken up by the bootmap, the DMA
- * reserve, or kernel itself.
- */
- free_bootmem(PFN_PHYS(bootmem_startpg) + bootmap_size,
- PFN_PHYS(bootmem_lastpg - bootmem_startpg) - bootmap_size -
- DMA_RESERVED_BYTES);
-
- /*
* The bootmem allocator seemingly just lives to feed memory
* to the paging system
*/
diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h
index 74d2a55..36cc039 100644
--- a/arch/ia64/include/asm/hugetlb.h
+++ b/arch/ia64/include/asm/hugetlb.h
@@ -3,13 +3,13 @@
#define _ASM_IA64_HUGETLB_H
#include <asm/page.h>
-#include <asm-generic/hugetlb.h>
-
+#define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE
void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
unsigned long end, unsigned long floor,
unsigned long ceiling);
+#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE
int prepare_hugepage_range(struct file *file,
unsigned long addr, unsigned long len);
@@ -21,53 +21,16 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
REGION_NUMBER((addr)+(len)-1) == RGN_HPAGE);
}
-static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
-{
- set_pte_at(mm, addr, ptep, pte);
-}
-
-static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- return ptep_get_and_clear(mm, addr, ptep);
-}
-
+#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
}
-static inline int huge_pte_none(pte_t pte)
-{
- return pte_none(pte);
-}
-
-static inline pte_t huge_pte_wrprotect(pte_t pte)
-{
- return pte_wrprotect(pte);
-}
-
-static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- ptep_set_wrprotect(mm, addr, ptep);
-}
-
-static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep,
- pte_t pte, int dirty)
-{
- return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
-}
-
-static inline pte_t huge_ptep_get(pte_t *ptep)
-{
- return *ptep;
-}
-
static inline void arch_clear_hugepage_flags(struct page *page)
{
}
+#include <asm-generic/hugetlb.h>
+
#endif /* _ASM_IA64_HUGETLB_H */
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index 1658277..b1e7468e 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -544,7 +544,6 @@ extern struct page *zero_page_memmap_ptr;
# ifdef CONFIG_VIRTUAL_MEM_MAP
/* arch mem_map init routine is needed due to holes in a virtual mem_map */
-# define __HAVE_ARCH_MEMMAP_INIT
extern void memmap_init (unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn);
# endif /* CONFIG_VIRTUAL_MEM_MAP */
diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h
index 982bc06..425bb6f 100644
--- a/arch/mips/include/asm/hugetlb.h
+++ b/arch/mips/include/asm/hugetlb.h
@@ -10,8 +10,6 @@
#define __ASM_HUGETLB_H
#include <asm/page.h>
-#include <asm-generic/hugetlb.h>
-
static inline int is_hugepage_only_range(struct mm_struct *mm,
unsigned long addr,
@@ -20,6 +18,7 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
return 0;
}
+#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE
static inline int prepare_hugepage_range(struct file *file,
unsigned long addr,
unsigned long len)
@@ -38,21 +37,7 @@ static inline int prepare_hugepage_range(struct file *file,
return 0;
}
-static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
- unsigned long addr,
- unsigned long end,
- unsigned long floor,
- unsigned long ceiling)
-{
- free_pgd_range(tlb, addr, end, floor, ceiling);
-}
-
-static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
-{
- set_pte_at(mm, addr, ptep, pte);
-}
-
+#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
@@ -64,29 +49,21 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
return pte;
}
+#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
flush_tlb_page(vma, addr & huge_page_mask(hstate_vma(vma)));
}
+#define __HAVE_ARCH_HUGE_PTE_NONE
static inline int huge_pte_none(pte_t pte)
{
unsigned long val = pte_val(pte) & ~_PAGE_GLOBAL;
return !val || (val == (unsigned long)invalid_pte_table);
}
-static inline pte_t huge_pte_wrprotect(pte_t pte)
-{
- return pte_wrprotect(pte);
-}
-
-static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- ptep_set_wrprotect(mm, addr, ptep);
-}
-
+#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS
static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long addr,
pte_t *ptep, pte_t pte,
@@ -105,13 +82,10 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
return changed;
}
-static inline pte_t huge_ptep_get(pte_t *ptep)
-{
- return *ptep;
-}
-
static inline void arch_clear_hugepage_flags(struct page *page)
{
}
+#include <asm-generic/hugetlb.h>
+
#endif /* __ASM_HUGETLB_H */
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index 0396569..2df0c57 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -23,6 +23,9 @@ config NIOS2
select SPARSE_IRQ
select USB_ARCH_HAS_HCD if USB_SUPPORT
select CPU_NO_EFFICIENT_FFS
+ select HAVE_MEMBLOCK
+ select ARCH_DISCARD_MEMBLOCK
+ select NO_BOOTMEM
config GENERIC_CSUM
def_bool y
diff --git a/arch/nios2/kernel/prom.c b/arch/nios2/kernel/prom.c
index 8d7446a..a6d4f75 100644
--- a/arch/nios2/kernel/prom.c
+++ b/arch/nios2/kernel/prom.c
@@ -32,23 +32,6 @@
#include <asm/sections.h>
-void __init early_init_dt_add_memory_arch(u64 base, u64 size)
-{
- u64 kernel_start = (u64)virt_to_phys(_text);
-
- if (!memory_size &&
- (kernel_start >= base) && (kernel_start < (base + size)))
- memory_size = size;
-
-}
-
-int __init early_init_dt_reserve_memory_arch(phys_addr_t base, phys_addr_t size,
- bool nomap)
-{
- reserve_bootmem(base, size, BOOTMEM_DEFAULT);
- return 0;
-}
-
void __init early_init_devtree(void *params)
{
__be32 *dtb = (u32 *)__dtb_start;
diff --git a/arch/nios2/kernel/setup.c b/arch/nios2/kernel/setup.c
index 926a02b..2d0011d 100644
--- a/arch/nios2/kernel/setup.c
+++ b/arch/nios2/kernel/setup.c
@@ -17,6 +17,7 @@
#include <linux/sched/task.h>
#include <linux/console.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/initrd.h>
#include <linux/of_fdt.h>
#include <linux/screen_info.h>
@@ -143,10 +144,12 @@ asmlinkage void __init nios2_boot_init(unsigned r4, unsigned r5, unsigned r6,
void __init setup_arch(char **cmdline_p)
{
- int bootmap_size;
+ int dram_start;
console_verbose();
+ dram_start = memblock_start_of_DRAM();
+ memory_size = memblock_phys_mem_size();
memory_start = PAGE_ALIGN((unsigned long)__pa(_end));
memory_end = (unsigned long) CONFIG_NIOS2_MEM_BASE + memory_size;
@@ -163,39 +166,11 @@ void __init setup_arch(char **cmdline_p)
max_low_pfn = PFN_DOWN(memory_end);
max_mapnr = max_low_pfn;
- /*
- * give all the memory to the bootmap allocator, tell it to put the
- * boot mem_map at the start of memory
- */
- pr_debug("init_bootmem_node(?,%#lx, %#x, %#lx)\n",
- min_low_pfn, PFN_DOWN(PHYS_OFFSET), max_low_pfn);
- bootmap_size = init_bootmem_node(NODE_DATA(0),
- min_low_pfn, PFN_DOWN(PHYS_OFFSET),
- max_low_pfn);
-
- /*
- * free the usable memory, we have to make sure we do not free
- * the bootmem bitmap so we then reserve it after freeing it :-)
- */
- pr_debug("free_bootmem(%#lx, %#lx)\n",
- memory_start, memory_end - memory_start);
- free_bootmem(memory_start, memory_end - memory_start);
-
- /*
- * Reserve the bootmem bitmap itself as well. We do this in two
- * steps (first step was init_bootmem()) because this catches
- * the (very unlikely) case of us accidentally initializing the
- * bootmem allocator with an invalid RAM area.
- *
- * Arguments are start, size
- */
- pr_debug("reserve_bootmem(%#lx, %#x)\n", memory_start, bootmap_size);
- reserve_bootmem(memory_start, bootmap_size, BOOTMEM_DEFAULT);
-
+ memblock_reserve(dram_start, memory_start - dram_start);
#ifdef CONFIG_BLK_DEV_INITRD
if (initrd_start) {
- reserve_bootmem(virt_to_phys((void *)initrd_start),
- initrd_end - initrd_start, BOOTMEM_DEFAULT);
+ memblock_reserve(virt_to_phys((void *)initrd_start),
+ initrd_end - initrd_start);
}
#endif /* CONFIG_BLK_DEV_INITRD */
diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h
index 58e0f46..7cb595d 100644
--- a/arch/parisc/include/asm/hugetlb.h
+++ b/arch/parisc/include/asm/hugetlb.h
@@ -3,12 +3,12 @@
#define _ASM_PARISC64_HUGETLB_H
#include <asm/page.h>
-#include <asm-generic/hugetlb.h>
-
+#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte);
+#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep);
@@ -22,6 +22,7 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
* If the arch doesn't supply something else, assume that hugepage
* size aligned regions are ok without further preparation.
*/
+#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE
static inline int prepare_hugepage_range(struct file *file,
unsigned long addr, unsigned long len)
{
@@ -32,43 +33,25 @@ static inline int prepare_hugepage_range(struct file *file,
return 0;
}
-static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
- unsigned long addr, unsigned long end,
- unsigned long floor,
- unsigned long ceiling)
-{
- free_pgd_range(tlb, addr, end, floor, ceiling);
-}
-
+#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
}
-static inline int huge_pte_none(pte_t pte)
-{
- return pte_none(pte);
-}
-
-static inline pte_t huge_pte_wrprotect(pte_t pte)
-{
- return pte_wrprotect(pte);
-}
-
+#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
void huge_ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep);
+#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS
int huge_ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
pte_t pte, int dirty);
-static inline pte_t huge_ptep_get(pte_t *ptep)
-{
- return *ptep;
-}
-
static inline void arch_clear_hugepage_flags(struct page *page)
{
}
+#include <asm-generic/hugetlb.h>
+
#endif /* _ASM_PARISC64_HUGETLB_H */
diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h
index e61dd3a..c21d337 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -311,12 +311,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
{
pte_update(ptep, _PAGE_RW, 0);
}
-static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- ptep_set_wrprotect(mm, addr, ptep);
-}
-
static inline void __ptep_set_access_flags(struct vm_area_struct *vma,
pte_t *ptep, pte_t entry,
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index cb5dd40..c4a726c 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -426,6 +426,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
pte_update(mm, addr, ptep, 0, _PAGE_PRIVILEGED, 0);
}
+#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index 2d00cc5..383da1a 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -4,7 +4,6 @@
#ifdef CONFIG_HUGETLB_PAGE
#include <asm/page.h>
-#include <asm-generic/hugetlb.h>
extern struct kmem_cache *hugepte_cache;
@@ -110,31 +109,12 @@ static inline void flush_hugetlb_page(struct vm_area_struct *vma,
void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
#endif
+#define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE
void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
unsigned long end, unsigned long floor,
unsigned long ceiling);
-/*
- * If the arch doesn't supply something else, assume that hugepage
- * size aligned regions are ok without further preparation.
- */
-static inline int prepare_hugepage_range(struct file *file,
- unsigned long addr, unsigned long len)
-{
- struct hstate *h = hstate_file(file);
- if (len & ~huge_page_mask(h))
- return -EINVAL;
- if (addr & ~huge_page_mask(h))
- return -EINVAL;
- return 0;
-}
-
-static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
-{
- set_pte_at(mm, addr, ptep, pte);
-}
-
+#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
@@ -145,6 +125,7 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
#endif
}
+#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
@@ -153,29 +134,17 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
flush_hugetlb_page(vma, addr);
}
-static inline int huge_pte_none(pte_t pte)
-{
- return pte_none(pte);
-}
-
-static inline pte_t huge_pte_wrprotect(pte_t pte)
-{
- return pte_wrprotect(pte);
-}
-
+#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS
extern int huge_ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
pte_t pte, int dirty);
-static inline pte_t huge_ptep_get(pte_t *ptep)
-{
- return *ptep;
-}
-
static inline void arch_clear_hugepage_flags(struct page *page)
{
}
+#include <asm-generic/hugetlb.h>
+
#else /* ! CONFIG_HUGETLB_PAGE */
static inline void flush_hugetlb_page(struct vm_area_struct *vma,
unsigned long vmaddr)
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h
index f7b129a..3ffb0ff 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -300,12 +300,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
pte_update(ptep, clr, set);
}
-static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- ptep_set_wrprotect(mm, addr, ptep);
-}
-
static inline void __ptep_set_access_flags(struct vm_area_struct *vma,
pte_t *ptep, pte_t entry,
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index dc6bb9da..67421f7 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -275,6 +275,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
pte_update(mm, addr, ptep, _PAGE_RW, 0, 0);
}
+#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
diff --git a/arch/powerpc/platforms/cell/cpufreq_spudemand.c b/arch/powerpc/platforms/cell/cpufreq_spudemand.c
index 882944c..5d8e8b6 100644
--- a/arch/powerpc/platforms/cell/cpufreq_spudemand.c
+++ b/arch/powerpc/platforms/cell/cpufreq_spudemand.c
@@ -49,7 +49,7 @@ static int calc_freq(struct spu_gov_info_struct *info)
cpu = info->policy->cpu;
busy_spus = atomic_read(&cbe_spu_info[cpu_to_node(cpu)].busy_spus);
- CALC_LOAD(info->busy_spus, EXP, busy_spus * FIXED_1);
+ info->busy_spus = calc_load(info->busy_spus, EXP, busy_spus * FIXED_1);
pr_debug("cpu %d: busy_spus=%d, info->busy_spus=%ld\n",
cpu, busy_spus, info->busy_spus);
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index c9ef3c5..9fcccb4 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -987,9 +987,9 @@ static void spu_calc_load(void)
unsigned long active_tasks; /* fixed-point */
active_tasks = count_active_contexts() * FIXED_1;
- CALC_LOAD(spu_avenrun[0], EXP_1, active_tasks);
- CALC_LOAD(spu_avenrun[1], EXP_5, active_tasks);
- CALC_LOAD(spu_avenrun[2], EXP_15, active_tasks);
+ spu_avenrun[0] = calc_load(spu_avenrun[0], EXP_1, active_tasks);
+ spu_avenrun[1] = calc_load(spu_avenrun[1], EXP_5, active_tasks);
+ spu_avenrun[2] = calc_load(spu_avenrun[2], EXP_15, active_tasks);
}
static void spusched_wake(struct timer_list *unused)
@@ -1071,9 +1071,6 @@ void spuctx_switch_state(struct spu_context *ctx,
}
}
-#define LOAD_INT(x) ((x) >> FSHIFT)
-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
-
static int show_spu_loadavg(struct seq_file *s, void *private)
{
int a, b, c;
diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c
index 433a994..54f3756 100644
--- a/arch/s390/appldata/appldata_os.c
+++ b/arch/s390/appldata/appldata_os.c
@@ -25,10 +25,6 @@
#include "appldata.h"
-
-#define LOAD_INT(x) ((x) >> FSHIFT)
-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
-
/*
* OS data
*
diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h
index 735939c..6f025fe 100644
--- a/arch/sh/include/asm/hugetlb.h
+++ b/arch/sh/include/asm/hugetlb.h
@@ -4,8 +4,6 @@
#include <asm/cacheflush.h>
#include <asm/page.h>
-#include <asm-generic/hugetlb.h>
-
static inline int is_hugepage_only_range(struct mm_struct *mm,
unsigned long addr,
@@ -17,6 +15,7 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
* If the arch doesn't supply something else, assume that hugepage
* size aligned regions are ok without further preparation.
*/
+#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE
static inline int prepare_hugepage_range(struct file *file,
unsigned long addr, unsigned long len)
{
@@ -27,62 +26,17 @@ static inline int prepare_hugepage_range(struct file *file,
return 0;
}
-static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
- unsigned long addr, unsigned long end,
- unsigned long floor,
- unsigned long ceiling)
-{
- free_pgd_range(tlb, addr, end, floor, ceiling);
-}
-
-static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
-{
- set_pte_at(mm, addr, ptep, pte);
-}
-
-static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- return ptep_get_and_clear(mm, addr, ptep);
-}
-
+#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
}
-static inline int huge_pte_none(pte_t pte)
-{
- return pte_none(pte);
-}
-
-static inline pte_t huge_pte_wrprotect(pte_t pte)
-{
- return pte_wrprotect(pte);
-}
-
-static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- ptep_set_wrprotect(mm, addr, ptep);
-}
-
-static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep,
- pte_t pte, int dirty)
-{
- return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
-}
-
-static inline pte_t huge_ptep_get(pte_t *ptep)
-{
- return *ptep;
-}
-
static inline void arch_clear_hugepage_flags(struct page *page)
{
clear_bit(PG_dcache_clean, &page->flags);
}
+#include <asm-generic/hugetlb.h>
+
#endif /* _ASM_SH_HUGETLB_H */
diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h
index 300557c..3963f80 100644
--- a/arch/sparc/include/asm/hugetlb.h
+++ b/arch/sparc/include/asm/hugetlb.h
@@ -3,7 +3,6 @@
#define _ASM_SPARC64_HUGETLB_H
#include <asm/page.h>
-#include <asm-generic/hugetlb.h>
#ifdef CONFIG_HUGETLB_PAGE
struct pud_huge_patch_entry {
@@ -13,9 +12,11 @@ struct pud_huge_patch_entry {
extern struct pud_huge_patch_entry __pud_huge_patch, __pud_huge_patch_end;
#endif
+#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte);
+#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep);
@@ -25,37 +26,13 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
return 0;
}
-/*
- * If the arch doesn't supply something else, assume that hugepage
- * size aligned regions are ok without further preparation.
- */
-static inline int prepare_hugepage_range(struct file *file,
- unsigned long addr, unsigned long len)
-{
- struct hstate *h = hstate_file(file);
-
- if (len & ~huge_page_mask(h))
- return -EINVAL;
- if (addr & ~huge_page_mask(h))
- return -EINVAL;
- return 0;
-}
-
+#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
}
-static inline int huge_pte_none(pte_t pte)
-{
- return pte_none(pte);
-}
-
-static inline pte_t huge_pte_wrprotect(pte_t pte)
-{
- return pte_wrprotect(pte);
-}
-
+#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
@@ -63,6 +40,7 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
set_huge_pte_at(mm, addr, ptep, pte_wrprotect(old_pte));
}
+#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS
static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
pte_t pte, int dirty)
@@ -75,17 +53,15 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
return changed;
}
-static inline pte_t huge_ptep_get(pte_t *ptep)
-{
- return *ptep;
-}
-
static inline void arch_clear_hugepage_flags(struct page *page)
{
}
+#define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE
void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
unsigned long end, unsigned long floor,
unsigned long ceiling);
+#include <asm-generic/hugetlb.h>
+
#endif /* _ASM_SPARC64_HUGETLB_H */
diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index 6b99389..10c15b8 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -12,6 +12,8 @@ config UML
select HAVE_UID16
select HAVE_FUTEX_CMPXCHG if FUTEX
select HAVE_DEBUG_KMEMLEAK
+ select HAVE_MEMBLOCK
+ select NO_BOOTMEM
select GENERIC_IRQ_SHOW
select GENERIC_CPU_DEVICES
select GENERIC_CLOCKEVENTS
diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c
index f02596e..296a91a 100644
--- a/arch/um/kernel/physmem.c
+++ b/arch/um/kernel/physmem.c
@@ -5,6 +5,7 @@
#include <linux/module.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/mm.h>
#include <linux/pfn.h>
#include <asm/page.h>
@@ -80,28 +81,23 @@ void __init setup_physmem(unsigned long start, unsigned long reserve_end,
unsigned long len, unsigned long long highmem)
{
unsigned long reserve = reserve_end - start;
- unsigned long pfn = PFN_UP(__pa(reserve_end));
- unsigned long delta = (len - reserve) >> PAGE_SHIFT;
- unsigned long offset, bootmap_size;
- long map_size;
+ long map_size = len - reserve;
int err;
- offset = uml_reserved - uml_physmem;
- map_size = len - offset;
if(map_size <= 0) {
os_warn("Too few physical memory! Needed=%lu, given=%lu\n",
- offset, len);
+ reserve, len);
exit(1);
}
physmem_fd = create_mem_file(len + highmem);
- err = os_map_memory((void *) uml_reserved, physmem_fd, offset,
+ err = os_map_memory((void *) reserve_end, physmem_fd, reserve,
map_size, 1, 1, 1);
if (err < 0) {
os_warn("setup_physmem - mapping %ld bytes of memory at 0x%p "
"failed - errno = %d\n", map_size,
- (void *) uml_reserved, err);
+ (void *) reserve_end, err);
exit(1);
}
@@ -113,9 +109,11 @@ void __init setup_physmem(unsigned long start, unsigned long reserve_end,
os_write_file(physmem_fd, __syscall_stub_start, PAGE_SIZE);
os_fsync_file(physmem_fd);
- bootmap_size = init_bootmem(pfn, pfn + delta);
- free_bootmem(__pa(reserve_end) + bootmap_size,
- len - bootmap_size - reserve);
+ memblock_add(__pa(start), len + highmem);
+ memblock_reserve(__pa(start), reserve);
+
+ min_low_pfn = PFN_UP(__pa(reserve_end));
+ max_low_pfn = min_low_pfn + (map_size >> PAGE_SHIFT);
}
int phys_mapping(unsigned long phys, unsigned long long *offset_out)
diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig
index 3a3b40f..0c5111b 100644
--- a/arch/unicore32/Kconfig
+++ b/arch/unicore32/Kconfig
@@ -6,6 +6,7 @@ config UNICORE32
select ARCH_MIGHT_HAVE_PC_SERIO
select DMA_DIRECT_OPS
select HAVE_MEMBLOCK
+ select NO_BOOTMEM
select HAVE_GENERIC_DMA_COHERENT
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_BZIP2
diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c
index 5f72a8d..8f8699e 100644
--- a/arch/unicore32/mm/init.c
+++ b/arch/unicore32/mm/init.c
@@ -84,58 +84,6 @@ static void __init find_limits(unsigned long *min, unsigned long *max_low,
}
}
-static void __init uc32_bootmem_init(unsigned long start_pfn,
- unsigned long end_pfn)
-{
- struct memblock_region *reg;
- unsigned int boot_pages;
- phys_addr_t bitmap;
- pg_data_t *pgdat;
-
- /*
- * Allocate the bootmem bitmap page. This must be in a region
- * of memory which has already been mapped.
- */
- boot_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
- bitmap = memblock_alloc_base(boot_pages << PAGE_SHIFT, L1_CACHE_BYTES,
- __pfn_to_phys(end_pfn));
-
- /*
- * Initialise the bootmem allocator, handing the
- * memory banks over to bootmem.
- */
- node_set_online(0);
- pgdat = NODE_DATA(0);
- init_bootmem_node(pgdat, __phys_to_pfn(bitmap), start_pfn, end_pfn);
-
- /* Free the lowmem regions from memblock into bootmem. */
- for_each_memblock(memory, reg) {
- unsigned long start = memblock_region_memory_base_pfn(reg);
- unsigned long end = memblock_region_memory_end_pfn(reg);
-
- if (end >= end_pfn)
- end = end_pfn;
- if (start >= end)
- break;
-
- free_bootmem(__pfn_to_phys(start), (end - start) << PAGE_SHIFT);
- }
-
- /* Reserve the lowmem memblock reserved regions in bootmem. */
- for_each_memblock(reserved, reg) {
- unsigned long start = memblock_region_reserved_base_pfn(reg);
- unsigned long end = memblock_region_reserved_end_pfn(reg);
-
- if (end >= end_pfn)
- end = end_pfn;
- if (start >= end)
- break;
-
- reserve_bootmem(__pfn_to_phys(start),
- (end - start) << PAGE_SHIFT, BOOTMEM_DEFAULT);
- }
-}
-
static void __init uc32_bootmem_free(unsigned long min, unsigned long max_low,
unsigned long max_high)
{
@@ -232,7 +180,7 @@ void __init bootmem_init(void)
find_limits(&min, &max_low, &max_high);
- uc32_bootmem_init(min, max_low);
+ node_set_online(0);
/*
* Sparsemem tries to allocate bootmem in memory_present(),
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 3f9d43f..7eb8785 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -39,7 +39,7 @@ void __init init_vdso_image(const struct vdso_image *image)
struct linux_binprm;
-static int vdso_fault(const struct vm_special_mapping *sm,
+static vm_fault_t vdso_fault(const struct vm_special_mapping *sm,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
const struct vdso_image *image = vma->vm_mm->context.vdso_image;
@@ -84,12 +84,11 @@ static int vdso_mremap(const struct vm_special_mapping *sm,
return 0;
}
-static int vvar_fault(const struct vm_special_mapping *sm,
+static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
const struct vdso_image *image = vma->vm_mm->context.vdso_image;
long sym_offset;
- int ret = -EFAULT;
if (!image)
return VM_FAULT_SIGBUS;
@@ -108,29 +107,24 @@ static int vvar_fault(const struct vm_special_mapping *sm,
return VM_FAULT_SIGBUS;
if (sym_offset == image->sym_vvar_page) {
- ret = vm_insert_pfn(vma, vmf->address,
- __pa_symbol(&__vvar_page) >> PAGE_SHIFT);
+ return vmf_insert_pfn(vma, vmf->address,
+ __pa_symbol(&__vvar_page) >> PAGE_SHIFT);
} else if (sym_offset == image->sym_pvclock_page) {
struct pvclock_vsyscall_time_info *pvti =
pvclock_get_pvti_cpu0_va();
if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) {
- ret = vm_insert_pfn_prot(
- vma,
- vmf->address,
- __pa(pvti) >> PAGE_SHIFT,
- pgprot_decrypted(vma->vm_page_prot));
+ return vmf_insert_pfn_prot(vma, vmf->address,
+ __pa(pvti) >> PAGE_SHIFT,
+ pgprot_decrypted(vma->vm_page_prot));
}
} else if (sym_offset == image->sym_hvclock_page) {
struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page();
if (tsc_pg && vclock_was_used(VCLOCK_HVCLOCK))
- ret = vm_insert_pfn(vma, vmf->address,
- vmalloc_to_pfn(tsc_pg));
+ return vmf_insert_pfn(vma, vmf->address,
+ vmalloc_to_pfn(tsc_pg));
}
- if (ret == 0 || ret == -EBUSY)
- return VM_FAULT_NOPAGE;
-
return VM_FAULT_SIGBUS;
}
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index 5ed826d..7469d32 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -13,75 +13,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
return 0;
}
-/*
- * If the arch doesn't supply something else, assume that hugepage
- * size aligned regions are ok without further preparation.
- */
-static inline int prepare_hugepage_range(struct file *file,
- unsigned long addr, unsigned long len)
-{
- struct hstate *h = hstate_file(file);
- if (len & ~huge_page_mask(h))
- return -EINVAL;
- if (addr & ~huge_page_mask(h))
- return -EINVAL;
- return 0;
-}
-
-static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
- unsigned long addr, unsigned long end,
- unsigned long floor,
- unsigned long ceiling)
-{
- free_pgd_range(tlb, addr, end, floor, ceiling);
-}
-
-static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
-{
- set_pte_at(mm, addr, ptep, pte);
-}
-
-static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- return ptep_get_and_clear(mm, addr, ptep);
-}
-
-static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
-{
- ptep_clear_flush(vma, addr, ptep);
-}
-
-static inline int huge_pte_none(pte_t pte)
-{
- return pte_none(pte);
-}
-
-static inline pte_t huge_pte_wrprotect(pte_t pte)
-{
- return pte_wrprotect(pte);
-}
-
-static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- ptep_set_wrprotect(mm, addr, ptep);
-}
-
-static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep,
- pte_t pte, int dirty)
-{
- return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
-}
-
-static inline pte_t huge_ptep_get(pte_t *ptep)
-{
- return *ptep;
-}
-
static inline void arch_clear_hugepage_flags(struct page *page)
{
}
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index c88c23c..d1f25c8 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1248,7 +1248,6 @@ void __init e820__memblock_setup(void)
{
int i;
u64 end;
- u64 addr = 0;
/*
* The bootstrap memblock region count maximum is 128 entries
@@ -1265,21 +1264,13 @@ void __init e820__memblock_setup(void)
struct e820_entry *entry = &e820_table->entries[i];
end = entry->addr + entry->size;
- if (addr < entry->addr)
- memblock_reserve(addr, entry->addr - addr);
- addr = end;
if (end != (resource_size_t)end)
continue;
- /*
- * all !E820_TYPE_RAM ranges (including gap ranges) are put
- * into memblock.reserved to make sure that struct pages in
- * such regions are not left uninitialized after bootup.
- */
if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
- memblock_reserve(entry->addr, entry->size);
- else
- memblock_add(entry->addr, entry->size);
+ continue;
+
+ memblock_add(entry->addr, entry->size);
}
/* Throw away partial pages: */
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index 82c75643..3310ade 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -26,5 +26,6 @@ generic-y += rwsem.h
generic-y += sections.h
generic-y += topology.h
generic-y += trace_clock.h
+generic-y += vga.h
generic-y += word-at-a-time.h
generic-y += xor.h
diff --git a/arch/xtensa/include/asm/vga.h b/arch/xtensa/include/asm/vga.h
deleted file mode 100644
index 1fd8cab..0000000
--- a/arch/xtensa/include/asm/vga.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * include/asm-xtensa/vga.h
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License. See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 2001 - 2005 Tensilica Inc.
- */
-
-#ifndef _XTENSA_VGA_H
-#define _XTENSA_VGA_H
-
-#define VGA_MAP_MEM(x,s) (unsigned long)phys_to_virt(x)
-
-#define vga_readb(x) (*(x))
-#define vga_writeb(x,y) (*(y) = (x))
-
-#endif
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 35c48d7..28f80d2 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -153,7 +153,7 @@ struct iolatency_grp {
#define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC
/*
* These are the constants used to fake the fixed-point moving average
- * calculation just like load average. The call to CALC_LOAD folds
+ * calculation just like load average. The call to calc_load() folds
* (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg. The sampling
* window size is bucketed to try to approximately calculate average
* latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows
@@ -248,7 +248,7 @@ static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat,
return;
/*
- * CALC_LOAD takes in a number stored in fixed point representation.
+ * calc_load() takes in a number stored in fixed point representation.
* Because we are using this for IO time in ns, the values stored
* are significantly larger than the FIXED_1 denominator (2048).
* Therefore, rounding errors in the calculation are negligible and
@@ -257,7 +257,9 @@ static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat,
exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1,
div64_u64(iolat->cur_win_nsec,
BLKIOLATENCY_EXP_BUCKET_SIZE));
- CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat->rqs.mean);
+ iolat->lat_avg = calc_load(iolat->lat_avg,
+ iolatency_exp_factors[exp_idx],
+ stat->rqs.mean);
}
static inline bool iolatency_may_queue(struct iolatency_grp *iolat,
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 1ac4c36..86d6cd9 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -67,8 +67,11 @@ static ssize_t node_read_meminfo(struct device *dev,
int nid = dev->id;
struct pglist_data *pgdat = NODE_DATA(nid);
struct sysinfo i;
+ unsigned long sreclaimable, sunreclaimable;
si_meminfo_node(&i, nid);
+ sreclaimable = node_page_state(pgdat, NR_SLAB_RECLAIMABLE);
+ sunreclaimable = node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE);
n = sprintf(buf,
"Node %d MemTotal: %8lu kB\n"
"Node %d MemFree: %8lu kB\n"
@@ -118,6 +121,7 @@ static ssize_t node_read_meminfo(struct device *dev,
"Node %d NFS_Unstable: %8lu kB\n"
"Node %d Bounce: %8lu kB\n"
"Node %d WritebackTmp: %8lu kB\n"
+ "Node %d KReclaimable: %8lu kB\n"
"Node %d Slab: %8lu kB\n"
"Node %d SReclaimable: %8lu kB\n"
"Node %d SUnreclaim: %8lu kB\n"
@@ -138,20 +142,21 @@ static ssize_t node_read_meminfo(struct device *dev,
nid, K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)),
nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
- nid, K(node_page_state(pgdat, NR_SLAB_RECLAIMABLE) +
- node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE)),
- nid, K(node_page_state(pgdat, NR_SLAB_RECLAIMABLE)),
+ nid, K(sreclaimable +
+ node_page_state(pgdat, NR_KERNEL_MISC_RECLAIMABLE)),
+ nid, K(sreclaimable + sunreclaimable),
+ nid, K(sreclaimable),
+ nid, K(sunreclaimable)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- nid, K(node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE)),
+ ,
nid, K(node_page_state(pgdat, NR_ANON_THPS) *
HPAGE_PMD_NR),
nid, K(node_page_state(pgdat, NR_SHMEM_THPS) *
HPAGE_PMD_NR),
nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) *
- HPAGE_PMD_NR));
-#else
- nid, K(node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE)));
+ HPAGE_PMD_NR)
#endif
+ );
n += hugetlb_report_node_meminfo(nid, buf + n);
return n;
}
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 575a68f..7197960 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -130,10 +130,6 @@ struct menu_device {
int interval_ptr;
};
-
-#define LOAD_INT(x) ((x) >> FSHIFT)
-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
-
static inline int get_loadavg(unsigned long load)
{
return LOAD_INT(load) * 10 + LOAD_FRAC(load) / 10;
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
index e1c7996..475b769 100644
--- a/drivers/infiniband/hw/hfi1/mmu_rb.c
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
@@ -77,7 +77,6 @@ static void do_remove(struct mmu_rb_handler *handler,
static void handle_remove(struct work_struct *work);
static const struct mmu_notifier_ops mn_opts = {
- .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
.invalidate_range_start = mmu_notifier_range_start,
};
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index 58da65d..fd55223 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -427,7 +427,6 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
}
static const struct mmu_notifier_ops iommu_mn = {
- .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
.release = mn_release,
.clear_flush_young = mn_clear_flush_young,
.invalidate_range = mn_invalidate_range,
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index 4a03e50..db301ef 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -273,7 +273,6 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
}
static const struct mmu_notifier_ops intel_mmuops = {
- .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
.release = intel_mm_release,
.change_pte = intel_change_pte,
.invalidate_range = intel_invalidate_range,
diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
index be28f05..03b49d5 100644
--- a/drivers/misc/sgi-gru/grutlbpurge.c
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -261,7 +261,6 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
static const struct mmu_notifier_ops gru_mmuops = {
- .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
.invalidate_range_start = gru_invalidate_range_start,
.invalidate_range_end = gru_invalidate_range_end,
.release = gru_release,
diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index 800ad25..76c83c1 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -1127,12 +1127,13 @@ void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size)
{
const u64 phys_offset = MIN_MEMBLOCK_ADDR;
+ if (size < PAGE_SIZE - (base & ~PAGE_MASK)) {
+ pr_warn("Ignoring memory block 0x%llx - 0x%llx\n",
+ base, base + size);
+ return;
+ }
+
if (!PAGE_ALIGNED(base)) {
- if (size < PAGE_SIZE - (base & ~PAGE_MASK)) {
- pr_warn("Ignoring memory block 0x%llx - 0x%llx\n",
- base, base + size);
- return;
- }
size -= PAGE_SIZE - (base & ~PAGE_MASK);
base = PAGE_ALIGN(base);
}
diff --git a/drivers/staging/android/ion/ion_page_pool.c b/drivers/staging/android/ion/ion_page_pool.c
index 9bc56eb..0d2a959 100644
--- a/drivers/staging/android/ion/ion_page_pool.c
+++ b/drivers/staging/android/ion/ion_page_pool.c
@@ -33,8 +33,8 @@ static void ion_page_pool_add(struct ion_page_pool *pool, struct page *page)
pool->low_count++;
}
- mod_node_page_state(page_pgdat(page), NR_INDIRECTLY_RECLAIMABLE_BYTES,
- (1 << (PAGE_SHIFT + pool->order)));
+ mod_node_page_state(page_pgdat(page), NR_KERNEL_MISC_RECLAIMABLE,
+ 1 << pool->order);
mutex_unlock(&pool->mutex);
}
@@ -53,8 +53,8 @@ static struct page *ion_page_pool_remove(struct ion_page_pool *pool, bool high)
}
list_del(&page->lru);
- mod_node_page_state(page_pgdat(page), NR_INDIRECTLY_RECLAIMABLE_BYTES,
- -(1 << (PAGE_SHIFT + pool->order)));
+ mod_node_page_state(page_pgdat(page), NR_KERNEL_MISC_RECLAIMABLE,
+ -(1 << pool->order));
return page;
}
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index f408994..0c35e62 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -418,9 +418,12 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma)
int i;
vma->vm_flags |= VM_MIXEDMAP;
for (i = 0; i < pages && !ret; i++) {
+ vm_fault_t vmf;
unsigned long off = i * PAGE_SIZE;
pfn_t pfn = phys_to_pfn_t(address + off, PFN_DEV);
- ret = vm_insert_mixed(vma, vma->vm_start + off, pfn);
+ vmf = vmf_insert_mixed(vma, vma->vm_start + off, pfn);
+ if (vmf & VM_FAULT_ERROR)
+ ret = vm_fault_to_errno(vmf, 0);
}
}
diff --git a/fs/dcache.c b/fs/dcache.c
index 2e7e8d8..c2e443f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -257,24 +257,10 @@ static void __d_free(struct rcu_head *head)
kmem_cache_free(dentry_cache, dentry);
}
-static void __d_free_external_name(struct rcu_head *head)
-{
- struct external_name *name = container_of(head, struct external_name,
- u.head);
-
- mod_node_page_state(page_pgdat(virt_to_page(name)),
- NR_INDIRECTLY_RECLAIMABLE_BYTES,
- -ksize(name));
-
- kfree(name);
-}
-
static void __d_free_external(struct rcu_head *head)
{
struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
-
- __d_free_external_name(&external_name(dentry)->u.head);
-
+ kfree(external_name(dentry));
kmem_cache_free(dentry_cache, dentry);
}
@@ -306,7 +292,7 @@ void release_dentry_name_snapshot(struct name_snapshot *name)
struct external_name *p;
p = container_of(name->name, struct external_name, name[0]);
if (unlikely(atomic_dec_and_test(&p->u.count)))
- call_rcu(&p->u.head, __d_free_external_name);
+ kfree_rcu(p, u.head);
}
}
EXPORT_SYMBOL(release_dentry_name_snapshot);
@@ -1606,7 +1592,6 @@ EXPORT_SYMBOL(d_invalidate);
struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
{
- struct external_name *ext = NULL;
struct dentry *dentry;
char *dname;
int err;
@@ -1627,14 +1612,15 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
dname = dentry->d_iname;
} else if (name->len > DNAME_INLINE_LEN-1) {
size_t size = offsetof(struct external_name, name[1]);
-
- ext = kmalloc(size + name->len, GFP_KERNEL_ACCOUNT);
- if (!ext) {
+ struct external_name *p = kmalloc(size + name->len,
+ GFP_KERNEL_ACCOUNT |
+ __GFP_RECLAIMABLE);
+ if (!p) {
kmem_cache_free(dentry_cache, dentry);
return NULL;
}
- atomic_set(&ext->u.count, 1);
- dname = ext->name;
+ atomic_set(&p->u.count, 1);
+ dname = p->name;
} else {
dname = dentry->d_iname;
}
@@ -1673,12 +1659,6 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
}
}
- if (unlikely(ext)) {
- pg_data_t *pgdat = page_pgdat(virt_to_page(ext));
- mod_node_page_state(pgdat, NR_INDIRECTLY_RECLAIMABLE_BYTES,
- ksize(ext));
- }
-
this_cpu_inc(nr_dentry);
return dentry;
@@ -2707,7 +2687,7 @@ static void copy_name(struct dentry *dentry, struct dentry *target)
dentry->d_name.hash_len = target->d_name.hash_len;
}
if (old_name && likely(atomic_dec_and_test(&old_name->u.count)))
- call_rcu(&old_name->u.head, __d_free_external_name);
+ kfree_rcu(old_name, u.head);
}
/*
diff --git a/fs/iomap.c b/fs/iomap.c
index ec15cf2..90c2feb 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1057,7 +1057,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
return length;
}
-int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
+vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
{
struct page *page = vmf->page;
struct inode *inode = file_inode(vmf->vma->vm_file);
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index ff2716f..fdf527b 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -236,6 +236,9 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
sb->s_export_op = &kernfs_export_ops;
sb->s_time_gran = 1;
+ /* sysfs dentries and inodes don't require IO to create */
+ sb->s_shrink.seeks = 0;
+
/* get root inode, initialize and unlock it */
mutex_lock(&kernfs_mutex);
inode = kernfs_get_inode(sb, info->root->kn);
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index a342f00..d1cbb27 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5106,8 +5106,6 @@ int ocfs2_split_extent(handle_t *handle,
* rightmost extent list.
*/
if (path->p_tree_depth) {
- struct ocfs2_extent_block *eb;
-
ret = ocfs2_read_extent_block(et->et_ci,
ocfs2_et_get_last_eb_blk(et),
&last_eb_bh);
@@ -5115,8 +5113,6 @@ int ocfs2_split_extent(handle_t *handle,
mlog_errno(ret);
goto out;
}
-
- eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
}
if (rec->e_cpos == split_rec->e_cpos &&
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 302cd7c..da578ad 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1392,8 +1392,7 @@ retry:
unlock:
spin_unlock(&oi->ip_lock);
out:
- if (new)
- kfree(new);
+ kfree(new);
return ret;
}
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 9b984ca..1d6dc84 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -329,7 +329,7 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle)
{
char *buf;
- buf = (char *) get_zeroed_page(GFP_NOFS);
+ buf = (char *) get_zeroed_page(GFP_ATOMIC);
if (buf) {
dump_mle(mle, buf, PAGE_SIZE - 1);
free_page((unsigned long)buf);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 838a06d..074d5de 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -531,7 +531,7 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
assert_spin_locked(&res->spinlock);
/* don't shuffle secondary queues */
- if ((res->owner == dlm->node_num)) {
+ if (res->owner == dlm->node_num) {
if (res->state & (DLM_LOCK_RES_MIGRATING |
DLM_LOCK_RES_BLOCK_DIRTY))
return;
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 7a5ee14..1114ef0 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4135,7 +4135,6 @@ static int ocfs2_create_reflink_node(struct inode *s_inode,
struct buffer_head *ref_root_bh = NULL;
struct ocfs2_cached_dealloc_ctxt dealloc;
struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
- struct ocfs2_refcount_block *rb;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data;
struct ocfs2_refcount_tree *ref_tree;
@@ -4162,7 +4161,6 @@ static int ocfs2_create_reflink_node(struct inode *s_inode,
mlog_errno(ret);
goto out;
}
- rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh,
&ref_tree->rf_ci, ref_root_bh,
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index fc5306a..5792f9e 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -516,6 +516,9 @@ int proc_fill_super(struct super_block *s, void *data, int silent)
*/
s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
+ /* procfs dentries and inodes don't require IO to create */
+ s->s_shrink.seeks = 0;
+
pde_get(&proc_root);
root_inode = proc_get_inode(s, &proc_root);
if (!root_inode) {
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index d066947..8468bae 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -10,9 +10,6 @@
#include <linux/seqlock.h>
#include <linux/time.h>
-#define LOAD_INT(x) ((x) >> FSHIFT)
-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
-
static int loadavg_proc_show(struct seq_file *m, void *v)
{
unsigned long avnrun[3];
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index edda898..568d90e 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -38,6 +38,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
long cached;
long available;
unsigned long pages[NR_LRU_LISTS];
+ unsigned long sreclaimable, sunreclaim;
int lru;
si_meminfo(&i);
@@ -53,6 +54,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
available = si_mem_available();
+ sreclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE);
+ sunreclaim = global_node_page_state(NR_SLAB_UNRECLAIMABLE);
show_val_kb(m, "MemTotal: ", i.totalram);
show_val_kb(m, "MemFree: ", i.freeram);
@@ -94,14 +97,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "Mapped: ",
global_node_page_state(NR_FILE_MAPPED));
show_val_kb(m, "Shmem: ", i.sharedram);
- show_val_kb(m, "Slab: ",
- global_node_page_state(NR_SLAB_RECLAIMABLE) +
- global_node_page_state(NR_SLAB_UNRECLAIMABLE));
-
- show_val_kb(m, "SReclaimable: ",
- global_node_page_state(NR_SLAB_RECLAIMABLE));
- show_val_kb(m, "SUnreclaim: ",
- global_node_page_state(NR_SLAB_UNRECLAIMABLE));
+ show_val_kb(m, "KReclaimable: ", sreclaimable +
+ global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE));
+ show_val_kb(m, "Slab: ", sreclaimable + sunreclaim);
+ show_val_kb(m, "SReclaimable: ", sreclaimable);
+ show_val_kb(m, "SUnreclaim: ", sunreclaim);
seq_printf(m, "KernelStack: %8lu kB\n",
global_zone_page_state(NR_KERNEL_STACK_KB));
show_val_kb(m, "PageTables: ",
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 5ea1d64..a027473 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -713,6 +713,8 @@ static void smap_gather_stats(struct vm_area_struct *vma,
smaps_walk.private = mss;
#ifdef CONFIG_SHMEM
+ /* In case of smaps_rollup, reset the value from previous vma */
+ mss->check_shmem_swap = false;
if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
/*
* For shared or readonly shmem mappings we know that all
@@ -728,7 +730,7 @@ static void smap_gather_stats(struct vm_area_struct *vma,
if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
!(vma->vm_flags & VM_WRITE)) {
- mss->swap = shmem_swapped;
+ mss->swap += shmem_swapped;
} else {
mss->check_shmem_swap = true;
smaps_walk.pte_hole = smaps_pte_hole;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index bfa0ec6..356d2b85 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1026,7 +1026,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
struct userfaultfd_ctx *fork_nctx = NULL;
/* always take the fd_wqh lock before the fault_pending_wqh lock */
- spin_lock(&ctx->fd_wqh.lock);
+ spin_lock_irq(&ctx->fd_wqh.lock);
__add_wait_queue(&ctx->fd_wqh, &wait);
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
@@ -1112,13 +1112,13 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
ret = -EAGAIN;
break;
}
- spin_unlock(&ctx->fd_wqh.lock);
+ spin_unlock_irq(&ctx->fd_wqh.lock);
schedule();
- spin_lock(&ctx->fd_wqh.lock);
+ spin_lock_irq(&ctx->fd_wqh.lock);
}
__remove_wait_queue(&ctx->fd_wqh, &wait);
__set_current_state(TASK_RUNNING);
- spin_unlock(&ctx->fd_wqh.lock);
+ spin_unlock_irq(&ctx->fd_wqh.lock);
if (!ret && msg->event == UFFD_EVENT_FORK) {
ret = resolve_userfault_fork(ctx, fork_nctx, msg);
diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h
index 9d0cde8..71d7b77 100644
--- a/include/asm-generic/hugetlb.h
+++ b/include/asm-generic/hugetlb.h
@@ -32,7 +32,7 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot)
return pte_modify(pte, newprot);
}
-#ifndef huge_pte_clear
+#ifndef __HAVE_ARCH_HUGE_PTE_CLEAR
static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned long sz)
{
@@ -40,4 +40,90 @@ static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
}
#endif
+#ifndef __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE
+static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
+ unsigned long addr, unsigned long end,
+ unsigned long floor, unsigned long ceiling)
+{
+ free_pgd_range(tlb, addr, end, floor, ceiling);
+}
+#endif
+
+#ifndef __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
+static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ set_pte_at(mm, addr, ptep, pte);
+}
+#endif
+
+#ifndef __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
+static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ return ptep_get_and_clear(mm, addr, ptep);
+}
+#endif
+
+#ifndef __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
+static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ ptep_clear_flush(vma, addr, ptep);
+}
+#endif
+
+#ifndef __HAVE_ARCH_HUGE_PTE_NONE
+static inline int huge_pte_none(pte_t pte)
+{
+ return pte_none(pte);
+}
+#endif
+
+#ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT
+static inline pte_t huge_pte_wrprotect(pte_t pte)
+{
+ return pte_wrprotect(pte);
+}
+#endif
+
+#ifndef __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE
+static inline int prepare_hugepage_range(struct file *file,
+ unsigned long addr, unsigned long len)
+{
+ struct hstate *h = hstate_file(file);
+
+ if (len & ~huge_page_mask(h))
+ return -EINVAL;
+ if (addr & ~huge_page_mask(h))
+ return -EINVAL;
+
+ return 0;
+}
+#endif
+
+#ifndef __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
+static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ ptep_set_wrprotect(mm, addr, ptep);
+}
+#endif
+
+#ifndef __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS
+static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t pte, int dirty)
+{
+ return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
+}
+#endif
+
+#ifndef __HAVE_ARCH_HUGE_PTEP_GET
+static inline pte_t huge_ptep_get(pte_t *ptep)
+{
+ return *ptep;
+}
+#endif
+
#endif /* _ASM_GENERIC_HUGETLB_H */
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 88ebc61..5657a20 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -757,7 +757,7 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
/*
* Interfaces that can be used by architecture code to keep track of
* memory type of pfn mappings specified by the remap_pfn_range,
- * vm_insert_pfn.
+ * vmf_insert_pfn.
*/
/*
@@ -773,7 +773,7 @@ static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
/*
* track_pfn_insert is called when a _new_ single pfn is established
- * by vm_insert_pfn().
+ * by vmf_insert_pfn().
*/
static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
pfn_t pfn)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 22254c1..5e1694f 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -20,6 +20,7 @@
#include <linux/u64_stats_sync.h>
#include <linux/workqueue.h>
#include <linux/bpf-cgroup.h>
+#include <linux/psi_types.h>
#ifdef CONFIG_CGROUPS
@@ -436,6 +437,9 @@ struct cgroup {
/* used to schedule release agent */
struct work_struct release_agent_work;
+ /* used to track pressure stalls */
+ struct psi_group psi;
+
/* used to store eBPF programs */
struct cgroup_bpf bpf;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b622d66..9968332 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -650,6 +650,11 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
pr_cont_kernfs_path(cgrp->kn);
}
+static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
+{
+ return &cgrp->psi;
+}
+
static inline void cgroup_init_kthreadd(void)
{
/*
@@ -703,6 +708,16 @@ static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp)
return NULL;
}
+static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
+{
+ return NULL;
+}
+
+static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
+{
+ return NULL;
+}
+
static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
struct cgroup *ancestor)
{
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 31c865d..577d1b2 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -57,7 +57,12 @@ struct task_delay_info {
u64 freepages_start;
u64 freepages_delay; /* wait for memory reclaim */
+
+ u64 thrashing_start;
+ u64 thrashing_delay; /* wait for thrashing page */
+
u32 freepages_count; /* total count of memory reclaim */
+ u32 thrashing_count; /* total count of thrash waits */
};
#endif
@@ -76,6 +81,8 @@ extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *);
extern __u64 __delayacct_blkio_ticks(struct task_struct *);
extern void __delayacct_freepages_start(void);
extern void __delayacct_freepages_end(void);
+extern void __delayacct_thrashing_start(void);
+extern void __delayacct_thrashing_end(void);
static inline int delayacct_is_task_waiting_on_io(struct task_struct *p)
{
@@ -156,6 +163,18 @@ static inline void delayacct_freepages_end(void)
__delayacct_freepages_end();
}
+static inline void delayacct_thrashing_start(void)
+{
+ if (current->delays)
+ __delayacct_thrashing_start();
+}
+
+static inline void delayacct_thrashing_end(void)
+{
+ if (current->delays)
+ __delayacct_thrashing_end();
+}
+
#else
static inline void delayacct_set_flag(int flag)
{}
@@ -182,6 +201,10 @@ static inline void delayacct_freepages_start(void)
{}
static inline void delayacct_freepages_end(void)
{}
+static inline void delayacct_thrashing_start(void)
+{}
+static inline void delayacct_thrashing_end(void)
+{}
#endif /* CONFIG_TASK_DELAY_ACCT */
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 4c92e3b..dde9470 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -107,7 +107,7 @@ enum hmm_pfn_flag_e {
* HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory
* HMM_PFN_NONE: corresponding CPU page table entry is pte_none()
* HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the
- * result of vm_insert_pfn() or vm_insert_page(). Therefore, it should not
+ * result of vmf_insert_pfn() or vm_insert_page(). Therefore, it should not
* be mirrored by a device, because the entry will never have HMM_PFN_VALID
* set and the pfn value is undefined.
*
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index fdcb459..4663ee9 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -213,9 +213,9 @@ static inline int hpage_nr_pages(struct page *page)
}
struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, int flags);
+ pmd_t *pmd, int flags, struct dev_pagemap **pgmap);
struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
- pud_t *pud, int flags);
+ pud_t *pud, int flags, struct dev_pagemap **pgmap);
extern vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);
@@ -344,13 +344,13 @@ static inline void mm_put_huge_zero_page(struct mm_struct *mm)
}
static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
- unsigned long addr, pmd_t *pmd, int flags)
+ unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
{
return NULL;
}
static inline struct page *follow_devmap_pud(struct vm_area_struct *vma,
- unsigned long addr, pud_t *pud, int flags)
+ unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap)
{
return NULL;
}
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 3555d54..9a42581 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -6,6 +6,7 @@
#include <linux/bitmap.h>
#include <linux/mm.h>
#include <linux/types.h>
+#include <linux/mm_types.h>
struct address_space;
struct fiemap_extent_info;
@@ -141,7 +142,8 @@ int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
bool *did_zero, const struct iomap_ops *ops);
int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
const struct iomap_ops *ops);
-int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops);
+vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf,
+ const struct iomap_ops *ops);
int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
loff_t start, loff_t len, const struct iomap_ops *ops);
loff_t iomap_seek_hole(struct inode *inode, loff_t offset,
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index d7618c4..7c47b1a 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -90,6 +90,7 @@
#ifndef WEAK
#define WEAK(name) \
.weak name ASM_NL \
+ ALIGN ASM_NL \
name:
#endif
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 837f2f2..bb2c84a 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -281,4 +281,7 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor)
}
#endif /* mul_u64_u32_div */
+#define DIV64_U64_ROUND_UP(ll, d) \
+ ({ u64 _tmp = (d); div64_u64((ll) + _tmp - 1, _tmp); })
+
#endif /* _LINUX_MATH64_H */
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 5169205..2acdd04 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -265,21 +265,6 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \
nid, flags, p_start, p_end, p_nid)
-/**
- * for_each_resv_unavail_range - iterate through reserved and unavailable memory
- * @i: u64 used as loop variable
- * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
- * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
- *
- * Walks over unavailable but reserved (reserved && !memory) areas of memblock.
- * Available as soon as memblock is initialized.
- * Note: because this memory does not belong to any physical node, flags and
- * nid arguments do not make sense and thus not exported as arguments.
- */
-#define for_each_resv_unavail_range(i, p_start, p_end) \
- for_each_mem_range(i, &memblock.reserved, &memblock.memory, \
- NUMA_NO_NODE, MEMBLOCK_NONE, p_start, p_end, NULL)
-
static inline void memblock_set_region_flags(struct memblock_region *r,
enum memblock_flags flags)
{
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 652f602..7ab2120 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -78,7 +78,7 @@ struct mem_cgroup_reclaim_cookie {
struct mem_cgroup_id {
int id;
- atomic_t ref;
+ refcount_t ref;
};
/*
@@ -1268,10 +1268,11 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep);
void memcg_kmem_put_cache(struct kmem_cache *cachep);
int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
struct mem_cgroup *memcg);
+
+#ifdef CONFIG_MEMCG_KMEM
int memcg_kmem_charge(struct page *page, gfp_t gfp, int order);
void memcg_kmem_uncharge(struct page *page, int order);
-#ifdef CONFIG_MEMCG_KMEM
extern struct static_key_false memcg_kmem_enabled_key;
extern struct workqueue_struct *memcg_kmem_cache_wq;
@@ -1307,6 +1308,16 @@ extern int memcg_expand_shrinker_maps(int new_id);
extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
int nid, int shrinker_id);
#else
+
+static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
+{
+ return 0;
+}
+
+static inline void memcg_kmem_uncharge(struct page *page, int order)
+{
+}
+
#define for_each_memcg_cache_index(_idx) \
for (; NULL; )
diff --git a/include/linux/mm.h b/include/linux/mm.h
index daa2b8f..1e52b8f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -848,6 +848,8 @@ static inline bool is_zone_device_page(const struct page *page)
{
return page_zonenum(page) == ZONE_DEVICE;
}
+extern void memmap_init_zone_device(struct zone *, unsigned long,
+ unsigned long, struct dev_pagemap *);
#else
static inline bool is_zone_device_page(const struct page *page)
{
@@ -2304,6 +2306,8 @@ extern unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot, unsigned long flags,
vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate,
struct list_head *uf);
+extern int __do_munmap(struct mm_struct *, unsigned long, size_t,
+ struct list_head *uf, bool downgrade);
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf);
@@ -2502,11 +2506,11 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t);
int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
-int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
+vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn);
-int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
+vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, pgprot_t pgprot);
-int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn);
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
unsigned long addr, pfn_t pfn);
@@ -2525,32 +2529,6 @@ static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma,
return VM_FAULT_NOPAGE;
}
-static inline vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma,
- unsigned long addr, pfn_t pfn)
-{
- int err = vm_insert_mixed(vma, addr, pfn);
-
- if (err == -ENOMEM)
- return VM_FAULT_OOM;
- if (err < 0 && err != -EBUSY)
- return VM_FAULT_SIGBUS;
-
- return VM_FAULT_NOPAGE;
-}
-
-static inline vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma,
- unsigned long addr, unsigned long pfn)
-{
- int err = vm_insert_pfn(vma, addr, pfn);
-
- if (err == -ENOMEM)
- return VM_FAULT_OOM;
- if (err < 0 && err != -EBUSY)
- return VM_FAULT_SIGBUS;
-
- return VM_FAULT_NOPAGE;
-}
-
static inline vm_fault_t vmf_error(int err)
{
if (err == -ENOMEM)
@@ -2558,16 +2536,8 @@ static inline vm_fault_t vmf_error(int err)
return VM_FAULT_SIGBUS;
}
-struct page *follow_page_mask(struct vm_area_struct *vma,
- unsigned long address, unsigned int foll_flags,
- unsigned int *page_mask);
-
-static inline struct page *follow_page(struct vm_area_struct *vma,
- unsigned long address, unsigned int foll_flags)
-{
- unsigned int unused_page_mask;
- return follow_page_mask(vma, address, foll_flags, &unused_page_mask);
-}
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
+ unsigned int foll_flags);
#define FOLL_WRITE 0x01 /* check pte is writable */
#define FOLL_TOUCH 0x02 /* mark page accessed */
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 133ba78..9893a64 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -2,7 +2,6 @@
#ifndef _LINUX_MMU_NOTIFIER_H
#define _LINUX_MMU_NOTIFIER_H
-#include <linux/types.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/mm_types.h>
@@ -11,9 +10,6 @@
struct mmu_notifier;
struct mmu_notifier_ops;
-/* mmu_notifier_ops flags */
-#define MMU_INVALIDATE_DOES_NOT_BLOCK (0x01)
-
#ifdef CONFIG_MMU_NOTIFIER
/*
@@ -31,15 +27,6 @@ struct mmu_notifier_mm {
struct mmu_notifier_ops {
/*
- * Flags to specify behavior of callbacks for this MMU notifier.
- * Used to determine which context an operation may be called.
- *
- * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_range_* callbacks do not
- * block
- */
- int flags;
-
- /*
* Called either by mmu_notifier_unregister or when the mm is
* being destroyed by exit_mmap, always before all pages are
* freed. This can run concurrently with other mmu notifier
@@ -153,7 +140,9 @@ struct mmu_notifier_ops {
*
* If blockable argument is set to false then the callback cannot
* sleep and has to return with -EAGAIN. 0 should be returned
- * otherwise.
+ * otherwise. Please note that if invalidate_range_start approves
+ * a non-blocking behavior then the same applies to
+ * invalidate_range_end.
*
*/
int (*invalidate_range_start)(struct mmu_notifier *mn,
@@ -181,10 +170,6 @@ struct mmu_notifier_ops {
* Note that this function might be called with just a sub-range
* of what was passed to invalidate_range_start()/end(), if
* called between those functions.
- *
- * If this callback cannot block, and invalidate_range_{start,end}
- * cannot block, mmu_notifier_ops.flags should have
- * MMU_INVALIDATE_DOES_NOT_BLOCK set.
*/
void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm,
unsigned long start, unsigned long end);
@@ -239,7 +224,6 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
bool only_end);
extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
unsigned long start, unsigned long end);
-extern bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
static inline void mmu_notifier_release(struct mm_struct *mm)
{
@@ -493,11 +477,6 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
{
}
-static inline bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
-{
- return false;
-}
-
static inline void mmu_notifier_mm_init(struct mm_struct *mm)
{
}
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d4b0c79..9f0cacc 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -161,8 +161,10 @@ enum node_stat_item {
NR_SLAB_UNRECLAIMABLE,
NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */
NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */
+ WORKINGSET_NODES,
WORKINGSET_REFAULT,
WORKINGSET_ACTIVATE,
+ WORKINGSET_RESTORE,
WORKINGSET_NODERECLAIM,
NR_ANON_MAPPED, /* Mapped anonymous pages */
NR_FILE_MAPPED, /* pagecache pages mapped into pagetables.
@@ -180,7 +182,7 @@ enum node_stat_item {
NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */
NR_DIRTIED, /* page dirtyings since bootup */
NR_WRITTEN, /* page writings since bootup */
- NR_INDIRECTLY_RECLAIMABLE_BYTES, /* measured in bytes */
+ NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */
NR_VM_NODE_STAT_ITEMS
};
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 74bee8c..50ce1bd 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -69,13 +69,14 @@
*/
enum pageflags {
PG_locked, /* Page is locked. Don't touch. */
- PG_error,
PG_referenced,
PG_uptodate,
PG_dirty,
PG_lru,
PG_active,
+ PG_workingset,
PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
+ PG_error,
PG_slab,
PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/
PG_arch_1,
@@ -162,6 +163,14 @@ static inline int PagePoisoned(const struct page *page)
return page->flags == PAGE_POISON_PATTERN;
}
+#ifdef CONFIG_DEBUG_VM
+void page_init_poison(struct page *page, size_t size);
+#else
+static inline void page_init_poison(struct page *page, size_t size)
+{
+}
+#endif
+
/*
* Page flags policies wrt compound pages
*
@@ -280,6 +289,8 @@ PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
TESTCLEARFLAG(Active, active, PF_HEAD)
+PAGEFLAG(Workingset, workingset, PF_HEAD)
+ TESTCLEARFLAG(Workingset, workingset, PF_HEAD)
__PAGEFLAG(Slab, slab, PF_NO_TAIL)
__PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL)
PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */
@@ -292,6 +303,7 @@ PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND);
PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
__CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
+ __SETPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
__CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
__SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
index 21713dc..7bb7785 100644
--- a/include/linux/pfn_t.h
+++ b/include/linux/pfn_t.h
@@ -9,8 +9,10 @@
* PFN_SG_LAST - pfn references a page and is the last scatterlist entry
* PFN_DEV - pfn is not covered by system memmap by default
* PFN_MAP - pfn has a dynamic page mapping established by a device driver
+ * PFN_SPECIAL - for CONFIG_FS_DAX_LIMITED builds to allow XIP, but not
+ * get_user_pages
*/
-#define PFN_FLAGS_MASK (((u64) ~PAGE_MASK) << (BITS_PER_LONG_LONG - PAGE_SHIFT))
+#define PFN_FLAGS_MASK (((u64) (~PAGE_MASK)) << (BITS_PER_LONG_LONG - PAGE_SHIFT))
#define PFN_SG_CHAIN (1ULL << (BITS_PER_LONG_LONG - 1))
#define PFN_SG_LAST (1ULL << (BITS_PER_LONG_LONG - 2))
#define PFN_DEV (1ULL << (BITS_PER_LONG_LONG - 3))
diff --git a/include/linux/psi.h b/include/linux/psi.h
new file mode 100644
index 0000000..8e0725a
--- /dev/null
+++ b/include/linux/psi.h
@@ -0,0 +1,53 @@
+#ifndef _LINUX_PSI_H
+#define _LINUX_PSI_H
+
+#include <linux/psi_types.h>
+#include <linux/sched.h>
+
+struct seq_file;
+struct css_set;
+
+#ifdef CONFIG_PSI
+
+extern bool psi_disabled;
+
+void psi_init(void);
+
+void psi_task_change(struct task_struct *task, int clear, int set);
+
+void psi_memstall_tick(struct task_struct *task, int cpu);
+void psi_memstall_enter(unsigned long *flags);
+void psi_memstall_leave(unsigned long *flags);
+
+int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
+
+#ifdef CONFIG_CGROUPS
+int psi_cgroup_alloc(struct cgroup *cgrp);
+void psi_cgroup_free(struct cgroup *cgrp);
+void cgroup_move_task(struct task_struct *p, struct css_set *to);
+#endif
+
+#else /* CONFIG_PSI */
+
+static inline void psi_init(void) {}
+
+static inline void psi_memstall_enter(unsigned long *flags) {}
+static inline void psi_memstall_leave(unsigned long *flags) {}
+
+#ifdef CONFIG_CGROUPS
+static inline int psi_cgroup_alloc(struct cgroup *cgrp)
+{
+ return 0;
+}
+static inline void psi_cgroup_free(struct cgroup *cgrp)
+{
+}
+static inline void cgroup_move_task(struct task_struct *p, struct css_set *to)
+{
+ rcu_assign_pointer(p->cgroups, to);
+}
+#endif
+
+#endif /* CONFIG_PSI */
+
+#endif /* _LINUX_PSI_H */
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
new file mode 100644
index 0000000..2cf422d
--- /dev/null
+++ b/include/linux/psi_types.h
@@ -0,0 +1,92 @@
+#ifndef _LINUX_PSI_TYPES_H
+#define _LINUX_PSI_TYPES_H
+
+#include <linux/seqlock.h>
+#include <linux/types.h>
+
+#ifdef CONFIG_PSI
+
+/* Tracked task states */
+enum psi_task_count {
+ NR_IOWAIT,
+ NR_MEMSTALL,
+ NR_RUNNING,
+ NR_PSI_TASK_COUNTS,
+};
+
+/* Task state bitmasks */
+#define TSK_IOWAIT (1 << NR_IOWAIT)
+#define TSK_MEMSTALL (1 << NR_MEMSTALL)
+#define TSK_RUNNING (1 << NR_RUNNING)
+
+/* Resources that workloads could be stalled on */
+enum psi_res {
+ PSI_IO,
+ PSI_MEM,
+ PSI_CPU,
+ NR_PSI_RESOURCES,
+};
+
+/*
+ * Pressure states for each resource:
+ *
+ * SOME: Stalled tasks & working tasks
+ * FULL: Stalled tasks & no working tasks
+ */
+enum psi_states {
+ PSI_IO_SOME,
+ PSI_IO_FULL,
+ PSI_MEM_SOME,
+ PSI_MEM_FULL,
+ PSI_CPU_SOME,
+ /* Only per-CPU, to weigh the CPU in the global average: */
+ PSI_NONIDLE,
+ NR_PSI_STATES,
+};
+
+struct psi_group_cpu {
+ /* 1st cacheline updated by the scheduler */
+
+ /* Aggregator needs to know of concurrent changes */
+ seqcount_t seq ____cacheline_aligned_in_smp;
+
+ /* States of the tasks belonging to this group */
+ unsigned int tasks[NR_PSI_TASK_COUNTS];
+
+ /* Period time sampling buckets for each state of interest (ns) */
+ u32 times[NR_PSI_STATES];
+
+ /* Time of last task change in this group (rq_clock) */
+ u64 state_start;
+
+ /* 2nd cacheline updated by the aggregator */
+
+ /* Delta detection against the sampling buckets */
+ u32 times_prev[NR_PSI_STATES] ____cacheline_aligned_in_smp;
+};
+
+struct psi_group {
+ /* Protects data updated during an aggregation */
+ struct mutex stat_lock;
+
+ /* Per-cpu task state & time tracking */
+ struct psi_group_cpu __percpu *pcpu;
+
+ /* Periodic aggregation state */
+ u64 total_prev[NR_PSI_STATES - 1];
+ u64 last_update;
+ u64 next_update;
+ struct delayed_work clock_work;
+
+ /* Total stall times and sampled pressure averages */
+ u64 total[NR_PSI_STATES - 1];
+ unsigned long avg[NR_PSI_STATES - 1][3];
+};
+
+#else /* CONFIG_PSI */
+
+struct psi_group { };
+
+#endif /* CONFIG_PSI */
+
+#endif /* _LINUX_PSI_TYPES_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index adfb3f9..8f8a541 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -25,6 +25,7 @@
#include <linux/latencytop.h>
#include <linux/sched/prio.h>
#include <linux/signal_types.h>
+#include <linux/psi_types.h>
#include <linux/mm_types_task.h>
#include <linux/task_io_accounting.h>
#include <linux/rseq.h>
@@ -706,6 +707,10 @@ struct task_struct {
unsigned sched_contributes_to_load:1;
unsigned sched_migrated:1;
unsigned sched_remote_wakeup:1;
+#ifdef CONFIG_PSI
+ unsigned sched_psi_wake_requeue:1;
+#endif
+
/* Force alignment to the next boundary: */
unsigned :0;
@@ -719,9 +724,6 @@ struct task_struct {
#endif
#ifdef CONFIG_MEMCG
unsigned in_user_fault:1;
-#ifdef CONFIG_MEMCG_KMEM
- unsigned memcg_kmem_skip_account:1;
-#endif
#endif
#ifdef CONFIG_COMPAT_BRK
unsigned brk_randomized:1;
@@ -965,6 +967,10 @@ struct task_struct {
kernel_siginfo_t *last_siginfo;
struct task_io_accounting ioac;
+#ifdef CONFIG_PSI
+ /* Pressure stall state */
+ unsigned int psi_flags;
+#endif
#ifdef CONFIG_TASK_XACCT
/* Accumulated RSS usage: */
u64 acct_rss_mem1;
@@ -1391,6 +1397,7 @@ extern struct pid *cad_pid;
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
#define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */
#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
+#define PF_MEMSTALL 0x01000000 /* Stalled due to lack of memory */
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h
index 80bc84b..4859bea 100644
--- a/include/linux/sched/loadavg.h
+++ b/include/linux/sched/loadavg.h
@@ -22,10 +22,26 @@ extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
#define EXP_5 2014 /* 1/exp(5sec/5min) */
#define EXP_15 2037 /* 1/exp(5sec/15min) */
-#define CALC_LOAD(load,exp,n) \
- load *= exp; \
- load += n*(FIXED_1-exp); \
- load >>= FSHIFT;
+/*
+ * a1 = a0 * e + a * (1 - e)
+ */
+static inline unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+ unsigned long newload;
+
+ newload = load * exp + active * (FIXED_1 - exp);
+ if (active >= load)
+ newload += FIXED_1-1;
+
+ return newload / FIXED_1;
+}
+
+extern unsigned long calc_load_n(unsigned long load, unsigned long exp,
+ unsigned long active, unsigned int n);
+
+#define LOAD_INT(x) ((x) >> FSHIFT)
+#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
extern void calc_global_load(unsigned long ticks);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index ed9cbdd..918f374 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -295,12 +295,43 @@ static inline void __check_heap_object(const void *ptr, unsigned long n,
#define SLAB_OBJ_MIN_SIZE (KMALLOC_MIN_SIZE < 16 ? \
(KMALLOC_MIN_SIZE) : 16)
+/*
+ * Whenever changing this, take care of that kmalloc_type() and
+ * create_kmalloc_caches() still work as intended.
+ */
+enum kmalloc_cache_type {
+ KMALLOC_NORMAL = 0,
+ KMALLOC_RECLAIM,
+#ifdef CONFIG_ZONE_DMA
+ KMALLOC_DMA,
+#endif
+ NR_KMALLOC_TYPES
+};
+
#ifndef CONFIG_SLOB
-extern struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1];
+extern struct kmem_cache *
+kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1];
+
+static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags)
+{
+ int is_dma = 0;
+ int type_dma = 0;
+ int is_reclaimable;
+
#ifdef CONFIG_ZONE_DMA
-extern struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1];
+ is_dma = !!(flags & __GFP_DMA);
+ type_dma = is_dma * KMALLOC_DMA;
#endif
+ is_reclaimable = !!(flags & __GFP_RECLAIMABLE);
+
+ /*
+ * If an allocation is both __GFP_DMA and __GFP_RECLAIMABLE, return
+ * KMALLOC_DMA and effectively ignore __GFP_RECLAIMABLE
+ */
+ return type_dma + (is_reclaimable & !is_dma) * KMALLOC_RECLAIM;
+}
+
/*
* Figure out which kmalloc slab an allocation of a certain size
* belongs to.
@@ -501,18 +532,20 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
if (__builtin_constant_p(size)) {
+#ifndef CONFIG_SLOB
+ unsigned int index;
+#endif
if (size > KMALLOC_MAX_CACHE_SIZE)
return kmalloc_large(size, flags);
#ifndef CONFIG_SLOB
- if (!(flags & GFP_DMA)) {
- unsigned int index = kmalloc_index(size);
+ index = kmalloc_index(size);
- if (!index)
- return ZERO_SIZE_PTR;
+ if (!index)
+ return ZERO_SIZE_PTR;
- return kmem_cache_alloc_trace(kmalloc_caches[index],
- flags, size);
- }
+ return kmem_cache_alloc_trace(
+ kmalloc_caches[kmalloc_type(flags)][index],
+ flags, size);
#endif
}
return __kmalloc(size, flags);
@@ -542,13 +575,14 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
{
#ifndef CONFIG_SLOB
if (__builtin_constant_p(size) &&
- size <= KMALLOC_MAX_CACHE_SIZE && !(flags & GFP_DMA)) {
+ size <= KMALLOC_MAX_CACHE_SIZE) {
unsigned int i = kmalloc_index(size);
if (!i)
return ZERO_SIZE_PTR;
- return kmem_cache_alloc_node_trace(kmalloc_caches[i],
+ return kmem_cache_alloc_node_trace(
+ kmalloc_caches[kmalloc_type(flags)][i],
flags, node, size);
}
#endif
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 8e2c11e..38195f5 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -167,13 +167,14 @@ enum {
SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */
SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */
SWP_BLKDEV = (1 << 6), /* its a block device */
- SWP_FILE = (1 << 7), /* set after swap_activate success */
- SWP_AREA_DISCARD = (1 << 8), /* single-time swap area discards */
- SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */
- SWP_STABLE_WRITES = (1 << 10), /* no overwrite PG_writeback pages */
- SWP_SYNCHRONOUS_IO = (1 << 11), /* synchronous IO is efficient */
+ SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */
+ SWP_FS = (1 << 8), /* swap file goes through fs */
+ SWP_AREA_DISCARD = (1 << 9), /* single-time swap area discards */
+ SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */
+ SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */
+ SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
/* add others here before... */
- SWP_SCANNING = (1 << 12), /* refcount in scan_swap_map */
+ SWP_SCANNING = (1 << 13), /* refcount in scan_swap_map */
};
#define SWAP_CLUSTER_MAX 32UL
@@ -296,7 +297,7 @@ struct vma_swap_readahead {
/* linux/mm/workingset.c */
void *workingset_eviction(struct address_space *mapping, struct page *page);
-bool workingset_refault(void *shadow);
+void workingset_refault(struct page *page, void *shadow);
void workingset_activation(struct page *page);
/* Do not use directly, use workingset_lookup_update */
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index a81cffb..a1675d4 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -88,6 +88,7 @@
{1UL << PG_dirty, "dirty" }, \
{1UL << PG_lru, "lru" }, \
{1UL << PG_active, "active" }, \
+ {1UL << PG_workingset, "workingset" }, \
{1UL << PG_slab, "slab" }, \
{1UL << PG_owner_priv_1, "owner_priv_1" }, \
{1UL << PG_arch_1, "arch_1" }, \
diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h
index b7aa7bb..5e8ca16 100644
--- a/include/uapi/linux/taskstats.h
+++ b/include/uapi/linux/taskstats.h
@@ -34,7 +34,7 @@
*/
-#define TASKSTATS_VERSION 8
+#define TASKSTATS_VERSION 9
#define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN
* in linux/sched.h */
@@ -164,6 +164,10 @@ struct taskstats {
/* Delay waiting for memory reclaim */
__u64 freepages_count;
__u64 freepages_delay_total;
+
+ /* Delay waiting for thrashing page */
+ __u64 thrashing_count;
+ __u64 thrashing_delay_total;
};
diff --git a/init/Kconfig b/init/Kconfig
index 317d5cc..a4112e9 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -490,6 +490,25 @@ config TASK_IO_ACCOUNTING
Say N if unsure.
+config PSI
+ bool "Pressure stall information tracking"
+ help
+ Collect metrics that indicate how overcommitted the CPU, memory,
+ and IO capacity are in the system.
+
+ If you say Y here, the kernel will create /proc/pressure/ with the
+ pressure statistics files cpu, memory, and io. These will indicate
+ the share of walltime in which some or all tasks in the system are
+ delayed due to contention of the respective resource.
+
+ In kernels with cgroup support, cgroups (cgroup2 only) will
+ have cpu.pressure, memory.pressure, and io.pressure files,
+ which aggregate pressure stalls for the grouped tasks only.
+
+ For more details see Documentation/accounting/psi.txt.
+
+ Say N if unsure.
+
endmenu # "CPU/Task time and stats accounting"
config CPU_ISOLATION
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 4c1cf09..8b79318 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -55,6 +55,7 @@
#include <linux/nsproxy.h>
#include <linux/file.h>
#include <linux/sched/cputime.h>
+#include <linux/psi.h>
#include <net/sock.h>
#define CREATE_TRACE_POINTS
@@ -862,7 +863,7 @@ static void css_set_move_task(struct task_struct *task,
*/
WARN_ON_ONCE(task->flags & PF_EXITING);
- rcu_assign_pointer(task->cgroups, to_cset);
+ cgroup_move_task(task, to_cset);
list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
&to_cset->tasks);
}
@@ -3446,6 +3447,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
return ret;
}
+#ifdef CONFIG_PSI
+static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
+{
+ return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO);
+}
+static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
+{
+ return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM);
+}
+static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
+{
+ return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU);
+}
+#endif
+
static int cgroup_file_open(struct kernfs_open_file *of)
{
struct cftype *cft = of->kn->priv;
@@ -4576,6 +4592,23 @@ static struct cftype cgroup_base_files[] = {
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cpu_stat_show,
},
+#ifdef CONFIG_PSI
+ {
+ .name = "io.pressure",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_io_pressure_show,
+ },
+ {
+ .name = "memory.pressure",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_memory_pressure_show,
+ },
+ {
+ .name = "cpu.pressure",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_cpu_pressure_show,
+ },
+#endif
{ } /* terminate */
};
@@ -4636,6 +4669,7 @@ static void css_free_rwork_fn(struct work_struct *work)
*/
cgroup_put(cgroup_parent(cgrp));
kernfs_put(cgrp->kn);
+ psi_cgroup_free(cgrp);
if (cgroup_on_dfl(cgrp))
cgroup_rstat_exit(cgrp);
kfree(cgrp);
@@ -4892,10 +4926,15 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
cgrp->self.parent = &parent->self;
cgrp->root = root;
cgrp->level = level;
- ret = cgroup_bpf_inherit(cgrp);
+
+ ret = psi_cgroup_alloc(cgrp);
if (ret)
goto out_idr_free;
+ ret = cgroup_bpf_inherit(cgrp);
+ if (ret)
+ goto out_psi_free;
+
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
@@ -4933,6 +4972,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
return cgrp;
+out_psi_free:
+ psi_cgroup_free(cgrp);
out_idr_free:
cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
out_stat_exit:
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 2ddfce8..bb4fe4e 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2556,16 +2556,11 @@ static int kdb_summary(int argc, const char **argv)
}
kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60);
- /* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */
-
-#define LOAD_INT(x) ((x) >> FSHIFT)
-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
kdb_printf("load avg %ld.%02ld %ld.%02ld %ld.%02ld\n",
LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]),
LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]),
LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2]));
-#undef LOAD_INT
-#undef LOAD_FRAC
+
/* Display in kilobytes */
#define K(x) ((x) << (PAGE_SHIFT - 10))
kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n"
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index ca8ac28..2a12b98 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -135,9 +135,12 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
tmp = d->freepages_delay_total + tsk->delays->freepages_delay;
d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
+ tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay;
+ d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp;
d->blkio_count += tsk->delays->blkio_count;
d->swapin_count += tsk->delays->swapin_count;
d->freepages_count += tsk->delays->freepages_count;
+ d->thrashing_count += tsk->delays->thrashing_count;
raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
return 0;
@@ -169,3 +172,15 @@ void __delayacct_freepages_end(void)
&current->delays->freepages_count);
}
+void __delayacct_thrashing_start(void)
+{
+ current->delays->thrashing_start = ktime_get_ns();
+}
+
+void __delayacct_thrashing_end(void)
+{
+ delayacct_end(&current->delays->lock,
+ &current->delays->thrashing_start,
+ &current->delays->thrashing_delay,
+ &current->delays->thrashing_count);
+}
diff --git a/kernel/fork.c b/kernel/fork.c
index f0b5847..8f82a3b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -223,9 +223,14 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
return s->addr;
}
+ /*
+ * Allocated stacks are cached and later reused by new threads,
+ * so memcg accounting is performed manually on assigning/releasing
+ * stacks to tasks. Drop __GFP_ACCOUNT.
+ */
stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
VMALLOC_START, VMALLOC_END,
- THREADINFO_GFP,
+ THREADINFO_GFP & ~__GFP_ACCOUNT,
PAGE_KERNEL,
0, node, __builtin_return_address(0));
@@ -248,9 +253,19 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
static inline void free_thread_stack(struct task_struct *tsk)
{
#ifdef CONFIG_VMAP_STACK
- if (task_stack_vm_area(tsk)) {
+ struct vm_struct *vm = task_stack_vm_area(tsk);
+
+ if (vm) {
int i;
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+ mod_memcg_page_state(vm->pages[i],
+ MEMCG_KERNEL_STACK_KB,
+ -(int)(PAGE_SIZE / 1024));
+
+ memcg_kmem_uncharge(vm->pages[i], 0);
+ }
+
for (i = 0; i < NR_CACHED_STACKS; i++) {
if (this_cpu_cmpxchg(cached_stacks[i],
NULL, tsk->stack_vm_area) != NULL)
@@ -351,10 +366,6 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
NR_KERNEL_STACK_KB,
PAGE_SIZE / 1024 * account);
}
-
- /* All stack pages belong to the same memcg. */
- mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
- account * (THREAD_SIZE / 1024));
} else {
/*
* All stack pages are in the same zone and belong to the
@@ -370,6 +381,35 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
}
}
+static int memcg_charge_kernel_stack(struct task_struct *tsk)
+{
+#ifdef CONFIG_VMAP_STACK
+ struct vm_struct *vm = task_stack_vm_area(tsk);
+ int ret;
+
+ if (vm) {
+ int i;
+
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+ /*
+ * If memcg_kmem_charge() fails, page->mem_cgroup
+ * pointer is NULL, and both memcg_kmem_uncharge()
+ * and mod_memcg_page_state() in free_thread_stack()
+ * will ignore this page. So it's safe.
+ */
+ ret = memcg_kmem_charge(vm->pages[i], GFP_KERNEL, 0);
+ if (ret)
+ return ret;
+
+ mod_memcg_page_state(vm->pages[i],
+ MEMCG_KERNEL_STACK_KB,
+ PAGE_SIZE / 1024);
+ }
+ }
+#endif
+ return 0;
+}
+
static void release_task_stack(struct task_struct *tsk)
{
if (WARN_ON(tsk->state != TASK_DEAD))
@@ -807,6 +847,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
if (!stack)
goto free_tsk;
+ if (memcg_charge_kernel_stack(tsk))
+ goto free_stack;
+
stack_vm_area = task_stack_vm_area(tsk);
err = arch_dup_task_struct(tsk, orig);
@@ -1779,6 +1822,10 @@ static __latent_entropy struct task_struct *copy_process(
p->default_timer_slack_ns = current->timer_slack_ns;
+#ifdef CONFIG_PSI
+ p->psi_flags = 0;
+#endif
+
task_io_accounting_init(&p->ioac);
acct_clear_integrals(p);
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 5b8600d..620fc4d 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -175,10 +175,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
struct vmem_altmap *altmap = pgmap->altmap_valid ?
&pgmap->altmap : NULL;
struct resource *res = &pgmap->res;
- unsigned long pfn, pgoff, order;
+ struct dev_pagemap *conflict_pgmap;
pgprot_t pgprot = PAGE_KERNEL;
+ unsigned long pgoff, order;
int error, nid, is_ram;
- struct dev_pagemap *conflict_pgmap;
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -256,19 +256,14 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
if (error)
goto err_add_memory;
- for_each_device_pfn(pfn, pgmap) {
- struct page *page = pfn_to_page(pfn);
-
- /*
- * ZONE_DEVICE pages union ->lru with a ->pgmap back
- * pointer. It is a bug if a ZONE_DEVICE page is ever
- * freed or placed on a driver-private list. Seed the
- * storage with LIST_POISON* values.
- */
- list_del(&page->lru);
- page->pgmap = pgmap;
- percpu_ref_get(pgmap->ref);
- }
+ /*
+ * Initialization of the pages has been deferred until now in order
+ * to allow us to do the work while not holding the hotplug lock.
+ */
+ memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
+ align_start >> PAGE_SHIFT,
+ align_size >> PAGE_SHIFT, pgmap);
+ percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap));
devm_add_action(dev, devm_memremap_pages_release, pgmap);
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 7fe1834..21fb5a5 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -29,3 +29,4 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o
obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
obj-$(CONFIG_MEMBARRIER) += membarrier.o
obj-$(CONFIG_CPU_ISOLATION) += isolation.o
+obj-$(CONFIG_PSI) += psi.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2e696b0..fd2fce8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -722,8 +722,10 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
if (!(flags & ENQUEUE_NOCLOCK))
update_rq_clock(rq);
- if (!(flags & ENQUEUE_RESTORE))
+ if (!(flags & ENQUEUE_RESTORE)) {
sched_info_queued(rq, p);
+ psi_enqueue(p, flags & ENQUEUE_WAKEUP);
+ }
p->sched_class->enqueue_task(rq, p, flags);
}
@@ -733,8 +735,10 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
if (!(flags & DEQUEUE_NOCLOCK))
update_rq_clock(rq);
- if (!(flags & DEQUEUE_SAVE))
+ if (!(flags & DEQUEUE_SAVE)) {
sched_info_dequeued(rq, p);
+ psi_dequeue(p, flags & DEQUEUE_SLEEP);
+ }
p->sched_class->dequeue_task(rq, p, flags);
}
@@ -2037,6 +2041,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
+ psi_ttwu_dequeue(p);
set_task_cpu(p, cpu);
}
@@ -3051,6 +3056,7 @@ void scheduler_tick(void)
curr->sched_class->task_tick(rq, curr, 0);
cpu_load_update_active(rq);
calc_global_load_tick(rq);
+ psi_task_tick(rq);
rq_unlock(rq, &rf);
@@ -4933,9 +4939,7 @@ static void do_sched_yield(void)
struct rq_flags rf;
struct rq *rq;
- local_irq_disable();
- rq = this_rq();
- rq_lock(rq, &rf);
+ rq = this_rq_lock_irq(&rf);
schedstat_inc(rq->yld_count);
current->sched_class->yield_task(rq);
@@ -6069,6 +6073,8 @@ void __init sched_init(void)
init_schedstats();
+ psi_init();
+
scheduler_running = 1;
}
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index a171c12..28a5165 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -91,19 +91,73 @@ long calc_load_fold_active(struct rq *this_rq, long adjust)
return delta;
}
-/*
- * a1 = a0 * e + a * (1 - e)
+/**
+ * fixed_power_int - compute: x^n, in O(log n) time
+ *
+ * @x: base of the power
+ * @frac_bits: fractional bits of @x
+ * @n: power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
*/
static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
{
- unsigned long newload;
+ unsigned long result = 1UL << frac_bits;
+
+ if (n) {
+ for (;;) {
+ if (n & 1) {
+ result *= x;
+ result += 1UL << (frac_bits - 1);
+ result >>= frac_bits;
+ }
+ n >>= 1;
+ if (!n)
+ break;
+ x *= x;
+ x += 1UL << (frac_bits - 1);
+ x >>= frac_bits;
+ }
+ }
- newload = load * exp + active * (FIXED_1 - exp);
- if (active >= load)
- newload += FIXED_1-1;
+ return result;
+}
- return newload / FIXED_1;
+/*
+ * a1 = a0 * e + a * (1 - e)
+ *
+ * a2 = a1 * e + a * (1 - e)
+ * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ * = a0 * e^2 + a * (1 - e) * (1 + e)
+ *
+ * a3 = a2 * e + a * (1 - e)
+ * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ *
+ * ...
+ *
+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ * = a0 * e^n + a * (1 - e^n)
+ *
+ * [1] application of the geometric series:
+ *
+ * n 1 - x^(n+1)
+ * S_n := \Sum x^i = -------------
+ * i=0 1 - x
+ */
+unsigned long
+calc_load_n(unsigned long load, unsigned long exp,
+ unsigned long active, unsigned int n)
+{
+ return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -225,75 +279,6 @@ static long calc_load_nohz_fold(void)
return delta;
}
-/**
- * fixed_power_int - compute: x^n, in O(log n) time
- *
- * @x: base of the power
- * @frac_bits: fractional bits of @x
- * @n: power to raise @x to.
- *
- * By exploiting the relation between the definition of the natural power
- * function: x^n := x*x*...*x (x multiplied by itself for n times), and
- * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
- * (where: n_i \elem {0, 1}, the binary vector representing n),
- * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
- * of course trivially computable in O(log_2 n), the length of our binary
- * vector.
- */
-static unsigned long
-fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
-{
- unsigned long result = 1UL << frac_bits;
-
- if (n) {
- for (;;) {
- if (n & 1) {
- result *= x;
- result += 1UL << (frac_bits - 1);
- result >>= frac_bits;
- }
- n >>= 1;
- if (!n)
- break;
- x *= x;
- x += 1UL << (frac_bits - 1);
- x >>= frac_bits;
- }
- }
-
- return result;
-}
-
-/*
- * a1 = a0 * e + a * (1 - e)
- *
- * a2 = a1 * e + a * (1 - e)
- * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
- * = a0 * e^2 + a * (1 - e) * (1 + e)
- *
- * a3 = a2 * e + a * (1 - e)
- * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
- * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
- *
- * ...
- *
- * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
- * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
- * = a0 * e^n + a * (1 - e^n)
- *
- * [1] application of the geometric series:
- *
- * n 1 - x^(n+1)
- * S_n := \Sum x^i = -------------
- * i=0 1 - x
- */
-static unsigned long
-calc_load_n(unsigned long load, unsigned long exp,
- unsigned long active, unsigned int n)
-{
- return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
-}
-
/*
* NO_HZ can leave us missing all per-CPU ticks calling
* calc_load_fold_active(), but since a NO_HZ CPU folds its delta into
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
new file mode 100644
index 0000000..7cdecfc
--- /dev/null
+++ b/kernel/sched/psi.c
@@ -0,0 +1,759 @@
+/*
+ * Pressure stall information for CPU, memory and IO
+ *
+ * Copyright (c) 2018 Facebook, Inc.
+ * Author: Johannes Weiner <hannes@cmpxchg.org>
+ *
+ * When CPU, memory and IO are contended, tasks experience delays that
+ * reduce throughput and introduce latencies into the workload. Memory
+ * and IO contention, in addition, can cause a full loss of forward
+ * progress in which the CPU goes idle.
+ *
+ * This code aggregates individual task delays into resource pressure
+ * metrics that indicate problems with both workload health and
+ * resource utilization.
+ *
+ * Model
+ *
+ * The time in which a task can execute on a CPU is our baseline for
+ * productivity. Pressure expresses the amount of time in which this
+ * potential cannot be realized due to resource contention.
+ *
+ * This concept of productivity has two components: the workload and
+ * the CPU. To measure the impact of pressure on both, we define two
+ * contention states for a resource: SOME and FULL.
+ *
+ * In the SOME state of a given resource, one or more tasks are
+ * delayed on that resource. This affects the workload's ability to
+ * perform work, but the CPU may still be executing other tasks.
+ *
+ * In the FULL state of a given resource, all non-idle tasks are
+ * delayed on that resource such that nobody is advancing and the CPU
+ * goes idle. This leaves both workload and CPU unproductive.
+ *
+ * (Naturally, the FULL state doesn't exist for the CPU resource.)
+ *
+ * SOME = nr_delayed_tasks != 0
+ * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0
+ *
+ * The percentage of wallclock time spent in those compound stall
+ * states gives pressure numbers between 0 and 100 for each resource,
+ * where the SOME percentage indicates workload slowdowns and the FULL
+ * percentage indicates reduced CPU utilization:
+ *
+ * %SOME = time(SOME) / period
+ * %FULL = time(FULL) / period
+ *
+ * Multiple CPUs
+ *
+ * The more tasks and available CPUs there are, the more work can be
+ * performed concurrently. This means that the potential that can go
+ * unrealized due to resource contention *also* scales with non-idle
+ * tasks and CPUs.
+ *
+ * Consider a scenario where 257 number crunching tasks are trying to
+ * run concurrently on 256 CPUs. If we simply aggregated the task
+ * states, we would have to conclude a CPU SOME pressure number of
+ * 100%, since *somebody* is waiting on a runqueue at all
+ * times. However, that is clearly not the amount of contention the
+ * workload is experiencing: only one out of 256 possible exceution
+ * threads will be contended at any given time, or about 0.4%.
+ *
+ * Conversely, consider a scenario of 4 tasks and 4 CPUs where at any
+ * given time *one* of the tasks is delayed due to a lack of memory.
+ * Again, looking purely at the task state would yield a memory FULL
+ * pressure number of 0%, since *somebody* is always making forward
+ * progress. But again this wouldn't capture the amount of execution
+ * potential lost, which is 1 out of 4 CPUs, or 25%.
+ *
+ * To calculate wasted potential (pressure) with multiple processors,
+ * we have to base our calculation on the number of non-idle tasks in
+ * conjunction with the number of available CPUs, which is the number
+ * of potential execution threads. SOME becomes then the proportion of
+ * delayed tasks to possibe threads, and FULL is the share of possible
+ * threads that are unproductive due to delays:
+ *
+ * threads = min(nr_nonidle_tasks, nr_cpus)
+ * SOME = min(nr_delayed_tasks / threads, 1)
+ * FULL = (threads - min(nr_running_tasks, threads)) / threads
+ *
+ * For the 257 number crunchers on 256 CPUs, this yields:
+ *
+ * threads = min(257, 256)
+ * SOME = min(1 / 256, 1) = 0.4%
+ * FULL = (256 - min(257, 256)) / 256 = 0%
+ *
+ * For the 1 out of 4 memory-delayed tasks, this yields:
+ *
+ * threads = min(4, 4)
+ * SOME = min(1 / 4, 1) = 25%
+ * FULL = (4 - min(3, 4)) / 4 = 25%
+ *
+ * [ Substitute nr_cpus with 1, and you can see that it's a natural
+ * extension of the single-CPU model. ]
+ *
+ * Implementation
+ *
+ * To assess the precise time spent in each such state, we would have
+ * to freeze the system on task changes and start/stop the state
+ * clocks accordingly. Obviously that doesn't scale in practice.
+ *
+ * Because the scheduler aims to distribute the compute load evenly
+ * among the available CPUs, we can track task state locally to each
+ * CPU and, at much lower frequency, extrapolate the global state for
+ * the cumulative stall times and the running averages.
+ *
+ * For each runqueue, we track:
+ *
+ * tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
+ * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu])
+ * tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
+ *
+ * and then periodically aggregate:
+ *
+ * tNONIDLE = sum(tNONIDLE[i])
+ *
+ * tSOME = sum(tSOME[i] * tNONIDLE[i]) / tNONIDLE
+ * tFULL = sum(tFULL[i] * tNONIDLE[i]) / tNONIDLE
+ *
+ * %SOME = tSOME / period
+ * %FULL = tFULL / period
+ *
+ * This gives us an approximation of pressure that is practical
+ * cost-wise, yet way more sensitive and accurate than periodic
+ * sampling of the aggregate task states would be.
+ */
+
+#include <linux/sched/loadavg.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/seqlock.h>
+#include <linux/cgroup.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/psi.h>
+#include "sched.h"
+
+static int psi_bug __read_mostly;
+
+bool psi_disabled __read_mostly;
+core_param(psi_disabled, psi_disabled, bool, 0644);
+
+/* Running averages - we need to be higher-res than loadavg */
+#define PSI_FREQ (2*HZ+1) /* 2 sec intervals */
+#define EXP_10s 1677 /* 1/exp(2s/10s) as fixed-point */
+#define EXP_60s 1981 /* 1/exp(2s/60s) */
+#define EXP_300s 2034 /* 1/exp(2s/300s) */
+
+/* Sampling frequency in nanoseconds */
+static u64 psi_period __read_mostly;
+
+/* System-level pressure and stall tracking */
+static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu);
+static struct psi_group psi_system = {
+ .pcpu = &system_group_pcpu,
+};
+
+static void psi_update_work(struct work_struct *work);
+
+static void group_init(struct psi_group *group)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
+ group->next_update = sched_clock() + psi_period;
+ INIT_DELAYED_WORK(&group->clock_work, psi_update_work);
+ mutex_init(&group->stat_lock);
+}
+
+void __init psi_init(void)
+{
+ if (psi_disabled)
+ return;
+
+ psi_period = jiffies_to_nsecs(PSI_FREQ);
+ group_init(&psi_system);
+}
+
+static bool test_state(unsigned int *tasks, enum psi_states state)
+{
+ switch (state) {
+ case PSI_IO_SOME:
+ return tasks[NR_IOWAIT];
+ case PSI_IO_FULL:
+ return tasks[NR_IOWAIT] && !tasks[NR_RUNNING];
+ case PSI_MEM_SOME:
+ return tasks[NR_MEMSTALL];
+ case PSI_MEM_FULL:
+ return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
+ case PSI_CPU_SOME:
+ return tasks[NR_RUNNING] > 1;
+ case PSI_NONIDLE:
+ return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
+ tasks[NR_RUNNING];
+ default:
+ return false;
+ }
+}
+
+static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
+{
+ struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
+ unsigned int tasks[NR_PSI_TASK_COUNTS];
+ u64 now, state_start;
+ unsigned int seq;
+ int s;
+
+ /* Snapshot a coherent view of the CPU state */
+ do {
+ seq = read_seqcount_begin(&groupc->seq);
+ now = cpu_clock(cpu);
+ memcpy(times, groupc->times, sizeof(groupc->times));
+ memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
+ state_start = groupc->state_start;
+ } while (read_seqcount_retry(&groupc->seq, seq));
+
+ /* Calculate state time deltas against the previous snapshot */
+ for (s = 0; s < NR_PSI_STATES; s++) {
+ u32 delta;
+ /*
+ * In addition to already concluded states, we also
+ * incorporate currently active states on the CPU,
+ * since states may last for many sampling periods.
+ *
+ * This way we keep our delta sampling buckets small
+ * (u32) and our reported pressure close to what's
+ * actually happening.
+ */
+ if (test_state(tasks, s))
+ times[s] += now - state_start;
+
+ delta = times[s] - groupc->times_prev[s];
+ groupc->times_prev[s] = times[s];
+
+ times[s] = delta;
+ }
+}
+
+static void calc_avgs(unsigned long avg[3], int missed_periods,
+ u64 time, u64 period)
+{
+ unsigned long pct;
+
+ /* Fill in zeroes for periods of no activity */
+ if (missed_periods) {
+ avg[0] = calc_load_n(avg[0], EXP_10s, 0, missed_periods);
+ avg[1] = calc_load_n(avg[1], EXP_60s, 0, missed_periods);
+ avg[2] = calc_load_n(avg[2], EXP_300s, 0, missed_periods);
+ }
+
+ /* Sample the most recent active period */
+ pct = div_u64(time * 100, period);
+ pct *= FIXED_1;
+ avg[0] = calc_load(avg[0], EXP_10s, pct);
+ avg[1] = calc_load(avg[1], EXP_60s, pct);
+ avg[2] = calc_load(avg[2], EXP_300s, pct);
+}
+
+static bool update_stats(struct psi_group *group)
+{
+ u64 deltas[NR_PSI_STATES - 1] = { 0, };
+ unsigned long missed_periods = 0;
+ unsigned long nonidle_total = 0;
+ u64 now, expires, period;
+ int cpu;
+ int s;
+
+ mutex_lock(&group->stat_lock);
+
+ /*
+ * Collect the per-cpu time buckets and average them into a
+ * single time sample that is normalized to wallclock time.
+ *
+ * For averaging, each CPU is weighted by its non-idle time in
+ * the sampling period. This eliminates artifacts from uneven
+ * loading, or even entirely idle CPUs.
+ */
+ for_each_possible_cpu(cpu) {
+ u32 times[NR_PSI_STATES];
+ u32 nonidle;
+
+ get_recent_times(group, cpu, times);
+
+ nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]);
+ nonidle_total += nonidle;
+
+ for (s = 0; s < PSI_NONIDLE; s++)
+ deltas[s] += (u64)times[s] * nonidle;
+ }
+
+ /*
+ * Integrate the sample into the running statistics that are
+ * reported to userspace: the cumulative stall times and the
+ * decaying averages.
+ *
+ * Pressure percentages are sampled at PSI_FREQ. We might be
+ * called more often when the user polls more frequently than
+ * that; we might be called less often when there is no task
+ * activity, thus no data, and clock ticks are sporadic. The
+ * below handles both.
+ */
+
+ /* total= */
+ for (s = 0; s < NR_PSI_STATES - 1; s++)
+ group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL));
+
+ /* avgX= */
+ now = sched_clock();
+ expires = group->next_update;
+ if (now < expires)
+ goto out;
+ if (now - expires > psi_period)
+ missed_periods = div_u64(now - expires, psi_period);
+
+ /*
+ * The periodic clock tick can get delayed for various
+ * reasons, especially on loaded systems. To avoid clock
+ * drift, we schedule the clock in fixed psi_period intervals.
+ * But the deltas we sample out of the per-cpu buckets above
+ * are based on the actual time elapsing between clock ticks.
+ */
+ group->next_update = expires + ((1 + missed_periods) * psi_period);
+ period = now - (group->last_update + (missed_periods * psi_period));
+ group->last_update = now;
+
+ for (s = 0; s < NR_PSI_STATES - 1; s++) {
+ u32 sample;
+
+ sample = group->total[s] - group->total_prev[s];
+ /*
+ * Due to the lockless sampling of the time buckets,
+ * recorded time deltas can slip into the next period,
+ * which under full pressure can result in samples in
+ * excess of the period length.
+ *
+ * We don't want to report non-sensical pressures in
+ * excess of 100%, nor do we want to drop such events
+ * on the floor. Instead we punt any overage into the
+ * future until pressure subsides. By doing this we
+ * don't underreport the occurring pressure curve, we
+ * just report it delayed by one period length.
+ *
+ * The error isn't cumulative. As soon as another
+ * delta slips from a period P to P+1, by definition
+ * it frees up its time T in P.
+ */
+ if (sample > period)
+ sample = period;
+ group->total_prev[s] += sample;
+ calc_avgs(group->avg[s], missed_periods, sample, period);
+ }
+out:
+ mutex_unlock(&group->stat_lock);
+ return nonidle_total;
+}
+
+static void psi_update_work(struct work_struct *work)
+{
+ struct delayed_work *dwork;
+ struct psi_group *group;
+ bool nonidle;
+
+ dwork = to_delayed_work(work);
+ group = container_of(dwork, struct psi_group, clock_work);
+
+ /*
+ * If there is task activity, periodically fold the per-cpu
+ * times and feed samples into the running averages. If things
+ * are idle and there is no data to process, stop the clock.
+ * Once restarted, we'll catch up the running averages in one
+ * go - see calc_avgs() and missed_periods.
+ */
+
+ nonidle = update_stats(group);
+
+ if (nonidle) {
+ unsigned long delay = 0;
+ u64 now;
+
+ now = sched_clock();
+ if (group->next_update > now)
+ delay = nsecs_to_jiffies(group->next_update - now) + 1;
+ schedule_delayed_work(dwork, delay);
+ }
+}
+
+static void record_times(struct psi_group_cpu *groupc, int cpu,
+ bool memstall_tick)
+{
+ u32 delta;
+ u64 now;
+
+ now = cpu_clock(cpu);
+ delta = now - groupc->state_start;
+ groupc->state_start = now;
+
+ if (test_state(groupc->tasks, PSI_IO_SOME)) {
+ groupc->times[PSI_IO_SOME] += delta;
+ if (test_state(groupc->tasks, PSI_IO_FULL))
+ groupc->times[PSI_IO_FULL] += delta;
+ }
+
+ if (test_state(groupc->tasks, PSI_MEM_SOME)) {
+ groupc->times[PSI_MEM_SOME] += delta;
+ if (test_state(groupc->tasks, PSI_MEM_FULL))
+ groupc->times[PSI_MEM_FULL] += delta;
+ else if (memstall_tick) {
+ u32 sample;
+ /*
+ * Since we care about lost potential, a
+ * memstall is FULL when there are no other
+ * working tasks, but also when the CPU is
+ * actively reclaiming and nothing productive
+ * could run even if it were runnable.
+ *
+ * When the timer tick sees a reclaiming CPU,
+ * regardless of runnable tasks, sample a FULL
+ * tick (or less if it hasn't been a full tick
+ * since the last state change).
+ */
+ sample = min(delta, (u32)jiffies_to_nsecs(1));
+ groupc->times[PSI_MEM_FULL] += sample;
+ }
+ }
+
+ if (test_state(groupc->tasks, PSI_CPU_SOME))
+ groupc->times[PSI_CPU_SOME] += delta;
+
+ if (test_state(groupc->tasks, PSI_NONIDLE))
+ groupc->times[PSI_NONIDLE] += delta;
+}
+
+static void psi_group_change(struct psi_group *group, int cpu,
+ unsigned int clear, unsigned int set)
+{
+ struct psi_group_cpu *groupc;
+ unsigned int t, m;
+
+ groupc = per_cpu_ptr(group->pcpu, cpu);
+
+ /*
+ * First we assess the aggregate resource states this CPU's
+ * tasks have been in since the last change, and account any
+ * SOME and FULL time these may have resulted in.
+ *
+ * Then we update the task counts according to the state
+ * change requested through the @clear and @set bits.
+ */
+ write_seqcount_begin(&groupc->seq);
+
+ record_times(groupc, cpu, false);
+
+ for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
+ if (!(m & (1 << t)))
+ continue;
+ if (groupc->tasks[t] == 0 && !psi_bug) {
+ printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n",
+ cpu, t, groupc->tasks[0],
+ groupc->tasks[1], groupc->tasks[2],
+ clear, set);
+ psi_bug = 1;
+ }
+ groupc->tasks[t]--;
+ }
+
+ for (t = 0; set; set &= ~(1 << t), t++)
+ if (set & (1 << t))
+ groupc->tasks[t]++;
+
+ write_seqcount_end(&groupc->seq);
+
+ if (!delayed_work_pending(&group->clock_work))
+ schedule_delayed_work(&group->clock_work, PSI_FREQ);
+}
+
+static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
+{
+#ifdef CONFIG_CGROUPS
+ struct cgroup *cgroup = NULL;
+
+ if (!*iter)
+ cgroup = task->cgroups->dfl_cgrp;
+ else if (*iter == &psi_system)
+ return NULL;
+ else
+ cgroup = cgroup_parent(*iter);
+
+ if (cgroup && cgroup_parent(cgroup)) {
+ *iter = cgroup;
+ return cgroup_psi(cgroup);
+ }
+#else
+ if (*iter)
+ return NULL;
+#endif
+ *iter = &psi_system;
+ return &psi_system;
+}
+
+void psi_task_change(struct task_struct *task, int clear, int set)
+{
+ int cpu = task_cpu(task);
+ struct psi_group *group;
+ void *iter = NULL;
+
+ if (!task->pid)
+ return;
+
+ if (((task->psi_flags & set) ||
+ (task->psi_flags & clear) != clear) &&
+ !psi_bug) {
+ printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n",
+ task->pid, task->comm, cpu,
+ task->psi_flags, clear, set);
+ psi_bug = 1;
+ }
+
+ task->psi_flags &= ~clear;
+ task->psi_flags |= set;
+
+ while ((group = iterate_groups(task, &iter)))
+ psi_group_change(group, cpu, clear, set);
+}
+
+void psi_memstall_tick(struct task_struct *task, int cpu)
+{
+ struct psi_group *group;
+ void *iter = NULL;
+
+ while ((group = iterate_groups(task, &iter))) {
+ struct psi_group_cpu *groupc;
+
+ groupc = per_cpu_ptr(group->pcpu, cpu);
+ write_seqcount_begin(&groupc->seq);
+ record_times(groupc, cpu, true);
+ write_seqcount_end(&groupc->seq);
+ }
+}
+
+/**
+ * psi_memstall_enter - mark the beginning of a memory stall section
+ * @flags: flags to handle nested sections
+ *
+ * Marks the calling task as being stalled due to a lack of memory,
+ * such as waiting for a refault or performing reclaim.
+ */
+void psi_memstall_enter(unsigned long *flags)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ if (psi_disabled)
+ return;
+
+ *flags = current->flags & PF_MEMSTALL;
+ if (*flags)
+ return;
+ /*
+ * PF_MEMSTALL setting & accounting needs to be atomic wrt
+ * changes to the task's scheduling state, otherwise we can
+ * race with CPU migration.
+ */
+ rq = this_rq_lock_irq(&rf);
+
+ current->flags |= PF_MEMSTALL;
+ psi_task_change(current, 0, TSK_MEMSTALL);
+
+ rq_unlock_irq(rq, &rf);
+}
+
+/**
+ * psi_memstall_leave - mark the end of an memory stall section
+ * @flags: flags to handle nested memdelay sections
+ *
+ * Marks the calling task as no longer stalled due to lack of memory.
+ */
+void psi_memstall_leave(unsigned long *flags)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ if (psi_disabled)
+ return;
+
+ if (*flags)
+ return;
+ /*
+ * PF_MEMSTALL clearing & accounting needs to be atomic wrt
+ * changes to the task's scheduling state, otherwise we could
+ * race with CPU migration.
+ */
+ rq = this_rq_lock_irq(&rf);
+
+ current->flags &= ~PF_MEMSTALL;
+ psi_task_change(current, TSK_MEMSTALL, 0);
+
+ rq_unlock_irq(rq, &rf);
+}
+
+#ifdef CONFIG_CGROUPS
+int psi_cgroup_alloc(struct cgroup *cgroup)
+{
+ if (psi_disabled)
+ return 0;
+
+ cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu);
+ if (!cgroup->psi.pcpu)
+ return -ENOMEM;
+ group_init(&cgroup->psi);
+ return 0;
+}
+
+void psi_cgroup_free(struct cgroup *cgroup)
+{
+ if (psi_disabled)
+ return;
+
+ cancel_delayed_work_sync(&cgroup->psi.clock_work);
+ free_percpu(cgroup->psi.pcpu);
+}
+
+/**
+ * cgroup_move_task - move task to a different cgroup
+ * @task: the task
+ * @to: the target css_set
+ *
+ * Move task to a new cgroup and safely migrate its associated stall
+ * state between the different groups.
+ *
+ * This function acquires the task's rq lock to lock out concurrent
+ * changes to the task's scheduling state and - in case the task is
+ * running - concurrent changes to its stall state.
+ */
+void cgroup_move_task(struct task_struct *task, struct css_set *to)
+{
+ bool move_psi = !psi_disabled;
+ unsigned int task_flags = 0;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ if (move_psi) {
+ rq = task_rq_lock(task, &rf);
+
+ if (task_on_rq_queued(task))
+ task_flags = TSK_RUNNING;
+ else if (task->in_iowait)
+ task_flags = TSK_IOWAIT;
+
+ if (task->flags & PF_MEMSTALL)
+ task_flags |= TSK_MEMSTALL;
+
+ if (task_flags)
+ psi_task_change(task, task_flags, 0);
+ }
+
+ /*
+ * Lame to do this here, but the scheduler cannot be locked
+ * from the outside, so we move cgroups from inside sched/.
+ */
+ rcu_assign_pointer(task->cgroups, to);
+
+ if (move_psi) {
+ if (task_flags)
+ psi_task_change(task, 0, task_flags);
+
+ task_rq_unlock(rq, task, &rf);
+ }
+}
+#endif /* CONFIG_CGROUPS */
+
+int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
+{
+ int full;
+
+ if (psi_disabled)
+ return -EOPNOTSUPP;
+
+ update_stats(group);
+
+ for (full = 0; full < 2 - (res == PSI_CPU); full++) {
+ unsigned long avg[3];
+ u64 total;
+ int w;
+
+ for (w = 0; w < 3; w++)
+ avg[w] = group->avg[res * 2 + full][w];
+ total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC);
+
+ seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
+ full ? "full" : "some",
+ LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
+ LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
+ LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
+ total);
+ }
+
+ return 0;
+}
+
+static int psi_io_show(struct seq_file *m, void *v)
+{
+ return psi_show(m, &psi_system, PSI_IO);
+}
+
+static int psi_memory_show(struct seq_file *m, void *v)
+{
+ return psi_show(m, &psi_system, PSI_MEM);
+}
+
+static int psi_cpu_show(struct seq_file *m, void *v)
+{
+ return psi_show(m, &psi_system, PSI_CPU);
+}
+
+static int psi_io_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, psi_io_show, NULL);
+}
+
+static int psi_memory_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, psi_memory_show, NULL);
+}
+
+static int psi_cpu_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, psi_cpu_show, NULL);
+}
+
+static const struct file_operations psi_io_fops = {
+ .open = psi_io_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static const struct file_operations psi_memory_fops = {
+ .open = psi_memory_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static const struct file_operations psi_cpu_fops = {
+ .open = psi_cpu_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init psi_proc_init(void)
+{
+ proc_mkdir("pressure", NULL);
+ proc_create("pressure/io", 0, NULL, &psi_io_fops);
+ proc_create("pressure/memory", 0, NULL, &psi_memory_fops);
+ proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops);
+ return 0;
+}
+module_init(psi_proc_init);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b8c0077..618577f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -54,6 +54,7 @@
#include <linux/proc_fs.h>
#include <linux/prefetch.h>
#include <linux/profile.h>
+#include <linux/psi.h>
#include <linux/rcupdate_wait.h>
#include <linux/security.h>
#include <linux/stop_machine.h>
@@ -319,6 +320,7 @@ extern bool dl_cpu_busy(unsigned int cpu);
#ifdef CONFIG_CGROUP_SCHED
#include <linux/cgroup.h>
+#include <linux/psi.h>
struct cfs_rq;
struct rt_rq;
@@ -957,6 +959,8 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() raw_cpu_ptr(&runqueues)
+extern void update_rq_clock(struct rq *rq);
+
static inline u64 __rq_clock_broken(struct rq *rq)
{
return READ_ONCE(rq->clock);
@@ -1075,6 +1079,98 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf)
#endif
}
+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+ __acquires(rq->lock);
+
+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+ __acquires(p->pi_lock)
+ __acquires(rq->lock);
+
+static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
+ __releases(rq->lock)
+{
+ rq_unpin_lock(rq, rf);
+ raw_spin_unlock(&rq->lock);
+}
+
+static inline void
+task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
+ __releases(rq->lock)
+ __releases(p->pi_lock)
+{
+ rq_unpin_lock(rq, rf);
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
+}
+
+static inline void
+rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ raw_spin_lock_irqsave(&rq->lock, rf->flags);
+ rq_pin_lock(rq, rf);
+}
+
+static inline void
+rq_lock_irq(struct rq *rq, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ raw_spin_lock_irq(&rq->lock);
+ rq_pin_lock(rq, rf);
+}
+
+static inline void
+rq_lock(struct rq *rq, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ raw_spin_lock(&rq->lock);
+ rq_pin_lock(rq, rf);
+}
+
+static inline void
+rq_relock(struct rq *rq, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ raw_spin_lock(&rq->lock);
+ rq_repin_lock(rq, rf);
+}
+
+static inline void
+rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
+ __releases(rq->lock)
+{
+ rq_unpin_lock(rq, rf);
+ raw_spin_unlock_irqrestore(&rq->lock, rf->flags);
+}
+
+static inline void
+rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
+ __releases(rq->lock)
+{
+ rq_unpin_lock(rq, rf);
+ raw_spin_unlock_irq(&rq->lock);
+}
+
+static inline void
+rq_unlock(struct rq *rq, struct rq_flags *rf)
+ __releases(rq->lock)
+{
+ rq_unpin_lock(rq, rf);
+ raw_spin_unlock(&rq->lock);
+}
+
+static inline struct rq *
+this_rq_lock_irq(struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ local_irq_disable();
+ rq = this_rq();
+ rq_lock(rq, rf);
+ return rq;
+}
+
#ifdef CONFIG_NUMA
enum numa_topology_type {
NUMA_DIRECT,
@@ -1717,8 +1813,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
sched_update_tick_dependency(rq);
}
-extern void update_rq_clock(struct rq *rq);
-
extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
@@ -1783,86 +1877,6 @@ unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu)
#endif
#endif
-struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
- __acquires(rq->lock);
-
-struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
- __acquires(p->pi_lock)
- __acquires(rq->lock);
-
-static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
- __releases(rq->lock)
-{
- rq_unpin_lock(rq, rf);
- raw_spin_unlock(&rq->lock);
-}
-
-static inline void
-task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
- __releases(rq->lock)
- __releases(p->pi_lock)
-{
- rq_unpin_lock(rq, rf);
- raw_spin_unlock(&rq->lock);
- raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
-}
-
-static inline void
-rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
- __acquires(rq->lock)
-{
- raw_spin_lock_irqsave(&rq->lock, rf->flags);
- rq_pin_lock(rq, rf);
-}
-
-static inline void
-rq_lock_irq(struct rq *rq, struct rq_flags *rf)
- __acquires(rq->lock)
-{
- raw_spin_lock_irq(&rq->lock);
- rq_pin_lock(rq, rf);
-}
-
-static inline void
-rq_lock(struct rq *rq, struct rq_flags *rf)
- __acquires(rq->lock)
-{
- raw_spin_lock(&rq->lock);
- rq_pin_lock(rq, rf);
-}
-
-static inline void
-rq_relock(struct rq *rq, struct rq_flags *rf)
- __acquires(rq->lock)
-{
- raw_spin_lock(&rq->lock);
- rq_repin_lock(rq, rf);
-}
-
-static inline void
-rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
- __releases(rq->lock)
-{
- rq_unpin_lock(rq, rf);
- raw_spin_unlock_irqrestore(&rq->lock, rf->flags);
-}
-
-static inline void
-rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
- __releases(rq->lock)
-{
- rq_unpin_lock(rq, rf);
- raw_spin_unlock_irq(&rq->lock);
-}
-
-static inline void
-rq_unlock(struct rq *rq, struct rq_flags *rf)
- __releases(rq->lock)
-{
- rq_unpin_lock(rq, rf);
- raw_spin_unlock(&rq->lock);
-}
-
#ifdef CONFIG_SMP
#ifdef CONFIG_PREEMPT
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 8aea199..4904c46 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -55,6 +55,92 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt
# define schedstat_val_or_zero(var) 0
#endif /* CONFIG_SCHEDSTATS */
+#ifdef CONFIG_PSI
+/*
+ * PSI tracks state that persists across sleeps, such as iowaits and
+ * memory stalls. As a result, it has to distinguish between sleeps,
+ * where a task's runnable state changes, and requeues, where a task
+ * and its state are being moved between CPUs and runqueues.
+ */
+static inline void psi_enqueue(struct task_struct *p, bool wakeup)
+{
+ int clear = 0, set = TSK_RUNNING;
+
+ if (psi_disabled)
+ return;
+
+ if (!wakeup || p->sched_psi_wake_requeue) {
+ if (p->flags & PF_MEMSTALL)
+ set |= TSK_MEMSTALL;
+ if (p->sched_psi_wake_requeue)
+ p->sched_psi_wake_requeue = 0;
+ } else {
+ if (p->in_iowait)
+ clear |= TSK_IOWAIT;
+ }
+
+ psi_task_change(p, clear, set);
+}
+
+static inline void psi_dequeue(struct task_struct *p, bool sleep)
+{
+ int clear = TSK_RUNNING, set = 0;
+
+ if (psi_disabled)
+ return;
+
+ if (!sleep) {
+ if (p->flags & PF_MEMSTALL)
+ clear |= TSK_MEMSTALL;
+ } else {
+ if (p->in_iowait)
+ set |= TSK_IOWAIT;
+ }
+
+ psi_task_change(p, clear, set);
+}
+
+static inline void psi_ttwu_dequeue(struct task_struct *p)
+{
+ if (psi_disabled)
+ return;
+ /*
+ * Is the task being migrated during a wakeup? Make sure to
+ * deregister its sleep-persistent psi states from the old
+ * queue, and let psi_enqueue() know it has to requeue.
+ */
+ if (unlikely(p->in_iowait || (p->flags & PF_MEMSTALL))) {
+ struct rq_flags rf;
+ struct rq *rq;
+ int clear = 0;
+
+ if (p->in_iowait)
+ clear |= TSK_IOWAIT;
+ if (p->flags & PF_MEMSTALL)
+ clear |= TSK_MEMSTALL;
+
+ rq = __task_rq_lock(p, &rf);
+ psi_task_change(p, clear, 0);
+ p->sched_psi_wake_requeue = 1;
+ __task_rq_unlock(rq, &rf);
+ }
+}
+
+static inline void psi_task_tick(struct rq *rq)
+{
+ if (psi_disabled)
+ return;
+
+ if (unlikely(rq->curr->flags & PF_MEMSTALL))
+ psi_memstall_tick(rq->curr, cpu_of(rq));
+}
+#else /* CONFIG_PSI */
+static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
+static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
+static inline void psi_ttwu_dequeue(struct task_struct *p) {}
+static inline void psi_task_tick(struct rq *rq) {}
+#endif /* CONFIG_PSI */
+
#ifdef CONFIG_SCHED_INFO
static inline void sched_info_reset_dequeued(struct task_struct *t)
{
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index ec65710..51b7840 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -579,6 +579,73 @@ static noinline void __init kmem_cache_invalid_free(void)
kmem_cache_destroy(cache);
}
+static noinline void __init kasan_memchr(void)
+{
+ char *ptr;
+ size_t size = 24;
+
+ pr_info("out-of-bounds in memchr\n");
+ ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
+ if (!ptr)
+ return;
+
+ memchr(ptr, '1', size + 1);
+ kfree(ptr);
+}
+
+static noinline void __init kasan_memcmp(void)
+{
+ char *ptr;
+ size_t size = 24;
+ int arr[9];
+
+ pr_info("out-of-bounds in memcmp\n");
+ ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
+ if (!ptr)
+ return;
+
+ memset(arr, 0, sizeof(arr));
+ memcmp(ptr, arr, size+1);
+ kfree(ptr);
+}
+
+static noinline void __init kasan_strings(void)
+{
+ char *ptr;
+ size_t size = 24;
+
+ pr_info("use-after-free in strchr\n");
+ ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
+ if (!ptr)
+ return;
+
+ kfree(ptr);
+
+ /*
+ * Try to cause only 1 invalid access (less spam in dmesg).
+ * For that we need ptr to point to zeroed byte.
+ * Skip metadata that could be stored in freed object so ptr
+ * will likely point to zeroed byte.
+ */
+ ptr += 16;
+ strchr(ptr, '1');
+
+ pr_info("use-after-free in strrchr\n");
+ strrchr(ptr, '1');
+
+ pr_info("use-after-free in strcmp\n");
+ strcmp(ptr, "2");
+
+ pr_info("use-after-free in strncmp\n");
+ strncmp(ptr, "2", 1);
+
+ pr_info("use-after-free in strlen\n");
+ strlen(ptr);
+
+ pr_info("use-after-free in strnlen\n");
+ strnlen(ptr, 1);
+}
+
static int __init kmalloc_tests_init(void)
{
/*
@@ -618,6 +685,9 @@ static int __init kmalloc_tests_init(void)
use_after_scope_test();
kmem_cache_double_free();
kmem_cache_invalid_free();
+ kasan_memchr();
+ kasan_memcmp();
+ kasan_strings();
kasan_restore_multi_shot(multishot);
diff --git a/mm/compaction.c b/mm/compaction.c
index faca45e..7c60747 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -22,6 +22,7 @@
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/page_owner.h>
+#include <linux/psi.h>
#include "internal.h"
#ifdef CONFIG_COMPACTION
@@ -2068,11 +2069,15 @@ static int kcompactd(void *p)
pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
while (!kthread_should_stop()) {
+ unsigned long pflags;
+
trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
wait_event_freezable(pgdat->kcompactd_wait,
kcompactd_work_requested(pgdat));
+ psi_memstall_enter(&pflags);
kcompactd_do_work(pgdat);
+ psi_memstall_leave(&pflags);
}
return 0;
diff --git a/mm/debug.c b/mm/debug.c
index bd10aad..cdacba1 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -13,6 +13,7 @@
#include <trace/events/mmflags.h>
#include <linux/migrate.h>
#include <linux/page_owner.h>
+#include <linux/ctype.h>
#include "internal.h"
@@ -175,4 +176,49 @@ void dump_mm(const struct mm_struct *mm)
);
}
+static bool page_init_poisoning __read_mostly = true;
+
+static int __init setup_vm_debug(char *str)
+{
+ bool __page_init_poisoning = true;
+
+ /*
+ * Calling vm_debug with no arguments is equivalent to requesting
+ * to enable all debugging options we can control.
+ */
+ if (*str++ != '=' || !*str)
+ goto out;
+
+ __page_init_poisoning = false;
+ if (*str == '-')
+ goto out;
+
+ while (*str) {
+ switch (tolower(*str)) {
+ case'p':
+ __page_init_poisoning = true;
+ break;
+ default:
+ pr_err("vm_debug option '%c' unknown. skipped\n",
+ *str);
+ }
+
+ str++;
+ }
+out:
+ if (page_init_poisoning && !__page_init_poisoning)
+ pr_warn("Page struct poisoning disabled by kernel command line option 'vm_debug'\n");
+
+ page_init_poisoning = __page_init_poisoning;
+
+ return 1;
+}
+__setup("vm_debug", setup_vm_debug);
+
+void page_init_poison(struct page *page, size_t size)
+{
+ if (page_init_poisoning)
+ memset(page, PAGE_POISON_PATTERN, size);
+}
+EXPORT_SYMBOL_GPL(page_init_poison);
#endif /* CONFIG_DEBUG_VM */
diff --git a/mm/filemap.c b/mm/filemap.c
index 52517f2..3968da1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -36,6 +36,8 @@
#include <linux/cleancache.h>
#include <linux/shmem_fs.h>
#include <linux/rmap.h>
+#include <linux/delayacct.h>
+#include <linux/psi.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
@@ -915,12 +917,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
* data from the working set, only to cache data that will
* get overwritten with something else, is a waste of memory.
*/
- if (!(gfp_mask & __GFP_WRITE) &&
- shadow && workingset_refault(shadow)) {
- SetPageActive(page);
- workingset_activation(page);
- } else
- ClearPageActive(page);
+ WARN_ON_ONCE(PageActive(page));
+ if (!(gfp_mask & __GFP_WRITE) && shadow)
+ workingset_refault(page, shadow);
lru_cache_add(page);
}
return ret;
@@ -1076,8 +1075,18 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
{
struct wait_page_queue wait_page;
wait_queue_entry_t *wait = &wait_page.wait;
+ bool thrashing = false;
+ unsigned long pflags;
int ret = 0;
+ if (bit_nr == PG_locked &&
+ !PageUptodate(page) && PageWorkingset(page)) {
+ if (!PageSwapBacked(page))
+ delayacct_thrashing_start();
+ psi_memstall_enter(&pflags);
+ thrashing = true;
+ }
+
init_wait(wait);
wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0;
wait->func = wake_page_function;
@@ -1116,6 +1125,12 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
finish_wait(q, wait);
+ if (thrashing) {
+ if (!PageSwapBacked(page))
+ delayacct_thrashing_end();
+ psi_memstall_leave(&pflags);
+ }
+
/*
* A signal could leave PageWaiters set. Clearing it here if
* !waitqueue_active would be possible (by open-coding finish_wait),
@@ -2581,9 +2596,7 @@ no_cached_page:
* system is low on memory, or a problem occurs while trying
* to schedule I/O.
*/
- if (error == -ENOMEM)
- return VM_FAULT_OOM;
- return VM_FAULT_SIGBUS;
+ return vmf_error(error);
page_not_uptodate:
/*
@@ -2748,9 +2761,9 @@ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
return generic_file_mmap(file, vma);
}
#else
-int filemap_page_mkwrite(struct vm_fault *vmf)
+vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
{
- return -ENOSYS;
+ return VM_FAULT_SIGBUS;
}
int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
{
@@ -3012,7 +3025,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
if (iocb->ki_flags & IOCB_NOWAIT) {
/* If there are pages to writeback, return */
if (filemap_range_has_page(inode->i_mapping, pos,
- pos + iov_iter_count(from)))
+ pos + write_len))
return -EAGAIN;
} else {
written = filemap_write_and_wait_range(mapping, pos,
diff --git a/mm/gup.c b/mm/gup.c
index 1abc8b4..841d7ef 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -20,6 +20,11 @@
#include "internal.h"
+struct follow_page_context {
+ struct dev_pagemap *pgmap;
+ unsigned int page_mask;
+};
+
static struct page *no_page_table(struct vm_area_struct *vma,
unsigned int flags)
{
@@ -71,10 +76,10 @@ static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
}
static struct page *follow_page_pte(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd, unsigned int flags)
+ unsigned long address, pmd_t *pmd, unsigned int flags,
+ struct dev_pagemap **pgmap)
{
struct mm_struct *mm = vma->vm_mm;
- struct dev_pagemap *pgmap = NULL;
struct page *page;
spinlock_t *ptl;
pte_t *ptep, pte;
@@ -116,8 +121,8 @@ retry:
* Only return device mapping pages in the FOLL_GET case since
* they are only valid while holding the pgmap reference.
*/
- pgmap = get_dev_pagemap(pte_pfn(pte), NULL);
- if (pgmap)
+ *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
+ if (*pgmap)
page = pte_page(pte);
else
goto no_page;
@@ -152,15 +157,8 @@ retry:
goto retry;
}
- if (flags & FOLL_GET) {
+ if (flags & FOLL_GET)
get_page(page);
-
- /* drop the pgmap reference now that we hold the page */
- if (pgmap) {
- put_dev_pagemap(pgmap);
- pgmap = NULL;
- }
- }
if (flags & FOLL_TOUCH) {
if ((flags & FOLL_WRITE) &&
!pte_dirty(pte) && !PageDirty(page))
@@ -210,7 +208,8 @@ no_page:
static struct page *follow_pmd_mask(struct vm_area_struct *vma,
unsigned long address, pud_t *pudp,
- unsigned int flags, unsigned int *page_mask)
+ unsigned int flags,
+ struct follow_page_context *ctx)
{
pmd_t *pmd, pmdval;
spinlock_t *ptl;
@@ -258,13 +257,13 @@ retry:
}
if (pmd_devmap(pmdval)) {
ptl = pmd_lock(mm, pmd);
- page = follow_devmap_pmd(vma, address, pmd, flags);
+ page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
spin_unlock(ptl);
if (page)
return page;
}
if (likely(!pmd_trans_huge(pmdval)))
- return follow_page_pte(vma, address, pmd, flags);
+ return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
return no_page_table(vma, flags);
@@ -284,7 +283,7 @@ retry_locked:
}
if (unlikely(!pmd_trans_huge(*pmd))) {
spin_unlock(ptl);
- return follow_page_pte(vma, address, pmd, flags);
+ return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
}
if (flags & FOLL_SPLIT) {
int ret;
@@ -307,18 +306,18 @@ retry_locked:
}
return ret ? ERR_PTR(ret) :
- follow_page_pte(vma, address, pmd, flags);
+ follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
}
page = follow_trans_huge_pmd(vma, address, pmd, flags);
spin_unlock(ptl);
- *page_mask = HPAGE_PMD_NR - 1;
+ ctx->page_mask = HPAGE_PMD_NR - 1;
return page;
}
-
static struct page *follow_pud_mask(struct vm_area_struct *vma,
unsigned long address, p4d_t *p4dp,
- unsigned int flags, unsigned int *page_mask)
+ unsigned int flags,
+ struct follow_page_context *ctx)
{
pud_t *pud;
spinlock_t *ptl;
@@ -344,7 +343,7 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
}
if (pud_devmap(*pud)) {
ptl = pud_lock(mm, pud);
- page = follow_devmap_pud(vma, address, pud, flags);
+ page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
spin_unlock(ptl);
if (page)
return page;
@@ -352,13 +351,13 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
if (unlikely(pud_bad(*pud)))
return no_page_table(vma, flags);
- return follow_pmd_mask(vma, address, pud, flags, page_mask);
+ return follow_pmd_mask(vma, address, pud, flags, ctx);
}
-
static struct page *follow_p4d_mask(struct vm_area_struct *vma,
unsigned long address, pgd_t *pgdp,
- unsigned int flags, unsigned int *page_mask)
+ unsigned int flags,
+ struct follow_page_context *ctx)
{
p4d_t *p4d;
struct page *page;
@@ -378,7 +377,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
return page;
return no_page_table(vma, flags);
}
- return follow_pud_mask(vma, address, p4d, flags, page_mask);
+ return follow_pud_mask(vma, address, p4d, flags, ctx);
}
/**
@@ -396,13 +395,13 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
*/
struct page *follow_page_mask(struct vm_area_struct *vma,
unsigned long address, unsigned int flags,
- unsigned int *page_mask)
+ struct follow_page_context *ctx)
{
pgd_t *pgd;
struct page *page;
struct mm_struct *mm = vma->vm_mm;
- *page_mask = 0;
+ ctx->page_mask = 0;
/* make this handle hugepd */
page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
@@ -431,7 +430,19 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
return no_page_table(vma, flags);
}
- return follow_p4d_mask(vma, address, pgd, flags, page_mask);
+ return follow_p4d_mask(vma, address, pgd, flags, ctx);
+}
+
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
+ unsigned int foll_flags)
+{
+ struct follow_page_context ctx = { NULL };
+ struct page *page;
+
+ page = follow_page_mask(vma, address, foll_flags, &ctx);
+ if (ctx.pgmap)
+ put_dev_pagemap(ctx.pgmap);
+ return page;
}
static int get_gate_page(struct mm_struct *mm, unsigned long address,
@@ -659,9 +670,9 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *nonblocking)
{
- long i = 0;
- unsigned int page_mask;
+ long ret = 0, i = 0;
struct vm_area_struct *vma = NULL;
+ struct follow_page_context ctx = { NULL };
if (!nr_pages)
return 0;
@@ -691,12 +702,14 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
pages ? &pages[i] : NULL);
if (ret)
return i ? : ret;
- page_mask = 0;
+ ctx.page_mask = 0;
goto next_page;
}
- if (!vma || check_vma_flags(vma, gup_flags))
- return i ? : -EFAULT;
+ if (!vma || check_vma_flags(vma, gup_flags)) {
+ ret = -EFAULT;
+ goto out;
+ }
if (is_vm_hugetlb_page(vma)) {
i = follow_hugetlb_page(mm, vma, pages, vmas,
&start, &nr_pages, i,
@@ -709,23 +722,26 @@ retry:
* If we have a pending SIGKILL, don't keep faulting pages and
* potentially allocating memory.
*/
- if (unlikely(fatal_signal_pending(current)))
- return i ? i : -ERESTARTSYS;
+ if (unlikely(fatal_signal_pending(current))) {
+ ret = -ERESTARTSYS;
+ goto out;
+ }
cond_resched();
- page = follow_page_mask(vma, start, foll_flags, &page_mask);
+
+ page = follow_page_mask(vma, start, foll_flags, &ctx);
if (!page) {
- int ret;
ret = faultin_page(tsk, vma, start, &foll_flags,
nonblocking);
switch (ret) {
case 0:
goto retry;
+ case -EBUSY:
+ ret = 0;
+ /* FALLTHRU */
case -EFAULT:
case -ENOMEM:
case -EHWPOISON:
- return i ? i : ret;
- case -EBUSY:
- return i;
+ goto out;
case -ENOENT:
goto next_page;
}
@@ -737,27 +753,31 @@ retry:
*/
goto next_page;
} else if (IS_ERR(page)) {
- return i ? i : PTR_ERR(page);
+ ret = PTR_ERR(page);
+ goto out;
}
if (pages) {
pages[i] = page;
flush_anon_page(vma, page, start);
flush_dcache_page(page);
- page_mask = 0;
+ ctx.page_mask = 0;
}
next_page:
if (vmas) {
vmas[i] = vma;
- page_mask = 0;
+ ctx.page_mask = 0;
}
- page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
+ page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
if (page_increm > nr_pages)
page_increm = nr_pages;
i += page_increm;
start += page_increm * PAGE_SIZE;
nr_pages -= page_increm;
} while (nr_pages);
- return i;
+out:
+ if (ctx.pgmap)
+ put_dev_pagemap(ctx.pgmap);
+ return i ? i : ret;
}
static bool vma_permits_fault(struct vm_area_struct *vma,
@@ -1780,12 +1800,11 @@ bool gup_fast_permitted(unsigned long start, int nr_pages, int write)
int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
{
- unsigned long addr, len, end;
+ unsigned long len, end;
unsigned long flags;
int nr = 0;
start &= PAGE_MASK;
- addr = start;
len = (unsigned long) nr_pages << PAGE_SHIFT;
end = start + len;
@@ -1807,7 +1826,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
if (gup_fast_permitted(start, nr_pages, write)) {
local_irq_save(flags);
- gup_pgd_range(addr, end, write, pages, &nr);
+ gup_pgd_range(start, end, write, pages, &nr);
local_irq_restore(flags);
}
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c
index 7405c9d8..debf113 100644
--- a/mm/gup_benchmark.c
+++ b/mm/gup_benchmark.c
@@ -6,13 +6,17 @@
#include <linux/debugfs.h>
#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark)
+#define GUP_LONGTERM_BENCHMARK _IOWR('g', 2, struct gup_benchmark)
+#define GUP_BENCHMARK _IOWR('g', 3, struct gup_benchmark)
struct gup_benchmark {
- __u64 delta_usec;
+ __u64 get_delta_usec;
+ __u64 put_delta_usec;
__u64 addr;
__u64 size;
__u32 nr_pages_per_call;
__u32 flags;
+ __u64 expansion[10]; /* For future use */
};
static int __gup_benchmark_ioctl(unsigned int cmd,
@@ -41,21 +45,40 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
nr = (next - addr) / PAGE_SIZE;
}
- nr = get_user_pages_fast(addr, nr, gup->flags & 1, pages + i);
+ switch (cmd) {
+ case GUP_FAST_BENCHMARK:
+ nr = get_user_pages_fast(addr, nr, gup->flags & 1,
+ pages + i);
+ break;
+ case GUP_LONGTERM_BENCHMARK:
+ nr = get_user_pages_longterm(addr, nr, gup->flags & 1,
+ pages + i, NULL);
+ break;
+ case GUP_BENCHMARK:
+ nr = get_user_pages(addr, nr, gup->flags & 1, pages + i,
+ NULL);
+ break;
+ default:
+ return -1;
+ }
+
if (nr <= 0)
break;
i += nr;
}
end_time = ktime_get();
- gup->delta_usec = ktime_us_delta(end_time, start_time);
+ gup->get_delta_usec = ktime_us_delta(end_time, start_time);
gup->size = addr - gup->addr;
+ start_time = ktime_get();
for (i = 0; i < nr_pages; i++) {
if (!pages[i])
break;
put_page(pages[i]);
}
+ end_time = ktime_get();
+ gup->put_delta_usec = ktime_us_delta(end_time, start_time);
kvfree(pages);
return 0;
@@ -67,8 +90,14 @@ static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd,
struct gup_benchmark gup;
int ret;
- if (cmd != GUP_FAST_BENCHMARK)
+ switch (cmd) {
+ case GUP_FAST_BENCHMARK:
+ case GUP_LONGTERM_BENCHMARK:
+ case GUP_BENCHMARK:
+ break;
+ default:
return -EINVAL;
+ }
if (copy_from_user(&gup, (void __user *)arg, sizeof(gup)))
return -EFAULT;
diff --git a/mm/hmm.c b/mm/hmm.c
index c968e49..774d684 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1024,7 +1024,6 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
resource_size_t key, align_start, align_size, align_end;
struct device *device = devmem->device;
int ret, nid, is_ram;
- unsigned long pfn;
align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1);
align_size = ALIGN(devmem->resource->start +
@@ -1109,11 +1108,14 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
align_size >> PAGE_SHIFT, NULL);
mem_hotplug_done();
- for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) {
- struct page *page = pfn_to_page(pfn);
+ /*
+ * Initialization of the pages has been deferred until now in order
+ * to allow us to do the work while not holding the hotplug lock.
+ */
+ memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
+ align_start >> PAGE_SHIFT,
+ align_size >> PAGE_SHIFT, &devmem->pagemap);
- page->pgmap = &devmem->pagemap;
- }
return 0;
error_add_memory:
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index deed97f..25ef59b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -852,11 +852,10 @@ static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
}
struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, int flags)
+ pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
{
unsigned long pfn = pmd_pfn(*pmd);
struct mm_struct *mm = vma->vm_mm;
- struct dev_pagemap *pgmap;
struct page *page;
assert_spin_locked(pmd_lockptr(mm, pmd));
@@ -886,12 +885,11 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
return ERR_PTR(-EEXIST);
pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
- pgmap = get_dev_pagemap(pfn, NULL);
- if (!pgmap)
+ *pgmap = get_dev_pagemap(pfn, *pgmap);
+ if (!*pgmap)
return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
get_page(page);
- put_dev_pagemap(pgmap);
return page;
}
@@ -1000,11 +998,10 @@ static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
}
struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
- pud_t *pud, int flags)
+ pud_t *pud, int flags, struct dev_pagemap **pgmap)
{
unsigned long pfn = pud_pfn(*pud);
struct mm_struct *mm = vma->vm_mm;
- struct dev_pagemap *pgmap;
struct page *page;
assert_spin_locked(pud_lockptr(mm, pud));
@@ -1028,12 +1025,11 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
return ERR_PTR(-EEXIST);
pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
- pgmap = get_dev_pagemap(pfn, NULL);
- if (!pgmap)
+ *pgmap = get_dev_pagemap(pfn, *pgmap);
+ if (!*pgmap)
return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
get_page(page);
- put_dev_pagemap(pgmap);
return page;
}
@@ -1562,8 +1558,20 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
* We are not sure a pending tlb flush here is for a huge page
* mapping or not. Hence use the tlb range variant
*/
- if (mm_tlb_flush_pending(vma->vm_mm))
+ if (mm_tlb_flush_pending(vma->vm_mm)) {
flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
+ /*
+ * change_huge_pmd() released the pmd lock before
+ * invalidating the secondary MMUs sharing the primary
+ * MMU pagetables (with ->invalidate_range()). The
+ * mmu_notifier_invalidate_range_end() (which
+ * internally calls ->invalidate_range()) in
+ * change_pmd_range() will run after us, so we can't
+ * rely on it here and we need an explicit invalidate.
+ */
+ mmu_notifier_invalidate_range(vma->vm_mm, haddr,
+ haddr + HPAGE_PMD_SIZE);
+ }
/*
* Migrate the THP to the requested node, returns with page unlocked
@@ -2369,6 +2377,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
(1L << PG_mlocked) |
(1L << PG_uptodate) |
(1L << PG_active) |
+ (1L << PG_workingset) |
(1L << PG_locked) |
(1L << PG_unevictable) |
(1L << PG_dirty)));
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5c390f5..7b5c0ad 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3690,6 +3690,12 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
return err;
ClearPagePrivate(page);
+ /*
+ * set page dirty so that it will not be removed from cache/file
+ * by non-hugetlbfs specific code paths.
+ */
+ set_page_dirty(page);
+
spin_lock(&inode->i_lock);
inode->i_blocks += blocks_per_huge_page(h);
spin_unlock(&inode->i_lock);
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 3a8ddf8..b209dba 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -103,7 +103,7 @@ static int quarantine_head;
static int quarantine_tail;
/* Total size of all objects in global_quarantine across all batches. */
static unsigned long quarantine_size;
-static DEFINE_SPINLOCK(quarantine_lock);
+static DEFINE_RAW_SPINLOCK(quarantine_lock);
DEFINE_STATIC_SRCU(remove_cache_srcu);
/* Maximum size of the global queue. */
@@ -190,7 +190,7 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) {
qlist_move_all(q, &temp);
- spin_lock(&quarantine_lock);
+ raw_spin_lock(&quarantine_lock);
WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes);
qlist_move_all(&temp, &global_quarantine[quarantine_tail]);
if (global_quarantine[quarantine_tail].bytes >=
@@ -203,7 +203,7 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
if (new_tail != quarantine_head)
quarantine_tail = new_tail;
}
- spin_unlock(&quarantine_lock);
+ raw_spin_unlock(&quarantine_lock);
}
local_irq_restore(flags);
@@ -230,7 +230,7 @@ void quarantine_reduce(void)
* expected case).
*/
srcu_idx = srcu_read_lock(&remove_cache_srcu);
- spin_lock_irqsave(&quarantine_lock, flags);
+ raw_spin_lock_irqsave(&quarantine_lock, flags);
/*
* Update quarantine size in case of hotplug. Allocate a fraction of
@@ -254,7 +254,7 @@ void quarantine_reduce(void)
quarantine_head = 0;
}
- spin_unlock_irqrestore(&quarantine_lock, flags);
+ raw_spin_unlock_irqrestore(&quarantine_lock, flags);
qlist_free_all(&to_free, NULL);
srcu_read_unlock(&remove_cache_srcu, srcu_idx);
@@ -310,17 +310,17 @@ void quarantine_remove_cache(struct kmem_cache *cache)
*/
on_each_cpu(per_cpu_remove_cache, cache, 1);
- spin_lock_irqsave(&quarantine_lock, flags);
+ raw_spin_lock_irqsave(&quarantine_lock, flags);
for (i = 0; i < QUARANTINE_BATCHES; i++) {
if (qlist_empty(&global_quarantine[i]))
continue;
qlist_move_cache(&global_quarantine[i], &to_free, cache);
/* Scanning whole quarantine can take a while. */
- spin_unlock_irqrestore(&quarantine_lock, flags);
+ raw_spin_unlock_irqrestore(&quarantine_lock, flags);
cond_resched();
- spin_lock_irqsave(&quarantine_lock, flags);
+ raw_spin_lock_irqsave(&quarantine_lock, flags);
}
- spin_unlock_irqrestore(&quarantine_lock, flags);
+ raw_spin_unlock_irqrestore(&quarantine_lock, flags);
qlist_free_all(&to_free, cache);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 17dd883..4f7e4b5 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -86,6 +86,7 @@
#include <linux/seq_file.h>
#include <linux/cpumask.h>
#include <linux/spinlock.h>
+#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/rcupdate.h>
#include <linux/stacktrace.h>
@@ -181,6 +182,7 @@ struct kmemleak_object {
/* flag set to not scan the object */
#define OBJECT_NO_SCAN (1 << 2)
+#define HEX_PREFIX " "
/* number of bytes to print per line; must be 16 or 32 */
#define HEX_ROW_SIZE 16
/* number of bytes to print at a time (1, 2, 4, 8) */
@@ -235,6 +237,9 @@ static int kmemleak_skip_disable;
/* If there are leaks that can be reported */
static bool kmemleak_found_leaks;
+static bool kmemleak_verbose;
+module_param_named(verbose, kmemleak_verbose, bool, 0600);
+
/*
* Early object allocation/freeing logging. Kmemleak is initialized after the
* kernel allocator. However, both the kernel allocator and kmemleak may
@@ -299,6 +304,25 @@ static void kmemleak_disable(void);
kmemleak_disable(); \
} while (0)
+#define warn_or_seq_printf(seq, fmt, ...) do { \
+ if (seq) \
+ seq_printf(seq, fmt, ##__VA_ARGS__); \
+ else \
+ pr_warn(fmt, ##__VA_ARGS__); \
+} while (0)
+
+static void warn_or_seq_hex_dump(struct seq_file *seq, int prefix_type,
+ int rowsize, int groupsize, const void *buf,
+ size_t len, bool ascii)
+{
+ if (seq)
+ seq_hex_dump(seq, HEX_PREFIX, prefix_type, rowsize, groupsize,
+ buf, len, ascii);
+ else
+ print_hex_dump(KERN_WARNING, pr_fmt(HEX_PREFIX), prefix_type,
+ rowsize, groupsize, buf, len, ascii);
+}
+
/*
* Printing of the objects hex dump to the seq file. The number of lines to be
* printed is limited to HEX_MAX_LINES to prevent seq file spamming. The
@@ -314,10 +338,10 @@ static void hex_dump_object(struct seq_file *seq,
/* limit the number of lines to HEX_MAX_LINES */
len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE);
- seq_printf(seq, " hex dump (first %zu bytes):\n", len);
+ warn_or_seq_printf(seq, " hex dump (first %zu bytes):\n", len);
kasan_disable_current();
- seq_hex_dump(seq, " ", DUMP_PREFIX_NONE, HEX_ROW_SIZE,
- HEX_GROUP_SIZE, ptr, len, HEX_ASCII);
+ warn_or_seq_hex_dump(seq, DUMP_PREFIX_NONE, HEX_ROW_SIZE,
+ HEX_GROUP_SIZE, ptr, len, HEX_ASCII);
kasan_enable_current();
}
@@ -365,17 +389,17 @@ static void print_unreferenced(struct seq_file *seq,
int i;
unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies);
- seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n",
+ warn_or_seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n",
object->pointer, object->size);
- seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n",
+ warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n",
object->comm, object->pid, object->jiffies,
msecs_age / 1000, msecs_age % 1000);
hex_dump_object(seq, object);
- seq_printf(seq, " backtrace:\n");
+ warn_or_seq_printf(seq, " backtrace:\n");
for (i = 0; i < object->trace_len; i++) {
void *ptr = (void *)object->trace[i];
- seq_printf(seq, " [<%p>] %pS\n", ptr, ptr);
+ warn_or_seq_printf(seq, " [<%p>] %pS\n", ptr, ptr);
}
}
@@ -1598,6 +1622,10 @@ static void kmemleak_scan(void)
if (unreferenced_object(object) &&
!(object->flags & OBJECT_REPORTED)) {
object->flags |= OBJECT_REPORTED;
+
+ if (kmemleak_verbose)
+ print_unreferenced(NULL, object);
+
new_leaks++;
}
spin_unlock_irqrestore(&object->lock, flags);
diff --git a/mm/memblock.c b/mm/memblock.c
index 2379444..a853150 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1444,10 +1444,9 @@ void * __init memblock_virt_alloc_try_nid_raw(
ptr = memblock_virt_alloc_internal(size, align,
min_addr, max_addr, nid);
-#ifdef CONFIG_DEBUG_VM
if (ptr && size > 0)
- memset(ptr, PAGE_POISON_PATTERN, size);
-#endif
+ page_init_poison(ptr, size);
+
return ptr;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e79cb59..10a9b55 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1669,6 +1669,8 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int
if (order > PAGE_ALLOC_COSTLY_ORDER)
return OOM_SKIPPED;
+ memcg_memory_event(memcg, MEMCG_OOM);
+
/*
* We are in the middle of the charge context here, so we
* don't want to block when potentially sitting on a callstack
@@ -2250,8 +2252,6 @@ retry:
if (fatal_signal_pending(current))
goto force;
- memcg_memory_event(mem_over_limit, MEMCG_OOM);
-
/*
* keep retrying as long as the memcg oom killer is able to make
* a forward progress or bypass the charge if the oom killer
@@ -2460,7 +2460,7 @@ static void memcg_kmem_cache_create_func(struct work_struct *w)
/*
* Enqueue the creation of a per-memcg kmem_cache.
*/
-static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
+static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
struct kmem_cache *cachep)
{
struct memcg_kmem_cache_create_work *cw;
@@ -2478,25 +2478,6 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
queue_work(memcg_kmem_cache_wq, &cw->work);
}
-static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
-{
- /*
- * We need to stop accounting when we kmalloc, because if the
- * corresponding kmalloc cache is not yet created, the first allocation
- * in __memcg_schedule_kmem_cache_create will recurse.
- *
- * However, it is better to enclose the whole function. Depending on
- * the debugging options enabled, INIT_WORK(), for instance, can
- * trigger an allocation. This too, will make us recurse. Because at
- * this point we can't allow ourselves back into memcg_kmem_get_cache,
- * the safest choice is to do it like this, wrapping the whole function.
- */
- current->memcg_kmem_skip_account = 1;
- __memcg_schedule_kmem_cache_create(memcg, cachep);
- current->memcg_kmem_skip_account = 0;
-}
-
static inline bool memcg_kmem_bypass(void)
{
if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
@@ -2531,9 +2512,6 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
if (memcg_kmem_bypass())
return cachep;
- if (current->memcg_kmem_skip_account)
- return cachep;
-
memcg = get_mem_cgroup_from_current();
kmemcg_id = READ_ONCE(memcg->kmemcg_id);
if (kmemcg_id < 0)
@@ -4321,14 +4299,12 @@ static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
{
- VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
- atomic_add(n, &memcg->id.ref);
+ refcount_add(n, &memcg->id.ref);
}
static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
{
- VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
- if (atomic_sub_and_test(n, &memcg->id.ref)) {
+ if (refcount_sub_and_test(n, &memcg->id.ref)) {
mem_cgroup_id_remove(memcg);
/* Memcg ID pins CSS */
@@ -4545,7 +4521,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
}
/* Online state pins memcg ID, memcg ID pins CSS */
- atomic_set(&memcg->id.ref, 1);
+ refcount_set(&memcg->id.ref, 1);
css_get(css);
return 0;
}
@@ -4573,6 +4549,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
memcg_offline_kmem(memcg);
wb_memcg_offline(memcg);
+ drain_all_stock(memcg);
+
mem_cgroup_id_put(memcg);
}
@@ -5595,6 +5573,13 @@ static int memory_stat_show(struct seq_file *m, void *v)
seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
+ seq_printf(m, "workingset_refault %lu\n",
+ acc.stat[WORKINGSET_REFAULT]);
+ seq_printf(m, "workingset_activate %lu\n",
+ acc.stat[WORKINGSET_ACTIVATE]);
+ seq_printf(m, "workingset_nodereclaim %lu\n",
+ acc.stat[WORKINGSET_NODERECLAIM]);
+
seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
acc.events[PGSCAN_DIRECT]);
@@ -5605,13 +5590,6 @@ static int memory_stat_show(struct seq_file *m, void *v)
seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
- seq_printf(m, "workingset_refault %lu\n",
- acc.stat[WORKINGSET_REFAULT]);
- seq_printf(m, "workingset_activate %lu\n",
- acc.stat[WORKINGSET_ACTIVATE]);
- seq_printf(m, "workingset_nodereclaim %lu\n",
- acc.stat[WORKINGSET_NODERECLAIM]);
-
return 0;
}
@@ -6377,7 +6355,7 @@ subsys_initcall(mem_cgroup_init);
#ifdef CONFIG_MEMCG_SWAP
static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
{
- while (!atomic_inc_not_zero(&memcg->id.ref)) {
+ while (!refcount_inc_not_zero(&memcg->id.ref)) {
/*
* The root cgroup cannot be destroyed, so it's refcount must
* always be >= 1.
diff --git a/mm/memory.c b/mm/memory.c
index 21a5e6e..0721395 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1520,19 +1520,16 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
}
EXPORT_SYMBOL(vm_insert_page);
-static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
+static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn, pgprot_t prot, bool mkwrite)
{
struct mm_struct *mm = vma->vm_mm;
- int retval;
pte_t *pte, entry;
spinlock_t *ptl;
- retval = -ENOMEM;
pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
- goto out;
- retval = -EBUSY;
+ return VM_FAULT_OOM;
if (!pte_none(*pte)) {
if (mkwrite) {
/*
@@ -1565,56 +1562,32 @@ out_mkwrite:
set_pte_at(mm, addr, pte, entry);
update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
- retval = 0;
out_unlock:
pte_unmap_unlock(pte, ptl);
-out:
- return retval;
-}
-
-/**
- * vm_insert_pfn - insert single pfn into user vma
- * @vma: user vma to map to
- * @addr: target user address of this page
- * @pfn: source kernel pfn
- *
- * Similar to vm_insert_page, this allows drivers to insert individual pages
- * they've allocated into a user vma. Same comments apply.
- *
- * This function should only be called from a vm_ops->fault handler, and
- * in that case the handler should return NULL.
- *
- * vma cannot be a COW mapping.
- *
- * As this is called only for pages that do not currently exist, we
- * do not need to flush old virtual caches or the TLB.
- */
-int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
- unsigned long pfn)
-{
- return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
+ return VM_FAULT_NOPAGE;
}
-EXPORT_SYMBOL(vm_insert_pfn);
/**
- * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot
+ * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
* @vma: user vma to map to
* @addr: target user address of this page
* @pfn: source kernel pfn
* @pgprot: pgprot flags for the inserted page
*
- * This is exactly like vm_insert_pfn, except that it allows drivers to
+ * This is exactly like vmf_insert_pfn(), except that it allows drivers to
* to override pgprot on a per-page basis.
*
* This only makes sense for IO mappings, and it makes no sense for
- * cow mappings. In general, using multiple vmas is preferable;
- * vm_insert_pfn_prot should only be used if using multiple VMAs is
+ * COW mappings. In general, using multiple vmas is preferable;
+ * vmf_insert_pfn_prot should only be used if using multiple VMAs is
* impractical.
+ *
+ * Context: Process context. May allocate using %GFP_KERNEL.
+ * Return: vm_fault_t value.
*/
-int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
+vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, pgprot_t pgprot)
{
- int ret;
/*
* Technically, architectures with pte_special can avoid all these
* restrictions (same for remap_pfn_range). However we would like
@@ -1628,19 +1601,44 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
if (addr < vma->vm_start || addr >= vma->vm_end)
- return -EFAULT;
+ return VM_FAULT_SIGBUS;
if (!pfn_modify_allowed(pfn, pgprot))
- return -EACCES;
+ return VM_FAULT_SIGBUS;
track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
- ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
+ return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
false);
+}
+EXPORT_SYMBOL(vmf_insert_pfn_prot);
- return ret;
+/**
+ * vmf_insert_pfn - insert single pfn into user vma
+ * @vma: user vma to map to
+ * @addr: target user address of this page
+ * @pfn: source kernel pfn
+ *
+ * Similar to vm_insert_page, this allows drivers to insert individual pages
+ * they've allocated into a user vma. Same comments apply.
+ *
+ * This function should only be called from a vm_ops->fault handler, and
+ * in that case the handler should return the result of this function.
+ *
+ * vma cannot be a COW mapping.
+ *
+ * As this is called only for pages that do not currently exist, we
+ * do not need to flush old virtual caches or the TLB.
+ *
+ * Context: Process context. May allocate using %GFP_KERNEL.
+ * Return: vm_fault_t value.
+ */
+vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn)
+{
+ return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
}
-EXPORT_SYMBOL(vm_insert_pfn_prot);
+EXPORT_SYMBOL(vmf_insert_pfn);
static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
{
@@ -1656,20 +1654,21 @@ static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
return false;
}
-static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn, bool mkwrite)
+static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
+ unsigned long addr, pfn_t pfn, bool mkwrite)
{
pgprot_t pgprot = vma->vm_page_prot;
+ int err;
BUG_ON(!vm_mixed_ok(vma, pfn));
if (addr < vma->vm_start || addr >= vma->vm_end)
- return -EFAULT;
+ return VM_FAULT_SIGBUS;
track_pfn_insert(vma, &pgprot, pfn);
if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
- return -EACCES;
+ return VM_FAULT_SIGBUS;
/*
* If we don't have pte special, then we have to use the pfn_valid()
@@ -1688,36 +1687,35 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
* result in pfn_t_has_page() == false.
*/
page = pfn_to_page(pfn_t_to_pfn(pfn));
- return insert_page(vma, addr, page, pgprot);
+ err = insert_page(vma, addr, page, pgprot);
+ } else {
+ return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
}
- return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
+
+ if (err == -ENOMEM)
+ return VM_FAULT_OOM;
+ if (err < 0 && err != -EBUSY)
+ return VM_FAULT_SIGBUS;
+
+ return VM_FAULT_NOPAGE;
}
-int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn)
+vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+ pfn_t pfn)
{
return __vm_insert_mixed(vma, addr, pfn, false);
-
}
-EXPORT_SYMBOL(vm_insert_mixed);
+EXPORT_SYMBOL(vmf_insert_mixed);
/*
* If the insertion of PTE failed because someone else already added a
* different entry in the mean time, we treat that as success as we assume
* the same entry was actually inserted.
*/
-
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
unsigned long addr, pfn_t pfn)
{
- int err;
-
- err = __vm_insert_mixed(vma, addr, pfn, true);
- if (err == -ENOMEM)
- return VM_FAULT_OOM;
- if (err < 0 && err != -EBUSY)
- return VM_FAULT_SIGBUS;
- return VM_FAULT_NOPAGE;
+ return __vm_insert_mixed(vma, addr, pfn, true);
}
EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
@@ -3498,10 +3496,36 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret;
- /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
- if (!vma->vm_ops->fault)
- ret = VM_FAULT_SIGBUS;
- else if (!(vmf->flags & FAULT_FLAG_WRITE))
+ /*
+ * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
+ */
+ if (!vma->vm_ops->fault) {
+ /*
+ * If we find a migration pmd entry or a none pmd entry, which
+ * should never happen, return SIGBUS
+ */
+ if (unlikely(!pmd_present(*vmf->pmd)))
+ ret = VM_FAULT_SIGBUS;
+ else {
+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
+ vmf->pmd,
+ vmf->address,
+ &vmf->ptl);
+ /*
+ * Make sure this is not a temporary clearing of pte
+ * by holding ptl and checking again. A R/M/W update
+ * of pte involves: take ptl, clearing the pte so that
+ * we don't have concurrent modification by hardware
+ * followed by an update.
+ */
+ if (unlikely(pte_none(*vmf->pte)))
+ ret = VM_FAULT_SIGBUS;
+ else
+ ret = VM_FAULT_NOPAGE;
+
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ }
+ } else if (!(vmf->flags & FAULT_FLAG_WRITE))
ret = do_read_fault(vmf);
else if (!(vma->vm_flags & VM_SHARED))
ret = do_cow_fault(vmf);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 38d94b7..7e6509a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -687,62 +687,19 @@ static void node_states_check_changes_online(unsigned long nr_pages,
struct zone *zone, struct memory_notify *arg)
{
int nid = zone_to_nid(zone);
- enum zone_type zone_last = ZONE_NORMAL;
- /*
- * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
- * contains nodes which have zones of 0...ZONE_NORMAL,
- * set zone_last to ZONE_NORMAL.
- *
- * If we don't have HIGHMEM nor movable node,
- * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
- * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
- */
- if (N_MEMORY == N_NORMAL_MEMORY)
- zone_last = ZONE_MOVABLE;
+ arg->status_change_nid = -1;
+ arg->status_change_nid_normal = -1;
+ arg->status_change_nid_high = -1;
- /*
- * if the memory to be online is in a zone of 0...zone_last, and
- * the zones of 0...zone_last don't have memory before online, we will
- * need to set the node to node_states[N_NORMAL_MEMORY] after
- * the memory is online.
- */
- if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY))
+ if (!node_state(nid, N_MEMORY))
+ arg->status_change_nid = nid;
+ if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
arg->status_change_nid_normal = nid;
- else
- arg->status_change_nid_normal = -1;
-
#ifdef CONFIG_HIGHMEM
- /*
- * If we have movable node, node_states[N_HIGH_MEMORY]
- * contains nodes which have zones of 0...ZONE_HIGHMEM,
- * set zone_last to ZONE_HIGHMEM.
- *
- * If we don't have movable node, node_states[N_NORMAL_MEMORY]
- * contains nodes which have zones of 0...ZONE_MOVABLE,
- * set zone_last to ZONE_MOVABLE.
- */
- zone_last = ZONE_HIGHMEM;
- if (N_MEMORY == N_HIGH_MEMORY)
- zone_last = ZONE_MOVABLE;
-
- if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY))
+ if (zone_idx(zone) <= N_HIGH_MEMORY && !node_state(nid, N_HIGH_MEMORY))
arg->status_change_nid_high = nid;
- else
- arg->status_change_nid_high = -1;
-#else
- arg->status_change_nid_high = arg->status_change_nid_normal;
#endif
-
- /*
- * if the node don't have memory befor online, we will need to
- * set the node to node_states[N_MEMORY] after the memory
- * is online.
- */
- if (!node_state(nid, N_MEMORY))
- arg->status_change_nid = nid;
- else
- arg->status_change_nid = -1;
}
static void node_states_set_node(int node, struct memory_notify *arg)
@@ -753,7 +710,8 @@ static void node_states_set_node(int node, struct memory_notify *arg)
if (arg->status_change_nid_high >= 0)
node_set_state(node, N_HIGH_MEMORY);
- node_set_state(node, N_MEMORY);
+ if (arg->status_change_nid >= 0)
+ node_set_state(node, N_MEMORY);
}
static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
@@ -1505,75 +1463,53 @@ static void node_states_check_changes_offline(unsigned long nr_pages,
{
struct pglist_data *pgdat = zone->zone_pgdat;
unsigned long present_pages = 0;
- enum zone_type zt, zone_last = ZONE_NORMAL;
+ enum zone_type zt;
- /*
- * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
- * contains nodes which have zones of 0...ZONE_NORMAL,
- * set zone_last to ZONE_NORMAL.
- *
- * If we don't have HIGHMEM nor movable node,
- * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
- * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
- */
- if (N_MEMORY == N_NORMAL_MEMORY)
- zone_last = ZONE_MOVABLE;
+ arg->status_change_nid = -1;
+ arg->status_change_nid_normal = -1;
+ arg->status_change_nid_high = -1;
/*
- * check whether node_states[N_NORMAL_MEMORY] will be changed.
- * If the memory to be offline is in a zone of 0...zone_last,
- * and it is the last present memory, 0...zone_last will
- * become empty after offline , thus we can determind we will
- * need to clear the node from node_states[N_NORMAL_MEMORY].
+ * Check whether node_states[N_NORMAL_MEMORY] will be changed.
+ * If the memory to be offline is within the range
+ * [0..ZONE_NORMAL], and it is the last present memory there,
+ * the zones in that range will become empty after the offlining,
+ * thus we can determine that we need to clear the node from
+ * node_states[N_NORMAL_MEMORY].
*/
- for (zt = 0; zt <= zone_last; zt++)
+ for (zt = 0; zt <= ZONE_NORMAL; zt++)
present_pages += pgdat->node_zones[zt].present_pages;
- if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
+ if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
arg->status_change_nid_normal = zone_to_nid(zone);
- else
- arg->status_change_nid_normal = -1;
#ifdef CONFIG_HIGHMEM
/*
- * If we have movable node, node_states[N_HIGH_MEMORY]
- * contains nodes which have zones of 0...ZONE_HIGHMEM,
- * set zone_last to ZONE_HIGHMEM.
- *
- * If we don't have movable node, node_states[N_NORMAL_MEMORY]
- * contains nodes which have zones of 0...ZONE_MOVABLE,
- * set zone_last to ZONE_MOVABLE.
+ * node_states[N_HIGH_MEMORY] contains nodes which
+ * have normal memory or high memory.
+ * Here we add the present_pages belonging to ZONE_HIGHMEM.
+ * If the zone is within the range of [0..ZONE_HIGHMEM), and
+ * we determine that the zones in that range become empty,
+ * we need to clear the node for N_HIGH_MEMORY.
*/
- zone_last = ZONE_HIGHMEM;
- if (N_MEMORY == N_HIGH_MEMORY)
- zone_last = ZONE_MOVABLE;
-
- for (; zt <= zone_last; zt++)
- present_pages += pgdat->node_zones[zt].present_pages;
- if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
+ present_pages += pgdat->node_zones[ZONE_HIGHMEM].present_pages;
+ if (zone_idx(zone) <= ZONE_HIGHMEM && nr_pages >= present_pages)
arg->status_change_nid_high = zone_to_nid(zone);
- else
- arg->status_change_nid_high = -1;
-#else
- arg->status_change_nid_high = arg->status_change_nid_normal;
#endif
/*
- * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE
+ * We have accounted the pages from [0..ZONE_NORMAL), and
+ * in case of CONFIG_HIGHMEM the pages from ZONE_HIGHMEM
+ * as well.
+ * Here we count the possible pages from ZONE_MOVABLE.
+ * If after having accounted all the pages, we see that the nr_pages
+ * to be offlined is over or equal to the accounted pages,
+ * we know that the node will become empty, and so, we can clear
+ * it for N_MEMORY as well.
*/
- zone_last = ZONE_MOVABLE;
+ present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages;
- /*
- * check whether node_states[N_HIGH_MEMORY] will be changed
- * If we try to offline the last present @nr_pages from the node,
- * we can determind we will need to clear the node from
- * node_states[N_HIGH_MEMORY].
- */
- for (; zt <= zone_last; zt++)
- present_pages += pgdat->node_zones[zt].present_pages;
if (nr_pages >= present_pages)
arg->status_change_nid = zone_to_nid(zone);
- else
- arg->status_change_nid = -1;
}
static void node_states_clear_node(int node, struct memory_notify *arg)
@@ -1581,12 +1517,10 @@ static void node_states_clear_node(int node, struct memory_notify *arg)
if (arg->status_change_nid_normal >= 0)
node_clear_state(node, N_NORMAL_MEMORY);
- if ((N_MEMORY != N_NORMAL_MEMORY) &&
- (arg->status_change_nid_high >= 0))
+ if (arg->status_change_nid_high >= 0)
node_clear_state(node, N_HIGH_MEMORY);
- if ((N_MEMORY != N_HIGH_MEMORY) &&
- (arg->status_change_nid >= 0))
+ if (arg->status_change_nid >= 0)
node_clear_state(node, N_MEMORY);
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index da858f7..cfd26d7 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -797,16 +797,19 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
}
}
-static int lookup_node(unsigned long addr)
+static int lookup_node(struct mm_struct *mm, unsigned long addr)
{
struct page *p;
int err;
- err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL);
+ int locked = 1;
+ err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
if (err >= 0) {
err = page_to_nid(p);
put_page(p);
}
+ if (locked)
+ up_read(&mm->mmap_sem);
return err;
}
@@ -817,7 +820,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
int err;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
- struct mempolicy *pol = current->mempolicy;
+ struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
if (flags &
~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
@@ -857,7 +860,16 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
if (flags & MPOL_F_NODE) {
if (flags & MPOL_F_ADDR) {
- err = lookup_node(addr);
+ /*
+ * Take a refcount on the mpol, lookup_node()
+ * wil drop the mmap_sem, so after calling
+ * lookup_node() only "pol" remains valid, "vma"
+ * is stale.
+ */
+ pol_refcount = pol;
+ vma = NULL;
+ mpol_get(pol);
+ err = lookup_node(mm, addr);
if (err < 0)
goto out;
*policy = err;
@@ -892,7 +904,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
out:
mpol_cond_put(pol);
if (vma)
- up_read(&current->mm->mmap_sem);
+ up_read(&mm->mmap_sem);
+ if (pol_refcount)
+ mpol_put(pol_refcount);
return err;
}
@@ -2697,12 +2711,11 @@ static const char * const policy_modes[] =
int mpol_parse_str(char *str, struct mempolicy **mpol)
{
struct mempolicy *new = NULL;
- unsigned short mode;
unsigned short mode_flags;
nodemask_t nodes;
char *nodelist = strchr(str, ':');
char *flags = strchr(str, '=');
- int err = 1;
+ int err = 1, mode;
if (nodelist) {
/* NUL-terminate mode or flags string */
@@ -2717,12 +2730,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
if (flags)
*flags++ = '\0'; /* terminate mode string */
- for (mode = 0; mode < MPOL_MAX; mode++) {
- if (!strcmp(str, policy_modes[mode])) {
- break;
- }
- }
- if (mode >= MPOL_MAX)
+ mode = match_string(policy_modes, MPOL_MAX, str);
+ if (mode < 0)
goto out;
switch (mode) {
diff --git a/mm/migrate.c b/mm/migrate.c
index 84381b5..b6700f2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -685,6 +685,8 @@ void migrate_page_states(struct page *newpage, struct page *page)
SetPageActive(newpage);
} else if (TestClearPageUnevictable(page))
SetPageUnevictable(newpage);
+ if (PageWorkingset(page))
+ SetPageWorkingset(newpage);
if (PageChecked(page))
SetPageChecked(newpage);
if (PageMappedToDisk(page))
@@ -1973,8 +1975,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
int isolated = 0;
struct page *new_page = NULL;
int page_lru = page_is_file_cache(page);
- unsigned long mmun_start = address & HPAGE_PMD_MASK;
- unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
+ unsigned long start = address & HPAGE_PMD_MASK;
new_page = alloc_pages_node(node,
(GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
@@ -1997,15 +1998,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
/* anon mapping, we can simply copy page->mapping to the new page: */
new_page->mapping = page->mapping;
new_page->index = page->index;
+ /* flush the cache before copying using the kernel virtual address */
+ flush_cache_range(vma, start, start + HPAGE_PMD_SIZE);
migrate_page_copy(new_page, page);
WARN_ON(PageLRU(new_page));
/* Recheck the target PMD */
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
ptl = pmd_lock(mm, pmd);
if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) {
spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
/* Reverse changes made by migrate_page_copy() */
if (TestClearPageActive(new_page))
@@ -2029,16 +2030,26 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
/*
- * Clear the old entry under pagetable lock and establish the new PTE.
- * Any parallel GUP will either observe the old page blocking on the
- * page lock, block on the page table lock or observe the new page.
- * The SetPageUptodate on the new page and page_add_new_anon_rmap
- * guarantee the copy is visible before the pagetable update.
+ * Overwrite the old entry under pagetable lock and establish
+ * the new PTE. Any parallel GUP will either observe the old
+ * page blocking on the page lock, block on the page table
+ * lock or observe the new page. The SetPageUptodate on the
+ * new page and page_add_new_anon_rmap guarantee the copy is
+ * visible before the pagetable update.
*/
- flush_cache_range(vma, mmun_start, mmun_end);
- page_add_anon_rmap(new_page, vma, mmun_start, true);
- pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
- set_pmd_at(mm, mmun_start, pmd, entry);
+ page_add_anon_rmap(new_page, vma, start, true);
+ /*
+ * At this point the pmd is numa/protnone (i.e. non present) and the TLB
+ * has already been flushed globally. So no TLB can be currently
+ * caching this non present pmd mapping. There's no need to clear the
+ * pmd before doing set_pmd_at(), nor to flush the TLB after
+ * set_pmd_at(). Clearing the pmd here would introduce a race
+ * condition against MADV_DONTNEED, because MADV_DONTNEED only holds the
+ * mmap_sem for reading. If the pmd is set to NULL at any given time,
+ * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this
+ * pmd.
+ */
+ set_pmd_at(mm, start, pmd, entry);
update_mmu_cache_pmd(vma, address, &entry);
page_ref_unfreeze(page, 2);
@@ -2047,11 +2058,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
spin_unlock(ptl);
- /*
- * No need to double call mmu_notifier->invalidate_range() callback as
- * the above pmdp_huge_clear_flush_notify() did already call it.
- */
- mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end);
/* Take an "isolate" reference and put new page on the LRU. */
get_page(new_page);
@@ -2075,7 +2081,7 @@ out_fail:
ptl = pmd_lock(mm, pmd);
if (pmd_same(*pmd, entry)) {
entry = pmd_modify(entry, vma->vm_page_prot);
- set_pmd_at(mm, mmun_start, pmd, entry);
+ set_pmd_at(mm, start, pmd, entry);
update_mmu_cache_pmd(vma, address, &entry);
}
spin_unlock(ptl);
diff --git a/mm/mmap.c b/mm/mmap.c
index f7cd9cb..6c04292 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -191,16 +191,19 @@ static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
unsigned long retval;
- unsigned long newbrk, oldbrk;
+ unsigned long newbrk, oldbrk, origbrk;
struct mm_struct *mm = current->mm;
struct vm_area_struct *next;
unsigned long min_brk;
bool populate;
+ bool downgraded = false;
LIST_HEAD(uf);
if (down_write_killable(&mm->mmap_sem))
return -EINTR;
+ origbrk = mm->brk;
+
#ifdef CONFIG_COMPAT_BRK
/*
* CONFIG_COMPAT_BRK can still be overridden by setting
@@ -229,14 +232,32 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
newbrk = PAGE_ALIGN(brk);
oldbrk = PAGE_ALIGN(mm->brk);
- if (oldbrk == newbrk)
- goto set_brk;
+ if (oldbrk == newbrk) {
+ mm->brk = brk;
+ goto success;
+ }
- /* Always allow shrinking brk. */
+ /*
+ * Always allow shrinking brk.
+ * __do_munmap() may downgrade mmap_sem to read.
+ */
if (brk <= mm->brk) {
- if (!do_munmap(mm, newbrk, oldbrk-newbrk, &uf))
- goto set_brk;
- goto out;
+ int ret;
+
+ /*
+ * mm->brk must to be protected by write mmap_sem so update it
+ * before downgrading mmap_sem. When __do_munmap() fails,
+ * mm->brk will be restored from origbrk.
+ */
+ mm->brk = brk;
+ ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true);
+ if (ret < 0) {
+ mm->brk = origbrk;
+ goto out;
+ } else if (ret == 1) {
+ downgraded = true;
+ }
+ goto success;
}
/* Check against existing mmap mappings. */
@@ -247,18 +268,21 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
/* Ok, looks good - let it rip. */
if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
goto out;
-
-set_brk:
mm->brk = brk;
+
+success:
populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
- up_write(&mm->mmap_sem);
+ if (downgraded)
+ up_read(&mm->mmap_sem);
+ else
+ up_write(&mm->mmap_sem);
userfaultfd_unmap_complete(mm, &uf);
if (populate)
mm_populate(oldbrk, newbrk - oldbrk);
return brk;
out:
- retval = mm->brk;
+ retval = origbrk;
up_write(&mm->mmap_sem);
return retval;
}
@@ -2687,8 +2711,8 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
* work. This now handles partial unmappings.
* Jeremy Fitzhardinge <jeremy@goop.org>
*/
-int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
- struct list_head *uf)
+int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
+ struct list_head *uf, bool downgrade)
{
unsigned long end;
struct vm_area_struct *vma, *prev, *last;
@@ -2770,25 +2794,38 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
mm->locked_vm -= vma_pages(tmp);
munlock_vma_pages_all(tmp);
}
+
tmp = tmp->vm_next;
}
}
- /*
- * Remove the vma's, and unmap the actual pages
- */
+ /* Detach vmas from rbtree */
detach_vmas_to_be_unmapped(mm, vma, prev, end);
- unmap_region(mm, vma, prev, start, end);
+ /*
+ * mpx unmap needs to be called with mmap_sem held for write.
+ * It is safe to call it before unmap_region().
+ */
arch_unmap(mm, vma, start, end);
+ if (downgrade)
+ downgrade_write(&mm->mmap_sem);
+
+ unmap_region(mm, vma, prev, start, end);
+
/* Fix up all other VM information */
remove_vma_list(mm, vma);
- return 0;
+ return downgrade ? 1 : 0;
}
-int vm_munmap(unsigned long start, size_t len)
+int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
+ struct list_head *uf)
+{
+ return __do_munmap(mm, start, len, uf, false);
+}
+
+static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
{
int ret;
struct mm_struct *mm = current->mm;
@@ -2797,17 +2834,32 @@ int vm_munmap(unsigned long start, size_t len)
if (down_write_killable(&mm->mmap_sem))
return -EINTR;
- ret = do_munmap(mm, start, len, &uf);
- up_write(&mm->mmap_sem);
+ ret = __do_munmap(mm, start, len, &uf, downgrade);
+ /*
+ * Returning 1 indicates mmap_sem is downgraded.
+ * But 1 is not legal return value of vm_munmap() and munmap(), reset
+ * it to 0 before return.
+ */
+ if (ret == 1) {
+ up_read(&mm->mmap_sem);
+ ret = 0;
+ } else
+ up_write(&mm->mmap_sem);
+
userfaultfd_unmap_complete(mm, &uf);
return ret;
}
+
+int vm_munmap(unsigned long start, size_t len)
+{
+ return __vm_munmap(start, len, false);
+}
EXPORT_SYMBOL(vm_munmap);
SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
{
profile_munmap(addr);
- return vm_munmap(addr, len);
+ return __vm_munmap(addr, len, true);
}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 82bb1a9..5119ff8 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -247,37 +247,6 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
}
EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
-/*
- * Must be called while holding mm->mmap_sem for either read or write.
- * The result is guaranteed to be valid until mm->mmap_sem is dropped.
- */
-bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
-{
- struct mmu_notifier *mn;
- int id;
- bool ret = false;
-
- WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
-
- if (!mm_has_notifiers(mm))
- return ret;
-
- id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
- if (!mn->ops->invalidate_range &&
- !mn->ops->invalidate_range_start &&
- !mn->ops->invalidate_range_end)
- continue;
-
- if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
- ret = true;
- break;
- }
- }
- srcu_read_unlock(&srcu, id);
- return ret;
-}
-
static int do_mmu_notifier_register(struct mmu_notifier *mn,
struct mm_struct *mm,
int take_mmap_sem)
diff --git a/mm/mremap.c b/mm/mremap.c
index a9617e7..7f9f918 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -521,6 +521,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
unsigned long ret = -EINVAL;
unsigned long charged = 0;
bool locked = false;
+ bool downgraded = false;
struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
LIST_HEAD(uf_unmap_early);
LIST_HEAD(uf_unmap);
@@ -557,12 +558,20 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
/*
* Always allow a shrinking remap: that just unmaps
* the unnecessary pages..
- * do_munmap does all the needed commit accounting
+ * __do_munmap does all the needed commit accounting, and
+ * downgrades mmap_sem to read if so directed.
*/
if (old_len >= new_len) {
- ret = do_munmap(mm, addr+new_len, old_len - new_len, &uf_unmap);
- if (ret && old_len != new_len)
+ int retval;
+
+ retval = __do_munmap(mm, addr+new_len, old_len - new_len,
+ &uf_unmap, true);
+ if (retval < 0 && old_len != new_len) {
+ ret = retval;
goto out;
+ /* Returning 1 indicates mmap_sem is downgraded to read. */
+ } else if (retval == 1)
+ downgraded = true;
ret = addr;
goto out;
}
@@ -627,7 +636,10 @@ out:
vm_unacct_memory(charged);
locked = 0;
}
- up_write(&current->mm->mmap_sem);
+ if (downgraded)
+ up_read(&current->mm->mmap_sem);
+ else
+ up_write(&current->mm->mmap_sem);
if (locked && new_len > old_len)
mm_populate(new_addr + old_len, new_len - old_len);
userfaultfd_unmap_complete(mm, &uf_unmap_early);
diff --git a/mm/nommu.c b/mm/nommu.c
index e4aac33..749276b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1709,11 +1709,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
return ret;
}
-struct page *follow_page_mask(struct vm_area_struct *vma,
- unsigned long address, unsigned int flags,
- unsigned int *page_mask)
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
+ unsigned int foll_flags)
{
- *page_mask = 0;
return NULL;
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 84ae9bf..439a304 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2149,6 +2149,13 @@ EXPORT_SYMBOL(tag_pages_for_writeback);
* not miss some pages (e.g., because some other process has cleared TOWRITE
* tag we set). The rule we follow is that TOWRITE tag can be cleared only
* by the process clearing the DIRTY tag (and submitting the page for IO).
+ *
+ * To avoid deadlocks between range_cyclic writeback and callers that hold
+ * pages in PageWriteback to aggregate IO until write_cache_pages() returns,
+ * we do not loop back to the start of the file. Doing so causes a page
+ * lock/page writeback access order inversion - we should only ever lock
+ * multiple pages in ascending page->index order, and looping back to the start
+ * of the file violates that rule and causes deadlocks.
*/
int write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc, writepage_t writepage,
@@ -2162,7 +2169,6 @@ int write_cache_pages(struct address_space *mapping,
pgoff_t index;
pgoff_t end; /* Inclusive */
pgoff_t done_index;
- int cycled;
int range_whole = 0;
int tag;
@@ -2170,23 +2176,17 @@ int write_cache_pages(struct address_space *mapping,
if (wbc->range_cyclic) {
writeback_index = mapping->writeback_index; /* prev offset */
index = writeback_index;
- if (index == 0)
- cycled = 1;
- else
- cycled = 0;
end = -1;
} else {
index = wbc->range_start >> PAGE_SHIFT;
end = wbc->range_end >> PAGE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
- cycled = 1; /* ignore range_cyclic tests */
}
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
-retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag_pages_for_writeback(mapping, index, end);
done_index = index;
@@ -2272,17 +2272,14 @@ continue_unlock:
pagevec_release(&pvec);
cond_resched();
}
- if (!cycled && !done) {
- /*
- * range_cyclic:
- * We hit the last page and there is more work to be done: wrap
- * back to the start of the file
- */
- cycled = 1;
- index = 0;
- end = writeback_index - 1;
- goto retry;
- }
+
+ /*
+ * If we hit the last page and there is more work to be done: wrap
+ * back the index back to the start of the file for the next
+ * time we are called.
+ */
+ if (wbc->range_cyclic && !done)
+ done_index = 0;
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = done_index;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e2ef1c1..863d46d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -66,6 +66,7 @@
#include <linux/ftrace.h>
#include <linux/lockdep.h>
#include <linux/nmi.h>
+#include <linux/psi.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -306,24 +307,33 @@ static inline bool __meminit early_page_uninitialised(unsigned long pfn)
}
/*
- * Returns false when the remaining initialisation should be deferred until
+ * Returns true when the remaining initialisation should be deferred until
* later in the boot cycle when it can be parallelised.
*/
-static inline bool update_defer_init(pg_data_t *pgdat,
- unsigned long pfn, unsigned long zone_end,
- unsigned long *nr_initialised)
+static bool __meminit
+defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
{
+ static unsigned long prev_end_pfn, nr_initialised;
+
+ /*
+ * prev_end_pfn static that contains the end of previous zone
+ * No need to protect because called very early in boot before smp_init.
+ */
+ if (prev_end_pfn != end_pfn) {
+ prev_end_pfn = end_pfn;
+ nr_initialised = 0;
+ }
+
/* Always populate low zones for address-constrained allocations */
- if (zone_end < pgdat_end_pfn(pgdat))
- return true;
- (*nr_initialised)++;
- if ((*nr_initialised > pgdat->static_init_pgcnt) &&
- (pfn & (PAGES_PER_SECTION - 1)) == 0) {
- pgdat->first_deferred_pfn = pfn;
+ if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
return false;
+ nr_initialised++;
+ if ((nr_initialised > NODE_DATA(nid)->static_init_pgcnt) &&
+ (pfn & (PAGES_PER_SECTION - 1)) == 0) {
+ NODE_DATA(nid)->first_deferred_pfn = pfn;
+ return true;
}
-
- return true;
+ return false;
}
#else
static inline bool early_page_uninitialised(unsigned long pfn)
@@ -331,11 +341,9 @@ static inline bool early_page_uninitialised(unsigned long pfn)
return false;
}
-static inline bool update_defer_init(pg_data_t *pgdat,
- unsigned long pfn, unsigned long zone_end,
- unsigned long *nr_initialised)
+static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
{
- return true;
+ return false;
}
#endif
@@ -1231,7 +1239,12 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
/* Avoid false-positive PageTail() */
INIT_LIST_HEAD(&page->lru);
- SetPageReserved(page);
+ /*
+ * no need for atomic set_bit because the struct
+ * page is not visible yet so nobody should
+ * access it yet.
+ */
+ __SetPageReserved(page);
}
}
}
@@ -2015,10 +2028,6 @@ static int move_freepages(struct zone *zone,
pfn_valid(page_to_pfn(end_page)) &&
page_zone(start_page) != page_zone(end_page));
#endif
-
- if (num_movable)
- *num_movable = 0;
-
for (page = start_page; page <= end_page;) {
if (!pfn_valid_within(page_to_pfn(page))) {
page++;
@@ -2058,6 +2067,9 @@ int move_freepages_block(struct zone *zone, struct page *page,
unsigned long start_pfn, end_pfn;
struct page *start_page, *end_page;
+ if (num_movable)
+ *num_movable = 0;
+
start_pfn = page_to_pfn(page);
start_pfn = start_pfn & ~(pageblock_nr_pages-1);
start_page = pfn_to_page(start_pfn);
@@ -3366,26 +3378,12 @@ try_this_zone:
return NULL;
}
-/*
- * Large machines with many possible nodes should not always dump per-node
- * meminfo in irq context.
- */
-static inline bool should_suppress_show_mem(void)
-{
- bool ret = false;
-
-#if NODES_SHIFT > 8
- ret = in_interrupt();
-#endif
- return ret;
-}
-
static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
{
unsigned int filter = SHOW_MEM_FILTER_NODES;
static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
- if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs))
+ if (!__ratelimit(&show_mem_rs))
return;
/*
@@ -3549,15 +3547,20 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
enum compact_priority prio, enum compact_result *compact_result)
{
struct page *page;
+ unsigned long pflags;
unsigned int noreclaim_flag;
if (!order)
return NULL;
+ psi_memstall_enter(&pflags);
noreclaim_flag = memalloc_noreclaim_save();
+
*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
prio);
+
memalloc_noreclaim_restore(noreclaim_flag);
+ psi_memstall_leave(&pflags);
if (*compact_result <= COMPACT_INACTIVE)
return NULL;
@@ -3756,11 +3759,13 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
struct reclaim_state reclaim_state;
int progress;
unsigned int noreclaim_flag;
+ unsigned long pflags;
cond_resched();
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
+ psi_memstall_enter(&pflags);
fs_reclaim_acquire(gfp_mask);
noreclaim_flag = memalloc_noreclaim_save();
reclaim_state.reclaimed_slab = 0;
@@ -3772,6 +3777,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
current->reclaim_state = NULL;
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(gfp_mask);
+ psi_memstall_leave(&pflags);
cond_resched();
@@ -3922,6 +3928,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
{
struct zone *zone;
struct zoneref *z;
+ bool ret = false;
/*
* Costly allocations might have made a progress but this doesn't mean
@@ -3985,25 +3992,24 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
}
}
- /*
- * Memory allocation/reclaim might be called from a WQ
- * context and the current implementation of the WQ
- * concurrency control doesn't recognize that
- * a particular WQ is congested if the worker thread is
- * looping without ever sleeping. Therefore we have to
- * do a short sleep here rather than calling
- * cond_resched().
- */
- if (current->flags & PF_WQ_WORKER)
- schedule_timeout_uninterruptible(1);
- else
- cond_resched();
-
- return true;
+ ret = true;
+ goto out;
}
}
- return false;
+out:
+ /*
+ * Memory allocation/reclaim might be called from a WQ context and the
+ * current implementation of the WQ concurrency control doesn't
+ * recognize that a particular WQ is congested if the worker thread is
+ * looping without ever sleeping. Therefore we have to do a short sleep
+ * here rather than calling cond_resched().
+ */
+ if (current->flags & PF_WQ_WORKER)
+ schedule_timeout_uninterruptible(1);
+ else
+ cond_resched();
+ return ret;
}
static inline bool
@@ -4701,6 +4707,7 @@ long si_mem_available(void)
unsigned long pagecache;
unsigned long wmark_low = 0;
unsigned long pages[NR_LRU_LISTS];
+ unsigned long reclaimable;
struct zone *zone;
int lru;
@@ -4726,19 +4733,13 @@ long si_mem_available(void)
available += pagecache;
/*
- * Part of the reclaimable slab consists of items that are in use,
- * and cannot be freed. Cap this estimate at the low watermark.
+ * Part of the reclaimable slab and other kernel memory consists of
+ * items that are in use, and cannot be freed. Cap this estimate at the
+ * low watermark.
*/
- available += global_node_page_state(NR_SLAB_RECLAIMABLE) -
- min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2,
- wmark_low);
-
- /*
- * Part of the kernel memory, which can be released under memory
- * pressure.
- */
- available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >>
- PAGE_SHIFT;
+ reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) +
+ global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
+ available += reclaimable - min(reclaimable / 2, wmark_low);
if (available < 0)
available = 0;
@@ -5449,6 +5450,30 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
#endif
}
+/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
+static bool __meminit
+overlap_memmap_init(unsigned long zone, unsigned long *pfn)
+{
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ static struct memblock_region *r;
+
+ if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
+ if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
+ for_each_memblock(memory, r) {
+ if (*pfn < memblock_region_memory_end_pfn(r))
+ break;
+ }
+ }
+ if (*pfn >= memblock_region_memory_base_pfn(r) &&
+ memblock_is_mirror(r)) {
+ *pfn = memblock_region_memory_end_pfn(r);
+ return true;
+ }
+ }
+#endif
+ return false;
+}
+
/*
* Initially all pages are reserved - free ones are freed
* up by free_all_bootmem() once the early boot process is
@@ -5458,67 +5483,118 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, enum memmap_context context,
struct vmem_altmap *altmap)
{
- unsigned long end_pfn = start_pfn + size;
- pg_data_t *pgdat = NODE_DATA(nid);
- unsigned long pfn;
- unsigned long nr_initialised = 0;
+ unsigned long pfn, end_pfn = start_pfn + size;
struct page *page;
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
- struct memblock_region *r = NULL, *tmp;
-#endif
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
+#ifdef CONFIG_ZONE_DEVICE
/*
* Honor reservation requested by the driver for this ZONE_DEVICE
- * memory
+ * memory. We limit the total number of pages to initialize to just
+ * those that might contain the memory mapping. We will defer the
+ * ZONE_DEVICE page initialization until after we have released
+ * the hotplug lock.
*/
- if (altmap && start_pfn == altmap->base_pfn)
- start_pfn += altmap->reserve;
+ if (zone == ZONE_DEVICE) {
+ if (!altmap)
+ return;
+
+ if (start_pfn == altmap->base_pfn)
+ start_pfn += altmap->reserve;
+ end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
+ }
+#endif
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
* There can be holes in boot-time mem_map[]s handed to this
* function. They do not exist on hotplugged memory.
*/
- if (context != MEMMAP_EARLY)
- goto not_early;
-
- if (!early_pfn_valid(pfn))
- continue;
- if (!early_pfn_in_nid(pfn, nid))
- continue;
- if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
- break;
-
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
- /*
- * Check given memblock attribute by firmware which can affect
- * kernel memory layout. If zone==ZONE_MOVABLE but memory is
- * mirrored, it's an overlapped memmap init. skip it.
- */
- if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
- if (!r || pfn >= memblock_region_memory_end_pfn(r)) {
- for_each_memblock(memory, tmp)
- if (pfn < memblock_region_memory_end_pfn(tmp))
- break;
- r = tmp;
- }
- if (pfn >= memblock_region_memory_base_pfn(r) &&
- memblock_is_mirror(r)) {
- /* already initialized as NORMAL */
- pfn = memblock_region_memory_end_pfn(r);
+ if (context == MEMMAP_EARLY) {
+ if (!early_pfn_valid(pfn))
continue;
- }
+ if (!early_pfn_in_nid(pfn, nid))
+ continue;
+ if (overlap_memmap_init(zone, &pfn))
+ continue;
+ if (defer_init(nid, pfn, end_pfn))
+ break;
}
-#endif
-not_early:
page = pfn_to_page(pfn);
__init_single_page(page, pfn, zone, nid);
if (context == MEMMAP_HOTPLUG)
- SetPageReserved(page);
+ __SetPageReserved(page);
+
+ /*
+ * Mark the block movable so that blocks are reserved for
+ * movable at startup. This will force kernel allocations
+ * to reserve their blocks rather than leaking throughout
+ * the address space during boot when many long-lived
+ * kernel allocations are made.
+ *
+ * bitmap is created for zone's valid pfn range. but memmap
+ * can be created for invalid pages (for alignment)
+ * check here not to call set_pageblock_migratetype() against
+ * pfn out of zone.
+ */
+ if (!(pfn & (pageblock_nr_pages - 1))) {
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ cond_resched();
+ }
+ }
+}
+
+#ifdef CONFIG_ZONE_DEVICE
+void __ref memmap_init_zone_device(struct zone *zone,
+ unsigned long start_pfn,
+ unsigned long size,
+ struct dev_pagemap *pgmap)
+{
+ unsigned long pfn, end_pfn = start_pfn + size;
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ unsigned long zone_idx = zone_idx(zone);
+ unsigned long start = jiffies;
+ int nid = pgdat->node_id;
+
+ if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone)))
+ return;
+
+ /*
+ * The call to memmap_init_zone should have already taken care
+ * of the pages reserved for the memmap, so we can just jump to
+ * the end of that region and start processing the device pages.
+ */
+ if (pgmap->altmap_valid) {
+ struct vmem_altmap *altmap = &pgmap->altmap;
+
+ start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
+ size = end_pfn - start_pfn;
+ }
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ struct page *page = pfn_to_page(pfn);
+
+ __init_single_page(page, pfn, zone_idx, nid);
+
+ /*
+ * Mark page reserved as it will need to wait for onlining
+ * phase for it to be fully associated with a zone.
+ *
+ * We can use the non-atomic __set_bit operation for setting
+ * the flag as we are still initializing the pages.
+ */
+ __SetPageReserved(page);
+
+ /*
+ * ZONE_DEVICE pages union ->lru with a ->pgmap back
+ * pointer and hmm_data. It is a bug if a ZONE_DEVICE
+ * page is ever freed or placed on a driver-private list.
+ */
+ page->pgmap = pgmap;
+ page->hmm_data = 0;
/*
* Mark the block movable so that blocks are reserved for
@@ -5540,8 +5616,12 @@ not_early:
cond_resched();
}
}
+
+ pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev),
+ size, jiffies_to_msecs(jiffies - start));
}
+#endif
static void __meminit zone_init_free_lists(struct zone *zone)
{
unsigned int order, t;
@@ -5551,10 +5631,11 @@ static void __meminit zone_init_free_lists(struct zone *zone)
}
}
-#ifndef __HAVE_ARCH_MEMMAP_INIT
-#define memmap_init(size, nid, zone, start_pfn) \
- memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY, NULL)
-#endif
+void __meminit __weak memmap_init(unsigned long size, int nid,
+ unsigned long zone, unsigned long start_pfn)
+{
+ memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, NULL);
+}
static int zone_batchsize(struct zone *zone)
{
@@ -6428,45 +6509,65 @@ void __init free_area_init_node(int nid, unsigned long *zones_size,
}
#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP)
+
+/*
+ * Zero all valid struct pages in range [spfn, epfn), return number of struct
+ * pages zeroed
+ */
+static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
+{
+ unsigned long pfn;
+ u64 pgcnt = 0;
+
+ for (pfn = spfn; pfn < epfn; pfn++) {
+ if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
+ pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
+ + pageblock_nr_pages - 1;
+ continue;
+ }
+ mm_zero_struct_page(pfn_to_page(pfn));
+ pgcnt++;
+ }
+
+ return pgcnt;
+}
+
/*
* Only struct pages that are backed by physical memory are zeroed and
* initialized by going through __init_single_page(). But, there are some
* struct pages which are reserved in memblock allocator and their fields
* may be accessed (for example page_to_pfn() on some configuration accesses
* flags). We must explicitly zero those struct pages.
+ *
+ * This function also addresses a similar issue where struct pages are left
+ * uninitialized because the physical address range is not covered by
+ * memblock.memory or memblock.reserved. That could happen when memblock
+ * layout is manually configured via memmap=.
*/
void __init zero_resv_unavail(void)
{
phys_addr_t start, end;
- unsigned long pfn;
u64 i, pgcnt;
+ phys_addr_t next = 0;
/*
- * Loop through ranges that are reserved, but do not have reported
- * physical memory backing.
+ * Loop through unavailable ranges not covered by memblock.memory.
*/
pgcnt = 0;
- for_each_resv_unavail_range(i, &start, &end) {
- for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) {
- if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
- pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
- + pageblock_nr_pages - 1;
- continue;
- }
- mm_zero_struct_page(pfn_to_page(pfn));
- pgcnt++;
- }
+ for_each_mem_range(i, &memblock.memory, NULL,
+ NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) {
+ if (next < start)
+ pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start));
+ next = end;
}
+ pgcnt += zero_pfn_range(PFN_DOWN(next), max_pfn);
/*
* Struct pages that do not have backing memory. This could be because
* firmware is using some of this memory, or for some other reasons.
- * Once memblock is changed so such behaviour is not allowed: i.e.
- * list of "reserved" memory must be a subset of list of "memory", then
- * this code can be removed.
*/
if (pgcnt)
- pr_info("Reserved but unavailable: %lld pages", pgcnt);
+ pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt);
}
#endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */
@@ -6803,15 +6904,12 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
{
enum zone_type zone_type;
- if (N_MEMORY == N_NORMAL_MEMORY)
- return;
-
for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
if (populated_zone(zone)) {
- node_set_state(nid, N_HIGH_MEMORY);
- if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
- zone_type <= ZONE_NORMAL)
+ if (IS_ENABLED(CONFIG_HIGHMEM))
+ node_set_state(nid, N_HIGH_MEMORY);
+ if (zone_type <= ZONE_NORMAL)
node_set_state(nid, N_NORMAL_MEMORY);
break;
}
diff --git a/mm/page_io.c b/mm/page_io.c
index 573d366..a451ffa 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -283,7 +283,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
struct swap_info_struct *sis = page_swap_info(page);
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
- if (sis->flags & SWP_FILE) {
+ if (sis->flags & SWP_FS) {
struct kiocb kiocb;
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;
@@ -365,7 +365,7 @@ int swap_readpage(struct page *page, bool synchronous)
goto out;
}
- if (sis->flags & SWP_FILE) {
+ if (sis->flags & SWP_FS) {
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;
@@ -423,7 +423,7 @@ int swap_set_page_dirty(struct page *page)
{
struct swap_info_struct *sis = page_swap_info(page);
- if (sis->flags & SWP_FILE) {
+ if (sis->flags & SWP_FS) {
struct address_space *mapping = sis->swap_file->f_mapping;
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
diff --git a/mm/slab.c b/mm/slab.c
index aa76a70..2a5654b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1288,7 +1288,7 @@ void __init kmem_cache_init(void)
* Initialize the caches that provide memory for the kmem_cache_node
* structures first. Without this, further allocations will bug.
*/
- kmalloc_caches[INDEX_NODE] = create_kmalloc_cache(
+ kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE] = create_kmalloc_cache(
kmalloc_info[INDEX_NODE].name,
kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS,
0, kmalloc_size(INDEX_NODE));
@@ -1304,7 +1304,7 @@ void __init kmem_cache_init(void)
for_each_online_node(nid) {
init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid);
- init_list(kmalloc_caches[INDEX_NODE],
+ init_list(kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE],
&init_kmem_cache_node[SIZE_NODE + nid], nid);
}
}
@@ -3675,6 +3675,8 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
struct kmem_cache *cachep;
void *ret;
+ if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
+ return NULL;
cachep = kmalloc_slab(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
@@ -3710,6 +3712,8 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
struct kmem_cache *cachep;
void *ret;
+ if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
+ return NULL;
cachep = kmalloc_slab(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index fea3376..7eb8dc1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -973,14 +973,10 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name,
return s;
}
-struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init;
+struct kmem_cache *
+kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init;
EXPORT_SYMBOL(kmalloc_caches);
-#ifdef CONFIG_ZONE_DMA
-struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init;
-EXPORT_SYMBOL(kmalloc_dma_caches);
-#endif
-
/*
* Conversion table for small slabs sizes / 8 to the index in the
* kmalloc array. This is necessary for slabs < 192 since we have non power
@@ -1027,25 +1023,20 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
{
unsigned int index;
- if (unlikely(size > KMALLOC_MAX_SIZE)) {
- WARN_ON_ONCE(!(flags & __GFP_NOWARN));
- return NULL;
- }
-
if (size <= 192) {
if (!size)
return ZERO_SIZE_PTR;
index = size_index[size_index_elem(size)];
- } else
+ } else {
+ if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
+ WARN_ON(1);
+ return NULL;
+ }
index = fls(size - 1);
+ }
-#ifdef CONFIG_ZONE_DMA
- if (unlikely((flags & GFP_DMA)))
- return kmalloc_dma_caches[index];
-
-#endif
- return kmalloc_caches[index];
+ return kmalloc_caches[kmalloc_type(flags)][index];
}
/*
@@ -1059,15 +1050,15 @@ const struct kmalloc_info_struct kmalloc_info[] __initconst = {
{"kmalloc-16", 16}, {"kmalloc-32", 32},
{"kmalloc-64", 64}, {"kmalloc-128", 128},
{"kmalloc-256", 256}, {"kmalloc-512", 512},
- {"kmalloc-1024", 1024}, {"kmalloc-2048", 2048},
- {"kmalloc-4096", 4096}, {"kmalloc-8192", 8192},
- {"kmalloc-16384", 16384}, {"kmalloc-32768", 32768},
- {"kmalloc-65536", 65536}, {"kmalloc-131072", 131072},
- {"kmalloc-262144", 262144}, {"kmalloc-524288", 524288},
- {"kmalloc-1048576", 1048576}, {"kmalloc-2097152", 2097152},
- {"kmalloc-4194304", 4194304}, {"kmalloc-8388608", 8388608},
- {"kmalloc-16777216", 16777216}, {"kmalloc-33554432", 33554432},
- {"kmalloc-67108864", 67108864}
+ {"kmalloc-1k", 1024}, {"kmalloc-2k", 2048},
+ {"kmalloc-4k", 4096}, {"kmalloc-8k", 8192},
+ {"kmalloc-16k", 16384}, {"kmalloc-32k", 32768},
+ {"kmalloc-64k", 65536}, {"kmalloc-128k", 131072},
+ {"kmalloc-256k", 262144}, {"kmalloc-512k", 524288},
+ {"kmalloc-1M", 1048576}, {"kmalloc-2M", 2097152},
+ {"kmalloc-4M", 4194304}, {"kmalloc-8M", 8388608},
+ {"kmalloc-16M", 16777216}, {"kmalloc-32M", 33554432},
+ {"kmalloc-64M", 67108864}
};
/*
@@ -1117,9 +1108,36 @@ void __init setup_kmalloc_cache_index_table(void)
}
}
-static void __init new_kmalloc_cache(int idx, slab_flags_t flags)
+static const char *
+kmalloc_cache_name(const char *prefix, unsigned int size)
+{
+
+ static const char units[3] = "\0kM";
+ int idx = 0;
+
+ while (size >= 1024 && (size % 1024 == 0)) {
+ size /= 1024;
+ idx++;
+ }
+
+ return kasprintf(GFP_NOWAIT, "%s-%u%c", prefix, size, units[idx]);
+}
+
+static void __init
+new_kmalloc_cache(int idx, int type, slab_flags_t flags)
{
- kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name,
+ const char *name;
+
+ if (type == KMALLOC_RECLAIM) {
+ flags |= SLAB_RECLAIM_ACCOUNT;
+ name = kmalloc_cache_name("kmalloc-rcl",
+ kmalloc_info[idx].size);
+ BUG_ON(!name);
+ } else {
+ name = kmalloc_info[idx].name;
+ }
+
+ kmalloc_caches[type][idx] = create_kmalloc_cache(name,
kmalloc_info[idx].size, flags, 0,
kmalloc_info[idx].size);
}
@@ -1131,21 +1149,25 @@ static void __init new_kmalloc_cache(int idx, slab_flags_t flags)
*/
void __init create_kmalloc_caches(slab_flags_t flags)
{
- int i;
+ int i, type;
- for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
- if (!kmalloc_caches[i])
- new_kmalloc_cache(i, flags);
+ for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) {
+ for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
+ if (!kmalloc_caches[type][i])
+ new_kmalloc_cache(i, type, flags);
- /*
- * Caches that are not of the two-to-the-power-of size.
- * These have to be created immediately after the
- * earlier power of two caches
- */
- if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6)
- new_kmalloc_cache(1, flags);
- if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7)
- new_kmalloc_cache(2, flags);
+ /*
+ * Caches that are not of the two-to-the-power-of size.
+ * These have to be created immediately after the
+ * earlier power of two caches
+ */
+ if (KMALLOC_MIN_SIZE <= 32 && i == 6 &&
+ !kmalloc_caches[type][1])
+ new_kmalloc_cache(1, type, flags);
+ if (KMALLOC_MIN_SIZE <= 64 && i == 7 &&
+ !kmalloc_caches[type][2])
+ new_kmalloc_cache(2, type, flags);
+ }
}
/* Kmalloc array is now usable */
@@ -1153,16 +1175,15 @@ void __init create_kmalloc_caches(slab_flags_t flags)
#ifdef CONFIG_ZONE_DMA
for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
- struct kmem_cache *s = kmalloc_caches[i];
+ struct kmem_cache *s = kmalloc_caches[KMALLOC_NORMAL][i];
if (s) {
unsigned int size = kmalloc_size(i);
- char *n = kasprintf(GFP_NOWAIT,
- "dma-kmalloc-%u", size);
+ const char *n = kmalloc_cache_name("dma-kmalloc", size);
BUG_ON(!n);
- kmalloc_dma_caches[i] = create_kmalloc_cache(n,
- size, SLAB_CACHE_DMA | flags, 0, 0);
+ kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache(
+ n, size, SLAB_CACHE_DMA | flags, 0, 0);
}
}
#endif
diff --git a/mm/slub.c b/mm/slub.c
index 8da34a8..e3629cd 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1276,16 +1276,54 @@ out:
__setup("slub_debug", setup_slub_debug);
+/*
+ * kmem_cache_flags - apply debugging options to the cache
+ * @object_size: the size of an object without meta data
+ * @flags: flags to set
+ * @name: name of the cache
+ * @ctor: constructor function
+ *
+ * Debug option(s) are applied to @flags. In addition to the debug
+ * option(s), if a slab name (or multiple) is specified i.e.
+ * slub_debug=<Debug-Options>,<slab name1>,<slab name2> ...
+ * then only the select slabs will receive the debug option(s).
+ */
slab_flags_t kmem_cache_flags(unsigned int object_size,
slab_flags_t flags, const char *name,
void (*ctor)(void *))
{
- /*
- * Enable debugging if selected on the kernel commandline.
- */
- if (slub_debug && (!slub_debug_slabs || (name &&
- !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))))
- flags |= slub_debug;
+ char *iter;
+ size_t len;
+
+ /* If slub_debug = 0, it folds into the if conditional. */
+ if (!slub_debug_slabs)
+ return flags | slub_debug;
+
+ len = strlen(name);
+ iter = slub_debug_slabs;
+ while (*iter) {
+ char *end, *glob;
+ size_t cmplen;
+
+ end = strchr(iter, ',');
+ if (!end)
+ end = iter + strlen(iter);
+
+ glob = strnchr(iter, end - iter, '*');
+ if (glob)
+ cmplen = glob - iter;
+ else
+ cmplen = max_t(size_t, len, (end - iter));
+
+ if (!strncmp(name, iter, cmplen)) {
+ flags |= slub_debug;
+ break;
+ }
+
+ if (!*end)
+ break;
+ iter = end + 1;
+ }
return flags;
}
@@ -3621,9 +3659,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
#ifdef CONFIG_SLUB_DEBUG
void *addr = page_address(page);
void *p;
- unsigned long *map = kcalloc(BITS_TO_LONGS(page->objects),
- sizeof(long),
- GFP_ATOMIC);
+ unsigned long *map = bitmap_zalloc(page->objects, GFP_ATOMIC);
if (!map)
return;
slab_err(s, page, text, s->name);
@@ -3638,7 +3674,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
}
}
slab_unlock(page);
- kfree(map);
+ bitmap_free(map);
#endif
}
@@ -4411,10 +4447,8 @@ static long validate_slab_cache(struct kmem_cache *s)
{
int node;
unsigned long count = 0;
- unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)),
- sizeof(unsigned long),
- GFP_KERNEL);
struct kmem_cache_node *n;
+ unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
if (!map)
return -ENOMEM;
@@ -4422,7 +4456,7 @@ static long validate_slab_cache(struct kmem_cache *s)
flush_all(s);
for_each_kmem_cache_node(s, node, n)
count += validate_slab_node(s, n, map);
- kfree(map);
+ bitmap_free(map);
return count;
}
/*
@@ -4573,14 +4607,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
unsigned long i;
struct loc_track t = { 0, 0, NULL };
int node;
- unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)),
- sizeof(unsigned long),
- GFP_KERNEL);
struct kmem_cache_node *n;
+ unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
GFP_KERNEL)) {
- kfree(map);
+ bitmap_free(map);
return sprintf(buf, "Out of memory\n");
}
/* Push back cpu slabs */
@@ -4646,7 +4678,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
}
free_loc_track(&t);
- kfree(map);
+ bitmap_free(map);
if (!t.count)
len += sprintf(buf, "No data\n");
return len;
@@ -4657,6 +4689,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
static void __init resiliency_test(void)
{
u8 *p;
+ int type = KMALLOC_NORMAL;
BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10);
@@ -4669,7 +4702,7 @@ static void __init resiliency_test(void)
pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n",
p + 16);
- validate_slab_cache(kmalloc_caches[4]);
+ validate_slab_cache(kmalloc_caches[type][4]);
/* Hmmm... The next two are dangerous */
p = kzalloc(32, GFP_KERNEL);
@@ -4678,33 +4711,33 @@ static void __init resiliency_test(void)
p);
pr_err("If allocated object is overwritten then not detectable\n\n");
- validate_slab_cache(kmalloc_caches[5]);
+ validate_slab_cache(kmalloc_caches[type][5]);
p = kzalloc(64, GFP_KERNEL);
p += 64 + (get_cycles() & 0xff) * sizeof(void *);
*p = 0x56;
pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
p);
pr_err("If allocated object is overwritten then not detectable\n\n");
- validate_slab_cache(kmalloc_caches[6]);
+ validate_slab_cache(kmalloc_caches[type][6]);
pr_err("\nB. Corruption after free\n");
p = kzalloc(128, GFP_KERNEL);
kfree(p);
*p = 0x78;
pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
- validate_slab_cache(kmalloc_caches[7]);
+ validate_slab_cache(kmalloc_caches[type][7]);
p = kzalloc(256, GFP_KERNEL);
kfree(p);
p[50] = 0x9a;
pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
- validate_slab_cache(kmalloc_caches[8]);
+ validate_slab_cache(kmalloc_caches[type][8]);
p = kzalloc(512, GFP_KERNEL);
kfree(p);
p[512] = 0xab;
pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
- validate_slab_cache(kmalloc_caches[9]);
+ validate_slab_cache(kmalloc_caches[type][9]);
}
#else
#ifdef CONFIG_SYSFS
diff --git a/mm/sparse.c b/mm/sparse.c
index 10b07ee..67ad061 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -696,13 +696,11 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat,
goto out;
}
-#ifdef CONFIG_DEBUG_VM
/*
* Poison uninitialized struct pages in order to catch invalid flags
* combinations.
*/
- memset(memmap, PAGE_POISON_PATTERN, sizeof(struct page) * PAGES_PER_SECTION);
-#endif
+ page_init_poison(memmap, sizeof(struct page) * PAGES_PER_SECTION);
section_mark_present(ms);
sparse_init_one_section(ms, section_nr, memmap, usemap);
diff --git a/mm/swap.c b/mm/swap.c
index 26fc9b5..87a54c8 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -29,7 +29,6 @@
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/backing-dev.h>
-#include <linux/memremap.h>
#include <linux/memcontrol.h>
#include <linux/gfp.h>
#include <linux/uio.h>
diff --git a/mm/swap_state.c b/mm/swap_state.c
index ecee9c6..0d6a7f2 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -448,6 +448,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
/*
* Initiate read into locked page and return.
*/
+ SetPageWorkingset(new_page);
lru_cache_add_anon(new_page);
*new_page_allocated = true;
return new_page;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d954b71..644f746 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -103,26 +103,39 @@ static inline unsigned char swap_count(unsigned char ent)
return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
}
+/* Reclaim the swap entry anyway if possible */
+#define TTRS_ANYWAY 0x1
+/*
+ * Reclaim the swap entry if there are no more mappings of the
+ * corresponding page
+ */
+#define TTRS_UNMAPPED 0x2
+/* Reclaim the swap entry if swap is getting full*/
+#define TTRS_FULL 0x4
+
/* returns 1 if swap entry is freed */
-static int
-__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
+static int __try_to_reclaim_swap(struct swap_info_struct *si,
+ unsigned long offset, unsigned long flags)
{
swp_entry_t entry = swp_entry(si->type, offset);
struct page *page;
int ret = 0;
- page = find_get_page(swap_address_space(entry), swp_offset(entry));
+ page = find_get_page(swap_address_space(entry), offset);
if (!page)
return 0;
/*
- * This function is called from scan_swap_map() and it's called
- * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
- * We have to use trylock for avoiding deadlock. This is a special
+ * When this function is called from scan_swap_map_slots() and it's
+ * called by vmscan.c at reclaiming pages. So, we hold a lock on a page,
+ * here. We have to use trylock for avoiding deadlock. This is a special
* case and you should use try_to_free_swap() with explicit lock_page()
* in usual operations.
*/
if (trylock_page(page)) {
- ret = try_to_free_swap(page);
+ if ((flags & TTRS_ANYWAY) ||
+ ((flags & TTRS_UNMAPPED) && !page_mapped(page)) ||
+ ((flags & TTRS_FULL) && mem_cgroup_swap_full(page)))
+ ret = try_to_free_swap(page);
unlock_page(page);
}
put_page(page);
@@ -780,7 +793,7 @@ checks:
int swap_was_freed;
unlock_cluster(ci);
spin_unlock(&si->lock);
- swap_was_freed = __try_to_reclaim_swap(si, offset);
+ swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
spin_lock(&si->lock);
/* entry was freed successfully, try to use this again */
if (swap_was_freed)
@@ -919,6 +932,7 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
struct swap_cluster_info *ci;
ci = lock_cluster(si, offset);
+ memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
cluster_set_count_flag(ci, 0, 0);
free_cluster(si, idx);
unlock_cluster(ci);
@@ -989,7 +1003,7 @@ start_over:
goto nextsi;
}
if (size == SWAPFILE_CLUSTER) {
- if (!(si->flags & SWP_FILE))
+ if (!(si->flags & SWP_FS))
n_ret = swap_alloc_cluster(si, swp_entries);
} else
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
@@ -1169,6 +1183,8 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p,
ci = lock_cluster_or_swap_info(p, offset);
usage = __swap_entry_free_locked(p, offset, usage);
unlock_cluster_or_swap_info(p, ci);
+ if (!usage)
+ free_swap_slot(entry);
return usage;
}
@@ -1199,10 +1215,8 @@ void swap_free(swp_entry_t entry)
struct swap_info_struct *p;
p = _swap_info_get(entry);
- if (p) {
- if (!__swap_entry_free(p, entry, 1))
- free_swap_slot(entry);
- }
+ if (p)
+ __swap_entry_free(p, entry, 1);
}
/*
@@ -1237,9 +1251,6 @@ void put_swap_page(struct page *page, swp_entry_t entry)
if (free_entries == SWAPFILE_CLUSTER) {
unlock_cluster_or_swap_info(si, ci);
spin_lock(&si->lock);
- ci = lock_cluster(si, offset);
- memset(map, 0, SWAPFILE_CLUSTER);
- unlock_cluster(ci);
mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
swap_free_cluster(si, idx);
spin_unlock(&si->lock);
@@ -1612,7 +1623,6 @@ int try_to_free_swap(struct page *page)
int free_swap_and_cache(swp_entry_t entry)
{
struct swap_info_struct *p;
- struct page *page = NULL;
unsigned char count;
if (non_swap_entry(entry))
@@ -1622,30 +1632,9 @@ int free_swap_and_cache(swp_entry_t entry)
if (p) {
count = __swap_entry_free(p, entry, 1);
if (count == SWAP_HAS_CACHE &&
- !swap_page_trans_huge_swapped(p, entry)) {
- page = find_get_page(swap_address_space(entry),
- swp_offset(entry));
- if (page && !trylock_page(page)) {
- put_page(page);
- page = NULL;
- }
- } else if (!count)
- free_swap_slot(entry);
- }
- if (page) {
- /*
- * Not mapped elsewhere, or swap space full? Free it!
- * Also recheck PageSwapCache now page is locked (above).
- */
- if (PageSwapCache(page) && !PageWriteback(page) &&
- (!page_mapped(page) || mem_cgroup_swap_full(page)) &&
- !swap_page_trans_huge_swapped(p, entry)) {
- page = compound_head(page);
- delete_from_swap_cache(page);
- SetPageDirty(page);
- }
- unlock_page(page);
- put_page(page);
+ !swap_page_trans_huge_swapped(p, entry))
+ __try_to_reclaim_swap(p, swp_offset(entry),
+ TTRS_UNMAPPED | TTRS_FULL);
}
return p != NULL;
}
@@ -2310,12 +2299,13 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
kfree(se);
}
- if (sis->flags & SWP_FILE) {
+ if (sis->flags & SWP_ACTIVATED) {
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;
- sis->flags &= ~SWP_FILE;
- mapping->a_ops->swap_deactivate(swap_file);
+ sis->flags &= ~SWP_ACTIVATED;
+ if (mapping->a_ops->swap_deactivate)
+ mapping->a_ops->swap_deactivate(swap_file);
}
}
@@ -2364,6 +2354,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
list_add_tail(&new_se->list, &sis->first_swap_extent.list);
return 1;
}
+EXPORT_SYMBOL_GPL(add_swap_extent);
/*
* A `swap extent' is a simple thing which maps a contiguous range of pages
@@ -2411,8 +2402,10 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
if (mapping->a_ops->swap_activate) {
ret = mapping->a_ops->swap_activate(sis, swap_file, span);
+ if (ret >= 0)
+ sis->flags |= SWP_ACTIVATED;
if (!ret) {
- sis->flags |= SWP_FILE;
+ sis->flags |= SWP_FS;
ret = add_swap_extent(sis, 0, sis->max, 0);
*span = sis->pages;
}
diff --git a/mm/util.c b/mm/util.c
index 470f5cd..8bf08b5 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -435,7 +435,7 @@ EXPORT_SYMBOL(kvmalloc_node);
* It is slightly more efficient to use kfree() or vfree() if you are certain
* that you know which one to use.
*
- * Context: Any context except NMI.
+ * Context: Either preemptible task context or not-NMI interrupt.
*/
void kvfree(const void *addr)
{
@@ -678,8 +678,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
* Part of the kernel memory, which can be released
* under memory pressure.
*/
- free += global_node_page_state(
- NR_INDIRECTLY_RECLAIMABLE_BYTES) >> PAGE_SHIFT;
+ free += global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
/*
* Leave reserved pages. The pages are not for anonymous pages.
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a728fc4..97d4b25 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1577,6 +1577,8 @@ void vfree_atomic(const void *addr)
* have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
* conventions for vfree() arch-depenedent would be a really bad idea)
*
+ * May sleep if called *not* from interrupt context.
+ *
* NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
*/
void vfree(const void *addr)
@@ -1585,6 +1587,8 @@ void vfree(const void *addr)
kmemleak_free(addr);
+ might_sleep_if(!in_interrupt());
+
if (!addr)
return;
if (unlikely(in_interrupt()))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c5ef724..28c9ae5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -49,6 +49,7 @@
#include <linux/prefetch.h>
#include <linux/printk.h>
#include <linux/dax.h>
+#include <linux/psi.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -473,9 +474,18 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
total_scan = nr;
- delta = freeable >> priority;
- delta *= 4;
- do_div(delta, shrinker->seeks);
+ if (shrinker->seeks) {
+ delta = freeable >> priority;
+ delta *= 4;
+ do_div(delta, shrinker->seeks);
+ } else {
+ /*
+ * These objects don't require any IO to create. Trim
+ * them aggressively under memory pressure to keep
+ * them from causing refetches in the IO caches.
+ */
+ delta = freeable / 2;
+ }
/*
* Make sure we apply some minimal pressure on default priority
@@ -2145,6 +2155,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
}
ClearPageActive(page); /* we are de-activating */
+ SetPageWorkingset(page);
list_add(&page->lru, &l_inactive);
}
@@ -2456,9 +2467,11 @@ out:
/*
* Scan types proportional to swappiness and
* their relative recent reclaim efficiency.
+ * Make sure we don't miss the last page
+ * because of a round-off error.
*/
- scan = div64_u64(scan * fraction[file],
- denominator);
+ scan = DIV64_U64_ROUND_UP(scan * fraction[file],
+ denominator);
break;
case SCAN_FILE:
case SCAN_ANON:
@@ -3302,6 +3315,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
{
struct zonelist *zonelist;
unsigned long nr_reclaimed;
+ unsigned long pflags;
int nid;
unsigned int noreclaim_flag;
struct scan_control sc = {
@@ -3330,9 +3344,13 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
sc.gfp_mask,
sc.reclaim_idx);
+ psi_memstall_enter(&pflags);
noreclaim_flag = memalloc_noreclaim_save();
+
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+
memalloc_noreclaim_restore(noreclaim_flag);
+ psi_memstall_leave(&pflags);
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
@@ -3497,6 +3515,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
int i;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
+ unsigned long pflags;
struct zone *zone;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
@@ -3507,6 +3526,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
.may_swap = 1,
};
+ psi_memstall_enter(&pflags);
__fs_reclaim_acquire();
count_vm_event(PAGEOUTRUN);
@@ -3608,6 +3628,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
out:
snapshot_refaults(NULL, pgdat);
__fs_reclaim_release();
+ psi_memstall_leave(&pflags);
/*
* Return the order kswapd stopped reclaiming at as
* prepare_kswapd_sleep() takes it into account. If another caller
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7878da7..6038ce5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1143,8 +1143,10 @@ const char * const vmstat_text[] = {
"nr_slab_unreclaimable",
"nr_isolated_anon",
"nr_isolated_file",
+ "workingset_nodes",
"workingset_refault",
"workingset_activate",
+ "workingset_restore",
"workingset_nodereclaim",
"nr_anon_pages",
"nr_mapped",
@@ -1161,7 +1163,7 @@ const char * const vmstat_text[] = {
"nr_vmscan_immediate_reclaim",
"nr_dirtied",
"nr_written",
- "", /* nr_indirectly_reclaimable */
+ "nr_kernel_misc_reclaimable",
/* enum writeback_stat_item counters */
"nr_dirty_threshold",
@@ -1663,6 +1665,8 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
stat_items_size += sizeof(struct vm_event_state);
#endif
+ BUILD_BUG_ON(stat_items_size !=
+ ARRAY_SIZE(vmstat_text) * sizeof(unsigned long));
v = kmalloc(stat_items_size, GFP_KERNEL);
m->private = v;
if (!v)
@@ -1706,10 +1710,6 @@ static int vmstat_show(struct seq_file *m, void *arg)
unsigned long *l = arg;
unsigned long off = l - (unsigned long *)m->private;
- /* Skip hidden vmstat items. */
- if (*vmstat_text[off] == '\0')
- return 0;
-
seq_puts(m, vmstat_text[off]);
seq_put_decimal_ull(m, " ", *l);
seq_putc(m, '\n');
diff --git a/mm/workingset.c b/mm/workingset.c
index 4516dd7..cbc13d4 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -121,7 +121,7 @@
* the only thing eating into inactive list space is active pages.
*
*
- * Activating refaulting pages
+ * Refaulting inactive pages
*
* All that is known about the active list is that the pages have been
* accessed more than once in the past. This means that at any given
@@ -134,6 +134,10 @@
* used less frequently than the refaulting page - or even not used at
* all anymore.
*
+ * That means if inactive cache is refaulting with a suitable refault
+ * distance, we assume the cache workingset is transitioning and put
+ * pressure on the current active list.
+ *
* If this is wrong and demotion kicks in, the pages which are truly
* used more frequently will be reactivated while the less frequently
* used once will be evicted from memory.
@@ -141,6 +145,14 @@
* But if this is right, the stale pages will be pushed out of memory
* and the used pages get to stay in cache.
*
+ * Refaulting active pages
+ *
+ * If on the other hand the refaulting pages have recently been
+ * deactivated, it means that the active list is no longer protecting
+ * actively used cache from reclaim. The cache is NOT transitioning to
+ * a different workingset; the existing workingset is thrashing in the
+ * space allocated to the page cache.
+ *
*
* Implementation
*
@@ -156,8 +168,7 @@
*/
#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \
- NODES_SHIFT + \
- MEM_CGROUP_ID_SHIFT)
+ 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT)
#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
/*
@@ -170,23 +181,28 @@
*/
static unsigned int bucket_order __read_mostly;
-static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)
+static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
+ bool workingset)
{
eviction >>= bucket_order;
eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
+ eviction = (eviction << 1) | workingset;
eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
}
static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
- unsigned long *evictionp)
+ unsigned long *evictionp, bool *workingsetp)
{
unsigned long entry = (unsigned long)shadow;
int memcgid, nid;
+ bool workingset;
entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
+ workingset = entry & 1;
+ entry >>= 1;
nid = entry & ((1UL << NODES_SHIFT) - 1);
entry >>= NODES_SHIFT;
memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
@@ -195,6 +211,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
*memcgidp = memcgid;
*pgdat = NODE_DATA(nid);
*evictionp = entry << bucket_order;
+ *workingsetp = workingset;
}
/**
@@ -207,8 +224,8 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
*/
void *workingset_eviction(struct address_space *mapping, struct page *page)
{
- struct mem_cgroup *memcg = page_memcg(page);
struct pglist_data *pgdat = page_pgdat(page);
+ struct mem_cgroup *memcg = page_memcg(page);
int memcgid = mem_cgroup_id(memcg);
unsigned long eviction;
struct lruvec *lruvec;
@@ -220,30 +237,30 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
lruvec = mem_cgroup_lruvec(pgdat, memcg);
eviction = atomic_long_inc_return(&lruvec->inactive_age);
- return pack_shadow(memcgid, pgdat, eviction);
+ return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
}
/**
* workingset_refault - evaluate the refault of a previously evicted page
+ * @page: the freshly allocated replacement page
* @shadow: shadow entry of the evicted page
*
* Calculates and evaluates the refault distance of the previously
* evicted page in the context of the node it was allocated in.
- *
- * Returns %true if the page should be activated, %false otherwise.
*/
-bool workingset_refault(void *shadow)
+void workingset_refault(struct page *page, void *shadow)
{
unsigned long refault_distance;
+ struct pglist_data *pgdat;
unsigned long active_file;
struct mem_cgroup *memcg;
unsigned long eviction;
struct lruvec *lruvec;
unsigned long refault;
- struct pglist_data *pgdat;
+ bool workingset;
int memcgid;
- unpack_shadow(shadow, &memcgid, &pgdat, &eviction);
+ unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
rcu_read_lock();
/*
@@ -263,41 +280,51 @@ bool workingset_refault(void *shadow)
* configurations instead.
*/
memcg = mem_cgroup_from_id(memcgid);
- if (!mem_cgroup_disabled() && !memcg) {
- rcu_read_unlock();
- return false;
- }
+ if (!mem_cgroup_disabled() && !memcg)
+ goto out;
lruvec = mem_cgroup_lruvec(pgdat, memcg);
refault = atomic_long_read(&lruvec->inactive_age);
active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
/*
- * The unsigned subtraction here gives an accurate distance
- * across inactive_age overflows in most cases.
+ * Calculate the refault distance
*
- * There is a special case: usually, shadow entries have a
- * short lifetime and are either refaulted or reclaimed along
- * with the inode before they get too old. But it is not
- * impossible for the inactive_age to lap a shadow entry in
- * the field, which can then can result in a false small
- * refault distance, leading to a false activation should this
- * old entry actually refault again. However, earlier kernels
- * used to deactivate unconditionally with *every* reclaim
- * invocation for the longest time, so the occasional
- * inappropriate activation leading to pressure on the active
- * list is not a problem.
+ * The unsigned subtraction here gives an accurate distance
+ * across inactive_age overflows in most cases. There is a
+ * special case: usually, shadow entries have a short lifetime
+ * and are either refaulted or reclaimed along with the inode
+ * before they get too old. But it is not impossible for the
+ * inactive_age to lap a shadow entry in the field, which can
+ * then result in a false small refault distance, leading to a
+ * false activation should this old entry actually refault
+ * again. However, earlier kernels used to deactivate
+ * unconditionally with *every* reclaim invocation for the
+ * longest time, so the occasional inappropriate activation
+ * leading to pressure on the active list is not a problem.
*/
refault_distance = (refault - eviction) & EVICTION_MASK;
inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
- if (refault_distance <= active_file) {
- inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
- rcu_read_unlock();
- return true;
+ /*
+ * Compare the distance to the existing workingset size. We
+ * don't act on pages that couldn't stay resident even if all
+ * the memory was available to the page cache.
+ */
+ if (refault_distance > active_file)
+ goto out;
+
+ SetPageActive(page);
+ atomic_long_inc(&lruvec->inactive_age);
+ inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
+
+ /* Page was active prior to eviction */
+ if (workingset) {
+ SetPageWorkingset(page);
+ inc_lruvec_state(lruvec, WORKINGSET_RESTORE);
}
+out:
rcu_read_unlock();
- return false;
}
/**
@@ -350,12 +377,20 @@ void workingset_update_node(struct radix_tree_node *node)
* already where they should be. The list_empty() test is safe
* as node->private_list is protected by the i_pages lock.
*/
+ VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */
+
if (node->count && node->count == node->exceptional) {
- if (list_empty(&node->private_list))
+ if (list_empty(&node->private_list)) {
list_lru_add(&shadow_nodes, &node->private_list);
+ __inc_lruvec_page_state(virt_to_page(node),
+ WORKINGSET_NODES);
+ }
} else {
- if (!list_empty(&node->private_list))
+ if (!list_empty(&node->private_list)) {
list_lru_del(&shadow_nodes, &node->private_list);
+ __dec_lruvec_page_state(virt_to_page(node),
+ WORKINGSET_NODES);
+ }
}
}
@@ -364,7 +399,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
{
unsigned long max_nodes;
unsigned long nodes;
- unsigned long cache;
+ unsigned long pages;
nodes = list_lru_shrink_count(&shadow_nodes, sc);
@@ -390,14 +425,20 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
*
* PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE
*/
+#ifdef CONFIG_MEMCG
if (sc->memcg) {
- cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
- LRU_ALL_FILE);
- } else {
- cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) +
- node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE);
- }
- max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3);
+ struct lruvec *lruvec;
+
+ pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
+ LRU_ALL);
+ lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg);
+ pages += lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE);
+ pages += lruvec_page_state(lruvec, NR_SLAB_UNRECLAIMABLE);
+ } else
+#endif
+ pages = node_present_pages(sc->nid);
+
+ max_nodes = pages >> (RADIX_TREE_MAP_SHIFT - 3);
if (!nodes)
return SHRINK_EMPTY;
@@ -440,6 +481,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
}
list_lru_isolate(lru, item);
+ __dec_lruvec_page_state(virt_to_page(node), WORKINGSET_NODES);
+
spin_unlock(lru_lock);
/*
@@ -467,7 +510,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
}
if (WARN_ON_ONCE(node->exceptional))
goto out_invalid;
- inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
+ __inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
__radix_tree_delete_node(&mapping->i_pages, node,
workingset_lookup_update(mapping));
@@ -491,7 +534,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
static struct shrinker workingset_shadow_shrinker = {
.count_objects = count_shadow_nodes,
.scan_objects = scan_shadow_nodes,
- .seeks = DEFAULT_SEEKS,
+ .seeks = 0, /* ->count reports only fully expendable nodes */
.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
};
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 9da6555..0787d33 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -418,7 +418,7 @@ static void *zs_zpool_map(void *pool, unsigned long handle,
case ZPOOL_MM_WO:
zs_mm = ZS_MM_WO;
break;
- case ZPOOL_MM_RW: /* fallthru */
+ case ZPOOL_MM_RW: /* fall through */
default:
zs_mm = ZS_MM_RW;
break;
diff --git a/scripts/tags.sh b/scripts/tags.sh
index 26de7d5..4fa070f 100755
--- a/scripts/tags.sh
+++ b/scripts/tags.sh
@@ -203,7 +203,7 @@ regex_c=(
'/\<DECLARE_\(TASKLET\|WORK\|DELAYED_WORK\)(\([[:alnum:]_]*\)/\2/v/'
'/\(^\s\)OFFSET(\([[:alnum:]_]*\)/\2/v/'
'/\(^\s\)DEFINE(\([[:alnum:]_]*\)/\2/v/'
- '/\<DEFINE_HASHTABLE(\([[:alnum:]_]*\)/\1/v/'
+ '/\<\(DEFINE\|DECLARE\)_HASHTABLE(\([[:alnum:]_]*\)/\2/v/'
)
regex_kconfig=(
'/^[[:blank:]]*\(menu\|\)config[[:blank:]]\+\([[:alnum:]_]\+\)/\2/'
diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c
index 9f420d9..8cb504d 100644
--- a/tools/accounting/getdelays.c
+++ b/tools/accounting/getdelays.c
@@ -203,6 +203,8 @@ static void print_delayacct(struct taskstats *t)
"SWAP %15s%15s%15s\n"
" %15llu%15llu%15llums\n"
"RECLAIM %12s%15s%15s\n"
+ " %15llu%15llu%15llums\n"
+ "THRASHING%12s%15s%15s\n"
" %15llu%15llu%15llums\n",
"count", "real total", "virtual total",
"delay total", "delay average",
@@ -222,7 +224,11 @@ static void print_delayacct(struct taskstats *t)
"count", "delay total", "delay average",
(unsigned long long)t->freepages_count,
(unsigned long long)t->freepages_delay_total,
- average_ms(t->freepages_delay_total, t->freepages_count));
+ average_ms(t->freepages_delay_total, t->freepages_count),
+ "count", "delay total", "delay average",
+ (unsigned long long)t->thrashing_count,
+ (unsigned long long)t->thrashing_delay_total,
+ average_ms(t->thrashing_delay_total, t->thrashing_count));
}
static void task_context_switch_counts(struct taskstats *t)
diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
index af5ff83..31b3c98 100644
--- a/tools/testing/selftests/vm/.gitignore
+++ b/tools/testing/selftests/vm/.gitignore
@@ -13,3 +13,4 @@ mlock-random-test
virtual_address_range
gup_benchmark
va_128TBswitch
+map_fixed_noreplace
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index e94b7b1..6e67e72 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -12,6 +12,7 @@ TEST_GEN_FILES += gup_benchmark
TEST_GEN_FILES += hugepage-mmap
TEST_GEN_FILES += hugepage-shm
TEST_GEN_FILES += map_hugetlb
+TEST_GEN_FILES += map_fixed_noreplace
TEST_GEN_FILES += map_populate
TEST_GEN_FILES += mlock-random-test
TEST_GEN_FILES += mlock2-tests
diff --git a/tools/testing/selftests/vm/gup_benchmark.c b/tools/testing/selftests/vm/gup_benchmark.c
index 36df551..880b96f 100644
--- a/tools/testing/selftests/vm/gup_benchmark.c
+++ b/tools/testing/selftests/vm/gup_benchmark.c
@@ -15,9 +15,12 @@
#define PAGE_SIZE sysconf(_SC_PAGESIZE)
#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark)
+#define GUP_LONGTERM_BENCHMARK _IOWR('g', 2, struct gup_benchmark)
+#define GUP_BENCHMARK _IOWR('g', 3, struct gup_benchmark)
struct gup_benchmark {
- __u64 delta_usec;
+ __u64 get_delta_usec;
+ __u64 put_delta_usec;
__u64 addr;
__u64 size;
__u32 nr_pages_per_call;
@@ -28,10 +31,12 @@ int main(int argc, char **argv)
{
struct gup_benchmark gup;
unsigned long size = 128 * MB;
- int i, fd, opt, nr_pages = 1, thp = -1, repeats = 1, write = 0;
+ int i, fd, filed, opt, nr_pages = 1, thp = -1, repeats = 1, write = 0;
+ int cmd = GUP_FAST_BENCHMARK, flags = MAP_PRIVATE;
+ char *file = "/dev/zero";
char *p;
- while ((opt = getopt(argc, argv, "m:r:n:tT")) != -1) {
+ while ((opt = getopt(argc, argv, "m:r:n:f:tTLUSH")) != -1) {
switch (opt) {
case 'm':
size = atoi(optarg) * MB;
@@ -48,13 +53,36 @@ int main(int argc, char **argv)
case 'T':
thp = 0;
break;
+ case 'L':
+ cmd = GUP_LONGTERM_BENCHMARK;
+ break;
+ case 'U':
+ cmd = GUP_BENCHMARK;
+ break;
case 'w':
write = 1;
+ break;
+ case 'f':
+ file = optarg;
+ break;
+ case 'S':
+ flags &= ~MAP_PRIVATE;
+ flags |= MAP_SHARED;
+ break;
+ case 'H':
+ flags |= MAP_HUGETLB;
+ break;
default:
return -1;
}
}
+ filed = open(file, O_RDWR|O_CREAT);
+ if (filed < 0) {
+ perror("open");
+ exit(filed);
+ }
+
gup.nr_pages_per_call = nr_pages;
gup.flags = write;
@@ -62,8 +90,7 @@ int main(int argc, char **argv)
if (fd == -1)
perror("open"), exit(1);
- p = mmap(NULL, size, PROT_READ | PROT_WRITE,
- MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, filed, 0);
if (p == MAP_FAILED)
perror("mmap"), exit(1);
gup.addr = (unsigned long)p;
@@ -78,10 +105,11 @@ int main(int argc, char **argv)
for (i = 0; i < repeats; i++) {
gup.size = size;
- if (ioctl(fd, GUP_FAST_BENCHMARK, &gup))
+ if (ioctl(fd, cmd, &gup))
perror("ioctl"), exit(1);
- printf("Time: %lld us", gup.delta_usec);
+ printf("Time: get:%lld put:%lld us", gup.get_delta_usec,
+ gup.put_delta_usec);
if (gup.size != size)
printf(", truncated (size: %lld)", gup.size);
printf("\n");
diff --git a/tools/testing/selftests/vm/map_fixed_noreplace.c b/tools/testing/selftests/vm/map_fixed_noreplace.c
new file mode 100644
index 0000000..d91bde5
--- /dev/null
+++ b/tools/testing/selftests/vm/map_fixed_noreplace.c
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Test that MAP_FIXED_NOREPLACE works.
+ *
+ * Copyright 2018, Jann Horn <jannh@google.com>
+ * Copyright 2018, Michael Ellerman, IBM Corporation.
+ */
+
+#include <sys/mman.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#ifndef MAP_FIXED_NOREPLACE
+#define MAP_FIXED_NOREPLACE 0x100000
+#endif
+
+#define BASE_ADDRESS (256ul * 1024 * 1024)
+
+
+static void dump_maps(void)
+{
+ char cmd[32];
+
+ snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid());
+ system(cmd);
+}
+
+int main(void)
+{
+ unsigned long flags, addr, size, page_size;
+ char *p;
+
+ page_size = sysconf(_SC_PAGE_SIZE);
+
+ flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE;
+
+ // Check we can map all the areas we need below
+ errno = 0;
+ addr = BASE_ADDRESS;
+ size = 5 * page_size;
+ p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+
+ printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+ if (p == MAP_FAILED) {
+ dump_maps();
+ printf("Error: couldn't map the space we need for the test\n");
+ return 1;
+ }
+
+ errno = 0;
+ if (munmap((void *)addr, 5 * page_size) != 0) {
+ dump_maps();
+ printf("Error: munmap failed!?\n");
+ return 1;
+ }
+ printf("unmap() successful\n");
+
+ errno = 0;
+ addr = BASE_ADDRESS + page_size;
+ size = 3 * page_size;
+ p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+ printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+ if (p == MAP_FAILED) {
+ dump_maps();
+ printf("Error: first mmap() failed unexpectedly\n");
+ return 1;
+ }
+
+ /*
+ * Exact same mapping again:
+ * base | free | new
+ * +1 | mapped | new
+ * +2 | mapped | new
+ * +3 | mapped | new
+ * +4 | free | new
+ */
+ errno = 0;
+ addr = BASE_ADDRESS;
+ size = 5 * page_size;
+ p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+ printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+ if (p != MAP_FAILED) {
+ dump_maps();
+ printf("Error:1: mmap() succeeded when it shouldn't have\n");
+ return 1;
+ }
+
+ /*
+ * Second mapping contained within first:
+ *
+ * base | free |
+ * +1 | mapped |
+ * +2 | mapped | new
+ * +3 | mapped |
+ * +4 | free |
+ */
+ errno = 0;
+ addr = BASE_ADDRESS + (2 * page_size);
+ size = page_size;
+ p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+ printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+ if (p != MAP_FAILED) {
+ dump_maps();
+ printf("Error:2: mmap() succeeded when it shouldn't have\n");
+ return 1;
+ }
+
+ /*
+ * Overlap end of existing mapping:
+ * base | free |
+ * +1 | mapped |
+ * +2 | mapped |
+ * +3 | mapped | new
+ * +4 | free | new
+ */
+ errno = 0;
+ addr = BASE_ADDRESS + (3 * page_size);
+ size = 2 * page_size;
+ p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+ printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+ if (p != MAP_FAILED) {
+ dump_maps();
+ printf("Error:3: mmap() succeeded when it shouldn't have\n");
+ return 1;
+ }
+
+ /*
+ * Overlap start of existing mapping:
+ * base | free | new
+ * +1 | mapped | new
+ * +2 | mapped |
+ * +3 | mapped |
+ * +4 | free |
+ */
+ errno = 0;
+ addr = BASE_ADDRESS;
+ size = 2 * page_size;
+ p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+ printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+ if (p != MAP_FAILED) {
+ dump_maps();
+ printf("Error:4: mmap() succeeded when it shouldn't have\n");
+ return 1;
+ }
+
+ /*
+ * Adjacent to start of existing mapping:
+ * base | free | new
+ * +1 | mapped |
+ * +2 | mapped |
+ * +3 | mapped |
+ * +4 | free |
+ */
+ errno = 0;
+ addr = BASE_ADDRESS;
+ size = page_size;
+ p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+ printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+ if (p == MAP_FAILED) {
+ dump_maps();
+ printf("Error:5: mmap() failed when it shouldn't have\n");
+ return 1;
+ }
+
+ /*
+ * Adjacent to end of existing mapping:
+ * base | free |
+ * +1 | mapped |
+ * +2 | mapped |
+ * +3 | mapped |
+ * +4 | free | new
+ */
+ errno = 0;
+ addr = BASE_ADDRESS + (4 * page_size);
+ size = page_size;
+ p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+ printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+ if (p == MAP_FAILED) {
+ dump_maps();
+ printf("Error:6: mmap() failed when it shouldn't have\n");
+ return 1;
+ }
+
+ addr = BASE_ADDRESS;
+ size = 5 * page_size;
+ if (munmap((void *)addr, size) != 0) {
+ dump_maps();
+ printf("Error: munmap failed!?\n");
+ return 1;
+ }
+ printf("unmap() successful\n");
+
+ printf("OK\n");
+ return 0;
+}
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 7b8171e..5d1db82 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -34,18 +34,6 @@
* per-CPU threads 1 by triggering userfaults inside
* pthread_mutex_lock will also verify the atomicity of the memory
* transfer (UFFDIO_COPY).
- *
- * The program takes two parameters: the amounts of physical memory in
- * megabytes (MiB) of the area and the number of bounces to execute.
- *
- * # 100MiB 99999 bounces
- * ./userfaultfd 100 99999
- *
- * # 1GiB 99 bounces
- * ./userfaultfd 1000 99
- *
- * # 10MiB-~6GiB 999 bounces, continue forever unless an error triggers
- * while ./userfaultfd $[RANDOM % 6000 + 10] 999; do true; done
*/
#define _GNU_SOURCE
@@ -115,6 +103,30 @@ pthread_attr_t attr;
~(unsigned long)(sizeof(unsigned long long) \
- 1)))
+const char *examples =
+ "# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
+ "./userfaultfd anon 100 99999\n\n"
+ "# Run share memory test on 1GiB region with 99 bounces:\n"
+ "./userfaultfd shmem 1000 99\n\n"
+ "# Run hugetlb memory test on 256MiB region with 50 bounces (using /dev/hugepages/hugefile):\n"
+ "./userfaultfd hugetlb 256 50 /dev/hugepages/hugefile\n\n"
+ "# Run the same hugetlb test but using shmem:\n"
+ "./userfaultfd hugetlb_shared 256 50 /dev/hugepages/hugefile\n\n"
+ "# 10MiB-~6GiB 999 bounces anonymous test, "
+ "continue forever unless an error triggers\n"
+ "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n";
+
+static void usage(void)
+{
+ fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> "
+ "[hugetlbfs_file]\n\n");
+ fprintf(stderr, "Supported <test type>: anon, hugetlb, "
+ "hugetlb_shared, shmem\n\n");
+ fprintf(stderr, "Examples:\n\n");
+ fprintf(stderr, examples);
+ exit(1);
+}
+
static int anon_release_pages(char *rel_area)
{
int ret = 0;
@@ -439,6 +451,43 @@ static int copy_page(int ufd, unsigned long offset)
return __copy_page(ufd, offset, false);
}
+static int uffd_read_msg(int ufd, struct uffd_msg *msg)
+{
+ int ret = read(uffd, msg, sizeof(*msg));
+
+ if (ret != sizeof(*msg)) {
+ if (ret < 0) {
+ if (errno == EAGAIN)
+ return 1;
+ else
+ perror("blocking read error"), exit(1);
+ } else {
+ fprintf(stderr, "short read\n"), exit(1);
+ }
+ }
+
+ return 0;
+}
+
+/* Return 1 if page fault handled by us; otherwise 0 */
+static int uffd_handle_page_fault(struct uffd_msg *msg)
+{
+ unsigned long offset;
+
+ if (msg->event != UFFD_EVENT_PAGEFAULT)
+ fprintf(stderr, "unexpected msg event %u\n",
+ msg->event), exit(1);
+
+ if (bounces & BOUNCE_VERIFY &&
+ msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
+ fprintf(stderr, "unexpected write fault\n"), exit(1);
+
+ offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
+ offset &= ~(page_size-1);
+
+ return copy_page(uffd, offset);
+}
+
static void *uffd_poll_thread(void *arg)
{
unsigned long cpu = (unsigned long) arg;
@@ -446,7 +495,6 @@ static void *uffd_poll_thread(void *arg)
struct uffd_msg msg;
struct uffdio_register uffd_reg;
int ret;
- unsigned long offset;
char tmp_chr;
unsigned long userfaults = 0;
@@ -470,25 +518,15 @@ static void *uffd_poll_thread(void *arg)
if (!(pollfd[0].revents & POLLIN))
fprintf(stderr, "pollfd[0].revents %d\n",
pollfd[0].revents), exit(1);
- ret = read(uffd, &msg, sizeof(msg));
- if (ret < 0) {
- if (errno == EAGAIN)
- continue;
- perror("nonblocking read error"), exit(1);
- }
+ if (uffd_read_msg(uffd, &msg))
+ continue;
switch (msg.event) {
default:
fprintf(stderr, "unexpected msg event %u\n",
msg.event), exit(1);
break;
case UFFD_EVENT_PAGEFAULT:
- if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
- fprintf(stderr, "unexpected write fault\n"), exit(1);
- offset = (char *)(unsigned long)msg.arg.pagefault.address -
- area_dst;
- offset &= ~(page_size-1);
- if (copy_page(uffd, offset))
- userfaults++;
+ userfaults += uffd_handle_page_fault(&msg);
break;
case UFFD_EVENT_FORK:
close(uffd);
@@ -516,8 +554,6 @@ static void *uffd_read_thread(void *arg)
{
unsigned long *this_cpu_userfaults;
struct uffd_msg msg;
- unsigned long offset;
- int ret;
this_cpu_userfaults = (unsigned long *) arg;
*this_cpu_userfaults = 0;
@@ -526,24 +562,9 @@ static void *uffd_read_thread(void *arg)
/* from here cancellation is ok */
for (;;) {
- ret = read(uffd, &msg, sizeof(msg));
- if (ret != sizeof(msg)) {
- if (ret < 0)
- perror("blocking read error"), exit(1);
- else
- fprintf(stderr, "short read\n"), exit(1);
- }
- if (msg.event != UFFD_EVENT_PAGEFAULT)
- fprintf(stderr, "unexpected msg event %u\n",
- msg.event), exit(1);
- if (bounces & BOUNCE_VERIFY &&
- msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
- fprintf(stderr, "unexpected write fault\n"), exit(1);
- offset = (char *)(unsigned long)msg.arg.pagefault.address -
- area_dst;
- offset &= ~(page_size-1);
- if (copy_page(uffd, offset))
- (*this_cpu_userfaults)++;
+ if (uffd_read_msg(uffd, &msg))
+ continue;
+ (*this_cpu_userfaults) += uffd_handle_page_fault(&msg);
}
return (void *)NULL;
}
@@ -605,6 +626,12 @@ static int stress(unsigned long *userfaults)
if (uffd_test_ops->release_pages(area_src))
return 1;
+
+ finished = 1;
+ for (cpu = 0; cpu < nr_cpus; cpu++)
+ if (pthread_join(locking_threads[cpu], NULL))
+ return 1;
+
for (cpu = 0; cpu < nr_cpus; cpu++) {
char c;
if (bounces & BOUNCE_POLL) {
@@ -622,11 +649,6 @@ static int stress(unsigned long *userfaults)
}
}
- finished = 1;
- for (cpu = 0; cpu < nr_cpus; cpu++)
- if (pthread_join(locking_threads[cpu], NULL))
- return 1;
-
return 0;
}
@@ -1272,8 +1294,7 @@ static void sigalrm(int sig)
int main(int argc, char **argv)
{
if (argc < 4)
- fprintf(stderr, "Usage: <test type> <MiB> <bounces> [hugetlbfs_file]\n"),
- exit(1);
+ usage();
if (signal(SIGALRM, sigalrm) == SIG_ERR)
fprintf(stderr, "failed to arm SIGALRM"), exit(1);
@@ -1286,20 +1307,19 @@ int main(int argc, char **argv)
nr_cpus;
if (!nr_pages_per_cpu) {
fprintf(stderr, "invalid MiB\n");
- fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
+ usage();
}
bounces = atoi(argv[3]);
if (bounces <= 0) {
fprintf(stderr, "invalid bounces\n");
- fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
+ usage();
}
nr_pages = nr_pages_per_cpu * nr_cpus;
if (test_type == TEST_HUGETLB) {
if (argc < 5)
- fprintf(stderr, "Usage: hugetlb <MiB> <bounces> <hugetlbfs_file>\n"),
- exit(1);
+ usage();
huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755);
if (huge_fd < 0) {
fprintf(stderr, "Open of %s failed", argv[3]);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 786ade1..2679e47 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -497,7 +497,6 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
}
static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
- .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
.clear_flush_young = kvm_mmu_notifier_clear_flush_young,
OpenPOWER on IntegriCloud