summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorTimothy Pearson <tpearson@raptorengineering.com>2017-08-23 14:45:25 -0500
committerTimothy Pearson <tpearson@raptorengineering.com>2017-08-23 14:45:25 -0500
commitfcbb27b0ec6dcbc5a5108cb8fb19eae64593d204 (patch)
tree22962a4387943edc841c72a4e636a068c66d58fd /mm
downloadast2050-linux-kernel-fcbb27b0ec6dcbc5a5108cb8fb19eae64593d204.zip
ast2050-linux-kernel-fcbb27b0ec6dcbc5a5108cb8fb19eae64593d204.tar.gz
Initial import of modified Linux 2.6.28 tree
Original upstream URL: git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git | branch linux-2.6.28.y
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig224
-rw-r--r--mm/Makefile36
-rw-r--r--mm/allocpercpu.c143
-rw-r--r--mm/backing-dev.c306
-rw-r--r--mm/bootmem.c727
-rw-r--r--mm/bounce.c293
-rw-r--r--mm/dmapool.c504
-rw-r--r--mm/fadvise.c151
-rw-r--r--mm/filemap.c2478
-rw-r--r--mm/filemap_xip.c474
-rw-r--r--mm/fremap.c260
-rw-r--r--mm/highmem.c375
-rw-r--r--mm/hugetlb.c2293
-rw-r--r--mm/internal.h283
-rw-r--r--mm/maccess.c55
-rw-r--r--mm/madvise.c365
-rw-r--r--mm/memcontrol.c1174
-rw-r--r--mm/memory.c3051
-rw-r--r--mm/memory_hotplug.c867
-rw-r--r--mm/mempolicy.c2338
-rw-r--r--mm/mempool.c340
-rw-r--r--mm/migrate.c1151
-rw-r--r--mm/mincore.c229
-rw-r--r--mm/mlock.c629
-rw-r--r--mm/mm_init.c152
-rw-r--r--mm/mmap.c2482
-rw-r--r--mm/mmu_notifier.c277
-rw-r--r--mm/mmzone.c74
-rw-r--r--mm/mprotect.c319
-rw-r--r--mm/mremap.c433
-rw-r--r--mm/msync.c103
-rw-r--r--mm/nommu.c1523
-rw-r--r--mm/oom_kill.c593
-rw-r--r--mm/page-writeback.c1389
-rw-r--r--mm/page_alloc.c4779
-rw-r--r--mm/page_cgroup.c275
-rw-r--r--mm/page_io.c141
-rw-r--r--mm/page_isolation.c142
-rw-r--r--mm/pagewalk.c137
-rw-r--r--mm/pdflush.c241
-rw-r--r--mm/prio_tree.c207
-rw-r--r--mm/quicklist.c103
-rw-r--r--mm/readahead.c483
-rw-r--r--mm/rmap.c1236
-rw-r--r--mm/shmem.c2608
-rw-r--r--mm/shmem_acl.c197
-rw-r--r--mm/slab.c4522
-rw-r--r--mm/slob.c647
-rw-r--r--mm/slub.c4515
-rw-r--r--mm/sparse-vmemmap.c159
-rw-r--r--mm/sparse.c636
-rw-r--r--mm/swap.c606
-rw-r--r--mm/swap_state.c372
-rw-r--r--mm/swapfile.c1870
-rw-r--r--mm/thrash.c79
-rw-r--r--mm/tiny-shmem.c134
-rw-r--r--mm/truncate.c473
-rw-r--r--mm/util.c188
-rw-r--r--mm/vmalloc.c1812
-rw-r--r--mm/vmscan.c2592
-rw-r--r--mm/vmstat.c969
61 files changed, 56214 insertions, 0 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
new file mode 100644
index 0000000..5b5790f
--- /dev/null
+++ b/mm/Kconfig
@@ -0,0 +1,224 @@
+config SELECT_MEMORY_MODEL
+ def_bool y
+ depends on EXPERIMENTAL || ARCH_SELECT_MEMORY_MODEL
+
+choice
+ prompt "Memory model"
+ depends on SELECT_MEMORY_MODEL
+ default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT
+ default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT
+ default FLATMEM_MANUAL
+
+config FLATMEM_MANUAL
+ bool "Flat Memory"
+ depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE
+ help
+ This option allows you to change some of the ways that
+ Linux manages its memory internally. Most users will
+ only have one option here: FLATMEM. This is normal
+ and a correct option.
+
+ Some users of more advanced features like NUMA and
+ memory hotplug may have different options here.
+ DISCONTIGMEM is an more mature, better tested system,
+ but is incompatible with memory hotplug and may suffer
+ decreased performance over SPARSEMEM. If unsure between
+ "Sparse Memory" and "Discontiguous Memory", choose
+ "Discontiguous Memory".
+
+ If unsure, choose this option (Flat Memory) over any other.
+
+config DISCONTIGMEM_MANUAL
+ bool "Discontiguous Memory"
+ depends on ARCH_DISCONTIGMEM_ENABLE
+ help
+ This option provides enhanced support for discontiguous
+ memory systems, over FLATMEM. These systems have holes
+ in their physical address spaces, and this option provides
+ more efficient handling of these holes. However, the vast
+ majority of hardware has quite flat address spaces, and
+ can have degraded performance from the extra overhead that
+ this option imposes.
+
+ Many NUMA configurations will have this as the only option.
+
+ If unsure, choose "Flat Memory" over this option.
+
+config SPARSEMEM_MANUAL
+ bool "Sparse Memory"
+ depends on ARCH_SPARSEMEM_ENABLE
+ help
+ This will be the only option for some systems, including
+ memory hotplug systems. This is normal.
+
+ For many other systems, this will be an alternative to
+ "Discontiguous Memory". This option provides some potential
+ performance benefits, along with decreased code complexity,
+ but it is newer, and more experimental.
+
+ If unsure, choose "Discontiguous Memory" or "Flat Memory"
+ over this option.
+
+endchoice
+
+config DISCONTIGMEM
+ def_bool y
+ depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL
+
+config SPARSEMEM
+ def_bool y
+ depends on SPARSEMEM_MANUAL
+
+config FLATMEM
+ def_bool y
+ depends on (!DISCONTIGMEM && !SPARSEMEM) || FLATMEM_MANUAL
+
+config FLAT_NODE_MEM_MAP
+ def_bool y
+ depends on !SPARSEMEM
+
+#
+# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
+# to represent different areas of memory. This variable allows
+# those dependencies to exist individually.
+#
+config NEED_MULTIPLE_NODES
+ def_bool y
+ depends on DISCONTIGMEM || NUMA
+
+config HAVE_MEMORY_PRESENT
+ def_bool y
+ depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM
+
+#
+# SPARSEMEM_EXTREME (which is the default) does some bootmem
+# allocations when memory_present() is called. If this cannot
+# be done on your architecture, select this option. However,
+# statically allocating the mem_section[] array can potentially
+# consume vast quantities of .bss, so be careful.
+#
+# This option will also potentially produce smaller runtime code
+# with gcc 3.4 and later.
+#
+config SPARSEMEM_STATIC
+ bool
+
+#
+# Architecture platforms which require a two level mem_section in SPARSEMEM
+# must select this option. This is usually for architecture platforms with
+# an extremely sparse physical address space.
+#
+config SPARSEMEM_EXTREME
+ def_bool y
+ depends on SPARSEMEM && !SPARSEMEM_STATIC
+
+config SPARSEMEM_VMEMMAP_ENABLE
+ bool
+
+config SPARSEMEM_VMEMMAP
+ bool "Sparse Memory virtual memmap"
+ depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE
+ default y
+ help
+ SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise
+ pfn_to_page and page_to_pfn operations. This is the most
+ efficient option when sufficient kernel resources are available.
+
+# eventually, we can have this option just 'select SPARSEMEM'
+config MEMORY_HOTPLUG
+ bool "Allow for memory hot-add"
+ depends on SPARSEMEM || X86_64_ACPI_NUMA
+ depends on HOTPLUG && !HIBERNATION && ARCH_ENABLE_MEMORY_HOTPLUG
+ depends on (IA64 || X86 || PPC64 || SUPERH || S390)
+
+comment "Memory hotplug is currently incompatible with Software Suspend"
+ depends on SPARSEMEM && HOTPLUG && HIBERNATION
+
+config MEMORY_HOTPLUG_SPARSE
+ def_bool y
+ depends on SPARSEMEM && MEMORY_HOTPLUG
+
+config MEMORY_HOTREMOVE
+ bool "Allow for memory hot remove"
+ depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
+ depends on MIGRATION
+
+#
+# If we have space for more page flags then we can enable additional
+# optimizations and functionality.
+#
+# Regular Sparsemem takes page flag bits for the sectionid if it does not
+# use a virtual memmap. Disable extended page flags for 32 bit platforms
+# that require the use of a sectionid in the page flags.
+#
+config PAGEFLAGS_EXTENDED
+ def_bool y
+ depends on 64BIT || SPARSEMEM_VMEMMAP || !NUMA || !SPARSEMEM
+
+# Heavily threaded applications may benefit from splitting the mm-wide
+# page_table_lock, so that faults on different parts of the user address
+# space can be handled with less contention: split it at this NR_CPUS.
+# Default to 4 for wider testing, though 8 might be more appropriate.
+# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
+# PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
+#
+config SPLIT_PTLOCK_CPUS
+ int
+ default "4096" if ARM && !CPU_CACHE_VIPT
+ default "4096" if PARISC && !PA20
+ default "4"
+
+#
+# support for page migration
+#
+config MIGRATION
+ bool "Page migration"
+ def_bool y
+ depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE
+ help
+ Allows the migration of the physical location of pages of processes
+ while the virtual addresses are not changed. This is useful for
+ example on NUMA systems to put pages nearer to the processors accessing
+ the page.
+
+config RESOURCES_64BIT
+ bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL)
+ default 64BIT
+ help
+ This option allows memory and IO resources to be 64 bit.
+
+config PHYS_ADDR_T_64BIT
+ def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
+
+config ZONE_DMA_FLAG
+ int
+ default "0" if !ZONE_DMA
+ default "1"
+
+config BOUNCE
+ def_bool y
+ depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM)
+
+config NR_QUICK
+ int
+ depends on QUICKLIST
+ default "2" if SUPERH || AVR32
+ default "1"
+
+config VIRT_TO_BUS
+ def_bool y
+ depends on !ARCH_NO_VIRT_TO_BUS
+
+config UNEVICTABLE_LRU
+ bool "Add LRU list to track non-evictable pages"
+ default y
+ depends on MMU
+ help
+ Keeps unevictable pages off of the active and inactive pageout
+ lists, so kswapd will not waste CPU time or have its balancing
+ algorithms thrown off by scanning these pages. Selecting this
+ will use one page flag and increase the code size a little,
+ say Y unless you know what you are doing.
+
+config MMU_NOTIFIER
+ bool
diff --git a/mm/Makefile b/mm/Makefile
new file mode 100644
index 0000000..c06b45a
--- /dev/null
+++ b/mm/Makefile
@@ -0,0 +1,36 @@
+#
+# Makefile for the linux memory manager.
+#
+
+mmu-y := nommu.o
+mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
+ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
+ vmalloc.o
+
+obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
+ maccess.o page_alloc.o page-writeback.o pdflush.o \
+ readahead.o swap.o truncate.o vmscan.o \
+ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
+ page_isolation.o mm_init.o $(mmu-y)
+
+obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
+obj-$(CONFIG_BOUNCE) += bounce.o
+obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
+obj-$(CONFIG_HAS_DMA) += dmapool.o
+obj-$(CONFIG_HUGETLBFS) += hugetlb.o
+obj-$(CONFIG_NUMA) += mempolicy.o
+obj-$(CONFIG_SPARSEMEM) += sparse.o
+obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
+obj-$(CONFIG_SHMEM) += shmem.o
+obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
+obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
+obj-$(CONFIG_SLOB) += slob.o
+obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
+obj-$(CONFIG_SLAB) += slab.o
+obj-$(CONFIG_SLUB) += slub.o
+obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
+obj-$(CONFIG_FS_XIP) += filemap_xip.o
+obj-$(CONFIG_MIGRATION) += migrate.o
+obj-$(CONFIG_SMP) += allocpercpu.o
+obj-$(CONFIG_QUICKLIST) += quicklist.o
+obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
new file mode 100644
index 0000000..4297bc4
--- /dev/null
+++ b/mm/allocpercpu.c
@@ -0,0 +1,143 @@
+/*
+ * linux/mm/allocpercpu.c
+ *
+ * Separated from slab.c August 11, 2006 Christoph Lameter
+ */
+#include <linux/mm.h>
+#include <linux/module.h>
+
+#ifndef cache_line_size
+#define cache_line_size() L1_CACHE_BYTES
+#endif
+
+/**
+ * percpu_depopulate - depopulate per-cpu data for given cpu
+ * @__pdata: per-cpu data to depopulate
+ * @cpu: depopulate per-cpu data for this cpu
+ *
+ * Depopulating per-cpu data for a cpu going offline would be a typical
+ * use case. You need to register a cpu hotplug handler for that purpose.
+ */
+static void percpu_depopulate(void *__pdata, int cpu)
+{
+ struct percpu_data *pdata = __percpu_disguise(__pdata);
+
+ kfree(pdata->ptrs[cpu]);
+ pdata->ptrs[cpu] = NULL;
+}
+
+/**
+ * percpu_depopulate_mask - depopulate per-cpu data for some cpu's
+ * @__pdata: per-cpu data to depopulate
+ * @mask: depopulate per-cpu data for cpu's selected through mask bits
+ */
+static void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
+{
+ int cpu;
+ for_each_cpu_mask_nr(cpu, *mask)
+ percpu_depopulate(__pdata, cpu);
+}
+
+#define percpu_depopulate_mask(__pdata, mask) \
+ __percpu_depopulate_mask((__pdata), &(mask))
+
+/**
+ * percpu_populate - populate per-cpu data for given cpu
+ * @__pdata: per-cpu data to populate further
+ * @size: size of per-cpu object
+ * @gfp: may sleep or not etc.
+ * @cpu: populate per-data for this cpu
+ *
+ * Populating per-cpu data for a cpu coming online would be a typical
+ * use case. You need to register a cpu hotplug handler for that purpose.
+ * Per-cpu object is populated with zeroed buffer.
+ */
+static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
+{
+ struct percpu_data *pdata = __percpu_disguise(__pdata);
+ int node = cpu_to_node(cpu);
+
+ /*
+ * We should make sure each CPU gets private memory.
+ */
+ size = roundup(size, cache_line_size());
+
+ BUG_ON(pdata->ptrs[cpu]);
+ if (node_online(node))
+ pdata->ptrs[cpu] = kmalloc_node(size, gfp|__GFP_ZERO, node);
+ else
+ pdata->ptrs[cpu] = kzalloc(size, gfp);
+ return pdata->ptrs[cpu];
+}
+
+/**
+ * percpu_populate_mask - populate per-cpu data for more cpu's
+ * @__pdata: per-cpu data to populate further
+ * @size: size of per-cpu object
+ * @gfp: may sleep or not etc.
+ * @mask: populate per-cpu data for cpu's selected through mask bits
+ *
+ * Per-cpu objects are populated with zeroed buffers.
+ */
+static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
+ cpumask_t *mask)
+{
+ cpumask_t populated;
+ int cpu;
+
+ cpus_clear(populated);
+ for_each_cpu_mask_nr(cpu, *mask)
+ if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) {
+ __percpu_depopulate_mask(__pdata, &populated);
+ return -ENOMEM;
+ } else
+ cpu_set(cpu, populated);
+ return 0;
+}
+
+#define percpu_populate_mask(__pdata, size, gfp, mask) \
+ __percpu_populate_mask((__pdata), (size), (gfp), &(mask))
+
+/**
+ * percpu_alloc_mask - initial setup of per-cpu data
+ * @size: size of per-cpu object
+ * @gfp: may sleep or not etc.
+ * @mask: populate per-data for cpu's selected through mask bits
+ *
+ * Populating per-cpu data for all online cpu's would be a typical use case,
+ * which is simplified by the percpu_alloc() wrapper.
+ * Per-cpu objects are populated with zeroed buffers.
+ */
+void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
+{
+ /*
+ * We allocate whole cache lines to avoid false sharing
+ */
+ size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size());
+ void *pdata = kzalloc(sz, gfp);
+ void *__pdata = __percpu_disguise(pdata);
+
+ if (unlikely(!pdata))
+ return NULL;
+ if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask)))
+ return __pdata;
+ kfree(pdata);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(__percpu_alloc_mask);
+
+/**
+ * percpu_free - final cleanup of per-cpu data
+ * @__pdata: object to clean up
+ *
+ * We simply clean up any per-cpu object left. No need for the client to
+ * track and specify through a bis mask which per-cpu objects are to free.
+ */
+void percpu_free(void *__pdata)
+{
+ if (unlikely(!__pdata))
+ return;
+ __percpu_depopulate_mask(__pdata, &cpu_possible_map);
+ kfree(__percpu_disguise(__pdata));
+}
+EXPORT_SYMBOL_GPL(percpu_free);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
new file mode 100644
index 0000000..801c08b
--- /dev/null
+++ b/mm/backing-dev.c
@@ -0,0 +1,306 @@
+
+#include <linux/wait.h>
+#include <linux/backing-dev.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/writeback.h>
+#include <linux/device.h>
+
+
+static struct class *bdi_class;
+
+#ifdef CONFIG_DEBUG_FS
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+static struct dentry *bdi_debug_root;
+
+static void bdi_debug_init(void)
+{
+ bdi_debug_root = debugfs_create_dir("bdi", NULL);
+}
+
+static int bdi_debug_stats_show(struct seq_file *m, void *v)
+{
+ struct backing_dev_info *bdi = m->private;
+ long background_thresh;
+ long dirty_thresh;
+ long bdi_thresh;
+
+ get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
+
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+ seq_printf(m,
+ "BdiWriteback: %8lu kB\n"
+ "BdiReclaimable: %8lu kB\n"
+ "BdiDirtyThresh: %8lu kB\n"
+ "DirtyThresh: %8lu kB\n"
+ "BackgroundThresh: %8lu kB\n",
+ (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
+ (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
+ K(bdi_thresh),
+ K(dirty_thresh),
+ K(background_thresh));
+#undef K
+
+ return 0;
+}
+
+static int bdi_debug_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, bdi_debug_stats_show, inode->i_private);
+}
+
+static const struct file_operations bdi_debug_stats_fops = {
+ .open = bdi_debug_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
+{
+ bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
+ bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir,
+ bdi, &bdi_debug_stats_fops);
+}
+
+static void bdi_debug_unregister(struct backing_dev_info *bdi)
+{
+ debugfs_remove(bdi->debug_stats);
+ debugfs_remove(bdi->debug_dir);
+}
+#else
+static inline void bdi_debug_init(void)
+{
+}
+static inline void bdi_debug_register(struct backing_dev_info *bdi,
+ const char *name)
+{
+}
+static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
+{
+}
+#endif
+
+static ssize_t read_ahead_kb_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ char *end;
+ unsigned long read_ahead_kb;
+ ssize_t ret = -EINVAL;
+
+ read_ahead_kb = simple_strtoul(buf, &end, 10);
+ if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
+ bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
+ ret = count;
+ }
+ return ret;
+}
+
+#define K(pages) ((pages) << (PAGE_SHIFT - 10))
+
+#define BDI_SHOW(name, expr) \
+static ssize_t name##_show(struct device *dev, \
+ struct device_attribute *attr, char *page) \
+{ \
+ struct backing_dev_info *bdi = dev_get_drvdata(dev); \
+ \
+ return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr); \
+}
+
+BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
+
+static ssize_t min_ratio_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ char *end;
+ unsigned int ratio;
+ ssize_t ret = -EINVAL;
+
+ ratio = simple_strtoul(buf, &end, 10);
+ if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
+ ret = bdi_set_min_ratio(bdi, ratio);
+ if (!ret)
+ ret = count;
+ }
+ return ret;
+}
+BDI_SHOW(min_ratio, bdi->min_ratio)
+
+static ssize_t max_ratio_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ char *end;
+ unsigned int ratio;
+ ssize_t ret = -EINVAL;
+
+ ratio = simple_strtoul(buf, &end, 10);
+ if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
+ ret = bdi_set_max_ratio(bdi, ratio);
+ if (!ret)
+ ret = count;
+ }
+ return ret;
+}
+BDI_SHOW(max_ratio, bdi->max_ratio)
+
+#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
+
+static struct device_attribute bdi_dev_attrs[] = {
+ __ATTR_RW(read_ahead_kb),
+ __ATTR_RW(min_ratio),
+ __ATTR_RW(max_ratio),
+ __ATTR_NULL,
+};
+
+static __init int bdi_class_init(void)
+{
+ bdi_class = class_create(THIS_MODULE, "bdi");
+ bdi_class->dev_attrs = bdi_dev_attrs;
+ bdi_debug_init();
+ return 0;
+}
+
+postcore_initcall(bdi_class_init);
+
+int bdi_register(struct backing_dev_info *bdi, struct device *parent,
+ const char *fmt, ...)
+{
+ va_list args;
+ int ret = 0;
+ struct device *dev;
+
+ if (bdi->dev) /* The driver needs to use separate queues per device */
+ goto exit;
+
+ va_start(args, fmt);
+ dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
+ va_end(args);
+ if (IS_ERR(dev)) {
+ ret = PTR_ERR(dev);
+ goto exit;
+ }
+
+ bdi->dev = dev;
+ bdi_debug_register(bdi, dev_name(dev));
+
+exit:
+ return ret;
+}
+EXPORT_SYMBOL(bdi_register);
+
+int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
+{
+ return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
+}
+EXPORT_SYMBOL(bdi_register_dev);
+
+void bdi_unregister(struct backing_dev_info *bdi)
+{
+ if (bdi->dev) {
+ bdi_debug_unregister(bdi);
+ device_unregister(bdi->dev);
+ bdi->dev = NULL;
+ }
+}
+EXPORT_SYMBOL(bdi_unregister);
+
+int bdi_init(struct backing_dev_info *bdi)
+{
+ int i;
+ int err;
+
+ bdi->dev = NULL;
+
+ bdi->min_ratio = 0;
+ bdi->max_ratio = 100;
+ bdi->max_prop_frac = PROP_FRAC_BASE;
+
+ for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
+ err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0);
+ if (err)
+ goto err;
+ }
+
+ bdi->dirty_exceeded = 0;
+ err = prop_local_init_percpu(&bdi->completions);
+
+ if (err) {
+err:
+ while (i--)
+ percpu_counter_destroy(&bdi->bdi_stat[i]);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(bdi_init);
+
+void bdi_destroy(struct backing_dev_info *bdi)
+{
+ int i;
+
+ bdi_unregister(bdi);
+
+ for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
+ percpu_counter_destroy(&bdi->bdi_stat[i]);
+
+ prop_local_destroy_percpu(&bdi->completions);
+}
+EXPORT_SYMBOL(bdi_destroy);
+
+static wait_queue_head_t congestion_wqh[2] = {
+ __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
+ __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
+ };
+
+
+void clear_bdi_congested(struct backing_dev_info *bdi, int rw)
+{
+ enum bdi_state bit;
+ wait_queue_head_t *wqh = &congestion_wqh[rw];
+
+ bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
+ clear_bit(bit, &bdi->state);
+ smp_mb__after_clear_bit();
+ if (waitqueue_active(wqh))
+ wake_up(wqh);
+}
+EXPORT_SYMBOL(clear_bdi_congested);
+
+void set_bdi_congested(struct backing_dev_info *bdi, int rw)
+{
+ enum bdi_state bit;
+
+ bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
+ set_bit(bit, &bdi->state);
+}
+EXPORT_SYMBOL(set_bdi_congested);
+
+/**
+ * congestion_wait - wait for a backing_dev to become uncongested
+ * @rw: READ or WRITE
+ * @timeout: timeout in jiffies
+ *
+ * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
+ * write congestion. If no backing_devs are congested then just wait for the
+ * next write to be completed.
+ */
+long congestion_wait(int rw, long timeout)
+{
+ long ret;
+ DEFINE_WAIT(wait);
+ wait_queue_head_t *wqh = &congestion_wqh[rw];
+
+ prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+ ret = io_schedule_timeout(timeout);
+ finish_wait(wqh, &wait);
+ return ret;
+}
+EXPORT_SYMBOL(congestion_wait);
+
diff --git a/mm/bootmem.c b/mm/bootmem.c
new file mode 100644
index 0000000..ac5a891
--- /dev/null
+++ b/mm/bootmem.c
@@ -0,0 +1,727 @@
+/*
+ * bootmem - A boot-time physical memory allocator and configurator
+ *
+ * Copyright (C) 1999 Ingo Molnar
+ * 1999 Kanoj Sarcar, SGI
+ * 2008 Johannes Weiner
+ *
+ * Access to this subsystem has to be serialized externally (which is true
+ * for the boot process anyway).
+ */
+#include <linux/init.h>
+#include <linux/pfn.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+
+#include <asm/bug.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+
+#include "internal.h"
+
+unsigned long max_low_pfn;
+unsigned long min_low_pfn;
+unsigned long max_pfn;
+
+#ifdef CONFIG_CRASH_DUMP
+/*
+ * If we have booted due to a crash, max_pfn will be a very low value. We need
+ * to know the amount of memory that the previous kernel used.
+ */
+unsigned long saved_max_pfn;
+#endif
+
+bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
+
+static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
+
+static int bootmem_debug;
+
+static int __init bootmem_debug_setup(char *buf)
+{
+ bootmem_debug = 1;
+ return 0;
+}
+early_param("bootmem_debug", bootmem_debug_setup);
+
+#define bdebug(fmt, args...) ({ \
+ if (unlikely(bootmem_debug)) \
+ printk(KERN_INFO \
+ "bootmem::%s " fmt, \
+ __func__, ## args); \
+})
+
+static unsigned long __init bootmap_bytes(unsigned long pages)
+{
+ unsigned long bytes = (pages + 7) / 8;
+
+ return ALIGN(bytes, sizeof(long));
+}
+
+/**
+ * bootmem_bootmap_pages - calculate bitmap size in pages
+ * @pages: number of pages the bitmap has to represent
+ */
+unsigned long __init bootmem_bootmap_pages(unsigned long pages)
+{
+ unsigned long bytes = bootmap_bytes(pages);
+
+ return PAGE_ALIGN(bytes) >> PAGE_SHIFT;
+}
+
+/*
+ * link bdata in order
+ */
+static void __init link_bootmem(bootmem_data_t *bdata)
+{
+ struct list_head *iter;
+
+ list_for_each(iter, &bdata_list) {
+ bootmem_data_t *ent;
+
+ ent = list_entry(iter, bootmem_data_t, list);
+ if (bdata->node_min_pfn < ent->node_min_pfn)
+ break;
+ }
+ list_add_tail(&bdata->list, iter);
+}
+
+/*
+ * Called once to set up the allocator itself.
+ */
+static unsigned long __init init_bootmem_core(bootmem_data_t *bdata,
+ unsigned long mapstart, unsigned long start, unsigned long end)
+{
+ unsigned long mapsize;
+
+ mminit_validate_memmodel_limits(&start, &end);
+ bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
+ bdata->node_min_pfn = start;
+ bdata->node_low_pfn = end;
+ link_bootmem(bdata);
+
+ /*
+ * Initially all pages are reserved - setup_arch() has to
+ * register free RAM areas explicitly.
+ */
+ mapsize = bootmap_bytes(end - start);
+ memset(bdata->node_bootmem_map, 0xff, mapsize);
+
+ bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n",
+ bdata - bootmem_node_data, start, mapstart, end, mapsize);
+
+ return mapsize;
+}
+
+/**
+ * init_bootmem_node - register a node as boot memory
+ * @pgdat: node to register
+ * @freepfn: pfn where the bitmap for this node is to be placed
+ * @startpfn: first pfn on the node
+ * @endpfn: first pfn after the node
+ *
+ * Returns the number of bytes needed to hold the bitmap for this node.
+ */
+unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
+ unsigned long startpfn, unsigned long endpfn)
+{
+ return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn);
+}
+
+/**
+ * init_bootmem - register boot memory
+ * @start: pfn where the bitmap is to be placed
+ * @pages: number of available physical pages
+ *
+ * Returns the number of bytes needed to hold the bitmap.
+ */
+unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
+{
+ max_low_pfn = pages;
+ min_low_pfn = start;
+ return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
+}
+
+static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
+{
+ int aligned;
+ struct page *page;
+ unsigned long start, end, pages, count = 0;
+
+ if (!bdata->node_bootmem_map)
+ return 0;
+
+ start = bdata->node_min_pfn;
+ end = bdata->node_low_pfn;
+
+ /*
+ * If the start is aligned to the machines wordsize, we might
+ * be able to free pages in bulks of that order.
+ */
+ aligned = !(start & (BITS_PER_LONG - 1));
+
+ bdebug("nid=%td start=%lx end=%lx aligned=%d\n",
+ bdata - bootmem_node_data, start, end, aligned);
+
+ while (start < end) {
+ unsigned long *map, idx, vec;
+
+ map = bdata->node_bootmem_map;
+ idx = start - bdata->node_min_pfn;
+ vec = ~map[idx / BITS_PER_LONG];
+
+ if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) {
+ int order = ilog2(BITS_PER_LONG);
+
+ __free_pages_bootmem(pfn_to_page(start), order);
+ count += BITS_PER_LONG;
+ } else {
+ unsigned long off = 0;
+
+ while (vec && off < BITS_PER_LONG) {
+ if (vec & 1) {
+ page = pfn_to_page(start + off);
+ __free_pages_bootmem(page, 0);
+ count++;
+ }
+ vec >>= 1;
+ off++;
+ }
+ }
+ start += BITS_PER_LONG;
+ }
+
+ page = virt_to_page(bdata->node_bootmem_map);
+ pages = bdata->node_low_pfn - bdata->node_min_pfn;
+ pages = bootmem_bootmap_pages(pages);
+ count += pages;
+ while (pages--)
+ __free_pages_bootmem(page++, 0);
+
+ bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
+
+ return count;
+}
+
+/**
+ * free_all_bootmem_node - release a node's free pages to the buddy allocator
+ * @pgdat: node to be released
+ *
+ * Returns the number of pages actually released.
+ */
+unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
+{
+ register_page_bootmem_info_node(pgdat);
+ return free_all_bootmem_core(pgdat->bdata);
+}
+
+/**
+ * free_all_bootmem - release free pages to the buddy allocator
+ *
+ * Returns the number of pages actually released.
+ */
+unsigned long __init free_all_bootmem(void)
+{
+ return free_all_bootmem_core(NODE_DATA(0)->bdata);
+}
+
+static void __init __free(bootmem_data_t *bdata,
+ unsigned long sidx, unsigned long eidx)
+{
+ unsigned long idx;
+
+ bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data,
+ sidx + bdata->node_min_pfn,
+ eidx + bdata->node_min_pfn);
+
+ if (bdata->hint_idx > sidx)
+ bdata->hint_idx = sidx;
+
+ for (idx = sidx; idx < eidx; idx++)
+ if (!test_and_clear_bit(idx, bdata->node_bootmem_map))
+ BUG();
+}
+
+static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx,
+ unsigned long eidx, int flags)
+{
+ unsigned long idx;
+ int exclusive = flags & BOOTMEM_EXCLUSIVE;
+
+ bdebug("nid=%td start=%lx end=%lx flags=%x\n",
+ bdata - bootmem_node_data,
+ sidx + bdata->node_min_pfn,
+ eidx + bdata->node_min_pfn,
+ flags);
+
+ for (idx = sidx; idx < eidx; idx++)
+ if (test_and_set_bit(idx, bdata->node_bootmem_map)) {
+ if (exclusive) {
+ __free(bdata, sidx, idx);
+ return -EBUSY;
+ }
+ bdebug("silent double reserve of PFN %lx\n",
+ idx + bdata->node_min_pfn);
+ }
+ return 0;
+}
+
+static int __init mark_bootmem_node(bootmem_data_t *bdata,
+ unsigned long start, unsigned long end,
+ int reserve, int flags)
+{
+ unsigned long sidx, eidx;
+
+ bdebug("nid=%td start=%lx end=%lx reserve=%d flags=%x\n",
+ bdata - bootmem_node_data, start, end, reserve, flags);
+
+ BUG_ON(start < bdata->node_min_pfn);
+ BUG_ON(end > bdata->node_low_pfn);
+
+ sidx = start - bdata->node_min_pfn;
+ eidx = end - bdata->node_min_pfn;
+
+ if (reserve)
+ return __reserve(bdata, sidx, eidx, flags);
+ else
+ __free(bdata, sidx, eidx);
+ return 0;
+}
+
+static int __init mark_bootmem(unsigned long start, unsigned long end,
+ int reserve, int flags)
+{
+ unsigned long pos;
+ bootmem_data_t *bdata;
+
+ pos = start;
+ list_for_each_entry(bdata, &bdata_list, list) {
+ int err;
+ unsigned long max;
+
+ if (pos < bdata->node_min_pfn ||
+ pos >= bdata->node_low_pfn) {
+ BUG_ON(pos != start);
+ continue;
+ }
+
+ max = min(bdata->node_low_pfn, end);
+
+ err = mark_bootmem_node(bdata, pos, max, reserve, flags);
+ if (reserve && err) {
+ mark_bootmem(start, pos, 0, 0);
+ return err;
+ }
+
+ if (max == end)
+ return 0;
+ pos = bdata->node_low_pfn;
+ }
+ BUG();
+}
+
+/**
+ * free_bootmem_node - mark a page range as usable
+ * @pgdat: node the range resides on
+ * @physaddr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * Partial pages will be considered reserved and left as they are.
+ *
+ * The range must reside completely on the specified node.
+ */
+void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+ unsigned long size)
+{
+ unsigned long start, end;
+
+ start = PFN_UP(physaddr);
+ end = PFN_DOWN(physaddr + size);
+
+ mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
+}
+
+/**
+ * free_bootmem - mark a page range as usable
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * Partial pages will be considered reserved and left as they are.
+ *
+ * The range must be contiguous but may span node boundaries.
+ */
+void __init free_bootmem(unsigned long addr, unsigned long size)
+{
+ unsigned long start, end;
+
+ start = PFN_UP(addr);
+ end = PFN_DOWN(addr + size);
+
+ mark_bootmem(start, end, 0, 0);
+}
+
+/**
+ * reserve_bootmem_node - mark a page range as reserved
+ * @pgdat: node the range resides on
+ * @physaddr: starting address of the range
+ * @size: size of the range in bytes
+ * @flags: reservation flags (see linux/bootmem.h)
+ *
+ * Partial pages will be reserved.
+ *
+ * The range must reside completely on the specified node.
+ */
+int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+ unsigned long size, int flags)
+{
+ unsigned long start, end;
+
+ start = PFN_DOWN(physaddr);
+ end = PFN_UP(physaddr + size);
+
+ return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
+}
+
+#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
+/**
+ * reserve_bootmem - mark a page range as usable
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ * @flags: reservation flags (see linux/bootmem.h)
+ *
+ * Partial pages will be reserved.
+ *
+ * The range must be contiguous but may span node boundaries.
+ */
+int __init reserve_bootmem(unsigned long addr, unsigned long size,
+ int flags)
+{
+ unsigned long start, end;
+
+ start = PFN_DOWN(addr);
+ end = PFN_UP(addr + size);
+
+ return mark_bootmem(start, end, 1, flags);
+}
+#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
+
+static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
+ unsigned long step)
+{
+ unsigned long base = bdata->node_min_pfn;
+
+ /*
+ * Align the index with respect to the node start so that the
+ * combination of both satisfies the requested alignment.
+ */
+
+ return ALIGN(base + idx, step) - base;
+}
+
+static unsigned long align_off(struct bootmem_data *bdata, unsigned long off,
+ unsigned long align)
+{
+ unsigned long base = PFN_PHYS(bdata->node_min_pfn);
+
+ /* Same as align_idx for byte offsets */
+
+ return ALIGN(base + off, align) - base;
+}
+
+static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
+ unsigned long size, unsigned long align,
+ unsigned long goal, unsigned long limit)
+{
+ unsigned long fallback = 0;
+ unsigned long min, max, start, sidx, midx, step;
+
+ BUG_ON(!size);
+ BUG_ON(align & (align - 1));
+ BUG_ON(limit && goal + size > limit);
+
+ if (!bdata->node_bootmem_map)
+ return NULL;
+
+ bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
+ bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
+ align, goal, limit);
+
+ min = bdata->node_min_pfn;
+ max = bdata->node_low_pfn;
+
+ goal >>= PAGE_SHIFT;
+ limit >>= PAGE_SHIFT;
+
+ if (limit && max > limit)
+ max = limit;
+ if (max <= min)
+ return NULL;
+
+ step = max(align >> PAGE_SHIFT, 1UL);
+
+ if (goal && min < goal && goal < max)
+ start = ALIGN(goal, step);
+ else
+ start = ALIGN(min, step);
+
+ sidx = start - bdata->node_min_pfn;
+ midx = max - bdata->node_min_pfn;
+
+ if (bdata->hint_idx > sidx) {
+ /*
+ * Handle the valid case of sidx being zero and still
+ * catch the fallback below.
+ */
+ fallback = sidx + 1;
+ sidx = align_idx(bdata, bdata->hint_idx, step);
+ }
+
+ while (1) {
+ int merge;
+ void *region;
+ unsigned long eidx, i, start_off, end_off;
+find_block:
+ sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx);
+ sidx = align_idx(bdata, sidx, step);
+ eidx = sidx + PFN_UP(size);
+
+ if (sidx >= midx || eidx > midx)
+ break;
+
+ for (i = sidx; i < eidx; i++)
+ if (test_bit(i, bdata->node_bootmem_map)) {
+ sidx = align_idx(bdata, i, step);
+ if (sidx == i)
+ sidx += step;
+ goto find_block;
+ }
+
+ if (bdata->last_end_off & (PAGE_SIZE - 1) &&
+ PFN_DOWN(bdata->last_end_off) + 1 == sidx)
+ start_off = align_off(bdata, bdata->last_end_off, align);
+ else
+ start_off = PFN_PHYS(sidx);
+
+ merge = PFN_DOWN(start_off) < sidx;
+ end_off = start_off + size;
+
+ bdata->last_end_off = end_off;
+ bdata->hint_idx = PFN_UP(end_off);
+
+ /*
+ * Reserve the area now:
+ */
+ if (__reserve(bdata, PFN_DOWN(start_off) + merge,
+ PFN_UP(end_off), BOOTMEM_EXCLUSIVE))
+ BUG();
+
+ region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
+ start_off);
+ memset(region, 0, size);
+ return region;
+ }
+
+ if (fallback) {
+ sidx = align_idx(bdata, fallback - 1, step);
+ fallback = 0;
+ goto find_block;
+ }
+
+ return NULL;
+}
+
+static void * __init ___alloc_bootmem_nopanic(unsigned long size,
+ unsigned long align,
+ unsigned long goal,
+ unsigned long limit)
+{
+ bootmem_data_t *bdata;
+
+restart:
+ list_for_each_entry(bdata, &bdata_list, list) {
+ void *region;
+
+ if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
+ continue;
+ if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
+ break;
+
+ region = alloc_bootmem_core(bdata, size, align, goal, limit);
+ if (region)
+ return region;
+ }
+
+ if (goal) {
+ goal = 0;
+ goto restart;
+ }
+
+ return NULL;
+}
+
+/**
+ * __alloc_bootmem_nopanic - allocate boot memory without panicking
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * Returns NULL on failure.
+ */
+void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
+ unsigned long goal)
+{
+ return ___alloc_bootmem_nopanic(size, align, goal, 0);
+}
+
+static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
+ unsigned long goal, unsigned long limit)
+{
+ void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
+
+ if (mem)
+ return mem;
+ /*
+ * Whoops, we cannot satisfy the allocation request.
+ */
+ printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+ panic("Out of memory");
+ return NULL;
+}
+
+/**
+ * __alloc_bootmem - allocate boot memory
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem(unsigned long size, unsigned long align,
+ unsigned long goal)
+{
+ return ___alloc_bootmem(size, align, goal, 0);
+}
+
+static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
+ unsigned long size, unsigned long align,
+ unsigned long goal, unsigned long limit)
+{
+ void *ptr;
+
+ ptr = alloc_bootmem_core(bdata, size, align, goal, limit);
+ if (ptr)
+ return ptr;
+
+ return ___alloc_bootmem(size, align, goal, limit);
+}
+
+/**
+ * __alloc_bootmem_node - allocate boot memory from a specific node
+ * @pgdat: node to allocate from
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may fall back to any node in the system if the specified node
+ * can not hold the requested memory.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
+}
+
+#ifdef CONFIG_SPARSEMEM
+/**
+ * alloc_bootmem_section - allocate boot memory from a specific section
+ * @size: size of the request in bytes
+ * @section_nr: sparse map section to allocate from
+ *
+ * Return NULL on failure.
+ */
+void * __init alloc_bootmem_section(unsigned long size,
+ unsigned long section_nr)
+{
+ bootmem_data_t *bdata;
+ unsigned long pfn, goal, limit;
+
+ pfn = section_nr_to_pfn(section_nr);
+ goal = pfn << PAGE_SHIFT;
+ limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
+ bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
+
+ return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
+}
+#endif
+
+void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ void *ptr;
+
+ ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
+ if (ptr)
+ return ptr;
+
+ return __alloc_bootmem_nopanic(size, align, goal);
+}
+
+#ifndef ARCH_LOW_ADDRESS_LIMIT
+#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
+#endif
+
+/**
+ * __alloc_bootmem_low - allocate low boot memory
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
+ unsigned long goal)
+{
+ return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
+}
+
+/**
+ * __alloc_bootmem_low_node - allocate low boot memory from a specific node
+ * @pgdat: node to allocate from
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may fall back to any node in the system if the specified node
+ * can not hold the requested memory.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ return ___alloc_bootmem_node(pgdat->bdata, size, align,
+ goal, ARCH_LOW_ADDRESS_LIMIT);
+}
diff --git a/mm/bounce.c b/mm/bounce.c
new file mode 100644
index 0000000..06722c4
--- /dev/null
+++ b/mm/bounce.c
@@ -0,0 +1,293 @@
+/* bounce buffer handling for block devices
+ *
+ * - Split from highmem.c
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/bio.h>
+#include <linux/pagemap.h>
+#include <linux/mempool.h>
+#include <linux/blkdev.h>
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/highmem.h>
+#include <linux/blktrace_api.h>
+#include <asm/tlbflush.h>
+
+#define POOL_SIZE 64
+#define ISA_POOL_SIZE 16
+
+static mempool_t *page_pool, *isa_page_pool;
+
+#ifdef CONFIG_HIGHMEM
+static __init int init_emergency_pool(void)
+{
+ struct sysinfo i;
+ si_meminfo(&i);
+ si_swapinfo(&i);
+
+ if (!i.totalhigh)
+ return 0;
+
+ page_pool = mempool_create_page_pool(POOL_SIZE, 0);
+ BUG_ON(!page_pool);
+ printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
+
+ return 0;
+}
+
+__initcall(init_emergency_pool);
+
+/*
+ * highmem version, map in to vec
+ */
+static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
+{
+ unsigned long flags;
+ unsigned char *vto;
+
+ local_irq_save(flags);
+ vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
+ memcpy(vto + to->bv_offset, vfrom, to->bv_len);
+ kunmap_atomic(vto, KM_BOUNCE_READ);
+ local_irq_restore(flags);
+}
+
+#else /* CONFIG_HIGHMEM */
+
+#define bounce_copy_vec(to, vfrom) \
+ memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
+
+#endif /* CONFIG_HIGHMEM */
+
+/*
+ * allocate pages in the DMA region for the ISA pool
+ */
+static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
+{
+ return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
+}
+
+/*
+ * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
+ * as the max address, so check if the pool has already been created.
+ */
+int init_emergency_isa_pool(void)
+{
+ if (isa_page_pool)
+ return 0;
+
+ isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
+ mempool_free_pages, (void *) 0);
+ BUG_ON(!isa_page_pool);
+
+ printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
+ return 0;
+}
+
+/*
+ * Simple bounce buffer support for highmem pages. Depending on the
+ * queue gfp mask set, *to may or may not be a highmem page. kmap it
+ * always, it will do the Right Thing
+ */
+static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
+{
+ unsigned char *vfrom;
+ struct bio_vec *tovec, *fromvec;
+ int i;
+
+ __bio_for_each_segment(tovec, to, i, 0) {
+ fromvec = from->bi_io_vec + i;
+
+ /*
+ * not bounced
+ */
+ if (tovec->bv_page == fromvec->bv_page)
+ continue;
+
+ /*
+ * fromvec->bv_offset and fromvec->bv_len might have been
+ * modified by the block layer, so use the original copy,
+ * bounce_copy_vec already uses tovec->bv_len
+ */
+ vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
+
+ flush_dcache_page(tovec->bv_page);
+ bounce_copy_vec(tovec, vfrom);
+ }
+}
+
+static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
+{
+ struct bio *bio_orig = bio->bi_private;
+ struct bio_vec *bvec, *org_vec;
+ int i;
+
+ if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
+ set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
+
+ /*
+ * free up bounce indirect pages used
+ */
+ __bio_for_each_segment(bvec, bio, i, 0) {
+ org_vec = bio_orig->bi_io_vec + i;
+ if (bvec->bv_page == org_vec->bv_page)
+ continue;
+
+ dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
+ mempool_free(bvec->bv_page, pool);
+ }
+
+ bio_endio(bio_orig, err);
+ bio_put(bio);
+}
+
+static void bounce_end_io_write(struct bio *bio, int err)
+{
+ bounce_end_io(bio, page_pool, err);
+}
+
+static void bounce_end_io_write_isa(struct bio *bio, int err)
+{
+
+ bounce_end_io(bio, isa_page_pool, err);
+}
+
+static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
+{
+ struct bio *bio_orig = bio->bi_private;
+
+ if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+ copy_to_high_bio_irq(bio_orig, bio);
+
+ bounce_end_io(bio, pool, err);
+}
+
+static void bounce_end_io_read(struct bio *bio, int err)
+{
+ __bounce_end_io_read(bio, page_pool, err);
+}
+
+static void bounce_end_io_read_isa(struct bio *bio, int err)
+{
+ __bounce_end_io_read(bio, isa_page_pool, err);
+}
+
+static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
+ mempool_t *pool)
+{
+ struct page *page;
+ struct bio *bio = NULL;
+ int i, rw = bio_data_dir(*bio_orig);
+ struct bio_vec *to, *from;
+
+ bio_for_each_segment(from, *bio_orig, i) {
+ page = from->bv_page;
+
+ /*
+ * is destination page below bounce pfn?
+ */
+ if (page_to_pfn(page) <= q->bounce_pfn)
+ continue;
+
+ /*
+ * irk, bounce it
+ */
+ if (!bio)
+ bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt);
+
+ to = bio->bi_io_vec + i;
+
+ to->bv_page = mempool_alloc(pool, q->bounce_gfp);
+ to->bv_len = from->bv_len;
+ to->bv_offset = from->bv_offset;
+ inc_zone_page_state(to->bv_page, NR_BOUNCE);
+
+ if (rw == WRITE) {
+ char *vto, *vfrom;
+
+ flush_dcache_page(from->bv_page);
+ vto = page_address(to->bv_page) + to->bv_offset;
+ vfrom = kmap(from->bv_page) + from->bv_offset;
+ memcpy(vto, vfrom, to->bv_len);
+ kunmap(from->bv_page);
+ }
+ }
+
+ /*
+ * no pages bounced
+ */
+ if (!bio)
+ return;
+
+ blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
+
+ /*
+ * at least one page was bounced, fill in possible non-highmem
+ * pages
+ */
+ __bio_for_each_segment(from, *bio_orig, i, 0) {
+ to = bio_iovec_idx(bio, i);
+ if (!to->bv_page) {
+ to->bv_page = from->bv_page;
+ to->bv_len = from->bv_len;
+ to->bv_offset = from->bv_offset;
+ }
+ }
+
+ bio->bi_bdev = (*bio_orig)->bi_bdev;
+ bio->bi_flags |= (1 << BIO_BOUNCED);
+ bio->bi_sector = (*bio_orig)->bi_sector;
+ bio->bi_rw = (*bio_orig)->bi_rw;
+
+ bio->bi_vcnt = (*bio_orig)->bi_vcnt;
+ bio->bi_idx = (*bio_orig)->bi_idx;
+ bio->bi_size = (*bio_orig)->bi_size;
+
+ if (pool == page_pool) {
+ bio->bi_end_io = bounce_end_io_write;
+ if (rw == READ)
+ bio->bi_end_io = bounce_end_io_read;
+ } else {
+ bio->bi_end_io = bounce_end_io_write_isa;
+ if (rw == READ)
+ bio->bi_end_io = bounce_end_io_read_isa;
+ }
+
+ bio->bi_private = *bio_orig;
+ *bio_orig = bio;
+}
+
+void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
+{
+ mempool_t *pool;
+
+ /*
+ * Data-less bio, nothing to bounce
+ */
+ if (!bio_has_data(*bio_orig))
+ return;
+
+ /*
+ * for non-isa bounce case, just check if the bounce pfn is equal
+ * to or bigger than the highest pfn in the system -- in that case,
+ * don't waste time iterating over bio segments
+ */
+ if (!(q->bounce_gfp & GFP_DMA)) {
+ if (q->bounce_pfn >= blk_max_pfn)
+ return;
+ pool = page_pool;
+ } else {
+ BUG_ON(!isa_page_pool);
+ pool = isa_page_pool;
+ }
+
+ /*
+ * slow path
+ */
+ __blk_queue_bounce(q, bio_orig, pool);
+}
+
+EXPORT_SYMBOL(blk_queue_bounce);
diff --git a/mm/dmapool.c b/mm/dmapool.c
new file mode 100644
index 0000000..b1f0885
--- /dev/null
+++ b/mm/dmapool.c
@@ -0,0 +1,504 @@
+/*
+ * DMA Pool allocator
+ *
+ * Copyright 2001 David Brownell
+ * Copyright 2007 Intel Corporation
+ * Author: Matthew Wilcox <willy@linux.intel.com>
+ *
+ * This software may be redistributed and/or modified under the terms of
+ * the GNU General Public License ("GPL") version 2 as published by the
+ * Free Software Foundation.
+ *
+ * This allocator returns small blocks of a given size which are DMA-able by
+ * the given device. It uses the dma_alloc_coherent page allocator to get
+ * new pages, then splits them up into blocks of the required size.
+ * Many older drivers still have their own code to do this.
+ *
+ * The current design of this allocator is fairly simple. The pool is
+ * represented by the 'struct dma_pool' which keeps a doubly-linked list of
+ * allocated pages. Each page in the page_list is split into blocks of at
+ * least 'size' bytes. Free blocks are tracked in an unsorted singly-linked
+ * list of free blocks within the page. Used blocks aren't tracked, but we
+ * keep a count of how many are currently allocated from each page.
+ */
+
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmapool.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/poison.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON)
+#define DMAPOOL_DEBUG 1
+#endif
+
+struct dma_pool { /* the pool */
+ struct list_head page_list;
+ spinlock_t lock;
+ size_t size;
+ struct device *dev;
+ size_t allocation;
+ size_t boundary;
+ char name[32];
+ wait_queue_head_t waitq;
+ struct list_head pools;
+};
+
+struct dma_page { /* cacheable header for 'allocation' bytes */
+ struct list_head page_list;
+ void *vaddr;
+ dma_addr_t dma;
+ unsigned int in_use;
+ unsigned int offset;
+};
+
+#define POOL_TIMEOUT_JIFFIES ((100 /* msec */ * HZ) / 1000)
+
+static DEFINE_MUTEX(pools_lock);
+
+static ssize_t
+show_pools(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ unsigned temp;
+ unsigned size;
+ char *next;
+ struct dma_page *page;
+ struct dma_pool *pool;
+
+ next = buf;
+ size = PAGE_SIZE;
+
+ temp = scnprintf(next, size, "poolinfo - 0.1\n");
+ size -= temp;
+ next += temp;
+
+ mutex_lock(&pools_lock);
+ list_for_each_entry(pool, &dev->dma_pools, pools) {
+ unsigned pages = 0;
+ unsigned blocks = 0;
+
+ list_for_each_entry(page, &pool->page_list, page_list) {
+ pages++;
+ blocks += page->in_use;
+ }
+
+ /* per-pool info, no real statistics yet */
+ temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n",
+ pool->name, blocks,
+ pages * (pool->allocation / pool->size),
+ pool->size, pages);
+ size -= temp;
+ next += temp;
+ }
+ mutex_unlock(&pools_lock);
+
+ return PAGE_SIZE - size;
+}
+
+static DEVICE_ATTR(pools, S_IRUGO, show_pools, NULL);
+
+/**
+ * dma_pool_create - Creates a pool of consistent memory blocks, for dma.
+ * @name: name of pool, for diagnostics
+ * @dev: device that will be doing the DMA
+ * @size: size of the blocks in this pool.
+ * @align: alignment requirement for blocks; must be a power of two
+ * @boundary: returned blocks won't cross this power of two boundary
+ * Context: !in_interrupt()
+ *
+ * Returns a dma allocation pool with the requested characteristics, or
+ * null if one can't be created. Given one of these pools, dma_pool_alloc()
+ * may be used to allocate memory. Such memory will all have "consistent"
+ * DMA mappings, accessible by the device and its driver without using
+ * cache flushing primitives. The actual size of blocks allocated may be
+ * larger than requested because of alignment.
+ *
+ * If @boundary is nonzero, objects returned from dma_pool_alloc() won't
+ * cross that size boundary. This is useful for devices which have
+ * addressing restrictions on individual DMA transfers, such as not crossing
+ * boundaries of 4KBytes.
+ */
+struct dma_pool *dma_pool_create(const char *name, struct device *dev,
+ size_t size, size_t align, size_t boundary)
+{
+ struct dma_pool *retval;
+ size_t allocation;
+
+ if (align == 0) {
+ align = 1;
+ } else if (align & (align - 1)) {
+ return NULL;
+ }
+
+ if (size == 0) {
+ return NULL;
+ } else if (size < 4) {
+ size = 4;
+ }
+
+ if ((size % align) != 0)
+ size = ALIGN(size, align);
+
+ allocation = max_t(size_t, size, PAGE_SIZE);
+
+ if (!boundary) {
+ boundary = allocation;
+ } else if ((boundary < size) || (boundary & (boundary - 1))) {
+ return NULL;
+ }
+
+ retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev));
+ if (!retval)
+ return retval;
+
+ strlcpy(retval->name, name, sizeof(retval->name));
+
+ retval->dev = dev;
+
+ INIT_LIST_HEAD(&retval->page_list);
+ spin_lock_init(&retval->lock);
+ retval->size = size;
+ retval->boundary = boundary;
+ retval->allocation = allocation;
+ init_waitqueue_head(&retval->waitq);
+
+ if (dev) {
+ int ret;
+
+ mutex_lock(&pools_lock);
+ if (list_empty(&dev->dma_pools))
+ ret = device_create_file(dev, &dev_attr_pools);
+ else
+ ret = 0;
+ /* note: not currently insisting "name" be unique */
+ if (!ret)
+ list_add(&retval->pools, &dev->dma_pools);
+ else {
+ kfree(retval);
+ retval = NULL;
+ }
+ mutex_unlock(&pools_lock);
+ } else
+ INIT_LIST_HEAD(&retval->pools);
+
+ return retval;
+}
+EXPORT_SYMBOL(dma_pool_create);
+
+static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page)
+{
+ unsigned int offset = 0;
+ unsigned int next_boundary = pool->boundary;
+
+ do {
+ unsigned int next = offset + pool->size;
+ if (unlikely((next + pool->size) >= next_boundary)) {
+ next = next_boundary;
+ next_boundary += pool->boundary;
+ }
+ *(int *)(page->vaddr + offset) = next;
+ offset = next;
+ } while (offset < pool->allocation);
+}
+
+static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
+{
+ struct dma_page *page;
+
+ page = kmalloc(sizeof(*page), mem_flags);
+ if (!page)
+ return NULL;
+ page->vaddr = dma_alloc_coherent(pool->dev, pool->allocation,
+ &page->dma, mem_flags);
+ if (page->vaddr) {
+#ifdef DMAPOOL_DEBUG
+ memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
+#endif
+ pool_initialise_page(pool, page);
+ list_add(&page->page_list, &pool->page_list);
+ page->in_use = 0;
+ page->offset = 0;
+ } else {
+ kfree(page);
+ page = NULL;
+ }
+ return page;
+}
+
+static inline int is_page_busy(struct dma_page *page)
+{
+ return page->in_use != 0;
+}
+
+static void pool_free_page(struct dma_pool *pool, struct dma_page *page)
+{
+ dma_addr_t dma = page->dma;
+
+#ifdef DMAPOOL_DEBUG
+ memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
+#endif
+ dma_free_coherent(pool->dev, pool->allocation, page->vaddr, dma);
+ list_del(&page->page_list);
+ kfree(page);
+}
+
+/**
+ * dma_pool_destroy - destroys a pool of dma memory blocks.
+ * @pool: dma pool that will be destroyed
+ * Context: !in_interrupt()
+ *
+ * Caller guarantees that no more memory from the pool is in use,
+ * and that nothing will try to use the pool after this call.
+ */
+void dma_pool_destroy(struct dma_pool *pool)
+{
+ mutex_lock(&pools_lock);
+ list_del(&pool->pools);
+ if (pool->dev && list_empty(&pool->dev->dma_pools))
+ device_remove_file(pool->dev, &dev_attr_pools);
+ mutex_unlock(&pools_lock);
+
+ while (!list_empty(&pool->page_list)) {
+ struct dma_page *page;
+ page = list_entry(pool->page_list.next,
+ struct dma_page, page_list);
+ if (is_page_busy(page)) {
+ if (pool->dev)
+ dev_err(pool->dev,
+ "dma_pool_destroy %s, %p busy\n",
+ pool->name, page->vaddr);
+ else
+ printk(KERN_ERR
+ "dma_pool_destroy %s, %p busy\n",
+ pool->name, page->vaddr);
+ /* leak the still-in-use consistent memory */
+ list_del(&page->page_list);
+ kfree(page);
+ } else
+ pool_free_page(pool, page);
+ }
+
+ kfree(pool);
+}
+EXPORT_SYMBOL(dma_pool_destroy);
+
+/**
+ * dma_pool_alloc - get a block of consistent memory
+ * @pool: dma pool that will produce the block
+ * @mem_flags: GFP_* bitmask
+ * @handle: pointer to dma address of block
+ *
+ * This returns the kernel virtual address of a currently unused block,
+ * and reports its dma address through the handle.
+ * If such a memory block can't be allocated, %NULL is returned.
+ */
+void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
+ dma_addr_t *handle)
+{
+ unsigned long flags;
+ struct dma_page *page;
+ size_t offset;
+ void *retval;
+
+ spin_lock_irqsave(&pool->lock, flags);
+ restart:
+ list_for_each_entry(page, &pool->page_list, page_list) {
+ if (page->offset < pool->allocation)
+ goto ready;
+ }
+ page = pool_alloc_page(pool, GFP_ATOMIC);
+ if (!page) {
+ if (mem_flags & __GFP_WAIT) {
+ DECLARE_WAITQUEUE(wait, current);
+
+ __set_current_state(TASK_INTERRUPTIBLE);
+ __add_wait_queue(&pool->waitq, &wait);
+ spin_unlock_irqrestore(&pool->lock, flags);
+
+ schedule_timeout(POOL_TIMEOUT_JIFFIES);
+
+ spin_lock_irqsave(&pool->lock, flags);
+ __remove_wait_queue(&pool->waitq, &wait);
+ goto restart;
+ }
+ retval = NULL;
+ goto done;
+ }
+
+ ready:
+ page->in_use++;
+ offset = page->offset;
+ page->offset = *(int *)(page->vaddr + offset);
+ retval = offset + page->vaddr;
+ *handle = offset + page->dma;
+#ifdef DMAPOOL_DEBUG
+ memset(retval, POOL_POISON_ALLOCATED, pool->size);
+#endif
+ done:
+ spin_unlock_irqrestore(&pool->lock, flags);
+ return retval;
+}
+EXPORT_SYMBOL(dma_pool_alloc);
+
+static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
+{
+ unsigned long flags;
+ struct dma_page *page;
+
+ spin_lock_irqsave(&pool->lock, flags);
+ list_for_each_entry(page, &pool->page_list, page_list) {
+ if (dma < page->dma)
+ continue;
+ if (dma < (page->dma + pool->allocation))
+ goto done;
+ }
+ page = NULL;
+ done:
+ spin_unlock_irqrestore(&pool->lock, flags);
+ return page;
+}
+
+/**
+ * dma_pool_free - put block back into dma pool
+ * @pool: the dma pool holding the block
+ * @vaddr: virtual address of block
+ * @dma: dma address of block
+ *
+ * Caller promises neither device nor driver will again touch this block
+ * unless it is first re-allocated.
+ */
+void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
+{
+ struct dma_page *page;
+ unsigned long flags;
+ unsigned int offset;
+
+ page = pool_find_page(pool, dma);
+ if (!page) {
+ if (pool->dev)
+ dev_err(pool->dev,
+ "dma_pool_free %s, %p/%lx (bad dma)\n",
+ pool->name, vaddr, (unsigned long)dma);
+ else
+ printk(KERN_ERR "dma_pool_free %s, %p/%lx (bad dma)\n",
+ pool->name, vaddr, (unsigned long)dma);
+ return;
+ }
+
+ offset = vaddr - page->vaddr;
+#ifdef DMAPOOL_DEBUG
+ if ((dma - page->dma) != offset) {
+ if (pool->dev)
+ dev_err(pool->dev,
+ "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
+ pool->name, vaddr, (unsigned long long)dma);
+ else
+ printk(KERN_ERR
+ "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
+ pool->name, vaddr, (unsigned long long)dma);
+ return;
+ }
+ {
+ unsigned int chain = page->offset;
+ while (chain < pool->allocation) {
+ if (chain != offset) {
+ chain = *(int *)(page->vaddr + chain);
+ continue;
+ }
+ if (pool->dev)
+ dev_err(pool->dev, "dma_pool_free %s, dma %Lx "
+ "already free\n", pool->name,
+ (unsigned long long)dma);
+ else
+ printk(KERN_ERR "dma_pool_free %s, dma %Lx "
+ "already free\n", pool->name,
+ (unsigned long long)dma);
+ return;
+ }
+ }
+ memset(vaddr, POOL_POISON_FREED, pool->size);
+#endif
+
+ spin_lock_irqsave(&pool->lock, flags);
+ page->in_use--;
+ *(int *)vaddr = page->offset;
+ page->offset = offset;
+ if (waitqueue_active(&pool->waitq))
+ wake_up_locked(&pool->waitq);
+ /*
+ * Resist a temptation to do
+ * if (!is_page_busy(page)) pool_free_page(pool, page);
+ * Better have a few empty pages hang around.
+ */
+ spin_unlock_irqrestore(&pool->lock, flags);
+}
+EXPORT_SYMBOL(dma_pool_free);
+
+/*
+ * Managed DMA pool
+ */
+static void dmam_pool_release(struct device *dev, void *res)
+{
+ struct dma_pool *pool = *(struct dma_pool **)res;
+
+ dma_pool_destroy(pool);
+}
+
+static int dmam_pool_match(struct device *dev, void *res, void *match_data)
+{
+ return *(struct dma_pool **)res == match_data;
+}
+
+/**
+ * dmam_pool_create - Managed dma_pool_create()
+ * @name: name of pool, for diagnostics
+ * @dev: device that will be doing the DMA
+ * @size: size of the blocks in this pool.
+ * @align: alignment requirement for blocks; must be a power of two
+ * @allocation: returned blocks won't cross this boundary (or zero)
+ *
+ * Managed dma_pool_create(). DMA pool created with this function is
+ * automatically destroyed on driver detach.
+ */
+struct dma_pool *dmam_pool_create(const char *name, struct device *dev,
+ size_t size, size_t align, size_t allocation)
+{
+ struct dma_pool **ptr, *pool;
+
+ ptr = devres_alloc(dmam_pool_release, sizeof(*ptr), GFP_KERNEL);
+ if (!ptr)
+ return NULL;
+
+ pool = *ptr = dma_pool_create(name, dev, size, align, allocation);
+ if (pool)
+ devres_add(dev, ptr);
+ else
+ devres_free(ptr);
+
+ return pool;
+}
+EXPORT_SYMBOL(dmam_pool_create);
+
+/**
+ * dmam_pool_destroy - Managed dma_pool_destroy()
+ * @pool: dma pool that will be destroyed
+ *
+ * Managed dma_pool_destroy().
+ */
+void dmam_pool_destroy(struct dma_pool *pool)
+{
+ struct device *dev = pool->dev;
+
+ dma_pool_destroy(pool);
+ WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool));
+}
+EXPORT_SYMBOL(dmam_pool_destroy);
diff --git a/mm/fadvise.c b/mm/fadvise.c
new file mode 100644
index 0000000..54a0f80
--- /dev/null
+++ b/mm/fadvise.c
@@ -0,0 +1,151 @@
+/*
+ * mm/fadvise.c
+ *
+ * Copyright (C) 2002, Linus Torvalds
+ *
+ * 11Jan2003 Andrew Morton
+ * Initial version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/backing-dev.h>
+#include <linux/pagevec.h>
+#include <linux/fadvise.h>
+#include <linux/writeback.h>
+#include <linux/syscalls.h>
+
+#include <asm/unistd.h>
+
+/*
+ * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
+ * deactivate the pages and clear PG_Referenced.
+ */
+SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
+{
+ struct file *file = fget(fd);
+ struct address_space *mapping;
+ struct backing_dev_info *bdi;
+ loff_t endbyte; /* inclusive */
+ pgoff_t start_index;
+ pgoff_t end_index;
+ unsigned long nrpages;
+ int ret = 0;
+
+ if (!file)
+ return -EBADF;
+
+ if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) {
+ ret = -ESPIPE;
+ goto out;
+ }
+
+ mapping = file->f_mapping;
+ if (!mapping || len < 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (mapping->a_ops->get_xip_mem) {
+ switch (advice) {
+ case POSIX_FADV_NORMAL:
+ case POSIX_FADV_RANDOM:
+ case POSIX_FADV_SEQUENTIAL:
+ case POSIX_FADV_WILLNEED:
+ case POSIX_FADV_NOREUSE:
+ case POSIX_FADV_DONTNEED:
+ /* no bad return value, but ignore advice */
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ goto out;
+ }
+
+ /* Careful about overflows. Len == 0 means "as much as possible" */
+ endbyte = offset + len;
+ if (!len || endbyte < len)
+ endbyte = -1;
+ else
+ endbyte--; /* inclusive */
+
+ bdi = mapping->backing_dev_info;
+
+ switch (advice) {
+ case POSIX_FADV_NORMAL:
+ file->f_ra.ra_pages = bdi->ra_pages;
+ break;
+ case POSIX_FADV_RANDOM:
+ file->f_ra.ra_pages = 0;
+ break;
+ case POSIX_FADV_SEQUENTIAL:
+ file->f_ra.ra_pages = bdi->ra_pages * 2;
+ break;
+ case POSIX_FADV_WILLNEED:
+ if (!mapping->a_ops->readpage) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /* First and last PARTIAL page! */
+ start_index = offset >> PAGE_CACHE_SHIFT;
+ end_index = endbyte >> PAGE_CACHE_SHIFT;
+
+ /* Careful about overflow on the "+1" */
+ nrpages = end_index - start_index + 1;
+ if (!nrpages)
+ nrpages = ~0UL;
+
+ ret = force_page_cache_readahead(mapping, file,
+ start_index,
+ max_sane_readahead(nrpages));
+ if (ret > 0)
+ ret = 0;
+ break;
+ case POSIX_FADV_NOREUSE:
+ break;
+ case POSIX_FADV_DONTNEED:
+ if (!bdi_write_congested(mapping->backing_dev_info))
+ filemap_flush(mapping);
+
+ /* First and last FULL page! */
+ start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
+ end_index = (endbyte >> PAGE_CACHE_SHIFT);
+
+ if (end_index >= start_index)
+ invalidate_mapping_pages(mapping, start_index,
+ end_index);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+out:
+ fput(file);
+ return ret;
+}
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_fadvise64_64(long fd, loff_t offset, loff_t len, long advice)
+{
+ return SYSC_fadvise64_64((int) fd, offset, len, (int) advice);
+}
+SYSCALL_ALIAS(sys_fadvise64_64, SyS_fadvise64_64);
+#endif
+
+#ifdef __ARCH_WANT_SYS_FADVISE64
+
+SYSCALL_DEFINE(fadvise64)(int fd, loff_t offset, size_t len, int advice)
+{
+ return sys_fadvise64_64(fd, offset, len, advice);
+}
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_fadvise64(long fd, loff_t offset, long len, long advice)
+{
+ return SYSC_fadvise64((int) fd, offset, (size_t)len, (int)advice);
+}
+SYSCALL_ALIAS(sys_fadvise64, SyS_fadvise64);
+#endif
+
+#endif
diff --git a/mm/filemap.c b/mm/filemap.c
new file mode 100644
index 0000000..6f62ef9
--- /dev/null
+++ b/mm/filemap.c
@@ -0,0 +1,2478 @@
+/*
+ * linux/mm/filemap.c
+ *
+ * Copyright (C) 1994-1999 Linus Torvalds
+ */
+
+/*
+ * This file handles the generic file mmap semantics used by
+ * most "normal" filesystems (but you don't /have/ to use this:
+ * the NFS filesystem used to do this differently, for example)
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/aio.h>
+#include <linux/capability.h>
+#include <linux/kernel_stat.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/hash.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/pagevec.h>
+#include <linux/blkdev.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/cpuset.h>
+#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h> /* for page_is_file_cache() */
+#include "internal.h"
+
+/*
+ * FIXME: remove all knowledge of the buffer layer from the core VM
+ */
+#include <linux/buffer_head.h> /* for generic_osync_inode */
+
+#include <asm/mman.h>
+
+
+/*
+ * Shared mappings implemented 30.11.1994. It's not fully working yet,
+ * though.
+ *
+ * Shared mappings now work. 15.8.1995 Bruno.
+ *
+ * finished 'unifying' the page and buffer cache and SMP-threaded the
+ * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
+ *
+ * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
+ */
+
+/*
+ * Lock ordering:
+ *
+ * ->i_mmap_lock (vmtruncate)
+ * ->private_lock (__free_pte->__set_page_dirty_buffers)
+ * ->swap_lock (exclusive_swap_page, others)
+ * ->mapping->tree_lock
+ *
+ * ->i_mutex
+ * ->i_mmap_lock (truncate->unmap_mapping_range)
+ *
+ * ->mmap_sem
+ * ->i_mmap_lock
+ * ->page_table_lock or pte_lock (various, mainly in memory.c)
+ * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
+ *
+ * ->mmap_sem
+ * ->lock_page (access_process_vm)
+ *
+ * ->i_mutex (generic_file_buffered_write)
+ * ->mmap_sem (fault_in_pages_readable->do_page_fault)
+ *
+ * ->i_mutex
+ * ->i_alloc_sem (various)
+ *
+ * ->inode_lock
+ * ->sb_lock (fs/fs-writeback.c)
+ * ->mapping->tree_lock (__sync_single_inode)
+ *
+ * ->i_mmap_lock
+ * ->anon_vma.lock (vma_adjust)
+ *
+ * ->anon_vma.lock
+ * ->page_table_lock or pte_lock (anon_vma_prepare and various)
+ *
+ * ->page_table_lock or pte_lock
+ * ->swap_lock (try_to_unmap_one)
+ * ->private_lock (try_to_unmap_one)
+ * ->tree_lock (try_to_unmap_one)
+ * ->zone.lru_lock (follow_page->mark_page_accessed)
+ * ->zone.lru_lock (check_pte_range->isolate_lru_page)
+ * ->private_lock (page_remove_rmap->set_page_dirty)
+ * ->tree_lock (page_remove_rmap->set_page_dirty)
+ * ->inode_lock (page_remove_rmap->set_page_dirty)
+ * ->inode_lock (zap_pte_range->set_page_dirty)
+ * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
+ *
+ * ->task->proc_lock
+ * ->dcache_lock (proc_pid_lookup)
+ */
+
+/*
+ * Remove a page from the page cache and free it. Caller has to make
+ * sure the page is locked and that nobody else uses it - or that usage
+ * is safe. The caller must hold the mapping's tree_lock.
+ */
+void __remove_from_page_cache(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+
+ radix_tree_delete(&mapping->page_tree, page->index);
+ page->mapping = NULL;
+ mapping->nrpages--;
+ __dec_zone_page_state(page, NR_FILE_PAGES);
+ BUG_ON(page_mapped(page));
+ mem_cgroup_uncharge_cache_page(page);
+
+ /*
+ * Some filesystems seem to re-dirty the page even after
+ * the VM has canceled the dirty bit (eg ext3 journaling).
+ *
+ * Fix it up by doing a final dirty accounting check after
+ * having removed the page entirely.
+ */
+ if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
+ dec_zone_page_state(page, NR_FILE_DIRTY);
+ dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+ }
+}
+
+void remove_from_page_cache(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+
+ BUG_ON(!PageLocked(page));
+
+ spin_lock_irq(&mapping->tree_lock);
+ __remove_from_page_cache(page);
+ spin_unlock_irq(&mapping->tree_lock);
+}
+
+static int sync_page(void *word)
+{
+ struct address_space *mapping;
+ struct page *page;
+
+ page = container_of((unsigned long *)word, struct page, flags);
+
+ /*
+ * page_mapping() is being called without PG_locked held.
+ * Some knowledge of the state and use of the page is used to
+ * reduce the requirements down to a memory barrier.
+ * The danger here is of a stale page_mapping() return value
+ * indicating a struct address_space different from the one it's
+ * associated with when it is associated with one.
+ * After smp_mb(), it's either the correct page_mapping() for
+ * the page, or an old page_mapping() and the page's own
+ * page_mapping() has gone NULL.
+ * The ->sync_page() address_space operation must tolerate
+ * page_mapping() going NULL. By an amazing coincidence,
+ * this comes about because none of the users of the page
+ * in the ->sync_page() methods make essential use of the
+ * page_mapping(), merely passing the page down to the backing
+ * device's unplug functions when it's non-NULL, which in turn
+ * ignore it for all cases but swap, where only page_private(page) is
+ * of interest. When page_mapping() does go NULL, the entire
+ * call stack gracefully ignores the page and returns.
+ * -- wli
+ */
+ smp_mb();
+ mapping = page_mapping(page);
+ if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
+ mapping->a_ops->sync_page(page);
+ io_schedule();
+ return 0;
+}
+
+static int sync_page_killable(void *word)
+{
+ sync_page(word);
+ return fatal_signal_pending(current) ? -EINTR : 0;
+}
+
+/**
+ * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
+ * @mapping: address space structure to write
+ * @start: offset in bytes where the range starts
+ * @end: offset in bytes where the range ends (inclusive)
+ * @sync_mode: enable synchronous operation
+ *
+ * Start writeback against all of a mapping's dirty pages that lie
+ * within the byte offsets <start, end> inclusive.
+ *
+ * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
+ * opposed to a regular memory cleansing writeback. The difference between
+ * these two operations is that if a dirty page/buffer is encountered, it must
+ * be waited upon, and not just skipped over.
+ */
+int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+ loff_t end, int sync_mode)
+{
+ int ret;
+ struct writeback_control wbc = {
+ .sync_mode = sync_mode,
+ .nr_to_write = LONG_MAX,
+ .range_start = start,
+ .range_end = end,
+ };
+
+ if (!mapping_cap_writeback_dirty(mapping))
+ return 0;
+
+ ret = do_writepages(mapping, &wbc);
+ return ret;
+}
+
+static inline int __filemap_fdatawrite(struct address_space *mapping,
+ int sync_mode)
+{
+ return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
+}
+
+int filemap_fdatawrite(struct address_space *mapping)
+{
+ return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
+}
+EXPORT_SYMBOL(filemap_fdatawrite);
+
+int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+ loff_t end)
+{
+ return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
+}
+EXPORT_SYMBOL(filemap_fdatawrite_range);
+
+/**
+ * filemap_flush - mostly a non-blocking flush
+ * @mapping: target address_space
+ *
+ * This is a mostly non-blocking flush. Not suitable for data-integrity
+ * purposes - I/O may not be started against all dirty pages.
+ */
+int filemap_flush(struct address_space *mapping)
+{
+ return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
+}
+EXPORT_SYMBOL(filemap_flush);
+
+/**
+ * wait_on_page_writeback_range - wait for writeback to complete
+ * @mapping: target address_space
+ * @start: beginning page index
+ * @end: ending page index
+ *
+ * Wait for writeback to complete against pages indexed by start->end
+ * inclusive
+ */
+int wait_on_page_writeback_range(struct address_space *mapping,
+ pgoff_t start, pgoff_t end)
+{
+ struct pagevec pvec;
+ int nr_pages;
+ int ret = 0;
+ pgoff_t index;
+
+ if (end < start)
+ return 0;
+
+ pagevec_init(&pvec, 0);
+ index = start;
+ while ((index <= end) &&
+ (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_WRITEBACK,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
+ unsigned i;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ /* until radix tree lookup accepts end_index */
+ if (page->index > end)
+ continue;
+
+ wait_on_page_writeback(page);
+ if (PageError(page))
+ ret = -EIO;
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+
+ /* Check for outstanding write errors */
+ if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
+ ret = -ENOSPC;
+ if (test_and_clear_bit(AS_EIO, &mapping->flags))
+ ret = -EIO;
+
+ return ret;
+}
+
+/**
+ * sync_page_range - write and wait on all pages in the passed range
+ * @inode: target inode
+ * @mapping: target address_space
+ * @pos: beginning offset in pages to write
+ * @count: number of bytes to write
+ *
+ * Write and wait upon all the pages in the passed range. This is a "data
+ * integrity" operation. It waits upon in-flight writeout before starting and
+ * waiting upon new writeout. If there was an IO error, return it.
+ *
+ * We need to re-take i_mutex during the generic_osync_inode list walk because
+ * it is otherwise livelockable.
+ */
+int sync_page_range(struct inode *inode, struct address_space *mapping,
+ loff_t pos, loff_t count)
+{
+ pgoff_t start = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
+ int ret;
+
+ if (!mapping_cap_writeback_dirty(mapping) || !count)
+ return 0;
+ ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
+ if (ret == 0) {
+ mutex_lock(&inode->i_mutex);
+ ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
+ mutex_unlock(&inode->i_mutex);
+ }
+ if (ret == 0)
+ ret = wait_on_page_writeback_range(mapping, start, end);
+ return ret;
+}
+EXPORT_SYMBOL(sync_page_range);
+
+/**
+ * sync_page_range_nolock - write & wait on all pages in the passed range without locking
+ * @inode: target inode
+ * @mapping: target address_space
+ * @pos: beginning offset in pages to write
+ * @count: number of bytes to write
+ *
+ * Note: Holding i_mutex across sync_page_range_nolock() is not a good idea
+ * as it forces O_SYNC writers to different parts of the same file
+ * to be serialised right until io completion.
+ */
+int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
+ loff_t pos, loff_t count)
+{
+ pgoff_t start = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
+ int ret;
+
+ if (!mapping_cap_writeback_dirty(mapping) || !count)
+ return 0;
+ ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
+ if (ret == 0)
+ ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
+ if (ret == 0)
+ ret = wait_on_page_writeback_range(mapping, start, end);
+ return ret;
+}
+EXPORT_SYMBOL(sync_page_range_nolock);
+
+/**
+ * filemap_fdatawait - wait for all under-writeback pages to complete
+ * @mapping: address space structure to wait for
+ *
+ * Walk the list of under-writeback pages of the given address space
+ * and wait for all of them.
+ */
+int filemap_fdatawait(struct address_space *mapping)
+{
+ loff_t i_size = i_size_read(mapping->host);
+
+ if (i_size == 0)
+ return 0;
+
+ return wait_on_page_writeback_range(mapping, 0,
+ (i_size - 1) >> PAGE_CACHE_SHIFT);
+}
+EXPORT_SYMBOL(filemap_fdatawait);
+
+int filemap_write_and_wait(struct address_space *mapping)
+{
+ int err = 0;
+
+ if (mapping->nrpages) {
+ err = filemap_fdatawrite(mapping);
+ /*
+ * Even if the above returned error, the pages may be
+ * written partially (e.g. -ENOSPC), so we wait for it.
+ * But the -EIO is special case, it may indicate the worst
+ * thing (e.g. bug) happened, so we avoid waiting for it.
+ */
+ if (err != -EIO) {
+ int err2 = filemap_fdatawait(mapping);
+ if (!err)
+ err = err2;
+ }
+ }
+ return err;
+}
+EXPORT_SYMBOL(filemap_write_and_wait);
+
+/**
+ * filemap_write_and_wait_range - write out & wait on a file range
+ * @mapping: the address_space for the pages
+ * @lstart: offset in bytes where the range starts
+ * @lend: offset in bytes where the range ends (inclusive)
+ *
+ * Write out and wait upon file offsets lstart->lend, inclusive.
+ *
+ * Note that `lend' is inclusive (describes the last byte to be written) so
+ * that this function can be used to write to the very end-of-file (end = -1).
+ */
+int filemap_write_and_wait_range(struct address_space *mapping,
+ loff_t lstart, loff_t lend)
+{
+ int err = 0;
+
+ if (mapping->nrpages) {
+ err = __filemap_fdatawrite_range(mapping, lstart, lend,
+ WB_SYNC_ALL);
+ /* See comment of filemap_write_and_wait() */
+ if (err != -EIO) {
+ int err2 = wait_on_page_writeback_range(mapping,
+ lstart >> PAGE_CACHE_SHIFT,
+ lend >> PAGE_CACHE_SHIFT);
+ if (!err)
+ err = err2;
+ }
+ }
+ return err;
+}
+
+/**
+ * add_to_page_cache_locked - add a locked page to the pagecache
+ * @page: page to add
+ * @mapping: the page's address_space
+ * @offset: page index
+ * @gfp_mask: page allocation mode
+ *
+ * This function is used to add a page to the pagecache. It must be locked.
+ * This function does not add the page to the LRU. The caller must do that.
+ */
+int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
+ pgoff_t offset, gfp_t gfp_mask)
+{
+ int error;
+
+ VM_BUG_ON(!PageLocked(page));
+
+ error = mem_cgroup_cache_charge(page, current->mm,
+ gfp_mask & ~__GFP_HIGHMEM);
+ if (error)
+ goto out;
+
+ error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+ if (error == 0) {
+ page_cache_get(page);
+ page->mapping = mapping;
+ page->index = offset;
+
+ spin_lock_irq(&mapping->tree_lock);
+ error = radix_tree_insert(&mapping->page_tree, offset, page);
+ if (likely(!error)) {
+ mapping->nrpages++;
+ __inc_zone_page_state(page, NR_FILE_PAGES);
+ } else {
+ page->mapping = NULL;
+ mem_cgroup_uncharge_cache_page(page);
+ page_cache_release(page);
+ }
+
+ spin_unlock_irq(&mapping->tree_lock);
+ radix_tree_preload_end();
+ } else
+ mem_cgroup_uncharge_cache_page(page);
+out:
+ return error;
+}
+EXPORT_SYMBOL(add_to_page_cache_locked);
+
+int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
+ pgoff_t offset, gfp_t gfp_mask)
+{
+ int ret;
+
+ /*
+ * Splice_read and readahead add shmem/tmpfs pages into the page cache
+ * before shmem_readpage has a chance to mark them as SwapBacked: they
+ * need to go on the active_anon lru below, and mem_cgroup_cache_charge
+ * (called in add_to_page_cache) needs to know where they're going too.
+ */
+ if (mapping_cap_swap_backed(mapping))
+ SetPageSwapBacked(page);
+
+ ret = add_to_page_cache(page, mapping, offset, gfp_mask);
+ if (ret == 0) {
+ if (page_is_file_cache(page))
+ lru_cache_add_file(page);
+ else
+ lru_cache_add_active_anon(page);
+ }
+ return ret;
+}
+
+#ifdef CONFIG_NUMA
+struct page *__page_cache_alloc(gfp_t gfp)
+{
+ if (cpuset_do_page_mem_spread()) {
+ int n = cpuset_mem_spread_node();
+ return alloc_pages_node(n, gfp, 0);
+ }
+ return alloc_pages(gfp, 0);
+}
+EXPORT_SYMBOL(__page_cache_alloc);
+#endif
+
+static int __sleep_on_page_lock(void *word)
+{
+ io_schedule();
+ return 0;
+}
+
+/*
+ * In order to wait for pages to become available there must be
+ * waitqueues associated with pages. By using a hash table of
+ * waitqueues where the bucket discipline is to maintain all
+ * waiters on the same queue and wake all when any of the pages
+ * become available, and for the woken contexts to check to be
+ * sure the appropriate page became available, this saves space
+ * at a cost of "thundering herd" phenomena during rare hash
+ * collisions.
+ */
+static wait_queue_head_t *page_waitqueue(struct page *page)
+{
+ const struct zone *zone = page_zone(page);
+
+ return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
+}
+
+static inline void wake_up_page(struct page *page, int bit)
+{
+ __wake_up_bit(page_waitqueue(page), &page->flags, bit);
+}
+
+void wait_on_page_bit(struct page *page, int bit_nr)
+{
+ DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+
+ if (test_bit(bit_nr, &page->flags))
+ __wait_on_bit(page_waitqueue(page), &wait, sync_page,
+ TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_on_page_bit);
+
+/**
+ * unlock_page - unlock a locked page
+ * @page: the page
+ *
+ * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
+ * Also wakes sleepers in wait_on_page_writeback() because the wakeup
+ * mechananism between PageLocked pages and PageWriteback pages is shared.
+ * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
+ *
+ * The mb is necessary to enforce ordering between the clear_bit and the read
+ * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).
+ */
+void unlock_page(struct page *page)
+{
+ VM_BUG_ON(!PageLocked(page));
+ clear_bit_unlock(PG_locked, &page->flags);
+ smp_mb__after_clear_bit();
+ wake_up_page(page, PG_locked);
+}
+EXPORT_SYMBOL(unlock_page);
+
+/**
+ * end_page_writeback - end writeback against a page
+ * @page: the page
+ */
+void end_page_writeback(struct page *page)
+{
+ if (TestClearPageReclaim(page))
+ rotate_reclaimable_page(page);
+
+ if (!test_clear_page_writeback(page))
+ BUG();
+
+ smp_mb__after_clear_bit();
+ wake_up_page(page, PG_writeback);
+}
+EXPORT_SYMBOL(end_page_writeback);
+
+/**
+ * __lock_page - get a lock on the page, assuming we need to sleep to get it
+ * @page: the page to lock
+ *
+ * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some
+ * random driver's requestfn sets TASK_RUNNING, we could busywait. However
+ * chances are that on the second loop, the block layer's plug list is empty,
+ * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
+ */
+void __lock_page(struct page *page)
+{
+ DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+
+ __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
+ TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(__lock_page);
+
+int __lock_page_killable(struct page *page)
+{
+ DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+
+ return __wait_on_bit_lock(page_waitqueue(page), &wait,
+ sync_page_killable, TASK_KILLABLE);
+}
+
+/**
+ * __lock_page_nosync - get a lock on the page, without calling sync_page()
+ * @page: the page to lock
+ *
+ * Variant of lock_page that does not require the caller to hold a reference
+ * on the page's mapping.
+ */
+void __lock_page_nosync(struct page *page)
+{
+ DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+ __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
+ TASK_UNINTERRUPTIBLE);
+}
+
+/**
+ * find_get_page - find and get a page reference
+ * @mapping: the address_space to search
+ * @offset: the page index
+ *
+ * Is there a pagecache struct page at the given (mapping, offset) tuple?
+ * If yes, increment its refcount and return it; if no, return NULL.
+ */
+struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
+{
+ void **pagep;
+ struct page *page;
+
+ rcu_read_lock();
+repeat:
+ page = NULL;
+ pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
+ if (pagep) {
+ page = radix_tree_deref_slot(pagep);
+ if (unlikely(!page || page == RADIX_TREE_RETRY))
+ goto repeat;
+
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /*
+ * Has the page moved?
+ * This is part of the lockless pagecache protocol. See
+ * include/linux/pagemap.h for details.
+ */
+ if (unlikely(page != *pagep)) {
+ page_cache_release(page);
+ goto repeat;
+ }
+ }
+ rcu_read_unlock();
+
+ return page;
+}
+EXPORT_SYMBOL(find_get_page);
+
+/**
+ * find_lock_page - locate, pin and lock a pagecache page
+ * @mapping: the address_space to search
+ * @offset: the page index
+ *
+ * Locates the desired pagecache page, locks it, increments its reference
+ * count and returns its address.
+ *
+ * Returns zero if the page was not present. find_lock_page() may sleep.
+ */
+struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
+{
+ struct page *page;
+
+repeat:
+ page = find_get_page(mapping, offset);
+ if (page) {
+ lock_page(page);
+ /* Has the page been truncated? */
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto repeat;
+ }
+ VM_BUG_ON(page->index != offset);
+ }
+ return page;
+}
+EXPORT_SYMBOL(find_lock_page);
+
+/**
+ * find_or_create_page - locate or add a pagecache page
+ * @mapping: the page's address_space
+ * @index: the page's index into the mapping
+ * @gfp_mask: page allocation mode
+ *
+ * Locates a page in the pagecache. If the page is not present, a new page
+ * is allocated using @gfp_mask and is added to the pagecache and to the VM's
+ * LRU list. The returned page is locked and has its reference count
+ * incremented.
+ *
+ * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
+ * allocation!
+ *
+ * find_or_create_page() returns the desired page's address, or zero on
+ * memory exhaustion.
+ */
+struct page *find_or_create_page(struct address_space *mapping,
+ pgoff_t index, gfp_t gfp_mask)
+{
+ struct page *page;
+ int err;
+repeat:
+ page = find_lock_page(mapping, index);
+ if (!page) {
+ page = __page_cache_alloc(gfp_mask);
+ if (!page)
+ return NULL;
+ err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
+ if (unlikely(err)) {
+ page_cache_release(page);
+ page = NULL;
+ if (err == -EEXIST)
+ goto repeat;
+ }
+ }
+ return page;
+}
+EXPORT_SYMBOL(find_or_create_page);
+
+/**
+ * find_get_pages - gang pagecache lookup
+ * @mapping: The address_space to search
+ * @start: The starting page index
+ * @nr_pages: The maximum number of pages
+ * @pages: Where the resulting pages are placed
+ *
+ * find_get_pages() will search for and return a group of up to
+ * @nr_pages pages in the mapping. The pages are placed at @pages.
+ * find_get_pages() takes a reference against the returned pages.
+ *
+ * The search returns a group of mapping-contiguous pages with ascending
+ * indexes. There may be holes in the indices due to not-present pages.
+ *
+ * find_get_pages() returns the number of pages which were found.
+ */
+unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
+ unsigned int nr_pages, struct page **pages)
+{
+ unsigned int i;
+ unsigned int ret;
+ unsigned int nr_found;
+
+ rcu_read_lock();
+restart:
+ nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
+ (void ***)pages, start, nr_pages);
+ ret = 0;
+ for (i = 0; i < nr_found; i++) {
+ struct page *page;
+repeat:
+ page = radix_tree_deref_slot((void **)pages[i]);
+ if (unlikely(!page))
+ continue;
+ /*
+ * this can only trigger if nr_found == 1, making livelock
+ * a non issue.
+ */
+ if (unlikely(page == RADIX_TREE_RETRY))
+ goto restart;
+
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /* Has the page moved? */
+ if (unlikely(page != *((void **)pages[i]))) {
+ page_cache_release(page);
+ goto repeat;
+ }
+
+ pages[ret] = page;
+ ret++;
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+/**
+ * find_get_pages_contig - gang contiguous pagecache lookup
+ * @mapping: The address_space to search
+ * @index: The starting page index
+ * @nr_pages: The maximum number of pages
+ * @pages: Where the resulting pages are placed
+ *
+ * find_get_pages_contig() works exactly like find_get_pages(), except
+ * that the returned number of pages are guaranteed to be contiguous.
+ *
+ * find_get_pages_contig() returns the number of pages which were found.
+ */
+unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
+ unsigned int nr_pages, struct page **pages)
+{
+ unsigned int i;
+ unsigned int ret;
+ unsigned int nr_found;
+
+ rcu_read_lock();
+restart:
+ nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
+ (void ***)pages, index, nr_pages);
+ ret = 0;
+ for (i = 0; i < nr_found; i++) {
+ struct page *page;
+repeat:
+ page = radix_tree_deref_slot((void **)pages[i]);
+ if (unlikely(!page))
+ continue;
+ /*
+ * this can only trigger if nr_found == 1, making livelock
+ * a non issue.
+ */
+ if (unlikely(page == RADIX_TREE_RETRY))
+ goto restart;
+
+ if (page->mapping == NULL || page->index != index)
+ break;
+
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /* Has the page moved? */
+ if (unlikely(page != *((void **)pages[i]))) {
+ page_cache_release(page);
+ goto repeat;
+ }
+
+ pages[ret] = page;
+ ret++;
+ index++;
+ }
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL(find_get_pages_contig);
+
+/**
+ * find_get_pages_tag - find and return pages that match @tag
+ * @mapping: the address_space to search
+ * @index: the starting page index
+ * @tag: the tag index
+ * @nr_pages: the maximum number of pages
+ * @pages: where the resulting pages are placed
+ *
+ * Like find_get_pages, except we only return pages which are tagged with
+ * @tag. We update @index to index the next page for the traversal.
+ */
+unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
+ int tag, unsigned int nr_pages, struct page **pages)
+{
+ unsigned int i;
+ unsigned int ret;
+ unsigned int nr_found;
+
+ rcu_read_lock();
+restart:
+ nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree,
+ (void ***)pages, *index, nr_pages, tag);
+ ret = 0;
+ for (i = 0; i < nr_found; i++) {
+ struct page *page;
+repeat:
+ page = radix_tree_deref_slot((void **)pages[i]);
+ if (unlikely(!page))
+ continue;
+ /*
+ * this can only trigger if nr_found == 1, making livelock
+ * a non issue.
+ */
+ if (unlikely(page == RADIX_TREE_RETRY))
+ goto restart;
+
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /* Has the page moved? */
+ if (unlikely(page != *((void **)pages[i]))) {
+ page_cache_release(page);
+ goto repeat;
+ }
+
+ pages[ret] = page;
+ ret++;
+ }
+ rcu_read_unlock();
+
+ if (ret)
+ *index = pages[ret - 1]->index + 1;
+
+ return ret;
+}
+EXPORT_SYMBOL(find_get_pages_tag);
+
+/**
+ * grab_cache_page_nowait - returns locked page at given index in given cache
+ * @mapping: target address_space
+ * @index: the page index
+ *
+ * Same as grab_cache_page(), but do not wait if the page is unavailable.
+ * This is intended for speculative data generators, where the data can
+ * be regenerated if the page couldn't be grabbed. This routine should
+ * be safe to call while holding the lock for another page.
+ *
+ * Clear __GFP_FS when allocating the page to avoid recursion into the fs
+ * and deadlock against the caller's locked page.
+ */
+struct page *
+grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
+{
+ struct page *page = find_get_page(mapping, index);
+
+ if (page) {
+ if (trylock_page(page))
+ return page;
+ page_cache_release(page);
+ return NULL;
+ }
+ page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
+ if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) {
+ page_cache_release(page);
+ page = NULL;
+ }
+ return page;
+}
+EXPORT_SYMBOL(grab_cache_page_nowait);
+
+/*
+ * CD/DVDs are error prone. When a medium error occurs, the driver may fail
+ * a _large_ part of the i/o request. Imagine the worst scenario:
+ *
+ * ---R__________________________________________B__________
+ * ^ reading here ^ bad block(assume 4k)
+ *
+ * read(R) => miss => readahead(R...B) => media error => frustrating retries
+ * => failing the whole request => read(R) => read(R+1) =>
+ * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
+ * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
+ * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
+ *
+ * It is going insane. Fix it by quickly scaling down the readahead size.
+ */
+static void shrink_readahead_size_eio(struct file *filp,
+ struct file_ra_state *ra)
+{
+ if (!ra->ra_pages)
+ return;
+
+ ra->ra_pages /= 4;
+}
+
+/**
+ * do_generic_file_read - generic file read routine
+ * @filp: the file to read
+ * @ppos: current file position
+ * @desc: read_descriptor
+ * @actor: read method
+ *
+ * This is a generic file read routine, and uses the
+ * mapping->a_ops->readpage() function for the actual low-level stuff.
+ *
+ * This is really ugly. But the goto's actually try to clarify some
+ * of the logic when it comes to error handling etc.
+ */
+static void do_generic_file_read(struct file *filp, loff_t *ppos,
+ read_descriptor_t *desc, read_actor_t actor)
+{
+ struct address_space *mapping = filp->f_mapping;
+ struct inode *inode = mapping->host;
+ struct file_ra_state *ra = &filp->f_ra;
+ pgoff_t index;
+ pgoff_t last_index;
+ pgoff_t prev_index;
+ unsigned long offset; /* offset into pagecache page */
+ unsigned int prev_offset;
+ int error;
+
+ index = *ppos >> PAGE_CACHE_SHIFT;
+ prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
+ prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
+ last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
+ offset = *ppos & ~PAGE_CACHE_MASK;
+
+ for (;;) {
+ struct page *page;
+ pgoff_t end_index;
+ loff_t isize;
+ unsigned long nr, ret;
+
+ cond_resched();
+find_page:
+ page = find_get_page(mapping, index);
+ if (!page) {
+ page_cache_sync_readahead(mapping,
+ ra, filp,
+ index, last_index - index);
+ page = find_get_page(mapping, index);
+ if (unlikely(page == NULL))
+ goto no_cached_page;
+ }
+ if (PageReadahead(page)) {
+ page_cache_async_readahead(mapping,
+ ra, filp, page,
+ index, last_index - index);
+ }
+ if (!PageUptodate(page)) {
+ if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
+ !mapping->a_ops->is_partially_uptodate)
+ goto page_not_up_to_date;
+ if (!trylock_page(page))
+ goto page_not_up_to_date;
+ if (!mapping->a_ops->is_partially_uptodate(page,
+ desc, offset))
+ goto page_not_up_to_date_locked;
+ unlock_page(page);
+ }
+page_ok:
+ /*
+ * i_size must be checked after we know the page is Uptodate.
+ *
+ * Checking i_size after the check allows us to calculate
+ * the correct value for "nr", which means the zero-filled
+ * part of the page is not copied back to userspace (unless
+ * another truncate extends the file - this is desired though).
+ */
+
+ isize = i_size_read(inode);
+ end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+ if (unlikely(!isize || index > end_index)) {
+ page_cache_release(page);
+ goto out;
+ }
+
+ /* nr is the maximum number of bytes to copy from this page */
+ nr = PAGE_CACHE_SIZE;
+ if (index == end_index) {
+ nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+ if (nr <= offset) {
+ page_cache_release(page);
+ goto out;
+ }
+ }
+ nr = nr - offset;
+
+ /* If users can be writing to this page using arbitrary
+ * virtual addresses, take care about potential aliasing
+ * before reading the page on the kernel side.
+ */
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_page(page);
+
+ /*
+ * When a sequential read accesses a page several times,
+ * only mark it as accessed the first time.
+ */
+ if (prev_index != index || offset != prev_offset)
+ mark_page_accessed(page);
+ prev_index = index;
+
+ /*
+ * Ok, we have the page, and it's up-to-date, so
+ * now we can copy it to user space...
+ *
+ * The actor routine returns how many bytes were actually used..
+ * NOTE! This may not be the same as how much of a user buffer
+ * we filled up (we may be padding etc), so we can only update
+ * "pos" here (the actor routine has to update the user buffer
+ * pointers and the remaining count).
+ */
+ ret = actor(desc, page, offset, nr);
+ offset += ret;
+ index += offset >> PAGE_CACHE_SHIFT;
+ offset &= ~PAGE_CACHE_MASK;
+ prev_offset = offset;
+
+ page_cache_release(page);
+ if (ret == nr && desc->count)
+ continue;
+ goto out;
+
+page_not_up_to_date:
+ /* Get exclusive access to the page ... */
+ error = lock_page_killable(page);
+ if (unlikely(error))
+ goto readpage_error;
+
+page_not_up_to_date_locked:
+ /* Did it get truncated before we got the lock? */
+ if (!page->mapping) {
+ unlock_page(page);
+ page_cache_release(page);
+ continue;
+ }
+
+ /* Did somebody else fill it already? */
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ goto page_ok;
+ }
+
+readpage:
+ /* Start the actual read. The read will unlock the page. */
+ error = mapping->a_ops->readpage(filp, page);
+
+ if (unlikely(error)) {
+ if (error == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ goto find_page;
+ }
+ goto readpage_error;
+ }
+
+ if (!PageUptodate(page)) {
+ error = lock_page_killable(page);
+ if (unlikely(error))
+ goto readpage_error;
+ if (!PageUptodate(page)) {
+ if (page->mapping == NULL) {
+ /*
+ * invalidate_inode_pages got it
+ */
+ unlock_page(page);
+ page_cache_release(page);
+ goto find_page;
+ }
+ unlock_page(page);
+ shrink_readahead_size_eio(filp, ra);
+ error = -EIO;
+ goto readpage_error;
+ }
+ unlock_page(page);
+ }
+
+ goto page_ok;
+
+readpage_error:
+ /* UHHUH! A synchronous read error occurred. Report it */
+ desc->error = error;
+ page_cache_release(page);
+ goto out;
+
+no_cached_page:
+ /*
+ * Ok, it wasn't cached, so we need to create a new
+ * page..
+ */
+ page = page_cache_alloc_cold(mapping);
+ if (!page) {
+ desc->error = -ENOMEM;
+ goto out;
+ }
+ error = add_to_page_cache_lru(page, mapping,
+ index, GFP_KERNEL);
+ if (error) {
+ page_cache_release(page);
+ if (error == -EEXIST)
+ goto find_page;
+ desc->error = error;
+ goto out;
+ }
+ goto readpage;
+ }
+
+out:
+ ra->prev_pos = prev_index;
+ ra->prev_pos <<= PAGE_CACHE_SHIFT;
+ ra->prev_pos |= prev_offset;
+
+ *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
+ file_accessed(filp);
+}
+
+int file_read_actor(read_descriptor_t *desc, struct page *page,
+ unsigned long offset, unsigned long size)
+{
+ char *kaddr;
+ unsigned long left, count = desc->count;
+
+ if (size > count)
+ size = count;
+
+ /*
+ * Faults on the destination of a read are common, so do it before
+ * taking the kmap.
+ */
+ if (!fault_in_pages_writeable(desc->arg.buf, size)) {
+ kaddr = kmap_atomic(page, KM_USER0);
+ left = __copy_to_user_inatomic(desc->arg.buf,
+ kaddr + offset, size);
+ kunmap_atomic(kaddr, KM_USER0);
+ if (left == 0)
+ goto success;
+ }
+
+ /* Do it the slow way */
+ kaddr = kmap(page);
+ left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
+ kunmap(page);
+
+ if (left) {
+ size -= left;
+ desc->error = -EFAULT;
+ }
+success:
+ desc->count = count - size;
+ desc->written += size;
+ desc->arg.buf += size;
+ return size;
+}
+
+/*
+ * Performs necessary checks before doing a write
+ * @iov: io vector request
+ * @nr_segs: number of segments in the iovec
+ * @count: number of bytes to write
+ * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE
+ *
+ * Adjust number of segments and amount of bytes to write (nr_segs should be
+ * properly initialized first). Returns appropriate error code that caller
+ * should return or zero in case that write should be allowed.
+ */
+int generic_segment_checks(const struct iovec *iov,
+ unsigned long *nr_segs, size_t *count, int access_flags)
+{
+ unsigned long seg;
+ size_t cnt = 0;
+ for (seg = 0; seg < *nr_segs; seg++) {
+ const struct iovec *iv = &iov[seg];
+
+ /*
+ * If any segment has a negative length, or the cumulative
+ * length ever wraps negative then return -EINVAL.
+ */
+ cnt += iv->iov_len;
+ if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
+ return -EINVAL;
+ if (access_ok(access_flags, iv->iov_base, iv->iov_len))
+ continue;
+ if (seg == 0)
+ return -EFAULT;
+ *nr_segs = seg;
+ cnt -= iv->iov_len; /* This segment is no good */
+ break;
+ }
+ *count = cnt;
+ return 0;
+}
+EXPORT_SYMBOL(generic_segment_checks);
+
+/**
+ * generic_file_aio_read - generic filesystem read routine
+ * @iocb: kernel I/O control block
+ * @iov: io vector request
+ * @nr_segs: number of segments in the iovec
+ * @pos: current file position
+ *
+ * This is the "read()" routine for all filesystems
+ * that can use the page cache directly.
+ */
+ssize_t
+generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct file *filp = iocb->ki_filp;
+ ssize_t retval;
+ unsigned long seg;
+ size_t count;
+ loff_t *ppos = &iocb->ki_pos;
+
+ count = 0;
+ retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
+ if (retval)
+ return retval;
+
+ /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
+ if (filp->f_flags & O_DIRECT) {
+ loff_t size;
+ struct address_space *mapping;
+ struct inode *inode;
+
+ mapping = filp->f_mapping;
+ inode = mapping->host;
+ if (!count)
+ goto out; /* skip atime */
+ size = i_size_read(inode);
+ if (pos < size) {
+ retval = filemap_write_and_wait_range(mapping, pos,
+ pos + iov_length(iov, nr_segs) - 1);
+ if (!retval) {
+ retval = mapping->a_ops->direct_IO(READ, iocb,
+ iov, pos, nr_segs);
+ }
+ if (retval > 0)
+ *ppos = pos + retval;
+ if (retval) {
+ file_accessed(filp);
+ goto out;
+ }
+ }
+ }
+
+ for (seg = 0; seg < nr_segs; seg++) {
+ read_descriptor_t desc;
+
+ desc.written = 0;
+ desc.arg.buf = iov[seg].iov_base;
+ desc.count = iov[seg].iov_len;
+ if (desc.count == 0)
+ continue;
+ desc.error = 0;
+ do_generic_file_read(filp, ppos, &desc, file_read_actor);
+ retval += desc.written;
+ if (desc.error) {
+ retval = retval ?: desc.error;
+ break;
+ }
+ if (desc.count > 0)
+ break;
+ }
+out:
+ return retval;
+}
+EXPORT_SYMBOL(generic_file_aio_read);
+
+static ssize_t
+do_readahead(struct address_space *mapping, struct file *filp,
+ pgoff_t index, unsigned long nr)
+{
+ if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
+ return -EINVAL;
+
+ force_page_cache_readahead(mapping, filp, index,
+ max_sane_readahead(nr));
+ return 0;
+}
+
+SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
+{
+ ssize_t ret;
+ struct file *file;
+
+ ret = -EBADF;
+ file = fget(fd);
+ if (file) {
+ if (file->f_mode & FMODE_READ) {
+ struct address_space *mapping = file->f_mapping;
+ pgoff_t start = offset >> PAGE_CACHE_SHIFT;
+ pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
+ unsigned long len = end - start + 1;
+ ret = do_readahead(mapping, file, start, len);
+ }
+ fput(file);
+ }
+ return ret;
+}
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_readahead(long fd, loff_t offset, long count)
+{
+ return SYSC_readahead((int) fd, offset, (size_t) count);
+}
+SYSCALL_ALIAS(sys_readahead, SyS_readahead);
+#endif
+
+#ifdef CONFIG_MMU
+/**
+ * page_cache_read - adds requested page to the page cache if not already there
+ * @file: file to read
+ * @offset: page index
+ *
+ * This adds the requested page to the page cache if it isn't already there,
+ * and schedules an I/O to read in its contents from disk.
+ */
+static int page_cache_read(struct file *file, pgoff_t offset)
+{
+ struct address_space *mapping = file->f_mapping;
+ struct page *page;
+ int ret;
+
+ do {
+ page = page_cache_alloc_cold(mapping);
+ if (!page)
+ return -ENOMEM;
+
+ ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
+ if (ret == 0)
+ ret = mapping->a_ops->readpage(file, page);
+ else if (ret == -EEXIST)
+ ret = 0; /* losing race to add is OK */
+
+ page_cache_release(page);
+
+ } while (ret == AOP_TRUNCATED_PAGE);
+
+ return ret;
+}
+
+#define MMAP_LOTSAMISS (100)
+
+/**
+ * filemap_fault - read in file data for page fault handling
+ * @vma: vma in which the fault was taken
+ * @vmf: struct vm_fault containing details of the fault
+ *
+ * filemap_fault() is invoked via the vma operations vector for a
+ * mapped memory region to read in file data during a page fault.
+ *
+ * The goto's are kind of ugly, but this streamlines the normal case of having
+ * it in the page cache, and handles the special cases reasonably without
+ * having a lot of duplicated code.
+ */
+int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ int error;
+ struct file *file = vma->vm_file;
+ struct address_space *mapping = file->f_mapping;
+ struct file_ra_state *ra = &file->f_ra;
+ struct inode *inode = mapping->host;
+ struct page *page;
+ pgoff_t size;
+ int did_readaround = 0;
+ int ret = 0;
+
+ size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ if (vmf->pgoff >= size)
+ return VM_FAULT_SIGBUS;
+
+ /* If we don't want any read-ahead, don't bother */
+ if (VM_RandomReadHint(vma))
+ goto no_cached_page;
+
+ /*
+ * Do we have something in the page cache already?
+ */
+retry_find:
+ page = find_lock_page(mapping, vmf->pgoff);
+ /*
+ * For sequential accesses, we use the generic readahead logic.
+ */
+ if (VM_SequentialReadHint(vma)) {
+ if (!page) {
+ page_cache_sync_readahead(mapping, ra, file,
+ vmf->pgoff, 1);
+ page = find_lock_page(mapping, vmf->pgoff);
+ if (!page)
+ goto no_cached_page;
+ }
+ if (PageReadahead(page)) {
+ page_cache_async_readahead(mapping, ra, file, page,
+ vmf->pgoff, 1);
+ }
+ }
+
+ if (!page) {
+ unsigned long ra_pages;
+
+ ra->mmap_miss++;
+
+ /*
+ * Do we miss much more than hit in this file? If so,
+ * stop bothering with read-ahead. It will only hurt.
+ */
+ if (ra->mmap_miss > MMAP_LOTSAMISS)
+ goto no_cached_page;
+
+ /*
+ * To keep the pgmajfault counter straight, we need to
+ * check did_readaround, as this is an inner loop.
+ */
+ if (!did_readaround) {
+ ret = VM_FAULT_MAJOR;
+ count_vm_event(PGMAJFAULT);
+ }
+ did_readaround = 1;
+ ra_pages = max_sane_readahead(file->f_ra.ra_pages);
+ if (ra_pages) {
+ pgoff_t start = 0;
+
+ if (vmf->pgoff > ra_pages / 2)
+ start = vmf->pgoff - ra_pages / 2;
+ do_page_cache_readahead(mapping, file, start, ra_pages);
+ }
+ page = find_lock_page(mapping, vmf->pgoff);
+ if (!page)
+ goto no_cached_page;
+ }
+
+ if (!did_readaround)
+ ra->mmap_miss--;
+
+ /*
+ * We have a locked page in the page cache, now we need to check
+ * that it's up-to-date. If not, it is going to be due to an error.
+ */
+ if (unlikely(!PageUptodate(page)))
+ goto page_not_uptodate;
+
+ /* Must recheck i_size under page lock */
+ size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ if (unlikely(vmf->pgoff >= size)) {
+ unlock_page(page);
+ page_cache_release(page);
+ return VM_FAULT_SIGBUS;
+ }
+
+ /*
+ * Found the page and have a reference on it.
+ */
+ mark_page_accessed(page);
+ ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
+ vmf->page = page;
+ return ret | VM_FAULT_LOCKED;
+
+no_cached_page:
+ /*
+ * We're only likely to ever get here if MADV_RANDOM is in
+ * effect.
+ */
+ error = page_cache_read(file, vmf->pgoff);
+
+ /*
+ * The page we want has now been added to the page cache.
+ * In the unlikely event that someone removed it in the
+ * meantime, we'll just come back here and read it again.
+ */
+ if (error >= 0)
+ goto retry_find;
+
+ /*
+ * An error return from page_cache_read can result if the
+ * system is low on memory, or a problem occurs while trying
+ * to schedule I/O.
+ */
+ if (error == -ENOMEM)
+ return VM_FAULT_OOM;
+ return VM_FAULT_SIGBUS;
+
+page_not_uptodate:
+ /* IO error path */
+ if (!did_readaround) {
+ ret = VM_FAULT_MAJOR;
+ count_vm_event(PGMAJFAULT);
+ }
+
+ /*
+ * Umm, take care of errors if the page isn't up-to-date.
+ * Try to re-read it _once_. We do this synchronously,
+ * because there really aren't any performance issues here
+ * and we need to check for errors.
+ */
+ ClearPageError(page);
+ error = mapping->a_ops->readpage(file, page);
+ if (!error) {
+ wait_on_page_locked(page);
+ if (!PageUptodate(page))
+ error = -EIO;
+ }
+ page_cache_release(page);
+
+ if (!error || error == AOP_TRUNCATED_PAGE)
+ goto retry_find;
+
+ /* Things didn't work out. Return zero to tell the mm layer so. */
+ shrink_readahead_size_eio(file, ra);
+ return VM_FAULT_SIGBUS;
+}
+EXPORT_SYMBOL(filemap_fault);
+
+struct vm_operations_struct generic_file_vm_ops = {
+ .fault = filemap_fault,
+};
+
+/* This is used for a general mmap of a disk file */
+
+int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
+{
+ struct address_space *mapping = file->f_mapping;
+
+ if (!mapping->a_ops->readpage)
+ return -ENOEXEC;
+ file_accessed(file);
+ vma->vm_ops = &generic_file_vm_ops;
+ vma->vm_flags |= VM_CAN_NONLINEAR;
+ return 0;
+}
+
+/*
+ * This is for filesystems which do not implement ->writepage.
+ */
+int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+ return -EINVAL;
+ return generic_file_mmap(file, vma);
+}
+#else
+int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
+{
+ return -ENOSYS;
+}
+int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
+{
+ return -ENOSYS;
+}
+#endif /* CONFIG_MMU */
+
+EXPORT_SYMBOL(generic_file_mmap);
+EXPORT_SYMBOL(generic_file_readonly_mmap);
+
+static struct page *__read_cache_page(struct address_space *mapping,
+ pgoff_t index,
+ int (*filler)(void *,struct page*),
+ void *data)
+{
+ struct page *page;
+ int err;
+repeat:
+ page = find_get_page(mapping, index);
+ if (!page) {
+ page = page_cache_alloc_cold(mapping);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+ err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
+ if (unlikely(err)) {
+ page_cache_release(page);
+ if (err == -EEXIST)
+ goto repeat;
+ /* Presumably ENOMEM for radix tree node */
+ return ERR_PTR(err);
+ }
+ err = filler(data, page);
+ if (err < 0) {
+ page_cache_release(page);
+ page = ERR_PTR(err);
+ }
+ }
+ return page;
+}
+
+/**
+ * read_cache_page_async - read into page cache, fill it if needed
+ * @mapping: the page's address_space
+ * @index: the page index
+ * @filler: function to perform the read
+ * @data: destination for read data
+ *
+ * Same as read_cache_page, but don't wait for page to become unlocked
+ * after submitting it to the filler.
+ *
+ * Read into the page cache. If a page already exists, and PageUptodate() is
+ * not set, try to fill the page but don't wait for it to become unlocked.
+ *
+ * If the page does not get brought uptodate, return -EIO.
+ */
+struct page *read_cache_page_async(struct address_space *mapping,
+ pgoff_t index,
+ int (*filler)(void *,struct page*),
+ void *data)
+{
+ struct page *page;
+ int err;
+
+retry:
+ page = __read_cache_page(mapping, index, filler, data);
+ if (IS_ERR(page))
+ return page;
+ if (PageUptodate(page))
+ goto out;
+
+ lock_page(page);
+ if (!page->mapping) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto retry;
+ }
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ goto out;
+ }
+ err = filler(data, page);
+ if (err < 0) {
+ page_cache_release(page);
+ return ERR_PTR(err);
+ }
+out:
+ mark_page_accessed(page);
+ return page;
+}
+EXPORT_SYMBOL(read_cache_page_async);
+
+/**
+ * read_cache_page - read into page cache, fill it if needed
+ * @mapping: the page's address_space
+ * @index: the page index
+ * @filler: function to perform the read
+ * @data: destination for read data
+ *
+ * Read into the page cache. If a page already exists, and PageUptodate() is
+ * not set, try to fill the page then wait for it to become unlocked.
+ *
+ * If the page does not get brought uptodate, return -EIO.
+ */
+struct page *read_cache_page(struct address_space *mapping,
+ pgoff_t index,
+ int (*filler)(void *,struct page*),
+ void *data)
+{
+ struct page *page;
+
+ page = read_cache_page_async(mapping, index, filler, data);
+ if (IS_ERR(page))
+ goto out;
+ wait_on_page_locked(page);
+ if (!PageUptodate(page)) {
+ page_cache_release(page);
+ page = ERR_PTR(-EIO);
+ }
+ out:
+ return page;
+}
+EXPORT_SYMBOL(read_cache_page);
+
+/*
+ * The logic we want is
+ *
+ * if suid or (sgid and xgrp)
+ * remove privs
+ */
+int should_remove_suid(struct dentry *dentry)
+{
+ mode_t mode = dentry->d_inode->i_mode;
+ int kill = 0;
+
+ /* suid always must be killed */
+ if (unlikely(mode & S_ISUID))
+ kill = ATTR_KILL_SUID;
+
+ /*
+ * sgid without any exec bits is just a mandatory locking mark; leave
+ * it alone. If some exec bits are set, it's a real sgid; kill it.
+ */
+ if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
+ kill |= ATTR_KILL_SGID;
+
+ if (unlikely(kill && !capable(CAP_FSETID)))
+ return kill;
+
+ return 0;
+}
+EXPORT_SYMBOL(should_remove_suid);
+
+static int __remove_suid(struct dentry *dentry, int kill)
+{
+ struct iattr newattrs;
+
+ newattrs.ia_valid = ATTR_FORCE | kill;
+ return notify_change(dentry, &newattrs);
+}
+
+int file_remove_suid(struct file *file)
+{
+ struct dentry *dentry = file->f_path.dentry;
+ int killsuid = should_remove_suid(dentry);
+ int killpriv = security_inode_need_killpriv(dentry);
+ int error = 0;
+
+ if (killpriv < 0)
+ return killpriv;
+ if (killpriv)
+ error = security_inode_killpriv(dentry);
+ if (!error && killsuid)
+ error = __remove_suid(dentry, killsuid);
+
+ return error;
+}
+EXPORT_SYMBOL(file_remove_suid);
+
+static size_t __iovec_copy_from_user_inatomic(char *vaddr,
+ const struct iovec *iov, size_t base, size_t bytes)
+{
+ size_t copied = 0, left = 0;
+
+ while (bytes) {
+ char __user *buf = iov->iov_base + base;
+ int copy = min(bytes, iov->iov_len - base);
+
+ base = 0;
+ left = __copy_from_user_inatomic_nocache(vaddr, buf, copy);
+ copied += copy;
+ bytes -= copy;
+ vaddr += copy;
+ iov++;
+
+ if (unlikely(left))
+ break;
+ }
+ return copied - left;
+}
+
+/*
+ * Copy as much as we can into the page and return the number of bytes which
+ * were sucessfully copied. If a fault is encountered then return the number of
+ * bytes which were copied.
+ */
+size_t iov_iter_copy_from_user_atomic(struct page *page,
+ struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+ char *kaddr;
+ size_t copied;
+
+ BUG_ON(!in_atomic());
+ kaddr = kmap_atomic(page, KM_USER0);
+ if (likely(i->nr_segs == 1)) {
+ int left;
+ char __user *buf = i->iov->iov_base + i->iov_offset;
+ left = __copy_from_user_inatomic_nocache(kaddr + offset,
+ buf, bytes);
+ copied = bytes - left;
+ } else {
+ copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+ i->iov, i->iov_offset, bytes);
+ }
+ kunmap_atomic(kaddr, KM_USER0);
+
+ return copied;
+}
+EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
+
+/*
+ * This has the same sideeffects and return value as
+ * iov_iter_copy_from_user_atomic().
+ * The difference is that it attempts to resolve faults.
+ * Page must not be locked.
+ */
+size_t iov_iter_copy_from_user(struct page *page,
+ struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+ char *kaddr;
+ size_t copied;
+
+ kaddr = kmap(page);
+ if (likely(i->nr_segs == 1)) {
+ int left;
+ char __user *buf = i->iov->iov_base + i->iov_offset;
+ left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
+ copied = bytes - left;
+ } else {
+ copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+ i->iov, i->iov_offset, bytes);
+ }
+ kunmap(page);
+ return copied;
+}
+EXPORT_SYMBOL(iov_iter_copy_from_user);
+
+void iov_iter_advance(struct iov_iter *i, size_t bytes)
+{
+ BUG_ON(i->count < bytes);
+
+ if (likely(i->nr_segs == 1)) {
+ i->iov_offset += bytes;
+ i->count -= bytes;
+ } else {
+ const struct iovec *iov = i->iov;
+ size_t base = i->iov_offset;
+
+ /*
+ * The !iov->iov_len check ensures we skip over unlikely
+ * zero-length segments (without overruning the iovec).
+ */
+ while (bytes || unlikely(i->count && !iov->iov_len)) {
+ int copy;
+
+ copy = min(bytes, iov->iov_len - base);
+ BUG_ON(!i->count || i->count < copy);
+ i->count -= copy;
+ bytes -= copy;
+ base += copy;
+ if (iov->iov_len == base) {
+ iov++;
+ base = 0;
+ }
+ }
+ i->iov = iov;
+ i->iov_offset = base;
+ }
+}
+EXPORT_SYMBOL(iov_iter_advance);
+
+/*
+ * Fault in the first iovec of the given iov_iter, to a maximum length
+ * of bytes. Returns 0 on success, or non-zero if the memory could not be
+ * accessed (ie. because it is an invalid address).
+ *
+ * writev-intensive code may want this to prefault several iovecs -- that
+ * would be possible (callers must not rely on the fact that _only_ the
+ * first iovec will be faulted with the current implementation).
+ */
+int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+ char __user *buf = i->iov->iov_base + i->iov_offset;
+ bytes = min(bytes, i->iov->iov_len - i->iov_offset);
+ return fault_in_pages_readable(buf, bytes);
+}
+EXPORT_SYMBOL(iov_iter_fault_in_readable);
+
+/*
+ * Return the count of just the current iov_iter segment.
+ */
+size_t iov_iter_single_seg_count(struct iov_iter *i)
+{
+ const struct iovec *iov = i->iov;
+ if (i->nr_segs == 1)
+ return i->count;
+ else
+ return min(i->count, iov->iov_len - i->iov_offset);
+}
+EXPORT_SYMBOL(iov_iter_single_seg_count);
+
+/*
+ * Performs necessary checks before doing a write
+ *
+ * Can adjust writing position or amount of bytes to write.
+ * Returns appropriate error code that caller should return or
+ * zero in case that write should be allowed.
+ */
+inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
+{
+ struct inode *inode = file->f_mapping->host;
+ unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+
+ if (unlikely(*pos < 0))
+ return -EINVAL;
+
+ if (!isblk) {
+ /* FIXME: this is for backwards compatibility with 2.4 */
+ if (file->f_flags & O_APPEND)
+ *pos = i_size_read(inode);
+
+ if (limit != RLIM_INFINITY) {
+ if (*pos >= limit) {
+ send_sig(SIGXFSZ, current, 0);
+ return -EFBIG;
+ }
+ if (*count > limit - (typeof(limit))*pos) {
+ *count = limit - (typeof(limit))*pos;
+ }
+ }
+ }
+
+ /*
+ * LFS rule
+ */
+ if (unlikely(*pos + *count > MAX_NON_LFS &&
+ !(file->f_flags & O_LARGEFILE))) {
+ if (*pos >= MAX_NON_LFS) {
+ return -EFBIG;
+ }
+ if (*count > MAX_NON_LFS - (unsigned long)*pos) {
+ *count = MAX_NON_LFS - (unsigned long)*pos;
+ }
+ }
+
+ /*
+ * Are we about to exceed the fs block limit ?
+ *
+ * If we have written data it becomes a short write. If we have
+ * exceeded without writing data we send a signal and return EFBIG.
+ * Linus frestrict idea will clean these up nicely..
+ */
+ if (likely(!isblk)) {
+ if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {
+ if (*count || *pos > inode->i_sb->s_maxbytes) {
+ return -EFBIG;
+ }
+ /* zero-length writes at ->s_maxbytes are OK */
+ }
+
+ if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
+ *count = inode->i_sb->s_maxbytes - *pos;
+ } else {
+#ifdef CONFIG_BLOCK
+ loff_t isize;
+ if (bdev_read_only(I_BDEV(inode)))
+ return -EPERM;
+ isize = i_size_read(inode);
+ if (*pos >= isize) {
+ if (*count || *pos > isize)
+ return -ENOSPC;
+ }
+
+ if (*pos + *count > isize)
+ *count = isize - *pos;
+#else
+ return -EPERM;
+#endif
+ }
+ return 0;
+}
+EXPORT_SYMBOL(generic_write_checks);
+
+int pagecache_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ const struct address_space_operations *aops = mapping->a_ops;
+
+ return aops->write_begin(file, mapping, pos, len, flags,
+ pagep, fsdata);
+}
+EXPORT_SYMBOL(pagecache_write_begin);
+
+int pagecache_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ const struct address_space_operations *aops = mapping->a_ops;
+
+ mark_page_accessed(page);
+ return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
+}
+EXPORT_SYMBOL(pagecache_write_end);
+
+ssize_t
+generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long *nr_segs, loff_t pos, loff_t *ppos,
+ size_t count, size_t ocount)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ ssize_t written;
+ size_t write_len;
+ pgoff_t end;
+
+ if (count != ocount)
+ *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
+
+ write_len = iov_length(iov, *nr_segs);
+ end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
+
+ written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
+ if (written)
+ goto out;
+
+ /*
+ * After a write we want buffered reads to be sure to go to disk to get
+ * the new data. We invalidate clean cached page from the region we're
+ * about to write. We do this *before* the write so that we can return
+ * without clobbering -EIOCBQUEUED from ->direct_IO().
+ */
+ if (mapping->nrpages) {
+ written = invalidate_inode_pages2_range(mapping,
+ pos >> PAGE_CACHE_SHIFT, end);
+ /*
+ * If a page can not be invalidated, return 0 to fall back
+ * to buffered write.
+ */
+ if (written) {
+ if (written == -EBUSY)
+ return 0;
+ goto out;
+ }
+ }
+
+ written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
+
+ /*
+ * Finally, try again to invalidate clean pages which might have been
+ * cached by non-direct readahead, or faulted in by get_user_pages()
+ * if the source of the write was an mmap'ed region of the file
+ * we're writing. Either one is a pretty crazy thing to do,
+ * so we don't support it 100%. If this invalidation
+ * fails, tough, the write still worked...
+ */
+ if (mapping->nrpages) {
+ invalidate_inode_pages2_range(mapping,
+ pos >> PAGE_CACHE_SHIFT, end);
+ }
+
+ if (written > 0) {
+ loff_t end = pos + written;
+ if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
+ i_size_write(inode, end);
+ mark_inode_dirty(inode);
+ }
+ *ppos = end;
+ }
+
+ /*
+ * Sync the fs metadata but not the minor inode changes and
+ * of course not the data as we did direct DMA for the IO.
+ * i_mutex is held, which protects generic_osync_inode() from
+ * livelocking. AIO O_DIRECT ops attempt to sync metadata here.
+ */
+out:
+ if ((written >= 0 || written == -EIOCBQUEUED) &&
+ ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+ int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
+ if (err < 0)
+ written = err;
+ }
+ return written;
+}
+EXPORT_SYMBOL(generic_file_direct_write);
+
+/*
+ * Find or create a page at the given pagecache position. Return the locked
+ * page. This function is specifically for buffered writes.
+ */
+struct page *grab_cache_page_write_begin(struct address_space *mapping,
+ pgoff_t index, unsigned flags)
+{
+ int status;
+ struct page *page;
+ gfp_t gfp_notmask = 0;
+ if (flags & AOP_FLAG_NOFS)
+ gfp_notmask = __GFP_FS;
+repeat:
+ page = find_lock_page(mapping, index);
+ if (likely(page))
+ return page;
+
+ page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
+ if (!page)
+ return NULL;
+ status = add_to_page_cache_lru(page, mapping, index,
+ GFP_KERNEL & ~gfp_notmask);
+ if (unlikely(status)) {
+ page_cache_release(page);
+ if (status == -EEXIST)
+ goto repeat;
+ return NULL;
+ }
+ return page;
+}
+EXPORT_SYMBOL(grab_cache_page_write_begin);
+
+static ssize_t generic_perform_write(struct file *file,
+ struct iov_iter *i, loff_t pos)
+{
+ struct address_space *mapping = file->f_mapping;
+ const struct address_space_operations *a_ops = mapping->a_ops;
+ long status = 0;
+ ssize_t written = 0;
+ unsigned int flags = 0;
+
+ /*
+ * Copies from kernel address space cannot fail (NFSD is a big user).
+ */
+ if (segment_eq(get_fs(), KERNEL_DS))
+ flags |= AOP_FLAG_UNINTERRUPTIBLE;
+
+ do {
+ struct page *page;
+ pgoff_t index; /* Pagecache index for current page */
+ unsigned long offset; /* Offset into pagecache page */
+ unsigned long bytes; /* Bytes to write to page */
+ size_t copied; /* Bytes copied from user */
+ void *fsdata;
+
+ offset = (pos & (PAGE_CACHE_SIZE - 1));
+ index = pos >> PAGE_CACHE_SHIFT;
+ bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+ iov_iter_count(i));
+
+again:
+
+ /*
+ * Bring in the user page that we will copy from _first_.
+ * Otherwise there's a nasty deadlock on copying from the
+ * same page as we're writing to, without it being marked
+ * up-to-date.
+ *
+ * Not only is this an optimisation, but it is also required
+ * to check that the address is actually valid, when atomic
+ * usercopies are used, below.
+ */
+ if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+ status = -EFAULT;
+ break;
+ }
+
+ status = a_ops->write_begin(file, mapping, pos, bytes, flags,
+ &page, &fsdata);
+ if (unlikely(status))
+ break;
+
+ pagefault_disable();
+ copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+ pagefault_enable();
+ flush_dcache_page(page);
+
+ status = a_ops->write_end(file, mapping, pos, bytes, copied,
+ page, fsdata);
+ if (unlikely(status < 0))
+ break;
+ copied = status;
+
+ cond_resched();
+
+ iov_iter_advance(i, copied);
+ if (unlikely(copied == 0)) {
+ /*
+ * If we were unable to copy any data at all, we must
+ * fall back to a single segment length write.
+ *
+ * If we didn't fallback here, we could livelock
+ * because not all segments in the iov can be copied at
+ * once without a pagefault.
+ */
+ bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+ iov_iter_single_seg_count(i));
+ goto again;
+ }
+ pos += copied;
+ written += copied;
+
+ balance_dirty_pages_ratelimited(mapping);
+
+ } while (iov_iter_count(i));
+
+ return written ? written : status;
+}
+
+ssize_t
+generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos, loff_t *ppos,
+ size_t count, ssize_t written)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ const struct address_space_operations *a_ops = mapping->a_ops;
+ struct inode *inode = mapping->host;
+ ssize_t status;
+ struct iov_iter i;
+
+ iov_iter_init(&i, iov, nr_segs, count, written);
+ status = generic_perform_write(file, &i, pos);
+
+ if (likely(status >= 0)) {
+ written += status;
+ *ppos = pos + status;
+
+ /*
+ * For now, when the user asks for O_SYNC, we'll actually give
+ * O_DSYNC
+ */
+ if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+ if (!a_ops->writepage || !is_sync_kiocb(iocb))
+ status = generic_osync_inode(inode, mapping,
+ OSYNC_METADATA|OSYNC_DATA);
+ }
+ }
+
+ /*
+ * If we get here for O_DIRECT writes then we must have fallen through
+ * to buffered writes (block instantiation inside i_size). So we sync
+ * the file data here, to try to honour O_DIRECT expectations.
+ */
+ if (unlikely(file->f_flags & O_DIRECT) && written)
+ status = filemap_write_and_wait_range(mapping,
+ pos, pos + written - 1);
+
+ return written ? written : status;
+}
+EXPORT_SYMBOL(generic_file_buffered_write);
+
+static ssize_t
+__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t *ppos)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space * mapping = file->f_mapping;
+ size_t ocount; /* original count */
+ size_t count; /* after file limit checks */
+ struct inode *inode = mapping->host;
+ loff_t pos;
+ ssize_t written;
+ ssize_t err;
+
+ ocount = 0;
+ err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
+ if (err)
+ return err;
+
+ count = ocount;
+ pos = *ppos;
+
+ vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
+ /* We can write back this queue in page reclaim */
+ current->backing_dev_info = mapping->backing_dev_info;
+ written = 0;
+
+ err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+ if (err)
+ goto out;
+
+ if (count == 0)
+ goto out;
+
+ err = file_remove_suid(file);
+ if (err)
+ goto out;
+
+ file_update_time(file);
+
+ /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
+ if (unlikely(file->f_flags & O_DIRECT)) {
+ loff_t endbyte;
+ ssize_t written_buffered;
+
+ written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
+ ppos, count, ocount);
+ if (written < 0 || written == count)
+ goto out;
+ /*
+ * direct-io write to a hole: fall through to buffered I/O
+ * for completing the rest of the request.
+ */
+ pos += written;
+ count -= written;
+ written_buffered = generic_file_buffered_write(iocb, iov,
+ nr_segs, pos, ppos, count,
+ written);
+ /*
+ * If generic_file_buffered_write() retuned a synchronous error
+ * then we want to return the number of bytes which were
+ * direct-written, or the error code if that was zero. Note
+ * that this differs from normal direct-io semantics, which
+ * will return -EFOO even if some bytes were written.
+ */
+ if (written_buffered < 0) {
+ err = written_buffered;
+ goto out;
+ }
+
+ /*
+ * We need to ensure that the page cache pages are written to
+ * disk and invalidated to preserve the expected O_DIRECT
+ * semantics.
+ */
+ endbyte = pos + written_buffered - written - 1;
+ err = do_sync_mapping_range(file->f_mapping, pos, endbyte,
+ SYNC_FILE_RANGE_WAIT_BEFORE|
+ SYNC_FILE_RANGE_WRITE|
+ SYNC_FILE_RANGE_WAIT_AFTER);
+ if (err == 0) {
+ written = written_buffered;
+ invalidate_mapping_pages(mapping,
+ pos >> PAGE_CACHE_SHIFT,
+ endbyte >> PAGE_CACHE_SHIFT);
+ } else {
+ /*
+ * We don't know how much we wrote, so just return
+ * the number of bytes which were direct-written
+ */
+ }
+ } else {
+ written = generic_file_buffered_write(iocb, iov, nr_segs,
+ pos, ppos, count, written);
+ }
+out:
+ current->backing_dev_info = NULL;
+ return written ? written : err;
+}
+
+ssize_t generic_file_aio_write_nolock(struct kiocb *iocb,
+ const struct iovec *iov, unsigned long nr_segs, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ ssize_t ret;
+
+ BUG_ON(iocb->ki_pos != pos);
+
+ ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs,
+ &iocb->ki_pos);
+
+ if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+ ssize_t err;
+
+ err = sync_page_range_nolock(inode, mapping, pos, ret);
+ if (err < 0)
+ ret = err;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(generic_file_aio_write_nolock);
+
+ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ ssize_t ret;
+
+ BUG_ON(iocb->ki_pos != pos);
+
+ mutex_lock(&inode->i_mutex);
+ ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs,
+ &iocb->ki_pos);
+ mutex_unlock(&inode->i_mutex);
+
+ if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+ ssize_t err;
+
+ err = sync_page_range(inode, mapping, pos, ret);
+ if (err < 0)
+ ret = err;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(generic_file_aio_write);
+
+/**
+ * try_to_release_page() - release old fs-specific metadata on a page
+ *
+ * @page: the page which the kernel is trying to free
+ * @gfp_mask: memory allocation flags (and I/O mode)
+ *
+ * The address_space is to try to release any data against the page
+ * (presumably at page->private). If the release was successful, return `1'.
+ * Otherwise return zero.
+ *
+ * The @gfp_mask argument specifies whether I/O may be performed to release
+ * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
+ *
+ */
+int try_to_release_page(struct page *page, gfp_t gfp_mask)
+{
+ struct address_space * const mapping = page->mapping;
+
+ BUG_ON(!PageLocked(page));
+ if (PageWriteback(page))
+ return 0;
+
+ if (mapping && mapping->a_ops->releasepage)
+ return mapping->a_ops->releasepage(page, gfp_mask);
+ return try_to_free_buffers(page);
+}
+
+EXPORT_SYMBOL(try_to_release_page);
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
new file mode 100644
index 0000000..b5167df
--- /dev/null
+++ b/mm/filemap_xip.c
@@ -0,0 +1,474 @@
+/*
+ * linux/mm/filemap_xip.c
+ *
+ * Copyright (C) 2005 IBM Corporation
+ * Author: Carsten Otte <cotte@de.ibm.com>
+ *
+ * derived from linux/mm/filemap.c - Copyright (C) Linus Torvalds
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/module.h>
+#include <linux/uio.h>
+#include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
+#include <linux/sched.h>
+#include <linux/seqlock.h>
+#include <linux/mutex.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+
+/*
+ * We do use our own empty page to avoid interference with other users
+ * of ZERO_PAGE(), such as /dev/zero
+ */
+static DEFINE_MUTEX(xip_sparse_mutex);
+static seqcount_t xip_sparse_seq = SEQCNT_ZERO;
+static struct page *__xip_sparse_page;
+
+/* called under xip_sparse_mutex */
+static struct page *xip_sparse_page(void)
+{
+ if (!__xip_sparse_page) {
+ struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
+
+ if (page)
+ __xip_sparse_page = page;
+ }
+ return __xip_sparse_page;
+}
+
+/*
+ * This is a file read routine for execute in place files, and uses
+ * the mapping->a_ops->get_xip_mem() function for the actual low-level
+ * stuff.
+ *
+ * Note the struct file* is not used at all. It may be NULL.
+ */
+static ssize_t
+do_xip_mapping_read(struct address_space *mapping,
+ struct file_ra_state *_ra,
+ struct file *filp,
+ char __user *buf,
+ size_t len,
+ loff_t *ppos)
+{
+ struct inode *inode = mapping->host;
+ pgoff_t index, end_index;
+ unsigned long offset;
+ loff_t isize, pos;
+ size_t copied = 0, error = 0;
+
+ BUG_ON(!mapping->a_ops->get_xip_mem);
+
+ pos = *ppos;
+ index = pos >> PAGE_CACHE_SHIFT;
+ offset = pos & ~PAGE_CACHE_MASK;
+
+ isize = i_size_read(inode);
+ if (!isize)
+ goto out;
+
+ end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+ do {
+ unsigned long nr, left;
+ void *xip_mem;
+ unsigned long xip_pfn;
+ int zero = 0;
+
+ /* nr is the maximum number of bytes to copy from this page */
+ nr = PAGE_CACHE_SIZE;
+ if (index >= end_index) {
+ if (index > end_index)
+ goto out;
+ nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+ if (nr <= offset) {
+ goto out;
+ }
+ }
+ nr = nr - offset;
+ if (nr > len)
+ nr = len;
+
+ error = mapping->a_ops->get_xip_mem(mapping, index, 0,
+ &xip_mem, &xip_pfn);
+ if (unlikely(error)) {
+ if (error == -ENODATA) {
+ /* sparse */
+ zero = 1;
+ } else
+ goto out;
+ }
+
+ /* If users can be writing to this page using arbitrary
+ * virtual addresses, take care about potential aliasing
+ * before reading the page on the kernel side.
+ */
+ if (mapping_writably_mapped(mapping))
+ /* address based flush */ ;
+
+ /*
+ * Ok, we have the mem, so now we can copy it to user space...
+ *
+ * The actor routine returns how many bytes were actually used..
+ * NOTE! This may not be the same as how much of a user buffer
+ * we filled up (we may be padding etc), so we can only update
+ * "pos" here (the actor routine has to update the user buffer
+ * pointers and the remaining count).
+ */
+ if (!zero)
+ left = __copy_to_user(buf+copied, xip_mem+offset, nr);
+ else
+ left = __clear_user(buf + copied, nr);
+
+ if (left) {
+ error = -EFAULT;
+ goto out;
+ }
+
+ copied += (nr - left);
+ offset += (nr - left);
+ index += offset >> PAGE_CACHE_SHIFT;
+ offset &= ~PAGE_CACHE_MASK;
+ } while (copied < len);
+
+out:
+ *ppos = pos + copied;
+ if (filp)
+ file_accessed(filp);
+
+ return (copied ? copied : error);
+}
+
+ssize_t
+xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
+{
+ if (!access_ok(VERIFY_WRITE, buf, len))
+ return -EFAULT;
+
+ return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
+ buf, len, ppos);
+}
+EXPORT_SYMBOL_GPL(xip_file_read);
+
+/*
+ * __xip_unmap is invoked from xip_unmap and
+ * xip_write
+ *
+ * This function walks all vmas of the address_space and unmaps the
+ * __xip_sparse_page when found at pgoff.
+ */
+static void
+__xip_unmap (struct address_space * mapping,
+ unsigned long pgoff)
+{
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
+ struct prio_tree_iter iter;
+ unsigned long address;
+ pte_t *pte;
+ pte_t pteval;
+ spinlock_t *ptl;
+ struct page *page;
+ unsigned count;
+ int locked = 0;
+
+ count = read_seqcount_begin(&xip_sparse_seq);
+
+ page = __xip_sparse_page;
+ if (!page)
+ return;
+
+retry:
+ spin_lock(&mapping->i_mmap_lock);
+ vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ mm = vma->vm_mm;
+ address = vma->vm_start +
+ ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ pte = page_check_address(page, mm, address, &ptl, 1);
+ if (pte) {
+ /* Nuke the page table entry. */
+ flush_cache_page(vma, address, pte_pfn(*pte));
+ pteval = ptep_clear_flush_notify(vma, address, pte);
+ page_remove_rmap(page, vma);
+ dec_mm_counter(mm, file_rss);
+ BUG_ON(pte_dirty(pteval));
+ pte_unmap_unlock(pte, ptl);
+ page_cache_release(page);
+ }
+ }
+ spin_unlock(&mapping->i_mmap_lock);
+
+ if (locked) {
+ mutex_unlock(&xip_sparse_mutex);
+ } else if (read_seqcount_retry(&xip_sparse_seq, count)) {
+ mutex_lock(&xip_sparse_mutex);
+ locked = 1;
+ goto retry;
+ }
+}
+
+/*
+ * xip_fault() is invoked via the vma operations vector for a
+ * mapped memory region to read in file data during a page fault.
+ *
+ * This function is derived from filemap_fault, but used for execute in place
+ */
+static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct file *file = vma->vm_file;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ pgoff_t size;
+ void *xip_mem;
+ unsigned long xip_pfn;
+ struct page *page;
+ int error;
+
+ /* XXX: are VM_FAULT_ codes OK? */
+again:
+ size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ if (vmf->pgoff >= size)
+ return VM_FAULT_SIGBUS;
+
+ error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
+ &xip_mem, &xip_pfn);
+ if (likely(!error))
+ goto found;
+ if (error != -ENODATA)
+ return VM_FAULT_OOM;
+
+ /* sparse block */
+ if ((vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
+ (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) &&
+ (!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
+ int err;
+
+ /* maybe shared writable, allocate new block */
+ mutex_lock(&xip_sparse_mutex);
+ error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1,
+ &xip_mem, &xip_pfn);
+ mutex_unlock(&xip_sparse_mutex);
+ if (error)
+ return VM_FAULT_SIGBUS;
+ /* unmap sparse mappings at pgoff from all other vmas */
+ __xip_unmap(mapping, vmf->pgoff);
+
+found:
+ err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
+ xip_pfn);
+ if (err == -ENOMEM)
+ return VM_FAULT_OOM;
+ BUG_ON(err);
+ return VM_FAULT_NOPAGE;
+ } else {
+ int err, ret = VM_FAULT_OOM;
+
+ mutex_lock(&xip_sparse_mutex);
+ write_seqcount_begin(&xip_sparse_seq);
+ error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
+ &xip_mem, &xip_pfn);
+ if (unlikely(!error)) {
+ write_seqcount_end(&xip_sparse_seq);
+ mutex_unlock(&xip_sparse_mutex);
+ goto again;
+ }
+ if (error != -ENODATA)
+ goto out;
+ /* not shared and writable, use xip_sparse_page() */
+ page = xip_sparse_page();
+ if (!page)
+ goto out;
+ err = vm_insert_page(vma, (unsigned long)vmf->virtual_address,
+ page);
+ if (err == -ENOMEM)
+ goto out;
+
+ ret = VM_FAULT_NOPAGE;
+out:
+ write_seqcount_end(&xip_sparse_seq);
+ mutex_unlock(&xip_sparse_mutex);
+
+ return ret;
+ }
+}
+
+static struct vm_operations_struct xip_file_vm_ops = {
+ .fault = xip_file_fault,
+};
+
+int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
+{
+ BUG_ON(!file->f_mapping->a_ops->get_xip_mem);
+
+ file_accessed(file);
+ vma->vm_ops = &xip_file_vm_ops;
+ vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xip_file_mmap);
+
+static ssize_t
+__xip_file_write(struct file *filp, const char __user *buf,
+ size_t count, loff_t pos, loff_t *ppos)
+{
+ struct address_space * mapping = filp->f_mapping;
+ const struct address_space_operations *a_ops = mapping->a_ops;
+ struct inode *inode = mapping->host;
+ long status = 0;
+ size_t bytes;
+ ssize_t written = 0;
+
+ BUG_ON(!mapping->a_ops->get_xip_mem);
+
+ do {
+ unsigned long index;
+ unsigned long offset;
+ size_t copied;
+ void *xip_mem;
+ unsigned long xip_pfn;
+
+ offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+ index = pos >> PAGE_CACHE_SHIFT;
+ bytes = PAGE_CACHE_SIZE - offset;
+ if (bytes > count)
+ bytes = count;
+
+ status = a_ops->get_xip_mem(mapping, index, 0,
+ &xip_mem, &xip_pfn);
+ if (status == -ENODATA) {
+ /* we allocate a new page unmap it */
+ mutex_lock(&xip_sparse_mutex);
+ status = a_ops->get_xip_mem(mapping, index, 1,
+ &xip_mem, &xip_pfn);
+ mutex_unlock(&xip_sparse_mutex);
+ if (!status)
+ /* unmap page at pgoff from all other vmas */
+ __xip_unmap(mapping, index);
+ }
+
+ if (status)
+ break;
+
+ copied = bytes -
+ __copy_from_user_nocache(xip_mem + offset, buf, bytes);
+
+ if (likely(copied > 0)) {
+ status = copied;
+
+ if (status >= 0) {
+ written += status;
+ count -= status;
+ pos += status;
+ buf += status;
+ }
+ }
+ if (unlikely(copied != bytes))
+ if (status >= 0)
+ status = -EFAULT;
+ if (status < 0)
+ break;
+ } while (count);
+ *ppos = pos;
+ /*
+ * No need to use i_size_read() here, the i_size
+ * cannot change under us because we hold i_mutex.
+ */
+ if (pos > inode->i_size) {
+ i_size_write(inode, pos);
+ mark_inode_dirty(inode);
+ }
+
+ return written ? written : status;
+}
+
+ssize_t
+xip_file_write(struct file *filp, const char __user *buf, size_t len,
+ loff_t *ppos)
+{
+ struct address_space *mapping = filp->f_mapping;
+ struct inode *inode = mapping->host;
+ size_t count;
+ loff_t pos;
+ ssize_t ret;
+
+ mutex_lock(&inode->i_mutex);
+
+ if (!access_ok(VERIFY_READ, buf, len)) {
+ ret=-EFAULT;
+ goto out_up;
+ }
+
+ pos = *ppos;
+ count = len;
+
+ vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
+ /* We can write back this queue in page reclaim */
+ current->backing_dev_info = mapping->backing_dev_info;
+
+ ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
+ if (ret)
+ goto out_backing;
+ if (count == 0)
+ goto out_backing;
+
+ ret = file_remove_suid(filp);
+ if (ret)
+ goto out_backing;
+
+ file_update_time(filp);
+
+ ret = __xip_file_write (filp, buf, count, pos, ppos);
+
+ out_backing:
+ current->backing_dev_info = NULL;
+ out_up:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xip_file_write);
+
+/*
+ * truncate a page used for execute in place
+ * functionality is analog to block_truncate_page but does use get_xip_mem
+ * to get the page instead of page cache
+ */
+int
+xip_truncate_page(struct address_space *mapping, loff_t from)
+{
+ pgoff_t index = from >> PAGE_CACHE_SHIFT;
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned blocksize;
+ unsigned length;
+ void *xip_mem;
+ unsigned long xip_pfn;
+ int err;
+
+ BUG_ON(!mapping->a_ops->get_xip_mem);
+
+ blocksize = 1 << mapping->host->i_blkbits;
+ length = offset & (blocksize - 1);
+
+ /* Block boundary? Nothing to do */
+ if (!length)
+ return 0;
+
+ length = blocksize - length;
+
+ err = mapping->a_ops->get_xip_mem(mapping, index, 0,
+ &xip_mem, &xip_pfn);
+ if (unlikely(err)) {
+ if (err == -ENODATA)
+ /* Hole? No need to truncate */
+ return 0;
+ else
+ return err;
+ }
+ memset(xip_mem + offset, 0, length);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xip_truncate_page);
diff --git a/mm/fremap.c b/mm/fremap.c
new file mode 100644
index 0000000..b602c27
--- /dev/null
+++ b/mm/fremap.c
@@ -0,0 +1,260 @@
+/*
+ * linux/mm/fremap.c
+ *
+ * Explicit pagetable population and nonlinear (random) mappings support.
+ *
+ * started by Ingo Molnar, Copyright (C) 2002, 2003
+ */
+#include <linux/backing-dev.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/file.h>
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/swapops.h>
+#include <linux/rmap.h>
+#include <linux/module.h>
+#include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>
+
+#include <asm/mmu_context.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+#include "internal.h"
+
+static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ pte_t pte = *ptep;
+
+ if (pte_present(pte)) {
+ struct page *page;
+
+ flush_cache_page(vma, addr, pte_pfn(pte));
+ pte = ptep_clear_flush(vma, addr, ptep);
+ page = vm_normal_page(vma, addr, pte);
+ if (page) {
+ if (pte_dirty(pte))
+ set_page_dirty(page);
+ page_remove_rmap(page, vma);
+ page_cache_release(page);
+ update_hiwater_rss(mm);
+ dec_mm_counter(mm, file_rss);
+ }
+ } else {
+ if (!pte_file(pte))
+ free_swap_and_cache(pte_to_swp_entry(pte));
+ pte_clear_not_present_full(mm, addr, ptep, 0);
+ }
+}
+
+/*
+ * Install a file pte to a given virtual memory address, release any
+ * previously existing mapping.
+ */
+static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long pgoff, pgprot_t prot)
+{
+ int err = -ENOMEM;
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ pte = get_locked_pte(mm, addr, &ptl);
+ if (!pte)
+ goto out;
+
+ if (!pte_none(*pte))
+ zap_pte(mm, vma, addr, pte);
+
+ set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
+ /*
+ * We don't need to run update_mmu_cache() here because the "file pte"
+ * being installed by install_file_pte() is not a real pte - it's a
+ * non-present entry (like a swap entry), noting what file offset should
+ * be mapped there when there's a fault (in a non-linear vma where
+ * that's not obvious).
+ */
+ pte_unmap_unlock(pte, ptl);
+ err = 0;
+out:
+ return err;
+}
+
+static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long size, pgoff_t pgoff)
+{
+ int err;
+
+ do {
+ err = install_file_pte(mm, vma, addr, pgoff, vma->vm_page_prot);
+ if (err)
+ return err;
+
+ size -= PAGE_SIZE;
+ addr += PAGE_SIZE;
+ pgoff++;
+ } while (size);
+
+ return 0;
+
+}
+
+/**
+ * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma
+ * @start: start of the remapped virtual memory range
+ * @size: size of the remapped virtual memory range
+ * @prot: new protection bits of the range (see NOTE)
+ * @pgoff: to-be-mapped page of the backing store file
+ * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO.
+ *
+ * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma
+ * (shared backing store file).
+ *
+ * This syscall works purely via pagetables, so it's the most efficient
+ * way to map the same (large) file into a given virtual window. Unlike
+ * mmap()/mremap() it does not create any new vmas. The new mappings are
+ * also safe across swapout.
+ *
+ * NOTE: the @prot parameter right now is ignored (but must be zero),
+ * and the vma's default protection is used. Arbitrary protections
+ * might be implemented in the future.
+ */
+SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
+ unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
+{
+ struct mm_struct *mm = current->mm;
+ struct address_space *mapping;
+ unsigned long end = start + size;
+ struct vm_area_struct *vma;
+ int err = -EINVAL;
+ int has_write_lock = 0;
+
+ if (prot)
+ return err;
+ /*
+ * Sanitize the syscall parameters:
+ */
+ start = start & PAGE_MASK;
+ size = size & PAGE_MASK;
+
+ /* Does the address range wrap, or is the span zero-sized? */
+ if (start + size <= start)
+ return err;
+
+ /* Can we represent this offset inside this architecture's pte's? */
+#if PTE_FILE_MAX_BITS < BITS_PER_LONG
+ if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS))
+ return err;
+#endif
+
+ /* We need down_write() to change vma->vm_flags. */
+ down_read(&mm->mmap_sem);
+ retry:
+ vma = find_vma(mm, start);
+
+ /*
+ * Make sure the vma is shared, that it supports prefaulting,
+ * and that the remapped range is valid and fully within
+ * the single existing vma. vm_private_data is used as a
+ * swapout cursor in a VM_NONLINEAR vma.
+ */
+ if (!vma || !(vma->vm_flags & VM_SHARED))
+ goto out;
+
+ if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR))
+ goto out;
+
+ if (!(vma->vm_flags & VM_CAN_NONLINEAR))
+ goto out;
+
+ if (end <= start || start < vma->vm_start || end > vma->vm_end)
+ goto out;
+
+ /* Must set VM_NONLINEAR before any pages are populated. */
+ if (!(vma->vm_flags & VM_NONLINEAR)) {
+ /* Don't need a nonlinear mapping, exit success */
+ if (pgoff == linear_page_index(vma, start)) {
+ err = 0;
+ goto out;
+ }
+
+ if (!has_write_lock) {
+ up_read(&mm->mmap_sem);
+ down_write(&mm->mmap_sem);
+ has_write_lock = 1;
+ goto retry;
+ }
+ mapping = vma->vm_file->f_mapping;
+ /*
+ * page_mkclean doesn't work on nonlinear vmas, so if
+ * dirty pages need to be accounted, emulate with linear
+ * vmas.
+ */
+ if (mapping_cap_account_dirty(mapping)) {
+ unsigned long addr;
+ struct file *file = vma->vm_file;
+
+ flags &= MAP_NONBLOCK;
+ get_file(file);
+ addr = mmap_region(file, start, size,
+ flags, vma->vm_flags, pgoff, 1);
+ fput(file);
+ if (IS_ERR_VALUE(addr)) {
+ err = addr;
+ } else {
+ BUG_ON(addr != start);
+ err = 0;
+ }
+ goto out;
+ }
+ spin_lock(&mapping->i_mmap_lock);
+ flush_dcache_mmap_lock(mapping);
+ vma->vm_flags |= VM_NONLINEAR;
+ vma_prio_tree_remove(vma, &mapping->i_mmap);
+ vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
+ flush_dcache_mmap_unlock(mapping);
+ spin_unlock(&mapping->i_mmap_lock);
+ }
+
+ if (vma->vm_flags & VM_LOCKED) {
+ /*
+ * drop PG_Mlocked flag for over-mapped range
+ */
+ unsigned int saved_flags = vma->vm_flags;
+ munlock_vma_pages_range(vma, start, start + size);
+ vma->vm_flags = saved_flags;
+ }
+
+ mmu_notifier_invalidate_range_start(mm, start, start + size);
+ err = populate_range(mm, vma, start, size, pgoff);
+ mmu_notifier_invalidate_range_end(mm, start, start + size);
+ if (!err && !(flags & MAP_NONBLOCK)) {
+ if (vma->vm_flags & VM_LOCKED) {
+ /*
+ * might be mapping previously unmapped range of file
+ */
+ mlock_vma_pages_range(vma, start, start + size);
+ } else {
+ if (unlikely(has_write_lock)) {
+ downgrade_write(&mm->mmap_sem);
+ has_write_lock = 0;
+ }
+ make_pages_present(start, start+size);
+ }
+ }
+
+ /*
+ * We can't clear VM_NONLINEAR because we'd have to do
+ * it after ->populate completes, and that would prevent
+ * downgrading the lock. (Locks can't be upgraded).
+ */
+
+out:
+ if (likely(!has_write_lock))
+ up_read(&mm->mmap_sem);
+ else
+ up_write(&mm->mmap_sem);
+
+ return err;
+}
diff --git a/mm/highmem.c b/mm/highmem.c
new file mode 100644
index 0000000..b36b83b
--- /dev/null
+++ b/mm/highmem.c
@@ -0,0 +1,375 @@
+/*
+ * High memory handling common code and variables.
+ *
+ * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de
+ * Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de
+ *
+ *
+ * Redesigned the x86 32-bit VM architecture to deal with
+ * 64-bit physical space. With current x86 CPUs this
+ * means up to 64 Gigabytes physical RAM.
+ *
+ * Rewrote high memory support to move the page cache into
+ * high memory. Implemented permanent (schedulable) kmaps
+ * based on Linus' idea.
+ *
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/bio.h>
+#include <linux/pagemap.h>
+#include <linux/mempool.h>
+#include <linux/blkdev.h>
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/highmem.h>
+#include <linux/blktrace_api.h>
+#include <asm/tlbflush.h>
+
+/*
+ * Virtual_count is not a pure "count".
+ * 0 means that it is not mapped, and has not been mapped
+ * since a TLB flush - it is usable.
+ * 1 means that there are no users, but it has been mapped
+ * since the last TLB flush - so we can't use it.
+ * n means that there are (n-1) current users of it.
+ */
+#ifdef CONFIG_HIGHMEM
+
+unsigned long totalhigh_pages __read_mostly;
+EXPORT_SYMBOL(totalhigh_pages);
+
+unsigned int nr_free_highpages (void)
+{
+ pg_data_t *pgdat;
+ unsigned int pages = 0;
+
+ for_each_online_pgdat(pgdat) {
+ pages += zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
+ NR_FREE_PAGES);
+ if (zone_movable_is_highmem())
+ pages += zone_page_state(
+ &pgdat->node_zones[ZONE_MOVABLE],
+ NR_FREE_PAGES);
+ }
+
+ return pages;
+}
+
+static int pkmap_count[LAST_PKMAP];
+static unsigned int last_pkmap_nr;
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
+
+pte_t * pkmap_page_table;
+
+static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
+
+static void flush_all_zero_pkmaps(void)
+{
+ int i;
+ int need_flush = 0;
+
+ flush_cache_kmaps();
+
+ for (i = 0; i < LAST_PKMAP; i++) {
+ struct page *page;
+
+ /*
+ * zero means we don't have anything to do,
+ * >1 means that it is still in use. Only
+ * a count of 1 means that it is free but
+ * needs to be unmapped
+ */
+ if (pkmap_count[i] != 1)
+ continue;
+ pkmap_count[i] = 0;
+
+ /* sanity check */
+ BUG_ON(pte_none(pkmap_page_table[i]));
+
+ /*
+ * Don't need an atomic fetch-and-clear op here;
+ * no-one has the page mapped, and cannot get at
+ * its virtual address (and hence PTE) without first
+ * getting the kmap_lock (which is held here).
+ * So no dangers, even with speculative execution.
+ */
+ page = pte_page(pkmap_page_table[i]);
+ pte_clear(&init_mm, (unsigned long)page_address(page),
+ &pkmap_page_table[i]);
+
+ set_page_address(page, NULL);
+ need_flush = 1;
+ }
+ if (need_flush)
+ flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
+}
+
+/**
+ * kmap_flush_unused - flush all unused kmap mappings in order to remove stray mappings
+ */
+void kmap_flush_unused(void)
+{
+ spin_lock(&kmap_lock);
+ flush_all_zero_pkmaps();
+ spin_unlock(&kmap_lock);
+}
+
+static inline unsigned long map_new_virtual(struct page *page)
+{
+ unsigned long vaddr;
+ int count;
+
+start:
+ count = LAST_PKMAP;
+ /* Find an empty entry */
+ for (;;) {
+ last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
+ if (!last_pkmap_nr) {
+ flush_all_zero_pkmaps();
+ count = LAST_PKMAP;
+ }
+ if (!pkmap_count[last_pkmap_nr])
+ break; /* Found a usable entry */
+ if (--count)
+ continue;
+
+ /*
+ * Sleep for somebody else to unmap their entries
+ */
+ {
+ DECLARE_WAITQUEUE(wait, current);
+
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ add_wait_queue(&pkmap_map_wait, &wait);
+ spin_unlock(&kmap_lock);
+ schedule();
+ remove_wait_queue(&pkmap_map_wait, &wait);
+ spin_lock(&kmap_lock);
+
+ /* Somebody else might have mapped it while we slept */
+ if (page_address(page))
+ return (unsigned long)page_address(page);
+
+ /* Re-start */
+ goto start;
+ }
+ }
+ vaddr = PKMAP_ADDR(last_pkmap_nr);
+ set_pte_at(&init_mm, vaddr,
+ &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
+
+ pkmap_count[last_pkmap_nr] = 1;
+ set_page_address(page, (void *)vaddr);
+
+ return vaddr;
+}
+
+/**
+ * kmap_high - map a highmem page into memory
+ * @page: &struct page to map
+ *
+ * Returns the page's virtual memory address.
+ *
+ * We cannot call this from interrupts, as it may block.
+ */
+void *kmap_high(struct page *page)
+{
+ unsigned long vaddr;
+
+ /*
+ * For highmem pages, we can't trust "virtual" until
+ * after we have the lock.
+ */
+ spin_lock(&kmap_lock);
+ vaddr = (unsigned long)page_address(page);
+ if (!vaddr)
+ vaddr = map_new_virtual(page);
+ pkmap_count[PKMAP_NR(vaddr)]++;
+ BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2);
+ spin_unlock(&kmap_lock);
+ return (void*) vaddr;
+}
+
+EXPORT_SYMBOL(kmap_high);
+
+/**
+ * kunmap_high - map a highmem page into memory
+ * @page: &struct page to unmap
+ */
+void kunmap_high(struct page *page)
+{
+ unsigned long vaddr;
+ unsigned long nr;
+ int need_wakeup;
+
+ spin_lock(&kmap_lock);
+ vaddr = (unsigned long)page_address(page);
+ BUG_ON(!vaddr);
+ nr = PKMAP_NR(vaddr);
+
+ /*
+ * A count must never go down to zero
+ * without a TLB flush!
+ */
+ need_wakeup = 0;
+ switch (--pkmap_count[nr]) {
+ case 0:
+ BUG();
+ case 1:
+ /*
+ * Avoid an unnecessary wake_up() function call.
+ * The common case is pkmap_count[] == 1, but
+ * no waiters.
+ * The tasks queued in the wait-queue are guarded
+ * by both the lock in the wait-queue-head and by
+ * the kmap_lock. As the kmap_lock is held here,
+ * no need for the wait-queue-head's lock. Simply
+ * test if the queue is empty.
+ */
+ need_wakeup = waitqueue_active(&pkmap_map_wait);
+ }
+ spin_unlock(&kmap_lock);
+
+ /* do wake-up, if needed, race-free outside of the spin lock */
+ if (need_wakeup)
+ wake_up(&pkmap_map_wait);
+}
+
+EXPORT_SYMBOL(kunmap_high);
+#endif
+
+#if defined(HASHED_PAGE_VIRTUAL)
+
+#define PA_HASH_ORDER 7
+
+/*
+ * Describes one page->virtual association
+ */
+struct page_address_map {
+ struct page *page;
+ void *virtual;
+ struct list_head list;
+};
+
+/*
+ * page_address_map freelist, allocated from page_address_maps.
+ */
+static struct list_head page_address_pool; /* freelist */
+static spinlock_t pool_lock; /* protects page_address_pool */
+
+/*
+ * Hash table bucket
+ */
+static struct page_address_slot {
+ struct list_head lh; /* List of page_address_maps */
+ spinlock_t lock; /* Protect this bucket's list */
+} ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER];
+
+static struct page_address_slot *page_slot(struct page *page)
+{
+ return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
+}
+
+/**
+ * page_address - get the mapped virtual address of a page
+ * @page: &struct page to get the virtual address of
+ *
+ * Returns the page's virtual address.
+ */
+void *page_address(struct page *page)
+{
+ unsigned long flags;
+ void *ret;
+ struct page_address_slot *pas;
+
+ if (!PageHighMem(page))
+ return lowmem_page_address(page);
+
+ pas = page_slot(page);
+ ret = NULL;
+ spin_lock_irqsave(&pas->lock, flags);
+ if (!list_empty(&pas->lh)) {
+ struct page_address_map *pam;
+
+ list_for_each_entry(pam, &pas->lh, list) {
+ if (pam->page == page) {
+ ret = pam->virtual;
+ goto done;
+ }
+ }
+ }
+done:
+ spin_unlock_irqrestore(&pas->lock, flags);
+ return ret;
+}
+
+EXPORT_SYMBOL(page_address);
+
+/**
+ * set_page_address - set a page's virtual address
+ * @page: &struct page to set
+ * @virtual: virtual address to use
+ */
+void set_page_address(struct page *page, void *virtual)
+{
+ unsigned long flags;
+ struct page_address_slot *pas;
+ struct page_address_map *pam;
+
+ BUG_ON(!PageHighMem(page));
+
+ pas = page_slot(page);
+ if (virtual) { /* Add */
+ BUG_ON(list_empty(&page_address_pool));
+
+ spin_lock_irqsave(&pool_lock, flags);
+ pam = list_entry(page_address_pool.next,
+ struct page_address_map, list);
+ list_del(&pam->list);
+ spin_unlock_irqrestore(&pool_lock, flags);
+
+ pam->page = page;
+ pam->virtual = virtual;
+
+ spin_lock_irqsave(&pas->lock, flags);
+ list_add_tail(&pam->list, &pas->lh);
+ spin_unlock_irqrestore(&pas->lock, flags);
+ } else { /* Remove */
+ spin_lock_irqsave(&pas->lock, flags);
+ list_for_each_entry(pam, &pas->lh, list) {
+ if (pam->page == page) {
+ list_del(&pam->list);
+ spin_unlock_irqrestore(&pas->lock, flags);
+ spin_lock_irqsave(&pool_lock, flags);
+ list_add_tail(&pam->list, &page_address_pool);
+ spin_unlock_irqrestore(&pool_lock, flags);
+ goto done;
+ }
+ }
+ spin_unlock_irqrestore(&pas->lock, flags);
+ }
+done:
+ return;
+}
+
+static struct page_address_map page_address_maps[LAST_PKMAP];
+
+void __init page_address_init(void)
+{
+ int i;
+
+ INIT_LIST_HEAD(&page_address_pool);
+ for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
+ list_add(&page_address_maps[i].list, &page_address_pool);
+ for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
+ INIT_LIST_HEAD(&page_address_htable[i].lh);
+ spin_lock_init(&page_address_htable[i].lock);
+ }
+ spin_lock_init(&pool_lock);
+}
+
+#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
new file mode 100644
index 0000000..6058b53
--- /dev/null
+++ b/mm/hugetlb.c
@@ -0,0 +1,2293 @@
+/*
+ * Generic hugetlb support.
+ * (C) William Irwin, April 2004
+ */
+#include <linux/gfp.h>
+#include <linux/list.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/seq_file.h>
+#include <linux/sysctl.h>
+#include <linux/highmem.h>
+#include <linux/mmu_notifier.h>
+#include <linux/nodemask.h>
+#include <linux/pagemap.h>
+#include <linux/mempolicy.h>
+#include <linux/cpuset.h>
+#include <linux/mutex.h>
+#include <linux/bootmem.h>
+#include <linux/sysfs.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/io.h>
+
+#include <linux/hugetlb.h>
+#include "internal.h"
+
+const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
+static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
+unsigned long hugepages_treat_as_movable;
+
+static int max_hstate;
+unsigned int default_hstate_idx;
+struct hstate hstates[HUGE_MAX_HSTATE];
+
+__initdata LIST_HEAD(huge_boot_pages);
+
+/* for command line parsing */
+static struct hstate * __initdata parsed_hstate;
+static unsigned long __initdata default_hstate_max_huge_pages;
+static unsigned long __initdata default_hstate_size;
+
+#define for_each_hstate(h) \
+ for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
+
+/*
+ * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
+ */
+static DEFINE_SPINLOCK(hugetlb_lock);
+
+/*
+ * Region tracking -- allows tracking of reservations and instantiated pages
+ * across the pages in a mapping.
+ *
+ * The region data structures are protected by a combination of the mmap_sem
+ * and the hugetlb_instantion_mutex. To access or modify a region the caller
+ * must either hold the mmap_sem for write, or the mmap_sem for read and
+ * the hugetlb_instantiation mutex:
+ *
+ * down_write(&mm->mmap_sem);
+ * or
+ * down_read(&mm->mmap_sem);
+ * mutex_lock(&hugetlb_instantiation_mutex);
+ */
+struct file_region {
+ struct list_head link;
+ long from;
+ long to;
+};
+
+static long region_add(struct list_head *head, long f, long t)
+{
+ struct file_region *rg, *nrg, *trg;
+
+ /* Locate the region we are either in or before. */
+ list_for_each_entry(rg, head, link)
+ if (f <= rg->to)
+ break;
+
+ /* Round our left edge to the current segment if it encloses us. */
+ if (f > rg->from)
+ f = rg->from;
+
+ /* Check for and consume any regions we now overlap with. */
+ nrg = rg;
+ list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+ if (&rg->link == head)
+ break;
+ if (rg->from > t)
+ break;
+
+ /* If this area reaches higher then extend our area to
+ * include it completely. If this is not the first area
+ * which we intend to reuse, free it. */
+ if (rg->to > t)
+ t = rg->to;
+ if (rg != nrg) {
+ list_del(&rg->link);
+ kfree(rg);
+ }
+ }
+ nrg->from = f;
+ nrg->to = t;
+ return 0;
+}
+
+static long region_chg(struct list_head *head, long f, long t)
+{
+ struct file_region *rg, *nrg;
+ long chg = 0;
+
+ /* Locate the region we are before or in. */
+ list_for_each_entry(rg, head, link)
+ if (f <= rg->to)
+ break;
+
+ /* If we are below the current region then a new region is required.
+ * Subtle, allocate a new region at the position but make it zero
+ * size such that we can guarantee to record the reservation. */
+ if (&rg->link == head || t < rg->from) {
+ nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
+ if (!nrg)
+ return -ENOMEM;
+ nrg->from = f;
+ nrg->to = f;
+ INIT_LIST_HEAD(&nrg->link);
+ list_add(&nrg->link, rg->link.prev);
+
+ return t - f;
+ }
+
+ /* Round our left edge to the current segment if it encloses us. */
+ if (f > rg->from)
+ f = rg->from;
+ chg = t - f;
+
+ /* Check for and consume any regions we now overlap with. */
+ list_for_each_entry(rg, rg->link.prev, link) {
+ if (&rg->link == head)
+ break;
+ if (rg->from > t)
+ return chg;
+
+ /* We overlap with this area, if it extends futher than
+ * us then we must extend ourselves. Account for its
+ * existing reservation. */
+ if (rg->to > t) {
+ chg += rg->to - t;
+ t = rg->to;
+ }
+ chg -= rg->to - rg->from;
+ }
+ return chg;
+}
+
+static long region_truncate(struct list_head *head, long end)
+{
+ struct file_region *rg, *trg;
+ long chg = 0;
+
+ /* Locate the region we are either in or before. */
+ list_for_each_entry(rg, head, link)
+ if (end <= rg->to)
+ break;
+ if (&rg->link == head)
+ return 0;
+
+ /* If we are in the middle of a region then adjust it. */
+ if (end > rg->from) {
+ chg = rg->to - end;
+ rg->to = end;
+ rg = list_entry(rg->link.next, typeof(*rg), link);
+ }
+
+ /* Drop any remaining regions. */
+ list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+ if (&rg->link == head)
+ break;
+ chg += rg->to - rg->from;
+ list_del(&rg->link);
+ kfree(rg);
+ }
+ return chg;
+}
+
+static long region_count(struct list_head *head, long f, long t)
+{
+ struct file_region *rg;
+ long chg = 0;
+
+ /* Locate each segment we overlap with, and count that overlap. */
+ list_for_each_entry(rg, head, link) {
+ int seg_from;
+ int seg_to;
+
+ if (rg->to <= f)
+ continue;
+ if (rg->from >= t)
+ break;
+
+ seg_from = max(rg->from, f);
+ seg_to = min(rg->to, t);
+
+ chg += seg_to - seg_from;
+ }
+
+ return chg;
+}
+
+/*
+ * Convert the address within this vma to the page offset within
+ * the mapping, in pagecache page units; huge pages here.
+ */
+static pgoff_t vma_hugecache_offset(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ return ((address - vma->vm_start) >> huge_page_shift(h)) +
+ (vma->vm_pgoff >> huge_page_order(h));
+}
+
+/*
+ * Flags for MAP_PRIVATE reservations. These are stored in the bottom
+ * bits of the reservation map pointer, which are always clear due to
+ * alignment.
+ */
+#define HPAGE_RESV_OWNER (1UL << 0)
+#define HPAGE_RESV_UNMAPPED (1UL << 1)
+#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
+
+/*
+ * These helpers are used to track how many pages are reserved for
+ * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
+ * is guaranteed to have their future faults succeed.
+ *
+ * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
+ * the reserve counters are updated with the hugetlb_lock held. It is safe
+ * to reset the VMA at fork() time as it is not in use yet and there is no
+ * chance of the global counters getting corrupted as a result of the values.
+ *
+ * The private mapping reservation is represented in a subtly different
+ * manner to a shared mapping. A shared mapping has a region map associated
+ * with the underlying file, this region map represents the backing file
+ * pages which have ever had a reservation assigned which this persists even
+ * after the page is instantiated. A private mapping has a region map
+ * associated with the original mmap which is attached to all VMAs which
+ * reference it, this region map represents those offsets which have consumed
+ * reservation ie. where pages have been instantiated.
+ */
+static unsigned long get_vma_private_data(struct vm_area_struct *vma)
+{
+ return (unsigned long)vma->vm_private_data;
+}
+
+static void set_vma_private_data(struct vm_area_struct *vma,
+ unsigned long value)
+{
+ vma->vm_private_data = (void *)value;
+}
+
+struct resv_map {
+ struct kref refs;
+ struct list_head regions;
+};
+
+static struct resv_map *resv_map_alloc(void)
+{
+ struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
+ if (!resv_map)
+ return NULL;
+
+ kref_init(&resv_map->refs);
+ INIT_LIST_HEAD(&resv_map->regions);
+
+ return resv_map;
+}
+
+static void resv_map_release(struct kref *ref)
+{
+ struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
+
+ /* Clear out any active regions before we release the map. */
+ region_truncate(&resv_map->regions, 0);
+ kfree(resv_map);
+}
+
+static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
+{
+ VM_BUG_ON(!is_vm_hugetlb_page(vma));
+ if (!(vma->vm_flags & VM_SHARED))
+ return (struct resv_map *)(get_vma_private_data(vma) &
+ ~HPAGE_RESV_MASK);
+ return NULL;
+}
+
+static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
+{
+ VM_BUG_ON(!is_vm_hugetlb_page(vma));
+ VM_BUG_ON(vma->vm_flags & VM_SHARED);
+
+ set_vma_private_data(vma, (get_vma_private_data(vma) &
+ HPAGE_RESV_MASK) | (unsigned long)map);
+}
+
+static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
+{
+ VM_BUG_ON(!is_vm_hugetlb_page(vma));
+ VM_BUG_ON(vma->vm_flags & VM_SHARED);
+
+ set_vma_private_data(vma, get_vma_private_data(vma) | flags);
+}
+
+static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
+{
+ VM_BUG_ON(!is_vm_hugetlb_page(vma));
+
+ return (get_vma_private_data(vma) & flag) != 0;
+}
+
+/* Decrement the reserved pages in the hugepage pool by one */
+static void decrement_hugepage_resv_vma(struct hstate *h,
+ struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & VM_NORESERVE)
+ return;
+
+ if (vma->vm_flags & VM_SHARED) {
+ /* Shared mappings always use reserves */
+ h->resv_huge_pages--;
+ } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+ /*
+ * Only the process that called mmap() has reserves for
+ * private mappings.
+ */
+ h->resv_huge_pages--;
+ }
+}
+
+/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
+void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+{
+ VM_BUG_ON(!is_vm_hugetlb_page(vma));
+ if (!(vma->vm_flags & VM_SHARED))
+ vma->vm_private_data = (void *)0;
+}
+
+/* Returns true if the VMA has associated reserve pages */
+static int vma_has_reserves(struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & VM_SHARED)
+ return 1;
+ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+ return 1;
+ return 0;
+}
+
+static void clear_gigantic_page(struct page *page,
+ unsigned long addr, unsigned long sz)
+{
+ int i;
+ struct page *p = page;
+
+ might_sleep();
+ for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) {
+ cond_resched();
+ clear_user_highpage(p, addr + i * PAGE_SIZE);
+ }
+}
+static void clear_huge_page(struct page *page,
+ unsigned long addr, unsigned long sz)
+{
+ int i;
+
+ if (unlikely(sz > MAX_ORDER_NR_PAGES))
+ return clear_gigantic_page(page, addr, sz);
+
+ might_sleep();
+ for (i = 0; i < sz/PAGE_SIZE; i++) {
+ cond_resched();
+ clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+ }
+}
+
+static void copy_gigantic_page(struct page *dst, struct page *src,
+ unsigned long addr, struct vm_area_struct *vma)
+{
+ int i;
+ struct hstate *h = hstate_vma(vma);
+ struct page *dst_base = dst;
+ struct page *src_base = src;
+ might_sleep();
+ for (i = 0; i < pages_per_huge_page(h); ) {
+ cond_resched();
+ copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
+
+ i++;
+ dst = mem_map_next(dst, dst_base, i);
+ src = mem_map_next(src, src_base, i);
+ }
+}
+static void copy_huge_page(struct page *dst, struct page *src,
+ unsigned long addr, struct vm_area_struct *vma)
+{
+ int i;
+ struct hstate *h = hstate_vma(vma);
+
+ if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES))
+ return copy_gigantic_page(dst, src, addr, vma);
+
+ might_sleep();
+ for (i = 0; i < pages_per_huge_page(h); i++) {
+ cond_resched();
+ copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
+ }
+}
+
+static void enqueue_huge_page(struct hstate *h, struct page *page)
+{
+ int nid = page_to_nid(page);
+ list_add(&page->lru, &h->hugepage_freelists[nid]);
+ h->free_huge_pages++;
+ h->free_huge_pages_node[nid]++;
+}
+
+static struct page *dequeue_huge_page(struct hstate *h)
+{
+ int nid;
+ struct page *page = NULL;
+
+ for (nid = 0; nid < MAX_NUMNODES; ++nid) {
+ if (!list_empty(&h->hugepage_freelists[nid])) {
+ page = list_entry(h->hugepage_freelists[nid].next,
+ struct page, lru);
+ list_del(&page->lru);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+ break;
+ }
+ }
+ return page;
+}
+
+static struct page *dequeue_huge_page_vma(struct hstate *h,
+ struct vm_area_struct *vma,
+ unsigned long address, int avoid_reserve)
+{
+ int nid;
+ struct page *page = NULL;
+ struct mempolicy *mpol;
+ nodemask_t *nodemask;
+ struct zonelist *zonelist = huge_zonelist(vma, address,
+ htlb_alloc_mask, &mpol, &nodemask);
+ struct zone *zone;
+ struct zoneref *z;
+
+ /*
+ * A child process with MAP_PRIVATE mappings created by their parent
+ * have no page reserves. This check ensures that reservations are
+ * not "stolen". The child may still get SIGKILLed
+ */
+ if (!vma_has_reserves(vma) &&
+ h->free_huge_pages - h->resv_huge_pages == 0)
+ return NULL;
+
+ /* If reserves cannot be used, ensure enough pages are in the pool */
+ if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
+ return NULL;
+
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ MAX_NR_ZONES - 1, nodemask) {
+ nid = zone_to_nid(zone);
+ if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
+ !list_empty(&h->hugepage_freelists[nid])) {
+ page = list_entry(h->hugepage_freelists[nid].next,
+ struct page, lru);
+ list_del(&page->lru);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+
+ if (!avoid_reserve)
+ decrement_hugepage_resv_vma(h, vma);
+
+ break;
+ }
+ }
+ mpol_cond_put(mpol);
+ return page;
+}
+
+static void update_and_free_page(struct hstate *h, struct page *page)
+{
+ int i;
+
+ VM_BUG_ON(h->order >= MAX_ORDER);
+
+ h->nr_huge_pages--;
+ h->nr_huge_pages_node[page_to_nid(page)]--;
+ for (i = 0; i < pages_per_huge_page(h); i++) {
+ page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
+ 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
+ 1 << PG_private | 1<< PG_writeback);
+ }
+ set_compound_page_dtor(page, NULL);
+ set_page_refcounted(page);
+ arch_release_hugepage(page);
+ __free_pages(page, huge_page_order(h));
+}
+
+struct hstate *size_to_hstate(unsigned long size)
+{
+ struct hstate *h;
+
+ for_each_hstate(h) {
+ if (huge_page_size(h) == size)
+ return h;
+ }
+ return NULL;
+}
+
+static void free_huge_page(struct page *page)
+{
+ /*
+ * Can't pass hstate in here because it is called from the
+ * compound page destructor.
+ */
+ struct hstate *h = page_hstate(page);
+ int nid = page_to_nid(page);
+ struct address_space *mapping;
+
+ mapping = (struct address_space *) page_private(page);
+ set_page_private(page, 0);
+ BUG_ON(page_count(page));
+ INIT_LIST_HEAD(&page->lru);
+
+ spin_lock(&hugetlb_lock);
+ if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
+ update_and_free_page(h, page);
+ h->surplus_huge_pages--;
+ h->surplus_huge_pages_node[nid]--;
+ } else {
+ enqueue_huge_page(h, page);
+ }
+ spin_unlock(&hugetlb_lock);
+ if (mapping)
+ hugetlb_put_quota(mapping, 1);
+}
+
+/*
+ * Increment or decrement surplus_huge_pages. Keep node-specific counters
+ * balanced by operating on them in a round-robin fashion.
+ * Returns 1 if an adjustment was made.
+ */
+static int adjust_pool_surplus(struct hstate *h, int delta)
+{
+ static int prev_nid;
+ int nid = prev_nid;
+ int ret = 0;
+
+ VM_BUG_ON(delta != -1 && delta != 1);
+ do {
+ nid = next_node(nid, node_online_map);
+ if (nid == MAX_NUMNODES)
+ nid = first_node(node_online_map);
+
+ /* To shrink on this node, there must be a surplus page */
+ if (delta < 0 && !h->surplus_huge_pages_node[nid])
+ continue;
+ /* Surplus cannot exceed the total number of pages */
+ if (delta > 0 && h->surplus_huge_pages_node[nid] >=
+ h->nr_huge_pages_node[nid])
+ continue;
+
+ h->surplus_huge_pages += delta;
+ h->surplus_huge_pages_node[nid] += delta;
+ ret = 1;
+ break;
+ } while (nid != prev_nid);
+
+ prev_nid = nid;
+ return ret;
+}
+
+static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
+{
+ set_compound_page_dtor(page, free_huge_page);
+ spin_lock(&hugetlb_lock);
+ h->nr_huge_pages++;
+ h->nr_huge_pages_node[nid]++;
+ spin_unlock(&hugetlb_lock);
+ put_page(page); /* free it into the hugepage allocator */
+}
+
+static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
+{
+ struct page *page;
+
+ if (h->order >= MAX_ORDER)
+ return NULL;
+
+ page = alloc_pages_node(nid,
+ htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
+ __GFP_REPEAT|__GFP_NOWARN,
+ huge_page_order(h));
+ if (page) {
+ if (arch_prepare_hugepage(page)) {
+ __free_pages(page, huge_page_order(h));
+ return NULL;
+ }
+ prep_new_huge_page(h, page, nid);
+ }
+
+ return page;
+}
+
+/*
+ * Use a helper variable to find the next node and then
+ * copy it back to hugetlb_next_nid afterwards:
+ * otherwise there's a window in which a racer might
+ * pass invalid nid MAX_NUMNODES to alloc_pages_node.
+ * But we don't need to use a spin_lock here: it really
+ * doesn't matter if occasionally a racer chooses the
+ * same nid as we do. Move nid forward in the mask even
+ * if we just successfully allocated a hugepage so that
+ * the next caller gets hugepages on the next node.
+ */
+static int hstate_next_node(struct hstate *h)
+{
+ int next_nid;
+ next_nid = next_node(h->hugetlb_next_nid, node_online_map);
+ if (next_nid == MAX_NUMNODES)
+ next_nid = first_node(node_online_map);
+ h->hugetlb_next_nid = next_nid;
+ return next_nid;
+}
+
+static int alloc_fresh_huge_page(struct hstate *h)
+{
+ struct page *page;
+ int start_nid;
+ int next_nid;
+ int ret = 0;
+
+ start_nid = h->hugetlb_next_nid;
+
+ do {
+ page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid);
+ if (page)
+ ret = 1;
+ next_nid = hstate_next_node(h);
+ } while (!page && h->hugetlb_next_nid != start_nid);
+
+ if (ret)
+ count_vm_event(HTLB_BUDDY_PGALLOC);
+ else
+ count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+
+ return ret;
+}
+
+static struct page *alloc_buddy_huge_page(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ struct page *page;
+ unsigned int nid;
+
+ if (h->order >= MAX_ORDER)
+ return NULL;
+
+ /*
+ * Assume we will successfully allocate the surplus page to
+ * prevent racing processes from causing the surplus to exceed
+ * overcommit
+ *
+ * This however introduces a different race, where a process B
+ * tries to grow the static hugepage pool while alloc_pages() is
+ * called by process A. B will only examine the per-node
+ * counters in determining if surplus huge pages can be
+ * converted to normal huge pages in adjust_pool_surplus(). A
+ * won't be able to increment the per-node counter, until the
+ * lock is dropped by B, but B doesn't drop hugetlb_lock until
+ * no more huge pages can be converted from surplus to normal
+ * state (and doesn't try to convert again). Thus, we have a
+ * case where a surplus huge page exists, the pool is grown, and
+ * the surplus huge page still exists after, even though it
+ * should just have been converted to a normal huge page. This
+ * does not leak memory, though, as the hugepage will be freed
+ * once it is out of use. It also does not allow the counters to
+ * go out of whack in adjust_pool_surplus() as we don't modify
+ * the node values until we've gotten the hugepage and only the
+ * per-node value is checked there.
+ */
+ spin_lock(&hugetlb_lock);
+ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
+ spin_unlock(&hugetlb_lock);
+ return NULL;
+ } else {
+ h->nr_huge_pages++;
+ h->surplus_huge_pages++;
+ }
+ spin_unlock(&hugetlb_lock);
+
+ page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
+ __GFP_REPEAT|__GFP_NOWARN,
+ huge_page_order(h));
+
+ if (page && arch_prepare_hugepage(page)) {
+ __free_pages(page, huge_page_order(h));
+ return NULL;
+ }
+
+ spin_lock(&hugetlb_lock);
+ if (page) {
+ /*
+ * This page is now managed by the hugetlb allocator and has
+ * no users -- drop the buddy allocator's reference.
+ */
+ put_page_testzero(page);
+ VM_BUG_ON(page_count(page));
+ nid = page_to_nid(page);
+ set_compound_page_dtor(page, free_huge_page);
+ /*
+ * We incremented the global counters already
+ */
+ h->nr_huge_pages_node[nid]++;
+ h->surplus_huge_pages_node[nid]++;
+ __count_vm_event(HTLB_BUDDY_PGALLOC);
+ } else {
+ h->nr_huge_pages--;
+ h->surplus_huge_pages--;
+ __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+ }
+ spin_unlock(&hugetlb_lock);
+
+ return page;
+}
+
+/*
+ * Increase the hugetlb pool such that it can accomodate a reservation
+ * of size 'delta'.
+ */
+static int gather_surplus_pages(struct hstate *h, int delta)
+{
+ struct list_head surplus_list;
+ struct page *page, *tmp;
+ int ret, i;
+ int needed, allocated;
+
+ needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
+ if (needed <= 0) {
+ h->resv_huge_pages += delta;
+ return 0;
+ }
+
+ allocated = 0;
+ INIT_LIST_HEAD(&surplus_list);
+
+ ret = -ENOMEM;
+retry:
+ spin_unlock(&hugetlb_lock);
+ for (i = 0; i < needed; i++) {
+ page = alloc_buddy_huge_page(h, NULL, 0);
+ if (!page) {
+ /*
+ * We were not able to allocate enough pages to
+ * satisfy the entire reservation so we free what
+ * we've allocated so far.
+ */
+ spin_lock(&hugetlb_lock);
+ needed = 0;
+ goto free;
+ }
+
+ list_add(&page->lru, &surplus_list);
+ }
+ allocated += needed;
+
+ /*
+ * After retaking hugetlb_lock, we need to recalculate 'needed'
+ * because either resv_huge_pages or free_huge_pages may have changed.
+ */
+ spin_lock(&hugetlb_lock);
+ needed = (h->resv_huge_pages + delta) -
+ (h->free_huge_pages + allocated);
+ if (needed > 0)
+ goto retry;
+
+ /*
+ * The surplus_list now contains _at_least_ the number of extra pages
+ * needed to accomodate the reservation. Add the appropriate number
+ * of pages to the hugetlb pool and free the extras back to the buddy
+ * allocator. Commit the entire reservation here to prevent another
+ * process from stealing the pages as they are added to the pool but
+ * before they are reserved.
+ */
+ needed += allocated;
+ h->resv_huge_pages += delta;
+ ret = 0;
+free:
+ /* Free the needed pages to the hugetlb pool */
+ list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
+ if ((--needed) < 0)
+ break;
+ list_del(&page->lru);
+ enqueue_huge_page(h, page);
+ }
+
+ /* Free unnecessary surplus pages to the buddy allocator */
+ if (!list_empty(&surplus_list)) {
+ spin_unlock(&hugetlb_lock);
+ list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
+ list_del(&page->lru);
+ /*
+ * The page has a reference count of zero already, so
+ * call free_huge_page directly instead of using
+ * put_page. This must be done with hugetlb_lock
+ * unlocked which is safe because free_huge_page takes
+ * hugetlb_lock before deciding how to free the page.
+ */
+ free_huge_page(page);
+ }
+ spin_lock(&hugetlb_lock);
+ }
+
+ return ret;
+}
+
+/*
+ * When releasing a hugetlb pool reservation, any surplus pages that were
+ * allocated to satisfy the reservation must be explicitly freed if they were
+ * never used.
+ */
+static void return_unused_surplus_pages(struct hstate *h,
+ unsigned long unused_resv_pages)
+{
+ static int nid = -1;
+ struct page *page;
+ unsigned long nr_pages;
+
+ /*
+ * We want to release as many surplus pages as possible, spread
+ * evenly across all nodes. Iterate across all nodes until we
+ * can no longer free unreserved surplus pages. This occurs when
+ * the nodes with surplus pages have no free pages.
+ */
+ unsigned long remaining_iterations = num_online_nodes();
+
+ /* Uncommit the reservation */
+ h->resv_huge_pages -= unused_resv_pages;
+
+ /* Cannot return gigantic pages currently */
+ if (h->order >= MAX_ORDER)
+ return;
+
+ nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
+
+ while (remaining_iterations-- && nr_pages) {
+ nid = next_node(nid, node_online_map);
+ if (nid == MAX_NUMNODES)
+ nid = first_node(node_online_map);
+
+ if (!h->surplus_huge_pages_node[nid])
+ continue;
+
+ if (!list_empty(&h->hugepage_freelists[nid])) {
+ page = list_entry(h->hugepage_freelists[nid].next,
+ struct page, lru);
+ list_del(&page->lru);
+ update_and_free_page(h, page);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+ h->surplus_huge_pages--;
+ h->surplus_huge_pages_node[nid]--;
+ nr_pages--;
+ remaining_iterations = num_online_nodes();
+ }
+ }
+}
+
+/*
+ * Determine if the huge page at addr within the vma has an associated
+ * reservation. Where it does not we will need to logically increase
+ * reservation and actually increase quota before an allocation can occur.
+ * Where any new reservation would be required the reservation change is
+ * prepared, but not committed. Once the page has been quota'd allocated
+ * an instantiated the change should be committed via vma_commit_reservation.
+ * No action is required on failure.
+ */
+static int vma_needs_reservation(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ struct inode *inode = mapping->host;
+
+ if (vma->vm_flags & VM_SHARED) {
+ pgoff_t idx = vma_hugecache_offset(h, vma, addr);
+ return region_chg(&inode->i_mapping->private_list,
+ idx, idx + 1);
+
+ } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+ return 1;
+
+ } else {
+ int err;
+ pgoff_t idx = vma_hugecache_offset(h, vma, addr);
+ struct resv_map *reservations = vma_resv_map(vma);
+
+ err = region_chg(&reservations->regions, idx, idx + 1);
+ if (err < 0)
+ return err;
+ return 0;
+ }
+}
+static void vma_commit_reservation(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ struct inode *inode = mapping->host;
+
+ if (vma->vm_flags & VM_SHARED) {
+ pgoff_t idx = vma_hugecache_offset(h, vma, addr);
+ region_add(&inode->i_mapping->private_list, idx, idx + 1);
+
+ } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+ pgoff_t idx = vma_hugecache_offset(h, vma, addr);
+ struct resv_map *reservations = vma_resv_map(vma);
+
+ /* Mark this page used in the map. */
+ region_add(&reservations->regions, idx, idx + 1);
+ }
+}
+
+static struct page *alloc_huge_page(struct vm_area_struct *vma,
+ unsigned long addr, int avoid_reserve)
+{
+ struct hstate *h = hstate_vma(vma);
+ struct page *page;
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ struct inode *inode = mapping->host;
+ unsigned int chg;
+
+ /*
+ * Processes that did not create the mapping will have no reserves and
+ * will not have accounted against quota. Check that the quota can be
+ * made before satisfying the allocation
+ * MAP_NORESERVE mappings may also need pages and quota allocated
+ * if no reserve mapping overlaps.
+ */
+ chg = vma_needs_reservation(h, vma, addr);
+ if (chg < 0)
+ return ERR_PTR(chg);
+ if (chg)
+ if (hugetlb_get_quota(inode->i_mapping, chg))
+ return ERR_PTR(-ENOSPC);
+
+ spin_lock(&hugetlb_lock);
+ page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
+ spin_unlock(&hugetlb_lock);
+
+ if (!page) {
+ page = alloc_buddy_huge_page(h, vma, addr);
+ if (!page) {
+ hugetlb_put_quota(inode->i_mapping, chg);
+ return ERR_PTR(-VM_FAULT_OOM);
+ }
+ }
+
+ set_page_refcounted(page);
+ set_page_private(page, (unsigned long) mapping);
+
+ vma_commit_reservation(h, vma, addr);
+
+ return page;
+}
+
+__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h)
+{
+ struct huge_bootmem_page *m;
+ int nr_nodes = nodes_weight(node_online_map);
+
+ while (nr_nodes) {
+ void *addr;
+
+ addr = __alloc_bootmem_node_nopanic(
+ NODE_DATA(h->hugetlb_next_nid),
+ huge_page_size(h), huge_page_size(h), 0);
+
+ if (addr) {
+ /*
+ * Use the beginning of the huge page to store the
+ * huge_bootmem_page struct (until gather_bootmem
+ * puts them into the mem_map).
+ */
+ m = addr;
+ if (m)
+ goto found;
+ }
+ hstate_next_node(h);
+ nr_nodes--;
+ }
+ return 0;
+
+found:
+ BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
+ /* Put them into a private list first because mem_map is not up yet */
+ list_add(&m->list, &huge_boot_pages);
+ m->hstate = h;
+ return 1;
+}
+
+static void prep_compound_huge_page(struct page *page, int order)
+{
+ if (unlikely(order > (MAX_ORDER - 1)))
+ prep_compound_gigantic_page(page, order);
+ else
+ prep_compound_page(page, order);
+}
+
+/* Put bootmem huge pages into the standard lists after mem_map is up */
+static void __init gather_bootmem_prealloc(void)
+{
+ struct huge_bootmem_page *m;
+
+ list_for_each_entry(m, &huge_boot_pages, list) {
+ struct page *page = virt_to_page(m);
+ struct hstate *h = m->hstate;
+ __ClearPageReserved(page);
+ WARN_ON(page_count(page) != 1);
+ prep_compound_huge_page(page, h->order);
+ prep_new_huge_page(h, page, page_to_nid(page));
+ }
+}
+
+static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
+{
+ unsigned long i;
+
+ for (i = 0; i < h->max_huge_pages; ++i) {
+ if (h->order >= MAX_ORDER) {
+ if (!alloc_bootmem_huge_page(h))
+ break;
+ } else if (!alloc_fresh_huge_page(h))
+ break;
+ }
+ h->max_huge_pages = i;
+}
+
+static void __init hugetlb_init_hstates(void)
+{
+ struct hstate *h;
+
+ for_each_hstate(h) {
+ /* oversize hugepages were init'ed in early boot */
+ if (h->order < MAX_ORDER)
+ hugetlb_hstate_alloc_pages(h);
+ }
+}
+
+static char * __init memfmt(char *buf, unsigned long n)
+{
+ if (n >= (1UL << 30))
+ sprintf(buf, "%lu GB", n >> 30);
+ else if (n >= (1UL << 20))
+ sprintf(buf, "%lu MB", n >> 20);
+ else
+ sprintf(buf, "%lu KB", n >> 10);
+ return buf;
+}
+
+static void __init report_hugepages(void)
+{
+ struct hstate *h;
+
+ for_each_hstate(h) {
+ char buf[32];
+ printk(KERN_INFO "HugeTLB registered %s page size, "
+ "pre-allocated %ld pages\n",
+ memfmt(buf, huge_page_size(h)),
+ h->free_huge_pages);
+ }
+}
+
+#ifdef CONFIG_HIGHMEM
+static void try_to_free_low(struct hstate *h, unsigned long count)
+{
+ int i;
+
+ if (h->order >= MAX_ORDER)
+ return;
+
+ for (i = 0; i < MAX_NUMNODES; ++i) {
+ struct page *page, *next;
+ struct list_head *freel = &h->hugepage_freelists[i];
+ list_for_each_entry_safe(page, next, freel, lru) {
+ if (count >= h->nr_huge_pages)
+ return;
+ if (PageHighMem(page))
+ continue;
+ list_del(&page->lru);
+ update_and_free_page(h, page);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[page_to_nid(page)]--;
+ }
+ }
+}
+#else
+static inline void try_to_free_low(struct hstate *h, unsigned long count)
+{
+}
+#endif
+
+#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
+static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
+{
+ unsigned long min_count, ret;
+
+ if (h->order >= MAX_ORDER)
+ return h->max_huge_pages;
+
+ /*
+ * Increase the pool size
+ * First take pages out of surplus state. Then make up the
+ * remaining difference by allocating fresh huge pages.
+ *
+ * We might race with alloc_buddy_huge_page() here and be unable
+ * to convert a surplus huge page to a normal huge page. That is
+ * not critical, though, it just means the overall size of the
+ * pool might be one hugepage larger than it needs to be, but
+ * within all the constraints specified by the sysctls.
+ */
+ spin_lock(&hugetlb_lock);
+ while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
+ if (!adjust_pool_surplus(h, -1))
+ break;
+ }
+
+ while (count > persistent_huge_pages(h)) {
+ /*
+ * If this allocation races such that we no longer need the
+ * page, free_huge_page will handle it by freeing the page
+ * and reducing the surplus.
+ */
+ spin_unlock(&hugetlb_lock);
+ ret = alloc_fresh_huge_page(h);
+ spin_lock(&hugetlb_lock);
+ if (!ret)
+ goto out;
+
+ }
+
+ /*
+ * Decrease the pool size
+ * First return free pages to the buddy allocator (being careful
+ * to keep enough around to satisfy reservations). Then place
+ * pages into surplus state as needed so the pool will shrink
+ * to the desired size as pages become free.
+ *
+ * By placing pages into the surplus state independent of the
+ * overcommit value, we are allowing the surplus pool size to
+ * exceed overcommit. There are few sane options here. Since
+ * alloc_buddy_huge_page() is checking the global counter,
+ * though, we'll note that we're not allowed to exceed surplus
+ * and won't grow the pool anywhere else. Not until one of the
+ * sysctls are changed, or the surplus pages go out of use.
+ */
+ min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
+ min_count = max(count, min_count);
+ try_to_free_low(h, min_count);
+ while (min_count < persistent_huge_pages(h)) {
+ struct page *page = dequeue_huge_page(h);
+ if (!page)
+ break;
+ update_and_free_page(h, page);
+ }
+ while (count < persistent_huge_pages(h)) {
+ if (!adjust_pool_surplus(h, 1))
+ break;
+ }
+out:
+ ret = persistent_huge_pages(h);
+ spin_unlock(&hugetlb_lock);
+ return ret;
+}
+
+#define HSTATE_ATTR_RO(_name) \
+ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+#define HSTATE_ATTR(_name) \
+ static struct kobj_attribute _name##_attr = \
+ __ATTR(_name, 0644, _name##_show, _name##_store)
+
+static struct kobject *hugepages_kobj;
+static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
+
+static struct hstate *kobj_to_hstate(struct kobject *kobj)
+{
+ int i;
+ for (i = 0; i < HUGE_MAX_HSTATE; i++)
+ if (hstate_kobjs[i] == kobj)
+ return &hstates[i];
+ BUG();
+ return NULL;
+}
+
+static ssize_t nr_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h = kobj_to_hstate(kobj);
+ return sprintf(buf, "%lu\n", h->nr_huge_pages);
+}
+static ssize_t nr_hugepages_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ int err;
+ unsigned long input;
+ struct hstate *h = kobj_to_hstate(kobj);
+
+ err = strict_strtoul(buf, 10, &input);
+ if (err)
+ return 0;
+
+ h->max_huge_pages = set_max_huge_pages(h, input);
+
+ return count;
+}
+HSTATE_ATTR(nr_hugepages);
+
+static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h = kobj_to_hstate(kobj);
+ return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
+}
+static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ int err;
+ unsigned long input;
+ struct hstate *h = kobj_to_hstate(kobj);
+
+ err = strict_strtoul(buf, 10, &input);
+ if (err)
+ return 0;
+
+ spin_lock(&hugetlb_lock);
+ h->nr_overcommit_huge_pages = input;
+ spin_unlock(&hugetlb_lock);
+
+ return count;
+}
+HSTATE_ATTR(nr_overcommit_hugepages);
+
+static ssize_t free_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h = kobj_to_hstate(kobj);
+ return sprintf(buf, "%lu\n", h->free_huge_pages);
+}
+HSTATE_ATTR_RO(free_hugepages);
+
+static ssize_t resv_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h = kobj_to_hstate(kobj);
+ return sprintf(buf, "%lu\n", h->resv_huge_pages);
+}
+HSTATE_ATTR_RO(resv_hugepages);
+
+static ssize_t surplus_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h = kobj_to_hstate(kobj);
+ return sprintf(buf, "%lu\n", h->surplus_huge_pages);
+}
+HSTATE_ATTR_RO(surplus_hugepages);
+
+static struct attribute *hstate_attrs[] = {
+ &nr_hugepages_attr.attr,
+ &nr_overcommit_hugepages_attr.attr,
+ &free_hugepages_attr.attr,
+ &resv_hugepages_attr.attr,
+ &surplus_hugepages_attr.attr,
+ NULL,
+};
+
+static struct attribute_group hstate_attr_group = {
+ .attrs = hstate_attrs,
+};
+
+static int __init hugetlb_sysfs_add_hstate(struct hstate *h)
+{
+ int retval;
+
+ hstate_kobjs[h - hstates] = kobject_create_and_add(h->name,
+ hugepages_kobj);
+ if (!hstate_kobjs[h - hstates])
+ return -ENOMEM;
+
+ retval = sysfs_create_group(hstate_kobjs[h - hstates],
+ &hstate_attr_group);
+ if (retval)
+ kobject_put(hstate_kobjs[h - hstates]);
+
+ return retval;
+}
+
+static void __init hugetlb_sysfs_init(void)
+{
+ struct hstate *h;
+ int err;
+
+ hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
+ if (!hugepages_kobj)
+ return;
+
+ for_each_hstate(h) {
+ err = hugetlb_sysfs_add_hstate(h);
+ if (err)
+ printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
+ h->name);
+ }
+}
+
+static void __exit hugetlb_exit(void)
+{
+ struct hstate *h;
+
+ for_each_hstate(h) {
+ kobject_put(hstate_kobjs[h - hstates]);
+ }
+
+ kobject_put(hugepages_kobj);
+}
+module_exit(hugetlb_exit);
+
+static int __init hugetlb_init(void)
+{
+ /* Some platform decide whether they support huge pages at boot
+ * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
+ * there is no such support
+ */
+ if (HPAGE_SHIFT == 0)
+ return 0;
+
+ if (!size_to_hstate(default_hstate_size)) {
+ default_hstate_size = HPAGE_SIZE;
+ if (!size_to_hstate(default_hstate_size))
+ hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
+ }
+ default_hstate_idx = size_to_hstate(default_hstate_size) - hstates;
+ if (default_hstate_max_huge_pages)
+ default_hstate.max_huge_pages = default_hstate_max_huge_pages;
+
+ hugetlb_init_hstates();
+
+ gather_bootmem_prealloc();
+
+ report_hugepages();
+
+ hugetlb_sysfs_init();
+
+ return 0;
+}
+module_init(hugetlb_init);
+
+/* Should be called on processing a hugepagesz=... option */
+void __init hugetlb_add_hstate(unsigned order)
+{
+ struct hstate *h;
+ unsigned long i;
+
+ if (size_to_hstate(PAGE_SIZE << order)) {
+ printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
+ return;
+ }
+ BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
+ BUG_ON(order == 0);
+ h = &hstates[max_hstate++];
+ h->order = order;
+ h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
+ h->nr_huge_pages = 0;
+ h->free_huge_pages = 0;
+ for (i = 0; i < MAX_NUMNODES; ++i)
+ INIT_LIST_HEAD(&h->hugepage_freelists[i]);
+ h->hugetlb_next_nid = first_node(node_online_map);
+ snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
+ huge_page_size(h)/1024);
+
+ parsed_hstate = h;
+}
+
+static int __init hugetlb_nrpages_setup(char *s)
+{
+ unsigned long *mhp;
+ static unsigned long *last_mhp;
+
+ /*
+ * !max_hstate means we haven't parsed a hugepagesz= parameter yet,
+ * so this hugepages= parameter goes to the "default hstate".
+ */
+ if (!max_hstate)
+ mhp = &default_hstate_max_huge_pages;
+ else
+ mhp = &parsed_hstate->max_huge_pages;
+
+ if (mhp == last_mhp) {
+ printk(KERN_WARNING "hugepages= specified twice without "
+ "interleaving hugepagesz=, ignoring\n");
+ return 1;
+ }
+
+ if (sscanf(s, "%lu", mhp) <= 0)
+ *mhp = 0;
+
+ /*
+ * Global state is always initialized later in hugetlb_init.
+ * But we need to allocate >= MAX_ORDER hstates here early to still
+ * use the bootmem allocator.
+ */
+ if (max_hstate && parsed_hstate->order >= MAX_ORDER)
+ hugetlb_hstate_alloc_pages(parsed_hstate);
+
+ last_mhp = mhp;
+
+ return 1;
+}
+__setup("hugepages=", hugetlb_nrpages_setup);
+
+static int __init hugetlb_default_setup(char *s)
+{
+ default_hstate_size = memparse(s, &s);
+ return 1;
+}
+__setup("default_hugepagesz=", hugetlb_default_setup);
+
+static unsigned int cpuset_mems_nr(unsigned int *array)
+{
+ int node;
+ unsigned int nr = 0;
+
+ for_each_node_mask(node, cpuset_current_mems_allowed)
+ nr += array[node];
+
+ return nr;
+}
+
+#ifdef CONFIG_SYSCTL
+int hugetlb_sysctl_handler(struct ctl_table *table, int write,
+ struct file *file, void __user *buffer,
+ size_t *length, loff_t *ppos)
+{
+ struct hstate *h = &default_hstate;
+ unsigned long tmp;
+
+ if (!write)
+ tmp = h->max_huge_pages;
+
+ table->data = &tmp;
+ table->maxlen = sizeof(unsigned long);
+ proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+
+ if (write)
+ h->max_huge_pages = set_max_huge_pages(h, tmp);
+
+ return 0;
+}
+
+int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
+ struct file *file, void __user *buffer,
+ size_t *length, loff_t *ppos)
+{
+ proc_dointvec(table, write, file, buffer, length, ppos);
+ if (hugepages_treat_as_movable)
+ htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
+ else
+ htlb_alloc_mask = GFP_HIGHUSER;
+ return 0;
+}
+
+int hugetlb_overcommit_handler(struct ctl_table *table, int write,
+ struct file *file, void __user *buffer,
+ size_t *length, loff_t *ppos)
+{
+ struct hstate *h = &default_hstate;
+ unsigned long tmp;
+
+ if (!write)
+ tmp = h->nr_overcommit_huge_pages;
+
+ table->data = &tmp;
+ table->maxlen = sizeof(unsigned long);
+ proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+
+ if (write) {
+ spin_lock(&hugetlb_lock);
+ h->nr_overcommit_huge_pages = tmp;
+ spin_unlock(&hugetlb_lock);
+ }
+
+ return 0;
+}
+
+#endif /* CONFIG_SYSCTL */
+
+void hugetlb_report_meminfo(struct seq_file *m)
+{
+ struct hstate *h = &default_hstate;
+ seq_printf(m,
+ "HugePages_Total: %5lu\n"
+ "HugePages_Free: %5lu\n"
+ "HugePages_Rsvd: %5lu\n"
+ "HugePages_Surp: %5lu\n"
+ "Hugepagesize: %8lu kB\n",
+ h->nr_huge_pages,
+ h->free_huge_pages,
+ h->resv_huge_pages,
+ h->surplus_huge_pages,
+ 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
+}
+
+int hugetlb_report_node_meminfo(int nid, char *buf)
+{
+ struct hstate *h = &default_hstate;
+ return sprintf(buf,
+ "Node %d HugePages_Total: %5u\n"
+ "Node %d HugePages_Free: %5u\n"
+ "Node %d HugePages_Surp: %5u\n",
+ nid, h->nr_huge_pages_node[nid],
+ nid, h->free_huge_pages_node[nid],
+ nid, h->surplus_huge_pages_node[nid]);
+}
+
+/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
+unsigned long hugetlb_total_pages(void)
+{
+ struct hstate *h = &default_hstate;
+ return h->nr_huge_pages * pages_per_huge_page(h);
+}
+
+static int hugetlb_acct_memory(struct hstate *h, long delta)
+{
+ int ret = -ENOMEM;
+
+ spin_lock(&hugetlb_lock);
+ /*
+ * When cpuset is configured, it breaks the strict hugetlb page
+ * reservation as the accounting is done on a global variable. Such
+ * reservation is completely rubbish in the presence of cpuset because
+ * the reservation is not checked against page availability for the
+ * current cpuset. Application can still potentially OOM'ed by kernel
+ * with lack of free htlb page in cpuset that the task is in.
+ * Attempt to enforce strict accounting with cpuset is almost
+ * impossible (or too ugly) because cpuset is too fluid that
+ * task or memory node can be dynamically moved between cpusets.
+ *
+ * The change of semantics for shared hugetlb mapping with cpuset is
+ * undesirable. However, in order to preserve some of the semantics,
+ * we fall back to check against current free page availability as
+ * a best attempt and hopefully to minimize the impact of changing
+ * semantics that cpuset has.
+ */
+ if (delta > 0) {
+ if (gather_surplus_pages(h, delta) < 0)
+ goto out;
+
+ if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
+ return_unused_surplus_pages(h, delta);
+ goto out;
+ }
+ }
+
+ ret = 0;
+ if (delta < 0)
+ return_unused_surplus_pages(h, (unsigned long) -delta);
+
+out:
+ spin_unlock(&hugetlb_lock);
+ return ret;
+}
+
+static void hugetlb_vm_op_open(struct vm_area_struct *vma)
+{
+ struct resv_map *reservations = vma_resv_map(vma);
+
+ /*
+ * This new VMA should share its siblings reservation map if present.
+ * The VMA will only ever have a valid reservation map pointer where
+ * it is being copied for another still existing VMA. As that VMA
+ * has a reference to the reservation map it cannot dissappear until
+ * after this open call completes. It is therefore safe to take a
+ * new reference here without additional locking.
+ */
+ if (reservations)
+ kref_get(&reservations->refs);
+}
+
+static void hugetlb_vm_op_close(struct vm_area_struct *vma)
+{
+ struct hstate *h = hstate_vma(vma);
+ struct resv_map *reservations = vma_resv_map(vma);
+ unsigned long reserve;
+ unsigned long start;
+ unsigned long end;
+
+ if (reservations) {
+ start = vma_hugecache_offset(h, vma, vma->vm_start);
+ end = vma_hugecache_offset(h, vma, vma->vm_end);
+
+ reserve = (end - start) -
+ region_count(&reservations->regions, start, end);
+
+ kref_put(&reservations->refs, resv_map_release);
+
+ if (reserve) {
+ hugetlb_acct_memory(h, -reserve);
+ hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
+ }
+ }
+}
+
+/*
+ * We cannot handle pagefaults against hugetlb pages at all. They cause
+ * handle_mm_fault() to try to instantiate regular-sized pages in the
+ * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
+ * this far.
+ */
+static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ BUG();
+ return 0;
+}
+
+struct vm_operations_struct hugetlb_vm_ops = {
+ .fault = hugetlb_vm_op_fault,
+ .open = hugetlb_vm_op_open,
+ .close = hugetlb_vm_op_close,
+};
+
+static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
+ int writable)
+{
+ pte_t entry;
+
+ if (writable) {
+ entry =
+ pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
+ } else {
+ entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot));
+ }
+ entry = pte_mkyoung(entry);
+ entry = pte_mkhuge(entry);
+
+ return entry;
+}
+
+static void set_huge_ptep_writable(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ pte_t entry;
+
+ entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
+ if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) {
+ update_mmu_cache(vma, address, entry);
+ }
+}
+
+
+int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
+ struct vm_area_struct *vma)
+{
+ pte_t *src_pte, *dst_pte, entry;
+ struct page *ptepage;
+ unsigned long addr;
+ int cow;
+ struct hstate *h = hstate_vma(vma);
+ unsigned long sz = huge_page_size(h);
+
+ cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
+
+ for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
+ src_pte = huge_pte_offset(src, addr);
+ if (!src_pte)
+ continue;
+ dst_pte = huge_pte_alloc(dst, addr, sz);
+ if (!dst_pte)
+ goto nomem;
+
+ /* If the pagetables are shared don't copy or take references */
+ if (dst_pte == src_pte)
+ continue;
+
+ spin_lock(&dst->page_table_lock);
+ spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING);
+ if (!huge_pte_none(huge_ptep_get(src_pte))) {
+ if (cow)
+ huge_ptep_set_wrprotect(src, addr, src_pte);
+ entry = huge_ptep_get(src_pte);
+ ptepage = pte_page(entry);
+ get_page(ptepage);
+ set_huge_pte_at(dst, addr, dst_pte, entry);
+ }
+ spin_unlock(&src->page_table_lock);
+ spin_unlock(&dst->page_table_lock);
+ }
+ return 0;
+
+nomem:
+ return -ENOMEM;
+}
+
+void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, struct page *ref_page)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long address;
+ pte_t *ptep;
+ pte_t pte;
+ struct page *page;
+ struct page *tmp;
+ struct hstate *h = hstate_vma(vma);
+ unsigned long sz = huge_page_size(h);
+
+ /*
+ * A page gathering list, protected by per file i_mmap_lock. The
+ * lock is used to avoid list corruption from multiple unmapping
+ * of the same page since we are using page->lru.
+ */
+ LIST_HEAD(page_list);
+
+ WARN_ON(!is_vm_hugetlb_page(vma));
+ BUG_ON(start & ~huge_page_mask(h));
+ BUG_ON(end & ~huge_page_mask(h));
+
+ mmu_notifier_invalidate_range_start(mm, start, end);
+ spin_lock(&mm->page_table_lock);
+ for (address = start; address < end; address += sz) {
+ ptep = huge_pte_offset(mm, address);
+ if (!ptep)
+ continue;
+
+ if (huge_pmd_unshare(mm, &address, ptep))
+ continue;
+
+ /*
+ * If a reference page is supplied, it is because a specific
+ * page is being unmapped, not a range. Ensure the page we
+ * are about to unmap is the actual page of interest.
+ */
+ if (ref_page) {
+ pte = huge_ptep_get(ptep);
+ if (huge_pte_none(pte))
+ continue;
+ page = pte_page(pte);
+ if (page != ref_page)
+ continue;
+
+ /*
+ * Mark the VMA as having unmapped its page so that
+ * future faults in this VMA will fail rather than
+ * looking like data was lost
+ */
+ set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
+ }
+
+ pte = huge_ptep_get_and_clear(mm, address, ptep);
+ if (huge_pte_none(pte))
+ continue;
+
+ page = pte_page(pte);
+ if (pte_dirty(pte))
+ set_page_dirty(page);
+ list_add(&page->lru, &page_list);
+ }
+ spin_unlock(&mm->page_table_lock);
+ flush_tlb_range(vma, start, end);
+ mmu_notifier_invalidate_range_end(mm, start, end);
+ list_for_each_entry_safe(page, tmp, &page_list, lru) {
+ list_del(&page->lru);
+ put_page(page);
+ }
+}
+
+void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, struct page *ref_page)
+{
+ spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+ __unmap_hugepage_range(vma, start, end, ref_page);
+ spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+}
+
+/*
+ * This is called when the original mapper is failing to COW a MAP_PRIVATE
+ * mappping it owns the reserve page for. The intention is to unmap the page
+ * from other VMAs and let the children be SIGKILLed if they are faulting the
+ * same region.
+ */
+static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct page *page, unsigned long address)
+{
+ struct hstate *h = hstate_vma(vma);
+ struct vm_area_struct *iter_vma;
+ struct address_space *mapping;
+ struct prio_tree_iter iter;
+ pgoff_t pgoff;
+
+ /*
+ * vm_pgoff is in PAGE_SIZE units, hence the different calculation
+ * from page cache lookup which is in HPAGE_SIZE units.
+ */
+ address = address & huge_page_mask(h);
+ pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
+ + (vma->vm_pgoff >> PAGE_SHIFT);
+ mapping = (struct address_space *)page_private(page);
+
+ vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ /* Do not unmap the current VMA */
+ if (iter_vma == vma)
+ continue;
+
+ /*
+ * Unmap the page from other VMAs without their own reserves.
+ * They get marked to be SIGKILLed if they fault in these
+ * areas. This is because a future no-page fault on this VMA
+ * could insert a zeroed page instead of the data existing
+ * from the time of fork. This would look like data corruption
+ */
+ if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
+ unmap_hugepage_range(iter_vma,
+ address, address + huge_page_size(h),
+ page);
+ }
+
+ return 1;
+}
+
+static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep, pte_t pte,
+ struct page *pagecache_page)
+{
+ struct hstate *h = hstate_vma(vma);
+ struct page *old_page, *new_page;
+ int avoidcopy;
+ int outside_reserve = 0;
+
+ old_page = pte_page(pte);
+
+retry_avoidcopy:
+ /* If no-one else is actually using this page, avoid the copy
+ * and just make the page writable */
+ avoidcopy = (page_count(old_page) == 1);
+ if (avoidcopy) {
+ set_huge_ptep_writable(vma, address, ptep);
+ return 0;
+ }
+
+ /*
+ * If the process that created a MAP_PRIVATE mapping is about to
+ * perform a COW due to a shared page count, attempt to satisfy
+ * the allocation without using the existing reserves. The pagecache
+ * page is used to determine if the reserve at this address was
+ * consumed or not. If reserves were used, a partial faulted mapping
+ * at the time of fork() could consume its reserves on COW instead
+ * of the full address range.
+ */
+ if (!(vma->vm_flags & VM_SHARED) &&
+ is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
+ old_page != pagecache_page)
+ outside_reserve = 1;
+
+ page_cache_get(old_page);
+ new_page = alloc_huge_page(vma, address, outside_reserve);
+
+ if (IS_ERR(new_page)) {
+ page_cache_release(old_page);
+
+ /*
+ * If a process owning a MAP_PRIVATE mapping fails to COW,
+ * it is due to references held by a child and an insufficient
+ * huge page pool. To guarantee the original mappers
+ * reliability, unmap the page from child processes. The child
+ * may get SIGKILLed if it later faults.
+ */
+ if (outside_reserve) {
+ BUG_ON(huge_pte_none(pte));
+ if (unmap_ref_private(mm, vma, old_page, address)) {
+ BUG_ON(page_count(old_page) != 1);
+ BUG_ON(huge_pte_none(pte));
+ goto retry_avoidcopy;
+ }
+ WARN_ON_ONCE(1);
+ }
+
+ return -PTR_ERR(new_page);
+ }
+
+ spin_unlock(&mm->page_table_lock);
+ copy_huge_page(new_page, old_page, address, vma);
+ __SetPageUptodate(new_page);
+ spin_lock(&mm->page_table_lock);
+
+ ptep = huge_pte_offset(mm, address & huge_page_mask(h));
+ if (likely(pte_same(huge_ptep_get(ptep), pte))) {
+ /* Break COW */
+ huge_ptep_clear_flush(vma, address, ptep);
+ set_huge_pte_at(mm, address, ptep,
+ make_huge_pte(vma, new_page, 1));
+ /* Make the old page be freed below */
+ new_page = old_page;
+ }
+ page_cache_release(new_page);
+ page_cache_release(old_page);
+ return 0;
+}
+
+/* Return the pagecache page at a given address within a VMA */
+static struct page *hugetlbfs_pagecache_page(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ struct address_space *mapping;
+ pgoff_t idx;
+
+ mapping = vma->vm_file->f_mapping;
+ idx = vma_hugecache_offset(h, vma, address);
+
+ return find_lock_page(mapping, idx);
+}
+
+static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep, int write_access)
+{
+ struct hstate *h = hstate_vma(vma);
+ int ret = VM_FAULT_SIGBUS;
+ pgoff_t idx;
+ unsigned long size;
+ struct page *page;
+ struct address_space *mapping;
+ pte_t new_pte;
+
+ /*
+ * Currently, we are forced to kill the process in the event the
+ * original mapper has unmapped pages from the child due to a failed
+ * COW. Warn that such a situation has occured as it may not be obvious
+ */
+ if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
+ printk(KERN_WARNING
+ "PID %d killed due to inadequate hugepage pool\n",
+ current->pid);
+ return ret;
+ }
+
+ mapping = vma->vm_file->f_mapping;
+ idx = vma_hugecache_offset(h, vma, address);
+
+ /*
+ * Use page lock to guard against racing truncation
+ * before we get page_table_lock.
+ */
+retry:
+ page = find_lock_page(mapping, idx);
+ if (!page) {
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
+ if (idx >= size)
+ goto out;
+ page = alloc_huge_page(vma, address, 0);
+ if (IS_ERR(page)) {
+ ret = -PTR_ERR(page);
+ goto out;
+ }
+ clear_huge_page(page, address, huge_page_size(h));
+ __SetPageUptodate(page);
+
+ if (vma->vm_flags & VM_SHARED) {
+ int err;
+ struct inode *inode = mapping->host;
+
+ err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+ if (err) {
+ put_page(page);
+ if (err == -EEXIST)
+ goto retry;
+ goto out;
+ }
+
+ spin_lock(&inode->i_lock);
+ inode->i_blocks += blocks_per_huge_page(h);
+ spin_unlock(&inode->i_lock);
+ } else
+ lock_page(page);
+ }
+
+ /*
+ * If we are going to COW a private mapping later, we examine the
+ * pending reservations for this page now. This will ensure that
+ * any allocations necessary to record that reservation occur outside
+ * the spinlock.
+ */
+ if (write_access && !(vma->vm_flags & VM_SHARED))
+ if (vma_needs_reservation(h, vma, address) < 0) {
+ ret = VM_FAULT_OOM;
+ goto backout_unlocked;
+ }
+
+ spin_lock(&mm->page_table_lock);
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
+ if (idx >= size)
+ goto backout;
+
+ ret = 0;
+ if (!huge_pte_none(huge_ptep_get(ptep)))
+ goto backout;
+
+ new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
+ && (vma->vm_flags & VM_SHARED)));
+ set_huge_pte_at(mm, address, ptep, new_pte);
+
+ if (write_access && !(vma->vm_flags & VM_SHARED)) {
+ /* Optimization, do the COW without a second fault */
+ ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
+ }
+
+ spin_unlock(&mm->page_table_lock);
+ unlock_page(page);
+out:
+ return ret;
+
+backout:
+ spin_unlock(&mm->page_table_lock);
+backout_unlocked:
+ unlock_page(page);
+ put_page(page);
+ goto out;
+}
+
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, int write_access)
+{
+ pte_t *ptep;
+ pte_t entry;
+ int ret;
+ struct page *pagecache_page = NULL;
+ static DEFINE_MUTEX(hugetlb_instantiation_mutex);
+ struct hstate *h = hstate_vma(vma);
+
+ ptep = huge_pte_alloc(mm, address, huge_page_size(h));
+ if (!ptep)
+ return VM_FAULT_OOM;
+
+ /*
+ * Serialize hugepage allocation and instantiation, so that we don't
+ * get spurious allocation failures if two CPUs race to instantiate
+ * the same page in the page cache.
+ */
+ mutex_lock(&hugetlb_instantiation_mutex);
+ entry = huge_ptep_get(ptep);
+ if (huge_pte_none(entry)) {
+ ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
+ goto out_mutex;
+ }
+
+ ret = 0;
+
+ /*
+ * If we are going to COW the mapping later, we examine the pending
+ * reservations for this page now. This will ensure that any
+ * allocations necessary to record that reservation occur outside the
+ * spinlock. For private mappings, we also lookup the pagecache
+ * page now as it is used to determine if a reservation has been
+ * consumed.
+ */
+ if (write_access && !pte_write(entry)) {
+ if (vma_needs_reservation(h, vma, address) < 0) {
+ ret = VM_FAULT_OOM;
+ goto out_mutex;
+ }
+
+ if (!(vma->vm_flags & VM_SHARED))
+ pagecache_page = hugetlbfs_pagecache_page(h,
+ vma, address);
+ }
+
+ spin_lock(&mm->page_table_lock);
+ /* Check for a racing update before calling hugetlb_cow */
+ if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
+ goto out_page_table_lock;
+
+
+ if (write_access) {
+ if (!pte_write(entry)) {
+ ret = hugetlb_cow(mm, vma, address, ptep, entry,
+ pagecache_page);
+ goto out_page_table_lock;
+ }
+ entry = pte_mkdirty(entry);
+ }
+ entry = pte_mkyoung(entry);
+ if (huge_ptep_set_access_flags(vma, address, ptep, entry, write_access))
+ update_mmu_cache(vma, address, entry);
+
+out_page_table_lock:
+ spin_unlock(&mm->page_table_lock);
+
+ if (pagecache_page) {
+ unlock_page(pagecache_page);
+ put_page(pagecache_page);
+ }
+
+out_mutex:
+ mutex_unlock(&hugetlb_instantiation_mutex);
+
+ return ret;
+}
+
+/* Can be overriden by architectures */
+__attribute__((weak)) struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+ pud_t *pud, int write)
+{
+ BUG();
+ return NULL;
+}
+
+static int huge_zeropage_ok(pte_t *ptep, int write, int shared)
+{
+ if (!ptep || write || shared)
+ return 0;
+ else
+ return huge_pte_none(huge_ptep_get(ptep));
+}
+
+int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct page **pages, struct vm_area_struct **vmas,
+ unsigned long *position, int *length, int i,
+ int write)
+{
+ unsigned long pfn_offset;
+ unsigned long vaddr = *position;
+ int remainder = *length;
+ struct hstate *h = hstate_vma(vma);
+ int zeropage_ok = 0;
+ int shared = vma->vm_flags & VM_SHARED;
+
+ spin_lock(&mm->page_table_lock);
+ while (vaddr < vma->vm_end && remainder) {
+ pte_t *pte;
+ struct page *page;
+
+ /*
+ * Some archs (sparc64, sh*) have multiple pte_ts to
+ * each hugepage. We have to make * sure we get the
+ * first, for the page indexing below to work.
+ */
+ pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
+ if (huge_zeropage_ok(pte, write, shared))
+ zeropage_ok = 1;
+
+ if (!pte ||
+ (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) ||
+ (write && !pte_write(huge_ptep_get(pte)))) {
+ int ret;
+
+ spin_unlock(&mm->page_table_lock);
+ ret = hugetlb_fault(mm, vma, vaddr, write);
+ spin_lock(&mm->page_table_lock);
+ if (!(ret & VM_FAULT_ERROR))
+ continue;
+
+ remainder = 0;
+ if (!i)
+ i = -EFAULT;
+ break;
+ }
+
+ pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
+ page = pte_page(huge_ptep_get(pte));
+same_page:
+ if (pages) {
+ if (zeropage_ok)
+ pages[i] = ZERO_PAGE(0);
+ else
+ pages[i] = mem_map_offset(page, pfn_offset);
+ get_page(pages[i]);
+ }
+
+ if (vmas)
+ vmas[i] = vma;
+
+ vaddr += PAGE_SIZE;
+ ++pfn_offset;
+ --remainder;
+ ++i;
+ if (vaddr < vma->vm_end && remainder &&
+ pfn_offset < pages_per_huge_page(h)) {
+ /*
+ * We use pfn_offset to avoid touching the pageframes
+ * of this compound page.
+ */
+ goto same_page;
+ }
+ }
+ spin_unlock(&mm->page_table_lock);
+ *length = remainder;
+ *position = vaddr;
+
+ return i;
+}
+
+void hugetlb_change_protection(struct vm_area_struct *vma,
+ unsigned long address, unsigned long end, pgprot_t newprot)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long start = address;
+ pte_t *ptep;
+ pte_t pte;
+ struct hstate *h = hstate_vma(vma);
+
+ BUG_ON(address >= end);
+ flush_cache_range(vma, address, end);
+
+ spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+ spin_lock(&mm->page_table_lock);
+ for (; address < end; address += huge_page_size(h)) {
+ ptep = huge_pte_offset(mm, address);
+ if (!ptep)
+ continue;
+ if (huge_pmd_unshare(mm, &address, ptep))
+ continue;
+ if (!huge_pte_none(huge_ptep_get(ptep))) {
+ pte = huge_ptep_get_and_clear(mm, address, ptep);
+ pte = pte_mkhuge(pte_modify(pte, newprot));
+ set_huge_pte_at(mm, address, ptep, pte);
+ }
+ }
+ spin_unlock(&mm->page_table_lock);
+ spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+
+ flush_tlb_range(vma, start, end);
+}
+
+int hugetlb_reserve_pages(struct inode *inode,
+ long from, long to,
+ struct vm_area_struct *vma)
+{
+ long ret, chg;
+ struct hstate *h = hstate_inode(inode);
+
+ if (vma && vma->vm_flags & VM_NORESERVE)
+ return 0;
+
+ /*
+ * Shared mappings base their reservation on the number of pages that
+ * are already allocated on behalf of the file. Private mappings need
+ * to reserve the full area even if read-only as mprotect() may be
+ * called to make the mapping read-write. Assume !vma is a shm mapping
+ */
+ if (!vma || vma->vm_flags & VM_SHARED)
+ chg = region_chg(&inode->i_mapping->private_list, from, to);
+ else {
+ struct resv_map *resv_map = resv_map_alloc();
+ if (!resv_map)
+ return -ENOMEM;
+
+ chg = to - from;
+
+ set_vma_resv_map(vma, resv_map);
+ set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
+ }
+
+ if (chg < 0)
+ return chg;
+
+ if (hugetlb_get_quota(inode->i_mapping, chg))
+ return -ENOSPC;
+ ret = hugetlb_acct_memory(h, chg);
+ if (ret < 0) {
+ hugetlb_put_quota(inode->i_mapping, chg);
+ return ret;
+ }
+ if (!vma || vma->vm_flags & VM_SHARED)
+ region_add(&inode->i_mapping->private_list, from, to);
+ return 0;
+}
+
+void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
+{
+ struct hstate *h = hstate_inode(inode);
+ long chg = region_truncate(&inode->i_mapping->private_list, offset);
+
+ spin_lock(&inode->i_lock);
+ inode->i_blocks -= blocks_per_huge_page(h);
+ spin_unlock(&inode->i_lock);
+
+ hugetlb_put_quota(inode->i_mapping, (chg - freed));
+ hugetlb_acct_memory(h, -(chg - freed));
+}
diff --git a/mm/internal.h b/mm/internal.h
new file mode 100644
index 0000000..13333bc
--- /dev/null
+++ b/mm/internal.h
@@ -0,0 +1,283 @@
+/* internal.h: mm/ internal definitions
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifndef __MM_INTERNAL_H
+#define __MM_INTERNAL_H
+
+#include <linux/mm.h>
+
+void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
+ unsigned long floor, unsigned long ceiling);
+
+extern void prep_compound_page(struct page *page, unsigned long order);
+extern void prep_compound_gigantic_page(struct page *page, unsigned long order);
+
+static inline void set_page_count(struct page *page, int v)
+{
+ atomic_set(&page->_count, v);
+}
+
+/*
+ * Turn a non-refcounted page (->_count == 0) into refcounted with
+ * a count of one.
+ */
+static inline void set_page_refcounted(struct page *page)
+{
+ VM_BUG_ON(PageTail(page));
+ VM_BUG_ON(atomic_read(&page->_count));
+ set_page_count(page, 1);
+}
+
+static inline void __put_page(struct page *page)
+{
+ atomic_dec(&page->_count);
+}
+
+/*
+ * in mm/vmscan.c:
+ */
+extern int isolate_lru_page(struct page *page);
+extern void putback_lru_page(struct page *page);
+
+/*
+ * in mm/page_alloc.c
+ */
+extern void __free_pages_bootmem(struct page *page, unsigned int order);
+
+/*
+ * function for dealing with page's order in buddy system.
+ * zone->lock is already acquired when we use these.
+ * So, we don't need atomic page->flags operations here.
+ */
+static inline unsigned long page_order(struct page *page)
+{
+ VM_BUG_ON(!PageBuddy(page));
+ return page_private(page);
+}
+
+extern long mlock_vma_pages_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end);
+extern void munlock_vma_pages_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end);
+static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
+{
+ munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
+}
+
+#ifdef CONFIG_UNEVICTABLE_LRU
+/*
+ * unevictable_migrate_page() called only from migrate_page_copy() to
+ * migrate unevictable flag to new page.
+ * Note that the old page has been isolated from the LRU lists at this
+ * point so we don't need to worry about LRU statistics.
+ */
+static inline void unevictable_migrate_page(struct page *new, struct page *old)
+{
+ if (TestClearPageUnevictable(old))
+ SetPageUnevictable(new);
+}
+#else
+static inline void unevictable_migrate_page(struct page *new, struct page *old)
+{
+}
+#endif
+
+#ifdef CONFIG_UNEVICTABLE_LRU
+/*
+ * Called only in fault path via page_evictable() for a new page
+ * to determine if it's being mapped into a LOCKED vma.
+ * If so, mark page as mlocked.
+ */
+static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page)
+{
+ VM_BUG_ON(PageLRU(page));
+
+ if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
+ return 0;
+
+ if (!TestSetPageMlocked(page)) {
+ inc_zone_page_state(page, NR_MLOCK);
+ count_vm_event(UNEVICTABLE_PGMLOCKED);
+ }
+ return 1;
+}
+
+/*
+ * must be called with vma's mmap_sem held for read, and page locked.
+ */
+extern void mlock_vma_page(struct page *page);
+
+/*
+ * Clear the page's PageMlocked(). This can be useful in a situation where
+ * we want to unconditionally remove a page from the pagecache -- e.g.,
+ * on truncation or freeing.
+ *
+ * It is legal to call this function for any page, mlocked or not.
+ * If called for a page that is still mapped by mlocked vmas, all we do
+ * is revert to lazy LRU behaviour -- semantics are not broken.
+ */
+extern void __clear_page_mlock(struct page *page);
+static inline void clear_page_mlock(struct page *page)
+{
+ if (unlikely(TestClearPageMlocked(page)))
+ __clear_page_mlock(page);
+}
+
+/*
+ * mlock_migrate_page - called only from migrate_page_copy() to
+ * migrate the Mlocked page flag; update statistics.
+ */
+static inline void mlock_migrate_page(struct page *newpage, struct page *page)
+{
+ if (TestClearPageMlocked(page)) {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __dec_zone_page_state(page, NR_MLOCK);
+ SetPageMlocked(newpage);
+ __inc_zone_page_state(newpage, NR_MLOCK);
+ local_irq_restore(flags);
+ }
+}
+
+/*
+ * free_page_mlock() -- clean up attempts to free and mlocked() page.
+ * Page should not be on lru, so no need to fix that up.
+ * free_pages_check() will verify...
+ */
+static inline void free_page_mlock(struct page *page)
+{
+ if (unlikely(TestClearPageMlocked(page))) {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __dec_zone_page_state(page, NR_MLOCK);
+ __count_vm_event(UNEVICTABLE_MLOCKFREED);
+ local_irq_restore(flags);
+ }
+}
+
+#else /* CONFIG_UNEVICTABLE_LRU */
+static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
+{
+ return 0;
+}
+static inline void clear_page_mlock(struct page *page) { }
+static inline void mlock_vma_page(struct page *page) { }
+static inline void mlock_migrate_page(struct page *new, struct page *old) { }
+static inline void free_page_mlock(struct page *page) { }
+
+#endif /* CONFIG_UNEVICTABLE_LRU */
+
+/*
+ * Return the mem_map entry representing the 'offset' subpage within
+ * the maximally aligned gigantic page 'base'. Handle any discontiguity
+ * in the mem_map at MAX_ORDER_NR_PAGES boundaries.
+ */
+static inline struct page *mem_map_offset(struct page *base, int offset)
+{
+ if (unlikely(offset >= MAX_ORDER_NR_PAGES))
+ return pfn_to_page(page_to_pfn(base) + offset);
+ return base + offset;
+}
+
+/*
+ * Iterator over all subpages withing the maximally aligned gigantic
+ * page 'base'. Handle any discontiguity in the mem_map.
+ */
+static inline struct page *mem_map_next(struct page *iter,
+ struct page *base, int offset)
+{
+ if (unlikely((offset & (MAX_ORDER_NR_PAGES - 1)) == 0)) {
+ unsigned long pfn = page_to_pfn(base) + offset;
+ if (!pfn_valid(pfn))
+ return NULL;
+ return pfn_to_page(pfn);
+ }
+ return iter + 1;
+}
+
+/*
+ * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
+ * so all functions starting at paging_init should be marked __init
+ * in those cases. SPARSEMEM, however, allows for memory hotplug,
+ * and alloc_bootmem_node is not used.
+ */
+#ifdef CONFIG_SPARSEMEM
+#define __paginginit __meminit
+#else
+#define __paginginit __init
+#endif
+
+/* Memory initialisation debug and verification */
+enum mminit_level {
+ MMINIT_WARNING,
+ MMINIT_VERIFY,
+ MMINIT_TRACE
+};
+
+#ifdef CONFIG_DEBUG_MEMORY_INIT
+
+extern int mminit_loglevel;
+
+#define mminit_dprintk(level, prefix, fmt, arg...) \
+do { \
+ if (level < mminit_loglevel) { \
+ printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \
+ printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \
+ } \
+} while (0)
+
+extern void mminit_verify_pageflags_layout(void);
+extern void mminit_verify_page_links(struct page *page,
+ enum zone_type zone, unsigned long nid, unsigned long pfn);
+extern void mminit_verify_zonelist(void);
+
+#else
+
+static inline void mminit_dprintk(enum mminit_level level,
+ const char *prefix, const char *fmt, ...)
+{
+}
+
+static inline void mminit_verify_pageflags_layout(void)
+{
+}
+
+static inline void mminit_verify_page_links(struct page *page,
+ enum zone_type zone, unsigned long nid, unsigned long pfn)
+{
+}
+
+static inline void mminit_verify_zonelist(void)
+{
+}
+#endif /* CONFIG_DEBUG_MEMORY_INIT */
+
+/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */
+#if defined(CONFIG_SPARSEMEM)
+extern void mminit_validate_memmodel_limits(unsigned long *start_pfn,
+ unsigned long *end_pfn);
+#else
+static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
+ unsigned long *end_pfn)
+{
+}
+#endif /* CONFIG_SPARSEMEM */
+
+#define GUP_FLAGS_WRITE 0x1
+#define GUP_FLAGS_FORCE 0x2
+#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4
+
+int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, int len, int flags,
+ struct page **pages, struct vm_area_struct **vmas);
+
+#endif
diff --git a/mm/maccess.c b/mm/maccess.c
new file mode 100644
index 0000000..ac40796
--- /dev/null
+++ b/mm/maccess.c
@@ -0,0 +1,55 @@
+/*
+ * Access kernel memory without faulting.
+ */
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+
+/**
+ * probe_kernel_read(): safely attempt to read from a location
+ * @dst: pointer to the buffer that shall take the data
+ * @src: address to read from
+ * @size: size of the data chunk
+ *
+ * Safely read from address @src to the buffer at @dst. If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+long probe_kernel_read(void *dst, void *src, size_t size)
+{
+ long ret;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(dst,
+ (__force const void __user *)src, size);
+ pagefault_enable();
+ set_fs(old_fs);
+
+ return ret ? -EFAULT : 0;
+}
+EXPORT_SYMBOL_GPL(probe_kernel_read);
+
+/**
+ * probe_kernel_write(): safely attempt to write to a location
+ * @dst: address to write to
+ * @src: pointer to the data that shall be written
+ * @size: size of the data chunk
+ *
+ * Safely write to address @dst from the buffer at @src. If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+long probe_kernel_write(void *dst, void *src, size_t size)
+{
+ long ret;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ pagefault_disable();
+ ret = __copy_to_user_inatomic((__force void __user *)dst, src, size);
+ pagefault_enable();
+ set_fs(old_fs);
+
+ return ret ? -EFAULT : 0;
+}
+EXPORT_SYMBOL_GPL(probe_kernel_write);
diff --git a/mm/madvise.c b/mm/madvise.c
new file mode 100644
index 0000000..b9ce574
--- /dev/null
+++ b/mm/madvise.c
@@ -0,0 +1,365 @@
+/*
+ * linux/mm/madvise.c
+ *
+ * Copyright (C) 1999 Linus Torvalds
+ * Copyright (C) 2002 Christoph Hellwig
+ */
+
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/syscalls.h>
+#include <linux/mempolicy.h>
+#include <linux/hugetlb.h>
+#include <linux/sched.h>
+
+/*
+ * Any behaviour which results in changes to the vma->vm_flags needs to
+ * take mmap_sem for writing. Others, which simply traverse vmas, need
+ * to only take it for reading.
+ */
+static int madvise_need_mmap_write(int behavior)
+{
+ switch (behavior) {
+ case MADV_REMOVE:
+ case MADV_WILLNEED:
+ case MADV_DONTNEED:
+ return 0;
+ default:
+ /* be safe, default to 1. list exceptions explicitly */
+ return 1;
+ }
+}
+
+/*
+ * We can potentially split a vm area into separate
+ * areas, each area with its own behavior.
+ */
+static long madvise_behavior(struct vm_area_struct * vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end, int behavior)
+{
+ struct mm_struct * mm = vma->vm_mm;
+ int error = 0;
+ pgoff_t pgoff;
+ int new_flags = vma->vm_flags;
+
+ switch (behavior) {
+ case MADV_NORMAL:
+ new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
+ break;
+ case MADV_SEQUENTIAL:
+ new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
+ break;
+ case MADV_RANDOM:
+ new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
+ break;
+ case MADV_DONTFORK:
+ new_flags |= VM_DONTCOPY;
+ break;
+ case MADV_DOFORK:
+ new_flags &= ~VM_DONTCOPY;
+ break;
+ }
+
+ if (new_flags == vma->vm_flags) {
+ *prev = vma;
+ goto out;
+ }
+
+ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+ *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
+ vma->vm_file, pgoff, vma_policy(vma));
+ if (*prev) {
+ vma = *prev;
+ goto success;
+ }
+
+ *prev = vma;
+
+ if (start != vma->vm_start) {
+ error = split_vma(mm, vma, start, 1);
+ if (error)
+ goto out;
+ }
+
+ if (end != vma->vm_end) {
+ error = split_vma(mm, vma, end, 0);
+ if (error)
+ goto out;
+ }
+
+success:
+ /*
+ * vm_flags is protected by the mmap_sem held in write mode.
+ */
+ vma->vm_flags = new_flags;
+
+out:
+ if (error == -ENOMEM)
+ error = -EAGAIN;
+ return error;
+}
+
+/*
+ * Schedule all required I/O operations. Do not wait for completion.
+ */
+static long madvise_willneed(struct vm_area_struct * vma,
+ struct vm_area_struct ** prev,
+ unsigned long start, unsigned long end)
+{
+ struct file *file = vma->vm_file;
+
+ if (!file)
+ return -EBADF;
+
+ if (file->f_mapping->a_ops->get_xip_mem) {
+ /* no bad return value, but ignore advice */
+ return 0;
+ }
+
+ *prev = vma;
+ start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ if (end > vma->vm_end)
+ end = vma->vm_end;
+ end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+
+ force_page_cache_readahead(file->f_mapping,
+ file, start, max_sane_readahead(end - start));
+ return 0;
+}
+
+/*
+ * Application no longer needs these pages. If the pages are dirty,
+ * it's OK to just throw them away. The app will be more careful about
+ * data it wants to keep. Be sure to free swap resources too. The
+ * zap_page_range call sets things up for shrink_active_list to actually free
+ * these pages later if no one else has touched them in the meantime,
+ * although we could add these pages to a global reuse list for
+ * shrink_active_list to pick up before reclaiming other pages.
+ *
+ * NB: This interface discards data rather than pushes it out to swap,
+ * as some implementations do. This has performance implications for
+ * applications like large transactional databases which want to discard
+ * pages in anonymous maps after committing to backing store the data
+ * that was kept in them. There is no reason to write this data out to
+ * the swap area if the application is discarding it.
+ *
+ * An interface that causes the system to free clean pages and flush
+ * dirty pages is already available as msync(MS_INVALIDATE).
+ */
+static long madvise_dontneed(struct vm_area_struct * vma,
+ struct vm_area_struct ** prev,
+ unsigned long start, unsigned long end)
+{
+ *prev = vma;
+ if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
+ return -EINVAL;
+
+ if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
+ struct zap_details details = {
+ .nonlinear_vma = vma,
+ .last_index = ULONG_MAX,
+ };
+ zap_page_range(vma, start, end - start, &details);
+ } else
+ zap_page_range(vma, start, end - start, NULL);
+ return 0;
+}
+
+/*
+ * Application wants to free up the pages and associated backing store.
+ * This is effectively punching a hole into the middle of a file.
+ *
+ * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
+ * Other filesystems return -ENOSYS.
+ */
+static long madvise_remove(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end)
+{
+ struct address_space *mapping;
+ loff_t offset, endoff;
+ int error;
+
+ *prev = NULL; /* tell sys_madvise we drop mmap_sem */
+
+ if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
+ return -EINVAL;
+
+ if (!vma->vm_file || !vma->vm_file->f_mapping
+ || !vma->vm_file->f_mapping->host) {
+ return -EINVAL;
+ }
+
+ if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
+ return -EACCES;
+
+ mapping = vma->vm_file->f_mapping;
+
+ offset = (loff_t)(start - vma->vm_start)
+ + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+ endoff = (loff_t)(end - vma->vm_start - 1)
+ + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+
+ /* vmtruncate_range needs to take i_mutex and i_alloc_sem */
+ up_read(&current->mm->mmap_sem);
+ error = vmtruncate_range(mapping->host, offset, endoff);
+ down_read(&current->mm->mmap_sem);
+ return error;
+}
+
+static long
+madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
+ unsigned long start, unsigned long end, int behavior)
+{
+ long error;
+
+ switch (behavior) {
+ case MADV_DOFORK:
+ if (vma->vm_flags & VM_IO) {
+ error = -EINVAL;
+ break;
+ }
+ case MADV_DONTFORK:
+ case MADV_NORMAL:
+ case MADV_SEQUENTIAL:
+ case MADV_RANDOM:
+ error = madvise_behavior(vma, prev, start, end, behavior);
+ break;
+ case MADV_REMOVE:
+ error = madvise_remove(vma, prev, start, end);
+ break;
+
+ case MADV_WILLNEED:
+ error = madvise_willneed(vma, prev, start, end);
+ break;
+
+ case MADV_DONTNEED:
+ error = madvise_dontneed(vma, prev, start, end);
+ break;
+
+ default:
+ error = -EINVAL;
+ break;
+ }
+ return error;
+}
+
+/*
+ * The madvise(2) system call.
+ *
+ * Applications can use madvise() to advise the kernel how it should
+ * handle paging I/O in this VM area. The idea is to help the kernel
+ * use appropriate read-ahead and caching techniques. The information
+ * provided is advisory only, and can be safely disregarded by the
+ * kernel without affecting the correct operation of the application.
+ *
+ * behavior values:
+ * MADV_NORMAL - the default behavior is to read clusters. This
+ * results in some read-ahead and read-behind.
+ * MADV_RANDOM - the system should read the minimum amount of data
+ * on any access, since it is unlikely that the appli-
+ * cation will need more than what it asks for.
+ * MADV_SEQUENTIAL - pages in the given range will probably be accessed
+ * once, so they can be aggressively read ahead, and
+ * can be freed soon after they are accessed.
+ * MADV_WILLNEED - the application is notifying the system to read
+ * some pages ahead.
+ * MADV_DONTNEED - the application is finished with the given range,
+ * so the kernel can free resources associated with it.
+ * MADV_REMOVE - the application wants to free up the given range of
+ * pages and associated backing store.
+ *
+ * return values:
+ * zero - success
+ * -EINVAL - start + len < 0, start is not page-aligned,
+ * "behavior" is not a valid value, or application
+ * is attempting to release locked or shared pages.
+ * -ENOMEM - addresses in the specified range are not currently
+ * mapped, or are outside the AS of the process.
+ * -EIO - an I/O error occurred while paging in data.
+ * -EBADF - map exists, but area maps something that isn't a file.
+ * -EAGAIN - a kernel resource was temporarily unavailable.
+ */
+SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
+{
+ unsigned long end, tmp;
+ struct vm_area_struct * vma, *prev;
+ int unmapped_error = 0;
+ int error = -EINVAL;
+ int write;
+ size_t len;
+
+ write = madvise_need_mmap_write(behavior);
+ if (write)
+ down_write(&current->mm->mmap_sem);
+ else
+ down_read(&current->mm->mmap_sem);
+
+ if (start & ~PAGE_MASK)
+ goto out;
+ len = (len_in + ~PAGE_MASK) & PAGE_MASK;
+
+ /* Check to see whether len was rounded up from small -ve to zero */
+ if (len_in && !len)
+ goto out;
+
+ end = start + len;
+ if (end < start)
+ goto out;
+
+ error = 0;
+ if (end == start)
+ goto out;
+
+ /*
+ * If the interval [start,end) covers some unmapped address
+ * ranges, just ignore them, but return -ENOMEM at the end.
+ * - different from the way of handling in mlock etc.
+ */
+ vma = find_vma_prev(current->mm, start, &prev);
+ if (vma && start > vma->vm_start)
+ prev = vma;
+
+ for (;;) {
+ /* Still start < end. */
+ error = -ENOMEM;
+ if (!vma)
+ goto out;
+
+ /* Here start < (end|vma->vm_end). */
+ if (start < vma->vm_start) {
+ unmapped_error = -ENOMEM;
+ start = vma->vm_start;
+ if (start >= end)
+ goto out;
+ }
+
+ /* Here vma->vm_start <= start < (end|vma->vm_end) */
+ tmp = vma->vm_end;
+ if (end < tmp)
+ tmp = end;
+
+ /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
+ error = madvise_vma(vma, &prev, start, tmp, behavior);
+ if (error)
+ goto out;
+ start = tmp;
+ if (prev && start < prev->vm_end)
+ start = prev->vm_end;
+ error = unmapped_error;
+ if (start >= end)
+ goto out;
+ if (prev)
+ vma = prev->vm_next;
+ else /* madvise_remove dropped mmap_sem */
+ vma = find_vma(current->mm, start);
+ }
+out:
+ if (write)
+ up_write(&current->mm->mmap_sem);
+ else
+ up_read(&current->mm->mmap_sem);
+
+ return error;
+}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
new file mode 100644
index 0000000..866dcc7
--- /dev/null
+++ b/mm/memcontrol.c
@@ -0,0 +1,1174 @@
+/* memcontrol.c - Memory Controller
+ *
+ * Copyright IBM Corporation, 2007
+ * Author Balbir Singh <balbir@linux.vnet.ibm.com>
+ *
+ * Copyright 2007 OpenVZ SWsoft Inc
+ * Author: Pavel Emelianov <xemul@openvz.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/res_counter.h>
+#include <linux/memcontrol.h>
+#include <linux/cgroup.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/page-flags.h>
+#include <linux/backing-dev.h>
+#include <linux/bit_spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/vmalloc.h>
+#include <linux/mm_inline.h>
+#include <linux/page_cgroup.h>
+
+#include <asm/uaccess.h>
+
+struct cgroup_subsys mem_cgroup_subsys __read_mostly;
+#define MEM_CGROUP_RECLAIM_RETRIES 5
+
+/*
+ * Statistics for memory cgroup.
+ */
+enum mem_cgroup_stat_index {
+ /*
+ * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
+ */
+ MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
+ MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */
+ MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
+ MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
+
+ MEM_CGROUP_STAT_NSTATS,
+};
+
+struct mem_cgroup_stat_cpu {
+ s64 count[MEM_CGROUP_STAT_NSTATS];
+} ____cacheline_aligned_in_smp;
+
+struct mem_cgroup_stat {
+ struct mem_cgroup_stat_cpu cpustat[NR_CPUS];
+};
+
+/*
+ * For accounting under irq disable, no need for increment preempt count.
+ */
+static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
+ enum mem_cgroup_stat_index idx, int val)
+{
+ stat->count[idx] += val;
+}
+
+static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
+ enum mem_cgroup_stat_index idx)
+{
+ int cpu;
+ s64 ret = 0;
+ for_each_possible_cpu(cpu)
+ ret += stat->cpustat[cpu].count[idx];
+ return ret;
+}
+
+/*
+ * per-zone information in memory controller.
+ */
+struct mem_cgroup_per_zone {
+ /*
+ * spin_lock to protect the per cgroup LRU
+ */
+ spinlock_t lru_lock;
+ struct list_head lists[NR_LRU_LISTS];
+ unsigned long count[NR_LRU_LISTS];
+};
+/* Macro for accessing counter */
+#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
+
+struct mem_cgroup_per_node {
+ struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
+};
+
+struct mem_cgroup_lru_info {
+ struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
+};
+
+/*
+ * The memory controller data structure. The memory controller controls both
+ * page cache and RSS per cgroup. We would eventually like to provide
+ * statistics based on the statistics developed by Rik Van Riel for clock-pro,
+ * to help the administrator determine what knobs to tune.
+ *
+ * TODO: Add a water mark for the memory controller. Reclaim will begin when
+ * we hit the water mark. May be even add a low water mark, such that
+ * no reclaim occurs from a cgroup at it's low water mark, this is
+ * a feature that will be implemented much later in the future.
+ */
+struct mem_cgroup {
+ struct cgroup_subsys_state css;
+ /*
+ * the counter to account for memory usage
+ */
+ struct res_counter res;
+ /*
+ * Per cgroup active and inactive list, similar to the
+ * per zone LRU lists.
+ */
+ struct mem_cgroup_lru_info info;
+
+ int prev_priority; /* for recording reclaim priority */
+ /*
+ * statistics.
+ */
+ struct mem_cgroup_stat stat;
+};
+static struct mem_cgroup init_mem_cgroup;
+
+enum charge_type {
+ MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
+ MEM_CGROUP_CHARGE_TYPE_MAPPED,
+ MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
+ MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
+ NR_CHARGE_TYPE,
+};
+
+/* only for here (for easy reading.) */
+#define PCGF_CACHE (1UL << PCG_CACHE)
+#define PCGF_USED (1UL << PCG_USED)
+#define PCGF_ACTIVE (1UL << PCG_ACTIVE)
+#define PCGF_LOCK (1UL << PCG_LOCK)
+#define PCGF_FILE (1UL << PCG_FILE)
+static const unsigned long
+pcg_default_flags[NR_CHARGE_TYPE] = {
+ PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */
+ PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */
+ PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
+ 0, /* FORCE */
+};
+
+/*
+ * Always modified under lru lock. Then, not necessary to preempt_disable()
+ */
+static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
+ struct page_cgroup *pc,
+ bool charge)
+{
+ int val = (charge)? 1 : -1;
+ struct mem_cgroup_stat *stat = &mem->stat;
+ struct mem_cgroup_stat_cpu *cpustat;
+
+ VM_BUG_ON(!irqs_disabled());
+
+ cpustat = &stat->cpustat[smp_processor_id()];
+ if (PageCgroupCache(pc))
+ __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
+ else
+ __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
+
+ if (charge)
+ __mem_cgroup_stat_add_safe(cpustat,
+ MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
+ else
+ __mem_cgroup_stat_add_safe(cpustat,
+ MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
+}
+
+static struct mem_cgroup_per_zone *
+mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
+{
+ return &mem->info.nodeinfo[nid]->zoneinfo[zid];
+}
+
+static struct mem_cgroup_per_zone *
+page_cgroup_zoneinfo(struct page_cgroup *pc)
+{
+ struct mem_cgroup *mem = pc->mem_cgroup;
+ int nid = page_cgroup_nid(pc);
+ int zid = page_cgroup_zid(pc);
+
+ return mem_cgroup_zoneinfo(mem, nid, zid);
+}
+
+static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
+ enum lru_list idx)
+{
+ int nid, zid;
+ struct mem_cgroup_per_zone *mz;
+ u64 total = 0;
+
+ for_each_online_node(nid)
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ mz = mem_cgroup_zoneinfo(mem, nid, zid);
+ total += MEM_CGROUP_ZSTAT(mz, idx);
+ }
+ return total;
+}
+
+static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
+{
+ return container_of(cgroup_subsys_state(cont,
+ mem_cgroup_subsys_id), struct mem_cgroup,
+ css);
+}
+
+struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
+{
+ /*
+ * mm_update_next_owner() may clear mm->owner to NULL
+ * if it races with swapoff, page migration, etc.
+ * So this can be called with p == NULL.
+ */
+ if (unlikely(!p))
+ return NULL;
+
+ return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
+ struct mem_cgroup, css);
+}
+
+static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
+ struct page_cgroup *pc)
+{
+ int lru = LRU_BASE;
+
+ if (PageCgroupUnevictable(pc))
+ lru = LRU_UNEVICTABLE;
+ else {
+ if (PageCgroupActive(pc))
+ lru += LRU_ACTIVE;
+ if (PageCgroupFile(pc))
+ lru += LRU_FILE;
+ }
+
+ MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+
+ mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false);
+ list_del(&pc->lru);
+}
+
+static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
+ struct page_cgroup *pc)
+{
+ int lru = LRU_BASE;
+
+ if (PageCgroupUnevictable(pc))
+ lru = LRU_UNEVICTABLE;
+ else {
+ if (PageCgroupActive(pc))
+ lru += LRU_ACTIVE;
+ if (PageCgroupFile(pc))
+ lru += LRU_FILE;
+ }
+
+ MEM_CGROUP_ZSTAT(mz, lru) += 1;
+ list_add(&pc->lru, &mz->lists[lru]);
+
+ mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
+}
+
+static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
+{
+ struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
+ int active = PageCgroupActive(pc);
+ int file = PageCgroupFile(pc);
+ int unevictable = PageCgroupUnevictable(pc);
+ enum lru_list from = unevictable ? LRU_UNEVICTABLE :
+ (LRU_FILE * !!file + !!active);
+
+ if (lru == from)
+ return;
+
+ MEM_CGROUP_ZSTAT(mz, from) -= 1;
+ /*
+ * However this is done under mz->lru_lock, another flags, which
+ * are not related to LRU, will be modified from out-of-lock.
+ * We have to use atomic set/clear flags.
+ */
+ if (is_unevictable_lru(lru)) {
+ ClearPageCgroupActive(pc);
+ SetPageCgroupUnevictable(pc);
+ } else {
+ if (is_active_lru(lru))
+ SetPageCgroupActive(pc);
+ else
+ ClearPageCgroupActive(pc);
+ ClearPageCgroupUnevictable(pc);
+ }
+
+ MEM_CGROUP_ZSTAT(mz, lru) += 1;
+ list_move(&pc->lru, &mz->lists[lru]);
+}
+
+int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
+{
+ int ret;
+
+ task_lock(task);
+ ret = task->mm && mm_match_cgroup(task->mm, mem);
+ task_unlock(task);
+ return ret;
+}
+
+/*
+ * This routine assumes that the appropriate zone's lru lock is already held
+ */
+void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
+{
+ struct page_cgroup *pc;
+ struct mem_cgroup_per_zone *mz;
+ unsigned long flags;
+
+ if (mem_cgroup_subsys.disabled)
+ return;
+
+ /*
+ * We cannot lock_page_cgroup while holding zone's lru_lock,
+ * because other holders of lock_page_cgroup can be interrupted
+ * with an attempt to rotate_reclaimable_page. But we cannot
+ * safely get to page_cgroup without it, so just try_lock it:
+ * mem_cgroup_isolate_pages allows for page left on wrong list.
+ */
+ pc = lookup_page_cgroup(page);
+ if (!trylock_page_cgroup(pc))
+ return;
+ if (pc && PageCgroupUsed(pc)) {
+ mz = page_cgroup_zoneinfo(pc);
+ spin_lock_irqsave(&mz->lru_lock, flags);
+ __mem_cgroup_move_lists(pc, lru);
+ spin_unlock_irqrestore(&mz->lru_lock, flags);
+ }
+ unlock_page_cgroup(pc);
+}
+
+/*
+ * Calculate mapped_ratio under memory controller. This will be used in
+ * vmscan.c for deteremining we have to reclaim mapped pages.
+ */
+int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
+{
+ long total, rss;
+
+ /*
+ * usage is recorded in bytes. But, here, we assume the number of
+ * physical pages can be represented by "long" on any arch.
+ */
+ total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
+ rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
+ return (int)((rss * 100L) / total);
+}
+
+/*
+ * prev_priority control...this will be used in memory reclaim path.
+ */
+int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
+{
+ return mem->prev_priority;
+}
+
+void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
+{
+ if (priority < mem->prev_priority)
+ mem->prev_priority = priority;
+}
+
+void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
+{
+ mem->prev_priority = priority;
+}
+
+/*
+ * Calculate # of pages to be scanned in this priority/zone.
+ * See also vmscan.c
+ *
+ * priority starts from "DEF_PRIORITY" and decremented in each loop.
+ * (see include/linux/mmzone.h)
+ */
+
+long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
+ int priority, enum lru_list lru)
+{
+ long nr_pages;
+ int nid = zone->zone_pgdat->node_id;
+ int zid = zone_idx(zone);
+ struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
+
+ nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
+
+ return (nr_pages >> priority);
+}
+
+unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
+ struct list_head *dst,
+ unsigned long *scanned, int order,
+ int mode, struct zone *z,
+ struct mem_cgroup *mem_cont,
+ int active, int file)
+{
+ unsigned long nr_taken = 0;
+ struct page *page;
+ unsigned long scan;
+ LIST_HEAD(pc_list);
+ struct list_head *src;
+ struct page_cgroup *pc, *tmp;
+ int nid = z->zone_pgdat->node_id;
+ int zid = zone_idx(z);
+ struct mem_cgroup_per_zone *mz;
+ int lru = LRU_FILE * !!file + !!active;
+
+ BUG_ON(!mem_cont);
+ mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
+ src = &mz->lists[lru];
+
+ spin_lock(&mz->lru_lock);
+ scan = 0;
+ list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
+ if (scan >= nr_to_scan)
+ break;
+ if (unlikely(!PageCgroupUsed(pc)))
+ continue;
+ page = pc->page;
+
+ if (unlikely(!PageLRU(page)))
+ continue;
+
+ /*
+ * TODO: play better with lumpy reclaim, grabbing anything.
+ */
+ if (PageUnevictable(page) ||
+ (PageActive(page) && !active) ||
+ (!PageActive(page) && active)) {
+ __mem_cgroup_move_lists(pc, page_lru(page));
+ continue;
+ }
+
+ scan++;
+ list_move(&pc->lru, &pc_list);
+
+ if (__isolate_lru_page(page, mode, file) == 0) {
+ list_move(&page->lru, dst);
+ nr_taken++;
+ }
+ }
+
+ list_splice(&pc_list, src);
+ spin_unlock(&mz->lru_lock);
+
+ *scanned = scan;
+ return nr_taken;
+}
+
+/*
+ * Charge the memory controller for page usage.
+ * Return
+ * 0 if the charge was successful
+ * < 0 if the cgroup is over its limit
+ */
+static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask, enum charge_type ctype,
+ struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *mem;
+ struct page_cgroup *pc;
+ unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ struct mem_cgroup_per_zone *mz;
+ unsigned long flags;
+
+ pc = lookup_page_cgroup(page);
+ /* can happen at boot */
+ if (unlikely(!pc))
+ return 0;
+ prefetchw(pc);
+ /*
+ * We always charge the cgroup the mm_struct belongs to.
+ * The mm_struct's mem_cgroup changes on task migration if the
+ * thread group leader migrates. It's possible that mm is not
+ * set, if so charge the init_mm (happens for pagecache usage).
+ */
+
+ if (likely(!memcg)) {
+ rcu_read_lock();
+ mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!mem)) {
+ rcu_read_unlock();
+ return 0;
+ }
+ /*
+ * For every charge from the cgroup, increment reference count
+ */
+ css_get(&mem->css);
+ rcu_read_unlock();
+ } else {
+ mem = memcg;
+ css_get(&memcg->css);
+ }
+
+ while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
+ if (!(gfp_mask & __GFP_WAIT))
+ goto out;
+
+ if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
+ continue;
+
+ /*
+ * try_to_free_mem_cgroup_pages() might not give us a full
+ * picture of reclaim. Some pages are reclaimed and might be
+ * moved to swap cache or just unmapped from the cgroup.
+ * Check the limit again to see if the reclaim reduced the
+ * current usage of the cgroup before giving up
+ */
+ if (res_counter_check_under_limit(&mem->res))
+ continue;
+
+ if (!nr_retries--) {
+ mem_cgroup_out_of_memory(mem, gfp_mask);
+ goto out;
+ }
+ }
+
+
+ lock_page_cgroup(pc);
+ if (unlikely(PageCgroupUsed(pc))) {
+ unlock_page_cgroup(pc);
+ res_counter_uncharge(&mem->res, PAGE_SIZE);
+ css_put(&mem->css);
+
+ goto done;
+ }
+ pc->mem_cgroup = mem;
+ /*
+ * If a page is accounted as a page cache, insert to inactive list.
+ * If anon, insert to active list.
+ */
+ pc->flags = pcg_default_flags[ctype];
+
+ mz = page_cgroup_zoneinfo(pc);
+
+ spin_lock_irqsave(&mz->lru_lock, flags);
+ __mem_cgroup_add_list(mz, pc);
+ spin_unlock_irqrestore(&mz->lru_lock, flags);
+ unlock_page_cgroup(pc);
+
+done:
+ return 0;
+out:
+ css_put(&mem->css);
+ return -ENOMEM;
+}
+
+int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
+{
+ if (mem_cgroup_subsys.disabled)
+ return 0;
+ if (PageCompound(page))
+ return 0;
+ /*
+ * If already mapped, we don't have to account.
+ * If page cache, page->mapping has address_space.
+ * But page->mapping may have out-of-use anon_vma pointer,
+ * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
+ * is NULL.
+ */
+ if (page_mapped(page) || (page->mapping && !PageAnon(page)))
+ return 0;
+ if (unlikely(!mm))
+ mm = &init_mm;
+ return mem_cgroup_charge_common(page, mm, gfp_mask,
+ MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
+}
+
+int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask)
+{
+ if (mem_cgroup_subsys.disabled)
+ return 0;
+ if (PageCompound(page))
+ return 0;
+ /*
+ * Corner case handling. This is called from add_to_page_cache()
+ * in usual. But some FS (shmem) precharges this page before calling it
+ * and call add_to_page_cache() with GFP_NOWAIT.
+ *
+ * For GFP_NOWAIT case, the page may be pre-charged before calling
+ * add_to_page_cache(). (See shmem.c) check it here and avoid to call
+ * charge twice. (It works but has to pay a bit larger cost.)
+ */
+ if (!(gfp_mask & __GFP_WAIT)) {
+ struct page_cgroup *pc;
+
+
+ pc = lookup_page_cgroup(page);
+ if (!pc)
+ return 0;
+ lock_page_cgroup(pc);
+ if (PageCgroupUsed(pc)) {
+ unlock_page_cgroup(pc);
+ return 0;
+ }
+ unlock_page_cgroup(pc);
+ }
+
+ if (unlikely(!mm))
+ mm = &init_mm;
+
+ if (page_is_file_cache(page))
+ return mem_cgroup_charge_common(page, mm, gfp_mask,
+ MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
+ else
+ return mem_cgroup_charge_common(page, mm, gfp_mask,
+ MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
+}
+
+/*
+ * uncharge if !page_mapped(page)
+ */
+static void
+__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
+{
+ struct page_cgroup *pc;
+ struct mem_cgroup *mem;
+ struct mem_cgroup_per_zone *mz;
+ unsigned long flags;
+
+ if (mem_cgroup_subsys.disabled)
+ return;
+
+ /*
+ * Check if our page_cgroup is valid
+ */
+ pc = lookup_page_cgroup(page);
+ if (unlikely(!pc || !PageCgroupUsed(pc)))
+ return;
+
+ lock_page_cgroup(pc);
+ if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page))
+ || !PageCgroupUsed(pc)) {
+ /* This happens at race in zap_pte_range() and do_swap_page()*/
+ unlock_page_cgroup(pc);
+ return;
+ }
+ ClearPageCgroupUsed(pc);
+ mem = pc->mem_cgroup;
+
+ mz = page_cgroup_zoneinfo(pc);
+ spin_lock_irqsave(&mz->lru_lock, flags);
+ __mem_cgroup_remove_list(mz, pc);
+ spin_unlock_irqrestore(&mz->lru_lock, flags);
+ unlock_page_cgroup(pc);
+
+ res_counter_uncharge(&mem->res, PAGE_SIZE);
+ css_put(&mem->css);
+
+ return;
+}
+
+void mem_cgroup_uncharge_page(struct page *page)
+{
+ /* early check. */
+ if (page_mapped(page))
+ return;
+ if (page->mapping && !PageAnon(page))
+ return;
+ __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
+}
+
+void mem_cgroup_uncharge_cache_page(struct page *page)
+{
+ VM_BUG_ON(page_mapped(page));
+ VM_BUG_ON(page->mapping);
+ __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
+}
+
+/*
+ * Before starting migration, account against new page.
+ */
+int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
+{
+ struct page_cgroup *pc;
+ struct mem_cgroup *mem = NULL;
+ enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
+ int ret = 0;
+
+ if (mem_cgroup_subsys.disabled)
+ return 0;
+
+ pc = lookup_page_cgroup(page);
+ lock_page_cgroup(pc);
+ if (PageCgroupUsed(pc)) {
+ mem = pc->mem_cgroup;
+ css_get(&mem->css);
+ if (PageCgroupCache(pc)) {
+ if (page_is_file_cache(page))
+ ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
+ else
+ ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
+ }
+ }
+ unlock_page_cgroup(pc);
+ if (mem) {
+ ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
+ ctype, mem);
+ css_put(&mem->css);
+ }
+ return ret;
+}
+
+/* remove redundant charge if migration failed*/
+void mem_cgroup_end_migration(struct page *newpage)
+{
+ /*
+ * At success, page->mapping is not NULL.
+ * special rollback care is necessary when
+ * 1. at migration failure. (newpage->mapping is cleared in this case)
+ * 2. the newpage was moved but not remapped again because the task
+ * exits and the newpage is obsolete. In this case, the new page
+ * may be a swapcache. So, we just call mem_cgroup_uncharge_page()
+ * always for avoiding mess. The page_cgroup will be removed if
+ * unnecessary. File cache pages is still on radix-tree. Don't
+ * care it.
+ */
+ if (!newpage->mapping)
+ __mem_cgroup_uncharge_common(newpage,
+ MEM_CGROUP_CHARGE_TYPE_FORCE);
+ else if (PageAnon(newpage))
+ mem_cgroup_uncharge_page(newpage);
+}
+
+/*
+ * A call to try to shrink memory usage under specified resource controller.
+ * This is typically used for page reclaiming for shmem for reducing side
+ * effect of page allocation from shmem, which is used by some mem_cgroup.
+ */
+int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
+{
+ struct mem_cgroup *mem;
+ int progress = 0;
+ int retry = MEM_CGROUP_RECLAIM_RETRIES;
+
+ if (mem_cgroup_subsys.disabled)
+ return 0;
+ if (!mm)
+ return 0;
+
+ rcu_read_lock();
+ mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!mem)) {
+ rcu_read_unlock();
+ return 0;
+ }
+ css_get(&mem->css);
+ rcu_read_unlock();
+
+ do {
+ progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
+ progress += res_counter_check_under_limit(&mem->res);
+ } while (!progress && --retry);
+
+ css_put(&mem->css);
+ if (!retry)
+ return -ENOMEM;
+ return 0;
+}
+
+int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
+{
+
+ int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
+ int progress;
+ int ret = 0;
+
+ while (res_counter_set_limit(&memcg->res, val)) {
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+ if (!retry_count) {
+ ret = -EBUSY;
+ break;
+ }
+ progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL);
+ if (!progress)
+ retry_count--;
+ }
+ return ret;
+}
+
+
+/*
+ * This routine traverse page_cgroup in given list and drop them all.
+ * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
+ */
+#define FORCE_UNCHARGE_BATCH (128)
+static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
+ struct mem_cgroup_per_zone *mz,
+ enum lru_list lru)
+{
+ struct page_cgroup *pc;
+ struct page *page;
+ int count = FORCE_UNCHARGE_BATCH;
+ unsigned long flags;
+ struct list_head *list;
+
+ list = &mz->lists[lru];
+
+ spin_lock_irqsave(&mz->lru_lock, flags);
+ while (!list_empty(list)) {
+ pc = list_entry(list->prev, struct page_cgroup, lru);
+ page = pc->page;
+ if (!PageCgroupUsed(pc))
+ break;
+ get_page(page);
+ spin_unlock_irqrestore(&mz->lru_lock, flags);
+ /*
+ * Check if this page is on LRU. !LRU page can be found
+ * if it's under page migration.
+ */
+ if (PageLRU(page)) {
+ __mem_cgroup_uncharge_common(page,
+ MEM_CGROUP_CHARGE_TYPE_FORCE);
+ put_page(page);
+ if (--count <= 0) {
+ count = FORCE_UNCHARGE_BATCH;
+ cond_resched();
+ }
+ } else {
+ spin_lock_irqsave(&mz->lru_lock, flags);
+ break;
+ }
+ spin_lock_irqsave(&mz->lru_lock, flags);
+ }
+ spin_unlock_irqrestore(&mz->lru_lock, flags);
+}
+
+/*
+ * make mem_cgroup's charge to be 0 if there is no task.
+ * This enables deleting this mem_cgroup.
+ */
+static int mem_cgroup_force_empty(struct mem_cgroup *mem)
+{
+ int ret = -EBUSY;
+ int node, zid;
+
+ css_get(&mem->css);
+ /*
+ * page reclaim code (kswapd etc..) will move pages between
+ * active_list <-> inactive_list while we don't take a lock.
+ * So, we have to do loop here until all lists are empty.
+ */
+ while (mem->res.usage > 0) {
+ if (atomic_read(&mem->css.cgroup->count) > 0)
+ goto out;
+ /* This is for making all *used* pages to be on LRU. */
+ lru_add_drain_all();
+ for_each_node_state(node, N_POSSIBLE)
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ struct mem_cgroup_per_zone *mz;
+ enum lru_list l;
+ mz = mem_cgroup_zoneinfo(mem, node, zid);
+ for_each_lru(l)
+ mem_cgroup_force_empty_list(mem, mz, l);
+ }
+ cond_resched();
+ }
+ ret = 0;
+out:
+ css_put(&mem->css);
+ return ret;
+}
+
+static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
+{
+ return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
+ cft->private);
+}
+/*
+ * The user of this function is...
+ * RES_LIMIT.
+ */
+static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
+ const char *buffer)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+ unsigned long long val;
+ int ret;
+
+ switch (cft->private) {
+ case RES_LIMIT:
+ /* This function does all necessary parse...reuse it */
+ ret = res_counter_memparse_write_strategy(buffer, &val);
+ if (!ret)
+ ret = mem_cgroup_resize_limit(memcg, val);
+ break;
+ default:
+ ret = -EINVAL; /* should be BUG() ? */
+ break;
+ }
+ return ret;
+}
+
+static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
+{
+ struct mem_cgroup *mem;
+
+ mem = mem_cgroup_from_cont(cont);
+ switch (event) {
+ case RES_MAX_USAGE:
+ res_counter_reset_max(&mem->res);
+ break;
+ case RES_FAILCNT:
+ res_counter_reset_failcnt(&mem->res);
+ break;
+ }
+ return 0;
+}
+
+static int mem_force_empty_write(struct cgroup *cont, unsigned int event)
+{
+ return mem_cgroup_force_empty(mem_cgroup_from_cont(cont));
+}
+
+static const struct mem_cgroup_stat_desc {
+ const char *msg;
+ u64 unit;
+} mem_cgroup_stat_desc[] = {
+ [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },
+ [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
+ [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, },
+ [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, },
+};
+
+static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
+ struct cgroup_map_cb *cb)
+{
+ struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
+ struct mem_cgroup_stat *stat = &mem_cont->stat;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
+ s64 val;
+
+ val = mem_cgroup_read_stat(stat, i);
+ val *= mem_cgroup_stat_desc[i].unit;
+ cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);
+ }
+ /* showing # of active pages */
+ {
+ unsigned long active_anon, inactive_anon;
+ unsigned long active_file, inactive_file;
+ unsigned long unevictable;
+
+ inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
+ LRU_INACTIVE_ANON);
+ active_anon = mem_cgroup_get_all_zonestat(mem_cont,
+ LRU_ACTIVE_ANON);
+ inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
+ LRU_INACTIVE_FILE);
+ active_file = mem_cgroup_get_all_zonestat(mem_cont,
+ LRU_ACTIVE_FILE);
+ unevictable = mem_cgroup_get_all_zonestat(mem_cont,
+ LRU_UNEVICTABLE);
+
+ cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
+ cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
+ cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
+ cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
+ cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
+
+ }
+ return 0;
+}
+
+static struct cftype mem_cgroup_files[] = {
+ {
+ .name = "usage_in_bytes",
+ .private = RES_USAGE,
+ .read_u64 = mem_cgroup_read,
+ },
+ {
+ .name = "max_usage_in_bytes",
+ .private = RES_MAX_USAGE,
+ .trigger = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read,
+ },
+ {
+ .name = "limit_in_bytes",
+ .private = RES_LIMIT,
+ .write_string = mem_cgroup_write,
+ .read_u64 = mem_cgroup_read,
+ },
+ {
+ .name = "failcnt",
+ .private = RES_FAILCNT,
+ .trigger = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read,
+ },
+ {
+ .name = "force_empty",
+ .trigger = mem_force_empty_write,
+ },
+ {
+ .name = "stat",
+ .read_map = mem_control_stat_show,
+ },
+};
+
+static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
+{
+ struct mem_cgroup_per_node *pn;
+ struct mem_cgroup_per_zone *mz;
+ enum lru_list l;
+ int zone, tmp = node;
+ /*
+ * This routine is called against possible nodes.
+ * But it's BUG to call kmalloc() against offline node.
+ *
+ * TODO: this routine can waste much memory for nodes which will
+ * never be onlined. It's better to use memory hotplug callback
+ * function.
+ */
+ if (!node_state(node, N_NORMAL_MEMORY))
+ tmp = -1;
+ pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
+ if (!pn)
+ return 1;
+
+ mem->info.nodeinfo[node] = pn;
+ memset(pn, 0, sizeof(*pn));
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ mz = &pn->zoneinfo[zone];
+ spin_lock_init(&mz->lru_lock);
+ for_each_lru(l)
+ INIT_LIST_HEAD(&mz->lists[l]);
+ }
+ return 0;
+}
+
+static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
+{
+ kfree(mem->info.nodeinfo[node]);
+}
+
+static struct mem_cgroup *mem_cgroup_alloc(void)
+{
+ struct mem_cgroup *mem;
+
+ if (sizeof(*mem) < PAGE_SIZE)
+ mem = kmalloc(sizeof(*mem), GFP_KERNEL);
+ else
+ mem = vmalloc(sizeof(*mem));
+
+ if (mem)
+ memset(mem, 0, sizeof(*mem));
+ return mem;
+}
+
+static void mem_cgroup_free(struct mem_cgroup *mem)
+{
+ if (sizeof(*mem) < PAGE_SIZE)
+ kfree(mem);
+ else
+ vfree(mem);
+}
+
+
+static struct cgroup_subsys_state *
+mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ struct mem_cgroup *mem;
+ int node;
+
+ if (unlikely((cont->parent) == NULL)) {
+ mem = &init_mem_cgroup;
+ } else {
+ mem = mem_cgroup_alloc();
+ if (!mem)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ res_counter_init(&mem->res);
+
+ for_each_node_state(node, N_POSSIBLE)
+ if (alloc_mem_cgroup_per_zone_info(mem, node))
+ goto free_out;
+
+ return &mem->css;
+free_out:
+ for_each_node_state(node, N_POSSIBLE)
+ free_mem_cgroup_per_zone_info(mem, node);
+ if (cont->parent != NULL)
+ mem_cgroup_free(mem);
+ return ERR_PTR(-ENOMEM);
+}
+
+static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
+ struct cgroup *cont)
+{
+ struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+ mem_cgroup_force_empty(mem);
+}
+
+static void mem_cgroup_destroy(struct cgroup_subsys *ss,
+ struct cgroup *cont)
+{
+ int node;
+ struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+
+ for_each_node_state(node, N_POSSIBLE)
+ free_mem_cgroup_per_zone_info(mem, node);
+
+ mem_cgroup_free(mem_cgroup_from_cont(cont));
+}
+
+static int mem_cgroup_populate(struct cgroup_subsys *ss,
+ struct cgroup *cont)
+{
+ return cgroup_add_files(cont, ss, mem_cgroup_files,
+ ARRAY_SIZE(mem_cgroup_files));
+}
+
+static void mem_cgroup_move_task(struct cgroup_subsys *ss,
+ struct cgroup *cont,
+ struct cgroup *old_cont,
+ struct task_struct *p)
+{
+ struct mm_struct *mm;
+ struct mem_cgroup *mem, *old_mem;
+
+ mm = get_task_mm(p);
+ if (mm == NULL)
+ return;
+
+ mem = mem_cgroup_from_cont(cont);
+ old_mem = mem_cgroup_from_cont(old_cont);
+
+ /*
+ * Only thread group leaders are allowed to migrate, the mm_struct is
+ * in effect owned by the leader
+ */
+ if (!thread_group_leader(p))
+ goto out;
+
+out:
+ mmput(mm);
+}
+
+struct cgroup_subsys mem_cgroup_subsys = {
+ .name = "memory",
+ .subsys_id = mem_cgroup_subsys_id,
+ .create = mem_cgroup_create,
+ .pre_destroy = mem_cgroup_pre_destroy,
+ .destroy = mem_cgroup_destroy,
+ .populate = mem_cgroup_populate,
+ .attach = mem_cgroup_move_task,
+ .early_init = 0,
+};
diff --git a/mm/memory.c b/mm/memory.c
new file mode 100644
index 0000000..fe2257f
--- /dev/null
+++ b/mm/memory.c
@@ -0,0 +1,3051 @@
+/*
+ * linux/mm/memory.c
+ *
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ */
+
+/*
+ * demand-loading started 01.12.91 - seems it is high on the list of
+ * things wanted, and it should be easy to implement. - Linus
+ */
+
+/*
+ * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
+ * pages started 02.12.91, seems to work. - Linus.
+ *
+ * Tested sharing by executing about 30 /bin/sh: under the old kernel it
+ * would have taken more than the 6M I have free, but it worked well as
+ * far as I could see.
+ *
+ * Also corrected some "invalidate()"s - I wasn't doing enough of them.
+ */
+
+/*
+ * Real VM (paging to/from disk) started 18.12.91. Much more work and
+ * thought has to go into this. Oh, well..
+ * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.
+ * Found it. Everything seems to work now.
+ * 20.12.91 - Ok, making the swap-device changeable like the root.
+ */
+
+/*
+ * 05.04.94 - Multi-page memory management added for v1.1.
+ * Idea by Alex Bligh (alex@cconcepts.co.uk)
+ *
+ * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
+ * (Gerhard.Wichert@pdb.siemens.de)
+ *
+ * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
+ */
+
+#include <linux/kernel_stat.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/mman.h>
+#include <linux/swap.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/module.h>
+#include <linux/delayacct.h>
+#include <linux/init.h>
+#include <linux/writeback.h>
+#include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
+
+#include <asm/pgalloc.h>
+#include <asm/uaccess.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/pgtable.h>
+
+#include <linux/swapops.h>
+#include <linux/elf.h>
+
+#include "internal.h"
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+/* use the per-pgdat data instead for discontigmem - mbligh */
+unsigned long max_mapnr;
+struct page *mem_map;
+
+EXPORT_SYMBOL(max_mapnr);
+EXPORT_SYMBOL(mem_map);
+#endif
+
+unsigned long num_physpages;
+/*
+ * A number of key systems in x86 including ioremap() rely on the assumption
+ * that high_memory defines the upper bound on direct map memory, then end
+ * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and
+ * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
+ * and ZONE_HIGHMEM.
+ */
+void * high_memory;
+
+EXPORT_SYMBOL(num_physpages);
+EXPORT_SYMBOL(high_memory);
+
+/*
+ * Randomize the address space (stacks, mmaps, brk, etc.).
+ *
+ * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
+ * as ancient (libc5 based) binaries can segfault. )
+ */
+int randomize_va_space __read_mostly =
+#ifdef CONFIG_COMPAT_BRK
+ 1;
+#else
+ 2;
+#endif
+
+static int __init disable_randmaps(char *s)
+{
+ randomize_va_space = 0;
+ return 1;
+}
+__setup("norandmaps", disable_randmaps);
+
+
+/*
+ * If a p?d_bad entry is found while walking page tables, report
+ * the error, before resetting entry to p?d_none. Usually (but
+ * very seldom) called out from the p?d_none_or_clear_bad macros.
+ */
+
+void pgd_clear_bad(pgd_t *pgd)
+{
+ pgd_ERROR(*pgd);
+ pgd_clear(pgd);
+}
+
+void pud_clear_bad(pud_t *pud)
+{
+ pud_ERROR(*pud);
+ pud_clear(pud);
+}
+
+void pmd_clear_bad(pmd_t *pmd)
+{
+ pmd_ERROR(*pmd);
+ pmd_clear(pmd);
+}
+
+/*
+ * Note: this doesn't free the actual pages themselves. That
+ * has been handled earlier when unmapping all the memory regions.
+ */
+static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
+{
+ pgtable_t token = pmd_pgtable(*pmd);
+ pmd_clear(pmd);
+ pte_free_tlb(tlb, token);
+ tlb->mm->nr_ptes--;
+}
+
+static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+ unsigned long addr, unsigned long end,
+ unsigned long floor, unsigned long ceiling)
+{
+ pmd_t *pmd;
+ unsigned long next;
+ unsigned long start;
+
+ start = addr;
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (pmd_none_or_clear_bad(pmd))
+ continue;
+ free_pte_range(tlb, pmd);
+ } while (pmd++, addr = next, addr != end);
+
+ start &= PUD_MASK;
+ if (start < floor)
+ return;
+ if (ceiling) {
+ ceiling &= PUD_MASK;
+ if (!ceiling)
+ return;
+ }
+ if (end - 1 > ceiling - 1)
+ return;
+
+ pmd = pmd_offset(pud, start);
+ pud_clear(pud);
+ pmd_free_tlb(tlb, pmd);
+}
+
+static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+ unsigned long addr, unsigned long end,
+ unsigned long floor, unsigned long ceiling)
+{
+ pud_t *pud;
+ unsigned long next;
+ unsigned long start;
+
+ start = addr;
+ pud = pud_offset(pgd, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (pud_none_or_clear_bad(pud))
+ continue;
+ free_pmd_range(tlb, pud, addr, next, floor, ceiling);
+ } while (pud++, addr = next, addr != end);
+
+ start &= PGDIR_MASK;
+ if (start < floor)
+ return;
+ if (ceiling) {
+ ceiling &= PGDIR_MASK;
+ if (!ceiling)
+ return;
+ }
+ if (end - 1 > ceiling - 1)
+ return;
+
+ pud = pud_offset(pgd, start);
+ pgd_clear(pgd);
+ pud_free_tlb(tlb, pud);
+}
+
+/*
+ * This function frees user-level page tables of a process.
+ *
+ * Must be called with pagetable lock held.
+ */
+void free_pgd_range(struct mmu_gather *tlb,
+ unsigned long addr, unsigned long end,
+ unsigned long floor, unsigned long ceiling)
+{
+ pgd_t *pgd;
+ unsigned long next;
+ unsigned long start;
+
+ /*
+ * The next few lines have given us lots of grief...
+ *
+ * Why are we testing PMD* at this top level? Because often
+ * there will be no work to do at all, and we'd prefer not to
+ * go all the way down to the bottom just to discover that.
+ *
+ * Why all these "- 1"s? Because 0 represents both the bottom
+ * of the address space and the top of it (using -1 for the
+ * top wouldn't help much: the masks would do the wrong thing).
+ * The rule is that addr 0 and floor 0 refer to the bottom of
+ * the address space, but end 0 and ceiling 0 refer to the top
+ * Comparisons need to use "end - 1" and "ceiling - 1" (though
+ * that end 0 case should be mythical).
+ *
+ * Wherever addr is brought up or ceiling brought down, we must
+ * be careful to reject "the opposite 0" before it confuses the
+ * subsequent tests. But what about where end is brought down
+ * by PMD_SIZE below? no, end can't go down to 0 there.
+ *
+ * Whereas we round start (addr) and ceiling down, by different
+ * masks at different levels, in order to test whether a table
+ * now has no other vmas using it, so can be freed, we don't
+ * bother to round floor or end up - the tests don't need that.
+ */
+
+ addr &= PMD_MASK;
+ if (addr < floor) {
+ addr += PMD_SIZE;
+ if (!addr)
+ return;
+ }
+ if (ceiling) {
+ ceiling &= PMD_MASK;
+ if (!ceiling)
+ return;
+ }
+ if (end - 1 > ceiling - 1)
+ end -= PMD_SIZE;
+ if (addr > end - 1)
+ return;
+
+ start = addr;
+ pgd = pgd_offset(tlb->mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd))
+ continue;
+ free_pud_range(tlb, pgd, addr, next, floor, ceiling);
+ } while (pgd++, addr = next, addr != end);
+}
+
+void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long floor, unsigned long ceiling)
+{
+ while (vma) {
+ struct vm_area_struct *next = vma->vm_next;
+ unsigned long addr = vma->vm_start;
+
+ /*
+ * Hide vma from rmap and vmtruncate before freeing pgtables
+ */
+ anon_vma_unlink(vma);
+ unlink_file_vma(vma);
+
+ if (is_vm_hugetlb_page(vma)) {
+ hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
+ floor, next? next->vm_start: ceiling);
+ } else {
+ /*
+ * Optimization: gather nearby vmas into one call down
+ */
+ while (next && next->vm_start <= vma->vm_end + PMD_SIZE
+ && !is_vm_hugetlb_page(next)) {
+ vma = next;
+ next = vma->vm_next;
+ anon_vma_unlink(vma);
+ unlink_file_vma(vma);
+ }
+ free_pgd_range(tlb, addr, vma->vm_end,
+ floor, next? next->vm_start: ceiling);
+ }
+ vma = next;
+ }
+}
+
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+{
+ pgtable_t new = pte_alloc_one(mm, address);
+ if (!new)
+ return -ENOMEM;
+
+ /*
+ * Ensure all pte setup (eg. pte page lock and page clearing) are
+ * visible before the pte is made visible to other CPUs by being
+ * put into page tables.
+ *
+ * The other side of the story is the pointer chasing in the page
+ * table walking code (when walking the page table without locking;
+ * ie. most of the time). Fortunately, these data accesses consist
+ * of a chain of data-dependent loads, meaning most CPUs (alpha
+ * being the notable exception) will already guarantee loads are
+ * seen in-order. See the alpha page table accessors for the
+ * smp_read_barrier_depends() barriers in page table walking code.
+ */
+ smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
+
+ spin_lock(&mm->page_table_lock);
+ if (!pmd_present(*pmd)) { /* Has another populated it ? */
+ mm->nr_ptes++;
+ pmd_populate(mm, pmd, new);
+ new = NULL;
+ }
+ spin_unlock(&mm->page_table_lock);
+ if (new)
+ pte_free(mm, new);
+ return 0;
+}
+
+int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
+{
+ pte_t *new = pte_alloc_one_kernel(&init_mm, address);
+ if (!new)
+ return -ENOMEM;
+
+ smp_wmb(); /* See comment in __pte_alloc */
+
+ spin_lock(&init_mm.page_table_lock);
+ if (!pmd_present(*pmd)) { /* Has another populated it ? */
+ pmd_populate_kernel(&init_mm, pmd, new);
+ new = NULL;
+ }
+ spin_unlock(&init_mm.page_table_lock);
+ if (new)
+ pte_free_kernel(&init_mm, new);
+ return 0;
+}
+
+static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
+{
+ if (file_rss)
+ add_mm_counter(mm, file_rss, file_rss);
+ if (anon_rss)
+ add_mm_counter(mm, anon_rss, anon_rss);
+}
+
+/*
+ * This function is called to print an error when a bad pte
+ * is found. For example, we might have a PFN-mapped pte in
+ * a region that doesn't allow it.
+ *
+ * The calling function must still handle the error.
+ */
+static void print_bad_pte(struct vm_area_struct *vma, pte_t pte,
+ unsigned long vaddr)
+{
+ printk(KERN_ERR "Bad pte = %08llx, process = %s, "
+ "vm_flags = %lx, vaddr = %lx\n",
+ (long long)pte_val(pte),
+ (vma->vm_mm == current->mm ? current->comm : "???"),
+ vma->vm_flags, vaddr);
+ dump_stack();
+}
+
+static inline int is_cow_mapping(unsigned int flags)
+{
+ return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
+}
+
+/*
+ * vm_normal_page -- This function gets the "struct page" associated with a pte.
+ *
+ * "Special" mappings do not wish to be associated with a "struct page" (either
+ * it doesn't exist, or it exists but they don't want to touch it). In this
+ * case, NULL is returned here. "Normal" mappings do have a struct page.
+ *
+ * There are 2 broad cases. Firstly, an architecture may define a pte_special()
+ * pte bit, in which case this function is trivial. Secondly, an architecture
+ * may not have a spare pte bit, which requires a more complicated scheme,
+ * described below.
+ *
+ * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
+ * special mapping (even if there are underlying and valid "struct pages").
+ * COWed pages of a VM_PFNMAP are always normal.
+ *
+ * The way we recognize COWed pages within VM_PFNMAP mappings is through the
+ * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
+ * set, and the vm_pgoff will point to the first PFN mapped: thus every special
+ * mapping will always honor the rule
+ *
+ * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
+ *
+ * And for normal mappings this is false.
+ *
+ * This restricts such mappings to be a linear translation from virtual address
+ * to pfn. To get around this restriction, we allow arbitrary mappings so long
+ * as the vma is not a COW mapping; in that case, we know that all ptes are
+ * special (because none can have been COWed).
+ *
+ *
+ * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
+ *
+ * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
+ * page" backing, however the difference is that _all_ pages with a struct
+ * page (that is, those where pfn_valid is true) are refcounted and considered
+ * normal pages by the VM. The disadvantage is that pages are refcounted
+ * (which can be slower and simply not an option for some PFNMAP users). The
+ * advantage is that we don't have to follow the strict linearity rule of
+ * PFNMAP mappings in order to support COWable mappings.
+ *
+ */
+#ifdef __HAVE_ARCH_PTE_SPECIAL
+# define HAVE_PTE_SPECIAL 1
+#else
+# define HAVE_PTE_SPECIAL 0
+#endif
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte)
+{
+ unsigned long pfn;
+
+ if (HAVE_PTE_SPECIAL) {
+ if (likely(!pte_special(pte))) {
+ VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+ return pte_page(pte);
+ }
+ VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
+ return NULL;
+ }
+
+ /* !HAVE_PTE_SPECIAL case follows: */
+
+ pfn = pte_pfn(pte);
+
+ if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
+ if (vma->vm_flags & VM_MIXEDMAP) {
+ if (!pfn_valid(pfn))
+ return NULL;
+ goto out;
+ } else {
+ unsigned long off;
+ off = (addr - vma->vm_start) >> PAGE_SHIFT;
+ if (pfn == vma->vm_pgoff + off)
+ return NULL;
+ if (!is_cow_mapping(vma->vm_flags))
+ return NULL;
+ }
+ }
+
+ VM_BUG_ON(!pfn_valid(pfn));
+
+ /*
+ * NOTE! We still have PageReserved() pages in the page tables.
+ *
+ * eg. VDSO mappings can cause them to exist.
+ */
+out:
+ return pfn_to_page(pfn);
+}
+
+/*
+ * copy one vm_area from one task to the other. Assumes the page tables
+ * already present in the new task to be cleared in the whole range
+ * covered by this vma.
+ */
+
+static inline void
+copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
+ unsigned long addr, int *rss)
+{
+ unsigned long vm_flags = vma->vm_flags;
+ pte_t pte = *src_pte;
+ struct page *page;
+
+ /* pte contains position in swap or file, so copy. */
+ if (unlikely(!pte_present(pte))) {
+ if (!pte_file(pte)) {
+ swp_entry_t entry = pte_to_swp_entry(pte);
+
+ swap_duplicate(entry);
+ /* make sure dst_mm is on swapoff's mmlist. */
+ if (unlikely(list_empty(&dst_mm->mmlist))) {
+ spin_lock(&mmlist_lock);
+ if (list_empty(&dst_mm->mmlist))
+ list_add(&dst_mm->mmlist,
+ &src_mm->mmlist);
+ spin_unlock(&mmlist_lock);
+ }
+ if (is_write_migration_entry(entry) &&
+ is_cow_mapping(vm_flags)) {
+ /*
+ * COW mappings require pages in both parent
+ * and child to be set to read.
+ */
+ make_migration_entry_read(&entry);
+ pte = swp_entry_to_pte(entry);
+ set_pte_at(src_mm, addr, src_pte, pte);
+ }
+ }
+ goto out_set_pte;
+ }
+
+ /*
+ * If it's a COW mapping, write protect it both
+ * in the parent and the child
+ */
+ if (is_cow_mapping(vm_flags)) {
+ ptep_set_wrprotect(src_mm, addr, src_pte);
+ pte = pte_wrprotect(pte);
+ }
+
+ /*
+ * If it's a shared mapping, mark it clean in
+ * the child
+ */
+ if (vm_flags & VM_SHARED)
+ pte = pte_mkclean(pte);
+ pte = pte_mkold(pte);
+
+ page = vm_normal_page(vma, addr, pte);
+ if (page) {
+ get_page(page);
+ page_dup_rmap(page, vma, addr);
+ rss[!!PageAnon(page)]++;
+ }
+
+out_set_pte:
+ set_pte_at(dst_mm, addr, dst_pte, pte);
+}
+
+static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ pte_t *src_pte, *dst_pte;
+ spinlock_t *src_ptl, *dst_ptl;
+ int progress = 0;
+ int rss[2];
+
+again:
+ rss[1] = rss[0] = 0;
+ dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
+ if (!dst_pte)
+ return -ENOMEM;
+ src_pte = pte_offset_map_nested(src_pmd, addr);
+ src_ptl = pte_lockptr(src_mm, src_pmd);
+ spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+ arch_enter_lazy_mmu_mode();
+
+ do {
+ /*
+ * We are holding two locks at this point - either of them
+ * could generate latencies in another task on another CPU.
+ */
+ if (progress >= 32) {
+ progress = 0;
+ if (need_resched() ||
+ spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
+ break;
+ }
+ if (pte_none(*src_pte)) {
+ progress++;
+ continue;
+ }
+ copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
+ progress += 8;
+ } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
+
+ arch_leave_lazy_mmu_mode();
+ spin_unlock(src_ptl);
+ pte_unmap_nested(src_pte - 1);
+ add_mm_rss(dst_mm, rss[0], rss[1]);
+ pte_unmap_unlock(dst_pte - 1, dst_ptl);
+ cond_resched();
+ if (addr != end)
+ goto again;
+ return 0;
+}
+
+static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ pmd_t *src_pmd, *dst_pmd;
+ unsigned long next;
+
+ dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
+ if (!dst_pmd)
+ return -ENOMEM;
+ src_pmd = pmd_offset(src_pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (pmd_none_or_clear_bad(src_pmd))
+ continue;
+ if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
+ vma, addr, next))
+ return -ENOMEM;
+ } while (dst_pmd++, src_pmd++, addr = next, addr != end);
+ return 0;
+}
+
+static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ pud_t *src_pud, *dst_pud;
+ unsigned long next;
+
+ dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
+ if (!dst_pud)
+ return -ENOMEM;
+ src_pud = pud_offset(src_pgd, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (pud_none_or_clear_bad(src_pud))
+ continue;
+ if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
+ vma, addr, next))
+ return -ENOMEM;
+ } while (dst_pud++, src_pud++, addr = next, addr != end);
+ return 0;
+}
+
+int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ struct vm_area_struct *vma)
+{
+ pgd_t *src_pgd, *dst_pgd;
+ unsigned long next;
+ unsigned long addr = vma->vm_start;
+ unsigned long end = vma->vm_end;
+ int ret;
+
+ /*
+ * Don't copy ptes where a page fault will fill them correctly.
+ * Fork becomes much lighter when there are big shared or private
+ * readonly mappings. The tradeoff is that copy_page_range is more
+ * efficient than faulting.
+ */
+ if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
+ if (!vma->anon_vma)
+ return 0;
+ }
+
+ if (is_vm_hugetlb_page(vma))
+ return copy_hugetlb_page_range(dst_mm, src_mm, vma);
+
+ /*
+ * We need to invalidate the secondary MMU mappings only when
+ * there could be a permission downgrade on the ptes of the
+ * parent mm. And a permission downgrade will only happen if
+ * is_cow_mapping() returns true.
+ */
+ if (is_cow_mapping(vma->vm_flags))
+ mmu_notifier_invalidate_range_start(src_mm, addr, end);
+
+ ret = 0;
+ dst_pgd = pgd_offset(dst_mm, addr);
+ src_pgd = pgd_offset(src_mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(src_pgd))
+ continue;
+ if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
+ vma, addr, next))) {
+ ret = -ENOMEM;
+ break;
+ }
+ } while (dst_pgd++, src_pgd++, addr = next, addr != end);
+
+ if (is_cow_mapping(vma->vm_flags))
+ mmu_notifier_invalidate_range_end(src_mm,
+ vma->vm_start, end);
+ return ret;
+}
+
+static unsigned long zap_pte_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ long *zap_work, struct zap_details *details)
+{
+ struct mm_struct *mm = tlb->mm;
+ pte_t *pte;
+ spinlock_t *ptl;
+ int file_rss = 0;
+ int anon_rss = 0;
+
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ arch_enter_lazy_mmu_mode();
+ do {
+ pte_t ptent = *pte;
+ if (pte_none(ptent)) {
+ (*zap_work)--;
+ continue;
+ }
+
+ (*zap_work) -= PAGE_SIZE;
+
+ if (pte_present(ptent)) {
+ struct page *page;
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (unlikely(details) && page) {
+ /*
+ * unmap_shared_mapping_pages() wants to
+ * invalidate cache without truncating:
+ * unmap shared but keep private pages.
+ */
+ if (details->check_mapping &&
+ details->check_mapping != page->mapping)
+ continue;
+ /*
+ * Each page->index must be checked when
+ * invalidating or truncating nonlinear.
+ */
+ if (details->nonlinear_vma &&
+ (page->index < details->first_index ||
+ page->index > details->last_index))
+ continue;
+ }
+ ptent = ptep_get_and_clear_full(mm, addr, pte,
+ tlb->fullmm);
+ tlb_remove_tlb_entry(tlb, pte, addr);
+ if (unlikely(!page))
+ continue;
+ if (unlikely(details) && details->nonlinear_vma
+ && linear_page_index(details->nonlinear_vma,
+ addr) != page->index)
+ set_pte_at(mm, addr, pte,
+ pgoff_to_pte(page->index));
+ if (PageAnon(page))
+ anon_rss--;
+ else {
+ if (pte_dirty(ptent))
+ set_page_dirty(page);
+ if (pte_young(ptent))
+ SetPageReferenced(page);
+ file_rss--;
+ }
+ page_remove_rmap(page, vma);
+ tlb_remove_page(tlb, page);
+ continue;
+ }
+ /*
+ * If details->check_mapping, we leave swap entries;
+ * if details->nonlinear_vma, we leave file entries.
+ */
+ if (unlikely(details))
+ continue;
+ if (!pte_file(ptent))
+ free_swap_and_cache(pte_to_swp_entry(ptent));
+ pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
+
+ add_mm_rss(mm, file_rss, anon_rss);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(pte - 1, ptl);
+
+ return addr;
+}
+
+static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pud_t *pud,
+ unsigned long addr, unsigned long end,
+ long *zap_work, struct zap_details *details)
+{
+ pmd_t *pmd;
+ unsigned long next;
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (pmd_none_or_clear_bad(pmd)) {
+ (*zap_work)--;
+ continue;
+ }
+ next = zap_pte_range(tlb, vma, pmd, addr, next,
+ zap_work, details);
+ } while (pmd++, addr = next, (addr != end && *zap_work > 0));
+
+ return addr;
+}
+
+static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pgd_t *pgd,
+ unsigned long addr, unsigned long end,
+ long *zap_work, struct zap_details *details)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ pud = pud_offset(pgd, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (pud_none_or_clear_bad(pud)) {
+ (*zap_work)--;
+ continue;
+ }
+ next = zap_pmd_range(tlb, vma, pud, addr, next,
+ zap_work, details);
+ } while (pud++, addr = next, (addr != end && *zap_work > 0));
+
+ return addr;
+}
+
+static unsigned long unmap_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ long *zap_work, struct zap_details *details)
+{
+ pgd_t *pgd;
+ unsigned long next;
+
+ if (details && !details->check_mapping && !details->nonlinear_vma)
+ details = NULL;
+
+ BUG_ON(addr >= end);
+ tlb_start_vma(tlb, vma);
+ pgd = pgd_offset(vma->vm_mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd)) {
+ (*zap_work)--;
+ continue;
+ }
+ next = zap_pud_range(tlb, vma, pgd, addr, next,
+ zap_work, details);
+ } while (pgd++, addr = next, (addr != end && *zap_work > 0));
+ tlb_end_vma(tlb, vma);
+
+ return addr;
+}
+
+#ifdef CONFIG_PREEMPT
+# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
+#else
+/* No preempt: go for improved straight-line efficiency */
+# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
+#endif
+
+/**
+ * unmap_vmas - unmap a range of memory covered by a list of vma's
+ * @tlbp: address of the caller's struct mmu_gather
+ * @vma: the starting vma
+ * @start_addr: virtual address at which to start unmapping
+ * @end_addr: virtual address at which to end unmapping
+ * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
+ * @details: details of nonlinear truncation or shared cache invalidation
+ *
+ * Returns the end address of the unmapping (restart addr if interrupted).
+ *
+ * Unmap all pages in the vma list.
+ *
+ * We aim to not hold locks for too long (for scheduling latency reasons).
+ * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
+ * return the ending mmu_gather to the caller.
+ *
+ * Only addresses between `start' and `end' will be unmapped.
+ *
+ * The VMA list must be sorted in ascending virtual address order.
+ *
+ * unmap_vmas() assumes that the caller will flush the whole unmapped address
+ * range after unmap_vmas() returns. So the only responsibility here is to
+ * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
+ * drops the lock and schedules.
+ */
+unsigned long unmap_vmas(struct mmu_gather **tlbp,
+ struct vm_area_struct *vma, unsigned long start_addr,
+ unsigned long end_addr, unsigned long *nr_accounted,
+ struct zap_details *details)
+{
+ long zap_work = ZAP_BLOCK_SIZE;
+ unsigned long tlb_start = 0; /* For tlb_finish_mmu */
+ int tlb_start_valid = 0;
+ unsigned long start = start_addr;
+ spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
+ int fullmm = (*tlbp)->fullmm;
+ struct mm_struct *mm = vma->vm_mm;
+
+ mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
+ for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
+ unsigned long end;
+
+ start = max(vma->vm_start, start_addr);
+ if (start >= vma->vm_end)
+ continue;
+ end = min(vma->vm_end, end_addr);
+ if (end <= vma->vm_start)
+ continue;
+
+ if (vma->vm_flags & VM_ACCOUNT)
+ *nr_accounted += (end - start) >> PAGE_SHIFT;
+
+ while (start != end) {
+ if (!tlb_start_valid) {
+ tlb_start = start;
+ tlb_start_valid = 1;
+ }
+
+ if (unlikely(is_vm_hugetlb_page(vma))) {
+ /*
+ * It is undesirable to test vma->vm_file as it
+ * should be non-null for valid hugetlb area.
+ * However, vm_file will be NULL in the error
+ * cleanup path of do_mmap_pgoff. When
+ * hugetlbfs ->mmap method fails,
+ * do_mmap_pgoff() nullifies vma->vm_file
+ * before calling this function to clean up.
+ * Since no pte has actually been setup, it is
+ * safe to do nothing in this case.
+ */
+ if (vma->vm_file) {
+ unmap_hugepage_range(vma, start, end, NULL);
+ zap_work -= (end - start) /
+ pages_per_huge_page(hstate_vma(vma));
+ }
+
+ start = end;
+ } else
+ start = unmap_page_range(*tlbp, vma,
+ start, end, &zap_work, details);
+
+ if (zap_work > 0) {
+ BUG_ON(start != end);
+ break;
+ }
+
+ tlb_finish_mmu(*tlbp, tlb_start, start);
+
+ if (need_resched() ||
+ (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
+ if (i_mmap_lock) {
+ *tlbp = NULL;
+ goto out;
+ }
+ cond_resched();
+ }
+
+ *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
+ tlb_start_valid = 0;
+ zap_work = ZAP_BLOCK_SIZE;
+ }
+ }
+out:
+ mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
+ return start; /* which is now the end (or restart) address */
+}
+
+/**
+ * zap_page_range - remove user pages in a given range
+ * @vma: vm_area_struct holding the applicable pages
+ * @address: starting address of pages to zap
+ * @size: number of bytes to zap
+ * @details: details of nonlinear truncation or shared cache invalidation
+ */
+unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
+ unsigned long size, struct zap_details *details)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct mmu_gather *tlb;
+ unsigned long end = address + size;
+ unsigned long nr_accounted = 0;
+
+ lru_add_drain();
+ tlb = tlb_gather_mmu(mm, 0);
+ update_hiwater_rss(mm);
+ end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
+ if (tlb)
+ tlb_finish_mmu(tlb, address, end);
+ return end;
+}
+
+/**
+ * zap_vma_ptes - remove ptes mapping the vma
+ * @vma: vm_area_struct holding ptes to be zapped
+ * @address: starting address of pages to zap
+ * @size: number of bytes to zap
+ *
+ * This function only unmaps ptes assigned to VM_PFNMAP vmas.
+ *
+ * The entire address range must be fully contained within the vma.
+ *
+ * Returns 0 if successful.
+ */
+int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
+ unsigned long size)
+{
+ if (address < vma->vm_start || address + size > vma->vm_end ||
+ !(vma->vm_flags & VM_PFNMAP))
+ return -1;
+ zap_page_range(vma, address, size, NULL);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(zap_vma_ptes);
+
+/*
+ * Do a quick page-table lookup for a single page.
+ */
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
+ unsigned int flags)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *ptep, pte;
+ spinlock_t *ptl;
+ struct page *page;
+ struct mm_struct *mm = vma->vm_mm;
+
+ page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
+ if (!IS_ERR(page)) {
+ BUG_ON(flags & FOLL_GET);
+ goto out;
+ }
+
+ page = NULL;
+ pgd = pgd_offset(mm, address);
+ if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+ goto no_page_table;
+
+ pud = pud_offset(pgd, address);
+ if (pud_none(*pud))
+ goto no_page_table;
+ if (pud_huge(*pud)) {
+ BUG_ON(flags & FOLL_GET);
+ page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
+ goto out;
+ }
+ if (unlikely(pud_bad(*pud)))
+ goto no_page_table;
+
+ pmd = pmd_offset(pud, address);
+ if (pmd_none(*pmd))
+ goto no_page_table;
+ if (pmd_huge(*pmd)) {
+ BUG_ON(flags & FOLL_GET);
+ page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
+ goto out;
+ }
+ if (unlikely(pmd_bad(*pmd)))
+ goto no_page_table;
+
+ ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+
+ pte = *ptep;
+ if (!pte_present(pte))
+ goto no_page;
+ if ((flags & FOLL_WRITE) && !pte_write(pte))
+ goto unlock;
+ page = vm_normal_page(vma, address, pte);
+ if (unlikely(!page))
+ goto bad_page;
+
+ if (flags & FOLL_GET)
+ get_page(page);
+ if (flags & FOLL_TOUCH) {
+ if ((flags & FOLL_WRITE) &&
+ !pte_dirty(pte) && !PageDirty(page))
+ set_page_dirty(page);
+ mark_page_accessed(page);
+ }
+unlock:
+ pte_unmap_unlock(ptep, ptl);
+out:
+ return page;
+
+bad_page:
+ pte_unmap_unlock(ptep, ptl);
+ return ERR_PTR(-EFAULT);
+
+no_page:
+ pte_unmap_unlock(ptep, ptl);
+ if (!pte_none(pte))
+ return page;
+ /* Fall through to ZERO_PAGE handling */
+no_page_table:
+ /*
+ * When core dumping an enormous anonymous area that nobody
+ * has touched so far, we don't want to allocate page tables.
+ */
+ if (flags & FOLL_ANON) {
+ page = ZERO_PAGE(0);
+ if (flags & FOLL_GET)
+ get_page(page);
+ BUG_ON(flags & FOLL_WRITE);
+ }
+ return page;
+}
+
+/* Can we do the FOLL_ANON optimization? */
+static inline int use_zero_page(struct vm_area_struct *vma)
+{
+ /*
+ * We don't want to optimize FOLL_ANON for make_pages_present()
+ * when it tries to page in a VM_LOCKED region. As to VM_SHARED,
+ * we want to get the page from the page tables to make sure
+ * that we serialize and update with any other user of that
+ * mapping.
+ */
+ if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
+ return 0;
+ /*
+ * And if we have a fault routine, it's not an anonymous region.
+ */
+ return !vma->vm_ops || !vma->vm_ops->fault;
+}
+
+
+
+int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, int len, int flags,
+ struct page **pages, struct vm_area_struct **vmas)
+{
+ int i;
+ unsigned int vm_flags = 0;
+ int write = !!(flags & GUP_FLAGS_WRITE);
+ int force = !!(flags & GUP_FLAGS_FORCE);
+ int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
+
+ if (len <= 0)
+ return 0;
+ /*
+ * Require read or write permissions.
+ * If 'force' is set, we only require the "MAY" flags.
+ */
+ vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
+ vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+ i = 0;
+
+ do {
+ struct vm_area_struct *vma;
+ unsigned int foll_flags;
+
+ vma = find_extend_vma(mm, start);
+ if (!vma && in_gate_area(tsk, start)) {
+ unsigned long pg = start & PAGE_MASK;
+ struct vm_area_struct *gate_vma = get_gate_vma(tsk);
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ /* user gate pages are read-only */
+ if (!ignore && write)
+ return i ? : -EFAULT;
+ if (pg > TASK_SIZE)
+ pgd = pgd_offset_k(pg);
+ else
+ pgd = pgd_offset_gate(mm, pg);
+ BUG_ON(pgd_none(*pgd));
+ pud = pud_offset(pgd, pg);
+ BUG_ON(pud_none(*pud));
+ pmd = pmd_offset(pud, pg);
+ if (pmd_none(*pmd))
+ return i ? : -EFAULT;
+ pte = pte_offset_map(pmd, pg);
+ if (pte_none(*pte)) {
+ pte_unmap(pte);
+ return i ? : -EFAULT;
+ }
+ if (pages) {
+ struct page *page = vm_normal_page(gate_vma, start, *pte);
+ pages[i] = page;
+ if (page)
+ get_page(page);
+ }
+ pte_unmap(pte);
+ if (vmas)
+ vmas[i] = gate_vma;
+ i++;
+ start += PAGE_SIZE;
+ len--;
+ continue;
+ }
+
+ if (!vma ||
+ (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
+ (!ignore && !(vm_flags & vma->vm_flags)))
+ return i ? : -EFAULT;
+
+ if (is_vm_hugetlb_page(vma)) {
+ i = follow_hugetlb_page(mm, vma, pages, vmas,
+ &start, &len, i, write);
+ continue;
+ }
+
+ foll_flags = FOLL_TOUCH;
+ if (pages)
+ foll_flags |= FOLL_GET;
+ if (!write && use_zero_page(vma))
+ foll_flags |= FOLL_ANON;
+
+ do {
+ struct page *page;
+
+ /*
+ * If tsk is ooming, cut off its access to large memory
+ * allocations. It has a pending SIGKILL, but it can't
+ * be processed until returning to user space.
+ */
+ if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE)))
+ return i ? i : -ENOMEM;
+
+ if (write)
+ foll_flags |= FOLL_WRITE;
+
+ cond_resched();
+ while (!(page = follow_page(vma, start, foll_flags))) {
+ int ret;
+ ret = handle_mm_fault(mm, vma, start,
+ foll_flags & FOLL_WRITE);
+ if (ret & VM_FAULT_ERROR) {
+ if (ret & VM_FAULT_OOM)
+ return i ? i : -ENOMEM;
+ else if (ret & VM_FAULT_SIGBUS)
+ return i ? i : -EFAULT;
+ BUG();
+ }
+ if (ret & VM_FAULT_MAJOR)
+ tsk->maj_flt++;
+ else
+ tsk->min_flt++;
+
+ /*
+ * The VM_FAULT_WRITE bit tells us that
+ * do_wp_page has broken COW when necessary,
+ * even if maybe_mkwrite decided not to set
+ * pte_write. We can thus safely do subsequent
+ * page lookups as if they were reads.
+ */
+ if (ret & VM_FAULT_WRITE)
+ foll_flags &= ~FOLL_WRITE;
+
+ cond_resched();
+ }
+ if (IS_ERR(page))
+ return i ? i : PTR_ERR(page);
+ if (pages) {
+ pages[i] = page;
+
+ flush_anon_page(vma, page, start);
+ flush_dcache_page(page);
+ }
+ if (vmas)
+ vmas[i] = vma;
+ i++;
+ start += PAGE_SIZE;
+ len--;
+ } while (len && start < vma->vm_end);
+ } while (len);
+ return i;
+}
+
+int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, int len, int write, int force,
+ struct page **pages, struct vm_area_struct **vmas)
+{
+ int flags = 0;
+
+ if (write)
+ flags |= GUP_FLAGS_WRITE;
+ if (force)
+ flags |= GUP_FLAGS_FORCE;
+
+ return __get_user_pages(tsk, mm,
+ start, len, flags,
+ pages, vmas);
+}
+
+EXPORT_SYMBOL(get_user_pages);
+
+pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
+ spinlock_t **ptl)
+{
+ pgd_t * pgd = pgd_offset(mm, addr);
+ pud_t * pud = pud_alloc(mm, pgd, addr);
+ if (pud) {
+ pmd_t * pmd = pmd_alloc(mm, pud, addr);
+ if (pmd)
+ return pte_alloc_map_lock(mm, pmd, addr, ptl);
+ }
+ return NULL;
+}
+
+/*
+ * This is the old fallback for page remapping.
+ *
+ * For historical reasons, it only allows reserved pages. Only
+ * old drivers should use this, and they needed to mark their
+ * pages reserved for the old functions anyway.
+ */
+static int insert_page(struct vm_area_struct *vma, unsigned long addr,
+ struct page *page, pgprot_t prot)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ int retval;
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ retval = -EINVAL;
+ if (PageAnon(page))
+ goto out;
+ retval = -ENOMEM;
+ flush_dcache_page(page);
+ pte = get_locked_pte(mm, addr, &ptl);
+ if (!pte)
+ goto out;
+ retval = -EBUSY;
+ if (!pte_none(*pte))
+ goto out_unlock;
+
+ /* Ok, finally just insert the thing.. */
+ get_page(page);
+ inc_mm_counter(mm, file_rss);
+ page_add_file_rmap(page);
+ set_pte_at(mm, addr, pte, mk_pte(page, prot));
+
+ retval = 0;
+ pte_unmap_unlock(pte, ptl);
+ return retval;
+out_unlock:
+ pte_unmap_unlock(pte, ptl);
+out:
+ return retval;
+}
+
+/**
+ * vm_insert_page - insert single page into user vma
+ * @vma: user vma to map to
+ * @addr: target user address of this page
+ * @page: source kernel page
+ *
+ * This allows drivers to insert individual pages they've allocated
+ * into a user vma.
+ *
+ * The page has to be a nice clean _individual_ kernel allocation.
+ * If you allocate a compound page, you need to have marked it as
+ * such (__GFP_COMP), or manually just split the page up yourself
+ * (see split_page()).
+ *
+ * NOTE! Traditionally this was done with "remap_pfn_range()" which
+ * took an arbitrary page protection parameter. This doesn't allow
+ * that. Your vma protection will have to be set up correctly, which
+ * means that if you want a shared writable mapping, you'd better
+ * ask for a shared writable mapping!
+ *
+ * The page does not need to be reserved.
+ */
+int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
+ struct page *page)
+{
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return -EFAULT;
+ if (!page_count(page))
+ return -EINVAL;
+ vma->vm_flags |= VM_INSERTPAGE;
+ return insert_page(vma, addr, page, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_insert_page);
+
+static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, pgprot_t prot)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ int retval;
+ pte_t *pte, entry;
+ spinlock_t *ptl;
+
+ retval = -ENOMEM;
+ pte = get_locked_pte(mm, addr, &ptl);
+ if (!pte)
+ goto out;
+ retval = -EBUSY;
+ if (!pte_none(*pte))
+ goto out_unlock;
+
+ /* Ok, finally just insert the thing.. */
+ entry = pte_mkspecial(pfn_pte(pfn, prot));
+ set_pte_at(mm, addr, pte, entry);
+ update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */
+
+ retval = 0;
+out_unlock:
+ pte_unmap_unlock(pte, ptl);
+out:
+ return retval;
+}
+
+/**
+ * vm_insert_pfn - insert single pfn into user vma
+ * @vma: user vma to map to
+ * @addr: target user address of this page
+ * @pfn: source kernel pfn
+ *
+ * Similar to vm_inert_page, this allows drivers to insert individual pages
+ * they've allocated into a user vma. Same comments apply.
+ *
+ * This function should only be called from a vm_ops->fault handler, and
+ * in that case the handler should return NULL.
+ *
+ * vma cannot be a COW mapping.
+ *
+ * As this is called only for pages that do not currently exist, we
+ * do not need to flush old virtual caches or the TLB.
+ */
+int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn)
+{
+ /*
+ * Technically, architectures with pte_special can avoid all these
+ * restrictions (same for remap_pfn_range). However we would like
+ * consistency in testing and feature parity among all, so we should
+ * try to keep these invariants in place for everybody.
+ */
+ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
+ BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
+ (VM_PFNMAP|VM_MIXEDMAP));
+ BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
+ BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return -EFAULT;
+ return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_insert_pfn);
+
+int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn)
+{
+ BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return -EFAULT;
+
+ /*
+ * If we don't have pte special, then we have to use the pfn_valid()
+ * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
+ * refcount the page if pfn_valid is true (hence insert_page rather
+ * than insert_pfn).
+ */
+ if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
+ struct page *page;
+
+ page = pfn_to_page(pfn);
+ return insert_page(vma, addr, page, vma->vm_page_prot);
+ }
+ return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_insert_mixed);
+
+/*
+ * maps a range of physical memory into the requested pages. the old
+ * mappings are removed. any references to nonexistent pages results
+ * in null mappings (currently treated as "copy-on-access")
+ */
+static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ unsigned long pfn, pgprot_t prot)
+{
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
+ if (!pte)
+ return -ENOMEM;
+ arch_enter_lazy_mmu_mode();
+ do {
+ BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
+ pfn++;
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(pte - 1, ptl);
+ return 0;
+}
+
+static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
+ unsigned long addr, unsigned long end,
+ unsigned long pfn, pgprot_t prot)
+{
+ pmd_t *pmd;
+ unsigned long next;
+
+ pfn -= addr >> PAGE_SHIFT;
+ pmd = pmd_alloc(mm, pud, addr);
+ if (!pmd)
+ return -ENOMEM;
+ do {
+ next = pmd_addr_end(addr, end);
+ if (remap_pte_range(mm, pmd, addr, next,
+ pfn + (addr >> PAGE_SHIFT), prot))
+ return -ENOMEM;
+ } while (pmd++, addr = next, addr != end);
+ return 0;
+}
+
+static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
+ unsigned long addr, unsigned long end,
+ unsigned long pfn, pgprot_t prot)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ pfn -= addr >> PAGE_SHIFT;
+ pud = pud_alloc(mm, pgd, addr);
+ if (!pud)
+ return -ENOMEM;
+ do {
+ next = pud_addr_end(addr, end);
+ if (remap_pmd_range(mm, pud, addr, next,
+ pfn + (addr >> PAGE_SHIFT), prot))
+ return -ENOMEM;
+ } while (pud++, addr = next, addr != end);
+ return 0;
+}
+
+/**
+ * remap_pfn_range - remap kernel memory to userspace
+ * @vma: user vma to map to
+ * @addr: target user address to start at
+ * @pfn: physical address of kernel memory
+ * @size: size of map area
+ * @prot: page protection flags for this mapping
+ *
+ * Note: this is only safe if the mm semaphore is held when called.
+ */
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+ pgd_t *pgd;
+ unsigned long next;
+ unsigned long end = addr + PAGE_ALIGN(size);
+ struct mm_struct *mm = vma->vm_mm;
+ int err;
+
+ /*
+ * Physically remapped pages are special. Tell the
+ * rest of the world about it:
+ * VM_IO tells people not to look at these pages
+ * (accesses can have side effects).
+ * VM_RESERVED is specified all over the place, because
+ * in 2.4 it kept swapout's vma scan off this vma; but
+ * in 2.6 the LRU scan won't even find its pages, so this
+ * flag means no more than count its pages in reserved_vm,
+ * and omit it from core dump, even when VM_IO turned off.
+ * VM_PFNMAP tells the core MM that the base pages are just
+ * raw PFN mappings, and do not have a "struct page" associated
+ * with them.
+ *
+ * There's a horrible special case to handle copy-on-write
+ * behaviour that some programs depend on. We mark the "original"
+ * un-COW'ed pages by matching them up with "vma->vm_pgoff".
+ */
+ if (is_cow_mapping(vma->vm_flags)) {
+ if (addr != vma->vm_start || end != vma->vm_end)
+ return -EINVAL;
+ vma->vm_pgoff = pfn;
+ }
+
+ vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+
+ BUG_ON(addr >= end);
+ pfn -= addr >> PAGE_SHIFT;
+ pgd = pgd_offset(mm, addr);
+ flush_cache_range(vma, addr, end);
+ do {
+ next = pgd_addr_end(addr, end);
+ err = remap_pud_range(mm, pgd, addr, next,
+ pfn + (addr >> PAGE_SHIFT), prot);
+ if (err)
+ break;
+ } while (pgd++, addr = next, addr != end);
+ return err;
+}
+EXPORT_SYMBOL(remap_pfn_range);
+
+static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ pte_fn_t fn, void *data)
+{
+ pte_t *pte;
+ int err;
+ pgtable_t token;
+ spinlock_t *uninitialized_var(ptl);
+
+ pte = (mm == &init_mm) ?
+ pte_alloc_kernel(pmd, addr) :
+ pte_alloc_map_lock(mm, pmd, addr, &ptl);
+ if (!pte)
+ return -ENOMEM;
+
+ BUG_ON(pmd_huge(*pmd));
+
+ token = pmd_pgtable(*pmd);
+
+ do {
+ err = fn(pte, token, addr, data);
+ if (err)
+ break;
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+
+ if (mm != &init_mm)
+ pte_unmap_unlock(pte-1, ptl);
+ return err;
+}
+
+static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
+ unsigned long addr, unsigned long end,
+ pte_fn_t fn, void *data)
+{
+ pmd_t *pmd;
+ unsigned long next;
+ int err;
+
+ BUG_ON(pud_huge(*pud));
+
+ pmd = pmd_alloc(mm, pud, addr);
+ if (!pmd)
+ return -ENOMEM;
+ do {
+ next = pmd_addr_end(addr, end);
+ err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
+ if (err)
+ break;
+ } while (pmd++, addr = next, addr != end);
+ return err;
+}
+
+static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
+ unsigned long addr, unsigned long end,
+ pte_fn_t fn, void *data)
+{
+ pud_t *pud;
+ unsigned long next;
+ int err;
+
+ pud = pud_alloc(mm, pgd, addr);
+ if (!pud)
+ return -ENOMEM;
+ do {
+ next = pud_addr_end(addr, end);
+ err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
+ if (err)
+ break;
+ } while (pud++, addr = next, addr != end);
+ return err;
+}
+
+/*
+ * Scan a region of virtual memory, filling in page tables as necessary
+ * and calling a provided function on each leaf page table.
+ */
+int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
+ unsigned long size, pte_fn_t fn, void *data)
+{
+ pgd_t *pgd;
+ unsigned long next;
+ unsigned long start = addr, end = addr + size;
+ int err;
+
+ BUG_ON(addr >= end);
+ mmu_notifier_invalidate_range_start(mm, start, end);
+ pgd = pgd_offset(mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
+ if (err)
+ break;
+ } while (pgd++, addr = next, addr != end);
+ mmu_notifier_invalidate_range_end(mm, start, end);
+ return err;
+}
+EXPORT_SYMBOL_GPL(apply_to_page_range);
+
+/*
+ * handle_pte_fault chooses page fault handler according to an entry
+ * which was read non-atomically. Before making any commitment, on
+ * those architectures or configurations (e.g. i386 with PAE) which
+ * might give a mix of unmatched parts, do_swap_page and do_file_page
+ * must check under lock before unmapping the pte and proceeding
+ * (but do_wp_page is only called after already making such a check;
+ * and do_anonymous_page and do_no_page can safely check later on).
+ */
+static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
+ pte_t *page_table, pte_t orig_pte)
+{
+ int same = 1;
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
+ if (sizeof(pte_t) > sizeof(unsigned long)) {
+ spinlock_t *ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+ same = pte_same(*page_table, orig_pte);
+ spin_unlock(ptl);
+ }
+#endif
+ pte_unmap(page_table);
+ return same;
+}
+
+/*
+ * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
+ * servicing faults for write access. In the normal case, do always want
+ * pte_mkwrite. But get_user_pages can cause write faults for mappings
+ * that do not have writing enabled, when used by access_process_vm.
+ */
+static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
+{
+ if (likely(vma->vm_flags & VM_WRITE))
+ pte = pte_mkwrite(pte);
+ return pte;
+}
+
+static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
+{
+ /*
+ * If the source page was a PFN mapping, we don't have
+ * a "struct page" for it. We do a best-effort copy by
+ * just copying from the original user address. If that
+ * fails, we just zero-fill it. Live with it.
+ */
+ if (unlikely(!src)) {
+ void *kaddr = kmap_atomic(dst, KM_USER0);
+ void __user *uaddr = (void __user *)(va & PAGE_MASK);
+
+ /*
+ * This really shouldn't fail, because the page is there
+ * in the page tables. But it might just be unreadable,
+ * in which case we just give up and fill the result with
+ * zeroes.
+ */
+ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
+ memset(kaddr, 0, PAGE_SIZE);
+ kunmap_atomic(kaddr, KM_USER0);
+ flush_dcache_page(dst);
+ } else
+ copy_user_highpage(dst, src, va, vma);
+}
+
+/*
+ * This routine handles present pages, when users try to write
+ * to a shared page. It is done by copying the page to a new address
+ * and decrementing the shared-page counter for the old page.
+ *
+ * Note that this routine assumes that the protection checks have been
+ * done by the caller (the low-level page fault routine in most cases).
+ * Thus we can safely just mark it writable once we've done any necessary
+ * COW.
+ *
+ * We also mark the page dirty at this point even though the page will
+ * change only once the write actually happens. This avoids a few races,
+ * and potentially makes it more efficient.
+ *
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), with pte both mapped and locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ */
+static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ spinlock_t *ptl, pte_t orig_pte)
+{
+ struct page *old_page, *new_page;
+ pte_t entry;
+ int reuse = 0, ret = 0;
+ int page_mkwrite = 0;
+ struct page *dirty_page = NULL;
+
+ old_page = vm_normal_page(vma, address, orig_pte);
+ if (!old_page) {
+ /*
+ * VM_MIXEDMAP !pfn_valid() case
+ *
+ * We should not cow pages in a shared writeable mapping.
+ * Just mark the pages writable as we can't do any dirty
+ * accounting on raw pfn maps.
+ */
+ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+ (VM_WRITE|VM_SHARED))
+ goto reuse;
+ goto gotten;
+ }
+
+ /*
+ * Take out anonymous pages first, anonymous shared vmas are
+ * not dirty accountable.
+ */
+ if (PageAnon(old_page)) {
+ if (trylock_page(old_page)) {
+ reuse = can_share_swap_page(old_page);
+ unlock_page(old_page);
+ }
+ } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+ (VM_WRITE|VM_SHARED))) {
+ /*
+ * Only catch write-faults on shared writable pages,
+ * read-only shared pages can get COWed by
+ * get_user_pages(.write=1, .force=1).
+ */
+ if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
+ /*
+ * Notify the address space that the page is about to
+ * become writable so that it can prohibit this or wait
+ * for the page to get into an appropriate state.
+ *
+ * We do this without the lock held, so that it can
+ * sleep if it needs to.
+ */
+ page_cache_get(old_page);
+ pte_unmap_unlock(page_table, ptl);
+
+ if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
+ goto unwritable_page;
+
+ /*
+ * Since we dropped the lock we need to revalidate
+ * the PTE as someone else may have changed it. If
+ * they did, we just return, as we can count on the
+ * MMU to tell us if they didn't also make it writable.
+ */
+ page_table = pte_offset_map_lock(mm, pmd, address,
+ &ptl);
+ page_cache_release(old_page);
+ if (!pte_same(*page_table, orig_pte))
+ goto unlock;
+
+ page_mkwrite = 1;
+ }
+ dirty_page = old_page;
+ get_page(dirty_page);
+ reuse = 1;
+ }
+
+ if (reuse) {
+reuse:
+ flush_cache_page(vma, address, pte_pfn(orig_pte));
+ entry = pte_mkyoung(orig_pte);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (ptep_set_access_flags(vma, address, page_table, entry,1))
+ update_mmu_cache(vma, address, entry);
+ ret |= VM_FAULT_WRITE;
+ goto unlock;
+ }
+
+ /*
+ * Ok, we need to copy. Oh, well..
+ */
+ page_cache_get(old_page);
+gotten:
+ pte_unmap_unlock(page_table, ptl);
+
+ if (unlikely(anon_vma_prepare(vma)))
+ goto oom;
+ VM_BUG_ON(old_page == ZERO_PAGE(0));
+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ if (!new_page)
+ goto oom;
+ /*
+ * Don't let another task, with possibly unlocked vma,
+ * keep the mlocked page.
+ */
+ if ((vma->vm_flags & VM_LOCKED) && old_page) {
+ lock_page(old_page); /* for LRU manipulation */
+ clear_page_mlock(old_page);
+ unlock_page(old_page);
+ }
+ cow_user_page(new_page, old_page, address, vma);
+ __SetPageUptodate(new_page);
+
+ if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
+ goto oom_free_new;
+
+ /*
+ * Re-check the pte - we dropped the lock
+ */
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (likely(pte_same(*page_table, orig_pte))) {
+ if (old_page) {
+ if (!PageAnon(old_page)) {
+ dec_mm_counter(mm, file_rss);
+ inc_mm_counter(mm, anon_rss);
+ }
+ } else
+ inc_mm_counter(mm, anon_rss);
+ flush_cache_page(vma, address, pte_pfn(orig_pte));
+ entry = mk_pte(new_page, vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ /*
+ * Clear the pte entry and flush it first, before updating the
+ * pte with the new entry. This will avoid a race condition
+ * seen in the presence of one thread doing SMC and another
+ * thread doing COW.
+ */
+ ptep_clear_flush_notify(vma, address, page_table);
+ SetPageSwapBacked(new_page);
+ lru_cache_add_active_or_unevictable(new_page, vma);
+ page_add_new_anon_rmap(new_page, vma, address);
+
+//TODO: is this safe? do_anonymous_page() does it this way.
+ set_pte_at(mm, address, page_table, entry);
+ update_mmu_cache(vma, address, entry);
+ if (old_page) {
+ /*
+ * Only after switching the pte to the new page may
+ * we remove the mapcount here. Otherwise another
+ * process may come and find the rmap count decremented
+ * before the pte is switched to the new page, and
+ * "reuse" the old page writing into it while our pte
+ * here still points into it and can be read by other
+ * threads.
+ *
+ * The critical issue is to order this
+ * page_remove_rmap with the ptp_clear_flush above.
+ * Those stores are ordered by (if nothing else,)
+ * the barrier present in the atomic_add_negative
+ * in page_remove_rmap.
+ *
+ * Then the TLB flush in ptep_clear_flush ensures that
+ * no process can access the old page before the
+ * decremented mapcount is visible. And the old page
+ * cannot be reused until after the decremented
+ * mapcount is visible. So transitively, TLBs to
+ * old page will be flushed before it can be reused.
+ */
+ page_remove_rmap(old_page, vma);
+ }
+
+ /* Free the old page.. */
+ new_page = old_page;
+ ret |= VM_FAULT_WRITE;
+ } else
+ mem_cgroup_uncharge_page(new_page);
+
+ if (new_page)
+ page_cache_release(new_page);
+ if (old_page)
+ page_cache_release(old_page);
+unlock:
+ pte_unmap_unlock(page_table, ptl);
+ if (dirty_page) {
+ if (vma->vm_file)
+ file_update_time(vma->vm_file);
+
+ /*
+ * Yes, Virginia, this is actually required to prevent a race
+ * with clear_page_dirty_for_io() from clearing the page dirty
+ * bit after it clear all dirty ptes, but before a racing
+ * do_wp_page installs a dirty pte.
+ *
+ * do_no_page is protected similarly.
+ */
+ wait_on_page_locked(dirty_page);
+ set_page_dirty_balance(dirty_page, page_mkwrite);
+ put_page(dirty_page);
+ }
+ return ret;
+oom_free_new:
+ page_cache_release(new_page);
+oom:
+ if (old_page)
+ page_cache_release(old_page);
+ return VM_FAULT_OOM;
+
+unwritable_page:
+ page_cache_release(old_page);
+ return VM_FAULT_SIGBUS;
+}
+
+/*
+ * Helper functions for unmap_mapping_range().
+ *
+ * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
+ *
+ * We have to restart searching the prio_tree whenever we drop the lock,
+ * since the iterator is only valid while the lock is held, and anyway
+ * a later vma might be split and reinserted earlier while lock dropped.
+ *
+ * The list of nonlinear vmas could be handled more efficiently, using
+ * a placeholder, but handle it in the same way until a need is shown.
+ * It is important to search the prio_tree before nonlinear list: a vma
+ * may become nonlinear and be shifted from prio_tree to nonlinear list
+ * while the lock is dropped; but never shifted from list to prio_tree.
+ *
+ * In order to make forward progress despite restarting the search,
+ * vm_truncate_count is used to mark a vma as now dealt with, so we can
+ * quickly skip it next time around. Since the prio_tree search only
+ * shows us those vmas affected by unmapping the range in question, we
+ * can't efficiently keep all vmas in step with mapping->truncate_count:
+ * so instead reset them all whenever it wraps back to 0 (then go to 1).
+ * mapping->truncate_count and vma->vm_truncate_count are protected by
+ * i_mmap_lock.
+ *
+ * In order to make forward progress despite repeatedly restarting some
+ * large vma, note the restart_addr from unmap_vmas when it breaks out:
+ * and restart from that address when we reach that vma again. It might
+ * have been split or merged, shrunk or extended, but never shifted: so
+ * restart_addr remains valid so long as it remains in the vma's range.
+ * unmap_mapping_range forces truncate_count to leap over page-aligned
+ * values so we can save vma's restart_addr in its truncate_count field.
+ */
+#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
+
+static void reset_vma_truncate_counts(struct address_space *mapping)
+{
+ struct vm_area_struct *vma;
+ struct prio_tree_iter iter;
+
+ vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
+ vma->vm_truncate_count = 0;
+ list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
+ vma->vm_truncate_count = 0;
+}
+
+static int unmap_mapping_range_vma(struct vm_area_struct *vma,
+ unsigned long start_addr, unsigned long end_addr,
+ struct zap_details *details)
+{
+ unsigned long restart_addr;
+ int need_break;
+
+ /*
+ * files that support invalidating or truncating portions of the
+ * file from under mmaped areas must have their ->fault function
+ * return a locked page (and set VM_FAULT_LOCKED in the return).
+ * This provides synchronisation against concurrent unmapping here.
+ */
+
+again:
+ restart_addr = vma->vm_truncate_count;
+ if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
+ start_addr = restart_addr;
+ if (start_addr >= end_addr) {
+ /* Top of vma has been split off since last time */
+ vma->vm_truncate_count = details->truncate_count;
+ return 0;
+ }
+ }
+
+ restart_addr = zap_page_range(vma, start_addr,
+ end_addr - start_addr, details);
+ need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
+
+ if (restart_addr >= end_addr) {
+ /* We have now completed this vma: mark it so */
+ vma->vm_truncate_count = details->truncate_count;
+ if (!need_break)
+ return 0;
+ } else {
+ /* Note restart_addr in vma's truncate_count field */
+ vma->vm_truncate_count = restart_addr;
+ if (!need_break)
+ goto again;
+ }
+
+ spin_unlock(details->i_mmap_lock);
+ cond_resched();
+ spin_lock(details->i_mmap_lock);
+ return -EINTR;
+}
+
+static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
+ struct zap_details *details)
+{
+ struct vm_area_struct *vma;
+ struct prio_tree_iter iter;
+ pgoff_t vba, vea, zba, zea;
+
+restart:
+ vma_prio_tree_foreach(vma, &iter, root,
+ details->first_index, details->last_index) {
+ /* Skip quickly over those we have already dealt with */
+ if (vma->vm_truncate_count == details->truncate_count)
+ continue;
+
+ vba = vma->vm_pgoff;
+ vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
+ /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
+ zba = details->first_index;
+ if (zba < vba)
+ zba = vba;
+ zea = details->last_index;
+ if (zea > vea)
+ zea = vea;
+
+ if (unmap_mapping_range_vma(vma,
+ ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
+ ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
+ details) < 0)
+ goto restart;
+ }
+}
+
+static inline void unmap_mapping_range_list(struct list_head *head,
+ struct zap_details *details)
+{
+ struct vm_area_struct *vma;
+
+ /*
+ * In nonlinear VMAs there is no correspondence between virtual address
+ * offset and file offset. So we must perform an exhaustive search
+ * across *all* the pages in each nonlinear VMA, not just the pages
+ * whose virtual address lies outside the file truncation point.
+ */
+restart:
+ list_for_each_entry(vma, head, shared.vm_set.list) {
+ /* Skip quickly over those we have already dealt with */
+ if (vma->vm_truncate_count == details->truncate_count)
+ continue;
+ details->nonlinear_vma = vma;
+ if (unmap_mapping_range_vma(vma, vma->vm_start,
+ vma->vm_end, details) < 0)
+ goto restart;
+ }
+}
+
+/**
+ * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
+ * @mapping: the address space containing mmaps to be unmapped.
+ * @holebegin: byte in first page to unmap, relative to the start of
+ * the underlying file. This will be rounded down to a PAGE_SIZE
+ * boundary. Note that this is different from vmtruncate(), which
+ * must keep the partial page. In contrast, we must get rid of
+ * partial pages.
+ * @holelen: size of prospective hole in bytes. This will be rounded
+ * up to a PAGE_SIZE boundary. A holelen of zero truncates to the
+ * end of the file.
+ * @even_cows: 1 when truncating a file, unmap even private COWed pages;
+ * but 0 when invalidating pagecache, don't throw away private data.
+ */
+void unmap_mapping_range(struct address_space *mapping,
+ loff_t const holebegin, loff_t const holelen, int even_cows)
+{
+ struct zap_details details;
+ pgoff_t hba = holebegin >> PAGE_SHIFT;
+ pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+ /* Check for overflow. */
+ if (sizeof(holelen) > sizeof(hlen)) {
+ long long holeend =
+ (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (holeend & ~(long long)ULONG_MAX)
+ hlen = ULONG_MAX - hba + 1;
+ }
+
+ details.check_mapping = even_cows? NULL: mapping;
+ details.nonlinear_vma = NULL;
+ details.first_index = hba;
+ details.last_index = hba + hlen - 1;
+ if (details.last_index < details.first_index)
+ details.last_index = ULONG_MAX;
+ details.i_mmap_lock = &mapping->i_mmap_lock;
+
+ spin_lock(&mapping->i_mmap_lock);
+
+ /* Protect against endless unmapping loops */
+ mapping->truncate_count++;
+ if (unlikely(is_restart_addr(mapping->truncate_count))) {
+ if (mapping->truncate_count == 0)
+ reset_vma_truncate_counts(mapping);
+ mapping->truncate_count++;
+ }
+ details.truncate_count = mapping->truncate_count;
+
+ if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
+ unmap_mapping_range_tree(&mapping->i_mmap, &details);
+ if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
+ unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
+ spin_unlock(&mapping->i_mmap_lock);
+}
+EXPORT_SYMBOL(unmap_mapping_range);
+
+/**
+ * vmtruncate - unmap mappings "freed" by truncate() syscall
+ * @inode: inode of the file used
+ * @offset: file offset to start truncating
+ *
+ * NOTE! We have to be ready to update the memory sharing
+ * between the file and the memory map for a potential last
+ * incomplete page. Ugly, but necessary.
+ */
+int vmtruncate(struct inode * inode, loff_t offset)
+{
+ if (inode->i_size < offset) {
+ unsigned long limit;
+
+ limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+ if (limit != RLIM_INFINITY && offset > limit)
+ goto out_sig;
+ if (offset > inode->i_sb->s_maxbytes)
+ goto out_big;
+ i_size_write(inode, offset);
+ } else {
+ struct address_space *mapping = inode->i_mapping;
+
+ /*
+ * truncation of in-use swapfiles is disallowed - it would
+ * cause subsequent swapout to scribble on the now-freed
+ * blocks.
+ */
+ if (IS_SWAPFILE(inode))
+ return -ETXTBSY;
+ i_size_write(inode, offset);
+
+ /*
+ * unmap_mapping_range is called twice, first simply for
+ * efficiency so that truncate_inode_pages does fewer
+ * single-page unmaps. However after this first call, and
+ * before truncate_inode_pages finishes, it is possible for
+ * private pages to be COWed, which remain after
+ * truncate_inode_pages finishes, hence the second
+ * unmap_mapping_range call must be made for correctness.
+ */
+ unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
+ truncate_inode_pages(mapping, offset);
+ unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
+ }
+
+ if (inode->i_op && inode->i_op->truncate)
+ inode->i_op->truncate(inode);
+ return 0;
+
+out_sig:
+ send_sig(SIGXFSZ, current, 0);
+out_big:
+ return -EFBIG;
+}
+EXPORT_SYMBOL(vmtruncate);
+
+int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
+{
+ struct address_space *mapping = inode->i_mapping;
+
+ /*
+ * If the underlying filesystem is not going to provide
+ * a way to truncate a range of blocks (punch a hole) -
+ * we should return failure right now.
+ */
+ if (!inode->i_op || !inode->i_op->truncate_range)
+ return -ENOSYS;
+
+ mutex_lock(&inode->i_mutex);
+ down_write(&inode->i_alloc_sem);
+ unmap_mapping_range(mapping, offset, (end - offset), 1);
+ truncate_inode_pages_range(mapping, offset, end);
+ unmap_mapping_range(mapping, offset, (end - offset), 1);
+ inode->i_op->truncate_range(inode, offset, end);
+ up_write(&inode->i_alloc_sem);
+ mutex_unlock(&inode->i_mutex);
+
+ return 0;
+}
+
+/*
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ */
+static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ int write_access, pte_t orig_pte)
+{
+ spinlock_t *ptl;
+ struct page *page;
+ swp_entry_t entry;
+ pte_t pte;
+ int ret = 0;
+
+ if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
+ goto out;
+
+ entry = pte_to_swp_entry(orig_pte);
+ if (is_migration_entry(entry)) {
+ migration_entry_wait(mm, pmd, address);
+ goto out;
+ }
+ delayacct_set_flag(DELAYACCT_PF_SWAPIN);
+ page = lookup_swap_cache(entry);
+ if (!page) {
+ grab_swap_token(); /* Contend for token _before_ read-in */
+ page = swapin_readahead(entry,
+ GFP_HIGHUSER_MOVABLE, vma, address);
+ if (!page) {
+ /*
+ * Back out if somebody else faulted in this pte
+ * while we released the pte lock.
+ */
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (likely(pte_same(*page_table, orig_pte)))
+ ret = VM_FAULT_OOM;
+ delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+ goto unlock;
+ }
+
+ /* Had to read the page from swap area: Major fault */
+ ret = VM_FAULT_MAJOR;
+ count_vm_event(PGMAJFAULT);
+ }
+
+ mark_page_accessed(page);
+
+ lock_page(page);
+ delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+
+ if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
+ ret = VM_FAULT_OOM;
+ unlock_page(page);
+ goto out;
+ }
+
+ /*
+ * Back out if somebody else already faulted in this pte.
+ */
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (unlikely(!pte_same(*page_table, orig_pte)))
+ goto out_nomap;
+
+ if (unlikely(!PageUptodate(page))) {
+ ret = VM_FAULT_SIGBUS;
+ goto out_nomap;
+ }
+
+ /* The page isn't present yet, go ahead with the fault. */
+
+ inc_mm_counter(mm, anon_rss);
+ pte = mk_pte(page, vma->vm_page_prot);
+ if (write_access && can_share_swap_page(page)) {
+ pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+ write_access = 0;
+ }
+
+ flush_icache_page(vma, page);
+ set_pte_at(mm, address, page_table, pte);
+ page_add_anon_rmap(page, vma, address);
+
+ swap_free(entry);
+ if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
+ remove_exclusive_swap_page(page);
+ unlock_page(page);
+
+ if (write_access) {
+ ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
+ if (ret & VM_FAULT_ERROR)
+ ret &= VM_FAULT_ERROR;
+ goto out;
+ }
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(vma, address, pte);
+unlock:
+ pte_unmap_unlock(page_table, ptl);
+out:
+ return ret;
+out_nomap:
+ mem_cgroup_uncharge_page(page);
+ pte_unmap_unlock(page_table, ptl);
+ unlock_page(page);
+ page_cache_release(page);
+ return ret;
+}
+
+/*
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ */
+static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ int write_access)
+{
+ struct page *page;
+ spinlock_t *ptl;
+ pte_t entry;
+
+ /* Allocate our own private page. */
+ pte_unmap(page_table);
+
+ if (unlikely(anon_vma_prepare(vma)))
+ goto oom;
+ page = alloc_zeroed_user_highpage_movable(vma, address);
+ if (!page)
+ goto oom;
+ __SetPageUptodate(page);
+
+ if (mem_cgroup_charge(page, mm, GFP_KERNEL))
+ goto oom_free_page;
+
+ entry = mk_pte(page, vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (!pte_none(*page_table))
+ goto release;
+ inc_mm_counter(mm, anon_rss);
+ SetPageSwapBacked(page);
+ lru_cache_add_active_or_unevictable(page, vma);
+ page_add_new_anon_rmap(page, vma, address);
+ set_pte_at(mm, address, page_table, entry);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(vma, address, entry);
+unlock:
+ pte_unmap_unlock(page_table, ptl);
+ return 0;
+release:
+ mem_cgroup_uncharge_page(page);
+ page_cache_release(page);
+ goto unlock;
+oom_free_page:
+ page_cache_release(page);
+oom:
+ return VM_FAULT_OOM;
+}
+
+/*
+ * __do_fault() tries to create a new page mapping. It aggressively
+ * tries to share with existing pages, but makes a separate copy if
+ * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid
+ * the next page fault.
+ *
+ * As this is called only for pages that do not currently exist, we
+ * do not need to flush old virtual caches or the TLB.
+ *
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte neither mapped nor locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ */
+static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd,
+ pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+{
+ pte_t *page_table;
+ spinlock_t *ptl;
+ struct page *page;
+ pte_t entry;
+ int anon = 0;
+ int charged = 0;
+ struct page *dirty_page = NULL;
+ struct vm_fault vmf;
+ int ret;
+ int page_mkwrite = 0;
+
+ vmf.virtual_address = (void __user *)(address & PAGE_MASK);
+ vmf.pgoff = pgoff;
+ vmf.flags = flags;
+ vmf.page = NULL;
+
+ ret = vma->vm_ops->fault(vma, &vmf);
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
+ return ret;
+
+ /*
+ * For consistency in subsequent calls, make the faulted page always
+ * locked.
+ */
+ if (unlikely(!(ret & VM_FAULT_LOCKED)))
+ lock_page(vmf.page);
+ else
+ VM_BUG_ON(!PageLocked(vmf.page));
+
+ /*
+ * Should we do an early C-O-W break?
+ */
+ page = vmf.page;
+ if (flags & FAULT_FLAG_WRITE) {
+ if (!(vma->vm_flags & VM_SHARED)) {
+ anon = 1;
+ if (unlikely(anon_vma_prepare(vma))) {
+ ret = VM_FAULT_OOM;
+ goto out;
+ }
+ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
+ vma, address);
+ if (!page) {
+ ret = VM_FAULT_OOM;
+ goto out;
+ }
+ if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
+ ret = VM_FAULT_OOM;
+ page_cache_release(page);
+ goto out;
+ }
+ charged = 1;
+ /*
+ * Don't let another task, with possibly unlocked vma,
+ * keep the mlocked page.
+ */
+ if (vma->vm_flags & VM_LOCKED)
+ clear_page_mlock(vmf.page);
+ copy_user_highpage(page, vmf.page, address, vma);
+ __SetPageUptodate(page);
+ } else {
+ /*
+ * If the page will be shareable, see if the backing
+ * address space wants to know that the page is about
+ * to become writable
+ */
+ if (vma->vm_ops->page_mkwrite) {
+ unlock_page(page);
+ if (vma->vm_ops->page_mkwrite(vma, page) < 0) {
+ ret = VM_FAULT_SIGBUS;
+ anon = 1; /* no anon but release vmf.page */
+ goto out_unlocked;
+ }
+ lock_page(page);
+ /*
+ * XXX: this is not quite right (racy vs
+ * invalidate) to unlock and relock the page
+ * like this, however a better fix requires
+ * reworking page_mkwrite locking API, which
+ * is better done later.
+ */
+ if (!page->mapping) {
+ ret = 0;
+ anon = 1; /* no anon but release vmf.page */
+ goto out;
+ }
+ page_mkwrite = 1;
+ }
+ }
+
+ }
+
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+
+ /*
+ * This silly early PAGE_DIRTY setting removes a race
+ * due to the bad i386 page protection. But it's valid
+ * for other architectures too.
+ *
+ * Note that if write_access is true, we either now have
+ * an exclusive copy of the page, or this is a shared mapping,
+ * so we can make it writable and dirty to avoid having to
+ * handle that later.
+ */
+ /* Only go through if we didn't race with anybody else... */
+ if (likely(pte_same(*page_table, orig_pte))) {
+ flush_icache_page(vma, page);
+ entry = mk_pte(page, vma->vm_page_prot);
+ if (flags & FAULT_FLAG_WRITE)
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (anon) {
+ inc_mm_counter(mm, anon_rss);
+ SetPageSwapBacked(page);
+ lru_cache_add_active_or_unevictable(page, vma);
+ page_add_new_anon_rmap(page, vma, address);
+ } else {
+ inc_mm_counter(mm, file_rss);
+ page_add_file_rmap(page);
+ if (flags & FAULT_FLAG_WRITE) {
+ dirty_page = page;
+ get_page(dirty_page);
+ }
+ }
+//TODO: is this safe? do_anonymous_page() does it this way.
+ set_pte_at(mm, address, page_table, entry);
+
+ /* no need to invalidate: a not-present page won't be cached */
+ update_mmu_cache(vma, address, entry);
+ } else {
+ if (charged)
+ mem_cgroup_uncharge_page(page);
+ if (anon)
+ page_cache_release(page);
+ else
+ anon = 1; /* no anon but release faulted_page */
+ }
+
+ pte_unmap_unlock(page_table, ptl);
+
+out:
+ unlock_page(vmf.page);
+out_unlocked:
+ if (anon)
+ page_cache_release(vmf.page);
+ else if (dirty_page) {
+ if (vma->vm_file)
+ file_update_time(vma->vm_file);
+
+ set_page_dirty_balance(dirty_page, page_mkwrite);
+ put_page(dirty_page);
+ }
+
+ return ret;
+}
+
+static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ int write_access, pte_t orig_pte)
+{
+ pgoff_t pgoff = (((address & PAGE_MASK)
+ - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
+
+ pte_unmap(page_table);
+ return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
+}
+
+/*
+ * Fault of a previously existing named mapping. Repopulate the pte
+ * from the encoded file_pte if possible. This enables swappable
+ * nonlinear vmas.
+ *
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ */
+static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ int write_access, pte_t orig_pte)
+{
+ unsigned int flags = FAULT_FLAG_NONLINEAR |
+ (write_access ? FAULT_FLAG_WRITE : 0);
+ pgoff_t pgoff;
+
+ if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
+ return 0;
+
+ if (unlikely(!(vma->vm_flags & VM_NONLINEAR) ||
+ !(vma->vm_flags & VM_CAN_NONLINEAR))) {
+ /*
+ * Page table corrupted: show pte and kill process.
+ */
+ print_bad_pte(vma, orig_pte, address);
+ return VM_FAULT_OOM;
+ }
+
+ pgoff = pte_to_pgoff(orig_pte);
+ return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
+}
+
+/*
+ * These routines also need to handle stuff like marking pages dirty
+ * and/or accessed for architectures that don't do it in hardware (most
+ * RISC architectures). The early dirtying is also good on the i386.
+ *
+ * There is also a hook called "update_mmu_cache()" that architectures
+ * with external mmu caches can use to update those (ie the Sparc or
+ * PowerPC hashed page tables that act as extended TLBs).
+ *
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ */
+static inline int handle_pte_fault(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ pte_t *pte, pmd_t *pmd, int write_access)
+{
+ pte_t entry;
+ spinlock_t *ptl;
+
+ entry = *pte;
+ if (!pte_present(entry)) {
+ if (pte_none(entry)) {
+ if (vma->vm_ops) {
+ if (likely(vma->vm_ops->fault))
+ return do_linear_fault(mm, vma, address,
+ pte, pmd, write_access, entry);
+ }
+ return do_anonymous_page(mm, vma, address,
+ pte, pmd, write_access);
+ }
+ if (pte_file(entry))
+ return do_nonlinear_fault(mm, vma, address,
+ pte, pmd, write_access, entry);
+ return do_swap_page(mm, vma, address,
+ pte, pmd, write_access, entry);
+ }
+
+ ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+ if (unlikely(!pte_same(*pte, entry)))
+ goto unlock;
+ if (write_access) {
+ if (!pte_write(entry))
+ return do_wp_page(mm, vma, address,
+ pte, pmd, ptl, entry);
+ entry = pte_mkdirty(entry);
+ }
+ entry = pte_mkyoung(entry);
+ if (ptep_set_access_flags(vma, address, pte, entry, write_access)) {
+ update_mmu_cache(vma, address, entry);
+ } else {
+ /*
+ * This is needed only for protection faults but the arch code
+ * is not yet telling us if this is a protection fault or not.
+ * This still avoids useless tlb flushes for .text page faults
+ * with threads.
+ */
+ if (write_access)
+ flush_tlb_page(vma, address);
+ }
+unlock:
+ pte_unmap_unlock(pte, ptl);
+ return 0;
+}
+
+/*
+ * By the time we get here, we already hold the mm semaphore
+ */
+int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, int write_access)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ __set_current_state(TASK_RUNNING);
+
+ count_vm_event(PGFAULT);
+
+ if (unlikely(is_vm_hugetlb_page(vma)))
+ return hugetlb_fault(mm, vma, address, write_access);
+
+ pgd = pgd_offset(mm, address);
+ pud = pud_alloc(mm, pgd, address);
+ if (!pud)
+ return VM_FAULT_OOM;
+ pmd = pmd_alloc(mm, pud, address);
+ if (!pmd)
+ return VM_FAULT_OOM;
+ pte = pte_alloc_map(mm, pmd, address);
+ if (!pte)
+ return VM_FAULT_OOM;
+
+ return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
+}
+
+#ifndef __PAGETABLE_PUD_FOLDED
+/*
+ * Allocate page upper directory.
+ * We've already handled the fast-path in-line.
+ */
+int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
+{
+ pud_t *new = pud_alloc_one(mm, address);
+ if (!new)
+ return -ENOMEM;
+
+ smp_wmb(); /* See comment in __pte_alloc */
+
+ spin_lock(&mm->page_table_lock);
+ if (pgd_present(*pgd)) /* Another has populated it */
+ pud_free(mm, new);
+ else
+ pgd_populate(mm, pgd, new);
+ spin_unlock(&mm->page_table_lock);
+ return 0;
+}
+#endif /* __PAGETABLE_PUD_FOLDED */
+
+#ifndef __PAGETABLE_PMD_FOLDED
+/*
+ * Allocate page middle directory.
+ * We've already handled the fast-path in-line.
+ */
+int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
+{
+ pmd_t *new = pmd_alloc_one(mm, address);
+ if (!new)
+ return -ENOMEM;
+
+ smp_wmb(); /* See comment in __pte_alloc */
+
+ spin_lock(&mm->page_table_lock);
+#ifndef __ARCH_HAS_4LEVEL_HACK
+ if (pud_present(*pud)) /* Another has populated it */
+ pmd_free(mm, new);
+ else
+ pud_populate(mm, pud, new);
+#else
+ if (pgd_present(*pud)) /* Another has populated it */
+ pmd_free(mm, new);
+ else
+ pgd_populate(mm, pud, new);
+#endif /* __ARCH_HAS_4LEVEL_HACK */
+ spin_unlock(&mm->page_table_lock);
+ return 0;
+}
+#endif /* __PAGETABLE_PMD_FOLDED */
+
+int make_pages_present(unsigned long addr, unsigned long end)
+{
+ int ret, len, write;
+ struct vm_area_struct * vma;
+
+ vma = find_vma(current->mm, addr);
+ if (!vma)
+ return -ENOMEM;
+ write = (vma->vm_flags & VM_WRITE) != 0;
+ BUG_ON(addr >= end);
+ BUG_ON(end > vma->vm_end);
+ len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
+ ret = get_user_pages(current, current->mm, addr,
+ len, write, 0, NULL, NULL);
+ if (ret < 0)
+ return ret;
+ return ret == len ? 0 : -EFAULT;
+}
+
+#if !defined(__HAVE_ARCH_GATE_AREA)
+
+#if defined(AT_SYSINFO_EHDR)
+static struct vm_area_struct gate_vma;
+
+static int __init gate_vma_init(void)
+{
+ gate_vma.vm_mm = NULL;
+ gate_vma.vm_start = FIXADDR_USER_START;
+ gate_vma.vm_end = FIXADDR_USER_END;
+ gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
+ gate_vma.vm_page_prot = __P101;
+ /*
+ * Make sure the vDSO gets into every core dump.
+ * Dumping its contents makes post-mortem fully interpretable later
+ * without matching up the same kernel and hardware config to see
+ * what PC values meant.
+ */
+ gate_vma.vm_flags |= VM_ALWAYSDUMP;
+ return 0;
+}
+__initcall(gate_vma_init);
+#endif
+
+struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+{
+#ifdef AT_SYSINFO_EHDR
+ return &gate_vma;
+#else
+ return NULL;
+#endif
+}
+
+int in_gate_area_no_task(unsigned long addr)
+{
+#ifdef AT_SYSINFO_EHDR
+ if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
+ return 1;
+#endif
+ return 0;
+}
+
+#endif /* __HAVE_ARCH_GATE_AREA */
+
+#ifdef CONFIG_HAVE_IOREMAP_PROT
+static resource_size_t follow_phys(struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags,
+ unsigned long *prot)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *ptep, pte;
+ spinlock_t *ptl;
+ resource_size_t phys_addr = 0;
+ struct mm_struct *mm = vma->vm_mm;
+
+ VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP)));
+
+ pgd = pgd_offset(mm, address);
+ if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+ goto no_page_table;
+
+ pud = pud_offset(pgd, address);
+ if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+ goto no_page_table;
+
+ pmd = pmd_offset(pud, address);
+ if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+ goto no_page_table;
+
+ /* We cannot handle huge page PFN maps. Luckily they don't exist. */
+ if (pmd_huge(*pmd))
+ goto no_page_table;
+
+ ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (!ptep)
+ goto out;
+
+ pte = *ptep;
+ if (!pte_present(pte))
+ goto unlock;
+ if ((flags & FOLL_WRITE) && !pte_write(pte))
+ goto unlock;
+ phys_addr = pte_pfn(pte);
+ phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
+
+ *prot = pgprot_val(pte_pgprot(pte));
+
+unlock:
+ pte_unmap_unlock(ptep, ptl);
+out:
+ return phys_addr;
+no_page_table:
+ return 0;
+}
+
+int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
+ void *buf, int len, int write)
+{
+ resource_size_t phys_addr;
+ unsigned long prot = 0;
+ void *maddr;
+ int offset = addr & (PAGE_SIZE-1);
+
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+ return -EINVAL;
+
+ phys_addr = follow_phys(vma, addr, write, &prot);
+
+ if (!phys_addr)
+ return -EINVAL;
+
+ maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
+ if (write)
+ memcpy_toio(maddr + offset, buf, len);
+ else
+ memcpy_fromio(buf, maddr + offset, len);
+ iounmap(maddr);
+
+ return len;
+}
+#endif
+
+/*
+ * Access another process' address space.
+ * Source/target buffer must be kernel space,
+ * Do not walk the page table directly, use get_user_pages
+ */
+int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+{
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ void *old_buf = buf;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return 0;
+
+ down_read(&mm->mmap_sem);
+ /* ignore errors, just check how much was successfully transferred */
+ while (len) {
+ int bytes, ret, offset;
+ void *maddr;
+ struct page *page = NULL;
+
+ ret = get_user_pages(tsk, mm, addr, 1,
+ write, 1, &page, &vma);
+ if (ret <= 0) {
+ /*
+ * Check if this is a VM_IO | VM_PFNMAP VMA, which
+ * we can access using slightly different code.
+ */
+#ifdef CONFIG_HAVE_IOREMAP_PROT
+ vma = find_vma(mm, addr);
+ if (!vma)
+ break;
+ if (vma->vm_ops && vma->vm_ops->access)
+ ret = vma->vm_ops->access(vma, addr, buf,
+ len, write);
+ if (ret <= 0)
+#endif
+ break;
+ bytes = ret;
+ } else {
+ bytes = len;
+ offset = addr & (PAGE_SIZE-1);
+ if (bytes > PAGE_SIZE-offset)
+ bytes = PAGE_SIZE-offset;
+
+ maddr = kmap(page);
+ if (write) {
+ copy_to_user_page(vma, page, addr,
+ maddr + offset, buf, bytes);
+ set_page_dirty_lock(page);
+ } else {
+ copy_from_user_page(vma, page, addr,
+ buf, maddr + offset, bytes);
+ }
+ kunmap(page);
+ page_cache_release(page);
+ }
+ len -= bytes;
+ buf += bytes;
+ addr += bytes;
+ }
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+
+ return buf - old_buf;
+}
+
+/*
+ * Print the name of a VMA.
+ */
+void print_vma_addr(char *prefix, unsigned long ip)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+
+ /*
+ * Do not print if we are in atomic
+ * contexts (in exception stacks, etc.):
+ */
+ if (preempt_count())
+ return;
+
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, ip);
+ if (vma && vma->vm_file) {
+ struct file *f = vma->vm_file;
+ char *buf = (char *)__get_free_page(GFP_KERNEL);
+ if (buf) {
+ char *p, *s;
+
+ p = d_path(&f->f_path, buf, PAGE_SIZE);
+ if (IS_ERR(p))
+ p = "?";
+ s = strrchr(p, '/');
+ if (s)
+ p = s+1;
+ printk("%s%s[%lx+%lx]", prefix, p,
+ vma->vm_start,
+ vma->vm_end - vma->vm_start);
+ free_page((unsigned long)buf);
+ }
+ }
+ up_read(&current->mm->mmap_sem);
+}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
new file mode 100644
index 0000000..b173711
--- /dev/null
+++ b/mm/memory_hotplug.c
@@ -0,0 +1,867 @@
+/*
+ * linux/mm/memory_hotplug.c
+ *
+ * Copyright (C)
+ */
+
+#include <linux/stddef.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/interrupt.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/pagevec.h>
+#include <linux/writeback.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/cpu.h>
+#include <linux/memory.h>
+#include <linux/memory_hotplug.h>
+#include <linux/highmem.h>
+#include <linux/vmalloc.h>
+#include <linux/ioport.h>
+#include <linux/delay.h>
+#include <linux/migrate.h>
+#include <linux/page-isolation.h>
+#include <linux/pfn.h>
+
+#include <asm/tlbflush.h>
+
+#include "internal.h"
+
+/* add this memory to iomem resource */
+static struct resource *register_memory_resource(u64 start, u64 size)
+{
+ struct resource *res;
+ res = kzalloc(sizeof(struct resource), GFP_KERNEL);
+ BUG_ON(!res);
+
+ res->name = "System RAM";
+ res->start = start;
+ res->end = start + size - 1;
+ res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ if (request_resource(&iomem_resource, res) < 0) {
+ printk("System RAM resource %llx - %llx cannot be added\n",
+ (unsigned long long)res->start, (unsigned long long)res->end);
+ kfree(res);
+ res = NULL;
+ }
+ return res;
+}
+
+static void release_memory_resource(struct resource *res)
+{
+ if (!res)
+ return;
+ release_resource(res);
+ kfree(res);
+ return;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+#ifndef CONFIG_SPARSEMEM_VMEMMAP
+static void get_page_bootmem(unsigned long info, struct page *page, int type)
+{
+ atomic_set(&page->_mapcount, type);
+ SetPagePrivate(page);
+ set_page_private(page, info);
+ atomic_inc(&page->_count);
+}
+
+void put_page_bootmem(struct page *page)
+{
+ int type;
+
+ type = atomic_read(&page->_mapcount);
+ BUG_ON(type >= -1);
+
+ if (atomic_dec_return(&page->_count) == 1) {
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ reset_page_mapcount(page);
+ __free_pages_bootmem(page, 0);
+ }
+
+}
+
+static void register_page_bootmem_info_section(unsigned long start_pfn)
+{
+ unsigned long *usemap, mapsize, section_nr, i;
+ struct mem_section *ms;
+ struct page *page, *memmap;
+
+ if (!pfn_valid(start_pfn))
+ return;
+
+ section_nr = pfn_to_section_nr(start_pfn);
+ ms = __nr_to_section(section_nr);
+
+ /* Get section's memmap address */
+ memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+
+ /*
+ * Get page for the memmap's phys address
+ * XXX: need more consideration for sparse_vmemmap...
+ */
+ page = virt_to_page(memmap);
+ mapsize = sizeof(struct page) * PAGES_PER_SECTION;
+ mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;
+
+ /* remember memmap's page */
+ for (i = 0; i < mapsize; i++, page++)
+ get_page_bootmem(section_nr, page, SECTION_INFO);
+
+ usemap = __nr_to_section(section_nr)->pageblock_flags;
+ page = virt_to_page(usemap);
+
+ mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
+
+ for (i = 0; i < mapsize; i++, page++)
+ get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
+
+}
+
+void register_page_bootmem_info_node(struct pglist_data *pgdat)
+{
+ unsigned long i, pfn, end_pfn, nr_pages;
+ int node = pgdat->node_id;
+ struct page *page;
+ struct zone *zone;
+
+ nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
+ page = virt_to_page(pgdat);
+
+ for (i = 0; i < nr_pages; i++, page++)
+ get_page_bootmem(node, page, NODE_INFO);
+
+ zone = &pgdat->node_zones[0];
+ for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) {
+ if (zone->wait_table) {
+ nr_pages = zone->wait_table_hash_nr_entries
+ * sizeof(wait_queue_head_t);
+ nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT;
+ page = virt_to_page(zone->wait_table);
+
+ for (i = 0; i < nr_pages; i++, page++)
+ get_page_bootmem(node, page, NODE_INFO);
+ }
+ }
+
+ pfn = pgdat->node_start_pfn;
+ end_pfn = pfn + pgdat->node_spanned_pages;
+
+ /* register_section info */
+ for (; pfn < end_pfn; pfn += PAGES_PER_SECTION)
+ register_page_bootmem_info_section(pfn);
+
+}
+#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
+
+static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ unsigned long old_zone_end_pfn;
+
+ zone_span_writelock(zone);
+
+ old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ if (start_pfn < zone->zone_start_pfn)
+ zone->zone_start_pfn = start_pfn;
+
+ zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
+ zone->zone_start_pfn;
+
+ zone_span_writeunlock(zone);
+}
+
+static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ unsigned long old_pgdat_end_pfn =
+ pgdat->node_start_pfn + pgdat->node_spanned_pages;
+
+ if (start_pfn < pgdat->node_start_pfn)
+ pgdat->node_start_pfn = start_pfn;
+
+ pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
+ pgdat->node_start_pfn;
+}
+
+static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
+{
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ int nr_pages = PAGES_PER_SECTION;
+ int nid = pgdat->node_id;
+ int zone_type;
+ unsigned long flags;
+
+ zone_type = zone - pgdat->node_zones;
+ if (!zone->wait_table) {
+ int ret;
+
+ ret = init_currently_empty_zone(zone, phys_start_pfn,
+ nr_pages, MEMMAP_HOTPLUG);
+ if (ret)
+ return ret;
+ }
+ pgdat_resize_lock(zone->zone_pgdat, &flags);
+ grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
+ grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
+ phys_start_pfn + nr_pages);
+ pgdat_resize_unlock(zone->zone_pgdat, &flags);
+ memmap_init_zone(nr_pages, nid, zone_type,
+ phys_start_pfn, MEMMAP_HOTPLUG);
+ return 0;
+}
+
+static int __meminit __add_section(struct zone *zone, unsigned long phys_start_pfn)
+{
+ int nr_pages = PAGES_PER_SECTION;
+ int ret;
+
+ if (pfn_valid(phys_start_pfn))
+ return -EEXIST;
+
+ ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
+
+ if (ret < 0)
+ return ret;
+
+ ret = __add_zone(zone, phys_start_pfn);
+
+ if (ret < 0)
+ return ret;
+
+ return register_new_memory(__pfn_to_section(phys_start_pfn));
+}
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static int __remove_section(struct zone *zone, struct mem_section *ms)
+{
+ /*
+ * XXX: Freeing memmap with vmemmap is not implement yet.
+ * This should be removed later.
+ */
+ return -EBUSY;
+}
+#else
+static int __remove_section(struct zone *zone, struct mem_section *ms)
+{
+ unsigned long flags;
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ int ret = -EINVAL;
+
+ if (!valid_section(ms))
+ return ret;
+
+ ret = unregister_memory_section(ms);
+ if (ret)
+ return ret;
+
+ pgdat_resize_lock(pgdat, &flags);
+ sparse_remove_one_section(zone, ms);
+ pgdat_resize_unlock(pgdat, &flags);
+ return 0;
+}
+#endif
+
+/*
+ * Reasonably generic function for adding memory. It is
+ * expected that archs that support memory hotplug will
+ * call this function after deciding the zone to which to
+ * add the new pages.
+ */
+int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn,
+ unsigned long nr_pages)
+{
+ unsigned long i;
+ int err = 0;
+ int start_sec, end_sec;
+ /* during initialize mem_map, align hot-added range to section */
+ start_sec = pfn_to_section_nr(phys_start_pfn);
+ end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
+
+ for (i = start_sec; i <= end_sec; i++) {
+ err = __add_section(zone, i << PFN_SECTION_SHIFT);
+
+ /*
+ * EEXIST is finally dealt with by ioresource collision
+ * check. see add_memory() => register_memory_resource()
+ * Warning will be printed if there is collision.
+ */
+ if (err && (err != -EEXIST))
+ break;
+ err = 0;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(__add_pages);
+
+/**
+ * __remove_pages() - remove sections of pages from a zone
+ * @zone: zone from which pages need to be removed
+ * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
+ * @nr_pages: number of pages to remove (must be multiple of section size)
+ *
+ * Generic helper function to remove section mappings and sysfs entries
+ * for the section of the memory we are removing. Caller needs to make
+ * sure that pages are marked reserved and zones are adjust properly by
+ * calling offline_pages().
+ */
+int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
+ unsigned long nr_pages)
+{
+ unsigned long i, ret = 0;
+ int sections_to_remove;
+
+ /*
+ * We can only remove entire sections
+ */
+ BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
+ BUG_ON(nr_pages % PAGES_PER_SECTION);
+
+ sections_to_remove = nr_pages / PAGES_PER_SECTION;
+ for (i = 0; i < sections_to_remove; i++) {
+ unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
+ release_mem_region(pfn << PAGE_SHIFT,
+ PAGES_PER_SECTION << PAGE_SHIFT);
+ ret = __remove_section(zone, __pfn_to_section(pfn));
+ if (ret)
+ break;
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__remove_pages);
+
+void online_page(struct page *page)
+{
+ totalram_pages++;
+ num_physpages++;
+
+#ifdef CONFIG_HIGHMEM
+ if (PageHighMem(page))
+ totalhigh_pages++;
+#endif
+
+#ifdef CONFIG_FLATMEM
+ max_mapnr = max(page_to_pfn(page), max_mapnr);
+#endif
+
+ ClearPageReserved(page);
+ init_page_count(page);
+ __free_page(page);
+}
+
+static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
+ void *arg)
+{
+ unsigned long i;
+ unsigned long onlined_pages = *(unsigned long *)arg;
+ struct page *page;
+ if (PageReserved(pfn_to_page(start_pfn)))
+ for (i = 0; i < nr_pages; i++) {
+ page = pfn_to_page(start_pfn + i);
+ online_page(page);
+ onlined_pages++;
+ }
+ *(unsigned long *)arg = onlined_pages;
+ return 0;
+}
+
+
+int online_pages(unsigned long pfn, unsigned long nr_pages)
+{
+ unsigned long onlined_pages = 0;
+ struct zone *zone;
+ int need_zonelists_rebuild = 0;
+ int nid;
+ int ret;
+ struct memory_notify arg;
+
+ arg.start_pfn = pfn;
+ arg.nr_pages = nr_pages;
+ arg.status_change_nid = -1;
+
+ nid = page_to_nid(pfn_to_page(pfn));
+ if (node_present_pages(nid) == 0)
+ arg.status_change_nid = nid;
+
+ ret = memory_notify(MEM_GOING_ONLINE, &arg);
+ ret = notifier_to_errno(ret);
+ if (ret) {
+ memory_notify(MEM_CANCEL_ONLINE, &arg);
+ return ret;
+ }
+ /*
+ * This doesn't need a lock to do pfn_to_page().
+ * The section can't be removed here because of the
+ * memory_block->state_mutex.
+ */
+ zone = page_zone(pfn_to_page(pfn));
+ /*
+ * If this zone is not populated, then it is not in zonelist.
+ * This means the page allocator ignores this zone.
+ * So, zonelist must be updated after online.
+ */
+ if (!populated_zone(zone))
+ need_zonelists_rebuild = 1;
+
+ ret = walk_memory_resource(pfn, nr_pages, &onlined_pages,
+ online_pages_range);
+ if (ret) {
+ printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
+ nr_pages, pfn);
+ memory_notify(MEM_CANCEL_ONLINE, &arg);
+ return ret;
+ }
+
+ zone->present_pages += onlined_pages;
+ zone->zone_pgdat->node_present_pages += onlined_pages;
+
+ setup_per_zone_pages_min();
+ if (onlined_pages) {
+ kswapd_run(zone_to_nid(zone));
+ node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
+ }
+
+ if (need_zonelists_rebuild)
+ build_all_zonelists();
+ else
+ vm_total_pages = nr_free_pagecache_pages();
+
+ writeback_set_ratelimit();
+
+ if (onlined_pages)
+ memory_notify(MEM_ONLINE, &arg);
+
+ return 0;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
+
+static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
+{
+ struct pglist_data *pgdat;
+ unsigned long zones_size[MAX_NR_ZONES] = {0};
+ unsigned long zholes_size[MAX_NR_ZONES] = {0};
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+
+ pgdat = arch_alloc_nodedata(nid);
+ if (!pgdat)
+ return NULL;
+
+ arch_refresh_nodedata(nid, pgdat);
+
+ /* we can use NODE_DATA(nid) from here */
+
+ /* init node's zones as empty zones, we don't have any present pages.*/
+ free_area_init_node(nid, zones_size, start_pfn, zholes_size);
+
+ return pgdat;
+}
+
+static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
+{
+ arch_refresh_nodedata(nid, NULL);
+ arch_free_nodedata(pgdat);
+ return;
+}
+
+
+/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
+int __ref add_memory(int nid, u64 start, u64 size)
+{
+ pg_data_t *pgdat = NULL;
+ int new_pgdat = 0;
+ struct resource *res;
+ int ret;
+
+ res = register_memory_resource(start, size);
+ if (!res)
+ return -EEXIST;
+
+ if (!node_online(nid)) {
+ pgdat = hotadd_new_pgdat(nid, start);
+ if (!pgdat)
+ return -ENOMEM;
+ new_pgdat = 1;
+ }
+
+ /* call arch's memory hotadd */
+ ret = arch_add_memory(nid, start, size);
+
+ if (ret < 0)
+ goto error;
+
+ /* we online node here. we can't roll back from here. */
+ node_set_online(nid);
+
+ if (new_pgdat) {
+ ret = register_one_node(nid);
+ /*
+ * If sysfs file of new node can't create, cpu on the node
+ * can't be hot-added. There is no rollback way now.
+ * So, check by BUG_ON() to catch it reluctantly..
+ */
+ BUG_ON(ret);
+ }
+
+ return ret;
+error:
+ /* rollback pgdat allocation and others */
+ if (new_pgdat)
+ rollback_node_hotadd(nid, pgdat);
+ if (res)
+ release_memory_resource(res);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(add_memory);
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+/*
+ * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
+ * set and the size of the free page is given by page_order(). Using this,
+ * the function determines if the pageblock contains only free pages.
+ * Due to buddy contraints, a free page at least the size of a pageblock will
+ * be located at the start of the pageblock
+ */
+static inline int pageblock_free(struct page *page)
+{
+ return PageBuddy(page) && page_order(page) >= pageblock_order;
+}
+
+/* Return the start of the next active pageblock after a given page */
+static struct page *next_active_pageblock(struct page *page)
+{
+ int pageblocks_stride;
+
+ /* Ensure the starting page is pageblock-aligned */
+ BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
+
+ /* Move forward by at least 1 * pageblock_nr_pages */
+ pageblocks_stride = 1;
+
+ /* If the entire pageblock is free, move to the end of free page */
+ if (pageblock_free(page))
+ pageblocks_stride += page_order(page) - pageblock_order;
+
+ return page + (pageblocks_stride * pageblock_nr_pages);
+}
+
+/* Checks if this range of memory is likely to be hot-removable. */
+int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
+{
+ int type;
+ struct page *page = pfn_to_page(start_pfn);
+ struct page *end_page = page + nr_pages;
+
+ /* Check the starting page of each pageblock within the range */
+ for (; page < end_page; page = next_active_pageblock(page)) {
+ type = get_pageblock_migratetype(page);
+
+ /*
+ * A pageblock containing MOVABLE or free pages is considered
+ * removable
+ */
+ if (type != MIGRATE_MOVABLE && !pageblock_free(page))
+ return 0;
+
+ /*
+ * A pageblock starting with a PageReserved page is not
+ * considered removable.
+ */
+ if (PageReserved(page))
+ return 0;
+ }
+
+ /* All pageblocks in the memory block are likely to be hot-removable */
+ return 1;
+}
+
+/*
+ * Confirm all pages in a range [start, end) is belongs to the same zone.
+ */
+static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long pfn;
+ struct zone *zone = NULL;
+ struct page *page;
+ int i;
+ for (pfn = start_pfn;
+ pfn < end_pfn;
+ pfn += MAX_ORDER_NR_PAGES) {
+ i = 0;
+ /* This is just a CONFIG_HOLES_IN_ZONE check.*/
+ while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i))
+ i++;
+ if (i == MAX_ORDER_NR_PAGES)
+ continue;
+ page = pfn_to_page(pfn + i);
+ if (zone && page_zone(page) != zone)
+ return 0;
+ zone = page_zone(page);
+ }
+ return 1;
+}
+
+/*
+ * Scanning pfn is much easier than scanning lru list.
+ * Scan pfn from start to end and Find LRU page.
+ */
+int scan_lru_pages(unsigned long start, unsigned long end)
+{
+ unsigned long pfn;
+ struct page *page;
+ for (pfn = start; pfn < end; pfn++) {
+ if (pfn_valid(pfn)) {
+ page = pfn_to_page(pfn);
+ if (PageLRU(page))
+ return pfn;
+ }
+ }
+ return 0;
+}
+
+static struct page *
+hotremove_migrate_alloc(struct page *page,
+ unsigned long private,
+ int **x)
+{
+ /* This should be improoooooved!! */
+ return alloc_page(GFP_HIGHUSER_PAGECACHE);
+}
+
+
+#define NR_OFFLINE_AT_ONCE_PAGES (256)
+static int
+do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long pfn;
+ struct page *page;
+ int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
+ int not_managed = 0;
+ int ret = 0;
+ LIST_HEAD(source);
+
+ for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
+ if (!pfn_valid(pfn))
+ continue;
+ page = pfn_to_page(pfn);
+ if (!page_count(page))
+ continue;
+ /*
+ * We can skip free pages. And we can only deal with pages on
+ * LRU.
+ */
+ ret = isolate_lru_page(page);
+ if (!ret) { /* Success */
+ list_add_tail(&page->lru, &source);
+ move_pages--;
+ } else {
+ /* Becasue we don't have big zone->lock. we should
+ check this again here. */
+ if (page_count(page))
+ not_managed++;
+#ifdef CONFIG_DEBUG_VM
+ printk(KERN_INFO "removing from LRU failed"
+ " %lx/%d/%lx\n",
+ pfn, page_count(page), page->flags);
+#endif
+ }
+ }
+ ret = -EBUSY;
+ if (not_managed) {
+ if (!list_empty(&source))
+ putback_lru_pages(&source);
+ goto out;
+ }
+ ret = 0;
+ if (list_empty(&source))
+ goto out;
+ /* this function returns # of failed pages */
+ ret = migrate_pages(&source, hotremove_migrate_alloc, 0);
+
+out:
+ return ret;
+}
+
+/*
+ * remove from free_area[] and mark all as Reserved.
+ */
+static int
+offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
+ void *data)
+{
+ __offline_isolated_pages(start, start + nr_pages);
+ return 0;
+}
+
+static void
+offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
+{
+ walk_memory_resource(start_pfn, end_pfn - start_pfn, NULL,
+ offline_isolated_pages_cb);
+}
+
+/*
+ * Check all pages in range, recoreded as memory resource, are isolated.
+ */
+static int
+check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
+ void *data)
+{
+ int ret;
+ long offlined = *(long *)data;
+ ret = test_pages_isolated(start_pfn, start_pfn + nr_pages);
+ offlined = nr_pages;
+ if (!ret)
+ *(long *)data += offlined;
+ return ret;
+}
+
+static long
+check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
+{
+ long offlined = 0;
+ int ret;
+
+ ret = walk_memory_resource(start_pfn, end_pfn - start_pfn, &offlined,
+ check_pages_isolated_cb);
+ if (ret < 0)
+ offlined = (long)ret;
+ return offlined;
+}
+
+int offline_pages(unsigned long start_pfn,
+ unsigned long end_pfn, unsigned long timeout)
+{
+ unsigned long pfn, nr_pages, expire;
+ long offlined_pages;
+ int ret, drain, retry_max, node;
+ struct zone *zone;
+ struct memory_notify arg;
+
+ BUG_ON(start_pfn >= end_pfn);
+ /* at least, alignment against pageblock is necessary */
+ if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
+ return -EINVAL;
+ if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
+ return -EINVAL;
+ /* This makes hotplug much easier...and readable.
+ we assume this for now. .*/
+ if (!test_pages_in_a_zone(start_pfn, end_pfn))
+ return -EINVAL;
+
+ zone = page_zone(pfn_to_page(start_pfn));
+ node = zone_to_nid(zone);
+ nr_pages = end_pfn - start_pfn;
+
+ /* set above range as isolated */
+ ret = start_isolate_page_range(start_pfn, end_pfn);
+ if (ret)
+ return ret;
+
+ arg.start_pfn = start_pfn;
+ arg.nr_pages = nr_pages;
+ arg.status_change_nid = -1;
+ if (nr_pages >= node_present_pages(node))
+ arg.status_change_nid = node;
+
+ ret = memory_notify(MEM_GOING_OFFLINE, &arg);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ goto failed_removal;
+
+ pfn = start_pfn;
+ expire = jiffies + timeout;
+ drain = 0;
+ retry_max = 5;
+repeat:
+ /* start memory hot removal */
+ ret = -EAGAIN;
+ if (time_after(jiffies, expire))
+ goto failed_removal;
+ ret = -EINTR;
+ if (signal_pending(current))
+ goto failed_removal;
+ ret = 0;
+ if (drain) {
+ lru_add_drain_all();
+ flush_scheduled_work();
+ cond_resched();
+ drain_all_pages();
+ }
+
+ pfn = scan_lru_pages(start_pfn, end_pfn);
+ if (pfn) { /* We have page on LRU */
+ ret = do_migrate_range(pfn, end_pfn);
+ if (!ret) {
+ drain = 1;
+ goto repeat;
+ } else {
+ if (ret < 0)
+ if (--retry_max == 0)
+ goto failed_removal;
+ yield();
+ drain = 1;
+ goto repeat;
+ }
+ }
+ /* drain all zone's lru pagevec, this is asyncronous... */
+ lru_add_drain_all();
+ flush_scheduled_work();
+ yield();
+ /* drain pcp pages , this is synchrouns. */
+ drain_all_pages();
+ /* check again */
+ offlined_pages = check_pages_isolated(start_pfn, end_pfn);
+ if (offlined_pages < 0) {
+ ret = -EBUSY;
+ goto failed_removal;
+ }
+ printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
+ /* Ok, all of our target is islaoted.
+ We cannot do rollback at this point. */
+ offline_isolated_pages(start_pfn, end_pfn);
+ /* reset pagetype flags and makes migrate type to be MOVABLE */
+ undo_isolate_page_range(start_pfn, end_pfn);
+ /* removal success */
+ zone->present_pages -= offlined_pages;
+ zone->zone_pgdat->node_present_pages -= offlined_pages;
+ totalram_pages -= offlined_pages;
+ num_physpages -= offlined_pages;
+
+ vm_total_pages = nr_free_pagecache_pages();
+ writeback_set_ratelimit();
+
+ memory_notify(MEM_OFFLINE, &arg);
+ return 0;
+
+failed_removal:
+ printk(KERN_INFO "memory offlining %lx to %lx failed\n",
+ start_pfn, end_pfn);
+ memory_notify(MEM_CANCEL_OFFLINE, &arg);
+ /* pushback to free area */
+ undo_isolate_page_range(start_pfn, end_pfn);
+
+ return ret;
+}
+
+int remove_memory(u64 start, u64 size)
+{
+ unsigned long start_pfn, end_pfn;
+
+ start_pfn = PFN_DOWN(start);
+ end_pfn = start_pfn + PFN_DOWN(size);
+ return offline_pages(start_pfn, end_pfn, 120 * HZ);
+}
+#else
+int remove_memory(u64 start, u64 size)
+{
+ return -EINVAL;
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+EXPORT_SYMBOL_GPL(remove_memory);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
new file mode 100644
index 0000000..ac8f8f3
--- /dev/null
+++ b/mm/mempolicy.c
@@ -0,0 +1,2338 @@
+/*
+ * Simple NUMA memory policy for the Linux kernel.
+ *
+ * Copyright 2003,2004 Andi Kleen, SuSE Labs.
+ * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
+ * Subject to the GNU Public License, version 2.
+ *
+ * NUMA policy allows the user to give hints in which node(s) memory should
+ * be allocated.
+ *
+ * Support four policies per VMA and per process:
+ *
+ * The VMA policy has priority over the process policy for a page fault.
+ *
+ * interleave Allocate memory interleaved over a set of nodes,
+ * with normal fallback if it fails.
+ * For VMA based allocations this interleaves based on the
+ * offset into the backing object or offset into the mapping
+ * for anonymous memory. For process policy an process counter
+ * is used.
+ *
+ * bind Only allocate memory on a specific set of nodes,
+ * no fallback.
+ * FIXME: memory is allocated starting with the first node
+ * to the last. It would be better if bind would truly restrict
+ * the allocation to memory nodes instead
+ *
+ * preferred Try a specific node first before normal fallback.
+ * As a special case node -1 here means do the allocation
+ * on the local CPU. This is normally identical to default,
+ * but useful to set in a VMA when you have a non default
+ * process policy.
+ *
+ * default Allocate on the local node first, or when on a VMA
+ * use the process policy. This is what Linux always did
+ * in a NUMA aware kernel and still does by, ahem, default.
+ *
+ * The process policy is applied for most non interrupt memory allocations
+ * in that process' context. Interrupts ignore the policies and always
+ * try to allocate on the local CPU. The VMA policy is only applied for memory
+ * allocations for a VMA in the VM.
+ *
+ * Currently there are a few corner cases in swapping where the policy
+ * is not applied, but the majority should be handled. When process policy
+ * is used it is not remembered over swap outs/swap ins.
+ *
+ * Only the highest zone in the zone hierarchy gets policied. Allocations
+ * requesting a lower zone just use default policy. This implies that
+ * on systems with highmem kernel lowmem allocation don't get policied.
+ * Same with GFP_DMA allocations.
+ *
+ * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
+ * all users and remembered even when nobody has memory mapped.
+ */
+
+/* Notebook:
+ fix mmap readahead to honour policy and enable policy for any page cache
+ object
+ statistics for bigpages
+ global policy for page cache? currently it uses process policy. Requires
+ first item above.
+ handle mremap for shared memory (currently ignored for the policy)
+ grows down?
+ make bind policy root only? It can trigger oom much faster and the
+ kernel is not always grateful with that.
+*/
+
+#include <linux/mempolicy.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/nodemask.h>
+#include <linux/cpuset.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/nsproxy.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/compat.h>
+#include <linux/swap.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/migrate.h>
+#include <linux/rmap.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/ctype.h>
+
+#include <asm/tlbflush.h>
+#include <asm/uaccess.h>
+
+#include "internal.h"
+
+/* Internal flags */
+#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
+#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
+#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
+
+static struct kmem_cache *policy_cache;
+static struct kmem_cache *sn_cache;
+
+/* Highest zone. An specific allocation for a zone below that is not
+ policied. */
+enum zone_type policy_zone = 0;
+
+/*
+ * run-time system-wide default policy => local allocation
+ */
+struct mempolicy default_policy = {
+ .refcnt = ATOMIC_INIT(1), /* never free it */
+ .mode = MPOL_PREFERRED,
+ .flags = MPOL_F_LOCAL,
+};
+
+static const struct mempolicy_operations {
+ int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
+ void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
+} mpol_ops[MPOL_MAX];
+
+/* Check that the nodemask contains at least one populated zone */
+static int is_valid_nodemask(const nodemask_t *nodemask)
+{
+ int nd, k;
+
+ /* Check that there is something useful in this mask */
+ k = policy_zone;
+
+ for_each_node_mask(nd, *nodemask) {
+ struct zone *z;
+
+ for (k = 0; k <= policy_zone; k++) {
+ z = &NODE_DATA(nd)->node_zones[k];
+ if (z->present_pages > 0)
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
+{
+ return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
+}
+
+static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
+ const nodemask_t *rel)
+{
+ nodemask_t tmp;
+ nodes_fold(tmp, *orig, nodes_weight(*rel));
+ nodes_onto(*ret, tmp, *rel);
+}
+
+static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
+{
+ if (nodes_empty(*nodes))
+ return -EINVAL;
+ pol->v.nodes = *nodes;
+ return 0;
+}
+
+static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
+{
+ if (!nodes)
+ pol->flags |= MPOL_F_LOCAL; /* local allocation */
+ else if (nodes_empty(*nodes))
+ return -EINVAL; /* no allowed nodes */
+ else
+ pol->v.preferred_node = first_node(*nodes);
+ return 0;
+}
+
+static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
+{
+ if (!is_valid_nodemask(nodes))
+ return -EINVAL;
+ pol->v.nodes = *nodes;
+ return 0;
+}
+
+/* Create a new policy */
+static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
+ nodemask_t *nodes)
+{
+ struct mempolicy *policy;
+ nodemask_t cpuset_context_nmask;
+ int ret;
+
+ pr_debug("setting mode %d flags %d nodes[0] %lx\n",
+ mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
+
+ if (mode == MPOL_DEFAULT) {
+ if (nodes && !nodes_empty(*nodes))
+ return ERR_PTR(-EINVAL);
+ return NULL; /* simply delete any existing policy */
+ }
+ VM_BUG_ON(!nodes);
+
+ /*
+ * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
+ * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
+ * All other modes require a valid pointer to a non-empty nodemask.
+ */
+ if (mode == MPOL_PREFERRED) {
+ if (nodes_empty(*nodes)) {
+ if (((flags & MPOL_F_STATIC_NODES) ||
+ (flags & MPOL_F_RELATIVE_NODES)))
+ return ERR_PTR(-EINVAL);
+ nodes = NULL; /* flag local alloc */
+ }
+ } else if (nodes_empty(*nodes))
+ return ERR_PTR(-EINVAL);
+ policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
+ if (!policy)
+ return ERR_PTR(-ENOMEM);
+ atomic_set(&policy->refcnt, 1);
+ policy->mode = mode;
+ policy->flags = flags;
+
+ if (nodes) {
+ /*
+ * cpuset related setup doesn't apply to local allocation
+ */
+ cpuset_update_task_memory_state();
+ if (flags & MPOL_F_RELATIVE_NODES)
+ mpol_relative_nodemask(&cpuset_context_nmask, nodes,
+ &cpuset_current_mems_allowed);
+ else
+ nodes_and(cpuset_context_nmask, *nodes,
+ cpuset_current_mems_allowed);
+ if (mpol_store_user_nodemask(policy))
+ policy->w.user_nodemask = *nodes;
+ else
+ policy->w.cpuset_mems_allowed =
+ cpuset_mems_allowed(current);
+ }
+
+ ret = mpol_ops[mode].create(policy,
+ nodes ? &cpuset_context_nmask : NULL);
+ if (ret < 0) {
+ kmem_cache_free(policy_cache, policy);
+ return ERR_PTR(ret);
+ }
+ return policy;
+}
+
+/* Slow path of a mpol destructor. */
+void __mpol_put(struct mempolicy *p)
+{
+ if (!atomic_dec_and_test(&p->refcnt))
+ return;
+ kmem_cache_free(policy_cache, p);
+}
+
+static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
+{
+}
+
+static void mpol_rebind_nodemask(struct mempolicy *pol,
+ const nodemask_t *nodes)
+{
+ nodemask_t tmp;
+
+ if (pol->flags & MPOL_F_STATIC_NODES)
+ nodes_and(tmp, pol->w.user_nodemask, *nodes);
+ else if (pol->flags & MPOL_F_RELATIVE_NODES)
+ mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
+ else {
+ nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
+ *nodes);
+ pol->w.cpuset_mems_allowed = *nodes;
+ }
+
+ pol->v.nodes = tmp;
+ if (!node_isset(current->il_next, tmp)) {
+ current->il_next = next_node(current->il_next, tmp);
+ if (current->il_next >= MAX_NUMNODES)
+ current->il_next = first_node(tmp);
+ if (current->il_next >= MAX_NUMNODES)
+ current->il_next = numa_node_id();
+ }
+}
+
+static void mpol_rebind_preferred(struct mempolicy *pol,
+ const nodemask_t *nodes)
+{
+ nodemask_t tmp;
+
+ if (pol->flags & MPOL_F_STATIC_NODES) {
+ int node = first_node(pol->w.user_nodemask);
+
+ if (node_isset(node, *nodes)) {
+ pol->v.preferred_node = node;
+ pol->flags &= ~MPOL_F_LOCAL;
+ } else
+ pol->flags |= MPOL_F_LOCAL;
+ } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
+ mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
+ pol->v.preferred_node = first_node(tmp);
+ } else if (!(pol->flags & MPOL_F_LOCAL)) {
+ pol->v.preferred_node = node_remap(pol->v.preferred_node,
+ pol->w.cpuset_mems_allowed,
+ *nodes);
+ pol->w.cpuset_mems_allowed = *nodes;
+ }
+}
+
+/* Migrate a policy to a different set of nodes */
+static void mpol_rebind_policy(struct mempolicy *pol,
+ const nodemask_t *newmask)
+{
+ if (!pol)
+ return;
+ if (!mpol_store_user_nodemask(pol) &&
+ nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
+ return;
+ mpol_ops[pol->mode].rebind(pol, newmask);
+}
+
+/*
+ * Wrapper for mpol_rebind_policy() that just requires task
+ * pointer, and updates task mempolicy.
+ */
+
+void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
+{
+ mpol_rebind_policy(tsk->mempolicy, new);
+}
+
+/*
+ * Rebind each vma in mm to new nodemask.
+ *
+ * Call holding a reference to mm. Takes mm->mmap_sem during call.
+ */
+
+void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
+{
+ struct vm_area_struct *vma;
+
+ down_write(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next)
+ mpol_rebind_policy(vma->vm_policy, new);
+ up_write(&mm->mmap_sem);
+}
+
+static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
+ [MPOL_DEFAULT] = {
+ .rebind = mpol_rebind_default,
+ },
+ [MPOL_INTERLEAVE] = {
+ .create = mpol_new_interleave,
+ .rebind = mpol_rebind_nodemask,
+ },
+ [MPOL_PREFERRED] = {
+ .create = mpol_new_preferred,
+ .rebind = mpol_rebind_preferred,
+ },
+ [MPOL_BIND] = {
+ .create = mpol_new_bind,
+ .rebind = mpol_rebind_nodemask,
+ },
+};
+
+static void gather_stats(struct page *, void *, int pte_dirty);
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+ unsigned long flags);
+
+/* Scan through pages checking if pages follow certain conditions. */
+static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ const nodemask_t *nodes, unsigned long flags,
+ void *private)
+{
+ pte_t *orig_pte;
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ do {
+ struct page *page;
+ int nid;
+
+ if (!pte_present(*pte))
+ continue;
+ page = vm_normal_page(vma, addr, *pte);
+ if (!page)
+ continue;
+ /*
+ * The check for PageReserved here is important to avoid
+ * handling zero pages and other pages that may have been
+ * marked special by the system.
+ *
+ * If the PageReserved would not be checked here then f.e.
+ * the location of the zero page could have an influence
+ * on MPOL_MF_STRICT, zero pages would be counted for
+ * the per node stats, and there would be useless attempts
+ * to put zero pages on the migration list.
+ */
+ if (PageReserved(page))
+ continue;
+ nid = page_to_nid(page);
+ if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
+ continue;
+
+ if (flags & MPOL_MF_STATS)
+ gather_stats(page, private, pte_dirty(*pte));
+ else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ migrate_page_add(page, private, flags);
+ else
+ break;
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ pte_unmap_unlock(orig_pte, ptl);
+ return addr != end;
+}
+
+static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+ unsigned long addr, unsigned long end,
+ const nodemask_t *nodes, unsigned long flags,
+ void *private)
+{
+ pmd_t *pmd;
+ unsigned long next;
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (pmd_none_or_clear_bad(pmd))
+ continue;
+ if (check_pte_range(vma, pmd, addr, next, nodes,
+ flags, private))
+ return -EIO;
+ } while (pmd++, addr = next, addr != end);
+ return 0;
+}
+
+static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+ unsigned long addr, unsigned long end,
+ const nodemask_t *nodes, unsigned long flags,
+ void *private)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ pud = pud_offset(pgd, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (pud_none_or_clear_bad(pud))
+ continue;
+ if (check_pmd_range(vma, pud, addr, next, nodes,
+ flags, private))
+ return -EIO;
+ } while (pud++, addr = next, addr != end);
+ return 0;
+}
+
+static inline int check_pgd_range(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ const nodemask_t *nodes, unsigned long flags,
+ void *private)
+{
+ pgd_t *pgd;
+ unsigned long next;
+
+ pgd = pgd_offset(vma->vm_mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd))
+ continue;
+ if (check_pud_range(vma, pgd, addr, next, nodes,
+ flags, private))
+ return -EIO;
+ } while (pgd++, addr = next, addr != end);
+ return 0;
+}
+
+/*
+ * Check if all pages in a range are on a set of nodes.
+ * If pagelist != NULL then isolate pages from the LRU and
+ * put them on the pagelist.
+ */
+static struct vm_area_struct *
+check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
+ const nodemask_t *nodes, unsigned long flags, void *private)
+{
+ int err;
+ struct vm_area_struct *first, *vma, *prev;
+
+
+ first = find_vma(mm, start);
+ if (!first)
+ return ERR_PTR(-EFAULT);
+ prev = NULL;
+ for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
+ if (!(flags & MPOL_MF_DISCONTIG_OK)) {
+ if (!vma->vm_next && vma->vm_end < end)
+ return ERR_PTR(-EFAULT);
+ if (prev && prev->vm_end < vma->vm_start)
+ return ERR_PTR(-EFAULT);
+ }
+ if (!is_vm_hugetlb_page(vma) &&
+ ((flags & MPOL_MF_STRICT) ||
+ ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
+ vma_migratable(vma)))) {
+ unsigned long endvma = vma->vm_end;
+
+ if (endvma > end)
+ endvma = end;
+ if (vma->vm_start > start)
+ start = vma->vm_start;
+ err = check_pgd_range(vma, start, endvma, nodes,
+ flags, private);
+ if (err) {
+ first = ERR_PTR(err);
+ break;
+ }
+ }
+ prev = vma;
+ }
+ return first;
+}
+
+/* Apply policy to a single VMA */
+static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
+{
+ int err = 0;
+ struct mempolicy *old = vma->vm_policy;
+
+ pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
+ vma->vm_start, vma->vm_end, vma->vm_pgoff,
+ vma->vm_ops, vma->vm_file,
+ vma->vm_ops ? vma->vm_ops->set_policy : NULL);
+
+ if (vma->vm_ops && vma->vm_ops->set_policy)
+ err = vma->vm_ops->set_policy(vma, new);
+ if (!err) {
+ mpol_get(new);
+ vma->vm_policy = new;
+ mpol_put(old);
+ }
+ return err;
+}
+
+/* Step 2: apply policy to a range and do splits. */
+static int mbind_range(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, struct mempolicy *new)
+{
+ struct vm_area_struct *next;
+ int err;
+
+ err = 0;
+ for (; vma && vma->vm_start < end; vma = next) {
+ next = vma->vm_next;
+ if (vma->vm_start < start)
+ err = split_vma(vma->vm_mm, vma, start, 1);
+ if (!err && vma->vm_end > end)
+ err = split_vma(vma->vm_mm, vma, end, 0);
+ if (!err)
+ err = policy_vma(vma, new);
+ if (err)
+ break;
+ }
+ return err;
+}
+
+/*
+ * Update task->flags PF_MEMPOLICY bit: set iff non-default
+ * mempolicy. Allows more rapid checking of this (combined perhaps
+ * with other PF_* flag bits) on memory allocation hot code paths.
+ *
+ * If called from outside this file, the task 'p' should -only- be
+ * a newly forked child not yet visible on the task list, because
+ * manipulating the task flags of a visible task is not safe.
+ *
+ * The above limitation is why this routine has the funny name
+ * mpol_fix_fork_child_flag().
+ *
+ * It is also safe to call this with a task pointer of current,
+ * which the static wrapper mpol_set_task_struct_flag() does,
+ * for use within this file.
+ */
+
+void mpol_fix_fork_child_flag(struct task_struct *p)
+{
+ if (p->mempolicy)
+ p->flags |= PF_MEMPOLICY;
+ else
+ p->flags &= ~PF_MEMPOLICY;
+}
+
+static void mpol_set_task_struct_flag(void)
+{
+ mpol_fix_fork_child_flag(current);
+}
+
+/* Set the process memory policy */
+static long do_set_mempolicy(unsigned short mode, unsigned short flags,
+ nodemask_t *nodes)
+{
+ struct mempolicy *new;
+ struct mm_struct *mm = current->mm;
+
+ new = mpol_new(mode, flags, nodes);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ /*
+ * prevent changing our mempolicy while show_numa_maps()
+ * is using it.
+ * Note: do_set_mempolicy() can be called at init time
+ * with no 'mm'.
+ */
+ if (mm)
+ down_write(&mm->mmap_sem);
+ mpol_put(current->mempolicy);
+ current->mempolicy = new;
+ mpol_set_task_struct_flag();
+ if (new && new->mode == MPOL_INTERLEAVE &&
+ nodes_weight(new->v.nodes))
+ current->il_next = first_node(new->v.nodes);
+ if (mm)
+ up_write(&mm->mmap_sem);
+
+ return 0;
+}
+
+/*
+ * Return nodemask for policy for get_mempolicy() query
+ */
+static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
+{
+ nodes_clear(*nodes);
+ if (p == &default_policy)
+ return;
+
+ switch (p->mode) {
+ case MPOL_BIND:
+ /* Fall through */
+ case MPOL_INTERLEAVE:
+ *nodes = p->v.nodes;
+ break;
+ case MPOL_PREFERRED:
+ if (!(p->flags & MPOL_F_LOCAL))
+ node_set(p->v.preferred_node, *nodes);
+ /* else return empty node mask for local allocation */
+ break;
+ default:
+ BUG();
+ }
+}
+
+static int lookup_node(struct mm_struct *mm, unsigned long addr)
+{
+ struct page *p;
+ int err;
+
+ err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
+ if (err >= 0) {
+ err = page_to_nid(p);
+ put_page(p);
+ }
+ return err;
+}
+
+/* Retrieve NUMA policy */
+static long do_get_mempolicy(int *policy, nodemask_t *nmask,
+ unsigned long addr, unsigned long flags)
+{
+ int err;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = NULL;
+ struct mempolicy *pol = current->mempolicy;
+
+ cpuset_update_task_memory_state();
+ if (flags &
+ ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
+ return -EINVAL;
+
+ if (flags & MPOL_F_MEMS_ALLOWED) {
+ if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
+ return -EINVAL;
+ *policy = 0; /* just so it's initialized */
+ *nmask = cpuset_current_mems_allowed;
+ return 0;
+ }
+
+ if (flags & MPOL_F_ADDR) {
+ /*
+ * Do NOT fall back to task policy if the
+ * vma/shared policy at addr is NULL. We
+ * want to return MPOL_DEFAULT in this case.
+ */
+ down_read(&mm->mmap_sem);
+ vma = find_vma_intersection(mm, addr, addr+1);
+ if (!vma) {
+ up_read(&mm->mmap_sem);
+ return -EFAULT;
+ }
+ if (vma->vm_ops && vma->vm_ops->get_policy)
+ pol = vma->vm_ops->get_policy(vma, addr);
+ else
+ pol = vma->vm_policy;
+ } else if (addr)
+ return -EINVAL;
+
+ if (!pol)
+ pol = &default_policy; /* indicates default behavior */
+
+ if (flags & MPOL_F_NODE) {
+ if (flags & MPOL_F_ADDR) {
+ err = lookup_node(mm, addr);
+ if (err < 0)
+ goto out;
+ *policy = err;
+ } else if (pol == current->mempolicy &&
+ pol->mode == MPOL_INTERLEAVE) {
+ *policy = current->il_next;
+ } else {
+ err = -EINVAL;
+ goto out;
+ }
+ } else {
+ *policy = pol == &default_policy ? MPOL_DEFAULT :
+ pol->mode;
+ /*
+ * Internal mempolicy flags must be masked off before exposing
+ * the policy to userspace.
+ */
+ *policy |= (pol->flags & MPOL_MODE_FLAGS);
+ }
+
+ if (vma) {
+ up_read(&current->mm->mmap_sem);
+ vma = NULL;
+ }
+
+ err = 0;
+ if (nmask)
+ get_policy_nodemask(pol, nmask);
+
+ out:
+ mpol_cond_put(pol);
+ if (vma)
+ up_read(&current->mm->mmap_sem);
+ return err;
+}
+
+#ifdef CONFIG_MIGRATION
+/*
+ * page migration
+ */
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+ unsigned long flags)
+{
+ /*
+ * Avoid migrating a page that is shared with others.
+ */
+ if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
+ if (!isolate_lru_page(page)) {
+ list_add_tail(&page->lru, pagelist);
+ }
+ }
+}
+
+static struct page *new_node_page(struct page *page, unsigned long node, int **x)
+{
+ return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
+}
+
+/*
+ * Migrate pages from one node to a target node.
+ * Returns error or the number of pages not migrated.
+ */
+static int migrate_to_node(struct mm_struct *mm, int source, int dest,
+ int flags)
+{
+ nodemask_t nmask;
+ LIST_HEAD(pagelist);
+ int err = 0;
+
+ nodes_clear(nmask);
+ node_set(source, nmask);
+
+ check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
+ flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+
+ if (!list_empty(&pagelist))
+ err = migrate_pages(&pagelist, new_node_page, dest);
+
+ return err;
+}
+
+/*
+ * Move pages between the two nodesets so as to preserve the physical
+ * layout as much as possible.
+ *
+ * Returns the number of page that could not be moved.
+ */
+int do_migrate_pages(struct mm_struct *mm,
+ const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+{
+ int busy = 0;
+ int err;
+ nodemask_t tmp;
+
+ err = migrate_prep();
+ if (err)
+ return err;
+
+ down_read(&mm->mmap_sem);
+
+ err = migrate_vmas(mm, from_nodes, to_nodes, flags);
+ if (err)
+ goto out;
+
+/*
+ * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
+ * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
+ * bit in 'tmp', and return that <source, dest> pair for migration.
+ * The pair of nodemasks 'to' and 'from' define the map.
+ *
+ * If no pair of bits is found that way, fallback to picking some
+ * pair of 'source' and 'dest' bits that are not the same. If the
+ * 'source' and 'dest' bits are the same, this represents a node
+ * that will be migrating to itself, so no pages need move.
+ *
+ * If no bits are left in 'tmp', or if all remaining bits left
+ * in 'tmp' correspond to the same bit in 'to', return false
+ * (nothing left to migrate).
+ *
+ * This lets us pick a pair of nodes to migrate between, such that
+ * if possible the dest node is not already occupied by some other
+ * source node, minimizing the risk of overloading the memory on a
+ * node that would happen if we migrated incoming memory to a node
+ * before migrating outgoing memory source that same node.
+ *
+ * A single scan of tmp is sufficient. As we go, we remember the
+ * most recent <s, d> pair that moved (s != d). If we find a pair
+ * that not only moved, but what's better, moved to an empty slot
+ * (d is not set in tmp), then we break out then, with that pair.
+ * Otherwise when we finish scannng from_tmp, we at least have the
+ * most recent <s, d> pair that moved. If we get all the way through
+ * the scan of tmp without finding any node that moved, much less
+ * moved to an empty node, then there is nothing left worth migrating.
+ */
+
+ tmp = *from_nodes;
+ while (!nodes_empty(tmp)) {
+ int s,d;
+ int source = -1;
+ int dest = 0;
+
+ for_each_node_mask(s, tmp) {
+ d = node_remap(s, *from_nodes, *to_nodes);
+ if (s == d)
+ continue;
+
+ source = s; /* Node moved. Memorize */
+ dest = d;
+
+ /* dest not in remaining from nodes? */
+ if (!node_isset(dest, tmp))
+ break;
+ }
+ if (source == -1)
+ break;
+
+ node_clear(source, tmp);
+ err = migrate_to_node(mm, source, dest, flags);
+ if (err > 0)
+ busy += err;
+ if (err < 0)
+ break;
+ }
+out:
+ up_read(&mm->mmap_sem);
+ if (err < 0)
+ return err;
+ return busy;
+
+}
+
+/*
+ * Allocate a new page for page migration based on vma policy.
+ * Start assuming that page is mapped by vma pointed to by @private.
+ * Search forward from there, if not. N.B., this assumes that the
+ * list of pages handed to migrate_pages()--which is how we get here--
+ * is in virtual address order.
+ */
+static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+{
+ struct vm_area_struct *vma = (struct vm_area_struct *)private;
+ unsigned long uninitialized_var(address);
+
+ while (vma) {
+ address = page_address_in_vma(page, vma);
+ if (address != -EFAULT)
+ break;
+ vma = vma->vm_next;
+ }
+
+ /*
+ * if !vma, alloc_page_vma() will use task or system default policy
+ */
+ return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+}
+#else
+
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+ unsigned long flags)
+{
+}
+
+int do_migrate_pages(struct mm_struct *mm,
+ const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+{
+ return -ENOSYS;
+}
+
+static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+{
+ return NULL;
+}
+#endif
+
+static long do_mbind(unsigned long start, unsigned long len,
+ unsigned short mode, unsigned short mode_flags,
+ nodemask_t *nmask, unsigned long flags)
+{
+ struct vm_area_struct *vma;
+ struct mm_struct *mm = current->mm;
+ struct mempolicy *new;
+ unsigned long end;
+ int err;
+ LIST_HEAD(pagelist);
+
+ if (flags & ~(unsigned long)(MPOL_MF_STRICT |
+ MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ return -EINVAL;
+ if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
+ return -EPERM;
+
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+
+ if (mode == MPOL_DEFAULT)
+ flags &= ~MPOL_MF_STRICT;
+
+ len = (len + PAGE_SIZE - 1) & PAGE_MASK;
+ end = start + len;
+
+ if (end < start)
+ return -EINVAL;
+ if (end == start)
+ return 0;
+
+ new = mpol_new(mode, mode_flags, nmask);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ /*
+ * If we are using the default policy then operation
+ * on discontinuous address spaces is okay after all
+ */
+ if (!new)
+ flags |= MPOL_MF_DISCONTIG_OK;
+
+ pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
+ start, start + len, mode, mode_flags,
+ nmask ? nodes_addr(*nmask)[0] : -1);
+
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+
+ err = migrate_prep();
+ if (err)
+ return err;
+ }
+ down_write(&mm->mmap_sem);
+ vma = check_range(mm, start, end, nmask,
+ flags | MPOL_MF_INVERT, &pagelist);
+
+ err = PTR_ERR(vma);
+ if (!IS_ERR(vma)) {
+ int nr_failed = 0;
+
+ err = mbind_range(vma, start, end, new);
+
+ if (!list_empty(&pagelist))
+ nr_failed = migrate_pages(&pagelist, new_vma_page,
+ (unsigned long)vma);
+
+ if (!err && nr_failed && (flags & MPOL_MF_STRICT))
+ err = -EIO;
+ }
+
+ up_write(&mm->mmap_sem);
+ mpol_put(new);
+ return err;
+}
+
+/*
+ * User space interface with variable sized bitmaps for nodelists.
+ */
+
+/* Copy a node mask from user space. */
+static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
+ unsigned long maxnode)
+{
+ unsigned long k;
+ unsigned long nlongs;
+ unsigned long endmask;
+
+ --maxnode;
+ nodes_clear(*nodes);
+ if (maxnode == 0 || !nmask)
+ return 0;
+ if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
+ return -EINVAL;
+
+ nlongs = BITS_TO_LONGS(maxnode);
+ if ((maxnode % BITS_PER_LONG) == 0)
+ endmask = ~0UL;
+ else
+ endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
+
+ /* When the user specified more nodes than supported just check
+ if the non supported part is all zero. */
+ if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
+ if (nlongs > PAGE_SIZE/sizeof(long))
+ return -EINVAL;
+ for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
+ unsigned long t;
+ if (get_user(t, nmask + k))
+ return -EFAULT;
+ if (k == nlongs - 1) {
+ if (t & endmask)
+ return -EINVAL;
+ } else if (t)
+ return -EINVAL;
+ }
+ nlongs = BITS_TO_LONGS(MAX_NUMNODES);
+ endmask = ~0UL;
+ }
+
+ if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
+ return -EFAULT;
+ nodes_addr(*nodes)[nlongs-1] &= endmask;
+ return 0;
+}
+
+/* Copy a kernel node mask to user space */
+static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
+ nodemask_t *nodes)
+{
+ unsigned long copy = ALIGN(maxnode-1, 64) / 8;
+ const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
+
+ if (copy > nbytes) {
+ if (copy > PAGE_SIZE)
+ return -EINVAL;
+ if (clear_user((char __user *)mask + nbytes, copy - nbytes))
+ return -EFAULT;
+ copy = nbytes;
+ }
+ return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
+}
+
+SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
+ unsigned long, mode, unsigned long __user *, nmask,
+ unsigned long, maxnode, unsigned, flags)
+{
+ nodemask_t nodes;
+ int err;
+ unsigned short mode_flags;
+
+ mode_flags = mode & MPOL_MODE_FLAGS;
+ mode &= ~MPOL_MODE_FLAGS;
+ if (mode >= MPOL_MAX)
+ return -EINVAL;
+ if ((mode_flags & MPOL_F_STATIC_NODES) &&
+ (mode_flags & MPOL_F_RELATIVE_NODES))
+ return -EINVAL;
+ err = get_nodes(&nodes, nmask, maxnode);
+ if (err)
+ return err;
+ return do_mbind(start, len, mode, mode_flags, &nodes, flags);
+}
+
+/* Set the process memory policy */
+SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
+ unsigned long, maxnode)
+{
+ int err;
+ nodemask_t nodes;
+ unsigned short flags;
+
+ flags = mode & MPOL_MODE_FLAGS;
+ mode &= ~MPOL_MODE_FLAGS;
+ if ((unsigned int)mode >= MPOL_MAX)
+ return -EINVAL;
+ if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
+ return -EINVAL;
+ err = get_nodes(&nodes, nmask, maxnode);
+ if (err)
+ return err;
+ return do_set_mempolicy(mode, flags, &nodes);
+}
+
+SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
+ const unsigned long __user *, old_nodes,
+ const unsigned long __user *, new_nodes)
+{
+ struct mm_struct *mm;
+ struct task_struct *task;
+ nodemask_t old;
+ nodemask_t new;
+ nodemask_t task_nodes;
+ int err;
+
+ err = get_nodes(&old, old_nodes, maxnode);
+ if (err)
+ return err;
+
+ err = get_nodes(&new, new_nodes, maxnode);
+ if (err)
+ return err;
+
+ /* Find the mm_struct */
+ read_lock(&tasklist_lock);
+ task = pid ? find_task_by_vpid(pid) : current;
+ if (!task) {
+ read_unlock(&tasklist_lock);
+ return -ESRCH;
+ }
+ mm = get_task_mm(task);
+ read_unlock(&tasklist_lock);
+
+ if (!mm)
+ return -EINVAL;
+
+ /*
+ * Check if this process has the right to modify the specified
+ * process. The right exists if the process has administrative
+ * capabilities, superuser privileges or the same
+ * userid as the target process.
+ */
+ if ((current->euid != task->suid) && (current->euid != task->uid) &&
+ (current->uid != task->suid) && (current->uid != task->uid) &&
+ !capable(CAP_SYS_NICE)) {
+ err = -EPERM;
+ goto out;
+ }
+
+ task_nodes = cpuset_mems_allowed(task);
+ /* Is the user allowed to access the target nodes? */
+ if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
+ err = -EPERM;
+ goto out;
+ }
+
+ if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = security_task_movememory(task);
+ if (err)
+ goto out;
+
+ err = do_migrate_pages(mm, &old, &new,
+ capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
+out:
+ mmput(mm);
+ return err;
+}
+
+
+/* Retrieve NUMA policy */
+SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
+ unsigned long __user *, nmask, unsigned long, maxnode,
+ unsigned long, addr, unsigned long, flags)
+{
+ int err;
+ int uninitialized_var(pval);
+ nodemask_t nodes;
+
+ if (nmask != NULL && maxnode < MAX_NUMNODES)
+ return -EINVAL;
+
+ err = do_get_mempolicy(&pval, &nodes, addr, flags);
+
+ if (err)
+ return err;
+
+ if (policy && put_user(pval, policy))
+ return -EFAULT;
+
+ if (nmask)
+ err = copy_nodes_to_user(nmask, maxnode, &nodes);
+
+ return err;
+}
+
+#ifdef CONFIG_COMPAT
+
+asmlinkage long compat_sys_get_mempolicy(int __user *policy,
+ compat_ulong_t __user *nmask,
+ compat_ulong_t maxnode,
+ compat_ulong_t addr, compat_ulong_t flags)
+{
+ long err;
+ unsigned long __user *nm = NULL;
+ unsigned long nr_bits, alloc_size;
+ DECLARE_BITMAP(bm, MAX_NUMNODES);
+
+ nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
+ alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
+
+ if (nmask)
+ nm = compat_alloc_user_space(alloc_size);
+
+ err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
+
+ if (!err && nmask) {
+ err = copy_from_user(bm, nm, alloc_size);
+ /* ensure entire bitmap is zeroed */
+ err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
+ err |= compat_put_bitmap(nmask, bm, nr_bits);
+ }
+
+ return err;
+}
+
+asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
+ compat_ulong_t maxnode)
+{
+ long err = 0;
+ unsigned long __user *nm = NULL;
+ unsigned long nr_bits, alloc_size;
+ DECLARE_BITMAP(bm, MAX_NUMNODES);
+
+ nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
+ alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
+
+ if (nmask) {
+ err = compat_get_bitmap(bm, nmask, nr_bits);
+ nm = compat_alloc_user_space(alloc_size);
+ err |= copy_to_user(nm, bm, alloc_size);
+ }
+
+ if (err)
+ return -EFAULT;
+
+ return sys_set_mempolicy(mode, nm, nr_bits+1);
+}
+
+asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
+ compat_ulong_t mode, compat_ulong_t __user *nmask,
+ compat_ulong_t maxnode, compat_ulong_t flags)
+{
+ long err = 0;
+ unsigned long __user *nm = NULL;
+ unsigned long nr_bits, alloc_size;
+ nodemask_t bm;
+
+ nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
+ alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
+
+ if (nmask) {
+ err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
+ nm = compat_alloc_user_space(alloc_size);
+ err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
+ }
+
+ if (err)
+ return -EFAULT;
+
+ return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
+}
+
+#endif
+
+/*
+ * get_vma_policy(@task, @vma, @addr)
+ * @task - task for fallback if vma policy == default
+ * @vma - virtual memory area whose policy is sought
+ * @addr - address in @vma for shared policy lookup
+ *
+ * Returns effective policy for a VMA at specified address.
+ * Falls back to @task or system default policy, as necessary.
+ * Current or other task's task mempolicy and non-shared vma policies
+ * are protected by the task's mmap_sem, which must be held for read by
+ * the caller.
+ * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
+ * count--added by the get_policy() vm_op, as appropriate--to protect against
+ * freeing by another task. It is the caller's responsibility to free the
+ * extra reference for shared policies.
+ */
+static struct mempolicy *get_vma_policy(struct task_struct *task,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ struct mempolicy *pol = task->mempolicy;
+
+ if (vma) {
+ if (vma->vm_ops && vma->vm_ops->get_policy) {
+ struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
+ addr);
+ if (vpol)
+ pol = vpol;
+ } else if (vma->vm_policy)
+ pol = vma->vm_policy;
+ }
+ if (!pol)
+ pol = &default_policy;
+ return pol;
+}
+
+/*
+ * Return a nodemask representing a mempolicy for filtering nodes for
+ * page allocation
+ */
+static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
+{
+ /* Lower zones don't get a nodemask applied for MPOL_BIND */
+ if (unlikely(policy->mode == MPOL_BIND) &&
+ gfp_zone(gfp) >= policy_zone &&
+ cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
+ return &policy->v.nodes;
+
+ return NULL;
+}
+
+/* Return a zonelist indicated by gfp for node representing a mempolicy */
+static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
+{
+ int nd = numa_node_id();
+
+ switch (policy->mode) {
+ case MPOL_PREFERRED:
+ if (!(policy->flags & MPOL_F_LOCAL))
+ nd = policy->v.preferred_node;
+ break;
+ case MPOL_BIND:
+ /*
+ * Normally, MPOL_BIND allocations are node-local within the
+ * allowed nodemask. However, if __GFP_THISNODE is set and the
+ * current node is part of the mask, we use the zonelist for
+ * the first node in the mask instead.
+ */
+ if (unlikely(gfp & __GFP_THISNODE) &&
+ unlikely(!node_isset(nd, policy->v.nodes)))
+ nd = first_node(policy->v.nodes);
+ break;
+ case MPOL_INTERLEAVE: /* should not happen */
+ break;
+ default:
+ BUG();
+ }
+ return node_zonelist(nd, gfp);
+}
+
+/* Do dynamic interleaving for a process */
+static unsigned interleave_nodes(struct mempolicy *policy)
+{
+ unsigned nid, next;
+ struct task_struct *me = current;
+
+ nid = me->il_next;
+ next = next_node(nid, policy->v.nodes);
+ if (next >= MAX_NUMNODES)
+ next = first_node(policy->v.nodes);
+ if (next < MAX_NUMNODES)
+ me->il_next = next;
+ return nid;
+}
+
+/*
+ * Depending on the memory policy provide a node from which to allocate the
+ * next slab entry.
+ * @policy must be protected by freeing by the caller. If @policy is
+ * the current task's mempolicy, this protection is implicit, as only the
+ * task can change it's policy. The system default policy requires no
+ * such protection.
+ */
+unsigned slab_node(struct mempolicy *policy)
+{
+ if (!policy || policy->flags & MPOL_F_LOCAL)
+ return numa_node_id();
+
+ switch (policy->mode) {
+ case MPOL_PREFERRED:
+ /*
+ * handled MPOL_F_LOCAL above
+ */
+ return policy->v.preferred_node;
+
+ case MPOL_INTERLEAVE:
+ return interleave_nodes(policy);
+
+ case MPOL_BIND: {
+ /*
+ * Follow bind policy behavior and start allocation at the
+ * first node.
+ */
+ struct zonelist *zonelist;
+ struct zone *zone;
+ enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
+ zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
+ (void)first_zones_zonelist(zonelist, highest_zoneidx,
+ &policy->v.nodes,
+ &zone);
+ return zone->node;
+ }
+
+ default:
+ BUG();
+ }
+}
+
+/* Do static interleaving for a VMA with known offset. */
+static unsigned offset_il_node(struct mempolicy *pol,
+ struct vm_area_struct *vma, unsigned long off)
+{
+ unsigned nnodes = nodes_weight(pol->v.nodes);
+ unsigned target;
+ int c;
+ int nid = -1;
+
+ if (!nnodes)
+ return numa_node_id();
+ target = (unsigned int)off % nnodes;
+ c = 0;
+ do {
+ nid = next_node(nid, pol->v.nodes);
+ c++;
+ } while (c <= target);
+ return nid;
+}
+
+/* Determine a node number for interleave */
+static inline unsigned interleave_nid(struct mempolicy *pol,
+ struct vm_area_struct *vma, unsigned long addr, int shift)
+{
+ if (vma) {
+ unsigned long off;
+
+ /*
+ * for small pages, there is no difference between
+ * shift and PAGE_SHIFT, so the bit-shift is safe.
+ * for huge pages, since vm_pgoff is in units of small
+ * pages, we need to shift off the always 0 bits to get
+ * a useful offset.
+ */
+ BUG_ON(shift < PAGE_SHIFT);
+ off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
+ off += (addr - vma->vm_start) >> shift;
+ return offset_il_node(pol, vma, off);
+ } else
+ return interleave_nodes(pol);
+}
+
+#ifdef CONFIG_HUGETLBFS
+/*
+ * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
+ * @vma = virtual memory area whose policy is sought
+ * @addr = address in @vma for shared policy lookup and interleave policy
+ * @gfp_flags = for requested zone
+ * @mpol = pointer to mempolicy pointer for reference counted mempolicy
+ * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
+ *
+ * Returns a zonelist suitable for a huge page allocation and a pointer
+ * to the struct mempolicy for conditional unref after allocation.
+ * If the effective policy is 'BIND, returns a pointer to the mempolicy's
+ * @nodemask for filtering the zonelist.
+ */
+struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
+ gfp_t gfp_flags, struct mempolicy **mpol,
+ nodemask_t **nodemask)
+{
+ struct zonelist *zl;
+
+ *mpol = get_vma_policy(current, vma, addr);
+ *nodemask = NULL; /* assume !MPOL_BIND */
+
+ if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
+ zl = node_zonelist(interleave_nid(*mpol, vma, addr,
+ huge_page_shift(hstate_vma(vma))), gfp_flags);
+ } else {
+ zl = policy_zonelist(gfp_flags, *mpol);
+ if ((*mpol)->mode == MPOL_BIND)
+ *nodemask = &(*mpol)->v.nodes;
+ }
+ return zl;
+}
+#endif
+
+/* Allocate a page in interleaved policy.
+ Own path because it needs to do special accounting. */
+static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
+ unsigned nid)
+{
+ struct zonelist *zl;
+ struct page *page;
+
+ zl = node_zonelist(nid, gfp);
+ page = __alloc_pages(gfp, order, zl);
+ if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
+ inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
+ return page;
+}
+
+/**
+ * alloc_page_vma - Allocate a page for a VMA.
+ *
+ * @gfp:
+ * %GFP_USER user allocation.
+ * %GFP_KERNEL kernel allocations,
+ * %GFP_HIGHMEM highmem/user allocations,
+ * %GFP_FS allocation should not call back into a file system.
+ * %GFP_ATOMIC don't sleep.
+ *
+ * @vma: Pointer to VMA or NULL if not available.
+ * @addr: Virtual Address of the allocation. Must be inside the VMA.
+ *
+ * This function allocates a page from the kernel page pool and applies
+ * a NUMA policy associated with the VMA or the current process.
+ * When VMA is not NULL caller must hold down_read on the mmap_sem of the
+ * mm_struct of the VMA to prevent it from going away. Should be used for
+ * all allocations for pages that will be mapped into
+ * user space. Returns NULL when no page can be allocated.
+ *
+ * Should be called with the mm_sem of the vma hold.
+ */
+struct page *
+alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
+{
+ struct mempolicy *pol = get_vma_policy(current, vma, addr);
+ struct zonelist *zl;
+
+ cpuset_update_task_memory_state();
+
+ if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
+ unsigned nid;
+
+ nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
+ mpol_cond_put(pol);
+ return alloc_page_interleave(gfp, 0, nid);
+ }
+ zl = policy_zonelist(gfp, pol);
+ if (unlikely(mpol_needs_cond_ref(pol))) {
+ /*
+ * slow path: ref counted shared policy
+ */
+ struct page *page = __alloc_pages_nodemask(gfp, 0,
+ zl, policy_nodemask(gfp, pol));
+ __mpol_put(pol);
+ return page;
+ }
+ /*
+ * fast path: default or task policy
+ */
+ return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
+}
+
+/**
+ * alloc_pages_current - Allocate pages.
+ *
+ * @gfp:
+ * %GFP_USER user allocation,
+ * %GFP_KERNEL kernel allocation,
+ * %GFP_HIGHMEM highmem allocation,
+ * %GFP_FS don't call back into a file system.
+ * %GFP_ATOMIC don't sleep.
+ * @order: Power of two of allocation size in pages. 0 is a single page.
+ *
+ * Allocate a page from the kernel page pool. When not in
+ * interrupt context and apply the current process NUMA policy.
+ * Returns NULL when no page can be allocated.
+ *
+ * Don't call cpuset_update_task_memory_state() unless
+ * 1) it's ok to take cpuset_sem (can WAIT), and
+ * 2) allocating for current task (not interrupt).
+ */
+struct page *alloc_pages_current(gfp_t gfp, unsigned order)
+{
+ struct mempolicy *pol = current->mempolicy;
+
+ if ((gfp & __GFP_WAIT) && !in_interrupt())
+ cpuset_update_task_memory_state();
+ if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
+ pol = &default_policy;
+
+ /*
+ * No reference counting needed for current->mempolicy
+ * nor system default_policy
+ */
+ if (pol->mode == MPOL_INTERLEAVE)
+ return alloc_page_interleave(gfp, order, interleave_nodes(pol));
+ return __alloc_pages_nodemask(gfp, order,
+ policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
+}
+EXPORT_SYMBOL(alloc_pages_current);
+
+/*
+ * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
+ * rebinds the mempolicy its copying by calling mpol_rebind_policy()
+ * with the mems_allowed returned by cpuset_mems_allowed(). This
+ * keeps mempolicies cpuset relative after its cpuset moves. See
+ * further kernel/cpuset.c update_nodemask().
+ */
+
+/* Slow path of a mempolicy duplicate */
+struct mempolicy *__mpol_dup(struct mempolicy *old)
+{
+ struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
+
+ if (!new)
+ return ERR_PTR(-ENOMEM);
+ if (current_cpuset_is_being_rebound()) {
+ nodemask_t mems = cpuset_mems_allowed(current);
+ mpol_rebind_policy(old, &mems);
+ }
+ *new = *old;
+ atomic_set(&new->refcnt, 1);
+ return new;
+}
+
+/*
+ * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
+ * eliminate the * MPOL_F_* flags that require conditional ref and
+ * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly
+ * after return. Use the returned value.
+ *
+ * Allows use of a mempolicy for, e.g., multiple allocations with a single
+ * policy lookup, even if the policy needs/has extra ref on lookup.
+ * shmem_readahead needs this.
+ */
+struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
+ struct mempolicy *frompol)
+{
+ if (!mpol_needs_cond_ref(frompol))
+ return frompol;
+
+ *tompol = *frompol;
+ tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */
+ __mpol_put(frompol);
+ return tompol;
+}
+
+static int mpol_match_intent(const struct mempolicy *a,
+ const struct mempolicy *b)
+{
+ if (a->flags != b->flags)
+ return 0;
+ if (!mpol_store_user_nodemask(a))
+ return 1;
+ return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
+}
+
+/* Slow path of a mempolicy comparison */
+int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
+{
+ if (!a || !b)
+ return 0;
+ if (a->mode != b->mode)
+ return 0;
+ if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
+ return 0;
+ switch (a->mode) {
+ case MPOL_BIND:
+ /* Fall through */
+ case MPOL_INTERLEAVE:
+ return nodes_equal(a->v.nodes, b->v.nodes);
+ case MPOL_PREFERRED:
+ return a->v.preferred_node == b->v.preferred_node &&
+ a->flags == b->flags;
+ default:
+ BUG();
+ return 0;
+ }
+}
+
+/*
+ * Shared memory backing store policy support.
+ *
+ * Remember policies even when nobody has shared memory mapped.
+ * The policies are kept in Red-Black tree linked from the inode.
+ * They are protected by the sp->lock spinlock, which should be held
+ * for any accesses to the tree.
+ */
+
+/* lookup first element intersecting start-end */
+/* Caller holds sp->lock */
+static struct sp_node *
+sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
+{
+ struct rb_node *n = sp->root.rb_node;
+
+ while (n) {
+ struct sp_node *p = rb_entry(n, struct sp_node, nd);
+
+ if (start >= p->end)
+ n = n->rb_right;
+ else if (end <= p->start)
+ n = n->rb_left;
+ else
+ break;
+ }
+ if (!n)
+ return NULL;
+ for (;;) {
+ struct sp_node *w = NULL;
+ struct rb_node *prev = rb_prev(n);
+ if (!prev)
+ break;
+ w = rb_entry(prev, struct sp_node, nd);
+ if (w->end <= start)
+ break;
+ n = prev;
+ }
+ return rb_entry(n, struct sp_node, nd);
+}
+
+/* Insert a new shared policy into the list. */
+/* Caller holds sp->lock */
+static void sp_insert(struct shared_policy *sp, struct sp_node *new)
+{
+ struct rb_node **p = &sp->root.rb_node;
+ struct rb_node *parent = NULL;
+ struct sp_node *nd;
+
+ while (*p) {
+ parent = *p;
+ nd = rb_entry(parent, struct sp_node, nd);
+ if (new->start < nd->start)
+ p = &(*p)->rb_left;
+ else if (new->end > nd->end)
+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+ rb_link_node(&new->nd, parent, p);
+ rb_insert_color(&new->nd, &sp->root);
+ pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
+ new->policy ? new->policy->mode : 0);
+}
+
+/* Find shared policy intersecting idx */
+struct mempolicy *
+mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
+{
+ struct mempolicy *pol = NULL;
+ struct sp_node *sn;
+
+ if (!sp->root.rb_node)
+ return NULL;
+ spin_lock(&sp->lock);
+ sn = sp_lookup(sp, idx, idx+1);
+ if (sn) {
+ mpol_get(sn->policy);
+ pol = sn->policy;
+ }
+ spin_unlock(&sp->lock);
+ return pol;
+}
+
+static void sp_delete(struct shared_policy *sp, struct sp_node *n)
+{
+ pr_debug("deleting %lx-l%lx\n", n->start, n->end);
+ rb_erase(&n->nd, &sp->root);
+ mpol_put(n->policy);
+ kmem_cache_free(sn_cache, n);
+}
+
+static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
+ struct mempolicy *pol)
+{
+ struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
+
+ if (!n)
+ return NULL;
+ n->start = start;
+ n->end = end;
+ mpol_get(pol);
+ pol->flags |= MPOL_F_SHARED; /* for unref */
+ n->policy = pol;
+ return n;
+}
+
+/* Replace a policy range. */
+static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
+ unsigned long end, struct sp_node *new)
+{
+ struct sp_node *n, *new2 = NULL;
+
+restart:
+ spin_lock(&sp->lock);
+ n = sp_lookup(sp, start, end);
+ /* Take care of old policies in the same range. */
+ while (n && n->start < end) {
+ struct rb_node *next = rb_next(&n->nd);
+ if (n->start >= start) {
+ if (n->end <= end)
+ sp_delete(sp, n);
+ else
+ n->start = end;
+ } else {
+ /* Old policy spanning whole new range. */
+ if (n->end > end) {
+ if (!new2) {
+ spin_unlock(&sp->lock);
+ new2 = sp_alloc(end, n->end, n->policy);
+ if (!new2)
+ return -ENOMEM;
+ goto restart;
+ }
+ n->end = start;
+ sp_insert(sp, new2);
+ new2 = NULL;
+ break;
+ } else
+ n->end = start;
+ }
+ if (!next)
+ break;
+ n = rb_entry(next, struct sp_node, nd);
+ }
+ if (new)
+ sp_insert(sp, new);
+ spin_unlock(&sp->lock);
+ if (new2) {
+ mpol_put(new2->policy);
+ kmem_cache_free(sn_cache, new2);
+ }
+ return 0;
+}
+
+/**
+ * mpol_shared_policy_init - initialize shared policy for inode
+ * @sp: pointer to inode shared policy
+ * @mpol: struct mempolicy to install
+ *
+ * Install non-NULL @mpol in inode's shared policy rb-tree.
+ * On entry, the current task has a reference on a non-NULL @mpol.
+ * This must be released on exit.
+ */
+void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
+{
+ sp->root = RB_ROOT; /* empty tree == default mempolicy */
+ spin_lock_init(&sp->lock);
+
+ if (mpol) {
+ struct vm_area_struct pvma;
+ struct mempolicy *new;
+
+ /* contextualize the tmpfs mount point mempolicy */
+ new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
+ mpol_put(mpol); /* drop our ref on sb mpol */
+ if (IS_ERR(new))
+ return; /* no valid nodemask intersection */
+
+ /* Create pseudo-vma that contains just the policy */
+ memset(&pvma, 0, sizeof(struct vm_area_struct));
+ pvma.vm_end = TASK_SIZE; /* policy covers entire file */
+ mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
+ mpol_put(new); /* drop initial ref */
+ }
+}
+
+int mpol_set_shared_policy(struct shared_policy *info,
+ struct vm_area_struct *vma, struct mempolicy *npol)
+{
+ int err;
+ struct sp_node *new = NULL;
+ unsigned long sz = vma_pages(vma);
+
+ pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
+ vma->vm_pgoff,
+ sz, npol ? npol->mode : -1,
+ npol ? npol->flags : -1,
+ npol ? nodes_addr(npol->v.nodes)[0] : -1);
+
+ if (npol) {
+ new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
+ if (!new)
+ return -ENOMEM;
+ }
+ err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
+ if (err && new)
+ kmem_cache_free(sn_cache, new);
+ return err;
+}
+
+/* Free a backing policy store on inode delete. */
+void mpol_free_shared_policy(struct shared_policy *p)
+{
+ struct sp_node *n;
+ struct rb_node *next;
+
+ if (!p->root.rb_node)
+ return;
+ spin_lock(&p->lock);
+ next = rb_first(&p->root);
+ while (next) {
+ n = rb_entry(next, struct sp_node, nd);
+ next = rb_next(&n->nd);
+ rb_erase(&n->nd, &p->root);
+ mpol_put(n->policy);
+ kmem_cache_free(sn_cache, n);
+ }
+ spin_unlock(&p->lock);
+}
+
+/* assumes fs == KERNEL_DS */
+void __init numa_policy_init(void)
+{
+ nodemask_t interleave_nodes;
+ unsigned long largest = 0;
+ int nid, prefer = 0;
+
+ policy_cache = kmem_cache_create("numa_policy",
+ sizeof(struct mempolicy),
+ 0, SLAB_PANIC, NULL);
+
+ sn_cache = kmem_cache_create("shared_policy_node",
+ sizeof(struct sp_node),
+ 0, SLAB_PANIC, NULL);
+
+ /*
+ * Set interleaving policy for system init. Interleaving is only
+ * enabled across suitably sized nodes (default is >= 16MB), or
+ * fall back to the largest node if they're all smaller.
+ */
+ nodes_clear(interleave_nodes);
+ for_each_node_state(nid, N_HIGH_MEMORY) {
+ unsigned long total_pages = node_present_pages(nid);
+
+ /* Preserve the largest node */
+ if (largest < total_pages) {
+ largest = total_pages;
+ prefer = nid;
+ }
+
+ /* Interleave this node? */
+ if ((total_pages << PAGE_SHIFT) >= (16 << 20))
+ node_set(nid, interleave_nodes);
+ }
+
+ /* All too small, use the largest */
+ if (unlikely(nodes_empty(interleave_nodes)))
+ node_set(prefer, interleave_nodes);
+
+ if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
+ printk("numa_policy_init: interleaving failed\n");
+}
+
+/* Reset policy of current process to default */
+void numa_default_policy(void)
+{
+ do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
+}
+
+/*
+ * Parse and format mempolicy from/to strings
+ */
+
+/*
+ * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
+ * Used only for mpol_parse_str() and mpol_to_str()
+ */
+#define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
+static const char * const policy_types[] =
+ { "default", "prefer", "bind", "interleave", "local" };
+
+
+#ifdef CONFIG_TMPFS
+/**
+ * mpol_parse_str - parse string to mempolicy
+ * @str: string containing mempolicy to parse
+ * @mpol: pointer to struct mempolicy pointer, returned on success.
+ * @no_context: flag whether to "contextualize" the mempolicy
+ *
+ * Format of input:
+ * <mode>[=<flags>][:<nodelist>]
+ *
+ * if @no_context is true, save the input nodemask in w.user_nodemask in
+ * the returned mempolicy. This will be used to "clone" the mempolicy in
+ * a specific context [cpuset] at a later time. Used to parse tmpfs mpol
+ * mount option. Note that if 'static' or 'relative' mode flags were
+ * specified, the input nodemask will already have been saved. Saving
+ * it again is redundant, but safe.
+ *
+ * On success, returns 0, else 1
+ */
+int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
+{
+ struct mempolicy *new = NULL;
+ unsigned short uninitialized_var(mode);
+ unsigned short uninitialized_var(mode_flags);
+ nodemask_t nodes;
+ char *nodelist = strchr(str, ':');
+ char *flags = strchr(str, '=');
+ int i;
+ int err = 1;
+
+ if (nodelist) {
+ /* NUL-terminate mode or flags string */
+ *nodelist++ = '\0';
+ if (nodelist_parse(nodelist, nodes))
+ goto out;
+ if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
+ goto out;
+ } else
+ nodes_clear(nodes);
+
+ if (flags)
+ *flags++ = '\0'; /* terminate mode string */
+
+ for (i = 0; i <= MPOL_LOCAL; i++) {
+ if (!strcmp(str, policy_types[i])) {
+ mode = i;
+ break;
+ }
+ }
+ if (i > MPOL_LOCAL)
+ goto out;
+
+ switch (mode) {
+ case MPOL_PREFERRED:
+ /*
+ * Insist on a nodelist of one node only
+ */
+ if (nodelist) {
+ char *rest = nodelist;
+ while (isdigit(*rest))
+ rest++;
+ if (!*rest)
+ err = 0;
+ }
+ break;
+ case MPOL_INTERLEAVE:
+ /*
+ * Default to online nodes with memory if no nodelist
+ */
+ if (!nodelist)
+ nodes = node_states[N_HIGH_MEMORY];
+ err = 0;
+ break;
+ case MPOL_LOCAL:
+ /*
+ * Don't allow a nodelist; mpol_new() checks flags
+ */
+ if (nodelist)
+ goto out;
+ mode = MPOL_PREFERRED;
+ break;
+
+ /*
+ * case MPOL_BIND: mpol_new() enforces non-empty nodemask.
+ * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags.
+ */
+ }
+
+ mode_flags = 0;
+ if (flags) {
+ /*
+ * Currently, we only support two mutually exclusive
+ * mode flags.
+ */
+ if (!strcmp(flags, "static"))
+ mode_flags |= MPOL_F_STATIC_NODES;
+ else if (!strcmp(flags, "relative"))
+ mode_flags |= MPOL_F_RELATIVE_NODES;
+ else
+ err = 1;
+ }
+
+ new = mpol_new(mode, mode_flags, &nodes);
+ if (IS_ERR(new))
+ err = 1;
+ else if (no_context)
+ new->w.user_nodemask = nodes; /* save for contextualization */
+
+out:
+ /* Restore string for error message */
+ if (nodelist)
+ *--nodelist = ':';
+ if (flags)
+ *--flags = '=';
+ if (!err)
+ *mpol = new;
+ return err;
+}
+#endif /* CONFIG_TMPFS */
+
+/**
+ * mpol_to_str - format a mempolicy structure for printing
+ * @buffer: to contain formatted mempolicy string
+ * @maxlen: length of @buffer
+ * @pol: pointer to mempolicy to be formatted
+ * @no_context: "context free" mempolicy - use nodemask in w.user_nodemask
+ *
+ * Convert a mempolicy into a string.
+ * Returns the number of characters in buffer (if positive)
+ * or an error (negative)
+ */
+int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
+{
+ char *p = buffer;
+ int l;
+ nodemask_t nodes;
+ unsigned short mode;
+ unsigned short flags = pol ? pol->flags : 0;
+
+ /*
+ * Sanity check: room for longest mode, flag and some nodes
+ */
+ VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
+
+ if (!pol || pol == &default_policy)
+ mode = MPOL_DEFAULT;
+ else
+ mode = pol->mode;
+
+ switch (mode) {
+ case MPOL_DEFAULT:
+ nodes_clear(nodes);
+ break;
+
+ case MPOL_PREFERRED:
+ nodes_clear(nodes);
+ if (flags & MPOL_F_LOCAL)
+ mode = MPOL_LOCAL; /* pseudo-policy */
+ else
+ node_set(pol->v.preferred_node, nodes);
+ break;
+
+ case MPOL_BIND:
+ /* Fall through */
+ case MPOL_INTERLEAVE:
+ if (no_context)
+ nodes = pol->w.user_nodemask;
+ else
+ nodes = pol->v.nodes;
+ break;
+
+ default:
+ BUG();
+ }
+
+ l = strlen(policy_types[mode]);
+ if (buffer + maxlen < p + l + 1)
+ return -ENOSPC;
+
+ strcpy(p, policy_types[mode]);
+ p += l;
+
+ if (flags & MPOL_MODE_FLAGS) {
+ if (buffer + maxlen < p + 2)
+ return -ENOSPC;
+ *p++ = '=';
+
+ /*
+ * Currently, the only defined flags are mutually exclusive
+ */
+ if (flags & MPOL_F_STATIC_NODES)
+ p += snprintf(p, buffer + maxlen - p, "static");
+ else if (flags & MPOL_F_RELATIVE_NODES)
+ p += snprintf(p, buffer + maxlen - p, "relative");
+ }
+
+ if (!nodes_empty(nodes)) {
+ if (buffer + maxlen < p + 2)
+ return -ENOSPC;
+ *p++ = ':';
+ p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
+ }
+ return p - buffer;
+}
+
+struct numa_maps {
+ unsigned long pages;
+ unsigned long anon;
+ unsigned long active;
+ unsigned long writeback;
+ unsigned long mapcount_max;
+ unsigned long dirty;
+ unsigned long swapcache;
+ unsigned long node[MAX_NUMNODES];
+};
+
+static void gather_stats(struct page *page, void *private, int pte_dirty)
+{
+ struct numa_maps *md = private;
+ int count = page_mapcount(page);
+
+ md->pages++;
+ if (pte_dirty || PageDirty(page))
+ md->dirty++;
+
+ if (PageSwapCache(page))
+ md->swapcache++;
+
+ if (PageActive(page) || PageUnevictable(page))
+ md->active++;
+
+ if (PageWriteback(page))
+ md->writeback++;
+
+ if (PageAnon(page))
+ md->anon++;
+
+ if (count > md->mapcount_max)
+ md->mapcount_max = count;
+
+ md->node[page_to_nid(page)]++;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static void check_huge_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct numa_maps *md)
+{
+ unsigned long addr;
+ struct page *page;
+ struct hstate *h = hstate_vma(vma);
+ unsigned long sz = huge_page_size(h);
+
+ for (addr = start; addr < end; addr += sz) {
+ pte_t *ptep = huge_pte_offset(vma->vm_mm,
+ addr & huge_page_mask(h));
+ pte_t pte;
+
+ if (!ptep)
+ continue;
+
+ pte = *ptep;
+ if (pte_none(pte))
+ continue;
+
+ page = pte_page(pte);
+ if (!page)
+ continue;
+
+ gather_stats(page, md, pte_dirty(*ptep));
+ }
+}
+#else
+static inline void check_huge_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct numa_maps *md)
+{
+}
+#endif
+
+/*
+ * Display pages allocated per node and memory policy via /proc.
+ */
+int show_numa_map(struct seq_file *m, void *v)
+{
+ struct proc_maps_private *priv = m->private;
+ struct vm_area_struct *vma = v;
+ struct numa_maps *md;
+ struct file *file = vma->vm_file;
+ struct mm_struct *mm = vma->vm_mm;
+ struct mempolicy *pol;
+ int n;
+ char buffer[50];
+
+ if (!mm)
+ return 0;
+
+ md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
+ if (!md)
+ return 0;
+
+ pol = get_vma_policy(priv->task, vma, vma->vm_start);
+ mpol_to_str(buffer, sizeof(buffer), pol, 0);
+ mpol_cond_put(pol);
+
+ seq_printf(m, "%08lx %s", vma->vm_start, buffer);
+
+ if (file) {
+ seq_printf(m, " file=");
+ seq_path(m, &file->f_path, "\n\t= ");
+ } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
+ seq_printf(m, " heap");
+ } else if (vma->vm_start <= mm->start_stack &&
+ vma->vm_end >= mm->start_stack) {
+ seq_printf(m, " stack");
+ }
+
+ if (is_vm_hugetlb_page(vma)) {
+ check_huge_range(vma, vma->vm_start, vma->vm_end, md);
+ seq_printf(m, " huge");
+ } else {
+ check_pgd_range(vma, vma->vm_start, vma->vm_end,
+ &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
+ }
+
+ if (!md->pages)
+ goto out;
+
+ if (md->anon)
+ seq_printf(m," anon=%lu",md->anon);
+
+ if (md->dirty)
+ seq_printf(m," dirty=%lu",md->dirty);
+
+ if (md->pages != md->anon && md->pages != md->dirty)
+ seq_printf(m, " mapped=%lu", md->pages);
+
+ if (md->mapcount_max > 1)
+ seq_printf(m, " mapmax=%lu", md->mapcount_max);
+
+ if (md->swapcache)
+ seq_printf(m," swapcache=%lu", md->swapcache);
+
+ if (md->active < md->pages && !is_vm_hugetlb_page(vma))
+ seq_printf(m," active=%lu", md->active);
+
+ if (md->writeback)
+ seq_printf(m," writeback=%lu", md->writeback);
+
+ for_each_node_state(n, N_HIGH_MEMORY)
+ if (md->node[n])
+ seq_printf(m, " N%d=%lu", n, md->node[n]);
+out:
+ seq_putc(m, '\n');
+ kfree(md);
+
+ if (m->count < m->size)
+ m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
+ return 0;
+}
diff --git a/mm/mempool.c b/mm/mempool.c
new file mode 100644
index 0000000..a46eb1b
--- /dev/null
+++ b/mm/mempool.c
@@ -0,0 +1,340 @@
+/*
+ * linux/mm/mempool.c
+ *
+ * memory buffer pool support. Such pools are mostly used
+ * for guaranteed, deadlock-free memory allocations during
+ * extreme VM load.
+ *
+ * started by Ingo Molnar, Copyright (C) 2001
+ */
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/mempool.h>
+#include <linux/blkdev.h>
+#include <linux/writeback.h>
+
+static void add_element(mempool_t *pool, void *element)
+{
+ BUG_ON(pool->curr_nr >= pool->min_nr);
+ pool->elements[pool->curr_nr++] = element;
+}
+
+static void *remove_element(mempool_t *pool)
+{
+ BUG_ON(pool->curr_nr <= 0);
+ return pool->elements[--pool->curr_nr];
+}
+
+static void free_pool(mempool_t *pool)
+{
+ while (pool->curr_nr) {
+ void *element = remove_element(pool);
+ pool->free(element, pool->pool_data);
+ }
+ kfree(pool->elements);
+ kfree(pool);
+}
+
+/**
+ * mempool_create - create a memory pool
+ * @min_nr: the minimum number of elements guaranteed to be
+ * allocated for this pool.
+ * @alloc_fn: user-defined element-allocation function.
+ * @free_fn: user-defined element-freeing function.
+ * @pool_data: optional private data available to the user-defined functions.
+ *
+ * this function creates and allocates a guaranteed size, preallocated
+ * memory pool. The pool can be used from the mempool_alloc() and mempool_free()
+ * functions. This function might sleep. Both the alloc_fn() and the free_fn()
+ * functions might sleep - as long as the mempool_alloc() function is not called
+ * from IRQ contexts.
+ */
+mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
+ mempool_free_t *free_fn, void *pool_data)
+{
+ return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,-1);
+}
+EXPORT_SYMBOL(mempool_create);
+
+mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
+ mempool_free_t *free_fn, void *pool_data, int node_id)
+{
+ mempool_t *pool;
+ pool = kmalloc_node(sizeof(*pool), GFP_KERNEL | __GFP_ZERO, node_id);
+ if (!pool)
+ return NULL;
+ pool->elements = kmalloc_node(min_nr * sizeof(void *),
+ GFP_KERNEL, node_id);
+ if (!pool->elements) {
+ kfree(pool);
+ return NULL;
+ }
+ spin_lock_init(&pool->lock);
+ pool->min_nr = min_nr;
+ pool->pool_data = pool_data;
+ init_waitqueue_head(&pool->wait);
+ pool->alloc = alloc_fn;
+ pool->free = free_fn;
+
+ /*
+ * First pre-allocate the guaranteed number of buffers.
+ */
+ while (pool->curr_nr < pool->min_nr) {
+ void *element;
+
+ element = pool->alloc(GFP_KERNEL, pool->pool_data);
+ if (unlikely(!element)) {
+ free_pool(pool);
+ return NULL;
+ }
+ add_element(pool, element);
+ }
+ return pool;
+}
+EXPORT_SYMBOL(mempool_create_node);
+
+/**
+ * mempool_resize - resize an existing memory pool
+ * @pool: pointer to the memory pool which was allocated via
+ * mempool_create().
+ * @new_min_nr: the new minimum number of elements guaranteed to be
+ * allocated for this pool.
+ * @gfp_mask: the usual allocation bitmask.
+ *
+ * This function shrinks/grows the pool. In the case of growing,
+ * it cannot be guaranteed that the pool will be grown to the new
+ * size immediately, but new mempool_free() calls will refill it.
+ *
+ * Note, the caller must guarantee that no mempool_destroy is called
+ * while this function is running. mempool_alloc() & mempool_free()
+ * might be called (eg. from IRQ contexts) while this function executes.
+ */
+int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
+{
+ void *element;
+ void **new_elements;
+ unsigned long flags;
+
+ BUG_ON(new_min_nr <= 0);
+
+ spin_lock_irqsave(&pool->lock, flags);
+ if (new_min_nr <= pool->min_nr) {
+ while (new_min_nr < pool->curr_nr) {
+ element = remove_element(pool);
+ spin_unlock_irqrestore(&pool->lock, flags);
+ pool->free(element, pool->pool_data);
+ spin_lock_irqsave(&pool->lock, flags);
+ }
+ pool->min_nr = new_min_nr;
+ goto out_unlock;
+ }
+ spin_unlock_irqrestore(&pool->lock, flags);
+
+ /* Grow the pool */
+ new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask);
+ if (!new_elements)
+ return -ENOMEM;
+
+ spin_lock_irqsave(&pool->lock, flags);
+ if (unlikely(new_min_nr <= pool->min_nr)) {
+ /* Raced, other resize will do our work */
+ spin_unlock_irqrestore(&pool->lock, flags);
+ kfree(new_elements);
+ goto out;
+ }
+ memcpy(new_elements, pool->elements,
+ pool->curr_nr * sizeof(*new_elements));
+ kfree(pool->elements);
+ pool->elements = new_elements;
+ pool->min_nr = new_min_nr;
+
+ while (pool->curr_nr < pool->min_nr) {
+ spin_unlock_irqrestore(&pool->lock, flags);
+ element = pool->alloc(gfp_mask, pool->pool_data);
+ if (!element)
+ goto out;
+ spin_lock_irqsave(&pool->lock, flags);
+ if (pool->curr_nr < pool->min_nr) {
+ add_element(pool, element);
+ } else {
+ spin_unlock_irqrestore(&pool->lock, flags);
+ pool->free(element, pool->pool_data); /* Raced */
+ goto out;
+ }
+ }
+out_unlock:
+ spin_unlock_irqrestore(&pool->lock, flags);
+out:
+ return 0;
+}
+EXPORT_SYMBOL(mempool_resize);
+
+/**
+ * mempool_destroy - deallocate a memory pool
+ * @pool: pointer to the memory pool which was allocated via
+ * mempool_create().
+ *
+ * this function only sleeps if the free_fn() function sleeps. The caller
+ * has to guarantee that all elements have been returned to the pool (ie:
+ * freed) prior to calling mempool_destroy().
+ */
+void mempool_destroy(mempool_t *pool)
+{
+ /* Check for outstanding elements */
+ BUG_ON(pool->curr_nr != pool->min_nr);
+ free_pool(pool);
+}
+EXPORT_SYMBOL(mempool_destroy);
+
+/**
+ * mempool_alloc - allocate an element from a specific memory pool
+ * @pool: pointer to the memory pool which was allocated via
+ * mempool_create().
+ * @gfp_mask: the usual allocation bitmask.
+ *
+ * this function only sleeps if the alloc_fn() function sleeps or
+ * returns NULL. Note that due to preallocation, this function
+ * *never* fails when called from process contexts. (it might
+ * fail if called from an IRQ context.)
+ */
+void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
+{
+ void *element;
+ unsigned long flags;
+ wait_queue_t wait;
+ gfp_t gfp_temp;
+
+ might_sleep_if(gfp_mask & __GFP_WAIT);
+
+ gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */
+ gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */
+ gfp_mask |= __GFP_NOWARN; /* failures are OK */
+
+ gfp_temp = gfp_mask & ~(__GFP_WAIT|__GFP_IO);
+
+repeat_alloc:
+
+ element = pool->alloc(gfp_temp, pool->pool_data);
+ if (likely(element != NULL))
+ return element;
+
+ spin_lock_irqsave(&pool->lock, flags);
+ if (likely(pool->curr_nr)) {
+ element = remove_element(pool);
+ spin_unlock_irqrestore(&pool->lock, flags);
+ return element;
+ }
+ spin_unlock_irqrestore(&pool->lock, flags);
+
+ /* We must not sleep in the GFP_ATOMIC case */
+ if (!(gfp_mask & __GFP_WAIT))
+ return NULL;
+
+ /* Now start performing page reclaim */
+ gfp_temp = gfp_mask;
+ init_wait(&wait);
+ prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
+ smp_mb();
+ if (!pool->curr_nr) {
+ /*
+ * FIXME: this should be io_schedule(). The timeout is there
+ * as a workaround for some DM problems in 2.6.18.
+ */
+ io_schedule_timeout(5*HZ);
+ }
+ finish_wait(&pool->wait, &wait);
+
+ goto repeat_alloc;
+}
+EXPORT_SYMBOL(mempool_alloc);
+
+/**
+ * mempool_free - return an element to the pool.
+ * @element: pool element pointer.
+ * @pool: pointer to the memory pool which was allocated via
+ * mempool_create().
+ *
+ * this function only sleeps if the free_fn() function sleeps.
+ */
+void mempool_free(void *element, mempool_t *pool)
+{
+ unsigned long flags;
+
+ if (unlikely(element == NULL))
+ return;
+
+ smp_mb();
+ if (pool->curr_nr < pool->min_nr) {
+ spin_lock_irqsave(&pool->lock, flags);
+ if (pool->curr_nr < pool->min_nr) {
+ add_element(pool, element);
+ spin_unlock_irqrestore(&pool->lock, flags);
+ wake_up(&pool->wait);
+ return;
+ }
+ spin_unlock_irqrestore(&pool->lock, flags);
+ }
+ pool->free(element, pool->pool_data);
+}
+EXPORT_SYMBOL(mempool_free);
+
+/*
+ * A commonly used alloc and free fn.
+ */
+void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
+{
+ struct kmem_cache *mem = pool_data;
+ return kmem_cache_alloc(mem, gfp_mask);
+}
+EXPORT_SYMBOL(mempool_alloc_slab);
+
+void mempool_free_slab(void *element, void *pool_data)
+{
+ struct kmem_cache *mem = pool_data;
+ kmem_cache_free(mem, element);
+}
+EXPORT_SYMBOL(mempool_free_slab);
+
+/*
+ * A commonly used alloc and free fn that kmalloc/kfrees the amount of memory
+ * specified by pool_data
+ */
+void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
+{
+ size_t size = (size_t)(long)pool_data;
+ return kmalloc(size, gfp_mask);
+}
+EXPORT_SYMBOL(mempool_kmalloc);
+
+void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data)
+{
+ size_t size = (size_t) pool_data;
+ return kzalloc(size, gfp_mask);
+}
+EXPORT_SYMBOL(mempool_kzalloc);
+
+void mempool_kfree(void *element, void *pool_data)
+{
+ kfree(element);
+}
+EXPORT_SYMBOL(mempool_kfree);
+
+/*
+ * A simple mempool-backed page allocator that allocates pages
+ * of the order specified by pool_data.
+ */
+void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data)
+{
+ int order = (int)(long)pool_data;
+ return alloc_pages(gfp_mask, order);
+}
+EXPORT_SYMBOL(mempool_alloc_pages);
+
+void mempool_free_pages(void *element, void *pool_data)
+{
+ int order = (int)(long)pool_data;
+ __free_pages(element, order);
+}
+EXPORT_SYMBOL(mempool_free_pages);
diff --git a/mm/migrate.c b/mm/migrate.c
new file mode 100644
index 0000000..7fc57cc
--- /dev/null
+++ b/mm/migrate.c
@@ -0,0 +1,1151 @@
+/*
+ * Memory Migration functionality - linux/mm/migration.c
+ *
+ * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
+ *
+ * Page migration was first developed in the context of the memory hotplug
+ * project. The main authors of the migration code are:
+ *
+ * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
+ * Hirokazu Takahashi <taka@valinux.co.jp>
+ * Dave Hansen <haveblue@us.ibm.com>
+ * Christoph Lameter
+ */
+
+#include <linux/migrate.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>
+#include <linux/mm_inline.h>
+#include <linux/nsproxy.h>
+#include <linux/pagevec.h>
+#include <linux/rmap.h>
+#include <linux/topology.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/writeback.h>
+#include <linux/mempolicy.h>
+#include <linux/vmalloc.h>
+#include <linux/security.h>
+#include <linux/memcontrol.h>
+#include <linux/syscalls.h>
+
+#include "internal.h"
+
+#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
+
+/*
+ * migrate_prep() needs to be called before we start compiling a list of pages
+ * to be migrated using isolate_lru_page().
+ */
+int migrate_prep(void)
+{
+ /*
+ * Clear the LRU lists so pages can be isolated.
+ * Note that pages may be moved off the LRU after we have
+ * drained them. Those pages will fail to migrate like other
+ * pages that may be busy.
+ */
+ lru_add_drain_all();
+
+ return 0;
+}
+
+/*
+ * Add isolated pages on the list back to the LRU under page lock
+ * to avoid leaking evictable pages back onto unevictable list.
+ *
+ * returns the number of pages put back.
+ */
+int putback_lru_pages(struct list_head *l)
+{
+ struct page *page;
+ struct page *page2;
+ int count = 0;
+
+ list_for_each_entry_safe(page, page2, l, lru) {
+ list_del(&page->lru);
+ putback_lru_page(page);
+ count++;
+ }
+ return count;
+}
+
+/*
+ * Restore a potential migration pte to a working pte entry
+ */
+static void remove_migration_pte(struct vm_area_struct *vma,
+ struct page *old, struct page *new)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ swp_entry_t entry;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *ptep, pte;
+ spinlock_t *ptl;
+ unsigned long addr = page_address_in_vma(new, vma);
+
+ if (addr == -EFAULT)
+ return;
+
+ pgd = pgd_offset(mm, addr);
+ if (!pgd_present(*pgd))
+ return;
+
+ pud = pud_offset(pgd, addr);
+ if (!pud_present(*pud))
+ return;
+
+ pmd = pmd_offset(pud, addr);
+ if (!pmd_present(*pmd))
+ return;
+
+ ptep = pte_offset_map(pmd, addr);
+
+ if (!is_swap_pte(*ptep)) {
+ pte_unmap(ptep);
+ return;
+ }
+
+ ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+ pte = *ptep;
+ if (!is_swap_pte(pte))
+ goto out;
+
+ entry = pte_to_swp_entry(pte);
+
+ if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
+ goto out;
+
+ /*
+ * Yes, ignore the return value from a GFP_ATOMIC mem_cgroup_charge.
+ * Failure is not an option here: we're now expected to remove every
+ * migration pte, and will cause crashes otherwise. Normally this
+ * is not an issue: mem_cgroup_prepare_migration bumped up the old
+ * page_cgroup count for safety, that's now attached to the new page,
+ * so this charge should just be another incrementation of the count,
+ * to keep in balance with rmap.c's mem_cgroup_uncharging. But if
+ * there's been a force_empty, those reference counts may no longer
+ * be reliable, and this charge can actually fail: oh well, we don't
+ * make the situation any worse by proceeding as if it had succeeded.
+ */
+ mem_cgroup_charge(new, mm, GFP_ATOMIC);
+
+ get_page(new);
+ pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
+ if (is_write_migration_entry(entry))
+ pte = pte_mkwrite(pte);
+ flush_cache_page(vma, addr, pte_pfn(pte));
+ set_pte_at(mm, addr, ptep, pte);
+
+ if (PageAnon(new))
+ page_add_anon_rmap(new, vma, addr);
+ else
+ page_add_file_rmap(new);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(vma, addr, pte);
+
+out:
+ pte_unmap_unlock(ptep, ptl);
+}
+
+/*
+ * Note that remove_file_migration_ptes will only work on regular mappings,
+ * Nonlinear mappings do not use migration entries.
+ */
+static void remove_file_migration_ptes(struct page *old, struct page *new)
+{
+ struct vm_area_struct *vma;
+ struct address_space *mapping = page_mapping(new);
+ struct prio_tree_iter iter;
+ pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+
+ if (!mapping)
+ return;
+
+ spin_lock(&mapping->i_mmap_lock);
+
+ vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
+ remove_migration_pte(vma, old, new);
+
+ spin_unlock(&mapping->i_mmap_lock);
+}
+
+/*
+ * Must hold mmap_sem lock on at least one of the vmas containing
+ * the page so that the anon_vma cannot vanish.
+ */
+static void remove_anon_migration_ptes(struct page *old, struct page *new)
+{
+ struct anon_vma *anon_vma;
+ struct vm_area_struct *vma;
+ unsigned long mapping;
+
+ mapping = (unsigned long)new->mapping;
+
+ if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
+ return;
+
+ /*
+ * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
+ */
+ anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
+ spin_lock(&anon_vma->lock);
+
+ list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
+ remove_migration_pte(vma, old, new);
+
+ spin_unlock(&anon_vma->lock);
+}
+
+/*
+ * Get rid of all migration entries and replace them by
+ * references to the indicated page.
+ */
+static void remove_migration_ptes(struct page *old, struct page *new)
+{
+ if (PageAnon(new))
+ remove_anon_migration_ptes(old, new);
+ else
+ remove_file_migration_ptes(old, new);
+}
+
+/*
+ * Something used the pte of a page under migration. We need to
+ * get to the page and wait until migration is finished.
+ * When we return from this function the fault will be retried.
+ *
+ * This function is called from do_swap_page().
+ */
+void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long address)
+{
+ pte_t *ptep, pte;
+ spinlock_t *ptl;
+ swp_entry_t entry;
+ struct page *page;
+
+ ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+ pte = *ptep;
+ if (!is_swap_pte(pte))
+ goto out;
+
+ entry = pte_to_swp_entry(pte);
+ if (!is_migration_entry(entry))
+ goto out;
+
+ page = migration_entry_to_page(entry);
+
+ /*
+ * Once radix-tree replacement of page migration started, page_count
+ * *must* be zero. And, we don't want to call wait_on_page_locked()
+ * against a page without get_page().
+ * So, we use get_page_unless_zero(), here. Even failed, page fault
+ * will occur again.
+ */
+ if (!get_page_unless_zero(page))
+ goto out;
+ pte_unmap_unlock(ptep, ptl);
+ wait_on_page_locked(page);
+ put_page(page);
+ return;
+out:
+ pte_unmap_unlock(ptep, ptl);
+}
+
+/*
+ * Replace the page in the mapping.
+ *
+ * The number of remaining references must be:
+ * 1 for anonymous pages without a mapping
+ * 2 for pages with a mapping
+ * 3 for pages with a mapping and PagePrivate set.
+ */
+static int migrate_page_move_mapping(struct address_space *mapping,
+ struct page *newpage, struct page *page)
+{
+ int expected_count;
+ void **pslot;
+
+ if (!mapping) {
+ /* Anonymous page without mapping */
+ if (page_count(page) != 1)
+ return -EAGAIN;
+ return 0;
+ }
+
+ spin_lock_irq(&mapping->tree_lock);
+
+ pslot = radix_tree_lookup_slot(&mapping->page_tree,
+ page_index(page));
+
+ expected_count = 2 + !!PagePrivate(page);
+ if (page_count(page) != expected_count ||
+ (struct page *)radix_tree_deref_slot(pslot) != page) {
+ spin_unlock_irq(&mapping->tree_lock);
+ return -EAGAIN;
+ }
+
+ if (!page_freeze_refs(page, expected_count)) {
+ spin_unlock_irq(&mapping->tree_lock);
+ return -EAGAIN;
+ }
+
+ /*
+ * Now we know that no one else is looking at the page.
+ */
+ get_page(newpage); /* add cache reference */
+#ifdef CONFIG_SWAP
+ if (PageSwapCache(page)) {
+ SetPageSwapCache(newpage);
+ set_page_private(newpage, page_private(page));
+ }
+#endif
+
+ radix_tree_replace_slot(pslot, newpage);
+
+ page_unfreeze_refs(page, expected_count);
+ /*
+ * Drop cache reference from old page.
+ * We know this isn't the last reference.
+ */
+ __put_page(page);
+
+ /*
+ * If moved to a different zone then also account
+ * the page for that zone. Other VM counters will be
+ * taken care of when we establish references to the
+ * new page and drop references to the old page.
+ *
+ * Note that anonymous pages are accounted for
+ * via NR_FILE_PAGES and NR_ANON_PAGES if they
+ * are mapped to swap space.
+ */
+ __dec_zone_page_state(page, NR_FILE_PAGES);
+ __inc_zone_page_state(newpage, NR_FILE_PAGES);
+
+ spin_unlock_irq(&mapping->tree_lock);
+
+ return 0;
+}
+
+/*
+ * Copy the page to its new location
+ */
+static void migrate_page_copy(struct page *newpage, struct page *page)
+{
+ int anon;
+
+ copy_highpage(newpage, page);
+
+ if (PageError(page))
+ SetPageError(newpage);
+ if (PageReferenced(page))
+ SetPageReferenced(newpage);
+ if (PageUptodate(page))
+ SetPageUptodate(newpage);
+ if (TestClearPageActive(page)) {
+ VM_BUG_ON(PageUnevictable(page));
+ SetPageActive(newpage);
+ } else
+ unevictable_migrate_page(newpage, page);
+ if (PageChecked(page))
+ SetPageChecked(newpage);
+ if (PageMappedToDisk(page))
+ SetPageMappedToDisk(newpage);
+
+ if (PageDirty(page)) {
+ clear_page_dirty_for_io(page);
+ /*
+ * Want to mark the page and the radix tree as dirty, and
+ * redo the accounting that clear_page_dirty_for_io undid,
+ * but we can't use set_page_dirty because that function
+ * is actually a signal that all of the page has become dirty.
+ * Wheras only part of our page may be dirty.
+ */
+ __set_page_dirty_nobuffers(newpage);
+ }
+
+ mlock_migrate_page(newpage, page);
+
+#ifdef CONFIG_SWAP
+ ClearPageSwapCache(page);
+#endif
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ /* page->mapping contains a flag for PageAnon() */
+ anon = PageAnon(page);
+ page->mapping = NULL;
+
+ if (!anon) /* This page was removed from radix-tree. */
+ mem_cgroup_uncharge_cache_page(page);
+
+ /*
+ * If any waiters have accumulated on the new page then
+ * wake them up.
+ */
+ if (PageWriteback(newpage))
+ end_page_writeback(newpage);
+}
+
+/************************************************************
+ * Migration functions
+ ***********************************************************/
+
+/* Always fail migration. Used for mappings that are not movable */
+int fail_migrate_page(struct address_space *mapping,
+ struct page *newpage, struct page *page)
+{
+ return -EIO;
+}
+EXPORT_SYMBOL(fail_migrate_page);
+
+/*
+ * Common logic to directly migrate a single page suitable for
+ * pages that do not use PagePrivate.
+ *
+ * Pages are locked upon entry and exit.
+ */
+int migrate_page(struct address_space *mapping,
+ struct page *newpage, struct page *page)
+{
+ int rc;
+
+ BUG_ON(PageWriteback(page)); /* Writeback must be complete */
+
+ rc = migrate_page_move_mapping(mapping, newpage, page);
+
+ if (rc)
+ return rc;
+
+ migrate_page_copy(newpage, page);
+ return 0;
+}
+EXPORT_SYMBOL(migrate_page);
+
+#ifdef CONFIG_BLOCK
+/*
+ * Migration function for pages with buffers. This function can only be used
+ * if the underlying filesystem guarantees that no other references to "page"
+ * exist.
+ */
+int buffer_migrate_page(struct address_space *mapping,
+ struct page *newpage, struct page *page)
+{
+ struct buffer_head *bh, *head;
+ int rc;
+
+ if (!page_has_buffers(page))
+ return migrate_page(mapping, newpage, page);
+
+ head = page_buffers(page);
+
+ rc = migrate_page_move_mapping(mapping, newpage, page);
+
+ if (rc)
+ return rc;
+
+ bh = head;
+ do {
+ get_bh(bh);
+ lock_buffer(bh);
+ bh = bh->b_this_page;
+
+ } while (bh != head);
+
+ ClearPagePrivate(page);
+ set_page_private(newpage, page_private(page));
+ set_page_private(page, 0);
+ put_page(page);
+ get_page(newpage);
+
+ bh = head;
+ do {
+ set_bh_page(bh, newpage, bh_offset(bh));
+ bh = bh->b_this_page;
+
+ } while (bh != head);
+
+ SetPagePrivate(newpage);
+
+ migrate_page_copy(newpage, page);
+
+ bh = head;
+ do {
+ unlock_buffer(bh);
+ put_bh(bh);
+ bh = bh->b_this_page;
+
+ } while (bh != head);
+
+ return 0;
+}
+EXPORT_SYMBOL(buffer_migrate_page);
+#endif
+
+/*
+ * Writeback a page to clean the dirty state
+ */
+static int writeout(struct address_space *mapping, struct page *page)
+{
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ .nr_to_write = 1,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ .nonblocking = 1,
+ .for_reclaim = 1
+ };
+ int rc;
+
+ if (!mapping->a_ops->writepage)
+ /* No write method for the address space */
+ return -EINVAL;
+
+ if (!clear_page_dirty_for_io(page))
+ /* Someone else already triggered a write */
+ return -EAGAIN;
+
+ /*
+ * A dirty page may imply that the underlying filesystem has
+ * the page on some queue. So the page must be clean for
+ * migration. Writeout may mean we loose the lock and the
+ * page state is no longer what we checked for earlier.
+ * At this point we know that the migration attempt cannot
+ * be successful.
+ */
+ remove_migration_ptes(page, page);
+
+ rc = mapping->a_ops->writepage(page, &wbc);
+
+ if (rc != AOP_WRITEPAGE_ACTIVATE)
+ /* unlocked. Relock */
+ lock_page(page);
+
+ return (rc < 0) ? -EIO : -EAGAIN;
+}
+
+/*
+ * Default handling if a filesystem does not provide a migration function.
+ */
+static int fallback_migrate_page(struct address_space *mapping,
+ struct page *newpage, struct page *page)
+{
+ if (PageDirty(page))
+ return writeout(mapping, page);
+
+ /*
+ * Buffers may be managed in a filesystem specific way.
+ * We must have no buffers or drop them.
+ */
+ if (PagePrivate(page) &&
+ !try_to_release_page(page, GFP_KERNEL))
+ return -EAGAIN;
+
+ return migrate_page(mapping, newpage, page);
+}
+
+/*
+ * Move a page to a newly allocated page
+ * The page is locked and all ptes have been successfully removed.
+ *
+ * The new page will have replaced the old page if this function
+ * is successful.
+ *
+ * Return value:
+ * < 0 - error code
+ * == 0 - success
+ */
+static int move_to_new_page(struct page *newpage, struct page *page)
+{
+ struct address_space *mapping;
+ int rc;
+
+ /*
+ * Block others from accessing the page when we get around to
+ * establishing additional references. We are the only one
+ * holding a reference to the new page at this point.
+ */
+ if (!trylock_page(newpage))
+ BUG();
+
+ /* Prepare mapping for the new page.*/
+ newpage->index = page->index;
+ newpage->mapping = page->mapping;
+ if (PageSwapBacked(page))
+ SetPageSwapBacked(newpage);
+
+ mapping = page_mapping(page);
+ if (!mapping)
+ rc = migrate_page(mapping, newpage, page);
+ else if (mapping->a_ops->migratepage)
+ /*
+ * Most pages have a mapping and most filesystems
+ * should provide a migration function. Anonymous
+ * pages are part of swap space which also has its
+ * own migration function. This is the most common
+ * path for page migration.
+ */
+ rc = mapping->a_ops->migratepage(mapping,
+ newpage, page);
+ else
+ rc = fallback_migrate_page(mapping, newpage, page);
+
+ if (!rc) {
+ remove_migration_ptes(page, newpage);
+ } else
+ newpage->mapping = NULL;
+
+ unlock_page(newpage);
+
+ return rc;
+}
+
+/*
+ * Obtain the lock on page, remove all ptes and migrate the page
+ * to the newly allocated page in newpage.
+ */
+static int unmap_and_move(new_page_t get_new_page, unsigned long private,
+ struct page *page, int force)
+{
+ int rc = 0;
+ int *result = NULL;
+ struct page *newpage = get_new_page(page, private, &result);
+ int rcu_locked = 0;
+ int charge = 0;
+
+ if (!newpage)
+ return -ENOMEM;
+
+ if (page_count(page) == 1) {
+ /* page was freed from under us. So we are done. */
+ goto move_newpage;
+ }
+
+ charge = mem_cgroup_prepare_migration(page, newpage);
+ if (charge == -ENOMEM) {
+ rc = -ENOMEM;
+ goto move_newpage;
+ }
+ /* prepare cgroup just returns 0 or -ENOMEM */
+ BUG_ON(charge);
+
+ rc = -EAGAIN;
+ if (!trylock_page(page)) {
+ if (!force)
+ goto move_newpage;
+ lock_page(page);
+ }
+
+ if (PageWriteback(page)) {
+ if (!force)
+ goto unlock;
+ wait_on_page_writeback(page);
+ }
+ /*
+ * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
+ * we cannot notice that anon_vma is freed while we migrates a page.
+ * This rcu_read_lock() delays freeing anon_vma pointer until the end
+ * of migration. File cache pages are no problem because of page_lock()
+ * File Caches may use write_page() or lock_page() in migration, then,
+ * just care Anon page here.
+ */
+ if (PageAnon(page)) {
+ rcu_read_lock();
+ rcu_locked = 1;
+ }
+
+ /*
+ * Corner case handling:
+ * 1. When a new swap-cache page is read into, it is added to the LRU
+ * and treated as swapcache but it has no rmap yet.
+ * Calling try_to_unmap() against a page->mapping==NULL page will
+ * trigger a BUG. So handle it here.
+ * 2. An orphaned page (see truncate_complete_page) might have
+ * fs-private metadata. The page can be picked up due to memory
+ * offlining. Everywhere else except page reclaim, the page is
+ * invisible to the vm, so the page can not be migrated. So try to
+ * free the metadata, so the page can be freed.
+ */
+ if (!page->mapping) {
+ if (!PageAnon(page) && PagePrivate(page)) {
+ /*
+ * Go direct to try_to_free_buffers() here because
+ * a) that's what try_to_release_page() would do anyway
+ * b) we may be under rcu_read_lock() here, so we can't
+ * use GFP_KERNEL which is what try_to_release_page()
+ * needs to be effective.
+ */
+ try_to_free_buffers(page);
+ }
+ goto rcu_unlock;
+ }
+
+ /* Establish migration ptes or remove ptes */
+ try_to_unmap(page, 1);
+
+ if (!page_mapped(page))
+ rc = move_to_new_page(newpage, page);
+
+ if (rc)
+ remove_migration_ptes(page, page);
+rcu_unlock:
+ if (rcu_locked)
+ rcu_read_unlock();
+
+unlock:
+ unlock_page(page);
+
+ if (rc != -EAGAIN) {
+ /*
+ * A page that has been migrated has all references
+ * removed and will be freed. A page that has not been
+ * migrated will have kepts its references and be
+ * restored.
+ */
+ list_del(&page->lru);
+ putback_lru_page(page);
+ }
+
+move_newpage:
+ if (!charge)
+ mem_cgroup_end_migration(newpage);
+
+ /*
+ * Move the new page to the LRU. If migration was not successful
+ * then this will free the page.
+ */
+ putback_lru_page(newpage);
+
+ if (result) {
+ if (rc)
+ *result = rc;
+ else
+ *result = page_to_nid(newpage);
+ }
+ return rc;
+}
+
+/*
+ * migrate_pages
+ *
+ * The function takes one list of pages to migrate and a function
+ * that determines from the page to be migrated and the private data
+ * the target of the move and allocates the page.
+ *
+ * The function returns after 10 attempts or if no pages
+ * are movable anymore because to has become empty
+ * or no retryable pages exist anymore. All pages will be
+ * returned to the LRU or freed.
+ *
+ * Return: Number of pages not migrated or error code.
+ */
+int migrate_pages(struct list_head *from,
+ new_page_t get_new_page, unsigned long private)
+{
+ int retry = 1;
+ int nr_failed = 0;
+ int pass = 0;
+ struct page *page;
+ struct page *page2;
+ int swapwrite = current->flags & PF_SWAPWRITE;
+ int rc;
+
+ if (!swapwrite)
+ current->flags |= PF_SWAPWRITE;
+
+ for(pass = 0; pass < 10 && retry; pass++) {
+ retry = 0;
+
+ list_for_each_entry_safe(page, page2, from, lru) {
+ cond_resched();
+
+ rc = unmap_and_move(get_new_page, private,
+ page, pass > 2);
+
+ switch(rc) {
+ case -ENOMEM:
+ goto out;
+ case -EAGAIN:
+ retry++;
+ break;
+ case 0:
+ break;
+ default:
+ /* Permanent failure */
+ nr_failed++;
+ break;
+ }
+ }
+ }
+ rc = 0;
+out:
+ if (!swapwrite)
+ current->flags &= ~PF_SWAPWRITE;
+
+ putback_lru_pages(from);
+
+ if (rc)
+ return rc;
+
+ return nr_failed + retry;
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * Move a list of individual pages
+ */
+struct page_to_node {
+ unsigned long addr;
+ struct page *page;
+ int node;
+ int status;
+};
+
+static struct page *new_page_node(struct page *p, unsigned long private,
+ int **result)
+{
+ struct page_to_node *pm = (struct page_to_node *)private;
+
+ while (pm->node != MAX_NUMNODES && pm->page != p)
+ pm++;
+
+ if (pm->node == MAX_NUMNODES)
+ return NULL;
+
+ *result = &pm->status;
+
+ return alloc_pages_node(pm->node,
+ GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
+}
+
+/*
+ * Move a set of pages as indicated in the pm array. The addr
+ * field must be set to the virtual address of the page to be moved
+ * and the node number must contain a valid target node.
+ * The pm array ends with node = MAX_NUMNODES.
+ */
+static int do_move_page_to_node_array(struct mm_struct *mm,
+ struct page_to_node *pm,
+ int migrate_all)
+{
+ int err;
+ struct page_to_node *pp;
+ LIST_HEAD(pagelist);
+
+ migrate_prep();
+ down_read(&mm->mmap_sem);
+
+ /*
+ * Build a list of pages to migrate
+ */
+ for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
+ struct vm_area_struct *vma;
+ struct page *page;
+
+ /*
+ * A valid page pointer that will not match any of the
+ * pages that will be moved.
+ */
+ pp->page = ZERO_PAGE(0);
+
+ err = -EFAULT;
+ vma = find_vma(mm, pp->addr);
+ if (!vma || !vma_migratable(vma))
+ goto set_status;
+
+ page = follow_page(vma, pp->addr, FOLL_GET);
+
+ err = PTR_ERR(page);
+ if (IS_ERR(page))
+ goto set_status;
+
+ err = -ENOENT;
+ if (!page)
+ goto set_status;
+
+ if (PageReserved(page)) /* Check for zero page */
+ goto put_and_set;
+
+ pp->page = page;
+ err = page_to_nid(page);
+
+ if (err == pp->node)
+ /*
+ * Node already in the right place
+ */
+ goto put_and_set;
+
+ err = -EACCES;
+ if (page_mapcount(page) > 1 &&
+ !migrate_all)
+ goto put_and_set;
+
+ err = isolate_lru_page(page);
+ if (!err)
+ list_add_tail(&page->lru, &pagelist);
+put_and_set:
+ /*
+ * Either remove the duplicate refcount from
+ * isolate_lru_page() or drop the page ref if it was
+ * not isolated.
+ */
+ put_page(page);
+set_status:
+ pp->status = err;
+ }
+
+ err = 0;
+ if (!list_empty(&pagelist))
+ err = migrate_pages(&pagelist, new_page_node,
+ (unsigned long)pm);
+
+ up_read(&mm->mmap_sem);
+ return err;
+}
+
+/*
+ * Migrate an array of page address onto an array of nodes and fill
+ * the corresponding array of status.
+ */
+static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
+ unsigned long nr_pages,
+ const void __user * __user *pages,
+ const int __user *nodes,
+ int __user *status, int flags)
+{
+ struct page_to_node *pm = NULL;
+ nodemask_t task_nodes;
+ int err = 0;
+ int i;
+
+ task_nodes = cpuset_mems_allowed(task);
+
+ /* Limit nr_pages so that the multiplication may not overflow */
+ if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
+ err = -E2BIG;
+ goto out;
+ }
+
+ pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
+ if (!pm) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Get parameters from user space and initialize the pm
+ * array. Return various errors if the user did something wrong.
+ */
+ for (i = 0; i < nr_pages; i++) {
+ const void __user *p;
+
+ err = -EFAULT;
+ if (get_user(p, pages + i))
+ goto out_pm;
+
+ pm[i].addr = (unsigned long)p;
+ if (nodes) {
+ int node;
+
+ if (get_user(node, nodes + i))
+ goto out_pm;
+
+ err = -ENODEV;
+ if (!node_state(node, N_HIGH_MEMORY))
+ goto out_pm;
+
+ err = -EACCES;
+ if (!node_isset(node, task_nodes))
+ goto out_pm;
+
+ pm[i].node = node;
+ } else
+ pm[i].node = 0; /* anything to not match MAX_NUMNODES */
+ }
+ /* End marker */
+ pm[nr_pages].node = MAX_NUMNODES;
+
+ err = do_move_page_to_node_array(mm, pm, flags & MPOL_MF_MOVE_ALL);
+ if (err >= 0)
+ /* Return status information */
+ for (i = 0; i < nr_pages; i++)
+ if (put_user(pm[i].status, status + i))
+ err = -EFAULT;
+
+out_pm:
+ vfree(pm);
+out:
+ return err;
+}
+
+/*
+ * Determine the nodes of an array of pages and store it in an array of status.
+ */
+static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
+ const void __user **pages, int *status)
+{
+ unsigned long i;
+
+ down_read(&mm->mmap_sem);
+
+ for (i = 0; i < nr_pages; i++) {
+ unsigned long addr = (unsigned long)(*pages);
+ struct vm_area_struct *vma;
+ struct page *page;
+ int err = -EFAULT;
+
+ vma = find_vma(mm, addr);
+ if (!vma)
+ goto set_status;
+
+ page = follow_page(vma, addr, 0);
+
+ err = PTR_ERR(page);
+ if (IS_ERR(page))
+ goto set_status;
+
+ err = -ENOENT;
+ /* Use PageReserved to check for zero page */
+ if (!page || PageReserved(page))
+ goto set_status;
+
+ err = page_to_nid(page);
+set_status:
+ *status = err;
+
+ pages++;
+ status++;
+ }
+
+ up_read(&mm->mmap_sem);
+}
+
+/*
+ * Determine the nodes of a user array of pages and store it in
+ * a user array of status.
+ */
+static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
+ const void __user * __user *pages,
+ int __user *status)
+{
+#define DO_PAGES_STAT_CHUNK_NR 16
+ const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
+ int chunk_status[DO_PAGES_STAT_CHUNK_NR];
+ unsigned long i, chunk_nr = DO_PAGES_STAT_CHUNK_NR;
+ int err;
+
+ for (i = 0; i < nr_pages; i += chunk_nr) {
+ if (chunk_nr + i > nr_pages)
+ chunk_nr = nr_pages - i;
+
+ err = copy_from_user(chunk_pages, &pages[i],
+ chunk_nr * sizeof(*chunk_pages));
+ if (err) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
+
+ err = copy_to_user(&status[i], chunk_status,
+ chunk_nr * sizeof(*chunk_status));
+ if (err) {
+ err = -EFAULT;
+ goto out;
+ }
+ }
+ err = 0;
+
+out:
+ return err;
+}
+
+/*
+ * Move a list of pages in the address space of the currently executing
+ * process.
+ */
+SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
+ const void __user * __user *, pages,
+ const int __user *, nodes,
+ int __user *, status, int, flags)
+{
+ struct task_struct *task;
+ struct mm_struct *mm;
+ int err;
+
+ /* Check flags */
+ if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
+ return -EINVAL;
+
+ if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
+ return -EPERM;
+
+ /* Find the mm_struct */
+ read_lock(&tasklist_lock);
+ task = pid ? find_task_by_vpid(pid) : current;
+ if (!task) {
+ read_unlock(&tasklist_lock);
+ return -ESRCH;
+ }
+ mm = get_task_mm(task);
+ read_unlock(&tasklist_lock);
+
+ if (!mm)
+ return -EINVAL;
+
+ /*
+ * Check if this process has the right to modify the specified
+ * process. The right exists if the process has administrative
+ * capabilities, superuser privileges or the same
+ * userid as the target process.
+ */
+ if ((current->euid != task->suid) && (current->euid != task->uid) &&
+ (current->uid != task->suid) && (current->uid != task->uid) &&
+ !capable(CAP_SYS_NICE)) {
+ err = -EPERM;
+ goto out;
+ }
+
+ err = security_task_movememory(task);
+ if (err)
+ goto out;
+
+ if (nodes) {
+ err = do_pages_move(mm, task, nr_pages, pages, nodes, status,
+ flags);
+ } else {
+ err = do_pages_stat(mm, nr_pages, pages, status);
+ }
+
+out:
+ mmput(mm);
+ return err;
+}
+
+/*
+ * Call migration functions in the vma_ops that may prepare
+ * memory in a vm for migration. migration functions may perform
+ * the migration for vmas that do not have an underlying page struct.
+ */
+int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
+ const nodemask_t *from, unsigned long flags)
+{
+ struct vm_area_struct *vma;
+ int err = 0;
+
+ for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) {
+ if (vma->vm_ops && vma->vm_ops->migrate) {
+ err = vma->vm_ops->migrate(vma, to, from, flags);
+ if (err)
+ break;
+ }
+ }
+ return err;
+}
+#endif
diff --git a/mm/mincore.c b/mm/mincore.c
new file mode 100644
index 0000000..8cb508f
--- /dev/null
+++ b/mm/mincore.c
@@ -0,0 +1,229 @@
+/*
+ * linux/mm/mincore.c
+ *
+ * Copyright (C) 1994-2006 Linus Torvalds
+ */
+
+/*
+ * The mincore() system call.
+ */
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/syscalls.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+
+/*
+ * Later we can get more picky about what "in core" means precisely.
+ * For now, simply check to see if the page is in the page cache,
+ * and is up to date; i.e. that no page-in operation would be required
+ * at this time if an application were to map and access this page.
+ */
+static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
+{
+ unsigned char present = 0;
+ struct page *page;
+
+ /*
+ * When tmpfs swaps out a page from a file, any process mapping that
+ * file will not get a swp_entry_t in its pte, but rather it is like
+ * any other file mapping (ie. marked !present and faulted in with
+ * tmpfs's .fault). So swapped out tmpfs mappings are tested here.
+ *
+ * However when tmpfs moves the page from pagecache and into swapcache,
+ * it is still in core, but the find_get_page below won't find it.
+ * No big deal, but make a note of it.
+ */
+ page = find_get_page(mapping, pgoff);
+ if (page) {
+ present = PageUptodate(page);
+ page_cache_release(page);
+ }
+
+ return present;
+}
+
+/*
+ * Do a chunk of "sys_mincore()". We've already checked
+ * all the arguments, we hold the mmap semaphore: we should
+ * just return the amount of info we're asked for.
+ */
+static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *ptep;
+ spinlock_t *ptl;
+ unsigned long nr;
+ int i;
+ pgoff_t pgoff;
+ struct vm_area_struct *vma = find_vma(current->mm, addr);
+
+ /*
+ * find_vma() didn't find anything above us, or we're
+ * in an unmapped hole in the address space: ENOMEM.
+ */
+ if (!vma || addr < vma->vm_start)
+ return -ENOMEM;
+
+ /*
+ * Calculate how many pages there are left in the last level of the
+ * PTE array for our address.
+ */
+ nr = PTRS_PER_PTE - ((addr >> PAGE_SHIFT) & (PTRS_PER_PTE-1));
+
+ /*
+ * Don't overrun this vma
+ */
+ nr = min(nr, (vma->vm_end - addr) >> PAGE_SHIFT);
+
+ /*
+ * Don't return more than the caller asked for
+ */
+ nr = min(nr, pages);
+
+ pgd = pgd_offset(vma->vm_mm, addr);
+ if (pgd_none_or_clear_bad(pgd))
+ goto none_mapped;
+ pud = pud_offset(pgd, addr);
+ if (pud_none_or_clear_bad(pud))
+ goto none_mapped;
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none_or_clear_bad(pmd))
+ goto none_mapped;
+
+ ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ for (i = 0; i < nr; i++, ptep++, addr += PAGE_SIZE) {
+ unsigned char present;
+ pte_t pte = *ptep;
+
+ if (pte_present(pte)) {
+ present = 1;
+
+ } else if (pte_none(pte)) {
+ if (vma->vm_file) {
+ pgoff = linear_page_index(vma, addr);
+ present = mincore_page(vma->vm_file->f_mapping,
+ pgoff);
+ } else
+ present = 0;
+
+ } else if (pte_file(pte)) {
+ pgoff = pte_to_pgoff(pte);
+ present = mincore_page(vma->vm_file->f_mapping, pgoff);
+
+ } else { /* pte is a swap entry */
+ swp_entry_t entry = pte_to_swp_entry(pte);
+ if (is_migration_entry(entry)) {
+ /* migration entries are always uptodate */
+ present = 1;
+ } else {
+#ifdef CONFIG_SWAP
+ pgoff = entry.val;
+ present = mincore_page(&swapper_space, pgoff);
+#else
+ WARN_ON(1);
+ present = 1;
+#endif
+ }
+ }
+
+ vec[i] = present;
+ }
+ pte_unmap_unlock(ptep-1, ptl);
+
+ return nr;
+
+none_mapped:
+ if (vma->vm_file) {
+ pgoff = linear_page_index(vma, addr);
+ for (i = 0; i < nr; i++, pgoff++)
+ vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
+ } else {
+ for (i = 0; i < nr; i++)
+ vec[i] = 0;
+ }
+
+ return nr;
+}
+
+/*
+ * The mincore(2) system call.
+ *
+ * mincore() returns the memory residency status of the pages in the
+ * current process's address space specified by [addr, addr + len).
+ * The status is returned in a vector of bytes. The least significant
+ * bit of each byte is 1 if the referenced page is in memory, otherwise
+ * it is zero.
+ *
+ * Because the status of a page can change after mincore() checks it
+ * but before it returns to the application, the returned vector may
+ * contain stale information. Only locked pages are guaranteed to
+ * remain in memory.
+ *
+ * return values:
+ * zero - success
+ * -EFAULT - vec points to an illegal address
+ * -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE
+ * -ENOMEM - Addresses in the range [addr, addr + len] are
+ * invalid for the address space of this process, or
+ * specify one or more pages which are not currently
+ * mapped
+ * -EAGAIN - A kernel resource was temporarily unavailable.
+ */
+SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
+ unsigned char __user *, vec)
+{
+ long retval;
+ unsigned long pages;
+ unsigned char *tmp;
+
+ /* Check the start address: needs to be page-aligned.. */
+ if (start & ~PAGE_CACHE_MASK)
+ return -EINVAL;
+
+ /* ..and we need to be passed a valid user-space range */
+ if (!access_ok(VERIFY_READ, (void __user *) start, len))
+ return -ENOMEM;
+
+ /* This also avoids any overflows on PAGE_CACHE_ALIGN */
+ pages = len >> PAGE_SHIFT;
+ pages += (len & ~PAGE_MASK) != 0;
+
+ if (!access_ok(VERIFY_WRITE, vec, pages))
+ return -EFAULT;
+
+ tmp = (void *) __get_free_page(GFP_USER);
+ if (!tmp)
+ return -EAGAIN;
+
+ retval = 0;
+ while (pages) {
+ /*
+ * Do at most PAGE_SIZE entries per iteration, due to
+ * the temporary buffer size.
+ */
+ down_read(&current->mm->mmap_sem);
+ retval = do_mincore(start, tmp, min(pages, PAGE_SIZE));
+ up_read(&current->mm->mmap_sem);
+
+ if (retval <= 0)
+ break;
+ if (copy_to_user(vec, tmp, retval)) {
+ retval = -EFAULT;
+ break;
+ }
+ pages -= retval;
+ vec += retval;
+ start += retval << PAGE_SHIFT;
+ retval = 0;
+ }
+ free_page((unsigned long) tmp);
+ return retval;
+}
diff --git a/mm/mlock.c b/mm/mlock.c
new file mode 100644
index 0000000..64dca47
--- /dev/null
+++ b/mm/mlock.c
@@ -0,0 +1,629 @@
+/*
+ * linux/mm/mlock.c
+ *
+ * (C) Copyright 1995 Linus Torvalds
+ * (C) Copyright 2002 Christoph Hellwig
+ */
+
+#include <linux/capability.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pagemap.h>
+#include <linux/mempolicy.h>
+#include <linux/syscalls.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/rmap.h>
+#include <linux/mmzone.h>
+#include <linux/hugetlb.h>
+
+#include "internal.h"
+
+int can_do_mlock(void)
+{
+ if (capable(CAP_IPC_LOCK))
+ return 1;
+ if (current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur != 0)
+ return 1;
+ return 0;
+}
+EXPORT_SYMBOL(can_do_mlock);
+
+#ifdef CONFIG_UNEVICTABLE_LRU
+/*
+ * Mlocked pages are marked with PageMlocked() flag for efficient testing
+ * in vmscan and, possibly, the fault path; and to support semi-accurate
+ * statistics.
+ *
+ * An mlocked page [PageMlocked(page)] is unevictable. As such, it will
+ * be placed on the LRU "unevictable" list, rather than the [in]active lists.
+ * The unevictable list is an LRU sibling list to the [in]active lists.
+ * PageUnevictable is set to indicate the unevictable state.
+ *
+ * When lazy mlocking via vmscan, it is important to ensure that the
+ * vma's VM_LOCKED status is not concurrently being modified, otherwise we
+ * may have mlocked a page that is being munlocked. So lazy mlock must take
+ * the mmap_sem for read, and verify that the vma really is locked
+ * (see mm/rmap.c).
+ */
+
+/*
+ * LRU accounting for clear_page_mlock()
+ */
+void __clear_page_mlock(struct page *page)
+{
+ VM_BUG_ON(!PageLocked(page));
+
+ if (!page->mapping) { /* truncated ? */
+ return;
+ }
+
+ dec_zone_page_state(page, NR_MLOCK);
+ count_vm_event(UNEVICTABLE_PGCLEARED);
+ if (!isolate_lru_page(page)) {
+ putback_lru_page(page);
+ } else {
+ /*
+ * We lost the race. the page already moved to evictable list.
+ */
+ if (PageUnevictable(page))
+ count_vm_event(UNEVICTABLE_PGSTRANDED);
+ }
+}
+
+/*
+ * Mark page as mlocked if not already.
+ * If page on LRU, isolate and putback to move to unevictable list.
+ */
+void mlock_vma_page(struct page *page)
+{
+ BUG_ON(!PageLocked(page));
+
+ if (!TestSetPageMlocked(page)) {
+ inc_zone_page_state(page, NR_MLOCK);
+ count_vm_event(UNEVICTABLE_PGMLOCKED);
+ if (!isolate_lru_page(page))
+ putback_lru_page(page);
+ }
+}
+
+/*
+ * called from munlock()/munmap() path with page supposedly on the LRU.
+ *
+ * Note: unlike mlock_vma_page(), we can't just clear the PageMlocked
+ * [in try_to_munlock()] and then attempt to isolate the page. We must
+ * isolate the page to keep others from messing with its unevictable
+ * and mlocked state while trying to munlock. However, we pre-clear the
+ * mlocked state anyway as we might lose the isolation race and we might
+ * not get another chance to clear PageMlocked. If we successfully
+ * isolate the page and try_to_munlock() detects other VM_LOCKED vmas
+ * mapping the page, it will restore the PageMlocked state, unless the page
+ * is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(),
+ * perhaps redundantly.
+ * If we lose the isolation race, and the page is mapped by other VM_LOCKED
+ * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap()
+ * either of which will restore the PageMlocked state by calling
+ * mlock_vma_page() above, if it can grab the vma's mmap sem.
+ */
+static void munlock_vma_page(struct page *page)
+{
+ BUG_ON(!PageLocked(page));
+
+ if (TestClearPageMlocked(page)) {
+ dec_zone_page_state(page, NR_MLOCK);
+ if (!isolate_lru_page(page)) {
+ int ret = try_to_munlock(page);
+ /*
+ * did try_to_unlock() succeed or punt?
+ */
+ if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN)
+ count_vm_event(UNEVICTABLE_PGMUNLOCKED);
+
+ putback_lru_page(page);
+ } else {
+ /*
+ * We lost the race. let try_to_unmap() deal
+ * with it. At least we get the page state and
+ * mlock stats right. However, page is still on
+ * the noreclaim list. We'll fix that up when
+ * the page is eventually freed or we scan the
+ * noreclaim list.
+ */
+ if (PageUnevictable(page))
+ count_vm_event(UNEVICTABLE_PGSTRANDED);
+ else
+ count_vm_event(UNEVICTABLE_PGMUNLOCKED);
+ }
+ }
+}
+
+/**
+ * __mlock_vma_pages_range() - mlock/munlock a range of pages in the vma.
+ * @vma: target vma
+ * @start: start address
+ * @end: end address
+ * @mlock: 0 indicate munlock, otherwise mlock.
+ *
+ * If @mlock == 0, unlock an mlocked range;
+ * else mlock the range of pages. This takes care of making the pages present ,
+ * too.
+ *
+ * return 0 on success, negative error code on error.
+ *
+ * vma->vm_mm->mmap_sem must be held for at least read.
+ */
+static long __mlock_vma_pages_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ int mlock)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long addr = start;
+ struct page *pages[16]; /* 16 gives a reasonable batch */
+ int nr_pages = (end - start) / PAGE_SIZE;
+ int ret = 0;
+ int gup_flags = 0;
+
+ VM_BUG_ON(start & ~PAGE_MASK);
+ VM_BUG_ON(end & ~PAGE_MASK);
+ VM_BUG_ON(start < vma->vm_start);
+ VM_BUG_ON(end > vma->vm_end);
+ VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) &&
+ (atomic_read(&mm->mm_users) != 0));
+
+ /*
+ * mlock: don't page populate if page has PROT_NONE permission.
+ * munlock: the pages always do munlock althrough
+ * its has PROT_NONE permission.
+ */
+ if (!mlock)
+ gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS;
+
+ if (vma->vm_flags & VM_WRITE)
+ gup_flags |= GUP_FLAGS_WRITE;
+
+ while (nr_pages > 0) {
+ int i;
+
+ cond_resched();
+
+ /*
+ * get_user_pages makes pages present if we are
+ * setting mlock. and this extra reference count will
+ * disable migration of this page. However, page may
+ * still be truncated out from under us.
+ */
+ ret = __get_user_pages(current, mm, addr,
+ min_t(int, nr_pages, ARRAY_SIZE(pages)),
+ gup_flags, pages, NULL);
+ /*
+ * This can happen for, e.g., VM_NONLINEAR regions before
+ * a page has been allocated and mapped at a given offset,
+ * or for addresses that map beyond end of a file.
+ * We'll mlock the the pages if/when they get faulted in.
+ */
+ if (ret < 0)
+ break;
+ if (ret == 0) {
+ /*
+ * We know the vma is there, so the only time
+ * we cannot get a single page should be an
+ * error (ret < 0) case.
+ */
+ WARN_ON(1);
+ break;
+ }
+
+ lru_add_drain(); /* push cached pages to LRU */
+
+ for (i = 0; i < ret; i++) {
+ struct page *page = pages[i];
+
+ lock_page(page);
+ /*
+ * Because we lock page here and migration is blocked
+ * by the elevated reference, we need only check for
+ * page truncation (file-cache only).
+ */
+ if (page->mapping) {
+ if (mlock)
+ mlock_vma_page(page);
+ else
+ munlock_vma_page(page);
+ }
+ unlock_page(page);
+ put_page(page); /* ref from get_user_pages() */
+
+ /*
+ * here we assume that get_user_pages() has given us
+ * a list of virtually contiguous pages.
+ */
+ addr += PAGE_SIZE; /* for next get_user_pages() */
+ nr_pages--;
+ }
+ ret = 0;
+ }
+
+ return ret; /* count entire vma as locked_vm */
+}
+
+/*
+ * convert get_user_pages() return value to posix mlock() error
+ */
+static int __mlock_posix_error_return(long retval)
+{
+ if (retval == -EFAULT)
+ retval = -ENOMEM;
+ else if (retval == -ENOMEM)
+ retval = -EAGAIN;
+ return retval;
+}
+
+#else /* CONFIG_UNEVICTABLE_LRU */
+
+/*
+ * Just make pages present if VM_LOCKED. No-op if unlocking.
+ */
+static long __mlock_vma_pages_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ int mlock)
+{
+ if (mlock && (vma->vm_flags & VM_LOCKED))
+ return make_pages_present(start, end);
+ return 0;
+}
+
+static inline int __mlock_posix_error_return(long retval)
+{
+ return 0;
+}
+
+#endif /* CONFIG_UNEVICTABLE_LRU */
+
+/**
+ * mlock_vma_pages_range() - mlock pages in specified vma range.
+ * @vma - the vma containing the specfied address range
+ * @start - starting address in @vma to mlock
+ * @end - end address [+1] in @vma to mlock
+ *
+ * For mmap()/mremap()/expansion of mlocked vma.
+ *
+ * return 0 on success for "normal" vmas.
+ *
+ * return number of pages [> 0] to be removed from locked_vm on success
+ * of "special" vmas.
+ */
+long mlock_vma_pages_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ int nr_pages = (end - start) / PAGE_SIZE;
+ BUG_ON(!(vma->vm_flags & VM_LOCKED));
+
+ /*
+ * filter unlockable vmas
+ */
+ if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+ goto no_mlock;
+
+ if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
+ is_vm_hugetlb_page(vma) ||
+ vma == get_gate_vma(current))) {
+
+ __mlock_vma_pages_range(vma, start, end, 1);
+
+ /* Hide errors from mmap() and other callers */
+ return 0;
+ }
+
+ /*
+ * User mapped kernel pages or huge pages:
+ * make these pages present to populate the ptes, but
+ * fall thru' to reset VM_LOCKED--no need to unlock, and
+ * return nr_pages so these don't get counted against task's
+ * locked limit. huge pages are already counted against
+ * locked vm limit.
+ */
+ make_pages_present(start, end);
+
+no_mlock:
+ vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */
+ return nr_pages; /* error or pages NOT mlocked */
+}
+
+
+/*
+ * munlock_vma_pages_range() - munlock all pages in the vma range.'
+ * @vma - vma containing range to be munlock()ed.
+ * @start - start address in @vma of the range
+ * @end - end of range in @vma.
+ *
+ * For mremap(), munmap() and exit().
+ *
+ * Called with @vma VM_LOCKED.
+ *
+ * Returns with VM_LOCKED cleared. Callers must be prepared to
+ * deal with this.
+ *
+ * We don't save and restore VM_LOCKED here because pages are
+ * still on lru. In unmap path, pages might be scanned by reclaim
+ * and re-mlocked by try_to_{munlock|unmap} before we unmap and
+ * free them. This will result in freeing mlocked pages.
+ */
+void munlock_vma_pages_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ vma->vm_flags &= ~VM_LOCKED;
+ __mlock_vma_pages_range(vma, start, end, 0);
+}
+
+/*
+ * mlock_fixup - handle mlock[all]/munlock[all] requests.
+ *
+ * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
+ * munlock is a no-op. However, for some special vmas, we go ahead and
+ * populate the ptes via make_pages_present().
+ *
+ * For vmas that pass the filters, merge/split as appropriate.
+ */
+static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
+ unsigned long start, unsigned long end, unsigned int newflags)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pgoff_t pgoff;
+ int nr_pages;
+ int ret = 0;
+ int lock = newflags & VM_LOCKED;
+
+ if (newflags == vma->vm_flags ||
+ (vma->vm_flags & (VM_IO | VM_PFNMAP)))
+ goto out; /* don't set VM_LOCKED, don't count */
+
+ if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
+ is_vm_hugetlb_page(vma) ||
+ vma == get_gate_vma(current)) {
+ if (lock)
+ make_pages_present(start, end);
+ goto out; /* don't set VM_LOCKED, don't count */
+ }
+
+ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+ *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
+ vma->vm_file, pgoff, vma_policy(vma));
+ if (*prev) {
+ vma = *prev;
+ goto success;
+ }
+
+ if (start != vma->vm_start) {
+ ret = split_vma(mm, vma, start, 1);
+ if (ret)
+ goto out;
+ }
+
+ if (end != vma->vm_end) {
+ ret = split_vma(mm, vma, end, 0);
+ if (ret)
+ goto out;
+ }
+
+success:
+ /*
+ * Keep track of amount of locked VM.
+ */
+ nr_pages = (end - start) >> PAGE_SHIFT;
+ if (!lock)
+ nr_pages = -nr_pages;
+ mm->locked_vm += nr_pages;
+
+ /*
+ * vm_flags is protected by the mmap_sem held in write mode.
+ * It's okay if try_to_unmap_one unmaps a page just after we
+ * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
+ */
+ vma->vm_flags = newflags;
+
+ if (lock) {
+ ret = __mlock_vma_pages_range(vma, start, end, 1);
+
+ if (ret > 0) {
+ mm->locked_vm -= ret;
+ ret = 0;
+ } else
+ ret = __mlock_posix_error_return(ret); /* translate if needed */
+ } else {
+ __mlock_vma_pages_range(vma, start, end, 0);
+ }
+
+out:
+ *prev = vma;
+ return ret;
+}
+
+static int do_mlock(unsigned long start, size_t len, int on)
+{
+ unsigned long nstart, end, tmp;
+ struct vm_area_struct * vma, * prev;
+ int error;
+
+ len = PAGE_ALIGN(len);
+ end = start + len;
+ if (end < start)
+ return -EINVAL;
+ if (end == start)
+ return 0;
+ vma = find_vma_prev(current->mm, start, &prev);
+ if (!vma || vma->vm_start > start)
+ return -ENOMEM;
+
+ if (start > vma->vm_start)
+ prev = vma;
+
+ for (nstart = start ; ; ) {
+ unsigned int newflags;
+
+ /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
+
+ newflags = vma->vm_flags | VM_LOCKED;
+ if (!on)
+ newflags &= ~VM_LOCKED;
+
+ tmp = vma->vm_end;
+ if (tmp > end)
+ tmp = end;
+ error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
+ if (error)
+ break;
+ nstart = tmp;
+ if (nstart < prev->vm_end)
+ nstart = prev->vm_end;
+ if (nstart >= end)
+ break;
+
+ vma = prev->vm_next;
+ if (!vma || vma->vm_start != nstart) {
+ error = -ENOMEM;
+ break;
+ }
+ }
+ return error;
+}
+
+SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
+{
+ unsigned long locked;
+ unsigned long lock_limit;
+ int error = -ENOMEM;
+
+ if (!can_do_mlock())
+ return -EPERM;
+
+ lru_add_drain_all(); /* flush pagevec */
+
+ down_write(&current->mm->mmap_sem);
+ len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
+ start &= PAGE_MASK;
+
+ locked = len >> PAGE_SHIFT;
+ locked += current->mm->locked_vm;
+
+ lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ lock_limit >>= PAGE_SHIFT;
+
+ /* check against resource limits */
+ if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
+ error = do_mlock(start, len, 1);
+ up_write(&current->mm->mmap_sem);
+ return error;
+}
+
+SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
+{
+ int ret;
+
+ down_write(&current->mm->mmap_sem);
+ len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
+ start &= PAGE_MASK;
+ ret = do_mlock(start, len, 0);
+ up_write(&current->mm->mmap_sem);
+ return ret;
+}
+
+static int do_mlockall(int flags)
+{
+ struct vm_area_struct * vma, * prev = NULL;
+ unsigned int def_flags = 0;
+
+ if (flags & MCL_FUTURE)
+ def_flags = VM_LOCKED;
+ current->mm->def_flags = def_flags;
+ if (flags == MCL_FUTURE)
+ goto out;
+
+ for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
+ unsigned int newflags;
+
+ newflags = vma->vm_flags | VM_LOCKED;
+ if (!(flags & MCL_CURRENT))
+ newflags &= ~VM_LOCKED;
+
+ /* Ignore errors */
+ mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
+ }
+out:
+ return 0;
+}
+
+SYSCALL_DEFINE1(mlockall, int, flags)
+{
+ unsigned long lock_limit;
+ int ret = -EINVAL;
+
+ if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE)))
+ goto out;
+
+ ret = -EPERM;
+ if (!can_do_mlock())
+ goto out;
+
+ lru_add_drain_all(); /* flush pagevec */
+
+ down_write(&current->mm->mmap_sem);
+
+ lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ lock_limit >>= PAGE_SHIFT;
+
+ ret = -ENOMEM;
+ if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
+ capable(CAP_IPC_LOCK))
+ ret = do_mlockall(flags);
+ up_write(&current->mm->mmap_sem);
+out:
+ return ret;
+}
+
+SYSCALL_DEFINE0(munlockall)
+{
+ int ret;
+
+ down_write(&current->mm->mmap_sem);
+ ret = do_mlockall(0);
+ up_write(&current->mm->mmap_sem);
+ return ret;
+}
+
+/*
+ * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
+ * shm segments) get accounted against the user_struct instead.
+ */
+static DEFINE_SPINLOCK(shmlock_user_lock);
+
+int user_shm_lock(size_t size, struct user_struct *user)
+{
+ unsigned long lock_limit, locked;
+ int allowed = 0;
+
+ locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ if (lock_limit == RLIM_INFINITY)
+ allowed = 1;
+ lock_limit >>= PAGE_SHIFT;
+ spin_lock(&shmlock_user_lock);
+ if (!allowed &&
+ locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
+ goto out;
+ get_uid(user);
+ user->locked_shm += locked;
+ allowed = 1;
+out:
+ spin_unlock(&shmlock_user_lock);
+ return allowed;
+}
+
+void user_shm_unlock(size_t size, struct user_struct *user)
+{
+ spin_lock(&shmlock_user_lock);
+ user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ spin_unlock(&shmlock_user_lock);
+ free_uid(user);
+}
diff --git a/mm/mm_init.c b/mm/mm_init.c
new file mode 100644
index 0000000..4e0e265
--- /dev/null
+++ b/mm/mm_init.c
@@ -0,0 +1,152 @@
+/*
+ * mm_init.c - Memory initialisation verification and debugging
+ *
+ * Copyright 2008 IBM Corporation, 2008
+ * Author Mel Gorman <mel@csn.ul.ie>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/kobject.h>
+#include <linux/module.h>
+#include "internal.h"
+
+#ifdef CONFIG_DEBUG_MEMORY_INIT
+int mminit_loglevel;
+
+#ifndef SECTIONS_SHIFT
+#define SECTIONS_SHIFT 0
+#endif
+
+/* The zonelists are simply reported, validation is manual. */
+void mminit_verify_zonelist(void)
+{
+ int nid;
+
+ if (mminit_loglevel < MMINIT_VERIFY)
+ return;
+
+ for_each_online_node(nid) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+ struct zone *zone;
+ struct zoneref *z;
+ struct zonelist *zonelist;
+ int i, listid, zoneid;
+
+ BUG_ON(MAX_ZONELISTS > 2);
+ for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
+
+ /* Identify the zone and nodelist */
+ zoneid = i % MAX_NR_ZONES;
+ listid = i / MAX_NR_ZONES;
+ zonelist = &pgdat->node_zonelists[listid];
+ zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
+
+ /* Print information about the zonelist */
+ printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ",
+ listid > 0 ? "thisnode" : "general", nid,
+ zone->name);
+
+ /* Iterate the zonelist */
+ for_each_zone_zonelist(zone, z, zonelist, zoneid) {
+#ifdef CONFIG_NUMA
+ printk(KERN_CONT "%d:%s ",
+ zone->node, zone->name);
+#else
+ printk(KERN_CONT "0:%s ", zone->name);
+#endif /* CONFIG_NUMA */
+ }
+ printk(KERN_CONT "\n");
+ }
+ }
+}
+
+void __init mminit_verify_pageflags_layout(void)
+{
+ int shift, width;
+ unsigned long or_mask, add_mask;
+
+ shift = 8 * sizeof(unsigned long);
+ width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH;
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
+ "Section %d Node %d Zone %d Flags %d\n",
+ SECTIONS_WIDTH,
+ NODES_WIDTH,
+ ZONES_WIDTH,
+ NR_PAGEFLAGS);
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
+ "Section %d Node %d Zone %d\n",
+ SECTIONS_SHIFT,
+ NODES_SHIFT,
+ ZONES_SHIFT);
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets",
+ "Section %lu Node %lu Zone %lu\n",
+ (unsigned long)SECTIONS_PGSHIFT,
+ (unsigned long)NODES_PGSHIFT,
+ (unsigned long)ZONES_PGSHIFT);
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid",
+ "Zone ID: %lu -> %lu\n",
+ (unsigned long)ZONEID_PGOFF,
+ (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT));
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
+ "location: %d -> %d unused %d -> %d flags %d -> %d\n",
+ shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
+ "Node not in page flags");
+#endif
+
+ if (SECTIONS_WIDTH) {
+ shift -= SECTIONS_WIDTH;
+ BUG_ON(shift != SECTIONS_PGSHIFT);
+ }
+ if (NODES_WIDTH) {
+ shift -= NODES_WIDTH;
+ BUG_ON(shift != NODES_PGSHIFT);
+ }
+ if (ZONES_WIDTH) {
+ shift -= ZONES_WIDTH;
+ BUG_ON(shift != ZONES_PGSHIFT);
+ }
+
+ /* Check for bitmask overlaps */
+ or_mask = (ZONES_MASK << ZONES_PGSHIFT) |
+ (NODES_MASK << NODES_PGSHIFT) |
+ (SECTIONS_MASK << SECTIONS_PGSHIFT);
+ add_mask = (ZONES_MASK << ZONES_PGSHIFT) +
+ (NODES_MASK << NODES_PGSHIFT) +
+ (SECTIONS_MASK << SECTIONS_PGSHIFT);
+ BUG_ON(or_mask != add_mask);
+}
+
+void __meminit mminit_verify_page_links(struct page *page, enum zone_type zone,
+ unsigned long nid, unsigned long pfn)
+{
+ BUG_ON(page_to_nid(page) != nid);
+ BUG_ON(page_zonenum(page) != zone);
+ BUG_ON(page_to_pfn(page) != pfn);
+}
+
+static __init int set_mminit_loglevel(char *str)
+{
+ get_option(&str, &mminit_loglevel);
+ return 0;
+}
+early_param("mminit_loglevel", set_mminit_loglevel);
+#endif /* CONFIG_DEBUG_MEMORY_INIT */
+
+struct kobject *mm_kobj;
+EXPORT_SYMBOL_GPL(mm_kobj);
+
+static int __init mm_sysfs_init(void)
+{
+ mm_kobj = kobject_create_and_add("mm", kernel_kobj);
+ if (!mm_kobj)
+ return -ENOMEM;
+
+ return 0;
+}
+
+__initcall(mm_sysfs_init);
diff --git a/mm/mmap.c b/mm/mmap.c
new file mode 100644
index 0000000..eb61f47
--- /dev/null
+++ b/mm/mmap.c
@@ -0,0 +1,2482 @@
+/*
+ * mm/mmap.c
+ *
+ * Written by obz.
+ *
+ * Address space accounting code <alan@redhat.com>
+ */
+
+#include <linux/slab.h>
+#include <linux/backing-dev.h>
+#include <linux/mm.h>
+#include <linux/shm.h>
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/syscalls.h>
+#include <linux/capability.h>
+#include <linux/init.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/personality.h>
+#include <linux/security.h>
+#include <linux/hugetlb.h>
+#include <linux/profile.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/mempolicy.h>
+#include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
+
+#include <asm/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/tlb.h>
+#include <asm/mmu_context.h>
+
+#include "internal.h"
+
+#ifndef arch_mmap_check
+#define arch_mmap_check(addr, len, flags) (0)
+#endif
+
+#ifndef arch_rebalance_pgtables
+#define arch_rebalance_pgtables(addr, len) (addr)
+#endif
+
+static void unmap_region(struct mm_struct *mm,
+ struct vm_area_struct *vma, struct vm_area_struct *prev,
+ unsigned long start, unsigned long end);
+
+/*
+ * WARNING: the debugging will use recursive algorithms so never enable this
+ * unless you know what you are doing.
+ */
+#undef DEBUG_MM_RB
+
+/* description of effects of mapping type and prot in current implementation.
+ * this is due to the limited x86 page protection hardware. The expected
+ * behavior is in parens:
+ *
+ * map_type prot
+ * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC
+ * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes
+ * w: (no) no w: (no) no w: (yes) yes w: (no) no
+ * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
+ *
+ * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes
+ * w: (no) no w: (no) no w: (copy) copy w: (no) no
+ * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
+ *
+ */
+pgprot_t protection_map[16] = {
+ __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
+ __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
+};
+
+pgprot_t vm_get_page_prot(unsigned long vm_flags)
+{
+ return __pgprot(pgprot_val(protection_map[vm_flags &
+ (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) |
+ pgprot_val(arch_vm_get_page_prot(vm_flags)));
+}
+EXPORT_SYMBOL(vm_get_page_prot);
+
+int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
+int sysctl_overcommit_ratio = 50; /* default is 50% */
+int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
+atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
+
+/*
+ * Check that a process has enough memory to allocate a new virtual
+ * mapping. 0 means there is enough memory for the allocation to
+ * succeed and -ENOMEM implies there is not.
+ *
+ * We currently support three overcommit policies, which are set via the
+ * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
+ *
+ * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
+ * Additional code 2002 Jul 20 by Robert Love.
+ *
+ * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
+ *
+ * Note this is a helper function intended to be used by LSMs which
+ * wish to use this logic.
+ */
+int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
+{
+ unsigned long free, allowed;
+
+ vm_acct_memory(pages);
+
+ /*
+ * Sometimes we want to use more memory than we have
+ */
+ if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
+ return 0;
+
+ if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
+ unsigned long n;
+
+ free = global_page_state(NR_FILE_PAGES);
+ free += nr_swap_pages;
+
+ /*
+ * Any slabs which are created with the
+ * SLAB_RECLAIM_ACCOUNT flag claim to have contents
+ * which are reclaimable, under pressure. The dentry
+ * cache and most inode caches should fall into this
+ */
+ free += global_page_state(NR_SLAB_RECLAIMABLE);
+
+ /*
+ * Leave the last 3% for root
+ */
+ if (!cap_sys_admin)
+ free -= free / 32;
+
+ if (free > pages)
+ return 0;
+
+ /*
+ * nr_free_pages() is very expensive on large systems,
+ * only call if we're about to fail.
+ */
+ n = nr_free_pages();
+
+ /*
+ * Leave reserved pages. The pages are not for anonymous pages.
+ */
+ if (n <= totalreserve_pages)
+ goto error;
+ else
+ n -= totalreserve_pages;
+
+ /*
+ * Leave the last 3% for root
+ */
+ if (!cap_sys_admin)
+ n -= n / 32;
+ free += n;
+
+ if (free > pages)
+ return 0;
+
+ goto error;
+ }
+
+ allowed = (totalram_pages - hugetlb_total_pages())
+ * sysctl_overcommit_ratio / 100;
+ /*
+ * Leave the last 3% for root
+ */
+ if (!cap_sys_admin)
+ allowed -= allowed / 32;
+ allowed += total_swap_pages;
+
+ /* Don't let a single process grow too big:
+ leave 3% of the size of this process for other processes */
+ if (mm)
+ allowed -= mm->total_vm / 32;
+
+ /*
+ * cast `allowed' as a signed long because vm_committed_space
+ * sometimes has a negative value
+ */
+ if (atomic_long_read(&vm_committed_space) < (long)allowed)
+ return 0;
+error:
+ vm_unacct_memory(pages);
+
+ return -ENOMEM;
+}
+
+/*
+ * Requires inode->i_mapping->i_mmap_lock
+ */
+static void __remove_shared_vm_struct(struct vm_area_struct *vma,
+ struct file *file, struct address_space *mapping)
+{
+ if (vma->vm_flags & VM_DENYWRITE)
+ atomic_inc(&file->f_path.dentry->d_inode->i_writecount);
+ if (vma->vm_flags & VM_SHARED)
+ mapping->i_mmap_writable--;
+
+ flush_dcache_mmap_lock(mapping);
+ if (unlikely(vma->vm_flags & VM_NONLINEAR))
+ list_del_init(&vma->shared.vm_set.list);
+ else
+ vma_prio_tree_remove(vma, &mapping->i_mmap);
+ flush_dcache_mmap_unlock(mapping);
+}
+
+/*
+ * Unlink a file-based vm structure from its prio_tree, to hide
+ * vma from rmap and vmtruncate before freeing its page tables.
+ */
+void unlink_file_vma(struct vm_area_struct *vma)
+{
+ struct file *file = vma->vm_file;
+
+ if (file) {
+ struct address_space *mapping = file->f_mapping;
+ spin_lock(&mapping->i_mmap_lock);
+ __remove_shared_vm_struct(vma, file, mapping);
+ spin_unlock(&mapping->i_mmap_lock);
+ }
+}
+
+/*
+ * Close a vm structure and free it, returning the next.
+ */
+static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
+{
+ struct vm_area_struct *next = vma->vm_next;
+
+ might_sleep();
+ if (vma->vm_ops && vma->vm_ops->close)
+ vma->vm_ops->close(vma);
+ if (vma->vm_file) {
+ fput(vma->vm_file);
+ if (vma->vm_flags & VM_EXECUTABLE)
+ removed_exe_file_vma(vma->vm_mm);
+ }
+ mpol_put(vma_policy(vma));
+ kmem_cache_free(vm_area_cachep, vma);
+ return next;
+}
+
+SYSCALL_DEFINE1(brk, unsigned long, brk)
+{
+ unsigned long rlim, retval;
+ unsigned long newbrk, oldbrk;
+ struct mm_struct *mm = current->mm;
+ unsigned long min_brk;
+
+ down_write(&mm->mmap_sem);
+
+#ifdef CONFIG_COMPAT_BRK
+ min_brk = mm->end_code;
+#else
+ min_brk = mm->start_brk;
+#endif
+ if (brk < min_brk)
+ goto out;
+
+ /*
+ * Check against rlimit here. If this check is done later after the test
+ * of oldbrk with newbrk then it can escape the test and let the data
+ * segment grow beyond its set limit the in case where the limit is
+ * not page aligned -Ram Gupta
+ */
+ rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
+ if (rlim < RLIM_INFINITY && (brk - mm->start_brk) +
+ (mm->end_data - mm->start_data) > rlim)
+ goto out;
+
+ newbrk = PAGE_ALIGN(brk);
+ oldbrk = PAGE_ALIGN(mm->brk);
+ if (oldbrk == newbrk)
+ goto set_brk;
+
+ /* Always allow shrinking brk. */
+ if (brk <= mm->brk) {
+ if (!do_munmap(mm, newbrk, oldbrk-newbrk))
+ goto set_brk;
+ goto out;
+ }
+
+ /* Check against existing mmap mappings. */
+ if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
+ goto out;
+
+ /* Ok, looks good - let it rip. */
+ if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
+ goto out;
+set_brk:
+ mm->brk = brk;
+out:
+ retval = mm->brk;
+ up_write(&mm->mmap_sem);
+ return retval;
+}
+
+#ifdef DEBUG_MM_RB
+static int browse_rb(struct rb_root *root)
+{
+ int i = 0, j;
+ struct rb_node *nd, *pn = NULL;
+ unsigned long prev = 0, pend = 0;
+
+ for (nd = rb_first(root); nd; nd = rb_next(nd)) {
+ struct vm_area_struct *vma;
+ vma = rb_entry(nd, struct vm_area_struct, vm_rb);
+ if (vma->vm_start < prev)
+ printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1;
+ if (vma->vm_start < pend)
+ printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
+ if (vma->vm_start > vma->vm_end)
+ printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start);
+ i++;
+ pn = nd;
+ prev = vma->vm_start;
+ pend = vma->vm_end;
+ }
+ j = 0;
+ for (nd = pn; nd; nd = rb_prev(nd)) {
+ j++;
+ }
+ if (i != j)
+ printk("backwards %d, forwards %d\n", j, i), i = 0;
+ return i;
+}
+
+void validate_mm(struct mm_struct *mm)
+{
+ int bug = 0;
+ int i = 0;
+ struct vm_area_struct *tmp = mm->mmap;
+ while (tmp) {
+ tmp = tmp->vm_next;
+ i++;
+ }
+ if (i != mm->map_count)
+ printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
+ i = browse_rb(&mm->mm_rb);
+ if (i != mm->map_count)
+ printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
+ BUG_ON(bug);
+}
+#else
+#define validate_mm(mm) do { } while (0)
+#endif
+
+static struct vm_area_struct *
+find_vma_prepare(struct mm_struct *mm, unsigned long addr,
+ struct vm_area_struct **pprev, struct rb_node ***rb_link,
+ struct rb_node ** rb_parent)
+{
+ struct vm_area_struct * vma;
+ struct rb_node ** __rb_link, * __rb_parent, * rb_prev;
+
+ __rb_link = &mm->mm_rb.rb_node;
+ rb_prev = __rb_parent = NULL;
+ vma = NULL;
+
+ while (*__rb_link) {
+ struct vm_area_struct *vma_tmp;
+
+ __rb_parent = *__rb_link;
+ vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
+
+ if (vma_tmp->vm_end > addr) {
+ vma = vma_tmp;
+ if (vma_tmp->vm_start <= addr)
+ break;
+ __rb_link = &__rb_parent->rb_left;
+ } else {
+ rb_prev = __rb_parent;
+ __rb_link = &__rb_parent->rb_right;
+ }
+ }
+
+ *pprev = NULL;
+ if (rb_prev)
+ *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
+ *rb_link = __rb_link;
+ *rb_parent = __rb_parent;
+ return vma;
+}
+
+static inline void
+__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct vm_area_struct *prev, struct rb_node *rb_parent)
+{
+ if (prev) {
+ vma->vm_next = prev->vm_next;
+ prev->vm_next = vma;
+ } else {
+ mm->mmap = vma;
+ if (rb_parent)
+ vma->vm_next = rb_entry(rb_parent,
+ struct vm_area_struct, vm_rb);
+ else
+ vma->vm_next = NULL;
+ }
+}
+
+void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct rb_node **rb_link, struct rb_node *rb_parent)
+{
+ rb_link_node(&vma->vm_rb, rb_parent, rb_link);
+ rb_insert_color(&vma->vm_rb, &mm->mm_rb);
+}
+
+static void __vma_link_file(struct vm_area_struct *vma)
+{
+ struct file * file;
+
+ file = vma->vm_file;
+ if (file) {
+ struct address_space *mapping = file->f_mapping;
+
+ if (vma->vm_flags & VM_DENYWRITE)
+ atomic_dec(&file->f_path.dentry->d_inode->i_writecount);
+ if (vma->vm_flags & VM_SHARED)
+ mapping->i_mmap_writable++;
+
+ flush_dcache_mmap_lock(mapping);
+ if (unlikely(vma->vm_flags & VM_NONLINEAR))
+ vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
+ else
+ vma_prio_tree_insert(vma, &mapping->i_mmap);
+ flush_dcache_mmap_unlock(mapping);
+ }
+}
+
+static void
+__vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct vm_area_struct *prev, struct rb_node **rb_link,
+ struct rb_node *rb_parent)
+{
+ __vma_link_list(mm, vma, prev, rb_parent);
+ __vma_link_rb(mm, vma, rb_link, rb_parent);
+ __anon_vma_link(vma);
+}
+
+static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct vm_area_struct *prev, struct rb_node **rb_link,
+ struct rb_node *rb_parent)
+{
+ struct address_space *mapping = NULL;
+
+ if (vma->vm_file)
+ mapping = vma->vm_file->f_mapping;
+
+ if (mapping) {
+ spin_lock(&mapping->i_mmap_lock);
+ vma->vm_truncate_count = mapping->truncate_count;
+ }
+ anon_vma_lock(vma);
+
+ __vma_link(mm, vma, prev, rb_link, rb_parent);
+ __vma_link_file(vma);
+
+ anon_vma_unlock(vma);
+ if (mapping)
+ spin_unlock(&mapping->i_mmap_lock);
+
+ mm->map_count++;
+ validate_mm(mm);
+}
+
+/*
+ * Helper for vma_adjust in the split_vma insert case:
+ * insert vm structure into list and rbtree and anon_vma,
+ * but it has already been inserted into prio_tree earlier.
+ */
+static void
+__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
+{
+ struct vm_area_struct * __vma, * prev;
+ struct rb_node ** rb_link, * rb_parent;
+
+ __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
+ BUG_ON(__vma && __vma->vm_start < vma->vm_end);
+ __vma_link(mm, vma, prev, rb_link, rb_parent);
+ mm->map_count++;
+}
+
+static inline void
+__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct vm_area_struct *prev)
+{
+ prev->vm_next = vma->vm_next;
+ rb_erase(&vma->vm_rb, &mm->mm_rb);
+ if (mm->mmap_cache == vma)
+ mm->mmap_cache = prev;
+}
+
+/*
+ * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
+ * is already present in an i_mmap tree without adjusting the tree.
+ * The following helper function should be used when such adjustments
+ * are necessary. The "insert" vma (if any) is to be inserted
+ * before we drop the necessary locks.
+ */
+void vma_adjust(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *next = vma->vm_next;
+ struct vm_area_struct *importer = NULL;
+ struct address_space *mapping = NULL;
+ struct prio_tree_root *root = NULL;
+ struct file *file = vma->vm_file;
+ struct anon_vma *anon_vma = NULL;
+ long adjust_next = 0;
+ int remove_next = 0;
+
+ if (next && !insert) {
+ if (end >= next->vm_end) {
+ /*
+ * vma expands, overlapping all the next, and
+ * perhaps the one after too (mprotect case 6).
+ */
+again: remove_next = 1 + (end > next->vm_end);
+ end = next->vm_end;
+ anon_vma = next->anon_vma;
+ importer = vma;
+ } else if (end > next->vm_start) {
+ /*
+ * vma expands, overlapping part of the next:
+ * mprotect case 5 shifting the boundary up.
+ */
+ adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
+ anon_vma = next->anon_vma;
+ importer = vma;
+ } else if (end < vma->vm_end) {
+ /*
+ * vma shrinks, and !insert tells it's not
+ * split_vma inserting another: so it must be
+ * mprotect case 4 shifting the boundary down.
+ */
+ adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT);
+ anon_vma = next->anon_vma;
+ importer = next;
+ }
+ }
+
+ if (file) {
+ mapping = file->f_mapping;
+ if (!(vma->vm_flags & VM_NONLINEAR))
+ root = &mapping->i_mmap;
+ spin_lock(&mapping->i_mmap_lock);
+ if (importer &&
+ vma->vm_truncate_count != next->vm_truncate_count) {
+ /*
+ * unmap_mapping_range might be in progress:
+ * ensure that the expanding vma is rescanned.
+ */
+ importer->vm_truncate_count = 0;
+ }
+ if (insert) {
+ insert->vm_truncate_count = vma->vm_truncate_count;
+ /*
+ * Put into prio_tree now, so instantiated pages
+ * are visible to arm/parisc __flush_dcache_page
+ * throughout; but we cannot insert into address
+ * space until vma start or end is updated.
+ */
+ __vma_link_file(insert);
+ }
+ }
+
+ /*
+ * When changing only vma->vm_end, we don't really need
+ * anon_vma lock: but is that case worth optimizing out?
+ */
+ if (vma->anon_vma)
+ anon_vma = vma->anon_vma;
+ if (anon_vma) {
+ spin_lock(&anon_vma->lock);
+ /*
+ * Easily overlooked: when mprotect shifts the boundary,
+ * make sure the expanding vma has anon_vma set if the
+ * shrinking vma had, to cover any anon pages imported.
+ */
+ if (importer && !importer->anon_vma) {
+ importer->anon_vma = anon_vma;
+ __anon_vma_link(importer);
+ }
+ }
+
+ if (root) {
+ flush_dcache_mmap_lock(mapping);
+ vma_prio_tree_remove(vma, root);
+ if (adjust_next)
+ vma_prio_tree_remove(next, root);
+ }
+
+ vma->vm_start = start;
+ vma->vm_end = end;
+ vma->vm_pgoff = pgoff;
+ if (adjust_next) {
+ next->vm_start += adjust_next << PAGE_SHIFT;
+ next->vm_pgoff += adjust_next;
+ }
+
+ if (root) {
+ if (adjust_next)
+ vma_prio_tree_insert(next, root);
+ vma_prio_tree_insert(vma, root);
+ flush_dcache_mmap_unlock(mapping);
+ }
+
+ if (remove_next) {
+ /*
+ * vma_merge has merged next into vma, and needs
+ * us to remove next before dropping the locks.
+ */
+ __vma_unlink(mm, next, vma);
+ if (file)
+ __remove_shared_vm_struct(next, file, mapping);
+ if (next->anon_vma)
+ __anon_vma_merge(vma, next);
+ } else if (insert) {
+ /*
+ * split_vma has split insert from vma, and needs
+ * us to insert it before dropping the locks
+ * (it may either follow vma or precede it).
+ */
+ __insert_vm_struct(mm, insert);
+ }
+
+ if (anon_vma)
+ spin_unlock(&anon_vma->lock);
+ if (mapping)
+ spin_unlock(&mapping->i_mmap_lock);
+
+ if (remove_next) {
+ if (file) {
+ fput(file);
+ if (next->vm_flags & VM_EXECUTABLE)
+ removed_exe_file_vma(mm);
+ }
+ mm->map_count--;
+ mpol_put(vma_policy(next));
+ kmem_cache_free(vm_area_cachep, next);
+ /*
+ * In mprotect's case 6 (see comments on vma_merge),
+ * we must remove another next too. It would clutter
+ * up the code too much to do both in one go.
+ */
+ if (remove_next == 2) {
+ next = vma->vm_next;
+ goto again;
+ }
+ }
+
+ validate_mm(mm);
+}
+
+/*
+ * If the vma has a ->close operation then the driver probably needs to release
+ * per-vma resources, so we don't attempt to merge those.
+ */
+static inline int is_mergeable_vma(struct vm_area_struct *vma,
+ struct file *file, unsigned long vm_flags)
+{
+ if (vma->vm_flags != vm_flags)
+ return 0;
+ if (vma->vm_file != file)
+ return 0;
+ if (vma->vm_ops && vma->vm_ops->close)
+ return 0;
+ return 1;
+}
+
+static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
+ struct anon_vma *anon_vma2)
+{
+ return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2);
+}
+
+/*
+ * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
+ * in front of (at a lower virtual address and file offset than) the vma.
+ *
+ * We cannot merge two vmas if they have differently assigned (non-NULL)
+ * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
+ *
+ * We don't check here for the merged mmap wrapping around the end of pagecache
+ * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
+ * wrap, nor mmaps which cover the final page at index -1UL.
+ */
+static int
+can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
+ struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+{
+ if (is_mergeable_vma(vma, file, vm_flags) &&
+ is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
+ if (vma->vm_pgoff == vm_pgoff)
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
+ * beyond (at a higher virtual address and file offset than) the vma.
+ *
+ * We cannot merge two vmas if they have differently assigned (non-NULL)
+ * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
+ */
+static int
+can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
+ struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+{
+ if (is_mergeable_vma(vma, file, vm_flags) &&
+ is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
+ pgoff_t vm_pglen;
+ vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+ if (vma->vm_pgoff + vm_pglen == vm_pgoff)
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
+ * whether that can be merged with its predecessor or its successor.
+ * Or both (it neatly fills a hole).
+ *
+ * In most cases - when called for mmap, brk or mremap - [addr,end) is
+ * certain not to be mapped by the time vma_merge is called; but when
+ * called for mprotect, it is certain to be already mapped (either at
+ * an offset within prev, or at the start of next), and the flags of
+ * this area are about to be changed to vm_flags - and the no-change
+ * case has already been eliminated.
+ *
+ * The following mprotect cases have to be considered, where AAAA is
+ * the area passed down from mprotect_fixup, never extending beyond one
+ * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
+ *
+ * AAAA AAAA AAAA AAAA
+ * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX
+ * cannot merge might become might become might become
+ * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or
+ * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or
+ * mremap move: PPPPNNNNNNNN 8
+ * AAAA
+ * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN
+ * might become case 1 below case 2 below case 3 below
+ *
+ * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX:
+ * mprotect_fixup updates vm_flags & vm_page_prot on successful return.
+ */
+struct vm_area_struct *vma_merge(struct mm_struct *mm,
+ struct vm_area_struct *prev, unsigned long addr,
+ unsigned long end, unsigned long vm_flags,
+ struct anon_vma *anon_vma, struct file *file,
+ pgoff_t pgoff, struct mempolicy *policy)
+{
+ pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
+ struct vm_area_struct *area, *next;
+
+ /*
+ * We later require that vma->vm_flags == vm_flags,
+ * so this tests vma->vm_flags & VM_SPECIAL, too.
+ */
+ if (vm_flags & VM_SPECIAL)
+ return NULL;
+
+ if (prev)
+ next = prev->vm_next;
+ else
+ next = mm->mmap;
+ area = next;
+ if (next && next->vm_end == end) /* cases 6, 7, 8 */
+ next = next->vm_next;
+
+ /*
+ * Can it merge with the predecessor?
+ */
+ if (prev && prev->vm_end == addr &&
+ mpol_equal(vma_policy(prev), policy) &&
+ can_vma_merge_after(prev, vm_flags,
+ anon_vma, file, pgoff)) {
+ /*
+ * OK, it can. Can we now merge in the successor as well?
+ */
+ if (next && end == next->vm_start &&
+ mpol_equal(policy, vma_policy(next)) &&
+ can_vma_merge_before(next, vm_flags,
+ anon_vma, file, pgoff+pglen) &&
+ is_mergeable_anon_vma(prev->anon_vma,
+ next->anon_vma)) {
+ /* cases 1, 6 */
+ vma_adjust(prev, prev->vm_start,
+ next->vm_end, prev->vm_pgoff, NULL);
+ } else /* cases 2, 5, 7 */
+ vma_adjust(prev, prev->vm_start,
+ end, prev->vm_pgoff, NULL);
+ return prev;
+ }
+
+ /*
+ * Can this new request be merged in front of next?
+ */
+ if (next && end == next->vm_start &&
+ mpol_equal(policy, vma_policy(next)) &&
+ can_vma_merge_before(next, vm_flags,
+ anon_vma, file, pgoff+pglen)) {
+ if (prev && addr < prev->vm_end) /* case 4 */
+ vma_adjust(prev, prev->vm_start,
+ addr, prev->vm_pgoff, NULL);
+ else /* cases 3, 8 */
+ vma_adjust(area, addr, next->vm_end,
+ next->vm_pgoff - pglen, NULL);
+ return area;
+ }
+
+ return NULL;
+}
+
+/*
+ * find_mergeable_anon_vma is used by anon_vma_prepare, to check
+ * neighbouring vmas for a suitable anon_vma, before it goes off
+ * to allocate a new anon_vma. It checks because a repetitive
+ * sequence of mprotects and faults may otherwise lead to distinct
+ * anon_vmas being allocated, preventing vma merge in subsequent
+ * mprotect.
+ */
+struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
+{
+ struct vm_area_struct *near;
+ unsigned long vm_flags;
+
+ near = vma->vm_next;
+ if (!near)
+ goto try_prev;
+
+ /*
+ * Since only mprotect tries to remerge vmas, match flags
+ * which might be mprotected into each other later on.
+ * Neither mlock nor madvise tries to remerge at present,
+ * so leave their flags as obstructing a merge.
+ */
+ vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
+ vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
+
+ if (near->anon_vma && vma->vm_end == near->vm_start &&
+ mpol_equal(vma_policy(vma), vma_policy(near)) &&
+ can_vma_merge_before(near, vm_flags,
+ NULL, vma->vm_file, vma->vm_pgoff +
+ ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)))
+ return near->anon_vma;
+try_prev:
+ /*
+ * It is potentially slow to have to call find_vma_prev here.
+ * But it's only on the first write fault on the vma, not
+ * every time, and we could devise a way to avoid it later
+ * (e.g. stash info in next's anon_vma_node when assigning
+ * an anon_vma, or when trying vma_merge). Another time.
+ */
+ BUG_ON(find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma);
+ if (!near)
+ goto none;
+
+ vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
+ vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
+
+ if (near->anon_vma && near->vm_end == vma->vm_start &&
+ mpol_equal(vma_policy(near), vma_policy(vma)) &&
+ can_vma_merge_after(near, vm_flags,
+ NULL, vma->vm_file, vma->vm_pgoff))
+ return near->anon_vma;
+none:
+ /*
+ * There's no absolute need to look only at touching neighbours:
+ * we could search further afield for "compatible" anon_vmas.
+ * But it would probably just be a waste of time searching,
+ * or lead to too many vmas hanging off the same anon_vma.
+ * We're trying to allow mprotect remerging later on,
+ * not trying to minimize memory used for anon_vmas.
+ */
+ return NULL;
+}
+
+#ifdef CONFIG_PROC_FS
+void vm_stat_account(struct mm_struct *mm, unsigned long flags,
+ struct file *file, long pages)
+{
+ const unsigned long stack_flags
+ = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
+
+ if (file) {
+ mm->shared_vm += pages;
+ if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
+ mm->exec_vm += pages;
+ } else if (flags & stack_flags)
+ mm->stack_vm += pages;
+ if (flags & (VM_RESERVED|VM_IO))
+ mm->reserved_vm += pages;
+}
+#endif /* CONFIG_PROC_FS */
+
+/*
+ * The caller must hold down_write(current->mm->mmap_sem).
+ */
+
+unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
+ unsigned long len, unsigned long prot,
+ unsigned long flags, unsigned long pgoff)
+{
+ struct mm_struct * mm = current->mm;
+ struct inode *inode;
+ unsigned int vm_flags;
+ int error;
+ int accountable = 1;
+ unsigned long reqprot = prot;
+
+ /*
+ * Does the application expect PROT_READ to imply PROT_EXEC?
+ *
+ * (the exception is when the underlying filesystem is noexec
+ * mounted, in which case we dont add PROT_EXEC.)
+ */
+ if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
+ if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
+ prot |= PROT_EXEC;
+
+ if (!len)
+ return -EINVAL;
+
+ if (!(flags & MAP_FIXED))
+ addr = round_hint_to_min(addr);
+
+ error = arch_mmap_check(addr, len, flags);
+ if (error)
+ return error;
+
+ /* Careful about overflows.. */
+ len = PAGE_ALIGN(len);
+ if (!len || len > TASK_SIZE)
+ return -ENOMEM;
+
+ /* offset overflow? */
+ if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
+ return -EOVERFLOW;
+
+ /* Too many mappings? */
+ if (mm->map_count > sysctl_max_map_count)
+ return -ENOMEM;
+
+ /* Obtain the address to map to. we verify (or select) it and ensure
+ * that it represents a valid section of the address space.
+ */
+ addr = get_unmapped_area(file, addr, len, pgoff, flags);
+ if (addr & ~PAGE_MASK)
+ return addr;
+
+ /* Do simple checking here so the lower-level routines won't have
+ * to. we assume access permissions have been handled by the open
+ * of the memory object, so we don't do any here.
+ */
+ vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
+ mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+
+ if (flags & MAP_LOCKED) {
+ if (!can_do_mlock())
+ return -EPERM;
+ vm_flags |= VM_LOCKED;
+ }
+
+ /* mlock MCL_FUTURE? */
+ if (vm_flags & VM_LOCKED) {
+ unsigned long locked, lock_limit;
+ locked = len >> PAGE_SHIFT;
+ locked += mm->locked_vm;
+ lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ lock_limit >>= PAGE_SHIFT;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+ return -EAGAIN;
+ }
+
+ inode = file ? file->f_path.dentry->d_inode : NULL;
+
+ if (file) {
+ switch (flags & MAP_TYPE) {
+ case MAP_SHARED:
+ if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
+ return -EACCES;
+
+ /*
+ * Make sure we don't allow writing to an append-only
+ * file..
+ */
+ if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
+ return -EACCES;
+
+ /*
+ * Make sure there are no mandatory locks on the file.
+ */
+ if (locks_verify_locked(inode))
+ return -EAGAIN;
+
+ vm_flags |= VM_SHARED | VM_MAYSHARE;
+ if (!(file->f_mode & FMODE_WRITE))
+ vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
+
+ /* fall through */
+ case MAP_PRIVATE:
+ if (!(file->f_mode & FMODE_READ))
+ return -EACCES;
+ if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
+ if (vm_flags & VM_EXEC)
+ return -EPERM;
+ vm_flags &= ~VM_MAYEXEC;
+ }
+ if (is_file_hugepages(file))
+ accountable = 0;
+
+ if (!file->f_op || !file->f_op->mmap)
+ return -ENODEV;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+ } else {
+ switch (flags & MAP_TYPE) {
+ case MAP_SHARED:
+ /*
+ * Ignore pgoff.
+ */
+ pgoff = 0;
+ vm_flags |= VM_SHARED | VM_MAYSHARE;
+ break;
+ case MAP_PRIVATE:
+ /*
+ * Set pgoff according to addr for anon_vma.
+ */
+ pgoff = addr >> PAGE_SHIFT;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
+ if (error)
+ return error;
+
+ return mmap_region(file, addr, len, flags, vm_flags, pgoff,
+ accountable);
+}
+EXPORT_SYMBOL(do_mmap_pgoff);
+
+/*
+ * Some shared mappigns will want the pages marked read-only
+ * to track write events. If so, we'll downgrade vm_page_prot
+ * to the private version (using protection_map[] without the
+ * VM_SHARED bit).
+ */
+int vma_wants_writenotify(struct vm_area_struct *vma)
+{
+ unsigned int vm_flags = vma->vm_flags;
+
+ /* If it was private or non-writable, the write bit is already clear */
+ if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
+ return 0;
+
+ /* The backer wishes to know when pages are first written to? */
+ if (vma->vm_ops && vma->vm_ops->page_mkwrite)
+ return 1;
+
+ /* The open routine did something to the protections already? */
+ if (pgprot_val(vma->vm_page_prot) !=
+ pgprot_val(vm_get_page_prot(vm_flags)))
+ return 0;
+
+ /* Specialty mapping? */
+ if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE))
+ return 0;
+
+ /* Can the mapping track the dirty pages? */
+ return vma->vm_file && vma->vm_file->f_mapping &&
+ mapping_cap_account_dirty(vma->vm_file->f_mapping);
+}
+
+unsigned long mmap_region(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long flags,
+ unsigned int vm_flags, unsigned long pgoff,
+ int accountable)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma, *prev;
+ struct vm_area_struct *merged_vma;
+ int correct_wcount = 0;
+ int error;
+ struct rb_node **rb_link, *rb_parent;
+ unsigned long charged = 0;
+ struct inode *inode = file ? file->f_path.dentry->d_inode : NULL;
+
+ /* Clear old maps */
+ error = -ENOMEM;
+munmap_back:
+ vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+ if (vma && vma->vm_start < addr + len) {
+ if (do_munmap(mm, addr, len))
+ return -ENOMEM;
+ goto munmap_back;
+ }
+
+ /* Check against address space limit. */
+ if (!may_expand_vm(mm, len >> PAGE_SHIFT))
+ return -ENOMEM;
+
+ if (flags & MAP_NORESERVE)
+ vm_flags |= VM_NORESERVE;
+
+ if (accountable && (!(flags & MAP_NORESERVE) ||
+ sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
+ if (vm_flags & VM_SHARED) {
+ /* Check memory availability in shmem_file_setup? */
+ vm_flags |= VM_ACCOUNT;
+ } else if (vm_flags & VM_WRITE) {
+ /*
+ * Private writable mapping: check memory availability
+ */
+ charged = len >> PAGE_SHIFT;
+ if (security_vm_enough_memory(charged))
+ return -ENOMEM;
+ vm_flags |= VM_ACCOUNT;
+ }
+ }
+
+ /*
+ * Can we just expand an old private anonymous mapping?
+ * The VM_SHARED test is necessary because shmem_zero_setup
+ * will create the file object for a shared anonymous map below.
+ */
+ if (!file && !(vm_flags & VM_SHARED)) {
+ vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
+ NULL, NULL, pgoff, NULL);
+ if (vma)
+ goto out;
+ }
+
+ /*
+ * Determine the object being mapped and call the appropriate
+ * specific mapper. the address has already been validated, but
+ * not unmapped, but the maps are removed from the list.
+ */
+ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+ if (!vma) {
+ error = -ENOMEM;
+ goto unacct_error;
+ }
+
+ vma->vm_mm = mm;
+ vma->vm_start = addr;
+ vma->vm_end = addr + len;
+ vma->vm_flags = vm_flags;
+ vma->vm_page_prot = vm_get_page_prot(vm_flags);
+ vma->vm_pgoff = pgoff;
+
+ if (file) {
+ error = -EINVAL;
+ if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
+ goto free_vma;
+ if (vm_flags & VM_DENYWRITE) {
+ error = deny_write_access(file);
+ if (error)
+ goto free_vma;
+ correct_wcount = 1;
+ }
+ vma->vm_file = file;
+ get_file(file);
+ error = file->f_op->mmap(file, vma);
+ if (error)
+ goto unmap_and_free_vma;
+ if (vm_flags & VM_EXECUTABLE)
+ added_exe_file_vma(mm);
+ } else if (vm_flags & VM_SHARED) {
+ error = shmem_zero_setup(vma);
+ if (error)
+ goto free_vma;
+ }
+
+ /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
+ * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
+ * that memory reservation must be checked; but that reservation
+ * belongs to shared memory object, not to vma: so now clear it.
+ */
+ if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
+ vma->vm_flags &= ~VM_ACCOUNT;
+
+ /* Can addr have changed??
+ *
+ * Answer: Yes, several device drivers can do it in their
+ * f_op->mmap method. -DaveM
+ */
+ addr = vma->vm_start;
+ pgoff = vma->vm_pgoff;
+ vm_flags = vma->vm_flags;
+
+ if (vma_wants_writenotify(vma))
+ vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
+
+ merged_vma = NULL;
+ if (file)
+ merged_vma = vma_merge(mm, prev, addr, vma->vm_end,
+ vma->vm_flags, NULL, file, pgoff, vma_policy(vma));
+ if (merged_vma) {
+ mpol_put(vma_policy(vma));
+ kmem_cache_free(vm_area_cachep, vma);
+ fput(file);
+ if (vm_flags & VM_EXECUTABLE)
+ removed_exe_file_vma(mm);
+ vma = merged_vma;
+ } else {
+ vma_link(mm, vma, prev, rb_link, rb_parent);
+ file = vma->vm_file;
+ }
+
+ /* Once vma denies write, undo our temporary denial count */
+ if (correct_wcount)
+ atomic_inc(&inode->i_writecount);
+out:
+ mm->total_vm += len >> PAGE_SHIFT;
+ vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
+ if (vm_flags & VM_LOCKED) {
+ /*
+ * makes pages present; downgrades, drops, reacquires mmap_sem
+ */
+ long nr_pages = mlock_vma_pages_range(vma, addr, addr + len);
+ if (nr_pages < 0)
+ return nr_pages; /* vma gone! */
+ mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages;
+ } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
+ make_pages_present(addr, addr + len);
+ return addr;
+
+unmap_and_free_vma:
+ if (correct_wcount)
+ atomic_inc(&inode->i_writecount);
+ vma->vm_file = NULL;
+ fput(file);
+
+ /* Undo any partial mapping done by a device driver. */
+ unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
+ charged = 0;
+free_vma:
+ kmem_cache_free(vm_area_cachep, vma);
+unacct_error:
+ if (charged)
+ vm_unacct_memory(charged);
+ return error;
+}
+
+/* Get an address range which is currently unmapped.
+ * For shmat() with addr=0.
+ *
+ * Ugly calling convention alert:
+ * Return value with the low bits set means error value,
+ * ie
+ * if (ret & ~PAGE_MASK)
+ * error = ret;
+ *
+ * This function "knows" that -ENOMEM has the bits set.
+ */
+#ifndef HAVE_ARCH_UNMAPPED_AREA
+unsigned long
+arch_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long start_addr;
+
+ if (len > TASK_SIZE)
+ return -ENOMEM;
+
+ if (flags & MAP_FIXED)
+ return addr;
+
+ if (addr) {
+ addr = PAGE_ALIGN(addr);
+ vma = find_vma(mm, addr);
+ if (TASK_SIZE - len >= addr &&
+ (!vma || addr + len <= vma->vm_start))
+ return addr;
+ }
+ if (len > mm->cached_hole_size) {
+ start_addr = addr = mm->free_area_cache;
+ } else {
+ start_addr = addr = TASK_UNMAPPED_BASE;
+ mm->cached_hole_size = 0;
+ }
+
+full_search:
+ for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+ /* At this point: (!vma || addr < vma->vm_end). */
+ if (TASK_SIZE - len < addr) {
+ /*
+ * Start a new search - just in case we missed
+ * some holes.
+ */
+ if (start_addr != TASK_UNMAPPED_BASE) {
+ addr = TASK_UNMAPPED_BASE;
+ start_addr = addr;
+ mm->cached_hole_size = 0;
+ goto full_search;
+ }
+ return -ENOMEM;
+ }
+ if (!vma || addr + len <= vma->vm_start) {
+ /*
+ * Remember the place where we stopped the search:
+ */
+ mm->free_area_cache = addr + len;
+ return addr;
+ }
+ if (addr + mm->cached_hole_size < vma->vm_start)
+ mm->cached_hole_size = vma->vm_start - addr;
+ addr = vma->vm_end;
+ }
+}
+#endif
+
+void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
+{
+ /*
+ * Is this a new hole at the lowest possible address?
+ */
+ if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) {
+ mm->free_area_cache = addr;
+ mm->cached_hole_size = ~0UL;
+ }
+}
+
+/*
+ * This mmap-allocator allocates new areas top-down from below the
+ * stack's low limit (the base):
+ */
+#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
+unsigned long
+arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
+ const unsigned long len, const unsigned long pgoff,
+ const unsigned long flags)
+{
+ struct vm_area_struct *vma;
+ struct mm_struct *mm = current->mm;
+ unsigned long addr = addr0;
+
+ /* requested length too big for entire address space */
+ if (len > TASK_SIZE)
+ return -ENOMEM;
+
+ if (flags & MAP_FIXED)
+ return addr;
+
+ /* requesting a specific address */
+ if (addr) {
+ addr = PAGE_ALIGN(addr);
+ vma = find_vma(mm, addr);
+ if (TASK_SIZE - len >= addr &&
+ (!vma || addr + len <= vma->vm_start))
+ return addr;
+ }
+
+ /* check if free_area_cache is useful for us */
+ if (len <= mm->cached_hole_size) {
+ mm->cached_hole_size = 0;
+ mm->free_area_cache = mm->mmap_base;
+ }
+
+ /* either no address requested or can't fit in requested address hole */
+ addr = mm->free_area_cache;
+
+ /* make sure it can fit in the remaining address space */
+ if (addr > len) {
+ vma = find_vma(mm, addr-len);
+ if (!vma || addr <= vma->vm_start)
+ /* remember the address as a hint for next time */
+ return (mm->free_area_cache = addr-len);
+ }
+
+ if (mm->mmap_base < len)
+ goto bottomup;
+
+ addr = mm->mmap_base-len;
+
+ do {
+ /*
+ * Lookup failure means no vma is above this address,
+ * else if new region fits below vma->vm_start,
+ * return with success:
+ */
+ vma = find_vma(mm, addr);
+ if (!vma || addr+len <= vma->vm_start)
+ /* remember the address as a hint for next time */
+ return (mm->free_area_cache = addr);
+
+ /* remember the largest hole we saw so far */
+ if (addr + mm->cached_hole_size < vma->vm_start)
+ mm->cached_hole_size = vma->vm_start - addr;
+
+ /* try just below the current vma->vm_start */
+ addr = vma->vm_start-len;
+ } while (len < vma->vm_start);
+
+bottomup:
+ /*
+ * A failed mmap() very likely causes application failure,
+ * so fall back to the bottom-up function here. This scenario
+ * can happen with large stack limits and large mmap()
+ * allocations.
+ */
+ mm->cached_hole_size = ~0UL;
+ mm->free_area_cache = TASK_UNMAPPED_BASE;
+ addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
+ /*
+ * Restore the topdown base:
+ */
+ mm->free_area_cache = mm->mmap_base;
+ mm->cached_hole_size = ~0UL;
+
+ return addr;
+}
+#endif
+
+void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
+{
+ /*
+ * Is this a new hole at the highest possible address?
+ */
+ if (addr > mm->free_area_cache)
+ mm->free_area_cache = addr;
+
+ /* dont allow allocations above current base */
+ if (mm->free_area_cache > mm->mmap_base)
+ mm->free_area_cache = mm->mmap_base;
+}
+
+unsigned long
+get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags)
+{
+ unsigned long (*get_area)(struct file *, unsigned long,
+ unsigned long, unsigned long, unsigned long);
+
+ get_area = current->mm->get_unmapped_area;
+ if (file && file->f_op && file->f_op->get_unmapped_area)
+ get_area = file->f_op->get_unmapped_area;
+ addr = get_area(file, addr, len, pgoff, flags);
+ if (IS_ERR_VALUE(addr))
+ return addr;
+
+ if (addr > TASK_SIZE - len)
+ return -ENOMEM;
+ if (addr & ~PAGE_MASK)
+ return -EINVAL;
+
+ return arch_rebalance_pgtables(addr, len);
+}
+
+EXPORT_SYMBOL(get_unmapped_area);
+
+/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
+struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
+{
+ struct vm_area_struct *vma = NULL;
+
+ if (mm) {
+ /* Check the cache first. */
+ /* (Cache hit rate is typically around 35%.) */
+ vma = mm->mmap_cache;
+ if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
+ struct rb_node * rb_node;
+
+ rb_node = mm->mm_rb.rb_node;
+ vma = NULL;
+
+ while (rb_node) {
+ struct vm_area_struct * vma_tmp;
+
+ vma_tmp = rb_entry(rb_node,
+ struct vm_area_struct, vm_rb);
+
+ if (vma_tmp->vm_end > addr) {
+ vma = vma_tmp;
+ if (vma_tmp->vm_start <= addr)
+ break;
+ rb_node = rb_node->rb_left;
+ } else
+ rb_node = rb_node->rb_right;
+ }
+ if (vma)
+ mm->mmap_cache = vma;
+ }
+ }
+ return vma;
+}
+
+EXPORT_SYMBOL(find_vma);
+
+/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
+struct vm_area_struct *
+find_vma_prev(struct mm_struct *mm, unsigned long addr,
+ struct vm_area_struct **pprev)
+{
+ struct vm_area_struct *vma = NULL, *prev = NULL;
+ struct rb_node * rb_node;
+ if (!mm)
+ goto out;
+
+ /* Guard against addr being lower than the first VMA */
+ vma = mm->mmap;
+
+ /* Go through the RB tree quickly. */
+ rb_node = mm->mm_rb.rb_node;
+
+ while (rb_node) {
+ struct vm_area_struct *vma_tmp;
+ vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+
+ if (addr < vma_tmp->vm_end) {
+ rb_node = rb_node->rb_left;
+ } else {
+ prev = vma_tmp;
+ if (!prev->vm_next || (addr < prev->vm_next->vm_end))
+ break;
+ rb_node = rb_node->rb_right;
+ }
+ }
+
+out:
+ *pprev = prev;
+ return prev ? prev->vm_next : vma;
+}
+
+/*
+ * Verify that the stack growth is acceptable and
+ * update accounting. This is shared with both the
+ * grow-up and grow-down cases.
+ */
+static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct rlimit *rlim = current->signal->rlim;
+ unsigned long new_start;
+
+ /* address space limit tests */
+ if (!may_expand_vm(mm, grow))
+ return -ENOMEM;
+
+ /* Stack limit test */
+ if (size > rlim[RLIMIT_STACK].rlim_cur)
+ return -ENOMEM;
+
+ /* mlock limit tests */
+ if (vma->vm_flags & VM_LOCKED) {
+ unsigned long locked;
+ unsigned long limit;
+ locked = mm->locked_vm + grow;
+ limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
+ if (locked > limit && !capable(CAP_IPC_LOCK))
+ return -ENOMEM;
+ }
+
+ /* Check to ensure the stack will not grow into a hugetlb-only region */
+ new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
+ vma->vm_end - size;
+ if (is_hugepage_only_range(vma->vm_mm, new_start, size))
+ return -EFAULT;
+
+ /*
+ * Overcommit.. This must be the final test, as it will
+ * update security statistics.
+ */
+ if (security_vm_enough_memory(grow))
+ return -ENOMEM;
+
+ /* Ok, everything looks good - let it rip */
+ mm->total_vm += grow;
+ if (vma->vm_flags & VM_LOCKED)
+ mm->locked_vm += grow;
+ vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
+ return 0;
+}
+
+#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
+/*
+ * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
+ * vma is the last one with address > vma->vm_end. Have to extend vma.
+ */
+#ifndef CONFIG_IA64
+static
+#endif
+int expand_upwards(struct vm_area_struct *vma, unsigned long address)
+{
+ int error;
+
+ if (!(vma->vm_flags & VM_GROWSUP))
+ return -EFAULT;
+
+ /*
+ * We must make sure the anon_vma is allocated
+ * so that the anon_vma locking is not a noop.
+ */
+ if (unlikely(anon_vma_prepare(vma)))
+ return -ENOMEM;
+ anon_vma_lock(vma);
+
+ /*
+ * vma->vm_start/vm_end cannot change under us because the caller
+ * is required to hold the mmap_sem in read mode. We need the
+ * anon_vma lock to serialize against concurrent expand_stacks.
+ * Also guard against wrapping around to address 0.
+ */
+ if (address < PAGE_ALIGN(address+4))
+ address = PAGE_ALIGN(address+4);
+ else {
+ anon_vma_unlock(vma);
+ return -ENOMEM;
+ }
+ error = 0;
+
+ /* Somebody else might have raced and expanded it already */
+ if (address > vma->vm_end) {
+ unsigned long size, grow;
+
+ size = address - vma->vm_start;
+ grow = (address - vma->vm_end) >> PAGE_SHIFT;
+
+ error = acct_stack_growth(vma, size, grow);
+ if (!error)
+ vma->vm_end = address;
+ }
+ anon_vma_unlock(vma);
+ return error;
+}
+#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
+
+/*
+ * vma is the first one with address < vma->vm_start. Have to extend vma.
+ */
+static int expand_downwards(struct vm_area_struct *vma,
+ unsigned long address)
+{
+ int error;
+
+ /*
+ * We must make sure the anon_vma is allocated
+ * so that the anon_vma locking is not a noop.
+ */
+ if (unlikely(anon_vma_prepare(vma)))
+ return -ENOMEM;
+
+ address &= PAGE_MASK;
+ error = security_file_mmap(NULL, 0, 0, 0, address, 1);
+ if (error)
+ return error;
+
+ anon_vma_lock(vma);
+
+ /*
+ * vma->vm_start/vm_end cannot change under us because the caller
+ * is required to hold the mmap_sem in read mode. We need the
+ * anon_vma lock to serialize against concurrent expand_stacks.
+ */
+
+ /* Somebody else might have raced and expanded it already */
+ if (address < vma->vm_start) {
+ unsigned long size, grow;
+
+ size = vma->vm_end - address;
+ grow = (vma->vm_start - address) >> PAGE_SHIFT;
+
+ error = acct_stack_growth(vma, size, grow);
+ if (!error) {
+ vma->vm_start = address;
+ vma->vm_pgoff -= grow;
+ }
+ }
+ anon_vma_unlock(vma);
+ return error;
+}
+
+int expand_stack_downwards(struct vm_area_struct *vma, unsigned long address)
+{
+ return expand_downwards(vma, address);
+}
+
+#ifdef CONFIG_STACK_GROWSUP
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+ return expand_upwards(vma, address);
+}
+
+struct vm_area_struct *
+find_extend_vma(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_area_struct *vma, *prev;
+
+ addr &= PAGE_MASK;
+ vma = find_vma_prev(mm, addr, &prev);
+ if (vma && (vma->vm_start <= addr))
+ return vma;
+ if (!prev || expand_stack(prev, addr))
+ return NULL;
+ if (prev->vm_flags & VM_LOCKED) {
+ if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0)
+ return NULL; /* vma gone! */
+ }
+ return prev;
+}
+#else
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+ return expand_downwards(vma, address);
+}
+
+struct vm_area_struct *
+find_extend_vma(struct mm_struct * mm, unsigned long addr)
+{
+ struct vm_area_struct * vma;
+ unsigned long start;
+
+ addr &= PAGE_MASK;
+ vma = find_vma(mm,addr);
+ if (!vma)
+ return NULL;
+ if (vma->vm_start <= addr)
+ return vma;
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ return NULL;
+ start = vma->vm_start;
+ if (expand_stack(vma, addr))
+ return NULL;
+ if (vma->vm_flags & VM_LOCKED) {
+ if (mlock_vma_pages_range(vma, addr, start) < 0)
+ return NULL; /* vma gone! */
+ }
+ return vma;
+}
+#endif
+
+/*
+ * Ok - we have the memory areas we should free on the vma list,
+ * so release them, and do the vma updates.
+ *
+ * Called with the mm semaphore held.
+ */
+static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ /* Update high watermark before we lower total_vm */
+ update_hiwater_vm(mm);
+ do {
+ long nrpages = vma_pages(vma);
+
+ mm->total_vm -= nrpages;
+ vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
+ vma = remove_vma(vma);
+ } while (vma);
+ validate_mm(mm);
+}
+
+/*
+ * Get rid of page table information in the indicated region.
+ *
+ * Called with the mm semaphore held.
+ */
+static void unmap_region(struct mm_struct *mm,
+ struct vm_area_struct *vma, struct vm_area_struct *prev,
+ unsigned long start, unsigned long end)
+{
+ struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
+ struct mmu_gather *tlb;
+ unsigned long nr_accounted = 0;
+
+ lru_add_drain();
+ tlb = tlb_gather_mmu(mm, 0);
+ update_hiwater_rss(mm);
+ unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
+ vm_unacct_memory(nr_accounted);
+ free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
+ next? next->vm_start: 0);
+ tlb_finish_mmu(tlb, start, end);
+}
+
+/*
+ * Create a list of vma's touched by the unmap, removing them from the mm's
+ * vma list as we go..
+ */
+static void
+detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct vm_area_struct *prev, unsigned long end)
+{
+ struct vm_area_struct **insertion_point;
+ struct vm_area_struct *tail_vma = NULL;
+ unsigned long addr;
+
+ insertion_point = (prev ? &prev->vm_next : &mm->mmap);
+ do {
+ rb_erase(&vma->vm_rb, &mm->mm_rb);
+ mm->map_count--;
+ tail_vma = vma;
+ vma = vma->vm_next;
+ } while (vma && vma->vm_start < end);
+ *insertion_point = vma;
+ tail_vma->vm_next = NULL;
+ if (mm->unmap_area == arch_unmap_area)
+ addr = prev ? prev->vm_end : mm->mmap_base;
+ else
+ addr = vma ? vma->vm_start : mm->mmap_base;
+ mm->unmap_area(mm, addr);
+ mm->mmap_cache = NULL; /* Kill the cache. */
+}
+
+/*
+ * Split a vma into two pieces at address 'addr', a new vma is allocated
+ * either for the first part or the tail.
+ */
+int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
+ unsigned long addr, int new_below)
+{
+ struct mempolicy *pol;
+ struct vm_area_struct *new;
+
+ if (is_vm_hugetlb_page(vma) && (addr &
+ ~(huge_page_mask(hstate_vma(vma)))))
+ return -EINVAL;
+
+ if (mm->map_count >= sysctl_max_map_count)
+ return -ENOMEM;
+
+ new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ /* most fields are the same, copy all, and then fixup */
+ *new = *vma;
+
+ if (new_below)
+ new->vm_end = addr;
+ else {
+ new->vm_start = addr;
+ new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
+ }
+
+ pol = mpol_dup(vma_policy(vma));
+ if (IS_ERR(pol)) {
+ kmem_cache_free(vm_area_cachep, new);
+ return PTR_ERR(pol);
+ }
+ vma_set_policy(new, pol);
+
+ if (new->vm_file) {
+ get_file(new->vm_file);
+ if (vma->vm_flags & VM_EXECUTABLE)
+ added_exe_file_vma(mm);
+ }
+
+ if (new->vm_ops && new->vm_ops->open)
+ new->vm_ops->open(new);
+
+ if (new_below)
+ vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
+ ((addr - new->vm_start) >> PAGE_SHIFT), new);
+ else
+ vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
+
+ return 0;
+}
+
+/* Munmap is split into 2 main parts -- this part which finds
+ * what needs doing, and the areas themselves, which do the
+ * work. This now handles partial unmappings.
+ * Jeremy Fitzhardinge <jeremy@goop.org>
+ */
+int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
+{
+ unsigned long end;
+ struct vm_area_struct *vma, *prev, *last;
+
+ if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
+ return -EINVAL;
+
+ if ((len = PAGE_ALIGN(len)) == 0)
+ return -EINVAL;
+
+ /* Find the first overlapping VMA */
+ vma = find_vma_prev(mm, start, &prev);
+ if (!vma)
+ return 0;
+ /* we have start < vma->vm_end */
+
+ /* if it doesn't overlap, we have nothing.. */
+ end = start + len;
+ if (vma->vm_start >= end)
+ return 0;
+
+ /*
+ * If we need to split any vma, do it now to save pain later.
+ *
+ * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
+ * unmapped vm_area_struct will remain in use: so lower split_vma
+ * places tmp vma above, and higher split_vma places tmp vma below.
+ */
+ if (start > vma->vm_start) {
+ int error = split_vma(mm, vma, start, 0);
+ if (error)
+ return error;
+ prev = vma;
+ }
+
+ /* Does it split the last one? */
+ last = find_vma(mm, end);
+ if (last && end > last->vm_start) {
+ int error = split_vma(mm, last, end, 1);
+ if (error)
+ return error;
+ }
+ vma = prev? prev->vm_next: mm->mmap;
+
+ /*
+ * unlock any mlock()ed ranges before detaching vmas
+ */
+ if (mm->locked_vm) {
+ struct vm_area_struct *tmp = vma;
+ while (tmp && tmp->vm_start < end) {
+ if (tmp->vm_flags & VM_LOCKED) {
+ mm->locked_vm -= vma_pages(tmp);
+ munlock_vma_pages_all(tmp);
+ }
+ tmp = tmp->vm_next;
+ }
+ }
+
+ /*
+ * Remove the vma's, and unmap the actual pages
+ */
+ detach_vmas_to_be_unmapped(mm, vma, prev, end);
+ unmap_region(mm, vma, prev, start, end);
+
+ /* Fix up all other VM information */
+ remove_vma_list(mm, vma);
+
+ return 0;
+}
+
+EXPORT_SYMBOL(do_munmap);
+
+SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
+{
+ int ret;
+ struct mm_struct *mm = current->mm;
+
+ profile_munmap(addr);
+
+ down_write(&mm->mmap_sem);
+ ret = do_munmap(mm, addr, len);
+ up_write(&mm->mmap_sem);
+ return ret;
+}
+
+static inline void verify_mm_writelocked(struct mm_struct *mm)
+{
+#ifdef CONFIG_DEBUG_VM
+ if (unlikely(down_read_trylock(&mm->mmap_sem))) {
+ WARN_ON(1);
+ up_read(&mm->mmap_sem);
+ }
+#endif
+}
+
+/*
+ * this is really a simplified "do_mmap". it only handles
+ * anonymous maps. eventually we may be able to do some
+ * brk-specific accounting here.
+ */
+unsigned long do_brk(unsigned long addr, unsigned long len)
+{
+ struct mm_struct * mm = current->mm;
+ struct vm_area_struct * vma, * prev;
+ unsigned long flags;
+ struct rb_node ** rb_link, * rb_parent;
+ pgoff_t pgoff = addr >> PAGE_SHIFT;
+ int error;
+
+ len = PAGE_ALIGN(len);
+ if (!len)
+ return addr;
+
+ if ((addr + len) > TASK_SIZE || (addr + len) < addr)
+ return -EINVAL;
+
+ if (is_hugepage_only_range(mm, addr, len))
+ return -EINVAL;
+
+ error = security_file_mmap(NULL, 0, 0, 0, addr, 1);
+ if (error)
+ return error;
+
+ flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
+
+ error = arch_mmap_check(addr, len, flags);
+ if (error)
+ return error;
+
+ /*
+ * mlock MCL_FUTURE?
+ */
+ if (mm->def_flags & VM_LOCKED) {
+ unsigned long locked, lock_limit;
+ locked = len >> PAGE_SHIFT;
+ locked += mm->locked_vm;
+ lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ lock_limit >>= PAGE_SHIFT;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+ return -EAGAIN;
+ }
+
+ /*
+ * mm->mmap_sem is required to protect against another thread
+ * changing the mappings in case we sleep.
+ */
+ verify_mm_writelocked(mm);
+
+ /*
+ * Clear old maps. this also does some error checking for us
+ */
+ munmap_back:
+ vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+ if (vma && vma->vm_start < addr + len) {
+ if (do_munmap(mm, addr, len))
+ return -ENOMEM;
+ goto munmap_back;
+ }
+
+ /* Check against address space limits *after* clearing old maps... */
+ if (!may_expand_vm(mm, len >> PAGE_SHIFT))
+ return -ENOMEM;
+
+ if (mm->map_count > sysctl_max_map_count)
+ return -ENOMEM;
+
+ if (security_vm_enough_memory(len >> PAGE_SHIFT))
+ return -ENOMEM;
+
+ /* Can we just expand an old private anonymous mapping? */
+ vma = vma_merge(mm, prev, addr, addr + len, flags,
+ NULL, NULL, pgoff, NULL);
+ if (vma)
+ goto out;
+
+ /*
+ * create a vma struct for an anonymous mapping
+ */
+ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+ if (!vma) {
+ vm_unacct_memory(len >> PAGE_SHIFT);
+ return -ENOMEM;
+ }
+
+ vma->vm_mm = mm;
+ vma->vm_start = addr;
+ vma->vm_end = addr + len;
+ vma->vm_pgoff = pgoff;
+ vma->vm_flags = flags;
+ vma->vm_page_prot = vm_get_page_prot(flags);
+ vma_link(mm, vma, prev, rb_link, rb_parent);
+out:
+ mm->total_vm += len >> PAGE_SHIFT;
+ if (flags & VM_LOCKED) {
+ if (!mlock_vma_pages_range(vma, addr, addr + len))
+ mm->locked_vm += (len >> PAGE_SHIFT);
+ }
+ return addr;
+}
+
+EXPORT_SYMBOL(do_brk);
+
+/* Release all mmaps. */
+void exit_mmap(struct mm_struct *mm)
+{
+ struct mmu_gather *tlb;
+ struct vm_area_struct *vma;
+ unsigned long nr_accounted = 0;
+ unsigned long end;
+
+ /* mm's last user has gone, and its about to be pulled down */
+ mmu_notifier_release(mm);
+
+ if (mm->locked_vm) {
+ vma = mm->mmap;
+ while (vma) {
+ if (vma->vm_flags & VM_LOCKED)
+ munlock_vma_pages_all(vma);
+ vma = vma->vm_next;
+ }
+ }
+
+ arch_exit_mmap(mm);
+
+ vma = mm->mmap;
+ if (!vma) /* Can happen if dup_mmap() received an OOM */
+ return;
+
+ lru_add_drain();
+ flush_cache_mm(mm);
+ tlb = tlb_gather_mmu(mm, 1);
+ /* Don't update_hiwater_rss(mm) here, do_exit already did */
+ /* Use -1 here to ensure all VMAs in the mm are unmapped */
+ end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
+ vm_unacct_memory(nr_accounted);
+ free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
+ tlb_finish_mmu(tlb, 0, end);
+
+ /*
+ * Walk the list again, actually closing and freeing it,
+ * with preemption enabled, without holding any MM locks.
+ */
+ while (vma)
+ vma = remove_vma(vma);
+
+ BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
+}
+
+/* Insert vm structure into process list sorted by address
+ * and into the inode's i_mmap tree. If vm_file is non-NULL
+ * then i_mmap_lock is taken here.
+ */
+int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
+{
+ struct vm_area_struct * __vma, * prev;
+ struct rb_node ** rb_link, * rb_parent;
+
+ /*
+ * The vm_pgoff of a purely anonymous vma should be irrelevant
+ * until its first write fault, when page's anon_vma and index
+ * are set. But now set the vm_pgoff it will almost certainly
+ * end up with (unless mremap moves it elsewhere before that
+ * first wfault), so /proc/pid/maps tells a consistent story.
+ *
+ * By setting it to reflect the virtual start address of the
+ * vma, merges and splits can happen in a seamless way, just
+ * using the existing file pgoff checks and manipulations.
+ * Similarly in do_mmap_pgoff and in do_brk.
+ */
+ if (!vma->vm_file) {
+ BUG_ON(vma->anon_vma);
+ vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
+ }
+ __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);
+ if (__vma && __vma->vm_start < vma->vm_end)
+ return -ENOMEM;
+ if ((vma->vm_flags & VM_ACCOUNT) &&
+ security_vm_enough_memory_mm(mm, vma_pages(vma)))
+ return -ENOMEM;
+ vma_link(mm, vma, prev, rb_link, rb_parent);
+ return 0;
+}
+
+/*
+ * Copy the vma structure to a new location in the same mm,
+ * prior to moving page table entries, to effect an mremap move.
+ */
+struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
+ unsigned long addr, unsigned long len, pgoff_t pgoff)
+{
+ struct vm_area_struct *vma = *vmap;
+ unsigned long vma_start = vma->vm_start;
+ struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *new_vma, *prev;
+ struct rb_node **rb_link, *rb_parent;
+ struct mempolicy *pol;
+
+ /*
+ * If anonymous vma has not yet been faulted, update new pgoff
+ * to match new location, to increase its chance of merging.
+ */
+ if (!vma->vm_file && !vma->anon_vma)
+ pgoff = addr >> PAGE_SHIFT;
+
+ find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+ new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
+ vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+ if (new_vma) {
+ /*
+ * Source vma may have been merged into new_vma
+ */
+ if (vma_start >= new_vma->vm_start &&
+ vma_start < new_vma->vm_end)
+ *vmap = new_vma;
+ } else {
+ new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+ if (new_vma) {
+ *new_vma = *vma;
+ pol = mpol_dup(vma_policy(vma));
+ if (IS_ERR(pol)) {
+ kmem_cache_free(vm_area_cachep, new_vma);
+ return NULL;
+ }
+ vma_set_policy(new_vma, pol);
+ new_vma->vm_start = addr;
+ new_vma->vm_end = addr + len;
+ new_vma->vm_pgoff = pgoff;
+ if (new_vma->vm_file) {
+ get_file(new_vma->vm_file);
+ if (vma->vm_flags & VM_EXECUTABLE)
+ added_exe_file_vma(mm);
+ }
+ if (new_vma->vm_ops && new_vma->vm_ops->open)
+ new_vma->vm_ops->open(new_vma);
+ vma_link(mm, new_vma, prev, rb_link, rb_parent);
+ }
+ }
+ return new_vma;
+}
+
+/*
+ * Return true if the calling process may expand its vm space by the passed
+ * number of pages
+ */
+int may_expand_vm(struct mm_struct *mm, unsigned long npages)
+{
+ unsigned long cur = mm->total_vm; /* pages */
+ unsigned long lim;
+
+ lim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
+
+ if (cur + npages > lim)
+ return 0;
+ return 1;
+}
+
+
+static int special_mapping_fault(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+ pgoff_t pgoff;
+ struct page **pages;
+
+ /*
+ * special mappings have no vm_file, and in that case, the mm
+ * uses vm_pgoff internally. So we have to subtract it from here.
+ * We are allowed to do this because we are the mm; do not copy
+ * this code into drivers!
+ */
+ pgoff = vmf->pgoff - vma->vm_pgoff;
+
+ for (pages = vma->vm_private_data; pgoff && *pages; ++pages)
+ pgoff--;
+
+ if (*pages) {
+ struct page *page = *pages;
+ get_page(page);
+ vmf->page = page;
+ return 0;
+ }
+
+ return VM_FAULT_SIGBUS;
+}
+
+/*
+ * Having a close hook prevents vma merging regardless of flags.
+ */
+static void special_mapping_close(struct vm_area_struct *vma)
+{
+}
+
+static struct vm_operations_struct special_mapping_vmops = {
+ .close = special_mapping_close,
+ .fault = special_mapping_fault,
+};
+
+/*
+ * Called with mm->mmap_sem held for writing.
+ * Insert a new vma covering the given region, with the given flags.
+ * Its pages are supplied by the given array of struct page *.
+ * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
+ * The region past the last page supplied will always produce SIGBUS.
+ * The array pointer and the pages it points to are assumed to stay alive
+ * for as long as this mapping might exist.
+ */
+int install_special_mapping(struct mm_struct *mm,
+ unsigned long addr, unsigned long len,
+ unsigned long vm_flags, struct page **pages)
+{
+ struct vm_area_struct *vma;
+
+ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+ if (unlikely(vma == NULL))
+ return -ENOMEM;
+
+ vma->vm_mm = mm;
+ vma->vm_start = addr;
+ vma->vm_end = addr + len;
+
+ vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND;
+ vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+
+ vma->vm_ops = &special_mapping_vmops;
+ vma->vm_private_data = pages;
+
+ if (unlikely(insert_vm_struct(mm, vma))) {
+ kmem_cache_free(vm_area_cachep, vma);
+ return -ENOMEM;
+ }
+
+ mm->total_vm += len >> PAGE_SHIFT;
+
+ return 0;
+}
+
+static DEFINE_MUTEX(mm_all_locks_mutex);
+
+static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
+{
+ if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+ /*
+ * The LSB of head.next can't change from under us
+ * because we hold the mm_all_locks_mutex.
+ */
+ spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem);
+ /*
+ * We can safely modify head.next after taking the
+ * anon_vma->lock. If some other vma in this mm shares
+ * the same anon_vma we won't take it again.
+ *
+ * No need of atomic instructions here, head.next
+ * can't change from under us thanks to the
+ * anon_vma->lock.
+ */
+ if (__test_and_set_bit(0, (unsigned long *)
+ &anon_vma->head.next))
+ BUG();
+ }
+}
+
+static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
+{
+ if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
+ /*
+ * AS_MM_ALL_LOCKS can't change from under us because
+ * we hold the mm_all_locks_mutex.
+ *
+ * Operations on ->flags have to be atomic because
+ * even if AS_MM_ALL_LOCKS is stable thanks to the
+ * mm_all_locks_mutex, there may be other cpus
+ * changing other bitflags in parallel to us.
+ */
+ if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
+ BUG();
+ spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem);
+ }
+}
+
+/*
+ * This operation locks against the VM for all pte/vma/mm related
+ * operations that could ever happen on a certain mm. This includes
+ * vmtruncate, try_to_unmap, and all page faults.
+ *
+ * The caller must take the mmap_sem in write mode before calling
+ * mm_take_all_locks(). The caller isn't allowed to release the
+ * mmap_sem until mm_drop_all_locks() returns.
+ *
+ * mmap_sem in write mode is required in order to block all operations
+ * that could modify pagetables and free pages without need of
+ * altering the vma layout (for example populate_range() with
+ * nonlinear vmas). It's also needed in write mode to avoid new
+ * anon_vmas to be associated with existing vmas.
+ *
+ * A single task can't take more than one mm_take_all_locks() in a row
+ * or it would deadlock.
+ *
+ * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
+ * mapping->flags avoid to take the same lock twice, if more than one
+ * vma in this mm is backed by the same anon_vma or address_space.
+ *
+ * We can take all the locks in random order because the VM code
+ * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never
+ * takes more than one of them in a row. Secondly we're protected
+ * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
+ *
+ * mm_take_all_locks() and mm_drop_all_locks are expensive operations
+ * that may have to take thousand of locks.
+ *
+ * mm_take_all_locks() can fail if it's interrupted by signals.
+ */
+int mm_take_all_locks(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ int ret = -EINTR;
+
+ BUG_ON(down_read_trylock(&mm->mmap_sem));
+
+ mutex_lock(&mm_all_locks_mutex);
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (signal_pending(current))
+ goto out_unlock;
+ if (vma->vm_file && vma->vm_file->f_mapping)
+ vm_lock_mapping(mm, vma->vm_file->f_mapping);
+ }
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (signal_pending(current))
+ goto out_unlock;
+ if (vma->anon_vma)
+ vm_lock_anon_vma(mm, vma->anon_vma);
+ }
+
+ ret = 0;
+
+out_unlock:
+ if (ret)
+ mm_drop_all_locks(mm);
+
+ return ret;
+}
+
+static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
+{
+ if (test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+ /*
+ * The LSB of head.next can't change to 0 from under
+ * us because we hold the mm_all_locks_mutex.
+ *
+ * We must however clear the bitflag before unlocking
+ * the vma so the users using the anon_vma->head will
+ * never see our bitflag.
+ *
+ * No need of atomic instructions here, head.next
+ * can't change from under us until we release the
+ * anon_vma->lock.
+ */
+ if (!__test_and_clear_bit(0, (unsigned long *)
+ &anon_vma->head.next))
+ BUG();
+ spin_unlock(&anon_vma->lock);
+ }
+}
+
+static void vm_unlock_mapping(struct address_space *mapping)
+{
+ if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
+ /*
+ * AS_MM_ALL_LOCKS can't change to 0 from under us
+ * because we hold the mm_all_locks_mutex.
+ */
+ spin_unlock(&mapping->i_mmap_lock);
+ if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
+ &mapping->flags))
+ BUG();
+ }
+}
+
+/*
+ * The mmap_sem cannot be released by the caller until
+ * mm_drop_all_locks() returns.
+ */
+void mm_drop_all_locks(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ BUG_ON(down_read_trylock(&mm->mmap_sem));
+ BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (vma->anon_vma)
+ vm_unlock_anon_vma(vma->anon_vma);
+ if (vma->vm_file && vma->vm_file->f_mapping)
+ vm_unlock_mapping(vma->vm_file->f_mapping);
+ }
+
+ mutex_unlock(&mm_all_locks_mutex);
+}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
new file mode 100644
index 0000000..5f4ef02
--- /dev/null
+++ b/mm/mmu_notifier.c
@@ -0,0 +1,277 @@
+/*
+ * linux/mm/mmu_notifier.c
+ *
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright (C) 2008 SGI
+ * Christoph Lameter <clameter@sgi.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/rculist.h>
+#include <linux/mmu_notifier.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+
+/*
+ * This function can't run concurrently against mmu_notifier_register
+ * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
+ * runs with mm_users == 0. Other tasks may still invoke mmu notifiers
+ * in parallel despite there being no task using this mm any more,
+ * through the vmas outside of the exit_mmap context, such as with
+ * vmtruncate. This serializes against mmu_notifier_unregister with
+ * the mmu_notifier_mm->lock in addition to RCU and it serializes
+ * against the other mmu notifiers with RCU. struct mmu_notifier_mm
+ * can't go away from under us as exit_mmap holds an mm_count pin
+ * itself.
+ */
+void __mmu_notifier_release(struct mm_struct *mm)
+{
+ struct mmu_notifier *mn;
+
+ spin_lock(&mm->mmu_notifier_mm->lock);
+ while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
+ mn = hlist_entry(mm->mmu_notifier_mm->list.first,
+ struct mmu_notifier,
+ hlist);
+ /*
+ * We arrived before mmu_notifier_unregister so
+ * mmu_notifier_unregister will do nothing other than
+ * to wait ->release to finish and
+ * mmu_notifier_unregister to return.
+ */
+ hlist_del_init_rcu(&mn->hlist);
+ /*
+ * RCU here will block mmu_notifier_unregister until
+ * ->release returns.
+ */
+ rcu_read_lock();
+ spin_unlock(&mm->mmu_notifier_mm->lock);
+ /*
+ * if ->release runs before mmu_notifier_unregister it
+ * must be handled as it's the only way for the driver
+ * to flush all existing sptes and stop the driver
+ * from establishing any more sptes before all the
+ * pages in the mm are freed.
+ */
+ if (mn->ops->release)
+ mn->ops->release(mn, mm);
+ rcu_read_unlock();
+ spin_lock(&mm->mmu_notifier_mm->lock);
+ }
+ spin_unlock(&mm->mmu_notifier_mm->lock);
+
+ /*
+ * synchronize_rcu here prevents mmu_notifier_release to
+ * return to exit_mmap (which would proceed freeing all pages
+ * in the mm) until the ->release method returns, if it was
+ * invoked by mmu_notifier_unregister.
+ *
+ * The mmu_notifier_mm can't go away from under us because one
+ * mm_count is hold by exit_mmap.
+ */
+ synchronize_rcu();
+}
+
+/*
+ * If no young bitflag is supported by the hardware, ->clear_flush_young can
+ * unmap the address and return 1 or 0 depending if the mapping previously
+ * existed or not.
+ */
+int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
+ unsigned long address)
+{
+ struct mmu_notifier *mn;
+ struct hlist_node *n;
+ int young = 0;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+ if (mn->ops->clear_flush_young)
+ young |= mn->ops->clear_flush_young(mn, mm, address);
+ }
+ rcu_read_unlock();
+
+ return young;
+}
+
+void __mmu_notifier_invalidate_page(struct mm_struct *mm,
+ unsigned long address)
+{
+ struct mmu_notifier *mn;
+ struct hlist_node *n;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+ if (mn->ops->invalidate_page)
+ mn->ops->invalidate_page(mn, mm, address);
+ }
+ rcu_read_unlock();
+}
+
+void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ struct mmu_notifier *mn;
+ struct hlist_node *n;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+ if (mn->ops->invalidate_range_start)
+ mn->ops->invalidate_range_start(mn, mm, start, end);
+ }
+ rcu_read_unlock();
+}
+
+void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ struct mmu_notifier *mn;
+ struct hlist_node *n;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+ if (mn->ops->invalidate_range_end)
+ mn->ops->invalidate_range_end(mn, mm, start, end);
+ }
+ rcu_read_unlock();
+}
+
+static int do_mmu_notifier_register(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ int take_mmap_sem)
+{
+ struct mmu_notifier_mm *mmu_notifier_mm;
+ int ret;
+
+ BUG_ON(atomic_read(&mm->mm_users) <= 0);
+
+ ret = -ENOMEM;
+ mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
+ if (unlikely(!mmu_notifier_mm))
+ goto out;
+
+ if (take_mmap_sem)
+ down_write(&mm->mmap_sem);
+ ret = mm_take_all_locks(mm);
+ if (unlikely(ret))
+ goto out_cleanup;
+
+ if (!mm_has_notifiers(mm)) {
+ INIT_HLIST_HEAD(&mmu_notifier_mm->list);
+ spin_lock_init(&mmu_notifier_mm->lock);
+ mm->mmu_notifier_mm = mmu_notifier_mm;
+ mmu_notifier_mm = NULL;
+ }
+ atomic_inc(&mm->mm_count);
+
+ /*
+ * Serialize the update against mmu_notifier_unregister. A
+ * side note: mmu_notifier_release can't run concurrently with
+ * us because we hold the mm_users pin (either implicitly as
+ * current->mm or explicitly with get_task_mm() or similar).
+ * We can't race against any other mmu notifier method either
+ * thanks to mm_take_all_locks().
+ */
+ spin_lock(&mm->mmu_notifier_mm->lock);
+ hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list);
+ spin_unlock(&mm->mmu_notifier_mm->lock);
+
+ mm_drop_all_locks(mm);
+out_cleanup:
+ if (take_mmap_sem)
+ up_write(&mm->mmap_sem);
+ /* kfree() does nothing if mmu_notifier_mm is NULL */
+ kfree(mmu_notifier_mm);
+out:
+ BUG_ON(atomic_read(&mm->mm_users) <= 0);
+ return ret;
+}
+
+/*
+ * Must not hold mmap_sem nor any other VM related lock when calling
+ * this registration function. Must also ensure mm_users can't go down
+ * to zero while this runs to avoid races with mmu_notifier_release,
+ * so mm has to be current->mm or the mm should be pinned safely such
+ * as with get_task_mm(). If the mm is not current->mm, the mm_users
+ * pin should be released by calling mmput after mmu_notifier_register
+ * returns. mmu_notifier_unregister must be always called to
+ * unregister the notifier. mm_count is automatically pinned to allow
+ * mmu_notifier_unregister to safely run at any time later, before or
+ * after exit_mmap. ->release will always be called before exit_mmap
+ * frees the pages.
+ */
+int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+ return do_mmu_notifier_register(mn, mm, 1);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_register);
+
+/*
+ * Same as mmu_notifier_register but here the caller must hold the
+ * mmap_sem in write mode.
+ */
+int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+ return do_mmu_notifier_register(mn, mm, 0);
+}
+EXPORT_SYMBOL_GPL(__mmu_notifier_register);
+
+/* this is called after the last mmu_notifier_unregister() returned */
+void __mmu_notifier_mm_destroy(struct mm_struct *mm)
+{
+ BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list));
+ kfree(mm->mmu_notifier_mm);
+ mm->mmu_notifier_mm = LIST_POISON1; /* debug */
+}
+
+/*
+ * This releases the mm_count pin automatically and frees the mm
+ * structure if it was the last user of it. It serializes against
+ * running mmu notifiers with RCU and against mmu_notifier_unregister
+ * with the unregister lock + RCU. All sptes must be dropped before
+ * calling mmu_notifier_unregister. ->release or any other notifier
+ * method may be invoked concurrently with mmu_notifier_unregister,
+ * and only after mmu_notifier_unregister returned we're guaranteed
+ * that ->release or any other method can't run anymore.
+ */
+void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+ BUG_ON(atomic_read(&mm->mm_count) <= 0);
+
+ spin_lock(&mm->mmu_notifier_mm->lock);
+ if (!hlist_unhashed(&mn->hlist)) {
+ hlist_del_rcu(&mn->hlist);
+
+ /*
+ * RCU here will force exit_mmap to wait ->release to finish
+ * before freeing the pages.
+ */
+ rcu_read_lock();
+ spin_unlock(&mm->mmu_notifier_mm->lock);
+ /*
+ * exit_mmap will block in mmu_notifier_release to
+ * guarantee ->release is called before freeing the
+ * pages.
+ */
+ if (mn->ops->release)
+ mn->ops->release(mn, mm);
+ rcu_read_unlock();
+ } else
+ spin_unlock(&mm->mmu_notifier_mm->lock);
+
+ /*
+ * Wait any running method to finish, of course including
+ * ->release if it was run by mmu_notifier_relase instead of us.
+ */
+ synchronize_rcu();
+
+ BUG_ON(atomic_read(&mm->mm_count) <= 0);
+
+ mmdrop(mm);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
diff --git a/mm/mmzone.c b/mm/mmzone.c
new file mode 100644
index 0000000..16ce8b9
--- /dev/null
+++ b/mm/mmzone.c
@@ -0,0 +1,74 @@
+/*
+ * linux/mm/mmzone.c
+ *
+ * management codes for pgdats and zones.
+ */
+
+
+#include <linux/stddef.h>
+#include <linux/mmzone.h>
+#include <linux/module.h>
+
+struct pglist_data *first_online_pgdat(void)
+{
+ return NODE_DATA(first_online_node);
+}
+
+struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
+{
+ int nid = next_online_node(pgdat->node_id);
+
+ if (nid == MAX_NUMNODES)
+ return NULL;
+ return NODE_DATA(nid);
+}
+
+/*
+ * next_zone - helper magic for for_each_zone()
+ */
+struct zone *next_zone(struct zone *zone)
+{
+ pg_data_t *pgdat = zone->zone_pgdat;
+
+ if (zone < pgdat->node_zones + MAX_NR_ZONES - 1)
+ zone++;
+ else {
+ pgdat = next_online_pgdat(pgdat);
+ if (pgdat)
+ zone = pgdat->node_zones;
+ else
+ zone = NULL;
+ }
+ return zone;
+}
+
+static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
+{
+#ifdef CONFIG_NUMA
+ return node_isset(zonelist_node_idx(zref), *nodes);
+#else
+ return 1;
+#endif /* CONFIG_NUMA */
+}
+
+/* Returns the next zone at or below highest_zoneidx in a zonelist */
+struct zoneref *next_zones_zonelist(struct zoneref *z,
+ enum zone_type highest_zoneidx,
+ nodemask_t *nodes,
+ struct zone **zone)
+{
+ /*
+ * Find the next suitable zone to use for the allocation.
+ * Only filter based on nodemask if it's set
+ */
+ if (likely(nodes == NULL))
+ while (zonelist_zone_idx(z) > highest_zoneidx)
+ z++;
+ else
+ while (zonelist_zone_idx(z) > highest_zoneidx ||
+ (z->zone && !zref_in_nodemask(z, nodes)))
+ z++;
+
+ *zone = zonelist_zone(z);
+ return z;
+}
diff --git a/mm/mprotect.c b/mm/mprotect.c
new file mode 100644
index 0000000..2623e29
--- /dev/null
+++ b/mm/mprotect.c
@@ -0,0 +1,319 @@
+/*
+ * mm/mprotect.c
+ *
+ * (C) Copyright 1994 Linus Torvalds
+ * (C) Copyright 2002 Christoph Hellwig
+ *
+ * Address space accounting code <alan@redhat.com>
+ * (C) Copyright 2002 Red Hat Inc, All Rights Reserved
+ */
+
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/slab.h>
+#include <linux/shm.h>
+#include <linux/mman.h>
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/security.h>
+#include <linux/mempolicy.h>
+#include <linux/personality.h>
+#include <linux/syscalls.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+#ifndef pgprot_modify
+static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
+{
+ return newprot;
+}
+#endif
+
+static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, unsigned long end, pgprot_t newprot,
+ int dirty_accountable)
+{
+ pte_t *pte, oldpte;
+ spinlock_t *ptl;
+
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ arch_enter_lazy_mmu_mode();
+ do {
+ oldpte = *pte;
+ if (pte_present(oldpte)) {
+ pte_t ptent;
+
+ ptent = ptep_modify_prot_start(mm, addr, pte);
+ ptent = pte_modify(ptent, newprot);
+
+ /*
+ * Avoid taking write faults for pages we know to be
+ * dirty.
+ */
+ if (dirty_accountable && pte_dirty(ptent))
+ ptent = pte_mkwrite(ptent);
+
+ ptep_modify_prot_commit(mm, addr, pte, ptent);
+#ifdef CONFIG_MIGRATION
+ } else if (!pte_file(oldpte)) {
+ swp_entry_t entry = pte_to_swp_entry(oldpte);
+
+ if (is_write_migration_entry(entry)) {
+ /*
+ * A protection check is difficult so
+ * just be safe and disable write
+ */
+ make_migration_entry_read(&entry);
+ set_pte_at(mm, addr, pte,
+ swp_entry_to_pte(entry));
+ }
+#endif
+ }
+
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(pte - 1, ptl);
+}
+
+static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
+ unsigned long addr, unsigned long end, pgprot_t newprot,
+ int dirty_accountable)
+{
+ pmd_t *pmd;
+ unsigned long next;
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (pmd_none_or_clear_bad(pmd))
+ continue;
+ change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
+ } while (pmd++, addr = next, addr != end);
+}
+
+static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
+ unsigned long addr, unsigned long end, pgprot_t newprot,
+ int dirty_accountable)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ pud = pud_offset(pgd, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (pud_none_or_clear_bad(pud))
+ continue;
+ change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable);
+ } while (pud++, addr = next, addr != end);
+}
+
+static void change_protection(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end, pgprot_t newprot,
+ int dirty_accountable)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pgd_t *pgd;
+ unsigned long next;
+ unsigned long start = addr;
+
+ BUG_ON(addr >= end);
+ pgd = pgd_offset(mm, addr);
+ flush_cache_range(vma, addr, end);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd))
+ continue;
+ change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable);
+ } while (pgd++, addr = next, addr != end);
+ flush_tlb_range(vma, start, end);
+}
+
+int
+mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
+ unsigned long start, unsigned long end, unsigned long newflags)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long oldflags = vma->vm_flags;
+ long nrpages = (end - start) >> PAGE_SHIFT;
+ unsigned long charged = 0;
+ pgoff_t pgoff;
+ int error;
+ int dirty_accountable = 0;
+
+ if (newflags == oldflags) {
+ *pprev = vma;
+ return 0;
+ }
+
+ /*
+ * If we make a private mapping writable we increase our commit;
+ * but (without finer accounting) cannot reduce our commit if we
+ * make it unwritable again.
+ */
+ if (newflags & VM_WRITE) {
+ if (!(oldflags & (VM_ACCOUNT|VM_WRITE|
+ VM_SHARED|VM_NORESERVE))) {
+ charged = nrpages;
+ if (security_vm_enough_memory(charged))
+ return -ENOMEM;
+ newflags |= VM_ACCOUNT;
+ }
+ }
+
+ /*
+ * First try to merge with previous and/or next vma.
+ */
+ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+ *pprev = vma_merge(mm, *pprev, start, end, newflags,
+ vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+ if (*pprev) {
+ vma = *pprev;
+ goto success;
+ }
+
+ *pprev = vma;
+
+ if (start != vma->vm_start) {
+ error = split_vma(mm, vma, start, 1);
+ if (error)
+ goto fail;
+ }
+
+ if (end != vma->vm_end) {
+ error = split_vma(mm, vma, end, 0);
+ if (error)
+ goto fail;
+ }
+
+success:
+ /*
+ * vm_flags and vm_page_prot are protected by the mmap_sem
+ * held in write mode.
+ */
+ vma->vm_flags = newflags;
+ vma->vm_page_prot = pgprot_modify(vma->vm_page_prot,
+ vm_get_page_prot(newflags));
+
+ if (vma_wants_writenotify(vma)) {
+ vma->vm_page_prot = vm_get_page_prot(newflags & ~VM_SHARED);
+ dirty_accountable = 1;
+ }
+
+ mmu_notifier_invalidate_range_start(mm, start, end);
+ if (is_vm_hugetlb_page(vma))
+ hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
+ else
+ change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
+ mmu_notifier_invalidate_range_end(mm, start, end);
+ vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
+ vm_stat_account(mm, newflags, vma->vm_file, nrpages);
+ return 0;
+
+fail:
+ vm_unacct_memory(charged);
+ return error;
+}
+
+SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
+ unsigned long, prot)
+{
+ unsigned long vm_flags, nstart, end, tmp, reqprot;
+ struct vm_area_struct *vma, *prev;
+ int error = -EINVAL;
+ const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
+ prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
+ if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
+ return -EINVAL;
+
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+ if (!len)
+ return 0;
+ len = PAGE_ALIGN(len);
+ end = start + len;
+ if (end <= start)
+ return -ENOMEM;
+ if (!arch_validate_prot(prot))
+ return -EINVAL;
+
+ reqprot = prot;
+ /*
+ * Does the application expect PROT_READ to imply PROT_EXEC:
+ */
+ if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
+ prot |= PROT_EXEC;
+
+ vm_flags = calc_vm_prot_bits(prot);
+
+ down_write(&current->mm->mmap_sem);
+
+ vma = find_vma_prev(current->mm, start, &prev);
+ error = -ENOMEM;
+ if (!vma)
+ goto out;
+ if (unlikely(grows & PROT_GROWSDOWN)) {
+ if (vma->vm_start >= end)
+ goto out;
+ start = vma->vm_start;
+ error = -EINVAL;
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ goto out;
+ }
+ else {
+ if (vma->vm_start > start)
+ goto out;
+ if (unlikely(grows & PROT_GROWSUP)) {
+ end = vma->vm_end;
+ error = -EINVAL;
+ if (!(vma->vm_flags & VM_GROWSUP))
+ goto out;
+ }
+ }
+ if (start > vma->vm_start)
+ prev = vma;
+
+ for (nstart = start ; ; ) {
+ unsigned long newflags;
+
+ /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
+
+ newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
+
+ /* newflags >> 4 shift VM_MAY% in place of VM_% */
+ if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
+ error = -EACCES;
+ goto out;
+ }
+
+ error = security_file_mprotect(vma, reqprot, prot);
+ if (error)
+ goto out;
+
+ tmp = vma->vm_end;
+ if (tmp > end)
+ tmp = end;
+ error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
+ if (error)
+ goto out;
+ nstart = tmp;
+
+ if (nstart < prev->vm_end)
+ nstart = prev->vm_end;
+ if (nstart >= end)
+ goto out;
+
+ vma = prev->vm_next;
+ if (!vma || vma->vm_start != nstart) {
+ error = -ENOMEM;
+ goto out;
+ }
+ }
+out:
+ up_write(&current->mm->mmap_sem);
+ return error;
+}
diff --git a/mm/mremap.c b/mm/mremap.c
new file mode 100644
index 0000000..4207055
--- /dev/null
+++ b/mm/mremap.c
@@ -0,0 +1,433 @@
+/*
+ * mm/mremap.c
+ *
+ * (C) Copyright 1996 Linus Torvalds
+ *
+ * Address space accounting code <alan@redhat.com>
+ * (C) Copyright 2002 Red Hat Inc, All Rights Reserved
+ */
+
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/slab.h>
+#include <linux/shm.h>
+#include <linux/mman.h>
+#include <linux/swap.h>
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>
+
+#include <asm/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+#include "internal.h"
+
+static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pgd = pgd_offset(mm, addr);
+ if (pgd_none_or_clear_bad(pgd))
+ return NULL;
+
+ pud = pud_offset(pgd, addr);
+ if (pud_none_or_clear_bad(pud))
+ return NULL;
+
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none_or_clear_bad(pmd))
+ return NULL;
+
+ return pmd;
+}
+
+static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pgd = pgd_offset(mm, addr);
+ pud = pud_alloc(mm, pgd, addr);
+ if (!pud)
+ return NULL;
+
+ pmd = pmd_alloc(mm, pud, addr);
+ if (!pmd)
+ return NULL;
+
+ if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr))
+ return NULL;
+
+ return pmd;
+}
+
+static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
+ unsigned long old_addr, unsigned long old_end,
+ struct vm_area_struct *new_vma, pmd_t *new_pmd,
+ unsigned long new_addr)
+{
+ struct address_space *mapping = NULL;
+ struct mm_struct *mm = vma->vm_mm;
+ pte_t *old_pte, *new_pte, pte;
+ spinlock_t *old_ptl, *new_ptl;
+ unsigned long old_start;
+
+ old_start = old_addr;
+ mmu_notifier_invalidate_range_start(vma->vm_mm,
+ old_start, old_end);
+ if (vma->vm_file) {
+ /*
+ * Subtle point from Rajesh Venkatasubramanian: before
+ * moving file-based ptes, we must lock vmtruncate out,
+ * since it might clean the dst vma before the src vma,
+ * and we propagate stale pages into the dst afterward.
+ */
+ mapping = vma->vm_file->f_mapping;
+ spin_lock(&mapping->i_mmap_lock);
+ if (new_vma->vm_truncate_count &&
+ new_vma->vm_truncate_count != vma->vm_truncate_count)
+ new_vma->vm_truncate_count = 0;
+ }
+
+ /*
+ * We don't have to worry about the ordering of src and dst
+ * pte locks because exclusive mmap_sem prevents deadlock.
+ */
+ old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
+ new_pte = pte_offset_map_nested(new_pmd, new_addr);
+ new_ptl = pte_lockptr(mm, new_pmd);
+ if (new_ptl != old_ptl)
+ spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+ arch_enter_lazy_mmu_mode();
+
+ for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
+ new_pte++, new_addr += PAGE_SIZE) {
+ if (pte_none(*old_pte))
+ continue;
+ pte = ptep_clear_flush(vma, old_addr, old_pte);
+ pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
+ set_pte_at(mm, new_addr, new_pte, pte);
+ }
+
+ arch_leave_lazy_mmu_mode();
+ if (new_ptl != old_ptl)
+ spin_unlock(new_ptl);
+ pte_unmap_nested(new_pte - 1);
+ pte_unmap_unlock(old_pte - 1, old_ptl);
+ if (mapping)
+ spin_unlock(&mapping->i_mmap_lock);
+ mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
+}
+
+#define LATENCY_LIMIT (64 * PAGE_SIZE)
+
+unsigned long move_page_tables(struct vm_area_struct *vma,
+ unsigned long old_addr, struct vm_area_struct *new_vma,
+ unsigned long new_addr, unsigned long len)
+{
+ unsigned long extent, next, old_end;
+ pmd_t *old_pmd, *new_pmd;
+
+ old_end = old_addr + len;
+ flush_cache_range(vma, old_addr, old_end);
+
+ for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
+ cond_resched();
+ next = (old_addr + PMD_SIZE) & PMD_MASK;
+ if (next - 1 > old_end)
+ next = old_end;
+ extent = next - old_addr;
+ old_pmd = get_old_pmd(vma->vm_mm, old_addr);
+ if (!old_pmd)
+ continue;
+ new_pmd = alloc_new_pmd(vma->vm_mm, new_addr);
+ if (!new_pmd)
+ break;
+ next = (new_addr + PMD_SIZE) & PMD_MASK;
+ if (extent > next - new_addr)
+ extent = next - new_addr;
+ if (extent > LATENCY_LIMIT)
+ extent = LATENCY_LIMIT;
+ move_ptes(vma, old_pmd, old_addr, old_addr + extent,
+ new_vma, new_pmd, new_addr);
+ }
+
+ return len + old_addr - old_end; /* how much done */
+}
+
+static unsigned long move_vma(struct vm_area_struct *vma,
+ unsigned long old_addr, unsigned long old_len,
+ unsigned long new_len, unsigned long new_addr)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *new_vma;
+ unsigned long vm_flags = vma->vm_flags;
+ unsigned long new_pgoff;
+ unsigned long moved_len;
+ unsigned long excess = 0;
+ unsigned long hiwater_vm;
+ int split = 0;
+
+ /*
+ * We'd prefer to avoid failure later on in do_munmap:
+ * which may split one vma into three before unmapping.
+ */
+ if (mm->map_count >= sysctl_max_map_count - 3)
+ return -ENOMEM;
+
+ new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
+ new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
+ if (!new_vma)
+ return -ENOMEM;
+
+ moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
+ if (moved_len < old_len) {
+ /*
+ * On error, move entries back from new area to old,
+ * which will succeed since page tables still there,
+ * and then proceed to unmap new area instead of old.
+ */
+ move_page_tables(new_vma, new_addr, vma, old_addr, moved_len);
+ vma = new_vma;
+ old_len = new_len;
+ old_addr = new_addr;
+ new_addr = -ENOMEM;
+ }
+
+ /* Conceal VM_ACCOUNT so old reservation is not undone */
+ if (vm_flags & VM_ACCOUNT) {
+ vma->vm_flags &= ~VM_ACCOUNT;
+ excess = vma->vm_end - vma->vm_start - old_len;
+ if (old_addr > vma->vm_start &&
+ old_addr + old_len < vma->vm_end)
+ split = 1;
+ }
+
+ /*
+ * If we failed to move page tables we still do total_vm increment
+ * since do_munmap() will decrement it by old_len == new_len.
+ *
+ * Since total_vm is about to be raised artificially high for a
+ * moment, we need to restore high watermark afterwards: if stats
+ * are taken meanwhile, total_vm and hiwater_vm appear too high.
+ * If this were a serious issue, we'd add a flag to do_munmap().
+ */
+ hiwater_vm = mm->hiwater_vm;
+ mm->total_vm += new_len >> PAGE_SHIFT;
+ vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
+
+ if (do_munmap(mm, old_addr, old_len) < 0) {
+ /* OOM: unable to split vma, just get accounts right */
+ vm_unacct_memory(excess >> PAGE_SHIFT);
+ excess = 0;
+ }
+ mm->hiwater_vm = hiwater_vm;
+
+ /* Restore VM_ACCOUNT if one or two pieces of vma left */
+ if (excess) {
+ vma->vm_flags |= VM_ACCOUNT;
+ if (split)
+ vma->vm_next->vm_flags |= VM_ACCOUNT;
+ }
+
+ if (vm_flags & VM_LOCKED) {
+ mm->locked_vm += new_len >> PAGE_SHIFT;
+ if (new_len > old_len)
+ mlock_vma_pages_range(new_vma, new_addr + old_len,
+ new_addr + new_len);
+ }
+
+ return new_addr;
+}
+
+/*
+ * Expand (or shrink) an existing mapping, potentially moving it at the
+ * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
+ *
+ * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
+ * This option implies MREMAP_MAYMOVE.
+ */
+unsigned long do_mremap(unsigned long addr,
+ unsigned long old_len, unsigned long new_len,
+ unsigned long flags, unsigned long new_addr)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long ret = -EINVAL;
+ unsigned long charged = 0;
+
+ if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
+ goto out;
+
+ if (addr & ~PAGE_MASK)
+ goto out;
+
+ old_len = PAGE_ALIGN(old_len);
+ new_len = PAGE_ALIGN(new_len);
+
+ /*
+ * We allow a zero old-len as a special case
+ * for DOS-emu "duplicate shm area" thing. But
+ * a zero new-len is nonsensical.
+ */
+ if (!new_len)
+ goto out;
+
+ /* new_addr is only valid if MREMAP_FIXED is specified */
+ if (flags & MREMAP_FIXED) {
+ if (new_addr & ~PAGE_MASK)
+ goto out;
+ if (!(flags & MREMAP_MAYMOVE))
+ goto out;
+
+ if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
+ goto out;
+
+ /* Check if the location we're moving into overlaps the
+ * old location at all, and fail if it does.
+ */
+ if ((new_addr <= addr) && (new_addr+new_len) > addr)
+ goto out;
+
+ if ((addr <= new_addr) && (addr+old_len) > new_addr)
+ goto out;
+
+ ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
+ if (ret)
+ goto out;
+
+ ret = do_munmap(mm, new_addr, new_len);
+ if (ret)
+ goto out;
+ }
+
+ /*
+ * Always allow a shrinking remap: that just unmaps
+ * the unnecessary pages..
+ * do_munmap does all the needed commit accounting
+ */
+ if (old_len >= new_len) {
+ ret = do_munmap(mm, addr+new_len, old_len - new_len);
+ if (ret && old_len != new_len)
+ goto out;
+ ret = addr;
+ if (!(flags & MREMAP_FIXED) || (new_addr == addr))
+ goto out;
+ old_len = new_len;
+ }
+
+ /*
+ * Ok, we need to grow.. or relocate.
+ */
+ ret = -EFAULT;
+ vma = find_vma(mm, addr);
+ if (!vma || vma->vm_start > addr)
+ goto out;
+ if (is_vm_hugetlb_page(vma)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ /* We can't remap across vm area boundaries */
+ if (old_len > vma->vm_end - addr)
+ goto out;
+ if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) {
+ if (new_len > old_len)
+ goto out;
+ }
+ if (vma->vm_flags & VM_LOCKED) {
+ unsigned long locked, lock_limit;
+ locked = mm->locked_vm << PAGE_SHIFT;
+ lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ locked += new_len - old_len;
+ ret = -EAGAIN;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+ goto out;
+ }
+ if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (vma->vm_flags & VM_ACCOUNT) {
+ charged = (new_len - old_len) >> PAGE_SHIFT;
+ if (security_vm_enough_memory(charged))
+ goto out_nc;
+ }
+
+ /* old_len exactly to the end of the area..
+ * And we're not relocating the area.
+ */
+ if (old_len == vma->vm_end - addr &&
+ !((flags & MREMAP_FIXED) && (addr != new_addr)) &&
+ (old_len != new_len || !(flags & MREMAP_MAYMOVE))) {
+ unsigned long max_addr = TASK_SIZE;
+ if (vma->vm_next)
+ max_addr = vma->vm_next->vm_start;
+ /* can we just expand the current mapping? */
+ if (max_addr - addr >= new_len) {
+ int pages = (new_len - old_len) >> PAGE_SHIFT;
+
+ vma_adjust(vma, vma->vm_start,
+ addr + new_len, vma->vm_pgoff, NULL);
+
+ mm->total_vm += pages;
+ vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
+ if (vma->vm_flags & VM_LOCKED) {
+ mm->locked_vm += pages;
+ mlock_vma_pages_range(vma, addr + old_len,
+ addr + new_len);
+ }
+ ret = addr;
+ goto out;
+ }
+ }
+
+ /*
+ * We weren't able to just expand or shrink the area,
+ * we need to create a new one and move it..
+ */
+ ret = -ENOMEM;
+ if (flags & MREMAP_MAYMOVE) {
+ if (!(flags & MREMAP_FIXED)) {
+ unsigned long map_flags = 0;
+ if (vma->vm_flags & VM_MAYSHARE)
+ map_flags |= MAP_SHARED;
+
+ new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
+ vma->vm_pgoff, map_flags);
+ if (new_addr & ~PAGE_MASK) {
+ ret = new_addr;
+ goto out;
+ }
+
+ ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
+ if (ret)
+ goto out;
+ }
+ ret = move_vma(vma, addr, old_len, new_len, new_addr);
+ }
+out:
+ if (ret & ~PAGE_MASK)
+ vm_unacct_memory(charged);
+out_nc:
+ return ret;
+}
+
+SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
+ unsigned long, new_len, unsigned long, flags,
+ unsigned long, new_addr)
+{
+ unsigned long ret;
+
+ down_write(&current->mm->mmap_sem);
+ ret = do_mremap(addr, old_len, new_len, flags, new_addr);
+ up_write(&current->mm->mmap_sem);
+ return ret;
+}
diff --git a/mm/msync.c b/mm/msync.c
new file mode 100644
index 0000000..9e06d2e
--- /dev/null
+++ b/mm/msync.c
@@ -0,0 +1,103 @@
+/*
+ * linux/mm/msync.c
+ *
+ * Copyright (C) 1994-1999 Linus Torvalds
+ */
+
+/*
+ * The msync() system call.
+ */
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/file.h>
+#include <linux/syscalls.h>
+#include <linux/sched.h>
+
+/*
+ * MS_SYNC syncs the entire file - including mappings.
+ *
+ * MS_ASYNC does not start I/O (it used to, up to 2.5.67).
+ * Nor does it marks the relevant pages dirty (it used to up to 2.6.17).
+ * Now it doesn't do anything, since dirty pages are properly tracked.
+ *
+ * The application may now run fsync() to
+ * write out the dirty pages and wait on the writeout and check the result.
+ * Or the application may run fadvise(FADV_DONTNEED) against the fd to start
+ * async writeout immediately.
+ * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to
+ * applications.
+ */
+SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
+{
+ unsigned long end;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ int unmapped_error = 0;
+ int error = -EINVAL;
+
+ if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
+ goto out;
+ if (start & ~PAGE_MASK)
+ goto out;
+ if ((flags & MS_ASYNC) && (flags & MS_SYNC))
+ goto out;
+ error = -ENOMEM;
+ len = (len + ~PAGE_MASK) & PAGE_MASK;
+ end = start + len;
+ if (end < start)
+ goto out;
+ error = 0;
+ if (end == start)
+ goto out;
+ /*
+ * If the interval [start,end) covers some unmapped address ranges,
+ * just ignore them, but return -ENOMEM at the end.
+ */
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, start);
+ for (;;) {
+ struct file *file;
+
+ /* Still start < end. */
+ error = -ENOMEM;
+ if (!vma)
+ goto out_unlock;
+ /* Here start < vma->vm_end. */
+ if (start < vma->vm_start) {
+ start = vma->vm_start;
+ if (start >= end)
+ goto out_unlock;
+ unmapped_error = -ENOMEM;
+ }
+ /* Here vma->vm_start <= start < vma->vm_end. */
+ if ((flags & MS_INVALIDATE) &&
+ (vma->vm_flags & VM_LOCKED)) {
+ error = -EBUSY;
+ goto out_unlock;
+ }
+ file = vma->vm_file;
+ start = vma->vm_end;
+ if ((flags & MS_SYNC) && file &&
+ (vma->vm_flags & VM_SHARED)) {
+ get_file(file);
+ up_read(&mm->mmap_sem);
+ error = do_fsync(file, 0);
+ fput(file);
+ if (error || start >= end)
+ goto out;
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, start);
+ } else {
+ if (start >= end) {
+ error = 0;
+ goto out_unlock;
+ }
+ vma = vma->vm_next;
+ }
+ }
+out_unlock:
+ up_read(&mm->mmap_sem);
+out:
+ return error ? : unmapped_error;
+}
diff --git a/mm/nommu.c b/mm/nommu.c
new file mode 100644
index 0000000..b8fade4
--- /dev/null
+++ b/mm/nommu.c
@@ -0,0 +1,1523 @@
+/*
+ * linux/mm/nommu.c
+ *
+ * Replacement code for mm functions to support CPU's that don't
+ * have any form of memory management unit (thus no virtual memory).
+ *
+ * See Documentation/nommu-mmap.txt
+ *
+ * Copyright (c) 2004-2005 David Howells <dhowells@redhat.com>
+ * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
+ * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
+ * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com>
+ * Copyright (c) 2007 Paul Mundt <lethal@linux-sh.org>
+ */
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/swap.h>
+#include <linux/file.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/tracehook.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/mount.h>
+#include <linux/personality.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+
+#include <asm/uaccess.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+
+#include "internal.h"
+
+void *high_memory;
+struct page *mem_map;
+unsigned long max_mapnr;
+unsigned long num_physpages;
+unsigned long askedalloc, realalloc;
+atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
+int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
+int sysctl_overcommit_ratio = 50; /* default is 50% */
+int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
+int heap_stack_gap = 0;
+
+EXPORT_SYMBOL(mem_map);
+EXPORT_SYMBOL(num_physpages);
+
+/* list of shareable VMAs */
+struct rb_root nommu_vma_tree = RB_ROOT;
+DECLARE_RWSEM(nommu_vma_sem);
+
+struct vm_operations_struct generic_file_vm_ops = {
+};
+
+/*
+ * Handle all mappings that got truncated by a "truncate()"
+ * system call.
+ *
+ * NOTE! We have to be ready to update the memory sharing
+ * between the file and the memory map for a potential last
+ * incomplete page. Ugly, but necessary.
+ */
+int vmtruncate(struct inode *inode, loff_t offset)
+{
+ struct address_space *mapping = inode->i_mapping;
+ unsigned long limit;
+
+ if (inode->i_size < offset)
+ goto do_expand;
+ i_size_write(inode, offset);
+
+ truncate_inode_pages(mapping, offset);
+ goto out_truncate;
+
+do_expand:
+ limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+ if (limit != RLIM_INFINITY && offset > limit)
+ goto out_sig;
+ if (offset > inode->i_sb->s_maxbytes)
+ goto out;
+ i_size_write(inode, offset);
+
+out_truncate:
+ if (inode->i_op && inode->i_op->truncate)
+ inode->i_op->truncate(inode);
+ return 0;
+out_sig:
+ send_sig(SIGXFSZ, current, 0);
+out:
+ return -EFBIG;
+}
+
+EXPORT_SYMBOL(vmtruncate);
+
+/*
+ * Return the total memory allocated for this pointer, not
+ * just what the caller asked for.
+ *
+ * Doesn't have to be accurate, i.e. may have races.
+ */
+unsigned int kobjsize(const void *objp)
+{
+ struct page *page;
+
+ /*
+ * If the object we have should not have ksize performed on it,
+ * return size of 0
+ */
+ if (!objp || !virt_addr_valid(objp))
+ return 0;
+
+ page = virt_to_head_page(objp);
+
+ /*
+ * If the allocator sets PageSlab, we know the pointer came from
+ * kmalloc().
+ */
+ if (PageSlab(page))
+ return ksize(objp);
+
+ /*
+ * The ksize() function is only guaranteed to work for pointers
+ * returned by kmalloc(). So handle arbitrary pointers here.
+ */
+ return PAGE_SIZE << compound_order(page);
+}
+
+int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, int len, int flags,
+ struct page **pages, struct vm_area_struct **vmas)
+{
+ struct vm_area_struct *vma;
+ unsigned long vm_flags;
+ int i;
+ int write = !!(flags & GUP_FLAGS_WRITE);
+ int force = !!(flags & GUP_FLAGS_FORCE);
+ int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
+
+ /* calculate required read or write permissions.
+ * - if 'force' is set, we only require the "MAY" flags.
+ */
+ vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
+ vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+
+ for (i = 0; i < len; i++) {
+ vma = find_vma(mm, start);
+ if (!vma)
+ goto finish_or_fault;
+
+ /* protect what we can, including chardevs */
+ if (vma->vm_flags & (VM_IO | VM_PFNMAP) ||
+ (!ignore && !(vm_flags & vma->vm_flags)))
+ goto finish_or_fault;
+
+ if (pages) {
+ pages[i] = virt_to_page(start);
+ if (pages[i])
+ page_cache_get(pages[i]);
+ }
+ if (vmas)
+ vmas[i] = vma;
+ start += PAGE_SIZE;
+ }
+
+ return i;
+
+finish_or_fault:
+ return i ? : -EFAULT;
+}
+
+
+/*
+ * get a list of pages in an address range belonging to the specified process
+ * and indicate the VMA that covers each page
+ * - this is potentially dodgy as we may end incrementing the page count of a
+ * slab page or a secondary page from a compound page
+ * - don't permit access to VMAs that don't support it, such as I/O mappings
+ */
+int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, int len, int write, int force,
+ struct page **pages, struct vm_area_struct **vmas)
+{
+ int flags = 0;
+
+ if (write)
+ flags |= GUP_FLAGS_WRITE;
+ if (force)
+ flags |= GUP_FLAGS_FORCE;
+
+ return __get_user_pages(tsk, mm,
+ start, len, flags,
+ pages, vmas);
+}
+EXPORT_SYMBOL(get_user_pages);
+
+DEFINE_RWLOCK(vmlist_lock);
+struct vm_struct *vmlist;
+
+void vfree(const void *addr)
+{
+ kfree(addr);
+}
+EXPORT_SYMBOL(vfree);
+
+void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
+{
+ /*
+ * You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc()
+ * returns only a logical address.
+ */
+ return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM);
+}
+EXPORT_SYMBOL(__vmalloc);
+
+void *vmalloc_user(unsigned long size)
+{
+ void *ret;
+
+ ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+ PAGE_KERNEL);
+ if (ret) {
+ struct vm_area_struct *vma;
+
+ down_write(&current->mm->mmap_sem);
+ vma = find_vma(current->mm, (unsigned long)ret);
+ if (vma)
+ vma->vm_flags |= VM_USERMAP;
+ up_write(&current->mm->mmap_sem);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(vmalloc_user);
+
+struct page *vmalloc_to_page(const void *addr)
+{
+ return virt_to_page(addr);
+}
+EXPORT_SYMBOL(vmalloc_to_page);
+
+unsigned long vmalloc_to_pfn(const void *addr)
+{
+ return page_to_pfn(virt_to_page(addr));
+}
+EXPORT_SYMBOL(vmalloc_to_pfn);
+
+long vread(char *buf, char *addr, unsigned long count)
+{
+ memcpy(buf, addr, count);
+ return count;
+}
+
+long vwrite(char *buf, char *addr, unsigned long count)
+{
+ /* Don't allow overflow */
+ if ((unsigned long) addr + count < count)
+ count = -(unsigned long) addr;
+
+ memcpy(addr, buf, count);
+ return(count);
+}
+
+/*
+ * vmalloc - allocate virtually continguos memory
+ *
+ * @size: allocation size
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into continguos kernel virtual space.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
+void *vmalloc(unsigned long size)
+{
+ return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
+}
+EXPORT_SYMBOL(vmalloc);
+
+void *vmalloc_node(unsigned long size, int node)
+{
+ return vmalloc(size);
+}
+EXPORT_SYMBOL(vmalloc_node);
+
+#ifndef PAGE_KERNEL_EXEC
+# define PAGE_KERNEL_EXEC PAGE_KERNEL
+#endif
+
+/**
+ * vmalloc_exec - allocate virtually contiguous, executable memory
+ * @size: allocation size
+ *
+ * Kernel-internal function to allocate enough pages to cover @size
+ * the page level allocator and map them into contiguous and
+ * executable kernel virtual space.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
+
+void *vmalloc_exec(unsigned long size)
+{
+ return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
+}
+
+/**
+ * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
+ * @size: allocation size
+ *
+ * Allocate enough 32bit PA addressable pages to cover @size from the
+ * page level allocator and map them into continguos kernel virtual space.
+ */
+void *vmalloc_32(unsigned long size)
+{
+ return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
+}
+EXPORT_SYMBOL(vmalloc_32);
+
+/**
+ * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
+ * @size: allocation size
+ *
+ * The resulting memory area is 32bit addressable and zeroed so it can be
+ * mapped to userspace without leaking data.
+ *
+ * VM_USERMAP is set on the corresponding VMA so that subsequent calls to
+ * remap_vmalloc_range() are permissible.
+ */
+void *vmalloc_32_user(unsigned long size)
+{
+ /*
+ * We'll have to sort out the ZONE_DMA bits for 64-bit,
+ * but for now this can simply use vmalloc_user() directly.
+ */
+ return vmalloc_user(size);
+}
+EXPORT_SYMBOL(vmalloc_32_user);
+
+void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot)
+{
+ BUG();
+ return NULL;
+}
+EXPORT_SYMBOL(vmap);
+
+void vunmap(const void *addr)
+{
+ BUG();
+}
+EXPORT_SYMBOL(vunmap);
+
+/*
+ * Implement a stub for vmalloc_sync_all() if the architecture chose not to
+ * have one.
+ */
+void __attribute__((weak)) vmalloc_sync_all(void)
+{
+}
+
+int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
+ struct page *page)
+{
+ return -EINVAL;
+}
+EXPORT_SYMBOL(vm_insert_page);
+
+/*
+ * sys_brk() for the most part doesn't need the global kernel
+ * lock, except when an application is doing something nasty
+ * like trying to un-brk an area that has already been mapped
+ * to a regular file. in this case, the unmapping will need
+ * to invoke file system routines that need the global lock.
+ */
+SYSCALL_DEFINE1(brk, unsigned long, brk)
+{
+ struct mm_struct *mm = current->mm;
+
+ if (brk < mm->start_brk || brk > mm->context.end_brk)
+ return mm->brk;
+
+ if (mm->brk == brk)
+ return mm->brk;
+
+ /*
+ * Always allow shrinking brk
+ */
+ if (brk <= mm->brk) {
+ mm->brk = brk;
+ return brk;
+ }
+
+ /*
+ * Ok, looks good - let it rip.
+ */
+ return mm->brk = brk;
+}
+
+#ifdef DEBUG
+static void show_process_blocks(void)
+{
+ struct vm_list_struct *vml;
+
+ printk("Process blocks %d:", current->pid);
+
+ for (vml = &current->mm->context.vmlist; vml; vml = vml->next) {
+ printk(" %p: %p", vml, vml->vma);
+ if (vml->vma)
+ printk(" (%d @%lx #%d)",
+ kobjsize((void *) vml->vma->vm_start),
+ vml->vma->vm_start,
+ atomic_read(&vml->vma->vm_usage));
+ printk(vml->next ? " ->" : ".\n");
+ }
+}
+#endif /* DEBUG */
+
+/*
+ * add a VMA into a process's mm_struct in the appropriate place in the list
+ * - should be called with mm->mmap_sem held writelocked
+ */
+static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml)
+{
+ struct vm_list_struct **ppv;
+
+ for (ppv = &current->mm->context.vmlist; *ppv; ppv = &(*ppv)->next)
+ if ((*ppv)->vma->vm_start > vml->vma->vm_start)
+ break;
+
+ vml->next = *ppv;
+ *ppv = vml;
+}
+
+/*
+ * look up the first VMA in which addr resides, NULL if none
+ * - should be called with mm->mmap_sem at least held readlocked
+ */
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_list_struct *loop, *vml;
+
+ /* search the vm_start ordered list */
+ vml = NULL;
+ for (loop = mm->context.vmlist; loop; loop = loop->next) {
+ if (loop->vma->vm_start > addr)
+ break;
+ vml = loop;
+ }
+
+ if (vml && vml->vma->vm_end > addr)
+ return vml->vma;
+
+ return NULL;
+}
+EXPORT_SYMBOL(find_vma);
+
+/*
+ * find a VMA
+ * - we don't extend stack VMAs under NOMMU conditions
+ */
+struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
+{
+ return find_vma(mm, addr);
+}
+
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+ return -ENOMEM;
+}
+
+/*
+ * look up the first VMA exactly that exactly matches addr
+ * - should be called with mm->mmap_sem at least held readlocked
+ */
+static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
+ unsigned long addr)
+{
+ struct vm_list_struct *vml;
+
+ /* search the vm_start ordered list */
+ for (vml = mm->context.vmlist; vml; vml = vml->next) {
+ if (vml->vma->vm_start == addr)
+ return vml->vma;
+ if (vml->vma->vm_start > addr)
+ break;
+ }
+
+ return NULL;
+}
+
+/*
+ * find a VMA in the global tree
+ */
+static inline struct vm_area_struct *find_nommu_vma(unsigned long start)
+{
+ struct vm_area_struct *vma;
+ struct rb_node *n = nommu_vma_tree.rb_node;
+
+ while (n) {
+ vma = rb_entry(n, struct vm_area_struct, vm_rb);
+
+ if (start < vma->vm_start)
+ n = n->rb_left;
+ else if (start > vma->vm_start)
+ n = n->rb_right;
+ else
+ return vma;
+ }
+
+ return NULL;
+}
+
+/*
+ * add a VMA in the global tree
+ */
+static void add_nommu_vma(struct vm_area_struct *vma)
+{
+ struct vm_area_struct *pvma;
+ struct address_space *mapping;
+ struct rb_node **p = &nommu_vma_tree.rb_node;
+ struct rb_node *parent = NULL;
+
+ /* add the VMA to the mapping */
+ if (vma->vm_file) {
+ mapping = vma->vm_file->f_mapping;
+
+ flush_dcache_mmap_lock(mapping);
+ vma_prio_tree_insert(vma, &mapping->i_mmap);
+ flush_dcache_mmap_unlock(mapping);
+ }
+
+ /* add the VMA to the master list */
+ while (*p) {
+ parent = *p;
+ pvma = rb_entry(parent, struct vm_area_struct, vm_rb);
+
+ if (vma->vm_start < pvma->vm_start) {
+ p = &(*p)->rb_left;
+ }
+ else if (vma->vm_start > pvma->vm_start) {
+ p = &(*p)->rb_right;
+ }
+ else {
+ /* mappings are at the same address - this can only
+ * happen for shared-mem chardevs and shared file
+ * mappings backed by ramfs/tmpfs */
+ BUG_ON(!(pvma->vm_flags & VM_SHARED));
+
+ if (vma < pvma)
+ p = &(*p)->rb_left;
+ else if (vma > pvma)
+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+ }
+
+ rb_link_node(&vma->vm_rb, parent, p);
+ rb_insert_color(&vma->vm_rb, &nommu_vma_tree);
+}
+
+/*
+ * delete a VMA from the global list
+ */
+static void delete_nommu_vma(struct vm_area_struct *vma)
+{
+ struct address_space *mapping;
+
+ /* remove the VMA from the mapping */
+ if (vma->vm_file) {
+ mapping = vma->vm_file->f_mapping;
+
+ flush_dcache_mmap_lock(mapping);
+ vma_prio_tree_remove(vma, &mapping->i_mmap);
+ flush_dcache_mmap_unlock(mapping);
+ }
+
+ /* remove from the master list */
+ rb_erase(&vma->vm_rb, &nommu_vma_tree);
+}
+
+/*
+ * determine whether a mapping should be permitted and, if so, what sort of
+ * mapping we're capable of supporting
+ */
+static int validate_mmap_request(struct file *file,
+ unsigned long addr,
+ unsigned long len,
+ unsigned long prot,
+ unsigned long flags,
+ unsigned long pgoff,
+ unsigned long *_capabilities)
+{
+ unsigned long capabilities;
+ unsigned long reqprot = prot;
+ int ret;
+
+ /* do the simple checks first */
+ if (flags & MAP_FIXED || addr) {
+ printk(KERN_DEBUG
+ "%d: Can't do fixed-address/overlay mmap of RAM\n",
+ current->pid);
+ return -EINVAL;
+ }
+
+ if ((flags & MAP_TYPE) != MAP_PRIVATE &&
+ (flags & MAP_TYPE) != MAP_SHARED)
+ return -EINVAL;
+
+ if (!len)
+ return -EINVAL;
+
+ /* Careful about overflows.. */
+ len = PAGE_ALIGN(len);
+ if (!len || len > TASK_SIZE)
+ return -ENOMEM;
+
+ /* offset overflow? */
+ if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
+ return -EOVERFLOW;
+
+ if (file) {
+ /* validate file mapping requests */
+ struct address_space *mapping;
+
+ /* files must support mmap */
+ if (!file->f_op || !file->f_op->mmap)
+ return -ENODEV;
+
+ /* work out if what we've got could possibly be shared
+ * - we support chardevs that provide their own "memory"
+ * - we support files/blockdevs that are memory backed
+ */
+ mapping = file->f_mapping;
+ if (!mapping)
+ mapping = file->f_path.dentry->d_inode->i_mapping;
+
+ capabilities = 0;
+ if (mapping && mapping->backing_dev_info)
+ capabilities = mapping->backing_dev_info->capabilities;
+
+ if (!capabilities) {
+ /* no explicit capabilities set, so assume some
+ * defaults */
+ switch (file->f_path.dentry->d_inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ case S_IFBLK:
+ capabilities = BDI_CAP_MAP_COPY;
+ break;
+
+ case S_IFCHR:
+ capabilities =
+ BDI_CAP_MAP_DIRECT |
+ BDI_CAP_READ_MAP |
+ BDI_CAP_WRITE_MAP;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+ }
+
+ /* eliminate any capabilities that we can't support on this
+ * device */
+ if (!file->f_op->get_unmapped_area)
+ capabilities &= ~BDI_CAP_MAP_DIRECT;
+ if (!file->f_op->read)
+ capabilities &= ~BDI_CAP_MAP_COPY;
+
+ if (flags & MAP_SHARED) {
+ /* do checks for writing, appending and locking */
+ if ((prot & PROT_WRITE) &&
+ !(file->f_mode & FMODE_WRITE))
+ return -EACCES;
+
+ if (IS_APPEND(file->f_path.dentry->d_inode) &&
+ (file->f_mode & FMODE_WRITE))
+ return -EACCES;
+
+ if (locks_verify_locked(file->f_path.dentry->d_inode))
+ return -EAGAIN;
+
+ if (!(capabilities & BDI_CAP_MAP_DIRECT))
+ return -ENODEV;
+
+ if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) ||
+ ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) ||
+ ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP))
+ ) {
+ printk("MAP_SHARED not completely supported on !MMU\n");
+ return -EINVAL;
+ }
+
+ /* we mustn't privatise shared mappings */
+ capabilities &= ~BDI_CAP_MAP_COPY;
+ }
+ else {
+ /* we're going to read the file into private memory we
+ * allocate */
+ if (!(capabilities & BDI_CAP_MAP_COPY))
+ return -ENODEV;
+
+ /* we don't permit a private writable mapping to be
+ * shared with the backing device */
+ if (prot & PROT_WRITE)
+ capabilities &= ~BDI_CAP_MAP_DIRECT;
+ }
+
+ /* handle executable mappings and implied executable
+ * mappings */
+ if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
+ if (prot & PROT_EXEC)
+ return -EPERM;
+ }
+ else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
+ /* handle implication of PROT_EXEC by PROT_READ */
+ if (current->personality & READ_IMPLIES_EXEC) {
+ if (capabilities & BDI_CAP_EXEC_MAP)
+ prot |= PROT_EXEC;
+ }
+ }
+ else if ((prot & PROT_READ) &&
+ (prot & PROT_EXEC) &&
+ !(capabilities & BDI_CAP_EXEC_MAP)
+ ) {
+ /* backing file is not executable, try to copy */
+ capabilities &= ~BDI_CAP_MAP_DIRECT;
+ }
+ }
+ else {
+ /* anonymous mappings are always memory backed and can be
+ * privately mapped
+ */
+ capabilities = BDI_CAP_MAP_COPY;
+
+ /* handle PROT_EXEC implication by PROT_READ */
+ if ((prot & PROT_READ) &&
+ (current->personality & READ_IMPLIES_EXEC))
+ prot |= PROT_EXEC;
+ }
+
+ /* allow the security API to have its say */
+ ret = security_file_mmap(file, reqprot, prot, flags, addr, 0);
+ if (ret < 0)
+ return ret;
+
+ /* looks okay */
+ *_capabilities = capabilities;
+ return 0;
+}
+
+/*
+ * we've determined that we can make the mapping, now translate what we
+ * now know into VMA flags
+ */
+static unsigned long determine_vm_flags(struct file *file,
+ unsigned long prot,
+ unsigned long flags,
+ unsigned long capabilities)
+{
+ unsigned long vm_flags;
+
+ vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags);
+ vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+ /* vm_flags |= mm->def_flags; */
+
+ if (!(capabilities & BDI_CAP_MAP_DIRECT)) {
+ /* attempt to share read-only copies of mapped file chunks */
+ if (file && !(prot & PROT_WRITE))
+ vm_flags |= VM_MAYSHARE;
+ }
+ else {
+ /* overlay a shareable mapping on the backing device or inode
+ * if possible - used for chardevs, ramfs/tmpfs/shmfs and
+ * romfs/cramfs */
+ if (flags & MAP_SHARED)
+ vm_flags |= VM_MAYSHARE | VM_SHARED;
+ else if ((((vm_flags & capabilities) ^ vm_flags) & BDI_CAP_VMFLAGS) == 0)
+ vm_flags |= VM_MAYSHARE;
+ }
+
+ /* refuse to let anyone share private mappings with this process if
+ * it's being traced - otherwise breakpoints set in it may interfere
+ * with another untraced process
+ */
+ if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current))
+ vm_flags &= ~VM_MAYSHARE;
+
+ return vm_flags;
+}
+
+/*
+ * set up a shared mapping on a file
+ */
+static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len)
+{
+ int ret;
+
+ ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
+ if (ret != -ENOSYS)
+ return ret;
+
+ /* getting an ENOSYS error indicates that direct mmap isn't
+ * possible (as opposed to tried but failed) so we'll fall
+ * through to making a private copy of the data and mapping
+ * that if we can */
+ return -ENODEV;
+}
+
+/*
+ * set up a private mapping or an anonymous shared mapping
+ */
+static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
+{
+ void *base;
+ int ret;
+
+ /* invoke the file's mapping function so that it can keep track of
+ * shared mappings on devices or memory
+ * - VM_MAYSHARE will be set if it may attempt to share
+ */
+ if (vma->vm_file) {
+ ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
+ if (ret != -ENOSYS) {
+ /* shouldn't return success if we're not sharing */
+ BUG_ON(ret == 0 && !(vma->vm_flags & VM_MAYSHARE));
+ return ret; /* success or a real error */
+ }
+
+ /* getting an ENOSYS error indicates that direct mmap isn't
+ * possible (as opposed to tried but failed) so we'll try to
+ * make a private copy of the data and map that instead */
+ }
+
+ /* allocate some memory to hold the mapping
+ * - note that this may not return a page-aligned address if the object
+ * we're allocating is smaller than a page
+ */
+ base = kmalloc(len, GFP_KERNEL|__GFP_COMP);
+ if (!base)
+ goto enomem;
+
+ vma->vm_start = (unsigned long) base;
+ vma->vm_end = vma->vm_start + len;
+ vma->vm_flags |= VM_MAPPED_COPY;
+
+#ifdef WARN_ON_SLACK
+ if (len + WARN_ON_SLACK <= kobjsize(result))
+ printk("Allocation of %lu bytes from process %d has %lu bytes of slack\n",
+ len, current->pid, kobjsize(result) - len);
+#endif
+
+ if (vma->vm_file) {
+ /* read the contents of a file into the copy */
+ mm_segment_t old_fs;
+ loff_t fpos;
+
+ fpos = vma->vm_pgoff;
+ fpos <<= PAGE_SHIFT;
+
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos);
+ set_fs(old_fs);
+
+ if (ret < 0)
+ goto error_free;
+
+ /* clear the last little bit */
+ if (ret < len)
+ memset(base + ret, 0, len - ret);
+
+ } else {
+ /* if it's an anonymous mapping, then just clear it */
+ memset(base, 0, len);
+ }
+
+ return 0;
+
+error_free:
+ kfree(base);
+ vma->vm_start = 0;
+ return ret;
+
+enomem:
+ printk("Allocation of length %lu from process %d failed\n",
+ len, current->pid);
+ show_free_areas();
+ return -ENOMEM;
+}
+
+/*
+ * handle mapping creation for uClinux
+ */
+unsigned long do_mmap_pgoff(struct file *file,
+ unsigned long addr,
+ unsigned long len,
+ unsigned long prot,
+ unsigned long flags,
+ unsigned long pgoff)
+{
+ struct vm_list_struct *vml = NULL;
+ struct vm_area_struct *vma = NULL;
+ struct rb_node *rb;
+ unsigned long capabilities, vm_flags;
+ void *result;
+ int ret;
+
+ if (!(flags & MAP_FIXED))
+ addr = round_hint_to_min(addr);
+
+ /* decide whether we should attempt the mapping, and if so what sort of
+ * mapping */
+ ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
+ &capabilities);
+ if (ret < 0)
+ return ret;
+
+ /* we've determined that we can make the mapping, now translate what we
+ * now know into VMA flags */
+ vm_flags = determine_vm_flags(file, prot, flags, capabilities);
+
+ /* we're going to need to record the mapping if it works */
+ vml = kzalloc(sizeof(struct vm_list_struct), GFP_KERNEL);
+ if (!vml)
+ goto error_getting_vml;
+
+ down_write(&nommu_vma_sem);
+
+ /* if we want to share, we need to check for VMAs created by other
+ * mmap() calls that overlap with our proposed mapping
+ * - we can only share with an exact match on most regular files
+ * - shared mappings on character devices and memory backed files are
+ * permitted to overlap inexactly as far as we are concerned for in
+ * these cases, sharing is handled in the driver or filesystem rather
+ * than here
+ */
+ if (vm_flags & VM_MAYSHARE) {
+ unsigned long pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ unsigned long vmpglen;
+
+ /* suppress VMA sharing for shared regions */
+ if (vm_flags & VM_SHARED &&
+ capabilities & BDI_CAP_MAP_DIRECT)
+ goto dont_share_VMAs;
+
+ for (rb = rb_first(&nommu_vma_tree); rb; rb = rb_next(rb)) {
+ vma = rb_entry(rb, struct vm_area_struct, vm_rb);
+
+ if (!(vma->vm_flags & VM_MAYSHARE))
+ continue;
+
+ /* search for overlapping mappings on the same file */
+ if (vma->vm_file->f_path.dentry->d_inode != file->f_path.dentry->d_inode)
+ continue;
+
+ if (vma->vm_pgoff >= pgoff + pglen)
+ continue;
+
+ vmpglen = vma->vm_end - vma->vm_start + PAGE_SIZE - 1;
+ vmpglen >>= PAGE_SHIFT;
+ if (pgoff >= vma->vm_pgoff + vmpglen)
+ continue;
+
+ /* handle inexactly overlapping matches between mappings */
+ if (vma->vm_pgoff != pgoff || vmpglen != pglen) {
+ if (!(capabilities & BDI_CAP_MAP_DIRECT))
+ goto sharing_violation;
+ continue;
+ }
+
+ /* we've found a VMA we can share */
+ atomic_inc(&vma->vm_usage);
+
+ vml->vma = vma;
+ result = (void *) vma->vm_start;
+ goto shared;
+ }
+
+ dont_share_VMAs:
+ vma = NULL;
+
+ /* obtain the address at which to make a shared mapping
+ * - this is the hook for quasi-memory character devices to
+ * tell us the location of a shared mapping
+ */
+ if (file && file->f_op->get_unmapped_area) {
+ addr = file->f_op->get_unmapped_area(file, addr, len,
+ pgoff, flags);
+ if (IS_ERR((void *) addr)) {
+ ret = addr;
+ if (ret != (unsigned long) -ENOSYS)
+ goto error;
+
+ /* the driver refused to tell us where to site
+ * the mapping so we'll have to attempt to copy
+ * it */
+ ret = (unsigned long) -ENODEV;
+ if (!(capabilities & BDI_CAP_MAP_COPY))
+ goto error;
+
+ capabilities &= ~BDI_CAP_MAP_DIRECT;
+ }
+ }
+ }
+
+ /* we're going to need a VMA struct as well */
+ vma = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
+ if (!vma)
+ goto error_getting_vma;
+
+ INIT_LIST_HEAD(&vma->anon_vma_node);
+ atomic_set(&vma->vm_usage, 1);
+ if (file) {
+ get_file(file);
+ if (vm_flags & VM_EXECUTABLE) {
+ added_exe_file_vma(current->mm);
+ vma->vm_mm = current->mm;
+ }
+ }
+ vma->vm_file = file;
+ vma->vm_flags = vm_flags;
+ vma->vm_start = addr;
+ vma->vm_end = addr + len;
+ vma->vm_pgoff = pgoff;
+
+ vml->vma = vma;
+
+ /* set up the mapping */
+ if (file && vma->vm_flags & VM_SHARED)
+ ret = do_mmap_shared_file(vma, len);
+ else
+ ret = do_mmap_private(vma, len);
+ if (ret < 0)
+ goto error;
+
+ /* okay... we have a mapping; now we have to register it */
+ result = (void *) vma->vm_start;
+
+ if (vma->vm_flags & VM_MAPPED_COPY) {
+ realalloc += kobjsize(result);
+ askedalloc += len;
+ }
+
+ realalloc += kobjsize(vma);
+ askedalloc += sizeof(*vma);
+
+ current->mm->total_vm += len >> PAGE_SHIFT;
+
+ add_nommu_vma(vma);
+
+ shared:
+ realalloc += kobjsize(vml);
+ askedalloc += sizeof(*vml);
+
+ add_vma_to_mm(current->mm, vml);
+
+ up_write(&nommu_vma_sem);
+
+ if (prot & PROT_EXEC)
+ flush_icache_range((unsigned long) result,
+ (unsigned long) result + len);
+
+#ifdef DEBUG
+ printk("do_mmap:\n");
+ show_process_blocks();
+#endif
+
+ return (unsigned long) result;
+
+ error:
+ up_write(&nommu_vma_sem);
+ kfree(vml);
+ if (vma) {
+ if (vma->vm_file) {
+ fput(vma->vm_file);
+ if (vma->vm_flags & VM_EXECUTABLE)
+ removed_exe_file_vma(vma->vm_mm);
+ }
+ kfree(vma);
+ }
+ return ret;
+
+ sharing_violation:
+ up_write(&nommu_vma_sem);
+ printk("Attempt to share mismatched mappings\n");
+ kfree(vml);
+ return -EINVAL;
+
+ error_getting_vma:
+ up_write(&nommu_vma_sem);
+ kfree(vml);
+ printk("Allocation of vma for %lu byte allocation from process %d failed\n",
+ len, current->pid);
+ show_free_areas();
+ return -ENOMEM;
+
+ error_getting_vml:
+ printk("Allocation of vml for %lu byte allocation from process %d failed\n",
+ len, current->pid);
+ show_free_areas();
+ return -ENOMEM;
+}
+EXPORT_SYMBOL(do_mmap_pgoff);
+
+/*
+ * handle mapping disposal for uClinux
+ */
+static void put_vma(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ if (vma) {
+ down_write(&nommu_vma_sem);
+
+ if (atomic_dec_and_test(&vma->vm_usage)) {
+ delete_nommu_vma(vma);
+
+ if (vma->vm_ops && vma->vm_ops->close)
+ vma->vm_ops->close(vma);
+
+ /* IO memory and memory shared directly out of the pagecache from
+ * ramfs/tmpfs mustn't be released here */
+ if (vma->vm_flags & VM_MAPPED_COPY) {
+ realalloc -= kobjsize((void *) vma->vm_start);
+ askedalloc -= vma->vm_end - vma->vm_start;
+ kfree((void *) vma->vm_start);
+ }
+
+ realalloc -= kobjsize(vma);
+ askedalloc -= sizeof(*vma);
+
+ if (vma->vm_file) {
+ fput(vma->vm_file);
+ if (vma->vm_flags & VM_EXECUTABLE)
+ removed_exe_file_vma(mm);
+ }
+ kfree(vma);
+ }
+
+ up_write(&nommu_vma_sem);
+ }
+}
+
+/*
+ * release a mapping
+ * - under NOMMU conditions the parameters must match exactly to the mapping to
+ * be removed
+ */
+int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
+{
+ struct vm_list_struct *vml, **parent;
+ unsigned long end = addr + len;
+
+#ifdef DEBUG
+ printk("do_munmap:\n");
+#endif
+
+ for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) {
+ if ((*parent)->vma->vm_start > addr)
+ break;
+ if ((*parent)->vma->vm_start == addr &&
+ ((len == 0) || ((*parent)->vma->vm_end == end)))
+ goto found;
+ }
+
+ printk("munmap of non-mmaped memory by process %d (%s): %p\n",
+ current->pid, current->comm, (void *) addr);
+ return -EINVAL;
+
+ found:
+ vml = *parent;
+
+ put_vma(mm, vml->vma);
+
+ *parent = vml->next;
+ realalloc -= kobjsize(vml);
+ askedalloc -= sizeof(*vml);
+ kfree(vml);
+
+ update_hiwater_vm(mm);
+ mm->total_vm -= len >> PAGE_SHIFT;
+
+#ifdef DEBUG
+ show_process_blocks();
+#endif
+
+ return 0;
+}
+EXPORT_SYMBOL(do_munmap);
+
+SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
+{
+ int ret;
+ struct mm_struct *mm = current->mm;
+
+ down_write(&mm->mmap_sem);
+ ret = do_munmap(mm, addr, len);
+ up_write(&mm->mmap_sem);
+ return ret;
+}
+
+/*
+ * Release all mappings
+ */
+void exit_mmap(struct mm_struct * mm)
+{
+ struct vm_list_struct *tmp;
+
+ if (mm) {
+#ifdef DEBUG
+ printk("Exit_mmap:\n");
+#endif
+
+ mm->total_vm = 0;
+
+ while ((tmp = mm->context.vmlist)) {
+ mm->context.vmlist = tmp->next;
+ put_vma(mm, tmp->vma);
+
+ realalloc -= kobjsize(tmp);
+ askedalloc -= sizeof(*tmp);
+ kfree(tmp);
+ }
+
+#ifdef DEBUG
+ show_process_blocks();
+#endif
+ }
+}
+
+unsigned long do_brk(unsigned long addr, unsigned long len)
+{
+ return -ENOMEM;
+}
+
+/*
+ * expand (or shrink) an existing mapping, potentially moving it at the same
+ * time (controlled by the MREMAP_MAYMOVE flag and available VM space)
+ *
+ * under NOMMU conditions, we only permit changing a mapping's size, and only
+ * as long as it stays within the hole allocated by the kmalloc() call in
+ * do_mmap_pgoff() and the block is not shareable
+ *
+ * MREMAP_FIXED is not supported under NOMMU conditions
+ */
+unsigned long do_mremap(unsigned long addr,
+ unsigned long old_len, unsigned long new_len,
+ unsigned long flags, unsigned long new_addr)
+{
+ struct vm_area_struct *vma;
+
+ /* insanity checks first */
+ if (new_len == 0)
+ return (unsigned long) -EINVAL;
+
+ if (flags & MREMAP_FIXED && new_addr != addr)
+ return (unsigned long) -EINVAL;
+
+ vma = find_vma_exact(current->mm, addr);
+ if (!vma)
+ return (unsigned long) -EINVAL;
+
+ if (vma->vm_end != vma->vm_start + old_len)
+ return (unsigned long) -EFAULT;
+
+ if (vma->vm_flags & VM_MAYSHARE)
+ return (unsigned long) -EPERM;
+
+ if (new_len > kobjsize((void *) addr))
+ return (unsigned long) -ENOMEM;
+
+ /* all checks complete - do it */
+ vma->vm_end = vma->vm_start + new_len;
+
+ askedalloc -= old_len;
+ askedalloc += new_len;
+
+ return vma->vm_start;
+}
+EXPORT_SYMBOL(do_mremap);
+
+SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
+ unsigned long, new_len, unsigned long, flags,
+ unsigned long, new_addr)
+{
+ unsigned long ret;
+
+ down_write(&current->mm->mmap_sem);
+ ret = do_mremap(addr, old_len, new_len, flags, new_addr);
+ up_write(&current->mm->mmap_sem);
+ return ret;
+}
+
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
+ unsigned int foll_flags)
+{
+ return NULL;
+}
+
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
+ unsigned long to, unsigned long size, pgprot_t prot)
+{
+ vma->vm_start = vma->vm_pgoff << PAGE_SHIFT;
+ return 0;
+}
+EXPORT_SYMBOL(remap_pfn_range);
+
+int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
+ unsigned long pgoff)
+{
+ unsigned int size = vma->vm_end - vma->vm_start;
+
+ if (!(vma->vm_flags & VM_USERMAP))
+ return -EINVAL;
+
+ vma->vm_start = (unsigned long)(addr + (pgoff << PAGE_SHIFT));
+ vma->vm_end = vma->vm_start + size;
+
+ return 0;
+}
+EXPORT_SYMBOL(remap_vmalloc_range);
+
+void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
+{
+}
+
+unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+ return -ENOMEM;
+}
+
+void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
+{
+}
+
+void unmap_mapping_range(struct address_space *mapping,
+ loff_t const holebegin, loff_t const holelen,
+ int even_cows)
+{
+}
+EXPORT_SYMBOL(unmap_mapping_range);
+
+/*
+ * ask for an unmapped area at which to create a mapping on a file
+ */
+unsigned long get_unmapped_area(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ unsigned long (*get_area)(struct file *, unsigned long, unsigned long,
+ unsigned long, unsigned long);
+
+ get_area = current->mm->get_unmapped_area;
+ if (file && file->f_op && file->f_op->get_unmapped_area)
+ get_area = file->f_op->get_unmapped_area;
+
+ if (!get_area)
+ return -ENOSYS;
+
+ return get_area(file, addr, len, pgoff, flags);
+}
+EXPORT_SYMBOL(get_unmapped_area);
+
+/*
+ * Check that a process has enough memory to allocate a new virtual
+ * mapping. 0 means there is enough memory for the allocation to
+ * succeed and -ENOMEM implies there is not.
+ *
+ * We currently support three overcommit policies, which are set via the
+ * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
+ *
+ * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
+ * Additional code 2002 Jul 20 by Robert Love.
+ *
+ * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
+ *
+ * Note this is a helper function intended to be used by LSMs which
+ * wish to use this logic.
+ */
+int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
+{
+ unsigned long free, allowed;
+
+ vm_acct_memory(pages);
+
+ /*
+ * Sometimes we want to use more memory than we have
+ */
+ if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
+ return 0;
+
+ if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
+ unsigned long n;
+
+ free = global_page_state(NR_FILE_PAGES);
+ free += nr_swap_pages;
+
+ /*
+ * Any slabs which are created with the
+ * SLAB_RECLAIM_ACCOUNT flag claim to have contents
+ * which are reclaimable, under pressure. The dentry
+ * cache and most inode caches should fall into this
+ */
+ free += global_page_state(NR_SLAB_RECLAIMABLE);
+
+ /*
+ * Leave the last 3% for root
+ */
+ if (!cap_sys_admin)
+ free -= free / 32;
+
+ if (free > pages)
+ return 0;
+
+ /*
+ * nr_free_pages() is very expensive on large systems,
+ * only call if we're about to fail.
+ */
+ n = nr_free_pages();
+
+ /*
+ * Leave reserved pages. The pages are not for anonymous pages.
+ */
+ if (n <= totalreserve_pages)
+ goto error;
+ else
+ n -= totalreserve_pages;
+
+ /*
+ * Leave the last 3% for root
+ */
+ if (!cap_sys_admin)
+ n -= n / 32;
+ free += n;
+
+ if (free > pages)
+ return 0;
+
+ goto error;
+ }
+
+ allowed = totalram_pages * sysctl_overcommit_ratio / 100;
+ /*
+ * Leave the last 3% for root
+ */
+ if (!cap_sys_admin)
+ allowed -= allowed / 32;
+ allowed += total_swap_pages;
+
+ /* Don't let a single process grow too big:
+ leave 3% of the size of this process for other processes */
+ if (mm)
+ allowed -= mm->total_vm / 32;
+
+ /*
+ * cast `allowed' as a signed long because vm_committed_space
+ * sometimes has a negative value
+ */
+ if (atomic_long_read(&vm_committed_space) < (long)allowed)
+ return 0;
+error:
+ vm_unacct_memory(pages);
+
+ return -ENOMEM;
+}
+
+int in_gate_area_no_task(unsigned long addr)
+{
+ return 0;
+}
+
+int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ BUG();
+ return 0;
+}
+EXPORT_SYMBOL(filemap_fault);
+
+/*
+ * Access another process' address space.
+ * - source/target buffer must be kernel space
+ */
+int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+{
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
+
+ if (addr + len < addr)
+ return 0;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return 0;
+
+ down_read(&mm->mmap_sem);
+
+ /* the access must start within one of the target process's mappings */
+ vma = find_vma(mm, addr);
+ if (vma) {
+ /* don't overrun this mapping */
+ if (addr + len >= vma->vm_end)
+ len = vma->vm_end - addr;
+
+ /* only read or write mappings where it is permitted */
+ if (write && vma->vm_flags & VM_MAYWRITE)
+ len -= copy_to_user((void *) addr, buf, len);
+ else if (!write && vma->vm_flags & VM_MAYREAD)
+ len -= copy_from_user(buf, (void *) addr, len);
+ else
+ len = 0;
+ } else {
+ len = 0;
+ }
+
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ return len;
+}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
new file mode 100644
index 0000000..a0a0190
--- /dev/null
+++ b/mm/oom_kill.c
@@ -0,0 +1,593 @@
+/*
+ * linux/mm/oom_kill.c
+ *
+ * Copyright (C) 1998,2000 Rik van Riel
+ * Thanks go out to Claus Fischer for some serious inspiration and
+ * for goading me into coding this file...
+ *
+ * The routines in this file are used to kill a process when
+ * we're seriously out of memory. This gets called from __alloc_pages()
+ * in mm/page_alloc.c when we really run out of memory.
+ *
+ * Since we won't call these routines often (on a well-configured
+ * machine) this file will double as a 'coding guide' and a signpost
+ * for newbie kernel hackers. It features several pointers to major
+ * kernel subsystems and hints as to where to find out what things do.
+ */
+
+#include <linux/oom.h>
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/swap.h>
+#include <linux/timex.h>
+#include <linux/jiffies.h>
+#include <linux/cpuset.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/memcontrol.h>
+#include <linux/security.h>
+
+int sysctl_panic_on_oom;
+int sysctl_oom_kill_allocating_task;
+int sysctl_oom_dump_tasks;
+static DEFINE_SPINLOCK(zone_scan_mutex);
+/* #define DEBUG */
+
+/**
+ * badness - calculate a numeric value for how bad this task has been
+ * @p: task struct of which task we should calculate
+ * @uptime: current uptime in seconds
+ *
+ * The formula used is relatively simple and documented inline in the
+ * function. The main rationale is that we want to select a good task
+ * to kill when we run out of memory.
+ *
+ * Good in this context means that:
+ * 1) we lose the minimum amount of work done
+ * 2) we recover a large amount of memory
+ * 3) we don't kill anything innocent of eating tons of memory
+ * 4) we want to kill the minimum amount of processes (one)
+ * 5) we try to kill the process the user expects us to kill, this
+ * algorithm has been meticulously tuned to meet the principle
+ * of least surprise ... (be careful when you change it)
+ */
+
+unsigned long badness(struct task_struct *p, unsigned long uptime)
+{
+ unsigned long points, cpu_time, run_time, s;
+ struct mm_struct *mm;
+ struct task_struct *child;
+
+ task_lock(p);
+ mm = p->mm;
+ if (!mm) {
+ task_unlock(p);
+ return 0;
+ }
+
+ /*
+ * The memory size of the process is the basis for the badness.
+ */
+ points = mm->total_vm;
+
+ /*
+ * After this unlock we can no longer dereference local variable `mm'
+ */
+ task_unlock(p);
+
+ /*
+ * swapoff can easily use up all memory, so kill those first.
+ */
+ if (p->flags & PF_SWAPOFF)
+ return ULONG_MAX;
+
+ /*
+ * Processes which fork a lot of child processes are likely
+ * a good choice. We add half the vmsize of the children if they
+ * have an own mm. This prevents forking servers to flood the
+ * machine with an endless amount of children. In case a single
+ * child is eating the vast majority of memory, adding only half
+ * to the parents will make the child our kill candidate of choice.
+ */
+ list_for_each_entry(child, &p->children, sibling) {
+ task_lock(child);
+ if (child->mm != mm && child->mm)
+ points += child->mm->total_vm/2 + 1;
+ task_unlock(child);
+ }
+
+ /*
+ * CPU time is in tens of seconds and run time is in thousands
+ * of seconds. There is no particular reason for this other than
+ * that it turned out to work very well in practice.
+ */
+ cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime))
+ >> (SHIFT_HZ + 3);
+
+ if (uptime >= p->start_time.tv_sec)
+ run_time = (uptime - p->start_time.tv_sec) >> 10;
+ else
+ run_time = 0;
+
+ s = int_sqrt(cpu_time);
+ if (s)
+ points /= s;
+ s = int_sqrt(int_sqrt(run_time));
+ if (s)
+ points /= s;
+
+ /*
+ * Niced processes are most likely less important, so double
+ * their badness points.
+ */
+ if (task_nice(p) > 0)
+ points *= 2;
+
+ /*
+ * Superuser processes are usually more important, so we make it
+ * less likely that we kill those.
+ */
+ if (has_capability(p, CAP_SYS_ADMIN) ||
+ has_capability(p, CAP_SYS_RESOURCE))
+ points /= 4;
+
+ /*
+ * We don't want to kill a process with direct hardware access.
+ * Not only could that mess up the hardware, but usually users
+ * tend to only have this flag set on applications they think
+ * of as important.
+ */
+ if (has_capability(p, CAP_SYS_RAWIO))
+ points /= 4;
+
+ /*
+ * If p's nodes don't overlap ours, it may still help to kill p
+ * because p may have allocated or otherwise mapped memory on
+ * this node before. However it will be less likely.
+ */
+ if (!cpuset_mems_allowed_intersects(current, p))
+ points /= 8;
+
+ /*
+ * Adjust the score by oomkilladj.
+ */
+ if (p->oomkilladj) {
+ if (p->oomkilladj > 0) {
+ if (!points)
+ points = 1;
+ points <<= p->oomkilladj;
+ } else
+ points >>= -(p->oomkilladj);
+ }
+
+#ifdef DEBUG
+ printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points\n",
+ p->pid, p->comm, points);
+#endif
+ return points;
+}
+
+/*
+ * Determine the type of allocation constraint.
+ */
+static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
+ gfp_t gfp_mask)
+{
+#ifdef CONFIG_NUMA
+ struct zone *zone;
+ struct zoneref *z;
+ enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ nodemask_t nodes = node_states[N_HIGH_MEMORY];
+
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+ if (cpuset_zone_allowed_softwall(zone, gfp_mask))
+ node_clear(zone_to_nid(zone), nodes);
+ else
+ return CONSTRAINT_CPUSET;
+
+ if (!nodes_empty(nodes))
+ return CONSTRAINT_MEMORY_POLICY;
+#endif
+
+ return CONSTRAINT_NONE;
+}
+
+/*
+ * Simple selection loop. We chose the process with the highest
+ * number of 'points'. We expect the caller will lock the tasklist.
+ *
+ * (not docbooked, we don't want this one cluttering up the manual)
+ */
+static struct task_struct *select_bad_process(unsigned long *ppoints,
+ struct mem_cgroup *mem)
+{
+ struct task_struct *g, *p;
+ struct task_struct *chosen = NULL;
+ struct timespec uptime;
+ *ppoints = 0;
+
+ do_posix_clock_monotonic_gettime(&uptime);
+ do_each_thread(g, p) {
+ unsigned long points;
+
+ /*
+ * skip kernel threads and tasks which have already released
+ * their mm.
+ */
+ if (!p->mm)
+ continue;
+ /* skip the init task */
+ if (is_global_init(p))
+ continue;
+ if (mem && !task_in_mem_cgroup(p, mem))
+ continue;
+
+ /*
+ * This task already has access to memory reserves and is
+ * being killed. Don't allow any other task access to the
+ * memory reserve.
+ *
+ * Note: this may have a chance of deadlock if it gets
+ * blocked waiting for another task which itself is waiting
+ * for memory. Is there a better alternative?
+ */
+ if (test_tsk_thread_flag(p, TIF_MEMDIE))
+ return ERR_PTR(-1UL);
+
+ /*
+ * This is in the process of releasing memory so wait for it
+ * to finish before killing some other task by mistake.
+ *
+ * However, if p is the current task, we allow the 'kill' to
+ * go ahead if it is exiting: this will simply set TIF_MEMDIE,
+ * which will allow it to gain access to memory reserves in
+ * the process of exiting and releasing its resources.
+ * Otherwise we could get an easy OOM deadlock.
+ */
+ if (p->flags & PF_EXITING) {
+ if (p != current)
+ return ERR_PTR(-1UL);
+
+ chosen = p;
+ *ppoints = ULONG_MAX;
+ }
+
+ if (p->oomkilladj == OOM_DISABLE)
+ continue;
+
+ points = badness(p, uptime.tv_sec);
+ if (points > *ppoints || !chosen) {
+ chosen = p;
+ *ppoints = points;
+ }
+ } while_each_thread(g, p);
+
+ return chosen;
+}
+
+/**
+ * dump_tasks - dump current memory state of all system tasks
+ * @mem: target memory controller
+ *
+ * Dumps the current memory state of all system tasks, excluding kernel threads.
+ * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
+ * score, and name.
+ *
+ * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are
+ * shown.
+ *
+ * Call with tasklist_lock read-locked.
+ */
+static void dump_tasks(const struct mem_cgroup *mem)
+{
+ struct task_struct *g, *p;
+
+ printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj "
+ "name\n");
+ do_each_thread(g, p) {
+ /*
+ * total_vm and rss sizes do not exist for tasks with a
+ * detached mm so there's no need to report them.
+ */
+ if (!p->mm)
+ continue;
+ if (mem && !task_in_mem_cgroup(p, mem))
+ continue;
+ if (!thread_group_leader(p))
+ continue;
+
+ task_lock(p);
+ printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
+ p->pid, p->uid, p->tgid, p->mm->total_vm,
+ get_mm_rss(p->mm), (int)task_cpu(p), p->oomkilladj,
+ p->comm);
+ task_unlock(p);
+ } while_each_thread(g, p);
+}
+
+/*
+ * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO
+ * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO
+ * set.
+ */
+static void __oom_kill_task(struct task_struct *p, int verbose)
+{
+ if (is_global_init(p)) {
+ WARN_ON(1);
+ printk(KERN_WARNING "tried to kill init!\n");
+ return;
+ }
+
+ if (!p->mm) {
+ WARN_ON(1);
+ printk(KERN_WARNING "tried to kill an mm-less task!\n");
+ return;
+ }
+
+ if (verbose)
+ printk(KERN_ERR "Killed process %d (%s)\n",
+ task_pid_nr(p), p->comm);
+
+ /*
+ * We give our sacrificial lamb high priority and access to
+ * all the memory it needs. That way it should be able to
+ * exit() and clear out its resources quickly...
+ */
+ p->rt.time_slice = HZ;
+ set_tsk_thread_flag(p, TIF_MEMDIE);
+
+ force_sig(SIGKILL, p);
+}
+
+static int oom_kill_task(struct task_struct *p)
+{
+ struct mm_struct *mm;
+ struct task_struct *g, *q;
+
+ mm = p->mm;
+
+ /* WARNING: mm may not be dereferenced since we did not obtain its
+ * value from get_task_mm(p). This is OK since all we need to do is
+ * compare mm to q->mm below.
+ *
+ * Furthermore, even if mm contains a non-NULL value, p->mm may
+ * change to NULL at any time since we do not hold task_lock(p).
+ * However, this is of no concern to us.
+ */
+
+ if (mm == NULL)
+ return 1;
+
+ /*
+ * Don't kill the process if any threads are set to OOM_DISABLE
+ */
+ do_each_thread(g, q) {
+ if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
+ return 1;
+ } while_each_thread(g, q);
+
+ __oom_kill_task(p, 1);
+
+ /*
+ * kill all processes that share the ->mm (i.e. all threads),
+ * but are in a different thread group. Don't let them have access
+ * to memory reserves though, otherwise we might deplete all memory.
+ */
+ do_each_thread(g, q) {
+ if (q->mm == mm && !same_thread_group(q, p))
+ force_sig(SIGKILL, q);
+ } while_each_thread(g, q);
+
+ return 0;
+}
+
+static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+ unsigned long points, struct mem_cgroup *mem,
+ const char *message)
+{
+ struct task_struct *c;
+
+ if (printk_ratelimit()) {
+ printk(KERN_WARNING "%s invoked oom-killer: "
+ "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
+ current->comm, gfp_mask, order, current->oomkilladj);
+ dump_stack();
+ show_mem();
+ if (sysctl_oom_dump_tasks)
+ dump_tasks(mem);
+ }
+
+ /*
+ * If the task is already exiting, don't alarm the sysadmin or kill
+ * its children or threads, just set TIF_MEMDIE so it can die quickly
+ */
+ if (p->flags & PF_EXITING) {
+ __oom_kill_task(p, 0);
+ return 0;
+ }
+
+ printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n",
+ message, task_pid_nr(p), p->comm, points);
+
+ /* Try to kill a child first */
+ list_for_each_entry(c, &p->children, sibling) {
+ if (c->mm == p->mm)
+ continue;
+ if (!oom_kill_task(c))
+ return 0;
+ }
+ return oom_kill_task(p);
+}
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
+{
+ unsigned long points = 0;
+ struct task_struct *p;
+
+ cgroup_lock();
+ read_lock(&tasklist_lock);
+retry:
+ p = select_bad_process(&points, mem);
+ if (PTR_ERR(p) == -1UL)
+ goto out;
+
+ if (!p)
+ p = current;
+
+ if (oom_kill_process(p, gfp_mask, 0, points, mem,
+ "Memory cgroup out of memory"))
+ goto retry;
+out:
+ read_unlock(&tasklist_lock);
+ cgroup_unlock();
+}
+#endif
+
+static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
+
+int register_oom_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&oom_notify_list, nb);
+}
+EXPORT_SYMBOL_GPL(register_oom_notifier);
+
+int unregister_oom_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&oom_notify_list, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_oom_notifier);
+
+/*
+ * Try to acquire the OOM killer lock for the zones in zonelist. Returns zero
+ * if a parallel OOM killing is already taking place that includes a zone in
+ * the zonelist. Otherwise, locks all zones in the zonelist and returns 1.
+ */
+int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
+{
+ struct zoneref *z;
+ struct zone *zone;
+ int ret = 1;
+
+ spin_lock(&zone_scan_mutex);
+ for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
+ if (zone_is_oom_locked(zone)) {
+ ret = 0;
+ goto out;
+ }
+ }
+
+ for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
+ /*
+ * Lock each zone in the zonelist under zone_scan_mutex so a
+ * parallel invocation of try_set_zone_oom() doesn't succeed
+ * when it shouldn't.
+ */
+ zone_set_flag(zone, ZONE_OOM_LOCKED);
+ }
+
+out:
+ spin_unlock(&zone_scan_mutex);
+ return ret;
+}
+
+/*
+ * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed
+ * allocation attempts with zonelists containing them may now recall the OOM
+ * killer, if necessary.
+ */
+void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
+{
+ struct zoneref *z;
+ struct zone *zone;
+
+ spin_lock(&zone_scan_mutex);
+ for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
+ zone_clear_flag(zone, ZONE_OOM_LOCKED);
+ }
+ spin_unlock(&zone_scan_mutex);
+}
+
+/**
+ * out_of_memory - kill the "best" process when we run out of memory
+ * @zonelist: zonelist pointer
+ * @gfp_mask: memory allocation flags
+ * @order: amount of memory being requested as a power of 2
+ *
+ * If we run out of memory, we have the choice between either
+ * killing a random task (bad), letting the system crash (worse)
+ * OR try to be smart about which process to kill. Note that we
+ * don't have to be perfect here, we just have to be good.
+ */
+void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
+{
+ struct task_struct *p;
+ unsigned long points = 0;
+ unsigned long freed = 0;
+ enum oom_constraint constraint;
+
+ blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
+ if (freed > 0)
+ /* Got some memory back in the last second. */
+ return;
+
+ if (sysctl_panic_on_oom == 2)
+ panic("out of memory. Compulsory panic_on_oom is selected.\n");
+
+ /*
+ * Check if there were limitations on the allocation (only relevant for
+ * NUMA) that may require different handling.
+ */
+ constraint = constrained_alloc(zonelist, gfp_mask);
+ read_lock(&tasklist_lock);
+
+ switch (constraint) {
+ case CONSTRAINT_MEMORY_POLICY:
+ oom_kill_process(current, gfp_mask, order, points, NULL,
+ "No available memory (MPOL_BIND)");
+ break;
+
+ case CONSTRAINT_NONE:
+ if (sysctl_panic_on_oom)
+ panic("out of memory. panic_on_oom is selected\n");
+ /* Fall-through */
+ case CONSTRAINT_CPUSET:
+ if (sysctl_oom_kill_allocating_task) {
+ oom_kill_process(current, gfp_mask, order, points, NULL,
+ "Out of memory (oom_kill_allocating_task)");
+ break;
+ }
+retry:
+ /*
+ * Rambo mode: Shoot down a process and hope it solves whatever
+ * issues we may have.
+ */
+ p = select_bad_process(&points, NULL);
+
+ if (PTR_ERR(p) == -1UL)
+ goto out;
+
+ /* Found nothing?!?! Either we hang forever, or we panic. */
+ if (!p) {
+ read_unlock(&tasklist_lock);
+ panic("Out of memory and no killable processes...\n");
+ }
+
+ if (oom_kill_process(p, gfp_mask, order, points, NULL,
+ "Out of memory"))
+ goto retry;
+
+ break;
+ }
+
+out:
+ read_unlock(&tasklist_lock);
+
+ /*
+ * Give "p" a good chance of killing itself before we
+ * retry to allocate memory unless "p" is current
+ */
+ if (!test_thread_flag(TIF_MEMDIE))
+ schedule_timeout_uninterruptible(1);
+}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
new file mode 100644
index 0000000..5270591
--- /dev/null
+++ b/mm/page-writeback.c
@@ -0,0 +1,1389 @@
+/*
+ * mm/page-writeback.c
+ *
+ * Copyright (C) 2002, Linus Torvalds.
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * Contains functions related to writing back dirty pages at the
+ * address_space level.
+ *
+ * 10Apr2002 Andrew Morton
+ * Initial version
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/init.h>
+#include <linux/backing-dev.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/blkdev.h>
+#include <linux/mpage.h>
+#include <linux/rmap.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/smp.h>
+#include <linux/sysctl.h>
+#include <linux/cpu.h>
+#include <linux/syscalls.h>
+#include <linux/buffer_head.h>
+#include <linux/pagevec.h>
+
+/*
+ * The maximum number of pages to writeout in a single bdflush/kupdate
+ * operation. We do this so we don't hold I_SYNC against an inode for
+ * enormous amounts of time, which would block a userspace task which has
+ * been forced to throttle against that inode. Also, the code reevaluates
+ * the dirty each time it has written this many pages.
+ */
+#define MAX_WRITEBACK_PAGES 1024
+
+/*
+ * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
+ * will look to see if it needs to force writeback or throttling.
+ */
+static long ratelimit_pages = 32;
+
+/*
+ * When balance_dirty_pages decides that the caller needs to perform some
+ * non-background writeback, this is how many pages it will attempt to write.
+ * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably
+ * large amounts of I/O are submitted.
+ */
+static inline long sync_writeback_pages(void)
+{
+ return ratelimit_pages + ratelimit_pages / 2;
+}
+
+/* The following parameters are exported via /proc/sys/vm */
+
+/*
+ * Start background writeback (via pdflush) at this percentage
+ */
+int dirty_background_ratio = 5;
+
+/*
+ * free highmem will not be subtracted from the total free memory
+ * for calculating free ratios if vm_highmem_is_dirtyable is true
+ */
+int vm_highmem_is_dirtyable;
+
+/*
+ * The generator of dirty data starts writeback at this percentage
+ */
+int vm_dirty_ratio = 10;
+
+/*
+ * The interval between `kupdate'-style writebacks, in jiffies
+ */
+int dirty_writeback_interval = 5 * HZ;
+
+/*
+ * The longest number of jiffies for which data is allowed to remain dirty
+ */
+int dirty_expire_interval = 30 * HZ;
+
+/*
+ * Flag that makes the machine dump writes/reads and block dirtyings.
+ */
+int block_dump;
+
+/*
+ * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
+ * a full sync is triggered after this time elapses without any disk activity.
+ */
+int laptop_mode;
+
+EXPORT_SYMBOL(laptop_mode);
+
+/* End of sysctl-exported parameters */
+
+
+static void background_writeout(unsigned long _min_pages);
+
+/*
+ * Scale the writeback cache size proportional to the relative writeout speeds.
+ *
+ * We do this by keeping a floating proportion between BDIs, based on page
+ * writeback completions [end_page_writeback()]. Those devices that write out
+ * pages fastest will get the larger share, while the slower will get a smaller
+ * share.
+ *
+ * We use page writeout completions because we are interested in getting rid of
+ * dirty pages. Having them written out is the primary goal.
+ *
+ * We introduce a concept of time, a period over which we measure these events,
+ * because demand can/will vary over time. The length of this period itself is
+ * measured in page writeback completions.
+ *
+ */
+static struct prop_descriptor vm_completions;
+static struct prop_descriptor vm_dirties;
+
+/*
+ * couple the period to the dirty_ratio:
+ *
+ * period/2 ~ roundup_pow_of_two(dirty limit)
+ */
+static int calc_period_shift(void)
+{
+ unsigned long dirty_total;
+
+ dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100;
+ return 2 + ilog2(dirty_total - 1);
+}
+
+/*
+ * update the period when the dirty ratio changes.
+ */
+int dirty_ratio_handler(struct ctl_table *table, int write,
+ struct file *filp, void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int old_ratio = vm_dirty_ratio;
+ int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+ if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
+ int shift = calc_period_shift();
+ prop_change_shift(&vm_completions, shift);
+ prop_change_shift(&vm_dirties, shift);
+ }
+ return ret;
+}
+
+/*
+ * Increment the BDI's writeout completion count and the global writeout
+ * completion count. Called from test_clear_page_writeback().
+ */
+static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
+{
+ __prop_inc_percpu_max(&vm_completions, &bdi->completions,
+ bdi->max_prop_frac);
+}
+
+void bdi_writeout_inc(struct backing_dev_info *bdi)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __bdi_writeout_inc(bdi);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(bdi_writeout_inc);
+
+static inline void task_dirty_inc(struct task_struct *tsk)
+{
+ prop_inc_single(&vm_dirties, &tsk->dirties);
+}
+
+/*
+ * Obtain an accurate fraction of the BDI's portion.
+ */
+static void bdi_writeout_fraction(struct backing_dev_info *bdi,
+ long *numerator, long *denominator)
+{
+ if (bdi_cap_writeback_dirty(bdi)) {
+ prop_fraction_percpu(&vm_completions, &bdi->completions,
+ numerator, denominator);
+ } else {
+ *numerator = 0;
+ *denominator = 1;
+ }
+}
+
+/*
+ * Clip the earned share of dirty pages to that which is actually available.
+ * This avoids exceeding the total dirty_limit when the floating averages
+ * fluctuate too quickly.
+ */
+static void
+clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty)
+{
+ long avail_dirty;
+
+ avail_dirty = dirty -
+ (global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_WRITEBACK) +
+ global_page_state(NR_UNSTABLE_NFS) +
+ global_page_state(NR_WRITEBACK_TEMP));
+
+ if (avail_dirty < 0)
+ avail_dirty = 0;
+
+ avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
+ bdi_stat(bdi, BDI_WRITEBACK);
+
+ *pbdi_dirty = min(*pbdi_dirty, avail_dirty);
+}
+
+static inline void task_dirties_fraction(struct task_struct *tsk,
+ long *numerator, long *denominator)
+{
+ prop_fraction_single(&vm_dirties, &tsk->dirties,
+ numerator, denominator);
+}
+
+/*
+ * scale the dirty limit
+ *
+ * task specific dirty limit:
+ *
+ * dirty -= (dirty/8) * p_{t}
+ */
+static void task_dirty_limit(struct task_struct *tsk, long *pdirty)
+{
+ long numerator, denominator;
+ long dirty = *pdirty;
+ u64 inv = dirty >> 3;
+
+ task_dirties_fraction(tsk, &numerator, &denominator);
+ inv *= numerator;
+ do_div(inv, denominator);
+
+ dirty -= inv;
+ if (dirty < *pdirty/2)
+ dirty = *pdirty/2;
+
+ *pdirty = dirty;
+}
+
+/*
+ *
+ */
+static DEFINE_SPINLOCK(bdi_lock);
+static unsigned int bdi_min_ratio;
+
+int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
+{
+ int ret = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&bdi_lock, flags);
+ if (min_ratio > bdi->max_ratio) {
+ ret = -EINVAL;
+ } else {
+ min_ratio -= bdi->min_ratio;
+ if (bdi_min_ratio + min_ratio < 100) {
+ bdi_min_ratio += min_ratio;
+ bdi->min_ratio += min_ratio;
+ } else {
+ ret = -EINVAL;
+ }
+ }
+ spin_unlock_irqrestore(&bdi_lock, flags);
+
+ return ret;
+}
+
+int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ if (max_ratio > 100)
+ return -EINVAL;
+
+ spin_lock_irqsave(&bdi_lock, flags);
+ if (bdi->min_ratio > max_ratio) {
+ ret = -EINVAL;
+ } else {
+ bdi->max_ratio = max_ratio;
+ bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
+ }
+ spin_unlock_irqrestore(&bdi_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(bdi_set_max_ratio);
+
+/*
+ * Work out the current dirty-memory clamping and background writeout
+ * thresholds.
+ *
+ * The main aim here is to lower them aggressively if there is a lot of mapped
+ * memory around. To avoid stressing page reclaim with lots of unreclaimable
+ * pages. It is better to clamp down on writers than to start swapping, and
+ * performing lots of scanning.
+ *
+ * We only allow 1/2 of the currently-unmapped memory to be dirtied.
+ *
+ * We don't permit the clamping level to fall below 5% - that is getting rather
+ * excessive.
+ *
+ * We make sure that the background writeout level is below the adjusted
+ * clamping level.
+ */
+
+static unsigned long highmem_dirtyable_memory(unsigned long total)
+{
+#ifdef CONFIG_HIGHMEM
+ int node;
+ unsigned long x = 0;
+
+ for_each_node_state(node, N_HIGH_MEMORY) {
+ struct zone *z =
+ &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
+
+ x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z);
+ }
+ /*
+ * Make sure that the number of highmem pages is never larger
+ * than the number of the total dirtyable memory. This can only
+ * occur in very strange VM situations but we want to make sure
+ * that this does not occur.
+ */
+ return min(x, total);
+#else
+ return 0;
+#endif
+}
+
+/**
+ * determine_dirtyable_memory - amount of memory that may be used
+ *
+ * Returns the numebr of pages that can currently be freed and used
+ * by the kernel for direct mappings.
+ */
+unsigned long determine_dirtyable_memory(void)
+{
+ unsigned long x;
+
+ x = global_page_state(NR_FREE_PAGES) + global_lru_pages();
+
+ if (!vm_highmem_is_dirtyable)
+ x -= highmem_dirtyable_memory(x);
+
+ return x + 1; /* Ensure that we never return 0 */
+}
+
+void
+get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
+ struct backing_dev_info *bdi)
+{
+ int background_ratio; /* Percentages */
+ int dirty_ratio;
+ long background;
+ long dirty;
+ unsigned long available_memory = determine_dirtyable_memory();
+ struct task_struct *tsk;
+
+ dirty_ratio = vm_dirty_ratio;
+ if (dirty_ratio < 5)
+ dirty_ratio = 5;
+
+ background_ratio = dirty_background_ratio;
+ if (background_ratio >= dirty_ratio)
+ background_ratio = dirty_ratio / 2;
+
+ background = (background_ratio * available_memory) / 100;
+ dirty = (dirty_ratio * available_memory) / 100;
+ tsk = current;
+ if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
+ background += background / 4;
+ dirty += dirty / 4;
+ }
+ *pbackground = background;
+ *pdirty = dirty;
+
+ if (bdi) {
+ u64 bdi_dirty;
+ long numerator, denominator;
+
+ /*
+ * Calculate this BDI's share of the dirty ratio.
+ */
+ bdi_writeout_fraction(bdi, &numerator, &denominator);
+
+ bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
+ bdi_dirty *= numerator;
+ do_div(bdi_dirty, denominator);
+ bdi_dirty += (dirty * bdi->min_ratio) / 100;
+ if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
+ bdi_dirty = dirty * bdi->max_ratio / 100;
+
+ *pbdi_dirty = bdi_dirty;
+ clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
+ task_dirty_limit(current, pbdi_dirty);
+ }
+}
+
+/*
+ * balance_dirty_pages() must be called by processes which are generating dirty
+ * data. It looks at the number of dirty pages in the machine and will force
+ * the caller to perform writeback if the system is over `vm_dirty_ratio'.
+ * If we're over `background_thresh' then pdflush is woken to perform some
+ * writeout.
+ */
+static void balance_dirty_pages(struct address_space *mapping)
+{
+ long nr_reclaimable, bdi_nr_reclaimable;
+ long nr_writeback, bdi_nr_writeback;
+ long background_thresh;
+ long dirty_thresh;
+ long bdi_thresh;
+ unsigned long pages_written = 0;
+ unsigned long write_chunk = sync_writeback_pages();
+
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
+
+ for (;;) {
+ struct writeback_control wbc = {
+ .bdi = bdi,
+ .sync_mode = WB_SYNC_NONE,
+ .older_than_this = NULL,
+ .nr_to_write = write_chunk,
+ .range_cyclic = 1,
+ };
+
+ get_dirty_limits(&background_thresh, &dirty_thresh,
+ &bdi_thresh, bdi);
+
+ nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS);
+ nr_writeback = global_page_state(NR_WRITEBACK);
+
+ bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+ bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
+
+ if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
+ break;
+
+ /*
+ * Throttle it only when the background writeback cannot
+ * catch-up. This avoids (excessively) small writeouts
+ * when the bdi limits are ramping up.
+ */
+ if (nr_reclaimable + nr_writeback <
+ (background_thresh + dirty_thresh) / 2)
+ break;
+
+ if (!bdi->dirty_exceeded)
+ bdi->dirty_exceeded = 1;
+
+ /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
+ * Unstable writes are a feature of certain networked
+ * filesystems (i.e. NFS) in which data may have been
+ * written to the server's write cache, but has not yet
+ * been flushed to permanent storage.
+ */
+ if (bdi_nr_reclaimable) {
+ writeback_inodes(&wbc);
+ pages_written += write_chunk - wbc.nr_to_write;
+ get_dirty_limits(&background_thresh, &dirty_thresh,
+ &bdi_thresh, bdi);
+ }
+
+ /*
+ * In order to avoid the stacked BDI deadlock we need
+ * to ensure we accurately count the 'dirty' pages when
+ * the threshold is low.
+ *
+ * Otherwise it would be possible to get thresh+n pages
+ * reported dirty, even though there are thresh-m pages
+ * actually dirty; with m+n sitting in the percpu
+ * deltas.
+ */
+ if (bdi_thresh < 2*bdi_stat_error(bdi)) {
+ bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
+ bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
+ } else if (bdi_nr_reclaimable) {
+ bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+ bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
+ }
+
+ if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
+ break;
+ if (pages_written >= write_chunk)
+ break; /* We've done our duty */
+
+ congestion_wait(WRITE, HZ/10);
+ }
+
+ if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
+ bdi->dirty_exceeded)
+ bdi->dirty_exceeded = 0;
+
+ if (writeback_in_progress(bdi))
+ return; /* pdflush is already working this queue */
+
+ /*
+ * In laptop mode, we wait until hitting the higher threshold before
+ * starting background writeout, and then write out all the way down
+ * to the lower threshold. So slow writers cause minimal disk activity.
+ *
+ * In normal mode, we start background writeout at the lower
+ * background_thresh, to keep the amount of dirty memory low.
+ */
+ if ((laptop_mode && pages_written) ||
+ (!laptop_mode && (global_page_state(NR_FILE_DIRTY)
+ + global_page_state(NR_UNSTABLE_NFS)
+ > background_thresh)))
+ pdflush_operation(background_writeout, 0);
+}
+
+void set_page_dirty_balance(struct page *page, int page_mkwrite)
+{
+ if (set_page_dirty(page) || page_mkwrite) {
+ struct address_space *mapping = page_mapping(page);
+
+ if (mapping)
+ balance_dirty_pages_ratelimited(mapping);
+ }
+}
+
+/**
+ * balance_dirty_pages_ratelimited_nr - balance dirty memory state
+ * @mapping: address_space which was dirtied
+ * @nr_pages_dirtied: number of pages which the caller has just dirtied
+ *
+ * Processes which are dirtying memory should call in here once for each page
+ * which was newly dirtied. The function will periodically check the system's
+ * dirty state and will initiate writeback if needed.
+ *
+ * On really big machines, get_writeback_state is expensive, so try to avoid
+ * calling it too often (ratelimiting). But once we're over the dirty memory
+ * limit we decrease the ratelimiting by a lot, to prevent individual processes
+ * from overshooting the limit by (ratelimit_pages) each.
+ */
+void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
+ unsigned long nr_pages_dirtied)
+{
+ static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
+ unsigned long ratelimit;
+ unsigned long *p;
+
+ ratelimit = ratelimit_pages;
+ if (mapping->backing_dev_info->dirty_exceeded)
+ ratelimit = 8;
+
+ /*
+ * Check the rate limiting. Also, we do not want to throttle real-time
+ * tasks in balance_dirty_pages(). Period.
+ */
+ preempt_disable();
+ p = &__get_cpu_var(ratelimits);
+ *p += nr_pages_dirtied;
+ if (unlikely(*p >= ratelimit)) {
+ *p = 0;
+ preempt_enable();
+ balance_dirty_pages(mapping);
+ return;
+ }
+ preempt_enable();
+}
+EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
+
+void throttle_vm_writeout(gfp_t gfp_mask)
+{
+ long background_thresh;
+ long dirty_thresh;
+
+ for ( ; ; ) {
+ get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
+
+ /*
+ * Boost the allowable dirty threshold a bit for page
+ * allocators so they don't get DoS'ed by heavy writers
+ */
+ dirty_thresh += dirty_thresh / 10; /* wheeee... */
+
+ if (global_page_state(NR_UNSTABLE_NFS) +
+ global_page_state(NR_WRITEBACK) <= dirty_thresh)
+ break;
+ congestion_wait(WRITE, HZ/10);
+
+ /*
+ * The caller might hold locks which can prevent IO completion
+ * or progress in the filesystem. So we cannot just sit here
+ * waiting for IO to complete.
+ */
+ if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO))
+ break;
+ }
+}
+
+/*
+ * writeback at least _min_pages, and keep writing until the amount of dirty
+ * memory is less than the background threshold, or until we're all clean.
+ */
+static void background_writeout(unsigned long _min_pages)
+{
+ long min_pages = _min_pages;
+ struct writeback_control wbc = {
+ .bdi = NULL,
+ .sync_mode = WB_SYNC_NONE,
+ .older_than_this = NULL,
+ .nr_to_write = 0,
+ .nonblocking = 1,
+ .range_cyclic = 1,
+ };
+
+ for ( ; ; ) {
+ long background_thresh;
+ long dirty_thresh;
+
+ get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
+ if (global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS) < background_thresh
+ && min_pages <= 0)
+ break;
+ wbc.more_io = 0;
+ wbc.encountered_congestion = 0;
+ wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+ wbc.pages_skipped = 0;
+ writeback_inodes(&wbc);
+ min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+ if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
+ /* Wrote less than expected */
+ if (wbc.encountered_congestion || wbc.more_io)
+ congestion_wait(WRITE, HZ/10);
+ else
+ break;
+ }
+ }
+}
+
+/*
+ * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
+ * the whole world. Returns 0 if a pdflush thread was dispatched. Returns
+ * -1 if all pdflush threads were busy.
+ */
+int wakeup_pdflush(long nr_pages)
+{
+ if (nr_pages == 0)
+ nr_pages = global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS);
+ return pdflush_operation(background_writeout, nr_pages);
+}
+
+static void wb_timer_fn(unsigned long unused);
+static void laptop_timer_fn(unsigned long unused);
+
+static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
+static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
+
+/*
+ * Periodic writeback of "old" data.
+ *
+ * Define "old": the first time one of an inode's pages is dirtied, we mark the
+ * dirtying-time in the inode's address_space. So this periodic writeback code
+ * just walks the superblock inode list, writing back any inodes which are
+ * older than a specific point in time.
+ *
+ * Try to run once per dirty_writeback_interval. But if a writeback event
+ * takes longer than a dirty_writeback_interval interval, then leave a
+ * one-second gap.
+ *
+ * older_than_this takes precedence over nr_to_write. So we'll only write back
+ * all dirty pages if they are all attached to "old" mappings.
+ */
+static void wb_kupdate(unsigned long arg)
+{
+ unsigned long oldest_jif;
+ unsigned long start_jif;
+ unsigned long next_jif;
+ long nr_to_write;
+ struct writeback_control wbc = {
+ .bdi = NULL,
+ .sync_mode = WB_SYNC_NONE,
+ .older_than_this = &oldest_jif,
+ .nr_to_write = 0,
+ .nonblocking = 1,
+ .for_kupdate = 1,
+ .range_cyclic = 1,
+ };
+
+ sync_supers();
+
+ oldest_jif = jiffies - dirty_expire_interval;
+ start_jif = jiffies;
+ next_jif = start_jif + dirty_writeback_interval;
+ nr_to_write = global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS) +
+ (inodes_stat.nr_inodes - inodes_stat.nr_unused);
+ while (nr_to_write > 0) {
+ wbc.more_io = 0;
+ wbc.encountered_congestion = 0;
+ wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+ writeback_inodes(&wbc);
+ if (wbc.nr_to_write > 0) {
+ if (wbc.encountered_congestion || wbc.more_io)
+ congestion_wait(WRITE, HZ/10);
+ else
+ break; /* All the old data is written */
+ }
+ nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+ }
+ if (time_before(next_jif, jiffies + HZ))
+ next_jif = jiffies + HZ;
+ if (dirty_writeback_interval)
+ mod_timer(&wb_timer, next_jif);
+}
+
+/*
+ * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
+ */
+int dirty_writeback_centisecs_handler(ctl_table *table, int write,
+ struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+ proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos);
+ if (dirty_writeback_interval)
+ mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
+ else
+ del_timer(&wb_timer);
+ return 0;
+}
+
+static void wb_timer_fn(unsigned long unused)
+{
+ if (pdflush_operation(wb_kupdate, 0) < 0)
+ mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
+}
+
+static void laptop_flush(unsigned long unused)
+{
+ sys_sync();
+}
+
+static void laptop_timer_fn(unsigned long unused)
+{
+ pdflush_operation(laptop_flush, 0);
+}
+
+/*
+ * We've spun up the disk and we're in laptop mode: schedule writeback
+ * of all dirty data a few seconds from now. If the flush is already scheduled
+ * then push it back - the user is still using the disk.
+ */
+void laptop_io_completion(void)
+{
+ mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode);
+}
+
+/*
+ * We're in laptop mode and we've just synced. The sync's writes will have
+ * caused another writeback to be scheduled by laptop_io_completion.
+ * Nothing needs to be written back anymore, so we unschedule the writeback.
+ */
+void laptop_sync_completion(void)
+{
+ del_timer(&laptop_mode_wb_timer);
+}
+
+/*
+ * If ratelimit_pages is too high then we can get into dirty-data overload
+ * if a large number of processes all perform writes at the same time.
+ * If it is too low then SMP machines will call the (expensive)
+ * get_writeback_state too often.
+ *
+ * Here we set ratelimit_pages to a level which ensures that when all CPUs are
+ * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
+ * thresholds before writeback cuts in.
+ *
+ * But the limit should not be set too high. Because it also controls the
+ * amount of memory which the balance_dirty_pages() caller has to write back.
+ * If this is too large then the caller will block on the IO queue all the
+ * time. So limit it to four megabytes - the balance_dirty_pages() caller
+ * will write six megabyte chunks, max.
+ */
+
+void writeback_set_ratelimit(void)
+{
+ ratelimit_pages = vm_total_pages / (num_online_cpus() * 32);
+ if (ratelimit_pages < 16)
+ ratelimit_pages = 16;
+ if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
+ ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
+}
+
+static int __cpuinit
+ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
+{
+ writeback_set_ratelimit();
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block __cpuinitdata ratelimit_nb = {
+ .notifier_call = ratelimit_handler,
+ .next = NULL,
+};
+
+/*
+ * Called early on to tune the page writeback dirty limits.
+ *
+ * We used to scale dirty pages according to how total memory
+ * related to pages that could be allocated for buffers (by
+ * comparing nr_free_buffer_pages() to vm_total_pages.
+ *
+ * However, that was when we used "dirty_ratio" to scale with
+ * all memory, and we don't do that any more. "dirty_ratio"
+ * is now applied to total non-HIGHPAGE memory (by subtracting
+ * totalhigh_pages from vm_total_pages), and as such we can't
+ * get into the old insane situation any more where we had
+ * large amounts of dirty pages compared to a small amount of
+ * non-HIGHMEM memory.
+ *
+ * But we might still want to scale the dirty_ratio by how
+ * much memory the box has..
+ */
+void __init page_writeback_init(void)
+{
+ int shift;
+
+ mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
+ writeback_set_ratelimit();
+ register_cpu_notifier(&ratelimit_nb);
+
+ shift = calc_period_shift();
+ prop_descriptor_init(&vm_completions, shift);
+ prop_descriptor_init(&vm_dirties, shift);
+}
+
+/**
+ * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @writepage: function called for each page
+ * @data: data passed to writepage function
+ *
+ * If a page is already under I/O, write_cache_pages() skips it, even
+ * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
+ * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
+ * and msync() need to guarantee that all the data which was dirty at the time
+ * the call was made get new I/O started against them. If wbc->sync_mode is
+ * WB_SYNC_ALL then we were called for data integrity and we must wait for
+ * existing IO to complete.
+ */
+int write_cache_pages(struct address_space *mapping,
+ struct writeback_control *wbc, writepage_t writepage,
+ void *data)
+{
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
+ int ret = 0;
+ int done = 0;
+ struct pagevec pvec;
+ int nr_pages;
+ pgoff_t uninitialized_var(writeback_index);
+ pgoff_t index;
+ pgoff_t end; /* Inclusive */
+ pgoff_t done_index;
+ int cycled;
+ int range_whole = 0;
+ long nr_to_write = wbc->nr_to_write;
+
+ if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ wbc->encountered_congestion = 1;
+ return 0;
+ }
+
+ pagevec_init(&pvec, 0);
+ if (wbc->range_cyclic) {
+ writeback_index = mapping->writeback_index; /* prev offset */
+ index = writeback_index;
+ if (index == 0)
+ cycled = 1;
+ else
+ cycled = 0;
+ end = -1;
+ } else {
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;
+ cycled = 1; /* ignore range_cyclic tests */
+ }
+retry:
+ done_index = index;
+ while (!done && (index <= end)) {
+ int i;
+
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ /*
+ * At this point, the page may be truncated or
+ * invalidated (changing page->mapping to NULL), or
+ * even swizzled back from swapper_space to tmpfs file
+ * mapping. However, page->index will not change
+ * because we have a reference on the page.
+ */
+ if (page->index > end) {
+ /*
+ * can't be range_cyclic (1st pass) because
+ * end == -1 in that case.
+ */
+ done = 1;
+ break;
+ }
+
+ done_index = page->index + 1;
+
+ lock_page(page);
+
+ /*
+ * Page truncated or invalidated. We can freely skip it
+ * then, even for data integrity operations: the page
+ * has disappeared concurrently, so there could be no
+ * real expectation of this data interity operation
+ * even if there is now a new, dirty page at the same
+ * pagecache address.
+ */
+ if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+ unlock_page(page);
+ continue;
+ }
+
+ if (!PageDirty(page)) {
+ /* someone wrote it for us */
+ goto continue_unlock;
+ }
+
+ if (PageWriteback(page)) {
+ if (wbc->sync_mode != WB_SYNC_NONE)
+ wait_on_page_writeback(page);
+ else
+ goto continue_unlock;
+ }
+
+ BUG_ON(PageWriteback(page));
+ if (!clear_page_dirty_for_io(page))
+ goto continue_unlock;
+
+ ret = (*writepage)(page, wbc, data);
+ if (unlikely(ret)) {
+ if (ret == AOP_WRITEPAGE_ACTIVATE) {
+ unlock_page(page);
+ ret = 0;
+ } else {
+ /*
+ * done_index is set past this page,
+ * so media errors will not choke
+ * background writeout for the entire
+ * file. This has consequences for
+ * range_cyclic semantics (ie. it may
+ * not be suitable for data integrity
+ * writeout).
+ */
+ done = 1;
+ break;
+ }
+ }
+
+ if (nr_to_write > 0) {
+ nr_to_write--;
+ if (nr_to_write == 0 &&
+ wbc->sync_mode == WB_SYNC_NONE) {
+ /*
+ * We stop writing back only if we are
+ * not doing integrity sync. In case of
+ * integrity sync we have to keep going
+ * because someone may be concurrently
+ * dirtying pages, and we might have
+ * synced a lot of newly appeared dirty
+ * pages, but have not synced all of the
+ * old dirty pages.
+ */
+ done = 1;
+ break;
+ }
+ }
+
+ if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ wbc->encountered_congestion = 1;
+ done = 1;
+ break;
+ }
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+ if (!cycled && !done) {
+ /*
+ * range_cyclic:
+ * We hit the last page and there is more work to be done: wrap
+ * back to the start of the file
+ */
+ cycled = 1;
+ index = 0;
+ end = writeback_index - 1;
+ goto retry;
+ }
+ if (!wbc->no_nrwrite_index_update) {
+ if (wbc->range_cyclic || (range_whole && nr_to_write > 0))
+ mapping->writeback_index = done_index;
+ wbc->nr_to_write = nr_to_write;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(write_cache_pages);
+
+/*
+ * Function used by generic_writepages to call the real writepage
+ * function and set the mapping flags on error
+ */
+static int __writepage(struct page *page, struct writeback_control *wbc,
+ void *data)
+{
+ struct address_space *mapping = data;
+ int ret = mapping->a_ops->writepage(page, wbc);
+ mapping_set_error(mapping, ret);
+ return ret;
+}
+
+/**
+ * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them.
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ *
+ * This is a library function, which implements the writepages()
+ * address_space_operation.
+ */
+int generic_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ /* deal with chardevs and other special file */
+ if (!mapping->a_ops->writepage)
+ return 0;
+
+ return write_cache_pages(mapping, wbc, __writepage, mapping);
+}
+
+EXPORT_SYMBOL(generic_writepages);
+
+int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+ int ret;
+
+ if (wbc->nr_to_write <= 0)
+ return 0;
+ wbc->for_writepages = 1;
+ if (mapping->a_ops->writepages)
+ ret = mapping->a_ops->writepages(mapping, wbc);
+ else
+ ret = generic_writepages(mapping, wbc);
+ wbc->for_writepages = 0;
+ return ret;
+}
+
+/**
+ * write_one_page - write out a single page and optionally wait on I/O
+ * @page: the page to write
+ * @wait: if true, wait on writeout
+ *
+ * The page must be locked by the caller and will be unlocked upon return.
+ *
+ * write_one_page() returns a negative error code if I/O failed.
+ */
+int write_one_page(struct page *page, int wait)
+{
+ struct address_space *mapping = page->mapping;
+ int ret = 0;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = 1,
+ };
+
+ BUG_ON(!PageLocked(page));
+
+ if (wait)
+ wait_on_page_writeback(page);
+
+ if (clear_page_dirty_for_io(page)) {
+ page_cache_get(page);
+ ret = mapping->a_ops->writepage(page, &wbc);
+ if (ret == 0 && wait) {
+ wait_on_page_writeback(page);
+ if (PageError(page))
+ ret = -EIO;
+ }
+ page_cache_release(page);
+ } else {
+ unlock_page(page);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(write_one_page);
+
+/*
+ * For address_spaces which do not use buffers nor write back.
+ */
+int __set_page_dirty_no_writeback(struct page *page)
+{
+ if (!PageDirty(page))
+ SetPageDirty(page);
+ return 0;
+}
+
+/*
+ * For address_spaces which do not use buffers. Just tag the page as dirty in
+ * its radix tree.
+ *
+ * This is also used when a single buffer is being dirtied: we want to set the
+ * page dirty in that case, but not all the buffers. This is a "bottom-up"
+ * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
+ *
+ * Most callers have locked the page, which pins the address_space in memory.
+ * But zap_pte_range() does not lock the page, however in that case the
+ * mapping is pinned by the vma's ->vm_file reference.
+ *
+ * We take care to handle the case where the page was truncated from the
+ * mapping by re-checking page_mapping() inside tree_lock.
+ */
+int __set_page_dirty_nobuffers(struct page *page)
+{
+ if (!TestSetPageDirty(page)) {
+ struct address_space *mapping = page_mapping(page);
+ struct address_space *mapping2;
+
+ if (!mapping)
+ return 1;
+
+ spin_lock_irq(&mapping->tree_lock);
+ mapping2 = page_mapping(page);
+ if (mapping2) { /* Race with truncate? */
+ BUG_ON(mapping2 != mapping);
+ WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
+ if (mapping_cap_account_dirty(mapping)) {
+ __inc_zone_page_state(page, NR_FILE_DIRTY);
+ __inc_bdi_stat(mapping->backing_dev_info,
+ BDI_RECLAIMABLE);
+ task_io_account_write(PAGE_CACHE_SIZE);
+ }
+ radix_tree_tag_set(&mapping->page_tree,
+ page_index(page), PAGECACHE_TAG_DIRTY);
+ }
+ spin_unlock_irq(&mapping->tree_lock);
+ if (mapping->host) {
+ /* !PageAnon && !swapper_space */
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+ }
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(__set_page_dirty_nobuffers);
+
+/*
+ * When a writepage implementation decides that it doesn't want to write this
+ * page for some reason, it should redirty the locked page via
+ * redirty_page_for_writepage() and it should then unlock the page and return 0
+ */
+int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
+{
+ wbc->pages_skipped++;
+ return __set_page_dirty_nobuffers(page);
+}
+EXPORT_SYMBOL(redirty_page_for_writepage);
+
+/*
+ * If the mapping doesn't provide a set_page_dirty a_op, then
+ * just fall through and assume that it wants buffer_heads.
+ */
+static int __set_page_dirty(struct page *page)
+{
+ struct address_space *mapping = page_mapping(page);
+
+ if (likely(mapping)) {
+ int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
+#ifdef CONFIG_BLOCK
+ if (!spd)
+ spd = __set_page_dirty_buffers;
+#endif
+ return (*spd)(page);
+ }
+ if (!PageDirty(page)) {
+ if (!TestSetPageDirty(page))
+ return 1;
+ }
+ return 0;
+}
+
+int set_page_dirty(struct page *page)
+{
+ int ret = __set_page_dirty(page);
+ if (ret)
+ task_dirty_inc(current);
+ return ret;
+}
+EXPORT_SYMBOL(set_page_dirty);
+
+/*
+ * set_page_dirty() is racy if the caller has no reference against
+ * page->mapping->host, and if the page is unlocked. This is because another
+ * CPU could truncate the page off the mapping and then free the mapping.
+ *
+ * Usually, the page _is_ locked, or the caller is a user-space process which
+ * holds a reference on the inode by having an open file.
+ *
+ * In other cases, the page should be locked before running set_page_dirty().
+ */
+int set_page_dirty_lock(struct page *page)
+{
+ int ret;
+
+ lock_page_nosync(page);
+ ret = set_page_dirty(page);
+ unlock_page(page);
+ return ret;
+}
+EXPORT_SYMBOL(set_page_dirty_lock);
+
+/*
+ * Clear a page's dirty flag, while caring for dirty memory accounting.
+ * Returns true if the page was previously dirty.
+ *
+ * This is for preparing to put the page under writeout. We leave the page
+ * tagged as dirty in the radix tree so that a concurrent write-for-sync
+ * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage
+ * implementation will run either set_page_writeback() or set_page_dirty(),
+ * at which stage we bring the page's dirty flag and radix-tree dirty tag
+ * back into sync.
+ *
+ * This incoherency between the page's dirty flag and radix-tree tag is
+ * unfortunate, but it only exists while the page is locked.
+ */
+int clear_page_dirty_for_io(struct page *page)
+{
+ struct address_space *mapping = page_mapping(page);
+
+ BUG_ON(!PageLocked(page));
+
+ ClearPageReclaim(page);
+ if (mapping && mapping_cap_account_dirty(mapping)) {
+ /*
+ * Yes, Virginia, this is indeed insane.
+ *
+ * We use this sequence to make sure that
+ * (a) we account for dirty stats properly
+ * (b) we tell the low-level filesystem to
+ * mark the whole page dirty if it was
+ * dirty in a pagetable. Only to then
+ * (c) clean the page again and return 1 to
+ * cause the writeback.
+ *
+ * This way we avoid all nasty races with the
+ * dirty bit in multiple places and clearing
+ * them concurrently from different threads.
+ *
+ * Note! Normally the "set_page_dirty(page)"
+ * has no effect on the actual dirty bit - since
+ * that will already usually be set. But we
+ * need the side effects, and it can help us
+ * avoid races.
+ *
+ * We basically use the page "master dirty bit"
+ * as a serialization point for all the different
+ * threads doing their things.
+ */
+ if (page_mkclean(page))
+ set_page_dirty(page);
+ /*
+ * We carefully synchronise fault handlers against
+ * installing a dirty pte and marking the page dirty
+ * at this point. We do this by having them hold the
+ * page lock at some point after installing their
+ * pte, but before marking the page dirty.
+ * Pages are always locked coming in here, so we get
+ * the desired exclusion. See mm/memory.c:do_wp_page()
+ * for more comments.
+ */
+ if (TestClearPageDirty(page)) {
+ dec_zone_page_state(page, NR_FILE_DIRTY);
+ dec_bdi_stat(mapping->backing_dev_info,
+ BDI_RECLAIMABLE);
+ return 1;
+ }
+ return 0;
+ }
+ return TestClearPageDirty(page);
+}
+EXPORT_SYMBOL(clear_page_dirty_for_io);
+
+int test_clear_page_writeback(struct page *page)
+{
+ struct address_space *mapping = page_mapping(page);
+ int ret;
+
+ if (mapping) {
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
+ unsigned long flags;
+
+ spin_lock_irqsave(&mapping->tree_lock, flags);
+ ret = TestClearPageWriteback(page);
+ if (ret) {
+ radix_tree_tag_clear(&mapping->page_tree,
+ page_index(page),
+ PAGECACHE_TAG_WRITEBACK);
+ if (bdi_cap_account_writeback(bdi)) {
+ __dec_bdi_stat(bdi, BDI_WRITEBACK);
+ __bdi_writeout_inc(bdi);
+ }
+ }
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ } else {
+ ret = TestClearPageWriteback(page);
+ }
+ if (ret)
+ dec_zone_page_state(page, NR_WRITEBACK);
+ return ret;
+}
+
+int test_set_page_writeback(struct page *page)
+{
+ struct address_space *mapping = page_mapping(page);
+ int ret;
+
+ if (mapping) {
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
+ unsigned long flags;
+
+ spin_lock_irqsave(&mapping->tree_lock, flags);
+ ret = TestSetPageWriteback(page);
+ if (!ret) {
+ radix_tree_tag_set(&mapping->page_tree,
+ page_index(page),
+ PAGECACHE_TAG_WRITEBACK);
+ if (bdi_cap_account_writeback(bdi))
+ __inc_bdi_stat(bdi, BDI_WRITEBACK);
+ }
+ if (!PageDirty(page))
+ radix_tree_tag_clear(&mapping->page_tree,
+ page_index(page),
+ PAGECACHE_TAG_DIRTY);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ } else {
+ ret = TestSetPageWriteback(page);
+ }
+ if (!ret)
+ inc_zone_page_state(page, NR_WRITEBACK);
+ return ret;
+
+}
+EXPORT_SYMBOL(test_set_page_writeback);
+
+/*
+ * Return true if any of the pages in the mapping are marked with the
+ * passed tag.
+ */
+int mapping_tagged(struct address_space *mapping, int tag)
+{
+ int ret;
+ rcu_read_lock();
+ ret = radix_tree_tagged(&mapping->page_tree, tag);
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL(mapping_tagged);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
new file mode 100644
index 0000000..566ee21
--- /dev/null
+++ b/mm/page_alloc.c
@@ -0,0 +1,4779 @@
+/*
+ * linux/mm/page_alloc.c
+ *
+ * Manages the free list, the system allocates free pages here.
+ * Note that kmalloc() lives in slab.c
+ *
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ * Swap reorganised 29.12.95, Stephen Tweedie
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
+ * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
+ * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
+ * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
+ * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
+ */
+
+#include <linux/stddef.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/interrupt.h>
+#include <linux/pagemap.h>
+#include <linux/jiffies.h>
+#include <linux/bootmem.h>
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/suspend.h>
+#include <linux/pagevec.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/oom.h>
+#include <linux/notifier.h>
+#include <linux/topology.h>
+#include <linux/sysctl.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/memory_hotplug.h>
+#include <linux/nodemask.h>
+#include <linux/vmalloc.h>
+#include <linux/mempolicy.h>
+#include <linux/stop_machine.h>
+#include <linux/sort.h>
+#include <linux/pfn.h>
+#include <linux/backing-dev.h>
+#include <linux/fault-inject.h>
+#include <linux/page-isolation.h>
+#include <linux/page_cgroup.h>
+#include <linux/debugobjects.h>
+
+#include <asm/tlbflush.h>
+#include <asm/div64.h>
+#include "internal.h"
+
+/*
+ * Array of node states.
+ */
+nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
+ [N_POSSIBLE] = NODE_MASK_ALL,
+ [N_ONLINE] = { { [0] = 1UL } },
+#ifndef CONFIG_NUMA
+ [N_NORMAL_MEMORY] = { { [0] = 1UL } },
+#ifdef CONFIG_HIGHMEM
+ [N_HIGH_MEMORY] = { { [0] = 1UL } },
+#endif
+ [N_CPU] = { { [0] = 1UL } },
+#endif /* NUMA */
+};
+EXPORT_SYMBOL(node_states);
+
+unsigned long totalram_pages __read_mostly;
+unsigned long totalreserve_pages __read_mostly;
+long nr_swap_pages;
+int percpu_pagelist_fraction;
+
+#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
+int pageblock_order __read_mostly;
+#endif
+
+static void __free_pages_ok(struct page *page, unsigned int order);
+
+/*
+ * results with 256, 32 in the lowmem_reserve sysctl:
+ * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
+ * 1G machine -> (16M dma, 784M normal, 224M high)
+ * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
+ * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
+ * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
+ *
+ * TBD: should special case ZONE_DMA32 machines here - in those we normally
+ * don't need any ZONE_NORMAL reservation
+ */
+int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
+#ifdef CONFIG_ZONE_DMA
+ 256,
+#endif
+#ifdef CONFIG_ZONE_DMA32
+ 256,
+#endif
+#ifdef CONFIG_HIGHMEM
+ 32,
+#endif
+ 32,
+};
+
+EXPORT_SYMBOL(totalram_pages);
+
+static char * const zone_names[MAX_NR_ZONES] = {
+#ifdef CONFIG_ZONE_DMA
+ "DMA",
+#endif
+#ifdef CONFIG_ZONE_DMA32
+ "DMA32",
+#endif
+ "Normal",
+#ifdef CONFIG_HIGHMEM
+ "HighMem",
+#endif
+ "Movable",
+};
+
+int min_free_kbytes = 1024;
+
+unsigned long __meminitdata nr_kernel_pages;
+unsigned long __meminitdata nr_all_pages;
+static unsigned long __meminitdata dma_reserve;
+
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+ /*
+ * MAX_ACTIVE_REGIONS determines the maximum number of distinct
+ * ranges of memory (RAM) that may be registered with add_active_range().
+ * Ranges passed to add_active_range() will be merged if possible
+ * so the number of times add_active_range() can be called is
+ * related to the number of nodes and the number of holes
+ */
+ #ifdef CONFIG_MAX_ACTIVE_REGIONS
+ /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
+ #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
+ #else
+ #if MAX_NUMNODES >= 32
+ /* If there can be many nodes, allow up to 50 holes per node */
+ #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
+ #else
+ /* By default, allow up to 256 distinct regions */
+ #define MAX_ACTIVE_REGIONS 256
+ #endif
+ #endif
+
+ static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
+ static int __meminitdata nr_nodemap_entries;
+ static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
+ static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
+#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
+ static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
+ static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
+#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
+ static unsigned long __initdata required_kernelcore;
+ static unsigned long __initdata required_movablecore;
+ static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+
+ /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
+ int movable_zone;
+ EXPORT_SYMBOL(movable_zone);
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+
+#if MAX_NUMNODES > 1
+int nr_node_ids __read_mostly = MAX_NUMNODES;
+EXPORT_SYMBOL(nr_node_ids);
+#endif
+
+int page_group_by_mobility_disabled __read_mostly;
+
+static void set_pageblock_migratetype(struct page *page, int migratetype)
+{
+ set_pageblock_flags_group(page, (unsigned long)migratetype,
+ PB_migrate, PB_migrate_end);
+}
+
+#ifdef CONFIG_DEBUG_VM
+static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
+{
+ int ret = 0;
+ unsigned seq;
+ unsigned long pfn = page_to_pfn(page);
+
+ do {
+ seq = zone_span_seqbegin(zone);
+ if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
+ ret = 1;
+ else if (pfn < zone->zone_start_pfn)
+ ret = 1;
+ } while (zone_span_seqretry(zone, seq));
+
+ return ret;
+}
+
+static int page_is_consistent(struct zone *zone, struct page *page)
+{
+ if (!pfn_valid_within(page_to_pfn(page)))
+ return 0;
+ if (zone != page_zone(page))
+ return 0;
+
+ return 1;
+}
+/*
+ * Temporary debugging check for pages not lying within a given zone.
+ */
+static int bad_range(struct zone *zone, struct page *page)
+{
+ if (page_outside_zone_boundaries(zone, page))
+ return 1;
+ if (!page_is_consistent(zone, page))
+ return 1;
+
+ return 0;
+}
+#else
+static inline int bad_range(struct zone *zone, struct page *page)
+{
+ return 0;
+}
+#endif
+
+static void bad_page(struct page *page)
+{
+ printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG
+ "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
+ current->comm, page, (int)(2*sizeof(unsigned long)),
+ (unsigned long)page->flags, page->mapping,
+ page_mapcount(page), page_count(page));
+
+ printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
+ KERN_EMERG "Backtrace:\n");
+ dump_stack();
+ page->flags &= ~PAGE_FLAGS_CLEAR_WHEN_BAD;
+ set_page_count(page, 0);
+ reset_page_mapcount(page);
+ page->mapping = NULL;
+ add_taint(TAINT_BAD_PAGE);
+}
+
+/*
+ * Higher-order pages are called "compound pages". They are structured thusly:
+ *
+ * The first PAGE_SIZE page is called the "head page".
+ *
+ * The remaining PAGE_SIZE pages are called "tail pages".
+ *
+ * All pages have PG_compound set. All pages have their ->private pointing at
+ * the head page (even the head page has this).
+ *
+ * The first tail page's ->lru.next holds the address of the compound page's
+ * put_page() function. Its ->lru.prev holds the order of allocation.
+ * This usage means that zero-order pages may not be compound.
+ */
+
+static void free_compound_page(struct page *page)
+{
+ __free_pages_ok(page, compound_order(page));
+}
+
+void prep_compound_page(struct page *page, unsigned long order)
+{
+ int i;
+ int nr_pages = 1 << order;
+
+ set_compound_page_dtor(page, free_compound_page);
+ set_compound_order(page, order);
+ __SetPageHead(page);
+ for (i = 1; i < nr_pages; i++) {
+ struct page *p = page + i;
+
+ __SetPageTail(p);
+ p->first_page = page;
+ }
+}
+
+#ifdef CONFIG_HUGETLBFS
+void prep_compound_gigantic_page(struct page *page, unsigned long order)
+{
+ int i;
+ int nr_pages = 1 << order;
+ struct page *p = page + 1;
+
+ set_compound_page_dtor(page, free_compound_page);
+ set_compound_order(page, order);
+ __SetPageHead(page);
+ for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+ __SetPageTail(p);
+ p->first_page = page;
+ }
+}
+#endif
+
+static void destroy_compound_page(struct page *page, unsigned long order)
+{
+ int i;
+ int nr_pages = 1 << order;
+
+ if (unlikely(compound_order(page) != order))
+ bad_page(page);
+
+ if (unlikely(!PageHead(page)))
+ bad_page(page);
+ __ClearPageHead(page);
+ for (i = 1; i < nr_pages; i++) {
+ struct page *p = page + i;
+
+ if (unlikely(!PageTail(p) |
+ (p->first_page != page)))
+ bad_page(page);
+ __ClearPageTail(p);
+ }
+}
+
+static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
+{
+ int i;
+
+ /*
+ * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
+ * and __GFP_HIGHMEM from hard or soft interrupt context.
+ */
+ VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
+ for (i = 0; i < (1 << order); i++)
+ clear_highpage(page + i);
+}
+
+static inline void set_page_order(struct page *page, int order)
+{
+ set_page_private(page, order);
+ __SetPageBuddy(page);
+}
+
+static inline void rmv_page_order(struct page *page)
+{
+ __ClearPageBuddy(page);
+ set_page_private(page, 0);
+}
+
+/*
+ * Locate the struct page for both the matching buddy in our
+ * pair (buddy1) and the combined O(n+1) page they form (page).
+ *
+ * 1) Any buddy B1 will have an order O twin B2 which satisfies
+ * the following equation:
+ * B2 = B1 ^ (1 << O)
+ * For example, if the starting buddy (buddy2) is #8 its order
+ * 1 buddy is #10:
+ * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
+ *
+ * 2) Any buddy B will have an order O+1 parent P which
+ * satisfies the following equation:
+ * P = B & ~(1 << O)
+ *
+ * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
+ */
+static inline struct page *
+__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
+{
+ unsigned long buddy_idx = page_idx ^ (1 << order);
+
+ return page + (buddy_idx - page_idx);
+}
+
+static inline unsigned long
+__find_combined_index(unsigned long page_idx, unsigned int order)
+{
+ return (page_idx & ~(1 << order));
+}
+
+/*
+ * This function checks whether a page is free && is the buddy
+ * we can do coalesce a page and its buddy if
+ * (a) the buddy is not in a hole &&
+ * (b) the buddy is in the buddy system &&
+ * (c) a page and its buddy have the same order &&
+ * (d) a page and its buddy are in the same zone.
+ *
+ * For recording whether a page is in the buddy system, we use PG_buddy.
+ * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
+ *
+ * For recording page's order, we use page_private(page).
+ */
+static inline int page_is_buddy(struct page *page, struct page *buddy,
+ int order)
+{
+ if (!pfn_valid_within(page_to_pfn(buddy)))
+ return 0;
+
+ if (page_zone_id(page) != page_zone_id(buddy))
+ return 0;
+
+ if (PageBuddy(buddy) && page_order(buddy) == order) {
+ BUG_ON(page_count(buddy) != 0);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Freeing function for a buddy system allocator.
+ *
+ * The concept of a buddy system is to maintain direct-mapped table
+ * (containing bit values) for memory blocks of various "orders".
+ * The bottom level table contains the map for the smallest allocatable
+ * units of memory (here, pages), and each level above it describes
+ * pairs of units from the levels below, hence, "buddies".
+ * At a high level, all that happens here is marking the table entry
+ * at the bottom level available, and propagating the changes upward
+ * as necessary, plus some accounting needed to play nicely with other
+ * parts of the VM system.
+ * At each level, we keep a list of pages, which are heads of continuous
+ * free pages of length of (1 << order) and marked with PG_buddy. Page's
+ * order is recorded in page_private(page) field.
+ * So when we are allocating or freeing one, we can derive the state of the
+ * other. That is, if we allocate a small block, and both were
+ * free, the remainder of the region must be split into blocks.
+ * If a block is freed, and its buddy is also free, then this
+ * triggers coalescing into a block of larger size.
+ *
+ * -- wli
+ */
+
+static inline void __free_one_page(struct page *page,
+ struct zone *zone, unsigned int order)
+{
+ unsigned long page_idx;
+ int order_size = 1 << order;
+ int migratetype = get_pageblock_migratetype(page);
+
+ if (unlikely(PageCompound(page)))
+ destroy_compound_page(page, order);
+
+ page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
+
+ VM_BUG_ON(page_idx & (order_size - 1));
+ VM_BUG_ON(bad_range(zone, page));
+
+ __mod_zone_page_state(zone, NR_FREE_PAGES, order_size);
+ while (order < MAX_ORDER-1) {
+ unsigned long combined_idx;
+ struct page *buddy;
+
+ buddy = __page_find_buddy(page, page_idx, order);
+ if (!page_is_buddy(page, buddy, order))
+ break;
+
+ /* Our buddy is free, merge with it and move up one order. */
+ list_del(&buddy->lru);
+ zone->free_area[order].nr_free--;
+ rmv_page_order(buddy);
+ combined_idx = __find_combined_index(page_idx, order);
+ page = page + (combined_idx - page_idx);
+ page_idx = combined_idx;
+ order++;
+ }
+ set_page_order(page, order);
+ list_add(&page->lru,
+ &zone->free_area[order].free_list[migratetype]);
+ zone->free_area[order].nr_free++;
+}
+
+static inline int free_pages_check(struct page *page)
+{
+ free_page_mlock(page);
+ if (unlikely(page_mapcount(page) |
+ (page->mapping != NULL) |
+ (page_count(page) != 0) |
+ (page->flags & PAGE_FLAGS_CHECK_AT_FREE)))
+ bad_page(page);
+ if (PageDirty(page))
+ __ClearPageDirty(page);
+ if (PageSwapBacked(page))
+ __ClearPageSwapBacked(page);
+ /*
+ * For now, we report if PG_reserved was found set, but do not
+ * clear it, and do not free the page. But we shall soon need
+ * to do more, for when the ZERO_PAGE count wraps negative.
+ */
+ return PageReserved(page);
+}
+
+/*
+ * Frees a list of pages.
+ * Assumes all pages on list are in same zone, and of same order.
+ * count is the number of pages to free.
+ *
+ * If the zone was previously in an "all pages pinned" state then look to
+ * see if this freeing clears that state.
+ *
+ * And clear the zone's pages_scanned counter, to hold off the "all pages are
+ * pinned" detection logic.
+ */
+static void free_pages_bulk(struct zone *zone, int count,
+ struct list_head *list, int order)
+{
+ spin_lock(&zone->lock);
+ zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
+ zone->pages_scanned = 0;
+ while (count--) {
+ struct page *page;
+
+ VM_BUG_ON(list_empty(list));
+ page = list_entry(list->prev, struct page, lru);
+ /* have to delete it as __free_one_page list manipulates */
+ list_del(&page->lru);
+ __free_one_page(page, zone, order);
+ }
+ spin_unlock(&zone->lock);
+}
+
+static void free_one_page(struct zone *zone, struct page *page, int order)
+{
+ spin_lock(&zone->lock);
+ zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
+ zone->pages_scanned = 0;
+ __free_one_page(page, zone, order);
+ spin_unlock(&zone->lock);
+}
+
+static void __free_pages_ok(struct page *page, unsigned int order)
+{
+ unsigned long flags;
+ int i;
+ int reserved = 0;
+
+ for (i = 0 ; i < (1 << order) ; ++i)
+ reserved += free_pages_check(page + i);
+ if (reserved)
+ return;
+
+ if (!PageHighMem(page)) {
+ debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
+ debug_check_no_obj_freed(page_address(page),
+ PAGE_SIZE << order);
+ }
+ arch_free_page(page, order);
+ kernel_map_pages(page, 1 << order, 0);
+
+ local_irq_save(flags);
+ __count_vm_events(PGFREE, 1 << order);
+ free_one_page(page_zone(page), page, order);
+ local_irq_restore(flags);
+}
+
+/*
+ * permit the bootmem allocator to evade page validation on high-order frees
+ */
+void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
+{
+ if (order == 0) {
+ __ClearPageReserved(page);
+ set_page_count(page, 0);
+ set_page_refcounted(page);
+ __free_page(page);
+ } else {
+ int loop;
+
+ prefetchw(page);
+ for (loop = 0; loop < BITS_PER_LONG; loop++) {
+ struct page *p = &page[loop];
+
+ if (loop + 1 < BITS_PER_LONG)
+ prefetchw(p + 1);
+ __ClearPageReserved(p);
+ set_page_count(p, 0);
+ }
+
+ set_page_refcounted(page);
+ __free_pages(page, order);
+ }
+}
+
+
+/*
+ * The order of subdivision here is critical for the IO subsystem.
+ * Please do not alter this order without good reasons and regression
+ * testing. Specifically, as large blocks of memory are subdivided,
+ * the order in which smaller blocks are delivered depends on the order
+ * they're subdivided in this function. This is the primary factor
+ * influencing the order in which pages are delivered to the IO
+ * subsystem according to empirical testing, and this is also justified
+ * by considering the behavior of a buddy system containing a single
+ * large block of memory acted on by a series of small allocations.
+ * This behavior is a critical factor in sglist merging's success.
+ *
+ * -- wli
+ */
+static inline void expand(struct zone *zone, struct page *page,
+ int low, int high, struct free_area *area,
+ int migratetype)
+{
+ unsigned long size = 1 << high;
+
+ while (high > low) {
+ area--;
+ high--;
+ size >>= 1;
+ VM_BUG_ON(bad_range(zone, &page[size]));
+ list_add(&page[size].lru, &area->free_list[migratetype]);
+ area->nr_free++;
+ set_page_order(&page[size], high);
+ }
+}
+
+/*
+ * This page is about to be returned from the page allocator
+ */
+static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
+{
+ if (unlikely(page_mapcount(page) |
+ (page->mapping != NULL) |
+ (page_count(page) != 0) |
+ (page->flags & PAGE_FLAGS_CHECK_AT_PREP)))
+ bad_page(page);
+
+ /*
+ * For now, we report if PG_reserved was found set, but do not
+ * clear it, and do not allocate the page: as a safety net.
+ */
+ if (PageReserved(page))
+ return 1;
+
+ page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |
+ 1 << PG_referenced | 1 << PG_arch_1 |
+ 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk
+#ifdef CONFIG_UNEVICTABLE_LRU
+ | 1 << PG_mlocked
+#endif
+ );
+ set_page_private(page, 0);
+ set_page_refcounted(page);
+
+ arch_alloc_page(page, order);
+ kernel_map_pages(page, 1 << order, 1);
+
+ if (gfp_flags & __GFP_ZERO)
+ prep_zero_page(page, order, gfp_flags);
+
+ if (order && (gfp_flags & __GFP_COMP))
+ prep_compound_page(page, order);
+
+ return 0;
+}
+
+/*
+ * Go through the free lists for the given migratetype and remove
+ * the smallest available page from the freelists
+ */
+static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
+ int migratetype)
+{
+ unsigned int current_order;
+ struct free_area * area;
+ struct page *page;
+
+ /* Find a page of the appropriate size in the preferred list */
+ for (current_order = order; current_order < MAX_ORDER; ++current_order) {
+ area = &(zone->free_area[current_order]);
+ if (list_empty(&area->free_list[migratetype]))
+ continue;
+
+ page = list_entry(area->free_list[migratetype].next,
+ struct page, lru);
+ list_del(&page->lru);
+ rmv_page_order(page);
+ area->nr_free--;
+ __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
+ expand(zone, page, order, current_order, area, migratetype);
+ return page;
+ }
+
+ return NULL;
+}
+
+
+/*
+ * This array describes the order lists are fallen back to when
+ * the free lists for the desirable migrate type are depleted
+ */
+static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
+ [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
+ [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
+ [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
+ [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */
+};
+
+/*
+ * Move the free pages in a range to the free lists of the requested type.
+ * Note that start_page and end_pages are not aligned on a pageblock
+ * boundary. If alignment is required, use move_freepages_block()
+ */
+static int move_freepages(struct zone *zone,
+ struct page *start_page, struct page *end_page,
+ int migratetype)
+{
+ struct page *page;
+ unsigned long order;
+ int pages_moved = 0;
+
+#ifndef CONFIG_HOLES_IN_ZONE
+ /*
+ * page_zone is not safe to call in this context when
+ * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
+ * anyway as we check zone boundaries in move_freepages_block().
+ * Remove at a later date when no bug reports exist related to
+ * grouping pages by mobility
+ */
+ BUG_ON(page_zone(start_page) != page_zone(end_page));
+#endif
+
+ for (page = start_page; page <= end_page;) {
+ /* Make sure we are not inadvertently changing nodes */
+ VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
+
+ if (!pfn_valid_within(page_to_pfn(page))) {
+ page++;
+ continue;
+ }
+
+ if (!PageBuddy(page)) {
+ page++;
+ continue;
+ }
+
+ order = page_order(page);
+ list_del(&page->lru);
+ list_add(&page->lru,
+ &zone->free_area[order].free_list[migratetype]);
+ page += 1 << order;
+ pages_moved += 1 << order;
+ }
+
+ return pages_moved;
+}
+
+static int move_freepages_block(struct zone *zone, struct page *page,
+ int migratetype)
+{
+ unsigned long start_pfn, end_pfn;
+ struct page *start_page, *end_page;
+
+ start_pfn = page_to_pfn(page);
+ start_pfn = start_pfn & ~(pageblock_nr_pages-1);
+ start_page = pfn_to_page(start_pfn);
+ end_page = start_page + pageblock_nr_pages - 1;
+ end_pfn = start_pfn + pageblock_nr_pages - 1;
+
+ /* Do not cross zone boundaries */
+ if (start_pfn < zone->zone_start_pfn)
+ start_page = page;
+ if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
+ return 0;
+
+ return move_freepages(zone, start_page, end_page, migratetype);
+}
+
+/* Remove an element from the buddy allocator from the fallback list */
+static struct page *__rmqueue_fallback(struct zone *zone, int order,
+ int start_migratetype)
+{
+ struct free_area * area;
+ int current_order;
+ struct page *page;
+ int migratetype, i;
+
+ /* Find the largest possible block of pages in the other list */
+ for (current_order = MAX_ORDER-1; current_order >= order;
+ --current_order) {
+ for (i = 0; i < MIGRATE_TYPES - 1; i++) {
+ migratetype = fallbacks[start_migratetype][i];
+
+ /* MIGRATE_RESERVE handled later if necessary */
+ if (migratetype == MIGRATE_RESERVE)
+ continue;
+
+ area = &(zone->free_area[current_order]);
+ if (list_empty(&area->free_list[migratetype]))
+ continue;
+
+ page = list_entry(area->free_list[migratetype].next,
+ struct page, lru);
+ area->nr_free--;
+
+ /*
+ * If breaking a large block of pages, move all free
+ * pages to the preferred allocation list. If falling
+ * back for a reclaimable kernel allocation, be more
+ * agressive about taking ownership of free pages
+ */
+ if (unlikely(current_order >= (pageblock_order >> 1)) ||
+ start_migratetype == MIGRATE_RECLAIMABLE) {
+ unsigned long pages;
+ pages = move_freepages_block(zone, page,
+ start_migratetype);
+
+ /* Claim the whole block if over half of it is free */
+ if (pages >= (1 << (pageblock_order-1)))
+ set_pageblock_migratetype(page,
+ start_migratetype);
+
+ migratetype = start_migratetype;
+ }
+
+ /* Remove the page from the freelists */
+ list_del(&page->lru);
+ rmv_page_order(page);
+ __mod_zone_page_state(zone, NR_FREE_PAGES,
+ -(1UL << order));
+
+ if (current_order == pageblock_order)
+ set_pageblock_migratetype(page,
+ start_migratetype);
+
+ expand(zone, page, order, current_order, area, migratetype);
+ return page;
+ }
+ }
+
+ /* Use MIGRATE_RESERVE rather than fail an allocation */
+ return __rmqueue_smallest(zone, order, MIGRATE_RESERVE);
+}
+
+/*
+ * Do the hard work of removing an element from the buddy allocator.
+ * Call me with the zone->lock already held.
+ */
+static struct page *__rmqueue(struct zone *zone, unsigned int order,
+ int migratetype)
+{
+ struct page *page;
+
+ page = __rmqueue_smallest(zone, order, migratetype);
+
+ if (unlikely(!page))
+ page = __rmqueue_fallback(zone, order, migratetype);
+
+ return page;
+}
+
+/*
+ * Obtain a specified number of elements from the buddy allocator, all under
+ * a single hold of the lock, for efficiency. Add them to the supplied list.
+ * Returns the number of new pages which were placed at *list.
+ */
+static int rmqueue_bulk(struct zone *zone, unsigned int order,
+ unsigned long count, struct list_head *list,
+ int migratetype)
+{
+ int i;
+
+ spin_lock(&zone->lock);
+ for (i = 0; i < count; ++i) {
+ struct page *page = __rmqueue(zone, order, migratetype);
+ if (unlikely(page == NULL))
+ break;
+
+ /*
+ * Split buddy pages returned by expand() are received here
+ * in physical page order. The page is added to the callers and
+ * list and the list head then moves forward. From the callers
+ * perspective, the linked list is ordered by page number in
+ * some conditions. This is useful for IO devices that can
+ * merge IO requests if the physical pages are ordered
+ * properly.
+ */
+ list_add(&page->lru, list);
+ set_page_private(page, migratetype);
+ list = &page->lru;
+ }
+ spin_unlock(&zone->lock);
+ return i;
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * Called from the vmstat counter updater to drain pagesets of this
+ * currently executing processor on remote nodes after they have
+ * expired.
+ *
+ * Note that this function must be called with the thread pinned to
+ * a single processor.
+ */
+void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
+{
+ unsigned long flags;
+ int to_drain;
+
+ local_irq_save(flags);
+ if (pcp->count >= pcp->batch)
+ to_drain = pcp->batch;
+ else
+ to_drain = pcp->count;
+ free_pages_bulk(zone, to_drain, &pcp->list, 0);
+ pcp->count -= to_drain;
+ local_irq_restore(flags);
+}
+#endif
+
+/*
+ * Drain pages of the indicated processor.
+ *
+ * The processor must either be the current processor and the
+ * thread pinned to the current processor or a processor that
+ * is not online.
+ */
+static void drain_pages(unsigned int cpu)
+{
+ unsigned long flags;
+ struct zone *zone;
+
+ for_each_zone(zone) {
+ struct per_cpu_pageset *pset;
+ struct per_cpu_pages *pcp;
+
+ if (!populated_zone(zone))
+ continue;
+
+ pset = zone_pcp(zone, cpu);
+
+ pcp = &pset->pcp;
+ local_irq_save(flags);
+ free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+ pcp->count = 0;
+ local_irq_restore(flags);
+ }
+}
+
+/*
+ * Spill all of this CPU's per-cpu pages back into the buddy allocator.
+ */
+void drain_local_pages(void *arg)
+{
+ drain_pages(smp_processor_id());
+}
+
+/*
+ * Spill all the per-cpu pages from all CPUs back into the buddy allocator
+ */
+void drain_all_pages(void)
+{
+ on_each_cpu(drain_local_pages, NULL, 1);
+}
+
+#ifdef CONFIG_HIBERNATION
+
+void mark_free_pages(struct zone *zone)
+{
+ unsigned long pfn, max_zone_pfn;
+ unsigned long flags;
+ int order, t;
+ struct list_head *curr;
+
+ if (!zone->spanned_pages)
+ return;
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+ if (pfn_valid(pfn)) {
+ struct page *page = pfn_to_page(pfn);
+
+ if (!swsusp_page_is_forbidden(page))
+ swsusp_unset_page_free(page);
+ }
+
+ for_each_migratetype_order(order, t) {
+ list_for_each(curr, &zone->free_area[order].free_list[t]) {
+ unsigned long i;
+
+ pfn = page_to_pfn(list_entry(curr, struct page, lru));
+ for (i = 0; i < (1UL << order); i++)
+ swsusp_set_page_free(pfn_to_page(pfn + i));
+ }
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+#endif /* CONFIG_PM */
+
+/*
+ * Free a 0-order page
+ */
+static void free_hot_cold_page(struct page *page, int cold)
+{
+ struct zone *zone = page_zone(page);
+ struct per_cpu_pages *pcp;
+ unsigned long flags;
+
+ if (PageAnon(page))
+ page->mapping = NULL;
+ if (free_pages_check(page))
+ return;
+
+ if (!PageHighMem(page)) {
+ debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
+ debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
+ }
+ arch_free_page(page, 0);
+ kernel_map_pages(page, 1, 0);
+
+ pcp = &zone_pcp(zone, get_cpu())->pcp;
+ local_irq_save(flags);
+ __count_vm_event(PGFREE);
+ if (cold)
+ list_add_tail(&page->lru, &pcp->list);
+ else
+ list_add(&page->lru, &pcp->list);
+ set_page_private(page, get_pageblock_migratetype(page));
+ pcp->count++;
+ if (pcp->count >= pcp->high) {
+ free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+ pcp->count -= pcp->batch;
+ }
+ local_irq_restore(flags);
+ put_cpu();
+}
+
+void free_hot_page(struct page *page)
+{
+ free_hot_cold_page(page, 0);
+}
+
+void free_cold_page(struct page *page)
+{
+ free_hot_cold_page(page, 1);
+}
+
+/*
+ * split_page takes a non-compound higher-order page, and splits it into
+ * n (1<<order) sub-pages: page[0..n]
+ * Each sub-page must be freed individually.
+ *
+ * Note: this is probably too low level an operation for use in drivers.
+ * Please consult with lkml before using this in your driver.
+ */
+void split_page(struct page *page, unsigned int order)
+{
+ int i;
+
+ VM_BUG_ON(PageCompound(page));
+ VM_BUG_ON(!page_count(page));
+ for (i = 1; i < (1 << order); i++)
+ set_page_refcounted(page + i);
+}
+
+/*
+ * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
+ * we cheat by calling it from here, in the order > 0 path. Saves a branch
+ * or two.
+ */
+static struct page *buffered_rmqueue(struct zone *preferred_zone,
+ struct zone *zone, int order, gfp_t gfp_flags)
+{
+ unsigned long flags;
+ struct page *page;
+ int cold = !!(gfp_flags & __GFP_COLD);
+ int cpu;
+ int migratetype = allocflags_to_migratetype(gfp_flags);
+
+again:
+ cpu = get_cpu();
+ if (likely(order == 0)) {
+ struct per_cpu_pages *pcp;
+
+ pcp = &zone_pcp(zone, cpu)->pcp;
+ local_irq_save(flags);
+ if (!pcp->count) {
+ pcp->count = rmqueue_bulk(zone, 0,
+ pcp->batch, &pcp->list, migratetype);
+ if (unlikely(!pcp->count))
+ goto failed;
+ }
+
+ /* Find a page of the appropriate migrate type */
+ if (cold) {
+ list_for_each_entry_reverse(page, &pcp->list, lru)
+ if (page_private(page) == migratetype)
+ break;
+ } else {
+ list_for_each_entry(page, &pcp->list, lru)
+ if (page_private(page) == migratetype)
+ break;
+ }
+
+ /* Allocate more to the pcp list if necessary */
+ if (unlikely(&page->lru == &pcp->list)) {
+ pcp->count += rmqueue_bulk(zone, 0,
+ pcp->batch, &pcp->list, migratetype);
+ page = list_entry(pcp->list.next, struct page, lru);
+ }
+
+ list_del(&page->lru);
+ pcp->count--;
+ } else {
+ spin_lock_irqsave(&zone->lock, flags);
+ page = __rmqueue(zone, order, migratetype);
+ spin_unlock(&zone->lock);
+ if (!page)
+ goto failed;
+ }
+
+ __count_zone_vm_events(PGALLOC, zone, 1 << order);
+ zone_statistics(preferred_zone, zone);
+ local_irq_restore(flags);
+ put_cpu();
+
+ VM_BUG_ON(bad_range(zone, page));
+ if (prep_new_page(page, order, gfp_flags))
+ goto again;
+ return page;
+
+failed:
+ local_irq_restore(flags);
+ put_cpu();
+ return NULL;
+}
+
+#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
+#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */
+#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */
+#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */
+#define ALLOC_HARDER 0x10 /* try to alloc harder */
+#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
+#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
+
+#ifdef CONFIG_FAIL_PAGE_ALLOC
+
+static struct fail_page_alloc_attr {
+ struct fault_attr attr;
+
+ u32 ignore_gfp_highmem;
+ u32 ignore_gfp_wait;
+ u32 min_order;
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+ struct dentry *ignore_gfp_highmem_file;
+ struct dentry *ignore_gfp_wait_file;
+ struct dentry *min_order_file;
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+} fail_page_alloc = {
+ .attr = FAULT_ATTR_INITIALIZER,
+ .ignore_gfp_wait = 1,
+ .ignore_gfp_highmem = 1,
+ .min_order = 1,
+};
+
+static int __init setup_fail_page_alloc(char *str)
+{
+ return setup_fault_attr(&fail_page_alloc.attr, str);
+}
+__setup("fail_page_alloc=", setup_fail_page_alloc);
+
+static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+ if (order < fail_page_alloc.min_order)
+ return 0;
+ if (gfp_mask & __GFP_NOFAIL)
+ return 0;
+ if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
+ return 0;
+ if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
+ return 0;
+
+ return should_fail(&fail_page_alloc.attr, 1 << order);
+}
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+static int __init fail_page_alloc_debugfs(void)
+{
+ mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+ struct dentry *dir;
+ int err;
+
+ err = init_fault_attr_dentries(&fail_page_alloc.attr,
+ "fail_page_alloc");
+ if (err)
+ return err;
+ dir = fail_page_alloc.attr.dentries.dir;
+
+ fail_page_alloc.ignore_gfp_wait_file =
+ debugfs_create_bool("ignore-gfp-wait", mode, dir,
+ &fail_page_alloc.ignore_gfp_wait);
+
+ fail_page_alloc.ignore_gfp_highmem_file =
+ debugfs_create_bool("ignore-gfp-highmem", mode, dir,
+ &fail_page_alloc.ignore_gfp_highmem);
+ fail_page_alloc.min_order_file =
+ debugfs_create_u32("min-order", mode, dir,
+ &fail_page_alloc.min_order);
+
+ if (!fail_page_alloc.ignore_gfp_wait_file ||
+ !fail_page_alloc.ignore_gfp_highmem_file ||
+ !fail_page_alloc.min_order_file) {
+ err = -ENOMEM;
+ debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
+ debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
+ debugfs_remove(fail_page_alloc.min_order_file);
+ cleanup_fault_attr_dentries(&fail_page_alloc.attr);
+ }
+
+ return err;
+}
+
+late_initcall(fail_page_alloc_debugfs);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+#else /* CONFIG_FAIL_PAGE_ALLOC */
+
+static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+ return 0;
+}
+
+#endif /* CONFIG_FAIL_PAGE_ALLOC */
+
+/*
+ * Return 1 if free pages are above 'mark'. This takes into account the order
+ * of the allocation.
+ */
+int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags)
+{
+ /* free_pages my go negative - that's OK */
+ long min = mark;
+ long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;
+ int o;
+
+ if (alloc_flags & ALLOC_HIGH)
+ min -= min / 2;
+ if (alloc_flags & ALLOC_HARDER)
+ min -= min / 4;
+
+ if (free_pages <= min + z->lowmem_reserve[classzone_idx])
+ return 0;
+ for (o = 0; o < order; o++) {
+ /* At the next order, this order's pages become unavailable */
+ free_pages -= z->free_area[o].nr_free << o;
+
+ /* Require fewer higher order pages to be free */
+ min >>= 1;
+
+ if (free_pages <= min)
+ return 0;
+ }
+ return 1;
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * zlc_setup - Setup for "zonelist cache". Uses cached zone data to
+ * skip over zones that are not allowed by the cpuset, or that have
+ * been recently (in last second) found to be nearly full. See further
+ * comments in mmzone.h. Reduces cache footprint of zonelist scans
+ * that have to skip over a lot of full or unallowed zones.
+ *
+ * If the zonelist cache is present in the passed in zonelist, then
+ * returns a pointer to the allowed node mask (either the current
+ * tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
+ *
+ * If the zonelist cache is not available for this zonelist, does
+ * nothing and returns NULL.
+ *
+ * If the fullzones BITMAP in the zonelist cache is stale (more than
+ * a second since last zap'd) then we zap it out (clear its bits.)
+ *
+ * We hold off even calling zlc_setup, until after we've checked the
+ * first zone in the zonelist, on the theory that most allocations will
+ * be satisfied from that first zone, so best to examine that zone as
+ * quickly as we can.
+ */
+static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
+{
+ struct zonelist_cache *zlc; /* cached zonelist speedup info */
+ nodemask_t *allowednodes; /* zonelist_cache approximation */
+
+ zlc = zonelist->zlcache_ptr;
+ if (!zlc)
+ return NULL;
+
+ if (time_after(jiffies, zlc->last_full_zap + HZ)) {
+ bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+ zlc->last_full_zap = jiffies;
+ }
+
+ allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
+ &cpuset_current_mems_allowed :
+ &node_states[N_HIGH_MEMORY];
+ return allowednodes;
+}
+
+/*
+ * Given 'z' scanning a zonelist, run a couple of quick checks to see
+ * if it is worth looking at further for free memory:
+ * 1) Check that the zone isn't thought to be full (doesn't have its
+ * bit set in the zonelist_cache fullzones BITMAP).
+ * 2) Check that the zones node (obtained from the zonelist_cache
+ * z_to_n[] mapping) is allowed in the passed in allowednodes mask.
+ * Return true (non-zero) if zone is worth looking at further, or
+ * else return false (zero) if it is not.
+ *
+ * This check -ignores- the distinction between various watermarks,
+ * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
+ * found to be full for any variation of these watermarks, it will
+ * be considered full for up to one second by all requests, unless
+ * we are so low on memory on all allowed nodes that we are forced
+ * into the second scan of the zonelist.
+ *
+ * In the second scan we ignore this zonelist cache and exactly
+ * apply the watermarks to all zones, even it is slower to do so.
+ * We are low on memory in the second scan, and should leave no stone
+ * unturned looking for a free page.
+ */
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
+ nodemask_t *allowednodes)
+{
+ struct zonelist_cache *zlc; /* cached zonelist speedup info */
+ int i; /* index of *z in zonelist zones */
+ int n; /* node that zone *z is on */
+
+ zlc = zonelist->zlcache_ptr;
+ if (!zlc)
+ return 1;
+
+ i = z - zonelist->_zonerefs;
+ n = zlc->z_to_n[i];
+
+ /* This zone is worth trying if it is allowed but not full */
+ return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
+}
+
+/*
+ * Given 'z' scanning a zonelist, set the corresponding bit in
+ * zlc->fullzones, so that subsequent attempts to allocate a page
+ * from that zone don't waste time re-examining it.
+ */
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
+{
+ struct zonelist_cache *zlc; /* cached zonelist speedup info */
+ int i; /* index of *z in zonelist zones */
+
+ zlc = zonelist->zlcache_ptr;
+ if (!zlc)
+ return;
+
+ i = z - zonelist->_zonerefs;
+
+ set_bit(i, zlc->fullzones);
+}
+
+#else /* CONFIG_NUMA */
+
+static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
+{
+ return NULL;
+}
+
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
+ nodemask_t *allowednodes)
+{
+ return 1;
+}
+
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
+{
+}
+#endif /* CONFIG_NUMA */
+
+/*
+ * get_page_from_freelist goes through the zonelist trying to allocate
+ * a page.
+ */
+static struct page *
+get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
+ struct zonelist *zonelist, int high_zoneidx, int alloc_flags)
+{
+ struct zoneref *z;
+ struct page *page = NULL;
+ int classzone_idx;
+ struct zone *zone, *preferred_zone;
+ nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
+ int zlc_active = 0; /* set if using zonelist_cache */
+ int did_zlc_setup = 0; /* just call zlc_setup() one time */
+
+ (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
+ &preferred_zone);
+ if (!preferred_zone)
+ return NULL;
+
+ classzone_idx = zone_idx(preferred_zone);
+
+zonelist_scan:
+ /*
+ * Scan zonelist, looking for a zone with enough free.
+ * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+ */
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ high_zoneidx, nodemask) {
+ if (NUMA_BUILD && zlc_active &&
+ !zlc_zone_worth_trying(zonelist, z, allowednodes))
+ continue;
+ if ((alloc_flags & ALLOC_CPUSET) &&
+ !cpuset_zone_allowed_softwall(zone, gfp_mask))
+ goto try_next_zone;
+
+ if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
+ unsigned long mark;
+ if (alloc_flags & ALLOC_WMARK_MIN)
+ mark = zone->pages_min;
+ else if (alloc_flags & ALLOC_WMARK_LOW)
+ mark = zone->pages_low;
+ else
+ mark = zone->pages_high;
+ if (!zone_watermark_ok(zone, order, mark,
+ classzone_idx, alloc_flags)) {
+ if (!zone_reclaim_mode ||
+ !zone_reclaim(zone, gfp_mask, order))
+ goto this_zone_full;
+ }
+ }
+
+ page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask);
+ if (page)
+ break;
+this_zone_full:
+ if (NUMA_BUILD)
+ zlc_mark_zone_full(zonelist, z);
+try_next_zone:
+ if (NUMA_BUILD && !did_zlc_setup) {
+ /* we do zlc_setup after the first zone is tried */
+ allowednodes = zlc_setup(zonelist, alloc_flags);
+ zlc_active = 1;
+ did_zlc_setup = 1;
+ }
+ }
+
+ if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
+ /* Disable zlc cache for second zonelist scan */
+ zlc_active = 0;
+ goto zonelist_scan;
+ }
+ return page;
+}
+
+/*
+ * This is the 'heart' of the zoned buddy allocator.
+ */
+struct page *
+__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, nodemask_t *nodemask)
+{
+ const gfp_t wait = gfp_mask & __GFP_WAIT;
+ enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ struct zoneref *z;
+ struct zone *zone;
+ struct page *page;
+ struct reclaim_state reclaim_state;
+ struct task_struct *p = current;
+ int do_retry;
+ int alloc_flags;
+ unsigned long did_some_progress;
+ unsigned long pages_reclaimed = 0;
+
+ might_sleep_if(wait);
+
+ if (should_fail_alloc_page(gfp_mask, order))
+ return NULL;
+
+restart:
+ z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */
+
+ if (unlikely(!z->zone)) {
+ /*
+ * Happens if we have an empty zonelist as a result of
+ * GFP_THISNODE being used on a memoryless node
+ */
+ return NULL;
+ }
+
+ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
+ zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
+ if (page)
+ goto got_pg;
+
+ /*
+ * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
+ * __GFP_NOWARN set) should not cause reclaim since the subsystem
+ * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
+ * using a larger set of nodes after it has established that the
+ * allowed per node queues are empty and that nodes are
+ * over allocated.
+ */
+ if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
+ goto nopage;
+
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+ wakeup_kswapd(zone, order);
+
+ /*
+ * OK, we're below the kswapd watermark and have kicked background
+ * reclaim. Now things get more complex, so set up alloc_flags according
+ * to how we want to proceed.
+ *
+ * The caller may dip into page reserves a bit more if the caller
+ * cannot run direct reclaim, or if the caller has realtime scheduling
+ * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
+ * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
+ */
+ alloc_flags = ALLOC_WMARK_MIN;
+ if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
+ alloc_flags |= ALLOC_HARDER;
+ if (gfp_mask & __GFP_HIGH)
+ alloc_flags |= ALLOC_HIGH;
+ if (wait)
+ alloc_flags |= ALLOC_CPUSET;
+
+ /*
+ * Go through the zonelist again. Let __GFP_HIGH and allocations
+ * coming from realtime tasks go deeper into reserves.
+ *
+ * This is the last chance, in general, before the goto nopage.
+ * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
+ * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+ */
+ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
+ high_zoneidx, alloc_flags);
+ if (page)
+ goto got_pg;
+
+ /* This allocation should allow future memory freeing. */
+
+rebalance:
+ if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
+ && !in_interrupt()) {
+ if (!(gfp_mask & __GFP_NOMEMALLOC)) {
+nofail_alloc:
+ /* go through the zonelist yet again, ignoring mins */
+ page = get_page_from_freelist(gfp_mask, nodemask, order,
+ zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
+ if (page)
+ goto got_pg;
+ if (gfp_mask & __GFP_NOFAIL) {
+ congestion_wait(WRITE, HZ/50);
+ goto nofail_alloc;
+ }
+ }
+ goto nopage;
+ }
+
+ /* Atomic allocations - we can't balance anything */
+ if (!wait)
+ goto nopage;
+
+ cond_resched();
+
+ /* We now go into synchronous reclaim */
+ cpuset_memory_pressure_bump();
+ /*
+ * The task's cpuset might have expanded its set of allowable nodes
+ */
+ cpuset_update_task_memory_state();
+ p->flags |= PF_MEMALLOC;
+ reclaim_state.reclaimed_slab = 0;
+ p->reclaim_state = &reclaim_state;
+
+ did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
+
+ p->reclaim_state = NULL;
+ p->flags &= ~PF_MEMALLOC;
+
+ cond_resched();
+
+ if (order != 0)
+ drain_all_pages();
+
+ if (likely(did_some_progress)) {
+ page = get_page_from_freelist(gfp_mask, nodemask, order,
+ zonelist, high_zoneidx, alloc_flags);
+ if (page)
+ goto got_pg;
+ } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+ if (!try_set_zone_oom(zonelist, gfp_mask)) {
+ schedule_timeout_uninterruptible(1);
+ goto restart;
+ }
+
+ /*
+ * Go through the zonelist yet one more time, keep
+ * very high watermark here, this is only to catch
+ * a parallel oom killing, we must fail if we're still
+ * under heavy pressure.
+ */
+ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
+ order, zonelist, high_zoneidx,
+ ALLOC_WMARK_HIGH|ALLOC_CPUSET);
+ if (page) {
+ clear_zonelist_oom(zonelist, gfp_mask);
+ goto got_pg;
+ }
+
+ /* The OOM killer will not help higher order allocs so fail */
+ if (order > PAGE_ALLOC_COSTLY_ORDER) {
+ clear_zonelist_oom(zonelist, gfp_mask);
+ goto nopage;
+ }
+
+ out_of_memory(zonelist, gfp_mask, order);
+ clear_zonelist_oom(zonelist, gfp_mask);
+ goto restart;
+ }
+
+ /*
+ * Don't let big-order allocations loop unless the caller explicitly
+ * requests that. Wait for some write requests to complete then retry.
+ *
+ * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
+ * means __GFP_NOFAIL, but that may not be true in other
+ * implementations.
+ *
+ * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
+ * specified, then we retry until we no longer reclaim any pages
+ * (above), or we've reclaimed an order of pages at least as
+ * large as the allocation's order. In both cases, if the
+ * allocation still fails, we stop retrying.
+ */
+ pages_reclaimed += did_some_progress;
+ do_retry = 0;
+ if (!(gfp_mask & __GFP_NORETRY)) {
+ if (order <= PAGE_ALLOC_COSTLY_ORDER) {
+ do_retry = 1;
+ } else {
+ if (gfp_mask & __GFP_REPEAT &&
+ pages_reclaimed < (1 << order))
+ do_retry = 1;
+ }
+ if (gfp_mask & __GFP_NOFAIL)
+ do_retry = 1;
+ }
+ if (do_retry) {
+ congestion_wait(WRITE, HZ/50);
+ goto rebalance;
+ }
+
+nopage:
+ if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
+ printk(KERN_WARNING "%s: page allocation failure."
+ " order:%d, mode:0x%x\n",
+ p->comm, order, gfp_mask);
+ dump_stack();
+ show_mem();
+ }
+got_pg:
+ return page;
+}
+EXPORT_SYMBOL(__alloc_pages_internal);
+
+/*
+ * Common helper functions.
+ */
+unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
+{
+ struct page * page;
+ page = alloc_pages(gfp_mask, order);
+ if (!page)
+ return 0;
+ return (unsigned long) page_address(page);
+}
+
+EXPORT_SYMBOL(__get_free_pages);
+
+unsigned long get_zeroed_page(gfp_t gfp_mask)
+{
+ struct page * page;
+
+ /*
+ * get_zeroed_page() returns a 32-bit address, which cannot represent
+ * a highmem page
+ */
+ VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
+
+ page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
+ if (page)
+ return (unsigned long) page_address(page);
+ return 0;
+}
+
+EXPORT_SYMBOL(get_zeroed_page);
+
+void __pagevec_free(struct pagevec *pvec)
+{
+ int i = pagevec_count(pvec);
+
+ while (--i >= 0)
+ free_hot_cold_page(pvec->pages[i], pvec->cold);
+}
+
+void __free_pages(struct page *page, unsigned int order)
+{
+ if (put_page_testzero(page)) {
+ if (order == 0)
+ free_hot_page(page);
+ else
+ __free_pages_ok(page, order);
+ }
+}
+
+EXPORT_SYMBOL(__free_pages);
+
+void free_pages(unsigned long addr, unsigned int order)
+{
+ if (addr != 0) {
+ VM_BUG_ON(!virt_addr_valid((void *)addr));
+ __free_pages(virt_to_page((void *)addr), order);
+ }
+}
+
+EXPORT_SYMBOL(free_pages);
+
+/**
+ * alloc_pages_exact - allocate an exact number physically-contiguous pages.
+ * @size: the number of bytes to allocate
+ * @gfp_mask: GFP flags for the allocation
+ *
+ * This function is similar to alloc_pages(), except that it allocates the
+ * minimum number of pages to satisfy the request. alloc_pages() can only
+ * allocate memory in power-of-two pages.
+ *
+ * This function is also limited by MAX_ORDER.
+ *
+ * Memory allocated by this function must be released by free_pages_exact().
+ */
+void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
+{
+ unsigned int order = get_order(size);
+ unsigned long addr;
+
+ addr = __get_free_pages(gfp_mask, order);
+ if (addr) {
+ unsigned long alloc_end = addr + (PAGE_SIZE << order);
+ unsigned long used = addr + PAGE_ALIGN(size);
+
+ split_page(virt_to_page(addr), order);
+ while (used < alloc_end) {
+ free_page(used);
+ used += PAGE_SIZE;
+ }
+ }
+
+ return (void *)addr;
+}
+EXPORT_SYMBOL(alloc_pages_exact);
+
+/**
+ * free_pages_exact - release memory allocated via alloc_pages_exact()
+ * @virt: the value returned by alloc_pages_exact.
+ * @size: size of allocation, same value as passed to alloc_pages_exact().
+ *
+ * Release the memory allocated by a previous call to alloc_pages_exact.
+ */
+void free_pages_exact(void *virt, size_t size)
+{
+ unsigned long addr = (unsigned long)virt;
+ unsigned long end = addr + PAGE_ALIGN(size);
+
+ while (addr < end) {
+ free_page(addr);
+ addr += PAGE_SIZE;
+ }
+}
+EXPORT_SYMBOL(free_pages_exact);
+
+static unsigned int nr_free_zone_pages(int offset)
+{
+ struct zoneref *z;
+ struct zone *zone;
+
+ /* Just pick one node, since fallback list is circular */
+ unsigned int sum = 0;
+
+ struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
+
+ for_each_zone_zonelist(zone, z, zonelist, offset) {
+ unsigned long size = zone->present_pages;
+ unsigned long high = zone->pages_high;
+ if (size > high)
+ sum += size - high;
+ }
+
+ return sum;
+}
+
+/*
+ * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
+ */
+unsigned int nr_free_buffer_pages(void)
+{
+ return nr_free_zone_pages(gfp_zone(GFP_USER));
+}
+EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
+
+/*
+ * Amount of free RAM allocatable within all zones
+ */
+unsigned int nr_free_pagecache_pages(void)
+{
+ return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
+}
+
+static inline void show_node(struct zone *zone)
+{
+ if (NUMA_BUILD)
+ printk("Node %d ", zone_to_nid(zone));
+}
+
+void si_meminfo(struct sysinfo *val)
+{
+ val->totalram = totalram_pages;
+ val->sharedram = 0;
+ val->freeram = global_page_state(NR_FREE_PAGES);
+ val->bufferram = nr_blockdev_pages();
+ val->totalhigh = totalhigh_pages;
+ val->freehigh = nr_free_highpages();
+ val->mem_unit = PAGE_SIZE;
+}
+
+EXPORT_SYMBOL(si_meminfo);
+
+#ifdef CONFIG_NUMA
+void si_meminfo_node(struct sysinfo *val, int nid)
+{
+ pg_data_t *pgdat = NODE_DATA(nid);
+
+ val->totalram = pgdat->node_present_pages;
+ val->freeram = node_page_state(nid, NR_FREE_PAGES);
+#ifdef CONFIG_HIGHMEM
+ val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
+ val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
+ NR_FREE_PAGES);
+#else
+ val->totalhigh = 0;
+ val->freehigh = 0;
+#endif
+ val->mem_unit = PAGE_SIZE;
+}
+#endif
+
+#define K(x) ((x) << (PAGE_SHIFT-10))
+
+/*
+ * Show free area list (used inside shift_scroll-lock stuff)
+ * We also calculate the percentage fragmentation. We do this by counting the
+ * memory on each free list with the exception of the first item on the list.
+ */
+void show_free_areas(void)
+{
+ int cpu;
+ struct zone *zone;
+
+ for_each_zone(zone) {
+ if (!populated_zone(zone))
+ continue;
+
+ show_node(zone);
+ printk("%s per-cpu:\n", zone->name);
+
+ for_each_online_cpu(cpu) {
+ struct per_cpu_pageset *pageset;
+
+ pageset = zone_pcp(zone, cpu);
+
+ printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
+ cpu, pageset->pcp.high,
+ pageset->pcp.batch, pageset->pcp.count);
+ }
+ }
+
+ printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n"
+ " inactive_file:%lu"
+//TODO: check/adjust line lengths
+#ifdef CONFIG_UNEVICTABLE_LRU
+ " unevictable:%lu"
+#endif
+ " dirty:%lu writeback:%lu unstable:%lu\n"
+ " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
+ global_page_state(NR_ACTIVE_ANON),
+ global_page_state(NR_ACTIVE_FILE),
+ global_page_state(NR_INACTIVE_ANON),
+ global_page_state(NR_INACTIVE_FILE),
+#ifdef CONFIG_UNEVICTABLE_LRU
+ global_page_state(NR_UNEVICTABLE),
+#endif
+ global_page_state(NR_FILE_DIRTY),
+ global_page_state(NR_WRITEBACK),
+ global_page_state(NR_UNSTABLE_NFS),
+ global_page_state(NR_FREE_PAGES),
+ global_page_state(NR_SLAB_RECLAIMABLE) +
+ global_page_state(NR_SLAB_UNRECLAIMABLE),
+ global_page_state(NR_FILE_MAPPED),
+ global_page_state(NR_PAGETABLE),
+ global_page_state(NR_BOUNCE));
+
+ for_each_zone(zone) {
+ int i;
+
+ if (!populated_zone(zone))
+ continue;
+
+ show_node(zone);
+ printk("%s"
+ " free:%lukB"
+ " min:%lukB"
+ " low:%lukB"
+ " high:%lukB"
+ " active_anon:%lukB"
+ " inactive_anon:%lukB"
+ " active_file:%lukB"
+ " inactive_file:%lukB"
+#ifdef CONFIG_UNEVICTABLE_LRU
+ " unevictable:%lukB"
+#endif
+ " present:%lukB"
+ " pages_scanned:%lu"
+ " all_unreclaimable? %s"
+ "\n",
+ zone->name,
+ K(zone_page_state(zone, NR_FREE_PAGES)),
+ K(zone->pages_min),
+ K(zone->pages_low),
+ K(zone->pages_high),
+ K(zone_page_state(zone, NR_ACTIVE_ANON)),
+ K(zone_page_state(zone, NR_INACTIVE_ANON)),
+ K(zone_page_state(zone, NR_ACTIVE_FILE)),
+ K(zone_page_state(zone, NR_INACTIVE_FILE)),
+#ifdef CONFIG_UNEVICTABLE_LRU
+ K(zone_page_state(zone, NR_UNEVICTABLE)),
+#endif
+ K(zone->present_pages),
+ zone->pages_scanned,
+ (zone_is_all_unreclaimable(zone) ? "yes" : "no")
+ );
+ printk("lowmem_reserve[]:");
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ printk(" %lu", zone->lowmem_reserve[i]);
+ printk("\n");
+ }
+
+ for_each_zone(zone) {
+ unsigned long nr[MAX_ORDER], flags, order, total = 0;
+
+ if (!populated_zone(zone))
+ continue;
+
+ show_node(zone);
+ printk("%s: ", zone->name);
+
+ spin_lock_irqsave(&zone->lock, flags);
+ for (order = 0; order < MAX_ORDER; order++) {
+ nr[order] = zone->free_area[order].nr_free;
+ total += nr[order] << order;
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ for (order = 0; order < MAX_ORDER; order++)
+ printk("%lu*%lukB ", nr[order], K(1UL) << order);
+ printk("= %lukB\n", K(total));
+ }
+
+ printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
+
+ show_swap_cache_info();
+}
+
+static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
+{
+ zoneref->zone = zone;
+ zoneref->zone_idx = zone_idx(zone);
+}
+
+/*
+ * Builds allocation fallback zone lists.
+ *
+ * Add all populated zones of a node to the zonelist.
+ */
+static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
+ int nr_zones, enum zone_type zone_type)
+{
+ struct zone *zone;
+
+ BUG_ON(zone_type >= MAX_NR_ZONES);
+ zone_type++;
+
+ do {
+ zone_type--;
+ zone = pgdat->node_zones + zone_type;
+ if (populated_zone(zone)) {
+ zoneref_set_zone(zone,
+ &zonelist->_zonerefs[nr_zones++]);
+ check_highest_zone(zone_type);
+ }
+
+ } while (zone_type);
+ return nr_zones;
+}
+
+
+/*
+ * zonelist_order:
+ * 0 = automatic detection of better ordering.
+ * 1 = order by ([node] distance, -zonetype)
+ * 2 = order by (-zonetype, [node] distance)
+ *
+ * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
+ * the same zonelist. So only NUMA can configure this param.
+ */
+#define ZONELIST_ORDER_DEFAULT 0
+#define ZONELIST_ORDER_NODE 1
+#define ZONELIST_ORDER_ZONE 2
+
+/* zonelist order in the kernel.
+ * set_zonelist_order() will set this to NODE or ZONE.
+ */
+static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
+static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
+
+
+#ifdef CONFIG_NUMA
+/* The value user specified ....changed by config */
+static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
+/* string for sysctl */
+#define NUMA_ZONELIST_ORDER_LEN 16
+char numa_zonelist_order[16] = "default";
+
+/*
+ * interface for configure zonelist ordering.
+ * command line option "numa_zonelist_order"
+ * = "[dD]efault - default, automatic configuration.
+ * = "[nN]ode - order by node locality, then by zone within node
+ * = "[zZ]one - order by zone, then by locality within zone
+ */
+
+static int __parse_numa_zonelist_order(char *s)
+{
+ if (*s == 'd' || *s == 'D') {
+ user_zonelist_order = ZONELIST_ORDER_DEFAULT;
+ } else if (*s == 'n' || *s == 'N') {
+ user_zonelist_order = ZONELIST_ORDER_NODE;
+ } else if (*s == 'z' || *s == 'Z') {
+ user_zonelist_order = ZONELIST_ORDER_ZONE;
+ } else {
+ printk(KERN_WARNING
+ "Ignoring invalid numa_zonelist_order value: "
+ "%s\n", s);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static __init int setup_numa_zonelist_order(char *s)
+{
+ if (s)
+ return __parse_numa_zonelist_order(s);
+ return 0;
+}
+early_param("numa_zonelist_order", setup_numa_zonelist_order);
+
+/*
+ * sysctl handler for numa_zonelist_order
+ */
+int numa_zonelist_order_handler(ctl_table *table, int write,
+ struct file *file, void __user *buffer, size_t *length,
+ loff_t *ppos)
+{
+ char saved_string[NUMA_ZONELIST_ORDER_LEN];
+ int ret;
+
+ if (write)
+ strncpy(saved_string, (char*)table->data,
+ NUMA_ZONELIST_ORDER_LEN);
+ ret = proc_dostring(table, write, file, buffer, length, ppos);
+ if (ret)
+ return ret;
+ if (write) {
+ int oldval = user_zonelist_order;
+ if (__parse_numa_zonelist_order((char*)table->data)) {
+ /*
+ * bogus value. restore saved string
+ */
+ strncpy((char*)table->data, saved_string,
+ NUMA_ZONELIST_ORDER_LEN);
+ user_zonelist_order = oldval;
+ } else if (oldval != user_zonelist_order)
+ build_all_zonelists();
+ }
+ return 0;
+}
+
+
+#define MAX_NODE_LOAD (num_online_nodes())
+static int node_load[MAX_NUMNODES];
+
+/**
+ * find_next_best_node - find the next node that should appear in a given node's fallback list
+ * @node: node whose fallback list we're appending
+ * @used_node_mask: nodemask_t of already used nodes
+ *
+ * We use a number of factors to determine which is the next node that should
+ * appear on a given node's fallback list. The node should not have appeared
+ * already in @node's fallback list, and it should be the next closest node
+ * according to the distance array (which contains arbitrary distance values
+ * from each node to each node in the system), and should also prefer nodes
+ * with no CPUs, since presumably they'll have very little allocation pressure
+ * on them otherwise.
+ * It returns -1 if no node is found.
+ */
+static int find_next_best_node(int node, nodemask_t *used_node_mask)
+{
+ int n, val;
+ int min_val = INT_MAX;
+ int best_node = -1;
+ node_to_cpumask_ptr(tmp, 0);
+
+ /* Use the local node if we haven't already */
+ if (!node_isset(node, *used_node_mask)) {
+ node_set(node, *used_node_mask);
+ return node;
+ }
+
+ for_each_node_state(n, N_HIGH_MEMORY) {
+
+ /* Don't want a node to appear more than once */
+ if (node_isset(n, *used_node_mask))
+ continue;
+
+ /* Use the distance array to find the distance */
+ val = node_distance(node, n);
+
+ /* Penalize nodes under us ("prefer the next node") */
+ val += (n < node);
+
+ /* Give preference to headless and unused nodes */
+ node_to_cpumask_ptr_next(tmp, n);
+ if (!cpus_empty(*tmp))
+ val += PENALTY_FOR_NODE_WITH_CPUS;
+
+ /* Slight preference for less loaded node */
+ val *= (MAX_NODE_LOAD*MAX_NUMNODES);
+ val += node_load[n];
+
+ if (val < min_val) {
+ min_val = val;
+ best_node = n;
+ }
+ }
+
+ if (best_node >= 0)
+ node_set(best_node, *used_node_mask);
+
+ return best_node;
+}
+
+
+/*
+ * Build zonelists ordered by node and zones within node.
+ * This results in maximum locality--normal zone overflows into local
+ * DMA zone, if any--but risks exhausting DMA zone.
+ */
+static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
+{
+ int j;
+ struct zonelist *zonelist;
+
+ zonelist = &pgdat->node_zonelists[0];
+ for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
+ ;
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j,
+ MAX_NR_ZONES - 1);
+ zonelist->_zonerefs[j].zone = NULL;
+ zonelist->_zonerefs[j].zone_idx = 0;
+}
+
+/*
+ * Build gfp_thisnode zonelists
+ */
+static void build_thisnode_zonelists(pg_data_t *pgdat)
+{
+ int j;
+ struct zonelist *zonelist;
+
+ zonelist = &pgdat->node_zonelists[1];
+ j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
+ zonelist->_zonerefs[j].zone = NULL;
+ zonelist->_zonerefs[j].zone_idx = 0;
+}
+
+/*
+ * Build zonelists ordered by zone and nodes within zones.
+ * This results in conserving DMA zone[s] until all Normal memory is
+ * exhausted, but results in overflowing to remote node while memory
+ * may still exist in local DMA zone.
+ */
+static int node_order[MAX_NUMNODES];
+
+static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
+{
+ int pos, j, node;
+ int zone_type; /* needs to be signed */
+ struct zone *z;
+ struct zonelist *zonelist;
+
+ zonelist = &pgdat->node_zonelists[0];
+ pos = 0;
+ for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
+ for (j = 0; j < nr_nodes; j++) {
+ node = node_order[j];
+ z = &NODE_DATA(node)->node_zones[zone_type];
+ if (populated_zone(z)) {
+ zoneref_set_zone(z,
+ &zonelist->_zonerefs[pos++]);
+ check_highest_zone(zone_type);
+ }
+ }
+ }
+ zonelist->_zonerefs[pos].zone = NULL;
+ zonelist->_zonerefs[pos].zone_idx = 0;
+}
+
+static int default_zonelist_order(void)
+{
+ int nid, zone_type;
+ unsigned long low_kmem_size,total_size;
+ struct zone *z;
+ int average_size;
+ /*
+ * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem.
+ * If they are really small and used heavily, the system can fall
+ * into OOM very easily.
+ * This function detect ZONE_DMA/DMA32 size and confgigures zone order.
+ */
+ /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
+ low_kmem_size = 0;
+ total_size = 0;
+ for_each_online_node(nid) {
+ for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
+ z = &NODE_DATA(nid)->node_zones[zone_type];
+ if (populated_zone(z)) {
+ if (zone_type < ZONE_NORMAL)
+ low_kmem_size += z->present_pages;
+ total_size += z->present_pages;
+ }
+ }
+ }
+ if (!low_kmem_size || /* there are no DMA area. */
+ low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
+ return ZONELIST_ORDER_NODE;
+ /*
+ * look into each node's config.
+ * If there is a node whose DMA/DMA32 memory is very big area on
+ * local memory, NODE_ORDER may be suitable.
+ */
+ average_size = total_size /
+ (nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
+ for_each_online_node(nid) {
+ low_kmem_size = 0;
+ total_size = 0;
+ for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
+ z = &NODE_DATA(nid)->node_zones[zone_type];
+ if (populated_zone(z)) {
+ if (zone_type < ZONE_NORMAL)
+ low_kmem_size += z->present_pages;
+ total_size += z->present_pages;
+ }
+ }
+ if (low_kmem_size &&
+ total_size > average_size && /* ignore small node */
+ low_kmem_size > total_size * 70/100)
+ return ZONELIST_ORDER_NODE;
+ }
+ return ZONELIST_ORDER_ZONE;
+}
+
+static void set_zonelist_order(void)
+{
+ if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
+ current_zonelist_order = default_zonelist_order();
+ else
+ current_zonelist_order = user_zonelist_order;
+}
+
+static void build_zonelists(pg_data_t *pgdat)
+{
+ int j, node, load;
+ enum zone_type i;
+ nodemask_t used_mask;
+ int local_node, prev_node;
+ struct zonelist *zonelist;
+ int order = current_zonelist_order;
+
+ /* initialize zonelists */
+ for (i = 0; i < MAX_ZONELISTS; i++) {
+ zonelist = pgdat->node_zonelists + i;
+ zonelist->_zonerefs[0].zone = NULL;
+ zonelist->_zonerefs[0].zone_idx = 0;
+ }
+
+ /* NUMA-aware ordering of nodes */
+ local_node = pgdat->node_id;
+ load = num_online_nodes();
+ prev_node = local_node;
+ nodes_clear(used_mask);
+
+ memset(node_load, 0, sizeof(node_load));
+ memset(node_order, 0, sizeof(node_order));
+ j = 0;
+
+ while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
+ int distance = node_distance(local_node, node);
+
+ /*
+ * If another node is sufficiently far away then it is better
+ * to reclaim pages in a zone before going off node.
+ */
+ if (distance > RECLAIM_DISTANCE)
+ zone_reclaim_mode = 1;
+
+ /*
+ * We don't want to pressure a particular node.
+ * So adding penalty to the first node in same
+ * distance group to make it round-robin.
+ */
+ if (distance != node_distance(local_node, prev_node))
+ node_load[node] = load;
+
+ prev_node = node;
+ load--;
+ if (order == ZONELIST_ORDER_NODE)
+ build_zonelists_in_node_order(pgdat, node);
+ else
+ node_order[j++] = node; /* remember order */
+ }
+
+ if (order == ZONELIST_ORDER_ZONE) {
+ /* calculate node order -- i.e., DMA last! */
+ build_zonelists_in_zone_order(pgdat, j);
+ }
+
+ build_thisnode_zonelists(pgdat);
+}
+
+/* Construct the zonelist performance cache - see further mmzone.h */
+static void build_zonelist_cache(pg_data_t *pgdat)
+{
+ struct zonelist *zonelist;
+ struct zonelist_cache *zlc;
+ struct zoneref *z;
+
+ zonelist = &pgdat->node_zonelists[0];
+ zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
+ bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+ for (z = zonelist->_zonerefs; z->zone; z++)
+ zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
+}
+
+
+#else /* CONFIG_NUMA */
+
+static void set_zonelist_order(void)
+{
+ current_zonelist_order = ZONELIST_ORDER_ZONE;
+}
+
+static void build_zonelists(pg_data_t *pgdat)
+{
+ int node, local_node;
+ enum zone_type j;
+ struct zonelist *zonelist;
+
+ local_node = pgdat->node_id;
+
+ zonelist = &pgdat->node_zonelists[0];
+ j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
+
+ /*
+ * Now we build the zonelist so that it contains the zones
+ * of all the other nodes.
+ * We don't want to pressure a particular node, so when
+ * building the zones for node N, we make sure that the
+ * zones coming right after the local ones are those from
+ * node N+1 (modulo N)
+ */
+ for (node = local_node + 1; node < MAX_NUMNODES; node++) {
+ if (!node_online(node))
+ continue;
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j,
+ MAX_NR_ZONES - 1);
+ }
+ for (node = 0; node < local_node; node++) {
+ if (!node_online(node))
+ continue;
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j,
+ MAX_NR_ZONES - 1);
+ }
+
+ zonelist->_zonerefs[j].zone = NULL;
+ zonelist->_zonerefs[j].zone_idx = 0;
+}
+
+/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
+static void build_zonelist_cache(pg_data_t *pgdat)
+{
+ pgdat->node_zonelists[0].zlcache_ptr = NULL;
+}
+
+#endif /* CONFIG_NUMA */
+
+/* return values int ....just for stop_machine() */
+static int __build_all_zonelists(void *dummy)
+{
+ int nid;
+
+ for_each_online_node(nid) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+
+ build_zonelists(pgdat);
+ build_zonelist_cache(pgdat);
+ }
+ return 0;
+}
+
+void build_all_zonelists(void)
+{
+ set_zonelist_order();
+
+ if (system_state == SYSTEM_BOOTING) {
+ __build_all_zonelists(NULL);
+ mminit_verify_zonelist();
+ cpuset_init_current_mems_allowed();
+ } else {
+ /* we have to stop all cpus to guarantee there is no user
+ of zonelist */
+ stop_machine(__build_all_zonelists, NULL, NULL);
+ /* cpuset refresh routine should be here */
+ }
+ vm_total_pages = nr_free_pagecache_pages();
+ /*
+ * Disable grouping by mobility if the number of pages in the
+ * system is too low to allow the mechanism to work. It would be
+ * more accurate, but expensive to check per-zone. This check is
+ * made on memory-hotadd so a system can start with mobility
+ * disabled and enable it later
+ */
+ if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
+ page_group_by_mobility_disabled = 1;
+ else
+ page_group_by_mobility_disabled = 0;
+
+ printk("Built %i zonelists in %s order, mobility grouping %s. "
+ "Total pages: %ld\n",
+ num_online_nodes(),
+ zonelist_order_name[current_zonelist_order],
+ page_group_by_mobility_disabled ? "off" : "on",
+ vm_total_pages);
+#ifdef CONFIG_NUMA
+ printk("Policy zone: %s\n", zone_names[policy_zone]);
+#endif
+}
+
+/*
+ * Helper functions to size the waitqueue hash table.
+ * Essentially these want to choose hash table sizes sufficiently
+ * large so that collisions trying to wait on pages are rare.
+ * But in fact, the number of active page waitqueues on typical
+ * systems is ridiculously low, less than 200. So this is even
+ * conservative, even though it seems large.
+ *
+ * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
+ * waitqueues, i.e. the size of the waitq table given the number of pages.
+ */
+#define PAGES_PER_WAITQUEUE 256
+
+#ifndef CONFIG_MEMORY_HOTPLUG
+static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
+{
+ unsigned long size = 1;
+
+ pages /= PAGES_PER_WAITQUEUE;
+
+ while (size < pages)
+ size <<= 1;
+
+ /*
+ * Once we have dozens or even hundreds of threads sleeping
+ * on IO we've got bigger problems than wait queue collision.
+ * Limit the size of the wait table to a reasonable size.
+ */
+ size = min(size, 4096UL);
+
+ return max(size, 4UL);
+}
+#else
+/*
+ * A zone's size might be changed by hot-add, so it is not possible to determine
+ * a suitable size for its wait_table. So we use the maximum size now.
+ *
+ * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
+ *
+ * i386 (preemption config) : 4096 x 16 = 64Kbyte.
+ * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
+ * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
+ *
+ * The maximum entries are prepared when a zone's memory is (512K + 256) pages
+ * or more by the traditional way. (See above). It equals:
+ *
+ * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
+ * ia64(16K page size) : = ( 8G + 4M)byte.
+ * powerpc (64K page size) : = (32G +16M)byte.
+ */
+static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
+{
+ return 4096UL;
+}
+#endif
+
+/*
+ * This is an integer logarithm so that shifts can be used later
+ * to extract the more random high bits from the multiplicative
+ * hash function before the remainder is taken.
+ */
+static inline unsigned long wait_table_bits(unsigned long size)
+{
+ return ffz(~size);
+}
+
+#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
+
+/*
+ * Mark a number of pageblocks as MIGRATE_RESERVE. The number
+ * of blocks reserved is based on zone->pages_min. The memory within the
+ * reserve will tend to store contiguous free pages. Setting min_free_kbytes
+ * higher will lead to a bigger reserve which will get freed as contiguous
+ * blocks as reclaim kicks in
+ */
+static void setup_zone_migrate_reserve(struct zone *zone)
+{
+ unsigned long start_pfn, pfn, end_pfn;
+ struct page *page;
+ unsigned long reserve, block_migratetype;
+
+ /* Get the start pfn, end pfn and the number of blocks to reserve */
+ start_pfn = zone->zone_start_pfn;
+ end_pfn = start_pfn + zone->spanned_pages;
+ reserve = roundup(zone->pages_min, pageblock_nr_pages) >>
+ pageblock_order;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+ if (!pfn_valid(pfn))
+ continue;
+ page = pfn_to_page(pfn);
+
+ /* Watch out for overlapping nodes */
+ if (page_to_nid(page) != zone_to_nid(zone))
+ continue;
+
+ /* Blocks with reserved pages will never free, skip them. */
+ if (PageReserved(page))
+ continue;
+
+ block_migratetype = get_pageblock_migratetype(page);
+
+ /* If this block is reserved, account for it */
+ if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
+ reserve--;
+ continue;
+ }
+
+ /* Suitable for reserving if this block is movable */
+ if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
+ set_pageblock_migratetype(page, MIGRATE_RESERVE);
+ move_freepages_block(zone, page, MIGRATE_RESERVE);
+ reserve--;
+ continue;
+ }
+
+ /*
+ * If the reserve is met and this is a previous reserved block,
+ * take it back
+ */
+ if (block_migratetype == MIGRATE_RESERVE) {
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ move_freepages_block(zone, page, MIGRATE_MOVABLE);
+ }
+ }
+}
+
+/*
+ * Initially all pages are reserved - free ones are freed
+ * up by free_all_bootmem() once the early boot process is
+ * done. Non-atomic initialization, single-pass.
+ */
+void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+ unsigned long start_pfn, enum memmap_context context)
+{
+ struct page *page;
+ unsigned long end_pfn = start_pfn + size;
+ unsigned long pfn;
+ struct zone *z;
+
+ z = &NODE_DATA(nid)->node_zones[zone];
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ /*
+ * There can be holes in boot-time mem_map[]s
+ * handed to this function. They do not
+ * exist on hotplugged memory.
+ */
+ if (context == MEMMAP_EARLY) {
+ if (!early_pfn_valid(pfn))
+ continue;
+ if (!early_pfn_in_nid(pfn, nid))
+ continue;
+ }
+ page = pfn_to_page(pfn);
+ set_page_links(page, zone, nid, pfn);
+ mminit_verify_page_links(page, zone, nid, pfn);
+ init_page_count(page);
+ reset_page_mapcount(page);
+ SetPageReserved(page);
+ /*
+ * Mark the block movable so that blocks are reserved for
+ * movable at startup. This will force kernel allocations
+ * to reserve their blocks rather than leaking throughout
+ * the address space during boot when many long-lived
+ * kernel allocations are made. Later some blocks near
+ * the start are marked MIGRATE_RESERVE by
+ * setup_zone_migrate_reserve()
+ *
+ * bitmap is created for zone's valid pfn range. but memmap
+ * can be created for invalid pages (for alignment)
+ * check here not to call set_pageblock_migratetype() against
+ * pfn out of zone.
+ */
+ if ((z->zone_start_pfn <= pfn)
+ && (pfn < z->zone_start_pfn + z->spanned_pages)
+ && !(pfn & (pageblock_nr_pages - 1)))
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+
+ INIT_LIST_HEAD(&page->lru);
+#ifdef WANT_PAGE_VIRTUAL
+ /* The shift won't overflow because ZONE_NORMAL is below 4G. */
+ if (!is_highmem_idx(zone))
+ set_page_address(page, __va(pfn << PAGE_SHIFT));
+#endif
+ }
+}
+
+static void __meminit zone_init_free_lists(struct zone *zone)
+{
+ int order, t;
+ for_each_migratetype_order(order, t) {
+ INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
+ zone->free_area[order].nr_free = 0;
+ }
+}
+
+#ifndef __HAVE_ARCH_MEMMAP_INIT
+#define memmap_init(size, nid, zone, start_pfn) \
+ memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
+#endif
+
+static int zone_batchsize(struct zone *zone)
+{
+ int batch;
+
+ /*
+ * The per-cpu-pages pools are set to around 1000th of the
+ * size of the zone. But no more than 1/2 of a meg.
+ *
+ * OK, so we don't know how big the cache is. So guess.
+ */
+ batch = zone->present_pages / 1024;
+ if (batch * PAGE_SIZE > 512 * 1024)
+ batch = (512 * 1024) / PAGE_SIZE;
+ batch /= 4; /* We effectively *= 4 below */
+ if (batch < 1)
+ batch = 1;
+
+ /*
+ * Clamp the batch to a 2^n - 1 value. Having a power
+ * of 2 value was found to be more likely to have
+ * suboptimal cache aliasing properties in some cases.
+ *
+ * For example if 2 tasks are alternately allocating
+ * batches of pages, one task can end up with a lot
+ * of pages of one half of the possible page colors
+ * and the other with pages of the other colors.
+ */
+ batch = (1 << (fls(batch + batch/2)-1)) - 1;
+
+ return batch;
+}
+
+static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+{
+ struct per_cpu_pages *pcp;
+
+ memset(p, 0, sizeof(*p));
+
+ pcp = &p->pcp;
+ pcp->count = 0;
+ pcp->high = 6 * batch;
+ pcp->batch = max(1UL, 1 * batch);
+ INIT_LIST_HEAD(&pcp->list);
+}
+
+/*
+ * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
+ * to the value high for the pageset p.
+ */
+
+static void setup_pagelist_highmark(struct per_cpu_pageset *p,
+ unsigned long high)
+{
+ struct per_cpu_pages *pcp;
+
+ pcp = &p->pcp;
+ pcp->high = high;
+ pcp->batch = max(1UL, high/4);
+ if ((high/4) > (PAGE_SHIFT * 8))
+ pcp->batch = PAGE_SHIFT * 8;
+}
+
+
+#ifdef CONFIG_NUMA
+/*
+ * Boot pageset table. One per cpu which is going to be used for all
+ * zones and all nodes. The parameters will be set in such a way
+ * that an item put on a list will immediately be handed over to
+ * the buddy list. This is safe since pageset manipulation is done
+ * with interrupts disabled.
+ *
+ * Some NUMA counter updates may also be caught by the boot pagesets.
+ *
+ * The boot_pagesets must be kept even after bootup is complete for
+ * unused processors and/or zones. They do play a role for bootstrapping
+ * hotplugged processors.
+ *
+ * zoneinfo_show() and maybe other functions do
+ * not check if the processor is online before following the pageset pointer.
+ * Other parts of the kernel may not check if the zone is available.
+ */
+static struct per_cpu_pageset boot_pageset[NR_CPUS];
+
+/*
+ * Dynamically allocate memory for the
+ * per cpu pageset array in struct zone.
+ */
+static int __cpuinit process_zones(int cpu)
+{
+ struct zone *zone, *dzone;
+ int node = cpu_to_node(cpu);
+
+ node_set_state(node, N_CPU); /* this node has a cpu */
+
+ for_each_zone(zone) {
+
+ if (!populated_zone(zone))
+ continue;
+
+ zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
+ GFP_KERNEL, node);
+ if (!zone_pcp(zone, cpu))
+ goto bad;
+
+ setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
+
+ if (percpu_pagelist_fraction)
+ setup_pagelist_highmark(zone_pcp(zone, cpu),
+ (zone->present_pages / percpu_pagelist_fraction));
+ }
+
+ return 0;
+bad:
+ for_each_zone(dzone) {
+ if (!populated_zone(dzone))
+ continue;
+ if (dzone == zone)
+ break;
+ kfree(zone_pcp(dzone, cpu));
+ zone_pcp(dzone, cpu) = NULL;
+ }
+ return -ENOMEM;
+}
+
+static inline void free_zone_pagesets(int cpu)
+{
+ struct zone *zone;
+
+ for_each_zone(zone) {
+ struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
+
+ /* Free per_cpu_pageset if it is slab allocated */
+ if (pset != &boot_pageset[cpu])
+ kfree(pset);
+ zone_pcp(zone, cpu) = NULL;
+ }
+}
+
+static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ int cpu = (long)hcpu;
+ int ret = NOTIFY_OK;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ if (process_zones(cpu))
+ ret = NOTIFY_BAD;
+ break;
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ free_zone_pagesets(cpu);
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+
+static struct notifier_block __cpuinitdata pageset_notifier =
+ { &pageset_cpuup_callback, NULL, 0 };
+
+void __init setup_per_cpu_pageset(void)
+{
+ int err;
+
+ /* Initialize per_cpu_pageset for cpu 0.
+ * A cpuup callback will do this for every cpu
+ * as it comes online
+ */
+ err = process_zones(smp_processor_id());
+ BUG_ON(err);
+ register_cpu_notifier(&pageset_notifier);
+}
+
+#endif
+
+static noinline __init_refok
+int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
+{
+ int i;
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ size_t alloc_size;
+
+ /*
+ * The per-page waitqueue mechanism uses hashed waitqueues
+ * per zone.
+ */
+ zone->wait_table_hash_nr_entries =
+ wait_table_hash_nr_entries(zone_size_pages);
+ zone->wait_table_bits =
+ wait_table_bits(zone->wait_table_hash_nr_entries);
+ alloc_size = zone->wait_table_hash_nr_entries
+ * sizeof(wait_queue_head_t);
+
+ if (!slab_is_available()) {
+ zone->wait_table = (wait_queue_head_t *)
+ alloc_bootmem_node(pgdat, alloc_size);
+ } else {
+ /*
+ * This case means that a zone whose size was 0 gets new memory
+ * via memory hot-add.
+ * But it may be the case that a new node was hot-added. In
+ * this case vmalloc() will not be able to use this new node's
+ * memory - this wait_table must be initialized to use this new
+ * node itself as well.
+ * To use this new node's memory, further consideration will be
+ * necessary.
+ */
+ zone->wait_table = vmalloc(alloc_size);
+ }
+ if (!zone->wait_table)
+ return -ENOMEM;
+
+ for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
+ init_waitqueue_head(zone->wait_table + i);
+
+ return 0;
+}
+
+static __meminit void zone_pcp_init(struct zone *zone)
+{
+ int cpu;
+ unsigned long batch = zone_batchsize(zone);
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+#ifdef CONFIG_NUMA
+ /* Early boot. Slab allocator not functional yet */
+ zone_pcp(zone, cpu) = &boot_pageset[cpu];
+ setup_pageset(&boot_pageset[cpu],0);
+#else
+ setup_pageset(zone_pcp(zone,cpu), batch);
+#endif
+ }
+ if (zone->present_pages)
+ printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
+ zone->name, zone->present_pages, batch);
+}
+
+__meminit int init_currently_empty_zone(struct zone *zone,
+ unsigned long zone_start_pfn,
+ unsigned long size,
+ enum memmap_context context)
+{
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ int ret;
+ ret = zone_wait_table_init(zone, size);
+ if (ret)
+ return ret;
+ pgdat->nr_zones = zone_idx(zone) + 1;
+
+ zone->zone_start_pfn = zone_start_pfn;
+
+ mminit_dprintk(MMINIT_TRACE, "memmap_init",
+ "Initialising map node %d zone %lu pfns %lu -> %lu\n",
+ pgdat->node_id,
+ (unsigned long)zone_idx(zone),
+ zone_start_pfn, (zone_start_pfn + size));
+
+ zone_init_free_lists(zone);
+
+ return 0;
+}
+
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+/*
+ * Basic iterator support. Return the first range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns first region regardless of node
+ */
+static int __meminit first_active_region_index_in_nid(int nid)
+{
+ int i;
+
+ for (i = 0; i < nr_nodemap_entries; i++)
+ if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
+ return i;
+
+ return -1;
+}
+
+/*
+ * Basic iterator support. Return the next active range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns next region regardless of node
+ */
+static int __meminit next_active_region_index_in_nid(int index, int nid)
+{
+ for (index = index + 1; index < nr_nodemap_entries; index++)
+ if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
+ return index;
+
+ return -1;
+}
+
+#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+/*
+ * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
+ * Architectures may implement their own version but if add_active_range()
+ * was used and there are no special requirements, this is a convenient
+ * alternative
+ */
+int __meminit __early_pfn_to_nid(unsigned long pfn)
+{
+ int i;
+
+ for (i = 0; i < nr_nodemap_entries; i++) {
+ unsigned long start_pfn = early_node_map[i].start_pfn;
+ unsigned long end_pfn = early_node_map[i].end_pfn;
+
+ if (start_pfn <= pfn && pfn < end_pfn)
+ return early_node_map[i].nid;
+ }
+ /* This is a memory hole */
+ return -1;
+}
+#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
+
+int __meminit early_pfn_to_nid(unsigned long pfn)
+{
+ int nid;
+
+ nid = __early_pfn_to_nid(pfn);
+ if (nid >= 0)
+ return nid;
+ /* just returns 0 */
+ return 0;
+}
+
+#ifdef CONFIG_NODES_SPAN_OTHER_NODES
+bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
+{
+ int nid;
+
+ nid = __early_pfn_to_nid(pfn);
+ if (nid >= 0 && nid != node)
+ return false;
+ return true;
+}
+#endif
+
+/* Basic iterator support to walk early_node_map[] */
+#define for_each_active_range_index_in_nid(i, nid) \
+ for (i = first_active_region_index_in_nid(nid); i != -1; \
+ i = next_active_region_index_in_nid(i, nid))
+
+/**
+ * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
+ * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
+ * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
+ *
+ * If an architecture guarantees that all ranges registered with
+ * add_active_ranges() contain no holes and may be freed, this
+ * this function may be used instead of calling free_bootmem() manually.
+ */
+void __init free_bootmem_with_active_regions(int nid,
+ unsigned long max_low_pfn)
+{
+ int i;
+
+ for_each_active_range_index_in_nid(i, nid) {
+ unsigned long size_pages = 0;
+ unsigned long end_pfn = early_node_map[i].end_pfn;
+
+ if (early_node_map[i].start_pfn >= max_low_pfn)
+ continue;
+
+ if (end_pfn > max_low_pfn)
+ end_pfn = max_low_pfn;
+
+ size_pages = end_pfn - early_node_map[i].start_pfn;
+ free_bootmem_node(NODE_DATA(early_node_map[i].nid),
+ PFN_PHYS(early_node_map[i].start_pfn),
+ size_pages << PAGE_SHIFT);
+ }
+}
+
+void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
+{
+ int i;
+ int ret;
+
+ for_each_active_range_index_in_nid(i, nid) {
+ ret = work_fn(early_node_map[i].start_pfn,
+ early_node_map[i].end_pfn, data);
+ if (ret)
+ break;
+ }
+}
+/**
+ * sparse_memory_present_with_active_regions - Call memory_present for each active range
+ * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
+ *
+ * If an architecture guarantees that all ranges registered with
+ * add_active_ranges() contain no holes and may be freed, this
+ * function may be used instead of calling memory_present() manually.
+ */
+void __init sparse_memory_present_with_active_regions(int nid)
+{
+ int i;
+
+ for_each_active_range_index_in_nid(i, nid)
+ memory_present(early_node_map[i].nid,
+ early_node_map[i].start_pfn,
+ early_node_map[i].end_pfn);
+}
+
+/**
+ * push_node_boundaries - Push node boundaries to at least the requested boundary
+ * @nid: The nid of the node to push the boundary for
+ * @start_pfn: The start pfn of the node
+ * @end_pfn: The end pfn of the node
+ *
+ * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
+ * time. Specifically, on x86_64, SRAT will report ranges that can potentially
+ * be hotplugged even though no physical memory exists. This function allows
+ * an arch to push out the node boundaries so mem_map is allocated that can
+ * be used later.
+ */
+#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
+void __init push_node_boundaries(unsigned int nid,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ mminit_dprintk(MMINIT_TRACE, "zoneboundary",
+ "Entering push_node_boundaries(%u, %lu, %lu)\n",
+ nid, start_pfn, end_pfn);
+
+ /* Initialise the boundary for this node if necessary */
+ if (node_boundary_end_pfn[nid] == 0)
+ node_boundary_start_pfn[nid] = -1UL;
+
+ /* Update the boundaries */
+ if (node_boundary_start_pfn[nid] > start_pfn)
+ node_boundary_start_pfn[nid] = start_pfn;
+ if (node_boundary_end_pfn[nid] < end_pfn)
+ node_boundary_end_pfn[nid] = end_pfn;
+}
+
+/* If necessary, push the node boundary out for reserve hotadd */
+static void __meminit account_node_boundary(unsigned int nid,
+ unsigned long *start_pfn, unsigned long *end_pfn)
+{
+ mminit_dprintk(MMINIT_TRACE, "zoneboundary",
+ "Entering account_node_boundary(%u, %lu, %lu)\n",
+ nid, *start_pfn, *end_pfn);
+
+ /* Return if boundary information has not been provided */
+ if (node_boundary_end_pfn[nid] == 0)
+ return;
+
+ /* Check the boundaries and update if necessary */
+ if (node_boundary_start_pfn[nid] < *start_pfn)
+ *start_pfn = node_boundary_start_pfn[nid];
+ if (node_boundary_end_pfn[nid] > *end_pfn)
+ *end_pfn = node_boundary_end_pfn[nid];
+}
+#else
+void __init push_node_boundaries(unsigned int nid,
+ unsigned long start_pfn, unsigned long end_pfn) {}
+
+static void __meminit account_node_boundary(unsigned int nid,
+ unsigned long *start_pfn, unsigned long *end_pfn) {}
+#endif
+
+
+/**
+ * get_pfn_range_for_nid - Return the start and end page frames for a node
+ * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
+ * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
+ * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
+ *
+ * It returns the start and end page frame of a node based on information
+ * provided by an arch calling add_active_range(). If called for a node
+ * with no available memory, a warning is printed and the start and end
+ * PFNs will be 0.
+ */
+void __meminit get_pfn_range_for_nid(unsigned int nid,
+ unsigned long *start_pfn, unsigned long *end_pfn)
+{
+ int i;
+ *start_pfn = -1UL;
+ *end_pfn = 0;
+
+ for_each_active_range_index_in_nid(i, nid) {
+ *start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
+ *end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
+ }
+
+ if (*start_pfn == -1UL)
+ *start_pfn = 0;
+
+ /* Push the node boundaries out if requested */
+ account_node_boundary(nid, start_pfn, end_pfn);
+}
+
+/*
+ * This finds a zone that can be used for ZONE_MOVABLE pages. The
+ * assumption is made that zones within a node are ordered in monotonic
+ * increasing memory addresses so that the "highest" populated zone is used
+ */
+static void __init find_usable_zone_for_movable(void)
+{
+ int zone_index;
+ for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
+ if (zone_index == ZONE_MOVABLE)
+ continue;
+
+ if (arch_zone_highest_possible_pfn[zone_index] >
+ arch_zone_lowest_possible_pfn[zone_index])
+ break;
+ }
+
+ VM_BUG_ON(zone_index == -1);
+ movable_zone = zone_index;
+}
+
+/*
+ * The zone ranges provided by the architecture do not include ZONE_MOVABLE
+ * because it is sized independant of architecture. Unlike the other zones,
+ * the starting point for ZONE_MOVABLE is not fixed. It may be different
+ * in each node depending on the size of each node and how evenly kernelcore
+ * is distributed. This helper function adjusts the zone ranges
+ * provided by the architecture for a given node by using the end of the
+ * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
+ * zones within a node are in order of monotonic increases memory addresses
+ */
+static void __meminit adjust_zone_range_for_zone_movable(int nid,
+ unsigned long zone_type,
+ unsigned long node_start_pfn,
+ unsigned long node_end_pfn,
+ unsigned long *zone_start_pfn,
+ unsigned long *zone_end_pfn)
+{
+ /* Only adjust if ZONE_MOVABLE is on this node */
+ if (zone_movable_pfn[nid]) {
+ /* Size ZONE_MOVABLE */
+ if (zone_type == ZONE_MOVABLE) {
+ *zone_start_pfn = zone_movable_pfn[nid];
+ *zone_end_pfn = min(node_end_pfn,
+ arch_zone_highest_possible_pfn[movable_zone]);
+
+ /* Adjust for ZONE_MOVABLE starting within this range */
+ } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
+ *zone_end_pfn > zone_movable_pfn[nid]) {
+ *zone_end_pfn = zone_movable_pfn[nid];
+
+ /* Check if this whole range is within ZONE_MOVABLE */
+ } else if (*zone_start_pfn >= zone_movable_pfn[nid])
+ *zone_start_pfn = *zone_end_pfn;
+ }
+}
+
+/*
+ * Return the number of pages a zone spans in a node, including holes
+ * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
+ */
+static unsigned long __meminit zone_spanned_pages_in_node(int nid,
+ unsigned long zone_type,
+ unsigned long *ignored)
+{
+ unsigned long node_start_pfn, node_end_pfn;
+ unsigned long zone_start_pfn, zone_end_pfn;
+
+ /* Get the start and end of the node and zone */
+ get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
+ zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
+ zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+ adjust_zone_range_for_zone_movable(nid, zone_type,
+ node_start_pfn, node_end_pfn,
+ &zone_start_pfn, &zone_end_pfn);
+
+ /* Check that this node has pages within the zone's required range */
+ if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
+ return 0;
+
+ /* Move the zone boundaries inside the node if necessary */
+ zone_end_pfn = min(zone_end_pfn, node_end_pfn);
+ zone_start_pfn = max(zone_start_pfn, node_start_pfn);
+
+ /* Return the spanned pages */
+ return zone_end_pfn - zone_start_pfn;
+}
+
+/*
+ * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
+ * then all holes in the requested range will be accounted for.
+ */
+static unsigned long __meminit __absent_pages_in_range(int nid,
+ unsigned long range_start_pfn,
+ unsigned long range_end_pfn)
+{
+ int i = 0;
+ unsigned long prev_end_pfn = 0, hole_pages = 0;
+ unsigned long start_pfn;
+
+ /* Find the end_pfn of the first active range of pfns in the node */
+ i = first_active_region_index_in_nid(nid);
+ if (i == -1)
+ return 0;
+
+ prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
+
+ /* Account for ranges before physical memory on this node */
+ if (early_node_map[i].start_pfn > range_start_pfn)
+ hole_pages = prev_end_pfn - range_start_pfn;
+
+ /* Find all holes for the zone within the node */
+ for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
+
+ /* No need to continue if prev_end_pfn is outside the zone */
+ if (prev_end_pfn >= range_end_pfn)
+ break;
+
+ /* Make sure the end of the zone is not within the hole */
+ start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
+ prev_end_pfn = max(prev_end_pfn, range_start_pfn);
+
+ /* Update the hole size cound and move on */
+ if (start_pfn > range_start_pfn) {
+ BUG_ON(prev_end_pfn > start_pfn);
+ hole_pages += start_pfn - prev_end_pfn;
+ }
+ prev_end_pfn = early_node_map[i].end_pfn;
+ }
+
+ /* Account for ranges past physical memory on this node */
+ if (range_end_pfn > prev_end_pfn)
+ hole_pages += range_end_pfn -
+ max(range_start_pfn, prev_end_pfn);
+
+ return hole_pages;
+}
+
+/**
+ * absent_pages_in_range - Return number of page frames in holes within a range
+ * @start_pfn: The start PFN to start searching for holes
+ * @end_pfn: The end PFN to stop searching for holes
+ *
+ * It returns the number of pages frames in memory holes within a range.
+ */
+unsigned long __init absent_pages_in_range(unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
+}
+
+/* Return the number of page frames in holes in a zone on a node */
+static unsigned long __meminit zone_absent_pages_in_node(int nid,
+ unsigned long zone_type,
+ unsigned long *ignored)
+{
+ unsigned long node_start_pfn, node_end_pfn;
+ unsigned long zone_start_pfn, zone_end_pfn;
+
+ get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
+ zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
+ node_start_pfn);
+ zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
+ node_end_pfn);
+
+ adjust_zone_range_for_zone_movable(nid, zone_type,
+ node_start_pfn, node_end_pfn,
+ &zone_start_pfn, &zone_end_pfn);
+ return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
+}
+
+#else
+static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
+ unsigned long zone_type,
+ unsigned long *zones_size)
+{
+ return zones_size[zone_type];
+}
+
+static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
+ unsigned long zone_type,
+ unsigned long *zholes_size)
+{
+ if (!zholes_size)
+ return 0;
+
+ return zholes_size[zone_type];
+}
+
+#endif
+
+static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
+ unsigned long *zones_size, unsigned long *zholes_size)
+{
+ unsigned long realtotalpages, totalpages = 0;
+ enum zone_type i;
+
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
+ zones_size);
+ pgdat->node_spanned_pages = totalpages;
+
+ realtotalpages = totalpages;
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ realtotalpages -=
+ zone_absent_pages_in_node(pgdat->node_id, i,
+ zholes_size);
+ pgdat->node_present_pages = realtotalpages;
+ printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
+ realtotalpages);
+}
+
+#ifndef CONFIG_SPARSEMEM
+/*
+ * Calculate the size of the zone->blockflags rounded to an unsigned long
+ * Start by making sure zonesize is a multiple of pageblock_order by rounding
+ * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
+ * round what is now in bits to nearest long in bits, then return it in
+ * bytes.
+ */
+static unsigned long __init usemap_size(unsigned long zonesize)
+{
+ unsigned long usemapsize;
+
+ usemapsize = roundup(zonesize, pageblock_nr_pages);
+ usemapsize = usemapsize >> pageblock_order;
+ usemapsize *= NR_PAGEBLOCK_BITS;
+ usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
+
+ return usemapsize / 8;
+}
+
+static void __init setup_usemap(struct pglist_data *pgdat,
+ struct zone *zone, unsigned long zonesize)
+{
+ unsigned long usemapsize = usemap_size(zonesize);
+ zone->pageblock_flags = NULL;
+ if (usemapsize) {
+ zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
+ memset(zone->pageblock_flags, 0, usemapsize);
+ }
+}
+#else
+static void inline setup_usemap(struct pglist_data *pgdat,
+ struct zone *zone, unsigned long zonesize) {}
+#endif /* CONFIG_SPARSEMEM */
+
+#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
+
+/* Return a sensible default order for the pageblock size. */
+static inline int pageblock_default_order(void)
+{
+ if (HPAGE_SHIFT > PAGE_SHIFT)
+ return HUGETLB_PAGE_ORDER;
+
+ return MAX_ORDER-1;
+}
+
+/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
+static inline void __init set_pageblock_order(unsigned int order)
+{
+ /* Check that pageblock_nr_pages has not already been setup */
+ if (pageblock_order)
+ return;
+
+ /*
+ * Assume the largest contiguous order of interest is a huge page.
+ * This value may be variable depending on boot parameters on IA64
+ */
+ pageblock_order = order;
+}
+#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
+
+/*
+ * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
+ * and pageblock_default_order() are unused as pageblock_order is set
+ * at compile-time. See include/linux/pageblock-flags.h for the values of
+ * pageblock_order based on the kernel config
+ */
+static inline int pageblock_default_order(unsigned int order)
+{
+ return MAX_ORDER-1;
+}
+#define set_pageblock_order(x) do {} while (0)
+
+#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
+
+/*
+ * Set up the zone data structures:
+ * - mark all pages reserved
+ * - mark all memory queues empty
+ * - clear the memory bitmaps
+ */
+static void __paginginit free_area_init_core(struct pglist_data *pgdat,
+ unsigned long *zones_size, unsigned long *zholes_size)
+{
+ enum zone_type j;
+ int nid = pgdat->node_id;
+ unsigned long zone_start_pfn = pgdat->node_start_pfn;
+ int ret;
+
+ pgdat_resize_init(pgdat);
+ pgdat->nr_zones = 0;
+ init_waitqueue_head(&pgdat->kswapd_wait);
+ pgdat->kswapd_max_order = 0;
+ pgdat_page_cgroup_init(pgdat);
+
+ for (j = 0; j < MAX_NR_ZONES; j++) {
+ struct zone *zone = pgdat->node_zones + j;
+ unsigned long size, realsize, memmap_pages;
+ enum lru_list l;
+
+ size = zone_spanned_pages_in_node(nid, j, zones_size);
+ realsize = size - zone_absent_pages_in_node(nid, j,
+ zholes_size);
+
+ /*
+ * Adjust realsize so that it accounts for how much memory
+ * is used by this zone for memmap. This affects the watermark
+ * and per-cpu initialisations
+ */
+ memmap_pages =
+ PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
+ if (realsize >= memmap_pages) {
+ realsize -= memmap_pages;
+ printk(KERN_DEBUG
+ " %s zone: %lu pages used for memmap\n",
+ zone_names[j], memmap_pages);
+ } else
+ printk(KERN_WARNING
+ " %s zone: %lu pages exceeds realsize %lu\n",
+ zone_names[j], memmap_pages, realsize);
+
+ /* Account for reserved pages */
+ if (j == 0 && realsize > dma_reserve) {
+ realsize -= dma_reserve;
+ printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
+ zone_names[0], dma_reserve);
+ }
+
+ if (!is_highmem_idx(j))
+ nr_kernel_pages += realsize;
+ nr_all_pages += realsize;
+
+ zone->spanned_pages = size;
+ zone->present_pages = realsize;
+#ifdef CONFIG_NUMA
+ zone->node = nid;
+ zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
+ / 100;
+ zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
+#endif
+ zone->name = zone_names[j];
+ spin_lock_init(&zone->lock);
+ spin_lock_init(&zone->lru_lock);
+ zone_seqlock_init(zone);
+ zone->zone_pgdat = pgdat;
+
+ zone->prev_priority = DEF_PRIORITY;
+
+ zone_pcp_init(zone);
+ for_each_lru(l) {
+ INIT_LIST_HEAD(&zone->lru[l].list);
+ zone->lru[l].nr_scan = 0;
+ }
+ zone->recent_rotated[0] = 0;
+ zone->recent_rotated[1] = 0;
+ zone->recent_scanned[0] = 0;
+ zone->recent_scanned[1] = 0;
+ zap_zone_vm_stats(zone);
+ zone->flags = 0;
+ if (!size)
+ continue;
+
+ set_pageblock_order(pageblock_default_order());
+ setup_usemap(pgdat, zone, size);
+ ret = init_currently_empty_zone(zone, zone_start_pfn,
+ size, MEMMAP_EARLY);
+ BUG_ON(ret);
+ memmap_init(size, nid, j, zone_start_pfn);
+ zone_start_pfn += size;
+ }
+}
+
+static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
+{
+ /* Skip empty nodes */
+ if (!pgdat->node_spanned_pages)
+ return;
+
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+ /* ia64 gets its own node_mem_map, before this, without bootmem */
+ if (!pgdat->node_mem_map) {
+ unsigned long size, start, end;
+ struct page *map;
+
+ /*
+ * The zone's endpoints aren't required to be MAX_ORDER
+ * aligned but the node_mem_map endpoints must be in order
+ * for the buddy allocator to function correctly.
+ */
+ start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
+ end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
+ end = ALIGN(end, MAX_ORDER_NR_PAGES);
+ size = (end - start) * sizeof(struct page);
+ map = alloc_remap(pgdat->node_id, size);
+ if (!map)
+ map = alloc_bootmem_node(pgdat, size);
+ pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
+ }
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+ /*
+ * With no DISCONTIG, the global mem_map is just set as node 0's
+ */
+ if (pgdat == NODE_DATA(0)) {
+ mem_map = NODE_DATA(0)->node_mem_map;
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+ if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
+ mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+ }
+#endif
+#endif /* CONFIG_FLAT_NODE_MEM_MAP */
+}
+
+void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
+ unsigned long node_start_pfn, unsigned long *zholes_size)
+{
+ pg_data_t *pgdat = NODE_DATA(nid);
+
+ pgdat->node_id = nid;
+ pgdat->node_start_pfn = node_start_pfn;
+ calculate_node_totalpages(pgdat, zones_size, zholes_size);
+
+ alloc_node_mem_map(pgdat);
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+ printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
+ nid, (unsigned long)pgdat,
+ (unsigned long)pgdat->node_mem_map);
+#endif
+
+ free_area_init_core(pgdat, zones_size, zholes_size);
+}
+
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+
+#if MAX_NUMNODES > 1
+/*
+ * Figure out the number of possible node ids.
+ */
+static void __init setup_nr_node_ids(void)
+{
+ unsigned int node;
+ unsigned int highest = 0;
+
+ for_each_node_mask(node, node_possible_map)
+ highest = node;
+ nr_node_ids = highest + 1;
+}
+#else
+static inline void setup_nr_node_ids(void)
+{
+}
+#endif
+
+/**
+ * add_active_range - Register a range of PFNs backed by physical memory
+ * @nid: The node ID the range resides on
+ * @start_pfn: The start PFN of the available physical memory
+ * @end_pfn: The end PFN of the available physical memory
+ *
+ * These ranges are stored in an early_node_map[] and later used by
+ * free_area_init_nodes() to calculate zone sizes and holes. If the
+ * range spans a memory hole, it is up to the architecture to ensure
+ * the memory is not freed by the bootmem allocator. If possible
+ * the range being registered will be merged with existing ranges.
+ */
+void __init add_active_range(unsigned int nid, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ int i;
+
+ mminit_dprintk(MMINIT_TRACE, "memory_register",
+ "Entering add_active_range(%d, %#lx, %#lx) "
+ "%d entries of %d used\n",
+ nid, start_pfn, end_pfn,
+ nr_nodemap_entries, MAX_ACTIVE_REGIONS);
+
+ mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
+
+ /* Merge with existing active regions if possible */
+ for (i = 0; i < nr_nodemap_entries; i++) {
+ if (early_node_map[i].nid != nid)
+ continue;
+
+ /* Skip if an existing region covers this new one */
+ if (start_pfn >= early_node_map[i].start_pfn &&
+ end_pfn <= early_node_map[i].end_pfn)
+ return;
+
+ /* Merge forward if suitable */
+ if (start_pfn <= early_node_map[i].end_pfn &&
+ end_pfn > early_node_map[i].end_pfn) {
+ early_node_map[i].end_pfn = end_pfn;
+ return;
+ }
+
+ /* Merge backward if suitable */
+ if (start_pfn < early_node_map[i].end_pfn &&
+ end_pfn >= early_node_map[i].start_pfn) {
+ early_node_map[i].start_pfn = start_pfn;
+ return;
+ }
+ }
+
+ /* Check that early_node_map is large enough */
+ if (i >= MAX_ACTIVE_REGIONS) {
+ printk(KERN_CRIT "More than %d memory regions, truncating\n",
+ MAX_ACTIVE_REGIONS);
+ return;
+ }
+
+ early_node_map[i].nid = nid;
+ early_node_map[i].start_pfn = start_pfn;
+ early_node_map[i].end_pfn = end_pfn;
+ nr_nodemap_entries = i + 1;
+}
+
+/**
+ * remove_active_range - Shrink an existing registered range of PFNs
+ * @nid: The node id the range is on that should be shrunk
+ * @start_pfn: The new PFN of the range
+ * @end_pfn: The new PFN of the range
+ *
+ * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
+ * The map is kept near the end physical page range that has already been
+ * registered. This function allows an arch to shrink an existing registered
+ * range.
+ */
+void __init remove_active_range(unsigned int nid, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ int i, j;
+ int removed = 0;
+
+ printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n",
+ nid, start_pfn, end_pfn);
+
+ /* Find the old active region end and shrink */
+ for_each_active_range_index_in_nid(i, nid) {
+ if (early_node_map[i].start_pfn >= start_pfn &&
+ early_node_map[i].end_pfn <= end_pfn) {
+ /* clear it */
+ early_node_map[i].start_pfn = 0;
+ early_node_map[i].end_pfn = 0;
+ removed = 1;
+ continue;
+ }
+ if (early_node_map[i].start_pfn < start_pfn &&
+ early_node_map[i].end_pfn > start_pfn) {
+ unsigned long temp_end_pfn = early_node_map[i].end_pfn;
+ early_node_map[i].end_pfn = start_pfn;
+ if (temp_end_pfn > end_pfn)
+ add_active_range(nid, end_pfn, temp_end_pfn);
+ continue;
+ }
+ if (early_node_map[i].start_pfn >= start_pfn &&
+ early_node_map[i].end_pfn > end_pfn &&
+ early_node_map[i].start_pfn < end_pfn) {
+ early_node_map[i].start_pfn = end_pfn;
+ continue;
+ }
+ }
+
+ if (!removed)
+ return;
+
+ /* remove the blank ones */
+ for (i = nr_nodemap_entries - 1; i > 0; i--) {
+ if (early_node_map[i].nid != nid)
+ continue;
+ if (early_node_map[i].end_pfn)
+ continue;
+ /* we found it, get rid of it */
+ for (j = i; j < nr_nodemap_entries - 1; j++)
+ memcpy(&early_node_map[j], &early_node_map[j+1],
+ sizeof(early_node_map[j]));
+ j = nr_nodemap_entries - 1;
+ memset(&early_node_map[j], 0, sizeof(early_node_map[j]));
+ nr_nodemap_entries--;
+ }
+}
+
+/**
+ * remove_all_active_ranges - Remove all currently registered regions
+ *
+ * During discovery, it may be found that a table like SRAT is invalid
+ * and an alternative discovery method must be used. This function removes
+ * all currently registered regions.
+ */
+void __init remove_all_active_ranges(void)
+{
+ memset(early_node_map, 0, sizeof(early_node_map));
+ nr_nodemap_entries = 0;
+#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
+ memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
+ memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
+#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
+}
+
+/* Compare two active node_active_regions */
+static int __init cmp_node_active_region(const void *a, const void *b)
+{
+ struct node_active_region *arange = (struct node_active_region *)a;
+ struct node_active_region *brange = (struct node_active_region *)b;
+
+ /* Done this way to avoid overflows */
+ if (arange->start_pfn > brange->start_pfn)
+ return 1;
+ if (arange->start_pfn < brange->start_pfn)
+ return -1;
+
+ return 0;
+}
+
+/* sort the node_map by start_pfn */
+static void __init sort_node_map(void)
+{
+ sort(early_node_map, (size_t)nr_nodemap_entries,
+ sizeof(struct node_active_region),
+ cmp_node_active_region, NULL);
+}
+
+/* Find the lowest pfn for a node */
+static unsigned long __init find_min_pfn_for_node(int nid)
+{
+ int i;
+ unsigned long min_pfn = ULONG_MAX;
+
+ /* Assuming a sorted map, the first range found has the starting pfn */
+ for_each_active_range_index_in_nid(i, nid)
+ min_pfn = min(min_pfn, early_node_map[i].start_pfn);
+
+ if (min_pfn == ULONG_MAX) {
+ printk(KERN_WARNING
+ "Could not find start_pfn for node %d\n", nid);
+ return 0;
+ }
+
+ return min_pfn;
+}
+
+/**
+ * find_min_pfn_with_active_regions - Find the minimum PFN registered
+ *
+ * It returns the minimum PFN based on information provided via
+ * add_active_range().
+ */
+unsigned long __init find_min_pfn_with_active_regions(void)
+{
+ return find_min_pfn_for_node(MAX_NUMNODES);
+}
+
+/*
+ * early_calculate_totalpages()
+ * Sum pages in active regions for movable zone.
+ * Populate N_HIGH_MEMORY for calculating usable_nodes.
+ */
+static unsigned long __init early_calculate_totalpages(void)
+{
+ int i;
+ unsigned long totalpages = 0;
+
+ for (i = 0; i < nr_nodemap_entries; i++) {
+ unsigned long pages = early_node_map[i].end_pfn -
+ early_node_map[i].start_pfn;
+ totalpages += pages;
+ if (pages)
+ node_set_state(early_node_map[i].nid, N_HIGH_MEMORY);
+ }
+ return totalpages;
+}
+
+/*
+ * Find the PFN the Movable zone begins in each node. Kernel memory
+ * is spread evenly between nodes as long as the nodes have enough
+ * memory. When they don't, some nodes will have more kernelcore than
+ * others
+ */
+static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
+{
+ int i, nid;
+ unsigned long usable_startpfn;
+ unsigned long kernelcore_node, kernelcore_remaining;
+ unsigned long totalpages = early_calculate_totalpages();
+ int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
+
+ /*
+ * If movablecore was specified, calculate what size of
+ * kernelcore that corresponds so that memory usable for
+ * any allocation type is evenly spread. If both kernelcore
+ * and movablecore are specified, then the value of kernelcore
+ * will be used for required_kernelcore if it's greater than
+ * what movablecore would have allowed.
+ */
+ if (required_movablecore) {
+ unsigned long corepages;
+
+ /*
+ * Round-up so that ZONE_MOVABLE is at least as large as what
+ * was requested by the user
+ */
+ required_movablecore =
+ roundup(required_movablecore, MAX_ORDER_NR_PAGES);
+ corepages = totalpages - required_movablecore;
+
+ required_kernelcore = max(required_kernelcore, corepages);
+ }
+
+ /* If kernelcore was not specified, there is no ZONE_MOVABLE */
+ if (!required_kernelcore)
+ return;
+
+ /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
+ find_usable_zone_for_movable();
+ usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
+
+restart:
+ /* Spread kernelcore memory as evenly as possible throughout nodes */
+ kernelcore_node = required_kernelcore / usable_nodes;
+ for_each_node_state(nid, N_HIGH_MEMORY) {
+ /*
+ * Recalculate kernelcore_node if the division per node
+ * now exceeds what is necessary to satisfy the requested
+ * amount of memory for the kernel
+ */
+ if (required_kernelcore < kernelcore_node)
+ kernelcore_node = required_kernelcore / usable_nodes;
+
+ /*
+ * As the map is walked, we track how much memory is usable
+ * by the kernel using kernelcore_remaining. When it is
+ * 0, the rest of the node is usable by ZONE_MOVABLE
+ */
+ kernelcore_remaining = kernelcore_node;
+
+ /* Go through each range of PFNs within this node */
+ for_each_active_range_index_in_nid(i, nid) {
+ unsigned long start_pfn, end_pfn;
+ unsigned long size_pages;
+
+ start_pfn = max(early_node_map[i].start_pfn,
+ zone_movable_pfn[nid]);
+ end_pfn = early_node_map[i].end_pfn;
+ if (start_pfn >= end_pfn)
+ continue;
+
+ /* Account for what is only usable for kernelcore */
+ if (start_pfn < usable_startpfn) {
+ unsigned long kernel_pages;
+ kernel_pages = min(end_pfn, usable_startpfn)
+ - start_pfn;
+
+ kernelcore_remaining -= min(kernel_pages,
+ kernelcore_remaining);
+ required_kernelcore -= min(kernel_pages,
+ required_kernelcore);
+
+ /* Continue if range is now fully accounted */
+ if (end_pfn <= usable_startpfn) {
+
+ /*
+ * Push zone_movable_pfn to the end so
+ * that if we have to rebalance
+ * kernelcore across nodes, we will
+ * not double account here
+ */
+ zone_movable_pfn[nid] = end_pfn;
+ continue;
+ }
+ start_pfn = usable_startpfn;
+ }
+
+ /*
+ * The usable PFN range for ZONE_MOVABLE is from
+ * start_pfn->end_pfn. Calculate size_pages as the
+ * number of pages used as kernelcore
+ */
+ size_pages = end_pfn - start_pfn;
+ if (size_pages > kernelcore_remaining)
+ size_pages = kernelcore_remaining;
+ zone_movable_pfn[nid] = start_pfn + size_pages;
+
+ /*
+ * Some kernelcore has been met, update counts and
+ * break if the kernelcore for this node has been
+ * satisified
+ */
+ required_kernelcore -= min(required_kernelcore,
+ size_pages);
+ kernelcore_remaining -= size_pages;
+ if (!kernelcore_remaining)
+ break;
+ }
+ }
+
+ /*
+ * If there is still required_kernelcore, we do another pass with one
+ * less node in the count. This will push zone_movable_pfn[nid] further
+ * along on the nodes that still have memory until kernelcore is
+ * satisified
+ */
+ usable_nodes--;
+ if (usable_nodes && required_kernelcore > usable_nodes)
+ goto restart;
+
+ /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
+ for (nid = 0; nid < MAX_NUMNODES; nid++)
+ zone_movable_pfn[nid] =
+ roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
+}
+
+/* Any regular memory on that node ? */
+static void check_for_regular_memory(pg_data_t *pgdat)
+{
+#ifdef CONFIG_HIGHMEM
+ enum zone_type zone_type;
+
+ for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
+ struct zone *zone = &pgdat->node_zones[zone_type];
+ if (zone->present_pages)
+ node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
+ }
+#endif
+}
+
+/**
+ * free_area_init_nodes - Initialise all pg_data_t and zone data
+ * @max_zone_pfn: an array of max PFNs for each zone
+ *
+ * This will call free_area_init_node() for each active node in the system.
+ * Using the page ranges provided by add_active_range(), the size of each
+ * zone in each node and their holes is calculated. If the maximum PFN
+ * between two adjacent zones match, it is assumed that the zone is empty.
+ * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
+ * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
+ * starts where the previous one ended. For example, ZONE_DMA32 starts
+ * at arch_max_dma_pfn.
+ */
+void __init free_area_init_nodes(unsigned long *max_zone_pfn)
+{
+ unsigned long nid;
+ int i;
+
+ /* Sort early_node_map as initialisation assumes it is sorted */
+ sort_node_map();
+
+ /* Record where the zone boundaries are */
+ memset(arch_zone_lowest_possible_pfn, 0,
+ sizeof(arch_zone_lowest_possible_pfn));
+ memset(arch_zone_highest_possible_pfn, 0,
+ sizeof(arch_zone_highest_possible_pfn));
+ arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
+ arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
+ for (i = 1; i < MAX_NR_ZONES; i++) {
+ if (i == ZONE_MOVABLE)
+ continue;
+ arch_zone_lowest_possible_pfn[i] =
+ arch_zone_highest_possible_pfn[i-1];
+ arch_zone_highest_possible_pfn[i] =
+ max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
+ }
+ arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
+ arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
+
+ /* Find the PFNs that ZONE_MOVABLE begins at in each node */
+ memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
+ find_zone_movable_pfns_for_nodes(zone_movable_pfn);
+
+ /* Print out the zone ranges */
+ printk("Zone PFN ranges:\n");
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ if (i == ZONE_MOVABLE)
+ continue;
+ printk(" %-8s %0#10lx -> %0#10lx\n",
+ zone_names[i],
+ arch_zone_lowest_possible_pfn[i],
+ arch_zone_highest_possible_pfn[i]);
+ }
+
+ /* Print out the PFNs ZONE_MOVABLE begins at in each node */
+ printk("Movable zone start PFN for each node\n");
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ if (zone_movable_pfn[i])
+ printk(" Node %d: %lu\n", i, zone_movable_pfn[i]);
+ }
+
+ /* Print out the early_node_map[] */
+ printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
+ for (i = 0; i < nr_nodemap_entries; i++)
+ printk(" %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid,
+ early_node_map[i].start_pfn,
+ early_node_map[i].end_pfn);
+
+ /* Initialise every node */
+ mminit_verify_pageflags_layout();
+ setup_nr_node_ids();
+ for_each_online_node(nid) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+ free_area_init_node(nid, NULL,
+ find_min_pfn_for_node(nid), NULL);
+
+ /* Any memory on that node */
+ if (pgdat->node_present_pages)
+ node_set_state(nid, N_HIGH_MEMORY);
+ check_for_regular_memory(pgdat);
+ }
+}
+
+static int __init cmdline_parse_core(char *p, unsigned long *core)
+{
+ unsigned long long coremem;
+ if (!p)
+ return -EINVAL;
+
+ coremem = memparse(p, &p);
+ *core = coremem >> PAGE_SHIFT;
+
+ /* Paranoid check that UL is enough for the coremem value */
+ WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
+
+ return 0;
+}
+
+/*
+ * kernelcore=size sets the amount of memory for use for allocations that
+ * cannot be reclaimed or migrated.
+ */
+static int __init cmdline_parse_kernelcore(char *p)
+{
+ return cmdline_parse_core(p, &required_kernelcore);
+}
+
+/*
+ * movablecore=size sets the amount of memory for use for allocations that
+ * can be reclaimed or migrated.
+ */
+static int __init cmdline_parse_movablecore(char *p)
+{
+ return cmdline_parse_core(p, &required_movablecore);
+}
+
+early_param("kernelcore", cmdline_parse_kernelcore);
+early_param("movablecore", cmdline_parse_movablecore);
+
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+
+/**
+ * set_dma_reserve - set the specified number of pages reserved in the first zone
+ * @new_dma_reserve: The number of pages to mark reserved
+ *
+ * The per-cpu batchsize and zone watermarks are determined by present_pages.
+ * In the DMA zone, a significant percentage may be consumed by kernel image
+ * and other unfreeable allocations which can skew the watermarks badly. This
+ * function may optionally be used to account for unfreeable pages in the
+ * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
+ * smaller per-cpu batchsize.
+ */
+void __init set_dma_reserve(unsigned long new_dma_reserve)
+{
+ dma_reserve = new_dma_reserve;
+}
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] };
+EXPORT_SYMBOL(contig_page_data);
+#endif
+
+void __init free_area_init(unsigned long *zones_size)
+{
+ free_area_init_node(0, zones_size,
+ __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
+}
+
+static int page_alloc_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ int cpu = (unsigned long)hcpu;
+
+ if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+ drain_pages(cpu);
+
+ /*
+ * Spill the event counters of the dead processor
+ * into the current processors event counters.
+ * This artificially elevates the count of the current
+ * processor.
+ */
+ vm_events_fold_cpu(cpu);
+
+ /*
+ * Zero the differential counters of the dead processor
+ * so that the vm statistics are consistent.
+ *
+ * This is only okay since the processor is dead and cannot
+ * race with what we are doing.
+ */
+ refresh_cpu_vm_stats(cpu);
+ }
+ return NOTIFY_OK;
+}
+
+void __init page_alloc_init(void)
+{
+ hotcpu_notifier(page_alloc_cpu_notify, 0);
+}
+
+/*
+ * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
+ * or min_free_kbytes changes.
+ */
+static void calculate_totalreserve_pages(void)
+{
+ struct pglist_data *pgdat;
+ unsigned long reserve_pages = 0;
+ enum zone_type i, j;
+
+ for_each_online_pgdat(pgdat) {
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+ unsigned long max = 0;
+
+ /* Find valid and maximum lowmem_reserve in the zone */
+ for (j = i; j < MAX_NR_ZONES; j++) {
+ if (zone->lowmem_reserve[j] > max)
+ max = zone->lowmem_reserve[j];
+ }
+
+ /* we treat pages_high as reserved pages. */
+ max += zone->pages_high;
+
+ if (max > zone->present_pages)
+ max = zone->present_pages;
+ reserve_pages += max;
+ }
+ }
+ totalreserve_pages = reserve_pages;
+}
+
+/*
+ * setup_per_zone_lowmem_reserve - called whenever
+ * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
+ * has a correct pages reserved value, so an adequate number of
+ * pages are left in the zone after a successful __alloc_pages().
+ */
+static void setup_per_zone_lowmem_reserve(void)
+{
+ struct pglist_data *pgdat;
+ enum zone_type j, idx;
+
+ for_each_online_pgdat(pgdat) {
+ for (j = 0; j < MAX_NR_ZONES; j++) {
+ struct zone *zone = pgdat->node_zones + j;
+ unsigned long present_pages = zone->present_pages;
+
+ zone->lowmem_reserve[j] = 0;
+
+ idx = j;
+ while (idx) {
+ struct zone *lower_zone;
+
+ idx--;
+
+ if (sysctl_lowmem_reserve_ratio[idx] < 1)
+ sysctl_lowmem_reserve_ratio[idx] = 1;
+
+ lower_zone = pgdat->node_zones + idx;
+ lower_zone->lowmem_reserve[j] = present_pages /
+ sysctl_lowmem_reserve_ratio[idx];
+ present_pages += lower_zone->present_pages;
+ }
+ }
+ }
+
+ /* update totalreserve_pages */
+ calculate_totalreserve_pages();
+}
+
+/**
+ * setup_per_zone_pages_min - called when min_free_kbytes changes.
+ *
+ * Ensures that the pages_{min,low,high} values for each zone are set correctly
+ * with respect to min_free_kbytes.
+ */
+void setup_per_zone_pages_min(void)
+{
+ unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
+ unsigned long lowmem_pages = 0;
+ struct zone *zone;
+ unsigned long flags;
+
+ /* Calculate total number of !ZONE_HIGHMEM pages */
+ for_each_zone(zone) {
+ if (!is_highmem(zone))
+ lowmem_pages += zone->present_pages;
+ }
+
+ for_each_zone(zone) {
+ u64 tmp;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ tmp = (u64)pages_min * zone->present_pages;
+ do_div(tmp, lowmem_pages);
+ if (is_highmem(zone)) {
+ /*
+ * __GFP_HIGH and PF_MEMALLOC allocations usually don't
+ * need highmem pages, so cap pages_min to a small
+ * value here.
+ *
+ * The (pages_high-pages_low) and (pages_low-pages_min)
+ * deltas controls asynch page reclaim, and so should
+ * not be capped for highmem.
+ */
+ int min_pages;
+
+ min_pages = zone->present_pages / 1024;
+ if (min_pages < SWAP_CLUSTER_MAX)
+ min_pages = SWAP_CLUSTER_MAX;
+ if (min_pages > 128)
+ min_pages = 128;
+ zone->pages_min = min_pages;
+ } else {
+ /*
+ * If it's a lowmem zone, reserve a number of pages
+ * proportionate to the zone's size.
+ */
+ zone->pages_min = tmp;
+ }
+
+ zone->pages_low = zone->pages_min + (tmp >> 2);
+ zone->pages_high = zone->pages_min + (tmp >> 1);
+ setup_zone_migrate_reserve(zone);
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+
+ /* update totalreserve_pages */
+ calculate_totalreserve_pages();
+}
+
+/**
+ * setup_per_zone_inactive_ratio - called when min_free_kbytes changes.
+ *
+ * The inactive anon list should be small enough that the VM never has to
+ * do too much work, but large enough that each inactive page has a chance
+ * to be referenced again before it is swapped out.
+ *
+ * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
+ * INACTIVE_ANON pages on this zone's LRU, maintained by the
+ * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
+ * the anonymous pages are kept on the inactive list.
+ *
+ * total target max
+ * memory ratio inactive anon
+ * -------------------------------------
+ * 10MB 1 5MB
+ * 100MB 1 50MB
+ * 1GB 3 250MB
+ * 10GB 10 0.9GB
+ * 100GB 31 3GB
+ * 1TB 101 10GB
+ * 10TB 320 32GB
+ */
+void setup_per_zone_inactive_ratio(void)
+{
+ struct zone *zone;
+
+ for_each_zone(zone) {
+ unsigned int gb, ratio;
+
+ /* Zone size in gigabytes */
+ gb = zone->present_pages >> (30 - PAGE_SHIFT);
+ ratio = int_sqrt(10 * gb);
+ if (!ratio)
+ ratio = 1;
+
+ zone->inactive_ratio = ratio;
+ }
+}
+
+/*
+ * Initialise min_free_kbytes.
+ *
+ * For small machines we want it small (128k min). For large machines
+ * we want it large (64MB max). But it is not linear, because network
+ * bandwidth does not increase linearly with machine size. We use
+ *
+ * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
+ * min_free_kbytes = sqrt(lowmem_kbytes * 16)
+ *
+ * which yields
+ *
+ * 16MB: 512k
+ * 32MB: 724k
+ * 64MB: 1024k
+ * 128MB: 1448k
+ * 256MB: 2048k
+ * 512MB: 2896k
+ * 1024MB: 4096k
+ * 2048MB: 5792k
+ * 4096MB: 8192k
+ * 8192MB: 11584k
+ * 16384MB: 16384k
+ */
+static int __init init_per_zone_pages_min(void)
+{
+ unsigned long lowmem_kbytes;
+
+ lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
+
+ min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
+ if (min_free_kbytes < 128)
+ min_free_kbytes = 128;
+ if (min_free_kbytes > 65536)
+ min_free_kbytes = 65536;
+ setup_per_zone_pages_min();
+ setup_per_zone_lowmem_reserve();
+ setup_per_zone_inactive_ratio();
+ return 0;
+}
+module_init(init_per_zone_pages_min)
+
+/*
+ * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
+ * that we can call two helper functions whenever min_free_kbytes
+ * changes.
+ */
+int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
+ struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+ proc_dointvec(table, write, file, buffer, length, ppos);
+ if (write)
+ setup_per_zone_pages_min();
+ return 0;
+}
+
+#ifdef CONFIG_NUMA
+int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
+ struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+ struct zone *zone;
+ int rc;
+
+ rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+ if (rc)
+ return rc;
+
+ for_each_zone(zone)
+ zone->min_unmapped_pages = (zone->present_pages *
+ sysctl_min_unmapped_ratio) / 100;
+ return 0;
+}
+
+int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
+ struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+ struct zone *zone;
+ int rc;
+
+ rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+ if (rc)
+ return rc;
+
+ for_each_zone(zone)
+ zone->min_slab_pages = (zone->present_pages *
+ sysctl_min_slab_ratio) / 100;
+ return 0;
+}
+#endif
+
+/*
+ * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
+ * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
+ * whenever sysctl_lowmem_reserve_ratio changes.
+ *
+ * The reserve ratio obviously has absolutely no relation with the
+ * pages_min watermarks. The lowmem reserve ratio can only make sense
+ * if in function of the boot time zone sizes.
+ */
+int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
+ struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+ proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+ setup_per_zone_lowmem_reserve();
+ return 0;
+}
+
+/*
+ * percpu_pagelist_fraction - changes the pcp->high for each zone on each
+ * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
+ * can have before it gets flushed back to buddy allocator.
+ */
+
+int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
+ struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+ struct zone *zone;
+ unsigned int cpu;
+ int ret;
+
+ ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+ if (!write || (ret == -EINVAL))
+ return ret;
+ for_each_zone(zone) {
+ for_each_online_cpu(cpu) {
+ unsigned long high;
+ high = zone->present_pages / percpu_pagelist_fraction;
+ setup_pagelist_highmark(zone_pcp(zone, cpu), high);
+ }
+ }
+ return 0;
+}
+
+int hashdist = HASHDIST_DEFAULT;
+
+#ifdef CONFIG_NUMA
+static int __init set_hashdist(char *str)
+{
+ if (!str)
+ return 0;
+ hashdist = simple_strtoul(str, &str, 0);
+ return 1;
+}
+__setup("hashdist=", set_hashdist);
+#endif
+
+/*
+ * allocate a large system hash table from bootmem
+ * - it is assumed that the hash table must contain an exact power-of-2
+ * quantity of entries
+ * - limit is the number of hash buckets, not the total allocation size
+ */
+void *__init alloc_large_system_hash(const char *tablename,
+ unsigned long bucketsize,
+ unsigned long numentries,
+ int scale,
+ int flags,
+ unsigned int *_hash_shift,
+ unsigned int *_hash_mask,
+ unsigned long limit)
+{
+ unsigned long long max = limit;
+ unsigned long log2qty, size;
+ void *table = NULL;
+
+ /* allow the kernel cmdline to have a say */
+ if (!numentries) {
+ /* round applicable memory size up to nearest megabyte */
+ numentries = nr_kernel_pages;
+ numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
+ numentries >>= 20 - PAGE_SHIFT;
+ numentries <<= 20 - PAGE_SHIFT;
+
+ /* limit to 1 bucket per 2^scale bytes of low memory */
+ if (scale > PAGE_SHIFT)
+ numentries >>= (scale - PAGE_SHIFT);
+ else
+ numentries <<= (PAGE_SHIFT - scale);
+
+ /* Make sure we've got at least a 0-order allocation.. */
+ if (unlikely((numentries * bucketsize) < PAGE_SIZE))
+ numentries = PAGE_SIZE / bucketsize;
+ }
+ numentries = roundup_pow_of_two(numentries);
+
+ /* limit allocation size to 1/16 total memory by default */
+ if (max == 0) {
+ max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
+ do_div(max, bucketsize);
+ }
+
+ if (numentries > max)
+ numentries = max;
+
+ log2qty = ilog2(numentries);
+
+ do {
+ size = bucketsize << log2qty;
+ if (flags & HASH_EARLY)
+ table = alloc_bootmem_nopanic(size);
+ else if (hashdist)
+ table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
+ else {
+ unsigned long order = get_order(size);
+ table = (void*) __get_free_pages(GFP_ATOMIC, order);
+ /*
+ * If bucketsize is not a power-of-two, we may free
+ * some pages at the end of hash table.
+ */
+ if (table) {
+ unsigned long alloc_end = (unsigned long)table +
+ (PAGE_SIZE << order);
+ unsigned long used = (unsigned long)table +
+ PAGE_ALIGN(size);
+ split_page(virt_to_page(table), order);
+ while (used < alloc_end) {
+ free_page(used);
+ used += PAGE_SIZE;
+ }
+ }
+ }
+ } while (!table && size > PAGE_SIZE && --log2qty);
+
+ if (!table)
+ panic("Failed to allocate %s hash table\n", tablename);
+
+ printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n",
+ tablename,
+ (1U << log2qty),
+ ilog2(size) - PAGE_SHIFT,
+ size);
+
+ if (_hash_shift)
+ *_hash_shift = log2qty;
+ if (_hash_mask)
+ *_hash_mask = (1 << log2qty) - 1;
+
+ return table;
+}
+
+#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
+struct page *pfn_to_page(unsigned long pfn)
+{
+ return __pfn_to_page(pfn);
+}
+unsigned long page_to_pfn(struct page *page)
+{
+ return __page_to_pfn(page);
+}
+EXPORT_SYMBOL(pfn_to_page);
+EXPORT_SYMBOL(page_to_pfn);
+#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
+
+/* Return a pointer to the bitmap storing bits affecting a block of pages */
+static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
+ unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+ return __pfn_to_section(pfn)->pageblock_flags;
+#else
+ return zone->pageblock_flags;
+#endif /* CONFIG_SPARSEMEM */
+}
+
+static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+ pfn &= (PAGES_PER_SECTION-1);
+ return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
+#else
+ pfn = pfn - zone->zone_start_pfn;
+ return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
+#endif /* CONFIG_SPARSEMEM */
+}
+
+/**
+ * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @start_bitidx: The first bit of interest to retrieve
+ * @end_bitidx: The last bit of interest
+ * returns pageblock_bits flags
+ */
+unsigned long get_pageblock_flags_group(struct page *page,
+ int start_bitidx, int end_bitidx)
+{
+ struct zone *zone;
+ unsigned long *bitmap;
+ unsigned long pfn, bitidx;
+ unsigned long flags = 0;
+ unsigned long value = 1;
+
+ zone = page_zone(page);
+ pfn = page_to_pfn(page);
+ bitmap = get_pageblock_bitmap(zone, pfn);
+ bitidx = pfn_to_bitidx(zone, pfn);
+
+ for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
+ if (test_bit(bitidx + start_bitidx, bitmap))
+ flags |= value;
+
+ return flags;
+}
+
+/**
+ * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @start_bitidx: The first bit of interest
+ * @end_bitidx: The last bit of interest
+ * @flags: The flags to set
+ */
+void set_pageblock_flags_group(struct page *page, unsigned long flags,
+ int start_bitidx, int end_bitidx)
+{
+ struct zone *zone;
+ unsigned long *bitmap;
+ unsigned long pfn, bitidx;
+ unsigned long value = 1;
+
+ zone = page_zone(page);
+ pfn = page_to_pfn(page);
+ bitmap = get_pageblock_bitmap(zone, pfn);
+ bitidx = pfn_to_bitidx(zone, pfn);
+ VM_BUG_ON(pfn < zone->zone_start_pfn);
+ VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);
+
+ for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
+ if (flags & value)
+ __set_bit(bitidx + start_bitidx, bitmap);
+ else
+ __clear_bit(bitidx + start_bitidx, bitmap);
+}
+
+/*
+ * This is designed as sub function...plz see page_isolation.c also.
+ * set/clear page block's type to be ISOLATE.
+ * page allocater never alloc memory from ISOLATE block.
+ */
+
+int set_migratetype_isolate(struct page *page)
+{
+ struct zone *zone;
+ unsigned long flags;
+ int ret = -EBUSY;
+
+ zone = page_zone(page);
+ spin_lock_irqsave(&zone->lock, flags);
+ /*
+ * In future, more migrate types will be able to be isolation target.
+ */
+ if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE)
+ goto out;
+ set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+ move_freepages_block(zone, page, MIGRATE_ISOLATE);
+ ret = 0;
+out:
+ spin_unlock_irqrestore(&zone->lock, flags);
+ if (!ret)
+ drain_all_pages();
+ return ret;
+}
+
+void unset_migratetype_isolate(struct page *page)
+{
+ struct zone *zone;
+ unsigned long flags;
+ zone = page_zone(page);
+ spin_lock_irqsave(&zone->lock, flags);
+ if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+ goto out;
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ move_freepages_block(zone, page, MIGRATE_MOVABLE);
+out:
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+/*
+ * All pages in the range must be isolated before calling this.
+ */
+void
+__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
+{
+ struct page *page;
+ struct zone *zone;
+ int order, i;
+ unsigned long pfn;
+ unsigned long flags;
+ /* find the first valid pfn */
+ for (pfn = start_pfn; pfn < end_pfn; pfn++)
+ if (pfn_valid(pfn))
+ break;
+ if (pfn == end_pfn)
+ return;
+ zone = page_zone(pfn_to_page(pfn));
+ spin_lock_irqsave(&zone->lock, flags);
+ pfn = start_pfn;
+ while (pfn < end_pfn) {
+ if (!pfn_valid(pfn)) {
+ pfn++;
+ continue;
+ }
+ page = pfn_to_page(pfn);
+ BUG_ON(page_count(page));
+ BUG_ON(!PageBuddy(page));
+ order = page_order(page);
+#ifdef CONFIG_DEBUG_VM
+ printk(KERN_INFO "remove from free list %lx %d %lx\n",
+ pfn, 1 << order, end_pfn);
+#endif
+ list_del(&page->lru);
+ rmv_page_order(page);
+ zone->free_area[order].nr_free--;
+ __mod_zone_page_state(zone, NR_FREE_PAGES,
+ - (1UL << order));
+ for (i = 0; i < (1 << order); i++)
+ SetPageReserved((page+i));
+ pfn += (1 << order);
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+#endif
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
new file mode 100644
index 0000000..ab27ff7
--- /dev/null
+++ b/mm/page_cgroup.c
@@ -0,0 +1,275 @@
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/bootmem.h>
+#include <linux/bit_spinlock.h>
+#include <linux/page_cgroup.h>
+#include <linux/hash.h>
+#include <linux/slab.h>
+#include <linux/memory.h>
+#include <linux/vmalloc.h>
+#include <linux/cgroup.h>
+
+static void __meminit
+__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
+{
+ pc->flags = 0;
+ pc->mem_cgroup = NULL;
+ pc->page = pfn_to_page(pfn);
+}
+static unsigned long total_usage;
+
+#if !defined(CONFIG_SPARSEMEM)
+
+
+void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
+{
+ pgdat->node_page_cgroup = NULL;
+}
+
+struct page_cgroup *lookup_page_cgroup(struct page *page)
+{
+ unsigned long pfn = page_to_pfn(page);
+ unsigned long offset;
+ struct page_cgroup *base;
+
+ base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
+ if (unlikely(!base))
+ return NULL;
+
+ offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
+ return base + offset;
+}
+
+static int __init alloc_node_page_cgroup(int nid)
+{
+ struct page_cgroup *base, *pc;
+ unsigned long table_size;
+ unsigned long start_pfn, nr_pages, index;
+
+ start_pfn = NODE_DATA(nid)->node_start_pfn;
+ nr_pages = NODE_DATA(nid)->node_spanned_pages;
+
+ if (!nr_pages)
+ return 0;
+
+ table_size = sizeof(struct page_cgroup) * nr_pages;
+
+ base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
+ table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ if (!base)
+ return -ENOMEM;
+ for (index = 0; index < nr_pages; index++) {
+ pc = base + index;
+ __init_page_cgroup(pc, start_pfn + index);
+ }
+ NODE_DATA(nid)->node_page_cgroup = base;
+ total_usage += table_size;
+ return 0;
+}
+
+void __init page_cgroup_init(void)
+{
+
+ int nid, fail;
+
+ if (mem_cgroup_subsys.disabled)
+ return;
+
+ for_each_online_node(nid) {
+ fail = alloc_node_page_cgroup(nid);
+ if (fail)
+ goto fail;
+ }
+ printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
+ printk(KERN_INFO "please try cgroup_disable=memory option if you"
+ " don't want\n");
+ return;
+fail:
+ printk(KERN_CRIT "allocation of page_cgroup was failed.\n");
+ printk(KERN_CRIT "please try cgroup_disable=memory boot option\n");
+ panic("Out of memory");
+}
+
+#else /* CONFIG_FLAT_NODE_MEM_MAP */
+
+struct page_cgroup *lookup_page_cgroup(struct page *page)
+{
+ unsigned long pfn = page_to_pfn(page);
+ struct mem_section *section = __pfn_to_section(pfn);
+
+ return section->page_cgroup + pfn;
+}
+
+/* __alloc_bootmem...() is protected by !slab_available() */
+int __init_refok init_section_page_cgroup(unsigned long pfn)
+{
+ struct mem_section *section;
+ struct page_cgroup *base, *pc;
+ unsigned long table_size;
+ int nid, index;
+
+ section = __pfn_to_section(pfn);
+
+ if (!section->page_cgroup) {
+ nid = page_to_nid(pfn_to_page(pfn));
+ table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
+ if (slab_is_available()) {
+ base = kmalloc_node(table_size, GFP_KERNEL, nid);
+ if (!base)
+ base = vmalloc_node(table_size, nid);
+ } else {
+ base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
+ table_size,
+ PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ }
+ } else {
+ /*
+ * We don't have to allocate page_cgroup again, but
+ * address of memmap may be changed. So, we have to initialize
+ * again.
+ */
+ base = section->page_cgroup + pfn;
+ table_size = 0;
+ /* check address of memmap is changed or not. */
+ if (base->page == pfn_to_page(pfn))
+ return 0;
+ }
+
+ if (!base) {
+ printk(KERN_ERR "page cgroup allocation failure\n");
+ return -ENOMEM;
+ }
+
+ for (index = 0; index < PAGES_PER_SECTION; index++) {
+ pc = base + index;
+ __init_page_cgroup(pc, pfn + index);
+ }
+
+ section = __pfn_to_section(pfn);
+ section->page_cgroup = base - pfn;
+ total_usage += table_size;
+ return 0;
+}
+#ifdef CONFIG_MEMORY_HOTPLUG
+void __free_page_cgroup(unsigned long pfn)
+{
+ struct mem_section *ms;
+ struct page_cgroup *base;
+
+ ms = __pfn_to_section(pfn);
+ if (!ms || !ms->page_cgroup)
+ return;
+ base = ms->page_cgroup + pfn;
+ if (is_vmalloc_addr(base)) {
+ vfree(base);
+ ms->page_cgroup = NULL;
+ } else {
+ struct page *page = virt_to_page(base);
+ if (!PageReserved(page)) { /* Is bootmem ? */
+ kfree(base);
+ ms->page_cgroup = NULL;
+ }
+ }
+}
+
+int __meminit online_page_cgroup(unsigned long start_pfn,
+ unsigned long nr_pages,
+ int nid)
+{
+ unsigned long start, end, pfn;
+ int fail = 0;
+
+ start = start_pfn & ~(PAGES_PER_SECTION - 1);
+ end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
+
+ for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
+ if (!pfn_present(pfn))
+ continue;
+ fail = init_section_page_cgroup(pfn);
+ }
+ if (!fail)
+ return 0;
+
+ /* rollback */
+ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
+ __free_page_cgroup(pfn);
+
+ return -ENOMEM;
+}
+
+int __meminit offline_page_cgroup(unsigned long start_pfn,
+ unsigned long nr_pages, int nid)
+{
+ unsigned long start, end, pfn;
+
+ start = start_pfn & ~(PAGES_PER_SECTION - 1);
+ end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
+
+ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
+ __free_page_cgroup(pfn);
+ return 0;
+
+}
+
+static int __meminit page_cgroup_callback(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ struct memory_notify *mn = arg;
+ int ret = 0;
+ switch (action) {
+ case MEM_GOING_ONLINE:
+ ret = online_page_cgroup(mn->start_pfn,
+ mn->nr_pages, mn->status_change_nid);
+ break;
+ case MEM_OFFLINE:
+ offline_page_cgroup(mn->start_pfn,
+ mn->nr_pages, mn->status_change_nid);
+ break;
+ case MEM_CANCEL_ONLINE:
+ case MEM_GOING_OFFLINE:
+ break;
+ case MEM_ONLINE:
+ case MEM_CANCEL_OFFLINE:
+ break;
+ }
+
+ if (ret)
+ ret = notifier_from_errno(ret);
+ else
+ ret = NOTIFY_OK;
+
+ return ret;
+}
+
+#endif
+
+void __init page_cgroup_init(void)
+{
+ unsigned long pfn;
+ int fail = 0;
+
+ if (mem_cgroup_subsys.disabled)
+ return;
+
+ for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
+ if (!pfn_present(pfn))
+ continue;
+ fail = init_section_page_cgroup(pfn);
+ }
+ if (fail) {
+ printk(KERN_CRIT "try cgroup_disable=memory boot option\n");
+ panic("Out of memory");
+ } else {
+ hotplug_memory_notifier(page_cgroup_callback, 0);
+ }
+ printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
+ printk(KERN_INFO "please try cgroup_disable=memory option if you don't"
+ " want\n");
+}
+
+void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
+{
+ return;
+}
+
+#endif
diff --git a/mm/page_io.c b/mm/page_io.c
new file mode 100644
index 0000000..065c448
--- /dev/null
+++ b/mm/page_io.c
@@ -0,0 +1,141 @@
+/*
+ * linux/mm/page_io.c
+ *
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * Swap reorganised 29.12.95,
+ * Asynchronous swapping added 30.12.95. Stephen Tweedie
+ * Removed race in async swapping. 14.4.1996. Bruno Haible
+ * Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie
+ * Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman
+ */
+
+#include <linux/mm.h>
+#include <linux/kernel_stat.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/bio.h>
+#include <linux/swapops.h>
+#include <linux/writeback.h>
+#include <asm/pgtable.h>
+
+static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index,
+ struct page *page, bio_end_io_t end_io)
+{
+ struct bio *bio;
+
+ bio = bio_alloc(gfp_flags, 1);
+ if (bio) {
+ struct swap_info_struct *sis;
+ swp_entry_t entry = { .val = index, };
+
+ sis = get_swap_info_struct(swp_type(entry));
+ bio->bi_sector = map_swap_page(sis, swp_offset(entry)) *
+ (PAGE_SIZE >> 9);
+ bio->bi_bdev = sis->bdev;
+ bio->bi_io_vec[0].bv_page = page;
+ bio->bi_io_vec[0].bv_len = PAGE_SIZE;
+ bio->bi_io_vec[0].bv_offset = 0;
+ bio->bi_vcnt = 1;
+ bio->bi_idx = 0;
+ bio->bi_size = PAGE_SIZE;
+ bio->bi_end_io = end_io;
+ }
+ return bio;
+}
+
+static void end_swap_bio_write(struct bio *bio, int err)
+{
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct page *page = bio->bi_io_vec[0].bv_page;
+
+ if (!uptodate) {
+ SetPageError(page);
+ /*
+ * We failed to write the page out to swap-space.
+ * Re-dirty the page in order to avoid it being reclaimed.
+ * Also print a dire warning that things will go BAD (tm)
+ * very quickly.
+ *
+ * Also clear PG_reclaim to avoid rotate_reclaimable_page()
+ */
+ set_page_dirty(page);
+ printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n",
+ imajor(bio->bi_bdev->bd_inode),
+ iminor(bio->bi_bdev->bd_inode),
+ (unsigned long long)bio->bi_sector);
+ ClearPageReclaim(page);
+ }
+ end_page_writeback(page);
+ bio_put(bio);
+}
+
+void end_swap_bio_read(struct bio *bio, int err)
+{
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct page *page = bio->bi_io_vec[0].bv_page;
+
+ if (!uptodate) {
+ SetPageError(page);
+ ClearPageUptodate(page);
+ printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
+ imajor(bio->bi_bdev->bd_inode),
+ iminor(bio->bi_bdev->bd_inode),
+ (unsigned long long)bio->bi_sector);
+ } else {
+ SetPageUptodate(page);
+ }
+ unlock_page(page);
+ bio_put(bio);
+}
+
+/*
+ * We may have stale swap cache pages in memory: notice
+ * them here and get rid of the unnecessary final write.
+ */
+int swap_writepage(struct page *page, struct writeback_control *wbc)
+{
+ struct bio *bio;
+ int ret = 0, rw = WRITE;
+
+ if (remove_exclusive_swap_page(page)) {
+ unlock_page(page);
+ goto out;
+ }
+ bio = get_swap_bio(GFP_NOIO, page_private(page), page,
+ end_swap_bio_write);
+ if (bio == NULL) {
+ set_page_dirty(page);
+ unlock_page(page);
+ ret = -ENOMEM;
+ goto out;
+ }
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ rw |= (1 << BIO_RW_SYNC);
+ count_vm_event(PSWPOUT);
+ set_page_writeback(page);
+ unlock_page(page);
+ submit_bio(rw, bio);
+out:
+ return ret;
+}
+
+int swap_readpage(struct file *file, struct page *page)
+{
+ struct bio *bio;
+ int ret = 0;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(PageUptodate(page));
+ bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
+ end_swap_bio_read);
+ if (bio == NULL) {
+ unlock_page(page);
+ ret = -ENOMEM;
+ goto out;
+ }
+ count_vm_event(PSWPIN);
+ submit_bio(READ, bio);
+out:
+ return ret;
+}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
new file mode 100644
index 0000000..5e0ffd9
--- /dev/null
+++ b/mm/page_isolation.c
@@ -0,0 +1,142 @@
+/*
+ * linux/mm/page_isolation.c
+ */
+
+#include <linux/mm.h>
+#include <linux/page-isolation.h>
+#include <linux/pageblock-flags.h>
+#include "internal.h"
+
+static inline struct page *
+__first_valid_page(unsigned long pfn, unsigned long nr_pages)
+{
+ int i;
+ for (i = 0; i < nr_pages; i++)
+ if (pfn_valid_within(pfn + i))
+ break;
+ if (unlikely(i == nr_pages))
+ return NULL;
+ return pfn_to_page(pfn + i);
+}
+
+/*
+ * start_isolate_page_range() -- make page-allocation-type of range of pages
+ * to be MIGRATE_ISOLATE.
+ * @start_pfn: The lower PFN of the range to be isolated.
+ * @end_pfn: The upper PFN of the range to be isolated.
+ *
+ * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
+ * the range will never be allocated. Any free pages and pages freed in the
+ * future will not be allocated again.
+ *
+ * start_pfn/end_pfn must be aligned to pageblock_order.
+ * Returns 0 on success and -EBUSY if any part of range cannot be isolated.
+ */
+int
+start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long pfn;
+ unsigned long undo_pfn;
+ struct page *page;
+
+ BUG_ON((start_pfn) & (pageblock_nr_pages - 1));
+ BUG_ON((end_pfn) & (pageblock_nr_pages - 1));
+
+ for (pfn = start_pfn;
+ pfn < end_pfn;
+ pfn += pageblock_nr_pages) {
+ page = __first_valid_page(pfn, pageblock_nr_pages);
+ if (page && set_migratetype_isolate(page)) {
+ undo_pfn = pfn;
+ goto undo;
+ }
+ }
+ return 0;
+undo:
+ for (pfn = start_pfn;
+ pfn < undo_pfn;
+ pfn += pageblock_nr_pages)
+ unset_migratetype_isolate(pfn_to_page(pfn));
+
+ return -EBUSY;
+}
+
+/*
+ * Make isolated pages available again.
+ */
+int
+undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long pfn;
+ struct page *page;
+ BUG_ON((start_pfn) & (pageblock_nr_pages - 1));
+ BUG_ON((end_pfn) & (pageblock_nr_pages - 1));
+ for (pfn = start_pfn;
+ pfn < end_pfn;
+ pfn += pageblock_nr_pages) {
+ page = __first_valid_page(pfn, pageblock_nr_pages);
+ if (!page || get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+ continue;
+ unset_migratetype_isolate(page);
+ }
+ return 0;
+}
+/*
+ * Test all pages in the range is free(means isolated) or not.
+ * all pages in [start_pfn...end_pfn) must be in the same zone.
+ * zone->lock must be held before call this.
+ *
+ * Returns 0 if all pages in the range is isolated.
+ */
+static int
+__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
+{
+ struct page *page;
+
+ while (pfn < end_pfn) {
+ if (!pfn_valid_within(pfn)) {
+ pfn++;
+ continue;
+ }
+ page = pfn_to_page(pfn);
+ if (PageBuddy(page))
+ pfn += 1 << page_order(page);
+ else if (page_count(page) == 0 &&
+ page_private(page) == MIGRATE_ISOLATE)
+ pfn += 1;
+ else
+ break;
+ }
+ if (pfn < end_pfn)
+ return 0;
+ return 1;
+}
+
+int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long pfn, flags;
+ struct page *page;
+ struct zone *zone;
+ int ret;
+
+ pfn = start_pfn;
+ /*
+ * Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page
+ * is not aligned to pageblock_nr_pages.
+ * Then we just check pagetype fist.
+ */
+ for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+ page = __first_valid_page(pfn, pageblock_nr_pages);
+ if (page && get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+ break;
+ }
+ page = __first_valid_page(start_pfn, end_pfn - start_pfn);
+ if ((pfn < end_pfn) || !page)
+ return -EBUSY;
+ /* Check all pages are free or Marked as ISOLATED */
+ zone = page_zone(page);
+ spin_lock_irqsave(&zone->lock, flags);
+ ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn);
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return ret ? 0 : -EBUSY;
+}
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
new file mode 100644
index 0000000..d5878be
--- /dev/null
+++ b/mm/pagewalk.c
@@ -0,0 +1,137 @@
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/sched.h>
+
+static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ pte_t *pte;
+ int err = 0;
+
+ pte = pte_offset_map(pmd, addr);
+ for (;;) {
+ err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
+ if (err)
+ break;
+ addr += PAGE_SIZE;
+ if (addr == end)
+ break;
+ pte++;
+ }
+
+ pte_unmap(pte);
+ return err;
+}
+
+static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ pmd_t *pmd;
+ unsigned long next;
+ int err = 0;
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (pmd_none_or_clear_bad(pmd)) {
+ if (walk->pte_hole)
+ err = walk->pte_hole(addr, next, walk);
+ if (err)
+ break;
+ continue;
+ }
+ if (walk->pmd_entry)
+ err = walk->pmd_entry(pmd, addr, next, walk);
+ if (!err && walk->pte_entry)
+ err = walk_pte_range(pmd, addr, next, walk);
+ if (err)
+ break;
+ } while (pmd++, addr = next, addr != end);
+
+ return err;
+}
+
+static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ pud_t *pud;
+ unsigned long next;
+ int err = 0;
+
+ pud = pud_offset(pgd, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (pud_none_or_clear_bad(pud)) {
+ if (walk->pte_hole)
+ err = walk->pte_hole(addr, next, walk);
+ if (err)
+ break;
+ continue;
+ }
+ if (walk->pud_entry)
+ err = walk->pud_entry(pud, addr, next, walk);
+ if (!err && (walk->pmd_entry || walk->pte_entry))
+ err = walk_pmd_range(pud, addr, next, walk);
+ if (err)
+ break;
+ } while (pud++, addr = next, addr != end);
+
+ return err;
+}
+
+/**
+ * walk_page_range - walk a memory map's page tables with a callback
+ * @mm: memory map to walk
+ * @addr: starting address
+ * @end: ending address
+ * @walk: set of callbacks to invoke for each level of the tree
+ *
+ * Recursively walk the page table for the memory area in a VMA,
+ * calling supplied callbacks. Callbacks are called in-order (first
+ * PGD, first PUD, first PMD, first PTE, second PTE... second PMD,
+ * etc.). If lower-level callbacks are omitted, walking depth is reduced.
+ *
+ * Each callback receives an entry pointer and the start and end of the
+ * associated range, and a copy of the original mm_walk for access to
+ * the ->private or ->mm fields.
+ *
+ * No locks are taken, but the bottom level iterator will map PTE
+ * directories from highmem if necessary.
+ *
+ * If any callback returns a non-zero value, the walk is aborted and
+ * the return value is propagated back to the caller. Otherwise 0 is returned.
+ */
+int walk_page_range(unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ pgd_t *pgd;
+ unsigned long next;
+ int err = 0;
+
+ if (addr >= end)
+ return err;
+
+ if (!walk->mm)
+ return -EINVAL;
+
+ pgd = pgd_offset(walk->mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd)) {
+ if (walk->pte_hole)
+ err = walk->pte_hole(addr, next, walk);
+ if (err)
+ break;
+ continue;
+ }
+ if (walk->pgd_entry)
+ err = walk->pgd_entry(pgd, addr, next, walk);
+ if (!err &&
+ (walk->pud_entry || walk->pmd_entry || walk->pte_entry))
+ err = walk_pud_range(pgd, addr, next, walk);
+ if (err)
+ break;
+ } while (pgd++, addr = next, addr != end);
+
+ return err;
+}
diff --git a/mm/pdflush.c b/mm/pdflush.c
new file mode 100644
index 0000000..a0a14c4
--- /dev/null
+++ b/mm/pdflush.c
@@ -0,0 +1,241 @@
+/*
+ * mm/pdflush.c - worker threads for writing back filesystem data
+ *
+ * Copyright (C) 2002, Linus Torvalds.
+ *
+ * 09Apr2002 Andrew Morton
+ * Initial version
+ * 29Feb2004 kaos@sgi.com
+ * Move worker thread creation to kthread to avoid chewing
+ * up stack space with nested calls to kernel_thread.
+ */
+
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/signal.h>
+#include <linux/spinlock.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/fs.h> /* Needed by writeback.h */
+#include <linux/writeback.h> /* Prototypes pdflush_operation() */
+#include <linux/kthread.h>
+#include <linux/cpuset.h>
+#include <linux/freezer.h>
+
+
+/*
+ * Minimum and maximum number of pdflush instances
+ */
+#define MIN_PDFLUSH_THREADS 2
+#define MAX_PDFLUSH_THREADS 8
+
+static void start_one_pdflush_thread(void);
+
+
+/*
+ * The pdflush threads are worker threads for writing back dirty data.
+ * Ideally, we'd like one thread per active disk spindle. But the disk
+ * topology is very hard to divine at this level. Instead, we take
+ * care in various places to prevent more than one pdflush thread from
+ * performing writeback against a single filesystem. pdflush threads
+ * have the PF_FLUSHER flag set in current->flags to aid in this.
+ */
+
+/*
+ * All the pdflush threads. Protected by pdflush_lock
+ */
+static LIST_HEAD(pdflush_list);
+static DEFINE_SPINLOCK(pdflush_lock);
+
+/*
+ * The count of currently-running pdflush threads. Protected
+ * by pdflush_lock.
+ *
+ * Readable by sysctl, but not writable. Published to userspace at
+ * /proc/sys/vm/nr_pdflush_threads.
+ */
+int nr_pdflush_threads = 0;
+
+/*
+ * The time at which the pdflush thread pool last went empty
+ */
+static unsigned long last_empty_jifs;
+
+/*
+ * The pdflush thread.
+ *
+ * Thread pool management algorithm:
+ *
+ * - The minimum and maximum number of pdflush instances are bound
+ * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
+ *
+ * - If there have been no idle pdflush instances for 1 second, create
+ * a new one.
+ *
+ * - If the least-recently-went-to-sleep pdflush thread has been asleep
+ * for more than one second, terminate a thread.
+ */
+
+/*
+ * A structure for passing work to a pdflush thread. Also for passing
+ * state information between pdflush threads. Protected by pdflush_lock.
+ */
+struct pdflush_work {
+ struct task_struct *who; /* The thread */
+ void (*fn)(unsigned long); /* A callback function */
+ unsigned long arg0; /* An argument to the callback */
+ struct list_head list; /* On pdflush_list, when idle */
+ unsigned long when_i_went_to_sleep;
+};
+
+static int __pdflush(struct pdflush_work *my_work)
+{
+ current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+ set_freezable();
+ my_work->fn = NULL;
+ my_work->who = current;
+ INIT_LIST_HEAD(&my_work->list);
+
+ spin_lock_irq(&pdflush_lock);
+ nr_pdflush_threads++;
+ for ( ; ; ) {
+ struct pdflush_work *pdf;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ list_move(&my_work->list, &pdflush_list);
+ my_work->when_i_went_to_sleep = jiffies;
+ spin_unlock_irq(&pdflush_lock);
+ schedule();
+ try_to_freeze();
+ spin_lock_irq(&pdflush_lock);
+ if (!list_empty(&my_work->list)) {
+ /*
+ * Someone woke us up, but without removing our control
+ * structure from the global list. swsusp will do this
+ * in try_to_freeze()->refrigerator(). Handle it.
+ */
+ my_work->fn = NULL;
+ continue;
+ }
+ if (my_work->fn == NULL) {
+ printk("pdflush: bogus wakeup\n");
+ continue;
+ }
+ spin_unlock_irq(&pdflush_lock);
+
+ (*my_work->fn)(my_work->arg0);
+
+ /*
+ * Thread creation: For how long have there been zero
+ * available threads?
+ */
+ if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
+ /* unlocked list_empty() test is OK here */
+ if (list_empty(&pdflush_list)) {
+ /* unlocked test is OK here */
+ if (nr_pdflush_threads < MAX_PDFLUSH_THREADS)
+ start_one_pdflush_thread();
+ }
+ }
+
+ spin_lock_irq(&pdflush_lock);
+ my_work->fn = NULL;
+
+ /*
+ * Thread destruction: For how long has the sleepiest
+ * thread slept?
+ */
+ if (list_empty(&pdflush_list))
+ continue;
+ if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
+ continue;
+ pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
+ if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
+ /* Limit exit rate */
+ pdf->when_i_went_to_sleep = jiffies;
+ break; /* exeunt */
+ }
+ }
+ nr_pdflush_threads--;
+ spin_unlock_irq(&pdflush_lock);
+ return 0;
+}
+
+/*
+ * Of course, my_work wants to be just a local in __pdflush(). It is
+ * separated out in this manner to hopefully prevent the compiler from
+ * performing unfortunate optimisations against the auto variables. Because
+ * these are visible to other tasks and CPUs. (No problem has actually
+ * been observed. This is just paranoia).
+ */
+static int pdflush(void *dummy)
+{
+ struct pdflush_work my_work;
+ cpumask_t cpus_allowed;
+
+ /*
+ * pdflush can spend a lot of time doing encryption via dm-crypt. We
+ * don't want to do that at keventd's priority.
+ */
+ set_user_nice(current, 0);
+
+ /*
+ * Some configs put our parent kthread in a limited cpuset,
+ * which kthread() overrides, forcing cpus_allowed == CPU_MASK_ALL.
+ * Our needs are more modest - cut back to our cpusets cpus_allowed.
+ * This is needed as pdflush's are dynamically created and destroyed.
+ * The boottime pdflush's are easily placed w/o these 2 lines.
+ */
+ cpuset_cpus_allowed(current, &cpus_allowed);
+ set_cpus_allowed_ptr(current, &cpus_allowed);
+
+ return __pdflush(&my_work);
+}
+
+/*
+ * Attempt to wake up a pdflush thread, and get it to do some work for you.
+ * Returns zero if it indeed managed to find a worker thread, and passed your
+ * payload to it.
+ */
+int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */
+
+ spin_lock_irqsave(&pdflush_lock, flags);
+ if (list_empty(&pdflush_list)) {
+ ret = -1;
+ } else {
+ struct pdflush_work *pdf;
+
+ pdf = list_entry(pdflush_list.next, struct pdflush_work, list);
+ list_del_init(&pdf->list);
+ if (list_empty(&pdflush_list))
+ last_empty_jifs = jiffies;
+ pdf->fn = fn;
+ pdf->arg0 = arg0;
+ wake_up_process(pdf->who);
+ }
+ spin_unlock_irqrestore(&pdflush_lock, flags);
+
+ return ret;
+}
+
+static void start_one_pdflush_thread(void)
+{
+ kthread_run(pdflush, NULL, "pdflush");
+}
+
+static int __init pdflush_init(void)
+{
+ int i;
+
+ for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
+ start_one_pdflush_thread();
+ return 0;
+}
+
+module_init(pdflush_init);
diff --git a/mm/prio_tree.c b/mm/prio_tree.c
new file mode 100644
index 0000000..603ae98
--- /dev/null
+++ b/mm/prio_tree.c
@@ -0,0 +1,207 @@
+/*
+ * mm/prio_tree.c - priority search tree for mapping->i_mmap
+ *
+ * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu>
+ *
+ * This file is released under the GPL v2.
+ *
+ * Based on the radix priority search tree proposed by Edward M. McCreight
+ * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
+ *
+ * 02Feb2004 Initial version
+ */
+
+#include <linux/mm.h>
+#include <linux/prio_tree.h>
+
+/*
+ * See lib/prio_tree.c for details on the general radix priority search tree
+ * code.
+ */
+
+/*
+ * The following #defines are mirrored from lib/prio_tree.c. They're only used
+ * for debugging, and should be removed (along with the debugging code using
+ * them) when switching also VMAs to the regular prio_tree code.
+ */
+
+#define RADIX_INDEX(vma) ((vma)->vm_pgoff)
+#define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
+/* avoid overflow */
+#define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
+
+/*
+ * Radix priority search tree for address_space->i_mmap
+ *
+ * For each vma that map a unique set of file pages i.e., unique [radix_index,
+ * heap_index] value, we have a corresponding priority search tree node. If
+ * multiple vmas have identical [radix_index, heap_index] value, then one of
+ * them is used as a tree node and others are stored in a vm_set list. The tree
+ * node points to the first vma (head) of the list using vm_set.head.
+ *
+ * prio_tree_root
+ * |
+ * A vm_set.head
+ * / \ /
+ * L R -> H-I-J-K-M-N-O-P-Q-S
+ * ^ ^ <-- vm_set.list -->
+ * tree nodes
+ *
+ * We need some way to identify whether a vma is a tree node, head of a vm_set
+ * list, or just a member of a vm_set list. We cannot use vm_flags to store
+ * such information. The reason is, in the above figure, it is possible that
+ * vm_flags' of R and H are covered by the different mmap_sems. When R is
+ * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold
+ * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now.
+ * That's why some trick involving shared.vm_set.parent is used for identifying
+ * tree nodes and list head nodes.
+ *
+ * vma radix priority search tree node rules:
+ *
+ * vma->shared.vm_set.parent != NULL ==> a tree node
+ * vma->shared.vm_set.head != NULL ==> list of others mapping same range
+ * vma->shared.vm_set.head == NULL ==> no others map the same range
+ *
+ * vma->shared.vm_set.parent == NULL
+ * vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range
+ * vma->shared.vm_set.head == NULL ==> a list node
+ */
+
+/*
+ * Add a new vma known to map the same set of pages as the old vma:
+ * useful for fork's dup_mmap as well as vma_prio_tree_insert below.
+ * Note that it just happens to work correctly on i_mmap_nonlinear too.
+ */
+void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old)
+{
+ /* Leave these BUG_ONs till prio_tree patch stabilizes */
+ BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old));
+ BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old));
+
+ vma->shared.vm_set.head = NULL;
+ vma->shared.vm_set.parent = NULL;
+
+ if (!old->shared.vm_set.parent)
+ list_add(&vma->shared.vm_set.list,
+ &old->shared.vm_set.list);
+ else if (old->shared.vm_set.head)
+ list_add_tail(&vma->shared.vm_set.list,
+ &old->shared.vm_set.head->shared.vm_set.list);
+ else {
+ INIT_LIST_HEAD(&vma->shared.vm_set.list);
+ vma->shared.vm_set.head = old;
+ old->shared.vm_set.head = vma;
+ }
+}
+
+void vma_prio_tree_insert(struct vm_area_struct *vma,
+ struct prio_tree_root *root)
+{
+ struct prio_tree_node *ptr;
+ struct vm_area_struct *old;
+
+ vma->shared.vm_set.head = NULL;
+
+ ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node);
+ if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) {
+ old = prio_tree_entry(ptr, struct vm_area_struct,
+ shared.prio_tree_node);
+ vma_prio_tree_add(vma, old);
+ }
+}
+
+void vma_prio_tree_remove(struct vm_area_struct *vma,
+ struct prio_tree_root *root)
+{
+ struct vm_area_struct *node, *head, *new_head;
+
+ if (!vma->shared.vm_set.head) {
+ if (!vma->shared.vm_set.parent)
+ list_del_init(&vma->shared.vm_set.list);
+ else
+ raw_prio_tree_remove(root, &vma->shared.prio_tree_node);
+ } else {
+ /* Leave this BUG_ON till prio_tree patch stabilizes */
+ BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma);
+ if (vma->shared.vm_set.parent) {
+ head = vma->shared.vm_set.head;
+ if (!list_empty(&head->shared.vm_set.list)) {
+ new_head = list_entry(
+ head->shared.vm_set.list.next,
+ struct vm_area_struct,
+ shared.vm_set.list);
+ list_del_init(&head->shared.vm_set.list);
+ } else
+ new_head = NULL;
+
+ raw_prio_tree_replace(root, &vma->shared.prio_tree_node,
+ &head->shared.prio_tree_node);
+ head->shared.vm_set.head = new_head;
+ if (new_head)
+ new_head->shared.vm_set.head = head;
+
+ } else {
+ node = vma->shared.vm_set.head;
+ if (!list_empty(&vma->shared.vm_set.list)) {
+ new_head = list_entry(
+ vma->shared.vm_set.list.next,
+ struct vm_area_struct,
+ shared.vm_set.list);
+ list_del_init(&vma->shared.vm_set.list);
+ node->shared.vm_set.head = new_head;
+ new_head->shared.vm_set.head = node;
+ } else
+ node->shared.vm_set.head = NULL;
+ }
+ }
+}
+
+/*
+ * Helper function to enumerate vmas that map a given file page or a set of
+ * contiguous file pages. The function returns vmas that at least map a single
+ * page in the given range of contiguous file pages.
+ */
+struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
+ struct prio_tree_iter *iter)
+{
+ struct prio_tree_node *ptr;
+ struct vm_area_struct *next;
+
+ if (!vma) {
+ /*
+ * First call is with NULL vma
+ */
+ ptr = prio_tree_next(iter);
+ if (ptr) {
+ next = prio_tree_entry(ptr, struct vm_area_struct,
+ shared.prio_tree_node);
+ prefetch(next->shared.vm_set.head);
+ return next;
+ } else
+ return NULL;
+ }
+
+ if (vma->shared.vm_set.parent) {
+ if (vma->shared.vm_set.head) {
+ next = vma->shared.vm_set.head;
+ prefetch(next->shared.vm_set.list.next);
+ return next;
+ }
+ } else {
+ next = list_entry(vma->shared.vm_set.list.next,
+ struct vm_area_struct, shared.vm_set.list);
+ if (!next->shared.vm_set.head) {
+ prefetch(next->shared.vm_set.list.next);
+ return next;
+ }
+ }
+
+ ptr = prio_tree_next(iter);
+ if (ptr) {
+ next = prio_tree_entry(ptr, struct vm_area_struct,
+ shared.prio_tree_node);
+ prefetch(next->shared.vm_set.head);
+ return next;
+ } else
+ return NULL;
+}
diff --git a/mm/quicklist.c b/mm/quicklist.c
new file mode 100644
index 0000000..8dbb680
--- /dev/null
+++ b/mm/quicklist.c
@@ -0,0 +1,103 @@
+/*
+ * Quicklist support.
+ *
+ * Quicklists are light weight lists of pages that have a defined state
+ * on alloc and free. Pages must be in the quicklist specific defined state
+ * (zero by default) when the page is freed. It seems that the initial idea
+ * for such lists first came from Dave Miller and then various other people
+ * improved on it.
+ *
+ * Copyright (C) 2007 SGI,
+ * Christoph Lameter <clameter@sgi.com>
+ * Generalized, added support for multiple lists and
+ * constructors / destructors.
+ */
+#include <linux/kernel.h>
+
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/module.h>
+#include <linux/quicklist.h>
+
+DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK];
+
+#define FRACTION_OF_NODE_MEM 16
+
+static unsigned long max_pages(unsigned long min_pages)
+{
+ unsigned long node_free_pages, max;
+ int node = numa_node_id();
+ struct zone *zones = NODE_DATA(node)->node_zones;
+ int num_cpus_on_node;
+ node_to_cpumask_ptr(cpumask_on_node, node);
+
+ node_free_pages =
+#ifdef CONFIG_ZONE_DMA
+ zone_page_state(&zones[ZONE_DMA], NR_FREE_PAGES) +
+#endif
+#ifdef CONFIG_ZONE_DMA32
+ zone_page_state(&zones[ZONE_DMA32], NR_FREE_PAGES) +
+#endif
+ zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES);
+
+ max = node_free_pages / FRACTION_OF_NODE_MEM;
+
+ num_cpus_on_node = cpus_weight_nr(*cpumask_on_node);
+ max /= num_cpus_on_node;
+
+ return max(max, min_pages);
+}
+
+static long min_pages_to_free(struct quicklist *q,
+ unsigned long min_pages, long max_free)
+{
+ long pages_to_free;
+
+ pages_to_free = q->nr_pages - max_pages(min_pages);
+
+ return min(pages_to_free, max_free);
+}
+
+/*
+ * Trim down the number of pages in the quicklist
+ */
+void quicklist_trim(int nr, void (*dtor)(void *),
+ unsigned long min_pages, unsigned long max_free)
+{
+ long pages_to_free;
+ struct quicklist *q;
+
+ q = &get_cpu_var(quicklist)[nr];
+ if (q->nr_pages > min_pages) {
+ pages_to_free = min_pages_to_free(q, min_pages, max_free);
+
+ while (pages_to_free > 0) {
+ /*
+ * We pass a gfp_t of 0 to quicklist_alloc here
+ * because we will never call into the page allocator.
+ */
+ void *p = quicklist_alloc(nr, 0, NULL);
+
+ if (dtor)
+ dtor(p);
+ free_page((unsigned long)p);
+ pages_to_free--;
+ }
+ }
+ put_cpu_var(quicklist);
+}
+
+unsigned long quicklist_total_size(void)
+{
+ unsigned long count = 0;
+ int cpu;
+ struct quicklist *ql, *q;
+
+ for_each_online_cpu(cpu) {
+ ql = per_cpu(quicklist, cpu);
+ for (q = ql; q < ql + CONFIG_NR_QUICK; q++)
+ count += q->nr_pages;
+ }
+ return count;
+}
+
diff --git a/mm/readahead.c b/mm/readahead.c
new file mode 100644
index 0000000..bec83c1
--- /dev/null
+++ b/mm/readahead.c
@@ -0,0 +1,483 @@
+/*
+ * mm/readahead.c - address_space-level file readahead.
+ *
+ * Copyright (C) 2002, Linus Torvalds
+ *
+ * 09Apr2002 Andrew Morton
+ * Initial version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/pagevec.h>
+#include <linux/pagemap.h>
+
+void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
+{
+}
+EXPORT_SYMBOL(default_unplug_io_fn);
+
+struct backing_dev_info default_backing_dev_info = {
+ .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
+ .state = 0,
+ .capabilities = BDI_CAP_MAP_COPY,
+ .unplug_io_fn = default_unplug_io_fn,
+};
+EXPORT_SYMBOL_GPL(default_backing_dev_info);
+
+/*
+ * Initialise a struct file's readahead state. Assumes that the caller has
+ * memset *ra to zero.
+ */
+void
+file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
+{
+ ra->ra_pages = mapping->backing_dev_info->ra_pages;
+ ra->prev_pos = -1;
+}
+EXPORT_SYMBOL_GPL(file_ra_state_init);
+
+#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
+
+/**
+ * read_cache_pages - populate an address space with some pages & start reads against them
+ * @mapping: the address_space
+ * @pages: The address of a list_head which contains the target pages. These
+ * pages have their ->index populated and are otherwise uninitialised.
+ * @filler: callback routine for filling a single page.
+ * @data: private data for the callback routine.
+ *
+ * Hides the details of the LRU cache etc from the filesystems.
+ */
+int read_cache_pages(struct address_space *mapping, struct list_head *pages,
+ int (*filler)(void *, struct page *), void *data)
+{
+ struct page *page;
+ int ret = 0;
+
+ while (!list_empty(pages)) {
+ page = list_to_page(pages);
+ list_del(&page->lru);
+ if (add_to_page_cache_lru(page, mapping,
+ page->index, GFP_KERNEL)) {
+ page_cache_release(page);
+ continue;
+ }
+ page_cache_release(page);
+
+ ret = filler(data, page);
+ if (unlikely(ret)) {
+ put_pages_list(pages);
+ break;
+ }
+ task_io_account_read(PAGE_CACHE_SIZE);
+ }
+ return ret;
+}
+
+EXPORT_SYMBOL(read_cache_pages);
+
+static int read_pages(struct address_space *mapping, struct file *filp,
+ struct list_head *pages, unsigned nr_pages)
+{
+ unsigned page_idx;
+ int ret;
+
+ if (mapping->a_ops->readpages) {
+ ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
+ /* Clean up the remaining pages */
+ put_pages_list(pages);
+ goto out;
+ }
+
+ for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+ struct page *page = list_to_page(pages);
+ list_del(&page->lru);
+ if (!add_to_page_cache_lru(page, mapping,
+ page->index, GFP_KERNEL)) {
+ mapping->a_ops->readpage(filp, page);
+ }
+ page_cache_release(page);
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+/*
+ * do_page_cache_readahead actually reads a chunk of disk. It allocates all
+ * the pages first, then submits them all for I/O. This avoids the very bad
+ * behaviour which would occur if page allocations are causing VM writeback.
+ * We really don't want to intermingle reads and writes like that.
+ *
+ * Returns the number of pages requested, or the maximum amount of I/O allowed.
+ *
+ * do_page_cache_readahead() returns -1 if it encountered request queue
+ * congestion.
+ */
+static int
+__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
+ pgoff_t offset, unsigned long nr_to_read,
+ unsigned long lookahead_size)
+{
+ struct inode *inode = mapping->host;
+ struct page *page;
+ unsigned long end_index; /* The last page we want to read */
+ LIST_HEAD(page_pool);
+ int page_idx;
+ int ret = 0;
+ loff_t isize = i_size_read(inode);
+
+ if (isize == 0)
+ goto out;
+
+ end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
+
+ /*
+ * Preallocate as many pages as we will need.
+ */
+ for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
+ pgoff_t page_offset = offset + page_idx;
+
+ if (page_offset > end_index)
+ break;
+
+ rcu_read_lock();
+ page = radix_tree_lookup(&mapping->page_tree, page_offset);
+ rcu_read_unlock();
+ if (page)
+ continue;
+
+ page = page_cache_alloc_cold(mapping);
+ if (!page)
+ break;
+ page->index = page_offset;
+ list_add(&page->lru, &page_pool);
+ if (page_idx == nr_to_read - lookahead_size)
+ SetPageReadahead(page);
+ ret++;
+ }
+
+ /*
+ * Now start the IO. We ignore I/O errors - if the page is not
+ * uptodate then the caller will launch readpage again, and
+ * will then handle the error.
+ */
+ if (ret)
+ read_pages(mapping, filp, &page_pool, ret);
+ BUG_ON(!list_empty(&page_pool));
+out:
+ return ret;
+}
+
+/*
+ * Chunk the readahead into 2 megabyte units, so that we don't pin too much
+ * memory at once.
+ */
+int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
+ pgoff_t offset, unsigned long nr_to_read)
+{
+ int ret = 0;
+
+ if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
+ return -EINVAL;
+
+ while (nr_to_read) {
+ int err;
+
+ unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE;
+
+ if (this_chunk > nr_to_read)
+ this_chunk = nr_to_read;
+ err = __do_page_cache_readahead(mapping, filp,
+ offset, this_chunk, 0);
+ if (err < 0) {
+ ret = err;
+ break;
+ }
+ ret += err;
+ offset += this_chunk;
+ nr_to_read -= this_chunk;
+ }
+ return ret;
+}
+
+/*
+ * This version skips the IO if the queue is read-congested, and will tell the
+ * block layer to abandon the readahead if request allocation would block.
+ *
+ * force_page_cache_readahead() will ignore queue congestion and will block on
+ * request queues.
+ */
+int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
+ pgoff_t offset, unsigned long nr_to_read)
+{
+ if (bdi_read_congested(mapping->backing_dev_info))
+ return -1;
+
+ return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0);
+}
+
+/*
+ * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
+ * sensible upper limit.
+ */
+unsigned long max_sane_readahead(unsigned long nr)
+{
+ return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE)
+ + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
+}
+
+static int __init readahead_init(void)
+{
+ int err;
+
+ err = bdi_init(&default_backing_dev_info);
+ if (!err)
+ bdi_register(&default_backing_dev_info, NULL, "default");
+
+ return err;
+}
+subsys_initcall(readahead_init);
+
+/*
+ * Submit IO for the read-ahead request in file_ra_state.
+ */
+static unsigned long ra_submit(struct file_ra_state *ra,
+ struct address_space *mapping, struct file *filp)
+{
+ int actual;
+
+ actual = __do_page_cache_readahead(mapping, filp,
+ ra->start, ra->size, ra->async_size);
+
+ return actual;
+}
+
+/*
+ * Set the initial window size, round to next power of 2 and square
+ * for small size, x 4 for medium, and x 2 for large
+ * for 128k (32 page) max ra
+ * 1-8 page = 32k initial, > 8 page = 128k initial
+ */
+static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
+{
+ unsigned long newsize = roundup_pow_of_two(size);
+
+ if (newsize <= max / 32)
+ newsize = newsize * 4;
+ else if (newsize <= max / 4)
+ newsize = newsize * 2;
+ else
+ newsize = max;
+
+ return newsize;
+}
+
+/*
+ * Get the previous window size, ramp it up, and
+ * return it as the new window size.
+ */
+static unsigned long get_next_ra_size(struct file_ra_state *ra,
+ unsigned long max)
+{
+ unsigned long cur = ra->size;
+ unsigned long newsize;
+
+ if (cur < max / 16)
+ newsize = 4 * cur;
+ else
+ newsize = 2 * cur;
+
+ return min(newsize, max);
+}
+
+/*
+ * On-demand readahead design.
+ *
+ * The fields in struct file_ra_state represent the most-recently-executed
+ * readahead attempt:
+ *
+ * |<----- async_size ---------|
+ * |------------------- size -------------------->|
+ * |==================#===========================|
+ * ^start ^page marked with PG_readahead
+ *
+ * To overlap application thinking time and disk I/O time, we do
+ * `readahead pipelining': Do not wait until the application consumed all
+ * readahead pages and stalled on the missing page at readahead_index;
+ * Instead, submit an asynchronous readahead I/O as soon as there are
+ * only async_size pages left in the readahead window. Normally async_size
+ * will be equal to size, for maximum pipelining.
+ *
+ * In interleaved sequential reads, concurrent streams on the same fd can
+ * be invalidating each other's readahead state. So we flag the new readahead
+ * page at (start+size-async_size) with PG_readahead, and use it as readahead
+ * indicator. The flag won't be set on already cached pages, to avoid the
+ * readahead-for-nothing fuss, saving pointless page cache lookups.
+ *
+ * prev_pos tracks the last visited byte in the _previous_ read request.
+ * It should be maintained by the caller, and will be used for detecting
+ * small random reads. Note that the readahead algorithm checks loosely
+ * for sequential patterns. Hence interleaved reads might be served as
+ * sequential ones.
+ *
+ * There is a special-case: if the first page which the application tries to
+ * read happens to be the first page of the file, it is assumed that a linear
+ * read is about to happen and the window is immediately set to the initial size
+ * based on I/O request size and the max_readahead.
+ *
+ * The code ramps up the readahead size aggressively at first, but slow down as
+ * it approaches max_readhead.
+ */
+
+/*
+ * A minimal readahead algorithm for trivial sequential/random reads.
+ */
+static unsigned long
+ondemand_readahead(struct address_space *mapping,
+ struct file_ra_state *ra, struct file *filp,
+ bool hit_readahead_marker, pgoff_t offset,
+ unsigned long req_size)
+{
+ int max = ra->ra_pages; /* max readahead pages */
+ pgoff_t prev_offset;
+ int sequential;
+
+ /*
+ * It's the expected callback offset, assume sequential access.
+ * Ramp up sizes, and push forward the readahead window.
+ */
+ if (offset && (offset == (ra->start + ra->size - ra->async_size) ||
+ offset == (ra->start + ra->size))) {
+ ra->start += ra->size;
+ ra->size = get_next_ra_size(ra, max);
+ ra->async_size = ra->size;
+ goto readit;
+ }
+
+ prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT;
+ sequential = offset - prev_offset <= 1UL || req_size > max;
+
+ /*
+ * Standalone, small read.
+ * Read as is, and do not pollute the readahead state.
+ */
+ if (!hit_readahead_marker && !sequential) {
+ return __do_page_cache_readahead(mapping, filp,
+ offset, req_size, 0);
+ }
+
+ /*
+ * Hit a marked page without valid readahead state.
+ * E.g. interleaved reads.
+ * Query the pagecache for async_size, which normally equals to
+ * readahead size. Ramp it up and use it as the new readahead size.
+ */
+ if (hit_readahead_marker) {
+ pgoff_t start;
+
+ rcu_read_lock();
+ start = radix_tree_next_hole(&mapping->page_tree, offset,max+1);
+ rcu_read_unlock();
+
+ if (!start || start - offset > max)
+ return 0;
+
+ ra->start = start;
+ ra->size = start - offset; /* old async_size */
+ ra->size = get_next_ra_size(ra, max);
+ ra->async_size = ra->size;
+ goto readit;
+ }
+
+ /*
+ * It may be one of
+ * - first read on start of file
+ * - sequential cache miss
+ * - oversize random read
+ * Start readahead for it.
+ */
+ ra->start = offset;
+ ra->size = get_init_ra_size(req_size, max);
+ ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
+
+readit:
+ return ra_submit(ra, mapping, filp);
+}
+
+/**
+ * page_cache_sync_readahead - generic file readahead
+ * @mapping: address_space which holds the pagecache and I/O vectors
+ * @ra: file_ra_state which holds the readahead state
+ * @filp: passed on to ->readpage() and ->readpages()
+ * @offset: start offset into @mapping, in pagecache page-sized units
+ * @req_size: hint: total size of the read which the caller is performing in
+ * pagecache pages
+ *
+ * page_cache_sync_readahead() should be called when a cache miss happened:
+ * it will submit the read. The readahead logic may decide to piggyback more
+ * pages onto the read request if access patterns suggest it will improve
+ * performance.
+ */
+void page_cache_sync_readahead(struct address_space *mapping,
+ struct file_ra_state *ra, struct file *filp,
+ pgoff_t offset, unsigned long req_size)
+{
+ /* no read-ahead */
+ if (!ra->ra_pages)
+ return;
+
+ /* do read-ahead */
+ ondemand_readahead(mapping, ra, filp, false, offset, req_size);
+}
+EXPORT_SYMBOL_GPL(page_cache_sync_readahead);
+
+/**
+ * page_cache_async_readahead - file readahead for marked pages
+ * @mapping: address_space which holds the pagecache and I/O vectors
+ * @ra: file_ra_state which holds the readahead state
+ * @filp: passed on to ->readpage() and ->readpages()
+ * @page: the page at @offset which has the PG_readahead flag set
+ * @offset: start offset into @mapping, in pagecache page-sized units
+ * @req_size: hint: total size of the read which the caller is performing in
+ * pagecache pages
+ *
+ * page_cache_async_ondemand() should be called when a page is used which
+ * has the PG_readahead flag; this is a marker to suggest that the application
+ * has used up enough of the readahead window that we should start pulling in
+ * more pages.
+ */
+void
+page_cache_async_readahead(struct address_space *mapping,
+ struct file_ra_state *ra, struct file *filp,
+ struct page *page, pgoff_t offset,
+ unsigned long req_size)
+{
+ /* no read-ahead */
+ if (!ra->ra_pages)
+ return;
+
+ /*
+ * Same bit is used for PG_readahead and PG_reclaim.
+ */
+ if (PageWriteback(page))
+ return;
+
+ ClearPageReadahead(page);
+
+ /*
+ * Defer asynchronous read-ahead on IO congestion.
+ */
+ if (bdi_read_congested(mapping->backing_dev_info))
+ return;
+
+ /* do read-ahead */
+ ondemand_readahead(mapping, ra, filp, true, offset, req_size);
+}
+EXPORT_SYMBOL_GPL(page_cache_async_readahead);
diff --git a/mm/rmap.c b/mm/rmap.c
new file mode 100644
index 0000000..1099394
--- /dev/null
+++ b/mm/rmap.c
@@ -0,0 +1,1236 @@
+/*
+ * mm/rmap.c - physical to virtual reverse mappings
+ *
+ * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
+ * Released under the General Public License (GPL).
+ *
+ * Simple, low overhead reverse mapping scheme.
+ * Please try to keep this thing as modular as possible.
+ *
+ * Provides methods for unmapping each kind of mapped page:
+ * the anon methods track anonymous pages, and
+ * the file methods track pages belonging to an inode.
+ *
+ * Original design by Rik van Riel <riel@conectiva.com.br> 2001
+ * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
+ * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
+ * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004
+ */
+
+/*
+ * Lock ordering in mm:
+ *
+ * inode->i_mutex (while writing or truncating, not reading or faulting)
+ * inode->i_alloc_sem (vmtruncate_range)
+ * mm->mmap_sem
+ * page->flags PG_locked (lock_page)
+ * mapping->i_mmap_lock
+ * anon_vma->lock
+ * mm->page_table_lock or pte_lock
+ * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
+ * swap_lock (in swap_duplicate, swap_info_get)
+ * mmlist_lock (in mmput, drain_mmlist and others)
+ * mapping->private_lock (in __set_page_dirty_buffers)
+ * inode_lock (in set_page_dirty's __mark_inode_dirty)
+ * sb_lock (within inode_lock in fs/fs-writeback.c)
+ * mapping->tree_lock (widely used, in set_page_dirty,
+ * in arch-dependent flush_dcache_mmap_lock,
+ * within inode_lock in __sync_single_inode)
+ */
+
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/rmap.h>
+#include <linux/rcupdate.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
+
+#include <asm/tlbflush.h>
+
+#include "internal.h"
+
+static struct kmem_cache *anon_vma_cachep;
+
+static inline struct anon_vma *anon_vma_alloc(void)
+{
+ return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
+}
+
+static inline void anon_vma_free(struct anon_vma *anon_vma)
+{
+ kmem_cache_free(anon_vma_cachep, anon_vma);
+}
+
+/**
+ * anon_vma_prepare - attach an anon_vma to a memory region
+ * @vma: the memory region in question
+ *
+ * This makes sure the memory mapping described by 'vma' has
+ * an 'anon_vma' attached to it, so that we can associate the
+ * anonymous pages mapped into it with that anon_vma.
+ *
+ * The common case will be that we already have one, but if
+ * if not we either need to find an adjacent mapping that we
+ * can re-use the anon_vma from (very common when the only
+ * reason for splitting a vma has been mprotect()), or we
+ * allocate a new one.
+ *
+ * Anon-vma allocations are very subtle, because we may have
+ * optimistically looked up an anon_vma in page_lock_anon_vma()
+ * and that may actually touch the spinlock even in the newly
+ * allocated vma (it depends on RCU to make sure that the
+ * anon_vma isn't actually destroyed).
+ *
+ * As a result, we need to do proper anon_vma locking even
+ * for the new allocation. At the same time, we do not want
+ * to do any locking for the common case of already having
+ * an anon_vma.
+ *
+ * This must be called with the mmap_sem held for reading.
+ */
+int anon_vma_prepare(struct vm_area_struct *vma)
+{
+ struct anon_vma *anon_vma = vma->anon_vma;
+
+ might_sleep();
+ if (unlikely(!anon_vma)) {
+ struct mm_struct *mm = vma->vm_mm;
+ struct anon_vma *allocated;
+
+ anon_vma = find_mergeable_anon_vma(vma);
+ allocated = NULL;
+ if (!anon_vma) {
+ anon_vma = anon_vma_alloc();
+ if (unlikely(!anon_vma))
+ return -ENOMEM;
+ allocated = anon_vma;
+ }
+ spin_lock(&anon_vma->lock);
+
+ /* page_table_lock to protect against threads */
+ spin_lock(&mm->page_table_lock);
+ if (likely(!vma->anon_vma)) {
+ vma->anon_vma = anon_vma;
+ list_add_tail(&vma->anon_vma_node, &anon_vma->head);
+ allocated = NULL;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ spin_unlock(&anon_vma->lock);
+ if (unlikely(allocated))
+ anon_vma_free(allocated);
+ }
+ return 0;
+}
+
+void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
+{
+ BUG_ON(vma->anon_vma != next->anon_vma);
+ list_del(&next->anon_vma_node);
+}
+
+void __anon_vma_link(struct vm_area_struct *vma)
+{
+ struct anon_vma *anon_vma = vma->anon_vma;
+
+ if (anon_vma)
+ list_add_tail(&vma->anon_vma_node, &anon_vma->head);
+}
+
+void anon_vma_link(struct vm_area_struct *vma)
+{
+ struct anon_vma *anon_vma = vma->anon_vma;
+
+ if (anon_vma) {
+ spin_lock(&anon_vma->lock);
+ list_add_tail(&vma->anon_vma_node, &anon_vma->head);
+ spin_unlock(&anon_vma->lock);
+ }
+}
+
+void anon_vma_unlink(struct vm_area_struct *vma)
+{
+ struct anon_vma *anon_vma = vma->anon_vma;
+ int empty;
+
+ if (!anon_vma)
+ return;
+
+ spin_lock(&anon_vma->lock);
+ list_del(&vma->anon_vma_node);
+
+ /* We must garbage collect the anon_vma if it's empty */
+ empty = list_empty(&anon_vma->head);
+ spin_unlock(&anon_vma->lock);
+
+ if (empty)
+ anon_vma_free(anon_vma);
+}
+
+static void anon_vma_ctor(void *data)
+{
+ struct anon_vma *anon_vma = data;
+
+ spin_lock_init(&anon_vma->lock);
+ INIT_LIST_HEAD(&anon_vma->head);
+}
+
+void __init anon_vma_init(void)
+{
+ anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
+ 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor);
+}
+
+/*
+ * Getting a lock on a stable anon_vma from a page off the LRU is
+ * tricky: page_lock_anon_vma rely on RCU to guard against the races.
+ */
+struct anon_vma *page_lock_anon_vma(struct page *page)
+{
+ struct anon_vma *anon_vma;
+ unsigned long anon_mapping;
+
+ rcu_read_lock();
+ anon_mapping = (unsigned long) page->mapping;
+ if (!(anon_mapping & PAGE_MAPPING_ANON))
+ goto out;
+ if (!page_mapped(page))
+ goto out;
+
+ anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
+ spin_lock(&anon_vma->lock);
+ return anon_vma;
+out:
+ rcu_read_unlock();
+ return NULL;
+}
+
+void page_unlock_anon_vma(struct anon_vma *anon_vma)
+{
+ spin_unlock(&anon_vma->lock);
+ rcu_read_unlock();
+}
+
+/*
+ * At what user virtual address is page expected in @vma?
+ * Returns virtual address or -EFAULT if page's index/offset is not
+ * within the range mapped the @vma.
+ */
+static inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ unsigned long address;
+
+ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
+ /* page should be within @vma mapping range */
+ return -EFAULT;
+ }
+ return address;
+}
+
+/*
+ * At what user virtual address is page expected in vma? checking that the
+ * page matches the vma: currently only used on anon pages, by unuse_vma;
+ */
+unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
+{
+ if (PageAnon(page)) {
+ if ((void *)vma->anon_vma !=
+ (void *)page->mapping - PAGE_MAPPING_ANON)
+ return -EFAULT;
+ } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
+ if (!vma->vm_file ||
+ vma->vm_file->f_mapping != page->mapping)
+ return -EFAULT;
+ } else
+ return -EFAULT;
+ return vma_address(page, vma);
+}
+
+/*
+ * Check that @page is mapped at @address into @mm.
+ *
+ * If @sync is false, page_check_address may perform a racy check to avoid
+ * the page table lock when the pte is not present (helpful when reclaiming
+ * highly shared pages).
+ *
+ * On success returns with pte mapped and locked.
+ */
+pte_t *page_check_address(struct page *page, struct mm_struct *mm,
+ unsigned long address, spinlock_t **ptlp, int sync)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ return NULL;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return NULL;
+
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd))
+ return NULL;
+
+ pte = pte_offset_map(pmd, address);
+ /* Make a quick check before getting the lock */
+ if (!sync && !pte_present(*pte)) {
+ pte_unmap(pte);
+ return NULL;
+ }
+
+ ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+ if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
+ *ptlp = ptl;
+ return pte;
+ }
+ pte_unmap_unlock(pte, ptl);
+ return NULL;
+}
+
+/**
+ * page_mapped_in_vma - check whether a page is really mapped in a VMA
+ * @page: the page to test
+ * @vma: the VMA to test
+ *
+ * Returns 1 if the page is mapped into the page tables of the VMA, 0
+ * if the page is not mapped into the page tables of this VMA. Only
+ * valid for normal file or anonymous VMAs.
+ */
+static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
+{
+ unsigned long address;
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ address = vma_address(page, vma);
+ if (address == -EFAULT) /* out of vma range */
+ return 0;
+ pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
+ if (!pte) /* the page is not in this mm */
+ return 0;
+ pte_unmap_unlock(pte, ptl);
+
+ return 1;
+}
+
+/*
+ * Subfunctions of page_referenced: page_referenced_one called
+ * repeatedly from either page_referenced_anon or page_referenced_file.
+ */
+static int page_referenced_one(struct page *page,
+ struct vm_area_struct *vma, unsigned int *mapcount)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long address;
+ pte_t *pte;
+ spinlock_t *ptl;
+ int referenced = 0;
+
+ address = vma_address(page, vma);
+ if (address == -EFAULT)
+ goto out;
+
+ pte = page_check_address(page, mm, address, &ptl, 0);
+ if (!pte)
+ goto out;
+
+ /*
+ * Don't want to elevate referenced for mlocked page that gets this far,
+ * in order that it progresses to try_to_unmap and is moved to the
+ * unevictable list.
+ */
+ if (vma->vm_flags & VM_LOCKED) {
+ *mapcount = 1; /* break early from loop */
+ goto out_unmap;
+ }
+
+ if (ptep_clear_flush_young_notify(vma, address, pte))
+ referenced++;
+
+ /* Pretend the page is referenced if the task has the
+ swap token and is in the middle of a page fault. */
+ if (mm != current->mm && has_swap_token(mm) &&
+ rwsem_is_locked(&mm->mmap_sem))
+ referenced++;
+
+out_unmap:
+ (*mapcount)--;
+ pte_unmap_unlock(pte, ptl);
+out:
+ return referenced;
+}
+
+static int page_referenced_anon(struct page *page,
+ struct mem_cgroup *mem_cont)
+{
+ unsigned int mapcount;
+ struct anon_vma *anon_vma;
+ struct vm_area_struct *vma;
+ int referenced = 0;
+
+ anon_vma = page_lock_anon_vma(page);
+ if (!anon_vma)
+ return referenced;
+
+ mapcount = page_mapcount(page);
+ list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+ /*
+ * If we are reclaiming on behalf of a cgroup, skip
+ * counting on behalf of references from different
+ * cgroups
+ */
+ if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
+ continue;
+ referenced += page_referenced_one(page, vma, &mapcount);
+ if (!mapcount)
+ break;
+ }
+
+ page_unlock_anon_vma(anon_vma);
+ return referenced;
+}
+
+/**
+ * page_referenced_file - referenced check for object-based rmap
+ * @page: the page we're checking references on.
+ * @mem_cont: target memory controller
+ *
+ * For an object-based mapped page, find all the places it is mapped and
+ * check/clear the referenced flag. This is done by following the page->mapping
+ * pointer, then walking the chain of vmas it holds. It returns the number
+ * of references it found.
+ *
+ * This function is only called from page_referenced for object-based pages.
+ */
+static int page_referenced_file(struct page *page,
+ struct mem_cgroup *mem_cont)
+{
+ unsigned int mapcount;
+ struct address_space *mapping = page->mapping;
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ struct vm_area_struct *vma;
+ struct prio_tree_iter iter;
+ int referenced = 0;
+
+ /*
+ * The caller's checks on page->mapping and !PageAnon have made
+ * sure that this is a file page: the check for page->mapping
+ * excludes the case just before it gets set on an anon page.
+ */
+ BUG_ON(PageAnon(page));
+
+ /*
+ * The page lock not only makes sure that page->mapping cannot
+ * suddenly be NULLified by truncation, it makes sure that the
+ * structure at mapping cannot be freed and reused yet,
+ * so we can safely take mapping->i_mmap_lock.
+ */
+ BUG_ON(!PageLocked(page));
+
+ spin_lock(&mapping->i_mmap_lock);
+
+ /*
+ * i_mmap_lock does not stabilize mapcount at all, but mapcount
+ * is more likely to be accurate if we note it after spinning.
+ */
+ mapcount = page_mapcount(page);
+
+ vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ /*
+ * If we are reclaiming on behalf of a cgroup, skip
+ * counting on behalf of references from different
+ * cgroups
+ */
+ if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
+ continue;
+ referenced += page_referenced_one(page, vma, &mapcount);
+ if (!mapcount)
+ break;
+ }
+
+ spin_unlock(&mapping->i_mmap_lock);
+ return referenced;
+}
+
+/**
+ * page_referenced - test if the page was referenced
+ * @page: the page to test
+ * @is_locked: caller holds lock on the page
+ * @mem_cont: target memory controller
+ *
+ * Quick test_and_clear_referenced for all mappings to a page,
+ * returns the number of ptes which referenced the page.
+ */
+int page_referenced(struct page *page, int is_locked,
+ struct mem_cgroup *mem_cont)
+{
+ int referenced = 0;
+
+ if (TestClearPageReferenced(page))
+ referenced++;
+
+ if (page_mapped(page) && page->mapping) {
+ if (PageAnon(page))
+ referenced += page_referenced_anon(page, mem_cont);
+ else if (is_locked)
+ referenced += page_referenced_file(page, mem_cont);
+ else if (!trylock_page(page))
+ referenced++;
+ else {
+ if (page->mapping)
+ referenced +=
+ page_referenced_file(page, mem_cont);
+ unlock_page(page);
+ }
+ }
+
+ if (page_test_and_clear_young(page))
+ referenced++;
+
+ return referenced;
+}
+
+static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long address;
+ pte_t *pte;
+ spinlock_t *ptl;
+ int ret = 0;
+
+ address = vma_address(page, vma);
+ if (address == -EFAULT)
+ goto out;
+
+ pte = page_check_address(page, mm, address, &ptl, 1);
+ if (!pte)
+ goto out;
+
+ if (pte_dirty(*pte) || pte_write(*pte)) {
+ pte_t entry;
+
+ flush_cache_page(vma, address, pte_pfn(*pte));
+ entry = ptep_clear_flush_notify(vma, address, pte);
+ entry = pte_wrprotect(entry);
+ entry = pte_mkclean(entry);
+ set_pte_at(mm, address, pte, entry);
+ ret = 1;
+ }
+
+ pte_unmap_unlock(pte, ptl);
+out:
+ return ret;
+}
+
+static int page_mkclean_file(struct address_space *mapping, struct page *page)
+{
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ struct vm_area_struct *vma;
+ struct prio_tree_iter iter;
+ int ret = 0;
+
+ BUG_ON(PageAnon(page));
+
+ spin_lock(&mapping->i_mmap_lock);
+ vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ if (vma->vm_flags & VM_SHARED)
+ ret += page_mkclean_one(page, vma);
+ }
+ spin_unlock(&mapping->i_mmap_lock);
+ return ret;
+}
+
+int page_mkclean(struct page *page)
+{
+ int ret = 0;
+
+ BUG_ON(!PageLocked(page));
+
+ if (page_mapped(page)) {
+ struct address_space *mapping = page_mapping(page);
+ if (mapping) {
+ ret = page_mkclean_file(mapping, page);
+ if (page_test_dirty(page)) {
+ page_clear_dirty(page);
+ ret = 1;
+ }
+ }
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(page_mkclean);
+
+/**
+ * __page_set_anon_rmap - setup new anonymous rmap
+ * @page: the page to add the mapping to
+ * @vma: the vm area in which the mapping is added
+ * @address: the user virtual address mapped
+ */
+static void __page_set_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ struct anon_vma *anon_vma = vma->anon_vma;
+
+ BUG_ON(!anon_vma);
+ anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+ page->mapping = (struct address_space *) anon_vma;
+
+ page->index = linear_page_index(vma, address);
+
+ /*
+ * nr_mapped state can be updated without turning off
+ * interrupts because it is not modified via interrupt.
+ */
+ __inc_zone_page_state(page, NR_ANON_PAGES);
+}
+
+/**
+ * __page_check_anon_rmap - sanity check anonymous rmap addition
+ * @page: the page to add the mapping to
+ * @vma: the vm area in which the mapping is added
+ * @address: the user virtual address mapped
+ */
+static void __page_check_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+#ifdef CONFIG_DEBUG_VM
+ /*
+ * The page's anon-rmap details (mapping and index) are guaranteed to
+ * be set up correctly at this point.
+ *
+ * We have exclusion against page_add_anon_rmap because the caller
+ * always holds the page locked, except if called from page_dup_rmap,
+ * in which case the page is already known to be setup.
+ *
+ * We have exclusion against page_add_new_anon_rmap because those pages
+ * are initially only visible via the pagetables, and the pte is locked
+ * over the call to page_add_new_anon_rmap.
+ */
+ struct anon_vma *anon_vma = vma->anon_vma;
+ anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+ BUG_ON(page->mapping != (struct address_space *)anon_vma);
+ BUG_ON(page->index != linear_page_index(vma, address));
+#endif
+}
+
+/**
+ * page_add_anon_rmap - add pte mapping to an anonymous page
+ * @page: the page to add the mapping to
+ * @vma: the vm area in which the mapping is added
+ * @address: the user virtual address mapped
+ *
+ * The caller needs to hold the pte lock and the page must be locked.
+ */
+void page_add_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ if (atomic_inc_and_test(&page->_mapcount))
+ __page_set_anon_rmap(page, vma, address);
+ else
+ __page_check_anon_rmap(page, vma, address);
+}
+
+/**
+ * page_add_new_anon_rmap - add pte mapping to a new anonymous page
+ * @page: the page to add the mapping to
+ * @vma: the vm area in which the mapping is added
+ * @address: the user virtual address mapped
+ *
+ * Same as page_add_anon_rmap but must only be called on *new* pages.
+ * This means the inc-and-test can be bypassed.
+ * Page does not have to be locked.
+ */
+void page_add_new_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
+ __page_set_anon_rmap(page, vma, address);
+}
+
+/**
+ * page_add_file_rmap - add pte mapping to a file page
+ * @page: the page to add the mapping to
+ *
+ * The caller needs to hold the pte lock.
+ */
+void page_add_file_rmap(struct page *page)
+{
+ if (atomic_inc_and_test(&page->_mapcount))
+ __inc_zone_page_state(page, NR_FILE_MAPPED);
+}
+
+#ifdef CONFIG_DEBUG_VM
+/**
+ * page_dup_rmap - duplicate pte mapping to a page
+ * @page: the page to add the mapping to
+ * @vma: the vm area being duplicated
+ * @address: the user virtual address mapped
+ *
+ * For copy_page_range only: minimal extract from page_add_file_rmap /
+ * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's
+ * quicker.
+ *
+ * The caller needs to hold the pte lock.
+ */
+void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address)
+{
+ BUG_ON(page_mapcount(page) == 0);
+ if (PageAnon(page))
+ __page_check_anon_rmap(page, vma, address);
+ atomic_inc(&page->_mapcount);
+}
+#endif
+
+/**
+ * page_remove_rmap - take down pte mapping from a page
+ * @page: page to remove mapping from
+ * @vma: the vm area in which the mapping is removed
+ *
+ * The caller needs to hold the pte lock.
+ */
+void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
+{
+ if (atomic_add_negative(-1, &page->_mapcount)) {
+ if (unlikely(page_mapcount(page) < 0)) {
+ printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
+ printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page));
+ printk (KERN_EMERG " page->flags = %lx\n", page->flags);
+ printk (KERN_EMERG " page->count = %x\n", page_count(page));
+ printk (KERN_EMERG " page->mapping = %p\n", page->mapping);
+ print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops);
+ if (vma->vm_ops) {
+ print_symbol (KERN_EMERG " vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault);
+ }
+ if (vma->vm_file && vma->vm_file->f_op)
+ print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap);
+ BUG();
+ }
+
+ /*
+ * Now that the last pte has gone, s390 must transfer dirty
+ * flag from storage key to struct page. We can usually skip
+ * this if the page is anon, so about to be freed; but perhaps
+ * not if it's in swapcache - there might be another pte slot
+ * containing the swap entry, but page not yet written to swap.
+ */
+ if ((!PageAnon(page) || PageSwapCache(page)) &&
+ page_test_dirty(page)) {
+ page_clear_dirty(page);
+ set_page_dirty(page);
+ }
+ if (PageAnon(page))
+ mem_cgroup_uncharge_page(page);
+ __dec_zone_page_state(page,
+ PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
+ /*
+ * It would be tidy to reset the PageAnon mapping here,
+ * but that might overwrite a racing page_add_anon_rmap
+ * which increments mapcount after us but sets mapping
+ * before us: so leave the reset to free_hot_cold_page,
+ * and remember that it's only reliable while mapped.
+ * Leaving it set also helps swapoff to reinstate ptes
+ * faster for those pages still in swapcache.
+ */
+ }
+}
+
+/*
+ * Subfunctions of try_to_unmap: try_to_unmap_one called
+ * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
+ */
+static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
+ int migration)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long address;
+ pte_t *pte;
+ pte_t pteval;
+ spinlock_t *ptl;
+ int ret = SWAP_AGAIN;
+
+ address = vma_address(page, vma);
+ if (address == -EFAULT)
+ goto out;
+
+ pte = page_check_address(page, mm, address, &ptl, 0);
+ if (!pte)
+ goto out;
+
+ /*
+ * If the page is mlock()d, we cannot swap it out.
+ * If it's recently referenced (perhaps page_referenced
+ * skipped over this mm) then we should reactivate it.
+ */
+ if (!migration) {
+ if (vma->vm_flags & VM_LOCKED) {
+ ret = SWAP_MLOCK;
+ goto out_unmap;
+ }
+ if (ptep_clear_flush_young_notify(vma, address, pte)) {
+ ret = SWAP_FAIL;
+ goto out_unmap;
+ }
+ }
+
+ /* Nuke the page table entry. */
+ flush_cache_page(vma, address, page_to_pfn(page));
+ pteval = ptep_clear_flush_notify(vma, address, pte);
+
+ /* Move the dirty bit to the physical page now the pte is gone. */
+ if (pte_dirty(pteval))
+ set_page_dirty(page);
+
+ /* Update high watermark before we lower rss */
+ update_hiwater_rss(mm);
+
+ if (PageAnon(page)) {
+ swp_entry_t entry = { .val = page_private(page) };
+
+ if (PageSwapCache(page)) {
+ /*
+ * Store the swap location in the pte.
+ * See handle_pte_fault() ...
+ */
+ swap_duplicate(entry);
+ if (list_empty(&mm->mmlist)) {
+ spin_lock(&mmlist_lock);
+ if (list_empty(&mm->mmlist))
+ list_add(&mm->mmlist, &init_mm.mmlist);
+ spin_unlock(&mmlist_lock);
+ }
+ dec_mm_counter(mm, anon_rss);
+#ifdef CONFIG_MIGRATION
+ } else {
+ /*
+ * Store the pfn of the page in a special migration
+ * pte. do_swap_page() will wait until the migration
+ * pte is removed and then restart fault handling.
+ */
+ BUG_ON(!migration);
+ entry = make_migration_entry(page, pte_write(pteval));
+#endif
+ }
+ set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
+ BUG_ON(pte_file(*pte));
+ } else
+#ifdef CONFIG_MIGRATION
+ if (migration) {
+ /* Establish migration entry for a file page */
+ swp_entry_t entry;
+ entry = make_migration_entry(page, pte_write(pteval));
+ set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
+ } else
+#endif
+ dec_mm_counter(mm, file_rss);
+
+
+ page_remove_rmap(page, vma);
+ page_cache_release(page);
+
+out_unmap:
+ pte_unmap_unlock(pte, ptl);
+out:
+ return ret;
+}
+
+/*
+ * objrmap doesn't work for nonlinear VMAs because the assumption that
+ * offset-into-file correlates with offset-into-virtual-addresses does not hold.
+ * Consequently, given a particular page and its ->index, we cannot locate the
+ * ptes which are mapping that page without an exhaustive linear search.
+ *
+ * So what this code does is a mini "virtual scan" of each nonlinear VMA which
+ * maps the file to which the target page belongs. The ->vm_private_data field
+ * holds the current cursor into that scan. Successive searches will circulate
+ * around the vma's virtual address space.
+ *
+ * So as more replacement pressure is applied to the pages in a nonlinear VMA,
+ * more scanning pressure is placed against them as well. Eventually pages
+ * will become fully unmapped and are eligible for eviction.
+ *
+ * For very sparsely populated VMAs this is a little inefficient - chances are
+ * there there won't be many ptes located within the scan cluster. In this case
+ * maybe we could scan further - to the end of the pte page, perhaps.
+ *
+ * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can
+ * acquire it without blocking. If vma locked, mlock the pages in the cluster,
+ * rather than unmapping them. If we encounter the "check_page" that vmscan is
+ * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.
+ */
+#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE)
+#define CLUSTER_MASK (~(CLUSTER_SIZE - 1))
+
+static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
+ struct vm_area_struct *vma, struct page *check_page)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ pte_t pteval;
+ spinlock_t *ptl;
+ struct page *page;
+ unsigned long address;
+ unsigned long end;
+ int ret = SWAP_AGAIN;
+ int locked_vma = 0;
+
+ address = (vma->vm_start + cursor) & CLUSTER_MASK;
+ end = address + CLUSTER_SIZE;
+ if (address < vma->vm_start)
+ address = vma->vm_start;
+ if (end > vma->vm_end)
+ end = vma->vm_end;
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ return ret;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return ret;
+
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd))
+ return ret;
+
+ /*
+ * MLOCK_PAGES => feature is configured.
+ * if we can acquire the mmap_sem for read, and vma is VM_LOCKED,
+ * keep the sem while scanning the cluster for mlocking pages.
+ */
+ if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) {
+ locked_vma = (vma->vm_flags & VM_LOCKED);
+ if (!locked_vma)
+ up_read(&vma->vm_mm->mmap_sem); /* don't need it */
+ }
+
+ pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+
+ /* Update high watermark before we lower rss */
+ update_hiwater_rss(mm);
+
+ for (; address < end; pte++, address += PAGE_SIZE) {
+ if (!pte_present(*pte))
+ continue;
+ page = vm_normal_page(vma, address, *pte);
+ BUG_ON(!page || PageAnon(page));
+
+ if (locked_vma) {
+ mlock_vma_page(page); /* no-op if already mlocked */
+ if (page == check_page)
+ ret = SWAP_MLOCK;
+ continue; /* don't unmap */
+ }
+
+ if (ptep_clear_flush_young_notify(vma, address, pte))
+ continue;
+
+ /* Nuke the page table entry. */
+ flush_cache_page(vma, address, pte_pfn(*pte));
+ pteval = ptep_clear_flush_notify(vma, address, pte);
+
+ /* If nonlinear, store the file page offset in the pte. */
+ if (page->index != linear_page_index(vma, address))
+ set_pte_at(mm, address, pte, pgoff_to_pte(page->index));
+
+ /* Move the dirty bit to the physical page now the pte is gone. */
+ if (pte_dirty(pteval))
+ set_page_dirty(page);
+
+ page_remove_rmap(page, vma);
+ page_cache_release(page);
+ dec_mm_counter(mm, file_rss);
+ (*mapcount)--;
+ }
+ pte_unmap_unlock(pte - 1, ptl);
+ if (locked_vma)
+ up_read(&vma->vm_mm->mmap_sem);
+ return ret;
+}
+
+/*
+ * common handling for pages mapped in VM_LOCKED vmas
+ */
+static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
+{
+ int mlocked = 0;
+
+ if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
+ if (vma->vm_flags & VM_LOCKED) {
+ mlock_vma_page(page);
+ mlocked++; /* really mlocked the page */
+ }
+ up_read(&vma->vm_mm->mmap_sem);
+ }
+ return mlocked;
+}
+
+/**
+ * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
+ * rmap method
+ * @page: the page to unmap/unlock
+ * @unlock: request for unlock rather than unmap [unlikely]
+ * @migration: unmapping for migration - ignored if @unlock
+ *
+ * Find all the mappings of a page using the mapping pointer and the vma chains
+ * contained in the anon_vma struct it points to.
+ *
+ * This function is only called from try_to_unmap/try_to_munlock for
+ * anonymous pages.
+ * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
+ * where the page was found will be held for write. So, we won't recheck
+ * vm_flags for that VMA. That should be OK, because that vma shouldn't be
+ * 'LOCKED.
+ */
+static int try_to_unmap_anon(struct page *page, int unlock, int migration)
+{
+ struct anon_vma *anon_vma;
+ struct vm_area_struct *vma;
+ unsigned int mlocked = 0;
+ int ret = SWAP_AGAIN;
+
+ if (MLOCK_PAGES && unlikely(unlock))
+ ret = SWAP_SUCCESS; /* default for try_to_munlock() */
+
+ anon_vma = page_lock_anon_vma(page);
+ if (!anon_vma)
+ return ret;
+
+ list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+ if (MLOCK_PAGES && unlikely(unlock)) {
+ if (!((vma->vm_flags & VM_LOCKED) &&
+ page_mapped_in_vma(page, vma)))
+ continue; /* must visit all unlocked vmas */
+ ret = SWAP_MLOCK; /* saw at least one mlocked vma */
+ } else {
+ ret = try_to_unmap_one(page, vma, migration);
+ if (ret == SWAP_FAIL || !page_mapped(page))
+ break;
+ }
+ if (ret == SWAP_MLOCK) {
+ mlocked = try_to_mlock_page(page, vma);
+ if (mlocked)
+ break; /* stop if actually mlocked page */
+ }
+ }
+
+ page_unlock_anon_vma(anon_vma);
+
+ if (mlocked)
+ ret = SWAP_MLOCK; /* actually mlocked the page */
+ else if (ret == SWAP_MLOCK)
+ ret = SWAP_AGAIN; /* saw VM_LOCKED vma */
+
+ return ret;
+}
+
+/**
+ * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
+ * @page: the page to unmap/unlock
+ * @unlock: request for unlock rather than unmap [unlikely]
+ * @migration: unmapping for migration - ignored if @unlock
+ *
+ * Find all the mappings of a page using the mapping pointer and the vma chains
+ * contained in the address_space struct it points to.
+ *
+ * This function is only called from try_to_unmap/try_to_munlock for
+ * object-based pages.
+ * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
+ * where the page was found will be held for write. So, we won't recheck
+ * vm_flags for that VMA. That should be OK, because that vma shouldn't be
+ * 'LOCKED.
+ */
+static int try_to_unmap_file(struct page *page, int unlock, int migration)
+{
+ struct address_space *mapping = page->mapping;
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ struct vm_area_struct *vma;
+ struct prio_tree_iter iter;
+ int ret = SWAP_AGAIN;
+ unsigned long cursor;
+ unsigned long max_nl_cursor = 0;
+ unsigned long max_nl_size = 0;
+ unsigned int mapcount;
+ unsigned int mlocked = 0;
+
+ if (MLOCK_PAGES && unlikely(unlock))
+ ret = SWAP_SUCCESS; /* default for try_to_munlock() */
+
+ spin_lock(&mapping->i_mmap_lock);
+ vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ if (MLOCK_PAGES && unlikely(unlock)) {
+ if (!(vma->vm_flags & VM_LOCKED))
+ continue; /* must visit all vmas */
+ ret = SWAP_MLOCK;
+ } else {
+ ret = try_to_unmap_one(page, vma, migration);
+ if (ret == SWAP_FAIL || !page_mapped(page))
+ goto out;
+ }
+ if (ret == SWAP_MLOCK) {
+ mlocked = try_to_mlock_page(page, vma);
+ if (mlocked)
+ break; /* stop if actually mlocked page */
+ }
+ }
+
+ if (mlocked)
+ goto out;
+
+ if (list_empty(&mapping->i_mmap_nonlinear))
+ goto out;
+
+ list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
+ shared.vm_set.list) {
+ if (MLOCK_PAGES && unlikely(unlock)) {
+ if (!(vma->vm_flags & VM_LOCKED))
+ continue; /* must visit all vmas */
+ ret = SWAP_MLOCK; /* leave mlocked == 0 */
+ goto out; /* no need to look further */
+ }
+ if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED))
+ continue;
+ cursor = (unsigned long) vma->vm_private_data;
+ if (cursor > max_nl_cursor)
+ max_nl_cursor = cursor;
+ cursor = vma->vm_end - vma->vm_start;
+ if (cursor > max_nl_size)
+ max_nl_size = cursor;
+ }
+
+ if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
+ ret = SWAP_FAIL;
+ goto out;
+ }
+
+ /*
+ * We don't try to search for this page in the nonlinear vmas,
+ * and page_referenced wouldn't have found it anyway. Instead
+ * just walk the nonlinear vmas trying to age and unmap some.
+ * The mapcount of the page we came in with is irrelevant,
+ * but even so use it as a guide to how hard we should try?
+ */
+ mapcount = page_mapcount(page);
+ if (!mapcount)
+ goto out;
+ cond_resched_lock(&mapping->i_mmap_lock);
+
+ max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
+ if (max_nl_cursor == 0)
+ max_nl_cursor = CLUSTER_SIZE;
+
+ do {
+ list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
+ shared.vm_set.list) {
+ if (!MLOCK_PAGES && !migration &&
+ (vma->vm_flags & VM_LOCKED))
+ continue;
+ cursor = (unsigned long) vma->vm_private_data;
+ while ( cursor < max_nl_cursor &&
+ cursor < vma->vm_end - vma->vm_start) {
+ ret = try_to_unmap_cluster(cursor, &mapcount,
+ vma, page);
+ if (ret == SWAP_MLOCK)
+ mlocked = 2; /* to return below */
+ cursor += CLUSTER_SIZE;
+ vma->vm_private_data = (void *) cursor;
+ if ((int)mapcount <= 0)
+ goto out;
+ }
+ vma->vm_private_data = (void *) max_nl_cursor;
+ }
+ cond_resched_lock(&mapping->i_mmap_lock);
+ max_nl_cursor += CLUSTER_SIZE;
+ } while (max_nl_cursor <= max_nl_size);
+
+ /*
+ * Don't loop forever (perhaps all the remaining pages are
+ * in locked vmas). Reset cursor on all unreserved nonlinear
+ * vmas, now forgetting on which ones it had fallen behind.
+ */
+ list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
+ vma->vm_private_data = NULL;
+out:
+ spin_unlock(&mapping->i_mmap_lock);
+ if (mlocked)
+ ret = SWAP_MLOCK; /* actually mlocked the page */
+ else if (ret == SWAP_MLOCK)
+ ret = SWAP_AGAIN; /* saw VM_LOCKED vma */
+ return ret;
+}
+
+/**
+ * try_to_unmap - try to remove all page table mappings to a page
+ * @page: the page to get unmapped
+ * @migration: migration flag
+ *
+ * Tries to remove all the page table entries which are mapping this
+ * page, used in the pageout path. Caller must hold the page lock.
+ * Return values are:
+ *
+ * SWAP_SUCCESS - we succeeded in removing all mappings
+ * SWAP_AGAIN - we missed a mapping, try again later
+ * SWAP_FAIL - the page is unswappable
+ * SWAP_MLOCK - page is mlocked.
+ */
+int try_to_unmap(struct page *page, int migration)
+{
+ int ret;
+
+ BUG_ON(!PageLocked(page));
+
+ if (PageAnon(page))
+ ret = try_to_unmap_anon(page, 0, migration);
+ else
+ ret = try_to_unmap_file(page, 0, migration);
+ if (ret != SWAP_MLOCK && !page_mapped(page))
+ ret = SWAP_SUCCESS;
+ return ret;
+}
+
+#ifdef CONFIG_UNEVICTABLE_LRU
+/**
+ * try_to_munlock - try to munlock a page
+ * @page: the page to be munlocked
+ *
+ * Called from munlock code. Checks all of the VMAs mapping the page
+ * to make sure nobody else has this page mlocked. The page will be
+ * returned with PG_mlocked cleared if no other vmas have it mlocked.
+ *
+ * Return values are:
+ *
+ * SWAP_SUCCESS - no vma's holding page mlocked.
+ * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem
+ * SWAP_MLOCK - page is now mlocked.
+ */
+int try_to_munlock(struct page *page)
+{
+ VM_BUG_ON(!PageLocked(page) || PageLRU(page));
+
+ if (PageAnon(page))
+ return try_to_unmap_anon(page, 1, 0);
+ else
+ return try_to_unmap_file(page, 1, 0);
+}
+#endif
diff --git a/mm/shmem.c b/mm/shmem.c
new file mode 100644
index 0000000..0ed0752
--- /dev/null
+++ b/mm/shmem.c
@@ -0,0 +1,2608 @@
+/*
+ * Resizable virtual memory filesystem for Linux.
+ *
+ * Copyright (C) 2000 Linus Torvalds.
+ * 2000 Transmeta Corp.
+ * 2000-2001 Christoph Rohland
+ * 2000-2001 SAP AG
+ * 2002 Red Hat Inc.
+ * Copyright (C) 2002-2005 Hugh Dickins.
+ * Copyright (C) 2002-2005 VERITAS Software Corporation.
+ * Copyright (C) 2004 Andi Kleen, SuSE Labs
+ *
+ * Extended attribute support for tmpfs:
+ * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+ * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
+ *
+ * This file is released under the GPL.
+ */
+
+/*
+ * This virtual memory filesystem is heavily based on the ramfs. It
+ * extends ramfs by the ability to use swap and honor resource limits
+ * which makes it a completely usable filesystem.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/xattr.h>
+#include <linux/exportfs.h>
+#include <linux/generic_acl.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/file.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/backing-dev.h>
+#include <linux/shmem_fs.h>
+#include <linux/mount.h>
+#include <linux/writeback.h>
+#include <linux/vfs.h>
+#include <linux/blkdev.h>
+#include <linux/security.h>
+#include <linux/swapops.h>
+#include <linux/mempolicy.h>
+#include <linux/namei.h>
+#include <linux/ctype.h>
+#include <linux/migrate.h>
+#include <linux/highmem.h>
+#include <linux/seq_file.h>
+#include <linux/magic.h>
+
+#include <asm/uaccess.h>
+#include <asm/div64.h>
+#include <asm/pgtable.h>
+
+#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
+#define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
+#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
+
+#define SHMEM_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
+#define SHMEM_MAX_BYTES ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT)
+
+#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
+
+/* info->flags needs VM_flags to handle pagein/truncate races efficiently */
+#define SHMEM_PAGEIN VM_READ
+#define SHMEM_TRUNCATE VM_WRITE
+
+/* Definition to limit shmem_truncate's steps between cond_rescheds */
+#define LATENCY_LIMIT 64
+
+/* Pretend that each entry is of this size in directory's i_size */
+#define BOGO_DIRENT_SIZE 20
+
+/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
+enum sgp_type {
+ SGP_READ, /* don't exceed i_size, don't allocate page */
+ SGP_CACHE, /* don't exceed i_size, may allocate page */
+ SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */
+ SGP_WRITE, /* may exceed i_size, may allocate page */
+};
+
+#ifdef CONFIG_TMPFS
+static unsigned long shmem_default_max_blocks(void)
+{
+ return totalram_pages / 2;
+}
+
+static unsigned long shmem_default_max_inodes(void)
+{
+ return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
+}
+#endif
+
+static int shmem_getpage(struct inode *inode, unsigned long idx,
+ struct page **pagep, enum sgp_type sgp, int *type);
+
+static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
+{
+ /*
+ * The above definition of ENTRIES_PER_PAGE, and the use of
+ * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
+ * might be reconsidered if it ever diverges from PAGE_SIZE.
+ *
+ * Mobility flags are masked out as swap vectors cannot move
+ */
+ return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO,
+ PAGE_CACHE_SHIFT-PAGE_SHIFT);
+}
+
+static inline void shmem_dir_free(struct page *page)
+{
+ __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT);
+}
+
+static struct page **shmem_dir_map(struct page *page)
+{
+ return (struct page **)kmap_atomic(page, KM_USER0);
+}
+
+static inline void shmem_dir_unmap(struct page **dir)
+{
+ kunmap_atomic(dir, KM_USER0);
+}
+
+static swp_entry_t *shmem_swp_map(struct page *page)
+{
+ return (swp_entry_t *)kmap_atomic(page, KM_USER1);
+}
+
+static inline void shmem_swp_balance_unmap(void)
+{
+ /*
+ * When passing a pointer to an i_direct entry, to code which
+ * also handles indirect entries and so will shmem_swp_unmap,
+ * we must arrange for the preempt count to remain in balance.
+ * What kmap_atomic of a lowmem page does depends on config
+ * and architecture, so pretend to kmap_atomic some lowmem page.
+ */
+ (void) kmap_atomic(ZERO_PAGE(0), KM_USER1);
+}
+
+static inline void shmem_swp_unmap(swp_entry_t *entry)
+{
+ kunmap_atomic(entry, KM_USER1);
+}
+
+static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+
+/*
+ * shmem_file_setup pre-accounts the whole fixed size of a VM object,
+ * for shared memory and for shared anonymous (/dev/zero) mappings
+ * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
+ * consistent with the pre-accounting of private mappings ...
+ */
+static inline int shmem_acct_size(unsigned long flags, loff_t size)
+{
+ return (flags & VM_ACCOUNT) ?
+ security_vm_enough_memory_kern(VM_ACCT(size)) : 0;
+}
+
+static inline void shmem_unacct_size(unsigned long flags, loff_t size)
+{
+ if (flags & VM_ACCOUNT)
+ vm_unacct_memory(VM_ACCT(size));
+}
+
+/*
+ * ... whereas tmpfs objects are accounted incrementally as
+ * pages are allocated, in order to allow huge sparse files.
+ * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
+ * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
+ */
+static inline int shmem_acct_block(unsigned long flags)
+{
+ return (flags & VM_ACCOUNT) ?
+ 0 : security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE));
+}
+
+static inline void shmem_unacct_blocks(unsigned long flags, long pages)
+{
+ if (!(flags & VM_ACCOUNT))
+ vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
+}
+
+static const struct super_operations shmem_ops;
+static const struct address_space_operations shmem_aops;
+static const struct file_operations shmem_file_operations;
+static const struct inode_operations shmem_inode_operations;
+static const struct inode_operations shmem_dir_inode_operations;
+static const struct inode_operations shmem_special_inode_operations;
+static struct vm_operations_struct shmem_vm_ops;
+
+static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
+ .ra_pages = 0, /* No readahead */
+ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
+ .unplug_io_fn = default_unplug_io_fn,
+};
+
+static LIST_HEAD(shmem_swaplist);
+static DEFINE_MUTEX(shmem_swaplist_mutex);
+
+static void shmem_free_blocks(struct inode *inode, long pages)
+{
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ if (sbinfo->max_blocks) {
+ spin_lock(&sbinfo->stat_lock);
+ sbinfo->free_blocks += pages;
+ inode->i_blocks -= pages*BLOCKS_PER_PAGE;
+ spin_unlock(&sbinfo->stat_lock);
+ }
+}
+
+static int shmem_reserve_inode(struct super_block *sb)
+{
+ struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+ if (sbinfo->max_inodes) {
+ spin_lock(&sbinfo->stat_lock);
+ if (!sbinfo->free_inodes) {
+ spin_unlock(&sbinfo->stat_lock);
+ return -ENOSPC;
+ }
+ sbinfo->free_inodes--;
+ spin_unlock(&sbinfo->stat_lock);
+ }
+ return 0;
+}
+
+static void shmem_free_inode(struct super_block *sb)
+{
+ struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+ if (sbinfo->max_inodes) {
+ spin_lock(&sbinfo->stat_lock);
+ sbinfo->free_inodes++;
+ spin_unlock(&sbinfo->stat_lock);
+ }
+}
+
+/**
+ * shmem_recalc_inode - recalculate the size of an inode
+ * @inode: inode to recalc
+ *
+ * We have to calculate the free blocks since the mm can drop
+ * undirtied hole pages behind our back.
+ *
+ * But normally info->alloced == inode->i_mapping->nrpages + info->swapped
+ * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
+ *
+ * It has to be called with the spinlock held.
+ */
+static void shmem_recalc_inode(struct inode *inode)
+{
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ long freed;
+
+ freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
+ if (freed > 0) {
+ info->alloced -= freed;
+ shmem_unacct_blocks(info->flags, freed);
+ shmem_free_blocks(inode, freed);
+ }
+}
+
+/**
+ * shmem_swp_entry - find the swap vector position in the info structure
+ * @info: info structure for the inode
+ * @index: index of the page to find
+ * @page: optional page to add to the structure. Has to be preset to
+ * all zeros
+ *
+ * If there is no space allocated yet it will return NULL when
+ * page is NULL, else it will use the page for the needed block,
+ * setting it to NULL on return to indicate that it has been used.
+ *
+ * The swap vector is organized the following way:
+ *
+ * There are SHMEM_NR_DIRECT entries directly stored in the
+ * shmem_inode_info structure. So small files do not need an addional
+ * allocation.
+ *
+ * For pages with index > SHMEM_NR_DIRECT there is the pointer
+ * i_indirect which points to a page which holds in the first half
+ * doubly indirect blocks, in the second half triple indirect blocks:
+ *
+ * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the
+ * following layout (for SHMEM_NR_DIRECT == 16):
+ *
+ * i_indirect -> dir --> 16-19
+ * | +-> 20-23
+ * |
+ * +-->dir2 --> 24-27
+ * | +-> 28-31
+ * | +-> 32-35
+ * | +-> 36-39
+ * |
+ * +-->dir3 --> 40-43
+ * +-> 44-47
+ * +-> 48-51
+ * +-> 52-55
+ */
+static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page)
+{
+ unsigned long offset;
+ struct page **dir;
+ struct page *subdir;
+
+ if (index < SHMEM_NR_DIRECT) {
+ shmem_swp_balance_unmap();
+ return info->i_direct+index;
+ }
+ if (!info->i_indirect) {
+ if (page) {
+ info->i_indirect = *page;
+ *page = NULL;
+ }
+ return NULL; /* need another page */
+ }
+
+ index -= SHMEM_NR_DIRECT;
+ offset = index % ENTRIES_PER_PAGE;
+ index /= ENTRIES_PER_PAGE;
+ dir = shmem_dir_map(info->i_indirect);
+
+ if (index >= ENTRIES_PER_PAGE/2) {
+ index -= ENTRIES_PER_PAGE/2;
+ dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
+ index %= ENTRIES_PER_PAGE;
+ subdir = *dir;
+ if (!subdir) {
+ if (page) {
+ *dir = *page;
+ *page = NULL;
+ }
+ shmem_dir_unmap(dir);
+ return NULL; /* need another page */
+ }
+ shmem_dir_unmap(dir);
+ dir = shmem_dir_map(subdir);
+ }
+
+ dir += index;
+ subdir = *dir;
+ if (!subdir) {
+ if (!page || !(subdir = *page)) {
+ shmem_dir_unmap(dir);
+ return NULL; /* need a page */
+ }
+ *dir = subdir;
+ *page = NULL;
+ }
+ shmem_dir_unmap(dir);
+ return shmem_swp_map(subdir) + offset;
+}
+
+static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value)
+{
+ long incdec = value? 1: -1;
+
+ entry->val = value;
+ info->swapped += incdec;
+ if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
+ struct page *page = kmap_atomic_to_page(entry);
+ set_page_private(page, page_private(page) + incdec);
+ }
+}
+
+/**
+ * shmem_swp_alloc - get the position of the swap entry for the page.
+ * @info: info structure for the inode
+ * @index: index of the page to find
+ * @sgp: check and recheck i_size? skip allocation?
+ *
+ * If the entry does not exist, allocate it.
+ */
+static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp)
+{
+ struct inode *inode = &info->vfs_inode;
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ struct page *page = NULL;
+ swp_entry_t *entry;
+
+ if (sgp != SGP_WRITE &&
+ ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
+ return ERR_PTR(-EINVAL);
+
+ while (!(entry = shmem_swp_entry(info, index, &page))) {
+ if (sgp == SGP_READ)
+ return shmem_swp_map(ZERO_PAGE(0));
+ /*
+ * Test free_blocks against 1 not 0, since we have 1 data
+ * page (and perhaps indirect index pages) yet to allocate:
+ * a waste to allocate index if we cannot allocate data.
+ */
+ if (sbinfo->max_blocks) {
+ spin_lock(&sbinfo->stat_lock);
+ if (sbinfo->free_blocks <= 1) {
+ spin_unlock(&sbinfo->stat_lock);
+ return ERR_PTR(-ENOSPC);
+ }
+ sbinfo->free_blocks--;
+ inode->i_blocks += BLOCKS_PER_PAGE;
+ spin_unlock(&sbinfo->stat_lock);
+ }
+
+ spin_unlock(&info->lock);
+ page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping));
+ if (page)
+ set_page_private(page, 0);
+ spin_lock(&info->lock);
+
+ if (!page) {
+ shmem_free_blocks(inode, 1);
+ return ERR_PTR(-ENOMEM);
+ }
+ if (sgp != SGP_WRITE &&
+ ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+ entry = ERR_PTR(-EINVAL);
+ break;
+ }
+ if (info->next_index <= index)
+ info->next_index = index + 1;
+ }
+ if (page) {
+ /* another task gave its page, or truncated the file */
+ shmem_free_blocks(inode, 1);
+ shmem_dir_free(page);
+ }
+ if (info->next_index <= index && !IS_ERR(entry))
+ info->next_index = index + 1;
+ return entry;
+}
+
+/**
+ * shmem_free_swp - free some swap entries in a directory
+ * @dir: pointer to the directory
+ * @edir: pointer after last entry of the directory
+ * @punch_lock: pointer to spinlock when needed for the holepunch case
+ */
+static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir,
+ spinlock_t *punch_lock)
+{
+ spinlock_t *punch_unlock = NULL;
+ swp_entry_t *ptr;
+ int freed = 0;
+
+ for (ptr = dir; ptr < edir; ptr++) {
+ if (ptr->val) {
+ if (unlikely(punch_lock)) {
+ punch_unlock = punch_lock;
+ punch_lock = NULL;
+ spin_lock(punch_unlock);
+ if (!ptr->val)
+ continue;
+ }
+ free_swap_and_cache(*ptr);
+ *ptr = (swp_entry_t){0};
+ freed++;
+ }
+ }
+ if (punch_unlock)
+ spin_unlock(punch_unlock);
+ return freed;
+}
+
+static int shmem_map_and_free_swp(struct page *subdir, int offset,
+ int limit, struct page ***dir, spinlock_t *punch_lock)
+{
+ swp_entry_t *ptr;
+ int freed = 0;
+
+ ptr = shmem_swp_map(subdir);
+ for (; offset < limit; offset += LATENCY_LIMIT) {
+ int size = limit - offset;
+ if (size > LATENCY_LIMIT)
+ size = LATENCY_LIMIT;
+ freed += shmem_free_swp(ptr+offset, ptr+offset+size,
+ punch_lock);
+ if (need_resched()) {
+ shmem_swp_unmap(ptr);
+ if (*dir) {
+ shmem_dir_unmap(*dir);
+ *dir = NULL;
+ }
+ cond_resched();
+ ptr = shmem_swp_map(subdir);
+ }
+ }
+ shmem_swp_unmap(ptr);
+ return freed;
+}
+
+static void shmem_free_pages(struct list_head *next)
+{
+ struct page *page;
+ int freed = 0;
+
+ do {
+ page = container_of(next, struct page, lru);
+ next = next->next;
+ shmem_dir_free(page);
+ freed++;
+ if (freed >= LATENCY_LIMIT) {
+ cond_resched();
+ freed = 0;
+ }
+ } while (next);
+}
+
+static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
+{
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ unsigned long idx;
+ unsigned long size;
+ unsigned long limit;
+ unsigned long stage;
+ unsigned long diroff;
+ struct page **dir;
+ struct page *topdir;
+ struct page *middir;
+ struct page *subdir;
+ swp_entry_t *ptr;
+ LIST_HEAD(pages_to_free);
+ long nr_pages_to_free = 0;
+ long nr_swaps_freed = 0;
+ int offset;
+ int freed;
+ int punch_hole;
+ spinlock_t *needs_lock;
+ spinlock_t *punch_lock;
+ unsigned long upper_limit;
+
+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+ idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ if (idx >= info->next_index)
+ return;
+
+ spin_lock(&info->lock);
+ info->flags |= SHMEM_TRUNCATE;
+ if (likely(end == (loff_t) -1)) {
+ limit = info->next_index;
+ upper_limit = SHMEM_MAX_INDEX;
+ info->next_index = idx;
+ needs_lock = NULL;
+ punch_hole = 0;
+ } else {
+ if (end + 1 >= inode->i_size) { /* we may free a little more */
+ limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+ upper_limit = SHMEM_MAX_INDEX;
+ } else {
+ limit = (end + 1) >> PAGE_CACHE_SHIFT;
+ upper_limit = limit;
+ }
+ needs_lock = &info->lock;
+ punch_hole = 1;
+ }
+
+ topdir = info->i_indirect;
+ if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
+ info->i_indirect = NULL;
+ nr_pages_to_free++;
+ list_add(&topdir->lru, &pages_to_free);
+ }
+ spin_unlock(&info->lock);
+
+ if (info->swapped && idx < SHMEM_NR_DIRECT) {
+ ptr = info->i_direct;
+ size = limit;
+ if (size > SHMEM_NR_DIRECT)
+ size = SHMEM_NR_DIRECT;
+ nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock);
+ }
+
+ /*
+ * If there are no indirect blocks or we are punching a hole
+ * below indirect blocks, nothing to be done.
+ */
+ if (!topdir || limit <= SHMEM_NR_DIRECT)
+ goto done2;
+
+ /*
+ * The truncation case has already dropped info->lock, and we're safe
+ * because i_size and next_index have already been lowered, preventing
+ * access beyond. But in the punch_hole case, we still need to take
+ * the lock when updating the swap directory, because there might be
+ * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or
+ * shmem_writepage. However, whenever we find we can remove a whole
+ * directory page (not at the misaligned start or end of the range),
+ * we first NULLify its pointer in the level above, and then have no
+ * need to take the lock when updating its contents: needs_lock and
+ * punch_lock (either pointing to info->lock or NULL) manage this.
+ */
+
+ upper_limit -= SHMEM_NR_DIRECT;
+ limit -= SHMEM_NR_DIRECT;
+ idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
+ offset = idx % ENTRIES_PER_PAGE;
+ idx -= offset;
+
+ dir = shmem_dir_map(topdir);
+ stage = ENTRIES_PER_PAGEPAGE/2;
+ if (idx < ENTRIES_PER_PAGEPAGE/2) {
+ middir = topdir;
+ diroff = idx/ENTRIES_PER_PAGE;
+ } else {
+ dir += ENTRIES_PER_PAGE/2;
+ dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
+ while (stage <= idx)
+ stage += ENTRIES_PER_PAGEPAGE;
+ middir = *dir;
+ if (*dir) {
+ diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
+ ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
+ if (!diroff && !offset && upper_limit >= stage) {
+ if (needs_lock) {
+ spin_lock(needs_lock);
+ *dir = NULL;
+ spin_unlock(needs_lock);
+ needs_lock = NULL;
+ } else
+ *dir = NULL;
+ nr_pages_to_free++;
+ list_add(&middir->lru, &pages_to_free);
+ }
+ shmem_dir_unmap(dir);
+ dir = shmem_dir_map(middir);
+ } else {
+ diroff = 0;
+ offset = 0;
+ idx = stage;
+ }
+ }
+
+ for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) {
+ if (unlikely(idx == stage)) {
+ shmem_dir_unmap(dir);
+ dir = shmem_dir_map(topdir) +
+ ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
+ while (!*dir) {
+ dir++;
+ idx += ENTRIES_PER_PAGEPAGE;
+ if (idx >= limit)
+ goto done1;
+ }
+ stage = idx + ENTRIES_PER_PAGEPAGE;
+ middir = *dir;
+ if (punch_hole)
+ needs_lock = &info->lock;
+ if (upper_limit >= stage) {
+ if (needs_lock) {
+ spin_lock(needs_lock);
+ *dir = NULL;
+ spin_unlock(needs_lock);
+ needs_lock = NULL;
+ } else
+ *dir = NULL;
+ nr_pages_to_free++;
+ list_add(&middir->lru, &pages_to_free);
+ }
+ shmem_dir_unmap(dir);
+ cond_resched();
+ dir = shmem_dir_map(middir);
+ diroff = 0;
+ }
+ punch_lock = needs_lock;
+ subdir = dir[diroff];
+ if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) {
+ if (needs_lock) {
+ spin_lock(needs_lock);
+ dir[diroff] = NULL;
+ spin_unlock(needs_lock);
+ punch_lock = NULL;
+ } else
+ dir[diroff] = NULL;
+ nr_pages_to_free++;
+ list_add(&subdir->lru, &pages_to_free);
+ }
+ if (subdir && page_private(subdir) /* has swap entries */) {
+ size = limit - idx;
+ if (size > ENTRIES_PER_PAGE)
+ size = ENTRIES_PER_PAGE;
+ freed = shmem_map_and_free_swp(subdir,
+ offset, size, &dir, punch_lock);
+ if (!dir)
+ dir = shmem_dir_map(middir);
+ nr_swaps_freed += freed;
+ if (offset || punch_lock) {
+ spin_lock(&info->lock);
+ set_page_private(subdir,
+ page_private(subdir) - freed);
+ spin_unlock(&info->lock);
+ } else
+ BUG_ON(page_private(subdir) != freed);
+ }
+ offset = 0;
+ }
+done1:
+ shmem_dir_unmap(dir);
+done2:
+ if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
+ /*
+ * Call truncate_inode_pages again: racing shmem_unuse_inode
+ * may have swizzled a page in from swap since vmtruncate or
+ * generic_delete_inode did it, before we lowered next_index.
+ * Also, though shmem_getpage checks i_size before adding to
+ * cache, no recheck after: so fix the narrow window there too.
+ *
+ * Recalling truncate_inode_pages_range and unmap_mapping_range
+ * every time for punch_hole (which never got a chance to clear
+ * SHMEM_PAGEIN at the start of vmtruncate_range) is expensive,
+ * yet hardly ever necessary: try to optimize them out later.
+ */
+ truncate_inode_pages_range(inode->i_mapping, start, end);
+ if (punch_hole)
+ unmap_mapping_range(inode->i_mapping, start,
+ end - start, 1);
+ }
+
+ spin_lock(&info->lock);
+ info->flags &= ~SHMEM_TRUNCATE;
+ info->swapped -= nr_swaps_freed;
+ if (nr_pages_to_free)
+ shmem_free_blocks(inode, nr_pages_to_free);
+ shmem_recalc_inode(inode);
+ spin_unlock(&info->lock);
+
+ /*
+ * Empty swap vector directory pages to be freed?
+ */
+ if (!list_empty(&pages_to_free)) {
+ pages_to_free.prev->next = NULL;
+ shmem_free_pages(pages_to_free.next);
+ }
+}
+
+static void shmem_truncate(struct inode *inode)
+{
+ shmem_truncate_range(inode, inode->i_size, (loff_t)-1);
+}
+
+static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
+{
+ struct inode *inode = dentry->d_inode;
+ struct page *page = NULL;
+ int error;
+
+ if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
+ if (attr->ia_size < inode->i_size) {
+ /*
+ * If truncating down to a partial page, then
+ * if that page is already allocated, hold it
+ * in memory until the truncation is over, so
+ * truncate_partial_page cannnot miss it were
+ * it assigned to swap.
+ */
+ if (attr->ia_size & (PAGE_CACHE_SIZE-1)) {
+ (void) shmem_getpage(inode,
+ attr->ia_size>>PAGE_CACHE_SHIFT,
+ &page, SGP_READ, NULL);
+ if (page)
+ unlock_page(page);
+ }
+ /*
+ * Reset SHMEM_PAGEIN flag so that shmem_truncate can
+ * detect if any pages might have been added to cache
+ * after truncate_inode_pages. But we needn't bother
+ * if it's being fully truncated to zero-length: the
+ * nrpages check is efficient enough in that case.
+ */
+ if (attr->ia_size) {
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ spin_lock(&info->lock);
+ info->flags &= ~SHMEM_PAGEIN;
+ spin_unlock(&info->lock);
+ }
+ }
+ }
+
+ error = inode_change_ok(inode, attr);
+ if (!error)
+ error = inode_setattr(inode, attr);
+#ifdef CONFIG_TMPFS_POSIX_ACL
+ if (!error && (attr->ia_valid & ATTR_MODE))
+ error = generic_acl_chmod(inode, &shmem_acl_ops);
+#endif
+ if (page)
+ page_cache_release(page);
+ return error;
+}
+
+static void shmem_delete_inode(struct inode *inode)
+{
+ struct shmem_inode_info *info = SHMEM_I(inode);
+
+ if (inode->i_op->truncate == shmem_truncate) {
+ truncate_inode_pages(inode->i_mapping, 0);
+ shmem_unacct_size(info->flags, inode->i_size);
+ inode->i_size = 0;
+ shmem_truncate(inode);
+ if (!list_empty(&info->swaplist)) {
+ mutex_lock(&shmem_swaplist_mutex);
+ list_del_init(&info->swaplist);
+ mutex_unlock(&shmem_swaplist_mutex);
+ }
+ }
+ BUG_ON(inode->i_blocks);
+ shmem_free_inode(inode->i_sb);
+ clear_inode(inode);
+}
+
+static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
+{
+ swp_entry_t *ptr;
+
+ for (ptr = dir; ptr < edir; ptr++) {
+ if (ptr->val == entry.val)
+ return ptr - dir;
+ }
+ return -1;
+}
+
+static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
+{
+ struct inode *inode;
+ unsigned long idx;
+ unsigned long size;
+ unsigned long limit;
+ unsigned long stage;
+ struct page **dir;
+ struct page *subdir;
+ swp_entry_t *ptr;
+ int offset;
+ int error;
+
+ idx = 0;
+ ptr = info->i_direct;
+ spin_lock(&info->lock);
+ if (!info->swapped) {
+ list_del_init(&info->swaplist);
+ goto lost2;
+ }
+ limit = info->next_index;
+ size = limit;
+ if (size > SHMEM_NR_DIRECT)
+ size = SHMEM_NR_DIRECT;
+ offset = shmem_find_swp(entry, ptr, ptr+size);
+ if (offset >= 0)
+ goto found;
+ if (!info->i_indirect)
+ goto lost2;
+
+ dir = shmem_dir_map(info->i_indirect);
+ stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2;
+
+ for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
+ if (unlikely(idx == stage)) {
+ shmem_dir_unmap(dir-1);
+ if (cond_resched_lock(&info->lock)) {
+ /* check it has not been truncated */
+ if (limit > info->next_index) {
+ limit = info->next_index;
+ if (idx >= limit)
+ goto lost2;
+ }
+ }
+ dir = shmem_dir_map(info->i_indirect) +
+ ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
+ while (!*dir) {
+ dir++;
+ idx += ENTRIES_PER_PAGEPAGE;
+ if (idx >= limit)
+ goto lost1;
+ }
+ stage = idx + ENTRIES_PER_PAGEPAGE;
+ subdir = *dir;
+ shmem_dir_unmap(dir);
+ dir = shmem_dir_map(subdir);
+ }
+ subdir = *dir;
+ if (subdir && page_private(subdir)) {
+ ptr = shmem_swp_map(subdir);
+ size = limit - idx;
+ if (size > ENTRIES_PER_PAGE)
+ size = ENTRIES_PER_PAGE;
+ offset = shmem_find_swp(entry, ptr, ptr+size);
+ shmem_swp_unmap(ptr);
+ if (offset >= 0) {
+ shmem_dir_unmap(dir);
+ goto found;
+ }
+ }
+ }
+lost1:
+ shmem_dir_unmap(dir-1);
+lost2:
+ spin_unlock(&info->lock);
+ return 0;
+found:
+ idx += offset;
+ inode = igrab(&info->vfs_inode);
+ spin_unlock(&info->lock);
+
+ /*
+ * Move _head_ to start search for next from here.
+ * But be careful: shmem_delete_inode checks list_empty without taking
+ * mutex, and there's an instant in list_move_tail when info->swaplist
+ * would appear empty, if it were the only one on shmem_swaplist. We
+ * could avoid doing it if inode NULL; or use this minor optimization.
+ */
+ if (shmem_swaplist.next != &info->swaplist)
+ list_move_tail(&shmem_swaplist, &info->swaplist);
+ mutex_unlock(&shmem_swaplist_mutex);
+
+ error = 1;
+ if (!inode)
+ goto out;
+ /* Precharge page using GFP_KERNEL while we can wait */
+ error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
+ if (error)
+ goto out;
+ error = radix_tree_preload(GFP_KERNEL);
+ if (error) {
+ mem_cgroup_uncharge_cache_page(page);
+ goto out;
+ }
+ error = 1;
+
+ spin_lock(&info->lock);
+ ptr = shmem_swp_entry(info, idx, NULL);
+ if (ptr && ptr->val == entry.val) {
+ error = add_to_page_cache_locked(page, inode->i_mapping,
+ idx, GFP_NOWAIT);
+ /* does mem_cgroup_uncharge_cache_page on error */
+ } else /* we must compensate for our precharge above */
+ mem_cgroup_uncharge_cache_page(page);
+
+ if (error == -EEXIST) {
+ struct page *filepage = find_get_page(inode->i_mapping, idx);
+ error = 1;
+ if (filepage) {
+ /*
+ * There might be a more uptodate page coming down
+ * from a stacked writepage: forget our swappage if so.
+ */
+ if (PageUptodate(filepage))
+ error = 0;
+ page_cache_release(filepage);
+ }
+ }
+ if (!error) {
+ delete_from_swap_cache(page);
+ set_page_dirty(page);
+ info->flags |= SHMEM_PAGEIN;
+ shmem_swp_set(info, ptr, 0);
+ swap_free(entry);
+ error = 1; /* not an error, but entry was found */
+ }
+ if (ptr)
+ shmem_swp_unmap(ptr);
+ spin_unlock(&info->lock);
+ radix_tree_preload_end();
+out:
+ unlock_page(page);
+ page_cache_release(page);
+ iput(inode); /* allows for NULL */
+ return error;
+}
+
+/*
+ * shmem_unuse() search for an eventually swapped out shmem page.
+ */
+int shmem_unuse(swp_entry_t entry, struct page *page)
+{
+ struct list_head *p, *next;
+ struct shmem_inode_info *info;
+ int found = 0;
+
+ mutex_lock(&shmem_swaplist_mutex);
+ list_for_each_safe(p, next, &shmem_swaplist) {
+ info = list_entry(p, struct shmem_inode_info, swaplist);
+ found = shmem_unuse_inode(info, entry, page);
+ cond_resched();
+ if (found)
+ goto out;
+ }
+ mutex_unlock(&shmem_swaplist_mutex);
+out: return found; /* 0 or 1 or -ENOMEM */
+}
+
+/*
+ * Move the page from the page cache to the swap cache.
+ */
+static int shmem_writepage(struct page *page, struct writeback_control *wbc)
+{
+ struct shmem_inode_info *info;
+ swp_entry_t *entry, swap;
+ struct address_space *mapping;
+ unsigned long index;
+ struct inode *inode;
+
+ BUG_ON(!PageLocked(page));
+ mapping = page->mapping;
+ index = page->index;
+ inode = mapping->host;
+ info = SHMEM_I(inode);
+ if (info->flags & VM_LOCKED)
+ goto redirty;
+ if (!total_swap_pages)
+ goto redirty;
+
+ /*
+ * shmem_backing_dev_info's capabilities prevent regular writeback or
+ * sync from ever calling shmem_writepage; but a stacking filesystem
+ * may use the ->writepage of its underlying filesystem, in which case
+ * tmpfs should write out to swap only in response to memory pressure,
+ * and not for pdflush or sync. However, in those cases, we do still
+ * want to check if there's a redundant swappage to be discarded.
+ */
+ if (wbc->for_reclaim)
+ swap = get_swap_page();
+ else
+ swap.val = 0;
+
+ spin_lock(&info->lock);
+ if (index >= info->next_index) {
+ BUG_ON(!(info->flags & SHMEM_TRUNCATE));
+ goto unlock;
+ }
+ entry = shmem_swp_entry(info, index, NULL);
+ if (entry->val) {
+ /*
+ * The more uptodate page coming down from a stacked
+ * writepage should replace our old swappage.
+ */
+ free_swap_and_cache(*entry);
+ shmem_swp_set(info, entry, 0);
+ }
+ shmem_recalc_inode(inode);
+
+ if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
+ remove_from_page_cache(page);
+ shmem_swp_set(info, entry, swap.val);
+ shmem_swp_unmap(entry);
+ if (list_empty(&info->swaplist))
+ inode = igrab(inode);
+ else
+ inode = NULL;
+ spin_unlock(&info->lock);
+ swap_duplicate(swap);
+ BUG_ON(page_mapped(page));
+ page_cache_release(page); /* pagecache ref */
+ set_page_dirty(page);
+ unlock_page(page);
+ if (inode) {
+ mutex_lock(&shmem_swaplist_mutex);
+ /* move instead of add in case we're racing */
+ list_move_tail(&info->swaplist, &shmem_swaplist);
+ mutex_unlock(&shmem_swaplist_mutex);
+ iput(inode);
+ }
+ return 0;
+ }
+
+ shmem_swp_unmap(entry);
+unlock:
+ spin_unlock(&info->lock);
+ swap_free(swap);
+redirty:
+ set_page_dirty(page);
+ if (wbc->for_reclaim)
+ return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */
+ unlock_page(page);
+ return 0;
+}
+
+#ifdef CONFIG_NUMA
+#ifdef CONFIG_TMPFS
+static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
+{
+ char buffer[64];
+
+ if (!mpol || mpol->mode == MPOL_DEFAULT)
+ return; /* show nothing */
+
+ mpol_to_str(buffer, sizeof(buffer), mpol, 1);
+
+ seq_printf(seq, ",mpol=%s", buffer);
+}
+
+static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
+{
+ struct mempolicy *mpol = NULL;
+ if (sbinfo->mpol) {
+ spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
+ mpol = sbinfo->mpol;
+ mpol_get(mpol);
+ spin_unlock(&sbinfo->stat_lock);
+ }
+ return mpol;
+}
+#endif /* CONFIG_TMPFS */
+
+static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
+ struct shmem_inode_info *info, unsigned long idx)
+{
+ struct mempolicy mpol, *spol;
+ struct vm_area_struct pvma;
+ struct page *page;
+
+ spol = mpol_cond_copy(&mpol,
+ mpol_shared_policy_lookup(&info->policy, idx));
+
+ /* Create a pseudo vma that just contains the policy */
+ pvma.vm_start = 0;
+ pvma.vm_pgoff = idx;
+ pvma.vm_ops = NULL;
+ pvma.vm_policy = spol;
+ page = swapin_readahead(entry, gfp, &pvma, 0);
+ return page;
+}
+
+static struct page *shmem_alloc_page(gfp_t gfp,
+ struct shmem_inode_info *info, unsigned long idx)
+{
+ struct vm_area_struct pvma;
+
+ /* Create a pseudo vma that just contains the policy */
+ pvma.vm_start = 0;
+ pvma.vm_pgoff = idx;
+ pvma.vm_ops = NULL;
+ pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
+
+ /*
+ * alloc_page_vma() will drop the shared policy reference
+ */
+ return alloc_page_vma(gfp, &pvma, 0);
+}
+#else /* !CONFIG_NUMA */
+#ifdef CONFIG_TMPFS
+static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p)
+{
+}
+#endif /* CONFIG_TMPFS */
+
+static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
+ struct shmem_inode_info *info, unsigned long idx)
+{
+ return swapin_readahead(entry, gfp, NULL, 0);
+}
+
+static inline struct page *shmem_alloc_page(gfp_t gfp,
+ struct shmem_inode_info *info, unsigned long idx)
+{
+ return alloc_page(gfp);
+}
+#endif /* CONFIG_NUMA */
+
+#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS)
+static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
+{
+ return NULL;
+}
+#endif
+
+/*
+ * shmem_getpage - either get the page from swap or allocate a new one
+ *
+ * If we allocate a new one we do not mark it dirty. That's up to the
+ * vm. If we swap it in we mark it dirty since we also free the swap
+ * entry since a page cannot live in both the swap and page cache
+ */
+static int shmem_getpage(struct inode *inode, unsigned long idx,
+ struct page **pagep, enum sgp_type sgp, int *type)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_sb_info *sbinfo;
+ struct page *filepage = *pagep;
+ struct page *swappage;
+ swp_entry_t *entry;
+ swp_entry_t swap;
+ gfp_t gfp;
+ int error;
+
+ if (idx >= SHMEM_MAX_INDEX)
+ return -EFBIG;
+
+ if (type)
+ *type = 0;
+
+ /*
+ * Normally, filepage is NULL on entry, and either found
+ * uptodate immediately, or allocated and zeroed, or read
+ * in under swappage, which is then assigned to filepage.
+ * But shmem_readpage (required for splice) passes in a locked
+ * filepage, which may be found not uptodate by other callers
+ * too, and may need to be copied from the swappage read in.
+ */
+repeat:
+ if (!filepage)
+ filepage = find_lock_page(mapping, idx);
+ if (filepage && PageUptodate(filepage))
+ goto done;
+ error = 0;
+ gfp = mapping_gfp_mask(mapping);
+ if (!filepage) {
+ /*
+ * Try to preload while we can wait, to not make a habit of
+ * draining atomic reserves; but don't latch on to this cpu.
+ */
+ error = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
+ if (error)
+ goto failed;
+ radix_tree_preload_end();
+ }
+
+ spin_lock(&info->lock);
+ shmem_recalc_inode(inode);
+ entry = shmem_swp_alloc(info, idx, sgp);
+ if (IS_ERR(entry)) {
+ spin_unlock(&info->lock);
+ error = PTR_ERR(entry);
+ goto failed;
+ }
+ swap = *entry;
+
+ if (swap.val) {
+ /* Look it up and read it in.. */
+ swappage = lookup_swap_cache(swap);
+ if (!swappage) {
+ shmem_swp_unmap(entry);
+ /* here we actually do the io */
+ if (type && !(*type & VM_FAULT_MAJOR)) {
+ __count_vm_event(PGMAJFAULT);
+ *type |= VM_FAULT_MAJOR;
+ }
+ spin_unlock(&info->lock);
+ swappage = shmem_swapin(swap, gfp, info, idx);
+ if (!swappage) {
+ spin_lock(&info->lock);
+ entry = shmem_swp_alloc(info, idx, sgp);
+ if (IS_ERR(entry))
+ error = PTR_ERR(entry);
+ else {
+ if (entry->val == swap.val)
+ error = -ENOMEM;
+ shmem_swp_unmap(entry);
+ }
+ spin_unlock(&info->lock);
+ if (error)
+ goto failed;
+ goto repeat;
+ }
+ wait_on_page_locked(swappage);
+ page_cache_release(swappage);
+ goto repeat;
+ }
+
+ /* We have to do this with page locked to prevent races */
+ if (!trylock_page(swappage)) {
+ shmem_swp_unmap(entry);
+ spin_unlock(&info->lock);
+ wait_on_page_locked(swappage);
+ page_cache_release(swappage);
+ goto repeat;
+ }
+ if (PageWriteback(swappage)) {
+ shmem_swp_unmap(entry);
+ spin_unlock(&info->lock);
+ wait_on_page_writeback(swappage);
+ unlock_page(swappage);
+ page_cache_release(swappage);
+ goto repeat;
+ }
+ if (!PageUptodate(swappage)) {
+ shmem_swp_unmap(entry);
+ spin_unlock(&info->lock);
+ unlock_page(swappage);
+ page_cache_release(swappage);
+ error = -EIO;
+ goto failed;
+ }
+
+ if (filepage) {
+ shmem_swp_set(info, entry, 0);
+ shmem_swp_unmap(entry);
+ delete_from_swap_cache(swappage);
+ spin_unlock(&info->lock);
+ copy_highpage(filepage, swappage);
+ unlock_page(swappage);
+ page_cache_release(swappage);
+ flush_dcache_page(filepage);
+ SetPageUptodate(filepage);
+ set_page_dirty(filepage);
+ swap_free(swap);
+ } else if (!(error = add_to_page_cache_locked(swappage, mapping,
+ idx, GFP_NOWAIT))) {
+ info->flags |= SHMEM_PAGEIN;
+ shmem_swp_set(info, entry, 0);
+ shmem_swp_unmap(entry);
+ delete_from_swap_cache(swappage);
+ spin_unlock(&info->lock);
+ filepage = swappage;
+ set_page_dirty(filepage);
+ swap_free(swap);
+ } else {
+ shmem_swp_unmap(entry);
+ spin_unlock(&info->lock);
+ unlock_page(swappage);
+ page_cache_release(swappage);
+ if (error == -ENOMEM) {
+ /* allow reclaim from this memory cgroup */
+ error = mem_cgroup_shrink_usage(current->mm,
+ gfp);
+ if (error)
+ goto failed;
+ }
+ goto repeat;
+ }
+ } else if (sgp == SGP_READ && !filepage) {
+ shmem_swp_unmap(entry);
+ filepage = find_get_page(mapping, idx);
+ if (filepage &&
+ (!PageUptodate(filepage) || !trylock_page(filepage))) {
+ spin_unlock(&info->lock);
+ wait_on_page_locked(filepage);
+ page_cache_release(filepage);
+ filepage = NULL;
+ goto repeat;
+ }
+ spin_unlock(&info->lock);
+ } else {
+ shmem_swp_unmap(entry);
+ sbinfo = SHMEM_SB(inode->i_sb);
+ if (sbinfo->max_blocks) {
+ spin_lock(&sbinfo->stat_lock);
+ if (sbinfo->free_blocks == 0 ||
+ shmem_acct_block(info->flags)) {
+ spin_unlock(&sbinfo->stat_lock);
+ spin_unlock(&info->lock);
+ error = -ENOSPC;
+ goto failed;
+ }
+ sbinfo->free_blocks--;
+ inode->i_blocks += BLOCKS_PER_PAGE;
+ spin_unlock(&sbinfo->stat_lock);
+ } else if (shmem_acct_block(info->flags)) {
+ spin_unlock(&info->lock);
+ error = -ENOSPC;
+ goto failed;
+ }
+
+ if (!filepage) {
+ int ret;
+
+ spin_unlock(&info->lock);
+ filepage = shmem_alloc_page(gfp, info, idx);
+ if (!filepage) {
+ shmem_unacct_blocks(info->flags, 1);
+ shmem_free_blocks(inode, 1);
+ error = -ENOMEM;
+ goto failed;
+ }
+ SetPageSwapBacked(filepage);
+
+ /* Precharge page while we can wait, compensate after */
+ error = mem_cgroup_cache_charge(filepage, current->mm,
+ gfp & ~__GFP_HIGHMEM);
+ if (error) {
+ page_cache_release(filepage);
+ shmem_unacct_blocks(info->flags, 1);
+ shmem_free_blocks(inode, 1);
+ filepage = NULL;
+ goto failed;
+ }
+
+ spin_lock(&info->lock);
+ entry = shmem_swp_alloc(info, idx, sgp);
+ if (IS_ERR(entry))
+ error = PTR_ERR(entry);
+ else {
+ swap = *entry;
+ shmem_swp_unmap(entry);
+ }
+ ret = error || swap.val;
+ if (ret)
+ mem_cgroup_uncharge_cache_page(filepage);
+ else
+ ret = add_to_page_cache_lru(filepage, mapping,
+ idx, GFP_NOWAIT);
+ /*
+ * At add_to_page_cache_lru() failure, uncharge will
+ * be done automatically.
+ */
+ if (ret) {
+ spin_unlock(&info->lock);
+ page_cache_release(filepage);
+ shmem_unacct_blocks(info->flags, 1);
+ shmem_free_blocks(inode, 1);
+ filepage = NULL;
+ if (error)
+ goto failed;
+ goto repeat;
+ }
+ info->flags |= SHMEM_PAGEIN;
+ }
+
+ info->alloced++;
+ spin_unlock(&info->lock);
+ clear_highpage(filepage);
+ flush_dcache_page(filepage);
+ SetPageUptodate(filepage);
+ if (sgp == SGP_DIRTY)
+ set_page_dirty(filepage);
+ }
+done:
+ *pagep = filepage;
+ return 0;
+
+failed:
+ if (*pagep != filepage) {
+ unlock_page(filepage);
+ page_cache_release(filepage);
+ }
+ return error;
+}
+
+static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ int error;
+ int ret;
+
+ if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
+ return VM_FAULT_SIGBUS;
+
+ error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
+ if (error)
+ return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
+
+ mark_page_accessed(vmf->page);
+ return ret | VM_FAULT_LOCKED;
+}
+
+#ifdef CONFIG_NUMA
+static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
+{
+ struct inode *i = vma->vm_file->f_path.dentry->d_inode;
+ return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
+}
+
+static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct inode *i = vma->vm_file->f_path.dentry->d_inode;
+ unsigned long idx;
+
+ idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx);
+}
+#endif
+
+int shmem_lock(struct file *file, int lock, struct user_struct *user)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ int retval = -ENOMEM;
+
+ spin_lock(&info->lock);
+ if (lock && !(info->flags & VM_LOCKED)) {
+ if (!user_shm_lock(inode->i_size, user))
+ goto out_nomem;
+ info->flags |= VM_LOCKED;
+ mapping_set_unevictable(file->f_mapping);
+ }
+ if (!lock && (info->flags & VM_LOCKED) && user) {
+ user_shm_unlock(inode->i_size, user);
+ info->flags &= ~VM_LOCKED;
+ mapping_clear_unevictable(file->f_mapping);
+ scan_mapping_unevictable_pages(file->f_mapping);
+ }
+ retval = 0;
+
+out_nomem:
+ spin_unlock(&info->lock);
+ return retval;
+}
+
+static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ file_accessed(file);
+ vma->vm_ops = &shmem_vm_ops;
+ vma->vm_flags |= VM_CAN_NONLINEAR;
+ return 0;
+}
+
+static struct inode *
+shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
+{
+ struct inode *inode;
+ struct shmem_inode_info *info;
+ struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+
+ if (shmem_reserve_inode(sb))
+ return NULL;
+
+ inode = new_inode(sb);
+ if (inode) {
+ inode->i_mode = mode;
+ inode->i_uid = current->fsuid;
+ inode->i_gid = current->fsgid;
+ inode->i_blocks = 0;
+ inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_generation = get_seconds();
+ info = SHMEM_I(inode);
+ memset(info, 0, (char *)inode - (char *)info);
+ spin_lock_init(&info->lock);
+ INIT_LIST_HEAD(&info->swaplist);
+
+ switch (mode & S_IFMT) {
+ default:
+ inode->i_op = &shmem_special_inode_operations;
+ init_special_inode(inode, mode, dev);
+ break;
+ case S_IFREG:
+ inode->i_mapping->a_ops = &shmem_aops;
+ inode->i_op = &shmem_inode_operations;
+ inode->i_fop = &shmem_file_operations;
+ mpol_shared_policy_init(&info->policy,
+ shmem_get_sbmpol(sbinfo));
+ break;
+ case S_IFDIR:
+ inc_nlink(inode);
+ /* Some things misbehave if size == 0 on a directory */
+ inode->i_size = 2 * BOGO_DIRENT_SIZE;
+ inode->i_op = &shmem_dir_inode_operations;
+ inode->i_fop = &simple_dir_operations;
+ break;
+ case S_IFLNK:
+ /*
+ * Must not load anything in the rbtree,
+ * mpol_free_shared_policy will not be called.
+ */
+ mpol_shared_policy_init(&info->policy, NULL);
+ break;
+ }
+ } else
+ shmem_free_inode(sb);
+ return inode;
+}
+
+#ifdef CONFIG_TMPFS
+static const struct inode_operations shmem_symlink_inode_operations;
+static const struct inode_operations shmem_symlink_inline_operations;
+
+/*
+ * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin;
+ * but providing them allows a tmpfs file to be used for splice, sendfile, and
+ * below the loop driver, in the generic fashion that many filesystems support.
+ */
+static int shmem_readpage(struct file *file, struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL);
+ unlock_page(page);
+ return error;
+}
+
+static int
+shmem_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ struct inode *inode = mapping->host;
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ *pagep = NULL;
+ return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
+}
+
+static int
+shmem_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = mapping->host;
+
+ if (pos + copied > inode->i_size)
+ i_size_write(inode, pos + copied);
+
+ unlock_page(page);
+ set_page_dirty(page);
+ page_cache_release(page);
+
+ return copied;
+}
+
+static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
+{
+ struct inode *inode = filp->f_path.dentry->d_inode;
+ struct address_space *mapping = inode->i_mapping;
+ unsigned long index, offset;
+ enum sgp_type sgp = SGP_READ;
+
+ /*
+ * Might this read be for a stacking filesystem? Then when reading
+ * holes of a sparse file, we actually need to allocate those pages,
+ * and even mark them dirty, so it cannot exceed the max_blocks limit.
+ */
+ if (segment_eq(get_fs(), KERNEL_DS))
+ sgp = SGP_DIRTY;
+
+ index = *ppos >> PAGE_CACHE_SHIFT;
+ offset = *ppos & ~PAGE_CACHE_MASK;
+
+ for (;;) {
+ struct page *page = NULL;
+ unsigned long end_index, nr, ret;
+ loff_t i_size = i_size_read(inode);
+
+ end_index = i_size >> PAGE_CACHE_SHIFT;
+ if (index > end_index)
+ break;
+ if (index == end_index) {
+ nr = i_size & ~PAGE_CACHE_MASK;
+ if (nr <= offset)
+ break;
+ }
+
+ desc->error = shmem_getpage(inode, index, &page, sgp, NULL);
+ if (desc->error) {
+ if (desc->error == -EINVAL)
+ desc->error = 0;
+ break;
+ }
+ if (page)
+ unlock_page(page);
+
+ /*
+ * We must evaluate after, since reads (unlike writes)
+ * are called without i_mutex protection against truncate
+ */
+ nr = PAGE_CACHE_SIZE;
+ i_size = i_size_read(inode);
+ end_index = i_size >> PAGE_CACHE_SHIFT;
+ if (index == end_index) {
+ nr = i_size & ~PAGE_CACHE_MASK;
+ if (nr <= offset) {
+ if (page)
+ page_cache_release(page);
+ break;
+ }
+ }
+ nr -= offset;
+
+ if (page) {
+ /*
+ * If users can be writing to this page using arbitrary
+ * virtual addresses, take care about potential aliasing
+ * before reading the page on the kernel side.
+ */
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_page(page);
+ /*
+ * Mark the page accessed if we read the beginning.
+ */
+ if (!offset)
+ mark_page_accessed(page);
+ } else {
+ page = ZERO_PAGE(0);
+ page_cache_get(page);
+ }
+
+ /*
+ * Ok, we have the page, and it's up-to-date, so
+ * now we can copy it to user space...
+ *
+ * The actor routine returns how many bytes were actually used..
+ * NOTE! This may not be the same as how much of a user buffer
+ * we filled up (we may be padding etc), so we can only update
+ * "pos" here (the actor routine has to update the user buffer
+ * pointers and the remaining count).
+ */
+ ret = actor(desc, page, offset, nr);
+ offset += ret;
+ index += offset >> PAGE_CACHE_SHIFT;
+ offset &= ~PAGE_CACHE_MASK;
+
+ page_cache_release(page);
+ if (ret != nr || !desc->count)
+ break;
+
+ cond_resched();
+ }
+
+ *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
+ file_accessed(filp);
+}
+
+static ssize_t shmem_file_aio_read(struct kiocb *iocb,
+ const struct iovec *iov, unsigned long nr_segs, loff_t pos)
+{
+ struct file *filp = iocb->ki_filp;
+ ssize_t retval;
+ unsigned long seg;
+ size_t count;
+ loff_t *ppos = &iocb->ki_pos;
+
+ retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
+ if (retval)
+ return retval;
+
+ for (seg = 0; seg < nr_segs; seg++) {
+ read_descriptor_t desc;
+
+ desc.written = 0;
+ desc.arg.buf = iov[seg].iov_base;
+ desc.count = iov[seg].iov_len;
+ if (desc.count == 0)
+ continue;
+ desc.error = 0;
+ do_shmem_file_read(filp, ppos, &desc, file_read_actor);
+ retval += desc.written;
+ if (desc.error) {
+ retval = retval ?: desc.error;
+ break;
+ }
+ if (desc.count > 0)
+ break;
+ }
+ return retval;
+}
+
+static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
+
+ buf->f_type = TMPFS_MAGIC;
+ buf->f_bsize = PAGE_CACHE_SIZE;
+ buf->f_namelen = NAME_MAX;
+ spin_lock(&sbinfo->stat_lock);
+ if (sbinfo->max_blocks) {
+ buf->f_blocks = sbinfo->max_blocks;
+ buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
+ }
+ if (sbinfo->max_inodes) {
+ buf->f_files = sbinfo->max_inodes;
+ buf->f_ffree = sbinfo->free_inodes;
+ }
+ /* else leave those fields 0 like simple_statfs */
+ spin_unlock(&sbinfo->stat_lock);
+ return 0;
+}
+
+/*
+ * File creation. Allocate an inode, and we're done..
+ */
+static int
+shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+{
+ struct inode *inode = shmem_get_inode(dir->i_sb, mode, dev);
+ int error = -ENOSPC;
+
+ if (inode) {
+ error = security_inode_init_security(inode, dir, NULL, NULL,
+ NULL);
+ if (error) {
+ if (error != -EOPNOTSUPP) {
+ iput(inode);
+ return error;
+ }
+ }
+ error = shmem_acl_init(inode, dir);
+ if (error) {
+ iput(inode);
+ return error;
+ }
+ if (dir->i_mode & S_ISGID) {
+ inode->i_gid = dir->i_gid;
+ if (S_ISDIR(mode))
+ inode->i_mode |= S_ISGID;
+ }
+ dir->i_size += BOGO_DIRENT_SIZE;
+ dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ d_instantiate(dentry, inode);
+ dget(dentry); /* Extra count - pin the dentry in core */
+ }
+ return error;
+}
+
+static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+ int error;
+
+ if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
+ return error;
+ inc_nlink(dir);
+ return 0;
+}
+
+static int shmem_create(struct inode *dir, struct dentry *dentry, int mode,
+ struct nameidata *nd)
+{
+ return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
+}
+
+/*
+ * Link a file..
+ */
+static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = old_dentry->d_inode;
+ int ret;
+
+ /*
+ * No ordinary (disk based) filesystem counts links as inodes;
+ * but each new link needs a new dentry, pinning lowmem, and
+ * tmpfs dentries cannot be pruned until they are unlinked.
+ */
+ ret = shmem_reserve_inode(inode->i_sb);
+ if (ret)
+ goto out;
+
+ dir->i_size += BOGO_DIRENT_SIZE;
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ inc_nlink(inode);
+ atomic_inc(&inode->i_count); /* New dentry reference */
+ dget(dentry); /* Extra pinning count for the created dentry */
+ d_instantiate(dentry, inode);
+out:
+ return ret;
+}
+
+static int shmem_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+
+ if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
+ shmem_free_inode(inode->i_sb);
+
+ dir->i_size -= BOGO_DIRENT_SIZE;
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ drop_nlink(inode);
+ dput(dentry); /* Undo the count from "create" - this does all the work */
+ return 0;
+}
+
+static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ if (!simple_empty(dentry))
+ return -ENOTEMPTY;
+
+ drop_nlink(dentry->d_inode);
+ drop_nlink(dir);
+ return shmem_unlink(dir, dentry);
+}
+
+/*
+ * The VFS layer already does all the dentry stuff for rename,
+ * we just have to decrement the usage count for the target if
+ * it exists so that the VFS layer correctly free's it when it
+ * gets overwritten.
+ */
+static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct inode *inode = old_dentry->d_inode;
+ int they_are_dirs = S_ISDIR(inode->i_mode);
+
+ if (!simple_empty(new_dentry))
+ return -ENOTEMPTY;
+
+ if (new_dentry->d_inode) {
+ (void) shmem_unlink(new_dir, new_dentry);
+ if (they_are_dirs)
+ drop_nlink(old_dir);
+ } else if (they_are_dirs) {
+ drop_nlink(old_dir);
+ inc_nlink(new_dir);
+ }
+
+ old_dir->i_size -= BOGO_DIRENT_SIZE;
+ new_dir->i_size += BOGO_DIRENT_SIZE;
+ old_dir->i_ctime = old_dir->i_mtime =
+ new_dir->i_ctime = new_dir->i_mtime =
+ inode->i_ctime = CURRENT_TIME;
+ return 0;
+}
+
+static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+{
+ int error;
+ int len;
+ struct inode *inode;
+ struct page *page = NULL;
+ char *kaddr;
+ struct shmem_inode_info *info;
+
+ len = strlen(symname) + 1;
+ if (len > PAGE_CACHE_SIZE)
+ return -ENAMETOOLONG;
+
+ inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0);
+ if (!inode)
+ return -ENOSPC;
+
+ error = security_inode_init_security(inode, dir, NULL, NULL,
+ NULL);
+ if (error) {
+ if (error != -EOPNOTSUPP) {
+ iput(inode);
+ return error;
+ }
+ error = 0;
+ }
+
+ info = SHMEM_I(inode);
+ inode->i_size = len-1;
+ if (len <= (char *)inode - (char *)info) {
+ /* do it inline */
+ memcpy(info, symname, len);
+ inode->i_op = &shmem_symlink_inline_operations;
+ } else {
+ error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
+ if (error) {
+ iput(inode);
+ return error;
+ }
+ unlock_page(page);
+ inode->i_mapping->a_ops = &shmem_aops;
+ inode->i_op = &shmem_symlink_inode_operations;
+ kaddr = kmap_atomic(page, KM_USER0);
+ memcpy(kaddr, symname, len);
+ kunmap_atomic(kaddr, KM_USER0);
+ set_page_dirty(page);
+ page_cache_release(page);
+ }
+ if (dir->i_mode & S_ISGID)
+ inode->i_gid = dir->i_gid;
+ dir->i_size += BOGO_DIRENT_SIZE;
+ dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ d_instantiate(dentry, inode);
+ dget(dentry);
+ return 0;
+}
+
+static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
+{
+ nd_set_link(nd, (char *)SHMEM_I(dentry->d_inode));
+ return NULL;
+}
+
+static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ struct page *page = NULL;
+ int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
+ nd_set_link(nd, res ? ERR_PTR(res) : kmap(page));
+ if (page)
+ unlock_page(page);
+ return page;
+}
+
+static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+{
+ if (!IS_ERR(nd_get_link(nd))) {
+ struct page *page = cookie;
+ kunmap(page);
+ mark_page_accessed(page);
+ page_cache_release(page);
+ }
+}
+
+static const struct inode_operations shmem_symlink_inline_operations = {
+ .readlink = generic_readlink,
+ .follow_link = shmem_follow_link_inline,
+};
+
+static const struct inode_operations shmem_symlink_inode_operations = {
+ .truncate = shmem_truncate,
+ .readlink = generic_readlink,
+ .follow_link = shmem_follow_link,
+ .put_link = shmem_put_link,
+};
+
+#ifdef CONFIG_TMPFS_POSIX_ACL
+/*
+ * Superblocks without xattr inode operations will get security.* xattr
+ * support from the VFS "for free". As soon as we have any other xattrs
+ * like ACLs, we also need to implement the security.* handlers at
+ * filesystem level, though.
+ */
+
+static size_t shmem_xattr_security_list(struct inode *inode, char *list,
+ size_t list_len, const char *name,
+ size_t name_len)
+{
+ return security_inode_listsecurity(inode, list, list_len);
+}
+
+static int shmem_xattr_security_get(struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ return xattr_getsecurity(inode, name, buffer, size);
+}
+
+static int shmem_xattr_security_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ return security_inode_setsecurity(inode, name, value, size, flags);
+}
+
+static struct xattr_handler shmem_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .list = shmem_xattr_security_list,
+ .get = shmem_xattr_security_get,
+ .set = shmem_xattr_security_set,
+};
+
+static struct xattr_handler *shmem_xattr_handlers[] = {
+ &shmem_xattr_acl_access_handler,
+ &shmem_xattr_acl_default_handler,
+ &shmem_xattr_security_handler,
+ NULL
+};
+#endif
+
+static struct dentry *shmem_get_parent(struct dentry *child)
+{
+ return ERR_PTR(-ESTALE);
+}
+
+static int shmem_match(struct inode *ino, void *vfh)
+{
+ __u32 *fh = vfh;
+ __u64 inum = fh[2];
+ inum = (inum << 32) | fh[1];
+ return ino->i_ino == inum && fh[0] == ino->i_generation;
+}
+
+static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type)
+{
+ struct inode *inode;
+ struct dentry *dentry = NULL;
+ u64 inum = fid->raw[2];
+ inum = (inum << 32) | fid->raw[1];
+
+ if (fh_len < 3)
+ return NULL;
+
+ inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
+ shmem_match, fid->raw);
+ if (inode) {
+ dentry = d_find_alias(inode);
+ iput(inode);
+ }
+
+ return dentry;
+}
+
+static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
+ int connectable)
+{
+ struct inode *inode = dentry->d_inode;
+
+ if (*len < 3)
+ return 255;
+
+ if (hlist_unhashed(&inode->i_hash)) {
+ /* Unfortunately insert_inode_hash is not idempotent,
+ * so as we hash inodes here rather than at creation
+ * time, we need a lock to ensure we only try
+ * to do it once
+ */
+ static DEFINE_SPINLOCK(lock);
+ spin_lock(&lock);
+ if (hlist_unhashed(&inode->i_hash))
+ __insert_inode_hash(inode,
+ inode->i_ino + inode->i_generation);
+ spin_unlock(&lock);
+ }
+
+ fh[0] = inode->i_generation;
+ fh[1] = inode->i_ino;
+ fh[2] = ((__u64)inode->i_ino) >> 32;
+
+ *len = 3;
+ return 1;
+}
+
+static const struct export_operations shmem_export_ops = {
+ .get_parent = shmem_get_parent,
+ .encode_fh = shmem_encode_fh,
+ .fh_to_dentry = shmem_fh_to_dentry,
+};
+
+static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
+ bool remount)
+{
+ char *this_char, *value, *rest;
+
+ while (options != NULL) {
+ this_char = options;
+ for (;;) {
+ /*
+ * NUL-terminate this option: unfortunately,
+ * mount options form a comma-separated list,
+ * but mpol's nodelist may also contain commas.
+ */
+ options = strchr(options, ',');
+ if (options == NULL)
+ break;
+ options++;
+ if (!isdigit(*options)) {
+ options[-1] = '\0';
+ break;
+ }
+ }
+ if (!*this_char)
+ continue;
+ if ((value = strchr(this_char,'=')) != NULL) {
+ *value++ = 0;
+ } else {
+ printk(KERN_ERR
+ "tmpfs: No value for mount option '%s'\n",
+ this_char);
+ return 1;
+ }
+
+ if (!strcmp(this_char,"size")) {
+ unsigned long long size;
+ size = memparse(value,&rest);
+ if (*rest == '%') {
+ size <<= PAGE_SHIFT;
+ size *= totalram_pages;
+ do_div(size, 100);
+ rest++;
+ }
+ if (*rest)
+ goto bad_val;
+ sbinfo->max_blocks =
+ DIV_ROUND_UP(size, PAGE_CACHE_SIZE);
+ } else if (!strcmp(this_char,"nr_blocks")) {
+ sbinfo->max_blocks = memparse(value, &rest);
+ if (*rest)
+ goto bad_val;
+ } else if (!strcmp(this_char,"nr_inodes")) {
+ sbinfo->max_inodes = memparse(value, &rest);
+ if (*rest)
+ goto bad_val;
+ } else if (!strcmp(this_char,"mode")) {
+ if (remount)
+ continue;
+ sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777;
+ if (*rest)
+ goto bad_val;
+ } else if (!strcmp(this_char,"uid")) {
+ if (remount)
+ continue;
+ sbinfo->uid = simple_strtoul(value, &rest, 0);
+ if (*rest)
+ goto bad_val;
+ } else if (!strcmp(this_char,"gid")) {
+ if (remount)
+ continue;
+ sbinfo->gid = simple_strtoul(value, &rest, 0);
+ if (*rest)
+ goto bad_val;
+ } else if (!strcmp(this_char,"mpol")) {
+ if (mpol_parse_str(value, &sbinfo->mpol, 1))
+ goto bad_val;
+ } else {
+ printk(KERN_ERR "tmpfs: Bad mount option %s\n",
+ this_char);
+ return 1;
+ }
+ }
+ return 0;
+
+bad_val:
+ printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
+ value, this_char);
+ return 1;
+
+}
+
+static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+ struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+ struct shmem_sb_info config = *sbinfo;
+ unsigned long blocks;
+ unsigned long inodes;
+ int error = -EINVAL;
+
+ if (shmem_parse_options(data, &config, true))
+ return error;
+
+ spin_lock(&sbinfo->stat_lock);
+ blocks = sbinfo->max_blocks - sbinfo->free_blocks;
+ inodes = sbinfo->max_inodes - sbinfo->free_inodes;
+ if (config.max_blocks < blocks)
+ goto out;
+ if (config.max_inodes < inodes)
+ goto out;
+ /*
+ * Those tests also disallow limited->unlimited while any are in
+ * use, so i_blocks will always be zero when max_blocks is zero;
+ * but we must separately disallow unlimited->limited, because
+ * in that case we have no record of how much is already in use.
+ */
+ if (config.max_blocks && !sbinfo->max_blocks)
+ goto out;
+ if (config.max_inodes && !sbinfo->max_inodes)
+ goto out;
+
+ error = 0;
+ sbinfo->max_blocks = config.max_blocks;
+ sbinfo->free_blocks = config.max_blocks - blocks;
+ sbinfo->max_inodes = config.max_inodes;
+ sbinfo->free_inodes = config.max_inodes - inodes;
+
+ mpol_put(sbinfo->mpol);
+ sbinfo->mpol = config.mpol; /* transfers initial ref */
+out:
+ spin_unlock(&sbinfo->stat_lock);
+ return error;
+}
+
+static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+ struct shmem_sb_info *sbinfo = SHMEM_SB(vfs->mnt_sb);
+
+ if (sbinfo->max_blocks != shmem_default_max_blocks())
+ seq_printf(seq, ",size=%luk",
+ sbinfo->max_blocks << (PAGE_CACHE_SHIFT - 10));
+ if (sbinfo->max_inodes != shmem_default_max_inodes())
+ seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
+ if (sbinfo->mode != (S_IRWXUGO | S_ISVTX))
+ seq_printf(seq, ",mode=%03o", sbinfo->mode);
+ if (sbinfo->uid != 0)
+ seq_printf(seq, ",uid=%u", sbinfo->uid);
+ if (sbinfo->gid != 0)
+ seq_printf(seq, ",gid=%u", sbinfo->gid);
+ shmem_show_mpol(seq, sbinfo->mpol);
+ return 0;
+}
+#endif /* CONFIG_TMPFS */
+
+static void shmem_put_super(struct super_block *sb)
+{
+ kfree(sb->s_fs_info);
+ sb->s_fs_info = NULL;
+}
+
+static int shmem_fill_super(struct super_block *sb,
+ void *data, int silent)
+{
+ struct inode *inode;
+ struct dentry *root;
+ struct shmem_sb_info *sbinfo;
+ int err = -ENOMEM;
+
+ /* Round up to L1_CACHE_BYTES to resist false sharing */
+ sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info),
+ L1_CACHE_BYTES), GFP_KERNEL);
+ if (!sbinfo)
+ return -ENOMEM;
+
+ sbinfo->max_blocks = 0;
+ sbinfo->max_inodes = 0;
+ sbinfo->mode = S_IRWXUGO | S_ISVTX;
+ sbinfo->uid = current->fsuid;
+ sbinfo->gid = current->fsgid;
+ sbinfo->mpol = NULL;
+ sb->s_fs_info = sbinfo;
+
+#ifdef CONFIG_TMPFS
+ /*
+ * Per default we only allow half of the physical ram per
+ * tmpfs instance, limiting inodes to one per page of lowmem;
+ * but the internal instance is left unlimited.
+ */
+ if (!(sb->s_flags & MS_NOUSER)) {
+ sbinfo->max_blocks = shmem_default_max_blocks();
+ sbinfo->max_inodes = shmem_default_max_inodes();
+ if (shmem_parse_options(data, sbinfo, false)) {
+ err = -EINVAL;
+ goto failed;
+ }
+ }
+ sb->s_export_op = &shmem_export_ops;
+#else
+ sb->s_flags |= MS_NOUSER;
+#endif
+
+ spin_lock_init(&sbinfo->stat_lock);
+ sbinfo->free_blocks = sbinfo->max_blocks;
+ sbinfo->free_inodes = sbinfo->max_inodes;
+
+ sb->s_maxbytes = SHMEM_MAX_BYTES;
+ sb->s_blocksize = PAGE_CACHE_SIZE;
+ sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_magic = TMPFS_MAGIC;
+ sb->s_op = &shmem_ops;
+ sb->s_time_gran = 1;
+#ifdef CONFIG_TMPFS_POSIX_ACL
+ sb->s_xattr = shmem_xattr_handlers;
+ sb->s_flags |= MS_POSIXACL;
+#endif
+
+ inode = shmem_get_inode(sb, S_IFDIR | sbinfo->mode, 0);
+ if (!inode)
+ goto failed;
+ inode->i_uid = sbinfo->uid;
+ inode->i_gid = sbinfo->gid;
+ root = d_alloc_root(inode);
+ if (!root)
+ goto failed_iput;
+ sb->s_root = root;
+ return 0;
+
+failed_iput:
+ iput(inode);
+failed:
+ shmem_put_super(sb);
+ return err;
+}
+
+static struct kmem_cache *shmem_inode_cachep;
+
+static struct inode *shmem_alloc_inode(struct super_block *sb)
+{
+ struct shmem_inode_info *p;
+ p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
+ if (!p)
+ return NULL;
+ return &p->vfs_inode;
+}
+
+static void shmem_destroy_inode(struct inode *inode)
+{
+ if ((inode->i_mode & S_IFMT) == S_IFREG) {
+ /* only struct inode is valid if it's an inline symlink */
+ mpol_free_shared_policy(&SHMEM_I(inode)->policy);
+ }
+ shmem_acl_destroy_inode(inode);
+ kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
+}
+
+static void init_once(void *foo)
+{
+ struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
+
+ inode_init_once(&p->vfs_inode);
+#ifdef CONFIG_TMPFS_POSIX_ACL
+ p->i_acl = NULL;
+ p->i_default_acl = NULL;
+#endif
+}
+
+static int init_inodecache(void)
+{
+ shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
+ sizeof(struct shmem_inode_info),
+ 0, SLAB_PANIC, init_once);
+ return 0;
+}
+
+static void destroy_inodecache(void)
+{
+ kmem_cache_destroy(shmem_inode_cachep);
+}
+
+static const struct address_space_operations shmem_aops = {
+ .writepage = shmem_writepage,
+ .set_page_dirty = __set_page_dirty_no_writeback,
+#ifdef CONFIG_TMPFS
+ .readpage = shmem_readpage,
+ .write_begin = shmem_write_begin,
+ .write_end = shmem_write_end,
+#endif
+ .migratepage = migrate_page,
+};
+
+static const struct file_operations shmem_file_operations = {
+ .mmap = shmem_mmap,
+#ifdef CONFIG_TMPFS
+ .llseek = generic_file_llseek,
+ .read = do_sync_read,
+ .write = do_sync_write,
+ .aio_read = shmem_file_aio_read,
+ .aio_write = generic_file_aio_write,
+ .fsync = simple_sync_file,
+ .splice_read = generic_file_splice_read,
+ .splice_write = generic_file_splice_write,
+#endif
+};
+
+static const struct inode_operations shmem_inode_operations = {
+ .truncate = shmem_truncate,
+ .setattr = shmem_notify_change,
+ .truncate_range = shmem_truncate_range,
+#ifdef CONFIG_TMPFS_POSIX_ACL
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = generic_listxattr,
+ .removexattr = generic_removexattr,
+ .permission = shmem_permission,
+#endif
+
+};
+
+static const struct inode_operations shmem_dir_inode_operations = {
+#ifdef CONFIG_TMPFS
+ .create = shmem_create,
+ .lookup = simple_lookup,
+ .link = shmem_link,
+ .unlink = shmem_unlink,
+ .symlink = shmem_symlink,
+ .mkdir = shmem_mkdir,
+ .rmdir = shmem_rmdir,
+ .mknod = shmem_mknod,
+ .rename = shmem_rename,
+#endif
+#ifdef CONFIG_TMPFS_POSIX_ACL
+ .setattr = shmem_notify_change,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = generic_listxattr,
+ .removexattr = generic_removexattr,
+ .permission = shmem_permission,
+#endif
+};
+
+static const struct inode_operations shmem_special_inode_operations = {
+#ifdef CONFIG_TMPFS_POSIX_ACL
+ .setattr = shmem_notify_change,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = generic_listxattr,
+ .removexattr = generic_removexattr,
+ .permission = shmem_permission,
+#endif
+};
+
+static const struct super_operations shmem_ops = {
+ .alloc_inode = shmem_alloc_inode,
+ .destroy_inode = shmem_destroy_inode,
+#ifdef CONFIG_TMPFS
+ .statfs = shmem_statfs,
+ .remount_fs = shmem_remount_fs,
+ .show_options = shmem_show_options,
+#endif
+ .delete_inode = shmem_delete_inode,
+ .drop_inode = generic_delete_inode,
+ .put_super = shmem_put_super,
+};
+
+static struct vm_operations_struct shmem_vm_ops = {
+ .fault = shmem_fault,
+#ifdef CONFIG_NUMA
+ .set_policy = shmem_set_policy,
+ .get_policy = shmem_get_policy,
+#endif
+};
+
+
+static int shmem_get_sb(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+{
+ return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt);
+}
+
+static struct file_system_type tmpfs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "tmpfs",
+ .get_sb = shmem_get_sb,
+ .kill_sb = kill_litter_super,
+};
+static struct vfsmount *shm_mnt;
+
+static int __init init_tmpfs(void)
+{
+ int error;
+
+ error = bdi_init(&shmem_backing_dev_info);
+ if (error)
+ goto out4;
+
+ error = init_inodecache();
+ if (error)
+ goto out3;
+
+ error = register_filesystem(&tmpfs_fs_type);
+ if (error) {
+ printk(KERN_ERR "Could not register tmpfs\n");
+ goto out2;
+ }
+
+ shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER,
+ tmpfs_fs_type.name, NULL);
+ if (IS_ERR(shm_mnt)) {
+ error = PTR_ERR(shm_mnt);
+ printk(KERN_ERR "Could not kern_mount tmpfs\n");
+ goto out1;
+ }
+ return 0;
+
+out1:
+ unregister_filesystem(&tmpfs_fs_type);
+out2:
+ destroy_inodecache();
+out3:
+ bdi_destroy(&shmem_backing_dev_info);
+out4:
+ shm_mnt = ERR_PTR(error);
+ return error;
+}
+module_init(init_tmpfs)
+
+/**
+ * shmem_file_setup - get an unlinked file living in tmpfs
+ * @name: name for dentry (to be seen in /proc/<pid>/maps
+ * @size: size to be set for the file
+ * @flags: vm_flags
+ */
+struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
+{
+ int error;
+ struct file *file;
+ struct inode *inode;
+ struct dentry *dentry, *root;
+ struct qstr this;
+
+ if (IS_ERR(shm_mnt))
+ return (void *)shm_mnt;
+
+ if (size < 0 || size > SHMEM_MAX_BYTES)
+ return ERR_PTR(-EINVAL);
+
+ if (shmem_acct_size(flags, size))
+ return ERR_PTR(-ENOMEM);
+
+ error = -ENOMEM;
+ this.name = name;
+ this.len = strlen(name);
+ this.hash = 0; /* will go */
+ root = shm_mnt->mnt_root;
+ dentry = d_alloc(root, &this);
+ if (!dentry)
+ goto put_memory;
+
+ error = -ENFILE;
+ file = get_empty_filp();
+ if (!file)
+ goto put_dentry;
+
+ error = -ENOSPC;
+ inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
+ if (!inode)
+ goto close_file;
+
+ SHMEM_I(inode)->flags = flags & VM_ACCOUNT;
+ d_instantiate(dentry, inode);
+ inode->i_size = size;
+ inode->i_nlink = 0; /* It is unlinked */
+ init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
+ &shmem_file_operations);
+ return file;
+
+close_file:
+ put_filp(file);
+put_dentry:
+ dput(dentry);
+put_memory:
+ shmem_unacct_size(flags, size);
+ return ERR_PTR(error);
+}
+EXPORT_SYMBOL_GPL(shmem_file_setup);
+
+/**
+ * shmem_zero_setup - setup a shared anonymous mapping
+ * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
+ */
+int shmem_zero_setup(struct vm_area_struct *vma)
+{
+ struct file *file;
+ loff_t size = vma->vm_end - vma->vm_start;
+
+ file = shmem_file_setup("dev/zero", size, vma->vm_flags);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ if (vma->vm_file)
+ fput(vma->vm_file);
+ vma->vm_file = file;
+ vma->vm_ops = &shmem_vm_ops;
+ return 0;
+}
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
new file mode 100644
index 0000000..8e5aadd
--- /dev/null
+++ b/mm/shmem_acl.c
@@ -0,0 +1,197 @@
+/*
+ * mm/shmem_acl.c
+ *
+ * (C) 2005 Andreas Gruenbacher <agruen@suse.de>
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/fs.h>
+#include <linux/shmem_fs.h>
+#include <linux/xattr.h>
+#include <linux/generic_acl.h>
+
+/**
+ * shmem_get_acl - generic_acl_operations->getacl() operation
+ */
+static struct posix_acl *
+shmem_get_acl(struct inode *inode, int type)
+{
+ struct posix_acl *acl = NULL;
+
+ spin_lock(&inode->i_lock);
+ switch(type) {
+ case ACL_TYPE_ACCESS:
+ acl = posix_acl_dup(SHMEM_I(inode)->i_acl);
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ acl = posix_acl_dup(SHMEM_I(inode)->i_default_acl);
+ break;
+ }
+ spin_unlock(&inode->i_lock);
+
+ return acl;
+}
+
+/**
+ * shmem_set_acl - generic_acl_operations->setacl() operation
+ */
+static void
+shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+{
+ struct posix_acl *free = NULL;
+
+ spin_lock(&inode->i_lock);
+ switch(type) {
+ case ACL_TYPE_ACCESS:
+ free = SHMEM_I(inode)->i_acl;
+ SHMEM_I(inode)->i_acl = posix_acl_dup(acl);
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ free = SHMEM_I(inode)->i_default_acl;
+ SHMEM_I(inode)->i_default_acl = posix_acl_dup(acl);
+ break;
+ }
+ spin_unlock(&inode->i_lock);
+ posix_acl_release(free);
+}
+
+struct generic_acl_operations shmem_acl_ops = {
+ .getacl = shmem_get_acl,
+ .setacl = shmem_set_acl,
+};
+
+/**
+ * shmem_list_acl_access, shmem_get_acl_access, shmem_set_acl_access,
+ * shmem_xattr_acl_access_handler - plumbing code to implement the
+ * system.posix_acl_access xattr using the generic acl functions.
+ */
+
+static size_t
+shmem_list_acl_access(struct inode *inode, char *list, size_t list_size,
+ const char *name, size_t name_len)
+{
+ return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_ACCESS,
+ list, list_size);
+}
+
+static int
+shmem_get_acl_access(struct inode *inode, const char *name, void *buffer,
+ size_t size)
+{
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+ return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, buffer,
+ size);
+}
+
+static int
+shmem_set_acl_access(struct inode *inode, const char *name, const void *value,
+ size_t size, int flags)
+{
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+ return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, value,
+ size);
+}
+
+struct xattr_handler shmem_xattr_acl_access_handler = {
+ .prefix = POSIX_ACL_XATTR_ACCESS,
+ .list = shmem_list_acl_access,
+ .get = shmem_get_acl_access,
+ .set = shmem_set_acl_access,
+};
+
+/**
+ * shmem_list_acl_default, shmem_get_acl_default, shmem_set_acl_default,
+ * shmem_xattr_acl_default_handler - plumbing code to implement the
+ * system.posix_acl_default xattr using the generic acl functions.
+ */
+
+static size_t
+shmem_list_acl_default(struct inode *inode, char *list, size_t list_size,
+ const char *name, size_t name_len)
+{
+ return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT,
+ list, list_size);
+}
+
+static int
+shmem_get_acl_default(struct inode *inode, const char *name, void *buffer,
+ size_t size)
+{
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+ return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, buffer,
+ size);
+}
+
+static int
+shmem_set_acl_default(struct inode *inode, const char *name, const void *value,
+ size_t size, int flags)
+{
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+ return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, value,
+ size);
+}
+
+struct xattr_handler shmem_xattr_acl_default_handler = {
+ .prefix = POSIX_ACL_XATTR_DEFAULT,
+ .list = shmem_list_acl_default,
+ .get = shmem_get_acl_default,
+ .set = shmem_set_acl_default,
+};
+
+/**
+ * shmem_acl_init - Inizialize the acl(s) of a new inode
+ */
+int
+shmem_acl_init(struct inode *inode, struct inode *dir)
+{
+ return generic_acl_init(inode, dir, &shmem_acl_ops);
+}
+
+/**
+ * shmem_acl_destroy_inode - destroy acls hanging off the in-memory inode
+ *
+ * This is done before destroying the actual inode.
+ */
+
+void
+shmem_acl_destroy_inode(struct inode *inode)
+{
+ if (SHMEM_I(inode)->i_acl)
+ posix_acl_release(SHMEM_I(inode)->i_acl);
+ SHMEM_I(inode)->i_acl = NULL;
+ if (SHMEM_I(inode)->i_default_acl)
+ posix_acl_release(SHMEM_I(inode)->i_default_acl);
+ SHMEM_I(inode)->i_default_acl = NULL;
+}
+
+/**
+ * shmem_check_acl - check_acl() callback for generic_permission()
+ */
+static int
+shmem_check_acl(struct inode *inode, int mask)
+{
+ struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS);
+
+ if (acl) {
+ int error = posix_acl_permission(inode, acl, mask);
+ posix_acl_release(acl);
+ return error;
+ }
+ return -EAGAIN;
+}
+
+/**
+ * shmem_permission - permission() inode operation
+ */
+int
+shmem_permission(struct inode *inode, int mask)
+{
+ return generic_permission(inode, mask, shmem_check_acl);
+}
diff --git a/mm/slab.c b/mm/slab.c
new file mode 100644
index 0000000..0918751
--- /dev/null
+++ b/mm/slab.c
@@ -0,0 +1,4522 @@
+/*
+ * linux/mm/slab.c
+ * Written by Mark Hemment, 1996/97.
+ * (markhe@nextd.demon.co.uk)
+ *
+ * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
+ *
+ * Major cleanup, different bufctl logic, per-cpu arrays
+ * (c) 2000 Manfred Spraul
+ *
+ * Cleanup, make the head arrays unconditional, preparation for NUMA
+ * (c) 2002 Manfred Spraul
+ *
+ * An implementation of the Slab Allocator as described in outline in;
+ * UNIX Internals: The New Frontiers by Uresh Vahalia
+ * Pub: Prentice Hall ISBN 0-13-101908-2
+ * or with a little more detail in;
+ * The Slab Allocator: An Object-Caching Kernel Memory Allocator
+ * Jeff Bonwick (Sun Microsystems).
+ * Presented at: USENIX Summer 1994 Technical Conference
+ *
+ * The memory is organized in caches, one cache for each object type.
+ * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
+ * Each cache consists out of many slabs (they are small (usually one
+ * page long) and always contiguous), and each slab contains multiple
+ * initialized objects.
+ *
+ * This means, that your constructor is used only for newly allocated
+ * slabs and you must pass objects with the same initializations to
+ * kmem_cache_free.
+ *
+ * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
+ * normal). If you need a special memory type, then must create a new
+ * cache for that memory type.
+ *
+ * In order to reduce fragmentation, the slabs are sorted in 3 groups:
+ * full slabs with 0 free objects
+ * partial slabs
+ * empty slabs with no allocated objects
+ *
+ * If partial slabs exist, then new allocations come from these slabs,
+ * otherwise from empty slabs or new slabs are allocated.
+ *
+ * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
+ * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
+ *
+ * Each cache has a short per-cpu head array, most allocs
+ * and frees go into that array, and if that array overflows, then 1/2
+ * of the entries in the array are given back into the global cache.
+ * The head array is strictly LIFO and should improve the cache hit rates.
+ * On SMP, it additionally reduces the spinlock operations.
+ *
+ * The c_cpuarray may not be read with enabled local interrupts -
+ * it's changed with a smp_call_function().
+ *
+ * SMP synchronization:
+ * constructors and destructors are called without any locking.
+ * Several members in struct kmem_cache and struct slab never change, they
+ * are accessed without any locking.
+ * The per-cpu arrays are never accessed from the wrong cpu, no locking,
+ * and local interrupts are disabled so slab code is preempt-safe.
+ * The non-constant members are protected with a per-cache irq spinlock.
+ *
+ * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
+ * in 2000 - many ideas in the current implementation are derived from
+ * his patch.
+ *
+ * Further notes from the original documentation:
+ *
+ * 11 April '97. Started multi-threading - markhe
+ * The global cache-chain is protected by the mutex 'cache_chain_mutex'.
+ * The sem is only needed when accessing/extending the cache-chain, which
+ * can never happen inside an interrupt (kmem_cache_create(),
+ * kmem_cache_shrink() and kmem_cache_reap()).
+ *
+ * At present, each engine can be growing a cache. This should be blocked.
+ *
+ * 15 March 2005. NUMA slab allocator.
+ * Shai Fultheim <shai@scalex86.org>.
+ * Shobhit Dayal <shobhit@calsoftinc.com>
+ * Alok N Kataria <alokk@calsoftinc.com>
+ * Christoph Lameter <christoph@lameter.com>
+ *
+ * Modified the slab allocator to be node aware on NUMA systems.
+ * Each node has its own list of partial, free and full slabs.
+ * All object allocations for a node occur from node specific slab lists.
+ */
+
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/poison.h>
+#include <linux/swap.h>
+#include <linux/cache.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/compiler.h>
+#include <linux/cpuset.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/notifier.h>
+#include <linux/kallsyms.h>
+#include <linux/cpu.h>
+#include <linux/sysctl.h>
+#include <linux/module.h>
+#include <linux/rcupdate.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/nodemask.h>
+#include <linux/mempolicy.h>
+#include <linux/mutex.h>
+#include <linux/fault-inject.h>
+#include <linux/rtmutex.h>
+#include <linux/reciprocal_div.h>
+#include <linux/debugobjects.h>
+
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/page.h>
+
+/*
+ * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
+ * 0 for faster, smaller code (especially in the critical paths).
+ *
+ * STATS - 1 to collect stats for /proc/slabinfo.
+ * 0 for faster, smaller code (especially in the critical paths).
+ *
+ * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
+ */
+
+#ifdef CONFIG_DEBUG_SLAB
+#define DEBUG 1
+#define STATS 1
+#define FORCED_DEBUG 1
+#else
+#define DEBUG 0
+#define STATS 0
+#define FORCED_DEBUG 0
+#endif
+
+/* Shouldn't this be in a header file somewhere? */
+#define BYTES_PER_WORD sizeof(void *)
+#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long))
+
+#ifndef ARCH_KMALLOC_MINALIGN
+/*
+ * Enforce a minimum alignment for the kmalloc caches.
+ * Usually, the kmalloc caches are cache_line_size() aligned, except when
+ * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
+ * Some archs want to perform DMA into kmalloc caches and need a guaranteed
+ * alignment larger than the alignment of a 64-bit integer.
+ * ARCH_KMALLOC_MINALIGN allows that.
+ * Note that increasing this value may disable some debug features.
+ */
+#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
+#endif
+
+#ifndef ARCH_SLAB_MINALIGN
+/*
+ * Enforce a minimum alignment for all caches.
+ * Intended for archs that get misalignment faults even for BYTES_PER_WORD
+ * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
+ * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
+ * some debug features.
+ */
+#define ARCH_SLAB_MINALIGN 0
+#endif
+
+#ifndef ARCH_KMALLOC_FLAGS
+#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
+#endif
+
+/* Legal flag mask for kmem_cache_create(). */
+#if DEBUG
+# define CREATE_MASK (SLAB_RED_ZONE | \
+ SLAB_POISON | SLAB_HWCACHE_ALIGN | \
+ SLAB_CACHE_DMA | \
+ SLAB_STORE_USER | \
+ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
+ SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
+ SLAB_DEBUG_OBJECTS)
+#else
+# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \
+ SLAB_CACHE_DMA | \
+ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
+ SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
+ SLAB_DEBUG_OBJECTS)
+#endif
+
+/*
+ * kmem_bufctl_t:
+ *
+ * Bufctl's are used for linking objs within a slab
+ * linked offsets.
+ *
+ * This implementation relies on "struct page" for locating the cache &
+ * slab an object belongs to.
+ * This allows the bufctl structure to be small (one int), but limits
+ * the number of objects a slab (not a cache) can contain when off-slab
+ * bufctls are used. The limit is the size of the largest general cache
+ * that does not use off-slab slabs.
+ * For 32bit archs with 4 kB pages, is this 56.
+ * This is not serious, as it is only for large objects, when it is unwise
+ * to have too many per slab.
+ * Note: This limit can be raised by introducing a general cache whose size
+ * is less than 512 (PAGE_SIZE<<3), but greater than 256.
+ */
+
+typedef unsigned int kmem_bufctl_t;
+#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0)
+#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1)
+#define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2)
+#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3)
+
+/*
+ * struct slab
+ *
+ * Manages the objs in a slab. Placed either at the beginning of mem allocated
+ * for a slab, or allocated from an general cache.
+ * Slabs are chained into three list: fully used, partial, fully free slabs.
+ */
+struct slab {
+ struct list_head list;
+ unsigned long colouroff;
+ void *s_mem; /* including colour offset */
+ unsigned int inuse; /* num of objs active in slab */
+ kmem_bufctl_t free;
+ unsigned short nodeid;
+};
+
+/*
+ * struct slab_rcu
+ *
+ * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
+ * arrange for kmem_freepages to be called via RCU. This is useful if
+ * we need to approach a kernel structure obliquely, from its address
+ * obtained without the usual locking. We can lock the structure to
+ * stabilize it and check it's still at the given address, only if we
+ * can be sure that the memory has not been meanwhile reused for some
+ * other kind of object (which our subsystem's lock might corrupt).
+ *
+ * rcu_read_lock before reading the address, then rcu_read_unlock after
+ * taking the spinlock within the structure expected at that address.
+ *
+ * We assume struct slab_rcu can overlay struct slab when destroying.
+ */
+struct slab_rcu {
+ struct rcu_head head;
+ struct kmem_cache *cachep;
+ void *addr;
+};
+
+/*
+ * struct array_cache
+ *
+ * Purpose:
+ * - LIFO ordering, to hand out cache-warm objects from _alloc
+ * - reduce the number of linked list operations
+ * - reduce spinlock operations
+ *
+ * The limit is stored in the per-cpu structure to reduce the data cache
+ * footprint.
+ *
+ */
+struct array_cache {
+ unsigned int avail;
+ unsigned int limit;
+ unsigned int batchcount;
+ unsigned int touched;
+ spinlock_t lock;
+ void *entry[]; /*
+ * Must have this definition in here for the proper
+ * alignment of array_cache. Also simplifies accessing
+ * the entries.
+ */
+};
+
+/*
+ * bootstrap: The caches do not work without cpuarrays anymore, but the
+ * cpuarrays are allocated from the generic caches...
+ */
+#define BOOT_CPUCACHE_ENTRIES 1
+struct arraycache_init {
+ struct array_cache cache;
+ void *entries[BOOT_CPUCACHE_ENTRIES];
+};
+
+/*
+ * The slab lists for all objects.
+ */
+struct kmem_list3 {
+ struct list_head slabs_partial; /* partial list first, better asm code */
+ struct list_head slabs_full;
+ struct list_head slabs_free;
+ unsigned long free_objects;
+ unsigned int free_limit;
+ unsigned int colour_next; /* Per-node cache coloring */
+ spinlock_t list_lock;
+ struct array_cache *shared; /* shared per node */
+ struct array_cache **alien; /* on other nodes */
+ unsigned long next_reap; /* updated without locking */
+ int free_touched; /* updated without locking */
+};
+
+/*
+ * Need this for bootstrapping a per node allocator.
+ */
+#define NUM_INIT_LISTS (3 * MAX_NUMNODES)
+struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
+#define CACHE_CACHE 0
+#define SIZE_AC MAX_NUMNODES
+#define SIZE_L3 (2 * MAX_NUMNODES)
+
+static int drain_freelist(struct kmem_cache *cache,
+ struct kmem_list3 *l3, int tofree);
+static void free_block(struct kmem_cache *cachep, void **objpp, int len,
+ int node);
+static int enable_cpucache(struct kmem_cache *cachep);
+static void cache_reap(struct work_struct *unused);
+
+/*
+ * This function must be completely optimized away if a constant is passed to
+ * it. Mostly the same as what is in linux/slab.h except it returns an index.
+ */
+static __always_inline int index_of(const size_t size)
+{
+ extern void __bad_size(void);
+
+ if (__builtin_constant_p(size)) {
+ int i = 0;
+
+#define CACHE(x) \
+ if (size <=x) \
+ return i; \
+ else \
+ i++;
+#include <linux/kmalloc_sizes.h>
+#undef CACHE
+ __bad_size();
+ } else
+ __bad_size();
+ return 0;
+}
+
+static int slab_early_init = 1;
+
+#define INDEX_AC index_of(sizeof(struct arraycache_init))
+#define INDEX_L3 index_of(sizeof(struct kmem_list3))
+
+static void kmem_list3_init(struct kmem_list3 *parent)
+{
+ INIT_LIST_HEAD(&parent->slabs_full);
+ INIT_LIST_HEAD(&parent->slabs_partial);
+ INIT_LIST_HEAD(&parent->slabs_free);
+ parent->shared = NULL;
+ parent->alien = NULL;
+ parent->colour_next = 0;
+ spin_lock_init(&parent->list_lock);
+ parent->free_objects = 0;
+ parent->free_touched = 0;
+}
+
+#define MAKE_LIST(cachep, listp, slab, nodeid) \
+ do { \
+ INIT_LIST_HEAD(listp); \
+ list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
+ } while (0)
+
+#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \
+ do { \
+ MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \
+ MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
+ MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
+ } while (0)
+
+/*
+ * struct kmem_cache
+ *
+ * manages a cache.
+ */
+
+struct kmem_cache {
+/* 1) per-cpu data, touched during every alloc/free */
+ struct array_cache *array[NR_CPUS];
+/* 2) Cache tunables. Protected by cache_chain_mutex */
+ unsigned int batchcount;
+ unsigned int limit;
+ unsigned int shared;
+
+ unsigned int buffer_size;
+ u32 reciprocal_buffer_size;
+/* 3) touched by every alloc & free from the backend */
+
+ unsigned int flags; /* constant flags */
+ unsigned int num; /* # of objs per slab */
+
+/* 4) cache_grow/shrink */
+ /* order of pgs per slab (2^n) */
+ unsigned int gfporder;
+
+ /* force GFP flags, e.g. GFP_DMA */
+ gfp_t gfpflags;
+
+ size_t colour; /* cache colouring range */
+ unsigned int colour_off; /* colour offset */
+ struct kmem_cache *slabp_cache;
+ unsigned int slab_size;
+ unsigned int dflags; /* dynamic flags */
+
+ /* constructor func */
+ void (*ctor)(void *obj);
+
+/* 5) cache creation/removal */
+ const char *name;
+ struct list_head next;
+
+/* 6) statistics */
+#if STATS
+ unsigned long num_active;
+ unsigned long num_allocations;
+ unsigned long high_mark;
+ unsigned long grown;
+ unsigned long reaped;
+ unsigned long errors;
+ unsigned long max_freeable;
+ unsigned long node_allocs;
+ unsigned long node_frees;
+ unsigned long node_overflow;
+ atomic_t allochit;
+ atomic_t allocmiss;
+ atomic_t freehit;
+ atomic_t freemiss;
+#endif
+#if DEBUG
+ /*
+ * If debugging is enabled, then the allocator can add additional
+ * fields and/or padding to every object. buffer_size contains the total
+ * object size including these internal fields, the following two
+ * variables contain the offset to the user object and its size.
+ */
+ int obj_offset;
+ int obj_size;
+#endif
+ /*
+ * We put nodelists[] at the end of kmem_cache, because we want to size
+ * this array to nr_node_ids slots instead of MAX_NUMNODES
+ * (see kmem_cache_init())
+ * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache
+ * is statically defined, so we reserve the max number of nodes.
+ */
+ struct kmem_list3 *nodelists[MAX_NUMNODES];
+ /*
+ * Do not add fields after nodelists[]
+ */
+};
+
+#define CFLGS_OFF_SLAB (0x80000000UL)
+#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
+
+#define BATCHREFILL_LIMIT 16
+/*
+ * Optimization question: fewer reaps means less probability for unnessary
+ * cpucache drain/refill cycles.
+ *
+ * OTOH the cpuarrays can contain lots of objects,
+ * which could lock up otherwise freeable slabs.
+ */
+#define REAPTIMEOUT_CPUC (2*HZ)
+#define REAPTIMEOUT_LIST3 (4*HZ)
+
+#if STATS
+#define STATS_INC_ACTIVE(x) ((x)->num_active++)
+#define STATS_DEC_ACTIVE(x) ((x)->num_active--)
+#define STATS_INC_ALLOCED(x) ((x)->num_allocations++)
+#define STATS_INC_GROWN(x) ((x)->grown++)
+#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y))
+#define STATS_SET_HIGH(x) \
+ do { \
+ if ((x)->num_active > (x)->high_mark) \
+ (x)->high_mark = (x)->num_active; \
+ } while (0)
+#define STATS_INC_ERR(x) ((x)->errors++)
+#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
+#define STATS_INC_NODEFREES(x) ((x)->node_frees++)
+#define STATS_INC_ACOVERFLOW(x) ((x)->node_overflow++)
+#define STATS_SET_FREEABLE(x, i) \
+ do { \
+ if ((x)->max_freeable < i) \
+ (x)->max_freeable = i; \
+ } while (0)
+#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit)
+#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss)
+#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit)
+#define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss)
+#else
+#define STATS_INC_ACTIVE(x) do { } while (0)
+#define STATS_DEC_ACTIVE(x) do { } while (0)
+#define STATS_INC_ALLOCED(x) do { } while (0)
+#define STATS_INC_GROWN(x) do { } while (0)
+#define STATS_ADD_REAPED(x,y) do { } while (0)
+#define STATS_SET_HIGH(x) do { } while (0)
+#define STATS_INC_ERR(x) do { } while (0)
+#define STATS_INC_NODEALLOCS(x) do { } while (0)
+#define STATS_INC_NODEFREES(x) do { } while (0)
+#define STATS_INC_ACOVERFLOW(x) do { } while (0)
+#define STATS_SET_FREEABLE(x, i) do { } while (0)
+#define STATS_INC_ALLOCHIT(x) do { } while (0)
+#define STATS_INC_ALLOCMISS(x) do { } while (0)
+#define STATS_INC_FREEHIT(x) do { } while (0)
+#define STATS_INC_FREEMISS(x) do { } while (0)
+#endif
+
+#if DEBUG
+
+/*
+ * memory layout of objects:
+ * 0 : objp
+ * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
+ * the end of an object is aligned with the end of the real
+ * allocation. Catches writes behind the end of the allocation.
+ * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
+ * redzone word.
+ * cachep->obj_offset: The real object.
+ * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
+ * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
+ * [BYTES_PER_WORD long]
+ */
+static int obj_offset(struct kmem_cache *cachep)
+{
+ return cachep->obj_offset;
+}
+
+static int obj_size(struct kmem_cache *cachep)
+{
+ return cachep->obj_size;
+}
+
+static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
+{
+ BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
+ return (unsigned long long*) (objp + obj_offset(cachep) -
+ sizeof(unsigned long long));
+}
+
+static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
+{
+ BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
+ if (cachep->flags & SLAB_STORE_USER)
+ return (unsigned long long *)(objp + cachep->buffer_size -
+ sizeof(unsigned long long) -
+ REDZONE_ALIGN);
+ return (unsigned long long *) (objp + cachep->buffer_size -
+ sizeof(unsigned long long));
+}
+
+static void **dbg_userword(struct kmem_cache *cachep, void *objp)
+{
+ BUG_ON(!(cachep->flags & SLAB_STORE_USER));
+ return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
+}
+
+#else
+
+#define obj_offset(x) 0
+#define obj_size(cachep) (cachep->buffer_size)
+#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;})
+#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;})
+#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;})
+
+#endif
+
+/*
+ * Do not go above this order unless 0 objects fit into the slab.
+ */
+#define BREAK_GFP_ORDER_HI 1
+#define BREAK_GFP_ORDER_LO 0
+static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
+
+/*
+ * Functions for storing/retrieving the cachep and or slab from the page
+ * allocator. These are used to find the slab an obj belongs to. With kfree(),
+ * these are used to find the cache which an obj belongs to.
+ */
+static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
+{
+ page->lru.next = (struct list_head *)cache;
+}
+
+static inline struct kmem_cache *page_get_cache(struct page *page)
+{
+ page = compound_head(page);
+ BUG_ON(!PageSlab(page));
+ return (struct kmem_cache *)page->lru.next;
+}
+
+static inline void page_set_slab(struct page *page, struct slab *slab)
+{
+ page->lru.prev = (struct list_head *)slab;
+}
+
+static inline struct slab *page_get_slab(struct page *page)
+{
+ BUG_ON(!PageSlab(page));
+ return (struct slab *)page->lru.prev;
+}
+
+static inline struct kmem_cache *virt_to_cache(const void *obj)
+{
+ struct page *page = virt_to_head_page(obj);
+ return page_get_cache(page);
+}
+
+static inline struct slab *virt_to_slab(const void *obj)
+{
+ struct page *page = virt_to_head_page(obj);
+ return page_get_slab(page);
+}
+
+static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
+ unsigned int idx)
+{
+ return slab->s_mem + cache->buffer_size * idx;
+}
+
+/*
+ * We want to avoid an expensive divide : (offset / cache->buffer_size)
+ * Using the fact that buffer_size is a constant for a particular cache,
+ * we can replace (offset / cache->buffer_size) by
+ * reciprocal_divide(offset, cache->reciprocal_buffer_size)
+ */
+static inline unsigned int obj_to_index(const struct kmem_cache *cache,
+ const struct slab *slab, void *obj)
+{
+ u32 offset = (obj - slab->s_mem);
+ return reciprocal_divide(offset, cache->reciprocal_buffer_size);
+}
+
+/*
+ * These are the default caches for kmalloc. Custom caches can have other sizes.
+ */
+struct cache_sizes malloc_sizes[] = {
+#define CACHE(x) { .cs_size = (x) },
+#include <linux/kmalloc_sizes.h>
+ CACHE(ULONG_MAX)
+#undef CACHE
+};
+EXPORT_SYMBOL(malloc_sizes);
+
+/* Must match cache_sizes above. Out of line to keep cache footprint low. */
+struct cache_names {
+ char *name;
+ char *name_dma;
+};
+
+static struct cache_names __initdata cache_names[] = {
+#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
+#include <linux/kmalloc_sizes.h>
+ {NULL,}
+#undef CACHE
+};
+
+static struct arraycache_init initarray_cache __initdata =
+ { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
+static struct arraycache_init initarray_generic =
+ { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
+
+/* internal cache of cache description objs */
+static struct kmem_cache cache_cache = {
+ .batchcount = 1,
+ .limit = BOOT_CPUCACHE_ENTRIES,
+ .shared = 1,
+ .buffer_size = sizeof(struct kmem_cache),
+ .name = "kmem_cache",
+};
+
+#define BAD_ALIEN_MAGIC 0x01020304ul
+
+#ifdef CONFIG_LOCKDEP
+
+/*
+ * Slab sometimes uses the kmalloc slabs to store the slab headers
+ * for other slabs "off slab".
+ * The locking for this is tricky in that it nests within the locks
+ * of all other slabs in a few places; to deal with this special
+ * locking we put on-slab caches into a separate lock-class.
+ *
+ * We set lock class for alien array caches which are up during init.
+ * The lock annotation will be lost if all cpus of a node goes down and
+ * then comes back up during hotplug
+ */
+static struct lock_class_key on_slab_l3_key;
+static struct lock_class_key on_slab_alc_key;
+
+static inline void init_lock_keys(void)
+
+{
+ int q;
+ struct cache_sizes *s = malloc_sizes;
+
+ while (s->cs_size != ULONG_MAX) {
+ for_each_node(q) {
+ struct array_cache **alc;
+ int r;
+ struct kmem_list3 *l3 = s->cs_cachep->nodelists[q];
+ if (!l3 || OFF_SLAB(s->cs_cachep))
+ continue;
+ lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
+ alc = l3->alien;
+ /*
+ * FIXME: This check for BAD_ALIEN_MAGIC
+ * should go away when common slab code is taught to
+ * work even without alien caches.
+ * Currently, non NUMA code returns BAD_ALIEN_MAGIC
+ * for alloc_alien_cache,
+ */
+ if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
+ continue;
+ for_each_node(r) {
+ if (alc[r])
+ lockdep_set_class(&alc[r]->lock,
+ &on_slab_alc_key);
+ }
+ }
+ s++;
+ }
+}
+#else
+static inline void init_lock_keys(void)
+{
+}
+#endif
+
+/*
+ * Guard access to the cache-chain.
+ */
+static DEFINE_MUTEX(cache_chain_mutex);
+static struct list_head cache_chain;
+
+/*
+ * chicken and egg problem: delay the per-cpu array allocation
+ * until the general caches are up.
+ */
+static enum {
+ NONE,
+ PARTIAL_AC,
+ PARTIAL_L3,
+ FULL
+} g_cpucache_up;
+
+/*
+ * used by boot code to determine if it can use slab based allocator
+ */
+int slab_is_available(void)
+{
+ return g_cpucache_up == FULL;
+}
+
+static DEFINE_PER_CPU(struct delayed_work, reap_work);
+
+static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
+{
+ return cachep->array[smp_processor_id()];
+}
+
+static inline struct kmem_cache *__find_general_cachep(size_t size,
+ gfp_t gfpflags)
+{
+ struct cache_sizes *csizep = malloc_sizes;
+
+#if DEBUG
+ /* This happens if someone tries to call
+ * kmem_cache_create(), or __kmalloc(), before
+ * the generic caches are initialized.
+ */
+ BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
+#endif
+ if (!size)
+ return ZERO_SIZE_PTR;
+
+ while (size > csizep->cs_size)
+ csizep++;
+
+ /*
+ * Really subtle: The last entry with cs->cs_size==ULONG_MAX
+ * has cs_{dma,}cachep==NULL. Thus no special case
+ * for large kmalloc calls required.
+ */
+#ifdef CONFIG_ZONE_DMA
+ if (unlikely(gfpflags & GFP_DMA))
+ return csizep->cs_dmacachep;
+#endif
+ return csizep->cs_cachep;
+}
+
+static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
+{
+ return __find_general_cachep(size, gfpflags);
+}
+
+static size_t slab_mgmt_size(size_t nr_objs, size_t align)
+{
+ return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
+}
+
+/*
+ * Calculate the number of objects and left-over bytes for a given buffer size.
+ */
+static void cache_estimate(unsigned long gfporder, size_t buffer_size,
+ size_t align, int flags, size_t *left_over,
+ unsigned int *num)
+{
+ int nr_objs;
+ size_t mgmt_size;
+ size_t slab_size = PAGE_SIZE << gfporder;
+
+ /*
+ * The slab management structure can be either off the slab or
+ * on it. For the latter case, the memory allocated for a
+ * slab is used for:
+ *
+ * - The struct slab
+ * - One kmem_bufctl_t for each object
+ * - Padding to respect alignment of @align
+ * - @buffer_size bytes for each object
+ *
+ * If the slab management structure is off the slab, then the
+ * alignment will already be calculated into the size. Because
+ * the slabs are all pages aligned, the objects will be at the
+ * correct alignment when allocated.
+ */
+ if (flags & CFLGS_OFF_SLAB) {
+ mgmt_size = 0;
+ nr_objs = slab_size / buffer_size;
+
+ if (nr_objs > SLAB_LIMIT)
+ nr_objs = SLAB_LIMIT;
+ } else {
+ /*
+ * Ignore padding for the initial guess. The padding
+ * is at most @align-1 bytes, and @buffer_size is at
+ * least @align. In the worst case, this result will
+ * be one greater than the number of objects that fit
+ * into the memory allocation when taking the padding
+ * into account.
+ */
+ nr_objs = (slab_size - sizeof(struct slab)) /
+ (buffer_size + sizeof(kmem_bufctl_t));
+
+ /*
+ * This calculated number will be either the right
+ * amount, or one greater than what we want.
+ */
+ if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
+ > slab_size)
+ nr_objs--;
+
+ if (nr_objs > SLAB_LIMIT)
+ nr_objs = SLAB_LIMIT;
+
+ mgmt_size = slab_mgmt_size(nr_objs, align);
+ }
+ *num = nr_objs;
+ *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
+}
+
+#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
+
+static void __slab_error(const char *function, struct kmem_cache *cachep,
+ char *msg)
+{
+ printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
+ function, cachep->name, msg);
+ dump_stack();
+}
+
+/*
+ * By default on NUMA we use alien caches to stage the freeing of
+ * objects allocated from other nodes. This causes massive memory
+ * inefficiencies when using fake NUMA setup to split memory into a
+ * large number of small nodes, so it can be disabled on the command
+ * line
+ */
+
+static int use_alien_caches __read_mostly = 1;
+static int numa_platform __read_mostly = 1;
+static int __init noaliencache_setup(char *s)
+{
+ use_alien_caches = 0;
+ return 1;
+}
+__setup("noaliencache", noaliencache_setup);
+
+#ifdef CONFIG_NUMA
+/*
+ * Special reaping functions for NUMA systems called from cache_reap().
+ * These take care of doing round robin flushing of alien caches (containing
+ * objects freed on different nodes from which they were allocated) and the
+ * flushing of remote pcps by calling drain_node_pages.
+ */
+static DEFINE_PER_CPU(unsigned long, reap_node);
+
+static void init_reap_node(int cpu)
+{
+ int node;
+
+ node = next_node(cpu_to_node(cpu), node_online_map);
+ if (node == MAX_NUMNODES)
+ node = first_node(node_online_map);
+
+ per_cpu(reap_node, cpu) = node;
+}
+
+static void next_reap_node(void)
+{
+ int node = __get_cpu_var(reap_node);
+
+ node = next_node(node, node_online_map);
+ if (unlikely(node >= MAX_NUMNODES))
+ node = first_node(node_online_map);
+ __get_cpu_var(reap_node) = node;
+}
+
+#else
+#define init_reap_node(cpu) do { } while (0)
+#define next_reap_node(void) do { } while (0)
+#endif
+
+/*
+ * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz
+ * via the workqueue/eventd.
+ * Add the CPU number into the expiration time to minimize the possibility of
+ * the CPUs getting into lockstep and contending for the global cache chain
+ * lock.
+ */
+static void __cpuinit start_cpu_timer(int cpu)
+{
+ struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
+
+ /*
+ * When this gets called from do_initcalls via cpucache_init(),
+ * init_workqueues() has already run, so keventd will be setup
+ * at that time.
+ */
+ if (keventd_up() && reap_work->work.func == NULL) {
+ init_reap_node(cpu);
+ INIT_DELAYED_WORK(reap_work, cache_reap);
+ schedule_delayed_work_on(cpu, reap_work,
+ __round_jiffies_relative(HZ, cpu));
+ }
+}
+
+static struct array_cache *alloc_arraycache(int node, int entries,
+ int batchcount)
+{
+ int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
+ struct array_cache *nc = NULL;
+
+ nc = kmalloc_node(memsize, GFP_KERNEL, node);
+ if (nc) {
+ nc->avail = 0;
+ nc->limit = entries;
+ nc->batchcount = batchcount;
+ nc->touched = 0;
+ spin_lock_init(&nc->lock);
+ }
+ return nc;
+}
+
+/*
+ * Transfer objects in one arraycache to another.
+ * Locking must be handled by the caller.
+ *
+ * Return the number of entries transferred.
+ */
+static int transfer_objects(struct array_cache *to,
+ struct array_cache *from, unsigned int max)
+{
+ /* Figure out how many entries to transfer */
+ int nr = min(min(from->avail, max), to->limit - to->avail);
+
+ if (!nr)
+ return 0;
+
+ memcpy(to->entry + to->avail, from->entry + from->avail -nr,
+ sizeof(void *) *nr);
+
+ from->avail -= nr;
+ to->avail += nr;
+ to->touched = 1;
+ return nr;
+}
+
+#ifndef CONFIG_NUMA
+
+#define drain_alien_cache(cachep, alien) do { } while (0)
+#define reap_alien(cachep, l3) do { } while (0)
+
+static inline struct array_cache **alloc_alien_cache(int node, int limit)
+{
+ return (struct array_cache **)BAD_ALIEN_MAGIC;
+}
+
+static inline void free_alien_cache(struct array_cache **ac_ptr)
+{
+}
+
+static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+{
+ return 0;
+}
+
+static inline void *alternate_node_alloc(struct kmem_cache *cachep,
+ gfp_t flags)
+{
+ return NULL;
+}
+
+static inline void *____cache_alloc_node(struct kmem_cache *cachep,
+ gfp_t flags, int nodeid)
+{
+ return NULL;
+}
+
+#else /* CONFIG_NUMA */
+
+static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
+static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
+
+static struct array_cache **alloc_alien_cache(int node, int limit)
+{
+ struct array_cache **ac_ptr;
+ int memsize = sizeof(void *) * nr_node_ids;
+ int i;
+
+ if (limit > 1)
+ limit = 12;
+ ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
+ if (ac_ptr) {
+ for_each_node(i) {
+ if (i == node || !node_online(i)) {
+ ac_ptr[i] = NULL;
+ continue;
+ }
+ ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
+ if (!ac_ptr[i]) {
+ for (i--; i >= 0; i--)
+ kfree(ac_ptr[i]);
+ kfree(ac_ptr);
+ return NULL;
+ }
+ }
+ }
+ return ac_ptr;
+}
+
+static void free_alien_cache(struct array_cache **ac_ptr)
+{
+ int i;
+
+ if (!ac_ptr)
+ return;
+ for_each_node(i)
+ kfree(ac_ptr[i]);
+ kfree(ac_ptr);
+}
+
+static void __drain_alien_cache(struct kmem_cache *cachep,
+ struct array_cache *ac, int node)
+{
+ struct kmem_list3 *rl3 = cachep->nodelists[node];
+
+ if (ac->avail) {
+ spin_lock(&rl3->list_lock);
+ /*
+ * Stuff objects into the remote nodes shared array first.
+ * That way we could avoid the overhead of putting the objects
+ * into the free lists and getting them back later.
+ */
+ if (rl3->shared)
+ transfer_objects(rl3->shared, ac, ac->limit);
+
+ free_block(cachep, ac->entry, ac->avail, node);
+ ac->avail = 0;
+ spin_unlock(&rl3->list_lock);
+ }
+}
+
+/*
+ * Called from cache_reap() to regularly drain alien caches round robin.
+ */
+static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
+{
+ int node = __get_cpu_var(reap_node);
+
+ if (l3->alien) {
+ struct array_cache *ac = l3->alien[node];
+
+ if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
+ __drain_alien_cache(cachep, ac, node);
+ spin_unlock_irq(&ac->lock);
+ }
+ }
+}
+
+static void drain_alien_cache(struct kmem_cache *cachep,
+ struct array_cache **alien)
+{
+ int i = 0;
+ struct array_cache *ac;
+ unsigned long flags;
+
+ for_each_online_node(i) {
+ ac = alien[i];
+ if (ac) {
+ spin_lock_irqsave(&ac->lock, flags);
+ __drain_alien_cache(cachep, ac, i);
+ spin_unlock_irqrestore(&ac->lock, flags);
+ }
+ }
+}
+
+static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+{
+ struct slab *slabp = virt_to_slab(objp);
+ int nodeid = slabp->nodeid;
+ struct kmem_list3 *l3;
+ struct array_cache *alien = NULL;
+ int node;
+
+ node = numa_node_id();
+
+ /*
+ * Make sure we are not freeing a object from another node to the array
+ * cache on this cpu.
+ */
+ if (likely(slabp->nodeid == node))
+ return 0;
+
+ l3 = cachep->nodelists[node];
+ STATS_INC_NODEFREES(cachep);
+ if (l3->alien && l3->alien[nodeid]) {
+ alien = l3->alien[nodeid];
+ spin_lock(&alien->lock);
+ if (unlikely(alien->avail == alien->limit)) {
+ STATS_INC_ACOVERFLOW(cachep);
+ __drain_alien_cache(cachep, alien, nodeid);
+ }
+ alien->entry[alien->avail++] = objp;
+ spin_unlock(&alien->lock);
+ } else {
+ spin_lock(&(cachep->nodelists[nodeid])->list_lock);
+ free_block(cachep, &objp, 1, nodeid);
+ spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
+ }
+ return 1;
+}
+#endif
+
+static void __cpuinit cpuup_canceled(long cpu)
+{
+ struct kmem_cache *cachep;
+ struct kmem_list3 *l3 = NULL;
+ int node = cpu_to_node(cpu);
+ node_to_cpumask_ptr(mask, node);
+
+ list_for_each_entry(cachep, &cache_chain, next) {
+ struct array_cache *nc;
+ struct array_cache *shared;
+ struct array_cache **alien;
+
+ /* cpu is dead; no one can alloc from it. */
+ nc = cachep->array[cpu];
+ cachep->array[cpu] = NULL;
+ l3 = cachep->nodelists[node];
+
+ if (!l3)
+ goto free_array_cache;
+
+ spin_lock_irq(&l3->list_lock);
+
+ /* Free limit for this kmem_list3 */
+ l3->free_limit -= cachep->batchcount;
+ if (nc)
+ free_block(cachep, nc->entry, nc->avail, node);
+
+ if (!cpus_empty(*mask)) {
+ spin_unlock_irq(&l3->list_lock);
+ goto free_array_cache;
+ }
+
+ shared = l3->shared;
+ if (shared) {
+ free_block(cachep, shared->entry,
+ shared->avail, node);
+ l3->shared = NULL;
+ }
+
+ alien = l3->alien;
+ l3->alien = NULL;
+
+ spin_unlock_irq(&l3->list_lock);
+
+ kfree(shared);
+ if (alien) {
+ drain_alien_cache(cachep, alien);
+ free_alien_cache(alien);
+ }
+free_array_cache:
+ kfree(nc);
+ }
+ /*
+ * In the previous loop, all the objects were freed to
+ * the respective cache's slabs, now we can go ahead and
+ * shrink each nodelist to its limit.
+ */
+ list_for_each_entry(cachep, &cache_chain, next) {
+ l3 = cachep->nodelists[node];
+ if (!l3)
+ continue;
+ drain_freelist(cachep, l3, l3->free_objects);
+ }
+}
+
+static int __cpuinit cpuup_prepare(long cpu)
+{
+ struct kmem_cache *cachep;
+ struct kmem_list3 *l3 = NULL;
+ int node = cpu_to_node(cpu);
+ const int memsize = sizeof(struct kmem_list3);
+
+ /*
+ * We need to do this right in the beginning since
+ * alloc_arraycache's are going to use this list.
+ * kmalloc_node allows us to add the slab to the right
+ * kmem_list3 and not this cpu's kmem_list3
+ */
+
+ list_for_each_entry(cachep, &cache_chain, next) {
+ /*
+ * Set up the size64 kmemlist for cpu before we can
+ * begin anything. Make sure some other cpu on this
+ * node has not already allocated this
+ */
+ if (!cachep->nodelists[node]) {
+ l3 = kmalloc_node(memsize, GFP_KERNEL, node);
+ if (!l3)
+ goto bad;
+ kmem_list3_init(l3);
+ l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
+ ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+
+ /*
+ * The l3s don't come and go as CPUs come and
+ * go. cache_chain_mutex is sufficient
+ * protection here.
+ */
+ cachep->nodelists[node] = l3;
+ }
+
+ spin_lock_irq(&cachep->nodelists[node]->list_lock);
+ cachep->nodelists[node]->free_limit =
+ (1 + nr_cpus_node(node)) *
+ cachep->batchcount + cachep->num;
+ spin_unlock_irq(&cachep->nodelists[node]->list_lock);
+ }
+
+ /*
+ * Now we can go ahead with allocating the shared arrays and
+ * array caches
+ */
+ list_for_each_entry(cachep, &cache_chain, next) {
+ struct array_cache *nc;
+ struct array_cache *shared = NULL;
+ struct array_cache **alien = NULL;
+
+ nc = alloc_arraycache(node, cachep->limit,
+ cachep->batchcount);
+ if (!nc)
+ goto bad;
+ if (cachep->shared) {
+ shared = alloc_arraycache(node,
+ cachep->shared * cachep->batchcount,
+ 0xbaadf00d);
+ if (!shared) {
+ kfree(nc);
+ goto bad;
+ }
+ }
+ if (use_alien_caches) {
+ alien = alloc_alien_cache(node, cachep->limit);
+ if (!alien) {
+ kfree(shared);
+ kfree(nc);
+ goto bad;
+ }
+ }
+ cachep->array[cpu] = nc;
+ l3 = cachep->nodelists[node];
+ BUG_ON(!l3);
+
+ spin_lock_irq(&l3->list_lock);
+ if (!l3->shared) {
+ /*
+ * We are serialised from CPU_DEAD or
+ * CPU_UP_CANCELLED by the cpucontrol lock
+ */
+ l3->shared = shared;
+ shared = NULL;
+ }
+#ifdef CONFIG_NUMA
+ if (!l3->alien) {
+ l3->alien = alien;
+ alien = NULL;
+ }
+#endif
+ spin_unlock_irq(&l3->list_lock);
+ kfree(shared);
+ free_alien_cache(alien);
+ }
+ return 0;
+bad:
+ cpuup_canceled(cpu);
+ return -ENOMEM;
+}
+
+static int __cpuinit cpuup_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ long cpu = (long)hcpu;
+ int err = 0;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ mutex_lock(&cache_chain_mutex);
+ err = cpuup_prepare(cpu);
+ mutex_unlock(&cache_chain_mutex);
+ break;
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ start_cpu_timer(cpu);
+ break;
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ /*
+ * Shutdown cache reaper. Note that the cache_chain_mutex is
+ * held so that if cache_reap() is invoked it cannot do
+ * anything expensive but will only modify reap_work
+ * and reschedule the timer.
+ */
+ cancel_rearming_delayed_work(&per_cpu(reap_work, cpu));
+ /* Now the cache_reaper is guaranteed to be not running. */
+ per_cpu(reap_work, cpu).work.func = NULL;
+ break;
+ case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
+ start_cpu_timer(cpu);
+ break;
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ /*
+ * Even if all the cpus of a node are down, we don't free the
+ * kmem_list3 of any cache. This to avoid a race between
+ * cpu_down, and a kmalloc allocation from another cpu for
+ * memory from the node of the cpu going down. The list3
+ * structure is usually allocated from kmem_cache_create() and
+ * gets destroyed at kmem_cache_destroy().
+ */
+ /* fall through */
+#endif
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ mutex_lock(&cache_chain_mutex);
+ cpuup_canceled(cpu);
+ mutex_unlock(&cache_chain_mutex);
+ break;
+ }
+ return err ? NOTIFY_BAD : NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata cpucache_notifier = {
+ &cpuup_callback, NULL, 0
+};
+
+/*
+ * swap the static kmem_list3 with kmalloced memory
+ */
+static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
+ int nodeid)
+{
+ struct kmem_list3 *ptr;
+
+ ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
+ BUG_ON(!ptr);
+
+ local_irq_disable();
+ memcpy(ptr, list, sizeof(struct kmem_list3));
+ /*
+ * Do not assume that spinlocks can be initialized via memcpy:
+ */
+ spin_lock_init(&ptr->list_lock);
+
+ MAKE_ALL_LISTS(cachep, ptr, nodeid);
+ cachep->nodelists[nodeid] = ptr;
+ local_irq_enable();
+}
+
+/*
+ * For setting up all the kmem_list3s for cache whose buffer_size is same as
+ * size of kmem_list3.
+ */
+static void __init set_up_list3s(struct kmem_cache *cachep, int index)
+{
+ int node;
+
+ for_each_online_node(node) {
+ cachep->nodelists[node] = &initkmem_list3[index + node];
+ cachep->nodelists[node]->next_reap = jiffies +
+ REAPTIMEOUT_LIST3 +
+ ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+ }
+}
+
+/*
+ * Initialisation. Called after the page allocator have been initialised and
+ * before smp_init().
+ */
+void __init kmem_cache_init(void)
+{
+ size_t left_over;
+ struct cache_sizes *sizes;
+ struct cache_names *names;
+ int i;
+ int order;
+ int node;
+
+ if (num_possible_nodes() == 1) {
+ use_alien_caches = 0;
+ numa_platform = 0;
+ }
+
+ for (i = 0; i < NUM_INIT_LISTS; i++) {
+ kmem_list3_init(&initkmem_list3[i]);
+ if (i < MAX_NUMNODES)
+ cache_cache.nodelists[i] = NULL;
+ }
+ set_up_list3s(&cache_cache, CACHE_CACHE);
+
+ /*
+ * Fragmentation resistance on low memory - only use bigger
+ * page orders on machines with more than 32MB of memory.
+ */
+ if (num_physpages > (32 << 20) >> PAGE_SHIFT)
+ slab_break_gfp_order = BREAK_GFP_ORDER_HI;
+
+ /* Bootstrap is tricky, because several objects are allocated
+ * from caches that do not exist yet:
+ * 1) initialize the cache_cache cache: it contains the struct
+ * kmem_cache structures of all caches, except cache_cache itself:
+ * cache_cache is statically allocated.
+ * Initially an __init data area is used for the head array and the
+ * kmem_list3 structures, it's replaced with a kmalloc allocated
+ * array at the end of the bootstrap.
+ * 2) Create the first kmalloc cache.
+ * The struct kmem_cache for the new cache is allocated normally.
+ * An __init data area is used for the head array.
+ * 3) Create the remaining kmalloc caches, with minimally sized
+ * head arrays.
+ * 4) Replace the __init data head arrays for cache_cache and the first
+ * kmalloc cache with kmalloc allocated arrays.
+ * 5) Replace the __init data for kmem_list3 for cache_cache and
+ * the other cache's with kmalloc allocated memory.
+ * 6) Resize the head arrays of the kmalloc caches to their final sizes.
+ */
+
+ node = numa_node_id();
+
+ /* 1) create the cache_cache */
+ INIT_LIST_HEAD(&cache_chain);
+ list_add(&cache_cache.next, &cache_chain);
+ cache_cache.colour_off = cache_line_size();
+ cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
+ cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
+
+ /*
+ * struct kmem_cache size depends on nr_node_ids, which
+ * can be less than MAX_NUMNODES.
+ */
+ cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) +
+ nr_node_ids * sizeof(struct kmem_list3 *);
+#if DEBUG
+ cache_cache.obj_size = cache_cache.buffer_size;
+#endif
+ cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
+ cache_line_size());
+ cache_cache.reciprocal_buffer_size =
+ reciprocal_value(cache_cache.buffer_size);
+
+ for (order = 0; order < MAX_ORDER; order++) {
+ cache_estimate(order, cache_cache.buffer_size,
+ cache_line_size(), 0, &left_over, &cache_cache.num);
+ if (cache_cache.num)
+ break;
+ }
+ BUG_ON(!cache_cache.num);
+ cache_cache.gfporder = order;
+ cache_cache.colour = left_over / cache_cache.colour_off;
+ cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
+ sizeof(struct slab), cache_line_size());
+
+ /* 2+3) create the kmalloc caches */
+ sizes = malloc_sizes;
+ names = cache_names;
+
+ /*
+ * Initialize the caches that provide memory for the array cache and the
+ * kmem_list3 structures first. Without this, further allocations will
+ * bug.
+ */
+
+ sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
+ sizes[INDEX_AC].cs_size,
+ ARCH_KMALLOC_MINALIGN,
+ ARCH_KMALLOC_FLAGS|SLAB_PANIC,
+ NULL);
+
+ if (INDEX_AC != INDEX_L3) {
+ sizes[INDEX_L3].cs_cachep =
+ kmem_cache_create(names[INDEX_L3].name,
+ sizes[INDEX_L3].cs_size,
+ ARCH_KMALLOC_MINALIGN,
+ ARCH_KMALLOC_FLAGS|SLAB_PANIC,
+ NULL);
+ }
+
+ slab_early_init = 0;
+
+ while (sizes->cs_size != ULONG_MAX) {
+ /*
+ * For performance, all the general caches are L1 aligned.
+ * This should be particularly beneficial on SMP boxes, as it
+ * eliminates "false sharing".
+ * Note for systems short on memory removing the alignment will
+ * allow tighter packing of the smaller caches.
+ */
+ if (!sizes->cs_cachep) {
+ sizes->cs_cachep = kmem_cache_create(names->name,
+ sizes->cs_size,
+ ARCH_KMALLOC_MINALIGN,
+ ARCH_KMALLOC_FLAGS|SLAB_PANIC,
+ NULL);
+ }
+#ifdef CONFIG_ZONE_DMA
+ sizes->cs_dmacachep = kmem_cache_create(
+ names->name_dma,
+ sizes->cs_size,
+ ARCH_KMALLOC_MINALIGN,
+ ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
+ SLAB_PANIC,
+ NULL);
+#endif
+ sizes++;
+ names++;
+ }
+ /* 4) Replace the bootstrap head arrays */
+ {
+ struct array_cache *ptr;
+
+ ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+
+ local_irq_disable();
+ BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
+ memcpy(ptr, cpu_cache_get(&cache_cache),
+ sizeof(struct arraycache_init));
+ /*
+ * Do not assume that spinlocks can be initialized via memcpy:
+ */
+ spin_lock_init(&ptr->lock);
+
+ cache_cache.array[smp_processor_id()] = ptr;
+ local_irq_enable();
+
+ ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+
+ local_irq_disable();
+ BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
+ != &initarray_generic.cache);
+ memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
+ sizeof(struct arraycache_init));
+ /*
+ * Do not assume that spinlocks can be initialized via memcpy:
+ */
+ spin_lock_init(&ptr->lock);
+
+ malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
+ ptr;
+ local_irq_enable();
+ }
+ /* 5) Replace the bootstrap kmem_list3's */
+ {
+ int nid;
+
+ for_each_online_node(nid) {
+ init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
+
+ init_list(malloc_sizes[INDEX_AC].cs_cachep,
+ &initkmem_list3[SIZE_AC + nid], nid);
+
+ if (INDEX_AC != INDEX_L3) {
+ init_list(malloc_sizes[INDEX_L3].cs_cachep,
+ &initkmem_list3[SIZE_L3 + nid], nid);
+ }
+ }
+ }
+
+ /* 6) resize the head arrays to their final sizes */
+ {
+ struct kmem_cache *cachep;
+ mutex_lock(&cache_chain_mutex);
+ list_for_each_entry(cachep, &cache_chain, next)
+ if (enable_cpucache(cachep))
+ BUG();
+ mutex_unlock(&cache_chain_mutex);
+ }
+
+ /* Annotate slab for lockdep -- annotate the malloc caches */
+ init_lock_keys();
+
+
+ /* Done! */
+ g_cpucache_up = FULL;
+
+ /*
+ * Register a cpu startup notifier callback that initializes
+ * cpu_cache_get for all new cpus
+ */
+ register_cpu_notifier(&cpucache_notifier);
+
+ /*
+ * The reap timers are started later, with a module init call: That part
+ * of the kernel is not yet operational.
+ */
+}
+
+static int __init cpucache_init(void)
+{
+ int cpu;
+
+ /*
+ * Register the timers that return unneeded pages to the page allocator
+ */
+ for_each_online_cpu(cpu)
+ start_cpu_timer(cpu);
+ return 0;
+}
+__initcall(cpucache_init);
+
+/*
+ * Interface to system's page allocator. No need to hold the cache-lock.
+ *
+ * If we requested dmaable memory, we will get it. Even if we
+ * did not request dmaable memory, we might get it, but that
+ * would be relatively rare and ignorable.
+ */
+static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+{
+ struct page *page;
+ int nr_pages;
+ int i;
+
+#ifndef CONFIG_MMU
+ /*
+ * Nommu uses slab's for process anonymous memory allocations, and thus
+ * requires __GFP_COMP to properly refcount higher order allocations
+ */
+ flags |= __GFP_COMP;
+#endif
+
+ flags |= cachep->gfpflags;
+ if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
+ flags |= __GFP_RECLAIMABLE;
+
+ page = alloc_pages_node(nodeid, flags, cachep->gfporder);
+ if (!page)
+ return NULL;
+
+ nr_pages = (1 << cachep->gfporder);
+ if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
+ add_zone_page_state(page_zone(page),
+ NR_SLAB_RECLAIMABLE, nr_pages);
+ else
+ add_zone_page_state(page_zone(page),
+ NR_SLAB_UNRECLAIMABLE, nr_pages);
+ for (i = 0; i < nr_pages; i++)
+ __SetPageSlab(page + i);
+ return page_address(page);
+}
+
+/*
+ * Interface to system's page release.
+ */
+static void kmem_freepages(struct kmem_cache *cachep, void *addr)
+{
+ unsigned long i = (1 << cachep->gfporder);
+ struct page *page = virt_to_page(addr);
+ const unsigned long nr_freed = i;
+
+ if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
+ sub_zone_page_state(page_zone(page),
+ NR_SLAB_RECLAIMABLE, nr_freed);
+ else
+ sub_zone_page_state(page_zone(page),
+ NR_SLAB_UNRECLAIMABLE, nr_freed);
+ while (i--) {
+ BUG_ON(!PageSlab(page));
+ __ClearPageSlab(page);
+ page++;
+ }
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed_slab += nr_freed;
+ free_pages((unsigned long)addr, cachep->gfporder);
+}
+
+static void kmem_rcu_free(struct rcu_head *head)
+{
+ struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
+ struct kmem_cache *cachep = slab_rcu->cachep;
+
+ kmem_freepages(cachep, slab_rcu->addr);
+ if (OFF_SLAB(cachep))
+ kmem_cache_free(cachep->slabp_cache, slab_rcu);
+}
+
+#if DEBUG
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
+ unsigned long caller)
+{
+ int size = obj_size(cachep);
+
+ addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
+
+ if (size < 5 * sizeof(unsigned long))
+ return;
+
+ *addr++ = 0x12345678;
+ *addr++ = caller;
+ *addr++ = smp_processor_id();
+ size -= 3 * sizeof(unsigned long);
+ {
+ unsigned long *sptr = &caller;
+ unsigned long svalue;
+
+ while (!kstack_end(sptr)) {
+ svalue = *sptr++;
+ if (kernel_text_address(svalue)) {
+ *addr++ = svalue;
+ size -= sizeof(unsigned long);
+ if (size <= sizeof(unsigned long))
+ break;
+ }
+ }
+
+ }
+ *addr++ = 0x87654321;
+}
+#endif
+
+static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
+{
+ int size = obj_size(cachep);
+ addr = &((char *)addr)[obj_offset(cachep)];
+
+ memset(addr, val, size);
+ *(unsigned char *)(addr + size - 1) = POISON_END;
+}
+
+static void dump_line(char *data, int offset, int limit)
+{
+ int i;
+ unsigned char error = 0;
+ int bad_count = 0;
+
+ printk(KERN_ERR "%03x:", offset);
+ for (i = 0; i < limit; i++) {
+ if (data[offset + i] != POISON_FREE) {
+ error = data[offset + i];
+ bad_count++;
+ }
+ printk(" %02x", (unsigned char)data[offset + i]);
+ }
+ printk("\n");
+
+ if (bad_count == 1) {
+ error ^= POISON_FREE;
+ if (!(error & (error - 1))) {
+ printk(KERN_ERR "Single bit error detected. Probably "
+ "bad RAM.\n");
+#ifdef CONFIG_X86
+ printk(KERN_ERR "Run memtest86+ or a similar memory "
+ "test tool.\n");
+#else
+ printk(KERN_ERR "Run a memory test tool.\n");
+#endif
+ }
+ }
+}
+#endif
+
+#if DEBUG
+
+static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
+{
+ int i, size;
+ char *realobj;
+
+ if (cachep->flags & SLAB_RED_ZONE) {
+ printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n",
+ *dbg_redzone1(cachep, objp),
+ *dbg_redzone2(cachep, objp));
+ }
+
+ if (cachep->flags & SLAB_STORE_USER) {
+ printk(KERN_ERR "Last user: [<%p>]",
+ *dbg_userword(cachep, objp));
+ print_symbol("(%s)",
+ (unsigned long)*dbg_userword(cachep, objp));
+ printk("\n");
+ }
+ realobj = (char *)objp + obj_offset(cachep);
+ size = obj_size(cachep);
+ for (i = 0; i < size && lines; i += 16, lines--) {
+ int limit;
+ limit = 16;
+ if (i + limit > size)
+ limit = size - i;
+ dump_line(realobj, i, limit);
+ }
+}
+
+static void check_poison_obj(struct kmem_cache *cachep, void *objp)
+{
+ char *realobj;
+ int size, i;
+ int lines = 0;
+
+ realobj = (char *)objp + obj_offset(cachep);
+ size = obj_size(cachep);
+
+ for (i = 0; i < size; i++) {
+ char exp = POISON_FREE;
+ if (i == size - 1)
+ exp = POISON_END;
+ if (realobj[i] != exp) {
+ int limit;
+ /* Mismatch ! */
+ /* Print header */
+ if (lines == 0) {
+ printk(KERN_ERR
+ "Slab corruption: %s start=%p, len=%d\n",
+ cachep->name, realobj, size);
+ print_objinfo(cachep, objp, 0);
+ }
+ /* Hexdump the affected line */
+ i = (i / 16) * 16;
+ limit = 16;
+ if (i + limit > size)
+ limit = size - i;
+ dump_line(realobj, i, limit);
+ i += 16;
+ lines++;
+ /* Limit to 5 lines */
+ if (lines > 5)
+ break;
+ }
+ }
+ if (lines != 0) {
+ /* Print some data about the neighboring objects, if they
+ * exist:
+ */
+ struct slab *slabp = virt_to_slab(objp);
+ unsigned int objnr;
+
+ objnr = obj_to_index(cachep, slabp, objp);
+ if (objnr) {
+ objp = index_to_obj(cachep, slabp, objnr - 1);
+ realobj = (char *)objp + obj_offset(cachep);
+ printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
+ realobj, size);
+ print_objinfo(cachep, objp, 2);
+ }
+ if (objnr + 1 < cachep->num) {
+ objp = index_to_obj(cachep, slabp, objnr + 1);
+ realobj = (char *)objp + obj_offset(cachep);
+ printk(KERN_ERR "Next obj: start=%p, len=%d\n",
+ realobj, size);
+ print_objinfo(cachep, objp, 2);
+ }
+ }
+}
+#endif
+
+#if DEBUG
+static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
+{
+ int i;
+ for (i = 0; i < cachep->num; i++) {
+ void *objp = index_to_obj(cachep, slabp, i);
+
+ if (cachep->flags & SLAB_POISON) {
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ if (cachep->buffer_size % PAGE_SIZE == 0 &&
+ OFF_SLAB(cachep))
+ kernel_map_pages(virt_to_page(objp),
+ cachep->buffer_size / PAGE_SIZE, 1);
+ else
+ check_poison_obj(cachep, objp);
+#else
+ check_poison_obj(cachep, objp);
+#endif
+ }
+ if (cachep->flags & SLAB_RED_ZONE) {
+ if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
+ slab_error(cachep, "start of a freed object "
+ "was overwritten");
+ if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
+ slab_error(cachep, "end of a freed object "
+ "was overwritten");
+ }
+ }
+}
+#else
+static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
+{
+}
+#endif
+
+/**
+ * slab_destroy - destroy and release all objects in a slab
+ * @cachep: cache pointer being destroyed
+ * @slabp: slab pointer being destroyed
+ *
+ * Destroy all the objs in a slab, and release the mem back to the system.
+ * Before calling the slab must have been unlinked from the cache. The
+ * cache-lock is not held/needed.
+ */
+static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
+{
+ void *addr = slabp->s_mem - slabp->colouroff;
+
+ slab_destroy_debugcheck(cachep, slabp);
+ if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
+ struct slab_rcu *slab_rcu;
+
+ slab_rcu = (struct slab_rcu *)slabp;
+ slab_rcu->cachep = cachep;
+ slab_rcu->addr = addr;
+ call_rcu(&slab_rcu->head, kmem_rcu_free);
+ } else {
+ kmem_freepages(cachep, addr);
+ if (OFF_SLAB(cachep))
+ kmem_cache_free(cachep->slabp_cache, slabp);
+ }
+}
+
+static void __kmem_cache_destroy(struct kmem_cache *cachep)
+{
+ int i;
+ struct kmem_list3 *l3;
+
+ for_each_online_cpu(i)
+ kfree(cachep->array[i]);
+
+ /* NUMA: free the list3 structures */
+ for_each_online_node(i) {
+ l3 = cachep->nodelists[i];
+ if (l3) {
+ kfree(l3->shared);
+ free_alien_cache(l3->alien);
+ kfree(l3);
+ }
+ }
+ kmem_cache_free(&cache_cache, cachep);
+}
+
+
+/**
+ * calculate_slab_order - calculate size (page order) of slabs
+ * @cachep: pointer to the cache that is being created
+ * @size: size of objects to be created in this cache.
+ * @align: required alignment for the objects.
+ * @flags: slab allocation flags
+ *
+ * Also calculates the number of objects per slab.
+ *
+ * This could be made much more intelligent. For now, try to avoid using
+ * high order pages for slabs. When the gfp() functions are more friendly
+ * towards high-order requests, this should be changed.
+ */
+static size_t calculate_slab_order(struct kmem_cache *cachep,
+ size_t size, size_t align, unsigned long flags)
+{
+ unsigned long offslab_limit;
+ size_t left_over = 0;
+ int gfporder;
+
+ for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
+ unsigned int num;
+ size_t remainder;
+
+ cache_estimate(gfporder, size, align, flags, &remainder, &num);
+ if (!num)
+ continue;
+
+ if (flags & CFLGS_OFF_SLAB) {
+ /*
+ * Max number of objs-per-slab for caches which
+ * use off-slab slabs. Needed to avoid a possible
+ * looping condition in cache_grow().
+ */
+ offslab_limit = size - sizeof(struct slab);
+ offslab_limit /= sizeof(kmem_bufctl_t);
+
+ if (num > offslab_limit)
+ break;
+ }
+
+ /* Found something acceptable - save it away */
+ cachep->num = num;
+ cachep->gfporder = gfporder;
+ left_over = remainder;
+
+ /*
+ * A VFS-reclaimable slab tends to have most allocations
+ * as GFP_NOFS and we really don't want to have to be allocating
+ * higher-order pages when we are unable to shrink dcache.
+ */
+ if (flags & SLAB_RECLAIM_ACCOUNT)
+ break;
+
+ /*
+ * Large number of objects is good, but very large slabs are
+ * currently bad for the gfp()s.
+ */
+ if (gfporder >= slab_break_gfp_order)
+ break;
+
+ /*
+ * Acceptable internal fragmentation?
+ */
+ if (left_over * 8 <= (PAGE_SIZE << gfporder))
+ break;
+ }
+ return left_over;
+}
+
+static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
+{
+ if (g_cpucache_up == FULL)
+ return enable_cpucache(cachep);
+
+ if (g_cpucache_up == NONE) {
+ /*
+ * Note: the first kmem_cache_create must create the cache
+ * that's used by kmalloc(24), otherwise the creation of
+ * further caches will BUG().
+ */
+ cachep->array[smp_processor_id()] = &initarray_generic.cache;
+
+ /*
+ * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
+ * the first cache, then we need to set up all its list3s,
+ * otherwise the creation of further caches will BUG().
+ */
+ set_up_list3s(cachep, SIZE_AC);
+ if (INDEX_AC == INDEX_L3)
+ g_cpucache_up = PARTIAL_L3;
+ else
+ g_cpucache_up = PARTIAL_AC;
+ } else {
+ cachep->array[smp_processor_id()] =
+ kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+
+ if (g_cpucache_up == PARTIAL_AC) {
+ set_up_list3s(cachep, SIZE_L3);
+ g_cpucache_up = PARTIAL_L3;
+ } else {
+ int node;
+ for_each_online_node(node) {
+ cachep->nodelists[node] =
+ kmalloc_node(sizeof(struct kmem_list3),
+ GFP_KERNEL, node);
+ BUG_ON(!cachep->nodelists[node]);
+ kmem_list3_init(cachep->nodelists[node]);
+ }
+ }
+ }
+ cachep->nodelists[numa_node_id()]->next_reap =
+ jiffies + REAPTIMEOUT_LIST3 +
+ ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+
+ cpu_cache_get(cachep)->avail = 0;
+ cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
+ cpu_cache_get(cachep)->batchcount = 1;
+ cpu_cache_get(cachep)->touched = 0;
+ cachep->batchcount = 1;
+ cachep->limit = BOOT_CPUCACHE_ENTRIES;
+ return 0;
+}
+
+/**
+ * kmem_cache_create - Create a cache.
+ * @name: A string which is used in /proc/slabinfo to identify this cache.
+ * @size: The size of objects to be created in this cache.
+ * @align: The required alignment for the objects.
+ * @flags: SLAB flags
+ * @ctor: A constructor for the objects.
+ *
+ * Returns a ptr to the cache on success, NULL on failure.
+ * Cannot be called within a int, but can be interrupted.
+ * The @ctor is run when new pages are allocated by the cache.
+ *
+ * @name must be valid until the cache is destroyed. This implies that
+ * the module calling this has to destroy the cache before getting unloaded.
+ *
+ * The flags are
+ *
+ * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
+ * to catch references to uninitialised memory.
+ *
+ * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
+ * for buffer overruns.
+ *
+ * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
+ * cacheline. This can be beneficial if you're counting cycles as closely
+ * as davem.
+ */
+struct kmem_cache *
+kmem_cache_create (const char *name, size_t size, size_t align,
+ unsigned long flags, void (*ctor)(void *))
+{
+ size_t left_over, slab_size, ralign;
+ struct kmem_cache *cachep = NULL, *pc;
+
+ /*
+ * Sanity checks... these are all serious usage bugs.
+ */
+ if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
+ size > KMALLOC_MAX_SIZE) {
+ printk(KERN_ERR "%s: Early error in slab %s\n", __func__,
+ name);
+ BUG();
+ }
+
+ /*
+ * We use cache_chain_mutex to ensure a consistent view of
+ * cpu_online_map as well. Please see cpuup_callback
+ */
+ get_online_cpus();
+ mutex_lock(&cache_chain_mutex);
+
+ list_for_each_entry(pc, &cache_chain, next) {
+ char tmp;
+ int res;
+
+ /*
+ * This happens when the module gets unloaded and doesn't
+ * destroy its slab cache and no-one else reuses the vmalloc
+ * area of the module. Print a warning.
+ */
+ res = probe_kernel_address(pc->name, tmp);
+ if (res) {
+ printk(KERN_ERR
+ "SLAB: cache with size %d has lost its name\n",
+ pc->buffer_size);
+ continue;
+ }
+
+ if (!strcmp(pc->name, name)) {
+ printk(KERN_ERR
+ "kmem_cache_create: duplicate cache %s\n", name);
+ dump_stack();
+ goto oops;
+ }
+ }
+
+#if DEBUG
+ WARN_ON(strchr(name, ' ')); /* It confuses parsers */
+#if FORCED_DEBUG
+ /*
+ * Enable redzoning and last user accounting, except for caches with
+ * large objects, if the increased size would increase the object size
+ * above the next power of two: caches with object sizes just above a
+ * power of two have a significant amount of internal fragmentation.
+ */
+ if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
+ 2 * sizeof(unsigned long long)))
+ flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
+ if (!(flags & SLAB_DESTROY_BY_RCU))
+ flags |= SLAB_POISON;
+#endif
+ if (flags & SLAB_DESTROY_BY_RCU)
+ BUG_ON(flags & SLAB_POISON);
+#endif
+ /*
+ * Always checks flags, a caller might be expecting debug support which
+ * isn't available.
+ */
+ BUG_ON(flags & ~CREATE_MASK);
+
+ /*
+ * Check that size is in terms of words. This is needed to avoid
+ * unaligned accesses for some archs when redzoning is used, and makes
+ * sure any on-slab bufctl's are also correctly aligned.
+ */
+ if (size & (BYTES_PER_WORD - 1)) {
+ size += (BYTES_PER_WORD - 1);
+ size &= ~(BYTES_PER_WORD - 1);
+ }
+
+ /* calculate the final buffer alignment: */
+
+ /* 1) arch recommendation: can be overridden for debug */
+ if (flags & SLAB_HWCACHE_ALIGN) {
+ /*
+ * Default alignment: as specified by the arch code. Except if
+ * an object is really small, then squeeze multiple objects into
+ * one cacheline.
+ */
+ ralign = cache_line_size();
+ while (size <= ralign / 2)
+ ralign /= 2;
+ } else {
+ ralign = BYTES_PER_WORD;
+ }
+
+ /*
+ * Redzoning and user store require word alignment or possibly larger.
+ * Note this will be overridden by architecture or caller mandated
+ * alignment if either is greater than BYTES_PER_WORD.
+ */
+ if (flags & SLAB_STORE_USER)
+ ralign = BYTES_PER_WORD;
+
+ if (flags & SLAB_RED_ZONE) {
+ ralign = REDZONE_ALIGN;
+ /* If redzoning, ensure that the second redzone is suitably
+ * aligned, by adjusting the object size accordingly. */
+ size += REDZONE_ALIGN - 1;
+ size &= ~(REDZONE_ALIGN - 1);
+ }
+
+ /* 2) arch mandated alignment */
+ if (ralign < ARCH_SLAB_MINALIGN) {
+ ralign = ARCH_SLAB_MINALIGN;
+ }
+ /* 3) caller mandated alignment */
+ if (ralign < align) {
+ ralign = align;
+ }
+ /* disable debug if necessary */
+ if (ralign > __alignof__(unsigned long long))
+ flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
+ /*
+ * 4) Store it.
+ */
+ align = ralign;
+
+ /* Get cache's description obj. */
+ cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL);
+ if (!cachep)
+ goto oops;
+
+#if DEBUG
+ cachep->obj_size = size;
+
+ /*
+ * Both debugging options require word-alignment which is calculated
+ * into align above.
+ */
+ if (flags & SLAB_RED_ZONE) {
+ /* add space for red zone words */
+ cachep->obj_offset += sizeof(unsigned long long);
+ size += 2 * sizeof(unsigned long long);
+ }
+ if (flags & SLAB_STORE_USER) {
+ /* user store requires one word storage behind the end of
+ * the real object. But if the second red zone needs to be
+ * aligned to 64 bits, we must allow that much space.
+ */
+ if (flags & SLAB_RED_ZONE)
+ size += REDZONE_ALIGN;
+ else
+ size += BYTES_PER_WORD;
+ }
+#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
+ if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
+ && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
+ cachep->obj_offset += PAGE_SIZE - size;
+ size = PAGE_SIZE;
+ }
+#endif
+#endif
+
+ /*
+ * Determine if the slab management is 'on' or 'off' slab.
+ * (bootstrapping cannot cope with offslab caches so don't do
+ * it too early on.)
+ */
+ if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)
+ /*
+ * Size is large, assume best to place the slab management obj
+ * off-slab (should allow better packing of objs).
+ */
+ flags |= CFLGS_OFF_SLAB;
+
+ size = ALIGN(size, align);
+
+ left_over = calculate_slab_order(cachep, size, align, flags);
+
+ if (!cachep->num) {
+ printk(KERN_ERR
+ "kmem_cache_create: couldn't create cache %s.\n", name);
+ kmem_cache_free(&cache_cache, cachep);
+ cachep = NULL;
+ goto oops;
+ }
+ slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
+ + sizeof(struct slab), align);
+
+ /*
+ * If the slab has been placed off-slab, and we have enough space then
+ * move it on-slab. This is at the expense of any extra colouring.
+ */
+ if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
+ flags &= ~CFLGS_OFF_SLAB;
+ left_over -= slab_size;
+ }
+
+ if (flags & CFLGS_OFF_SLAB) {
+ /* really off slab. No need for manual alignment */
+ slab_size =
+ cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
+ }
+
+ cachep->colour_off = cache_line_size();
+ /* Offset must be a multiple of the alignment. */
+ if (cachep->colour_off < align)
+ cachep->colour_off = align;
+ cachep->colour = left_over / cachep->colour_off;
+ cachep->slab_size = slab_size;
+ cachep->flags = flags;
+ cachep->gfpflags = 0;
+ if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
+ cachep->gfpflags |= GFP_DMA;
+ cachep->buffer_size = size;
+ cachep->reciprocal_buffer_size = reciprocal_value(size);
+
+ if (flags & CFLGS_OFF_SLAB) {
+ cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
+ /*
+ * This is a possibility for one of the malloc_sizes caches.
+ * But since we go off slab only for object size greater than
+ * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
+ * this should not happen at all.
+ * But leave a BUG_ON for some lucky dude.
+ */
+ BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
+ }
+ cachep->ctor = ctor;
+ cachep->name = name;
+
+ if (setup_cpu_cache(cachep)) {
+ __kmem_cache_destroy(cachep);
+ cachep = NULL;
+ goto oops;
+ }
+
+ /* cache setup completed, link it into the list */
+ list_add(&cachep->next, &cache_chain);
+oops:
+ if (!cachep && (flags & SLAB_PANIC))
+ panic("kmem_cache_create(): failed to create slab `%s'\n",
+ name);
+ mutex_unlock(&cache_chain_mutex);
+ put_online_cpus();
+ return cachep;
+}
+EXPORT_SYMBOL(kmem_cache_create);
+
+#if DEBUG
+static void check_irq_off(void)
+{
+ BUG_ON(!irqs_disabled());
+}
+
+static void check_irq_on(void)
+{
+ BUG_ON(irqs_disabled());
+}
+
+static void check_spinlock_acquired(struct kmem_cache *cachep)
+{
+#ifdef CONFIG_SMP
+ check_irq_off();
+ assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
+#endif
+}
+
+static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
+{
+#ifdef CONFIG_SMP
+ check_irq_off();
+ assert_spin_locked(&cachep->nodelists[node]->list_lock);
+#endif
+}
+
+#else
+#define check_irq_off() do { } while(0)
+#define check_irq_on() do { } while(0)
+#define check_spinlock_acquired(x) do { } while(0)
+#define check_spinlock_acquired_node(x, y) do { } while(0)
+#endif
+
+static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
+ struct array_cache *ac,
+ int force, int node);
+
+static void do_drain(void *arg)
+{
+ struct kmem_cache *cachep = arg;
+ struct array_cache *ac;
+ int node = numa_node_id();
+
+ check_irq_off();
+ ac = cpu_cache_get(cachep);
+ spin_lock(&cachep->nodelists[node]->list_lock);
+ free_block(cachep, ac->entry, ac->avail, node);
+ spin_unlock(&cachep->nodelists[node]->list_lock);
+ ac->avail = 0;
+}
+
+static void drain_cpu_caches(struct kmem_cache *cachep)
+{
+ struct kmem_list3 *l3;
+ int node;
+
+ on_each_cpu(do_drain, cachep, 1);
+ check_irq_on();
+ for_each_online_node(node) {
+ l3 = cachep->nodelists[node];
+ if (l3 && l3->alien)
+ drain_alien_cache(cachep, l3->alien);
+ }
+
+ for_each_online_node(node) {
+ l3 = cachep->nodelists[node];
+ if (l3)
+ drain_array(cachep, l3, l3->shared, 1, node);
+ }
+}
+
+/*
+ * Remove slabs from the list of free slabs.
+ * Specify the number of slabs to drain in tofree.
+ *
+ * Returns the actual number of slabs released.
+ */
+static int drain_freelist(struct kmem_cache *cache,
+ struct kmem_list3 *l3, int tofree)
+{
+ struct list_head *p;
+ int nr_freed;
+ struct slab *slabp;
+
+ nr_freed = 0;
+ while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {
+
+ spin_lock_irq(&l3->list_lock);
+ p = l3->slabs_free.prev;
+ if (p == &l3->slabs_free) {
+ spin_unlock_irq(&l3->list_lock);
+ goto out;
+ }
+
+ slabp = list_entry(p, struct slab, list);
+#if DEBUG
+ BUG_ON(slabp->inuse);
+#endif
+ list_del(&slabp->list);
+ /*
+ * Safe to drop the lock. The slab is no longer linked
+ * to the cache.
+ */
+ l3->free_objects -= cache->num;
+ spin_unlock_irq(&l3->list_lock);
+ slab_destroy(cache, slabp);
+ nr_freed++;
+ }
+out:
+ return nr_freed;
+}
+
+/* Called with cache_chain_mutex held to protect against cpu hotplug */
+static int __cache_shrink(struct kmem_cache *cachep)
+{
+ int ret = 0, i = 0;
+ struct kmem_list3 *l3;
+
+ drain_cpu_caches(cachep);
+
+ check_irq_on();
+ for_each_online_node(i) {
+ l3 = cachep->nodelists[i];
+ if (!l3)
+ continue;
+
+ drain_freelist(cachep, l3, l3->free_objects);
+
+ ret += !list_empty(&l3->slabs_full) ||
+ !list_empty(&l3->slabs_partial);
+ }
+ return (ret ? 1 : 0);
+}
+
+/**
+ * kmem_cache_shrink - Shrink a cache.
+ * @cachep: The cache to shrink.
+ *
+ * Releases as many slabs as possible for a cache.
+ * To help debugging, a zero exit status indicates all slabs were released.
+ */
+int kmem_cache_shrink(struct kmem_cache *cachep)
+{
+ int ret;
+ BUG_ON(!cachep || in_interrupt());
+
+ get_online_cpus();
+ mutex_lock(&cache_chain_mutex);
+ ret = __cache_shrink(cachep);
+ mutex_unlock(&cache_chain_mutex);
+ put_online_cpus();
+ return ret;
+}
+EXPORT_SYMBOL(kmem_cache_shrink);
+
+/**
+ * kmem_cache_destroy - delete a cache
+ * @cachep: the cache to destroy
+ *
+ * Remove a &struct kmem_cache object from the slab cache.
+ *
+ * It is expected this function will be called by a module when it is
+ * unloaded. This will remove the cache completely, and avoid a duplicate
+ * cache being allocated each time a module is loaded and unloaded, if the
+ * module doesn't have persistent in-kernel storage across loads and unloads.
+ *
+ * The cache must be empty before calling this function.
+ *
+ * The caller must guarantee that noone will allocate memory from the cache
+ * during the kmem_cache_destroy().
+ */
+void kmem_cache_destroy(struct kmem_cache *cachep)
+{
+ BUG_ON(!cachep || in_interrupt());
+
+ /* Find the cache in the chain of caches. */
+ get_online_cpus();
+ mutex_lock(&cache_chain_mutex);
+ /*
+ * the chain is never empty, cache_cache is never destroyed
+ */
+ list_del(&cachep->next);
+ if (__cache_shrink(cachep)) {
+ slab_error(cachep, "Can't free all objects");
+ list_add(&cachep->next, &cache_chain);
+ mutex_unlock(&cache_chain_mutex);
+ put_online_cpus();
+ return;
+ }
+
+ if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
+ synchronize_rcu();
+
+ __kmem_cache_destroy(cachep);
+ mutex_unlock(&cache_chain_mutex);
+ put_online_cpus();
+}
+EXPORT_SYMBOL(kmem_cache_destroy);
+
+/*
+ * Get the memory for a slab management obj.
+ * For a slab cache when the slab descriptor is off-slab, slab descriptors
+ * always come from malloc_sizes caches. The slab descriptor cannot
+ * come from the same cache which is getting created because,
+ * when we are searching for an appropriate cache for these
+ * descriptors in kmem_cache_create, we search through the malloc_sizes array.
+ * If we are creating a malloc_sizes cache here it would not be visible to
+ * kmem_find_general_cachep till the initialization is complete.
+ * Hence we cannot have slabp_cache same as the original cache.
+ */
+static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
+ int colour_off, gfp_t local_flags,
+ int nodeid)
+{
+ struct slab *slabp;
+
+ if (OFF_SLAB(cachep)) {
+ /* Slab management obj is off-slab. */
+ slabp = kmem_cache_alloc_node(cachep->slabp_cache,
+ local_flags & ~GFP_THISNODE, nodeid);
+ if (!slabp)
+ return NULL;
+ } else {
+ slabp = objp + colour_off;
+ colour_off += cachep->slab_size;
+ }
+ slabp->inuse = 0;
+ slabp->colouroff = colour_off;
+ slabp->s_mem = objp + colour_off;
+ slabp->nodeid = nodeid;
+ slabp->free = 0;
+ return slabp;
+}
+
+static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
+{
+ return (kmem_bufctl_t *) (slabp + 1);
+}
+
+static void cache_init_objs(struct kmem_cache *cachep,
+ struct slab *slabp)
+{
+ int i;
+
+ for (i = 0; i < cachep->num; i++) {
+ void *objp = index_to_obj(cachep, slabp, i);
+#if DEBUG
+ /* need to poison the objs? */
+ if (cachep->flags & SLAB_POISON)
+ poison_obj(cachep, objp, POISON_FREE);
+ if (cachep->flags & SLAB_STORE_USER)
+ *dbg_userword(cachep, objp) = NULL;
+
+ if (cachep->flags & SLAB_RED_ZONE) {
+ *dbg_redzone1(cachep, objp) = RED_INACTIVE;
+ *dbg_redzone2(cachep, objp) = RED_INACTIVE;
+ }
+ /*
+ * Constructors are not allowed to allocate memory from the same
+ * cache which they are a constructor for. Otherwise, deadlock.
+ * They must also be threaded.
+ */
+ if (cachep->ctor && !(cachep->flags & SLAB_POISON))
+ cachep->ctor(objp + obj_offset(cachep));
+
+ if (cachep->flags & SLAB_RED_ZONE) {
+ if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
+ slab_error(cachep, "constructor overwrote the"
+ " end of an object");
+ if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
+ slab_error(cachep, "constructor overwrote the"
+ " start of an object");
+ }
+ if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
+ OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
+ kernel_map_pages(virt_to_page(objp),
+ cachep->buffer_size / PAGE_SIZE, 0);
+#else
+ if (cachep->ctor)
+ cachep->ctor(objp);
+#endif
+ slab_bufctl(slabp)[i] = i + 1;
+ }
+ slab_bufctl(slabp)[i - 1] = BUFCTL_END;
+}
+
+static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
+{
+ if (CONFIG_ZONE_DMA_FLAG) {
+ if (flags & GFP_DMA)
+ BUG_ON(!(cachep->gfpflags & GFP_DMA));
+ else
+ BUG_ON(cachep->gfpflags & GFP_DMA);
+ }
+}
+
+static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
+ int nodeid)
+{
+ void *objp = index_to_obj(cachep, slabp, slabp->free);
+ kmem_bufctl_t next;
+
+ slabp->inuse++;
+ next = slab_bufctl(slabp)[slabp->free];
+#if DEBUG
+ slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
+ WARN_ON(slabp->nodeid != nodeid);
+#endif
+ slabp->free = next;
+
+ return objp;
+}
+
+static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
+ void *objp, int nodeid)
+{
+ unsigned int objnr = obj_to_index(cachep, slabp, objp);
+
+#if DEBUG
+ /* Verify that the slab belongs to the intended node */
+ WARN_ON(slabp->nodeid != nodeid);
+
+ if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
+ printk(KERN_ERR "slab: double free detected in cache "
+ "'%s', objp %p\n", cachep->name, objp);
+ BUG();
+ }
+#endif
+ slab_bufctl(slabp)[objnr] = slabp->free;
+ slabp->free = objnr;
+ slabp->inuse--;
+}
+
+/*
+ * Map pages beginning at addr to the given cache and slab. This is required
+ * for the slab allocator to be able to lookup the cache and slab of a
+ * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
+ */
+static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
+ void *addr)
+{
+ int nr_pages;
+ struct page *page;
+
+ page = virt_to_page(addr);
+
+ nr_pages = 1;
+ if (likely(!PageCompound(page)))
+ nr_pages <<= cache->gfporder;
+
+ do {
+ page_set_cache(page, cache);
+ page_set_slab(page, slab);
+ page++;
+ } while (--nr_pages);
+}
+
+/*
+ * Grow (by 1) the number of slabs within a cache. This is called by
+ * kmem_cache_alloc() when there are no active objs left in a cache.
+ */
+static int cache_grow(struct kmem_cache *cachep,
+ gfp_t flags, int nodeid, void *objp)
+{
+ struct slab *slabp;
+ size_t offset;
+ gfp_t local_flags;
+ struct kmem_list3 *l3;
+
+ /*
+ * Be lazy and only check for valid flags here, keeping it out of the
+ * critical path in kmem_cache_alloc().
+ */
+ BUG_ON(flags & GFP_SLAB_BUG_MASK);
+ local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
+
+ /* Take the l3 list lock to change the colour_next on this node */
+ check_irq_off();
+ l3 = cachep->nodelists[nodeid];
+ spin_lock(&l3->list_lock);
+
+ /* Get colour for the slab, and cal the next value. */
+ offset = l3->colour_next;
+ l3->colour_next++;
+ if (l3->colour_next >= cachep->colour)
+ l3->colour_next = 0;
+ spin_unlock(&l3->list_lock);
+
+ offset *= cachep->colour_off;
+
+ if (local_flags & __GFP_WAIT)
+ local_irq_enable();
+
+ /*
+ * The test for missing atomic flag is performed here, rather than
+ * the more obvious place, simply to reduce the critical path length
+ * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
+ * will eventually be caught here (where it matters).
+ */
+ kmem_flagcheck(cachep, flags);
+
+ /*
+ * Get mem for the objs. Attempt to allocate a physical page from
+ * 'nodeid'.
+ */
+ if (!objp)
+ objp = kmem_getpages(cachep, local_flags, nodeid);
+ if (!objp)
+ goto failed;
+
+ /* Get slab management. */
+ slabp = alloc_slabmgmt(cachep, objp, offset,
+ local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
+ if (!slabp)
+ goto opps1;
+
+ slab_map_pages(cachep, slabp, objp);
+
+ cache_init_objs(cachep, slabp);
+
+ if (local_flags & __GFP_WAIT)
+ local_irq_disable();
+ check_irq_off();
+ spin_lock(&l3->list_lock);
+
+ /* Make slab active. */
+ list_add_tail(&slabp->list, &(l3->slabs_free));
+ STATS_INC_GROWN(cachep);
+ l3->free_objects += cachep->num;
+ spin_unlock(&l3->list_lock);
+ return 1;
+opps1:
+ kmem_freepages(cachep, objp);
+failed:
+ if (local_flags & __GFP_WAIT)
+ local_irq_disable();
+ return 0;
+}
+
+#if DEBUG
+
+/*
+ * Perform extra freeing checks:
+ * - detect bad pointers.
+ * - POISON/RED_ZONE checking
+ */
+static void kfree_debugcheck(const void *objp)
+{
+ if (!virt_addr_valid(objp)) {
+ printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
+ (unsigned long)objp);
+ BUG();
+ }
+}
+
+static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
+{
+ unsigned long long redzone1, redzone2;
+
+ redzone1 = *dbg_redzone1(cache, obj);
+ redzone2 = *dbg_redzone2(cache, obj);
+
+ /*
+ * Redzone is ok.
+ */
+ if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
+ return;
+
+ if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
+ slab_error(cache, "double free detected");
+ else
+ slab_error(cache, "memory outside object was overwritten");
+
+ printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n",
+ obj, redzone1, redzone2);
+}
+
+static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
+ void *caller)
+{
+ struct page *page;
+ unsigned int objnr;
+ struct slab *slabp;
+
+ BUG_ON(virt_to_cache(objp) != cachep);
+
+ objp -= obj_offset(cachep);
+ kfree_debugcheck(objp);
+ page = virt_to_head_page(objp);
+
+ slabp = page_get_slab(page);
+
+ if (cachep->flags & SLAB_RED_ZONE) {
+ verify_redzone_free(cachep, objp);
+ *dbg_redzone1(cachep, objp) = RED_INACTIVE;
+ *dbg_redzone2(cachep, objp) = RED_INACTIVE;
+ }
+ if (cachep->flags & SLAB_STORE_USER)
+ *dbg_userword(cachep, objp) = caller;
+
+ objnr = obj_to_index(cachep, slabp, objp);
+
+ BUG_ON(objnr >= cachep->num);
+ BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
+
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+ slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
+#endif
+ if (cachep->flags & SLAB_POISON) {
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
+ store_stackinfo(cachep, objp, (unsigned long)caller);
+ kernel_map_pages(virt_to_page(objp),
+ cachep->buffer_size / PAGE_SIZE, 0);
+ } else {
+ poison_obj(cachep, objp, POISON_FREE);
+ }
+#else
+ poison_obj(cachep, objp, POISON_FREE);
+#endif
+ }
+ return objp;
+}
+
+static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
+{
+ kmem_bufctl_t i;
+ int entries = 0;
+
+ /* Check slab's freelist to see if this obj is there. */
+ for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
+ entries++;
+ if (entries > cachep->num || i >= cachep->num)
+ goto bad;
+ }
+ if (entries != cachep->num - slabp->inuse) {
+bad:
+ printk(KERN_ERR "slab: Internal list corruption detected in "
+ "cache '%s'(%d), slabp %p(%d). Hexdump:\n",
+ cachep->name, cachep->num, slabp, slabp->inuse);
+ for (i = 0;
+ i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
+ i++) {
+ if (i % 16 == 0)
+ printk("\n%03x:", i);
+ printk(" %02x", ((unsigned char *)slabp)[i]);
+ }
+ printk("\n");
+ BUG();
+ }
+}
+#else
+#define kfree_debugcheck(x) do { } while(0)
+#define cache_free_debugcheck(x,objp,z) (objp)
+#define check_slabp(x,y) do { } while(0)
+#endif
+
+static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
+{
+ int batchcount;
+ struct kmem_list3 *l3;
+ struct array_cache *ac;
+ int node;
+
+retry:
+ check_irq_off();
+ node = numa_node_id();
+ ac = cpu_cache_get(cachep);
+ batchcount = ac->batchcount;
+ if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
+ /*
+ * If there was little recent activity on this cache, then
+ * perform only a partial refill. Otherwise we could generate
+ * refill bouncing.
+ */
+ batchcount = BATCHREFILL_LIMIT;
+ }
+ l3 = cachep->nodelists[node];
+
+ BUG_ON(ac->avail > 0 || !l3);
+ spin_lock(&l3->list_lock);
+
+ /* See if we can refill from the shared array */
+ if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
+ goto alloc_done;
+
+ while (batchcount > 0) {
+ struct list_head *entry;
+ struct slab *slabp;
+ /* Get slab alloc is to come from. */
+ entry = l3->slabs_partial.next;
+ if (entry == &l3->slabs_partial) {
+ l3->free_touched = 1;
+ entry = l3->slabs_free.next;
+ if (entry == &l3->slabs_free)
+ goto must_grow;
+ }
+
+ slabp = list_entry(entry, struct slab, list);
+ check_slabp(cachep, slabp);
+ check_spinlock_acquired(cachep);
+
+ /*
+ * The slab was either on partial or free list so
+ * there must be at least one object available for
+ * allocation.
+ */
+ BUG_ON(slabp->inuse < 0 || slabp->inuse >= cachep->num);
+
+ while (slabp->inuse < cachep->num && batchcount--) {
+ STATS_INC_ALLOCED(cachep);
+ STATS_INC_ACTIVE(cachep);
+ STATS_SET_HIGH(cachep);
+
+ ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
+ node);
+ }
+ check_slabp(cachep, slabp);
+
+ /* move slabp to correct slabp list: */
+ list_del(&slabp->list);
+ if (slabp->free == BUFCTL_END)
+ list_add(&slabp->list, &l3->slabs_full);
+ else
+ list_add(&slabp->list, &l3->slabs_partial);
+ }
+
+must_grow:
+ l3->free_objects -= ac->avail;
+alloc_done:
+ spin_unlock(&l3->list_lock);
+
+ if (unlikely(!ac->avail)) {
+ int x;
+ x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
+
+ /* cache_grow can reenable interrupts, then ac could change. */
+ ac = cpu_cache_get(cachep);
+ if (!x && ac->avail == 0) /* no objects in sight? abort */
+ return NULL;
+
+ if (!ac->avail) /* objects refilled by interrupt? */
+ goto retry;
+ }
+ ac->touched = 1;
+ return ac->entry[--ac->avail];
+}
+
+static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
+ gfp_t flags)
+{
+ might_sleep_if(flags & __GFP_WAIT);
+#if DEBUG
+ kmem_flagcheck(cachep, flags);
+#endif
+}
+
+#if DEBUG
+static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
+ gfp_t flags, void *objp, void *caller)
+{
+ if (!objp)
+ return objp;
+ if (cachep->flags & SLAB_POISON) {
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
+ kernel_map_pages(virt_to_page(objp),
+ cachep->buffer_size / PAGE_SIZE, 1);
+ else
+ check_poison_obj(cachep, objp);
+#else
+ check_poison_obj(cachep, objp);
+#endif
+ poison_obj(cachep, objp, POISON_INUSE);
+ }
+ if (cachep->flags & SLAB_STORE_USER)
+ *dbg_userword(cachep, objp) = caller;
+
+ if (cachep->flags & SLAB_RED_ZONE) {
+ if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
+ *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
+ slab_error(cachep, "double free, or memory outside"
+ " object was overwritten");
+ printk(KERN_ERR
+ "%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
+ objp, *dbg_redzone1(cachep, objp),
+ *dbg_redzone2(cachep, objp));
+ }
+ *dbg_redzone1(cachep, objp) = RED_ACTIVE;
+ *dbg_redzone2(cachep, objp) = RED_ACTIVE;
+ }
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+ {
+ struct slab *slabp;
+ unsigned objnr;
+
+ slabp = page_get_slab(virt_to_head_page(objp));
+ objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
+ slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
+ }
+#endif
+ objp += obj_offset(cachep);
+ if (cachep->ctor && cachep->flags & SLAB_POISON)
+ cachep->ctor(objp);
+#if ARCH_SLAB_MINALIGN
+ if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
+ printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
+ objp, ARCH_SLAB_MINALIGN);
+ }
+#endif
+ return objp;
+}
+#else
+#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
+#endif
+
+#ifdef CONFIG_FAILSLAB
+
+static struct failslab_attr {
+
+ struct fault_attr attr;
+
+ u32 ignore_gfp_wait;
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+ struct dentry *ignore_gfp_wait_file;
+#endif
+
+} failslab = {
+ .attr = FAULT_ATTR_INITIALIZER,
+ .ignore_gfp_wait = 1,
+};
+
+static int __init setup_failslab(char *str)
+{
+ return setup_fault_attr(&failslab.attr, str);
+}
+__setup("failslab=", setup_failslab);
+
+static int should_failslab(struct kmem_cache *cachep, gfp_t flags)
+{
+ if (cachep == &cache_cache)
+ return 0;
+ if (flags & __GFP_NOFAIL)
+ return 0;
+ if (failslab.ignore_gfp_wait && (flags & __GFP_WAIT))
+ return 0;
+
+ return should_fail(&failslab.attr, obj_size(cachep));
+}
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+static int __init failslab_debugfs(void)
+{
+ mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+ struct dentry *dir;
+ int err;
+
+ err = init_fault_attr_dentries(&failslab.attr, "failslab");
+ if (err)
+ return err;
+ dir = failslab.attr.dentries.dir;
+
+ failslab.ignore_gfp_wait_file =
+ debugfs_create_bool("ignore-gfp-wait", mode, dir,
+ &failslab.ignore_gfp_wait);
+
+ if (!failslab.ignore_gfp_wait_file) {
+ err = -ENOMEM;
+ debugfs_remove(failslab.ignore_gfp_wait_file);
+ cleanup_fault_attr_dentries(&failslab.attr);
+ }
+
+ return err;
+}
+
+late_initcall(failslab_debugfs);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+#else /* CONFIG_FAILSLAB */
+
+static inline int should_failslab(struct kmem_cache *cachep, gfp_t flags)
+{
+ return 0;
+}
+
+#endif /* CONFIG_FAILSLAB */
+
+static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
+{
+ void *objp;
+ struct array_cache *ac;
+
+ check_irq_off();
+
+ ac = cpu_cache_get(cachep);
+ if (likely(ac->avail)) {
+ STATS_INC_ALLOCHIT(cachep);
+ ac->touched = 1;
+ objp = ac->entry[--ac->avail];
+ } else {
+ STATS_INC_ALLOCMISS(cachep);
+ objp = cache_alloc_refill(cachep, flags);
+ }
+ return objp;
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
+ *
+ * If we are in_interrupt, then process context, including cpusets and
+ * mempolicy, may not apply and should not be used for allocation policy.
+ */
+static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
+{
+ int nid_alloc, nid_here;
+
+ if (in_interrupt() || (flags & __GFP_THISNODE))
+ return NULL;
+ nid_alloc = nid_here = numa_node_id();
+ if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
+ nid_alloc = cpuset_mem_spread_node();
+ else if (current->mempolicy)
+ nid_alloc = slab_node(current->mempolicy);
+ if (nid_alloc != nid_here)
+ return ____cache_alloc_node(cachep, flags, nid_alloc);
+ return NULL;
+}
+
+/*
+ * Fallback function if there was no memory available and no objects on a
+ * certain node and fall back is permitted. First we scan all the
+ * available nodelists for available objects. If that fails then we
+ * perform an allocation without specifying a node. This allows the page
+ * allocator to do its reclaim / fallback magic. We then insert the
+ * slab into the proper nodelist and then allocate from it.
+ */
+static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
+{
+ struct zonelist *zonelist;
+ gfp_t local_flags;
+ struct zoneref *z;
+ struct zone *zone;
+ enum zone_type high_zoneidx = gfp_zone(flags);
+ void *obj = NULL;
+ int nid;
+
+ if (flags & __GFP_THISNODE)
+ return NULL;
+
+ zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+ local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
+
+retry:
+ /*
+ * Look through allowed nodes for objects available
+ * from existing per node queues.
+ */
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ nid = zone_to_nid(zone);
+
+ if (cpuset_zone_allowed_hardwall(zone, flags) &&
+ cache->nodelists[nid] &&
+ cache->nodelists[nid]->free_objects) {
+ obj = ____cache_alloc_node(cache,
+ flags | GFP_THISNODE, nid);
+ if (obj)
+ break;
+ }
+ }
+
+ if (!obj) {
+ /*
+ * This allocation will be performed within the constraints
+ * of the current cpuset / memory policy requirements.
+ * We may trigger various forms of reclaim on the allowed
+ * set and go into memory reserves if necessary.
+ */
+ if (local_flags & __GFP_WAIT)
+ local_irq_enable();
+ kmem_flagcheck(cache, flags);
+ obj = kmem_getpages(cache, local_flags, -1);
+ if (local_flags & __GFP_WAIT)
+ local_irq_disable();
+ if (obj) {
+ /*
+ * Insert into the appropriate per node queues
+ */
+ nid = page_to_nid(virt_to_page(obj));
+ if (cache_grow(cache, flags, nid, obj)) {
+ obj = ____cache_alloc_node(cache,
+ flags | GFP_THISNODE, nid);
+ if (!obj)
+ /*
+ * Another processor may allocate the
+ * objects in the slab since we are
+ * not holding any locks.
+ */
+ goto retry;
+ } else {
+ /* cache_grow already freed obj */
+ obj = NULL;
+ }
+ }
+ }
+ return obj;
+}
+
+/*
+ * A interface to enable slab creation on nodeid
+ */
+static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
+ int nodeid)
+{
+ struct list_head *entry;
+ struct slab *slabp;
+ struct kmem_list3 *l3;
+ void *obj;
+ int x;
+
+ l3 = cachep->nodelists[nodeid];
+ BUG_ON(!l3);
+
+retry:
+ check_irq_off();
+ spin_lock(&l3->list_lock);
+ entry = l3->slabs_partial.next;
+ if (entry == &l3->slabs_partial) {
+ l3->free_touched = 1;
+ entry = l3->slabs_free.next;
+ if (entry == &l3->slabs_free)
+ goto must_grow;
+ }
+
+ slabp = list_entry(entry, struct slab, list);
+ check_spinlock_acquired_node(cachep, nodeid);
+ check_slabp(cachep, slabp);
+
+ STATS_INC_NODEALLOCS(cachep);
+ STATS_INC_ACTIVE(cachep);
+ STATS_SET_HIGH(cachep);
+
+ BUG_ON(slabp->inuse == cachep->num);
+
+ obj = slab_get_obj(cachep, slabp, nodeid);
+ check_slabp(cachep, slabp);
+ l3->free_objects--;
+ /* move slabp to correct slabp list: */
+ list_del(&slabp->list);
+
+ if (slabp->free == BUFCTL_END)
+ list_add(&slabp->list, &l3->slabs_full);
+ else
+ list_add(&slabp->list, &l3->slabs_partial);
+
+ spin_unlock(&l3->list_lock);
+ goto done;
+
+must_grow:
+ spin_unlock(&l3->list_lock);
+ x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
+ if (x)
+ goto retry;
+
+ return fallback_alloc(cachep, flags);
+
+done:
+ return obj;
+}
+
+/**
+ * kmem_cache_alloc_node - Allocate an object on the specified node
+ * @cachep: The cache to allocate from.
+ * @flags: See kmalloc().
+ * @nodeid: node number of the target node.
+ * @caller: return address of caller, used for debug information
+ *
+ * Identical to kmem_cache_alloc but it will allocate memory on the given
+ * node, which can improve the performance for cpu bound structures.
+ *
+ * Fallback to other node is possible if __GFP_THISNODE is not set.
+ */
+static __always_inline void *
+__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
+ void *caller)
+{
+ unsigned long save_flags;
+ void *ptr;
+
+ if (should_failslab(cachep, flags))
+ return NULL;
+
+ cache_alloc_debugcheck_before(cachep, flags);
+ local_irq_save(save_flags);
+
+ if (unlikely(nodeid == -1))
+ nodeid = numa_node_id();
+
+ if (unlikely(!cachep->nodelists[nodeid])) {
+ /* Node not bootstrapped yet */
+ ptr = fallback_alloc(cachep, flags);
+ goto out;
+ }
+
+ if (nodeid == numa_node_id()) {
+ /*
+ * Use the locally cached objects if possible.
+ * However ____cache_alloc does not allow fallback
+ * to other nodes. It may fail while we still have
+ * objects on other nodes available.
+ */
+ ptr = ____cache_alloc(cachep, flags);
+ if (ptr)
+ goto out;
+ }
+ /* ___cache_alloc_node can fall back to other nodes */
+ ptr = ____cache_alloc_node(cachep, flags, nodeid);
+ out:
+ local_irq_restore(save_flags);
+ ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
+
+ if (unlikely((flags & __GFP_ZERO) && ptr))
+ memset(ptr, 0, obj_size(cachep));
+
+ return ptr;
+}
+
+static __always_inline void *
+__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
+{
+ void *objp;
+
+ if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
+ objp = alternate_node_alloc(cache, flags);
+ if (objp)
+ goto out;
+ }
+ objp = ____cache_alloc(cache, flags);
+
+ /*
+ * We may just have run out of memory on the local node.
+ * ____cache_alloc_node() knows how to locate memory on other nodes
+ */
+ if (!objp)
+ objp = ____cache_alloc_node(cache, flags, numa_node_id());
+
+ out:
+ return objp;
+}
+#else
+
+static __always_inline void *
+__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
+{
+ return ____cache_alloc(cachep, flags);
+}
+
+#endif /* CONFIG_NUMA */
+
+static __always_inline void *
+__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
+{
+ unsigned long save_flags;
+ void *objp;
+
+ if (should_failslab(cachep, flags))
+ return NULL;
+
+ cache_alloc_debugcheck_before(cachep, flags);
+ local_irq_save(save_flags);
+ objp = __do_cache_alloc(cachep, flags);
+ local_irq_restore(save_flags);
+ objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
+ prefetchw(objp);
+
+ if (unlikely((flags & __GFP_ZERO) && objp))
+ memset(objp, 0, obj_size(cachep));
+
+ return objp;
+}
+
+/*
+ * Caller needs to acquire correct kmem_list's list_lock
+ */
+static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
+ int node)
+{
+ int i;
+ struct kmem_list3 *l3;
+
+ for (i = 0; i < nr_objects; i++) {
+ void *objp = objpp[i];
+ struct slab *slabp;
+
+ slabp = virt_to_slab(objp);
+ l3 = cachep->nodelists[node];
+ list_del(&slabp->list);
+ check_spinlock_acquired_node(cachep, node);
+ check_slabp(cachep, slabp);
+ slab_put_obj(cachep, slabp, objp, node);
+ STATS_DEC_ACTIVE(cachep);
+ l3->free_objects++;
+ check_slabp(cachep, slabp);
+
+ /* fixup slab chains */
+ if (slabp->inuse == 0) {
+ if (l3->free_objects > l3->free_limit) {
+ l3->free_objects -= cachep->num;
+ /* No need to drop any previously held
+ * lock here, even if we have a off-slab slab
+ * descriptor it is guaranteed to come from
+ * a different cache, refer to comments before
+ * alloc_slabmgmt.
+ */
+ slab_destroy(cachep, slabp);
+ } else {
+ list_add(&slabp->list, &l3->slabs_free);
+ }
+ } else {
+ /* Unconditionally move a slab to the end of the
+ * partial list on free - maximum time for the
+ * other objects to be freed, too.
+ */
+ list_add_tail(&slabp->list, &l3->slabs_partial);
+ }
+ }
+}
+
+static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
+{
+ int batchcount;
+ struct kmem_list3 *l3;
+ int node = numa_node_id();
+
+ batchcount = ac->batchcount;
+#if DEBUG
+ BUG_ON(!batchcount || batchcount > ac->avail);
+#endif
+ check_irq_off();
+ l3 = cachep->nodelists[node];
+ spin_lock(&l3->list_lock);
+ if (l3->shared) {
+ struct array_cache *shared_array = l3->shared;
+ int max = shared_array->limit - shared_array->avail;
+ if (max) {
+ if (batchcount > max)
+ batchcount = max;
+ memcpy(&(shared_array->entry[shared_array->avail]),
+ ac->entry, sizeof(void *) * batchcount);
+ shared_array->avail += batchcount;
+ goto free_done;
+ }
+ }
+
+ free_block(cachep, ac->entry, batchcount, node);
+free_done:
+#if STATS
+ {
+ int i = 0;
+ struct list_head *p;
+
+ p = l3->slabs_free.next;
+ while (p != &(l3->slabs_free)) {
+ struct slab *slabp;
+
+ slabp = list_entry(p, struct slab, list);
+ BUG_ON(slabp->inuse);
+
+ i++;
+ p = p->next;
+ }
+ STATS_SET_FREEABLE(cachep, i);
+ }
+#endif
+ spin_unlock(&l3->list_lock);
+ ac->avail -= batchcount;
+ memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
+}
+
+/*
+ * Release an obj back to its cache. If the obj has a constructed state, it must
+ * be in this state _before_ it is released. Called with disabled ints.
+ */
+static inline void __cache_free(struct kmem_cache *cachep, void *objp)
+{
+ struct array_cache *ac = cpu_cache_get(cachep);
+
+ check_irq_off();
+ objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
+
+ /*
+ * Skip calling cache_free_alien() when the platform is not numa.
+ * This will avoid cache misses that happen while accessing slabp (which
+ * is per page memory reference) to get nodeid. Instead use a global
+ * variable to skip the call, which is mostly likely to be present in
+ * the cache.
+ */
+ if (numa_platform && cache_free_alien(cachep, objp))
+ return;
+
+ if (likely(ac->avail < ac->limit)) {
+ STATS_INC_FREEHIT(cachep);
+ ac->entry[ac->avail++] = objp;
+ return;
+ } else {
+ STATS_INC_FREEMISS(cachep);
+ cache_flusharray(cachep, ac);
+ ac->entry[ac->avail++] = objp;
+ }
+}
+
+/**
+ * kmem_cache_alloc - Allocate an object
+ * @cachep: The cache to allocate from.
+ * @flags: See kmalloc().
+ *
+ * Allocate an object from this cache. The flags are only relevant
+ * if the cache has no available objects.
+ */
+void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
+{
+ return __cache_alloc(cachep, flags, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(kmem_cache_alloc);
+
+/**
+ * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
+ * @cachep: the cache we're checking against
+ * @ptr: pointer to validate
+ *
+ * This verifies that the untrusted pointer looks sane;
+ * it is _not_ a guarantee that the pointer is actually
+ * part of the slab cache in question, but it at least
+ * validates that the pointer can be dereferenced and
+ * looks half-way sane.
+ *
+ * Currently only used for dentry validation.
+ */
+int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
+{
+ unsigned long addr = (unsigned long)ptr;
+ unsigned long min_addr = PAGE_OFFSET;
+ unsigned long align_mask = BYTES_PER_WORD - 1;
+ unsigned long size = cachep->buffer_size;
+ struct page *page;
+
+ if (unlikely(addr < min_addr))
+ goto out;
+ if (unlikely(addr > (unsigned long)high_memory - size))
+ goto out;
+ if (unlikely(addr & align_mask))
+ goto out;
+ if (unlikely(!kern_addr_valid(addr)))
+ goto out;
+ if (unlikely(!kern_addr_valid(addr + size - 1)))
+ goto out;
+ page = virt_to_page(ptr);
+ if (unlikely(!PageSlab(page)))
+ goto out;
+ if (unlikely(page_get_cache(page) != cachep))
+ goto out;
+ return 1;
+out:
+ return 0;
+}
+
+#ifdef CONFIG_NUMA
+void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+{
+ return __cache_alloc_node(cachep, flags, nodeid,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(kmem_cache_alloc_node);
+
+static __always_inline void *
+__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
+{
+ struct kmem_cache *cachep;
+
+ cachep = kmem_find_general_cachep(size, flags);
+ if (unlikely(ZERO_OR_NULL_PTR(cachep)))
+ return cachep;
+ return kmem_cache_alloc_node(cachep, flags, node);
+}
+
+#ifdef CONFIG_DEBUG_SLAB
+void *__kmalloc_node(size_t size, gfp_t flags, int node)
+{
+ return __do_kmalloc_node(size, flags, node,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(__kmalloc_node);
+
+void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
+ int node, void *caller)
+{
+ return __do_kmalloc_node(size, flags, node, caller);
+}
+EXPORT_SYMBOL(__kmalloc_node_track_caller);
+#else
+void *__kmalloc_node(size_t size, gfp_t flags, int node)
+{
+ return __do_kmalloc_node(size, flags, node, NULL);
+}
+EXPORT_SYMBOL(__kmalloc_node);
+#endif /* CONFIG_DEBUG_SLAB */
+#endif /* CONFIG_NUMA */
+
+/**
+ * __do_kmalloc - allocate memory
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate (see kmalloc).
+ * @caller: function caller for debug tracking of the caller
+ */
+static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
+ void *caller)
+{
+ struct kmem_cache *cachep;
+
+ /* If you want to save a few bytes .text space: replace
+ * __ with kmem_.
+ * Then kmalloc uses the uninlined functions instead of the inline
+ * functions.
+ */
+ cachep = __find_general_cachep(size, flags);
+ if (unlikely(ZERO_OR_NULL_PTR(cachep)))
+ return cachep;
+ return __cache_alloc(cachep, flags, caller);
+}
+
+
+#ifdef CONFIG_DEBUG_SLAB
+void *__kmalloc(size_t size, gfp_t flags)
+{
+ return __do_kmalloc(size, flags, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(__kmalloc);
+
+void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
+{
+ return __do_kmalloc(size, flags, caller);
+}
+EXPORT_SYMBOL(__kmalloc_track_caller);
+
+#else
+void *__kmalloc(size_t size, gfp_t flags)
+{
+ return __do_kmalloc(size, flags, NULL);
+}
+EXPORT_SYMBOL(__kmalloc);
+#endif
+
+/**
+ * kmem_cache_free - Deallocate an object
+ * @cachep: The cache the allocation was from.
+ * @objp: The previously allocated object.
+ *
+ * Free an object which was previously allocated from this
+ * cache.
+ */
+void kmem_cache_free(struct kmem_cache *cachep, void *objp)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ debug_check_no_locks_freed(objp, obj_size(cachep));
+ if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
+ debug_check_no_obj_freed(objp, obj_size(cachep));
+ __cache_free(cachep, objp);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(kmem_cache_free);
+
+/**
+ * kfree - free previously allocated memory
+ * @objp: pointer returned by kmalloc.
+ *
+ * If @objp is NULL, no operation is performed.
+ *
+ * Don't free memory not originally allocated by kmalloc()
+ * or you will run into trouble.
+ */
+void kfree(const void *objp)
+{
+ struct kmem_cache *c;
+ unsigned long flags;
+
+ if (unlikely(ZERO_OR_NULL_PTR(objp)))
+ return;
+ local_irq_save(flags);
+ kfree_debugcheck(objp);
+ c = virt_to_cache(objp);
+ debug_check_no_locks_freed(objp, obj_size(c));
+ debug_check_no_obj_freed(objp, obj_size(c));
+ __cache_free(c, (void *)objp);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(kfree);
+
+unsigned int kmem_cache_size(struct kmem_cache *cachep)
+{
+ return obj_size(cachep);
+}
+EXPORT_SYMBOL(kmem_cache_size);
+
+const char *kmem_cache_name(struct kmem_cache *cachep)
+{
+ return cachep->name;
+}
+EXPORT_SYMBOL_GPL(kmem_cache_name);
+
+/*
+ * This initializes kmem_list3 or resizes various caches for all nodes.
+ */
+static int alloc_kmemlist(struct kmem_cache *cachep)
+{
+ int node;
+ struct kmem_list3 *l3;
+ struct array_cache *new_shared;
+ struct array_cache **new_alien = NULL;
+
+ for_each_online_node(node) {
+
+ if (use_alien_caches) {
+ new_alien = alloc_alien_cache(node, cachep->limit);
+ if (!new_alien)
+ goto fail;
+ }
+
+ new_shared = NULL;
+ if (cachep->shared) {
+ new_shared = alloc_arraycache(node,
+ cachep->shared*cachep->batchcount,
+ 0xbaadf00d);
+ if (!new_shared) {
+ free_alien_cache(new_alien);
+ goto fail;
+ }
+ }
+
+ l3 = cachep->nodelists[node];
+ if (l3) {
+ struct array_cache *shared = l3->shared;
+
+ spin_lock_irq(&l3->list_lock);
+
+ if (shared)
+ free_block(cachep, shared->entry,
+ shared->avail, node);
+
+ l3->shared = new_shared;
+ if (!l3->alien) {
+ l3->alien = new_alien;
+ new_alien = NULL;
+ }
+ l3->free_limit = (1 + nr_cpus_node(node)) *
+ cachep->batchcount + cachep->num;
+ spin_unlock_irq(&l3->list_lock);
+ kfree(shared);
+ free_alien_cache(new_alien);
+ continue;
+ }
+ l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
+ if (!l3) {
+ free_alien_cache(new_alien);
+ kfree(new_shared);
+ goto fail;
+ }
+
+ kmem_list3_init(l3);
+ l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
+ ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+ l3->shared = new_shared;
+ l3->alien = new_alien;
+ l3->free_limit = (1 + nr_cpus_node(node)) *
+ cachep->batchcount + cachep->num;
+ cachep->nodelists[node] = l3;
+ }
+ return 0;
+
+fail:
+ if (!cachep->next.next) {
+ /* Cache is not active yet. Roll back what we did */
+ node--;
+ while (node >= 0) {
+ if (cachep->nodelists[node]) {
+ l3 = cachep->nodelists[node];
+
+ kfree(l3->shared);
+ free_alien_cache(l3->alien);
+ kfree(l3);
+ cachep->nodelists[node] = NULL;
+ }
+ node--;
+ }
+ }
+ return -ENOMEM;
+}
+
+struct ccupdate_struct {
+ struct kmem_cache *cachep;
+ struct array_cache *new[NR_CPUS];
+};
+
+static void do_ccupdate_local(void *info)
+{
+ struct ccupdate_struct *new = info;
+ struct array_cache *old;
+
+ check_irq_off();
+ old = cpu_cache_get(new->cachep);
+
+ new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
+ new->new[smp_processor_id()] = old;
+}
+
+/* Always called with the cache_chain_mutex held */
+static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
+ int batchcount, int shared)
+{
+ struct ccupdate_struct *new;
+ int i;
+
+ new = kzalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ for_each_online_cpu(i) {
+ new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
+ batchcount);
+ if (!new->new[i]) {
+ for (i--; i >= 0; i--)
+ kfree(new->new[i]);
+ kfree(new);
+ return -ENOMEM;
+ }
+ }
+ new->cachep = cachep;
+
+ on_each_cpu(do_ccupdate_local, (void *)new, 1);
+
+ check_irq_on();
+ cachep->batchcount = batchcount;
+ cachep->limit = limit;
+ cachep->shared = shared;
+
+ for_each_online_cpu(i) {
+ struct array_cache *ccold = new->new[i];
+ if (!ccold)
+ continue;
+ spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
+ free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
+ spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
+ kfree(ccold);
+ }
+ kfree(new);
+ return alloc_kmemlist(cachep);
+}
+
+/* Called with cache_chain_mutex held always */
+static int enable_cpucache(struct kmem_cache *cachep)
+{
+ int err;
+ int limit, shared;
+
+ /*
+ * The head array serves three purposes:
+ * - create a LIFO ordering, i.e. return objects that are cache-warm
+ * - reduce the number of spinlock operations.
+ * - reduce the number of linked list operations on the slab and
+ * bufctl chains: array operations are cheaper.
+ * The numbers are guessed, we should auto-tune as described by
+ * Bonwick.
+ */
+ if (cachep->buffer_size > 131072)
+ limit = 1;
+ else if (cachep->buffer_size > PAGE_SIZE)
+ limit = 8;
+ else if (cachep->buffer_size > 1024)
+ limit = 24;
+ else if (cachep->buffer_size > 256)
+ limit = 54;
+ else
+ limit = 120;
+
+ /*
+ * CPU bound tasks (e.g. network routing) can exhibit cpu bound
+ * allocation behaviour: Most allocs on one cpu, most free operations
+ * on another cpu. For these cases, an efficient object passing between
+ * cpus is necessary. This is provided by a shared array. The array
+ * replaces Bonwick's magazine layer.
+ * On uniprocessor, it's functionally equivalent (but less efficient)
+ * to a larger limit. Thus disabled by default.
+ */
+ shared = 0;
+ if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1)
+ shared = 8;
+
+#if DEBUG
+ /*
+ * With debugging enabled, large batchcount lead to excessively long
+ * periods with disabled local interrupts. Limit the batchcount
+ */
+ if (limit > 32)
+ limit = 32;
+#endif
+ err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
+ if (err)
+ printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
+ cachep->name, -err);
+ return err;
+}
+
+/*
+ * Drain an array if it contains any elements taking the l3 lock only if
+ * necessary. Note that the l3 listlock also protects the array_cache
+ * if drain_array() is used on the shared array.
+ */
+void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
+ struct array_cache *ac, int force, int node)
+{
+ int tofree;
+
+ if (!ac || !ac->avail)
+ return;
+ if (ac->touched && !force) {
+ ac->touched = 0;
+ } else {
+ spin_lock_irq(&l3->list_lock);
+ if (ac->avail) {
+ tofree = force ? ac->avail : (ac->limit + 4) / 5;
+ if (tofree > ac->avail)
+ tofree = (ac->avail + 1) / 2;
+ free_block(cachep, ac->entry, tofree, node);
+ ac->avail -= tofree;
+ memmove(ac->entry, &(ac->entry[tofree]),
+ sizeof(void *) * ac->avail);
+ }
+ spin_unlock_irq(&l3->list_lock);
+ }
+}
+
+/**
+ * cache_reap - Reclaim memory from caches.
+ * @w: work descriptor
+ *
+ * Called from workqueue/eventd every few seconds.
+ * Purpose:
+ * - clear the per-cpu caches for this CPU.
+ * - return freeable pages to the main free memory pool.
+ *
+ * If we cannot acquire the cache chain mutex then just give up - we'll try
+ * again on the next iteration.
+ */
+static void cache_reap(struct work_struct *w)
+{
+ struct kmem_cache *searchp;
+ struct kmem_list3 *l3;
+ int node = numa_node_id();
+ struct delayed_work *work =
+ container_of(w, struct delayed_work, work);
+
+ if (!mutex_trylock(&cache_chain_mutex))
+ /* Give up. Setup the next iteration. */
+ goto out;
+
+ list_for_each_entry(searchp, &cache_chain, next) {
+ check_irq_on();
+
+ /*
+ * We only take the l3 lock if absolutely necessary and we
+ * have established with reasonable certainty that
+ * we can do some work if the lock was obtained.
+ */
+ l3 = searchp->nodelists[node];
+
+ reap_alien(searchp, l3);
+
+ drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
+
+ /*
+ * These are racy checks but it does not matter
+ * if we skip one check or scan twice.
+ */
+ if (time_after(l3->next_reap, jiffies))
+ goto next;
+
+ l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
+
+ drain_array(searchp, l3, l3->shared, 0, node);
+
+ if (l3->free_touched)
+ l3->free_touched = 0;
+ else {
+ int freed;
+
+ freed = drain_freelist(searchp, l3, (l3->free_limit +
+ 5 * searchp->num - 1) / (5 * searchp->num));
+ STATS_ADD_REAPED(searchp, freed);
+ }
+next:
+ cond_resched();
+ }
+ check_irq_on();
+ mutex_unlock(&cache_chain_mutex);
+ next_reap_node();
+out:
+ /* Set up the next iteration */
+ schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
+}
+
+#ifdef CONFIG_SLABINFO
+
+static void print_slabinfo_header(struct seq_file *m)
+{
+ /*
+ * Output format version, so at least we can change it
+ * without _too_ many complaints.
+ */
+#if STATS
+ seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
+#else
+ seq_puts(m, "slabinfo - version: 2.1\n");
+#endif
+ seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
+ "<objperslab> <pagesperslab>");
+ seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
+ seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
+#if STATS
+ seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
+ "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
+ seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
+#endif
+ seq_putc(m, '\n');
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+ loff_t n = *pos;
+
+ mutex_lock(&cache_chain_mutex);
+ if (!n)
+ print_slabinfo_header(m);
+
+ return seq_list_start(&cache_chain, *pos);
+}
+
+static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next(p, &cache_chain, pos);
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&cache_chain_mutex);
+}
+
+static int s_show(struct seq_file *m, void *p)
+{
+ struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
+ struct slab *slabp;
+ unsigned long active_objs;
+ unsigned long num_objs;
+ unsigned long active_slabs = 0;
+ unsigned long num_slabs, free_objects = 0, shared_avail = 0;
+ const char *name;
+ char *error = NULL;
+ int node;
+ struct kmem_list3 *l3;
+
+ active_objs = 0;
+ num_slabs = 0;
+ for_each_online_node(node) {
+ l3 = cachep->nodelists[node];
+ if (!l3)
+ continue;
+
+ check_irq_on();
+ spin_lock_irq(&l3->list_lock);
+
+ list_for_each_entry(slabp, &l3->slabs_full, list) {
+ if (slabp->inuse != cachep->num && !error)
+ error = "slabs_full accounting error";
+ active_objs += cachep->num;
+ active_slabs++;
+ }
+ list_for_each_entry(slabp, &l3->slabs_partial, list) {
+ if (slabp->inuse == cachep->num && !error)
+ error = "slabs_partial inuse accounting error";
+ if (!slabp->inuse && !error)
+ error = "slabs_partial/inuse accounting error";
+ active_objs += slabp->inuse;
+ active_slabs++;
+ }
+ list_for_each_entry(slabp, &l3->slabs_free, list) {
+ if (slabp->inuse && !error)
+ error = "slabs_free/inuse accounting error";
+ num_slabs++;
+ }
+ free_objects += l3->free_objects;
+ if (l3->shared)
+ shared_avail += l3->shared->avail;
+
+ spin_unlock_irq(&l3->list_lock);
+ }
+ num_slabs += active_slabs;
+ num_objs = num_slabs * cachep->num;
+ if (num_objs - active_objs != free_objects && !error)
+ error = "free_objects accounting error";
+
+ name = cachep->name;
+ if (error)
+ printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
+
+ seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
+ name, active_objs, num_objs, cachep->buffer_size,
+ cachep->num, (1 << cachep->gfporder));
+ seq_printf(m, " : tunables %4u %4u %4u",
+ cachep->limit, cachep->batchcount, cachep->shared);
+ seq_printf(m, " : slabdata %6lu %6lu %6lu",
+ active_slabs, num_slabs, shared_avail);
+#if STATS
+ { /* list3 stats */
+ unsigned long high = cachep->high_mark;
+ unsigned long allocs = cachep->num_allocations;
+ unsigned long grown = cachep->grown;
+ unsigned long reaped = cachep->reaped;
+ unsigned long errors = cachep->errors;
+ unsigned long max_freeable = cachep->max_freeable;
+ unsigned long node_allocs = cachep->node_allocs;
+ unsigned long node_frees = cachep->node_frees;
+ unsigned long overflows = cachep->node_overflow;
+
+ seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
+ %4lu %4lu %4lu %4lu %4lu", allocs, high, grown,
+ reaped, errors, max_freeable, node_allocs,
+ node_frees, overflows);
+ }
+ /* cpu stats */
+ {
+ unsigned long allochit = atomic_read(&cachep->allochit);
+ unsigned long allocmiss = atomic_read(&cachep->allocmiss);
+ unsigned long freehit = atomic_read(&cachep->freehit);
+ unsigned long freemiss = atomic_read(&cachep->freemiss);
+
+ seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
+ allochit, allocmiss, freehit, freemiss);
+ }
+#endif
+ seq_putc(m, '\n');
+ return 0;
+}
+
+/*
+ * slabinfo_op - iterator that generates /proc/slabinfo
+ *
+ * Output layout:
+ * cache-name
+ * num-active-objs
+ * total-objs
+ * object size
+ * num-active-slabs
+ * total-slabs
+ * num-pages-per-slab
+ * + further values on SMP and with statistics enabled
+ */
+
+static const struct seq_operations slabinfo_op = {
+ .start = s_start,
+ .next = s_next,
+ .stop = s_stop,
+ .show = s_show,
+};
+
+#define MAX_SLABINFO_WRITE 128
+/**
+ * slabinfo_write - Tuning for the slab allocator
+ * @file: unused
+ * @buffer: user buffer
+ * @count: data length
+ * @ppos: unused
+ */
+ssize_t slabinfo_write(struct file *file, const char __user * buffer,
+ size_t count, loff_t *ppos)
+{
+ char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
+ int limit, batchcount, shared, res;
+ struct kmem_cache *cachep;
+
+ if (count > MAX_SLABINFO_WRITE)
+ return -EINVAL;
+ if (copy_from_user(&kbuf, buffer, count))
+ return -EFAULT;
+ kbuf[MAX_SLABINFO_WRITE] = '\0';
+
+ tmp = strchr(kbuf, ' ');
+ if (!tmp)
+ return -EINVAL;
+ *tmp = '\0';
+ tmp++;
+ if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
+ return -EINVAL;
+
+ /* Find the cache in the chain of caches. */
+ mutex_lock(&cache_chain_mutex);
+ res = -EINVAL;
+ list_for_each_entry(cachep, &cache_chain, next) {
+ if (!strcmp(cachep->name, kbuf)) {
+ if (limit < 1 || batchcount < 1 ||
+ batchcount > limit || shared < 0) {
+ res = 0;
+ } else {
+ res = do_tune_cpucache(cachep, limit,
+ batchcount, shared);
+ }
+ break;
+ }
+ }
+ mutex_unlock(&cache_chain_mutex);
+ if (res >= 0)
+ res = count;
+ return res;
+}
+
+static int slabinfo_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &slabinfo_op);
+}
+
+static const struct file_operations proc_slabinfo_operations = {
+ .open = slabinfo_open,
+ .read = seq_read,
+ .write = slabinfo_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+
+static void *leaks_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&cache_chain_mutex);
+ return seq_list_start(&cache_chain, *pos);
+}
+
+static inline int add_caller(unsigned long *n, unsigned long v)
+{
+ unsigned long *p;
+ int l;
+ if (!v)
+ return 1;
+ l = n[1];
+ p = n + 2;
+ while (l) {
+ int i = l/2;
+ unsigned long *q = p + 2 * i;
+ if (*q == v) {
+ q[1]++;
+ return 1;
+ }
+ if (*q > v) {
+ l = i;
+ } else {
+ p = q + 2;
+ l -= i + 1;
+ }
+ }
+ if (++n[1] == n[0])
+ return 0;
+ memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
+ p[0] = v;
+ p[1] = 1;
+ return 1;
+}
+
+static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
+{
+ void *p;
+ int i;
+ if (n[0] == n[1])
+ return;
+ for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
+ if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
+ continue;
+ if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
+ return;
+ }
+}
+
+static void show_symbol(struct seq_file *m, unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+ unsigned long offset, size;
+ char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN];
+
+ if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) {
+ seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
+ if (modname[0])
+ seq_printf(m, " [%s]", modname);
+ return;
+ }
+#endif
+ seq_printf(m, "%p", (void *)address);
+}
+
+static int leaks_show(struct seq_file *m, void *p)
+{
+ struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
+ struct slab *slabp;
+ struct kmem_list3 *l3;
+ const char *name;
+ unsigned long *n = m->private;
+ int node;
+ int i;
+
+ if (!(cachep->flags & SLAB_STORE_USER))
+ return 0;
+ if (!(cachep->flags & SLAB_RED_ZONE))
+ return 0;
+
+ /* OK, we can do it */
+
+ n[1] = 0;
+
+ for_each_online_node(node) {
+ l3 = cachep->nodelists[node];
+ if (!l3)
+ continue;
+
+ check_irq_on();
+ spin_lock_irq(&l3->list_lock);
+
+ list_for_each_entry(slabp, &l3->slabs_full, list)
+ handle_slab(n, cachep, slabp);
+ list_for_each_entry(slabp, &l3->slabs_partial, list)
+ handle_slab(n, cachep, slabp);
+ spin_unlock_irq(&l3->list_lock);
+ }
+ name = cachep->name;
+ if (n[0] == n[1]) {
+ /* Increase the buffer size */
+ mutex_unlock(&cache_chain_mutex);
+ m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
+ if (!m->private) {
+ /* Too bad, we are really out */
+ m->private = n;
+ mutex_lock(&cache_chain_mutex);
+ return -ENOMEM;
+ }
+ *(unsigned long *)m->private = n[0] * 2;
+ kfree(n);
+ mutex_lock(&cache_chain_mutex);
+ /* Now make sure this entry will be retried */
+ m->count = m->size;
+ return 0;
+ }
+ for (i = 0; i < n[1]; i++) {
+ seq_printf(m, "%s: %lu ", name, n[2*i+3]);
+ show_symbol(m, n[2*i+2]);
+ seq_putc(m, '\n');
+ }
+
+ return 0;
+}
+
+static const struct seq_operations slabstats_op = {
+ .start = leaks_start,
+ .next = s_next,
+ .stop = s_stop,
+ .show = leaks_show,
+};
+
+static int slabstats_open(struct inode *inode, struct file *file)
+{
+ unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ int ret = -ENOMEM;
+ if (n) {
+ ret = seq_open(file, &slabstats_op);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ *n = PAGE_SIZE / (2 * sizeof(unsigned long));
+ m->private = n;
+ n = NULL;
+ }
+ kfree(n);
+ }
+ return ret;
+}
+
+static const struct file_operations proc_slabstats_operations = {
+ .open = slabstats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+#endif
+
+static int __init slab_proc_init(void)
+{
+ proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+ proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
+#endif
+ return 0;
+}
+module_init(slab_proc_init);
+#endif
+
+/**
+ * ksize - get the actual amount of memory allocated for a given object
+ * @objp: Pointer to the object
+ *
+ * kmalloc may internally round up allocations and return more memory
+ * than requested. ksize() can be used to determine the actual amount of
+ * memory allocated. The caller may use this additional memory, even though
+ * a smaller amount of memory was initially specified with the kmalloc call.
+ * The caller must guarantee that objp points to a valid object previously
+ * allocated with either kmalloc() or kmem_cache_alloc(). The object
+ * must not be freed during the duration of the call.
+ */
+size_t ksize(const void *objp)
+{
+ BUG_ON(!objp);
+ if (unlikely(objp == ZERO_SIZE_PTR))
+ return 0;
+
+ return obj_size(virt_to_cache(objp));
+}
diff --git a/mm/slob.c b/mm/slob.c
new file mode 100644
index 0000000..bf7e8fc
--- /dev/null
+++ b/mm/slob.c
@@ -0,0 +1,647 @@
+/*
+ * SLOB Allocator: Simple List Of Blocks
+ *
+ * Matt Mackall <mpm@selenic.com> 12/30/03
+ *
+ * NUMA support by Paul Mundt, 2007.
+ *
+ * How SLOB works:
+ *
+ * The core of SLOB is a traditional K&R style heap allocator, with
+ * support for returning aligned objects. The granularity of this
+ * allocator is as little as 2 bytes, however typically most architectures
+ * will require 4 bytes on 32-bit and 8 bytes on 64-bit.
+ *
+ * The slob heap is a set of linked list of pages from alloc_pages(),
+ * and within each page, there is a singly-linked list of free blocks
+ * (slob_t). The heap is grown on demand. To reduce fragmentation,
+ * heap pages are segregated into three lists, with objects less than
+ * 256 bytes, objects less than 1024 bytes, and all other objects.
+ *
+ * Allocation from heap involves first searching for a page with
+ * sufficient free blocks (using a next-fit-like approach) followed by
+ * a first-fit scan of the page. Deallocation inserts objects back
+ * into the free list in address order, so this is effectively an
+ * address-ordered first fit.
+ *
+ * Above this is an implementation of kmalloc/kfree. Blocks returned
+ * from kmalloc are prepended with a 4-byte header with the kmalloc size.
+ * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
+ * alloc_pages() directly, allocating compound pages so the page order
+ * does not have to be separately tracked, and also stores the exact
+ * allocation size in page->private so that it can be used to accurately
+ * provide ksize(). These objects are detected in kfree() because slob_page()
+ * is false for them.
+ *
+ * SLAB is emulated on top of SLOB by simply calling constructors and
+ * destructors for every SLAB allocation. Objects are returned with the
+ * 4-byte alignment unless the SLAB_HWCACHE_ALIGN flag is set, in which
+ * case the low-level allocator will fragment blocks to create the proper
+ * alignment. Again, objects of page-size or greater are allocated by
+ * calling alloc_pages(). As SLAB objects know their size, no separate
+ * size bookkeeping is necessary and there is essentially no allocation
+ * space overhead, and compound pages aren't needed for multi-page
+ * allocations.
+ *
+ * NUMA support in SLOB is fairly simplistic, pushing most of the real
+ * logic down to the page allocator, and simply doing the node accounting
+ * on the upper levels. In the event that a node id is explicitly
+ * provided, alloc_pages_node() with the specified node id is used
+ * instead. The common case (or when the node id isn't explicitly provided)
+ * will default to the current node, as per numa_node_id().
+ *
+ * Node aware pages are still inserted in to the global freelist, and
+ * these are scanned for by matching against the node id encoded in the
+ * page flags. As a result, block allocations that can be satisfied from
+ * the freelist will only be done so on pages residing on the same node,
+ * in order to prevent random node placement.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <asm/atomic.h>
+
+/*
+ * slob_block has a field 'units', which indicates size of block if +ve,
+ * or offset of next block if -ve (in SLOB_UNITs).
+ *
+ * Free blocks of size 1 unit simply contain the offset of the next block.
+ * Those with larger size contain their size in the first SLOB_UNIT of
+ * memory, and the offset of the next free block in the second SLOB_UNIT.
+ */
+#if PAGE_SIZE <= (32767 * 2)
+typedef s16 slobidx_t;
+#else
+typedef s32 slobidx_t;
+#endif
+
+struct slob_block {
+ slobidx_t units;
+};
+typedef struct slob_block slob_t;
+
+/*
+ * We use struct page fields to manage some slob allocation aspects,
+ * however to avoid the horrible mess in include/linux/mm_types.h, we'll
+ * just define our own struct page type variant here.
+ */
+struct slob_page {
+ union {
+ struct {
+ unsigned long flags; /* mandatory */
+ atomic_t _count; /* mandatory */
+ slobidx_t units; /* free units left in page */
+ unsigned long pad[2];
+ slob_t *free; /* first free slob_t in page */
+ struct list_head list; /* linked list of free pages */
+ };
+ struct page page;
+ };
+};
+static inline void struct_slob_page_wrong_size(void)
+{ BUILD_BUG_ON(sizeof(struct slob_page) != sizeof(struct page)); }
+
+/*
+ * free_slob_page: call before a slob_page is returned to the page allocator.
+ */
+static inline void free_slob_page(struct slob_page *sp)
+{
+ reset_page_mapcount(&sp->page);
+ sp->page.mapping = NULL;
+}
+
+/*
+ * All partially free slob pages go on these lists.
+ */
+#define SLOB_BREAK1 256
+#define SLOB_BREAK2 1024
+static LIST_HEAD(free_slob_small);
+static LIST_HEAD(free_slob_medium);
+static LIST_HEAD(free_slob_large);
+
+/*
+ * slob_page: True for all slob pages (false for bigblock pages)
+ */
+static inline int slob_page(struct slob_page *sp)
+{
+ return PageSlobPage((struct page *)sp);
+}
+
+static inline void set_slob_page(struct slob_page *sp)
+{
+ __SetPageSlobPage((struct page *)sp);
+}
+
+static inline void clear_slob_page(struct slob_page *sp)
+{
+ __ClearPageSlobPage((struct page *)sp);
+}
+
+/*
+ * slob_page_free: true for pages on free_slob_pages list.
+ */
+static inline int slob_page_free(struct slob_page *sp)
+{
+ return PageSlobFree((struct page *)sp);
+}
+
+static void set_slob_page_free(struct slob_page *sp, struct list_head *list)
+{
+ list_add(&sp->list, list);
+ __SetPageSlobFree((struct page *)sp);
+}
+
+static inline void clear_slob_page_free(struct slob_page *sp)
+{
+ list_del(&sp->list);
+ __ClearPageSlobFree((struct page *)sp);
+}
+
+#define SLOB_UNIT sizeof(slob_t)
+#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
+#define SLOB_ALIGN L1_CACHE_BYTES
+
+/*
+ * struct slob_rcu is inserted at the tail of allocated slob blocks, which
+ * were created with a SLAB_DESTROY_BY_RCU slab. slob_rcu is used to free
+ * the block using call_rcu.
+ */
+struct slob_rcu {
+ struct rcu_head head;
+ int size;
+};
+
+/*
+ * slob_lock protects all slob allocator structures.
+ */
+static DEFINE_SPINLOCK(slob_lock);
+
+/*
+ * Encode the given size and next info into a free slob block s.
+ */
+static void set_slob(slob_t *s, slobidx_t size, slob_t *next)
+{
+ slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK);
+ slobidx_t offset = next - base;
+
+ if (size > 1) {
+ s[0].units = size;
+ s[1].units = offset;
+ } else
+ s[0].units = -offset;
+}
+
+/*
+ * Return the size of a slob block.
+ */
+static slobidx_t slob_units(slob_t *s)
+{
+ if (s->units > 0)
+ return s->units;
+ return 1;
+}
+
+/*
+ * Return the next free slob block pointer after this one.
+ */
+static slob_t *slob_next(slob_t *s)
+{
+ slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK);
+ slobidx_t next;
+
+ if (s[0].units < 0)
+ next = -s[0].units;
+ else
+ next = s[1].units;
+ return base+next;
+}
+
+/*
+ * Returns true if s is the last free block in its page.
+ */
+static int slob_last(slob_t *s)
+{
+ return !((unsigned long)slob_next(s) & ~PAGE_MASK);
+}
+
+static void *slob_new_page(gfp_t gfp, int order, int node)
+{
+ void *page;
+
+#ifdef CONFIG_NUMA
+ if (node != -1)
+ page = alloc_pages_node(node, gfp, order);
+ else
+#endif
+ page = alloc_pages(gfp, order);
+
+ if (!page)
+ return NULL;
+
+ return page_address(page);
+}
+
+/*
+ * Allocate a slob block within a given slob_page sp.
+ */
+static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
+{
+ slob_t *prev, *cur, *aligned = 0;
+ int delta = 0, units = SLOB_UNITS(size);
+
+ for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) {
+ slobidx_t avail = slob_units(cur);
+
+ if (align) {
+ aligned = (slob_t *)ALIGN((unsigned long)cur, align);
+ delta = aligned - cur;
+ }
+ if (avail >= units + delta) { /* room enough? */
+ slob_t *next;
+
+ if (delta) { /* need to fragment head to align? */
+ next = slob_next(cur);
+ set_slob(aligned, avail - delta, next);
+ set_slob(cur, delta, aligned);
+ prev = cur;
+ cur = aligned;
+ avail = slob_units(cur);
+ }
+
+ next = slob_next(cur);
+ if (avail == units) { /* exact fit? unlink. */
+ if (prev)
+ set_slob(prev, slob_units(prev), next);
+ else
+ sp->free = next;
+ } else { /* fragment */
+ if (prev)
+ set_slob(prev, slob_units(prev), cur + units);
+ else
+ sp->free = cur + units;
+ set_slob(cur + units, avail - units, next);
+ }
+
+ sp->units -= units;
+ if (!sp->units)
+ clear_slob_page_free(sp);
+ return cur;
+ }
+ if (slob_last(cur))
+ return NULL;
+ }
+}
+
+/*
+ * slob_alloc: entry point into the slob allocator.
+ */
+static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
+{
+ struct slob_page *sp;
+ struct list_head *prev;
+ struct list_head *slob_list;
+ slob_t *b = NULL;
+ unsigned long flags;
+
+ if (size < SLOB_BREAK1)
+ slob_list = &free_slob_small;
+ else if (size < SLOB_BREAK2)
+ slob_list = &free_slob_medium;
+ else
+ slob_list = &free_slob_large;
+
+ spin_lock_irqsave(&slob_lock, flags);
+ /* Iterate through each partially free page, try to find room */
+ list_for_each_entry(sp, slob_list, list) {
+#ifdef CONFIG_NUMA
+ /*
+ * If there's a node specification, search for a partial
+ * page with a matching node id in the freelist.
+ */
+ if (node != -1 && page_to_nid(&sp->page) != node)
+ continue;
+#endif
+ /* Enough room on this page? */
+ if (sp->units < SLOB_UNITS(size))
+ continue;
+
+ /* Attempt to alloc */
+ prev = sp->list.prev;
+ b = slob_page_alloc(sp, size, align);
+ if (!b)
+ continue;
+
+ /* Improve fragment distribution and reduce our average
+ * search time by starting our next search here. (see
+ * Knuth vol 1, sec 2.5, pg 449) */
+ if (prev != slob_list->prev &&
+ slob_list->next != prev->next)
+ list_move_tail(slob_list, prev->next);
+ break;
+ }
+ spin_unlock_irqrestore(&slob_lock, flags);
+
+ /* Not enough space: must allocate a new page */
+ if (!b) {
+ b = slob_new_page(gfp & ~__GFP_ZERO, 0, node);
+ if (!b)
+ return 0;
+ sp = (struct slob_page *)virt_to_page(b);
+ set_slob_page(sp);
+
+ spin_lock_irqsave(&slob_lock, flags);
+ sp->units = SLOB_UNITS(PAGE_SIZE);
+ sp->free = b;
+ INIT_LIST_HEAD(&sp->list);
+ set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
+ set_slob_page_free(sp, slob_list);
+ b = slob_page_alloc(sp, size, align);
+ BUG_ON(!b);
+ spin_unlock_irqrestore(&slob_lock, flags);
+ }
+ if (unlikely((gfp & __GFP_ZERO) && b))
+ memset(b, 0, size);
+ return b;
+}
+
+/*
+ * slob_free: entry point into the slob allocator.
+ */
+static void slob_free(void *block, int size)
+{
+ struct slob_page *sp;
+ slob_t *prev, *next, *b = (slob_t *)block;
+ slobidx_t units;
+ unsigned long flags;
+
+ if (unlikely(ZERO_OR_NULL_PTR(block)))
+ return;
+ BUG_ON(!size);
+
+ sp = (struct slob_page *)virt_to_page(block);
+ units = SLOB_UNITS(size);
+
+ spin_lock_irqsave(&slob_lock, flags);
+
+ if (sp->units + units == SLOB_UNITS(PAGE_SIZE)) {
+ /* Go directly to page allocator. Do not pass slob allocator */
+ if (slob_page_free(sp))
+ clear_slob_page_free(sp);
+ clear_slob_page(sp);
+ free_slob_page(sp);
+ free_page((unsigned long)b);
+ goto out;
+ }
+
+ if (!slob_page_free(sp)) {
+ /* This slob page is about to become partially free. Easy! */
+ sp->units = units;
+ sp->free = b;
+ set_slob(b, units,
+ (void *)((unsigned long)(b +
+ SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK));
+ set_slob_page_free(sp, &free_slob_small);
+ goto out;
+ }
+
+ /*
+ * Otherwise the page is already partially free, so find reinsertion
+ * point.
+ */
+ sp->units += units;
+
+ if (b < sp->free) {
+ if (b + units == sp->free) {
+ units += slob_units(sp->free);
+ sp->free = slob_next(sp->free);
+ }
+ set_slob(b, units, sp->free);
+ sp->free = b;
+ } else {
+ prev = sp->free;
+ next = slob_next(prev);
+ while (b > next) {
+ prev = next;
+ next = slob_next(prev);
+ }
+
+ if (!slob_last(prev) && b + units == next) {
+ units += slob_units(next);
+ set_slob(b, units, slob_next(next));
+ } else
+ set_slob(b, units, next);
+
+ if (prev + slob_units(prev) == b) {
+ units = slob_units(b) + slob_units(prev);
+ set_slob(prev, units, slob_next(b));
+ } else
+ set_slob(prev, slob_units(prev), b);
+ }
+out:
+ spin_unlock_irqrestore(&slob_lock, flags);
+}
+
+/*
+ * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend.
+ */
+
+#ifndef ARCH_KMALLOC_MINALIGN
+#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long)
+#endif
+
+#ifndef ARCH_SLAB_MINALIGN
+#define ARCH_SLAB_MINALIGN __alignof__(unsigned long)
+#endif
+
+void *__kmalloc_node(size_t size, gfp_t gfp, int node)
+{
+ unsigned int *m;
+ int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+
+ if (size < PAGE_SIZE - align) {
+ if (!size)
+ return ZERO_SIZE_PTR;
+
+ m = slob_alloc(size + align, gfp, align, node);
+ if (!m)
+ return NULL;
+ *m = size;
+ return (void *)m + align;
+ } else {
+ void *ret;
+
+ ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node);
+ if (ret) {
+ struct page *page;
+ page = virt_to_page(ret);
+ page->private = size;
+ }
+ return ret;
+ }
+}
+EXPORT_SYMBOL(__kmalloc_node);
+
+void kfree(const void *block)
+{
+ struct slob_page *sp;
+
+ if (unlikely(ZERO_OR_NULL_PTR(block)))
+ return;
+
+ sp = (struct slob_page *)virt_to_page(block);
+ if (slob_page(sp)) {
+ int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ unsigned int *m = (unsigned int *)(block - align);
+ slob_free(m, *m + align);
+ } else
+ put_page(&sp->page);
+}
+EXPORT_SYMBOL(kfree);
+
+/* can't use ksize for kmem_cache_alloc memory, only kmalloc */
+size_t ksize(const void *block)
+{
+ struct slob_page *sp;
+
+ BUG_ON(!block);
+ if (unlikely(block == ZERO_SIZE_PTR))
+ return 0;
+
+ sp = (struct slob_page *)virt_to_page(block);
+ if (slob_page(sp)) {
+ int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ unsigned int *m = (unsigned int *)(block - align);
+ return SLOB_UNITS(*m) * SLOB_UNIT;
+ } else
+ return sp->page.private;
+}
+
+struct kmem_cache {
+ unsigned int size, align;
+ unsigned long flags;
+ const char *name;
+ void (*ctor)(void *);
+};
+
+struct kmem_cache *kmem_cache_create(const char *name, size_t size,
+ size_t align, unsigned long flags, void (*ctor)(void *))
+{
+ struct kmem_cache *c;
+
+ c = slob_alloc(sizeof(struct kmem_cache),
+ GFP_KERNEL, ARCH_KMALLOC_MINALIGN, -1);
+
+ if (c) {
+ c->name = name;
+ c->size = size;
+ if (flags & SLAB_DESTROY_BY_RCU) {
+ /* leave room for rcu footer at the end of object */
+ c->size += sizeof(struct slob_rcu);
+ }
+ c->flags = flags;
+ c->ctor = ctor;
+ /* ignore alignment unless it's forced */
+ c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
+ if (c->align < ARCH_SLAB_MINALIGN)
+ c->align = ARCH_SLAB_MINALIGN;
+ if (c->align < align)
+ c->align = align;
+ } else if (flags & SLAB_PANIC)
+ panic("Cannot create slab cache %s\n", name);
+
+ return c;
+}
+EXPORT_SYMBOL(kmem_cache_create);
+
+void kmem_cache_destroy(struct kmem_cache *c)
+{
+ slob_free(c, sizeof(struct kmem_cache));
+}
+EXPORT_SYMBOL(kmem_cache_destroy);
+
+void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
+{
+ void *b;
+
+ if (c->size < PAGE_SIZE)
+ b = slob_alloc(c->size, flags, c->align, node);
+ else
+ b = slob_new_page(flags, get_order(c->size), node);
+
+ if (c->ctor)
+ c->ctor(b);
+
+ return b;
+}
+EXPORT_SYMBOL(kmem_cache_alloc_node);
+
+static void __kmem_cache_free(void *b, int size)
+{
+ if (size < PAGE_SIZE)
+ slob_free(b, size);
+ else
+ free_pages((unsigned long)b, get_order(size));
+}
+
+static void kmem_rcu_free(struct rcu_head *head)
+{
+ struct slob_rcu *slob_rcu = (struct slob_rcu *)head;
+ void *b = (void *)slob_rcu - (slob_rcu->size - sizeof(struct slob_rcu));
+
+ __kmem_cache_free(b, slob_rcu->size);
+}
+
+void kmem_cache_free(struct kmem_cache *c, void *b)
+{
+ if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) {
+ struct slob_rcu *slob_rcu;
+ slob_rcu = b + (c->size - sizeof(struct slob_rcu));
+ INIT_RCU_HEAD(&slob_rcu->head);
+ slob_rcu->size = c->size;
+ call_rcu(&slob_rcu->head, kmem_rcu_free);
+ } else {
+ __kmem_cache_free(b, c->size);
+ }
+}
+EXPORT_SYMBOL(kmem_cache_free);
+
+unsigned int kmem_cache_size(struct kmem_cache *c)
+{
+ return c->size;
+}
+EXPORT_SYMBOL(kmem_cache_size);
+
+const char *kmem_cache_name(struct kmem_cache *c)
+{
+ return c->name;
+}
+EXPORT_SYMBOL(kmem_cache_name);
+
+int kmem_cache_shrink(struct kmem_cache *d)
+{
+ return 0;
+}
+EXPORT_SYMBOL(kmem_cache_shrink);
+
+int kmem_ptr_validate(struct kmem_cache *a, const void *b)
+{
+ return 0;
+}
+
+static unsigned int slob_ready __read_mostly;
+
+int slab_is_available(void)
+{
+ return slob_ready;
+}
+
+void __init kmem_cache_init(void)
+{
+ slob_ready = 1;
+}
diff --git a/mm/slub.c b/mm/slub.c
new file mode 100644
index 0000000..a2cd47d
--- /dev/null
+++ b/mm/slub.c
@@ -0,0 +1,4515 @@
+/*
+ * SLUB: A slab allocator that limits cache line use instead of queuing
+ * objects in per cpu and per node lists.
+ *
+ * The allocator synchronizes using per slab locks and only
+ * uses a centralized lock to manage a pool of partial slabs.
+ *
+ * (C) 2007 SGI, Christoph Lameter
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/bit_spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/mempolicy.h>
+#include <linux/ctype.h>
+#include <linux/debugobjects.h>
+#include <linux/kallsyms.h>
+#include <linux/memory.h>
+#include <linux/math64.h>
+
+/*
+ * Lock order:
+ * 1. slab_lock(page)
+ * 2. slab->list_lock
+ *
+ * The slab_lock protects operations on the object of a particular
+ * slab and its metadata in the page struct. If the slab lock
+ * has been taken then no allocations nor frees can be performed
+ * on the objects in the slab nor can the slab be added or removed
+ * from the partial or full lists since this would mean modifying
+ * the page_struct of the slab.
+ *
+ * The list_lock protects the partial and full list on each node and
+ * the partial slab counter. If taken then no new slabs may be added or
+ * removed from the lists nor make the number of partial slabs be modified.
+ * (Note that the total number of slabs is an atomic value that may be
+ * modified without taking the list lock).
+ *
+ * The list_lock is a centralized lock and thus we avoid taking it as
+ * much as possible. As long as SLUB does not have to handle partial
+ * slabs, operations can continue without any centralized lock. F.e.
+ * allocating a long series of objects that fill up slabs does not require
+ * the list lock.
+ *
+ * The lock order is sometimes inverted when we are trying to get a slab
+ * off a list. We take the list_lock and then look for a page on the list
+ * to use. While we do that objects in the slabs may be freed. We can
+ * only operate on the slab if we have also taken the slab_lock. So we use
+ * a slab_trylock() on the slab. If trylock was successful then no frees
+ * can occur anymore and we can use the slab for allocations etc. If the
+ * slab_trylock() does not succeed then frees are in progress in the slab and
+ * we must stay away from it for a while since we may cause a bouncing
+ * cacheline if we try to acquire the lock. So go onto the next slab.
+ * If all pages are busy then we may allocate a new slab instead of reusing
+ * a partial slab. A new slab has noone operating on it and thus there is
+ * no danger of cacheline contention.
+ *
+ * Interrupts are disabled during allocation and deallocation in order to
+ * make the slab allocator safe to use in the context of an irq. In addition
+ * interrupts are disabled to ensure that the processor does not change
+ * while handling per_cpu slabs, due to kernel preemption.
+ *
+ * SLUB assigns one slab for allocation to each processor.
+ * Allocations only occur from these slabs called cpu slabs.
+ *
+ * Slabs with free elements are kept on a partial list and during regular
+ * operations no list for full slabs is used. If an object in a full slab is
+ * freed then the slab will show up again on the partial lists.
+ * We track full slabs for debugging purposes though because otherwise we
+ * cannot scan all objects.
+ *
+ * Slabs are freed when they become empty. Teardown and setup is
+ * minimal so we rely on the page allocators per cpu caches for
+ * fast frees and allocs.
+ *
+ * Overloading of page flags that are otherwise used for LRU management.
+ *
+ * PageActive The slab is frozen and exempt from list processing.
+ * This means that the slab is dedicated to a purpose
+ * such as satisfying allocations for a specific
+ * processor. Objects may be freed in the slab while
+ * it is frozen but slab_free will then skip the usual
+ * list operations. It is up to the processor holding
+ * the slab to integrate the slab into the slab lists
+ * when the slab is no longer needed.
+ *
+ * One use of this flag is to mark slabs that are
+ * used for allocations. Then such a slab becomes a cpu
+ * slab. The cpu slab may be equipped with an additional
+ * freelist that allows lockless access to
+ * free objects in addition to the regular freelist
+ * that requires the slab lock.
+ *
+ * PageError Slab requires special handling due to debug
+ * options set. This moves slab handling out of
+ * the fast path and disables lockless freelists.
+ */
+
+#ifdef CONFIG_SLUB_DEBUG
+#define SLABDEBUG 1
+#else
+#define SLABDEBUG 0
+#endif
+
+/*
+ * Issues still to be resolved:
+ *
+ * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
+ *
+ * - Variable sizing of the per node arrays
+ */
+
+/* Enable to test recovery from slab corruption on boot */
+#undef SLUB_RESILIENCY_TEST
+
+/*
+ * Mininum number of partial slabs. These will be left on the partial
+ * lists even if they are empty. kmem_cache_shrink may reclaim them.
+ */
+#define MIN_PARTIAL 5
+
+/*
+ * Maximum number of desirable partial slabs.
+ * The existence of more partial slabs makes kmem_cache_shrink
+ * sort the partial list by the number of objects in the.
+ */
+#define MAX_PARTIAL 10
+
+#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
+ SLAB_POISON | SLAB_STORE_USER)
+
+/*
+ * Set of flags that will prevent slab merging
+ */
+#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
+ SLAB_TRACE | SLAB_DESTROY_BY_RCU)
+
+#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
+ SLAB_CACHE_DMA)
+
+#ifndef ARCH_KMALLOC_MINALIGN
+#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
+#endif
+
+#ifndef ARCH_SLAB_MINALIGN
+#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
+#endif
+
+/* Internal SLUB flags */
+#define __OBJECT_POISON 0x80000000 /* Poison object */
+#define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */
+
+static int kmem_size = sizeof(struct kmem_cache);
+
+#ifdef CONFIG_SMP
+static struct notifier_block slab_notifier;
+#endif
+
+static enum {
+ DOWN, /* No slab functionality available */
+ PARTIAL, /* kmem_cache_open() works but kmalloc does not */
+ UP, /* Everything works but does not show up in sysfs */
+ SYSFS /* Sysfs up */
+} slab_state = DOWN;
+
+/* A list of all slab caches on the system */
+static DECLARE_RWSEM(slub_lock);
+static LIST_HEAD(slab_caches);
+
+/*
+ * Tracking user of a slab.
+ */
+struct track {
+ void *addr; /* Called from address */
+ int cpu; /* Was running on cpu */
+ int pid; /* Pid context */
+ unsigned long when; /* When did the operation occur */
+};
+
+enum track_item { TRACK_ALLOC, TRACK_FREE };
+
+#ifdef CONFIG_SLUB_DEBUG
+static int sysfs_slab_add(struct kmem_cache *);
+static int sysfs_slab_alias(struct kmem_cache *, const char *);
+static void sysfs_slab_remove(struct kmem_cache *);
+
+#else
+static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
+static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
+ { return 0; }
+static inline void sysfs_slab_remove(struct kmem_cache *s)
+{
+ kfree(s);
+}
+
+#endif
+
+static inline void stat(struct kmem_cache_cpu *c, enum stat_item si)
+{
+#ifdef CONFIG_SLUB_STATS
+ c->stat[si]++;
+#endif
+}
+
+/********************************************************************
+ * Core slab cache functions
+ *******************************************************************/
+
+int slab_is_available(void)
+{
+ return slab_state >= UP;
+}
+
+static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
+{
+#ifdef CONFIG_NUMA
+ return s->node[node];
+#else
+ return &s->local_node;
+#endif
+}
+
+static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
+{
+#ifdef CONFIG_SMP
+ return s->cpu_slab[cpu];
+#else
+ return &s->cpu_slab;
+#endif
+}
+
+/* Verify that a pointer has an address that is valid within a slab page */
+static inline int check_valid_pointer(struct kmem_cache *s,
+ struct page *page, const void *object)
+{
+ void *base;
+
+ if (!object)
+ return 1;
+
+ base = page_address(page);
+ if (object < base || object >= base + page->objects * s->size ||
+ (object - base) % s->size) {
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * Slow version of get and set free pointer.
+ *
+ * This version requires touching the cache lines of kmem_cache which
+ * we avoid to do in the fast alloc free paths. There we obtain the offset
+ * from the page struct.
+ */
+static inline void *get_freepointer(struct kmem_cache *s, void *object)
+{
+ return *(void **)(object + s->offset);
+}
+
+static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
+{
+ *(void **)(object + s->offset) = fp;
+}
+
+/* Loop over all objects in a slab */
+#define for_each_object(__p, __s, __addr, __objects) \
+ for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
+ __p += (__s)->size)
+
+/* Scan freelist */
+#define for_each_free_object(__p, __s, __free) \
+ for (__p = (__free); __p; __p = get_freepointer((__s), __p))
+
+/* Determine object index from a given position */
+static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
+{
+ return (p - addr) / s->size;
+}
+
+static inline struct kmem_cache_order_objects oo_make(int order,
+ unsigned long size)
+{
+ struct kmem_cache_order_objects x = {
+ (order << 16) + (PAGE_SIZE << order) / size
+ };
+
+ return x;
+}
+
+static inline int oo_order(struct kmem_cache_order_objects x)
+{
+ return x.x >> 16;
+}
+
+static inline int oo_objects(struct kmem_cache_order_objects x)
+{
+ return x.x & ((1 << 16) - 1);
+}
+
+#ifdef CONFIG_SLUB_DEBUG
+/*
+ * Debug settings:
+ */
+#ifdef CONFIG_SLUB_DEBUG_ON
+static int slub_debug = DEBUG_DEFAULT_FLAGS;
+#else
+static int slub_debug;
+#endif
+
+static char *slub_debug_slabs;
+
+/*
+ * Object debugging
+ */
+static void print_section(char *text, u8 *addr, unsigned int length)
+{
+ int i, offset;
+ int newline = 1;
+ char ascii[17];
+
+ ascii[16] = 0;
+
+ for (i = 0; i < length; i++) {
+ if (newline) {
+ printk(KERN_ERR "%8s 0x%p: ", text, addr + i);
+ newline = 0;
+ }
+ printk(KERN_CONT " %02x", addr[i]);
+ offset = i % 16;
+ ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
+ if (offset == 15) {
+ printk(KERN_CONT " %s\n", ascii);
+ newline = 1;
+ }
+ }
+ if (!newline) {
+ i %= 16;
+ while (i < 16) {
+ printk(KERN_CONT " ");
+ ascii[i] = ' ';
+ i++;
+ }
+ printk(KERN_CONT " %s\n", ascii);
+ }
+}
+
+static struct track *get_track(struct kmem_cache *s, void *object,
+ enum track_item alloc)
+{
+ struct track *p;
+
+ if (s->offset)
+ p = object + s->offset + sizeof(void *);
+ else
+ p = object + s->inuse;
+
+ return p + alloc;
+}
+
+static void set_track(struct kmem_cache *s, void *object,
+ enum track_item alloc, void *addr)
+{
+ struct track *p;
+
+ if (s->offset)
+ p = object + s->offset + sizeof(void *);
+ else
+ p = object + s->inuse;
+
+ p += alloc;
+ if (addr) {
+ p->addr = addr;
+ p->cpu = smp_processor_id();
+ p->pid = current->pid;
+ p->when = jiffies;
+ } else
+ memset(p, 0, sizeof(struct track));
+}
+
+static void init_tracking(struct kmem_cache *s, void *object)
+{
+ if (!(s->flags & SLAB_STORE_USER))
+ return;
+
+ set_track(s, object, TRACK_FREE, NULL);
+ set_track(s, object, TRACK_ALLOC, NULL);
+}
+
+static void print_track(const char *s, struct track *t)
+{
+ if (!t->addr)
+ return;
+
+ printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
+ s, t->addr, jiffies - t->when, t->cpu, t->pid);
+}
+
+static void print_tracking(struct kmem_cache *s, void *object)
+{
+ if (!(s->flags & SLAB_STORE_USER))
+ return;
+
+ print_track("Allocated", get_track(s, object, TRACK_ALLOC));
+ print_track("Freed", get_track(s, object, TRACK_FREE));
+}
+
+static void print_page_info(struct page *page)
+{
+ printk(KERN_ERR "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
+ page, page->objects, page->inuse, page->freelist, page->flags);
+
+}
+
+static void slab_bug(struct kmem_cache *s, char *fmt, ...)
+{
+ va_list args;
+ char buf[100];
+
+ va_start(args, fmt);
+ vsnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+ printk(KERN_ERR "========================================"
+ "=====================================\n");
+ printk(KERN_ERR "BUG %s: %s\n", s->name, buf);
+ printk(KERN_ERR "----------------------------------------"
+ "-------------------------------------\n\n");
+}
+
+static void slab_fix(struct kmem_cache *s, char *fmt, ...)
+{
+ va_list args;
+ char buf[100];
+
+ va_start(args, fmt);
+ vsnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+ printk(KERN_ERR "FIX %s: %s\n", s->name, buf);
+}
+
+static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
+{
+ unsigned int off; /* Offset of last byte */
+ u8 *addr = page_address(page);
+
+ print_tracking(s, p);
+
+ print_page_info(page);
+
+ printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
+ p, p - addr, get_freepointer(s, p));
+
+ if (p > addr + 16)
+ print_section("Bytes b4", p - 16, 16);
+
+ print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE));
+
+ if (s->flags & SLAB_RED_ZONE)
+ print_section("Redzone", p + s->objsize,
+ s->inuse - s->objsize);
+
+ if (s->offset)
+ off = s->offset + sizeof(void *);
+ else
+ off = s->inuse;
+
+ if (s->flags & SLAB_STORE_USER)
+ off += 2 * sizeof(struct track);
+
+ if (off != s->size)
+ /* Beginning of the filler is the free pointer */
+ print_section("Padding", p + off, s->size - off);
+
+ dump_stack();
+}
+
+static void object_err(struct kmem_cache *s, struct page *page,
+ u8 *object, char *reason)
+{
+ slab_bug(s, "%s", reason);
+ print_trailer(s, page, object);
+}
+
+static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...)
+{
+ va_list args;
+ char buf[100];
+
+ va_start(args, fmt);
+ vsnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+ slab_bug(s, "%s", buf);
+ print_page_info(page);
+ dump_stack();
+}
+
+static void init_object(struct kmem_cache *s, void *object, int active)
+{
+ u8 *p = object;
+
+ if (s->flags & __OBJECT_POISON) {
+ memset(p, POISON_FREE, s->objsize - 1);
+ p[s->objsize - 1] = POISON_END;
+ }
+
+ if (s->flags & SLAB_RED_ZONE)
+ memset(p + s->objsize,
+ active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE,
+ s->inuse - s->objsize);
+}
+
+static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
+{
+ while (bytes) {
+ if (*start != (u8)value)
+ return start;
+ start++;
+ bytes--;
+ }
+ return NULL;
+}
+
+static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
+ void *from, void *to)
+{
+ slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
+ memset(from, data, to - from);
+}
+
+static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
+ u8 *object, char *what,
+ u8 *start, unsigned int value, unsigned int bytes)
+{
+ u8 *fault;
+ u8 *end;
+
+ fault = check_bytes(start, value, bytes);
+ if (!fault)
+ return 1;
+
+ end = start + bytes;
+ while (end > fault && end[-1] == value)
+ end--;
+
+ slab_bug(s, "%s overwritten", what);
+ printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
+ fault, end - 1, fault[0], value);
+ print_trailer(s, page, object);
+
+ restore_bytes(s, what, value, fault, end);
+ return 0;
+}
+
+/*
+ * Object layout:
+ *
+ * object address
+ * Bytes of the object to be managed.
+ * If the freepointer may overlay the object then the free
+ * pointer is the first word of the object.
+ *
+ * Poisoning uses 0x6b (POISON_FREE) and the last byte is
+ * 0xa5 (POISON_END)
+ *
+ * object + s->objsize
+ * Padding to reach word boundary. This is also used for Redzoning.
+ * Padding is extended by another word if Redzoning is enabled and
+ * objsize == inuse.
+ *
+ * We fill with 0xbb (RED_INACTIVE) for inactive objects and with
+ * 0xcc (RED_ACTIVE) for objects in use.
+ *
+ * object + s->inuse
+ * Meta data starts here.
+ *
+ * A. Free pointer (if we cannot overwrite object on free)
+ * B. Tracking data for SLAB_STORE_USER
+ * C. Padding to reach required alignment boundary or at mininum
+ * one word if debugging is on to be able to detect writes
+ * before the word boundary.
+ *
+ * Padding is done using 0x5a (POISON_INUSE)
+ *
+ * object + s->size
+ * Nothing is used beyond s->size.
+ *
+ * If slabcaches are merged then the objsize and inuse boundaries are mostly
+ * ignored. And therefore no slab options that rely on these boundaries
+ * may be used with merged slabcaches.
+ */
+
+static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
+{
+ unsigned long off = s->inuse; /* The end of info */
+
+ if (s->offset)
+ /* Freepointer is placed after the object. */
+ off += sizeof(void *);
+
+ if (s->flags & SLAB_STORE_USER)
+ /* We also have user information there */
+ off += 2 * sizeof(struct track);
+
+ if (s->size == off)
+ return 1;
+
+ return check_bytes_and_report(s, page, p, "Object padding",
+ p + off, POISON_INUSE, s->size - off);
+}
+
+/* Check the pad bytes at the end of a slab page */
+static int slab_pad_check(struct kmem_cache *s, struct page *page)
+{
+ u8 *start;
+ u8 *fault;
+ u8 *end;
+ int length;
+ int remainder;
+
+ if (!(s->flags & SLAB_POISON))
+ return 1;
+
+ start = page_address(page);
+ length = (PAGE_SIZE << compound_order(page));
+ end = start + length;
+ remainder = length % s->size;
+ if (!remainder)
+ return 1;
+
+ fault = check_bytes(end - remainder, POISON_INUSE, remainder);
+ if (!fault)
+ return 1;
+ while (end > fault && end[-1] == POISON_INUSE)
+ end--;
+
+ slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
+ print_section("Padding", end - remainder, remainder);
+
+ restore_bytes(s, "slab padding", POISON_INUSE, start, end);
+ return 0;
+}
+
+static int check_object(struct kmem_cache *s, struct page *page,
+ void *object, int active)
+{
+ u8 *p = object;
+ u8 *endobject = object + s->objsize;
+
+ if (s->flags & SLAB_RED_ZONE) {
+ unsigned int red =
+ active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
+
+ if (!check_bytes_and_report(s, page, object, "Redzone",
+ endobject, red, s->inuse - s->objsize))
+ return 0;
+ } else {
+ if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) {
+ check_bytes_and_report(s, page, p, "Alignment padding",
+ endobject, POISON_INUSE, s->inuse - s->objsize);
+ }
+ }
+
+ if (s->flags & SLAB_POISON) {
+ if (!active && (s->flags & __OBJECT_POISON) &&
+ (!check_bytes_and_report(s, page, p, "Poison", p,
+ POISON_FREE, s->objsize - 1) ||
+ !check_bytes_and_report(s, page, p, "Poison",
+ p + s->objsize - 1, POISON_END, 1)))
+ return 0;
+ /*
+ * check_pad_bytes cleans up on its own.
+ */
+ check_pad_bytes(s, page, p);
+ }
+
+ if (!s->offset && active)
+ /*
+ * Object and freepointer overlap. Cannot check
+ * freepointer while object is allocated.
+ */
+ return 1;
+
+ /* Check free pointer validity */
+ if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
+ object_err(s, page, p, "Freepointer corrupt");
+ /*
+ * No choice but to zap it and thus loose the remainder
+ * of the free objects in this slab. May cause
+ * another error because the object count is now wrong.
+ */
+ set_freepointer(s, p, NULL);
+ return 0;
+ }
+ return 1;
+}
+
+static int check_slab(struct kmem_cache *s, struct page *page)
+{
+ int maxobj;
+
+ VM_BUG_ON(!irqs_disabled());
+
+ if (!PageSlab(page)) {
+ slab_err(s, page, "Not a valid slab page");
+ return 0;
+ }
+
+ maxobj = (PAGE_SIZE << compound_order(page)) / s->size;
+ if (page->objects > maxobj) {
+ slab_err(s, page, "objects %u > max %u",
+ s->name, page->objects, maxobj);
+ return 0;
+ }
+ if (page->inuse > page->objects) {
+ slab_err(s, page, "inuse %u > max %u",
+ s->name, page->inuse, page->objects);
+ return 0;
+ }
+ /* Slab_pad_check fixes things up after itself */
+ slab_pad_check(s, page);
+ return 1;
+}
+
+/*
+ * Determine if a certain object on a page is on the freelist. Must hold the
+ * slab lock to guarantee that the chains are in a consistent state.
+ */
+static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
+{
+ int nr = 0;
+ void *fp = page->freelist;
+ void *object = NULL;
+ unsigned long max_objects;
+
+ while (fp && nr <= page->objects) {
+ if (fp == search)
+ return 1;
+ if (!check_valid_pointer(s, page, fp)) {
+ if (object) {
+ object_err(s, page, object,
+ "Freechain corrupt");
+ set_freepointer(s, object, NULL);
+ break;
+ } else {
+ slab_err(s, page, "Freepointer corrupt");
+ page->freelist = NULL;
+ page->inuse = page->objects;
+ slab_fix(s, "Freelist cleared");
+ return 0;
+ }
+ break;
+ }
+ object = fp;
+ fp = get_freepointer(s, object);
+ nr++;
+ }
+
+ max_objects = (PAGE_SIZE << compound_order(page)) / s->size;
+ if (max_objects > 65535)
+ max_objects = 65535;
+
+ if (page->objects != max_objects) {
+ slab_err(s, page, "Wrong number of objects. Found %d but "
+ "should be %d", page->objects, max_objects);
+ page->objects = max_objects;
+ slab_fix(s, "Number of objects adjusted.");
+ }
+ if (page->inuse != page->objects - nr) {
+ slab_err(s, page, "Wrong object count. Counter is %d but "
+ "counted were %d", page->inuse, page->objects - nr);
+ page->inuse = page->objects - nr;
+ slab_fix(s, "Object count adjusted.");
+ }
+ return search == NULL;
+}
+
+static void trace(struct kmem_cache *s, struct page *page, void *object,
+ int alloc)
+{
+ if (s->flags & SLAB_TRACE) {
+ printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
+ s->name,
+ alloc ? "alloc" : "free",
+ object, page->inuse,
+ page->freelist);
+
+ if (!alloc)
+ print_section("Object", (void *)object, s->objsize);
+
+ dump_stack();
+ }
+}
+
+/*
+ * Tracking of fully allocated slabs for debugging purposes.
+ */
+static void add_full(struct kmem_cache_node *n, struct page *page)
+{
+ spin_lock(&n->list_lock);
+ list_add(&page->lru, &n->full);
+ spin_unlock(&n->list_lock);
+}
+
+static void remove_full(struct kmem_cache *s, struct page *page)
+{
+ struct kmem_cache_node *n;
+
+ if (!(s->flags & SLAB_STORE_USER))
+ return;
+
+ n = get_node(s, page_to_nid(page));
+
+ spin_lock(&n->list_lock);
+ list_del(&page->lru);
+ spin_unlock(&n->list_lock);
+}
+
+/* Tracking of the number of slabs for debugging purposes */
+static inline unsigned long slabs_node(struct kmem_cache *s, int node)
+{
+ struct kmem_cache_node *n = get_node(s, node);
+
+ return atomic_long_read(&n->nr_slabs);
+}
+
+static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
+{
+ struct kmem_cache_node *n = get_node(s, node);
+
+ /*
+ * May be called early in order to allocate a slab for the
+ * kmem_cache_node structure. Solve the chicken-egg
+ * dilemma by deferring the increment of the count during
+ * bootstrap (see early_kmem_cache_node_alloc).
+ */
+ if (!NUMA_BUILD || n) {
+ atomic_long_inc(&n->nr_slabs);
+ atomic_long_add(objects, &n->total_objects);
+ }
+}
+static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
+{
+ struct kmem_cache_node *n = get_node(s, node);
+
+ atomic_long_dec(&n->nr_slabs);
+ atomic_long_sub(objects, &n->total_objects);
+}
+
+/* Object debug checks for alloc/free paths */
+static void setup_object_debug(struct kmem_cache *s, struct page *page,
+ void *object)
+{
+ if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
+ return;
+
+ init_object(s, object, 0);
+ init_tracking(s, object);
+}
+
+static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
+ void *object, void *addr)
+{
+ if (!check_slab(s, page))
+ goto bad;
+
+ if (!on_freelist(s, page, object)) {
+ object_err(s, page, object, "Object already allocated");
+ goto bad;
+ }
+
+ if (!check_valid_pointer(s, page, object)) {
+ object_err(s, page, object, "Freelist Pointer check fails");
+ goto bad;
+ }
+
+ if (!check_object(s, page, object, 0))
+ goto bad;
+
+ /* Success perform special debug activities for allocs */
+ if (s->flags & SLAB_STORE_USER)
+ set_track(s, object, TRACK_ALLOC, addr);
+ trace(s, page, object, 1);
+ init_object(s, object, 1);
+ return 1;
+
+bad:
+ if (PageSlab(page)) {
+ /*
+ * If this is a slab page then lets do the best we can
+ * to avoid issues in the future. Marking all objects
+ * as used avoids touching the remaining objects.
+ */
+ slab_fix(s, "Marking all objects used");
+ page->inuse = page->objects;
+ page->freelist = NULL;
+ }
+ return 0;
+}
+
+static int free_debug_processing(struct kmem_cache *s, struct page *page,
+ void *object, void *addr)
+{
+ if (!check_slab(s, page))
+ goto fail;
+
+ if (!check_valid_pointer(s, page, object)) {
+ slab_err(s, page, "Invalid object pointer 0x%p", object);
+ goto fail;
+ }
+
+ if (on_freelist(s, page, object)) {
+ object_err(s, page, object, "Object already free");
+ goto fail;
+ }
+
+ if (!check_object(s, page, object, 1))
+ return 0;
+
+ if (unlikely(s != page->slab)) {
+ if (!PageSlab(page)) {
+ slab_err(s, page, "Attempt to free object(0x%p) "
+ "outside of slab", object);
+ } else if (!page->slab) {
+ printk(KERN_ERR
+ "SLUB <none>: no slab for object 0x%p.\n",
+ object);
+ dump_stack();
+ } else
+ object_err(s, page, object,
+ "page slab pointer corrupt.");
+ goto fail;
+ }
+
+ /* Special debug activities for freeing objects */
+ if (!PageSlubFrozen(page) && !page->freelist)
+ remove_full(s, page);
+ if (s->flags & SLAB_STORE_USER)
+ set_track(s, object, TRACK_FREE, addr);
+ trace(s, page, object, 0);
+ init_object(s, object, 0);
+ return 1;
+
+fail:
+ slab_fix(s, "Object at 0x%p not freed", object);
+ return 0;
+}
+
+static int __init setup_slub_debug(char *str)
+{
+ slub_debug = DEBUG_DEFAULT_FLAGS;
+ if (*str++ != '=' || !*str)
+ /*
+ * No options specified. Switch on full debugging.
+ */
+ goto out;
+
+ if (*str == ',')
+ /*
+ * No options but restriction on slabs. This means full
+ * debugging for slabs matching a pattern.
+ */
+ goto check_slabs;
+
+ slub_debug = 0;
+ if (*str == '-')
+ /*
+ * Switch off all debugging measures.
+ */
+ goto out;
+
+ /*
+ * Determine which debug features should be switched on
+ */
+ for (; *str && *str != ','; str++) {
+ switch (tolower(*str)) {
+ case 'f':
+ slub_debug |= SLAB_DEBUG_FREE;
+ break;
+ case 'z':
+ slub_debug |= SLAB_RED_ZONE;
+ break;
+ case 'p':
+ slub_debug |= SLAB_POISON;
+ break;
+ case 'u':
+ slub_debug |= SLAB_STORE_USER;
+ break;
+ case 't':
+ slub_debug |= SLAB_TRACE;
+ break;
+ default:
+ printk(KERN_ERR "slub_debug option '%c' "
+ "unknown. skipped\n", *str);
+ }
+ }
+
+check_slabs:
+ if (*str == ',')
+ slub_debug_slabs = str + 1;
+out:
+ return 1;
+}
+
+__setup("slub_debug", setup_slub_debug);
+
+static unsigned long kmem_cache_flags(unsigned long objsize,
+ unsigned long flags, const char *name,
+ void (*ctor)(void *))
+{
+ /*
+ * Enable debugging if selected on the kernel commandline.
+ */
+ if (slub_debug && (!slub_debug_slabs ||
+ strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)) == 0))
+ flags |= slub_debug;
+
+ return flags;
+}
+#else
+static inline void setup_object_debug(struct kmem_cache *s,
+ struct page *page, void *object) {}
+
+static inline int alloc_debug_processing(struct kmem_cache *s,
+ struct page *page, void *object, void *addr) { return 0; }
+
+static inline int free_debug_processing(struct kmem_cache *s,
+ struct page *page, void *object, void *addr) { return 0; }
+
+static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
+ { return 1; }
+static inline int check_object(struct kmem_cache *s, struct page *page,
+ void *object, int active) { return 1; }
+static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
+static inline unsigned long kmem_cache_flags(unsigned long objsize,
+ unsigned long flags, const char *name,
+ void (*ctor)(void *))
+{
+ return flags;
+}
+#define slub_debug 0
+
+static inline unsigned long slabs_node(struct kmem_cache *s, int node)
+ { return 0; }
+static inline void inc_slabs_node(struct kmem_cache *s, int node,
+ int objects) {}
+static inline void dec_slabs_node(struct kmem_cache *s, int node,
+ int objects) {}
+#endif
+
+/*
+ * Slab allocation and freeing
+ */
+static inline struct page *alloc_slab_page(gfp_t flags, int node,
+ struct kmem_cache_order_objects oo)
+{
+ int order = oo_order(oo);
+
+ if (node == -1)
+ return alloc_pages(flags, order);
+ else
+ return alloc_pages_node(node, flags, order);
+}
+
+static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
+{
+ struct page *page;
+ struct kmem_cache_order_objects oo = s->oo;
+
+ flags |= s->allocflags;
+
+ page = alloc_slab_page(flags | __GFP_NOWARN | __GFP_NORETRY, node,
+ oo);
+ if (unlikely(!page)) {
+ oo = s->min;
+ /*
+ * Allocation may have failed due to fragmentation.
+ * Try a lower order alloc if possible
+ */
+ page = alloc_slab_page(flags, node, oo);
+ if (!page)
+ return NULL;
+
+ stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK);
+ }
+ page->objects = oo_objects(oo);
+ mod_zone_page_state(page_zone(page),
+ (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+ NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
+ 1 << oo_order(oo));
+
+ return page;
+}
+
+static void setup_object(struct kmem_cache *s, struct page *page,
+ void *object)
+{
+ setup_object_debug(s, page, object);
+ if (unlikely(s->ctor))
+ s->ctor(object);
+}
+
+static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
+{
+ struct page *page;
+ void *start;
+ void *last;
+ void *p;
+
+ BUG_ON(flags & GFP_SLAB_BUG_MASK);
+
+ page = allocate_slab(s,
+ flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
+ if (!page)
+ goto out;
+
+ inc_slabs_node(s, page_to_nid(page), page->objects);
+ page->slab = s;
+ page->flags |= 1 << PG_slab;
+ if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
+ SLAB_STORE_USER | SLAB_TRACE))
+ __SetPageSlubDebug(page);
+
+ start = page_address(page);
+
+ if (unlikely(s->flags & SLAB_POISON))
+ memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page));
+
+ last = start;
+ for_each_object(p, s, start, page->objects) {
+ setup_object(s, page, last);
+ set_freepointer(s, last, p);
+ last = p;
+ }
+ setup_object(s, page, last);
+ set_freepointer(s, last, NULL);
+
+ page->freelist = start;
+ page->inuse = 0;
+out:
+ return page;
+}
+
+static void __free_slab(struct kmem_cache *s, struct page *page)
+{
+ int order = compound_order(page);
+ int pages = 1 << order;
+
+ if (unlikely(SLABDEBUG && PageSlubDebug(page))) {
+ void *p;
+
+ slab_pad_check(s, page);
+ for_each_object(p, s, page_address(page),
+ page->objects)
+ check_object(s, page, p, 0);
+ __ClearPageSlubDebug(page);
+ }
+
+ mod_zone_page_state(page_zone(page),
+ (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+ NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
+ -pages);
+
+ __ClearPageSlab(page);
+ reset_page_mapcount(page);
+ __free_pages(page, order);
+}
+
+static void rcu_free_slab(struct rcu_head *h)
+{
+ struct page *page;
+
+ page = container_of((struct list_head *)h, struct page, lru);
+ __free_slab(page->slab, page);
+}
+
+static void free_slab(struct kmem_cache *s, struct page *page)
+{
+ if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
+ /*
+ * RCU free overloads the RCU head over the LRU
+ */
+ struct rcu_head *head = (void *)&page->lru;
+
+ call_rcu(head, rcu_free_slab);
+ } else
+ __free_slab(s, page);
+}
+
+static void discard_slab(struct kmem_cache *s, struct page *page)
+{
+ dec_slabs_node(s, page_to_nid(page), page->objects);
+ free_slab(s, page);
+}
+
+/*
+ * Per slab locking using the pagelock
+ */
+static __always_inline void slab_lock(struct page *page)
+{
+ bit_spin_lock(PG_locked, &page->flags);
+}
+
+static __always_inline void slab_unlock(struct page *page)
+{
+ __bit_spin_unlock(PG_locked, &page->flags);
+}
+
+static __always_inline int slab_trylock(struct page *page)
+{
+ int rc = 1;
+
+ rc = bit_spin_trylock(PG_locked, &page->flags);
+ return rc;
+}
+
+/*
+ * Management of partially allocated slabs
+ */
+static void add_partial(struct kmem_cache_node *n,
+ struct page *page, int tail)
+{
+ spin_lock(&n->list_lock);
+ n->nr_partial++;
+ if (tail)
+ list_add_tail(&page->lru, &n->partial);
+ else
+ list_add(&page->lru, &n->partial);
+ spin_unlock(&n->list_lock);
+}
+
+static void remove_partial(struct kmem_cache *s, struct page *page)
+{
+ struct kmem_cache_node *n = get_node(s, page_to_nid(page));
+
+ spin_lock(&n->list_lock);
+ list_del(&page->lru);
+ n->nr_partial--;
+ spin_unlock(&n->list_lock);
+}
+
+/*
+ * Lock slab and remove from the partial list.
+ *
+ * Must hold list_lock.
+ */
+static inline int lock_and_freeze_slab(struct kmem_cache_node *n,
+ struct page *page)
+{
+ if (slab_trylock(page)) {
+ list_del(&page->lru);
+ n->nr_partial--;
+ __SetPageSlubFrozen(page);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Try to allocate a partial slab from a specific node.
+ */
+static struct page *get_partial_node(struct kmem_cache_node *n)
+{
+ struct page *page;
+
+ /*
+ * Racy check. If we mistakenly see no partial slabs then we
+ * just allocate an empty slab. If we mistakenly try to get a
+ * partial slab and there is none available then get_partials()
+ * will return NULL.
+ */
+ if (!n || !n->nr_partial)
+ return NULL;
+
+ spin_lock(&n->list_lock);
+ list_for_each_entry(page, &n->partial, lru)
+ if (lock_and_freeze_slab(n, page))
+ goto out;
+ page = NULL;
+out:
+ spin_unlock(&n->list_lock);
+ return page;
+}
+
+/*
+ * Get a page from somewhere. Search in increasing NUMA distances.
+ */
+static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
+{
+#ifdef CONFIG_NUMA
+ struct zonelist *zonelist;
+ struct zoneref *z;
+ struct zone *zone;
+ enum zone_type high_zoneidx = gfp_zone(flags);
+ struct page *page;
+
+ /*
+ * The defrag ratio allows a configuration of the tradeoffs between
+ * inter node defragmentation and node local allocations. A lower
+ * defrag_ratio increases the tendency to do local allocations
+ * instead of attempting to obtain partial slabs from other nodes.
+ *
+ * If the defrag_ratio is set to 0 then kmalloc() always
+ * returns node local objects. If the ratio is higher then kmalloc()
+ * may return off node objects because partial slabs are obtained
+ * from other nodes and filled up.
+ *
+ * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes
+ * defrag_ratio = 1000) then every (well almost) allocation will
+ * first attempt to defrag slab caches on other nodes. This means
+ * scanning over all nodes to look for partial slabs which may be
+ * expensive if we do it every time we are trying to find a slab
+ * with available objects.
+ */
+ if (!s->remote_node_defrag_ratio ||
+ get_cycles() % 1024 > s->remote_node_defrag_ratio)
+ return NULL;
+
+ zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ struct kmem_cache_node *n;
+
+ n = get_node(s, zone_to_nid(zone));
+
+ if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
+ n->nr_partial > n->min_partial) {
+ page = get_partial_node(n);
+ if (page)
+ return page;
+ }
+ }
+#endif
+ return NULL;
+}
+
+/*
+ * Get a partial page, lock it and return it.
+ */
+static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
+{
+ struct page *page;
+ int searchnode = (node == -1) ? numa_node_id() : node;
+
+ page = get_partial_node(get_node(s, searchnode));
+ if (page || (flags & __GFP_THISNODE))
+ return page;
+
+ return get_any_partial(s, flags);
+}
+
+/*
+ * Move a page back to the lists.
+ *
+ * Must be called with the slab lock held.
+ *
+ * On exit the slab lock will have been dropped.
+ */
+static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
+{
+ struct kmem_cache_node *n = get_node(s, page_to_nid(page));
+ struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id());
+
+ __ClearPageSlubFrozen(page);
+ if (page->inuse) {
+
+ if (page->freelist) {
+ add_partial(n, page, tail);
+ stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
+ } else {
+ stat(c, DEACTIVATE_FULL);
+ if (SLABDEBUG && PageSlubDebug(page) &&
+ (s->flags & SLAB_STORE_USER))
+ add_full(n, page);
+ }
+ slab_unlock(page);
+ } else {
+ stat(c, DEACTIVATE_EMPTY);
+ if (n->nr_partial < n->min_partial) {
+ /*
+ * Adding an empty slab to the partial slabs in order
+ * to avoid page allocator overhead. This slab needs
+ * to come after the other slabs with objects in
+ * so that the others get filled first. That way the
+ * size of the partial list stays small.
+ *
+ * kmem_cache_shrink can reclaim any empty slabs from
+ * the partial list.
+ */
+ add_partial(n, page, 1);
+ slab_unlock(page);
+ } else {
+ slab_unlock(page);
+ stat(get_cpu_slab(s, raw_smp_processor_id()), FREE_SLAB);
+ discard_slab(s, page);
+ }
+ }
+}
+
+/*
+ * Remove the cpu slab
+ */
+static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
+{
+ struct page *page = c->page;
+ int tail = 1;
+
+ if (page->freelist)
+ stat(c, DEACTIVATE_REMOTE_FREES);
+ /*
+ * Merge cpu freelist into slab freelist. Typically we get here
+ * because both freelists are empty. So this is unlikely
+ * to occur.
+ */
+ while (unlikely(c->freelist)) {
+ void **object;
+
+ tail = 0; /* Hot objects. Put the slab first */
+
+ /* Retrieve object from cpu_freelist */
+ object = c->freelist;
+ c->freelist = c->freelist[c->offset];
+
+ /* And put onto the regular freelist */
+ object[c->offset] = page->freelist;
+ page->freelist = object;
+ page->inuse--;
+ }
+ c->page = NULL;
+ unfreeze_slab(s, page, tail);
+}
+
+static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
+{
+ stat(c, CPUSLAB_FLUSH);
+ slab_lock(c->page);
+ deactivate_slab(s, c);
+}
+
+/*
+ * Flush cpu slab.
+ *
+ * Called from IPI handler with interrupts disabled.
+ */
+static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
+{
+ struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+
+ if (likely(c && c->page))
+ flush_slab(s, c);
+}
+
+static void flush_cpu_slab(void *d)
+{
+ struct kmem_cache *s = d;
+
+ __flush_cpu_slab(s, smp_processor_id());
+}
+
+static void flush_all(struct kmem_cache *s)
+{
+ on_each_cpu(flush_cpu_slab, s, 1);
+}
+
+/*
+ * Check if the objects in a per cpu structure fit numa
+ * locality expectations.
+ */
+static inline int node_match(struct kmem_cache_cpu *c, int node)
+{
+#ifdef CONFIG_NUMA
+ if (node != -1 && c->node != node)
+ return 0;
+#endif
+ return 1;
+}
+
+/*
+ * Slow path. The lockless freelist is empty or we need to perform
+ * debugging duties.
+ *
+ * Interrupts are disabled.
+ *
+ * Processing is still very fast if new objects have been freed to the
+ * regular freelist. In that case we simply take over the regular freelist
+ * as the lockless freelist and zap the regular freelist.
+ *
+ * If that is not working then we fall back to the partial lists. We take the
+ * first element of the freelist as the object to allocate now and move the
+ * rest of the freelist to the lockless freelist.
+ *
+ * And if we were unable to get a new slab from the partial slab lists then
+ * we need to allocate a new slab. This is the slowest path since it involves
+ * a call to the page allocator and the setup of a new slab.
+ */
+static void *__slab_alloc(struct kmem_cache *s,
+ gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c)
+{
+ void **object;
+ struct page *new;
+
+ /* We handle __GFP_ZERO in the caller */
+ gfpflags &= ~__GFP_ZERO;
+
+ if (!c->page)
+ goto new_slab;
+
+ slab_lock(c->page);
+ if (unlikely(!node_match(c, node)))
+ goto another_slab;
+
+ stat(c, ALLOC_REFILL);
+
+load_freelist:
+ object = c->page->freelist;
+ if (unlikely(!object))
+ goto another_slab;
+ if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
+ goto debug;
+
+ c->freelist = object[c->offset];
+ c->page->inuse = c->page->objects;
+ c->page->freelist = NULL;
+ c->node = page_to_nid(c->page);
+unlock_out:
+ slab_unlock(c->page);
+ stat(c, ALLOC_SLOWPATH);
+ return object;
+
+another_slab:
+ deactivate_slab(s, c);
+
+new_slab:
+ new = get_partial(s, gfpflags, node);
+ if (new) {
+ c->page = new;
+ stat(c, ALLOC_FROM_PARTIAL);
+ goto load_freelist;
+ }
+
+ if (gfpflags & __GFP_WAIT)
+ local_irq_enable();
+
+ new = new_slab(s, gfpflags, node);
+
+ if (gfpflags & __GFP_WAIT)
+ local_irq_disable();
+
+ if (new) {
+ c = get_cpu_slab(s, smp_processor_id());
+ stat(c, ALLOC_SLAB);
+ if (c->page)
+ flush_slab(s, c);
+ slab_lock(new);
+ __SetPageSlubFrozen(new);
+ c->page = new;
+ goto load_freelist;
+ }
+ return NULL;
+debug:
+ if (!alloc_debug_processing(s, c->page, object, addr))
+ goto another_slab;
+
+ c->page->inuse++;
+ c->page->freelist = object[c->offset];
+ c->node = -1;
+ goto unlock_out;
+}
+
+/*
+ * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
+ * have the fastpath folded into their functions. So no function call
+ * overhead for requests that can be satisfied on the fastpath.
+ *
+ * The fastpath works by first checking if the lockless freelist can be used.
+ * If not then __slab_alloc is called for slow processing.
+ *
+ * Otherwise we can simply pick the next object from the lockless free list.
+ */
+static __always_inline void *slab_alloc(struct kmem_cache *s,
+ gfp_t gfpflags, int node, void *addr)
+{
+ void **object;
+ struct kmem_cache_cpu *c;
+ unsigned long flags;
+ unsigned int objsize;
+
+ local_irq_save(flags);
+ c = get_cpu_slab(s, smp_processor_id());
+ objsize = c->objsize;
+ if (unlikely(!c->freelist || !node_match(c, node)))
+
+ object = __slab_alloc(s, gfpflags, node, addr, c);
+
+ else {
+ object = c->freelist;
+ c->freelist = object[c->offset];
+ stat(c, ALLOC_FASTPATH);
+ }
+ local_irq_restore(flags);
+
+ if (unlikely((gfpflags & __GFP_ZERO) && object))
+ memset(object, 0, objsize);
+
+ return object;
+}
+
+void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
+{
+ return slab_alloc(s, gfpflags, -1, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(kmem_cache_alloc);
+
+#ifdef CONFIG_NUMA
+void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
+{
+ return slab_alloc(s, gfpflags, node, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(kmem_cache_alloc_node);
+#endif
+
+/*
+ * Slow patch handling. This may still be called frequently since objects
+ * have a longer lifetime than the cpu slabs in most processing loads.
+ *
+ * So we still attempt to reduce cache line usage. Just take the slab
+ * lock and free the item. If there is no additional partial page
+ * handling required then we can return immediately.
+ */
+static void __slab_free(struct kmem_cache *s, struct page *page,
+ void *x, void *addr, unsigned int offset)
+{
+ void *prior;
+ void **object = (void *)x;
+ struct kmem_cache_cpu *c;
+
+ c = get_cpu_slab(s, raw_smp_processor_id());
+ stat(c, FREE_SLOWPATH);
+ slab_lock(page);
+
+ if (unlikely(SLABDEBUG && PageSlubDebug(page)))
+ goto debug;
+
+checks_ok:
+ prior = object[offset] = page->freelist;
+ page->freelist = object;
+ page->inuse--;
+
+ if (unlikely(PageSlubFrozen(page))) {
+ stat(c, FREE_FROZEN);
+ goto out_unlock;
+ }
+
+ if (unlikely(!page->inuse))
+ goto slab_empty;
+
+ /*
+ * Objects left in the slab. If it was not on the partial list before
+ * then add it.
+ */
+ if (unlikely(!prior)) {
+ add_partial(get_node(s, page_to_nid(page)), page, 1);
+ stat(c, FREE_ADD_PARTIAL);
+ }
+
+out_unlock:
+ slab_unlock(page);
+ return;
+
+slab_empty:
+ if (prior) {
+ /*
+ * Slab still on the partial list.
+ */
+ remove_partial(s, page);
+ stat(c, FREE_REMOVE_PARTIAL);
+ }
+ slab_unlock(page);
+ stat(c, FREE_SLAB);
+ discard_slab(s, page);
+ return;
+
+debug:
+ if (!free_debug_processing(s, page, x, addr))
+ goto out_unlock;
+ goto checks_ok;
+}
+
+/*
+ * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
+ * can perform fastpath freeing without additional function calls.
+ *
+ * The fastpath is only possible if we are freeing to the current cpu slab
+ * of this processor. This typically the case if we have just allocated
+ * the item before.
+ *
+ * If fastpath is not possible then fall back to __slab_free where we deal
+ * with all sorts of special processing.
+ */
+static __always_inline void slab_free(struct kmem_cache *s,
+ struct page *page, void *x, void *addr)
+{
+ void **object = (void *)x;
+ struct kmem_cache_cpu *c;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ c = get_cpu_slab(s, smp_processor_id());
+ debug_check_no_locks_freed(object, c->objsize);
+ if (!(s->flags & SLAB_DEBUG_OBJECTS))
+ debug_check_no_obj_freed(object, s->objsize);
+ if (likely(page == c->page && c->node >= 0)) {
+ object[c->offset] = c->freelist;
+ c->freelist = object;
+ stat(c, FREE_FASTPATH);
+ } else
+ __slab_free(s, page, x, addr, c->offset);
+
+ local_irq_restore(flags);
+}
+
+void kmem_cache_free(struct kmem_cache *s, void *x)
+{
+ struct page *page;
+
+ page = virt_to_head_page(x);
+
+ slab_free(s, page, x, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(kmem_cache_free);
+
+/* Figure out on which slab object the object resides */
+static struct page *get_object_page(const void *x)
+{
+ struct page *page = virt_to_head_page(x);
+
+ if (!PageSlab(page))
+ return NULL;
+
+ return page;
+}
+
+/*
+ * Object placement in a slab is made very easy because we always start at
+ * offset 0. If we tune the size of the object to the alignment then we can
+ * get the required alignment by putting one properly sized object after
+ * another.
+ *
+ * Notice that the allocation order determines the sizes of the per cpu
+ * caches. Each processor has always one slab available for allocations.
+ * Increasing the allocation order reduces the number of times that slabs
+ * must be moved on and off the partial lists and is therefore a factor in
+ * locking overhead.
+ */
+
+/*
+ * Mininum / Maximum order of slab pages. This influences locking overhead
+ * and slab fragmentation. A higher order reduces the number of partial slabs
+ * and increases the number of allocations possible without having to
+ * take the list_lock.
+ */
+static int slub_min_order;
+static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
+static int slub_min_objects;
+
+/*
+ * Merge control. If this is set then no merging of slab caches will occur.
+ * (Could be removed. This was introduced to pacify the merge skeptics.)
+ */
+static int slub_nomerge;
+
+/*
+ * Calculate the order of allocation given an slab object size.
+ *
+ * The order of allocation has significant impact on performance and other
+ * system components. Generally order 0 allocations should be preferred since
+ * order 0 does not cause fragmentation in the page allocator. Larger objects
+ * be problematic to put into order 0 slabs because there may be too much
+ * unused space left. We go to a higher order if more than 1/16th of the slab
+ * would be wasted.
+ *
+ * In order to reach satisfactory performance we must ensure that a minimum
+ * number of objects is in one slab. Otherwise we may generate too much
+ * activity on the partial lists which requires taking the list_lock. This is
+ * less a concern for large slabs though which are rarely used.
+ *
+ * slub_max_order specifies the order where we begin to stop considering the
+ * number of objects in a slab as critical. If we reach slub_max_order then
+ * we try to keep the page order as low as possible. So we accept more waste
+ * of space in favor of a small page order.
+ *
+ * Higher order allocations also allow the placement of more objects in a
+ * slab and thereby reduce object handling overhead. If the user has
+ * requested a higher mininum order then we start with that one instead of
+ * the smallest order which will fit the object.
+ */
+static inline int slab_order(int size, int min_objects,
+ int max_order, int fract_leftover)
+{
+ int order;
+ int rem;
+ int min_order = slub_min_order;
+
+ if ((PAGE_SIZE << min_order) / size > 65535)
+ return get_order(size * 65535) - 1;
+
+ for (order = max(min_order,
+ fls(min_objects * size - 1) - PAGE_SHIFT);
+ order <= max_order; order++) {
+
+ unsigned long slab_size = PAGE_SIZE << order;
+
+ if (slab_size < min_objects * size)
+ continue;
+
+ rem = slab_size % size;
+
+ if (rem <= slab_size / fract_leftover)
+ break;
+
+ }
+
+ return order;
+}
+
+static inline int calculate_order(int size)
+{
+ int order;
+ int min_objects;
+ int fraction;
+
+ /*
+ * Attempt to find best configuration for a slab. This
+ * works by first attempting to generate a layout with
+ * the best configuration and backing off gradually.
+ *
+ * First we reduce the acceptable waste in a slab. Then
+ * we reduce the minimum objects required in a slab.
+ */
+ min_objects = slub_min_objects;
+ if (!min_objects)
+ min_objects = 4 * (fls(nr_cpu_ids) + 1);
+ while (min_objects > 1) {
+ fraction = 16;
+ while (fraction >= 4) {
+ order = slab_order(size, min_objects,
+ slub_max_order, fraction);
+ if (order <= slub_max_order)
+ return order;
+ fraction /= 2;
+ }
+ min_objects /= 2;
+ }
+
+ /*
+ * We were unable to place multiple objects in a slab. Now
+ * lets see if we can place a single object there.
+ */
+ order = slab_order(size, 1, slub_max_order, 1);
+ if (order <= slub_max_order)
+ return order;
+
+ /*
+ * Doh this slab cannot be placed using slub_max_order.
+ */
+ order = slab_order(size, 1, MAX_ORDER, 1);
+ if (order <= MAX_ORDER)
+ return order;
+ return -ENOSYS;
+}
+
+/*
+ * Figure out what the alignment of the objects will be.
+ */
+static unsigned long calculate_alignment(unsigned long flags,
+ unsigned long align, unsigned long size)
+{
+ /*
+ * If the user wants hardware cache aligned objects then follow that
+ * suggestion if the object is sufficiently large.
+ *
+ * The hardware cache alignment cannot override the specified
+ * alignment though. If that is greater then use it.
+ */
+ if (flags & SLAB_HWCACHE_ALIGN) {
+ unsigned long ralign = cache_line_size();
+ while (size <= ralign / 2)
+ ralign /= 2;
+ align = max(align, ralign);
+ }
+
+ if (align < ARCH_SLAB_MINALIGN)
+ align = ARCH_SLAB_MINALIGN;
+
+ return ALIGN(align, sizeof(void *));
+}
+
+static void init_kmem_cache_cpu(struct kmem_cache *s,
+ struct kmem_cache_cpu *c)
+{
+ c->page = NULL;
+ c->freelist = NULL;
+ c->node = 0;
+ c->offset = s->offset / sizeof(void *);
+ c->objsize = s->objsize;
+#ifdef CONFIG_SLUB_STATS
+ memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned));
+#endif
+}
+
+static void
+init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
+{
+ n->nr_partial = 0;
+
+ /*
+ * The larger the object size is, the more pages we want on the partial
+ * list to avoid pounding the page allocator excessively.
+ */
+ n->min_partial = ilog2(s->size);
+ if (n->min_partial < MIN_PARTIAL)
+ n->min_partial = MIN_PARTIAL;
+ else if (n->min_partial > MAX_PARTIAL)
+ n->min_partial = MAX_PARTIAL;
+
+ spin_lock_init(&n->list_lock);
+ INIT_LIST_HEAD(&n->partial);
+#ifdef CONFIG_SLUB_DEBUG
+ atomic_long_set(&n->nr_slabs, 0);
+ atomic_long_set(&n->total_objects, 0);
+ INIT_LIST_HEAD(&n->full);
+#endif
+}
+
+#ifdef CONFIG_SMP
+/*
+ * Per cpu array for per cpu structures.
+ *
+ * The per cpu array places all kmem_cache_cpu structures from one processor
+ * close together meaning that it becomes possible that multiple per cpu
+ * structures are contained in one cacheline. This may be particularly
+ * beneficial for the kmalloc caches.
+ *
+ * A desktop system typically has around 60-80 slabs. With 100 here we are
+ * likely able to get per cpu structures for all caches from the array defined
+ * here. We must be able to cover all kmalloc caches during bootstrap.
+ *
+ * If the per cpu array is exhausted then fall back to kmalloc
+ * of individual cachelines. No sharing is possible then.
+ */
+#define NR_KMEM_CACHE_CPU 100
+
+static DEFINE_PER_CPU(struct kmem_cache_cpu,
+ kmem_cache_cpu)[NR_KMEM_CACHE_CPU];
+
+static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
+static cpumask_t kmem_cach_cpu_free_init_once = CPU_MASK_NONE;
+
+static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
+ int cpu, gfp_t flags)
+{
+ struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu);
+
+ if (c)
+ per_cpu(kmem_cache_cpu_free, cpu) =
+ (void *)c->freelist;
+ else {
+ /* Table overflow: So allocate ourselves */
+ c = kmalloc_node(
+ ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()),
+ flags, cpu_to_node(cpu));
+ if (!c)
+ return NULL;
+ }
+
+ init_kmem_cache_cpu(s, c);
+ return c;
+}
+
+static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu)
+{
+ if (c < per_cpu(kmem_cache_cpu, cpu) ||
+ c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) {
+ kfree(c);
+ return;
+ }
+ c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu);
+ per_cpu(kmem_cache_cpu_free, cpu) = c;
+}
+
+static void free_kmem_cache_cpus(struct kmem_cache *s)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+
+ if (c) {
+ s->cpu_slab[cpu] = NULL;
+ free_kmem_cache_cpu(c, cpu);
+ }
+ }
+}
+
+static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+
+ if (c)
+ continue;
+
+ c = alloc_kmem_cache_cpu(s, cpu, flags);
+ if (!c) {
+ free_kmem_cache_cpus(s);
+ return 0;
+ }
+ s->cpu_slab[cpu] = c;
+ }
+ return 1;
+}
+
+/*
+ * Initialize the per cpu array.
+ */
+static void init_alloc_cpu_cpu(int cpu)
+{
+ int i;
+
+ if (cpu_isset(cpu, kmem_cach_cpu_free_init_once))
+ return;
+
+ for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--)
+ free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu);
+
+ cpu_set(cpu, kmem_cach_cpu_free_init_once);
+}
+
+static void __init init_alloc_cpu(void)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ init_alloc_cpu_cpu(cpu);
+ }
+
+#else
+static inline void free_kmem_cache_cpus(struct kmem_cache *s) {}
+static inline void init_alloc_cpu(void) {}
+
+static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
+{
+ init_kmem_cache_cpu(s, &s->cpu_slab);
+ return 1;
+}
+#endif
+
+#ifdef CONFIG_NUMA
+/*
+ * No kmalloc_node yet so do it by hand. We know that this is the first
+ * slab on the node for this slabcache. There are no concurrent accesses
+ * possible.
+ *
+ * Note that this function only works on the kmalloc_node_cache
+ * when allocating for the kmalloc_node_cache. This is used for bootstrapping
+ * memory on a fresh node that has no slab structures yet.
+ */
+static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
+ int node)
+{
+ struct page *page;
+ struct kmem_cache_node *n;
+ unsigned long flags;
+
+ BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
+
+ page = new_slab(kmalloc_caches, gfpflags, node);
+
+ BUG_ON(!page);
+ if (page_to_nid(page) != node) {
+ printk(KERN_ERR "SLUB: Unable to allocate memory from "
+ "node %d\n", node);
+ printk(KERN_ERR "SLUB: Allocating a useless per node structure "
+ "in order to be able to continue\n");
+ }
+
+ n = page->freelist;
+ BUG_ON(!n);
+ page->freelist = get_freepointer(kmalloc_caches, n);
+ page->inuse++;
+ kmalloc_caches->node[node] = n;
+#ifdef CONFIG_SLUB_DEBUG
+ init_object(kmalloc_caches, n, 1);
+ init_tracking(kmalloc_caches, n);
+#endif
+ init_kmem_cache_node(n, kmalloc_caches);
+ inc_slabs_node(kmalloc_caches, node, page->objects);
+
+ /*
+ * lockdep requires consistent irq usage for each lock
+ * so even though there cannot be a race this early in
+ * the boot sequence, we still disable irqs.
+ */
+ local_irq_save(flags);
+ add_partial(n, page, 0);
+ local_irq_restore(flags);
+ return n;
+}
+
+static void free_kmem_cache_nodes(struct kmem_cache *s)
+{
+ int node;
+
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ struct kmem_cache_node *n = s->node[node];
+ if (n && n != &s->local_node)
+ kmem_cache_free(kmalloc_caches, n);
+ s->node[node] = NULL;
+ }
+}
+
+static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
+{
+ int node;
+ int local_node;
+
+ if (slab_state >= UP)
+ local_node = page_to_nid(virt_to_page(s));
+ else
+ local_node = 0;
+
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ struct kmem_cache_node *n;
+
+ if (local_node == node)
+ n = &s->local_node;
+ else {
+ if (slab_state == DOWN) {
+ n = early_kmem_cache_node_alloc(gfpflags,
+ node);
+ continue;
+ }
+ n = kmem_cache_alloc_node(kmalloc_caches,
+ gfpflags, node);
+
+ if (!n) {
+ free_kmem_cache_nodes(s);
+ return 0;
+ }
+
+ }
+ s->node[node] = n;
+ init_kmem_cache_node(n, s);
+ }
+ return 1;
+}
+#else
+static void free_kmem_cache_nodes(struct kmem_cache *s)
+{
+}
+
+static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
+{
+ init_kmem_cache_node(&s->local_node, s);
+ return 1;
+}
+#endif
+
+/*
+ * calculate_sizes() determines the order and the distribution of data within
+ * a slab object.
+ */
+static int calculate_sizes(struct kmem_cache *s, int forced_order)
+{
+ unsigned long flags = s->flags;
+ unsigned long size = s->objsize;
+ unsigned long align = s->align;
+ int order;
+
+ /*
+ * Round up object size to the next word boundary. We can only
+ * place the free pointer at word boundaries and this determines
+ * the possible location of the free pointer.
+ */
+ size = ALIGN(size, sizeof(void *));
+
+#ifdef CONFIG_SLUB_DEBUG
+ /*
+ * Determine if we can poison the object itself. If the user of
+ * the slab may touch the object after free or before allocation
+ * then we should never poison the object itself.
+ */
+ if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) &&
+ !s->ctor)
+ s->flags |= __OBJECT_POISON;
+ else
+ s->flags &= ~__OBJECT_POISON;
+
+
+ /*
+ * If we are Redzoning then check if there is some space between the
+ * end of the object and the free pointer. If not then add an
+ * additional word to have some bytes to store Redzone information.
+ */
+ if ((flags & SLAB_RED_ZONE) && size == s->objsize)
+ size += sizeof(void *);
+#endif
+
+ /*
+ * With that we have determined the number of bytes in actual use
+ * by the object. This is the potential offset to the free pointer.
+ */
+ s->inuse = size;
+
+ if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) ||
+ s->ctor)) {
+ /*
+ * Relocate free pointer after the object if it is not
+ * permitted to overwrite the first word of the object on
+ * kmem_cache_free.
+ *
+ * This is the case if we do RCU, have a constructor or
+ * destructor or are poisoning the objects.
+ */
+ s->offset = size;
+ size += sizeof(void *);
+ }
+
+#ifdef CONFIG_SLUB_DEBUG
+ if (flags & SLAB_STORE_USER)
+ /*
+ * Need to store information about allocs and frees after
+ * the object.
+ */
+ size += 2 * sizeof(struct track);
+
+ if (flags & SLAB_RED_ZONE)
+ /*
+ * Add some empty padding so that we can catch
+ * overwrites from earlier objects rather than let
+ * tracking information or the free pointer be
+ * corrupted if an user writes before the start
+ * of the object.
+ */
+ size += sizeof(void *);
+#endif
+
+ /*
+ * Determine the alignment based on various parameters that the
+ * user specified and the dynamic determination of cache line size
+ * on bootup.
+ */
+ align = calculate_alignment(flags, align, s->objsize);
+
+ /*
+ * SLUB stores one object immediately after another beginning from
+ * offset 0. In order to align the objects we have to simply size
+ * each object to conform to the alignment.
+ */
+ size = ALIGN(size, align);
+ s->size = size;
+ if (forced_order >= 0)
+ order = forced_order;
+ else
+ order = calculate_order(size);
+
+ if (order < 0)
+ return 0;
+
+ s->allocflags = 0;
+ if (order)
+ s->allocflags |= __GFP_COMP;
+
+ if (s->flags & SLAB_CACHE_DMA)
+ s->allocflags |= SLUB_DMA;
+
+ if (s->flags & SLAB_RECLAIM_ACCOUNT)
+ s->allocflags |= __GFP_RECLAIMABLE;
+
+ /*
+ * Determine the number of objects per slab
+ */
+ s->oo = oo_make(order, size);
+ s->min = oo_make(get_order(size), size);
+ if (oo_objects(s->oo) > oo_objects(s->max))
+ s->max = s->oo;
+
+ return !!oo_objects(s->oo);
+
+}
+
+static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
+ const char *name, size_t size,
+ size_t align, unsigned long flags,
+ void (*ctor)(void *))
+{
+ memset(s, 0, kmem_size);
+ s->name = name;
+ s->ctor = ctor;
+ s->objsize = size;
+ s->align = align;
+ s->flags = kmem_cache_flags(size, flags, name, ctor);
+
+ if (!calculate_sizes(s, -1))
+ goto error;
+
+ s->refcount = 1;
+#ifdef CONFIG_NUMA
+ s->remote_node_defrag_ratio = 1000;
+#endif
+ if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
+ goto error;
+
+ if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA))
+ return 1;
+ free_kmem_cache_nodes(s);
+error:
+ if (flags & SLAB_PANIC)
+ panic("Cannot create slab %s size=%lu realsize=%u "
+ "order=%u offset=%u flags=%lx\n",
+ s->name, (unsigned long)size, s->size, oo_order(s->oo),
+ s->offset, flags);
+ return 0;
+}
+
+/*
+ * Check if a given pointer is valid
+ */
+int kmem_ptr_validate(struct kmem_cache *s, const void *object)
+{
+ struct page *page;
+
+ page = get_object_page(object);
+
+ if (!page || s != page->slab)
+ /* No slab or wrong slab */
+ return 0;
+
+ if (!check_valid_pointer(s, page, object))
+ return 0;
+
+ /*
+ * We could also check if the object is on the slabs freelist.
+ * But this would be too expensive and it seems that the main
+ * purpose of kmem_ptr_valid() is to check if the object belongs
+ * to a certain slab.
+ */
+ return 1;
+}
+EXPORT_SYMBOL(kmem_ptr_validate);
+
+/*
+ * Determine the size of a slab object
+ */
+unsigned int kmem_cache_size(struct kmem_cache *s)
+{
+ return s->objsize;
+}
+EXPORT_SYMBOL(kmem_cache_size);
+
+const char *kmem_cache_name(struct kmem_cache *s)
+{
+ return s->name;
+}
+EXPORT_SYMBOL(kmem_cache_name);
+
+static void list_slab_objects(struct kmem_cache *s, struct page *page,
+ const char *text)
+{
+#ifdef CONFIG_SLUB_DEBUG
+ void *addr = page_address(page);
+ void *p;
+ DECLARE_BITMAP(map, page->objects);
+
+ bitmap_zero(map, page->objects);
+ slab_err(s, page, "%s", text);
+ slab_lock(page);
+ for_each_free_object(p, s, page->freelist)
+ set_bit(slab_index(p, s, addr), map);
+
+ for_each_object(p, s, addr, page->objects) {
+
+ if (!test_bit(slab_index(p, s, addr), map)) {
+ printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n",
+ p, p - addr);
+ print_tracking(s, p);
+ }
+ }
+ slab_unlock(page);
+#endif
+}
+
+/*
+ * Attempt to free all partial slabs on a node.
+ */
+static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
+{
+ unsigned long flags;
+ struct page *page, *h;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+ list_for_each_entry_safe(page, h, &n->partial, lru) {
+ if (!page->inuse) {
+ list_del(&page->lru);
+ discard_slab(s, page);
+ n->nr_partial--;
+ } else {
+ list_slab_objects(s, page,
+ "Objects remaining on kmem_cache_close()");
+ }
+ }
+ spin_unlock_irqrestore(&n->list_lock, flags);
+}
+
+/*
+ * Release all resources used by a slab cache.
+ */
+static inline int kmem_cache_close(struct kmem_cache *s)
+{
+ int node;
+
+ flush_all(s);
+
+ /* Attempt to free all objects */
+ free_kmem_cache_cpus(s);
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ struct kmem_cache_node *n = get_node(s, node);
+
+ free_partial(s, n);
+ if (n->nr_partial || slabs_node(s, node))
+ return 1;
+ }
+ free_kmem_cache_nodes(s);
+ return 0;
+}
+
+/*
+ * Close a cache and release the kmem_cache structure
+ * (must be used for caches created using kmem_cache_create)
+ */
+void kmem_cache_destroy(struct kmem_cache *s)
+{
+ down_write(&slub_lock);
+ s->refcount--;
+ if (!s->refcount) {
+ list_del(&s->list);
+ up_write(&slub_lock);
+ if (kmem_cache_close(s)) {
+ printk(KERN_ERR "SLUB %s: %s called for cache that "
+ "still has objects.\n", s->name, __func__);
+ dump_stack();
+ }
+ sysfs_slab_remove(s);
+ } else
+ up_write(&slub_lock);
+}
+EXPORT_SYMBOL(kmem_cache_destroy);
+
+/********************************************************************
+ * Kmalloc subsystem
+ *******************************************************************/
+
+struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned;
+EXPORT_SYMBOL(kmalloc_caches);
+
+static int __init setup_slub_min_order(char *str)
+{
+ get_option(&str, &slub_min_order);
+
+ return 1;
+}
+
+__setup("slub_min_order=", setup_slub_min_order);
+
+static int __init setup_slub_max_order(char *str)
+{
+ get_option(&str, &slub_max_order);
+
+ return 1;
+}
+
+__setup("slub_max_order=", setup_slub_max_order);
+
+static int __init setup_slub_min_objects(char *str)
+{
+ get_option(&str, &slub_min_objects);
+
+ return 1;
+}
+
+__setup("slub_min_objects=", setup_slub_min_objects);
+
+static int __init setup_slub_nomerge(char *str)
+{
+ slub_nomerge = 1;
+ return 1;
+}
+
+__setup("slub_nomerge", setup_slub_nomerge);
+
+static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
+ const char *name, int size, gfp_t gfp_flags)
+{
+ unsigned int flags = 0;
+
+ if (gfp_flags & SLUB_DMA)
+ flags = SLAB_CACHE_DMA;
+
+ down_write(&slub_lock);
+ if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
+ flags, NULL))
+ goto panic;
+
+ list_add(&s->list, &slab_caches);
+ up_write(&slub_lock);
+ if (sysfs_slab_add(s))
+ goto panic;
+ return s;
+
+panic:
+ panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
+}
+
+#ifdef CONFIG_ZONE_DMA
+static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1];
+
+static void sysfs_add_func(struct work_struct *w)
+{
+ struct kmem_cache *s;
+
+ down_write(&slub_lock);
+ list_for_each_entry(s, &slab_caches, list) {
+ if (s->flags & __SYSFS_ADD_DEFERRED) {
+ s->flags &= ~__SYSFS_ADD_DEFERRED;
+ sysfs_slab_add(s);
+ }
+ }
+ up_write(&slub_lock);
+}
+
+static DECLARE_WORK(sysfs_add_work, sysfs_add_func);
+
+static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
+{
+ struct kmem_cache *s;
+ char *text;
+ size_t realsize;
+
+ s = kmalloc_caches_dma[index];
+ if (s)
+ return s;
+
+ /* Dynamically create dma cache */
+ if (flags & __GFP_WAIT)
+ down_write(&slub_lock);
+ else {
+ if (!down_write_trylock(&slub_lock))
+ goto out;
+ }
+
+ if (kmalloc_caches_dma[index])
+ goto unlock_out;
+
+ realsize = kmalloc_caches[index].objsize;
+ text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
+ (unsigned int)realsize);
+ s = kmalloc(kmem_size, flags & ~SLUB_DMA);
+
+ if (!s || !text || !kmem_cache_open(s, flags, text,
+ realsize, ARCH_KMALLOC_MINALIGN,
+ SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) {
+ kfree(s);
+ kfree(text);
+ goto unlock_out;
+ }
+
+ list_add(&s->list, &slab_caches);
+ kmalloc_caches_dma[index] = s;
+
+ schedule_work(&sysfs_add_work);
+
+unlock_out:
+ up_write(&slub_lock);
+out:
+ return kmalloc_caches_dma[index];
+}
+#endif
+
+/*
+ * Conversion table for small slabs sizes / 8 to the index in the
+ * kmalloc array. This is necessary for slabs < 192 since we have non power
+ * of two cache sizes there. The size of larger slabs can be determined using
+ * fls.
+ */
+static s8 size_index[24] = {
+ 3, /* 8 */
+ 4, /* 16 */
+ 5, /* 24 */
+ 5, /* 32 */
+ 6, /* 40 */
+ 6, /* 48 */
+ 6, /* 56 */
+ 6, /* 64 */
+ 1, /* 72 */
+ 1, /* 80 */
+ 1, /* 88 */
+ 1, /* 96 */
+ 7, /* 104 */
+ 7, /* 112 */
+ 7, /* 120 */
+ 7, /* 128 */
+ 2, /* 136 */
+ 2, /* 144 */
+ 2, /* 152 */
+ 2, /* 160 */
+ 2, /* 168 */
+ 2, /* 176 */
+ 2, /* 184 */
+ 2 /* 192 */
+};
+
+static struct kmem_cache *get_slab(size_t size, gfp_t flags)
+{
+ int index;
+
+ if (size <= 192) {
+ if (!size)
+ return ZERO_SIZE_PTR;
+
+ index = size_index[(size - 1) / 8];
+ } else
+ index = fls(size - 1);
+
+#ifdef CONFIG_ZONE_DMA
+ if (unlikely((flags & SLUB_DMA)))
+ return dma_kmalloc_cache(index, flags);
+
+#endif
+ return &kmalloc_caches[index];
+}
+
+void *__kmalloc(size_t size, gfp_t flags)
+{
+ struct kmem_cache *s;
+
+ if (unlikely(size > PAGE_SIZE))
+ return kmalloc_large(size, flags);
+
+ s = get_slab(size, flags);
+
+ if (unlikely(ZERO_OR_NULL_PTR(s)))
+ return s;
+
+ return slab_alloc(s, flags, -1, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(__kmalloc);
+
+static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
+{
+ struct page *page = alloc_pages_node(node, flags | __GFP_COMP,
+ get_order(size));
+
+ if (page)
+ return page_address(page);
+ else
+ return NULL;
+}
+
+#ifdef CONFIG_NUMA
+void *__kmalloc_node(size_t size, gfp_t flags, int node)
+{
+ struct kmem_cache *s;
+
+ if (unlikely(size > PAGE_SIZE))
+ return kmalloc_large_node(size, flags, node);
+
+ s = get_slab(size, flags);
+
+ if (unlikely(ZERO_OR_NULL_PTR(s)))
+ return s;
+
+ return slab_alloc(s, flags, node, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(__kmalloc_node);
+#endif
+
+size_t ksize(const void *object)
+{
+ struct page *page;
+ struct kmem_cache *s;
+
+ if (unlikely(object == ZERO_SIZE_PTR))
+ return 0;
+
+ page = virt_to_head_page(object);
+
+ if (unlikely(!PageSlab(page))) {
+ WARN_ON(!PageCompound(page));
+ return PAGE_SIZE << compound_order(page);
+ }
+ s = page->slab;
+
+#ifdef CONFIG_SLUB_DEBUG
+ /*
+ * Debugging requires use of the padding between object
+ * and whatever may come after it.
+ */
+ if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
+ return s->objsize;
+
+#endif
+ /*
+ * If we have the need to store the freelist pointer
+ * back there or track user information then we can
+ * only use the space before that information.
+ */
+ if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
+ return s->inuse;
+ /*
+ * Else we can use all the padding etc for the allocation
+ */
+ return s->size;
+}
+
+void kfree(const void *x)
+{
+ struct page *page;
+ void *object = (void *)x;
+
+ if (unlikely(ZERO_OR_NULL_PTR(x)))
+ return;
+
+ page = virt_to_head_page(x);
+ if (unlikely(!PageSlab(page))) {
+ BUG_ON(!PageCompound(page));
+ put_page(page);
+ return;
+ }
+ slab_free(page->slab, page, object, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(kfree);
+
+/*
+ * kmem_cache_shrink removes empty slabs from the partial lists and sorts
+ * the remaining slabs by the number of items in use. The slabs with the
+ * most items in use come first. New allocations will then fill those up
+ * and thus they can be removed from the partial lists.
+ *
+ * The slabs with the least items are placed last. This results in them
+ * being allocated from last increasing the chance that the last objects
+ * are freed in them.
+ */
+int kmem_cache_shrink(struct kmem_cache *s)
+{
+ int node;
+ int i;
+ struct kmem_cache_node *n;
+ struct page *page;
+ struct page *t;
+ int objects = oo_objects(s->max);
+ struct list_head *slabs_by_inuse =
+ kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
+ unsigned long flags;
+
+ if (!slabs_by_inuse)
+ return -ENOMEM;
+
+ flush_all(s);
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ n = get_node(s, node);
+
+ if (!n->nr_partial)
+ continue;
+
+ for (i = 0; i < objects; i++)
+ INIT_LIST_HEAD(slabs_by_inuse + i);
+
+ spin_lock_irqsave(&n->list_lock, flags);
+
+ /*
+ * Build lists indexed by the items in use in each slab.
+ *
+ * Note that concurrent frees may occur while we hold the
+ * list_lock. page->inuse here is the upper limit.
+ */
+ list_for_each_entry_safe(page, t, &n->partial, lru) {
+ if (!page->inuse && slab_trylock(page)) {
+ /*
+ * Must hold slab lock here because slab_free
+ * may have freed the last object and be
+ * waiting to release the slab.
+ */
+ list_del(&page->lru);
+ n->nr_partial--;
+ slab_unlock(page);
+ discard_slab(s, page);
+ } else {
+ list_move(&page->lru,
+ slabs_by_inuse + page->inuse);
+ }
+ }
+
+ /*
+ * Rebuild the partial list with the slabs filled up most
+ * first and the least used slabs at the end.
+ */
+ for (i = objects - 1; i >= 0; i--)
+ list_splice(slabs_by_inuse + i, n->partial.prev);
+
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ }
+
+ kfree(slabs_by_inuse);
+ return 0;
+}
+EXPORT_SYMBOL(kmem_cache_shrink);
+
+#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
+static int slab_mem_going_offline_callback(void *arg)
+{
+ struct kmem_cache *s;
+
+ down_read(&slub_lock);
+ list_for_each_entry(s, &slab_caches, list)
+ kmem_cache_shrink(s);
+ up_read(&slub_lock);
+
+ return 0;
+}
+
+static void slab_mem_offline_callback(void *arg)
+{
+ struct kmem_cache_node *n;
+ struct kmem_cache *s;
+ struct memory_notify *marg = arg;
+ int offline_node;
+
+ offline_node = marg->status_change_nid;
+
+ /*
+ * If the node still has available memory. we need kmem_cache_node
+ * for it yet.
+ */
+ if (offline_node < 0)
+ return;
+
+ down_read(&slub_lock);
+ list_for_each_entry(s, &slab_caches, list) {
+ n = get_node(s, offline_node);
+ if (n) {
+ /*
+ * if n->nr_slabs > 0, slabs still exist on the node
+ * that is going down. We were unable to free them,
+ * and offline_pages() function shoudn't call this
+ * callback. So, we must fail.
+ */
+ BUG_ON(slabs_node(s, offline_node));
+
+ s->node[offline_node] = NULL;
+ kmem_cache_free(kmalloc_caches, n);
+ }
+ }
+ up_read(&slub_lock);
+}
+
+static int slab_mem_going_online_callback(void *arg)
+{
+ struct kmem_cache_node *n;
+ struct kmem_cache *s;
+ struct memory_notify *marg = arg;
+ int nid = marg->status_change_nid;
+ int ret = 0;
+
+ /*
+ * If the node's memory is already available, then kmem_cache_node is
+ * already created. Nothing to do.
+ */
+ if (nid < 0)
+ return 0;
+
+ /*
+ * We are bringing a node online. No memory is available yet. We must
+ * allocate a kmem_cache_node structure in order to bring the node
+ * online.
+ */
+ down_read(&slub_lock);
+ list_for_each_entry(s, &slab_caches, list) {
+ /*
+ * XXX: kmem_cache_alloc_node will fallback to other nodes
+ * since memory is not yet available from the node that
+ * is brought up.
+ */
+ n = kmem_cache_alloc(kmalloc_caches, GFP_KERNEL);
+ if (!n) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ init_kmem_cache_node(n, s);
+ s->node[nid] = n;
+ }
+out:
+ up_read(&slub_lock);
+ return ret;
+}
+
+static int slab_memory_callback(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ int ret = 0;
+
+ switch (action) {
+ case MEM_GOING_ONLINE:
+ ret = slab_mem_going_online_callback(arg);
+ break;
+ case MEM_GOING_OFFLINE:
+ ret = slab_mem_going_offline_callback(arg);
+ break;
+ case MEM_OFFLINE:
+ case MEM_CANCEL_ONLINE:
+ slab_mem_offline_callback(arg);
+ break;
+ case MEM_ONLINE:
+ case MEM_CANCEL_OFFLINE:
+ break;
+ }
+ if (ret)
+ ret = notifier_from_errno(ret);
+ else
+ ret = NOTIFY_OK;
+ return ret;
+}
+
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+/********************************************************************
+ * Basic setup of slabs
+ *******************************************************************/
+
+void __init kmem_cache_init(void)
+{
+ int i;
+ int caches = 0;
+
+ init_alloc_cpu();
+
+#ifdef CONFIG_NUMA
+ /*
+ * Must first have the slab cache available for the allocations of the
+ * struct kmem_cache_node's. There is special bootstrap code in
+ * kmem_cache_open for slab_state == DOWN.
+ */
+ create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
+ sizeof(struct kmem_cache_node), GFP_KERNEL);
+ kmalloc_caches[0].refcount = -1;
+ caches++;
+
+ hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
+#endif
+
+ /* Able to allocate the per node structures */
+ slab_state = PARTIAL;
+
+ /* Caches that are not of the two-to-the-power-of size */
+ if (KMALLOC_MIN_SIZE <= 64) {
+ create_kmalloc_cache(&kmalloc_caches[1],
+ "kmalloc-96", 96, GFP_KERNEL);
+ caches++;
+ create_kmalloc_cache(&kmalloc_caches[2],
+ "kmalloc-192", 192, GFP_KERNEL);
+ caches++;
+ }
+
+ for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) {
+ create_kmalloc_cache(&kmalloc_caches[i],
+ "kmalloc", 1 << i, GFP_KERNEL);
+ caches++;
+ }
+
+
+ /*
+ * Patch up the size_index table if we have strange large alignment
+ * requirements for the kmalloc array. This is only the case for
+ * MIPS it seems. The standard arches will not generate any code here.
+ *
+ * Largest permitted alignment is 256 bytes due to the way we
+ * handle the index determination for the smaller caches.
+ *
+ * Make sure that nothing crazy happens if someone starts tinkering
+ * around with ARCH_KMALLOC_MINALIGN
+ */
+ BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
+ (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
+
+ for (i = 8; i < KMALLOC_MIN_SIZE; i += 8)
+ size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW;
+
+ if (KMALLOC_MIN_SIZE == 128) {
+ /*
+ * The 192 byte sized cache is not used if the alignment
+ * is 128 byte. Redirect kmalloc to use the 256 byte cache
+ * instead.
+ */
+ for (i = 128 + 8; i <= 192; i += 8)
+ size_index[(i - 1) / 8] = 8;
+ }
+
+ slab_state = UP;
+
+ /* Provide the correct kmalloc names now that the caches are up */
+ for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++)
+ kmalloc_caches[i]. name =
+ kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
+
+#ifdef CONFIG_SMP
+ register_cpu_notifier(&slab_notifier);
+ kmem_size = offsetof(struct kmem_cache, cpu_slab) +
+ nr_cpu_ids * sizeof(struct kmem_cache_cpu *);
+#else
+ kmem_size = sizeof(struct kmem_cache);
+#endif
+
+ printk(KERN_INFO
+ "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
+ " CPUs=%d, Nodes=%d\n",
+ caches, cache_line_size(),
+ slub_min_order, slub_max_order, slub_min_objects,
+ nr_cpu_ids, nr_node_ids);
+}
+
+/*
+ * Find a mergeable slab cache
+ */
+static int slab_unmergeable(struct kmem_cache *s)
+{
+ if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
+ return 1;
+
+ if (s->ctor)
+ return 1;
+
+ /*
+ * We may have set a slab to be unmergeable during bootstrap.
+ */
+ if (s->refcount < 0)
+ return 1;
+
+ return 0;
+}
+
+static struct kmem_cache *find_mergeable(size_t size,
+ size_t align, unsigned long flags, const char *name,
+ void (*ctor)(void *))
+{
+ struct kmem_cache *s;
+
+ if (slub_nomerge || (flags & SLUB_NEVER_MERGE))
+ return NULL;
+
+ if (ctor)
+ return NULL;
+
+ size = ALIGN(size, sizeof(void *));
+ align = calculate_alignment(flags, align, size);
+ size = ALIGN(size, align);
+ flags = kmem_cache_flags(size, flags, name, NULL);
+
+ list_for_each_entry(s, &slab_caches, list) {
+ if (slab_unmergeable(s))
+ continue;
+
+ if (size > s->size)
+ continue;
+
+ if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME))
+ continue;
+ /*
+ * Check if alignment is compatible.
+ * Courtesy of Adrian Drzewiecki
+ */
+ if ((s->size & ~(align - 1)) != s->size)
+ continue;
+
+ if (s->size - size >= sizeof(void *))
+ continue;
+
+ return s;
+ }
+ return NULL;
+}
+
+struct kmem_cache *kmem_cache_create(const char *name, size_t size,
+ size_t align, unsigned long flags, void (*ctor)(void *))
+{
+ struct kmem_cache *s;
+
+ down_write(&slub_lock);
+ s = find_mergeable(size, align, flags, name, ctor);
+ if (s) {
+ int cpu;
+
+ s->refcount++;
+ /*
+ * Adjust the object sizes so that we clear
+ * the complete object on kzalloc.
+ */
+ s->objsize = max(s->objsize, (int)size);
+
+ /*
+ * And then we need to update the object size in the
+ * per cpu structures
+ */
+ for_each_online_cpu(cpu)
+ get_cpu_slab(s, cpu)->objsize = s->objsize;
+
+ s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
+ up_write(&slub_lock);
+
+ if (sysfs_slab_alias(s, name))
+ goto err;
+ return s;
+ }
+
+ s = kmalloc(kmem_size, GFP_KERNEL);
+ if (s) {
+ if (kmem_cache_open(s, GFP_KERNEL, name,
+ size, align, flags, ctor)) {
+ list_add(&s->list, &slab_caches);
+ up_write(&slub_lock);
+ if (sysfs_slab_add(s))
+ goto err;
+ return s;
+ }
+ kfree(s);
+ }
+ up_write(&slub_lock);
+
+err:
+ if (flags & SLAB_PANIC)
+ panic("Cannot create slabcache %s\n", name);
+ else
+ s = NULL;
+ return s;
+}
+EXPORT_SYMBOL(kmem_cache_create);
+
+#ifdef CONFIG_SMP
+/*
+ * Use the cpu notifier to insure that the cpu slabs are flushed when
+ * necessary.
+ */
+static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ long cpu = (long)hcpu;
+ struct kmem_cache *s;
+ unsigned long flags;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ init_alloc_cpu_cpu(cpu);
+ down_read(&slub_lock);
+ list_for_each_entry(s, &slab_caches, list)
+ s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu,
+ GFP_KERNEL);
+ up_read(&slub_lock);
+ break;
+
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ down_read(&slub_lock);
+ list_for_each_entry(s, &slab_caches, list) {
+ struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+
+ local_irq_save(flags);
+ __flush_cpu_slab(s, cpu);
+ local_irq_restore(flags);
+ free_kmem_cache_cpu(c, cpu);
+ s->cpu_slab[cpu] = NULL;
+ }
+ up_read(&slub_lock);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata slab_notifier = {
+ .notifier_call = slab_cpuup_callback
+};
+
+#endif
+
+void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
+{
+ struct kmem_cache *s;
+
+ if (unlikely(size > PAGE_SIZE))
+ return kmalloc_large(size, gfpflags);
+
+ s = get_slab(size, gfpflags);
+
+ if (unlikely(ZERO_OR_NULL_PTR(s)))
+ return s;
+
+ return slab_alloc(s, gfpflags, -1, caller);
+}
+
+void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
+ int node, void *caller)
+{
+ struct kmem_cache *s;
+
+ if (unlikely(size > PAGE_SIZE))
+ return kmalloc_large_node(size, gfpflags, node);
+
+ s = get_slab(size, gfpflags);
+
+ if (unlikely(ZERO_OR_NULL_PTR(s)))
+ return s;
+
+ return slab_alloc(s, gfpflags, node, caller);
+}
+
+#ifdef CONFIG_SLUB_DEBUG
+static unsigned long count_partial(struct kmem_cache_node *n,
+ int (*get_count)(struct page *))
+{
+ unsigned long flags;
+ unsigned long x = 0;
+ struct page *page;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+ list_for_each_entry(page, &n->partial, lru)
+ x += get_count(page);
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ return x;
+}
+
+static int count_inuse(struct page *page)
+{
+ return page->inuse;
+}
+
+static int count_total(struct page *page)
+{
+ return page->objects;
+}
+
+static int count_free(struct page *page)
+{
+ return page->objects - page->inuse;
+}
+
+static int validate_slab(struct kmem_cache *s, struct page *page,
+ unsigned long *map)
+{
+ void *p;
+ void *addr = page_address(page);
+
+ if (!check_slab(s, page) ||
+ !on_freelist(s, page, NULL))
+ return 0;
+
+ /* Now we know that a valid freelist exists */
+ bitmap_zero(map, page->objects);
+
+ for_each_free_object(p, s, page->freelist) {
+ set_bit(slab_index(p, s, addr), map);
+ if (!check_object(s, page, p, 0))
+ return 0;
+ }
+
+ for_each_object(p, s, addr, page->objects)
+ if (!test_bit(slab_index(p, s, addr), map))
+ if (!check_object(s, page, p, 1))
+ return 0;
+ return 1;
+}
+
+static void validate_slab_slab(struct kmem_cache *s, struct page *page,
+ unsigned long *map)
+{
+ if (slab_trylock(page)) {
+ validate_slab(s, page, map);
+ slab_unlock(page);
+ } else
+ printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
+ s->name, page);
+
+ if (s->flags & DEBUG_DEFAULT_FLAGS) {
+ if (!PageSlubDebug(page))
+ printk(KERN_ERR "SLUB %s: SlubDebug not set "
+ "on slab 0x%p\n", s->name, page);
+ } else {
+ if (PageSlubDebug(page))
+ printk(KERN_ERR "SLUB %s: SlubDebug set on "
+ "slab 0x%p\n", s->name, page);
+ }
+}
+
+static int validate_slab_node(struct kmem_cache *s,
+ struct kmem_cache_node *n, unsigned long *map)
+{
+ unsigned long count = 0;
+ struct page *page;
+ unsigned long flags;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+
+ list_for_each_entry(page, &n->partial, lru) {
+ validate_slab_slab(s, page, map);
+ count++;
+ }
+ if (count != n->nr_partial)
+ printk(KERN_ERR "SLUB %s: %ld partial slabs counted but "
+ "counter=%ld\n", s->name, count, n->nr_partial);
+
+ if (!(s->flags & SLAB_STORE_USER))
+ goto out;
+
+ list_for_each_entry(page, &n->full, lru) {
+ validate_slab_slab(s, page, map);
+ count++;
+ }
+ if (count != atomic_long_read(&n->nr_slabs))
+ printk(KERN_ERR "SLUB: %s %ld slabs counted but "
+ "counter=%ld\n", s->name, count,
+ atomic_long_read(&n->nr_slabs));
+
+out:
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ return count;
+}
+
+static long validate_slab_cache(struct kmem_cache *s)
+{
+ int node;
+ unsigned long count = 0;
+ unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
+ sizeof(unsigned long), GFP_KERNEL);
+
+ if (!map)
+ return -ENOMEM;
+
+ flush_all(s);
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ struct kmem_cache_node *n = get_node(s, node);
+
+ count += validate_slab_node(s, n, map);
+ }
+ kfree(map);
+ return count;
+}
+
+#ifdef SLUB_RESILIENCY_TEST
+static void resiliency_test(void)
+{
+ u8 *p;
+
+ printk(KERN_ERR "SLUB resiliency testing\n");
+ printk(KERN_ERR "-----------------------\n");
+ printk(KERN_ERR "A. Corruption after allocation\n");
+
+ p = kzalloc(16, GFP_KERNEL);
+ p[16] = 0x12;
+ printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
+ " 0x12->0x%p\n\n", p + 16);
+
+ validate_slab_cache(kmalloc_caches + 4);
+
+ /* Hmmm... The next two are dangerous */
+ p = kzalloc(32, GFP_KERNEL);
+ p[32 + sizeof(void *)] = 0x34;
+ printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
+ " 0x34 -> -0x%p\n", p);
+ printk(KERN_ERR
+ "If allocated object is overwritten then not detectable\n\n");
+
+ validate_slab_cache(kmalloc_caches + 5);
+ p = kzalloc(64, GFP_KERNEL);
+ p += 64 + (get_cycles() & 0xff) * sizeof(void *);
+ *p = 0x56;
+ printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
+ p);
+ printk(KERN_ERR
+ "If allocated object is overwritten then not detectable\n\n");
+ validate_slab_cache(kmalloc_caches + 6);
+
+ printk(KERN_ERR "\nB. Corruption after free\n");
+ p = kzalloc(128, GFP_KERNEL);
+ kfree(p);
+ *p = 0x78;
+ printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
+ validate_slab_cache(kmalloc_caches + 7);
+
+ p = kzalloc(256, GFP_KERNEL);
+ kfree(p);
+ p[50] = 0x9a;
+ printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n",
+ p);
+ validate_slab_cache(kmalloc_caches + 8);
+
+ p = kzalloc(512, GFP_KERNEL);
+ kfree(p);
+ p[512] = 0xab;
+ printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
+ validate_slab_cache(kmalloc_caches + 9);
+}
+#else
+static void resiliency_test(void) {};
+#endif
+
+/*
+ * Generate lists of code addresses where slabcache objects are allocated
+ * and freed.
+ */
+
+struct location {
+ unsigned long count;
+ void *addr;
+ long long sum_time;
+ long min_time;
+ long max_time;
+ long min_pid;
+ long max_pid;
+ cpumask_t cpus;
+ nodemask_t nodes;
+};
+
+struct loc_track {
+ unsigned long max;
+ unsigned long count;
+ struct location *loc;
+};
+
+static void free_loc_track(struct loc_track *t)
+{
+ if (t->max)
+ free_pages((unsigned long)t->loc,
+ get_order(sizeof(struct location) * t->max));
+}
+
+static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
+{
+ struct location *l;
+ int order;
+
+ order = get_order(sizeof(struct location) * max);
+
+ l = (void *)__get_free_pages(flags, order);
+ if (!l)
+ return 0;
+
+ if (t->count) {
+ memcpy(l, t->loc, sizeof(struct location) * t->count);
+ free_loc_track(t);
+ }
+ t->max = max;
+ t->loc = l;
+ return 1;
+}
+
+static int add_location(struct loc_track *t, struct kmem_cache *s,
+ const struct track *track)
+{
+ long start, end, pos;
+ struct location *l;
+ void *caddr;
+ unsigned long age = jiffies - track->when;
+
+ start = -1;
+ end = t->count;
+
+ for ( ; ; ) {
+ pos = start + (end - start + 1) / 2;
+
+ /*
+ * There is nothing at "end". If we end up there
+ * we need to add something to before end.
+ */
+ if (pos == end)
+ break;
+
+ caddr = t->loc[pos].addr;
+ if (track->addr == caddr) {
+
+ l = &t->loc[pos];
+ l->count++;
+ if (track->when) {
+ l->sum_time += age;
+ if (age < l->min_time)
+ l->min_time = age;
+ if (age > l->max_time)
+ l->max_time = age;
+
+ if (track->pid < l->min_pid)
+ l->min_pid = track->pid;
+ if (track->pid > l->max_pid)
+ l->max_pid = track->pid;
+
+ cpu_set(track->cpu, l->cpus);
+ }
+ node_set(page_to_nid(virt_to_page(track)), l->nodes);
+ return 1;
+ }
+
+ if (track->addr < caddr)
+ end = pos;
+ else
+ start = pos;
+ }
+
+ /*
+ * Not found. Insert new tracking element.
+ */
+ if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
+ return 0;
+
+ l = t->loc + pos;
+ if (pos < t->count)
+ memmove(l + 1, l,
+ (t->count - pos) * sizeof(struct location));
+ t->count++;
+ l->count = 1;
+ l->addr = track->addr;
+ l->sum_time = age;
+ l->min_time = age;
+ l->max_time = age;
+ l->min_pid = track->pid;
+ l->max_pid = track->pid;
+ cpus_clear(l->cpus);
+ cpu_set(track->cpu, l->cpus);
+ nodes_clear(l->nodes);
+ node_set(page_to_nid(virt_to_page(track)), l->nodes);
+ return 1;
+}
+
+static void process_slab(struct loc_track *t, struct kmem_cache *s,
+ struct page *page, enum track_item alloc)
+{
+ void *addr = page_address(page);
+ DECLARE_BITMAP(map, page->objects);
+ void *p;
+
+ bitmap_zero(map, page->objects);
+ for_each_free_object(p, s, page->freelist)
+ set_bit(slab_index(p, s, addr), map);
+
+ for_each_object(p, s, addr, page->objects)
+ if (!test_bit(slab_index(p, s, addr), map))
+ add_location(t, s, get_track(s, p, alloc));
+}
+
+static int list_locations(struct kmem_cache *s, char *buf,
+ enum track_item alloc)
+{
+ int len = 0;
+ unsigned long i;
+ struct loc_track t = { 0, 0, NULL };
+ int node;
+
+ if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
+ GFP_TEMPORARY))
+ return sprintf(buf, "Out of memory\n");
+
+ /* Push back cpu slabs */
+ flush_all(s);
+
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ struct kmem_cache_node *n = get_node(s, node);
+ unsigned long flags;
+ struct page *page;
+
+ if (!atomic_long_read(&n->nr_slabs))
+ continue;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+ list_for_each_entry(page, &n->partial, lru)
+ process_slab(&t, s, page, alloc);
+ list_for_each_entry(page, &n->full, lru)
+ process_slab(&t, s, page, alloc);
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ }
+
+ for (i = 0; i < t.count; i++) {
+ struct location *l = &t.loc[i];
+
+ if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100)
+ break;
+ len += sprintf(buf + len, "%7ld ", l->count);
+
+ if (l->addr)
+ len += sprint_symbol(buf + len, (unsigned long)l->addr);
+ else
+ len += sprintf(buf + len, "<not-available>");
+
+ if (l->sum_time != l->min_time) {
+ len += sprintf(buf + len, " age=%ld/%ld/%ld",
+ l->min_time,
+ (long)div_u64(l->sum_time, l->count),
+ l->max_time);
+ } else
+ len += sprintf(buf + len, " age=%ld",
+ l->min_time);
+
+ if (l->min_pid != l->max_pid)
+ len += sprintf(buf + len, " pid=%ld-%ld",
+ l->min_pid, l->max_pid);
+ else
+ len += sprintf(buf + len, " pid=%ld",
+ l->min_pid);
+
+ if (num_online_cpus() > 1 && !cpus_empty(l->cpus) &&
+ len < PAGE_SIZE - 60) {
+ len += sprintf(buf + len, " cpus=");
+ len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50,
+ l->cpus);
+ }
+
+ if (num_online_nodes() > 1 && !nodes_empty(l->nodes) &&
+ len < PAGE_SIZE - 60) {
+ len += sprintf(buf + len, " nodes=");
+ len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50,
+ l->nodes);
+ }
+
+ len += sprintf(buf + len, "\n");
+ }
+
+ free_loc_track(&t);
+ if (!t.count)
+ len += sprintf(buf, "No data\n");
+ return len;
+}
+
+enum slab_stat_type {
+ SL_ALL, /* All slabs */
+ SL_PARTIAL, /* Only partially allocated slabs */
+ SL_CPU, /* Only slabs used for cpu caches */
+ SL_OBJECTS, /* Determine allocated objects not slabs */
+ SL_TOTAL /* Determine object capacity not slabs */
+};
+
+#define SO_ALL (1 << SL_ALL)
+#define SO_PARTIAL (1 << SL_PARTIAL)
+#define SO_CPU (1 << SL_CPU)
+#define SO_OBJECTS (1 << SL_OBJECTS)
+#define SO_TOTAL (1 << SL_TOTAL)
+
+static ssize_t show_slab_objects(struct kmem_cache *s,
+ char *buf, unsigned long flags)
+{
+ unsigned long total = 0;
+ int node;
+ int x;
+ unsigned long *nodes;
+ unsigned long *per_cpu;
+
+ nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL);
+ if (!nodes)
+ return -ENOMEM;
+ per_cpu = nodes + nr_node_ids;
+
+ if (flags & SO_CPU) {
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+
+ if (!c || c->node < 0)
+ continue;
+
+ if (c->page) {
+ if (flags & SO_TOTAL)
+ x = c->page->objects;
+ else if (flags & SO_OBJECTS)
+ x = c->page->inuse;
+ else
+ x = 1;
+
+ total += x;
+ nodes[c->node] += x;
+ }
+ per_cpu[c->node]++;
+ }
+ }
+
+ if (flags & SO_ALL) {
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ struct kmem_cache_node *n = get_node(s, node);
+
+ if (flags & SO_TOTAL)
+ x = atomic_long_read(&n->total_objects);
+ else if (flags & SO_OBJECTS)
+ x = atomic_long_read(&n->total_objects) -
+ count_partial(n, count_free);
+
+ else
+ x = atomic_long_read(&n->nr_slabs);
+ total += x;
+ nodes[node] += x;
+ }
+
+ } else if (flags & SO_PARTIAL) {
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ struct kmem_cache_node *n = get_node(s, node);
+
+ if (flags & SO_TOTAL)
+ x = count_partial(n, count_total);
+ else if (flags & SO_OBJECTS)
+ x = count_partial(n, count_inuse);
+ else
+ x = n->nr_partial;
+ total += x;
+ nodes[node] += x;
+ }
+ }
+ x = sprintf(buf, "%lu", total);
+#ifdef CONFIG_NUMA
+ for_each_node_state(node, N_NORMAL_MEMORY)
+ if (nodes[node])
+ x += sprintf(buf + x, " N%d=%lu",
+ node, nodes[node]);
+#endif
+ kfree(nodes);
+ return x + sprintf(buf + x, "\n");
+}
+
+static int any_slab_objects(struct kmem_cache *s)
+{
+ int node;
+
+ for_each_online_node(node) {
+ struct kmem_cache_node *n = get_node(s, node);
+
+ if (!n)
+ continue;
+
+ if (atomic_long_read(&n->total_objects))
+ return 1;
+ }
+ return 0;
+}
+
+#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
+#define to_slab(n) container_of(n, struct kmem_cache, kobj);
+
+struct slab_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct kmem_cache *s, char *buf);
+ ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
+};
+
+#define SLAB_ATTR_RO(_name) \
+ static struct slab_attribute _name##_attr = __ATTR_RO(_name)
+
+#define SLAB_ATTR(_name) \
+ static struct slab_attribute _name##_attr = \
+ __ATTR(_name, 0644, _name##_show, _name##_store)
+
+static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", s->size);
+}
+SLAB_ATTR_RO(slab_size);
+
+static ssize_t align_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", s->align);
+}
+SLAB_ATTR_RO(align);
+
+static ssize_t object_size_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", s->objsize);
+}
+SLAB_ATTR_RO(object_size);
+
+static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", oo_objects(s->oo));
+}
+SLAB_ATTR_RO(objs_per_slab);
+
+static ssize_t order_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ unsigned long order;
+ int err;
+
+ err = strict_strtoul(buf, 10, &order);
+ if (err)
+ return err;
+
+ if (order > slub_max_order || order < slub_min_order)
+ return -EINVAL;
+
+ calculate_sizes(s, order);
+ return length;
+}
+
+static ssize_t order_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", oo_order(s->oo));
+}
+SLAB_ATTR(order);
+
+static ssize_t ctor_show(struct kmem_cache *s, char *buf)
+{
+ if (s->ctor) {
+ int n = sprint_symbol(buf, (unsigned long)s->ctor);
+
+ return n + sprintf(buf + n, "\n");
+ }
+ return 0;
+}
+SLAB_ATTR_RO(ctor);
+
+static ssize_t aliases_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", s->refcount - 1);
+}
+SLAB_ATTR_RO(aliases);
+
+static ssize_t slabs_show(struct kmem_cache *s, char *buf)
+{
+ return show_slab_objects(s, buf, SO_ALL);
+}
+SLAB_ATTR_RO(slabs);
+
+static ssize_t partial_show(struct kmem_cache *s, char *buf)
+{
+ return show_slab_objects(s, buf, SO_PARTIAL);
+}
+SLAB_ATTR_RO(partial);
+
+static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
+{
+ return show_slab_objects(s, buf, SO_CPU);
+}
+SLAB_ATTR_RO(cpu_slabs);
+
+static ssize_t objects_show(struct kmem_cache *s, char *buf)
+{
+ return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
+}
+SLAB_ATTR_RO(objects);
+
+static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
+{
+ return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
+}
+SLAB_ATTR_RO(objects_partial);
+
+static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
+{
+ return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
+}
+SLAB_ATTR_RO(total_objects);
+
+static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
+}
+
+static ssize_t sanity_checks_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ s->flags &= ~SLAB_DEBUG_FREE;
+ if (buf[0] == '1')
+ s->flags |= SLAB_DEBUG_FREE;
+ return length;
+}
+SLAB_ATTR(sanity_checks);
+
+static ssize_t trace_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
+}
+
+static ssize_t trace_store(struct kmem_cache *s, const char *buf,
+ size_t length)
+{
+ s->flags &= ~SLAB_TRACE;
+ if (buf[0] == '1')
+ s->flags |= SLAB_TRACE;
+ return length;
+}
+SLAB_ATTR(trace);
+
+static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
+}
+
+static ssize_t reclaim_account_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ s->flags &= ~SLAB_RECLAIM_ACCOUNT;
+ if (buf[0] == '1')
+ s->flags |= SLAB_RECLAIM_ACCOUNT;
+ return length;
+}
+SLAB_ATTR(reclaim_account);
+
+static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
+}
+SLAB_ATTR_RO(hwcache_align);
+
+#ifdef CONFIG_ZONE_DMA
+static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
+}
+SLAB_ATTR_RO(cache_dma);
+#endif
+
+static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
+}
+SLAB_ATTR_RO(destroy_by_rcu);
+
+static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
+}
+
+static ssize_t red_zone_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ if (any_slab_objects(s))
+ return -EBUSY;
+
+ s->flags &= ~SLAB_RED_ZONE;
+ if (buf[0] == '1')
+ s->flags |= SLAB_RED_ZONE;
+ calculate_sizes(s, -1);
+ return length;
+}
+SLAB_ATTR(red_zone);
+
+static ssize_t poison_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
+}
+
+static ssize_t poison_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ if (any_slab_objects(s))
+ return -EBUSY;
+
+ s->flags &= ~SLAB_POISON;
+ if (buf[0] == '1')
+ s->flags |= SLAB_POISON;
+ calculate_sizes(s, -1);
+ return length;
+}
+SLAB_ATTR(poison);
+
+static ssize_t store_user_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
+}
+
+static ssize_t store_user_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ if (any_slab_objects(s))
+ return -EBUSY;
+
+ s->flags &= ~SLAB_STORE_USER;
+ if (buf[0] == '1')
+ s->flags |= SLAB_STORE_USER;
+ calculate_sizes(s, -1);
+ return length;
+}
+SLAB_ATTR(store_user);
+
+static ssize_t validate_show(struct kmem_cache *s, char *buf)
+{
+ return 0;
+}
+
+static ssize_t validate_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ int ret = -EINVAL;
+
+ if (buf[0] == '1') {
+ ret = validate_slab_cache(s);
+ if (ret >= 0)
+ ret = length;
+ }
+ return ret;
+}
+SLAB_ATTR(validate);
+
+static ssize_t shrink_show(struct kmem_cache *s, char *buf)
+{
+ return 0;
+}
+
+static ssize_t shrink_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ if (buf[0] == '1') {
+ int rc = kmem_cache_shrink(s);
+
+ if (rc)
+ return rc;
+ } else
+ return -EINVAL;
+ return length;
+}
+SLAB_ATTR(shrink);
+
+static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
+{
+ if (!(s->flags & SLAB_STORE_USER))
+ return -ENOSYS;
+ return list_locations(s, buf, TRACK_ALLOC);
+}
+SLAB_ATTR_RO(alloc_calls);
+
+static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
+{
+ if (!(s->flags & SLAB_STORE_USER))
+ return -ENOSYS;
+ return list_locations(s, buf, TRACK_FREE);
+}
+SLAB_ATTR_RO(free_calls);
+
+#ifdef CONFIG_NUMA
+static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10);
+}
+
+static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ unsigned long ratio;
+ int err;
+
+ err = strict_strtoul(buf, 10, &ratio);
+ if (err)
+ return err;
+
+ if (ratio <= 100)
+ s->remote_node_defrag_ratio = ratio * 10;
+
+ return length;
+}
+SLAB_ATTR(remote_node_defrag_ratio);
+#endif
+
+#ifdef CONFIG_SLUB_STATS
+static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
+{
+ unsigned long sum = 0;
+ int cpu;
+ int len;
+ int *data = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
+
+ if (!data)
+ return -ENOMEM;
+
+ for_each_online_cpu(cpu) {
+ unsigned x = get_cpu_slab(s, cpu)->stat[si];
+
+ data[cpu] = x;
+ sum += x;
+ }
+
+ len = sprintf(buf, "%lu", sum);
+
+#ifdef CONFIG_SMP
+ for_each_online_cpu(cpu) {
+ if (data[cpu] && len < PAGE_SIZE - 20)
+ len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]);
+ }
+#endif
+ kfree(data);
+ return len + sprintf(buf + len, "\n");
+}
+
+#define STAT_ATTR(si, text) \
+static ssize_t text##_show(struct kmem_cache *s, char *buf) \
+{ \
+ return show_stat(s, buf, si); \
+} \
+SLAB_ATTR_RO(text); \
+
+STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
+STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
+STAT_ATTR(FREE_FASTPATH, free_fastpath);
+STAT_ATTR(FREE_SLOWPATH, free_slowpath);
+STAT_ATTR(FREE_FROZEN, free_frozen);
+STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial);
+STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
+STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
+STAT_ATTR(ALLOC_SLAB, alloc_slab);
+STAT_ATTR(ALLOC_REFILL, alloc_refill);
+STAT_ATTR(FREE_SLAB, free_slab);
+STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
+STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
+STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
+STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
+STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
+STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
+STAT_ATTR(ORDER_FALLBACK, order_fallback);
+#endif
+
+static struct attribute *slab_attrs[] = {
+ &slab_size_attr.attr,
+ &object_size_attr.attr,
+ &objs_per_slab_attr.attr,
+ &order_attr.attr,
+ &objects_attr.attr,
+ &objects_partial_attr.attr,
+ &total_objects_attr.attr,
+ &slabs_attr.attr,
+ &partial_attr.attr,
+ &cpu_slabs_attr.attr,
+ &ctor_attr.attr,
+ &aliases_attr.attr,
+ &align_attr.attr,
+ &sanity_checks_attr.attr,
+ &trace_attr.attr,
+ &hwcache_align_attr.attr,
+ &reclaim_account_attr.attr,
+ &destroy_by_rcu_attr.attr,
+ &red_zone_attr.attr,
+ &poison_attr.attr,
+ &store_user_attr.attr,
+ &validate_attr.attr,
+ &shrink_attr.attr,
+ &alloc_calls_attr.attr,
+ &free_calls_attr.attr,
+#ifdef CONFIG_ZONE_DMA
+ &cache_dma_attr.attr,
+#endif
+#ifdef CONFIG_NUMA
+ &remote_node_defrag_ratio_attr.attr,
+#endif
+#ifdef CONFIG_SLUB_STATS
+ &alloc_fastpath_attr.attr,
+ &alloc_slowpath_attr.attr,
+ &free_fastpath_attr.attr,
+ &free_slowpath_attr.attr,
+ &free_frozen_attr.attr,
+ &free_add_partial_attr.attr,
+ &free_remove_partial_attr.attr,
+ &alloc_from_partial_attr.attr,
+ &alloc_slab_attr.attr,
+ &alloc_refill_attr.attr,
+ &free_slab_attr.attr,
+ &cpuslab_flush_attr.attr,
+ &deactivate_full_attr.attr,
+ &deactivate_empty_attr.attr,
+ &deactivate_to_head_attr.attr,
+ &deactivate_to_tail_attr.attr,
+ &deactivate_remote_frees_attr.attr,
+ &order_fallback_attr.attr,
+#endif
+ NULL
+};
+
+static struct attribute_group slab_attr_group = {
+ .attrs = slab_attrs,
+};
+
+static ssize_t slab_attr_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct slab_attribute *attribute;
+ struct kmem_cache *s;
+ int err;
+
+ attribute = to_slab_attr(attr);
+ s = to_slab(kobj);
+
+ if (!attribute->show)
+ return -EIO;
+
+ err = attribute->show(s, buf);
+
+ return err;
+}
+
+static ssize_t slab_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct slab_attribute *attribute;
+ struct kmem_cache *s;
+ int err;
+
+ attribute = to_slab_attr(attr);
+ s = to_slab(kobj);
+
+ if (!attribute->store)
+ return -EIO;
+
+ err = attribute->store(s, buf, len);
+
+ return err;
+}
+
+static void kmem_cache_release(struct kobject *kobj)
+{
+ struct kmem_cache *s = to_slab(kobj);
+
+ kfree(s);
+}
+
+static struct sysfs_ops slab_sysfs_ops = {
+ .show = slab_attr_show,
+ .store = slab_attr_store,
+};
+
+static struct kobj_type slab_ktype = {
+ .sysfs_ops = &slab_sysfs_ops,
+ .release = kmem_cache_release
+};
+
+static int uevent_filter(struct kset *kset, struct kobject *kobj)
+{
+ struct kobj_type *ktype = get_ktype(kobj);
+
+ if (ktype == &slab_ktype)
+ return 1;
+ return 0;
+}
+
+static struct kset_uevent_ops slab_uevent_ops = {
+ .filter = uevent_filter,
+};
+
+static struct kset *slab_kset;
+
+#define ID_STR_LENGTH 64
+
+/* Create a unique string id for a slab cache:
+ *
+ * Format :[flags-]size
+ */
+static char *create_unique_id(struct kmem_cache *s)
+{
+ char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
+ char *p = name;
+
+ BUG_ON(!name);
+
+ *p++ = ':';
+ /*
+ * First flags affecting slabcache operations. We will only
+ * get here for aliasable slabs so we do not need to support
+ * too many flags. The flags here must cover all flags that
+ * are matched during merging to guarantee that the id is
+ * unique.
+ */
+ if (s->flags & SLAB_CACHE_DMA)
+ *p++ = 'd';
+ if (s->flags & SLAB_RECLAIM_ACCOUNT)
+ *p++ = 'a';
+ if (s->flags & SLAB_DEBUG_FREE)
+ *p++ = 'F';
+ if (p != name + 1)
+ *p++ = '-';
+ p += sprintf(p, "%07d", s->size);
+ BUG_ON(p > name + ID_STR_LENGTH - 1);
+ return name;
+}
+
+static int sysfs_slab_add(struct kmem_cache *s)
+{
+ int err;
+ const char *name;
+ int unmergeable;
+
+ if (slab_state < SYSFS)
+ /* Defer until later */
+ return 0;
+
+ unmergeable = slab_unmergeable(s);
+ if (unmergeable) {
+ /*
+ * Slabcache can never be merged so we can use the name proper.
+ * This is typically the case for debug situations. In that
+ * case we can catch duplicate names easily.
+ */
+ sysfs_remove_link(&slab_kset->kobj, s->name);
+ name = s->name;
+ } else {
+ /*
+ * Create a unique name for the slab as a target
+ * for the symlinks.
+ */
+ name = create_unique_id(s);
+ }
+
+ s->kobj.kset = slab_kset;
+ err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name);
+ if (err) {
+ kobject_put(&s->kobj);
+ return err;
+ }
+
+ err = sysfs_create_group(&s->kobj, &slab_attr_group);
+ if (err)
+ return err;
+ kobject_uevent(&s->kobj, KOBJ_ADD);
+ if (!unmergeable) {
+ /* Setup first alias */
+ sysfs_slab_alias(s, s->name);
+ kfree(name);
+ }
+ return 0;
+}
+
+static void sysfs_slab_remove(struct kmem_cache *s)
+{
+ kobject_uevent(&s->kobj, KOBJ_REMOVE);
+ kobject_del(&s->kobj);
+ kobject_put(&s->kobj);
+}
+
+/*
+ * Need to buffer aliases during bootup until sysfs becomes
+ * available lest we loose that information.
+ */
+struct saved_alias {
+ struct kmem_cache *s;
+ const char *name;
+ struct saved_alias *next;
+};
+
+static struct saved_alias *alias_list;
+
+static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
+{
+ struct saved_alias *al;
+
+ if (slab_state == SYSFS) {
+ /*
+ * If we have a leftover link then remove it.
+ */
+ sysfs_remove_link(&slab_kset->kobj, name);
+ return sysfs_create_link(&slab_kset->kobj, &s->kobj, name);
+ }
+
+ al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
+ if (!al)
+ return -ENOMEM;
+
+ al->s = s;
+ al->name = name;
+ al->next = alias_list;
+ alias_list = al;
+ return 0;
+}
+
+static int __init slab_sysfs_init(void)
+{
+ struct kmem_cache *s;
+ int err;
+
+ slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
+ if (!slab_kset) {
+ printk(KERN_ERR "Cannot register slab subsystem.\n");
+ return -ENOSYS;
+ }
+
+ slab_state = SYSFS;
+
+ list_for_each_entry(s, &slab_caches, list) {
+ err = sysfs_slab_add(s);
+ if (err)
+ printk(KERN_ERR "SLUB: Unable to add boot slab %s"
+ " to sysfs\n", s->name);
+ }
+
+ while (alias_list) {
+ struct saved_alias *al = alias_list;
+
+ alias_list = alias_list->next;
+ err = sysfs_slab_alias(al->s, al->name);
+ if (err)
+ printk(KERN_ERR "SLUB: Unable to add boot slab alias"
+ " %s to sysfs\n", s->name);
+ kfree(al);
+ }
+
+ resiliency_test();
+ return 0;
+}
+
+__initcall(slab_sysfs_init);
+#endif
+
+/*
+ * The /proc/slabinfo ABI
+ */
+#ifdef CONFIG_SLABINFO
+static void print_slabinfo_header(struct seq_file *m)
+{
+ seq_puts(m, "slabinfo - version: 2.1\n");
+ seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
+ "<objperslab> <pagesperslab>");
+ seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
+ seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
+ seq_putc(m, '\n');
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+ loff_t n = *pos;
+
+ down_read(&slub_lock);
+ if (!n)
+ print_slabinfo_header(m);
+
+ return seq_list_start(&slab_caches, *pos);
+}
+
+static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next(p, &slab_caches, pos);
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+ up_read(&slub_lock);
+}
+
+static int s_show(struct seq_file *m, void *p)
+{
+ unsigned long nr_partials = 0;
+ unsigned long nr_slabs = 0;
+ unsigned long nr_inuse = 0;
+ unsigned long nr_objs = 0;
+ unsigned long nr_free = 0;
+ struct kmem_cache *s;
+ int node;
+
+ s = list_entry(p, struct kmem_cache, list);
+
+ for_each_online_node(node) {
+ struct kmem_cache_node *n = get_node(s, node);
+
+ if (!n)
+ continue;
+
+ nr_partials += n->nr_partial;
+ nr_slabs += atomic_long_read(&n->nr_slabs);
+ nr_objs += atomic_long_read(&n->total_objects);
+ nr_free += count_partial(n, count_free);
+ }
+
+ nr_inuse = nr_objs - nr_free;
+
+ seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse,
+ nr_objs, s->size, oo_objects(s->oo),
+ (1 << oo_order(s->oo)));
+ seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0);
+ seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs,
+ 0UL);
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static const struct seq_operations slabinfo_op = {
+ .start = s_start,
+ .next = s_next,
+ .stop = s_stop,
+ .show = s_show,
+};
+
+static int slabinfo_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &slabinfo_op);
+}
+
+static const struct file_operations proc_slabinfo_operations = {
+ .open = slabinfo_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init slab_proc_init(void)
+{
+ proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
+ return 0;
+}
+module_init(slab_proc_init);
+#endif /* CONFIG_SLABINFO */
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
new file mode 100644
index 0000000..a13ea64
--- /dev/null
+++ b/mm/sparse-vmemmap.c
@@ -0,0 +1,159 @@
+/*
+ * Virtual Memory Map support
+ *
+ * (C) 2007 sgi. Christoph Lameter.
+ *
+ * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn,
+ * virt_to_page, page_address() to be implemented as a base offset
+ * calculation without memory access.
+ *
+ * However, virtual mappings need a page table and TLBs. Many Linux
+ * architectures already map their physical space using 1-1 mappings
+ * via TLBs. For those arches the virtual memmory map is essentially
+ * for free if we use the same page size as the 1-1 mappings. In that
+ * case the overhead consists of a few additional pages that are
+ * allocated to create a view of memory for vmemmap.
+ *
+ * The architecture is expected to provide a vmemmap_populate() function
+ * to instantiate the mapping.
+ */
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/sched.h>
+#include <asm/dma.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+
+/*
+ * Allocate a block of memory to be used to back the virtual memory map
+ * or to back the page tables that are used to create the mapping.
+ * Uses the main allocators if they are available, else bootmem.
+ */
+
+static void * __init_refok __earlyonly_bootmem_alloc(int node,
+ unsigned long size,
+ unsigned long align,
+ unsigned long goal)
+{
+ return __alloc_bootmem_node(NODE_DATA(node), size, align, goal);
+}
+
+
+void * __meminit vmemmap_alloc_block(unsigned long size, int node)
+{
+ /* If the main allocator is up use that, fallback to bootmem. */
+ if (slab_is_available()) {
+ struct page *page = alloc_pages_node(node,
+ GFP_KERNEL | __GFP_ZERO, get_order(size));
+ if (page)
+ return page_address(page);
+ return NULL;
+ } else
+ return __earlyonly_bootmem_alloc(node, size, size,
+ __pa(MAX_DMA_ADDRESS));
+}
+
+void __meminit vmemmap_verify(pte_t *pte, int node,
+ unsigned long start, unsigned long end)
+{
+ unsigned long pfn = pte_pfn(*pte);
+ int actual_node = early_pfn_to_nid(pfn);
+
+ if (node_distance(actual_node, node) > LOCAL_DISTANCE)
+ printk(KERN_WARNING "[%lx-%lx] potential offnode "
+ "page_structs\n", start, end - 1);
+}
+
+pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
+{
+ pte_t *pte = pte_offset_kernel(pmd, addr);
+ if (pte_none(*pte)) {
+ pte_t entry;
+ void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+ if (!p)
+ return NULL;
+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
+ set_pte_at(&init_mm, addr, pte, entry);
+ }
+ return pte;
+}
+
+pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
+{
+ pmd_t *pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd)) {
+ void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+ if (!p)
+ return NULL;
+ pmd_populate_kernel(&init_mm, pmd, p);
+ }
+ return pmd;
+}
+
+pud_t * __meminit vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node)
+{
+ pud_t *pud = pud_offset(pgd, addr);
+ if (pud_none(*pud)) {
+ void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+ if (!p)
+ return NULL;
+ pud_populate(&init_mm, pud, p);
+ }
+ return pud;
+}
+
+pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
+{
+ pgd_t *pgd = pgd_offset_k(addr);
+ if (pgd_none(*pgd)) {
+ void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+ if (!p)
+ return NULL;
+ pgd_populate(&init_mm, pgd, p);
+ }
+ return pgd;
+}
+
+int __meminit vmemmap_populate_basepages(struct page *start_page,
+ unsigned long size, int node)
+{
+ unsigned long addr = (unsigned long)start_page;
+ unsigned long end = (unsigned long)(start_page + size);
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ for (; addr < end; addr += PAGE_SIZE) {
+ pgd = vmemmap_pgd_populate(addr, node);
+ if (!pgd)
+ return -ENOMEM;
+ pud = vmemmap_pud_populate(pgd, addr, node);
+ if (!pud)
+ return -ENOMEM;
+ pmd = vmemmap_pmd_populate(pud, addr, node);
+ if (!pmd)
+ return -ENOMEM;
+ pte = vmemmap_pte_populate(pmd, addr, node);
+ if (!pte)
+ return -ENOMEM;
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+ }
+
+ return 0;
+}
+
+struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid)
+{
+ struct page *map = pfn_to_page(pnum * PAGES_PER_SECTION);
+ int error = vmemmap_populate(map, PAGES_PER_SECTION, nid);
+ if (error)
+ return NULL;
+
+ return map;
+}
diff --git a/mm/sparse.c b/mm/sparse.c
new file mode 100644
index 0000000..083f5b6
--- /dev/null
+++ b/mm/sparse.c
@@ -0,0 +1,636 @@
+/*
+ * sparse memory mappings.
+ */
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include "internal.h"
+#include <asm/dma.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+
+/*
+ * Permanent SPARSEMEM data:
+ *
+ * 1) mem_section - memory sections, mem_map's for valid memory
+ */
+#ifdef CONFIG_SPARSEMEM_EXTREME
+struct mem_section *mem_section[NR_SECTION_ROOTS]
+ ____cacheline_internodealigned_in_smp;
+#else
+struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
+ ____cacheline_internodealigned_in_smp;
+#endif
+EXPORT_SYMBOL(mem_section);
+
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+/*
+ * If we did not store the node number in the page then we have to
+ * do a lookup in the section_to_node_table in order to find which
+ * node the page belongs to.
+ */
+#if MAX_NUMNODES <= 256
+static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
+#else
+static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
+#endif
+
+int page_to_nid(struct page *page)
+{
+ return section_to_node_table[page_to_section(page)];
+}
+EXPORT_SYMBOL(page_to_nid);
+
+static void set_section_nid(unsigned long section_nr, int nid)
+{
+ section_to_node_table[section_nr] = nid;
+}
+#else /* !NODE_NOT_IN_PAGE_FLAGS */
+static inline void set_section_nid(unsigned long section_nr, int nid)
+{
+}
+#endif
+
+#ifdef CONFIG_SPARSEMEM_EXTREME
+static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
+{
+ struct mem_section *section = NULL;
+ unsigned long array_size = SECTIONS_PER_ROOT *
+ sizeof(struct mem_section);
+
+ if (slab_is_available())
+ section = kmalloc_node(array_size, GFP_KERNEL, nid);
+ else
+ section = alloc_bootmem_node(NODE_DATA(nid), array_size);
+
+ if (section)
+ memset(section, 0, array_size);
+
+ return section;
+}
+
+static int __meminit sparse_index_init(unsigned long section_nr, int nid)
+{
+ static DEFINE_SPINLOCK(index_init_lock);
+ unsigned long root = SECTION_NR_TO_ROOT(section_nr);
+ struct mem_section *section;
+ int ret = 0;
+
+ if (mem_section[root])
+ return -EEXIST;
+
+ section = sparse_index_alloc(nid);
+ if (!section)
+ return -ENOMEM;
+ /*
+ * This lock keeps two different sections from
+ * reallocating for the same index
+ */
+ spin_lock(&index_init_lock);
+
+ if (mem_section[root]) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ mem_section[root] = section;
+out:
+ spin_unlock(&index_init_lock);
+ return ret;
+}
+#else /* !SPARSEMEM_EXTREME */
+static inline int sparse_index_init(unsigned long section_nr, int nid)
+{
+ return 0;
+}
+#endif
+
+/*
+ * Although written for the SPARSEMEM_EXTREME case, this happens
+ * to also work for the flat array case because
+ * NR_SECTION_ROOTS==NR_MEM_SECTIONS.
+ */
+int __section_nr(struct mem_section* ms)
+{
+ unsigned long root_nr;
+ struct mem_section* root;
+
+ for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) {
+ root = __nr_to_section(root_nr * SECTIONS_PER_ROOT);
+ if (!root)
+ continue;
+
+ if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT)))
+ break;
+ }
+
+ return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
+}
+
+/*
+ * During early boot, before section_mem_map is used for an actual
+ * mem_map, we use section_mem_map to store the section's NUMA
+ * node. This keeps us from having to use another data structure. The
+ * node information is cleared just before we store the real mem_map.
+ */
+static inline unsigned long sparse_encode_early_nid(int nid)
+{
+ return (nid << SECTION_NID_SHIFT);
+}
+
+static inline int sparse_early_nid(struct mem_section *section)
+{
+ return (section->section_mem_map >> SECTION_NID_SHIFT);
+}
+
+/* Validate the physical addressing limitations of the model */
+void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
+ unsigned long *end_pfn)
+{
+ unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
+
+ /*
+ * Sanity checks - do not allow an architecture to pass
+ * in larger pfns than the maximum scope of sparsemem:
+ */
+ if (*start_pfn > max_sparsemem_pfn) {
+ mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
+ "Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
+ *start_pfn, *end_pfn, max_sparsemem_pfn);
+ WARN_ON_ONCE(1);
+ *start_pfn = max_sparsemem_pfn;
+ *end_pfn = max_sparsemem_pfn;
+ }
+
+ if (*end_pfn > max_sparsemem_pfn) {
+ mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
+ "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
+ *start_pfn, *end_pfn, max_sparsemem_pfn);
+ WARN_ON_ONCE(1);
+ *end_pfn = max_sparsemem_pfn;
+ }
+}
+
+/* Record a memory area against a node. */
+void __init memory_present(int nid, unsigned long start, unsigned long end)
+{
+ unsigned long pfn;
+
+ start &= PAGE_SECTION_MASK;
+ mminit_validate_memmodel_limits(&start, &end);
+ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
+ unsigned long section = pfn_to_section_nr(pfn);
+ struct mem_section *ms;
+
+ sparse_index_init(section, nid);
+ set_section_nid(section, nid);
+
+ ms = __nr_to_section(section);
+ if (!ms->section_mem_map)
+ ms->section_mem_map = sparse_encode_early_nid(nid) |
+ SECTION_MARKED_PRESENT;
+ }
+}
+
+/*
+ * Only used by the i386 NUMA architecures, but relatively
+ * generic code.
+ */
+unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ unsigned long pfn;
+ unsigned long nr_pages = 0;
+
+ mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
+ for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+ if (nid != early_pfn_to_nid(pfn))
+ continue;
+
+ if (pfn_present(pfn))
+ nr_pages += PAGES_PER_SECTION;
+ }
+
+ return nr_pages * sizeof(struct page);
+}
+
+/*
+ * Subtle, we encode the real pfn into the mem_map such that
+ * the identity pfn - section_mem_map will return the actual
+ * physical page frame number.
+ */
+static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
+{
+ return (unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
+}
+
+/*
+ * Decode mem_map from the coded memmap
+ */
+struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
+{
+ /* mask off the extra low bits of information */
+ coded_mem_map &= SECTION_MAP_MASK;
+ return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
+}
+
+static int __meminit sparse_init_one_section(struct mem_section *ms,
+ unsigned long pnum, struct page *mem_map,
+ unsigned long *pageblock_bitmap)
+{
+ if (!present_section(ms))
+ return -EINVAL;
+
+ ms->section_mem_map &= ~SECTION_MAP_MASK;
+ ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
+ SECTION_HAS_MEM_MAP;
+ ms->pageblock_flags = pageblock_bitmap;
+
+ return 1;
+}
+
+unsigned long usemap_size(void)
+{
+ unsigned long size_bytes;
+ size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8;
+ size_bytes = roundup(size_bytes, sizeof(unsigned long));
+ return size_bytes;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static unsigned long *__kmalloc_section_usemap(void)
+{
+ return kmalloc(usemap_size(), GFP_KERNEL);
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static unsigned long * __init
+sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
+{
+ unsigned long section_nr;
+
+ /*
+ * A page may contain usemaps for other sections preventing the
+ * page being freed and making a section unremovable while
+ * other sections referencing the usemap retmain active. Similarly,
+ * a pgdat can prevent a section being removed. If section A
+ * contains a pgdat and section B contains the usemap, both
+ * sections become inter-dependent. This allocates usemaps
+ * from the same section as the pgdat where possible to avoid
+ * this problem.
+ */
+ section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
+ return alloc_bootmem_section(usemap_size(), section_nr);
+}
+
+static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
+{
+ unsigned long usemap_snr, pgdat_snr;
+ static unsigned long old_usemap_snr = NR_MEM_SECTIONS;
+ static unsigned long old_pgdat_snr = NR_MEM_SECTIONS;
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ int usemap_nid;
+
+ usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT);
+ pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
+ if (usemap_snr == pgdat_snr)
+ return;
+
+ if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr)
+ /* skip redundant message */
+ return;
+
+ old_usemap_snr = usemap_snr;
+ old_pgdat_snr = pgdat_snr;
+
+ usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
+ if (usemap_nid != nid) {
+ printk(KERN_INFO
+ "node %d must be removed before remove section %ld\n",
+ nid, usemap_snr);
+ return;
+ }
+ /*
+ * There is a circular dependency.
+ * Some platforms allow un-removable section because they will just
+ * gather other removable sections for dynamic partitioning.
+ * Just notify un-removable section's number here.
+ */
+ printk(KERN_INFO "Section %ld and %ld (node %d)", usemap_snr,
+ pgdat_snr, nid);
+ printk(KERN_CONT
+ " have a circular dependency on usemap and pgdat allocations\n");
+}
+#else
+static unsigned long * __init
+sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
+{
+ return NULL;
+}
+
+static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
+{
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+
+static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum)
+{
+ unsigned long *usemap;
+ struct mem_section *ms = __nr_to_section(pnum);
+ int nid = sparse_early_nid(ms);
+
+ usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid));
+ if (usemap)
+ return usemap;
+
+ usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size());
+ if (usemap) {
+ check_usemap_section_nr(nid, usemap);
+ return usemap;
+ }
+
+ /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */
+ nid = 0;
+
+ printk(KERN_WARNING "%s: allocation failed\n", __func__);
+ return NULL;
+}
+
+#ifndef CONFIG_SPARSEMEM_VMEMMAP
+struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
+{
+ struct page *map;
+
+ map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
+ if (map)
+ return map;
+
+ map = alloc_bootmem_pages_node(NODE_DATA(nid),
+ PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION));
+ return map;
+}
+#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
+
+static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
+{
+ struct page *map;
+ struct mem_section *ms = __nr_to_section(pnum);
+ int nid = sparse_early_nid(ms);
+
+ map = sparse_mem_map_populate(pnum, nid);
+ if (map)
+ return map;
+
+ printk(KERN_ERR "%s: sparsemem memory map backing failed "
+ "some memory will not be available.\n", __func__);
+ ms->section_mem_map = 0;
+ return NULL;
+}
+
+void __attribute__((weak)) __meminit vmemmap_populate_print_last(void)
+{
+}
+/*
+ * Allocate the accumulated non-linear sections, allocate a mem_map
+ * for each and record the physical to section mapping.
+ */
+void __init sparse_init(void)
+{
+ unsigned long pnum;
+ struct page *map;
+ unsigned long *usemap;
+ unsigned long **usemap_map;
+ int size;
+
+ /*
+ * map is using big page (aka 2M in x86 64 bit)
+ * usemap is less one page (aka 24 bytes)
+ * so alloc 2M (with 2M align) and 24 bytes in turn will
+ * make next 2M slip to one more 2M later.
+ * then in big system, the memory will have a lot of holes...
+ * here try to allocate 2M pages continously.
+ *
+ * powerpc need to call sparse_init_one_section right after each
+ * sparse_early_mem_map_alloc, so allocate usemap_map at first.
+ */
+ size = sizeof(unsigned long *) * NR_MEM_SECTIONS;
+ usemap_map = alloc_bootmem(size);
+ if (!usemap_map)
+ panic("can not allocate usemap_map\n");
+
+ for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+ if (!present_section_nr(pnum))
+ continue;
+ usemap_map[pnum] = sparse_early_usemap_alloc(pnum);
+ }
+
+ for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+ if (!present_section_nr(pnum))
+ continue;
+
+ usemap = usemap_map[pnum];
+ if (!usemap)
+ continue;
+
+ map = sparse_early_mem_map_alloc(pnum);
+ if (!map)
+ continue;
+
+ sparse_init_one_section(__nr_to_section(pnum), pnum, map,
+ usemap);
+ }
+
+ vmemmap_populate_print_last();
+
+ free_bootmem(__pa(usemap_map), size);
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
+ unsigned long nr_pages)
+{
+ /* This will make the necessary allocations eventually. */
+ return sparse_mem_map_populate(pnum, nid);
+}
+static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
+{
+ return; /* XXX: Not implemented yet */
+}
+static void free_map_bootmem(struct page *page, unsigned long nr_pages)
+{
+}
+#else
+static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
+{
+ struct page *page, *ret;
+ unsigned long memmap_size = sizeof(struct page) * nr_pages;
+
+ page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
+ if (page)
+ goto got_map_page;
+
+ ret = vmalloc(memmap_size);
+ if (ret)
+ goto got_map_ptr;
+
+ return NULL;
+got_map_page:
+ ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
+got_map_ptr:
+ memset(ret, 0, memmap_size);
+
+ return ret;
+}
+
+static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
+ unsigned long nr_pages)
+{
+ return __kmalloc_section_memmap(nr_pages);
+}
+
+static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
+{
+ if (is_vmalloc_addr(memmap))
+ vfree(memmap);
+ else
+ free_pages((unsigned long)memmap,
+ get_order(sizeof(struct page) * nr_pages));
+}
+
+static void free_map_bootmem(struct page *page, unsigned long nr_pages)
+{
+ unsigned long maps_section_nr, removing_section_nr, i;
+ int magic;
+
+ for (i = 0; i < nr_pages; i++, page++) {
+ magic = atomic_read(&page->_mapcount);
+
+ BUG_ON(magic == NODE_INFO);
+
+ maps_section_nr = pfn_to_section_nr(page_to_pfn(page));
+ removing_section_nr = page->private;
+
+ /*
+ * When this function is called, the removing section is
+ * logical offlined state. This means all pages are isolated
+ * from page allocator. If removing section's memmap is placed
+ * on the same section, it must not be freed.
+ * If it is freed, page allocator may allocate it which will
+ * be removed physically soon.
+ */
+ if (maps_section_nr != removing_section_nr)
+ put_page_bootmem(page);
+ }
+}
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+
+static void free_section_usemap(struct page *memmap, unsigned long *usemap)
+{
+ struct page *usemap_page;
+ unsigned long nr_pages;
+
+ if (!usemap)
+ return;
+
+ usemap_page = virt_to_page(usemap);
+ /*
+ * Check to see if allocation came from hot-plug-add
+ */
+ if (PageSlab(usemap_page)) {
+ kfree(usemap);
+ if (memmap)
+ __kfree_section_memmap(memmap, PAGES_PER_SECTION);
+ return;
+ }
+
+ /*
+ * The usemap came from bootmem. This is packed with other usemaps
+ * on the section which has pgdat at boot time. Just keep it as is now.
+ */
+
+ if (memmap) {
+ struct page *memmap_page;
+ memmap_page = virt_to_page(memmap);
+
+ nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
+ >> PAGE_SHIFT;
+
+ free_map_bootmem(memmap_page, nr_pages);
+ }
+}
+
+/*
+ * returns the number of sections whose mem_maps were properly
+ * set. If this is <=0, then that means that the passed-in
+ * map was not consumed and must be freed.
+ */
+int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
+ int nr_pages)
+{
+ unsigned long section_nr = pfn_to_section_nr(start_pfn);
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ struct mem_section *ms;
+ struct page *memmap;
+ unsigned long *usemap;
+ unsigned long flags;
+ int ret;
+
+ /*
+ * no locking for this, because it does its own
+ * plus, it does a kmalloc
+ */
+ ret = sparse_index_init(section_nr, pgdat->node_id);
+ if (ret < 0 && ret != -EEXIST)
+ return ret;
+ memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, nr_pages);
+ if (!memmap)
+ return -ENOMEM;
+ usemap = __kmalloc_section_usemap();
+ if (!usemap) {
+ __kfree_section_memmap(memmap, nr_pages);
+ return -ENOMEM;
+ }
+
+ pgdat_resize_lock(pgdat, &flags);
+
+ ms = __pfn_to_section(start_pfn);
+ if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ ms->section_mem_map |= SECTION_MARKED_PRESENT;
+
+ ret = sparse_init_one_section(ms, section_nr, memmap, usemap);
+
+out:
+ pgdat_resize_unlock(pgdat, &flags);
+ if (ret <= 0) {
+ kfree(usemap);
+ __kfree_section_memmap(memmap, nr_pages);
+ }
+ return ret;
+}
+
+void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
+{
+ struct page *memmap = NULL;
+ unsigned long *usemap = NULL;
+
+ if (ms->section_mem_map) {
+ usemap = ms->pageblock_flags;
+ memmap = sparse_decode_mem_map(ms->section_mem_map,
+ __section_nr(ms));
+ ms->section_mem_map = 0;
+ ms->pageblock_flags = NULL;
+ }
+
+ free_section_usemap(memmap, usemap);
+}
+#endif
diff --git a/mm/swap.c b/mm/swap.c
new file mode 100644
index 0000000..b135ec9
--- /dev/null
+++ b/mm/swap.c
@@ -0,0 +1,606 @@
+/*
+ * linux/mm/swap.c
+ *
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ */
+
+/*
+ * This file contains the default values for the operation of the
+ * Linux VM subsystem. Fine-tuning documentation can be found in
+ * Documentation/sysctl/vm.txt.
+ * Started 18.12.91
+ * Swap aging added 23.2.95, Stephen Tweedie.
+ * Buffermem limits added 12.3.98, Rik van Riel.
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/kernel_stat.h>
+#include <linux/swap.h>
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm_inline.h>
+#include <linux/buffer_head.h> /* for try_to_release_page() */
+#include <linux/percpu_counter.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <linux/backing-dev.h>
+#include <linux/memcontrol.h>
+
+#include "internal.h"
+
+/* How many pages do we try to swap or page in/out together? */
+int page_cluster;
+
+static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
+
+/*
+ * This path almost never happens for VM activity - pages are normally
+ * freed via pagevecs. But it gets used by networking.
+ */
+static void __page_cache_release(struct page *page)
+{
+ if (PageLRU(page)) {
+ unsigned long flags;
+ struct zone *zone = page_zone(page);
+
+ spin_lock_irqsave(&zone->lru_lock, flags);
+ VM_BUG_ON(!PageLRU(page));
+ __ClearPageLRU(page);
+ del_page_from_lru(zone, page);
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ }
+ free_hot_page(page);
+}
+
+static void put_compound_page(struct page *page)
+{
+ page = compound_head(page);
+ if (put_page_testzero(page)) {
+ compound_page_dtor *dtor;
+
+ dtor = get_compound_page_dtor(page);
+ (*dtor)(page);
+ }
+}
+
+void put_page(struct page *page)
+{
+ if (unlikely(PageCompound(page)))
+ put_compound_page(page);
+ else if (put_page_testzero(page))
+ __page_cache_release(page);
+}
+EXPORT_SYMBOL(put_page);
+
+/**
+ * put_pages_list() - release a list of pages
+ * @pages: list of pages threaded on page->lru
+ *
+ * Release a list of pages which are strung together on page.lru. Currently
+ * used by read_cache_pages() and related error recovery code.
+ */
+void put_pages_list(struct list_head *pages)
+{
+ while (!list_empty(pages)) {
+ struct page *victim;
+
+ victim = list_entry(pages->prev, struct page, lru);
+ list_del(&victim->lru);
+ page_cache_release(victim);
+ }
+}
+EXPORT_SYMBOL(put_pages_list);
+
+/*
+ * pagevec_move_tail() must be called with IRQ disabled.
+ * Otherwise this may cause nasty races.
+ */
+static void pagevec_move_tail(struct pagevec *pvec)
+{
+ int i;
+ int pgmoved = 0;
+ struct zone *zone = NULL;
+
+ for (i = 0; i < pagevec_count(pvec); i++) {
+ struct page *page = pvec->pages[i];
+ struct zone *pagezone = page_zone(page);
+
+ if (pagezone != zone) {
+ if (zone)
+ spin_unlock(&zone->lru_lock);
+ zone = pagezone;
+ spin_lock(&zone->lru_lock);
+ }
+ if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+ int lru = page_is_file_cache(page);
+ list_move_tail(&page->lru, &zone->lru[lru].list);
+ pgmoved++;
+ }
+ }
+ if (zone)
+ spin_unlock(&zone->lru_lock);
+ __count_vm_events(PGROTATED, pgmoved);
+ release_pages(pvec->pages, pvec->nr, pvec->cold);
+ pagevec_reinit(pvec);
+}
+
+/*
+ * Writeback is about to end against a page which has been marked for immediate
+ * reclaim. If it still appears to be reclaimable, move it to the tail of the
+ * inactive list.
+ */
+void rotate_reclaimable_page(struct page *page)
+{
+ if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
+ !PageUnevictable(page) && PageLRU(page)) {
+ struct pagevec *pvec;
+ unsigned long flags;
+
+ page_cache_get(page);
+ local_irq_save(flags);
+ pvec = &__get_cpu_var(lru_rotate_pvecs);
+ if (!pagevec_add(pvec, page))
+ pagevec_move_tail(pvec);
+ local_irq_restore(flags);
+ }
+}
+
+/*
+ * FIXME: speed this up?
+ */
+void activate_page(struct page *page)
+{
+ struct zone *zone = page_zone(page);
+
+ spin_lock_irq(&zone->lru_lock);
+ if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+ int file = page_is_file_cache(page);
+ int lru = LRU_BASE + file;
+ del_page_from_lru_list(zone, page, lru);
+
+ SetPageActive(page);
+ lru += LRU_ACTIVE;
+ add_page_to_lru_list(zone, page, lru);
+ __count_vm_event(PGACTIVATE);
+ mem_cgroup_move_lists(page, lru);
+
+ zone->recent_rotated[!!file]++;
+ zone->recent_scanned[!!file]++;
+ }
+ spin_unlock_irq(&zone->lru_lock);
+}
+
+/*
+ * Mark a page as having seen activity.
+ *
+ * inactive,unreferenced -> inactive,referenced
+ * inactive,referenced -> active,unreferenced
+ * active,unreferenced -> active,referenced
+ */
+void mark_page_accessed(struct page *page)
+{
+ if (!PageActive(page) && !PageUnevictable(page) &&
+ PageReferenced(page) && PageLRU(page)) {
+ activate_page(page);
+ ClearPageReferenced(page);
+ } else if (!PageReferenced(page)) {
+ SetPageReferenced(page);
+ }
+}
+
+EXPORT_SYMBOL(mark_page_accessed);
+
+void __lru_cache_add(struct page *page, enum lru_list lru)
+{
+ struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
+
+ page_cache_get(page);
+ if (!pagevec_add(pvec, page))
+ ____pagevec_lru_add(pvec, lru);
+ put_cpu_var(lru_add_pvecs);
+}
+
+/**
+ * lru_cache_add_lru - add a page to a page list
+ * @page: the page to be added to the LRU.
+ * @lru: the LRU list to which the page is added.
+ */
+void lru_cache_add_lru(struct page *page, enum lru_list lru)
+{
+ if (PageActive(page)) {
+ VM_BUG_ON(PageUnevictable(page));
+ ClearPageActive(page);
+ } else if (PageUnevictable(page)) {
+ VM_BUG_ON(PageActive(page));
+ ClearPageUnevictable(page);
+ }
+
+ VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page));
+ __lru_cache_add(page, lru);
+}
+
+/**
+ * add_page_to_unevictable_list - add a page to the unevictable list
+ * @page: the page to be added to the unevictable list
+ *
+ * Add page directly to its zone's unevictable list. To avoid races with
+ * tasks that might be making the page evictable, through eg. munlock,
+ * munmap or exit, while it's not on the lru, we want to add the page
+ * while it's locked or otherwise "invisible" to other tasks. This is
+ * difficult to do when using the pagevec cache, so bypass that.
+ */
+void add_page_to_unevictable_list(struct page *page)
+{
+ struct zone *zone = page_zone(page);
+
+ spin_lock_irq(&zone->lru_lock);
+ SetPageUnevictable(page);
+ SetPageLRU(page);
+ add_page_to_lru_list(zone, page, LRU_UNEVICTABLE);
+ spin_unlock_irq(&zone->lru_lock);
+}
+
+/**
+ * lru_cache_add_active_or_unevictable
+ * @page: the page to be added to LRU
+ * @vma: vma in which page is mapped for determining reclaimability
+ *
+ * place @page on active or unevictable LRU list, depending on
+ * page_evictable(). Note that if the page is not evictable,
+ * it goes directly back onto it's zone's unevictable list. It does
+ * NOT use a per cpu pagevec.
+ */
+void lru_cache_add_active_or_unevictable(struct page *page,
+ struct vm_area_struct *vma)
+{
+ if (page_evictable(page, vma))
+ lru_cache_add_lru(page, LRU_ACTIVE + page_is_file_cache(page));
+ else
+ add_page_to_unevictable_list(page);
+}
+
+/*
+ * Drain pages out of the cpu's pagevecs.
+ * Either "cpu" is the current CPU, and preemption has already been
+ * disabled; or "cpu" is being hot-unplugged, and is already dead.
+ */
+static void drain_cpu_pagevecs(int cpu)
+{
+ struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu);
+ struct pagevec *pvec;
+ int lru;
+
+ for_each_lru(lru) {
+ pvec = &pvecs[lru - LRU_BASE];
+ if (pagevec_count(pvec))
+ ____pagevec_lru_add(pvec, lru);
+ }
+
+ pvec = &per_cpu(lru_rotate_pvecs, cpu);
+ if (pagevec_count(pvec)) {
+ unsigned long flags;
+
+ /* No harm done if a racing interrupt already did this */
+ local_irq_save(flags);
+ pagevec_move_tail(pvec);
+ local_irq_restore(flags);
+ }
+}
+
+void lru_add_drain(void)
+{
+ drain_cpu_pagevecs(get_cpu());
+ put_cpu();
+}
+
+static void lru_add_drain_per_cpu(struct work_struct *dummy)
+{
+ lru_add_drain();
+}
+
+/*
+ * Returns 0 for success
+ */
+int lru_add_drain_all(void)
+{
+ return schedule_on_each_cpu(lru_add_drain_per_cpu);
+}
+
+/*
+ * Batched page_cache_release(). Decrement the reference count on all the
+ * passed pages. If it fell to zero then remove the page from the LRU and
+ * free it.
+ *
+ * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
+ * for the remainder of the operation.
+ *
+ * The locking in this function is against shrink_inactive_list(): we recheck
+ * the page count inside the lock to see whether shrink_inactive_list()
+ * grabbed the page via the LRU. If it did, give up: shrink_inactive_list()
+ * will free it.
+ */
+void release_pages(struct page **pages, int nr, int cold)
+{
+ int i;
+ struct pagevec pages_to_free;
+ struct zone *zone = NULL;
+ unsigned long uninitialized_var(flags);
+
+ pagevec_init(&pages_to_free, cold);
+ for (i = 0; i < nr; i++) {
+ struct page *page = pages[i];
+
+ if (unlikely(PageCompound(page))) {
+ if (zone) {
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ zone = NULL;
+ }
+ put_compound_page(page);
+ continue;
+ }
+
+ if (!put_page_testzero(page))
+ continue;
+
+ if (PageLRU(page)) {
+ struct zone *pagezone = page_zone(page);
+
+ if (pagezone != zone) {
+ if (zone)
+ spin_unlock_irqrestore(&zone->lru_lock,
+ flags);
+ zone = pagezone;
+ spin_lock_irqsave(&zone->lru_lock, flags);
+ }
+ VM_BUG_ON(!PageLRU(page));
+ __ClearPageLRU(page);
+ del_page_from_lru(zone, page);
+ }
+
+ if (!pagevec_add(&pages_to_free, page)) {
+ if (zone) {
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ zone = NULL;
+ }
+ __pagevec_free(&pages_to_free);
+ pagevec_reinit(&pages_to_free);
+ }
+ }
+ if (zone)
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+
+ pagevec_free(&pages_to_free);
+}
+
+/*
+ * The pages which we're about to release may be in the deferred lru-addition
+ * queues. That would prevent them from really being freed right now. That's
+ * OK from a correctness point of view but is inefficient - those pages may be
+ * cache-warm and we want to give them back to the page allocator ASAP.
+ *
+ * So __pagevec_release() will drain those queues here. __pagevec_lru_add()
+ * and __pagevec_lru_add_active() call release_pages() directly to avoid
+ * mutual recursion.
+ */
+void __pagevec_release(struct pagevec *pvec)
+{
+ lru_add_drain();
+ release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
+ pagevec_reinit(pvec);
+}
+
+EXPORT_SYMBOL(__pagevec_release);
+
+/*
+ * pagevec_release() for pages which are known to not be on the LRU
+ *
+ * This function reinitialises the caller's pagevec.
+ */
+void __pagevec_release_nonlru(struct pagevec *pvec)
+{
+ int i;
+ struct pagevec pages_to_free;
+
+ pagevec_init(&pages_to_free, pvec->cold);
+ for (i = 0; i < pagevec_count(pvec); i++) {
+ struct page *page = pvec->pages[i];
+
+ VM_BUG_ON(PageLRU(page));
+ if (put_page_testzero(page))
+ pagevec_add(&pages_to_free, page);
+ }
+ pagevec_free(&pages_to_free);
+ pagevec_reinit(pvec);
+}
+
+/*
+ * Add the passed pages to the LRU, then drop the caller's refcount
+ * on them. Reinitialises the caller's pagevec.
+ */
+void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
+{
+ int i;
+ struct zone *zone = NULL;
+ VM_BUG_ON(is_unevictable_lru(lru));
+
+ for (i = 0; i < pagevec_count(pvec); i++) {
+ struct page *page = pvec->pages[i];
+ struct zone *pagezone = page_zone(page);
+ int file;
+
+ if (pagezone != zone) {
+ if (zone)
+ spin_unlock_irq(&zone->lru_lock);
+ zone = pagezone;
+ spin_lock_irq(&zone->lru_lock);
+ }
+ VM_BUG_ON(PageActive(page));
+ VM_BUG_ON(PageUnevictable(page));
+ VM_BUG_ON(PageLRU(page));
+ SetPageLRU(page);
+ file = is_file_lru(lru);
+ zone->recent_scanned[file]++;
+ if (is_active_lru(lru)) {
+ SetPageActive(page);
+ zone->recent_rotated[file]++;
+ }
+ add_page_to_lru_list(zone, page, lru);
+ }
+ if (zone)
+ spin_unlock_irq(&zone->lru_lock);
+ release_pages(pvec->pages, pvec->nr, pvec->cold);
+ pagevec_reinit(pvec);
+}
+
+EXPORT_SYMBOL(____pagevec_lru_add);
+
+/*
+ * Try to drop buffers from the pages in a pagevec
+ */
+void pagevec_strip(struct pagevec *pvec)
+{
+ int i;
+
+ for (i = 0; i < pagevec_count(pvec); i++) {
+ struct page *page = pvec->pages[i];
+
+ if (PagePrivate(page) && trylock_page(page)) {
+ if (PagePrivate(page))
+ try_to_release_page(page, 0);
+ unlock_page(page);
+ }
+ }
+}
+
+/**
+ * pagevec_swap_free - try to free swap space from the pages in a pagevec
+ * @pvec: pagevec with swapcache pages to free the swap space of
+ *
+ * The caller needs to hold an extra reference to each page and
+ * not hold the page lock on the pages. This function uses a
+ * trylock on the page lock so it may not always free the swap
+ * space associated with a page.
+ */
+void pagevec_swap_free(struct pagevec *pvec)
+{
+ int i;
+
+ for (i = 0; i < pagevec_count(pvec); i++) {
+ struct page *page = pvec->pages[i];
+
+ if (PageSwapCache(page) && trylock_page(page)) {
+ if (PageSwapCache(page))
+ remove_exclusive_swap_page_ref(page);
+ unlock_page(page);
+ }
+ }
+}
+
+/**
+ * pagevec_lookup - gang pagecache lookup
+ * @pvec: Where the resulting pages are placed
+ * @mapping: The address_space to search
+ * @start: The starting page index
+ * @nr_pages: The maximum number of pages
+ *
+ * pagevec_lookup() will search for and return a group of up to @nr_pages pages
+ * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a
+ * reference against the pages in @pvec.
+ *
+ * The search returns a group of mapping-contiguous pages with ascending
+ * indexes. There may be holes in the indices due to not-present pages.
+ *
+ * pagevec_lookup() returns the number of pages which were found.
+ */
+unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
+ pgoff_t start, unsigned nr_pages)
+{
+ pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
+ return pagevec_count(pvec);
+}
+
+EXPORT_SYMBOL(pagevec_lookup);
+
+unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
+ pgoff_t *index, int tag, unsigned nr_pages)
+{
+ pvec->nr = find_get_pages_tag(mapping, index, tag,
+ nr_pages, pvec->pages);
+ return pagevec_count(pvec);
+}
+
+EXPORT_SYMBOL(pagevec_lookup_tag);
+
+#ifdef CONFIG_SMP
+/*
+ * We tolerate a little inaccuracy to avoid ping-ponging the counter between
+ * CPUs
+ */
+#define ACCT_THRESHOLD max(16, NR_CPUS * 2)
+
+static DEFINE_PER_CPU(long, committed_space);
+
+void vm_acct_memory(long pages)
+{
+ long *local;
+
+ preempt_disable();
+ local = &__get_cpu_var(committed_space);
+ *local += pages;
+ if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
+ atomic_long_add(*local, &vm_committed_space);
+ *local = 0;
+ }
+ preempt_enable();
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/* Drop the CPU's cached committed space back into the central pool. */
+static int cpu_swap_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ long *committed;
+
+ committed = &per_cpu(committed_space, (long)hcpu);
+ if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+ atomic_long_add(*committed, &vm_committed_space);
+ *committed = 0;
+ drain_cpu_pagevecs((long)hcpu);
+ }
+ return NOTIFY_OK;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+#endif /* CONFIG_SMP */
+
+/*
+ * Perform any setup for the swap system
+ */
+void __init swap_setup(void)
+{
+ unsigned long megs = num_physpages >> (20 - PAGE_SHIFT);
+
+#ifdef CONFIG_SWAP
+ bdi_init(swapper_space.backing_dev_info);
+#endif
+
+ /* Use a smaller cluster for small-memory machines */
+ if (megs < 16)
+ page_cluster = 2;
+ else
+ page_cluster = 3;
+ /*
+ * Right now other parts of the system means that we
+ * _really_ don't want to cluster much more
+ */
+#ifdef CONFIG_HOTPLUG_CPU
+ hotcpu_notifier(cpu_swap_callback, 0);
+#endif
+}
diff --git a/mm/swap_state.c b/mm/swap_state.c
new file mode 100644
index 0000000..3353c90
--- /dev/null
+++ b/mm/swap_state.c
@@ -0,0 +1,372 @@
+/*
+ * linux/mm/swap_state.c
+ *
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ * Swap reorganised 29.12.95, Stephen Tweedie
+ *
+ * Rewritten to use page cache, (C) 1998 Stephen Tweedie
+ */
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/kernel_stat.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>
+#include <linux/backing-dev.h>
+#include <linux/pagevec.h>
+#include <linux/migrate.h>
+
+#include <asm/pgtable.h>
+
+/*
+ * swapper_space is a fiction, retained to simplify the path through
+ * vmscan's shrink_page_list, to make sync_page look nicer, and to allow
+ * future use of radix_tree tags in the swap cache.
+ */
+static const struct address_space_operations swap_aops = {
+ .writepage = swap_writepage,
+ .sync_page = block_sync_page,
+ .set_page_dirty = __set_page_dirty_nobuffers,
+ .migratepage = migrate_page,
+};
+
+static struct backing_dev_info swap_backing_dev_info = {
+ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
+ .unplug_io_fn = swap_unplug_io_fn,
+};
+
+struct address_space swapper_space = {
+ .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
+ .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock),
+ .a_ops = &swap_aops,
+ .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
+ .backing_dev_info = &swap_backing_dev_info,
+};
+
+#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
+
+static struct {
+ unsigned long add_total;
+ unsigned long del_total;
+ unsigned long find_success;
+ unsigned long find_total;
+} swap_cache_info;
+
+void show_swap_cache_info(void)
+{
+ printk("%lu pages in swap cache\n", total_swapcache_pages);
+ printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
+ swap_cache_info.add_total, swap_cache_info.del_total,
+ swap_cache_info.find_success, swap_cache_info.find_total);
+ printk("Free swap = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10));
+ printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
+}
+
+/*
+ * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
+ * but sets SwapCache flag and private instead of mapping and index.
+ */
+int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
+{
+ int error;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(PageSwapCache(page));
+ BUG_ON(PagePrivate(page));
+ BUG_ON(!PageSwapBacked(page));
+ error = radix_tree_preload(gfp_mask);
+ if (!error) {
+ page_cache_get(page);
+ SetPageSwapCache(page);
+ set_page_private(page, entry.val);
+
+ spin_lock_irq(&swapper_space.tree_lock);
+ error = radix_tree_insert(&swapper_space.page_tree,
+ entry.val, page);
+ if (likely(!error)) {
+ total_swapcache_pages++;
+ __inc_zone_page_state(page, NR_FILE_PAGES);
+ INC_CACHE_INFO(add_total);
+ }
+ spin_unlock_irq(&swapper_space.tree_lock);
+ radix_tree_preload_end();
+
+ if (unlikely(error)) {
+ set_page_private(page, 0UL);
+ ClearPageSwapCache(page);
+ page_cache_release(page);
+ }
+ }
+ return error;
+}
+
+/*
+ * This must be called only on pages that have
+ * been verified to be in the swap cache.
+ */
+void __delete_from_swap_cache(struct page *page)
+{
+ BUG_ON(!PageLocked(page));
+ BUG_ON(!PageSwapCache(page));
+ BUG_ON(PageWriteback(page));
+ BUG_ON(PagePrivate(page));
+
+ radix_tree_delete(&swapper_space.page_tree, page_private(page));
+ set_page_private(page, 0);
+ ClearPageSwapCache(page);
+ total_swapcache_pages--;
+ __dec_zone_page_state(page, NR_FILE_PAGES);
+ INC_CACHE_INFO(del_total);
+}
+
+/**
+ * add_to_swap - allocate swap space for a page
+ * @page: page we want to move to swap
+ * @gfp_mask: memory allocation flags
+ *
+ * Allocate swap space for the page and add the page to the
+ * swap cache. Caller needs to hold the page lock.
+ */
+int add_to_swap(struct page * page, gfp_t gfp_mask)
+{
+ swp_entry_t entry;
+ int err;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(!PageUptodate(page));
+
+ for (;;) {
+ entry = get_swap_page();
+ if (!entry.val)
+ return 0;
+
+ /*
+ * Radix-tree node allocations from PF_MEMALLOC contexts could
+ * completely exhaust the page allocator. __GFP_NOMEMALLOC
+ * stops emergency reserves from being allocated.
+ *
+ * TODO: this could cause a theoretical memory reclaim
+ * deadlock in the swap out path.
+ */
+ /*
+ * Add it to the swap cache and mark it dirty
+ */
+ err = add_to_swap_cache(page, entry,
+ gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN);
+
+ switch (err) {
+ case 0: /* Success */
+ SetPageDirty(page);
+ return 1;
+ case -EEXIST:
+ /* Raced with "speculative" read_swap_cache_async */
+ swap_free(entry);
+ continue;
+ default:
+ /* -ENOMEM radix-tree allocation failure */
+ swap_free(entry);
+ return 0;
+ }
+ }
+}
+
+/*
+ * This must be called only on pages that have
+ * been verified to be in the swap cache and locked.
+ * It will never put the page into the free list,
+ * the caller has a reference on the page.
+ */
+void delete_from_swap_cache(struct page *page)
+{
+ swp_entry_t entry;
+
+ entry.val = page_private(page);
+
+ spin_lock_irq(&swapper_space.tree_lock);
+ __delete_from_swap_cache(page);
+ spin_unlock_irq(&swapper_space.tree_lock);
+
+ swap_free(entry);
+ page_cache_release(page);
+}
+
+/*
+ * If we are the only user, then try to free up the swap cache.
+ *
+ * Its ok to check for PageSwapCache without the page lock
+ * here because we are going to recheck again inside
+ * exclusive_swap_page() _with_ the lock.
+ * - Marcelo
+ */
+static inline void free_swap_cache(struct page *page)
+{
+ if (PageSwapCache(page) && trylock_page(page)) {
+ remove_exclusive_swap_page(page);
+ unlock_page(page);
+ }
+}
+
+/*
+ * Perform a free_page(), also freeing any swap cache associated with
+ * this page if it is the last user of the page.
+ */
+void free_page_and_swap_cache(struct page *page)
+{
+ free_swap_cache(page);
+ page_cache_release(page);
+}
+
+/*
+ * Passed an array of pages, drop them all from swapcache and then release
+ * them. They are removed from the LRU and freed if this is their last use.
+ */
+void free_pages_and_swap_cache(struct page **pages, int nr)
+{
+ struct page **pagep = pages;
+
+ lru_add_drain();
+ while (nr) {
+ int todo = min(nr, PAGEVEC_SIZE);
+ int i;
+
+ for (i = 0; i < todo; i++)
+ free_swap_cache(pagep[i]);
+ release_pages(pagep, todo, 0);
+ pagep += todo;
+ nr -= todo;
+ }
+}
+
+/*
+ * Lookup a swap entry in the swap cache. A found page will be returned
+ * unlocked and with its refcount incremented - we rely on the kernel
+ * lock getting page table operations atomic even if we drop the page
+ * lock before returning.
+ */
+struct page * lookup_swap_cache(swp_entry_t entry)
+{
+ struct page *page;
+
+ page = find_get_page(&swapper_space, entry.val);
+
+ if (page)
+ INC_CACHE_INFO(find_success);
+
+ INC_CACHE_INFO(find_total);
+ return page;
+}
+
+/*
+ * Locate a page of swap in physical memory, reserving swap cache space
+ * and reading the disk if it is not already cached.
+ * A failure return means that either the page allocation failed or that
+ * the swap entry is no longer in use.
+ */
+struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ struct page *found_page, *new_page = NULL;
+ int err;
+
+ do {
+ /*
+ * First check the swap cache. Since this is normally
+ * called after lookup_swap_cache() failed, re-calling
+ * that would confuse statistics.
+ */
+ found_page = find_get_page(&swapper_space, entry.val);
+ if (found_page)
+ break;
+
+ /*
+ * Get a new page to read into from swap.
+ */
+ if (!new_page) {
+ new_page = alloc_page_vma(gfp_mask, vma, addr);
+ if (!new_page)
+ break; /* Out of memory */
+ }
+
+ /*
+ * Swap entry may have been freed since our caller observed it.
+ */
+ if (!swap_duplicate(entry))
+ break;
+
+ /*
+ * Associate the page with swap entry in the swap cache.
+ * May fail (-EEXIST) if there is already a page associated
+ * with this entry in the swap cache: added by a racing
+ * read_swap_cache_async, or add_to_swap or shmem_writepage
+ * re-using the just freed swap entry for an existing page.
+ * May fail (-ENOMEM) if radix-tree node allocation failed.
+ */
+ __set_page_locked(new_page);
+ SetPageSwapBacked(new_page);
+ err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
+ if (likely(!err)) {
+ /*
+ * Initiate read into locked page and return.
+ */
+ lru_cache_add_anon(new_page);
+ swap_readpage(NULL, new_page);
+ return new_page;
+ }
+ ClearPageSwapBacked(new_page);
+ __clear_page_locked(new_page);
+ swap_free(entry);
+ } while (err != -ENOMEM);
+
+ if (new_page)
+ page_cache_release(new_page);
+ return found_page;
+}
+
+/**
+ * swapin_readahead - swap in pages in hope we need them soon
+ * @entry: swap entry of this memory
+ * @gfp_mask: memory allocation flags
+ * @vma: user vma this address belongs to
+ * @addr: target address for mempolicy
+ *
+ * Returns the struct page for entry and addr, after queueing swapin.
+ *
+ * Primitive swap readahead code. We simply read an aligned block of
+ * (1 << page_cluster) entries in the swap area. This method is chosen
+ * because it doesn't cost us any seek time. We also make sure to queue
+ * the 'original' request together with the readahead ones...
+ *
+ * This has been extended to use the NUMA policies from the mm triggering
+ * the readahead.
+ *
+ * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
+ */
+struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ int nr_pages;
+ struct page *page;
+ unsigned long offset;
+ unsigned long end_offset;
+
+ /*
+ * Get starting offset for readaround, and number of pages to read.
+ * Adjust starting address by readbehind (for NUMA interleave case)?
+ * No, it's very unlikely that swap layout would follow vma layout,
+ * more likely that neighbouring swap pages came from the same node:
+ * so use the same "addr" to choose the same node for each swap read.
+ */
+ nr_pages = valid_swaphandles(entry, &offset);
+ for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
+ /* Ok, do the async read-ahead now */
+ page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
+ gfp_mask, vma, addr);
+ if (!page)
+ break;
+ page_cache_release(page);
+ }
+ lru_add_drain(); /* Push any new pages onto the LRU now */
+ return read_swap_cache_async(entry, gfp_mask, vma, addr);
+}
diff --git a/mm/swapfile.c b/mm/swapfile.c
new file mode 100644
index 0000000..d06896b
--- /dev/null
+++ b/mm/swapfile.c
@@ -0,0 +1,1870 @@
+/*
+ * linux/mm/swapfile.c
+ *
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ * Swap reorganised 29.12.95, Stephen Tweedie
+ */
+
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/mman.h>
+#include <linux/slab.h>
+#include <linux/kernel_stat.h>
+#include <linux/swap.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/namei.h>
+#include <linux/shm.h>
+#include <linux/blkdev.h>
+#include <linux/writeback.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/rmap.h>
+#include <linux/security.h>
+#include <linux/backing-dev.h>
+#include <linux/mutex.h>
+#include <linux/capability.h>
+#include <linux/syscalls.h>
+#include <linux/memcontrol.h>
+
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <linux/swapops.h>
+
+static DEFINE_SPINLOCK(swap_lock);
+static unsigned int nr_swapfiles;
+long total_swap_pages;
+static int swap_overflow;
+static int least_priority;
+
+static const char Bad_file[] = "Bad swap file entry ";
+static const char Unused_file[] = "Unused swap file entry ";
+static const char Bad_offset[] = "Bad swap offset entry ";
+static const char Unused_offset[] = "Unused swap offset entry ";
+
+static struct swap_list_t swap_list = {-1, -1};
+
+static struct swap_info_struct swap_info[MAX_SWAPFILES];
+
+static DEFINE_MUTEX(swapon_mutex);
+
+/*
+ * We need this because the bdev->unplug_fn can sleep and we cannot
+ * hold swap_lock while calling the unplug_fn. And swap_lock
+ * cannot be turned into a mutex.
+ */
+static DECLARE_RWSEM(swap_unplug_sem);
+
+void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
+{
+ swp_entry_t entry;
+
+ down_read(&swap_unplug_sem);
+ entry.val = page_private(page);
+ if (PageSwapCache(page)) {
+ struct block_device *bdev = swap_info[swp_type(entry)].bdev;
+ struct backing_dev_info *bdi;
+
+ /*
+ * If the page is removed from swapcache from under us (with a
+ * racy try_to_unuse/swapoff) we need an additional reference
+ * count to avoid reading garbage from page_private(page) above.
+ * If the WARN_ON triggers during a swapoff it maybe the race
+ * condition and it's harmless. However if it triggers without
+ * swapoff it signals a problem.
+ */
+ WARN_ON(page_count(page) <= 1);
+
+ bdi = bdev->bd_inode->i_mapping->backing_dev_info;
+ blk_run_backing_dev(bdi, page);
+ }
+ up_read(&swap_unplug_sem);
+}
+
+#define SWAPFILE_CLUSTER 256
+#define LATENCY_LIMIT 256
+
+static inline unsigned long scan_swap_map(struct swap_info_struct *si)
+{
+ unsigned long offset, last_in_cluster;
+ int latency_ration = LATENCY_LIMIT;
+
+ /*
+ * We try to cluster swap pages by allocating them sequentially
+ * in swap. Once we've allocated SWAPFILE_CLUSTER pages this
+ * way, however, we resort to first-free allocation, starting
+ * a new cluster. This prevents us from scattering swap pages
+ * all over the entire swap partition, so that we reduce
+ * overall disk seek times between swap pages. -- sct
+ * But we do now try to find an empty cluster. -Andrea
+ */
+
+ si->flags += SWP_SCANNING;
+ if (unlikely(!si->cluster_nr)) {
+ si->cluster_nr = SWAPFILE_CLUSTER - 1;
+ if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER)
+ goto lowest;
+ spin_unlock(&swap_lock);
+
+ offset = si->lowest_bit;
+ last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
+
+ /* Locate the first empty (unaligned) cluster */
+ for (; last_in_cluster <= si->highest_bit; offset++) {
+ if (si->swap_map[offset])
+ last_in_cluster = offset + SWAPFILE_CLUSTER;
+ else if (offset == last_in_cluster) {
+ spin_lock(&swap_lock);
+ si->cluster_next = offset-SWAPFILE_CLUSTER+1;
+ goto cluster;
+ }
+ if (unlikely(--latency_ration < 0)) {
+ cond_resched();
+ latency_ration = LATENCY_LIMIT;
+ }
+ }
+ spin_lock(&swap_lock);
+ goto lowest;
+ }
+
+ si->cluster_nr--;
+cluster:
+ offset = si->cluster_next;
+ if (offset > si->highest_bit)
+lowest: offset = si->lowest_bit;
+checks: if (!(si->flags & SWP_WRITEOK))
+ goto no_page;
+ if (!si->highest_bit)
+ goto no_page;
+ if (!si->swap_map[offset]) {
+ if (offset == si->lowest_bit)
+ si->lowest_bit++;
+ if (offset == si->highest_bit)
+ si->highest_bit--;
+ si->inuse_pages++;
+ if (si->inuse_pages == si->pages) {
+ si->lowest_bit = si->max;
+ si->highest_bit = 0;
+ }
+ si->swap_map[offset] = 1;
+ si->cluster_next = offset + 1;
+ si->flags -= SWP_SCANNING;
+ return offset;
+ }
+
+ spin_unlock(&swap_lock);
+ while (++offset <= si->highest_bit) {
+ if (!si->swap_map[offset]) {
+ spin_lock(&swap_lock);
+ goto checks;
+ }
+ if (unlikely(--latency_ration < 0)) {
+ cond_resched();
+ latency_ration = LATENCY_LIMIT;
+ }
+ }
+ spin_lock(&swap_lock);
+ goto lowest;
+
+no_page:
+ si->flags -= SWP_SCANNING;
+ return 0;
+}
+
+swp_entry_t get_swap_page(void)
+{
+ struct swap_info_struct *si;
+ pgoff_t offset;
+ int type, next;
+ int wrapped = 0;
+
+ spin_lock(&swap_lock);
+ if (nr_swap_pages <= 0)
+ goto noswap;
+ nr_swap_pages--;
+
+ for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
+ si = swap_info + type;
+ next = si->next;
+ if (next < 0 ||
+ (!wrapped && si->prio != swap_info[next].prio)) {
+ next = swap_list.head;
+ wrapped++;
+ }
+
+ if (!si->highest_bit)
+ continue;
+ if (!(si->flags & SWP_WRITEOK))
+ continue;
+
+ swap_list.next = next;
+ offset = scan_swap_map(si);
+ if (offset) {
+ spin_unlock(&swap_lock);
+ return swp_entry(type, offset);
+ }
+ next = swap_list.next;
+ }
+
+ nr_swap_pages++;
+noswap:
+ spin_unlock(&swap_lock);
+ return (swp_entry_t) {0};
+}
+
+swp_entry_t get_swap_page_of_type(int type)
+{
+ struct swap_info_struct *si;
+ pgoff_t offset;
+
+ spin_lock(&swap_lock);
+ si = swap_info + type;
+ if (si->flags & SWP_WRITEOK) {
+ nr_swap_pages--;
+ offset = scan_swap_map(si);
+ if (offset) {
+ spin_unlock(&swap_lock);
+ return swp_entry(type, offset);
+ }
+ nr_swap_pages++;
+ }
+ spin_unlock(&swap_lock);
+ return (swp_entry_t) {0};
+}
+
+static struct swap_info_struct * swap_info_get(swp_entry_t entry)
+{
+ struct swap_info_struct * p;
+ unsigned long offset, type;
+
+ if (!entry.val)
+ goto out;
+ type = swp_type(entry);
+ if (type >= nr_swapfiles)
+ goto bad_nofile;
+ p = & swap_info[type];
+ if (!(p->flags & SWP_USED))
+ goto bad_device;
+ offset = swp_offset(entry);
+ if (offset >= p->max)
+ goto bad_offset;
+ if (!p->swap_map[offset])
+ goto bad_free;
+ spin_lock(&swap_lock);
+ return p;
+
+bad_free:
+ printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
+ goto out;
+bad_offset:
+ printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
+ goto out;
+bad_device:
+ printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
+ goto out;
+bad_nofile:
+ printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
+out:
+ return NULL;
+}
+
+static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
+{
+ int count = p->swap_map[offset];
+
+ if (count < SWAP_MAP_MAX) {
+ count--;
+ p->swap_map[offset] = count;
+ if (!count) {
+ if (offset < p->lowest_bit)
+ p->lowest_bit = offset;
+ if (offset > p->highest_bit)
+ p->highest_bit = offset;
+ if (p->prio > swap_info[swap_list.next].prio)
+ swap_list.next = p - swap_info;
+ nr_swap_pages++;
+ p->inuse_pages--;
+ }
+ }
+ return count;
+}
+
+/*
+ * Caller has made sure that the swapdevice corresponding to entry
+ * is still around or has not been recycled.
+ */
+void swap_free(swp_entry_t entry)
+{
+ struct swap_info_struct * p;
+
+ p = swap_info_get(entry);
+ if (p) {
+ swap_entry_free(p, swp_offset(entry));
+ spin_unlock(&swap_lock);
+ }
+}
+
+/*
+ * How many references to page are currently swapped out?
+ */
+static inline int page_swapcount(struct page *page)
+{
+ int count = 0;
+ struct swap_info_struct *p;
+ swp_entry_t entry;
+
+ entry.val = page_private(page);
+ p = swap_info_get(entry);
+ if (p) {
+ /* Subtract the 1 for the swap cache itself */
+ count = p->swap_map[swp_offset(entry)] - 1;
+ spin_unlock(&swap_lock);
+ }
+ return count;
+}
+
+/*
+ * We can use this swap cache entry directly
+ * if there are no other references to it.
+ */
+int can_share_swap_page(struct page *page)
+{
+ int count;
+
+ BUG_ON(!PageLocked(page));
+ count = page_mapcount(page);
+ if (count <= 1 && PageSwapCache(page))
+ count += page_swapcount(page);
+ return count == 1;
+}
+
+/*
+ * Work out if there are any other processes sharing this
+ * swap cache page. Free it if you can. Return success.
+ */
+static int remove_exclusive_swap_page_count(struct page *page, int count)
+{
+ int retval;
+ struct swap_info_struct * p;
+ swp_entry_t entry;
+
+ BUG_ON(PagePrivate(page));
+ BUG_ON(!PageLocked(page));
+
+ if (!PageSwapCache(page))
+ return 0;
+ if (PageWriteback(page))
+ return 0;
+ if (page_count(page) != count) /* us + cache + ptes */
+ return 0;
+
+ entry.val = page_private(page);
+ p = swap_info_get(entry);
+ if (!p)
+ return 0;
+
+ /* Is the only swap cache user the cache itself? */
+ retval = 0;
+ if (p->swap_map[swp_offset(entry)] == 1) {
+ /* Recheck the page count with the swapcache lock held.. */
+ spin_lock_irq(&swapper_space.tree_lock);
+ if ((page_count(page) == count) && !PageWriteback(page)) {
+ __delete_from_swap_cache(page);
+ SetPageDirty(page);
+ retval = 1;
+ }
+ spin_unlock_irq(&swapper_space.tree_lock);
+ }
+ spin_unlock(&swap_lock);
+
+ if (retval) {
+ swap_free(entry);
+ page_cache_release(page);
+ }
+
+ return retval;
+}
+
+/*
+ * Most of the time the page should have two references: one for the
+ * process and one for the swap cache.
+ */
+int remove_exclusive_swap_page(struct page *page)
+{
+ return remove_exclusive_swap_page_count(page, 2);
+}
+
+/*
+ * The pageout code holds an extra reference to the page. That raises
+ * the reference count to test for to 2 for a page that is only in the
+ * swap cache plus 1 for each process that maps the page.
+ */
+int remove_exclusive_swap_page_ref(struct page *page)
+{
+ return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page));
+}
+
+/*
+ * Free the swap entry like above, but also try to
+ * free the page cache entry if it is the last user.
+ */
+void free_swap_and_cache(swp_entry_t entry)
+{
+ struct swap_info_struct * p;
+ struct page *page = NULL;
+
+ if (is_migration_entry(entry))
+ return;
+
+ p = swap_info_get(entry);
+ if (p) {
+ if (swap_entry_free(p, swp_offset(entry)) == 1) {
+ page = find_get_page(&swapper_space, entry.val);
+ if (page && !trylock_page(page)) {
+ page_cache_release(page);
+ page = NULL;
+ }
+ }
+ spin_unlock(&swap_lock);
+ }
+ if (page) {
+ int one_user;
+
+ BUG_ON(PagePrivate(page));
+ one_user = (page_count(page) == 2);
+ /* Only cache user (+us), or swap space full? Free it! */
+ /* Also recheck PageSwapCache after page is locked (above) */
+ if (PageSwapCache(page) && !PageWriteback(page) &&
+ (one_user || vm_swap_full())) {
+ delete_from_swap_cache(page);
+ SetPageDirty(page);
+ }
+ unlock_page(page);
+ page_cache_release(page);
+ }
+}
+
+#ifdef CONFIG_HIBERNATION
+/*
+ * Find the swap type that corresponds to given device (if any).
+ *
+ * @offset - number of the PAGE_SIZE-sized block of the device, starting
+ * from 0, in which the swap header is expected to be located.
+ *
+ * This is needed for the suspend to disk (aka swsusp).
+ */
+int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
+{
+ struct block_device *bdev = NULL;
+ int i;
+
+ if (device)
+ bdev = bdget(device);
+
+ spin_lock(&swap_lock);
+ for (i = 0; i < nr_swapfiles; i++) {
+ struct swap_info_struct *sis = swap_info + i;
+
+ if (!(sis->flags & SWP_WRITEOK))
+ continue;
+
+ if (!bdev) {
+ if (bdev_p)
+ *bdev_p = sis->bdev;
+
+ spin_unlock(&swap_lock);
+ return i;
+ }
+ if (bdev == sis->bdev) {
+ struct swap_extent *se;
+
+ se = list_entry(sis->extent_list.next,
+ struct swap_extent, list);
+ if (se->start_block == offset) {
+ if (bdev_p)
+ *bdev_p = sis->bdev;
+
+ spin_unlock(&swap_lock);
+ bdput(bdev);
+ return i;
+ }
+ }
+ }
+ spin_unlock(&swap_lock);
+ if (bdev)
+ bdput(bdev);
+
+ return -ENODEV;
+}
+
+/*
+ * Return either the total number of swap pages of given type, or the number
+ * of free pages of that type (depending on @free)
+ *
+ * This is needed for software suspend
+ */
+unsigned int count_swap_pages(int type, int free)
+{
+ unsigned int n = 0;
+
+ if (type < nr_swapfiles) {
+ spin_lock(&swap_lock);
+ if (swap_info[type].flags & SWP_WRITEOK) {
+ n = swap_info[type].pages;
+ if (free)
+ n -= swap_info[type].inuse_pages;
+ }
+ spin_unlock(&swap_lock);
+ }
+ return n;
+}
+#endif
+
+/*
+ * No need to decide whether this PTE shares the swap entry with others,
+ * just let do_wp_page work it out if a write is requested later - to
+ * force COW, vm_page_prot omits write permission from any private vma.
+ */
+static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, swp_entry_t entry, struct page *page)
+{
+ spinlock_t *ptl;
+ pte_t *pte;
+ int ret = 1;
+
+ if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
+ ret = -ENOMEM;
+
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
+ if (ret > 0)
+ mem_cgroup_uncharge_page(page);
+ ret = 0;
+ goto out;
+ }
+
+ inc_mm_counter(vma->vm_mm, anon_rss);
+ get_page(page);
+ set_pte_at(vma->vm_mm, addr, pte,
+ pte_mkold(mk_pte(page, vma->vm_page_prot)));
+ page_add_anon_rmap(page, vma, addr);
+ swap_free(entry);
+ /*
+ * Move the page to the active list so it is not
+ * immediately swapped out again after swapon.
+ */
+ activate_page(page);
+out:
+ pte_unmap_unlock(pte, ptl);
+ return ret;
+}
+
+static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ swp_entry_t entry, struct page *page)
+{
+ pte_t swp_pte = swp_entry_to_pte(entry);
+ pte_t *pte;
+ int ret = 0;
+
+ /*
+ * We don't actually need pte lock while scanning for swp_pte: since
+ * we hold page lock and mmap_sem, swp_pte cannot be inserted into the
+ * page table while we're scanning; though it could get zapped, and on
+ * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
+ * of unmatched parts which look like swp_pte, so unuse_pte must
+ * recheck under pte lock. Scanning without pte lock lets it be
+ * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
+ */
+ pte = pte_offset_map(pmd, addr);
+ do {
+ /*
+ * swapoff spends a _lot_ of time in this loop!
+ * Test inline before going to call unuse_pte.
+ */
+ if (unlikely(pte_same(*pte, swp_pte))) {
+ pte_unmap(pte);
+ ret = unuse_pte(vma, pmd, addr, entry, page);
+ if (ret)
+ goto out;
+ pte = pte_offset_map(pmd, addr);
+ }
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ pte_unmap(pte - 1);
+out:
+ return ret;
+}
+
+static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+ unsigned long addr, unsigned long end,
+ swp_entry_t entry, struct page *page)
+{
+ pmd_t *pmd;
+ unsigned long next;
+ int ret;
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (pmd_none_or_clear_bad(pmd))
+ continue;
+ ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
+ if (ret)
+ return ret;
+ } while (pmd++, addr = next, addr != end);
+ return 0;
+}
+
+static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+ unsigned long addr, unsigned long end,
+ swp_entry_t entry, struct page *page)
+{
+ pud_t *pud;
+ unsigned long next;
+ int ret;
+
+ pud = pud_offset(pgd, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (pud_none_or_clear_bad(pud))
+ continue;
+ ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
+ if (ret)
+ return ret;
+ } while (pud++, addr = next, addr != end);
+ return 0;
+}
+
+static int unuse_vma(struct vm_area_struct *vma,
+ swp_entry_t entry, struct page *page)
+{
+ pgd_t *pgd;
+ unsigned long addr, end, next;
+ int ret;
+
+ if (page->mapping) {
+ addr = page_address_in_vma(page, vma);
+ if (addr == -EFAULT)
+ return 0;
+ else
+ end = addr + PAGE_SIZE;
+ } else {
+ addr = vma->vm_start;
+ end = vma->vm_end;
+ }
+
+ pgd = pgd_offset(vma->vm_mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd))
+ continue;
+ ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
+ if (ret)
+ return ret;
+ } while (pgd++, addr = next, addr != end);
+ return 0;
+}
+
+static int unuse_mm(struct mm_struct *mm,
+ swp_entry_t entry, struct page *page)
+{
+ struct vm_area_struct *vma;
+ int ret = 0;
+
+ if (!down_read_trylock(&mm->mmap_sem)) {
+ /*
+ * Activate page so shrink_inactive_list is unlikely to unmap
+ * its ptes while lock is dropped, so swapoff can make progress.
+ */
+ activate_page(page);
+ unlock_page(page);
+ down_read(&mm->mmap_sem);
+ lock_page(page);
+ }
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
+ break;
+ }
+ up_read(&mm->mmap_sem);
+ return (ret < 0)? ret: 0;
+}
+
+/*
+ * Scan swap_map from current position to next entry still in use.
+ * Recycle to start on reaching the end, returning 0 when empty.
+ */
+static unsigned int find_next_to_unuse(struct swap_info_struct *si,
+ unsigned int prev)
+{
+ unsigned int max = si->max;
+ unsigned int i = prev;
+ int count;
+
+ /*
+ * No need for swap_lock here: we're just looking
+ * for whether an entry is in use, not modifying it; false
+ * hits are okay, and sys_swapoff() has already prevented new
+ * allocations from this area (while holding swap_lock).
+ */
+ for (;;) {
+ if (++i >= max) {
+ if (!prev) {
+ i = 0;
+ break;
+ }
+ /*
+ * No entries in use at top of swap_map,
+ * loop back to start and recheck there.
+ */
+ max = prev + 1;
+ prev = 0;
+ i = 1;
+ }
+ count = si->swap_map[i];
+ if (count && count != SWAP_MAP_BAD)
+ break;
+ }
+ return i;
+}
+
+/*
+ * We completely avoid races by reading each swap page in advance,
+ * and then search for the process using it. All the necessary
+ * page table adjustments can then be made atomically.
+ */
+static int try_to_unuse(unsigned int type)
+{
+ struct swap_info_struct * si = &swap_info[type];
+ struct mm_struct *start_mm;
+ unsigned short *swap_map;
+ unsigned short swcount;
+ struct page *page;
+ swp_entry_t entry;
+ unsigned int i = 0;
+ int retval = 0;
+ int reset_overflow = 0;
+ int shmem;
+
+ /*
+ * When searching mms for an entry, a good strategy is to
+ * start at the first mm we freed the previous entry from
+ * (though actually we don't notice whether we or coincidence
+ * freed the entry). Initialize this start_mm with a hold.
+ *
+ * A simpler strategy would be to start at the last mm we
+ * freed the previous entry from; but that would take less
+ * advantage of mmlist ordering, which clusters forked mms
+ * together, child after parent. If we race with dup_mmap(), we
+ * prefer to resolve parent before child, lest we miss entries
+ * duplicated after we scanned child: using last mm would invert
+ * that. Though it's only a serious concern when an overflowed
+ * swap count is reset from SWAP_MAP_MAX, preventing a rescan.
+ */
+ start_mm = &init_mm;
+ atomic_inc(&init_mm.mm_users);
+
+ /*
+ * Keep on scanning until all entries have gone. Usually,
+ * one pass through swap_map is enough, but not necessarily:
+ * there are races when an instance of an entry might be missed.
+ */
+ while ((i = find_next_to_unuse(si, i)) != 0) {
+ if (signal_pending(current)) {
+ retval = -EINTR;
+ break;
+ }
+
+ /*
+ * Get a page for the entry, using the existing swap
+ * cache page if there is one. Otherwise, get a clean
+ * page and read the swap into it.
+ */
+ swap_map = &si->swap_map[i];
+ entry = swp_entry(type, i);
+ page = read_swap_cache_async(entry,
+ GFP_HIGHUSER_MOVABLE, NULL, 0);
+ if (!page) {
+ /*
+ * Either swap_duplicate() failed because entry
+ * has been freed independently, and will not be
+ * reused since sys_swapoff() already disabled
+ * allocation from here, or alloc_page() failed.
+ */
+ if (!*swap_map)
+ continue;
+ retval = -ENOMEM;
+ break;
+ }
+
+ /*
+ * Don't hold on to start_mm if it looks like exiting.
+ */
+ if (atomic_read(&start_mm->mm_users) == 1) {
+ mmput(start_mm);
+ start_mm = &init_mm;
+ atomic_inc(&init_mm.mm_users);
+ }
+
+ /*
+ * Wait for and lock page. When do_swap_page races with
+ * try_to_unuse, do_swap_page can handle the fault much
+ * faster than try_to_unuse can locate the entry. This
+ * apparently redundant "wait_on_page_locked" lets try_to_unuse
+ * defer to do_swap_page in such a case - in some tests,
+ * do_swap_page and try_to_unuse repeatedly compete.
+ */
+ wait_on_page_locked(page);
+ wait_on_page_writeback(page);
+ lock_page(page);
+ wait_on_page_writeback(page);
+
+ /*
+ * Remove all references to entry.
+ * Whenever we reach init_mm, there's no address space
+ * to search, but use it as a reminder to search shmem.
+ */
+ shmem = 0;
+ swcount = *swap_map;
+ if (swcount > 1) {
+ if (start_mm == &init_mm)
+ shmem = shmem_unuse(entry, page);
+ else
+ retval = unuse_mm(start_mm, entry, page);
+ }
+ if (*swap_map > 1) {
+ int set_start_mm = (*swap_map >= swcount);
+ struct list_head *p = &start_mm->mmlist;
+ struct mm_struct *new_start_mm = start_mm;
+ struct mm_struct *prev_mm = start_mm;
+ struct mm_struct *mm;
+
+ atomic_inc(&new_start_mm->mm_users);
+ atomic_inc(&prev_mm->mm_users);
+ spin_lock(&mmlist_lock);
+ while (*swap_map > 1 && !retval && !shmem &&
+ (p = p->next) != &start_mm->mmlist) {
+ mm = list_entry(p, struct mm_struct, mmlist);
+ if (!atomic_inc_not_zero(&mm->mm_users))
+ continue;
+ spin_unlock(&mmlist_lock);
+ mmput(prev_mm);
+ prev_mm = mm;
+
+ cond_resched();
+
+ swcount = *swap_map;
+ if (swcount <= 1)
+ ;
+ else if (mm == &init_mm) {
+ set_start_mm = 1;
+ shmem = shmem_unuse(entry, page);
+ } else
+ retval = unuse_mm(mm, entry, page);
+ if (set_start_mm && *swap_map < swcount) {
+ mmput(new_start_mm);
+ atomic_inc(&mm->mm_users);
+ new_start_mm = mm;
+ set_start_mm = 0;
+ }
+ spin_lock(&mmlist_lock);
+ }
+ spin_unlock(&mmlist_lock);
+ mmput(prev_mm);
+ mmput(start_mm);
+ start_mm = new_start_mm;
+ }
+ if (shmem) {
+ /* page has already been unlocked and released */
+ if (shmem > 0)
+ continue;
+ retval = shmem;
+ break;
+ }
+ if (retval) {
+ unlock_page(page);
+ page_cache_release(page);
+ break;
+ }
+
+ /*
+ * How could swap count reach 0x7fff when the maximum
+ * pid is 0x7fff, and there's no way to repeat a swap
+ * page within an mm (except in shmem, where it's the
+ * shared object which takes the reference count)?
+ * We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
+ *
+ * If that's wrong, then we should worry more about
+ * exit_mmap() and do_munmap() cases described above:
+ * we might be resetting SWAP_MAP_MAX too early here.
+ * We know "Undead"s can happen, they're okay, so don't
+ * report them; but do report if we reset SWAP_MAP_MAX.
+ */
+ if (*swap_map == SWAP_MAP_MAX) {
+ spin_lock(&swap_lock);
+ *swap_map = 1;
+ spin_unlock(&swap_lock);
+ reset_overflow = 1;
+ }
+
+ /*
+ * If a reference remains (rare), we would like to leave
+ * the page in the swap cache; but try_to_unmap could
+ * then re-duplicate the entry once we drop page lock,
+ * so we might loop indefinitely; also, that page could
+ * not be swapped out to other storage meanwhile. So:
+ * delete from cache even if there's another reference,
+ * after ensuring that the data has been saved to disk -
+ * since if the reference remains (rarer), it will be
+ * read from disk into another page. Splitting into two
+ * pages would be incorrect if swap supported "shared
+ * private" pages, but they are handled by tmpfs files.
+ */
+ if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ };
+
+ swap_writepage(page, &wbc);
+ lock_page(page);
+ wait_on_page_writeback(page);
+ }
+ if (PageSwapCache(page))
+ delete_from_swap_cache(page);
+
+ /*
+ * So we could skip searching mms once swap count went
+ * to 1, we did not mark any present ptes as dirty: must
+ * mark page dirty so shrink_page_list will preserve it.
+ */
+ SetPageDirty(page);
+ unlock_page(page);
+ page_cache_release(page);
+
+ /*
+ * Make sure that we aren't completely killing
+ * interactive performance.
+ */
+ cond_resched();
+ }
+
+ mmput(start_mm);
+ if (reset_overflow) {
+ printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
+ swap_overflow = 0;
+ }
+ return retval;
+}
+
+/*
+ * After a successful try_to_unuse, if no swap is now in use, we know
+ * we can empty the mmlist. swap_lock must be held on entry and exit.
+ * Note that mmlist_lock nests inside swap_lock, and an mm must be
+ * added to the mmlist just after page_duplicate - before would be racy.
+ */
+static void drain_mmlist(void)
+{
+ struct list_head *p, *next;
+ unsigned int i;
+
+ for (i = 0; i < nr_swapfiles; i++)
+ if (swap_info[i].inuse_pages)
+ return;
+ spin_lock(&mmlist_lock);
+ list_for_each_safe(p, next, &init_mm.mmlist)
+ list_del_init(p);
+ spin_unlock(&mmlist_lock);
+}
+
+/*
+ * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
+ * corresponds to page offset `offset'.
+ */
+sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
+{
+ struct swap_extent *se = sis->curr_swap_extent;
+ struct swap_extent *start_se = se;
+
+ for ( ; ; ) {
+ struct list_head *lh;
+
+ if (se->start_page <= offset &&
+ offset < (se->start_page + se->nr_pages)) {
+ return se->start_block + (offset - se->start_page);
+ }
+ lh = se->list.next;
+ if (lh == &sis->extent_list)
+ lh = lh->next;
+ se = list_entry(lh, struct swap_extent, list);
+ sis->curr_swap_extent = se;
+ BUG_ON(se == start_se); /* It *must* be present */
+ }
+}
+
+#ifdef CONFIG_HIBERNATION
+/*
+ * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
+ * corresponding to given index in swap_info (swap type).
+ */
+sector_t swapdev_block(int swap_type, pgoff_t offset)
+{
+ struct swap_info_struct *sis;
+
+ if (swap_type >= nr_swapfiles)
+ return 0;
+
+ sis = swap_info + swap_type;
+ return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0;
+}
+#endif /* CONFIG_HIBERNATION */
+
+/*
+ * Free all of a swapdev's extent information
+ */
+static void destroy_swap_extents(struct swap_info_struct *sis)
+{
+ while (!list_empty(&sis->extent_list)) {
+ struct swap_extent *se;
+
+ se = list_entry(sis->extent_list.next,
+ struct swap_extent, list);
+ list_del(&se->list);
+ kfree(se);
+ }
+}
+
+/*
+ * Add a block range (and the corresponding page range) into this swapdev's
+ * extent list. The extent list is kept sorted in page order.
+ *
+ * This function rather assumes that it is called in ascending page order.
+ */
+static int
+add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
+ unsigned long nr_pages, sector_t start_block)
+{
+ struct swap_extent *se;
+ struct swap_extent *new_se;
+ struct list_head *lh;
+
+ lh = sis->extent_list.prev; /* The highest page extent */
+ if (lh != &sis->extent_list) {
+ se = list_entry(lh, struct swap_extent, list);
+ BUG_ON(se->start_page + se->nr_pages != start_page);
+ if (se->start_block + se->nr_pages == start_block) {
+ /* Merge it */
+ se->nr_pages += nr_pages;
+ return 0;
+ }
+ }
+
+ /*
+ * No merge. Insert a new extent, preserving ordering.
+ */
+ new_se = kmalloc(sizeof(*se), GFP_KERNEL);
+ if (new_se == NULL)
+ return -ENOMEM;
+ new_se->start_page = start_page;
+ new_se->nr_pages = nr_pages;
+ new_se->start_block = start_block;
+
+ list_add_tail(&new_se->list, &sis->extent_list);
+ return 1;
+}
+
+/*
+ * A `swap extent' is a simple thing which maps a contiguous range of pages
+ * onto a contiguous range of disk blocks. An ordered list of swap extents
+ * is built at swapon time and is then used at swap_writepage/swap_readpage
+ * time for locating where on disk a page belongs.
+ *
+ * If the swapfile is an S_ISBLK block device, a single extent is installed.
+ * This is done so that the main operating code can treat S_ISBLK and S_ISREG
+ * swap files identically.
+ *
+ * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
+ * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK
+ * swapfiles are handled *identically* after swapon time.
+ *
+ * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
+ * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If
+ * some stray blocks are found which do not fall within the PAGE_SIZE alignment
+ * requirements, they are simply tossed out - we will never use those blocks
+ * for swapping.
+ *
+ * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon. This
+ * prevents root from shooting her foot off by ftruncating an in-use swapfile,
+ * which will scribble on the fs.
+ *
+ * The amount of disk space which a single swap extent represents varies.
+ * Typically it is in the 1-4 megabyte range. So we can have hundreds of
+ * extents in the list. To avoid much list walking, we cache the previous
+ * search location in `curr_swap_extent', and start new searches from there.
+ * This is extremely effective. The average number of iterations in
+ * map_swap_page() has been measured at about 0.3 per page. - akpm.
+ */
+static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
+{
+ struct inode *inode;
+ unsigned blocks_per_page;
+ unsigned long page_no;
+ unsigned blkbits;
+ sector_t probe_block;
+ sector_t last_block;
+ sector_t lowest_block = -1;
+ sector_t highest_block = 0;
+ int nr_extents = 0;
+ int ret;
+
+ inode = sis->swap_file->f_mapping->host;
+ if (S_ISBLK(inode->i_mode)) {
+ ret = add_swap_extent(sis, 0, sis->max, 0);
+ *span = sis->pages;
+ goto done;
+ }
+
+ blkbits = inode->i_blkbits;
+ blocks_per_page = PAGE_SIZE >> blkbits;
+
+ /*
+ * Map all the blocks into the extent list. This code doesn't try
+ * to be very smart.
+ */
+ probe_block = 0;
+ page_no = 0;
+ last_block = i_size_read(inode) >> blkbits;
+ while ((probe_block + blocks_per_page) <= last_block &&
+ page_no < sis->max) {
+ unsigned block_in_page;
+ sector_t first_block;
+
+ first_block = bmap(inode, probe_block);
+ if (first_block == 0)
+ goto bad_bmap;
+
+ /*
+ * It must be PAGE_SIZE aligned on-disk
+ */
+ if (first_block & (blocks_per_page - 1)) {
+ probe_block++;
+ goto reprobe;
+ }
+
+ for (block_in_page = 1; block_in_page < blocks_per_page;
+ block_in_page++) {
+ sector_t block;
+
+ block = bmap(inode, probe_block + block_in_page);
+ if (block == 0)
+ goto bad_bmap;
+ if (block != first_block + block_in_page) {
+ /* Discontiguity */
+ probe_block++;
+ goto reprobe;
+ }
+ }
+
+ first_block >>= (PAGE_SHIFT - blkbits);
+ if (page_no) { /* exclude the header page */
+ if (first_block < lowest_block)
+ lowest_block = first_block;
+ if (first_block > highest_block)
+ highest_block = first_block;
+ }
+
+ /*
+ * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
+ */
+ ret = add_swap_extent(sis, page_no, 1, first_block);
+ if (ret < 0)
+ goto out;
+ nr_extents += ret;
+ page_no++;
+ probe_block += blocks_per_page;
+reprobe:
+ continue;
+ }
+ ret = nr_extents;
+ *span = 1 + highest_block - lowest_block;
+ if (page_no == 0)
+ page_no = 1; /* force Empty message */
+ sis->max = page_no;
+ sis->pages = page_no - 1;
+ sis->highest_bit = page_no - 1;
+done:
+ sis->curr_swap_extent = list_entry(sis->extent_list.prev,
+ struct swap_extent, list);
+ goto out;
+bad_bmap:
+ printk(KERN_ERR "swapon: swapfile has holes\n");
+ ret = -EINVAL;
+out:
+ return ret;
+}
+
+#if 0 /* We don't need this yet */
+#include <linux/backing-dev.h>
+int page_queue_congested(struct page *page)
+{
+ struct backing_dev_info *bdi;
+
+ BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */
+
+ if (PageSwapCache(page)) {
+ swp_entry_t entry = { .val = page_private(page) };
+ struct swap_info_struct *sis;
+
+ sis = get_swap_info_struct(swp_type(entry));
+ bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info;
+ } else
+ bdi = page->mapping->backing_dev_info;
+ return bdi_write_congested(bdi);
+}
+#endif
+
+SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
+{
+ struct swap_info_struct * p = NULL;
+ unsigned short *swap_map;
+ struct file *swap_file, *victim;
+ struct address_space *mapping;
+ struct inode *inode;
+ char * pathname;
+ int i, type, prev;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ pathname = getname(specialfile);
+ err = PTR_ERR(pathname);
+ if (IS_ERR(pathname))
+ goto out;
+
+ victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0);
+ putname(pathname);
+ err = PTR_ERR(victim);
+ if (IS_ERR(victim))
+ goto out;
+
+ mapping = victim->f_mapping;
+ prev = -1;
+ spin_lock(&swap_lock);
+ for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
+ p = swap_info + type;
+ if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
+ if (p->swap_file->f_mapping == mapping)
+ break;
+ }
+ prev = type;
+ }
+ if (type < 0) {
+ err = -EINVAL;
+ spin_unlock(&swap_lock);
+ goto out_dput;
+ }
+ if (!security_vm_enough_memory(p->pages))
+ vm_unacct_memory(p->pages);
+ else {
+ err = -ENOMEM;
+ spin_unlock(&swap_lock);
+ goto out_dput;
+ }
+ if (prev < 0) {
+ swap_list.head = p->next;
+ } else {
+ swap_info[prev].next = p->next;
+ }
+ if (type == swap_list.next) {
+ /* just pick something that's safe... */
+ swap_list.next = swap_list.head;
+ }
+ if (p->prio < 0) {
+ for (i = p->next; i >= 0; i = swap_info[i].next)
+ swap_info[i].prio = p->prio--;
+ least_priority++;
+ }
+ nr_swap_pages -= p->pages;
+ total_swap_pages -= p->pages;
+ p->flags &= ~SWP_WRITEOK;
+ spin_unlock(&swap_lock);
+
+ current->flags |= PF_SWAPOFF;
+ err = try_to_unuse(type);
+ current->flags &= ~PF_SWAPOFF;
+
+ if (err) {
+ /* re-insert swap space back into swap_list */
+ spin_lock(&swap_lock);
+ if (p->prio < 0)
+ p->prio = --least_priority;
+ prev = -1;
+ for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
+ if (p->prio >= swap_info[i].prio)
+ break;
+ prev = i;
+ }
+ p->next = i;
+ if (prev < 0)
+ swap_list.head = swap_list.next = p - swap_info;
+ else
+ swap_info[prev].next = p - swap_info;
+ nr_swap_pages += p->pages;
+ total_swap_pages += p->pages;
+ p->flags |= SWP_WRITEOK;
+ spin_unlock(&swap_lock);
+ goto out_dput;
+ }
+
+ /* wait for any unplug function to finish */
+ down_write(&swap_unplug_sem);
+ up_write(&swap_unplug_sem);
+
+ destroy_swap_extents(p);
+ mutex_lock(&swapon_mutex);
+ spin_lock(&swap_lock);
+ drain_mmlist();
+
+ /* wait for anyone still in scan_swap_map */
+ p->highest_bit = 0; /* cuts scans short */
+ while (p->flags >= SWP_SCANNING) {
+ spin_unlock(&swap_lock);
+ schedule_timeout_uninterruptible(1);
+ spin_lock(&swap_lock);
+ }
+
+ swap_file = p->swap_file;
+ p->swap_file = NULL;
+ p->max = 0;
+ swap_map = p->swap_map;
+ p->swap_map = NULL;
+ p->flags = 0;
+ spin_unlock(&swap_lock);
+ mutex_unlock(&swapon_mutex);
+ vfree(swap_map);
+ inode = mapping->host;
+ if (S_ISBLK(inode->i_mode)) {
+ struct block_device *bdev = I_BDEV(inode);
+ set_blocksize(bdev, p->old_block_size);
+ bd_release(bdev);
+ } else {
+ mutex_lock(&inode->i_mutex);
+ inode->i_flags &= ~S_SWAPFILE;
+ mutex_unlock(&inode->i_mutex);
+ }
+ filp_close(swap_file, NULL);
+ err = 0;
+
+out_dput:
+ filp_close(victim, NULL);
+out:
+ return err;
+}
+
+#ifdef CONFIG_PROC_FS
+/* iterator */
+static void *swap_start(struct seq_file *swap, loff_t *pos)
+{
+ struct swap_info_struct *ptr = swap_info;
+ int i;
+ loff_t l = *pos;
+
+ mutex_lock(&swapon_mutex);
+
+ if (!l)
+ return SEQ_START_TOKEN;
+
+ for (i = 0; i < nr_swapfiles; i++, ptr++) {
+ if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
+ continue;
+ if (!--l)
+ return ptr;
+ }
+
+ return NULL;
+}
+
+static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
+{
+ struct swap_info_struct *ptr;
+ struct swap_info_struct *endptr = swap_info + nr_swapfiles;
+
+ if (v == SEQ_START_TOKEN)
+ ptr = swap_info;
+ else {
+ ptr = v;
+ ptr++;
+ }
+
+ for (; ptr < endptr; ptr++) {
+ if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
+ continue;
+ ++*pos;
+ return ptr;
+ }
+
+ return NULL;
+}
+
+static void swap_stop(struct seq_file *swap, void *v)
+{
+ mutex_unlock(&swapon_mutex);
+}
+
+static int swap_show(struct seq_file *swap, void *v)
+{
+ struct swap_info_struct *ptr = v;
+ struct file *file;
+ int len;
+
+ if (ptr == SEQ_START_TOKEN) {
+ seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
+ return 0;
+ }
+
+ file = ptr->swap_file;
+ len = seq_path(swap, &file->f_path, " \t\n\\");
+ seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
+ len < 40 ? 40 - len : 1, " ",
+ S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
+ "partition" : "file\t",
+ ptr->pages << (PAGE_SHIFT - 10),
+ ptr->inuse_pages << (PAGE_SHIFT - 10),
+ ptr->prio);
+ return 0;
+}
+
+static const struct seq_operations swaps_op = {
+ .start = swap_start,
+ .next = swap_next,
+ .stop = swap_stop,
+ .show = swap_show
+};
+
+static int swaps_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &swaps_op);
+}
+
+static const struct file_operations proc_swaps_operations = {
+ .open = swaps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init procswaps_init(void)
+{
+ proc_create("swaps", 0, NULL, &proc_swaps_operations);
+ return 0;
+}
+__initcall(procswaps_init);
+#endif /* CONFIG_PROC_FS */
+
+/*
+ * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
+ *
+ * The swapon system call
+ */
+SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
+{
+ struct swap_info_struct * p;
+ char *name = NULL;
+ struct block_device *bdev = NULL;
+ struct file *swap_file = NULL;
+ struct address_space *mapping;
+ unsigned int type;
+ int i, prev;
+ int error;
+ union swap_header *swap_header = NULL;
+ int swap_header_version;
+ unsigned int nr_good_pages = 0;
+ int nr_extents = 0;
+ sector_t span;
+ unsigned long maxpages = 1;
+ int swapfilesize;
+ unsigned short *swap_map = NULL;
+ struct page *page = NULL;
+ struct inode *inode = NULL;
+ int did_down = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ spin_lock(&swap_lock);
+ p = swap_info;
+ for (type = 0 ; type < nr_swapfiles ; type++,p++)
+ if (!(p->flags & SWP_USED))
+ break;
+ error = -EPERM;
+ if (type >= MAX_SWAPFILES) {
+ spin_unlock(&swap_lock);
+ goto out;
+ }
+ if (type >= nr_swapfiles)
+ nr_swapfiles = type+1;
+ memset(p, 0, sizeof(*p));
+ INIT_LIST_HEAD(&p->extent_list);
+ p->flags = SWP_USED;
+ p->next = -1;
+ spin_unlock(&swap_lock);
+ name = getname(specialfile);
+ error = PTR_ERR(name);
+ if (IS_ERR(name)) {
+ name = NULL;
+ goto bad_swap_2;
+ }
+ swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
+ error = PTR_ERR(swap_file);
+ if (IS_ERR(swap_file)) {
+ swap_file = NULL;
+ goto bad_swap_2;
+ }
+
+ p->swap_file = swap_file;
+ mapping = swap_file->f_mapping;
+ inode = mapping->host;
+
+ error = -EBUSY;
+ for (i = 0; i < nr_swapfiles; i++) {
+ struct swap_info_struct *q = &swap_info[i];
+
+ if (i == type || !q->swap_file)
+ continue;
+ if (mapping == q->swap_file->f_mapping)
+ goto bad_swap;
+ }
+
+ error = -EINVAL;
+ if (S_ISBLK(inode->i_mode)) {
+ bdev = I_BDEV(inode);
+ error = bd_claim(bdev, sys_swapon);
+ if (error < 0) {
+ bdev = NULL;
+ error = -EINVAL;
+ goto bad_swap;
+ }
+ p->old_block_size = block_size(bdev);
+ error = set_blocksize(bdev, PAGE_SIZE);
+ if (error < 0)
+ goto bad_swap;
+ p->bdev = bdev;
+ } else if (S_ISREG(inode->i_mode)) {
+ p->bdev = inode->i_sb->s_bdev;
+ mutex_lock(&inode->i_mutex);
+ did_down = 1;
+ if (IS_SWAPFILE(inode)) {
+ error = -EBUSY;
+ goto bad_swap;
+ }
+ } else {
+ goto bad_swap;
+ }
+
+ swapfilesize = i_size_read(inode) >> PAGE_SHIFT;
+
+ /*
+ * Read the swap header.
+ */
+ if (!mapping->a_ops->readpage) {
+ error = -EINVAL;
+ goto bad_swap;
+ }
+ page = read_mapping_page(mapping, 0, swap_file);
+ if (IS_ERR(page)) {
+ error = PTR_ERR(page);
+ goto bad_swap;
+ }
+ kmap(page);
+ swap_header = page_address(page);
+
+ if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
+ swap_header_version = 1;
+ else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
+ swap_header_version = 2;
+ else {
+ printk(KERN_ERR "Unable to find swap-space signature\n");
+ error = -EINVAL;
+ goto bad_swap;
+ }
+
+ switch (swap_header_version) {
+ case 1:
+ printk(KERN_ERR "version 0 swap is no longer supported. "
+ "Use mkswap -v1 %s\n", name);
+ error = -EINVAL;
+ goto bad_swap;
+ case 2:
+ /* swap partition endianess hack... */
+ if (swab32(swap_header->info.version) == 1) {
+ swab32s(&swap_header->info.version);
+ swab32s(&swap_header->info.last_page);
+ swab32s(&swap_header->info.nr_badpages);
+ for (i = 0; i < swap_header->info.nr_badpages; i++)
+ swab32s(&swap_header->info.badpages[i]);
+ }
+ /* Check the swap header's sub-version and the size of
+ the swap file and bad block lists */
+ if (swap_header->info.version != 1) {
+ printk(KERN_WARNING
+ "Unable to handle swap header version %d\n",
+ swap_header->info.version);
+ error = -EINVAL;
+ goto bad_swap;
+ }
+
+ p->lowest_bit = 1;
+ p->cluster_next = 1;
+
+ /*
+ * Find out how many pages are allowed for a single swap
+ * device. There are two limiting factors: 1) the number of
+ * bits for the swap offset in the swp_entry_t type and
+ * 2) the number of bits in the a swap pte as defined by
+ * the different architectures. In order to find the
+ * largest possible bit mask a swap entry with swap type 0
+ * and swap offset ~0UL is created, encoded to a swap pte,
+ * decoded to a swp_entry_t again and finally the swap
+ * offset is extracted. This will mask all the bits from
+ * the initial ~0UL mask that can't be encoded in either
+ * the swp_entry_t or the architecture definition of a
+ * swap pte.
+ */
+ maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1;
+ if (maxpages > swap_header->info.last_page)
+ maxpages = swap_header->info.last_page;
+ p->highest_bit = maxpages - 1;
+
+ error = -EINVAL;
+ if (!maxpages)
+ goto bad_swap;
+ if (swapfilesize && maxpages > swapfilesize) {
+ printk(KERN_WARNING
+ "Swap area shorter than signature indicates\n");
+ goto bad_swap;
+ }
+ if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
+ goto bad_swap;
+ if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
+ goto bad_swap;
+
+ /* OK, set up the swap map and apply the bad block list */
+ swap_map = vmalloc(maxpages * sizeof(short));
+ if (!swap_map) {
+ error = -ENOMEM;
+ goto bad_swap;
+ }
+
+ error = 0;
+ memset(swap_map, 0, maxpages * sizeof(short));
+ for (i = 0; i < swap_header->info.nr_badpages; i++) {
+ int page_nr = swap_header->info.badpages[i];
+ if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
+ error = -EINVAL;
+ else
+ swap_map[page_nr] = SWAP_MAP_BAD;
+ }
+ nr_good_pages = swap_header->info.last_page -
+ swap_header->info.nr_badpages -
+ 1 /* header page */;
+ if (error)
+ goto bad_swap;
+ }
+
+ if (nr_good_pages) {
+ swap_map[0] = SWAP_MAP_BAD;
+ p->max = maxpages;
+ p->pages = nr_good_pages;
+ nr_extents = setup_swap_extents(p, &span);
+ if (nr_extents < 0) {
+ error = nr_extents;
+ goto bad_swap;
+ }
+ nr_good_pages = p->pages;
+ }
+ if (!nr_good_pages) {
+ printk(KERN_WARNING "Empty swap-file\n");
+ error = -EINVAL;
+ goto bad_swap;
+ }
+
+ mutex_lock(&swapon_mutex);
+ spin_lock(&swap_lock);
+ if (swap_flags & SWAP_FLAG_PREFER)
+ p->prio =
+ (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
+ else
+ p->prio = --least_priority;
+ p->swap_map = swap_map;
+ p->flags = SWP_ACTIVE;
+ nr_swap_pages += nr_good_pages;
+ total_swap_pages += nr_good_pages;
+
+ printk(KERN_INFO "Adding %uk swap on %s. "
+ "Priority:%d extents:%d across:%lluk\n",
+ nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
+ nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10));
+
+ /* insert swap space into swap_list: */
+ prev = -1;
+ for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
+ if (p->prio >= swap_info[i].prio) {
+ break;
+ }
+ prev = i;
+ }
+ p->next = i;
+ if (prev < 0) {
+ swap_list.head = swap_list.next = p - swap_info;
+ } else {
+ swap_info[prev].next = p - swap_info;
+ }
+ spin_unlock(&swap_lock);
+ mutex_unlock(&swapon_mutex);
+ error = 0;
+ goto out;
+bad_swap:
+ if (bdev) {
+ set_blocksize(bdev, p->old_block_size);
+ bd_release(bdev);
+ }
+ destroy_swap_extents(p);
+bad_swap_2:
+ spin_lock(&swap_lock);
+ p->swap_file = NULL;
+ p->flags = 0;
+ spin_unlock(&swap_lock);
+ vfree(swap_map);
+ if (swap_file)
+ filp_close(swap_file, NULL);
+out:
+ if (page && !IS_ERR(page)) {
+ kunmap(page);
+ page_cache_release(page);
+ }
+ if (name)
+ putname(name);
+ if (did_down) {
+ if (!error)
+ inode->i_flags |= S_SWAPFILE;
+ mutex_unlock(&inode->i_mutex);
+ }
+ return error;
+}
+
+void si_swapinfo(struct sysinfo *val)
+{
+ unsigned int i;
+ unsigned long nr_to_be_unused = 0;
+
+ spin_lock(&swap_lock);
+ for (i = 0; i < nr_swapfiles; i++) {
+ if (!(swap_info[i].flags & SWP_USED) ||
+ (swap_info[i].flags & SWP_WRITEOK))
+ continue;
+ nr_to_be_unused += swap_info[i].inuse_pages;
+ }
+ val->freeswap = nr_swap_pages + nr_to_be_unused;
+ val->totalswap = total_swap_pages + nr_to_be_unused;
+ spin_unlock(&swap_lock);
+}
+
+/*
+ * Verify that a swap entry is valid and increment its swap map count.
+ *
+ * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
+ * "permanent", but will be reclaimed by the next swapoff.
+ */
+int swap_duplicate(swp_entry_t entry)
+{
+ struct swap_info_struct * p;
+ unsigned long offset, type;
+ int result = 0;
+
+ if (is_migration_entry(entry))
+ return 1;
+
+ type = swp_type(entry);
+ if (type >= nr_swapfiles)
+ goto bad_file;
+ p = type + swap_info;
+ offset = swp_offset(entry);
+
+ spin_lock(&swap_lock);
+ if (offset < p->max && p->swap_map[offset]) {
+ if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
+ p->swap_map[offset]++;
+ result = 1;
+ } else if (p->swap_map[offset] <= SWAP_MAP_MAX) {
+ if (swap_overflow++ < 5)
+ printk(KERN_WARNING "swap_dup: swap entry overflow\n");
+ p->swap_map[offset] = SWAP_MAP_MAX;
+ result = 1;
+ }
+ }
+ spin_unlock(&swap_lock);
+out:
+ return result;
+
+bad_file:
+ printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
+ goto out;
+}
+
+struct swap_info_struct *
+get_swap_info_struct(unsigned type)
+{
+ return &swap_info[type];
+}
+
+/*
+ * swap_lock prevents swap_map being freed. Don't grab an extra
+ * reference on the swaphandle, it doesn't matter if it becomes unused.
+ */
+int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
+{
+ struct swap_info_struct *si;
+ int our_page_cluster = page_cluster;
+ pgoff_t target, toff;
+ pgoff_t base, end;
+ int nr_pages = 0;
+
+ if (!our_page_cluster) /* no readahead */
+ return 0;
+
+ si = &swap_info[swp_type(entry)];
+ target = swp_offset(entry);
+ base = (target >> our_page_cluster) << our_page_cluster;
+ end = base + (1 << our_page_cluster);
+ if (!base) /* first page is swap header */
+ base++;
+
+ spin_lock(&swap_lock);
+ if (end > si->max) /* don't go beyond end of map */
+ end = si->max;
+
+ /* Count contiguous allocated slots above our target */
+ for (toff = target; ++toff < end; nr_pages++) {
+ /* Don't read in free or bad pages */
+ if (!si->swap_map[toff])
+ break;
+ if (si->swap_map[toff] == SWAP_MAP_BAD)
+ break;
+ }
+ /* Count contiguous allocated slots below our target */
+ for (toff = target; --toff >= base; nr_pages++) {
+ /* Don't read in free or bad pages */
+ if (!si->swap_map[toff])
+ break;
+ if (si->swap_map[toff] == SWAP_MAP_BAD)
+ break;
+ }
+ spin_unlock(&swap_lock);
+
+ /*
+ * Indicate starting offset, and return number of pages to get:
+ * if only 1, say 0, since there's then no readahead to be done.
+ */
+ *offset = ++toff;
+ return nr_pages? ++nr_pages: 0;
+}
diff --git a/mm/thrash.c b/mm/thrash.c
new file mode 100644
index 0000000..c4c5205
--- /dev/null
+++ b/mm/thrash.c
@@ -0,0 +1,79 @@
+/*
+ * mm/thrash.c
+ *
+ * Copyright (C) 2004, Red Hat, Inc.
+ * Copyright (C) 2004, Rik van Riel <riel@redhat.com>
+ * Released under the GPL, see the file COPYING for details.
+ *
+ * Simple token based thrashing protection, using the algorithm
+ * described in: http://www.cs.wm.edu/~sjiang/token.pdf
+ *
+ * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com>
+ * Improved algorithm to pass token:
+ * Each task has a priority which is incremented if it contended
+ * for the token in an interval less than its previous attempt.
+ * If the token is acquired, that task's priority is boosted to prevent
+ * the token from bouncing around too often and to let the task make
+ * some progress in its execution.
+ */
+
+#include <linux/jiffies.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/swap.h>
+
+static DEFINE_SPINLOCK(swap_token_lock);
+struct mm_struct *swap_token_mm;
+static unsigned int global_faults;
+
+void grab_swap_token(void)
+{
+ int current_interval;
+
+ global_faults++;
+
+ current_interval = global_faults - current->mm->faultstamp;
+
+ if (!spin_trylock(&swap_token_lock))
+ return;
+
+ /* First come first served */
+ if (swap_token_mm == NULL) {
+ current->mm->token_priority = current->mm->token_priority + 2;
+ swap_token_mm = current->mm;
+ goto out;
+ }
+
+ if (current->mm != swap_token_mm) {
+ if (current_interval < current->mm->last_interval)
+ current->mm->token_priority++;
+ else {
+ if (likely(current->mm->token_priority > 0))
+ current->mm->token_priority--;
+ }
+ /* Check if we deserve the token */
+ if (current->mm->token_priority >
+ swap_token_mm->token_priority) {
+ current->mm->token_priority += 2;
+ swap_token_mm = current->mm;
+ }
+ } else {
+ /* Token holder came in again! */
+ current->mm->token_priority += 2;
+ }
+
+out:
+ current->mm->faultstamp = global_faults;
+ current->mm->last_interval = current_interval;
+ spin_unlock(&swap_token_lock);
+return;
+}
+
+/* Called on process exit. */
+void __put_swap_token(struct mm_struct *mm)
+{
+ spin_lock(&swap_token_lock);
+ if (likely(mm == swap_token_mm))
+ swap_token_mm = NULL;
+ spin_unlock(&swap_token_lock);
+}
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
new file mode 100644
index 0000000..3e67d57
--- /dev/null
+++ b/mm/tiny-shmem.c
@@ -0,0 +1,134 @@
+/*
+ * tiny-shmem.c: simple shmemfs and tmpfs using ramfs code
+ *
+ * Matt Mackall <mpm@selenic.com> January, 2004
+ * derived from mm/shmem.c and fs/ramfs/inode.c
+ *
+ * This is intended for small system where the benefits of the full
+ * shmem code (swap-backed and resource-limited) are outweighed by
+ * their complexity. On systems without swap this code should be
+ * effectively equivalent, but much lighter weight.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/vfs.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/ramfs.h>
+
+static struct file_system_type tmpfs_fs_type = {
+ .name = "tmpfs",
+ .get_sb = ramfs_get_sb,
+ .kill_sb = kill_litter_super,
+};
+
+static struct vfsmount *shm_mnt;
+
+static int __init init_tmpfs(void)
+{
+ BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
+
+ shm_mnt = kern_mount(&tmpfs_fs_type);
+ BUG_ON(IS_ERR(shm_mnt));
+
+ return 0;
+}
+module_init(init_tmpfs)
+
+/**
+ * shmem_file_setup - get an unlinked file living in tmpfs
+ * @name: name for dentry (to be seen in /proc/<pid>/maps
+ * @size: size to be set for the file
+ * @flags: vm_flags
+ */
+struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
+{
+ int error;
+ struct file *file;
+ struct inode *inode;
+ struct dentry *dentry, *root;
+ struct qstr this;
+
+ if (IS_ERR(shm_mnt))
+ return (void *)shm_mnt;
+
+ error = -ENOMEM;
+ this.name = name;
+ this.len = strlen(name);
+ this.hash = 0; /* will go */
+ root = shm_mnt->mnt_root;
+ dentry = d_alloc(root, &this);
+ if (!dentry)
+ goto put_memory;
+
+ error = -ENFILE;
+ file = get_empty_filp();
+ if (!file)
+ goto put_dentry;
+
+ error = -ENOSPC;
+ inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
+ if (!inode)
+ goto close_file;
+
+ d_instantiate(dentry, inode);
+ inode->i_size = size;
+ inode->i_nlink = 0; /* It is unlinked */
+ init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
+ &ramfs_file_operations);
+
+#ifndef CONFIG_MMU
+ error = ramfs_nommu_expand_for_mapping(inode, size);
+ if (error)
+ goto close_file;
+#endif
+ return file;
+
+close_file:
+ put_filp(file);
+put_dentry:
+ dput(dentry);
+put_memory:
+ return ERR_PTR(error);
+}
+EXPORT_SYMBOL_GPL(shmem_file_setup);
+
+/**
+ * shmem_zero_setup - setup a shared anonymous mapping
+ * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
+ */
+int shmem_zero_setup(struct vm_area_struct *vma)
+{
+ struct file *file;
+ loff_t size = vma->vm_end - vma->vm_start;
+
+ file = shmem_file_setup("dev/zero", size, vma->vm_flags);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ if (vma->vm_file)
+ fput(vma->vm_file);
+ vma->vm_file = file;
+ vma->vm_ops = &generic_file_vm_ops;
+ return 0;
+}
+
+int shmem_unuse(swp_entry_t entry, struct page *page)
+{
+ return 0;
+}
+
+#ifndef CONFIG_MMU
+unsigned long shmem_get_unmapped_area(struct file *file,
+ unsigned long addr,
+ unsigned long len,
+ unsigned long pgoff,
+ unsigned long flags)
+{
+ return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags);
+}
+#endif
diff --git a/mm/truncate.c b/mm/truncate.c
new file mode 100644
index 0000000..1229211
--- /dev/null
+++ b/mm/truncate.c
@@ -0,0 +1,473 @@
+/*
+ * mm/truncate.c - code for taking down pages from address_spaces
+ *
+ * Copyright (C) 2002, Linus Torvalds
+ *
+ * 10Sep2002 Andrew Morton
+ * Initial version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/backing-dev.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/pagevec.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/buffer_head.h> /* grr. try_to_release_page,
+ do_invalidatepage */
+#include "internal.h"
+
+
+/**
+ * do_invalidatepage - invalidate part or all of a page
+ * @page: the page which is affected
+ * @offset: the index of the truncation point
+ *
+ * do_invalidatepage() is called when all or part of the page has become
+ * invalidated by a truncate operation.
+ *
+ * do_invalidatepage() does not have to release all buffers, but it must
+ * ensure that no dirty buffer is left outside @offset and that no I/O
+ * is underway against any of the blocks which are outside the truncation
+ * point. Because the caller is about to free (and possibly reuse) those
+ * blocks on-disk.
+ */
+void do_invalidatepage(struct page *page, unsigned long offset)
+{
+ void (*invalidatepage)(struct page *, unsigned long);
+ invalidatepage = page->mapping->a_ops->invalidatepage;
+#ifdef CONFIG_BLOCK
+ if (!invalidatepage)
+ invalidatepage = block_invalidatepage;
+#endif
+ if (invalidatepage)
+ (*invalidatepage)(page, offset);
+}
+
+static inline void truncate_partial_page(struct page *page, unsigned partial)
+{
+ zero_user_segment(page, partial, PAGE_CACHE_SIZE);
+ if (PagePrivate(page))
+ do_invalidatepage(page, partial);
+}
+
+/*
+ * This cancels just the dirty bit on the kernel page itself, it
+ * does NOT actually remove dirty bits on any mmap's that may be
+ * around. It also leaves the page tagged dirty, so any sync
+ * activity will still find it on the dirty lists, and in particular,
+ * clear_page_dirty_for_io() will still look at the dirty bits in
+ * the VM.
+ *
+ * Doing this should *normally* only ever be done when a page
+ * is truncated, and is not actually mapped anywhere at all. However,
+ * fs/buffer.c does this when it notices that somebody has cleaned
+ * out all the buffers on a page without actually doing it through
+ * the VM. Can you say "ext3 is horribly ugly"? Tought you could.
+ */
+void cancel_dirty_page(struct page *page, unsigned int account_size)
+{
+ if (TestClearPageDirty(page)) {
+ struct address_space *mapping = page->mapping;
+ if (mapping && mapping_cap_account_dirty(mapping)) {
+ dec_zone_page_state(page, NR_FILE_DIRTY);
+ dec_bdi_stat(mapping->backing_dev_info,
+ BDI_RECLAIMABLE);
+ if (account_size)
+ task_io_account_cancelled_write(account_size);
+ }
+ }
+}
+EXPORT_SYMBOL(cancel_dirty_page);
+
+/*
+ * If truncate cannot remove the fs-private metadata from the page, the page
+ * becomes orphaned. It will be left on the LRU and may even be mapped into
+ * user pagetables if we're racing with filemap_fault().
+ *
+ * We need to bale out if page->mapping is no longer equal to the original
+ * mapping. This happens a) when the VM reclaimed the page while we waited on
+ * its lock, b) when a concurrent invalidate_mapping_pages got there first and
+ * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
+ */
+static void
+truncate_complete_page(struct address_space *mapping, struct page *page)
+{
+ if (page->mapping != mapping)
+ return;
+
+ if (PagePrivate(page))
+ do_invalidatepage(page, 0);
+
+ cancel_dirty_page(page, PAGE_CACHE_SIZE);
+
+ clear_page_mlock(page);
+ remove_from_page_cache(page);
+ ClearPageMappedToDisk(page);
+ page_cache_release(page); /* pagecache ref */
+}
+
+/*
+ * This is for invalidate_mapping_pages(). That function can be called at
+ * any time, and is not supposed to throw away dirty pages. But pages can
+ * be marked dirty at any time too, so use remove_mapping which safely
+ * discards clean, unused pages.
+ *
+ * Returns non-zero if the page was successfully invalidated.
+ */
+static int
+invalidate_complete_page(struct address_space *mapping, struct page *page)
+{
+ int ret;
+
+ if (page->mapping != mapping)
+ return 0;
+
+ if (PagePrivate(page) && !try_to_release_page(page, 0))
+ return 0;
+
+ clear_page_mlock(page);
+ ret = remove_mapping(mapping, page);
+
+ return ret;
+}
+
+/**
+ * truncate_inode_pages - truncate range of pages specified by start & end byte offsets
+ * @mapping: mapping to truncate
+ * @lstart: offset from which to truncate
+ * @lend: offset to which to truncate
+ *
+ * Truncate the page cache, removing the pages that are between
+ * specified offsets (and zeroing out partial page
+ * (if lstart is not page aligned)).
+ *
+ * Truncate takes two passes - the first pass is nonblocking. It will not
+ * block on page locks and it will not block on writeback. The second pass
+ * will wait. This is to prevent as much IO as possible in the affected region.
+ * The first pass will remove most pages, so the search cost of the second pass
+ * is low.
+ *
+ * When looking at page->index outside the page lock we need to be careful to
+ * copy it into a local to avoid races (it could change at any time).
+ *
+ * We pass down the cache-hot hint to the page freeing code. Even if the
+ * mapping is large, it is probably the case that the final pages are the most
+ * recently touched, and freeing happens in ascending file offset order.
+ */
+void truncate_inode_pages_range(struct address_space *mapping,
+ loff_t lstart, loff_t lend)
+{
+ const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
+ pgoff_t end;
+ const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
+ struct pagevec pvec;
+ pgoff_t next;
+ int i;
+
+ if (mapping->nrpages == 0)
+ return;
+
+ BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
+ end = (lend >> PAGE_CACHE_SHIFT);
+
+ pagevec_init(&pvec, 0);
+ next = start;
+ while (next <= end &&
+ pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+ pgoff_t page_index = page->index;
+
+ if (page_index > end) {
+ next = page_index;
+ break;
+ }
+
+ if (page_index > next)
+ next = page_index;
+ next++;
+ if (!trylock_page(page))
+ continue;
+ if (PageWriteback(page)) {
+ unlock_page(page);
+ continue;
+ }
+ if (page_mapped(page)) {
+ unmap_mapping_range(mapping,
+ (loff_t)page_index<<PAGE_CACHE_SHIFT,
+ PAGE_CACHE_SIZE, 0);
+ }
+ truncate_complete_page(mapping, page);
+ unlock_page(page);
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+
+ if (partial) {
+ struct page *page = find_lock_page(mapping, start - 1);
+ if (page) {
+ wait_on_page_writeback(page);
+ truncate_partial_page(page, partial);
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ }
+
+ next = start;
+ for ( ; ; ) {
+ cond_resched();
+ if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+ if (next == start)
+ break;
+ next = start;
+ continue;
+ }
+ if (pvec.pages[0]->index > end) {
+ pagevec_release(&pvec);
+ break;
+ }
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+
+ if (page->index > end)
+ break;
+ lock_page(page);
+ wait_on_page_writeback(page);
+ if (page_mapped(page)) {
+ unmap_mapping_range(mapping,
+ (loff_t)page->index<<PAGE_CACHE_SHIFT,
+ PAGE_CACHE_SIZE, 0);
+ }
+ if (page->index > next)
+ next = page->index;
+ next++;
+ truncate_complete_page(mapping, page);
+ unlock_page(page);
+ }
+ pagevec_release(&pvec);
+ }
+}
+EXPORT_SYMBOL(truncate_inode_pages_range);
+
+/**
+ * truncate_inode_pages - truncate *all* the pages from an offset
+ * @mapping: mapping to truncate
+ * @lstart: offset from which to truncate
+ *
+ * Called under (and serialised by) inode->i_mutex.
+ */
+void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+{
+ truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
+}
+EXPORT_SYMBOL(truncate_inode_pages);
+
+unsigned long __invalidate_mapping_pages(struct address_space *mapping,
+ pgoff_t start, pgoff_t end, bool be_atomic)
+{
+ struct pagevec pvec;
+ pgoff_t next = start;
+ unsigned long ret = 0;
+ int i;
+
+ pagevec_init(&pvec, 0);
+ while (next <= end &&
+ pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+ pgoff_t index;
+ int lock_failed;
+
+ lock_failed = !trylock_page(page);
+
+ /*
+ * We really shouldn't be looking at the ->index of an
+ * unlocked page. But we're not allowed to lock these
+ * pages. So we rely upon nobody altering the ->index
+ * of this (pinned-by-us) page.
+ */
+ index = page->index;
+ if (index > next)
+ next = index;
+ next++;
+ if (lock_failed)
+ continue;
+
+ if (PageDirty(page) || PageWriteback(page))
+ goto unlock;
+ if (page_mapped(page))
+ goto unlock;
+ ret += invalidate_complete_page(mapping, page);
+unlock:
+ unlock_page(page);
+ if (next > end)
+ break;
+ }
+ pagevec_release(&pvec);
+ if (likely(!be_atomic))
+ cond_resched();
+ }
+ return ret;
+}
+
+/**
+ * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
+ * @mapping: the address_space which holds the pages to invalidate
+ * @start: the offset 'from' which to invalidate
+ * @end: the offset 'to' which to invalidate (inclusive)
+ *
+ * This function only removes the unlocked pages, if you want to
+ * remove all the pages of one inode, you must call truncate_inode_pages.
+ *
+ * invalidate_mapping_pages() will not block on IO activity. It will not
+ * invalidate pages which are dirty, locked, under writeback or mapped into
+ * pagetables.
+ */
+unsigned long invalidate_mapping_pages(struct address_space *mapping,
+ pgoff_t start, pgoff_t end)
+{
+ return __invalidate_mapping_pages(mapping, start, end, false);
+}
+EXPORT_SYMBOL(invalidate_mapping_pages);
+
+/*
+ * This is like invalidate_complete_page(), except it ignores the page's
+ * refcount. We do this because invalidate_inode_pages2() needs stronger
+ * invalidation guarantees, and cannot afford to leave pages behind because
+ * shrink_page_list() has a temp ref on them, or because they're transiently
+ * sitting in the lru_cache_add() pagevecs.
+ */
+static int
+invalidate_complete_page2(struct address_space *mapping, struct page *page)
+{
+ if (page->mapping != mapping)
+ return 0;
+
+ if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
+ return 0;
+
+ spin_lock_irq(&mapping->tree_lock);
+ if (PageDirty(page))
+ goto failed;
+
+ clear_page_mlock(page);
+ BUG_ON(PagePrivate(page));
+ __remove_from_page_cache(page);
+ spin_unlock_irq(&mapping->tree_lock);
+ page_cache_release(page); /* pagecache ref */
+ return 1;
+failed:
+ spin_unlock_irq(&mapping->tree_lock);
+ return 0;
+}
+
+static int do_launder_page(struct address_space *mapping, struct page *page)
+{
+ if (!PageDirty(page))
+ return 0;
+ if (page->mapping != mapping || mapping->a_ops->launder_page == NULL)
+ return 0;
+ return mapping->a_ops->launder_page(page);
+}
+
+/**
+ * invalidate_inode_pages2_range - remove range of pages from an address_space
+ * @mapping: the address_space
+ * @start: the page offset 'from' which to invalidate
+ * @end: the page offset 'to' which to invalidate (inclusive)
+ *
+ * Any pages which are found to be mapped into pagetables are unmapped prior to
+ * invalidation.
+ *
+ * Returns -EBUSY if any pages could not be invalidated.
+ */
+int invalidate_inode_pages2_range(struct address_space *mapping,
+ pgoff_t start, pgoff_t end)
+{
+ struct pagevec pvec;
+ pgoff_t next;
+ int i;
+ int ret = 0;
+ int ret2 = 0;
+ int did_range_unmap = 0;
+ int wrapped = 0;
+
+ pagevec_init(&pvec, 0);
+ next = start;
+ while (next <= end && !wrapped &&
+ pagevec_lookup(&pvec, mapping, next,
+ min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+ pgoff_t page_index;
+
+ lock_page(page);
+ if (page->mapping != mapping) {
+ unlock_page(page);
+ continue;
+ }
+ page_index = page->index;
+ next = page_index + 1;
+ if (next == 0)
+ wrapped = 1;
+ if (page_index > end) {
+ unlock_page(page);
+ break;
+ }
+ wait_on_page_writeback(page);
+ if (page_mapped(page)) {
+ if (!did_range_unmap) {
+ /*
+ * Zap the rest of the file in one hit.
+ */
+ unmap_mapping_range(mapping,
+ (loff_t)page_index<<PAGE_CACHE_SHIFT,
+ (loff_t)(end - page_index + 1)
+ << PAGE_CACHE_SHIFT,
+ 0);
+ did_range_unmap = 1;
+ } else {
+ /*
+ * Just zap this page
+ */
+ unmap_mapping_range(mapping,
+ (loff_t)page_index<<PAGE_CACHE_SHIFT,
+ PAGE_CACHE_SIZE, 0);
+ }
+ }
+ BUG_ON(page_mapped(page));
+ ret2 = do_launder_page(mapping, page);
+ if (ret2 == 0) {
+ if (!invalidate_complete_page2(mapping, page))
+ ret2 = -EBUSY;
+ }
+ if (ret2 < 0)
+ ret = ret2;
+ unlock_page(page);
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
+
+/**
+ * invalidate_inode_pages2 - remove all pages from an address_space
+ * @mapping: the address_space
+ *
+ * Any pages which are found to be mapped into pagetables are unmapped prior to
+ * invalidation.
+ *
+ * Returns -EIO if any pages could not be invalidated.
+ */
+int invalidate_inode_pages2(struct address_space *mapping)
+{
+ return invalidate_inode_pages2_range(mapping, 0, -1);
+}
+EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
diff --git a/mm/util.c b/mm/util.c
new file mode 100644
index 0000000..cb00b74
--- /dev/null
+++ b/mm/util.c
@@ -0,0 +1,188 @@
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <asm/uaccess.h>
+
+/**
+ * kstrdup - allocate space for and copy an existing string
+ * @s: the string to duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ */
+char *kstrdup(const char *s, gfp_t gfp)
+{
+ size_t len;
+ char *buf;
+
+ if (!s)
+ return NULL;
+
+ len = strlen(s) + 1;
+ buf = kmalloc_track_caller(len, gfp);
+ if (buf)
+ memcpy(buf, s, len);
+ return buf;
+}
+EXPORT_SYMBOL(kstrdup);
+
+/**
+ * kstrndup - allocate space for and copy an existing string
+ * @s: the string to duplicate
+ * @max: read at most @max chars from @s
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ */
+char *kstrndup(const char *s, size_t max, gfp_t gfp)
+{
+ size_t len;
+ char *buf;
+
+ if (!s)
+ return NULL;
+
+ len = strnlen(s, max);
+ buf = kmalloc_track_caller(len+1, gfp);
+ if (buf) {
+ memcpy(buf, s, len);
+ buf[len] = '\0';
+ }
+ return buf;
+}
+EXPORT_SYMBOL(kstrndup);
+
+/**
+ * kmemdup - duplicate region of memory
+ *
+ * @src: memory region to duplicate
+ * @len: memory region length
+ * @gfp: GFP mask to use
+ */
+void *kmemdup(const void *src, size_t len, gfp_t gfp)
+{
+ void *p;
+
+ p = kmalloc_track_caller(len, gfp);
+ if (p)
+ memcpy(p, src, len);
+ return p;
+}
+EXPORT_SYMBOL(kmemdup);
+
+/**
+ * __krealloc - like krealloc() but don't free @p.
+ * @p: object to reallocate memory for.
+ * @new_size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ *
+ * This function is like krealloc() except it never frees the originally
+ * allocated buffer. Use this if you don't want to free the buffer immediately
+ * like, for example, with RCU.
+ */
+void *__krealloc(const void *p, size_t new_size, gfp_t flags)
+{
+ void *ret;
+ size_t ks = 0;
+
+ if (unlikely(!new_size))
+ return ZERO_SIZE_PTR;
+
+ if (p)
+ ks = ksize(p);
+
+ if (ks >= new_size)
+ return (void *)p;
+
+ ret = kmalloc_track_caller(new_size, flags);
+ if (ret && p)
+ memcpy(ret, p, ks);
+
+ return ret;
+}
+EXPORT_SYMBOL(__krealloc);
+
+/**
+ * krealloc - reallocate memory. The contents will remain unchanged.
+ * @p: object to reallocate memory for.
+ * @new_size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ *
+ * The contents of the object pointed to are preserved up to the
+ * lesser of the new and old sizes. If @p is %NULL, krealloc()
+ * behaves exactly like kmalloc(). If @size is 0 and @p is not a
+ * %NULL pointer, the object pointed to is freed.
+ */
+void *krealloc(const void *p, size_t new_size, gfp_t flags)
+{
+ void *ret;
+
+ if (unlikely(!new_size)) {
+ kfree(p);
+ return ZERO_SIZE_PTR;
+ }
+
+ ret = __krealloc(p, new_size, flags);
+ if (ret && p != ret)
+ kfree(p);
+
+ return ret;
+}
+EXPORT_SYMBOL(krealloc);
+
+/*
+ * strndup_user - duplicate an existing string from user space
+ * @s: The string to duplicate
+ * @n: Maximum number of bytes to copy, including the trailing NUL.
+ */
+char *strndup_user(const char __user *s, long n)
+{
+ char *p;
+ long length;
+
+ length = strnlen_user(s, n);
+
+ if (!length)
+ return ERR_PTR(-EFAULT);
+
+ if (length > n)
+ return ERR_PTR(-EINVAL);
+
+ p = kmalloc(length, GFP_KERNEL);
+
+ if (!p)
+ return ERR_PTR(-ENOMEM);
+
+ if (copy_from_user(p, s, length)) {
+ kfree(p);
+ return ERR_PTR(-EFAULT);
+ }
+
+ p[length - 1] = '\0';
+
+ return p;
+}
+EXPORT_SYMBOL(strndup_user);
+
+#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT
+void arch_pick_mmap_layout(struct mm_struct *mm)
+{
+ mm->mmap_base = TASK_UNMAPPED_BASE;
+ mm->get_unmapped_area = arch_get_unmapped_area;
+ mm->unmap_area = arch_unmap_area;
+}
+#endif
+
+int __attribute__((weak)) get_user_pages_fast(unsigned long start,
+ int nr_pages, int write, struct page **pages)
+{
+ struct mm_struct *mm = current->mm;
+ int ret;
+
+ down_read(&mm->mmap_sem);
+ ret = get_user_pages(current, mm, start, nr_pages,
+ write, 0, pages, NULL);
+ up_read(&mm->mmap_sem);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(get_user_pages_fast);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
new file mode 100644
index 0000000..4172ce4
--- /dev/null
+++ b/mm/vmalloc.c
@@ -0,0 +1,1812 @@
+/*
+ * linux/mm/vmalloc.c
+ *
+ * Copyright (C) 1993 Linus Torvalds
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
+ * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
+ * Numa awareness, Christoph Lameter, SGI, June 2005
+ */
+
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/debugobjects.h>
+#include <linux/kallsyms.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/radix-tree.h>
+#include <linux/rcupdate.h>
+#include <linux/bootmem.h>
+
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+#include <asm/tlbflush.h>
+
+
+/*** Page table manipulation functions ***/
+
+static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
+{
+ pte_t *pte;
+
+ pte = pte_offset_kernel(pmd, addr);
+ do {
+ pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
+ WARN_ON(!pte_none(ptent) && !pte_present(ptent));
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+}
+
+static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
+{
+ pmd_t *pmd;
+ unsigned long next;
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (pmd_none_or_clear_bad(pmd))
+ continue;
+ vunmap_pte_range(pmd, addr, next);
+ } while (pmd++, addr = next, addr != end);
+}
+
+static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ pud = pud_offset(pgd, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (pud_none_or_clear_bad(pud))
+ continue;
+ vunmap_pmd_range(pud, addr, next);
+ } while (pud++, addr = next, addr != end);
+}
+
+static void vunmap_page_range(unsigned long addr, unsigned long end)
+{
+ pgd_t *pgd;
+ unsigned long next;
+
+ BUG_ON(addr >= end);
+ pgd = pgd_offset_k(addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd))
+ continue;
+ vunmap_pud_range(pgd, addr, next);
+ } while (pgd++, addr = next, addr != end);
+}
+
+static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr)
+{
+ pte_t *pte;
+
+ /*
+ * nr is a running index into the array which helps higher level
+ * callers keep track of where we're up to.
+ */
+
+ pte = pte_alloc_kernel(pmd, addr);
+ if (!pte)
+ return -ENOMEM;
+ do {
+ struct page *page = pages[*nr];
+
+ if (WARN_ON(!pte_none(*pte)))
+ return -EBUSY;
+ if (WARN_ON(!page))
+ return -ENOMEM;
+ set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
+ (*nr)++;
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ return 0;
+}
+
+static int vmap_pmd_range(pud_t *pud, unsigned long addr,
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr)
+{
+ pmd_t *pmd;
+ unsigned long next;
+
+ pmd = pmd_alloc(&init_mm, pud, addr);
+ if (!pmd)
+ return -ENOMEM;
+ do {
+ next = pmd_addr_end(addr, end);
+ if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
+ return -ENOMEM;
+ } while (pmd++, addr = next, addr != end);
+ return 0;
+}
+
+static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ pud = pud_alloc(&init_mm, pgd, addr);
+ if (!pud)
+ return -ENOMEM;
+ do {
+ next = pud_addr_end(addr, end);
+ if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
+ return -ENOMEM;
+ } while (pud++, addr = next, addr != end);
+ return 0;
+}
+
+/*
+ * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
+ * will have pfns corresponding to the "pages" array.
+ *
+ * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
+ */
+static int vmap_page_range(unsigned long start, unsigned long end,
+ pgprot_t prot, struct page **pages)
+{
+ pgd_t *pgd;
+ unsigned long next;
+ unsigned long addr = start;
+ int err = 0;
+ int nr = 0;
+
+ BUG_ON(addr >= end);
+ pgd = pgd_offset_k(addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ err = vmap_pud_range(pgd, addr, next, prot, pages, &nr);
+ if (err)
+ break;
+ } while (pgd++, addr = next, addr != end);
+ flush_cache_vmap(start, end);
+
+ if (unlikely(err))
+ return err;
+ return nr;
+}
+
+static inline int is_vmalloc_or_module_addr(const void *x)
+{
+ /*
+ * ARM, x86-64 and sparc64 put modules in a special place,
+ * and fall back on vmalloc() if that fails. Others
+ * just put it in the vmalloc space.
+ */
+#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
+ unsigned long addr = (unsigned long)x;
+ if (addr >= MODULES_VADDR && addr < MODULES_END)
+ return 1;
+#endif
+ return is_vmalloc_addr(x);
+}
+
+/*
+ * Walk a vmap address to the struct page it maps.
+ */
+struct page *vmalloc_to_page(const void *vmalloc_addr)
+{
+ unsigned long addr = (unsigned long) vmalloc_addr;
+ struct page *page = NULL;
+ pgd_t *pgd = pgd_offset_k(addr);
+
+ /*
+ * XXX we might need to change this if we add VIRTUAL_BUG_ON for
+ * architectures that do not vmalloc module space
+ */
+ VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
+
+ if (!pgd_none(*pgd)) {
+ pud_t *pud = pud_offset(pgd, addr);
+ if (!pud_none(*pud)) {
+ pmd_t *pmd = pmd_offset(pud, addr);
+ if (!pmd_none(*pmd)) {
+ pte_t *ptep, pte;
+
+ ptep = pte_offset_map(pmd, addr);
+ pte = *ptep;
+ if (pte_present(pte))
+ page = pte_page(pte);
+ pte_unmap(ptep);
+ }
+ }
+ }
+ return page;
+}
+EXPORT_SYMBOL(vmalloc_to_page);
+
+/*
+ * Map a vmalloc()-space virtual address to the physical page frame number.
+ */
+unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
+{
+ return page_to_pfn(vmalloc_to_page(vmalloc_addr));
+}
+EXPORT_SYMBOL(vmalloc_to_pfn);
+
+
+/*** Global kva allocator ***/
+
+#define VM_LAZY_FREE 0x01
+#define VM_LAZY_FREEING 0x02
+#define VM_VM_AREA 0x04
+
+struct vmap_area {
+ unsigned long va_start;
+ unsigned long va_end;
+ unsigned long flags;
+ struct rb_node rb_node; /* address sorted rbtree */
+ struct list_head list; /* address sorted list */
+ struct list_head purge_list; /* "lazy purge" list */
+ void *private;
+ struct rcu_head rcu_head;
+};
+
+static DEFINE_SPINLOCK(vmap_area_lock);
+static struct rb_root vmap_area_root = RB_ROOT;
+static LIST_HEAD(vmap_area_list);
+
+static struct vmap_area *__find_vmap_area(unsigned long addr)
+{
+ struct rb_node *n = vmap_area_root.rb_node;
+
+ while (n) {
+ struct vmap_area *va;
+
+ va = rb_entry(n, struct vmap_area, rb_node);
+ if (addr < va->va_start)
+ n = n->rb_left;
+ else if (addr > va->va_start)
+ n = n->rb_right;
+ else
+ return va;
+ }
+
+ return NULL;
+}
+
+static void __insert_vmap_area(struct vmap_area *va)
+{
+ struct rb_node **p = &vmap_area_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct rb_node *tmp;
+
+ while (*p) {
+ struct vmap_area *tmp;
+
+ parent = *p;
+ tmp = rb_entry(parent, struct vmap_area, rb_node);
+ if (va->va_start < tmp->va_end)
+ p = &(*p)->rb_left;
+ else if (va->va_end > tmp->va_start)
+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+
+ rb_link_node(&va->rb_node, parent, p);
+ rb_insert_color(&va->rb_node, &vmap_area_root);
+
+ /* address-sort this list so it is usable like the vmlist */
+ tmp = rb_prev(&va->rb_node);
+ if (tmp) {
+ struct vmap_area *prev;
+ prev = rb_entry(tmp, struct vmap_area, rb_node);
+ list_add_rcu(&va->list, &prev->list);
+ } else
+ list_add_rcu(&va->list, &vmap_area_list);
+}
+
+static void purge_vmap_area_lazy(void);
+
+/*
+ * Allocate a region of KVA of the specified size and alignment, within the
+ * vstart and vend.
+ */
+static struct vmap_area *alloc_vmap_area(unsigned long size,
+ unsigned long align,
+ unsigned long vstart, unsigned long vend,
+ int node, gfp_t gfp_mask)
+{
+ struct vmap_area *va;
+ struct rb_node *n;
+ unsigned long addr;
+ int purged = 0;
+
+ BUG_ON(!size);
+ BUG_ON(size & ~PAGE_MASK);
+
+ va = kmalloc_node(sizeof(struct vmap_area),
+ gfp_mask & GFP_RECLAIM_MASK, node);
+ if (unlikely(!va))
+ return ERR_PTR(-ENOMEM);
+
+retry:
+ addr = ALIGN(vstart, align);
+
+ spin_lock(&vmap_area_lock);
+ if (addr + size - 1 < addr)
+ goto overflow;
+
+ /* XXX: could have a last_hole cache */
+ n = vmap_area_root.rb_node;
+ if (n) {
+ struct vmap_area *first = NULL;
+
+ do {
+ struct vmap_area *tmp;
+ tmp = rb_entry(n, struct vmap_area, rb_node);
+ if (tmp->va_end >= addr) {
+ if (!first && tmp->va_start < addr + size)
+ first = tmp;
+ n = n->rb_left;
+ } else {
+ first = tmp;
+ n = n->rb_right;
+ }
+ } while (n);
+
+ if (!first)
+ goto found;
+
+ if (first->va_end < addr) {
+ n = rb_next(&first->rb_node);
+ if (n)
+ first = rb_entry(n, struct vmap_area, rb_node);
+ else
+ goto found;
+ }
+
+ while (addr + size > first->va_start && addr + size <= vend) {
+ addr = ALIGN(first->va_end + PAGE_SIZE, align);
+ if (addr + size - 1 < addr)
+ goto overflow;
+
+ n = rb_next(&first->rb_node);
+ if (n)
+ first = rb_entry(n, struct vmap_area, rb_node);
+ else
+ goto found;
+ }
+ }
+found:
+ if (addr + size > vend) {
+overflow:
+ spin_unlock(&vmap_area_lock);
+ if (!purged) {
+ purge_vmap_area_lazy();
+ purged = 1;
+ goto retry;
+ }
+ if (printk_ratelimit())
+ printk(KERN_WARNING "vmap allocation failed: "
+ "use vmalloc=<size> to increase size.\n");
+ return ERR_PTR(-EBUSY);
+ }
+
+ BUG_ON(addr & (align-1));
+
+ va->va_start = addr;
+ va->va_end = addr + size;
+ va->flags = 0;
+ __insert_vmap_area(va);
+ spin_unlock(&vmap_area_lock);
+
+ return va;
+}
+
+static void rcu_free_va(struct rcu_head *head)
+{
+ struct vmap_area *va = container_of(head, struct vmap_area, rcu_head);
+
+ kfree(va);
+}
+
+static void __free_vmap_area(struct vmap_area *va)
+{
+ BUG_ON(RB_EMPTY_NODE(&va->rb_node));
+ rb_erase(&va->rb_node, &vmap_area_root);
+ RB_CLEAR_NODE(&va->rb_node);
+ list_del_rcu(&va->list);
+
+ call_rcu(&va->rcu_head, rcu_free_va);
+}
+
+/*
+ * Free a region of KVA allocated by alloc_vmap_area
+ */
+static void free_vmap_area(struct vmap_area *va)
+{
+ spin_lock(&vmap_area_lock);
+ __free_vmap_area(va);
+ spin_unlock(&vmap_area_lock);
+}
+
+/*
+ * Clear the pagetable entries of a given vmap_area
+ */
+static void unmap_vmap_area(struct vmap_area *va)
+{
+ vunmap_page_range(va->va_start, va->va_end);
+}
+
+/*
+ * lazy_max_pages is the maximum amount of virtual address space we gather up
+ * before attempting to purge with a TLB flush.
+ *
+ * There is a tradeoff here: a larger number will cover more kernel page tables
+ * and take slightly longer to purge, but it will linearly reduce the number of
+ * global TLB flushes that must be performed. It would seem natural to scale
+ * this number up linearly with the number of CPUs (because vmapping activity
+ * could also scale linearly with the number of CPUs), however it is likely
+ * that in practice, workloads might be constrained in other ways that mean
+ * vmap activity will not scale linearly with CPUs. Also, I want to be
+ * conservative and not introduce a big latency on huge systems, so go with
+ * a less aggressive log scale. It will still be an improvement over the old
+ * code, and it will be simple to change the scale factor if we find that it
+ * becomes a problem on bigger systems.
+ */
+static unsigned long lazy_max_pages(void)
+{
+ unsigned int log;
+
+ log = fls(num_online_cpus());
+
+ return log * (32UL * 1024 * 1024 / PAGE_SIZE);
+}
+
+static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
+
+/*
+ * Purges all lazily-freed vmap areas.
+ *
+ * If sync is 0 then don't purge if there is already a purge in progress.
+ * If force_flush is 1, then flush kernel TLBs between *start and *end even
+ * if we found no lazy vmap areas to unmap (callers can use this to optimise
+ * their own TLB flushing).
+ * Returns with *start = min(*start, lowest purged address)
+ * *end = max(*end, highest purged address)
+ */
+static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
+ int sync, int force_flush)
+{
+ static DEFINE_SPINLOCK(purge_lock);
+ LIST_HEAD(valist);
+ struct vmap_area *va;
+ struct vmap_area *n_va;
+ int nr = 0;
+
+ /*
+ * If sync is 0 but force_flush is 1, we'll go sync anyway but callers
+ * should not expect such behaviour. This just simplifies locking for
+ * the case that isn't actually used at the moment anyway.
+ */
+ if (!sync && !force_flush) {
+ if (!spin_trylock(&purge_lock))
+ return;
+ } else
+ spin_lock(&purge_lock);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(va, &vmap_area_list, list) {
+ if (va->flags & VM_LAZY_FREE) {
+ if (va->va_start < *start)
+ *start = va->va_start;
+ if (va->va_end > *end)
+ *end = va->va_end;
+ nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
+ unmap_vmap_area(va);
+ list_add_tail(&va->purge_list, &valist);
+ va->flags |= VM_LAZY_FREEING;
+ va->flags &= ~VM_LAZY_FREE;
+ }
+ }
+ rcu_read_unlock();
+
+ if (nr) {
+ BUG_ON(nr > atomic_read(&vmap_lazy_nr));
+ atomic_sub(nr, &vmap_lazy_nr);
+ }
+
+ if (nr || force_flush)
+ flush_tlb_kernel_range(*start, *end);
+
+ if (nr) {
+ spin_lock(&vmap_area_lock);
+ list_for_each_entry_safe(va, n_va, &valist, purge_list)
+ __free_vmap_area(va);
+ spin_unlock(&vmap_area_lock);
+ }
+ spin_unlock(&purge_lock);
+}
+
+/*
+ * Kick off a purge of the outstanding lazy areas. Don't bother if somebody
+ * is already purging.
+ */
+static void try_purge_vmap_area_lazy(void)
+{
+ unsigned long start = ULONG_MAX, end = 0;
+
+ __purge_vmap_area_lazy(&start, &end, 0, 0);
+}
+
+/*
+ * Kick off a purge of the outstanding lazy areas.
+ */
+static void purge_vmap_area_lazy(void)
+{
+ unsigned long start = ULONG_MAX, end = 0;
+
+ __purge_vmap_area_lazy(&start, &end, 1, 0);
+}
+
+/*
+ * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
+ * called for the correct range previously.
+ */
+static void free_unmap_vmap_area_noflush(struct vmap_area *va)
+{
+ va->flags |= VM_LAZY_FREE;
+ atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
+ if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
+ try_purge_vmap_area_lazy();
+}
+
+/*
+ * Free and unmap a vmap area
+ */
+static void free_unmap_vmap_area(struct vmap_area *va)
+{
+ flush_cache_vunmap(va->va_start, va->va_end);
+ free_unmap_vmap_area_noflush(va);
+}
+
+static struct vmap_area *find_vmap_area(unsigned long addr)
+{
+ struct vmap_area *va;
+
+ spin_lock(&vmap_area_lock);
+ va = __find_vmap_area(addr);
+ spin_unlock(&vmap_area_lock);
+
+ return va;
+}
+
+static void free_unmap_vmap_area_addr(unsigned long addr)
+{
+ struct vmap_area *va;
+
+ va = find_vmap_area(addr);
+ BUG_ON(!va);
+ free_unmap_vmap_area(va);
+}
+
+
+/*** Per cpu kva allocator ***/
+
+/*
+ * vmap space is limited especially on 32 bit architectures. Ensure there is
+ * room for at least 16 percpu vmap blocks per CPU.
+ */
+/*
+ * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
+ * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess
+ * instead (we just need a rough idea)
+ */
+#if BITS_PER_LONG == 32
+#define VMALLOC_SPACE (128UL*1024*1024)
+#else
+#define VMALLOC_SPACE (128UL*1024*1024*1024)
+#endif
+
+#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE)
+#define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */
+#define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */
+#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2)
+#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */
+#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */
+#define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \
+ VMAP_MAX(VMAP_BBMAP_BITS_MIN, \
+ VMALLOC_PAGES / NR_CPUS / 16))
+
+#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
+
+static bool vmap_initialized __read_mostly = false;
+
+struct vmap_block_queue {
+ spinlock_t lock;
+ struct list_head free;
+ struct list_head dirty;
+ unsigned int nr_dirty;
+};
+
+struct vmap_block {
+ spinlock_t lock;
+ struct vmap_area *va;
+ struct vmap_block_queue *vbq;
+ unsigned long free, dirty;
+ DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
+ DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
+ union {
+ struct {
+ struct list_head free_list;
+ struct list_head dirty_list;
+ };
+ struct rcu_head rcu_head;
+ };
+};
+
+/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
+static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
+
+/*
+ * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
+ * in the free path. Could get rid of this if we change the API to return a
+ * "cookie" from alloc, to be passed to free. But no big deal yet.
+ */
+static DEFINE_SPINLOCK(vmap_block_tree_lock);
+static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
+
+/*
+ * We should probably have a fallback mechanism to allocate virtual memory
+ * out of partially filled vmap blocks. However vmap block sizing should be
+ * fairly reasonable according to the vmalloc size, so it shouldn't be a
+ * big problem.
+ */
+
+static unsigned long addr_to_vb_idx(unsigned long addr)
+{
+ addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
+ addr /= VMAP_BLOCK_SIZE;
+ return addr;
+}
+
+static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
+{
+ struct vmap_block_queue *vbq;
+ struct vmap_block *vb;
+ struct vmap_area *va;
+ unsigned long vb_idx;
+ int node, err;
+
+ node = numa_node_id();
+
+ vb = kmalloc_node(sizeof(struct vmap_block),
+ gfp_mask & GFP_RECLAIM_MASK, node);
+ if (unlikely(!vb))
+ return ERR_PTR(-ENOMEM);
+
+ va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
+ VMALLOC_START, VMALLOC_END,
+ node, gfp_mask);
+ if (unlikely(IS_ERR(va))) {
+ kfree(vb);
+ return ERR_PTR(PTR_ERR(va));
+ }
+
+ err = radix_tree_preload(gfp_mask);
+ if (unlikely(err)) {
+ kfree(vb);
+ free_vmap_area(va);
+ return ERR_PTR(err);
+ }
+
+ spin_lock_init(&vb->lock);
+ vb->va = va;
+ vb->free = VMAP_BBMAP_BITS;
+ vb->dirty = 0;
+ bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS);
+ bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
+ INIT_LIST_HEAD(&vb->free_list);
+ INIT_LIST_HEAD(&vb->dirty_list);
+
+ vb_idx = addr_to_vb_idx(va->va_start);
+ spin_lock(&vmap_block_tree_lock);
+ err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
+ spin_unlock(&vmap_block_tree_lock);
+ BUG_ON(err);
+ radix_tree_preload_end();
+
+ vbq = &get_cpu_var(vmap_block_queue);
+ vb->vbq = vbq;
+ spin_lock(&vbq->lock);
+ list_add(&vb->free_list, &vbq->free);
+ spin_unlock(&vbq->lock);
+ put_cpu_var(vmap_cpu_blocks);
+
+ return vb;
+}
+
+static void rcu_free_vb(struct rcu_head *head)
+{
+ struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head);
+
+ kfree(vb);
+}
+
+static void free_vmap_block(struct vmap_block *vb)
+{
+ struct vmap_block *tmp;
+ unsigned long vb_idx;
+
+ spin_lock(&vb->vbq->lock);
+ if (!list_empty(&vb->free_list))
+ list_del(&vb->free_list);
+ if (!list_empty(&vb->dirty_list))
+ list_del(&vb->dirty_list);
+ spin_unlock(&vb->vbq->lock);
+
+ vb_idx = addr_to_vb_idx(vb->va->va_start);
+ spin_lock(&vmap_block_tree_lock);
+ tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
+ spin_unlock(&vmap_block_tree_lock);
+ BUG_ON(tmp != vb);
+
+ free_unmap_vmap_area_noflush(vb->va);
+ call_rcu(&vb->rcu_head, rcu_free_vb);
+}
+
+static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
+{
+ struct vmap_block_queue *vbq;
+ struct vmap_block *vb;
+ unsigned long addr = 0;
+ unsigned int order;
+
+ BUG_ON(size & ~PAGE_MASK);
+ BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
+ order = get_order(size);
+
+again:
+ rcu_read_lock();
+ vbq = &get_cpu_var(vmap_block_queue);
+ list_for_each_entry_rcu(vb, &vbq->free, free_list) {
+ int i;
+
+ spin_lock(&vb->lock);
+ i = bitmap_find_free_region(vb->alloc_map,
+ VMAP_BBMAP_BITS, order);
+
+ if (i >= 0) {
+ addr = vb->va->va_start + (i << PAGE_SHIFT);
+ BUG_ON(addr_to_vb_idx(addr) !=
+ addr_to_vb_idx(vb->va->va_start));
+ vb->free -= 1UL << order;
+ if (vb->free == 0) {
+ spin_lock(&vbq->lock);
+ list_del_init(&vb->free_list);
+ spin_unlock(&vbq->lock);
+ }
+ spin_unlock(&vb->lock);
+ break;
+ }
+ spin_unlock(&vb->lock);
+ }
+ put_cpu_var(vmap_cpu_blocks);
+ rcu_read_unlock();
+
+ if (!addr) {
+ vb = new_vmap_block(gfp_mask);
+ if (IS_ERR(vb))
+ return vb;
+ goto again;
+ }
+
+ return (void *)addr;
+}
+
+static void vb_free(const void *addr, unsigned long size)
+{
+ unsigned long offset;
+ unsigned long vb_idx;
+ unsigned int order;
+ struct vmap_block *vb;
+
+ BUG_ON(size & ~PAGE_MASK);
+ BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
+
+ flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);
+
+ order = get_order(size);
+
+ offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
+
+ vb_idx = addr_to_vb_idx((unsigned long)addr);
+ rcu_read_lock();
+ vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
+ rcu_read_unlock();
+ BUG_ON(!vb);
+
+ spin_lock(&vb->lock);
+ bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order);
+ if (!vb->dirty) {
+ spin_lock(&vb->vbq->lock);
+ list_add(&vb->dirty_list, &vb->vbq->dirty);
+ spin_unlock(&vb->vbq->lock);
+ }
+ vb->dirty += 1UL << order;
+ if (vb->dirty == VMAP_BBMAP_BITS) {
+ BUG_ON(vb->free || !list_empty(&vb->free_list));
+ spin_unlock(&vb->lock);
+ free_vmap_block(vb);
+ } else
+ spin_unlock(&vb->lock);
+}
+
+/**
+ * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
+ *
+ * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
+ * to amortize TLB flushing overheads. What this means is that any page you
+ * have now, may, in a former life, have been mapped into kernel virtual
+ * address by the vmap layer and so there might be some CPUs with TLB entries
+ * still referencing that page (additional to the regular 1:1 kernel mapping).
+ *
+ * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
+ * be sure that none of the pages we have control over will have any aliases
+ * from the vmap layer.
+ */
+void vm_unmap_aliases(void)
+{
+ unsigned long start = ULONG_MAX, end = 0;
+ int cpu;
+ int flush = 0;
+
+ if (unlikely(!vmap_initialized))
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
+ struct vmap_block *vb;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(vb, &vbq->free, free_list) {
+ int i;
+
+ spin_lock(&vb->lock);
+ i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS);
+ while (i < VMAP_BBMAP_BITS) {
+ unsigned long s, e;
+ int j;
+ j = find_next_zero_bit(vb->dirty_map,
+ VMAP_BBMAP_BITS, i);
+
+ s = vb->va->va_start + (i << PAGE_SHIFT);
+ e = vb->va->va_start + (j << PAGE_SHIFT);
+ vunmap_page_range(s, e);
+ flush = 1;
+
+ if (s < start)
+ start = s;
+ if (e > end)
+ end = e;
+
+ i = j;
+ i = find_next_bit(vb->dirty_map,
+ VMAP_BBMAP_BITS, i);
+ }
+ spin_unlock(&vb->lock);
+ }
+ rcu_read_unlock();
+ }
+
+ __purge_vmap_area_lazy(&start, &end, 1, flush);
+}
+EXPORT_SYMBOL_GPL(vm_unmap_aliases);
+
+/**
+ * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
+ * @mem: the pointer returned by vm_map_ram
+ * @count: the count passed to that vm_map_ram call (cannot unmap partial)
+ */
+void vm_unmap_ram(const void *mem, unsigned int count)
+{
+ unsigned long size = count << PAGE_SHIFT;
+ unsigned long addr = (unsigned long)mem;
+
+ BUG_ON(!addr);
+ BUG_ON(addr < VMALLOC_START);
+ BUG_ON(addr > VMALLOC_END);
+ BUG_ON(addr & (PAGE_SIZE-1));
+
+ debug_check_no_locks_freed(mem, size);
+
+ if (likely(count <= VMAP_MAX_ALLOC))
+ vb_free(mem, size);
+ else
+ free_unmap_vmap_area_addr(addr);
+}
+EXPORT_SYMBOL(vm_unmap_ram);
+
+/**
+ * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
+ * @pages: an array of pointers to the pages to be mapped
+ * @count: number of pages
+ * @node: prefer to allocate data structures on this node
+ * @prot: memory protection to use. PAGE_KERNEL for regular RAM
+ *
+ * Returns: a pointer to the address that has been mapped, or %NULL on failure
+ */
+void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
+{
+ unsigned long size = count << PAGE_SHIFT;
+ unsigned long addr;
+ void *mem;
+
+ if (likely(count <= VMAP_MAX_ALLOC)) {
+ mem = vb_alloc(size, GFP_KERNEL);
+ if (IS_ERR(mem))
+ return NULL;
+ addr = (unsigned long)mem;
+ } else {
+ struct vmap_area *va;
+ va = alloc_vmap_area(size, PAGE_SIZE,
+ VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
+ if (IS_ERR(va))
+ return NULL;
+
+ addr = va->va_start;
+ mem = (void *)addr;
+ }
+ if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
+ vm_unmap_ram(mem, count);
+ return NULL;
+ }
+ return mem;
+}
+EXPORT_SYMBOL(vm_map_ram);
+
+void __init vmalloc_init(void)
+{
+ struct vmap_area *va;
+ struct vm_struct *tmp;
+ int i;
+
+ for_each_possible_cpu(i) {
+ struct vmap_block_queue *vbq;
+
+ vbq = &per_cpu(vmap_block_queue, i);
+ spin_lock_init(&vbq->lock);
+ INIT_LIST_HEAD(&vbq->free);
+ INIT_LIST_HEAD(&vbq->dirty);
+ vbq->nr_dirty = 0;
+ }
+
+ /* Import existing vmlist entries. */
+ for (tmp = vmlist; tmp; tmp = tmp->next) {
+ va = alloc_bootmem(sizeof(struct vmap_area));
+ va->flags = tmp->flags | VM_VM_AREA;
+ va->va_start = (unsigned long)tmp->addr;
+ va->va_end = va->va_start + tmp->size;
+ __insert_vmap_area(va);
+ }
+ vmap_initialized = true;
+}
+
+void unmap_kernel_range(unsigned long addr, unsigned long size)
+{
+ unsigned long end = addr + size;
+
+ flush_cache_vunmap(addr, end);
+ vunmap_page_range(addr, end);
+ flush_tlb_kernel_range(addr, end);
+}
+
+int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
+{
+ unsigned long addr = (unsigned long)area->addr;
+ unsigned long end = addr + area->size - PAGE_SIZE;
+ int err;
+
+ err = vmap_page_range(addr, end, prot, *pages);
+ if (err > 0) {
+ *pages += err;
+ err = 0;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(map_vm_area);
+
+/*** Old vmalloc interfaces ***/
+DEFINE_RWLOCK(vmlist_lock);
+struct vm_struct *vmlist;
+
+static struct vm_struct *__get_vm_area_node(unsigned long size,
+ unsigned long flags, unsigned long start, unsigned long end,
+ int node, gfp_t gfp_mask, void *caller)
+{
+ static struct vmap_area *va;
+ struct vm_struct *area;
+ struct vm_struct *tmp, **p;
+ unsigned long align = 1;
+
+ BUG_ON(in_interrupt());
+ if (flags & VM_IOREMAP) {
+ int bit = fls(size);
+
+ if (bit > IOREMAP_MAX_ORDER)
+ bit = IOREMAP_MAX_ORDER;
+ else if (bit < PAGE_SHIFT)
+ bit = PAGE_SHIFT;
+
+ align = 1ul << bit;
+ }
+
+ size = PAGE_ALIGN(size);
+ if (unlikely(!size))
+ return NULL;
+
+ area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
+ if (unlikely(!area))
+ return NULL;
+
+ /*
+ * We always allocate a guard page.
+ */
+ size += PAGE_SIZE;
+
+ va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
+ if (IS_ERR(va)) {
+ kfree(area);
+ return NULL;
+ }
+
+ area->flags = flags;
+ area->addr = (void *)va->va_start;
+ area->size = size;
+ area->pages = NULL;
+ area->nr_pages = 0;
+ area->phys_addr = 0;
+ area->caller = caller;
+ va->private = area;
+ va->flags |= VM_VM_AREA;
+
+ write_lock(&vmlist_lock);
+ for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
+ if (tmp->addr >= area->addr)
+ break;
+ }
+ area->next = *p;
+ *p = area;
+ write_unlock(&vmlist_lock);
+
+ return area;
+}
+
+struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
+ unsigned long start, unsigned long end)
+{
+ return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL_GPL(__get_vm_area);
+
+/**
+ * get_vm_area - reserve a contiguous kernel virtual area
+ * @size: size of the area
+ * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
+ *
+ * Search an area of @size in the kernel virtual mapping area,
+ * and reserved it for out purposes. Returns the area descriptor
+ * on success or %NULL on failure.
+ */
+struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
+{
+ return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END,
+ -1, GFP_KERNEL, __builtin_return_address(0));
+}
+
+struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
+ void *caller)
+{
+ return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END,
+ -1, GFP_KERNEL, caller);
+}
+
+struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
+ int node, gfp_t gfp_mask)
+{
+ return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node,
+ gfp_mask, __builtin_return_address(0));
+}
+
+static struct vm_struct *find_vm_area(const void *addr)
+{
+ struct vmap_area *va;
+
+ va = find_vmap_area((unsigned long)addr);
+ if (va && va->flags & VM_VM_AREA)
+ return va->private;
+
+ return NULL;
+}
+
+/**
+ * remove_vm_area - find and remove a continuous kernel virtual area
+ * @addr: base address
+ *
+ * Search for the kernel VM area starting at @addr, and remove it.
+ * This function returns the found VM area, but using it is NOT safe
+ * on SMP machines, except for its size or flags.
+ */
+struct vm_struct *remove_vm_area(const void *addr)
+{
+ struct vmap_area *va;
+
+ va = find_vmap_area((unsigned long)addr);
+ if (va && va->flags & VM_VM_AREA) {
+ struct vm_struct *vm = va->private;
+ struct vm_struct *tmp, **p;
+ free_unmap_vmap_area(va);
+ vm->size -= PAGE_SIZE;
+
+ write_lock(&vmlist_lock);
+ for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
+ ;
+ *p = tmp->next;
+ write_unlock(&vmlist_lock);
+
+ return vm;
+ }
+ return NULL;
+}
+
+static void __vunmap(const void *addr, int deallocate_pages)
+{
+ struct vm_struct *area;
+
+ if (!addr)
+ return;
+
+ if ((PAGE_SIZE-1) & (unsigned long)addr) {
+ WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
+ return;
+ }
+
+ area = remove_vm_area(addr);
+ if (unlikely(!area)) {
+ WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
+ addr);
+ return;
+ }
+
+ debug_check_no_locks_freed(addr, area->size);
+ debug_check_no_obj_freed(addr, area->size);
+
+ if (deallocate_pages) {
+ int i;
+
+ for (i = 0; i < area->nr_pages; i++) {
+ struct page *page = area->pages[i];
+
+ BUG_ON(!page);
+ __free_page(page);
+ }
+
+ if (area->flags & VM_VPAGES)
+ vfree(area->pages);
+ else
+ kfree(area->pages);
+ }
+
+ kfree(area);
+ return;
+}
+
+/**
+ * vfree - release memory allocated by vmalloc()
+ * @addr: memory base address
+ *
+ * Free the virtually continuous memory area starting at @addr, as
+ * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
+ * NULL, no operation is performed.
+ *
+ * Must not be called in interrupt context.
+ */
+void vfree(const void *addr)
+{
+ BUG_ON(in_interrupt());
+ __vunmap(addr, 1);
+}
+EXPORT_SYMBOL(vfree);
+
+/**
+ * vunmap - release virtual mapping obtained by vmap()
+ * @addr: memory base address
+ *
+ * Free the virtually contiguous memory area starting at @addr,
+ * which was created from the page array passed to vmap().
+ *
+ * Must not be called in interrupt context.
+ */
+void vunmap(const void *addr)
+{
+ BUG_ON(in_interrupt());
+ __vunmap(addr, 0);
+}
+EXPORT_SYMBOL(vunmap);
+
+/**
+ * vmap - map an array of pages into virtually contiguous space
+ * @pages: array of page pointers
+ * @count: number of pages to map
+ * @flags: vm_area->flags
+ * @prot: page protection for the mapping
+ *
+ * Maps @count pages from @pages into contiguous kernel virtual
+ * space.
+ */
+void *vmap(struct page **pages, unsigned int count,
+ unsigned long flags, pgprot_t prot)
+{
+ struct vm_struct *area;
+
+ if (count > num_physpages)
+ return NULL;
+
+ area = get_vm_area_caller((count << PAGE_SHIFT), flags,
+ __builtin_return_address(0));
+ if (!area)
+ return NULL;
+
+ if (map_vm_area(area, prot, &pages)) {
+ vunmap(area->addr);
+ return NULL;
+ }
+
+ return area->addr;
+}
+EXPORT_SYMBOL(vmap);
+
+static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
+ int node, void *caller);
+static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
+ pgprot_t prot, int node, void *caller)
+{
+ struct page **pages;
+ unsigned int nr_pages, array_size, i;
+
+ nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
+ array_size = (nr_pages * sizeof(struct page *));
+
+ area->nr_pages = nr_pages;
+ /* Please note that the recursion is strictly bounded. */
+ if (array_size > PAGE_SIZE) {
+ pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO,
+ PAGE_KERNEL, node, caller);
+ area->flags |= VM_VPAGES;
+ } else {
+ pages = kmalloc_node(array_size,
+ (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO,
+ node);
+ }
+ area->pages = pages;
+ area->caller = caller;
+ if (!area->pages) {
+ remove_vm_area(area->addr);
+ kfree(area);
+ return NULL;
+ }
+
+ for (i = 0; i < area->nr_pages; i++) {
+ struct page *page;
+
+ if (node < 0)
+ page = alloc_page(gfp_mask);
+ else
+ page = alloc_pages_node(node, gfp_mask, 0);
+
+ if (unlikely(!page)) {
+ /* Successfully allocated i pages, free them in __vunmap() */
+ area->nr_pages = i;
+ goto fail;
+ }
+ area->pages[i] = page;
+ }
+
+ if (map_vm_area(area, prot, &pages))
+ goto fail;
+ return area->addr;
+
+fail:
+ vfree(area->addr);
+ return NULL;
+}
+
+void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
+{
+ return __vmalloc_area_node(area, gfp_mask, prot, -1,
+ __builtin_return_address(0));
+}
+
+/**
+ * __vmalloc_node - allocate virtually contiguous memory
+ * @size: allocation size
+ * @gfp_mask: flags for the page level allocator
+ * @prot: protection mask for the allocated pages
+ * @node: node to use for allocation or -1
+ * @caller: caller's return address
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator with @gfp_mask flags. Map them into contiguous
+ * kernel virtual space, using a pagetable protection of @prot.
+ */
+static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
+ int node, void *caller)
+{
+ struct vm_struct *area;
+
+ size = PAGE_ALIGN(size);
+ if (!size || (size >> PAGE_SHIFT) > num_physpages)
+ return NULL;
+
+ area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END,
+ node, gfp_mask, caller);
+
+ if (!area)
+ return NULL;
+
+ return __vmalloc_area_node(area, gfp_mask, prot, node, caller);
+}
+
+void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
+{
+ return __vmalloc_node(size, gfp_mask, prot, -1,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(__vmalloc);
+
+/**
+ * vmalloc - allocate virtually contiguous memory
+ * @size: allocation size
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
+void *vmalloc(unsigned long size)
+{
+ return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
+ -1, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(vmalloc);
+
+/**
+ * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
+ * @size: allocation size
+ *
+ * The resulting memory area is zeroed so it can be mapped to userspace
+ * without leaking data.
+ */
+void *vmalloc_user(unsigned long size)
+{
+ struct vm_struct *area;
+ void *ret;
+
+ ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
+ if (ret) {
+ area = find_vm_area(ret);
+ area->flags |= VM_USERMAP;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(vmalloc_user);
+
+/**
+ * vmalloc_node - allocate memory on a specific node
+ * @size: allocation size
+ * @node: numa node
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
+void *vmalloc_node(unsigned long size, int node)
+{
+ return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
+ node, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(vmalloc_node);
+
+#ifndef PAGE_KERNEL_EXEC
+# define PAGE_KERNEL_EXEC PAGE_KERNEL
+#endif
+
+/**
+ * vmalloc_exec - allocate virtually contiguous, executable memory
+ * @size: allocation size
+ *
+ * Kernel-internal function to allocate enough pages to cover @size
+ * the page level allocator and map them into contiguous and
+ * executable kernel virtual space.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
+
+void *vmalloc_exec(unsigned long size)
+{
+ return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
+}
+
+#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
+#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL
+#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
+#define GFP_VMALLOC32 GFP_DMA | GFP_KERNEL
+#else
+#define GFP_VMALLOC32 GFP_KERNEL
+#endif
+
+/**
+ * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
+ * @size: allocation size
+ *
+ * Allocate enough 32bit PA addressable pages to cover @size from the
+ * page level allocator and map them into contiguous kernel virtual space.
+ */
+void *vmalloc_32(unsigned long size)
+{
+ return __vmalloc(size, GFP_VMALLOC32, PAGE_KERNEL);
+}
+EXPORT_SYMBOL(vmalloc_32);
+
+/**
+ * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
+ * @size: allocation size
+ *
+ * The resulting memory area is 32bit addressable and zeroed so it can be
+ * mapped to userspace without leaking data.
+ */
+void *vmalloc_32_user(unsigned long size)
+{
+ struct vm_struct *area;
+ void *ret;
+
+ ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL);
+ if (ret) {
+ area = find_vm_area(ret);
+ area->flags |= VM_USERMAP;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(vmalloc_32_user);
+
+long vread(char *buf, char *addr, unsigned long count)
+{
+ struct vm_struct *tmp;
+ char *vaddr, *buf_start = buf;
+ unsigned long n;
+
+ /* Don't allow overflow */
+ if ((unsigned long) addr + count < count)
+ count = -(unsigned long) addr;
+
+ read_lock(&vmlist_lock);
+ for (tmp = vmlist; tmp; tmp = tmp->next) {
+ vaddr = (char *) tmp->addr;
+ if (addr >= vaddr + tmp->size - PAGE_SIZE)
+ continue;
+ while (addr < vaddr) {
+ if (count == 0)
+ goto finished;
+ *buf = '\0';
+ buf++;
+ addr++;
+ count--;
+ }
+ n = vaddr + tmp->size - PAGE_SIZE - addr;
+ do {
+ if (count == 0)
+ goto finished;
+ *buf = *addr;
+ buf++;
+ addr++;
+ count--;
+ } while (--n > 0);
+ }
+finished:
+ read_unlock(&vmlist_lock);
+ return buf - buf_start;
+}
+
+long vwrite(char *buf, char *addr, unsigned long count)
+{
+ struct vm_struct *tmp;
+ char *vaddr, *buf_start = buf;
+ unsigned long n;
+
+ /* Don't allow overflow */
+ if ((unsigned long) addr + count < count)
+ count = -(unsigned long) addr;
+
+ read_lock(&vmlist_lock);
+ for (tmp = vmlist; tmp; tmp = tmp->next) {
+ vaddr = (char *) tmp->addr;
+ if (addr >= vaddr + tmp->size - PAGE_SIZE)
+ continue;
+ while (addr < vaddr) {
+ if (count == 0)
+ goto finished;
+ buf++;
+ addr++;
+ count--;
+ }
+ n = vaddr + tmp->size - PAGE_SIZE - addr;
+ do {
+ if (count == 0)
+ goto finished;
+ *addr = *buf;
+ buf++;
+ addr++;
+ count--;
+ } while (--n > 0);
+ }
+finished:
+ read_unlock(&vmlist_lock);
+ return buf - buf_start;
+}
+
+/**
+ * remap_vmalloc_range - map vmalloc pages to userspace
+ * @vma: vma to cover (map full range of vma)
+ * @addr: vmalloc memory
+ * @pgoff: number of pages into addr before first page to map
+ *
+ * Returns: 0 for success, -Exxx on failure
+ *
+ * This function checks that addr is a valid vmalloc'ed area, and
+ * that it is big enough to cover the vma. Will return failure if
+ * that criteria isn't met.
+ *
+ * Similar to remap_pfn_range() (see mm/memory.c)
+ */
+int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
+ unsigned long pgoff)
+{
+ struct vm_struct *area;
+ unsigned long uaddr = vma->vm_start;
+ unsigned long usize = vma->vm_end - vma->vm_start;
+
+ if ((PAGE_SIZE-1) & (unsigned long)addr)
+ return -EINVAL;
+
+ area = find_vm_area(addr);
+ if (!area)
+ return -EINVAL;
+
+ if (!(area->flags & VM_USERMAP))
+ return -EINVAL;
+
+ if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
+ return -EINVAL;
+
+ addr += pgoff << PAGE_SHIFT;
+ do {
+ struct page *page = vmalloc_to_page(addr);
+ int ret;
+
+ ret = vm_insert_page(vma, uaddr, page);
+ if (ret)
+ return ret;
+
+ uaddr += PAGE_SIZE;
+ addr += PAGE_SIZE;
+ usize -= PAGE_SIZE;
+ } while (usize > 0);
+
+ /* Prevent "things" like memory migration? VM_flags need a cleanup... */
+ vma->vm_flags |= VM_RESERVED;
+
+ return 0;
+}
+EXPORT_SYMBOL(remap_vmalloc_range);
+
+/*
+ * Implement a stub for vmalloc_sync_all() if the architecture chose not to
+ * have one.
+ */
+void __attribute__((weak)) vmalloc_sync_all(void)
+{
+}
+
+
+static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data)
+{
+ /* apply_to_page_range() does all the hard work. */
+ return 0;
+}
+
+/**
+ * alloc_vm_area - allocate a range of kernel address space
+ * @size: size of the area
+ *
+ * Returns: NULL on failure, vm_struct on success
+ *
+ * This function reserves a range of kernel address space, and
+ * allocates pagetables to map that range. No actual mappings
+ * are created. If the kernel address space is not shared
+ * between processes, it syncs the pagetable across all
+ * processes.
+ */
+struct vm_struct *alloc_vm_area(size_t size)
+{
+ struct vm_struct *area;
+
+ area = get_vm_area_caller(size, VM_IOREMAP,
+ __builtin_return_address(0));
+ if (area == NULL)
+ return NULL;
+
+ /*
+ * This ensures that page tables are constructed for this region
+ * of kernel virtual address space and mapped into init_mm.
+ */
+ if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
+ area->size, f, NULL)) {
+ free_vm_area(area);
+ return NULL;
+ }
+
+ /* Make sure the pagetables are constructed in process kernel
+ mappings */
+ vmalloc_sync_all();
+
+ return area;
+}
+EXPORT_SYMBOL_GPL(alloc_vm_area);
+
+void free_vm_area(struct vm_struct *area)
+{
+ struct vm_struct *ret;
+ ret = remove_vm_area(area->addr);
+ BUG_ON(ret != area);
+ kfree(area);
+}
+EXPORT_SYMBOL_GPL(free_vm_area);
+
+
+#ifdef CONFIG_PROC_FS
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+ loff_t n = *pos;
+ struct vm_struct *v;
+
+ read_lock(&vmlist_lock);
+ v = vmlist;
+ while (n > 0 && v) {
+ n--;
+ v = v->next;
+ }
+ if (!n)
+ return v;
+
+ return NULL;
+
+}
+
+static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ struct vm_struct *v = p;
+
+ ++*pos;
+ return v->next;
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+ read_unlock(&vmlist_lock);
+}
+
+static void show_numa_info(struct seq_file *m, struct vm_struct *v)
+{
+ if (NUMA_BUILD) {
+ unsigned int nr, *counters = m->private;
+
+ if (!counters)
+ return;
+
+ memset(counters, 0, nr_node_ids * sizeof(unsigned int));
+
+ for (nr = 0; nr < v->nr_pages; nr++)
+ counters[page_to_nid(v->pages[nr])]++;
+
+ for_each_node_state(nr, N_HIGH_MEMORY)
+ if (counters[nr])
+ seq_printf(m, " N%u=%u", nr, counters[nr]);
+ }
+}
+
+static int s_show(struct seq_file *m, void *p)
+{
+ struct vm_struct *v = p;
+
+ seq_printf(m, "0x%p-0x%p %7ld",
+ v->addr, v->addr + v->size, v->size);
+
+ if (v->caller) {
+ char buff[KSYM_SYMBOL_LEN];
+
+ seq_putc(m, ' ');
+ sprint_symbol(buff, (unsigned long)v->caller);
+ seq_puts(m, buff);
+ }
+
+ if (v->nr_pages)
+ seq_printf(m, " pages=%d", v->nr_pages);
+
+ if (v->phys_addr)
+ seq_printf(m, " phys=%lx", v->phys_addr);
+
+ if (v->flags & VM_IOREMAP)
+ seq_printf(m, " ioremap");
+
+ if (v->flags & VM_ALLOC)
+ seq_printf(m, " vmalloc");
+
+ if (v->flags & VM_MAP)
+ seq_printf(m, " vmap");
+
+ if (v->flags & VM_USERMAP)
+ seq_printf(m, " user");
+
+ if (v->flags & VM_VPAGES)
+ seq_printf(m, " vpages");
+
+ show_numa_info(m, v);
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static const struct seq_operations vmalloc_op = {
+ .start = s_start,
+ .next = s_next,
+ .stop = s_stop,
+ .show = s_show,
+};
+
+static int vmalloc_open(struct inode *inode, struct file *file)
+{
+ unsigned int *ptr = NULL;
+ int ret;
+
+ if (NUMA_BUILD)
+ ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
+ ret = seq_open(file, &vmalloc_op);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = ptr;
+ } else
+ kfree(ptr);
+ return ret;
+}
+
+static const struct file_operations proc_vmalloc_operations = {
+ .open = vmalloc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+static int __init proc_vmalloc_init(void)
+{
+ proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations);
+ return 0;
+}
+module_init(proc_vmalloc_init);
+#endif
+
diff --git a/mm/vmscan.c b/mm/vmscan.c
new file mode 100644
index 0000000..62e7f62
--- /dev/null
+++ b/mm/vmscan.c
@@ -0,0 +1,2592 @@
+/*
+ * linux/mm/vmscan.c
+ *
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * Swap reorganised 29.12.95, Stephen Tweedie.
+ * kswapd added: 7.1.96 sct
+ * Removed kswapd_ctl limits, and swap out as many pages as needed
+ * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
+ * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
+ * Multiqueue VM started 5.8.00, Rik van Riel.
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/kernel_stat.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/vmstat.h>
+#include <linux/file.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h> /* for try_to_release_page(),
+ buffer_heads_over_limit */
+#include <linux/mm_inline.h>
+#include <linux/pagevec.h>
+#include <linux/backing-dev.h>
+#include <linux/rmap.h>
+#include <linux/topology.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/notifier.h>
+#include <linux/rwsem.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/memcontrol.h>
+#include <linux/delayacct.h>
+#include <linux/sysctl.h>
+
+#include <asm/tlbflush.h>
+#include <asm/div64.h>
+
+#include <linux/swapops.h>
+
+#include "internal.h"
+
+struct scan_control {
+ /* Incremented by the number of inactive pages that were scanned */
+ unsigned long nr_scanned;
+
+ /* This context's GFP mask */
+ gfp_t gfp_mask;
+
+ int may_writepage;
+
+ /* Can pages be swapped as part of reclaim? */
+ int may_swap;
+
+ /* This context's SWAP_CLUSTER_MAX. If freeing memory for
+ * suspend, we effectively ignore SWAP_CLUSTER_MAX.
+ * In this context, it doesn't matter that we scan the
+ * whole list at once. */
+ int swap_cluster_max;
+
+ int swappiness;
+
+ int all_unreclaimable;
+
+ int order;
+
+ /* Which cgroup do we reclaim from */
+ struct mem_cgroup *mem_cgroup;
+
+ /* Pluggable isolate pages callback */
+ unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
+ unsigned long *scanned, int order, int mode,
+ struct zone *z, struct mem_cgroup *mem_cont,
+ int active, int file);
+};
+
+#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
+
+#ifdef ARCH_HAS_PREFETCH
+#define prefetch_prev_lru_page(_page, _base, _field) \
+ do { \
+ if ((_page)->lru.prev != _base) { \
+ struct page *prev; \
+ \
+ prev = lru_to_page(&(_page->lru)); \
+ prefetch(&prev->_field); \
+ } \
+ } while (0)
+#else
+#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
+#endif
+
+#ifdef ARCH_HAS_PREFETCHW
+#define prefetchw_prev_lru_page(_page, _base, _field) \
+ do { \
+ if ((_page)->lru.prev != _base) { \
+ struct page *prev; \
+ \
+ prev = lru_to_page(&(_page->lru)); \
+ prefetchw(&prev->_field); \
+ } \
+ } while (0)
+#else
+#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
+#endif
+
+/*
+ * From 0 .. 100. Higher means more swappy.
+ */
+int vm_swappiness = 60;
+long vm_total_pages; /* The total number of pages which the VM controls */
+
+static LIST_HEAD(shrinker_list);
+static DECLARE_RWSEM(shrinker_rwsem);
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#define scan_global_lru(sc) (!(sc)->mem_cgroup)
+#else
+#define scan_global_lru(sc) (1)
+#endif
+
+/*
+ * Add a shrinker callback to be called from the vm
+ */
+void register_shrinker(struct shrinker *shrinker)
+{
+ shrinker->nr = 0;
+ down_write(&shrinker_rwsem);
+ list_add_tail(&shrinker->list, &shrinker_list);
+ up_write(&shrinker_rwsem);
+}
+EXPORT_SYMBOL(register_shrinker);
+
+/*
+ * Remove one
+ */
+void unregister_shrinker(struct shrinker *shrinker)
+{
+ down_write(&shrinker_rwsem);
+ list_del(&shrinker->list);
+ up_write(&shrinker_rwsem);
+}
+EXPORT_SYMBOL(unregister_shrinker);
+
+#define SHRINK_BATCH 128
+/*
+ * Call the shrink functions to age shrinkable caches
+ *
+ * Here we assume it costs one seek to replace a lru page and that it also
+ * takes a seek to recreate a cache object. With this in mind we age equal
+ * percentages of the lru and ageable caches. This should balance the seeks
+ * generated by these structures.
+ *
+ * If the vm encountered mapped pages on the LRU it increase the pressure on
+ * slab to avoid swapping.
+ *
+ * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
+ *
+ * `lru_pages' represents the number of on-LRU pages in all the zones which
+ * are eligible for the caller's allocation attempt. It is used for balancing
+ * slab reclaim versus page reclaim.
+ *
+ * Returns the number of slab objects which we shrunk.
+ */
+unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
+ unsigned long lru_pages)
+{
+ struct shrinker *shrinker;
+ unsigned long ret = 0;
+
+ if (scanned == 0)
+ scanned = SWAP_CLUSTER_MAX;
+
+ if (!down_read_trylock(&shrinker_rwsem))
+ return 1; /* Assume we'll be able to shrink next time */
+
+ list_for_each_entry(shrinker, &shrinker_list, list) {
+ unsigned long long delta;
+ unsigned long total_scan;
+ unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask);
+
+ delta = (4 * scanned) / shrinker->seeks;
+ delta *= max_pass;
+ do_div(delta, lru_pages + 1);
+ shrinker->nr += delta;
+ if (shrinker->nr < 0) {
+ printk(KERN_ERR "%s: nr=%ld\n",
+ __func__, shrinker->nr);
+ shrinker->nr = max_pass;
+ }
+
+ /*
+ * Avoid risking looping forever due to too large nr value:
+ * never try to free more than twice the estimate number of
+ * freeable entries.
+ */
+ if (shrinker->nr > max_pass * 2)
+ shrinker->nr = max_pass * 2;
+
+ total_scan = shrinker->nr;
+ shrinker->nr = 0;
+
+ while (total_scan >= SHRINK_BATCH) {
+ long this_scan = SHRINK_BATCH;
+ int shrink_ret;
+ int nr_before;
+
+ nr_before = (*shrinker->shrink)(0, gfp_mask);
+ shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask);
+ if (shrink_ret == -1)
+ break;
+ if (shrink_ret < nr_before)
+ ret += nr_before - shrink_ret;
+ count_vm_events(SLABS_SCANNED, this_scan);
+ total_scan -= this_scan;
+
+ cond_resched();
+ }
+
+ shrinker->nr += total_scan;
+ }
+ up_read(&shrinker_rwsem);
+ return ret;
+}
+
+/* Called without lock on whether page is mapped, so answer is unstable */
+static inline int page_mapping_inuse(struct page *page)
+{
+ struct address_space *mapping;
+
+ /* Page is in somebody's page tables. */
+ if (page_mapped(page))
+ return 1;
+
+ /* Be more reluctant to reclaim swapcache than pagecache */
+ if (PageSwapCache(page))
+ return 1;
+
+ mapping = page_mapping(page);
+ if (!mapping)
+ return 0;
+
+ /* File is mmap'd by somebody? */
+ return mapping_mapped(mapping);
+}
+
+static inline int is_page_cache_freeable(struct page *page)
+{
+ return page_count(page) - !!PagePrivate(page) == 2;
+}
+
+static int may_write_to_queue(struct backing_dev_info *bdi)
+{
+ if (current->flags & PF_SWAPWRITE)
+ return 1;
+ if (!bdi_write_congested(bdi))
+ return 1;
+ if (bdi == current->backing_dev_info)
+ return 1;
+ return 0;
+}
+
+/*
+ * We detected a synchronous write error writing a page out. Probably
+ * -ENOSPC. We need to propagate that into the address_space for a subsequent
+ * fsync(), msync() or close().
+ *
+ * The tricky part is that after writepage we cannot touch the mapping: nothing
+ * prevents it from being freed up. But we have a ref on the page and once
+ * that page is locked, the mapping is pinned.
+ *
+ * We're allowed to run sleeping lock_page() here because we know the caller has
+ * __GFP_FS.
+ */
+static void handle_write_error(struct address_space *mapping,
+ struct page *page, int error)
+{
+ lock_page(page);
+ if (page_mapping(page) == mapping)
+ mapping_set_error(mapping, error);
+ unlock_page(page);
+}
+
+/* Request for sync pageout. */
+enum pageout_io {
+ PAGEOUT_IO_ASYNC,
+ PAGEOUT_IO_SYNC,
+};
+
+/* possible outcome of pageout() */
+typedef enum {
+ /* failed to write page out, page is locked */
+ PAGE_KEEP,
+ /* move page to the active list, page is locked */
+ PAGE_ACTIVATE,
+ /* page has been sent to the disk successfully, page is unlocked */
+ PAGE_SUCCESS,
+ /* page is clean and locked */
+ PAGE_CLEAN,
+} pageout_t;
+
+/*
+ * pageout is called by shrink_page_list() for each dirty page.
+ * Calls ->writepage().
+ */
+static pageout_t pageout(struct page *page, struct address_space *mapping,
+ enum pageout_io sync_writeback)
+{
+ /*
+ * If the page is dirty, only perform writeback if that write
+ * will be non-blocking. To prevent this allocation from being
+ * stalled by pagecache activity. But note that there may be
+ * stalls if we need to run get_block(). We could test
+ * PagePrivate for that.
+ *
+ * If this process is currently in generic_file_write() against
+ * this page's queue, we can perform writeback even if that
+ * will block.
+ *
+ * If the page is swapcache, write it back even if that would
+ * block, for some throttling. This happens by accident, because
+ * swap_backing_dev_info is bust: it doesn't reflect the
+ * congestion state of the swapdevs. Easy to fix, if needed.
+ * See swapfile.c:page_queue_congested().
+ */
+ if (!is_page_cache_freeable(page))
+ return PAGE_KEEP;
+ if (!mapping) {
+ /*
+ * Some data journaling orphaned pages can have
+ * page->mapping == NULL while being dirty with clean buffers.
+ */
+ if (PagePrivate(page)) {
+ if (try_to_free_buffers(page)) {
+ ClearPageDirty(page);
+ printk("%s: orphaned page\n", __func__);
+ return PAGE_CLEAN;
+ }
+ }
+ return PAGE_KEEP;
+ }
+ if (mapping->a_ops->writepage == NULL)
+ return PAGE_ACTIVATE;
+ if (!may_write_to_queue(mapping->backing_dev_info))
+ return PAGE_KEEP;
+
+ if (clear_page_dirty_for_io(page)) {
+ int res;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ .nr_to_write = SWAP_CLUSTER_MAX,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ .nonblocking = 1,
+ .for_reclaim = 1,
+ };
+
+ SetPageReclaim(page);
+ res = mapping->a_ops->writepage(page, &wbc);
+ if (res < 0)
+ handle_write_error(mapping, page, res);
+ if (res == AOP_WRITEPAGE_ACTIVATE) {
+ ClearPageReclaim(page);
+ return PAGE_ACTIVATE;
+ }
+
+ /*
+ * Wait on writeback if requested to. This happens when
+ * direct reclaiming a large contiguous area and the
+ * first attempt to free a range of pages fails.
+ */
+ if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC)
+ wait_on_page_writeback(page);
+
+ if (!PageWriteback(page)) {
+ /* synchronous write or broken a_ops? */
+ ClearPageReclaim(page);
+ }
+ inc_zone_page_state(page, NR_VMSCAN_WRITE);
+ return PAGE_SUCCESS;
+ }
+
+ return PAGE_CLEAN;
+}
+
+/*
+ * Same as remove_mapping, but if the page is removed from the mapping, it
+ * gets returned with a refcount of 0.
+ */
+static int __remove_mapping(struct address_space *mapping, struct page *page)
+{
+ BUG_ON(!PageLocked(page));
+ BUG_ON(mapping != page_mapping(page));
+
+ spin_lock_irq(&mapping->tree_lock);
+ /*
+ * The non racy check for a busy page.
+ *
+ * Must be careful with the order of the tests. When someone has
+ * a ref to the page, it may be possible that they dirty it then
+ * drop the reference. So if PageDirty is tested before page_count
+ * here, then the following race may occur:
+ *
+ * get_user_pages(&page);
+ * [user mapping goes away]
+ * write_to(page);
+ * !PageDirty(page) [good]
+ * SetPageDirty(page);
+ * put_page(page);
+ * !page_count(page) [good, discard it]
+ *
+ * [oops, our write_to data is lost]
+ *
+ * Reversing the order of the tests ensures such a situation cannot
+ * escape unnoticed. The smp_rmb is needed to ensure the page->flags
+ * load is not satisfied before that of page->_count.
+ *
+ * Note that if SetPageDirty is always performed via set_page_dirty,
+ * and thus under tree_lock, then this ordering is not required.
+ */
+ if (!page_freeze_refs(page, 2))
+ goto cannot_free;
+ /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
+ if (unlikely(PageDirty(page))) {
+ page_unfreeze_refs(page, 2);
+ goto cannot_free;
+ }
+
+ if (PageSwapCache(page)) {
+ swp_entry_t swap = { .val = page_private(page) };
+ __delete_from_swap_cache(page);
+ spin_unlock_irq(&mapping->tree_lock);
+ swap_free(swap);
+ } else {
+ __remove_from_page_cache(page);
+ spin_unlock_irq(&mapping->tree_lock);
+ }
+
+ return 1;
+
+cannot_free:
+ spin_unlock_irq(&mapping->tree_lock);
+ return 0;
+}
+
+/*
+ * Attempt to detach a locked page from its ->mapping. If it is dirty or if
+ * someone else has a ref on the page, abort and return 0. If it was
+ * successfully detached, return 1. Assumes the caller has a single ref on
+ * this page.
+ */
+int remove_mapping(struct address_space *mapping, struct page *page)
+{
+ if (__remove_mapping(mapping, page)) {
+ /*
+ * Unfreezing the refcount with 1 rather than 2 effectively
+ * drops the pagecache ref for us without requiring another
+ * atomic operation.
+ */
+ page_unfreeze_refs(page, 1);
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * putback_lru_page - put previously isolated page onto appropriate LRU list
+ * @page: page to be put back to appropriate lru list
+ *
+ * Add previously isolated @page to appropriate LRU list.
+ * Page may still be unevictable for other reasons.
+ *
+ * lru_lock must not be held, interrupts must be enabled.
+ */
+#ifdef CONFIG_UNEVICTABLE_LRU
+void putback_lru_page(struct page *page)
+{
+ int lru;
+ int active = !!TestClearPageActive(page);
+ int was_unevictable = PageUnevictable(page);
+
+ VM_BUG_ON(PageLRU(page));
+
+redo:
+ ClearPageUnevictable(page);
+
+ if (page_evictable(page, NULL)) {
+ /*
+ * For evictable pages, we can use the cache.
+ * In event of a race, worst case is we end up with an
+ * unevictable page on [in]active list.
+ * We know how to handle that.
+ */
+ lru = active + page_is_file_cache(page);
+ lru_cache_add_lru(page, lru);
+ } else {
+ /*
+ * Put unevictable pages directly on zone's unevictable
+ * list.
+ */
+ lru = LRU_UNEVICTABLE;
+ add_page_to_unevictable_list(page);
+ }
+ mem_cgroup_move_lists(page, lru);
+
+ /*
+ * page's status can change while we move it among lru. If an evictable
+ * page is on unevictable list, it never be freed. To avoid that,
+ * check after we added it to the list, again.
+ */
+ if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) {
+ if (!isolate_lru_page(page)) {
+ put_page(page);
+ goto redo;
+ }
+ /* This means someone else dropped this page from LRU
+ * So, it will be freed or putback to LRU again. There is
+ * nothing to do here.
+ */
+ }
+
+ if (was_unevictable && lru != LRU_UNEVICTABLE)
+ count_vm_event(UNEVICTABLE_PGRESCUED);
+ else if (!was_unevictable && lru == LRU_UNEVICTABLE)
+ count_vm_event(UNEVICTABLE_PGCULLED);
+
+ put_page(page); /* drop ref from isolate */
+}
+
+#else /* CONFIG_UNEVICTABLE_LRU */
+
+void putback_lru_page(struct page *page)
+{
+ int lru;
+ VM_BUG_ON(PageLRU(page));
+
+ lru = !!TestClearPageActive(page) + page_is_file_cache(page);
+ lru_cache_add_lru(page, lru);
+ mem_cgroup_move_lists(page, lru);
+ put_page(page);
+}
+#endif /* CONFIG_UNEVICTABLE_LRU */
+
+
+/*
+ * shrink_page_list() returns the number of reclaimed pages
+ */
+static unsigned long shrink_page_list(struct list_head *page_list,
+ struct scan_control *sc,
+ enum pageout_io sync_writeback)
+{
+ LIST_HEAD(ret_pages);
+ struct pagevec freed_pvec;
+ int pgactivate = 0;
+ unsigned long nr_reclaimed = 0;
+
+ cond_resched();
+
+ pagevec_init(&freed_pvec, 1);
+ while (!list_empty(page_list)) {
+ struct address_space *mapping;
+ struct page *page;
+ int may_enter_fs;
+ int referenced;
+
+ cond_resched();
+
+ page = lru_to_page(page_list);
+ list_del(&page->lru);
+
+ if (!trylock_page(page))
+ goto keep;
+
+ VM_BUG_ON(PageActive(page));
+
+ sc->nr_scanned++;
+
+ if (unlikely(!page_evictable(page, NULL)))
+ goto cull_mlocked;
+
+ if (!sc->may_swap && page_mapped(page))
+ goto keep_locked;
+
+ /* Double the slab pressure for mapped and swapcache pages */
+ if (page_mapped(page) || PageSwapCache(page))
+ sc->nr_scanned++;
+
+ may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
+ (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
+
+ if (PageWriteback(page)) {
+ /*
+ * Synchronous reclaim is performed in two passes,
+ * first an asynchronous pass over the list to
+ * start parallel writeback, and a second synchronous
+ * pass to wait for the IO to complete. Wait here
+ * for any page for which writeback has already
+ * started.
+ */
+ if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs)
+ wait_on_page_writeback(page);
+ else
+ goto keep_locked;
+ }
+
+ referenced = page_referenced(page, 1, sc->mem_cgroup);
+ /* In active use or really unfreeable? Activate it. */
+ if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
+ referenced && page_mapping_inuse(page))
+ goto activate_locked;
+
+#ifdef CONFIG_SWAP
+ /*
+ * Anonymous process memory has backing store?
+ * Try to allocate it some swap space here.
+ */
+ if (PageAnon(page) && !PageSwapCache(page)) {
+ if (!(sc->gfp_mask & __GFP_IO))
+ goto keep_locked;
+ switch (try_to_munlock(page)) {
+ case SWAP_FAIL: /* shouldn't happen */
+ case SWAP_AGAIN:
+ goto keep_locked;
+ case SWAP_MLOCK:
+ goto cull_mlocked;
+ case SWAP_SUCCESS:
+ ; /* fall thru'; add to swap cache */
+ }
+ if (!add_to_swap(page, GFP_ATOMIC))
+ goto activate_locked;
+ may_enter_fs = 1;
+ }
+#endif /* CONFIG_SWAP */
+
+ mapping = page_mapping(page);
+
+ /*
+ * The page is mapped into the page tables of one or more
+ * processes. Try to unmap it here.
+ */
+ if (page_mapped(page) && mapping) {
+ switch (try_to_unmap(page, 0)) {
+ case SWAP_FAIL:
+ goto activate_locked;
+ case SWAP_AGAIN:
+ goto keep_locked;
+ case SWAP_MLOCK:
+ goto cull_mlocked;
+ case SWAP_SUCCESS:
+ ; /* try to free the page below */
+ }
+ }
+
+ if (PageDirty(page)) {
+ if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced)
+ goto keep_locked;
+ if (!may_enter_fs)
+ goto keep_locked;
+ if (!sc->may_writepage)
+ goto keep_locked;
+
+ /* Page is dirty, try to write it out here */
+ switch (pageout(page, mapping, sync_writeback)) {
+ case PAGE_KEEP:
+ goto keep_locked;
+ case PAGE_ACTIVATE:
+ goto activate_locked;
+ case PAGE_SUCCESS:
+ if (PageWriteback(page) || PageDirty(page))
+ goto keep;
+ /*
+ * A synchronous write - probably a ramdisk. Go
+ * ahead and try to reclaim the page.
+ */
+ if (!trylock_page(page))
+ goto keep;
+ if (PageDirty(page) || PageWriteback(page))
+ goto keep_locked;
+ mapping = page_mapping(page);
+ case PAGE_CLEAN:
+ ; /* try to free the page below */
+ }
+ }
+
+ /*
+ * If the page has buffers, try to free the buffer mappings
+ * associated with this page. If we succeed we try to free
+ * the page as well.
+ *
+ * We do this even if the page is PageDirty().
+ * try_to_release_page() does not perform I/O, but it is
+ * possible for a page to have PageDirty set, but it is actually
+ * clean (all its buffers are clean). This happens if the
+ * buffers were written out directly, with submit_bh(). ext3
+ * will do this, as well as the blockdev mapping.
+ * try_to_release_page() will discover that cleanness and will
+ * drop the buffers and mark the page clean - it can be freed.
+ *
+ * Rarely, pages can have buffers and no ->mapping. These are
+ * the pages which were not successfully invalidated in
+ * truncate_complete_page(). We try to drop those buffers here
+ * and if that worked, and the page is no longer mapped into
+ * process address space (page_count == 1) it can be freed.
+ * Otherwise, leave the page on the LRU so it is swappable.
+ */
+ if (PagePrivate(page)) {
+ if (!try_to_release_page(page, sc->gfp_mask))
+ goto activate_locked;
+ if (!mapping && page_count(page) == 1) {
+ unlock_page(page);
+ if (put_page_testzero(page))
+ goto free_it;
+ else {
+ /*
+ * rare race with speculative reference.
+ * the speculative reference will free
+ * this page shortly, so we may
+ * increment nr_reclaimed here (and
+ * leave it off the LRU).
+ */
+ nr_reclaimed++;
+ continue;
+ }
+ }
+ }
+
+ if (!mapping || !__remove_mapping(mapping, page))
+ goto keep_locked;
+
+ /*
+ * At this point, we have no other references and there is
+ * no way to pick any more up (removed from LRU, removed
+ * from pagecache). Can use non-atomic bitops now (and
+ * we obviously don't have to worry about waking up a process
+ * waiting on the page lock, because there are no references.
+ */
+ __clear_page_locked(page);
+free_it:
+ nr_reclaimed++;
+ if (!pagevec_add(&freed_pvec, page)) {
+ __pagevec_free(&freed_pvec);
+ pagevec_reinit(&freed_pvec);
+ }
+ continue;
+
+cull_mlocked:
+ unlock_page(page);
+ putback_lru_page(page);
+ continue;
+
+activate_locked:
+ /* Not a candidate for swapping, so reclaim swap space. */
+ if (PageSwapCache(page) && vm_swap_full())
+ remove_exclusive_swap_page_ref(page);
+ VM_BUG_ON(PageActive(page));
+ SetPageActive(page);
+ pgactivate++;
+keep_locked:
+ unlock_page(page);
+keep:
+ list_add(&page->lru, &ret_pages);
+ VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
+ }
+ list_splice(&ret_pages, page_list);
+ if (pagevec_count(&freed_pvec))
+ __pagevec_free(&freed_pvec);
+ count_vm_events(PGACTIVATE, pgactivate);
+ return nr_reclaimed;
+}
+
+/* LRU Isolation modes. */
+#define ISOLATE_INACTIVE 0 /* Isolate inactive pages. */
+#define ISOLATE_ACTIVE 1 /* Isolate active pages. */
+#define ISOLATE_BOTH 2 /* Isolate both active and inactive pages. */
+
+/*
+ * Attempt to remove the specified page from its LRU. Only take this page
+ * if it is of the appropriate PageActive status. Pages which are being
+ * freed elsewhere are also ignored.
+ *
+ * page: page to consider
+ * mode: one of the LRU isolation modes defined above
+ *
+ * returns 0 on success, -ve errno on failure.
+ */
+int __isolate_lru_page(struct page *page, int mode, int file)
+{
+ int ret = -EINVAL;
+
+ /* Only take pages on the LRU. */
+ if (!PageLRU(page))
+ return ret;
+
+ /*
+ * When checking the active state, we need to be sure we are
+ * dealing with comparible boolean values. Take the logical not
+ * of each.
+ */
+ if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
+ return ret;
+
+ if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file))
+ return ret;
+
+ /*
+ * When this function is being called for lumpy reclaim, we
+ * initially look into all LRU pages, active, inactive and
+ * unevictable; only give shrink_page_list evictable pages.
+ */
+ if (PageUnevictable(page))
+ return ret;
+
+ ret = -EBUSY;
+ if (likely(get_page_unless_zero(page))) {
+ /*
+ * Be careful not to clear PageLRU until after we're
+ * sure the page is not being freed elsewhere -- the
+ * page release code relies on it.
+ */
+ ClearPageLRU(page);
+ ret = 0;
+ }
+
+ return ret;
+}
+
+/*
+ * zone->lru_lock is heavily contended. Some of the functions that
+ * shrink the lists perform better by taking out a batch of pages
+ * and working on them outside the LRU lock.
+ *
+ * For pagecache intensive workloads, this function is the hottest
+ * spot in the kernel (apart from copy_*_user functions).
+ *
+ * Appropriate locks must be held before calling this function.
+ *
+ * @nr_to_scan: The number of pages to look through on the list.
+ * @src: The LRU list to pull pages off.
+ * @dst: The temp list to put pages on to.
+ * @scanned: The number of pages that were scanned.
+ * @order: The caller's attempted allocation order
+ * @mode: One of the LRU isolation modes
+ * @file: True [1] if isolating file [!anon] pages
+ *
+ * returns how many pages were moved onto *@dst.
+ */
+static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
+ struct list_head *src, struct list_head *dst,
+ unsigned long *scanned, int order, int mode, int file)
+{
+ unsigned long nr_taken = 0;
+ unsigned long scan;
+
+ for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
+ struct page *page;
+ unsigned long pfn;
+ unsigned long end_pfn;
+ unsigned long page_pfn;
+ int zone_id;
+
+ page = lru_to_page(src);
+ prefetchw_prev_lru_page(page, src, flags);
+
+ VM_BUG_ON(!PageLRU(page));
+
+ switch (__isolate_lru_page(page, mode, file)) {
+ case 0:
+ list_move(&page->lru, dst);
+ nr_taken++;
+ break;
+
+ case -EBUSY:
+ /* else it is being freed elsewhere */
+ list_move(&page->lru, src);
+ continue;
+
+ default:
+ BUG();
+ }
+
+ if (!order)
+ continue;
+
+ /*
+ * Attempt to take all pages in the order aligned region
+ * surrounding the tag page. Only take those pages of
+ * the same active state as that tag page. We may safely
+ * round the target page pfn down to the requested order
+ * as the mem_map is guarenteed valid out to MAX_ORDER,
+ * where that page is in a different zone we will detect
+ * it from its zone id and abort this block scan.
+ */
+ zone_id = page_zone_id(page);
+ page_pfn = page_to_pfn(page);
+ pfn = page_pfn & ~((1 << order) - 1);
+ end_pfn = pfn + (1 << order);
+ for (; pfn < end_pfn; pfn++) {
+ struct page *cursor_page;
+
+ /* The target page is in the block, ignore it. */
+ if (unlikely(pfn == page_pfn))
+ continue;
+
+ /* Avoid holes within the zone. */
+ if (unlikely(!pfn_valid_within(pfn)))
+ break;
+
+ cursor_page = pfn_to_page(pfn);
+
+ /* Check that we have not crossed a zone boundary. */
+ if (unlikely(page_zone_id(cursor_page) != zone_id))
+ continue;
+ switch (__isolate_lru_page(cursor_page, mode, file)) {
+ case 0:
+ list_move(&cursor_page->lru, dst);
+ nr_taken++;
+ scan++;
+ break;
+
+ case -EBUSY:
+ /* else it is being freed elsewhere */
+ list_move(&cursor_page->lru, src);
+ default:
+ break; /* ! on LRU or wrong list */
+ }
+ }
+ }
+
+ *scanned = scan;
+ return nr_taken;
+}
+
+static unsigned long isolate_pages_global(unsigned long nr,
+ struct list_head *dst,
+ unsigned long *scanned, int order,
+ int mode, struct zone *z,
+ struct mem_cgroup *mem_cont,
+ int active, int file)
+{
+ int lru = LRU_BASE;
+ if (active)
+ lru += LRU_ACTIVE;
+ if (file)
+ lru += LRU_FILE;
+ return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
+ mode, !!file);
+}
+
+/*
+ * clear_active_flags() is a helper for shrink_active_list(), clearing
+ * any active bits from the pages in the list.
+ */
+static unsigned long clear_active_flags(struct list_head *page_list,
+ unsigned int *count)
+{
+ int nr_active = 0;
+ int lru;
+ struct page *page;
+
+ list_for_each_entry(page, page_list, lru) {
+ lru = page_is_file_cache(page);
+ if (PageActive(page)) {
+ lru += LRU_ACTIVE;
+ ClearPageActive(page);
+ nr_active++;
+ }
+ count[lru]++;
+ }
+
+ return nr_active;
+}
+
+/**
+ * isolate_lru_page - tries to isolate a page from its LRU list
+ * @page: page to isolate from its LRU list
+ *
+ * Isolates a @page from an LRU list, clears PageLRU and adjusts the
+ * vmstat statistic corresponding to whatever LRU list the page was on.
+ *
+ * Returns 0 if the page was removed from an LRU list.
+ * Returns -EBUSY if the page was not on an LRU list.
+ *
+ * The returned page will have PageLRU() cleared. If it was found on
+ * the active list, it will have PageActive set. If it was found on
+ * the unevictable list, it will have the PageUnevictable bit set. That flag
+ * may need to be cleared by the caller before letting the page go.
+ *
+ * The vmstat statistic corresponding to the list on which the page was
+ * found will be decremented.
+ *
+ * Restrictions:
+ * (1) Must be called with an elevated refcount on the page. This is a
+ * fundamentnal difference from isolate_lru_pages (which is called
+ * without a stable reference).
+ * (2) the lru_lock must not be held.
+ * (3) interrupts must be enabled.
+ */
+int isolate_lru_page(struct page *page)
+{
+ int ret = -EBUSY;
+
+ if (PageLRU(page)) {
+ struct zone *zone = page_zone(page);
+
+ spin_lock_irq(&zone->lru_lock);
+ if (PageLRU(page) && get_page_unless_zero(page)) {
+ int lru = page_lru(page);
+ ret = 0;
+ ClearPageLRU(page);
+
+ del_page_from_lru_list(zone, page, lru);
+ }
+ spin_unlock_irq(&zone->lru_lock);
+ }
+ return ret;
+}
+
+/*
+ * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
+ * of reclaimed pages
+ */
+static unsigned long shrink_inactive_list(unsigned long max_scan,
+ struct zone *zone, struct scan_control *sc,
+ int priority, int file)
+{
+ LIST_HEAD(page_list);
+ struct pagevec pvec;
+ unsigned long nr_scanned = 0;
+ unsigned long nr_reclaimed = 0;
+
+ pagevec_init(&pvec, 1);
+
+ lru_add_drain();
+ spin_lock_irq(&zone->lru_lock);
+ do {
+ struct page *page;
+ unsigned long nr_taken;
+ unsigned long nr_scan;
+ unsigned long nr_freed;
+ unsigned long nr_active;
+ unsigned int count[NR_LRU_LISTS] = { 0, };
+ int mode = ISOLATE_INACTIVE;
+
+ /*
+ * If we need a large contiguous chunk of memory, or have
+ * trouble getting a small set of contiguous pages, we
+ * will reclaim both active and inactive pages.
+ *
+ * We use the same threshold as pageout congestion_wait below.
+ */
+ if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+ mode = ISOLATE_BOTH;
+ else if (sc->order && priority < DEF_PRIORITY - 2)
+ mode = ISOLATE_BOTH;
+
+ nr_taken = sc->isolate_pages(sc->swap_cluster_max,
+ &page_list, &nr_scan, sc->order, mode,
+ zone, sc->mem_cgroup, 0, file);
+ nr_active = clear_active_flags(&page_list, count);
+ __count_vm_events(PGDEACTIVATE, nr_active);
+
+ __mod_zone_page_state(zone, NR_ACTIVE_FILE,
+ -count[LRU_ACTIVE_FILE]);
+ __mod_zone_page_state(zone, NR_INACTIVE_FILE,
+ -count[LRU_INACTIVE_FILE]);
+ __mod_zone_page_state(zone, NR_ACTIVE_ANON,
+ -count[LRU_ACTIVE_ANON]);
+ __mod_zone_page_state(zone, NR_INACTIVE_ANON,
+ -count[LRU_INACTIVE_ANON]);
+
+ if (scan_global_lru(sc)) {
+ zone->pages_scanned += nr_scan;
+ zone->recent_scanned[0] += count[LRU_INACTIVE_ANON];
+ zone->recent_scanned[0] += count[LRU_ACTIVE_ANON];
+ zone->recent_scanned[1] += count[LRU_INACTIVE_FILE];
+ zone->recent_scanned[1] += count[LRU_ACTIVE_FILE];
+ }
+ spin_unlock_irq(&zone->lru_lock);
+
+ nr_scanned += nr_scan;
+ nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
+
+ /*
+ * If we are direct reclaiming for contiguous pages and we do
+ * not reclaim everything in the list, try again and wait
+ * for IO to complete. This will stall high-order allocations
+ * but that should be acceptable to the caller
+ */
+ if (nr_freed < nr_taken && !current_is_kswapd() &&
+ sc->order > PAGE_ALLOC_COSTLY_ORDER) {
+ congestion_wait(WRITE, HZ/10);
+
+ /*
+ * The attempt at page out may have made some
+ * of the pages active, mark them inactive again.
+ */
+ nr_active = clear_active_flags(&page_list, count);
+ count_vm_events(PGDEACTIVATE, nr_active);
+
+ nr_freed += shrink_page_list(&page_list, sc,
+ PAGEOUT_IO_SYNC);
+ }
+
+ nr_reclaimed += nr_freed;
+ local_irq_disable();
+ if (current_is_kswapd()) {
+ __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
+ __count_vm_events(KSWAPD_STEAL, nr_freed);
+ } else if (scan_global_lru(sc))
+ __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
+
+ __count_zone_vm_events(PGSTEAL, zone, nr_freed);
+
+ if (nr_taken == 0)
+ goto done;
+
+ spin_lock(&zone->lru_lock);
+ /*
+ * Put back any unfreeable pages.
+ */
+ while (!list_empty(&page_list)) {
+ int lru;
+ page = lru_to_page(&page_list);
+ VM_BUG_ON(PageLRU(page));
+ list_del(&page->lru);
+ if (unlikely(!page_evictable(page, NULL))) {
+ spin_unlock_irq(&zone->lru_lock);
+ putback_lru_page(page);
+ spin_lock_irq(&zone->lru_lock);
+ continue;
+ }
+ SetPageLRU(page);
+ lru = page_lru(page);
+ add_page_to_lru_list(zone, page, lru);
+ mem_cgroup_move_lists(page, lru);
+ if (PageActive(page) && scan_global_lru(sc)) {
+ int file = !!page_is_file_cache(page);
+ zone->recent_rotated[file]++;
+ }
+ if (!pagevec_add(&pvec, page)) {
+ spin_unlock_irq(&zone->lru_lock);
+ __pagevec_release(&pvec);
+ spin_lock_irq(&zone->lru_lock);
+ }
+ }
+ } while (nr_scanned < max_scan);
+ spin_unlock(&zone->lru_lock);
+done:
+ local_irq_enable();
+ pagevec_release(&pvec);
+ return nr_reclaimed;
+}
+
+/*
+ * We are about to scan this zone at a certain priority level. If that priority
+ * level is smaller (ie: more urgent) than the previous priority, then note
+ * that priority level within the zone. This is done so that when the next
+ * process comes in to scan this zone, it will immediately start out at this
+ * priority level rather than having to build up its own scanning priority.
+ * Here, this priority affects only the reclaim-mapped threshold.
+ */
+static inline void note_zone_scanning_priority(struct zone *zone, int priority)
+{
+ if (priority < zone->prev_priority)
+ zone->prev_priority = priority;
+}
+
+static inline int zone_is_near_oom(struct zone *zone)
+{
+ return zone->pages_scanned >= (zone_lru_pages(zone) * 3);
+}
+
+/*
+ * This moves pages from the active list to the inactive list.
+ *
+ * We move them the other way if the page is referenced by one or more
+ * processes, from rmap.
+ *
+ * If the pages are mostly unmapped, the processing is fast and it is
+ * appropriate to hold zone->lru_lock across the whole operation. But if
+ * the pages are mapped, the processing is slow (page_referenced()) so we
+ * should drop zone->lru_lock around each page. It's impossible to balance
+ * this, so instead we remove the pages from the LRU while processing them.
+ * It is safe to rely on PG_active against the non-LRU pages in here because
+ * nobody will play with that bit on a non-LRU page.
+ *
+ * The downside is that we have to touch page->_count against each page.
+ * But we had to alter page->flags anyway.
+ */
+
+
+static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
+ struct scan_control *sc, int priority, int file)
+{
+ unsigned long pgmoved;
+ int pgdeactivate = 0;
+ unsigned long pgscanned;
+ LIST_HEAD(l_hold); /* The pages which were snipped off */
+ LIST_HEAD(l_inactive);
+ struct page *page;
+ struct pagevec pvec;
+ enum lru_list lru;
+
+ lru_add_drain();
+ spin_lock_irq(&zone->lru_lock);
+ pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
+ ISOLATE_ACTIVE, zone,
+ sc->mem_cgroup, 1, file);
+ /*
+ * zone->pages_scanned is used for detect zone's oom
+ * mem_cgroup remembers nr_scan by itself.
+ */
+ if (scan_global_lru(sc)) {
+ zone->pages_scanned += pgscanned;
+ zone->recent_scanned[!!file] += pgmoved;
+ }
+
+ if (file)
+ __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
+ else
+ __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
+ spin_unlock_irq(&zone->lru_lock);
+
+ pgmoved = 0;
+ while (!list_empty(&l_hold)) {
+ cond_resched();
+ page = lru_to_page(&l_hold);
+ list_del(&page->lru);
+
+ if (unlikely(!page_evictable(page, NULL))) {
+ putback_lru_page(page);
+ continue;
+ }
+
+ /* page_referenced clears PageReferenced */
+ if (page_mapping_inuse(page) &&
+ page_referenced(page, 0, sc->mem_cgroup))
+ pgmoved++;
+
+ list_add(&page->lru, &l_inactive);
+ }
+
+ spin_lock_irq(&zone->lru_lock);
+ /*
+ * Count referenced pages from currently used mappings as
+ * rotated, even though they are moved to the inactive list.
+ * This helps balance scan pressure between file and anonymous
+ * pages in get_scan_ratio.
+ */
+ zone->recent_rotated[!!file] += pgmoved;
+
+ /*
+ * Move the pages to the [file or anon] inactive list.
+ */
+ pagevec_init(&pvec, 1);
+
+ pgmoved = 0;
+ lru = LRU_BASE + file * LRU_FILE;
+ while (!list_empty(&l_inactive)) {
+ page = lru_to_page(&l_inactive);
+ prefetchw_prev_lru_page(page, &l_inactive, flags);
+ VM_BUG_ON(PageLRU(page));
+ SetPageLRU(page);
+ VM_BUG_ON(!PageActive(page));
+ ClearPageActive(page);
+
+ list_move(&page->lru, &zone->lru[lru].list);
+ mem_cgroup_move_lists(page, lru);
+ pgmoved++;
+ if (!pagevec_add(&pvec, page)) {
+ __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
+ spin_unlock_irq(&zone->lru_lock);
+ pgdeactivate += pgmoved;
+ pgmoved = 0;
+ if (buffer_heads_over_limit)
+ pagevec_strip(&pvec);
+ __pagevec_release(&pvec);
+ spin_lock_irq(&zone->lru_lock);
+ }
+ }
+ __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
+ pgdeactivate += pgmoved;
+ if (buffer_heads_over_limit) {
+ spin_unlock_irq(&zone->lru_lock);
+ pagevec_strip(&pvec);
+ spin_lock_irq(&zone->lru_lock);
+ }
+ __count_zone_vm_events(PGREFILL, zone, pgscanned);
+ __count_vm_events(PGDEACTIVATE, pgdeactivate);
+ spin_unlock_irq(&zone->lru_lock);
+ if (vm_swap_full())
+ pagevec_swap_free(&pvec);
+
+ pagevec_release(&pvec);
+}
+
+static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
+ struct zone *zone, struct scan_control *sc, int priority)
+{
+ int file = is_file_lru(lru);
+
+ if (lru == LRU_ACTIVE_FILE) {
+ shrink_active_list(nr_to_scan, zone, sc, priority, file);
+ return 0;
+ }
+
+ if (lru == LRU_ACTIVE_ANON &&
+ (!scan_global_lru(sc) || inactive_anon_is_low(zone))) {
+ shrink_active_list(nr_to_scan, zone, sc, priority, file);
+ return 0;
+ }
+ return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
+}
+
+/*
+ * Determine how aggressively the anon and file LRU lists should be
+ * scanned. The relative value of each set of LRU lists is determined
+ * by looking at the fraction of the pages scanned we did rotate back
+ * onto the active list instead of evict.
+ *
+ * percent[0] specifies how much pressure to put on ram/swap backed
+ * memory, while percent[1] determines pressure on the file LRUs.
+ */
+static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
+ unsigned long *percent)
+{
+ unsigned long anon, file, free;
+ unsigned long anon_prio, file_prio;
+ unsigned long ap, fp;
+
+ anon = zone_page_state(zone, NR_ACTIVE_ANON) +
+ zone_page_state(zone, NR_INACTIVE_ANON);
+ file = zone_page_state(zone, NR_ACTIVE_FILE) +
+ zone_page_state(zone, NR_INACTIVE_FILE);
+ free = zone_page_state(zone, NR_FREE_PAGES);
+
+ /* If we have no swap space, do not bother scanning anon pages. */
+ if (nr_swap_pages <= 0) {
+ percent[0] = 0;
+ percent[1] = 100;
+ return;
+ }
+
+ /* If we have very few page cache pages, force-scan anon pages. */
+ if (unlikely(file + free <= zone->pages_high)) {
+ percent[0] = 100;
+ percent[1] = 0;
+ return;
+ }
+
+ /*
+ * OK, so we have swap space and a fair amount of page cache
+ * pages. We use the recently rotated / recently scanned
+ * ratios to determine how valuable each cache is.
+ *
+ * Because workloads change over time (and to avoid overflow)
+ * we keep these statistics as a floating average, which ends
+ * up weighing recent references more than old ones.
+ *
+ * anon in [0], file in [1]
+ */
+ if (unlikely(zone->recent_scanned[0] > anon / 4)) {
+ spin_lock_irq(&zone->lru_lock);
+ zone->recent_scanned[0] /= 2;
+ zone->recent_rotated[0] /= 2;
+ spin_unlock_irq(&zone->lru_lock);
+ }
+
+ if (unlikely(zone->recent_scanned[1] > file / 4)) {
+ spin_lock_irq(&zone->lru_lock);
+ zone->recent_scanned[1] /= 2;
+ zone->recent_rotated[1] /= 2;
+ spin_unlock_irq(&zone->lru_lock);
+ }
+
+ /*
+ * With swappiness at 100, anonymous and file have the same priority.
+ * This scanning priority is essentially the inverse of IO cost.
+ */
+ anon_prio = sc->swappiness;
+ file_prio = 200 - sc->swappiness;
+
+ /*
+ * The amount of pressure on anon vs file pages is inversely
+ * proportional to the fraction of recently scanned pages on
+ * each list that were recently referenced and in active use.
+ */
+ ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1);
+ ap /= zone->recent_rotated[0] + 1;
+
+ fp = (file_prio + 1) * (zone->recent_scanned[1] + 1);
+ fp /= zone->recent_rotated[1] + 1;
+
+ /* Normalize to percentages */
+ percent[0] = 100 * ap / (ap + fp + 1);
+ percent[1] = 100 - percent[0];
+}
+
+
+/*
+ * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
+ */
+static unsigned long shrink_zone(int priority, struct zone *zone,
+ struct scan_control *sc)
+{
+ unsigned long nr[NR_LRU_LISTS];
+ unsigned long nr_to_scan;
+ unsigned long nr_reclaimed = 0;
+ unsigned long percent[2]; /* anon @ 0; file @ 1 */
+ enum lru_list l;
+
+ get_scan_ratio(zone, sc, percent);
+
+ for_each_evictable_lru(l) {
+ if (scan_global_lru(sc)) {
+ int file = is_file_lru(l);
+ int scan;
+
+ scan = zone_page_state(zone, NR_LRU_BASE + l);
+ if (priority) {
+ scan >>= priority;
+ scan = (scan * percent[file]) / 100;
+ }
+ zone->lru[l].nr_scan += scan;
+ nr[l] = zone->lru[l].nr_scan;
+ if (nr[l] >= sc->swap_cluster_max)
+ zone->lru[l].nr_scan = 0;
+ else
+ nr[l] = 0;
+ } else {
+ /*
+ * This reclaim occurs not because zone memory shortage
+ * but because memory controller hits its limit.
+ * Don't modify zone reclaim related data.
+ */
+ nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone,
+ priority, l);
+ }
+ }
+
+ while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
+ nr[LRU_INACTIVE_FILE]) {
+ for_each_evictable_lru(l) {
+ if (nr[l]) {
+ nr_to_scan = min(nr[l],
+ (unsigned long)sc->swap_cluster_max);
+ nr[l] -= nr_to_scan;
+
+ nr_reclaimed += shrink_list(l, nr_to_scan,
+ zone, sc, priority);
+ }
+ }
+ }
+
+ /*
+ * Even if we did not try to evict anon pages at all, we want to
+ * rebalance the anon lru active/inactive ratio.
+ */
+ if (!scan_global_lru(sc) || inactive_anon_is_low(zone))
+ shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
+ else if (!scan_global_lru(sc))
+ shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
+
+ throttle_vm_writeout(sc->gfp_mask);
+ return nr_reclaimed;
+}
+
+/*
+ * This is the direct reclaim path, for page-allocating processes. We only
+ * try to reclaim pages from zones which will satisfy the caller's allocation
+ * request.
+ *
+ * We reclaim from a zone even if that zone is over pages_high. Because:
+ * a) The caller may be trying to free *extra* pages to satisfy a higher-order
+ * allocation or
+ * b) The zones may be over pages_high but they must go *over* pages_high to
+ * satisfy the `incremental min' zone defense algorithm.
+ *
+ * Returns the number of reclaimed pages.
+ *
+ * If a zone is deemed to be full of pinned pages then just give it a light
+ * scan then give up on it.
+ */
+static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
+ struct scan_control *sc)
+{
+ enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
+ unsigned long nr_reclaimed = 0;
+ struct zoneref *z;
+ struct zone *zone;
+
+ sc->all_unreclaimable = 1;
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ if (!populated_zone(zone))
+ continue;
+ /*
+ * Take care memory controller reclaiming has small influence
+ * to global LRU.
+ */
+ if (scan_global_lru(sc)) {
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ continue;
+ note_zone_scanning_priority(zone, priority);
+
+ if (zone_is_all_unreclaimable(zone) &&
+ priority != DEF_PRIORITY)
+ continue; /* Let kswapd poll it */
+ sc->all_unreclaimable = 0;
+ } else {
+ /*
+ * Ignore cpuset limitation here. We just want to reduce
+ * # of used pages by us regardless of memory shortage.
+ */
+ sc->all_unreclaimable = 0;
+ mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
+ priority);
+ }
+
+ nr_reclaimed += shrink_zone(priority, zone, sc);
+ }
+
+ return nr_reclaimed;
+}
+
+/*
+ * This is the main entry point to direct page reclaim.
+ *
+ * If a full scan of the inactive list fails to free enough memory then we
+ * are "out of memory" and something needs to be killed.
+ *
+ * If the caller is !__GFP_FS then the probability of a failure is reasonably
+ * high - the zone may be full of dirty or under-writeback pages, which this
+ * caller can't do much about. We kick pdflush and take explicit naps in the
+ * hope that some of these pages can be written. But if the allocating task
+ * holds filesystem locks which prevent writeout this might not work, and the
+ * allocation attempt will fail.
+ *
+ * returns: 0, if no pages reclaimed
+ * else, the number of pages reclaimed
+ */
+static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
+ struct scan_control *sc)
+{
+ int priority;
+ unsigned long ret = 0;
+ unsigned long total_scanned = 0;
+ unsigned long nr_reclaimed = 0;
+ struct reclaim_state *reclaim_state = current->reclaim_state;
+ unsigned long lru_pages = 0;
+ struct zoneref *z;
+ struct zone *zone;
+ enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
+
+ delayacct_freepages_start();
+
+ if (scan_global_lru(sc))
+ count_vm_event(ALLOCSTALL);
+ /*
+ * mem_cgroup will not do shrink_slab.
+ */
+ if (scan_global_lru(sc)) {
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ continue;
+
+ lru_pages += zone_lru_pages(zone);
+ }
+ }
+
+ for (priority = DEF_PRIORITY; priority >= 0; priority--) {
+ sc->nr_scanned = 0;
+ if (!priority)
+ disable_swap_token();
+ nr_reclaimed += shrink_zones(priority, zonelist, sc);
+ /*
+ * Don't shrink slabs when reclaiming memory from
+ * over limit cgroups
+ */
+ if (scan_global_lru(sc)) {
+ shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
+ if (reclaim_state) {
+ nr_reclaimed += reclaim_state->reclaimed_slab;
+ reclaim_state->reclaimed_slab = 0;
+ }
+ }
+ total_scanned += sc->nr_scanned;
+ if (nr_reclaimed >= sc->swap_cluster_max) {
+ ret = nr_reclaimed;
+ goto out;
+ }
+
+ /*
+ * Try to write back as many pages as we just scanned. This
+ * tends to cause slow streaming writers to write data to the
+ * disk smoothly, at the dirtying rate, which is nice. But
+ * that's undesirable in laptop mode, where we *want* lumpy
+ * writeout. So in laptop mode, write out the whole world.
+ */
+ if (total_scanned > sc->swap_cluster_max +
+ sc->swap_cluster_max / 2) {
+ wakeup_pdflush(laptop_mode ? 0 : total_scanned);
+ sc->may_writepage = 1;
+ }
+
+ /* Take a nap, wait for some writeback to complete */
+ if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
+ congestion_wait(WRITE, HZ/10);
+ }
+ /* top priority shrink_zones still had more to do? don't OOM, then */
+ if (!sc->all_unreclaimable && scan_global_lru(sc))
+ ret = nr_reclaimed;
+out:
+ /*
+ * Now that we've scanned all the zones at this priority level, note
+ * that level within the zone so that the next thread which performs
+ * scanning of this zone will immediately start out at this priority
+ * level. This affects only the decision whether or not to bring
+ * mapped pages onto the inactive list.
+ */
+ if (priority < 0)
+ priority = 0;
+
+ if (scan_global_lru(sc)) {
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ continue;
+
+ zone->prev_priority = priority;
+ }
+ } else
+ mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
+
+ delayacct_freepages_end();
+
+ return ret;
+}
+
+unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
+ gfp_t gfp_mask)
+{
+ struct scan_control sc = {
+ .gfp_mask = gfp_mask,
+ .may_writepage = !laptop_mode,
+ .swap_cluster_max = SWAP_CLUSTER_MAX,
+ .may_swap = 1,
+ .swappiness = vm_swappiness,
+ .order = order,
+ .mem_cgroup = NULL,
+ .isolate_pages = isolate_pages_global,
+ };
+
+ return do_try_to_free_pages(zonelist, &sc);
+}
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+
+unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
+ gfp_t gfp_mask)
+{
+ struct scan_control sc = {
+ .may_writepage = !laptop_mode,
+ .may_swap = 1,
+ .swap_cluster_max = SWAP_CLUSTER_MAX,
+ .swappiness = vm_swappiness,
+ .order = 0,
+ .mem_cgroup = mem_cont,
+ .isolate_pages = mem_cgroup_isolate_pages,
+ };
+ struct zonelist *zonelist;
+
+ sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+ (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
+ zonelist = NODE_DATA(numa_node_id())->node_zonelists;
+ return do_try_to_free_pages(zonelist, &sc);
+}
+#endif
+
+/*
+ * For kswapd, balance_pgdat() will work across all this node's zones until
+ * they are all at pages_high.
+ *
+ * Returns the number of pages which were actually freed.
+ *
+ * There is special handling here for zones which are full of pinned pages.
+ * This can happen if the pages are all mlocked, or if they are all used by
+ * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb.
+ * What we do is to detect the case where all pages in the zone have been
+ * scanned twice and there has been zero successful reclaim. Mark the zone as
+ * dead and from now on, only perform a short scan. Basically we're polling
+ * the zone for when the problem goes away.
+ *
+ * kswapd scans the zones in the highmem->normal->dma direction. It skips
+ * zones which have free_pages > pages_high, but once a zone is found to have
+ * free_pages <= pages_high, we scan that zone and the lower zones regardless
+ * of the number of free pages in the lower zones. This interoperates with
+ * the page allocator fallback scheme to ensure that aging of pages is balanced
+ * across the zones.
+ */
+static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
+{
+ int all_zones_ok;
+ int priority;
+ int i;
+ unsigned long total_scanned;
+ unsigned long nr_reclaimed;
+ struct reclaim_state *reclaim_state = current->reclaim_state;
+ struct scan_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .may_swap = 1,
+ .swap_cluster_max = SWAP_CLUSTER_MAX,
+ .swappiness = vm_swappiness,
+ .order = order,
+ .mem_cgroup = NULL,
+ .isolate_pages = isolate_pages_global,
+ };
+ /*
+ * temp_priority is used to remember the scanning priority at which
+ * this zone was successfully refilled to free_pages == pages_high.
+ */
+ int temp_priority[MAX_NR_ZONES];
+
+loop_again:
+ total_scanned = 0;
+ nr_reclaimed = 0;
+ sc.may_writepage = !laptop_mode;
+ count_vm_event(PAGEOUTRUN);
+
+ for (i = 0; i < pgdat->nr_zones; i++)
+ temp_priority[i] = DEF_PRIORITY;
+
+ for (priority = DEF_PRIORITY; priority >= 0; priority--) {
+ int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
+ unsigned long lru_pages = 0;
+
+ /* The swap token gets in the way of swapout... */
+ if (!priority)
+ disable_swap_token();
+
+ all_zones_ok = 1;
+
+ /*
+ * Scan in the highmem->dma direction for the highest
+ * zone which needs scanning
+ */
+ for (i = pgdat->nr_zones - 1; i >= 0; i--) {
+ struct zone *zone = pgdat->node_zones + i;
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (zone_is_all_unreclaimable(zone) &&
+ priority != DEF_PRIORITY)
+ continue;
+
+ /*
+ * Do some background aging of the anon list, to give
+ * pages a chance to be referenced before reclaiming.
+ */
+ if (inactive_anon_is_low(zone))
+ shrink_active_list(SWAP_CLUSTER_MAX, zone,
+ &sc, priority, 0);
+
+ if (!zone_watermark_ok(zone, order, zone->pages_high,
+ 0, 0)) {
+ end_zone = i;
+ break;
+ }
+ }
+ if (i < 0)
+ goto out;
+
+ for (i = 0; i <= end_zone; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+
+ lru_pages += zone_lru_pages(zone);
+ }
+
+ /*
+ * Now scan the zone in the dma->highmem direction, stopping
+ * at the last zone which needs scanning.
+ *
+ * We do this because the page allocator works in the opposite
+ * direction. This prevents the page allocator from allocating
+ * pages behind kswapd's direction of progress, which would
+ * cause too much scanning of the lower zones.
+ */
+ for (i = 0; i <= end_zone; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+ int nr_slab;
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (zone_is_all_unreclaimable(zone) &&
+ priority != DEF_PRIORITY)
+ continue;
+
+ if (!zone_watermark_ok(zone, order, zone->pages_high,
+ end_zone, 0))
+ all_zones_ok = 0;
+ temp_priority[i] = priority;
+ sc.nr_scanned = 0;
+ note_zone_scanning_priority(zone, priority);
+ /*
+ * We put equal pressure on every zone, unless one
+ * zone has way too many pages free already.
+ */
+ if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
+ end_zone, 0))
+ nr_reclaimed += shrink_zone(priority, zone, &sc);
+ reclaim_state->reclaimed_slab = 0;
+ nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
+ lru_pages);
+ nr_reclaimed += reclaim_state->reclaimed_slab;
+ total_scanned += sc.nr_scanned;
+ if (zone_is_all_unreclaimable(zone))
+ continue;
+ if (nr_slab == 0 && zone->pages_scanned >=
+ (zone_lru_pages(zone) * 6))
+ zone_set_flag(zone,
+ ZONE_ALL_UNRECLAIMABLE);
+ /*
+ * If we've done a decent amount of scanning and
+ * the reclaim ratio is low, start doing writepage
+ * even in laptop mode
+ */
+ if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
+ total_scanned > nr_reclaimed + nr_reclaimed / 2)
+ sc.may_writepage = 1;
+ }
+ if (all_zones_ok)
+ break; /* kswapd: all done */
+ /*
+ * OK, kswapd is getting into trouble. Take a nap, then take
+ * another pass across the zones.
+ */
+ if (total_scanned && priority < DEF_PRIORITY - 2)
+ congestion_wait(WRITE, HZ/10);
+
+ /*
+ * We do this so kswapd doesn't build up large priorities for
+ * example when it is freeing in parallel with allocators. It
+ * matches the direct reclaim path behaviour in terms of impact
+ * on zone->*_priority.
+ */
+ if (nr_reclaimed >= SWAP_CLUSTER_MAX)
+ break;
+ }
+out:
+ /*
+ * Note within each zone the priority level at which this zone was
+ * brought into a happy state. So that the next thread which scans this
+ * zone will start out at that priority level.
+ */
+ for (i = 0; i < pgdat->nr_zones; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+
+ zone->prev_priority = temp_priority[i];
+ }
+ if (!all_zones_ok) {
+ cond_resched();
+
+ try_to_freeze();
+
+ goto loop_again;
+ }
+
+ return nr_reclaimed;
+}
+
+/*
+ * The background pageout daemon, started as a kernel thread
+ * from the init process.
+ *
+ * This basically trickles out pages so that we have _some_
+ * free memory available even if there is no other activity
+ * that frees anything up. This is needed for things like routing
+ * etc, where we otherwise might have all activity going on in
+ * asynchronous contexts that cannot page things out.
+ *
+ * If there are applications that are active memory-allocators
+ * (most normal use), this basically shouldn't matter.
+ */
+static int kswapd(void *p)
+{
+ unsigned long order;
+ pg_data_t *pgdat = (pg_data_t*)p;
+ struct task_struct *tsk = current;
+ DEFINE_WAIT(wait);
+ struct reclaim_state reclaim_state = {
+ .reclaimed_slab = 0,
+ };
+ node_to_cpumask_ptr(cpumask, pgdat->node_id);
+
+ if (!cpus_empty(*cpumask))
+ set_cpus_allowed_ptr(tsk, cpumask);
+ current->reclaim_state = &reclaim_state;
+
+ /*
+ * Tell the memory management that we're a "memory allocator",
+ * and that if we need more memory we should get access to it
+ * regardless (see "__alloc_pages()"). "kswapd" should
+ * never get caught in the normal page freeing logic.
+ *
+ * (Kswapd normally doesn't need memory anyway, but sometimes
+ * you need a small amount of memory in order to be able to
+ * page out something else, and this flag essentially protects
+ * us from recursively trying to free more memory as we're
+ * trying to free the first piece of memory in the first place).
+ */
+ tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
+ set_freezable();
+
+ order = 0;
+ for ( ; ; ) {
+ unsigned long new_order;
+
+ prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+ new_order = pgdat->kswapd_max_order;
+ pgdat->kswapd_max_order = 0;
+ if (order < new_order) {
+ /*
+ * Don't sleep if someone wants a larger 'order'
+ * allocation
+ */
+ order = new_order;
+ } else {
+ if (!freezing(current))
+ schedule();
+
+ order = pgdat->kswapd_max_order;
+ }
+ finish_wait(&pgdat->kswapd_wait, &wait);
+
+ if (!try_to_freeze()) {
+ /* We can speed up thawing tasks if we don't call
+ * balance_pgdat after returning from the refrigerator
+ */
+ balance_pgdat(pgdat, order);
+ }
+ }
+ return 0;
+}
+
+/*
+ * A zone is low on free memory, so wake its kswapd task to service it.
+ */
+void wakeup_kswapd(struct zone *zone, int order)
+{
+ pg_data_t *pgdat;
+
+ if (!populated_zone(zone))
+ return;
+
+ pgdat = zone->zone_pgdat;
+ if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
+ return;
+ if (pgdat->kswapd_max_order < order)
+ pgdat->kswapd_max_order = order;
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ return;
+ if (!waitqueue_active(&pgdat->kswapd_wait))
+ return;
+ wake_up_interruptible(&pgdat->kswapd_wait);
+}
+
+unsigned long global_lru_pages(void)
+{
+ return global_page_state(NR_ACTIVE_ANON)
+ + global_page_state(NR_ACTIVE_FILE)
+ + global_page_state(NR_INACTIVE_ANON)
+ + global_page_state(NR_INACTIVE_FILE);
+}
+
+#ifdef CONFIG_PM
+/*
+ * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages
+ * from LRU lists system-wide, for given pass and priority, and returns the
+ * number of reclaimed pages
+ *
+ * For pass > 3 we also try to shrink the LRU lists that contain a few pages
+ */
+static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
+ int pass, struct scan_control *sc)
+{
+ struct zone *zone;
+ unsigned long nr_to_scan, ret = 0;
+ enum lru_list l;
+
+ for_each_zone(zone) {
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
+ continue;
+
+ for_each_evictable_lru(l) {
+ /* For pass = 0, we don't shrink the active list */
+ if (pass == 0 &&
+ (l == LRU_ACTIVE || l == LRU_ACTIVE_FILE))
+ continue;
+
+ zone->lru[l].nr_scan +=
+ (zone_page_state(zone, NR_LRU_BASE + l)
+ >> prio) + 1;
+ if (zone->lru[l].nr_scan >= nr_pages || pass > 3) {
+ zone->lru[l].nr_scan = 0;
+ nr_to_scan = min(nr_pages,
+ zone_page_state(zone,
+ NR_LRU_BASE + l));
+ ret += shrink_list(l, nr_to_scan, zone,
+ sc, prio);
+ if (ret >= nr_pages)
+ return ret;
+ }
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Try to free `nr_pages' of memory, system-wide, and return the number of
+ * freed pages.
+ *
+ * Rather than trying to age LRUs the aim is to preserve the overall
+ * LRU order by reclaiming preferentially
+ * inactive > active > active referenced > active mapped
+ */
+unsigned long shrink_all_memory(unsigned long nr_pages)
+{
+ unsigned long lru_pages, nr_slab;
+ unsigned long ret = 0;
+ int pass;
+ struct reclaim_state reclaim_state;
+ struct scan_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .may_swap = 0,
+ .swap_cluster_max = nr_pages,
+ .may_writepage = 1,
+ .swappiness = vm_swappiness,
+ .isolate_pages = isolate_pages_global,
+ };
+
+ current->reclaim_state = &reclaim_state;
+
+ lru_pages = global_lru_pages();
+ nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
+ /* If slab caches are huge, it's better to hit them first */
+ while (nr_slab >= lru_pages) {
+ reclaim_state.reclaimed_slab = 0;
+ shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
+ if (!reclaim_state.reclaimed_slab)
+ break;
+
+ ret += reclaim_state.reclaimed_slab;
+ if (ret >= nr_pages)
+ goto out;
+
+ nr_slab -= reclaim_state.reclaimed_slab;
+ }
+
+ /*
+ * We try to shrink LRUs in 5 passes:
+ * 0 = Reclaim from inactive_list only
+ * 1 = Reclaim from active list but don't reclaim mapped
+ * 2 = 2nd pass of type 1
+ * 3 = Reclaim mapped (normal reclaim)
+ * 4 = 2nd pass of type 3
+ */
+ for (pass = 0; pass < 5; pass++) {
+ int prio;
+
+ /* Force reclaiming mapped pages in the passes #3 and #4 */
+ if (pass > 2) {
+ sc.may_swap = 1;
+ sc.swappiness = 100;
+ }
+
+ for (prio = DEF_PRIORITY; prio >= 0; prio--) {
+ unsigned long nr_to_scan = nr_pages - ret;
+
+ sc.nr_scanned = 0;
+ ret += shrink_all_zones(nr_to_scan, prio, pass, &sc);
+ if (ret >= nr_pages)
+ goto out;
+
+ reclaim_state.reclaimed_slab = 0;
+ shrink_slab(sc.nr_scanned, sc.gfp_mask,
+ global_lru_pages());
+ ret += reclaim_state.reclaimed_slab;
+ if (ret >= nr_pages)
+ goto out;
+
+ if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
+ congestion_wait(WRITE, HZ / 10);
+ }
+ }
+
+ /*
+ * If ret = 0, we could not shrink LRUs, but there may be something
+ * in slab caches
+ */
+ if (!ret) {
+ do {
+ reclaim_state.reclaimed_slab = 0;
+ shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages());
+ ret += reclaim_state.reclaimed_slab;
+ } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
+ }
+
+out:
+ current->reclaim_state = NULL;
+
+ return ret;
+}
+#endif
+
+/* It's optimal to keep kswapds on the same CPUs as their memory, but
+ not required for correctness. So if the last cpu in a node goes
+ away, we get changed to run anywhere: as the first one comes back,
+ restore their cpu bindings. */
+static int __devinit cpu_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ int nid;
+
+ if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
+ for_each_node_state(nid, N_HIGH_MEMORY) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+ node_to_cpumask_ptr(mask, pgdat->node_id);
+
+ if (any_online_cpu(*mask) < nr_cpu_ids)
+ /* One of our CPUs online: restore mask */
+ set_cpus_allowed_ptr(pgdat->kswapd, mask);
+ }
+ }
+ return NOTIFY_OK;
+}
+
+/*
+ * This kswapd start function will be called by init and node-hot-add.
+ * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
+ */
+int kswapd_run(int nid)
+{
+ pg_data_t *pgdat = NODE_DATA(nid);
+ int ret = 0;
+
+ if (pgdat->kswapd)
+ return 0;
+
+ pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
+ if (IS_ERR(pgdat->kswapd)) {
+ /* failure at boot is fatal */
+ BUG_ON(system_state == SYSTEM_BOOTING);
+ printk("Failed to start kswapd on node %d\n",nid);
+ ret = -1;
+ }
+ return ret;
+}
+
+static int __init kswapd_init(void)
+{
+ int nid;
+
+ swap_setup();
+ for_each_node_state(nid, N_HIGH_MEMORY)
+ kswapd_run(nid);
+ hotcpu_notifier(cpu_callback, 0);
+ return 0;
+}
+
+module_init(kswapd_init)
+
+#ifdef CONFIG_NUMA
+/*
+ * Zone reclaim mode
+ *
+ * If non-zero call zone_reclaim when the number of free pages falls below
+ * the watermarks.
+ */
+int zone_reclaim_mode __read_mostly;
+
+#define RECLAIM_OFF 0
+#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
+#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
+#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
+
+/*
+ * Priority for ZONE_RECLAIM. This determines the fraction of pages
+ * of a node considered for each zone_reclaim. 4 scans 1/16th of
+ * a zone.
+ */
+#define ZONE_RECLAIM_PRIORITY 4
+
+/*
+ * Percentage of pages in a zone that must be unmapped for zone_reclaim to
+ * occur.
+ */
+int sysctl_min_unmapped_ratio = 1;
+
+/*
+ * If the number of slab pages in a zone grows beyond this percentage then
+ * slab reclaim needs to occur.
+ */
+int sysctl_min_slab_ratio = 5;
+
+/*
+ * Try to free up some pages from this zone through reclaim.
+ */
+static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+{
+ /* Minimum pages needed in order to stay on node */
+ const unsigned long nr_pages = 1 << order;
+ struct task_struct *p = current;
+ struct reclaim_state reclaim_state;
+ int priority;
+ unsigned long nr_reclaimed = 0;
+ struct scan_control sc = {
+ .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
+ .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
+ .swap_cluster_max = max_t(unsigned long, nr_pages,
+ SWAP_CLUSTER_MAX),
+ .gfp_mask = gfp_mask,
+ .swappiness = vm_swappiness,
+ .isolate_pages = isolate_pages_global,
+ };
+ unsigned long slab_reclaimable;
+
+ disable_swap_token();
+ cond_resched();
+ /*
+ * We need to be able to allocate from the reserves for RECLAIM_SWAP
+ * and we also need to be able to write out pages for RECLAIM_WRITE
+ * and RECLAIM_SWAP.
+ */
+ p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
+ reclaim_state.reclaimed_slab = 0;
+ p->reclaim_state = &reclaim_state;
+
+ if (zone_page_state(zone, NR_FILE_PAGES) -
+ zone_page_state(zone, NR_FILE_MAPPED) >
+ zone->min_unmapped_pages) {
+ /*
+ * Free memory by calling shrink zone with increasing
+ * priorities until we have enough memory freed.
+ */
+ priority = ZONE_RECLAIM_PRIORITY;
+ do {
+ note_zone_scanning_priority(zone, priority);
+ nr_reclaimed += shrink_zone(priority, zone, &sc);
+ priority--;
+ } while (priority >= 0 && nr_reclaimed < nr_pages);
+ }
+
+ slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+ if (slab_reclaimable > zone->min_slab_pages) {
+ /*
+ * shrink_slab() does not currently allow us to determine how
+ * many pages were freed in this zone. So we take the current
+ * number of slab pages and shake the slab until it is reduced
+ * by the same nr_pages that we used for reclaiming unmapped
+ * pages.
+ *
+ * Note that shrink_slab will free memory on all zones and may
+ * take a long time.
+ */
+ while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
+ zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
+ slab_reclaimable - nr_pages)
+ ;
+
+ /*
+ * Update nr_reclaimed by the number of slab pages we
+ * reclaimed from this zone.
+ */
+ nr_reclaimed += slab_reclaimable -
+ zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+ }
+
+ p->reclaim_state = NULL;
+ current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
+ return nr_reclaimed >= nr_pages;
+}
+
+int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+{
+ int node_id;
+ int ret;
+
+ /*
+ * Zone reclaim reclaims unmapped file backed pages and
+ * slab pages if we are over the defined limits.
+ *
+ * A small portion of unmapped file backed pages is needed for
+ * file I/O otherwise pages read by file I/O will be immediately
+ * thrown out if the zone is overallocated. So we do not reclaim
+ * if less than a specified percentage of the zone is used by
+ * unmapped file backed pages.
+ */
+ if (zone_page_state(zone, NR_FILE_PAGES) -
+ zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages
+ && zone_page_state(zone, NR_SLAB_RECLAIMABLE)
+ <= zone->min_slab_pages)
+ return 0;
+
+ if (zone_is_all_unreclaimable(zone))
+ return 0;
+
+ /*
+ * Do not scan if the allocation should not be delayed.
+ */
+ if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
+ return 0;
+
+ /*
+ * Only run zone reclaim on the local zone or on zones that do not
+ * have associated processors. This will favor the local processor
+ * over remote processors and spread off node memory allocations
+ * as wide as possible.
+ */
+ node_id = zone_to_nid(zone);
+ if (node_state(node_id, N_CPU) && node_id != numa_node_id())
+ return 0;
+
+ if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
+ return 0;
+ ret = __zone_reclaim(zone, gfp_mask, order);
+ zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
+
+ return ret;
+}
+#endif
+
+#ifdef CONFIG_UNEVICTABLE_LRU
+/*
+ * page_evictable - test whether a page is evictable
+ * @page: the page to test
+ * @vma: the VMA in which the page is or will be mapped, may be NULL
+ *
+ * Test whether page is evictable--i.e., should be placed on active/inactive
+ * lists vs unevictable list. The vma argument is !NULL when called from the
+ * fault path to determine how to instantate a new page.
+ *
+ * Reasons page might not be evictable:
+ * (1) page's mapping marked unevictable
+ * (2) page is part of an mlocked VMA
+ *
+ */
+int page_evictable(struct page *page, struct vm_area_struct *vma)
+{
+
+ if (mapping_unevictable(page_mapping(page)))
+ return 0;
+
+ if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page)))
+ return 0;
+
+ return 1;
+}
+
+/**
+ * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list
+ * @page: page to check evictability and move to appropriate lru list
+ * @zone: zone page is in
+ *
+ * Checks a page for evictability and moves the page to the appropriate
+ * zone lru list.
+ *
+ * Restrictions: zone->lru_lock must be held, page must be on LRU and must
+ * have PageUnevictable set.
+ */
+static void check_move_unevictable_page(struct page *page, struct zone *zone)
+{
+ VM_BUG_ON(PageActive(page));
+
+retry:
+ ClearPageUnevictable(page);
+ if (page_evictable(page, NULL)) {
+ enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page);
+
+ __dec_zone_state(zone, NR_UNEVICTABLE);
+ list_move(&page->lru, &zone->lru[l].list);
+ __inc_zone_state(zone, NR_INACTIVE_ANON + l);
+ __count_vm_event(UNEVICTABLE_PGRESCUED);
+ } else {
+ /*
+ * rotate unevictable list
+ */
+ SetPageUnevictable(page);
+ list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
+ if (page_evictable(page, NULL))
+ goto retry;
+ }
+}
+
+/**
+ * scan_mapping_unevictable_pages - scan an address space for evictable pages
+ * @mapping: struct address_space to scan for evictable pages
+ *
+ * Scan all pages in mapping. Check unevictable pages for
+ * evictability and move them to the appropriate zone lru list.
+ */
+void scan_mapping_unevictable_pages(struct address_space *mapping)
+{
+ pgoff_t next = 0;
+ pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+ struct zone *zone;
+ struct pagevec pvec;
+
+ if (mapping->nrpages == 0)
+ return;
+
+ pagevec_init(&pvec, 0);
+ while (next < end &&
+ pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+ int i;
+ int pg_scanned = 0;
+
+ zone = NULL;
+
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+ pgoff_t page_index = page->index;
+ struct zone *pagezone = page_zone(page);
+
+ pg_scanned++;
+ if (page_index > next)
+ next = page_index;
+ next++;
+
+ if (pagezone != zone) {
+ if (zone)
+ spin_unlock_irq(&zone->lru_lock);
+ zone = pagezone;
+ spin_lock_irq(&zone->lru_lock);
+ }
+
+ if (PageLRU(page) && PageUnevictable(page))
+ check_move_unevictable_page(page, zone);
+ }
+ if (zone)
+ spin_unlock_irq(&zone->lru_lock);
+ pagevec_release(&pvec);
+
+ count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned);
+ }
+
+}
+
+/**
+ * scan_zone_unevictable_pages - check unevictable list for evictable pages
+ * @zone - zone of which to scan the unevictable list
+ *
+ * Scan @zone's unevictable LRU lists to check for pages that have become
+ * evictable. Move those that have to @zone's inactive list where they
+ * become candidates for reclaim, unless shrink_inactive_zone() decides
+ * to reactivate them. Pages that are still unevictable are rotated
+ * back onto @zone's unevictable list.
+ */
+#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
+void scan_zone_unevictable_pages(struct zone *zone)
+{
+ struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
+ unsigned long scan;
+ unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE);
+
+ while (nr_to_scan > 0) {
+ unsigned long batch_size = min(nr_to_scan,
+ SCAN_UNEVICTABLE_BATCH_SIZE);
+
+ spin_lock_irq(&zone->lru_lock);
+ for (scan = 0; scan < batch_size; scan++) {
+ struct page *page = lru_to_page(l_unevictable);
+
+ if (!trylock_page(page))
+ continue;
+
+ prefetchw_prev_lru_page(page, l_unevictable, flags);
+
+ if (likely(PageLRU(page) && PageUnevictable(page)))
+ check_move_unevictable_page(page, zone);
+
+ unlock_page(page);
+ }
+ spin_unlock_irq(&zone->lru_lock);
+
+ nr_to_scan -= batch_size;
+ }
+}
+
+
+/**
+ * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages
+ *
+ * A really big hammer: scan all zones' unevictable LRU lists to check for
+ * pages that have become evictable. Move those back to the zones'
+ * inactive list where they become candidates for reclaim.
+ * This occurs when, e.g., we have unswappable pages on the unevictable lists,
+ * and we add swap to the system. As such, it runs in the context of a task
+ * that has possibly/probably made some previously unevictable pages
+ * evictable.
+ */
+void scan_all_zones_unevictable_pages(void)
+{
+ struct zone *zone;
+
+ for_each_zone(zone) {
+ scan_zone_unevictable_pages(zone);
+ }
+}
+
+/*
+ * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of
+ * all nodes' unevictable lists for evictable pages
+ */
+unsigned long scan_unevictable_pages;
+
+int scan_unevictable_handler(struct ctl_table *table, int write,
+ struct file *file, void __user *buffer,
+ size_t *length, loff_t *ppos)
+{
+ proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+
+ if (write && *(unsigned long *)table->data)
+ scan_all_zones_unevictable_pages();
+
+ scan_unevictable_pages = 0;
+ return 0;
+}
+
+/*
+ * per node 'scan_unevictable_pages' attribute. On demand re-scan of
+ * a specified node's per zone unevictable lists for evictable pages.
+ */
+
+static ssize_t read_scan_unevictable_node(struct sys_device *dev,
+ struct sysdev_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "0\n"); /* always zero; should fit... */
+}
+
+static ssize_t write_scan_unevictable_node(struct sys_device *dev,
+ struct sysdev_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct zone *node_zones = NODE_DATA(dev->id)->node_zones;
+ struct zone *zone;
+ unsigned long res;
+ unsigned long req = strict_strtoul(buf, 10, &res);
+
+ if (!req)
+ return 1; /* zero is no-op */
+
+ for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+ if (!populated_zone(zone))
+ continue;
+ scan_zone_unevictable_pages(zone);
+ }
+ return 1;
+}
+
+
+static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
+ read_scan_unevictable_node,
+ write_scan_unevictable_node);
+
+int scan_unevictable_register_node(struct node *node)
+{
+ return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages);
+}
+
+void scan_unevictable_unregister_node(struct node *node)
+{
+ sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
+}
+
+#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
new file mode 100644
index 0000000..c3ccfda
--- /dev/null
+++ b/mm/vmstat.c
@@ -0,0 +1,969 @@
+/*
+ * linux/mm/vmstat.c
+ *
+ * Manages VM statistics
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * zoned VM statistics
+ * Copyright (C) 2006 Silicon Graphics, Inc.,
+ * Christoph Lameter <christoph@lameter.com>
+ */
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/cpu.h>
+#include <linux/vmstat.h>
+#include <linux/sched.h>
+
+#ifdef CONFIG_VM_EVENT_COUNTERS
+DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
+EXPORT_PER_CPU_SYMBOL(vm_event_states);
+
+static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
+{
+ int cpu;
+ int i;
+
+ memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
+
+ for_each_cpu_mask_nr(cpu, *cpumask) {
+ struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
+
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ ret[i] += this->event[i];
+ }
+}
+
+/*
+ * Accumulate the vm event counters across all CPUs.
+ * The result is unavoidably approximate - it can change
+ * during and after execution of this function.
+*/
+void all_vm_events(unsigned long *ret)
+{
+ get_online_cpus();
+ sum_vm_events(ret, &cpu_online_map);
+ put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(all_vm_events);
+
+#ifdef CONFIG_HOTPLUG
+/*
+ * Fold the foreign cpu events into our own.
+ *
+ * This is adding to the events on one processor
+ * but keeps the global counts constant.
+ */
+void vm_events_fold_cpu(int cpu)
+{
+ struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
+ int i;
+
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
+ count_vm_events(i, fold_state->event[i]);
+ fold_state->event[i] = 0;
+ }
+}
+#endif /* CONFIG_HOTPLUG */
+
+#endif /* CONFIG_VM_EVENT_COUNTERS */
+
+/*
+ * Manage combined zone based / global counters
+ *
+ * vm_stat contains the global counters
+ */
+atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
+EXPORT_SYMBOL(vm_stat);
+
+#ifdef CONFIG_SMP
+
+static int calculate_threshold(struct zone *zone)
+{
+ int threshold;
+ int mem; /* memory in 128 MB units */
+
+ /*
+ * The threshold scales with the number of processors and the amount
+ * of memory per zone. More memory means that we can defer updates for
+ * longer, more processors could lead to more contention.
+ * fls() is used to have a cheap way of logarithmic scaling.
+ *
+ * Some sample thresholds:
+ *
+ * Threshold Processors (fls) Zonesize fls(mem+1)
+ * ------------------------------------------------------------------
+ * 8 1 1 0.9-1 GB 4
+ * 16 2 2 0.9-1 GB 4
+ * 20 2 2 1-2 GB 5
+ * 24 2 2 2-4 GB 6
+ * 28 2 2 4-8 GB 7
+ * 32 2 2 8-16 GB 8
+ * 4 2 2 <128M 1
+ * 30 4 3 2-4 GB 5
+ * 48 4 3 8-16 GB 8
+ * 32 8 4 1-2 GB 4
+ * 32 8 4 0.9-1GB 4
+ * 10 16 5 <128M 1
+ * 40 16 5 900M 4
+ * 70 64 7 2-4 GB 5
+ * 84 64 7 4-8 GB 6
+ * 108 512 9 4-8 GB 6
+ * 125 1024 10 8-16 GB 8
+ * 125 1024 10 16-32 GB 9
+ */
+
+ mem = zone->present_pages >> (27 - PAGE_SHIFT);
+
+ threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
+
+ /*
+ * Maximum threshold is 125
+ */
+ threshold = min(125, threshold);
+
+ return threshold;
+}
+
+/*
+ * Refresh the thresholds for each zone.
+ */
+static void refresh_zone_stat_thresholds(void)
+{
+ struct zone *zone;
+ int cpu;
+ int threshold;
+
+ for_each_zone(zone) {
+
+ if (!zone->present_pages)
+ continue;
+
+ threshold = calculate_threshold(zone);
+
+ for_each_online_cpu(cpu)
+ zone_pcp(zone, cpu)->stat_threshold = threshold;
+ }
+}
+
+/*
+ * For use when we know that interrupts are disabled.
+ */
+void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+ int delta)
+{
+ struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+ s8 *p = pcp->vm_stat_diff + item;
+ long x;
+
+ x = delta + *p;
+
+ if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
+ zone_page_state_add(x, zone, item);
+ x = 0;
+ }
+ *p = x;
+}
+EXPORT_SYMBOL(__mod_zone_page_state);
+
+/*
+ * For an unknown interrupt state
+ */
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+ int delta)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __mod_zone_page_state(zone, item, delta);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
+
+/*
+ * Optimized increment and decrement functions.
+ *
+ * These are only for a single page and therefore can take a struct page *
+ * argument instead of struct zone *. This allows the inclusion of the code
+ * generated for page_zone(page) into the optimized functions.
+ *
+ * No overflow check is necessary and therefore the differential can be
+ * incremented or decremented in place which may allow the compilers to
+ * generate better code.
+ * The increment or decrement is known and therefore one boundary check can
+ * be omitted.
+ *
+ * NOTE: These functions are very performance sensitive. Change only
+ * with care.
+ *
+ * Some processors have inc/dec instructions that are atomic vs an interrupt.
+ * However, the code must first determine the differential location in a zone
+ * based on the processor number and then inc/dec the counter. There is no
+ * guarantee without disabling preemption that the processor will not change
+ * in between and therefore the atomicity vs. interrupt cannot be exploited
+ * in a useful way here.
+ */
+void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
+{
+ struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+ s8 *p = pcp->vm_stat_diff + item;
+
+ (*p)++;
+
+ if (unlikely(*p > pcp->stat_threshold)) {
+ int overstep = pcp->stat_threshold / 2;
+
+ zone_page_state_add(*p + overstep, zone, item);
+ *p = -overstep;
+ }
+}
+
+void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+ __inc_zone_state(page_zone(page), item);
+}
+EXPORT_SYMBOL(__inc_zone_page_state);
+
+void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
+{
+ struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+ s8 *p = pcp->vm_stat_diff + item;
+
+ (*p)--;
+
+ if (unlikely(*p < - pcp->stat_threshold)) {
+ int overstep = pcp->stat_threshold / 2;
+
+ zone_page_state_add(*p - overstep, zone, item);
+ *p = overstep;
+ }
+}
+
+void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+ __dec_zone_state(page_zone(page), item);
+}
+EXPORT_SYMBOL(__dec_zone_page_state);
+
+void inc_zone_state(struct zone *zone, enum zone_stat_item item)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __inc_zone_state(zone, item);
+ local_irq_restore(flags);
+}
+
+void inc_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+ unsigned long flags;
+ struct zone *zone;
+
+ zone = page_zone(page);
+ local_irq_save(flags);
+ __inc_zone_state(zone, item);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(inc_zone_page_state);
+
+void dec_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __dec_zone_page_state(page, item);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(dec_zone_page_state);
+
+/*
+ * Update the zone counters for one cpu.
+ *
+ * The cpu specified must be either the current cpu or a processor that
+ * is not online. If it is the current cpu then the execution thread must
+ * be pinned to the current cpu.
+ *
+ * Note that refresh_cpu_vm_stats strives to only access
+ * node local memory. The per cpu pagesets on remote zones are placed
+ * in the memory local to the processor using that pageset. So the
+ * loop over all zones will access a series of cachelines local to
+ * the processor.
+ *
+ * The call to zone_page_state_add updates the cachelines with the
+ * statistics in the remote zone struct as well as the global cachelines
+ * with the global counters. These could cause remote node cache line
+ * bouncing and will have to be only done when necessary.
+ */
+void refresh_cpu_vm_stats(int cpu)
+{
+ struct zone *zone;
+ int i;
+ int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+
+ for_each_zone(zone) {
+ struct per_cpu_pageset *p;
+
+ if (!populated_zone(zone))
+ continue;
+
+ p = zone_pcp(zone, cpu);
+
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+ if (p->vm_stat_diff[i]) {
+ unsigned long flags;
+ int v;
+
+ local_irq_save(flags);
+ v = p->vm_stat_diff[i];
+ p->vm_stat_diff[i] = 0;
+ local_irq_restore(flags);
+ atomic_long_add(v, &zone->vm_stat[i]);
+ global_diff[i] += v;
+#ifdef CONFIG_NUMA
+ /* 3 seconds idle till flush */
+ p->expire = 3;
+#endif
+ }
+ cond_resched();
+#ifdef CONFIG_NUMA
+ /*
+ * Deal with draining the remote pageset of this
+ * processor
+ *
+ * Check if there are pages remaining in this pageset
+ * if not then there is nothing to expire.
+ */
+ if (!p->expire || !p->pcp.count)
+ continue;
+
+ /*
+ * We never drain zones local to this processor.
+ */
+ if (zone_to_nid(zone) == numa_node_id()) {
+ p->expire = 0;
+ continue;
+ }
+
+ p->expire--;
+ if (p->expire)
+ continue;
+
+ if (p->pcp.count)
+ drain_zone_pages(zone, &p->pcp);
+#endif
+ }
+
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+ if (global_diff[i])
+ atomic_long_add(global_diff[i], &vm_stat[i]);
+}
+
+#endif
+
+#ifdef CONFIG_NUMA
+/*
+ * zonelist = the list of zones passed to the allocator
+ * z = the zone from which the allocation occurred.
+ *
+ * Must be called with interrupts disabled.
+ */
+void zone_statistics(struct zone *preferred_zone, struct zone *z)
+{
+ if (z->zone_pgdat == preferred_zone->zone_pgdat) {
+ __inc_zone_state(z, NUMA_HIT);
+ } else {
+ __inc_zone_state(z, NUMA_MISS);
+ __inc_zone_state(preferred_zone, NUMA_FOREIGN);
+ }
+ if (z->node == numa_node_id())
+ __inc_zone_state(z, NUMA_LOCAL);
+ else
+ __inc_zone_state(z, NUMA_OTHER);
+}
+#endif
+
+#ifdef CONFIG_PROC_FS
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+static char * const migratetype_names[MIGRATE_TYPES] = {
+ "Unmovable",
+ "Reclaimable",
+ "Movable",
+ "Reserve",
+ "Isolate",
+};
+
+static void *frag_start(struct seq_file *m, loff_t *pos)
+{
+ pg_data_t *pgdat;
+ loff_t node = *pos;
+ for (pgdat = first_online_pgdat();
+ pgdat && node;
+ pgdat = next_online_pgdat(pgdat))
+ --node;
+
+ return pgdat;
+}
+
+static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+ pg_data_t *pgdat = (pg_data_t *)arg;
+
+ (*pos)++;
+ return next_online_pgdat(pgdat);
+}
+
+static void frag_stop(struct seq_file *m, void *arg)
+{
+}
+
+/* Walk all the zones in a node and print using a callback */
+static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
+ void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
+{
+ struct zone *zone;
+ struct zone *node_zones = pgdat->node_zones;
+ unsigned long flags;
+
+ for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+ if (!populated_zone(zone))
+ continue;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ print(m, pgdat, zone);
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+}
+
+static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
+ struct zone *zone)
+{
+ int order;
+
+ seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
+ for (order = 0; order < MAX_ORDER; ++order)
+ seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
+ seq_putc(m, '\n');
+}
+
+/*
+ * This walks the free areas for each zone.
+ */
+static int frag_show(struct seq_file *m, void *arg)
+{
+ pg_data_t *pgdat = (pg_data_t *)arg;
+ walk_zones_in_node(m, pgdat, frag_show_print);
+ return 0;
+}
+
+static void pagetypeinfo_showfree_print(struct seq_file *m,
+ pg_data_t *pgdat, struct zone *zone)
+{
+ int order, mtype;
+
+ for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
+ seq_printf(m, "Node %4d, zone %8s, type %12s ",
+ pgdat->node_id,
+ zone->name,
+ migratetype_names[mtype]);
+ for (order = 0; order < MAX_ORDER; ++order) {
+ unsigned long freecount = 0;
+ struct free_area *area;
+ struct list_head *curr;
+
+ area = &(zone->free_area[order]);
+
+ list_for_each(curr, &area->free_list[mtype])
+ freecount++;
+ seq_printf(m, "%6lu ", freecount);
+ }
+ seq_putc(m, '\n');
+ }
+}
+
+/* Print out the free pages at each order for each migatetype */
+static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
+{
+ int order;
+ pg_data_t *pgdat = (pg_data_t *)arg;
+
+ /* Print header */
+ seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
+ for (order = 0; order < MAX_ORDER; ++order)
+ seq_printf(m, "%6d ", order);
+ seq_putc(m, '\n');
+
+ walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print);
+
+ return 0;
+}
+
+static void pagetypeinfo_showblockcount_print(struct seq_file *m,
+ pg_data_t *pgdat, struct zone *zone)
+{
+ int mtype;
+ unsigned long pfn;
+ unsigned long start_pfn = zone->zone_start_pfn;
+ unsigned long end_pfn = start_pfn + zone->spanned_pages;
+ unsigned long count[MIGRATE_TYPES] = { 0, };
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+ struct page *page;
+
+ if (!pfn_valid(pfn))
+ continue;
+
+ page = pfn_to_page(pfn);
+#ifdef CONFIG_ARCH_FLATMEM_HAS_HOLES
+ /*
+ * Ordinarily, memory holes in flatmem still have a valid
+ * memmap for the PFN range. However, an architecture for
+ * embedded systems (e.g. ARM) can free up the memmap backing
+ * holes to save memory on the assumption the memmap is
+ * never used. The page_zone linkages are then broken even
+ * though pfn_valid() returns true. Skip the page if the
+ * linkages are broken. Even if this test passed, the impact
+ * is that the counters for the movable type are off but
+ * fragmentation monitoring is likely meaningless on small
+ * systems.
+ */
+ if (page_zone(page) != zone)
+ continue;
+#endif
+ mtype = get_pageblock_migratetype(page);
+
+ if (mtype < MIGRATE_TYPES)
+ count[mtype]++;
+ }
+
+ /* Print counts */
+ seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
+ for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
+ seq_printf(m, "%12lu ", count[mtype]);
+ seq_putc(m, '\n');
+}
+
+/* Print out the free pages at each order for each migratetype */
+static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
+{
+ int mtype;
+ pg_data_t *pgdat = (pg_data_t *)arg;
+
+ seq_printf(m, "\n%-23s", "Number of blocks type ");
+ for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
+ seq_printf(m, "%12s ", migratetype_names[mtype]);
+ seq_putc(m, '\n');
+ walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print);
+
+ return 0;
+}
+
+/*
+ * This prints out statistics in relation to grouping pages by mobility.
+ * It is expensive to collect so do not constantly read the file.
+ */
+static int pagetypeinfo_show(struct seq_file *m, void *arg)
+{
+ pg_data_t *pgdat = (pg_data_t *)arg;
+
+ /* check memoryless node */
+ if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
+ return 0;
+
+ seq_printf(m, "Page block order: %d\n", pageblock_order);
+ seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages);
+ seq_putc(m, '\n');
+ pagetypeinfo_showfree(m, pgdat);
+ pagetypeinfo_showblockcount(m, pgdat);
+
+ return 0;
+}
+
+static const struct seq_operations fragmentation_op = {
+ .start = frag_start,
+ .next = frag_next,
+ .stop = frag_stop,
+ .show = frag_show,
+};
+
+static int fragmentation_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &fragmentation_op);
+}
+
+static const struct file_operations fragmentation_file_operations = {
+ .open = fragmentation_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct seq_operations pagetypeinfo_op = {
+ .start = frag_start,
+ .next = frag_next,
+ .stop = frag_stop,
+ .show = pagetypeinfo_show,
+};
+
+static int pagetypeinfo_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &pagetypeinfo_op);
+}
+
+static const struct file_operations pagetypeinfo_file_ops = {
+ .open = pagetypeinfo_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+#ifdef CONFIG_ZONE_DMA
+#define TEXT_FOR_DMA(xx) xx "_dma",
+#else
+#define TEXT_FOR_DMA(xx)
+#endif
+
+#ifdef CONFIG_ZONE_DMA32
+#define TEXT_FOR_DMA32(xx) xx "_dma32",
+#else
+#define TEXT_FOR_DMA32(xx)
+#endif
+
+#ifdef CONFIG_HIGHMEM
+#define TEXT_FOR_HIGHMEM(xx) xx "_high",
+#else
+#define TEXT_FOR_HIGHMEM(xx)
+#endif
+
+#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
+ TEXT_FOR_HIGHMEM(xx) xx "_movable",
+
+static const char * const vmstat_text[] = {
+ /* Zoned VM counters */
+ "nr_free_pages",
+ "nr_inactive_anon",
+ "nr_active_anon",
+ "nr_inactive_file",
+ "nr_active_file",
+#ifdef CONFIG_UNEVICTABLE_LRU
+ "nr_unevictable",
+ "nr_mlock",
+#endif
+ "nr_anon_pages",
+ "nr_mapped",
+ "nr_file_pages",
+ "nr_dirty",
+ "nr_writeback",
+ "nr_slab_reclaimable",
+ "nr_slab_unreclaimable",
+ "nr_page_table_pages",
+ "nr_unstable",
+ "nr_bounce",
+ "nr_vmscan_write",
+ "nr_writeback_temp",
+
+#ifdef CONFIG_NUMA
+ "numa_hit",
+ "numa_miss",
+ "numa_foreign",
+ "numa_interleave",
+ "numa_local",
+ "numa_other",
+#endif
+
+#ifdef CONFIG_VM_EVENT_COUNTERS
+ "pgpgin",
+ "pgpgout",
+ "pswpin",
+ "pswpout",
+
+ TEXTS_FOR_ZONES("pgalloc")
+
+ "pgfree",
+ "pgactivate",
+ "pgdeactivate",
+
+ "pgfault",
+ "pgmajfault",
+
+ TEXTS_FOR_ZONES("pgrefill")
+ TEXTS_FOR_ZONES("pgsteal")
+ TEXTS_FOR_ZONES("pgscan_kswapd")
+ TEXTS_FOR_ZONES("pgscan_direct")
+
+ "pginodesteal",
+ "slabs_scanned",
+ "kswapd_steal",
+ "kswapd_inodesteal",
+ "pageoutrun",
+ "allocstall",
+
+ "pgrotated",
+#ifdef CONFIG_HUGETLB_PAGE
+ "htlb_buddy_alloc_success",
+ "htlb_buddy_alloc_fail",
+#endif
+#ifdef CONFIG_UNEVICTABLE_LRU
+ "unevictable_pgs_culled",
+ "unevictable_pgs_scanned",
+ "unevictable_pgs_rescued",
+ "unevictable_pgs_mlocked",
+ "unevictable_pgs_munlocked",
+ "unevictable_pgs_cleared",
+ "unevictable_pgs_stranded",
+ "unevictable_pgs_mlockfreed",
+#endif
+#endif
+};
+
+static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
+ struct zone *zone)
+{
+ int i;
+ seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
+ seq_printf(m,
+ "\n pages free %lu"
+ "\n min %lu"
+ "\n low %lu"
+ "\n high %lu"
+ "\n scanned %lu (aa: %lu ia: %lu af: %lu if: %lu)"
+ "\n spanned %lu"
+ "\n present %lu",
+ zone_page_state(zone, NR_FREE_PAGES),
+ zone->pages_min,
+ zone->pages_low,
+ zone->pages_high,
+ zone->pages_scanned,
+ zone->lru[LRU_ACTIVE_ANON].nr_scan,
+ zone->lru[LRU_INACTIVE_ANON].nr_scan,
+ zone->lru[LRU_ACTIVE_FILE].nr_scan,
+ zone->lru[LRU_INACTIVE_FILE].nr_scan,
+ zone->spanned_pages,
+ zone->present_pages);
+
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+ seq_printf(m, "\n %-12s %lu", vmstat_text[i],
+ zone_page_state(zone, i));
+
+ seq_printf(m,
+ "\n protection: (%lu",
+ zone->lowmem_reserve[0]);
+ for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
+ seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
+ seq_printf(m,
+ ")"
+ "\n pagesets");
+ for_each_online_cpu(i) {
+ struct per_cpu_pageset *pageset;
+
+ pageset = zone_pcp(zone, i);
+ seq_printf(m,
+ "\n cpu: %i"
+ "\n count: %i"
+ "\n high: %i"
+ "\n batch: %i",
+ i,
+ pageset->pcp.count,
+ pageset->pcp.high,
+ pageset->pcp.batch);
+#ifdef CONFIG_SMP
+ seq_printf(m, "\n vm stats threshold: %d",
+ pageset->stat_threshold);
+#endif
+ }
+ seq_printf(m,
+ "\n all_unreclaimable: %u"
+ "\n prev_priority: %i"
+ "\n start_pfn: %lu"
+ "\n inactive_ratio: %u",
+ zone_is_all_unreclaimable(zone),
+ zone->prev_priority,
+ zone->zone_start_pfn,
+ zone->inactive_ratio);
+ seq_putc(m, '\n');
+}
+
+/*
+ * Output information about zones in @pgdat.
+ */
+static int zoneinfo_show(struct seq_file *m, void *arg)
+{
+ pg_data_t *pgdat = (pg_data_t *)arg;
+ walk_zones_in_node(m, pgdat, zoneinfo_show_print);
+ return 0;
+}
+
+static const struct seq_operations zoneinfo_op = {
+ .start = frag_start, /* iterate over all zones. The same as in
+ * fragmentation. */
+ .next = frag_next,
+ .stop = frag_stop,
+ .show = zoneinfo_show,
+};
+
+static int zoneinfo_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &zoneinfo_op);
+}
+
+static const struct file_operations proc_zoneinfo_file_operations = {
+ .open = zoneinfo_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static void *vmstat_start(struct seq_file *m, loff_t *pos)
+{
+ unsigned long *v;
+#ifdef CONFIG_VM_EVENT_COUNTERS
+ unsigned long *e;
+#endif
+ int i;
+
+ if (*pos >= ARRAY_SIZE(vmstat_text))
+ return NULL;
+
+#ifdef CONFIG_VM_EVENT_COUNTERS
+ v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
+ + sizeof(struct vm_event_state), GFP_KERNEL);
+#else
+ v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
+ GFP_KERNEL);
+#endif
+ m->private = v;
+ if (!v)
+ return ERR_PTR(-ENOMEM);
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+ v[i] = global_page_state(i);
+#ifdef CONFIG_VM_EVENT_COUNTERS
+ e = v + NR_VM_ZONE_STAT_ITEMS;
+ all_vm_events(e);
+ e[PGPGIN] /= 2; /* sectors -> kbytes */
+ e[PGPGOUT] /= 2;
+#endif
+ return v + *pos;
+}
+
+static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+ (*pos)++;
+ if (*pos >= ARRAY_SIZE(vmstat_text))
+ return NULL;
+ return (unsigned long *)m->private + *pos;
+}
+
+static int vmstat_show(struct seq_file *m, void *arg)
+{
+ unsigned long *l = arg;
+ unsigned long off = l - (unsigned long *)m->private;
+
+ seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
+ return 0;
+}
+
+static void vmstat_stop(struct seq_file *m, void *arg)
+{
+ kfree(m->private);
+ m->private = NULL;
+}
+
+static const struct seq_operations vmstat_op = {
+ .start = vmstat_start,
+ .next = vmstat_next,
+ .stop = vmstat_stop,
+ .show = vmstat_show,
+};
+
+static int vmstat_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &vmstat_op);
+}
+
+static const struct file_operations proc_vmstat_file_operations = {
+ .open = vmstat_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+#endif /* CONFIG_PROC_FS */
+
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
+int sysctl_stat_interval __read_mostly = HZ;
+
+static void vmstat_update(struct work_struct *w)
+{
+ refresh_cpu_vm_stats(smp_processor_id());
+ schedule_delayed_work(&__get_cpu_var(vmstat_work),
+ sysctl_stat_interval);
+}
+
+static void __cpuinit start_cpu_timer(int cpu)
+{
+ struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu);
+
+ INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update);
+ schedule_delayed_work_on(cpu, vmstat_work, HZ + cpu);
+}
+
+/*
+ * Use the cpu notifier to insure that the thresholds are recalculated
+ * when necessary.
+ */
+static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ long cpu = (long)hcpu;
+
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ start_cpu_timer(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu));
+ per_cpu(vmstat_work, cpu).work.func = NULL;
+ break;
+ case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
+ start_cpu_timer(cpu);
+ break;
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ refresh_zone_stat_thresholds();
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata vmstat_notifier =
+ { &vmstat_cpuup_callback, NULL, 0 };
+#endif
+
+static int __init setup_vmstat(void)
+{
+#ifdef CONFIG_SMP
+ int cpu;
+
+ refresh_zone_stat_thresholds();
+ register_cpu_notifier(&vmstat_notifier);
+
+ for_each_online_cpu(cpu)
+ start_cpu_timer(cpu);
+#endif
+#ifdef CONFIG_PROC_FS
+ proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
+ proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
+ proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
+ proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
+#endif
+ return 0;
+}
+module_init(setup_vmstat)
OpenPOWER on IntegriCloud