diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 224 | ||||
-rw-r--r-- | mm/Makefile | 36 | ||||
-rw-r--r-- | mm/allocpercpu.c | 143 | ||||
-rw-r--r-- | mm/backing-dev.c | 306 | ||||
-rw-r--r-- | mm/bootmem.c | 727 | ||||
-rw-r--r-- | mm/bounce.c | 293 | ||||
-rw-r--r-- | mm/dmapool.c | 504 | ||||
-rw-r--r-- | mm/fadvise.c | 151 | ||||
-rw-r--r-- | mm/filemap.c | 2478 | ||||
-rw-r--r-- | mm/filemap_xip.c | 474 | ||||
-rw-r--r-- | mm/fremap.c | 260 | ||||
-rw-r--r-- | mm/highmem.c | 375 | ||||
-rw-r--r-- | mm/hugetlb.c | 2293 | ||||
-rw-r--r-- | mm/internal.h | 283 | ||||
-rw-r--r-- | mm/maccess.c | 55 | ||||
-rw-r--r-- | mm/madvise.c | 365 | ||||
-rw-r--r-- | mm/memcontrol.c | 1174 | ||||
-rw-r--r-- | mm/memory.c | 3051 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 867 | ||||
-rw-r--r-- | mm/mempolicy.c | 2338 | ||||
-rw-r--r-- | mm/mempool.c | 340 | ||||
-rw-r--r-- | mm/migrate.c | 1151 | ||||
-rw-r--r-- | mm/mincore.c | 229 | ||||
-rw-r--r-- | mm/mlock.c | 629 | ||||
-rw-r--r-- | mm/mm_init.c | 152 | ||||
-rw-r--r-- | mm/mmap.c | 2482 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 277 | ||||
-rw-r--r-- | mm/mmzone.c | 74 | ||||
-rw-r--r-- | mm/mprotect.c | 319 | ||||
-rw-r--r-- | mm/mremap.c | 433 | ||||
-rw-r--r-- | mm/msync.c | 103 | ||||
-rw-r--r-- | mm/nommu.c | 1523 | ||||
-rw-r--r-- | mm/oom_kill.c | 593 | ||||
-rw-r--r-- | mm/page-writeback.c | 1389 | ||||
-rw-r--r-- | mm/page_alloc.c | 4779 | ||||
-rw-r--r-- | mm/page_cgroup.c | 275 | ||||
-rw-r--r-- | mm/page_io.c | 141 | ||||
-rw-r--r-- | mm/page_isolation.c | 142 | ||||
-rw-r--r-- | mm/pagewalk.c | 137 | ||||
-rw-r--r-- | mm/pdflush.c | 241 | ||||
-rw-r--r-- | mm/prio_tree.c | 207 | ||||
-rw-r--r-- | mm/quicklist.c | 103 | ||||
-rw-r--r-- | mm/readahead.c | 483 | ||||
-rw-r--r-- | mm/rmap.c | 1236 | ||||
-rw-r--r-- | mm/shmem.c | 2608 | ||||
-rw-r--r-- | mm/shmem_acl.c | 197 | ||||
-rw-r--r-- | mm/slab.c | 4522 | ||||
-rw-r--r-- | mm/slob.c | 647 | ||||
-rw-r--r-- | mm/slub.c | 4515 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 159 | ||||
-rw-r--r-- | mm/sparse.c | 636 | ||||
-rw-r--r-- | mm/swap.c | 606 | ||||
-rw-r--r-- | mm/swap_state.c | 372 | ||||
-rw-r--r-- | mm/swapfile.c | 1870 | ||||
-rw-r--r-- | mm/thrash.c | 79 | ||||
-rw-r--r-- | mm/tiny-shmem.c | 134 | ||||
-rw-r--r-- | mm/truncate.c | 473 | ||||
-rw-r--r-- | mm/util.c | 188 | ||||
-rw-r--r-- | mm/vmalloc.c | 1812 | ||||
-rw-r--r-- | mm/vmscan.c | 2592 | ||||
-rw-r--r-- | mm/vmstat.c | 969 |
61 files changed, 56214 insertions, 0 deletions
diff --git a/mm/Kconfig b/mm/Kconfig new file mode 100644 index 0000000..5b5790f --- /dev/null +++ b/mm/Kconfig @@ -0,0 +1,224 @@ +config SELECT_MEMORY_MODEL + def_bool y + depends on EXPERIMENTAL || ARCH_SELECT_MEMORY_MODEL + +choice + prompt "Memory model" + depends on SELECT_MEMORY_MODEL + default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT + default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT + default FLATMEM_MANUAL + +config FLATMEM_MANUAL + bool "Flat Memory" + depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE + help + This option allows you to change some of the ways that + Linux manages its memory internally. Most users will + only have one option here: FLATMEM. This is normal + and a correct option. + + Some users of more advanced features like NUMA and + memory hotplug may have different options here. + DISCONTIGMEM is an more mature, better tested system, + but is incompatible with memory hotplug and may suffer + decreased performance over SPARSEMEM. If unsure between + "Sparse Memory" and "Discontiguous Memory", choose + "Discontiguous Memory". + + If unsure, choose this option (Flat Memory) over any other. + +config DISCONTIGMEM_MANUAL + bool "Discontiguous Memory" + depends on ARCH_DISCONTIGMEM_ENABLE + help + This option provides enhanced support for discontiguous + memory systems, over FLATMEM. These systems have holes + in their physical address spaces, and this option provides + more efficient handling of these holes. However, the vast + majority of hardware has quite flat address spaces, and + can have degraded performance from the extra overhead that + this option imposes. + + Many NUMA configurations will have this as the only option. + + If unsure, choose "Flat Memory" over this option. + +config SPARSEMEM_MANUAL + bool "Sparse Memory" + depends on ARCH_SPARSEMEM_ENABLE + help + This will be the only option for some systems, including + memory hotplug systems. This is normal. + + For many other systems, this will be an alternative to + "Discontiguous Memory". This option provides some potential + performance benefits, along with decreased code complexity, + but it is newer, and more experimental. + + If unsure, choose "Discontiguous Memory" or "Flat Memory" + over this option. + +endchoice + +config DISCONTIGMEM + def_bool y + depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL + +config SPARSEMEM + def_bool y + depends on SPARSEMEM_MANUAL + +config FLATMEM + def_bool y + depends on (!DISCONTIGMEM && !SPARSEMEM) || FLATMEM_MANUAL + +config FLAT_NODE_MEM_MAP + def_bool y + depends on !SPARSEMEM + +# +# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's +# to represent different areas of memory. This variable allows +# those dependencies to exist individually. +# +config NEED_MULTIPLE_NODES + def_bool y + depends on DISCONTIGMEM || NUMA + +config HAVE_MEMORY_PRESENT + def_bool y + depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM + +# +# SPARSEMEM_EXTREME (which is the default) does some bootmem +# allocations when memory_present() is called. If this cannot +# be done on your architecture, select this option. However, +# statically allocating the mem_section[] array can potentially +# consume vast quantities of .bss, so be careful. +# +# This option will also potentially produce smaller runtime code +# with gcc 3.4 and later. +# +config SPARSEMEM_STATIC + bool + +# +# Architecture platforms which require a two level mem_section in SPARSEMEM +# must select this option. This is usually for architecture platforms with +# an extremely sparse physical address space. +# +config SPARSEMEM_EXTREME + def_bool y + depends on SPARSEMEM && !SPARSEMEM_STATIC + +config SPARSEMEM_VMEMMAP_ENABLE + bool + +config SPARSEMEM_VMEMMAP + bool "Sparse Memory virtual memmap" + depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE + default y + help + SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise + pfn_to_page and page_to_pfn operations. This is the most + efficient option when sufficient kernel resources are available. + +# eventually, we can have this option just 'select SPARSEMEM' +config MEMORY_HOTPLUG + bool "Allow for memory hot-add" + depends on SPARSEMEM || X86_64_ACPI_NUMA + depends on HOTPLUG && !HIBERNATION && ARCH_ENABLE_MEMORY_HOTPLUG + depends on (IA64 || X86 || PPC64 || SUPERH || S390) + +comment "Memory hotplug is currently incompatible with Software Suspend" + depends on SPARSEMEM && HOTPLUG && HIBERNATION + +config MEMORY_HOTPLUG_SPARSE + def_bool y + depends on SPARSEMEM && MEMORY_HOTPLUG + +config MEMORY_HOTREMOVE + bool "Allow for memory hot remove" + depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE + depends on MIGRATION + +# +# If we have space for more page flags then we can enable additional +# optimizations and functionality. +# +# Regular Sparsemem takes page flag bits for the sectionid if it does not +# use a virtual memmap. Disable extended page flags for 32 bit platforms +# that require the use of a sectionid in the page flags. +# +config PAGEFLAGS_EXTENDED + def_bool y + depends on 64BIT || SPARSEMEM_VMEMMAP || !NUMA || !SPARSEMEM + +# Heavily threaded applications may benefit from splitting the mm-wide +# page_table_lock, so that faults on different parts of the user address +# space can be handled with less contention: split it at this NR_CPUS. +# Default to 4 for wider testing, though 8 might be more appropriate. +# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. +# PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes. +# +config SPLIT_PTLOCK_CPUS + int + default "4096" if ARM && !CPU_CACHE_VIPT + default "4096" if PARISC && !PA20 + default "4" + +# +# support for page migration +# +config MIGRATION + bool "Page migration" + def_bool y + depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE + help + Allows the migration of the physical location of pages of processes + while the virtual addresses are not changed. This is useful for + example on NUMA systems to put pages nearer to the processors accessing + the page. + +config RESOURCES_64BIT + bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL) + default 64BIT + help + This option allows memory and IO resources to be 64 bit. + +config PHYS_ADDR_T_64BIT + def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT + +config ZONE_DMA_FLAG + int + default "0" if !ZONE_DMA + default "1" + +config BOUNCE + def_bool y + depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM) + +config NR_QUICK + int + depends on QUICKLIST + default "2" if SUPERH || AVR32 + default "1" + +config VIRT_TO_BUS + def_bool y + depends on !ARCH_NO_VIRT_TO_BUS + +config UNEVICTABLE_LRU + bool "Add LRU list to track non-evictable pages" + default y + depends on MMU + help + Keeps unevictable pages off of the active and inactive pageout + lists, so kswapd will not waste CPU time or have its balancing + algorithms thrown off by scanning these pages. Selecting this + will use one page flag and increase the code size a little, + say Y unless you know what you are doing. + +config MMU_NOTIFIER + bool diff --git a/mm/Makefile b/mm/Makefile new file mode 100644 index 0000000..c06b45a --- /dev/null +++ b/mm/Makefile @@ -0,0 +1,36 @@ +# +# Makefile for the linux memory manager. +# + +mmu-y := nommu.o +mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ + mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ + vmalloc.o + +obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ + maccess.o page_alloc.o page-writeback.o pdflush.o \ + readahead.o swap.o truncate.o vmscan.o \ + prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ + page_isolation.o mm_init.o $(mmu-y) + +obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o +obj-$(CONFIG_BOUNCE) += bounce.o +obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o +obj-$(CONFIG_HAS_DMA) += dmapool.o +obj-$(CONFIG_HUGETLBFS) += hugetlb.o +obj-$(CONFIG_NUMA) += mempolicy.o +obj-$(CONFIG_SPARSEMEM) += sparse.o +obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o +obj-$(CONFIG_SHMEM) += shmem.o +obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o +obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o +obj-$(CONFIG_SLOB) += slob.o +obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o +obj-$(CONFIG_SLAB) += slab.o +obj-$(CONFIG_SLUB) += slub.o +obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o +obj-$(CONFIG_FS_XIP) += filemap_xip.o +obj-$(CONFIG_MIGRATION) += migrate.o +obj-$(CONFIG_SMP) += allocpercpu.o +obj-$(CONFIG_QUICKLIST) += quicklist.o +obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c new file mode 100644 index 0000000..4297bc4 --- /dev/null +++ b/mm/allocpercpu.c @@ -0,0 +1,143 @@ +/* + * linux/mm/allocpercpu.c + * + * Separated from slab.c August 11, 2006 Christoph Lameter + */ +#include <linux/mm.h> +#include <linux/module.h> + +#ifndef cache_line_size +#define cache_line_size() L1_CACHE_BYTES +#endif + +/** + * percpu_depopulate - depopulate per-cpu data for given cpu + * @__pdata: per-cpu data to depopulate + * @cpu: depopulate per-cpu data for this cpu + * + * Depopulating per-cpu data for a cpu going offline would be a typical + * use case. You need to register a cpu hotplug handler for that purpose. + */ +static void percpu_depopulate(void *__pdata, int cpu) +{ + struct percpu_data *pdata = __percpu_disguise(__pdata); + + kfree(pdata->ptrs[cpu]); + pdata->ptrs[cpu] = NULL; +} + +/** + * percpu_depopulate_mask - depopulate per-cpu data for some cpu's + * @__pdata: per-cpu data to depopulate + * @mask: depopulate per-cpu data for cpu's selected through mask bits + */ +static void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) +{ + int cpu; + for_each_cpu_mask_nr(cpu, *mask) + percpu_depopulate(__pdata, cpu); +} + +#define percpu_depopulate_mask(__pdata, mask) \ + __percpu_depopulate_mask((__pdata), &(mask)) + +/** + * percpu_populate - populate per-cpu data for given cpu + * @__pdata: per-cpu data to populate further + * @size: size of per-cpu object + * @gfp: may sleep or not etc. + * @cpu: populate per-data for this cpu + * + * Populating per-cpu data for a cpu coming online would be a typical + * use case. You need to register a cpu hotplug handler for that purpose. + * Per-cpu object is populated with zeroed buffer. + */ +static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) +{ + struct percpu_data *pdata = __percpu_disguise(__pdata); + int node = cpu_to_node(cpu); + + /* + * We should make sure each CPU gets private memory. + */ + size = roundup(size, cache_line_size()); + + BUG_ON(pdata->ptrs[cpu]); + if (node_online(node)) + pdata->ptrs[cpu] = kmalloc_node(size, gfp|__GFP_ZERO, node); + else + pdata->ptrs[cpu] = kzalloc(size, gfp); + return pdata->ptrs[cpu]; +} + +/** + * percpu_populate_mask - populate per-cpu data for more cpu's + * @__pdata: per-cpu data to populate further + * @size: size of per-cpu object + * @gfp: may sleep or not etc. + * @mask: populate per-cpu data for cpu's selected through mask bits + * + * Per-cpu objects are populated with zeroed buffers. + */ +static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, + cpumask_t *mask) +{ + cpumask_t populated; + int cpu; + + cpus_clear(populated); + for_each_cpu_mask_nr(cpu, *mask) + if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) { + __percpu_depopulate_mask(__pdata, &populated); + return -ENOMEM; + } else + cpu_set(cpu, populated); + return 0; +} + +#define percpu_populate_mask(__pdata, size, gfp, mask) \ + __percpu_populate_mask((__pdata), (size), (gfp), &(mask)) + +/** + * percpu_alloc_mask - initial setup of per-cpu data + * @size: size of per-cpu object + * @gfp: may sleep or not etc. + * @mask: populate per-data for cpu's selected through mask bits + * + * Populating per-cpu data for all online cpu's would be a typical use case, + * which is simplified by the percpu_alloc() wrapper. + * Per-cpu objects are populated with zeroed buffers. + */ +void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) +{ + /* + * We allocate whole cache lines to avoid false sharing + */ + size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size()); + void *pdata = kzalloc(sz, gfp); + void *__pdata = __percpu_disguise(pdata); + + if (unlikely(!pdata)) + return NULL; + if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask))) + return __pdata; + kfree(pdata); + return NULL; +} +EXPORT_SYMBOL_GPL(__percpu_alloc_mask); + +/** + * percpu_free - final cleanup of per-cpu data + * @__pdata: object to clean up + * + * We simply clean up any per-cpu object left. No need for the client to + * track and specify through a bis mask which per-cpu objects are to free. + */ +void percpu_free(void *__pdata) +{ + if (unlikely(!__pdata)) + return; + __percpu_depopulate_mask(__pdata, &cpu_possible_map); + kfree(__percpu_disguise(__pdata)); +} +EXPORT_SYMBOL_GPL(percpu_free); diff --git a/mm/backing-dev.c b/mm/backing-dev.c new file mode 100644 index 0000000..801c08b --- /dev/null +++ b/mm/backing-dev.c @@ -0,0 +1,306 @@ + +#include <linux/wait.h> +#include <linux/backing-dev.h> +#include <linux/fs.h> +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/writeback.h> +#include <linux/device.h> + + +static struct class *bdi_class; + +#ifdef CONFIG_DEBUG_FS +#include <linux/debugfs.h> +#include <linux/seq_file.h> + +static struct dentry *bdi_debug_root; + +static void bdi_debug_init(void) +{ + bdi_debug_root = debugfs_create_dir("bdi", NULL); +} + +static int bdi_debug_stats_show(struct seq_file *m, void *v) +{ + struct backing_dev_info *bdi = m->private; + long background_thresh; + long dirty_thresh; + long bdi_thresh; + + get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); + +#define K(x) ((x) << (PAGE_SHIFT - 10)) + seq_printf(m, + "BdiWriteback: %8lu kB\n" + "BdiReclaimable: %8lu kB\n" + "BdiDirtyThresh: %8lu kB\n" + "DirtyThresh: %8lu kB\n" + "BackgroundThresh: %8lu kB\n", + (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), + (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), + K(bdi_thresh), + K(dirty_thresh), + K(background_thresh)); +#undef K + + return 0; +} + +static int bdi_debug_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, bdi_debug_stats_show, inode->i_private); +} + +static const struct file_operations bdi_debug_stats_fops = { + .open = bdi_debug_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void bdi_debug_register(struct backing_dev_info *bdi, const char *name) +{ + bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root); + bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir, + bdi, &bdi_debug_stats_fops); +} + +static void bdi_debug_unregister(struct backing_dev_info *bdi) +{ + debugfs_remove(bdi->debug_stats); + debugfs_remove(bdi->debug_dir); +} +#else +static inline void bdi_debug_init(void) +{ +} +static inline void bdi_debug_register(struct backing_dev_info *bdi, + const char *name) +{ +} +static inline void bdi_debug_unregister(struct backing_dev_info *bdi) +{ +} +#endif + +static ssize_t read_ahead_kb_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + char *end; + unsigned long read_ahead_kb; + ssize_t ret = -EINVAL; + + read_ahead_kb = simple_strtoul(buf, &end, 10); + if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) { + bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10); + ret = count; + } + return ret; +} + +#define K(pages) ((pages) << (PAGE_SHIFT - 10)) + +#define BDI_SHOW(name, expr) \ +static ssize_t name##_show(struct device *dev, \ + struct device_attribute *attr, char *page) \ +{ \ + struct backing_dev_info *bdi = dev_get_drvdata(dev); \ + \ + return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr); \ +} + +BDI_SHOW(read_ahead_kb, K(bdi->ra_pages)) + +static ssize_t min_ratio_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + char *end; + unsigned int ratio; + ssize_t ret = -EINVAL; + + ratio = simple_strtoul(buf, &end, 10); + if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) { + ret = bdi_set_min_ratio(bdi, ratio); + if (!ret) + ret = count; + } + return ret; +} +BDI_SHOW(min_ratio, bdi->min_ratio) + +static ssize_t max_ratio_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + char *end; + unsigned int ratio; + ssize_t ret = -EINVAL; + + ratio = simple_strtoul(buf, &end, 10); + if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) { + ret = bdi_set_max_ratio(bdi, ratio); + if (!ret) + ret = count; + } + return ret; +} +BDI_SHOW(max_ratio, bdi->max_ratio) + +#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store) + +static struct device_attribute bdi_dev_attrs[] = { + __ATTR_RW(read_ahead_kb), + __ATTR_RW(min_ratio), + __ATTR_RW(max_ratio), + __ATTR_NULL, +}; + +static __init int bdi_class_init(void) +{ + bdi_class = class_create(THIS_MODULE, "bdi"); + bdi_class->dev_attrs = bdi_dev_attrs; + bdi_debug_init(); + return 0; +} + +postcore_initcall(bdi_class_init); + +int bdi_register(struct backing_dev_info *bdi, struct device *parent, + const char *fmt, ...) +{ + va_list args; + int ret = 0; + struct device *dev; + + if (bdi->dev) /* The driver needs to use separate queues per device */ + goto exit; + + va_start(args, fmt); + dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args); + va_end(args); + if (IS_ERR(dev)) { + ret = PTR_ERR(dev); + goto exit; + } + + bdi->dev = dev; + bdi_debug_register(bdi, dev_name(dev)); + +exit: + return ret; +} +EXPORT_SYMBOL(bdi_register); + +int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) +{ + return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev)); +} +EXPORT_SYMBOL(bdi_register_dev); + +void bdi_unregister(struct backing_dev_info *bdi) +{ + if (bdi->dev) { + bdi_debug_unregister(bdi); + device_unregister(bdi->dev); + bdi->dev = NULL; + } +} +EXPORT_SYMBOL(bdi_unregister); + +int bdi_init(struct backing_dev_info *bdi) +{ + int i; + int err; + + bdi->dev = NULL; + + bdi->min_ratio = 0; + bdi->max_ratio = 100; + bdi->max_prop_frac = PROP_FRAC_BASE; + + for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { + err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0); + if (err) + goto err; + } + + bdi->dirty_exceeded = 0; + err = prop_local_init_percpu(&bdi->completions); + + if (err) { +err: + while (i--) + percpu_counter_destroy(&bdi->bdi_stat[i]); + } + + return err; +} +EXPORT_SYMBOL(bdi_init); + +void bdi_destroy(struct backing_dev_info *bdi) +{ + int i; + + bdi_unregister(bdi); + + for (i = 0; i < NR_BDI_STAT_ITEMS; i++) + percpu_counter_destroy(&bdi->bdi_stat[i]); + + prop_local_destroy_percpu(&bdi->completions); +} +EXPORT_SYMBOL(bdi_destroy); + +static wait_queue_head_t congestion_wqh[2] = { + __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), + __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) + }; + + +void clear_bdi_congested(struct backing_dev_info *bdi, int rw) +{ + enum bdi_state bit; + wait_queue_head_t *wqh = &congestion_wqh[rw]; + + bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested; + clear_bit(bit, &bdi->state); + smp_mb__after_clear_bit(); + if (waitqueue_active(wqh)) + wake_up(wqh); +} +EXPORT_SYMBOL(clear_bdi_congested); + +void set_bdi_congested(struct backing_dev_info *bdi, int rw) +{ + enum bdi_state bit; + + bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested; + set_bit(bit, &bdi->state); +} +EXPORT_SYMBOL(set_bdi_congested); + +/** + * congestion_wait - wait for a backing_dev to become uncongested + * @rw: READ or WRITE + * @timeout: timeout in jiffies + * + * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit + * write congestion. If no backing_devs are congested then just wait for the + * next write to be completed. + */ +long congestion_wait(int rw, long timeout) +{ + long ret; + DEFINE_WAIT(wait); + wait_queue_head_t *wqh = &congestion_wqh[rw]; + + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + ret = io_schedule_timeout(timeout); + finish_wait(wqh, &wait); + return ret; +} +EXPORT_SYMBOL(congestion_wait); + diff --git a/mm/bootmem.c b/mm/bootmem.c new file mode 100644 index 0000000..ac5a891 --- /dev/null +++ b/mm/bootmem.c @@ -0,0 +1,727 @@ +/* + * bootmem - A boot-time physical memory allocator and configurator + * + * Copyright (C) 1999 Ingo Molnar + * 1999 Kanoj Sarcar, SGI + * 2008 Johannes Weiner + * + * Access to this subsystem has to be serialized externally (which is true + * for the boot process anyway). + */ +#include <linux/init.h> +#include <linux/pfn.h> +#include <linux/bootmem.h> +#include <linux/module.h> + +#include <asm/bug.h> +#include <asm/io.h> +#include <asm/processor.h> + +#include "internal.h" + +unsigned long max_low_pfn; +unsigned long min_low_pfn; +unsigned long max_pfn; + +#ifdef CONFIG_CRASH_DUMP +/* + * If we have booted due to a crash, max_pfn will be a very low value. We need + * to know the amount of memory that the previous kernel used. + */ +unsigned long saved_max_pfn; +#endif + +bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; + +static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); + +static int bootmem_debug; + +static int __init bootmem_debug_setup(char *buf) +{ + bootmem_debug = 1; + return 0; +} +early_param("bootmem_debug", bootmem_debug_setup); + +#define bdebug(fmt, args...) ({ \ + if (unlikely(bootmem_debug)) \ + printk(KERN_INFO \ + "bootmem::%s " fmt, \ + __func__, ## args); \ +}) + +static unsigned long __init bootmap_bytes(unsigned long pages) +{ + unsigned long bytes = (pages + 7) / 8; + + return ALIGN(bytes, sizeof(long)); +} + +/** + * bootmem_bootmap_pages - calculate bitmap size in pages + * @pages: number of pages the bitmap has to represent + */ +unsigned long __init bootmem_bootmap_pages(unsigned long pages) +{ + unsigned long bytes = bootmap_bytes(pages); + + return PAGE_ALIGN(bytes) >> PAGE_SHIFT; +} + +/* + * link bdata in order + */ +static void __init link_bootmem(bootmem_data_t *bdata) +{ + struct list_head *iter; + + list_for_each(iter, &bdata_list) { + bootmem_data_t *ent; + + ent = list_entry(iter, bootmem_data_t, list); + if (bdata->node_min_pfn < ent->node_min_pfn) + break; + } + list_add_tail(&bdata->list, iter); +} + +/* + * Called once to set up the allocator itself. + */ +static unsigned long __init init_bootmem_core(bootmem_data_t *bdata, + unsigned long mapstart, unsigned long start, unsigned long end) +{ + unsigned long mapsize; + + mminit_validate_memmodel_limits(&start, &end); + bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart)); + bdata->node_min_pfn = start; + bdata->node_low_pfn = end; + link_bootmem(bdata); + + /* + * Initially all pages are reserved - setup_arch() has to + * register free RAM areas explicitly. + */ + mapsize = bootmap_bytes(end - start); + memset(bdata->node_bootmem_map, 0xff, mapsize); + + bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n", + bdata - bootmem_node_data, start, mapstart, end, mapsize); + + return mapsize; +} + +/** + * init_bootmem_node - register a node as boot memory + * @pgdat: node to register + * @freepfn: pfn where the bitmap for this node is to be placed + * @startpfn: first pfn on the node + * @endpfn: first pfn after the node + * + * Returns the number of bytes needed to hold the bitmap for this node. + */ +unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, + unsigned long startpfn, unsigned long endpfn) +{ + return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn); +} + +/** + * init_bootmem - register boot memory + * @start: pfn where the bitmap is to be placed + * @pages: number of available physical pages + * + * Returns the number of bytes needed to hold the bitmap. + */ +unsigned long __init init_bootmem(unsigned long start, unsigned long pages) +{ + max_low_pfn = pages; + min_low_pfn = start; + return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); +} + +static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) +{ + int aligned; + struct page *page; + unsigned long start, end, pages, count = 0; + + if (!bdata->node_bootmem_map) + return 0; + + start = bdata->node_min_pfn; + end = bdata->node_low_pfn; + + /* + * If the start is aligned to the machines wordsize, we might + * be able to free pages in bulks of that order. + */ + aligned = !(start & (BITS_PER_LONG - 1)); + + bdebug("nid=%td start=%lx end=%lx aligned=%d\n", + bdata - bootmem_node_data, start, end, aligned); + + while (start < end) { + unsigned long *map, idx, vec; + + map = bdata->node_bootmem_map; + idx = start - bdata->node_min_pfn; + vec = ~map[idx / BITS_PER_LONG]; + + if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) { + int order = ilog2(BITS_PER_LONG); + + __free_pages_bootmem(pfn_to_page(start), order); + count += BITS_PER_LONG; + } else { + unsigned long off = 0; + + while (vec && off < BITS_PER_LONG) { + if (vec & 1) { + page = pfn_to_page(start + off); + __free_pages_bootmem(page, 0); + count++; + } + vec >>= 1; + off++; + } + } + start += BITS_PER_LONG; + } + + page = virt_to_page(bdata->node_bootmem_map); + pages = bdata->node_low_pfn - bdata->node_min_pfn; + pages = bootmem_bootmap_pages(pages); + count += pages; + while (pages--) + __free_pages_bootmem(page++, 0); + + bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); + + return count; +} + +/** + * free_all_bootmem_node - release a node's free pages to the buddy allocator + * @pgdat: node to be released + * + * Returns the number of pages actually released. + */ +unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) +{ + register_page_bootmem_info_node(pgdat); + return free_all_bootmem_core(pgdat->bdata); +} + +/** + * free_all_bootmem - release free pages to the buddy allocator + * + * Returns the number of pages actually released. + */ +unsigned long __init free_all_bootmem(void) +{ + return free_all_bootmem_core(NODE_DATA(0)->bdata); +} + +static void __init __free(bootmem_data_t *bdata, + unsigned long sidx, unsigned long eidx) +{ + unsigned long idx; + + bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data, + sidx + bdata->node_min_pfn, + eidx + bdata->node_min_pfn); + + if (bdata->hint_idx > sidx) + bdata->hint_idx = sidx; + + for (idx = sidx; idx < eidx; idx++) + if (!test_and_clear_bit(idx, bdata->node_bootmem_map)) + BUG(); +} + +static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx, + unsigned long eidx, int flags) +{ + unsigned long idx; + int exclusive = flags & BOOTMEM_EXCLUSIVE; + + bdebug("nid=%td start=%lx end=%lx flags=%x\n", + bdata - bootmem_node_data, + sidx + bdata->node_min_pfn, + eidx + bdata->node_min_pfn, + flags); + + for (idx = sidx; idx < eidx; idx++) + if (test_and_set_bit(idx, bdata->node_bootmem_map)) { + if (exclusive) { + __free(bdata, sidx, idx); + return -EBUSY; + } + bdebug("silent double reserve of PFN %lx\n", + idx + bdata->node_min_pfn); + } + return 0; +} + +static int __init mark_bootmem_node(bootmem_data_t *bdata, + unsigned long start, unsigned long end, + int reserve, int flags) +{ + unsigned long sidx, eidx; + + bdebug("nid=%td start=%lx end=%lx reserve=%d flags=%x\n", + bdata - bootmem_node_data, start, end, reserve, flags); + + BUG_ON(start < bdata->node_min_pfn); + BUG_ON(end > bdata->node_low_pfn); + + sidx = start - bdata->node_min_pfn; + eidx = end - bdata->node_min_pfn; + + if (reserve) + return __reserve(bdata, sidx, eidx, flags); + else + __free(bdata, sidx, eidx); + return 0; +} + +static int __init mark_bootmem(unsigned long start, unsigned long end, + int reserve, int flags) +{ + unsigned long pos; + bootmem_data_t *bdata; + + pos = start; + list_for_each_entry(bdata, &bdata_list, list) { + int err; + unsigned long max; + + if (pos < bdata->node_min_pfn || + pos >= bdata->node_low_pfn) { + BUG_ON(pos != start); + continue; + } + + max = min(bdata->node_low_pfn, end); + + err = mark_bootmem_node(bdata, pos, max, reserve, flags); + if (reserve && err) { + mark_bootmem(start, pos, 0, 0); + return err; + } + + if (max == end) + return 0; + pos = bdata->node_low_pfn; + } + BUG(); +} + +/** + * free_bootmem_node - mark a page range as usable + * @pgdat: node the range resides on + * @physaddr: starting address of the range + * @size: size of the range in bytes + * + * Partial pages will be considered reserved and left as they are. + * + * The range must reside completely on the specified node. + */ +void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, + unsigned long size) +{ + unsigned long start, end; + + start = PFN_UP(physaddr); + end = PFN_DOWN(physaddr + size); + + mark_bootmem_node(pgdat->bdata, start, end, 0, 0); +} + +/** + * free_bootmem - mark a page range as usable + * @addr: starting address of the range + * @size: size of the range in bytes + * + * Partial pages will be considered reserved and left as they are. + * + * The range must be contiguous but may span node boundaries. + */ +void __init free_bootmem(unsigned long addr, unsigned long size) +{ + unsigned long start, end; + + start = PFN_UP(addr); + end = PFN_DOWN(addr + size); + + mark_bootmem(start, end, 0, 0); +} + +/** + * reserve_bootmem_node - mark a page range as reserved + * @pgdat: node the range resides on + * @physaddr: starting address of the range + * @size: size of the range in bytes + * @flags: reservation flags (see linux/bootmem.h) + * + * Partial pages will be reserved. + * + * The range must reside completely on the specified node. + */ +int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, + unsigned long size, int flags) +{ + unsigned long start, end; + + start = PFN_DOWN(physaddr); + end = PFN_UP(physaddr + size); + + return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); +} + +#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE +/** + * reserve_bootmem - mark a page range as usable + * @addr: starting address of the range + * @size: size of the range in bytes + * @flags: reservation flags (see linux/bootmem.h) + * + * Partial pages will be reserved. + * + * The range must be contiguous but may span node boundaries. + */ +int __init reserve_bootmem(unsigned long addr, unsigned long size, + int flags) +{ + unsigned long start, end; + + start = PFN_DOWN(addr); + end = PFN_UP(addr + size); + + return mark_bootmem(start, end, 1, flags); +} +#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ + +static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, + unsigned long step) +{ + unsigned long base = bdata->node_min_pfn; + + /* + * Align the index with respect to the node start so that the + * combination of both satisfies the requested alignment. + */ + + return ALIGN(base + idx, step) - base; +} + +static unsigned long align_off(struct bootmem_data *bdata, unsigned long off, + unsigned long align) +{ + unsigned long base = PFN_PHYS(bdata->node_min_pfn); + + /* Same as align_idx for byte offsets */ + + return ALIGN(base + off, align) - base; +} + +static void * __init alloc_bootmem_core(struct bootmem_data *bdata, + unsigned long size, unsigned long align, + unsigned long goal, unsigned long limit) +{ + unsigned long fallback = 0; + unsigned long min, max, start, sidx, midx, step; + + BUG_ON(!size); + BUG_ON(align & (align - 1)); + BUG_ON(limit && goal + size > limit); + + if (!bdata->node_bootmem_map) + return NULL; + + bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n", + bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT, + align, goal, limit); + + min = bdata->node_min_pfn; + max = bdata->node_low_pfn; + + goal >>= PAGE_SHIFT; + limit >>= PAGE_SHIFT; + + if (limit && max > limit) + max = limit; + if (max <= min) + return NULL; + + step = max(align >> PAGE_SHIFT, 1UL); + + if (goal && min < goal && goal < max) + start = ALIGN(goal, step); + else + start = ALIGN(min, step); + + sidx = start - bdata->node_min_pfn; + midx = max - bdata->node_min_pfn; + + if (bdata->hint_idx > sidx) { + /* + * Handle the valid case of sidx being zero and still + * catch the fallback below. + */ + fallback = sidx + 1; + sidx = align_idx(bdata, bdata->hint_idx, step); + } + + while (1) { + int merge; + void *region; + unsigned long eidx, i, start_off, end_off; +find_block: + sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx); + sidx = align_idx(bdata, sidx, step); + eidx = sidx + PFN_UP(size); + + if (sidx >= midx || eidx > midx) + break; + + for (i = sidx; i < eidx; i++) + if (test_bit(i, bdata->node_bootmem_map)) { + sidx = align_idx(bdata, i, step); + if (sidx == i) + sidx += step; + goto find_block; + } + + if (bdata->last_end_off & (PAGE_SIZE - 1) && + PFN_DOWN(bdata->last_end_off) + 1 == sidx) + start_off = align_off(bdata, bdata->last_end_off, align); + else + start_off = PFN_PHYS(sidx); + + merge = PFN_DOWN(start_off) < sidx; + end_off = start_off + size; + + bdata->last_end_off = end_off; + bdata->hint_idx = PFN_UP(end_off); + + /* + * Reserve the area now: + */ + if (__reserve(bdata, PFN_DOWN(start_off) + merge, + PFN_UP(end_off), BOOTMEM_EXCLUSIVE)) + BUG(); + + region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + + start_off); + memset(region, 0, size); + return region; + } + + if (fallback) { + sidx = align_idx(bdata, fallback - 1, step); + fallback = 0; + goto find_block; + } + + return NULL; +} + +static void * __init ___alloc_bootmem_nopanic(unsigned long size, + unsigned long align, + unsigned long goal, + unsigned long limit) +{ + bootmem_data_t *bdata; + +restart: + list_for_each_entry(bdata, &bdata_list, list) { + void *region; + + if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) + continue; + if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) + break; + + region = alloc_bootmem_core(bdata, size, align, goal, limit); + if (region) + return region; + } + + if (goal) { + goal = 0; + goto restart; + } + + return NULL; +} + +/** + * __alloc_bootmem_nopanic - allocate boot memory without panicking + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * Returns NULL on failure. + */ +void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, + unsigned long goal) +{ + return ___alloc_bootmem_nopanic(size, align, goal, 0); +} + +static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, + unsigned long goal, unsigned long limit) +{ + void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit); + + if (mem) + return mem; + /* + * Whoops, we cannot satisfy the allocation request. + */ + printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + panic("Out of memory"); + return NULL; +} + +/** + * __alloc_bootmem - allocate boot memory + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem(unsigned long size, unsigned long align, + unsigned long goal) +{ + return ___alloc_bootmem(size, align, goal, 0); +} + +static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, + unsigned long size, unsigned long align, + unsigned long goal, unsigned long limit) +{ + void *ptr; + + ptr = alloc_bootmem_core(bdata, size, align, goal, limit); + if (ptr) + return ptr; + + return ___alloc_bootmem(size, align, goal, limit); +} + +/** + * __alloc_bootmem_node - allocate boot memory from a specific node + * @pgdat: node to allocate from + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may fall back to any node in the system if the specified node + * can not hold the requested memory. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); +} + +#ifdef CONFIG_SPARSEMEM +/** + * alloc_bootmem_section - allocate boot memory from a specific section + * @size: size of the request in bytes + * @section_nr: sparse map section to allocate from + * + * Return NULL on failure. + */ +void * __init alloc_bootmem_section(unsigned long size, + unsigned long section_nr) +{ + bootmem_data_t *bdata; + unsigned long pfn, goal, limit; + + pfn = section_nr_to_pfn(section_nr); + goal = pfn << PAGE_SHIFT; + limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; + bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; + + return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); +} +#endif + +void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + void *ptr; + + ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); + if (ptr) + return ptr; + + return __alloc_bootmem_nopanic(size, align, goal); +} + +#ifndef ARCH_LOW_ADDRESS_LIMIT +#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL +#endif + +/** + * __alloc_bootmem_low - allocate low boot memory + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, + unsigned long goal) +{ + return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); +} + +/** + * __alloc_bootmem_low_node - allocate low boot memory from a specific node + * @pgdat: node to allocate from + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may fall back to any node in the system if the specified node + * can not hold the requested memory. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + return ___alloc_bootmem_node(pgdat->bdata, size, align, + goal, ARCH_LOW_ADDRESS_LIMIT); +} diff --git a/mm/bounce.c b/mm/bounce.c new file mode 100644 index 0000000..06722c4 --- /dev/null +++ b/mm/bounce.c @@ -0,0 +1,293 @@ +/* bounce buffer handling for block devices + * + * - Split from highmem.c + */ + +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/swap.h> +#include <linux/bio.h> +#include <linux/pagemap.h> +#include <linux/mempool.h> +#include <linux/blkdev.h> +#include <linux/init.h> +#include <linux/hash.h> +#include <linux/highmem.h> +#include <linux/blktrace_api.h> +#include <asm/tlbflush.h> + +#define POOL_SIZE 64 +#define ISA_POOL_SIZE 16 + +static mempool_t *page_pool, *isa_page_pool; + +#ifdef CONFIG_HIGHMEM +static __init int init_emergency_pool(void) +{ + struct sysinfo i; + si_meminfo(&i); + si_swapinfo(&i); + + if (!i.totalhigh) + return 0; + + page_pool = mempool_create_page_pool(POOL_SIZE, 0); + BUG_ON(!page_pool); + printk("highmem bounce pool size: %d pages\n", POOL_SIZE); + + return 0; +} + +__initcall(init_emergency_pool); + +/* + * highmem version, map in to vec + */ +static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) +{ + unsigned long flags; + unsigned char *vto; + + local_irq_save(flags); + vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ); + memcpy(vto + to->bv_offset, vfrom, to->bv_len); + kunmap_atomic(vto, KM_BOUNCE_READ); + local_irq_restore(flags); +} + +#else /* CONFIG_HIGHMEM */ + +#define bounce_copy_vec(to, vfrom) \ + memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len) + +#endif /* CONFIG_HIGHMEM */ + +/* + * allocate pages in the DMA region for the ISA pool + */ +static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data) +{ + return mempool_alloc_pages(gfp_mask | GFP_DMA, data); +} + +/* + * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA + * as the max address, so check if the pool has already been created. + */ +int init_emergency_isa_pool(void) +{ + if (isa_page_pool) + return 0; + + isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa, + mempool_free_pages, (void *) 0); + BUG_ON(!isa_page_pool); + + printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE); + return 0; +} + +/* + * Simple bounce buffer support for highmem pages. Depending on the + * queue gfp mask set, *to may or may not be a highmem page. kmap it + * always, it will do the Right Thing + */ +static void copy_to_high_bio_irq(struct bio *to, struct bio *from) +{ + unsigned char *vfrom; + struct bio_vec *tovec, *fromvec; + int i; + + __bio_for_each_segment(tovec, to, i, 0) { + fromvec = from->bi_io_vec + i; + + /* + * not bounced + */ + if (tovec->bv_page == fromvec->bv_page) + continue; + + /* + * fromvec->bv_offset and fromvec->bv_len might have been + * modified by the block layer, so use the original copy, + * bounce_copy_vec already uses tovec->bv_len + */ + vfrom = page_address(fromvec->bv_page) + tovec->bv_offset; + + flush_dcache_page(tovec->bv_page); + bounce_copy_vec(tovec, vfrom); + } +} + +static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) +{ + struct bio *bio_orig = bio->bi_private; + struct bio_vec *bvec, *org_vec; + int i; + + if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) + set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags); + + /* + * free up bounce indirect pages used + */ + __bio_for_each_segment(bvec, bio, i, 0) { + org_vec = bio_orig->bi_io_vec + i; + if (bvec->bv_page == org_vec->bv_page) + continue; + + dec_zone_page_state(bvec->bv_page, NR_BOUNCE); + mempool_free(bvec->bv_page, pool); + } + + bio_endio(bio_orig, err); + bio_put(bio); +} + +static void bounce_end_io_write(struct bio *bio, int err) +{ + bounce_end_io(bio, page_pool, err); +} + +static void bounce_end_io_write_isa(struct bio *bio, int err) +{ + + bounce_end_io(bio, isa_page_pool, err); +} + +static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err) +{ + struct bio *bio_orig = bio->bi_private; + + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) + copy_to_high_bio_irq(bio_orig, bio); + + bounce_end_io(bio, pool, err); +} + +static void bounce_end_io_read(struct bio *bio, int err) +{ + __bounce_end_io_read(bio, page_pool, err); +} + +static void bounce_end_io_read_isa(struct bio *bio, int err) +{ + __bounce_end_io_read(bio, isa_page_pool, err); +} + +static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, + mempool_t *pool) +{ + struct page *page; + struct bio *bio = NULL; + int i, rw = bio_data_dir(*bio_orig); + struct bio_vec *to, *from; + + bio_for_each_segment(from, *bio_orig, i) { + page = from->bv_page; + + /* + * is destination page below bounce pfn? + */ + if (page_to_pfn(page) <= q->bounce_pfn) + continue; + + /* + * irk, bounce it + */ + if (!bio) + bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt); + + to = bio->bi_io_vec + i; + + to->bv_page = mempool_alloc(pool, q->bounce_gfp); + to->bv_len = from->bv_len; + to->bv_offset = from->bv_offset; + inc_zone_page_state(to->bv_page, NR_BOUNCE); + + if (rw == WRITE) { + char *vto, *vfrom; + + flush_dcache_page(from->bv_page); + vto = page_address(to->bv_page) + to->bv_offset; + vfrom = kmap(from->bv_page) + from->bv_offset; + memcpy(vto, vfrom, to->bv_len); + kunmap(from->bv_page); + } + } + + /* + * no pages bounced + */ + if (!bio) + return; + + blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE); + + /* + * at least one page was bounced, fill in possible non-highmem + * pages + */ + __bio_for_each_segment(from, *bio_orig, i, 0) { + to = bio_iovec_idx(bio, i); + if (!to->bv_page) { + to->bv_page = from->bv_page; + to->bv_len = from->bv_len; + to->bv_offset = from->bv_offset; + } + } + + bio->bi_bdev = (*bio_orig)->bi_bdev; + bio->bi_flags |= (1 << BIO_BOUNCED); + bio->bi_sector = (*bio_orig)->bi_sector; + bio->bi_rw = (*bio_orig)->bi_rw; + + bio->bi_vcnt = (*bio_orig)->bi_vcnt; + bio->bi_idx = (*bio_orig)->bi_idx; + bio->bi_size = (*bio_orig)->bi_size; + + if (pool == page_pool) { + bio->bi_end_io = bounce_end_io_write; + if (rw == READ) + bio->bi_end_io = bounce_end_io_read; + } else { + bio->bi_end_io = bounce_end_io_write_isa; + if (rw == READ) + bio->bi_end_io = bounce_end_io_read_isa; + } + + bio->bi_private = *bio_orig; + *bio_orig = bio; +} + +void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) +{ + mempool_t *pool; + + /* + * Data-less bio, nothing to bounce + */ + if (!bio_has_data(*bio_orig)) + return; + + /* + * for non-isa bounce case, just check if the bounce pfn is equal + * to or bigger than the highest pfn in the system -- in that case, + * don't waste time iterating over bio segments + */ + if (!(q->bounce_gfp & GFP_DMA)) { + if (q->bounce_pfn >= blk_max_pfn) + return; + pool = page_pool; + } else { + BUG_ON(!isa_page_pool); + pool = isa_page_pool; + } + + /* + * slow path + */ + __blk_queue_bounce(q, bio_orig, pool); +} + +EXPORT_SYMBOL(blk_queue_bounce); diff --git a/mm/dmapool.c b/mm/dmapool.c new file mode 100644 index 0000000..b1f0885 --- /dev/null +++ b/mm/dmapool.c @@ -0,0 +1,504 @@ +/* + * DMA Pool allocator + * + * Copyright 2001 David Brownell + * Copyright 2007 Intel Corporation + * Author: Matthew Wilcox <willy@linux.intel.com> + * + * This software may be redistributed and/or modified under the terms of + * the GNU General Public License ("GPL") version 2 as published by the + * Free Software Foundation. + * + * This allocator returns small blocks of a given size which are DMA-able by + * the given device. It uses the dma_alloc_coherent page allocator to get + * new pages, then splits them up into blocks of the required size. + * Many older drivers still have their own code to do this. + * + * The current design of this allocator is fairly simple. The pool is + * represented by the 'struct dma_pool' which keeps a doubly-linked list of + * allocated pages. Each page in the page_list is split into blocks of at + * least 'size' bytes. Free blocks are tracked in an unsorted singly-linked + * list of free blocks within the page. Used blocks aren't tracked, but we + * keep a count of how many are currently allocated from each page. + */ + +#include <linux/device.h> +#include <linux/dma-mapping.h> +#include <linux/dmapool.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/poison.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/wait.h> + +#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON) +#define DMAPOOL_DEBUG 1 +#endif + +struct dma_pool { /* the pool */ + struct list_head page_list; + spinlock_t lock; + size_t size; + struct device *dev; + size_t allocation; + size_t boundary; + char name[32]; + wait_queue_head_t waitq; + struct list_head pools; +}; + +struct dma_page { /* cacheable header for 'allocation' bytes */ + struct list_head page_list; + void *vaddr; + dma_addr_t dma; + unsigned int in_use; + unsigned int offset; +}; + +#define POOL_TIMEOUT_JIFFIES ((100 /* msec */ * HZ) / 1000) + +static DEFINE_MUTEX(pools_lock); + +static ssize_t +show_pools(struct device *dev, struct device_attribute *attr, char *buf) +{ + unsigned temp; + unsigned size; + char *next; + struct dma_page *page; + struct dma_pool *pool; + + next = buf; + size = PAGE_SIZE; + + temp = scnprintf(next, size, "poolinfo - 0.1\n"); + size -= temp; + next += temp; + + mutex_lock(&pools_lock); + list_for_each_entry(pool, &dev->dma_pools, pools) { + unsigned pages = 0; + unsigned blocks = 0; + + list_for_each_entry(page, &pool->page_list, page_list) { + pages++; + blocks += page->in_use; + } + + /* per-pool info, no real statistics yet */ + temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n", + pool->name, blocks, + pages * (pool->allocation / pool->size), + pool->size, pages); + size -= temp; + next += temp; + } + mutex_unlock(&pools_lock); + + return PAGE_SIZE - size; +} + +static DEVICE_ATTR(pools, S_IRUGO, show_pools, NULL); + +/** + * dma_pool_create - Creates a pool of consistent memory blocks, for dma. + * @name: name of pool, for diagnostics + * @dev: device that will be doing the DMA + * @size: size of the blocks in this pool. + * @align: alignment requirement for blocks; must be a power of two + * @boundary: returned blocks won't cross this power of two boundary + * Context: !in_interrupt() + * + * Returns a dma allocation pool with the requested characteristics, or + * null if one can't be created. Given one of these pools, dma_pool_alloc() + * may be used to allocate memory. Such memory will all have "consistent" + * DMA mappings, accessible by the device and its driver without using + * cache flushing primitives. The actual size of blocks allocated may be + * larger than requested because of alignment. + * + * If @boundary is nonzero, objects returned from dma_pool_alloc() won't + * cross that size boundary. This is useful for devices which have + * addressing restrictions on individual DMA transfers, such as not crossing + * boundaries of 4KBytes. + */ +struct dma_pool *dma_pool_create(const char *name, struct device *dev, + size_t size, size_t align, size_t boundary) +{ + struct dma_pool *retval; + size_t allocation; + + if (align == 0) { + align = 1; + } else if (align & (align - 1)) { + return NULL; + } + + if (size == 0) { + return NULL; + } else if (size < 4) { + size = 4; + } + + if ((size % align) != 0) + size = ALIGN(size, align); + + allocation = max_t(size_t, size, PAGE_SIZE); + + if (!boundary) { + boundary = allocation; + } else if ((boundary < size) || (boundary & (boundary - 1))) { + return NULL; + } + + retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev)); + if (!retval) + return retval; + + strlcpy(retval->name, name, sizeof(retval->name)); + + retval->dev = dev; + + INIT_LIST_HEAD(&retval->page_list); + spin_lock_init(&retval->lock); + retval->size = size; + retval->boundary = boundary; + retval->allocation = allocation; + init_waitqueue_head(&retval->waitq); + + if (dev) { + int ret; + + mutex_lock(&pools_lock); + if (list_empty(&dev->dma_pools)) + ret = device_create_file(dev, &dev_attr_pools); + else + ret = 0; + /* note: not currently insisting "name" be unique */ + if (!ret) + list_add(&retval->pools, &dev->dma_pools); + else { + kfree(retval); + retval = NULL; + } + mutex_unlock(&pools_lock); + } else + INIT_LIST_HEAD(&retval->pools); + + return retval; +} +EXPORT_SYMBOL(dma_pool_create); + +static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page) +{ + unsigned int offset = 0; + unsigned int next_boundary = pool->boundary; + + do { + unsigned int next = offset + pool->size; + if (unlikely((next + pool->size) >= next_boundary)) { + next = next_boundary; + next_boundary += pool->boundary; + } + *(int *)(page->vaddr + offset) = next; + offset = next; + } while (offset < pool->allocation); +} + +static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) +{ + struct dma_page *page; + + page = kmalloc(sizeof(*page), mem_flags); + if (!page) + return NULL; + page->vaddr = dma_alloc_coherent(pool->dev, pool->allocation, + &page->dma, mem_flags); + if (page->vaddr) { +#ifdef DMAPOOL_DEBUG + memset(page->vaddr, POOL_POISON_FREED, pool->allocation); +#endif + pool_initialise_page(pool, page); + list_add(&page->page_list, &pool->page_list); + page->in_use = 0; + page->offset = 0; + } else { + kfree(page); + page = NULL; + } + return page; +} + +static inline int is_page_busy(struct dma_page *page) +{ + return page->in_use != 0; +} + +static void pool_free_page(struct dma_pool *pool, struct dma_page *page) +{ + dma_addr_t dma = page->dma; + +#ifdef DMAPOOL_DEBUG + memset(page->vaddr, POOL_POISON_FREED, pool->allocation); +#endif + dma_free_coherent(pool->dev, pool->allocation, page->vaddr, dma); + list_del(&page->page_list); + kfree(page); +} + +/** + * dma_pool_destroy - destroys a pool of dma memory blocks. + * @pool: dma pool that will be destroyed + * Context: !in_interrupt() + * + * Caller guarantees that no more memory from the pool is in use, + * and that nothing will try to use the pool after this call. + */ +void dma_pool_destroy(struct dma_pool *pool) +{ + mutex_lock(&pools_lock); + list_del(&pool->pools); + if (pool->dev && list_empty(&pool->dev->dma_pools)) + device_remove_file(pool->dev, &dev_attr_pools); + mutex_unlock(&pools_lock); + + while (!list_empty(&pool->page_list)) { + struct dma_page *page; + page = list_entry(pool->page_list.next, + struct dma_page, page_list); + if (is_page_busy(page)) { + if (pool->dev) + dev_err(pool->dev, + "dma_pool_destroy %s, %p busy\n", + pool->name, page->vaddr); + else + printk(KERN_ERR + "dma_pool_destroy %s, %p busy\n", + pool->name, page->vaddr); + /* leak the still-in-use consistent memory */ + list_del(&page->page_list); + kfree(page); + } else + pool_free_page(pool, page); + } + + kfree(pool); +} +EXPORT_SYMBOL(dma_pool_destroy); + +/** + * dma_pool_alloc - get a block of consistent memory + * @pool: dma pool that will produce the block + * @mem_flags: GFP_* bitmask + * @handle: pointer to dma address of block + * + * This returns the kernel virtual address of a currently unused block, + * and reports its dma address through the handle. + * If such a memory block can't be allocated, %NULL is returned. + */ +void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, + dma_addr_t *handle) +{ + unsigned long flags; + struct dma_page *page; + size_t offset; + void *retval; + + spin_lock_irqsave(&pool->lock, flags); + restart: + list_for_each_entry(page, &pool->page_list, page_list) { + if (page->offset < pool->allocation) + goto ready; + } + page = pool_alloc_page(pool, GFP_ATOMIC); + if (!page) { + if (mem_flags & __GFP_WAIT) { + DECLARE_WAITQUEUE(wait, current); + + __set_current_state(TASK_INTERRUPTIBLE); + __add_wait_queue(&pool->waitq, &wait); + spin_unlock_irqrestore(&pool->lock, flags); + + schedule_timeout(POOL_TIMEOUT_JIFFIES); + + spin_lock_irqsave(&pool->lock, flags); + __remove_wait_queue(&pool->waitq, &wait); + goto restart; + } + retval = NULL; + goto done; + } + + ready: + page->in_use++; + offset = page->offset; + page->offset = *(int *)(page->vaddr + offset); + retval = offset + page->vaddr; + *handle = offset + page->dma; +#ifdef DMAPOOL_DEBUG + memset(retval, POOL_POISON_ALLOCATED, pool->size); +#endif + done: + spin_unlock_irqrestore(&pool->lock, flags); + return retval; +} +EXPORT_SYMBOL(dma_pool_alloc); + +static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma) +{ + unsigned long flags; + struct dma_page *page; + + spin_lock_irqsave(&pool->lock, flags); + list_for_each_entry(page, &pool->page_list, page_list) { + if (dma < page->dma) + continue; + if (dma < (page->dma + pool->allocation)) + goto done; + } + page = NULL; + done: + spin_unlock_irqrestore(&pool->lock, flags); + return page; +} + +/** + * dma_pool_free - put block back into dma pool + * @pool: the dma pool holding the block + * @vaddr: virtual address of block + * @dma: dma address of block + * + * Caller promises neither device nor driver will again touch this block + * unless it is first re-allocated. + */ +void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) +{ + struct dma_page *page; + unsigned long flags; + unsigned int offset; + + page = pool_find_page(pool, dma); + if (!page) { + if (pool->dev) + dev_err(pool->dev, + "dma_pool_free %s, %p/%lx (bad dma)\n", + pool->name, vaddr, (unsigned long)dma); + else + printk(KERN_ERR "dma_pool_free %s, %p/%lx (bad dma)\n", + pool->name, vaddr, (unsigned long)dma); + return; + } + + offset = vaddr - page->vaddr; +#ifdef DMAPOOL_DEBUG + if ((dma - page->dma) != offset) { + if (pool->dev) + dev_err(pool->dev, + "dma_pool_free %s, %p (bad vaddr)/%Lx\n", + pool->name, vaddr, (unsigned long long)dma); + else + printk(KERN_ERR + "dma_pool_free %s, %p (bad vaddr)/%Lx\n", + pool->name, vaddr, (unsigned long long)dma); + return; + } + { + unsigned int chain = page->offset; + while (chain < pool->allocation) { + if (chain != offset) { + chain = *(int *)(page->vaddr + chain); + continue; + } + if (pool->dev) + dev_err(pool->dev, "dma_pool_free %s, dma %Lx " + "already free\n", pool->name, + (unsigned long long)dma); + else + printk(KERN_ERR "dma_pool_free %s, dma %Lx " + "already free\n", pool->name, + (unsigned long long)dma); + return; + } + } + memset(vaddr, POOL_POISON_FREED, pool->size); +#endif + + spin_lock_irqsave(&pool->lock, flags); + page->in_use--; + *(int *)vaddr = page->offset; + page->offset = offset; + if (waitqueue_active(&pool->waitq)) + wake_up_locked(&pool->waitq); + /* + * Resist a temptation to do + * if (!is_page_busy(page)) pool_free_page(pool, page); + * Better have a few empty pages hang around. + */ + spin_unlock_irqrestore(&pool->lock, flags); +} +EXPORT_SYMBOL(dma_pool_free); + +/* + * Managed DMA pool + */ +static void dmam_pool_release(struct device *dev, void *res) +{ + struct dma_pool *pool = *(struct dma_pool **)res; + + dma_pool_destroy(pool); +} + +static int dmam_pool_match(struct device *dev, void *res, void *match_data) +{ + return *(struct dma_pool **)res == match_data; +} + +/** + * dmam_pool_create - Managed dma_pool_create() + * @name: name of pool, for diagnostics + * @dev: device that will be doing the DMA + * @size: size of the blocks in this pool. + * @align: alignment requirement for blocks; must be a power of two + * @allocation: returned blocks won't cross this boundary (or zero) + * + * Managed dma_pool_create(). DMA pool created with this function is + * automatically destroyed on driver detach. + */ +struct dma_pool *dmam_pool_create(const char *name, struct device *dev, + size_t size, size_t align, size_t allocation) +{ + struct dma_pool **ptr, *pool; + + ptr = devres_alloc(dmam_pool_release, sizeof(*ptr), GFP_KERNEL); + if (!ptr) + return NULL; + + pool = *ptr = dma_pool_create(name, dev, size, align, allocation); + if (pool) + devres_add(dev, ptr); + else + devres_free(ptr); + + return pool; +} +EXPORT_SYMBOL(dmam_pool_create); + +/** + * dmam_pool_destroy - Managed dma_pool_destroy() + * @pool: dma pool that will be destroyed + * + * Managed dma_pool_destroy(). + */ +void dmam_pool_destroy(struct dma_pool *pool) +{ + struct device *dev = pool->dev; + + dma_pool_destroy(pool); + WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool)); +} +EXPORT_SYMBOL(dmam_pool_destroy); diff --git a/mm/fadvise.c b/mm/fadvise.c new file mode 100644 index 0000000..54a0f80 --- /dev/null +++ b/mm/fadvise.c @@ -0,0 +1,151 @@ +/* + * mm/fadvise.c + * + * Copyright (C) 2002, Linus Torvalds + * + * 11Jan2003 Andrew Morton + * Initial version. + */ + +#include <linux/kernel.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/backing-dev.h> +#include <linux/pagevec.h> +#include <linux/fadvise.h> +#include <linux/writeback.h> +#include <linux/syscalls.h> + +#include <asm/unistd.h> + +/* + * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could + * deactivate the pages and clear PG_Referenced. + */ +SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) +{ + struct file *file = fget(fd); + struct address_space *mapping; + struct backing_dev_info *bdi; + loff_t endbyte; /* inclusive */ + pgoff_t start_index; + pgoff_t end_index; + unsigned long nrpages; + int ret = 0; + + if (!file) + return -EBADF; + + if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) { + ret = -ESPIPE; + goto out; + } + + mapping = file->f_mapping; + if (!mapping || len < 0) { + ret = -EINVAL; + goto out; + } + + if (mapping->a_ops->get_xip_mem) { + switch (advice) { + case POSIX_FADV_NORMAL: + case POSIX_FADV_RANDOM: + case POSIX_FADV_SEQUENTIAL: + case POSIX_FADV_WILLNEED: + case POSIX_FADV_NOREUSE: + case POSIX_FADV_DONTNEED: + /* no bad return value, but ignore advice */ + break; + default: + ret = -EINVAL; + } + goto out; + } + + /* Careful about overflows. Len == 0 means "as much as possible" */ + endbyte = offset + len; + if (!len || endbyte < len) + endbyte = -1; + else + endbyte--; /* inclusive */ + + bdi = mapping->backing_dev_info; + + switch (advice) { + case POSIX_FADV_NORMAL: + file->f_ra.ra_pages = bdi->ra_pages; + break; + case POSIX_FADV_RANDOM: + file->f_ra.ra_pages = 0; + break; + case POSIX_FADV_SEQUENTIAL: + file->f_ra.ra_pages = bdi->ra_pages * 2; + break; + case POSIX_FADV_WILLNEED: + if (!mapping->a_ops->readpage) { + ret = -EINVAL; + break; + } + + /* First and last PARTIAL page! */ + start_index = offset >> PAGE_CACHE_SHIFT; + end_index = endbyte >> PAGE_CACHE_SHIFT; + + /* Careful about overflow on the "+1" */ + nrpages = end_index - start_index + 1; + if (!nrpages) + nrpages = ~0UL; + + ret = force_page_cache_readahead(mapping, file, + start_index, + max_sane_readahead(nrpages)); + if (ret > 0) + ret = 0; + break; + case POSIX_FADV_NOREUSE: + break; + case POSIX_FADV_DONTNEED: + if (!bdi_write_congested(mapping->backing_dev_info)) + filemap_flush(mapping); + + /* First and last FULL page! */ + start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; + end_index = (endbyte >> PAGE_CACHE_SHIFT); + + if (end_index >= start_index) + invalidate_mapping_pages(mapping, start_index, + end_index); + break; + default: + ret = -EINVAL; + } +out: + fput(file); + return ret; +} +#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS +asmlinkage long SyS_fadvise64_64(long fd, loff_t offset, loff_t len, long advice) +{ + return SYSC_fadvise64_64((int) fd, offset, len, (int) advice); +} +SYSCALL_ALIAS(sys_fadvise64_64, SyS_fadvise64_64); +#endif + +#ifdef __ARCH_WANT_SYS_FADVISE64 + +SYSCALL_DEFINE(fadvise64)(int fd, loff_t offset, size_t len, int advice) +{ + return sys_fadvise64_64(fd, offset, len, advice); +} +#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS +asmlinkage long SyS_fadvise64(long fd, loff_t offset, long len, long advice) +{ + return SYSC_fadvise64((int) fd, offset, (size_t)len, (int)advice); +} +SYSCALL_ALIAS(sys_fadvise64, SyS_fadvise64); +#endif + +#endif diff --git a/mm/filemap.c b/mm/filemap.c new file mode 100644 index 0000000..6f62ef9 --- /dev/null +++ b/mm/filemap.c @@ -0,0 +1,2478 @@ +/* + * linux/mm/filemap.c + * + * Copyright (C) 1994-1999 Linus Torvalds + */ + +/* + * This file handles the generic file mmap semantics used by + * most "normal" filesystems (but you don't /have/ to use this: + * the NFS filesystem used to do this differently, for example) + */ +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/compiler.h> +#include <linux/fs.h> +#include <linux/uaccess.h> +#include <linux/aio.h> +#include <linux/capability.h> +#include <linux/kernel_stat.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/mman.h> +#include <linux/pagemap.h> +#include <linux/file.h> +#include <linux/uio.h> +#include <linux/hash.h> +#include <linux/writeback.h> +#include <linux/backing-dev.h> +#include <linux/pagevec.h> +#include <linux/blkdev.h> +#include <linux/security.h> +#include <linux/syscalls.h> +#include <linux/cpuset.h> +#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ +#include <linux/memcontrol.h> +#include <linux/mm_inline.h> /* for page_is_file_cache() */ +#include "internal.h" + +/* + * FIXME: remove all knowledge of the buffer layer from the core VM + */ +#include <linux/buffer_head.h> /* for generic_osync_inode */ + +#include <asm/mman.h> + + +/* + * Shared mappings implemented 30.11.1994. It's not fully working yet, + * though. + * + * Shared mappings now work. 15.8.1995 Bruno. + * + * finished 'unifying' the page and buffer cache and SMP-threaded the + * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com> + * + * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de> + */ + +/* + * Lock ordering: + * + * ->i_mmap_lock (vmtruncate) + * ->private_lock (__free_pte->__set_page_dirty_buffers) + * ->swap_lock (exclusive_swap_page, others) + * ->mapping->tree_lock + * + * ->i_mutex + * ->i_mmap_lock (truncate->unmap_mapping_range) + * + * ->mmap_sem + * ->i_mmap_lock + * ->page_table_lock or pte_lock (various, mainly in memory.c) + * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) + * + * ->mmap_sem + * ->lock_page (access_process_vm) + * + * ->i_mutex (generic_file_buffered_write) + * ->mmap_sem (fault_in_pages_readable->do_page_fault) + * + * ->i_mutex + * ->i_alloc_sem (various) + * + * ->inode_lock + * ->sb_lock (fs/fs-writeback.c) + * ->mapping->tree_lock (__sync_single_inode) + * + * ->i_mmap_lock + * ->anon_vma.lock (vma_adjust) + * + * ->anon_vma.lock + * ->page_table_lock or pte_lock (anon_vma_prepare and various) + * + * ->page_table_lock or pte_lock + * ->swap_lock (try_to_unmap_one) + * ->private_lock (try_to_unmap_one) + * ->tree_lock (try_to_unmap_one) + * ->zone.lru_lock (follow_page->mark_page_accessed) + * ->zone.lru_lock (check_pte_range->isolate_lru_page) + * ->private_lock (page_remove_rmap->set_page_dirty) + * ->tree_lock (page_remove_rmap->set_page_dirty) + * ->inode_lock (page_remove_rmap->set_page_dirty) + * ->inode_lock (zap_pte_range->set_page_dirty) + * ->private_lock (zap_pte_range->__set_page_dirty_buffers) + * + * ->task->proc_lock + * ->dcache_lock (proc_pid_lookup) + */ + +/* + * Remove a page from the page cache and free it. Caller has to make + * sure the page is locked and that nobody else uses it - or that usage + * is safe. The caller must hold the mapping's tree_lock. + */ +void __remove_from_page_cache(struct page *page) +{ + struct address_space *mapping = page->mapping; + + radix_tree_delete(&mapping->page_tree, page->index); + page->mapping = NULL; + mapping->nrpages--; + __dec_zone_page_state(page, NR_FILE_PAGES); + BUG_ON(page_mapped(page)); + mem_cgroup_uncharge_cache_page(page); + + /* + * Some filesystems seem to re-dirty the page even after + * the VM has canceled the dirty bit (eg ext3 journaling). + * + * Fix it up by doing a final dirty accounting check after + * having removed the page entirely. + */ + if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { + dec_zone_page_state(page, NR_FILE_DIRTY); + dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + } +} + +void remove_from_page_cache(struct page *page) +{ + struct address_space *mapping = page->mapping; + + BUG_ON(!PageLocked(page)); + + spin_lock_irq(&mapping->tree_lock); + __remove_from_page_cache(page); + spin_unlock_irq(&mapping->tree_lock); +} + +static int sync_page(void *word) +{ + struct address_space *mapping; + struct page *page; + + page = container_of((unsigned long *)word, struct page, flags); + + /* + * page_mapping() is being called without PG_locked held. + * Some knowledge of the state and use of the page is used to + * reduce the requirements down to a memory barrier. + * The danger here is of a stale page_mapping() return value + * indicating a struct address_space different from the one it's + * associated with when it is associated with one. + * After smp_mb(), it's either the correct page_mapping() for + * the page, or an old page_mapping() and the page's own + * page_mapping() has gone NULL. + * The ->sync_page() address_space operation must tolerate + * page_mapping() going NULL. By an amazing coincidence, + * this comes about because none of the users of the page + * in the ->sync_page() methods make essential use of the + * page_mapping(), merely passing the page down to the backing + * device's unplug functions when it's non-NULL, which in turn + * ignore it for all cases but swap, where only page_private(page) is + * of interest. When page_mapping() does go NULL, the entire + * call stack gracefully ignores the page and returns. + * -- wli + */ + smp_mb(); + mapping = page_mapping(page); + if (mapping && mapping->a_ops && mapping->a_ops->sync_page) + mapping->a_ops->sync_page(page); + io_schedule(); + return 0; +} + +static int sync_page_killable(void *word) +{ + sync_page(word); + return fatal_signal_pending(current) ? -EINTR : 0; +} + +/** + * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range + * @mapping: address space structure to write + * @start: offset in bytes where the range starts + * @end: offset in bytes where the range ends (inclusive) + * @sync_mode: enable synchronous operation + * + * Start writeback against all of a mapping's dirty pages that lie + * within the byte offsets <start, end> inclusive. + * + * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as + * opposed to a regular memory cleansing writeback. The difference between + * these two operations is that if a dirty page/buffer is encountered, it must + * be waited upon, and not just skipped over. + */ +int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, + loff_t end, int sync_mode) +{ + int ret; + struct writeback_control wbc = { + .sync_mode = sync_mode, + .nr_to_write = LONG_MAX, + .range_start = start, + .range_end = end, + }; + + if (!mapping_cap_writeback_dirty(mapping)) + return 0; + + ret = do_writepages(mapping, &wbc); + return ret; +} + +static inline int __filemap_fdatawrite(struct address_space *mapping, + int sync_mode) +{ + return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); +} + +int filemap_fdatawrite(struct address_space *mapping) +{ + return __filemap_fdatawrite(mapping, WB_SYNC_ALL); +} +EXPORT_SYMBOL(filemap_fdatawrite); + +int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, + loff_t end) +{ + return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); +} +EXPORT_SYMBOL(filemap_fdatawrite_range); + +/** + * filemap_flush - mostly a non-blocking flush + * @mapping: target address_space + * + * This is a mostly non-blocking flush. Not suitable for data-integrity + * purposes - I/O may not be started against all dirty pages. + */ +int filemap_flush(struct address_space *mapping) +{ + return __filemap_fdatawrite(mapping, WB_SYNC_NONE); +} +EXPORT_SYMBOL(filemap_flush); + +/** + * wait_on_page_writeback_range - wait for writeback to complete + * @mapping: target address_space + * @start: beginning page index + * @end: ending page index + * + * Wait for writeback to complete against pages indexed by start->end + * inclusive + */ +int wait_on_page_writeback_range(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + struct pagevec pvec; + int nr_pages; + int ret = 0; + pgoff_t index; + + if (end < start) + return 0; + + pagevec_init(&pvec, 0); + index = start; + while ((index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_WRITEBACK, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { + unsigned i; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* until radix tree lookup accepts end_index */ + if (page->index > end) + continue; + + wait_on_page_writeback(page); + if (PageError(page)) + ret = -EIO; + } + pagevec_release(&pvec); + cond_resched(); + } + + /* Check for outstanding write errors */ + if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) + ret = -ENOSPC; + if (test_and_clear_bit(AS_EIO, &mapping->flags)) + ret = -EIO; + + return ret; +} + +/** + * sync_page_range - write and wait on all pages in the passed range + * @inode: target inode + * @mapping: target address_space + * @pos: beginning offset in pages to write + * @count: number of bytes to write + * + * Write and wait upon all the pages in the passed range. This is a "data + * integrity" operation. It waits upon in-flight writeout before starting and + * waiting upon new writeout. If there was an IO error, return it. + * + * We need to re-take i_mutex during the generic_osync_inode list walk because + * it is otherwise livelockable. + */ +int sync_page_range(struct inode *inode, struct address_space *mapping, + loff_t pos, loff_t count) +{ + pgoff_t start = pos >> PAGE_CACHE_SHIFT; + pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; + int ret; + + if (!mapping_cap_writeback_dirty(mapping) || !count) + return 0; + ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); + if (ret == 0) { + mutex_lock(&inode->i_mutex); + ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); + mutex_unlock(&inode->i_mutex); + } + if (ret == 0) + ret = wait_on_page_writeback_range(mapping, start, end); + return ret; +} +EXPORT_SYMBOL(sync_page_range); + +/** + * sync_page_range_nolock - write & wait on all pages in the passed range without locking + * @inode: target inode + * @mapping: target address_space + * @pos: beginning offset in pages to write + * @count: number of bytes to write + * + * Note: Holding i_mutex across sync_page_range_nolock() is not a good idea + * as it forces O_SYNC writers to different parts of the same file + * to be serialised right until io completion. + */ +int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, + loff_t pos, loff_t count) +{ + pgoff_t start = pos >> PAGE_CACHE_SHIFT; + pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; + int ret; + + if (!mapping_cap_writeback_dirty(mapping) || !count) + return 0; + ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); + if (ret == 0) + ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); + if (ret == 0) + ret = wait_on_page_writeback_range(mapping, start, end); + return ret; +} +EXPORT_SYMBOL(sync_page_range_nolock); + +/** + * filemap_fdatawait - wait for all under-writeback pages to complete + * @mapping: address space structure to wait for + * + * Walk the list of under-writeback pages of the given address space + * and wait for all of them. + */ +int filemap_fdatawait(struct address_space *mapping) +{ + loff_t i_size = i_size_read(mapping->host); + + if (i_size == 0) + return 0; + + return wait_on_page_writeback_range(mapping, 0, + (i_size - 1) >> PAGE_CACHE_SHIFT); +} +EXPORT_SYMBOL(filemap_fdatawait); + +int filemap_write_and_wait(struct address_space *mapping) +{ + int err = 0; + + if (mapping->nrpages) { + err = filemap_fdatawrite(mapping); + /* + * Even if the above returned error, the pages may be + * written partially (e.g. -ENOSPC), so we wait for it. + * But the -EIO is special case, it may indicate the worst + * thing (e.g. bug) happened, so we avoid waiting for it. + */ + if (err != -EIO) { + int err2 = filemap_fdatawait(mapping); + if (!err) + err = err2; + } + } + return err; +} +EXPORT_SYMBOL(filemap_write_and_wait); + +/** + * filemap_write_and_wait_range - write out & wait on a file range + * @mapping: the address_space for the pages + * @lstart: offset in bytes where the range starts + * @lend: offset in bytes where the range ends (inclusive) + * + * Write out and wait upon file offsets lstart->lend, inclusive. + * + * Note that `lend' is inclusive (describes the last byte to be written) so + * that this function can be used to write to the very end-of-file (end = -1). + */ +int filemap_write_and_wait_range(struct address_space *mapping, + loff_t lstart, loff_t lend) +{ + int err = 0; + + if (mapping->nrpages) { + err = __filemap_fdatawrite_range(mapping, lstart, lend, + WB_SYNC_ALL); + /* See comment of filemap_write_and_wait() */ + if (err != -EIO) { + int err2 = wait_on_page_writeback_range(mapping, + lstart >> PAGE_CACHE_SHIFT, + lend >> PAGE_CACHE_SHIFT); + if (!err) + err = err2; + } + } + return err; +} + +/** + * add_to_page_cache_locked - add a locked page to the pagecache + * @page: page to add + * @mapping: the page's address_space + * @offset: page index + * @gfp_mask: page allocation mode + * + * This function is used to add a page to the pagecache. It must be locked. + * This function does not add the page to the LRU. The caller must do that. + */ +int add_to_page_cache_locked(struct page *page, struct address_space *mapping, + pgoff_t offset, gfp_t gfp_mask) +{ + int error; + + VM_BUG_ON(!PageLocked(page)); + + error = mem_cgroup_cache_charge(page, current->mm, + gfp_mask & ~__GFP_HIGHMEM); + if (error) + goto out; + + error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); + if (error == 0) { + page_cache_get(page); + page->mapping = mapping; + page->index = offset; + + spin_lock_irq(&mapping->tree_lock); + error = radix_tree_insert(&mapping->page_tree, offset, page); + if (likely(!error)) { + mapping->nrpages++; + __inc_zone_page_state(page, NR_FILE_PAGES); + } else { + page->mapping = NULL; + mem_cgroup_uncharge_cache_page(page); + page_cache_release(page); + } + + spin_unlock_irq(&mapping->tree_lock); + radix_tree_preload_end(); + } else + mem_cgroup_uncharge_cache_page(page); +out: + return error; +} +EXPORT_SYMBOL(add_to_page_cache_locked); + +int add_to_page_cache_lru(struct page *page, struct address_space *mapping, + pgoff_t offset, gfp_t gfp_mask) +{ + int ret; + + /* + * Splice_read and readahead add shmem/tmpfs pages into the page cache + * before shmem_readpage has a chance to mark them as SwapBacked: they + * need to go on the active_anon lru below, and mem_cgroup_cache_charge + * (called in add_to_page_cache) needs to know where they're going too. + */ + if (mapping_cap_swap_backed(mapping)) + SetPageSwapBacked(page); + + ret = add_to_page_cache(page, mapping, offset, gfp_mask); + if (ret == 0) { + if (page_is_file_cache(page)) + lru_cache_add_file(page); + else + lru_cache_add_active_anon(page); + } + return ret; +} + +#ifdef CONFIG_NUMA +struct page *__page_cache_alloc(gfp_t gfp) +{ + if (cpuset_do_page_mem_spread()) { + int n = cpuset_mem_spread_node(); + return alloc_pages_node(n, gfp, 0); + } + return alloc_pages(gfp, 0); +} +EXPORT_SYMBOL(__page_cache_alloc); +#endif + +static int __sleep_on_page_lock(void *word) +{ + io_schedule(); + return 0; +} + +/* + * In order to wait for pages to become available there must be + * waitqueues associated with pages. By using a hash table of + * waitqueues where the bucket discipline is to maintain all + * waiters on the same queue and wake all when any of the pages + * become available, and for the woken contexts to check to be + * sure the appropriate page became available, this saves space + * at a cost of "thundering herd" phenomena during rare hash + * collisions. + */ +static wait_queue_head_t *page_waitqueue(struct page *page) +{ + const struct zone *zone = page_zone(page); + + return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; +} + +static inline void wake_up_page(struct page *page, int bit) +{ + __wake_up_bit(page_waitqueue(page), &page->flags, bit); +} + +void wait_on_page_bit(struct page *page, int bit_nr) +{ + DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); + + if (test_bit(bit_nr, &page->flags)) + __wait_on_bit(page_waitqueue(page), &wait, sync_page, + TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_on_page_bit); + +/** + * unlock_page - unlock a locked page + * @page: the page + * + * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). + * Also wakes sleepers in wait_on_page_writeback() because the wakeup + * mechananism between PageLocked pages and PageWriteback pages is shared. + * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. + * + * The mb is necessary to enforce ordering between the clear_bit and the read + * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()). + */ +void unlock_page(struct page *page) +{ + VM_BUG_ON(!PageLocked(page)); + clear_bit_unlock(PG_locked, &page->flags); + smp_mb__after_clear_bit(); + wake_up_page(page, PG_locked); +} +EXPORT_SYMBOL(unlock_page); + +/** + * end_page_writeback - end writeback against a page + * @page: the page + */ +void end_page_writeback(struct page *page) +{ + if (TestClearPageReclaim(page)) + rotate_reclaimable_page(page); + + if (!test_clear_page_writeback(page)) + BUG(); + + smp_mb__after_clear_bit(); + wake_up_page(page, PG_writeback); +} +EXPORT_SYMBOL(end_page_writeback); + +/** + * __lock_page - get a lock on the page, assuming we need to sleep to get it + * @page: the page to lock + * + * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some + * random driver's requestfn sets TASK_RUNNING, we could busywait. However + * chances are that on the second loop, the block layer's plug list is empty, + * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. + */ +void __lock_page(struct page *page) +{ + DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); + + __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page, + TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(__lock_page); + +int __lock_page_killable(struct page *page) +{ + DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); + + return __wait_on_bit_lock(page_waitqueue(page), &wait, + sync_page_killable, TASK_KILLABLE); +} + +/** + * __lock_page_nosync - get a lock on the page, without calling sync_page() + * @page: the page to lock + * + * Variant of lock_page that does not require the caller to hold a reference + * on the page's mapping. + */ +void __lock_page_nosync(struct page *page) +{ + DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); + __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, + TASK_UNINTERRUPTIBLE); +} + +/** + * find_get_page - find and get a page reference + * @mapping: the address_space to search + * @offset: the page index + * + * Is there a pagecache struct page at the given (mapping, offset) tuple? + * If yes, increment its refcount and return it; if no, return NULL. + */ +struct page *find_get_page(struct address_space *mapping, pgoff_t offset) +{ + void **pagep; + struct page *page; + + rcu_read_lock(); +repeat: + page = NULL; + pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); + if (pagep) { + page = radix_tree_deref_slot(pagep); + if (unlikely(!page || page == RADIX_TREE_RETRY)) + goto repeat; + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* + * Has the page moved? + * This is part of the lockless pagecache protocol. See + * include/linux/pagemap.h for details. + */ + if (unlikely(page != *pagep)) { + page_cache_release(page); + goto repeat; + } + } + rcu_read_unlock(); + + return page; +} +EXPORT_SYMBOL(find_get_page); + +/** + * find_lock_page - locate, pin and lock a pagecache page + * @mapping: the address_space to search + * @offset: the page index + * + * Locates the desired pagecache page, locks it, increments its reference + * count and returns its address. + * + * Returns zero if the page was not present. find_lock_page() may sleep. + */ +struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) +{ + struct page *page; + +repeat: + page = find_get_page(mapping, offset); + if (page) { + lock_page(page); + /* Has the page been truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + page_cache_release(page); + goto repeat; + } + VM_BUG_ON(page->index != offset); + } + return page; +} +EXPORT_SYMBOL(find_lock_page); + +/** + * find_or_create_page - locate or add a pagecache page + * @mapping: the page's address_space + * @index: the page's index into the mapping + * @gfp_mask: page allocation mode + * + * Locates a page in the pagecache. If the page is not present, a new page + * is allocated using @gfp_mask and is added to the pagecache and to the VM's + * LRU list. The returned page is locked and has its reference count + * incremented. + * + * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic + * allocation! + * + * find_or_create_page() returns the desired page's address, or zero on + * memory exhaustion. + */ +struct page *find_or_create_page(struct address_space *mapping, + pgoff_t index, gfp_t gfp_mask) +{ + struct page *page; + int err; +repeat: + page = find_lock_page(mapping, index); + if (!page) { + page = __page_cache_alloc(gfp_mask); + if (!page) + return NULL; + err = add_to_page_cache_lru(page, mapping, index, gfp_mask); + if (unlikely(err)) { + page_cache_release(page); + page = NULL; + if (err == -EEXIST) + goto repeat; + } + } + return page; +} +EXPORT_SYMBOL(find_or_create_page); + +/** + * find_get_pages - gang pagecache lookup + * @mapping: The address_space to search + * @start: The starting page index + * @nr_pages: The maximum number of pages + * @pages: Where the resulting pages are placed + * + * find_get_pages() will search for and return a group of up to + * @nr_pages pages in the mapping. The pages are placed at @pages. + * find_get_pages() takes a reference against the returned pages. + * + * The search returns a group of mapping-contiguous pages with ascending + * indexes. There may be holes in the indices due to not-present pages. + * + * find_get_pages() returns the number of pages which were found. + */ +unsigned find_get_pages(struct address_space *mapping, pgoff_t start, + unsigned int nr_pages, struct page **pages) +{ + unsigned int i; + unsigned int ret; + unsigned int nr_found; + + rcu_read_lock(); +restart: + nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, + (void ***)pages, start, nr_pages); + ret = 0; + for (i = 0; i < nr_found; i++) { + struct page *page; +repeat: + page = radix_tree_deref_slot((void **)pages[i]); + if (unlikely(!page)) + continue; + /* + * this can only trigger if nr_found == 1, making livelock + * a non issue. + */ + if (unlikely(page == RADIX_TREE_RETRY)) + goto restart; + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *((void **)pages[i]))) { + page_cache_release(page); + goto repeat; + } + + pages[ret] = page; + ret++; + } + rcu_read_unlock(); + return ret; +} + +/** + * find_get_pages_contig - gang contiguous pagecache lookup + * @mapping: The address_space to search + * @index: The starting page index + * @nr_pages: The maximum number of pages + * @pages: Where the resulting pages are placed + * + * find_get_pages_contig() works exactly like find_get_pages(), except + * that the returned number of pages are guaranteed to be contiguous. + * + * find_get_pages_contig() returns the number of pages which were found. + */ +unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, + unsigned int nr_pages, struct page **pages) +{ + unsigned int i; + unsigned int ret; + unsigned int nr_found; + + rcu_read_lock(); +restart: + nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, + (void ***)pages, index, nr_pages); + ret = 0; + for (i = 0; i < nr_found; i++) { + struct page *page; +repeat: + page = radix_tree_deref_slot((void **)pages[i]); + if (unlikely(!page)) + continue; + /* + * this can only trigger if nr_found == 1, making livelock + * a non issue. + */ + if (unlikely(page == RADIX_TREE_RETRY)) + goto restart; + + if (page->mapping == NULL || page->index != index) + break; + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *((void **)pages[i]))) { + page_cache_release(page); + goto repeat; + } + + pages[ret] = page; + ret++; + index++; + } + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL(find_get_pages_contig); + +/** + * find_get_pages_tag - find and return pages that match @tag + * @mapping: the address_space to search + * @index: the starting page index + * @tag: the tag index + * @nr_pages: the maximum number of pages + * @pages: where the resulting pages are placed + * + * Like find_get_pages, except we only return pages which are tagged with + * @tag. We update @index to index the next page for the traversal. + */ +unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, + int tag, unsigned int nr_pages, struct page **pages) +{ + unsigned int i; + unsigned int ret; + unsigned int nr_found; + + rcu_read_lock(); +restart: + nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree, + (void ***)pages, *index, nr_pages, tag); + ret = 0; + for (i = 0; i < nr_found; i++) { + struct page *page; +repeat: + page = radix_tree_deref_slot((void **)pages[i]); + if (unlikely(!page)) + continue; + /* + * this can only trigger if nr_found == 1, making livelock + * a non issue. + */ + if (unlikely(page == RADIX_TREE_RETRY)) + goto restart; + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *((void **)pages[i]))) { + page_cache_release(page); + goto repeat; + } + + pages[ret] = page; + ret++; + } + rcu_read_unlock(); + + if (ret) + *index = pages[ret - 1]->index + 1; + + return ret; +} +EXPORT_SYMBOL(find_get_pages_tag); + +/** + * grab_cache_page_nowait - returns locked page at given index in given cache + * @mapping: target address_space + * @index: the page index + * + * Same as grab_cache_page(), but do not wait if the page is unavailable. + * This is intended for speculative data generators, where the data can + * be regenerated if the page couldn't be grabbed. This routine should + * be safe to call while holding the lock for another page. + * + * Clear __GFP_FS when allocating the page to avoid recursion into the fs + * and deadlock against the caller's locked page. + */ +struct page * +grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) +{ + struct page *page = find_get_page(mapping, index); + + if (page) { + if (trylock_page(page)) + return page; + page_cache_release(page); + return NULL; + } + page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); + if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) { + page_cache_release(page); + page = NULL; + } + return page; +} +EXPORT_SYMBOL(grab_cache_page_nowait); + +/* + * CD/DVDs are error prone. When a medium error occurs, the driver may fail + * a _large_ part of the i/o request. Imagine the worst scenario: + * + * ---R__________________________________________B__________ + * ^ reading here ^ bad block(assume 4k) + * + * read(R) => miss => readahead(R...B) => media error => frustrating retries + * => failing the whole request => read(R) => read(R+1) => + * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) => + * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) => + * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ...... + * + * It is going insane. Fix it by quickly scaling down the readahead size. + */ +static void shrink_readahead_size_eio(struct file *filp, + struct file_ra_state *ra) +{ + if (!ra->ra_pages) + return; + + ra->ra_pages /= 4; +} + +/** + * do_generic_file_read - generic file read routine + * @filp: the file to read + * @ppos: current file position + * @desc: read_descriptor + * @actor: read method + * + * This is a generic file read routine, and uses the + * mapping->a_ops->readpage() function for the actual low-level stuff. + * + * This is really ugly. But the goto's actually try to clarify some + * of the logic when it comes to error handling etc. + */ +static void do_generic_file_read(struct file *filp, loff_t *ppos, + read_descriptor_t *desc, read_actor_t actor) +{ + struct address_space *mapping = filp->f_mapping; + struct inode *inode = mapping->host; + struct file_ra_state *ra = &filp->f_ra; + pgoff_t index; + pgoff_t last_index; + pgoff_t prev_index; + unsigned long offset; /* offset into pagecache page */ + unsigned int prev_offset; + int error; + + index = *ppos >> PAGE_CACHE_SHIFT; + prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT; + prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1); + last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; + offset = *ppos & ~PAGE_CACHE_MASK; + + for (;;) { + struct page *page; + pgoff_t end_index; + loff_t isize; + unsigned long nr, ret; + + cond_resched(); +find_page: + page = find_get_page(mapping, index); + if (!page) { + page_cache_sync_readahead(mapping, + ra, filp, + index, last_index - index); + page = find_get_page(mapping, index); + if (unlikely(page == NULL)) + goto no_cached_page; + } + if (PageReadahead(page)) { + page_cache_async_readahead(mapping, + ra, filp, page, + index, last_index - index); + } + if (!PageUptodate(page)) { + if (inode->i_blkbits == PAGE_CACHE_SHIFT || + !mapping->a_ops->is_partially_uptodate) + goto page_not_up_to_date; + if (!trylock_page(page)) + goto page_not_up_to_date; + if (!mapping->a_ops->is_partially_uptodate(page, + desc, offset)) + goto page_not_up_to_date_locked; + unlock_page(page); + } +page_ok: + /* + * i_size must be checked after we know the page is Uptodate. + * + * Checking i_size after the check allows us to calculate + * the correct value for "nr", which means the zero-filled + * part of the page is not copied back to userspace (unless + * another truncate extends the file - this is desired though). + */ + + isize = i_size_read(inode); + end_index = (isize - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(!isize || index > end_index)) { + page_cache_release(page); + goto out; + } + + /* nr is the maximum number of bytes to copy from this page */ + nr = PAGE_CACHE_SIZE; + if (index == end_index) { + nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; + if (nr <= offset) { + page_cache_release(page); + goto out; + } + } + nr = nr - offset; + + /* If users can be writing to this page using arbitrary + * virtual addresses, take care about potential aliasing + * before reading the page on the kernel side. + */ + if (mapping_writably_mapped(mapping)) + flush_dcache_page(page); + + /* + * When a sequential read accesses a page several times, + * only mark it as accessed the first time. + */ + if (prev_index != index || offset != prev_offset) + mark_page_accessed(page); + prev_index = index; + + /* + * Ok, we have the page, and it's up-to-date, so + * now we can copy it to user space... + * + * The actor routine returns how many bytes were actually used.. + * NOTE! This may not be the same as how much of a user buffer + * we filled up (we may be padding etc), so we can only update + * "pos" here (the actor routine has to update the user buffer + * pointers and the remaining count). + */ + ret = actor(desc, page, offset, nr); + offset += ret; + index += offset >> PAGE_CACHE_SHIFT; + offset &= ~PAGE_CACHE_MASK; + prev_offset = offset; + + page_cache_release(page); + if (ret == nr && desc->count) + continue; + goto out; + +page_not_up_to_date: + /* Get exclusive access to the page ... */ + error = lock_page_killable(page); + if (unlikely(error)) + goto readpage_error; + +page_not_up_to_date_locked: + /* Did it get truncated before we got the lock? */ + if (!page->mapping) { + unlock_page(page); + page_cache_release(page); + continue; + } + + /* Did somebody else fill it already? */ + if (PageUptodate(page)) { + unlock_page(page); + goto page_ok; + } + +readpage: + /* Start the actual read. The read will unlock the page. */ + error = mapping->a_ops->readpage(filp, page); + + if (unlikely(error)) { + if (error == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto find_page; + } + goto readpage_error; + } + + if (!PageUptodate(page)) { + error = lock_page_killable(page); + if (unlikely(error)) + goto readpage_error; + if (!PageUptodate(page)) { + if (page->mapping == NULL) { + /* + * invalidate_inode_pages got it + */ + unlock_page(page); + page_cache_release(page); + goto find_page; + } + unlock_page(page); + shrink_readahead_size_eio(filp, ra); + error = -EIO; + goto readpage_error; + } + unlock_page(page); + } + + goto page_ok; + +readpage_error: + /* UHHUH! A synchronous read error occurred. Report it */ + desc->error = error; + page_cache_release(page); + goto out; + +no_cached_page: + /* + * Ok, it wasn't cached, so we need to create a new + * page.. + */ + page = page_cache_alloc_cold(mapping); + if (!page) { + desc->error = -ENOMEM; + goto out; + } + error = add_to_page_cache_lru(page, mapping, + index, GFP_KERNEL); + if (error) { + page_cache_release(page); + if (error == -EEXIST) + goto find_page; + desc->error = error; + goto out; + } + goto readpage; + } + +out: + ra->prev_pos = prev_index; + ra->prev_pos <<= PAGE_CACHE_SHIFT; + ra->prev_pos |= prev_offset; + + *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; + file_accessed(filp); +} + +int file_read_actor(read_descriptor_t *desc, struct page *page, + unsigned long offset, unsigned long size) +{ + char *kaddr; + unsigned long left, count = desc->count; + + if (size > count) + size = count; + + /* + * Faults on the destination of a read are common, so do it before + * taking the kmap. + */ + if (!fault_in_pages_writeable(desc->arg.buf, size)) { + kaddr = kmap_atomic(page, KM_USER0); + left = __copy_to_user_inatomic(desc->arg.buf, + kaddr + offset, size); + kunmap_atomic(kaddr, KM_USER0); + if (left == 0) + goto success; + } + + /* Do it the slow way */ + kaddr = kmap(page); + left = __copy_to_user(desc->arg.buf, kaddr + offset, size); + kunmap(page); + + if (left) { + size -= left; + desc->error = -EFAULT; + } +success: + desc->count = count - size; + desc->written += size; + desc->arg.buf += size; + return size; +} + +/* + * Performs necessary checks before doing a write + * @iov: io vector request + * @nr_segs: number of segments in the iovec + * @count: number of bytes to write + * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE + * + * Adjust number of segments and amount of bytes to write (nr_segs should be + * properly initialized first). Returns appropriate error code that caller + * should return or zero in case that write should be allowed. + */ +int generic_segment_checks(const struct iovec *iov, + unsigned long *nr_segs, size_t *count, int access_flags) +{ + unsigned long seg; + size_t cnt = 0; + for (seg = 0; seg < *nr_segs; seg++) { + const struct iovec *iv = &iov[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + cnt += iv->iov_len; + if (unlikely((ssize_t)(cnt|iv->iov_len) < 0)) + return -EINVAL; + if (access_ok(access_flags, iv->iov_base, iv->iov_len)) + continue; + if (seg == 0) + return -EFAULT; + *nr_segs = seg; + cnt -= iv->iov_len; /* This segment is no good */ + break; + } + *count = cnt; + return 0; +} +EXPORT_SYMBOL(generic_segment_checks); + +/** + * generic_file_aio_read - generic filesystem read routine + * @iocb: kernel I/O control block + * @iov: io vector request + * @nr_segs: number of segments in the iovec + * @pos: current file position + * + * This is the "read()" routine for all filesystems + * that can use the page cache directly. + */ +ssize_t +generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *filp = iocb->ki_filp; + ssize_t retval; + unsigned long seg; + size_t count; + loff_t *ppos = &iocb->ki_pos; + + count = 0; + retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); + if (retval) + return retval; + + /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ + if (filp->f_flags & O_DIRECT) { + loff_t size; + struct address_space *mapping; + struct inode *inode; + + mapping = filp->f_mapping; + inode = mapping->host; + if (!count) + goto out; /* skip atime */ + size = i_size_read(inode); + if (pos < size) { + retval = filemap_write_and_wait_range(mapping, pos, + pos + iov_length(iov, nr_segs) - 1); + if (!retval) { + retval = mapping->a_ops->direct_IO(READ, iocb, + iov, pos, nr_segs); + } + if (retval > 0) + *ppos = pos + retval; + if (retval) { + file_accessed(filp); + goto out; + } + } + } + + for (seg = 0; seg < nr_segs; seg++) { + read_descriptor_t desc; + + desc.written = 0; + desc.arg.buf = iov[seg].iov_base; + desc.count = iov[seg].iov_len; + if (desc.count == 0) + continue; + desc.error = 0; + do_generic_file_read(filp, ppos, &desc, file_read_actor); + retval += desc.written; + if (desc.error) { + retval = retval ?: desc.error; + break; + } + if (desc.count > 0) + break; + } +out: + return retval; +} +EXPORT_SYMBOL(generic_file_aio_read); + +static ssize_t +do_readahead(struct address_space *mapping, struct file *filp, + pgoff_t index, unsigned long nr) +{ + if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) + return -EINVAL; + + force_page_cache_readahead(mapping, filp, index, + max_sane_readahead(nr)); + return 0; +} + +SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) +{ + ssize_t ret; + struct file *file; + + ret = -EBADF; + file = fget(fd); + if (file) { + if (file->f_mode & FMODE_READ) { + struct address_space *mapping = file->f_mapping; + pgoff_t start = offset >> PAGE_CACHE_SHIFT; + pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; + unsigned long len = end - start + 1; + ret = do_readahead(mapping, file, start, len); + } + fput(file); + } + return ret; +} +#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS +asmlinkage long SyS_readahead(long fd, loff_t offset, long count) +{ + return SYSC_readahead((int) fd, offset, (size_t) count); +} +SYSCALL_ALIAS(sys_readahead, SyS_readahead); +#endif + +#ifdef CONFIG_MMU +/** + * page_cache_read - adds requested page to the page cache if not already there + * @file: file to read + * @offset: page index + * + * This adds the requested page to the page cache if it isn't already there, + * and schedules an I/O to read in its contents from disk. + */ +static int page_cache_read(struct file *file, pgoff_t offset) +{ + struct address_space *mapping = file->f_mapping; + struct page *page; + int ret; + + do { + page = page_cache_alloc_cold(mapping); + if (!page) + return -ENOMEM; + + ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); + if (ret == 0) + ret = mapping->a_ops->readpage(file, page); + else if (ret == -EEXIST) + ret = 0; /* losing race to add is OK */ + + page_cache_release(page); + + } while (ret == AOP_TRUNCATED_PAGE); + + return ret; +} + +#define MMAP_LOTSAMISS (100) + +/** + * filemap_fault - read in file data for page fault handling + * @vma: vma in which the fault was taken + * @vmf: struct vm_fault containing details of the fault + * + * filemap_fault() is invoked via the vma operations vector for a + * mapped memory region to read in file data during a page fault. + * + * The goto's are kind of ugly, but this streamlines the normal case of having + * it in the page cache, and handles the special cases reasonably without + * having a lot of duplicated code. + */ +int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + int error; + struct file *file = vma->vm_file; + struct address_space *mapping = file->f_mapping; + struct file_ra_state *ra = &file->f_ra; + struct inode *inode = mapping->host; + struct page *page; + pgoff_t size; + int did_readaround = 0; + int ret = 0; + + size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (vmf->pgoff >= size) + return VM_FAULT_SIGBUS; + + /* If we don't want any read-ahead, don't bother */ + if (VM_RandomReadHint(vma)) + goto no_cached_page; + + /* + * Do we have something in the page cache already? + */ +retry_find: + page = find_lock_page(mapping, vmf->pgoff); + /* + * For sequential accesses, we use the generic readahead logic. + */ + if (VM_SequentialReadHint(vma)) { + if (!page) { + page_cache_sync_readahead(mapping, ra, file, + vmf->pgoff, 1); + page = find_lock_page(mapping, vmf->pgoff); + if (!page) + goto no_cached_page; + } + if (PageReadahead(page)) { + page_cache_async_readahead(mapping, ra, file, page, + vmf->pgoff, 1); + } + } + + if (!page) { + unsigned long ra_pages; + + ra->mmap_miss++; + + /* + * Do we miss much more than hit in this file? If so, + * stop bothering with read-ahead. It will only hurt. + */ + if (ra->mmap_miss > MMAP_LOTSAMISS) + goto no_cached_page; + + /* + * To keep the pgmajfault counter straight, we need to + * check did_readaround, as this is an inner loop. + */ + if (!did_readaround) { + ret = VM_FAULT_MAJOR; + count_vm_event(PGMAJFAULT); + } + did_readaround = 1; + ra_pages = max_sane_readahead(file->f_ra.ra_pages); + if (ra_pages) { + pgoff_t start = 0; + + if (vmf->pgoff > ra_pages / 2) + start = vmf->pgoff - ra_pages / 2; + do_page_cache_readahead(mapping, file, start, ra_pages); + } + page = find_lock_page(mapping, vmf->pgoff); + if (!page) + goto no_cached_page; + } + + if (!did_readaround) + ra->mmap_miss--; + + /* + * We have a locked page in the page cache, now we need to check + * that it's up-to-date. If not, it is going to be due to an error. + */ + if (unlikely(!PageUptodate(page))) + goto page_not_uptodate; + + /* Must recheck i_size under page lock */ + size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(vmf->pgoff >= size)) { + unlock_page(page); + page_cache_release(page); + return VM_FAULT_SIGBUS; + } + + /* + * Found the page and have a reference on it. + */ + mark_page_accessed(page); + ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; + vmf->page = page; + return ret | VM_FAULT_LOCKED; + +no_cached_page: + /* + * We're only likely to ever get here if MADV_RANDOM is in + * effect. + */ + error = page_cache_read(file, vmf->pgoff); + + /* + * The page we want has now been added to the page cache. + * In the unlikely event that someone removed it in the + * meantime, we'll just come back here and read it again. + */ + if (error >= 0) + goto retry_find; + + /* + * An error return from page_cache_read can result if the + * system is low on memory, or a problem occurs while trying + * to schedule I/O. + */ + if (error == -ENOMEM) + return VM_FAULT_OOM; + return VM_FAULT_SIGBUS; + +page_not_uptodate: + /* IO error path */ + if (!did_readaround) { + ret = VM_FAULT_MAJOR; + count_vm_event(PGMAJFAULT); + } + + /* + * Umm, take care of errors if the page isn't up-to-date. + * Try to re-read it _once_. We do this synchronously, + * because there really aren't any performance issues here + * and we need to check for errors. + */ + ClearPageError(page); + error = mapping->a_ops->readpage(file, page); + if (!error) { + wait_on_page_locked(page); + if (!PageUptodate(page)) + error = -EIO; + } + page_cache_release(page); + + if (!error || error == AOP_TRUNCATED_PAGE) + goto retry_find; + + /* Things didn't work out. Return zero to tell the mm layer so. */ + shrink_readahead_size_eio(file, ra); + return VM_FAULT_SIGBUS; +} +EXPORT_SYMBOL(filemap_fault); + +struct vm_operations_struct generic_file_vm_ops = { + .fault = filemap_fault, +}; + +/* This is used for a general mmap of a disk file */ + +int generic_file_mmap(struct file * file, struct vm_area_struct * vma) +{ + struct address_space *mapping = file->f_mapping; + + if (!mapping->a_ops->readpage) + return -ENOEXEC; + file_accessed(file); + vma->vm_ops = &generic_file_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; + return 0; +} + +/* + * This is for filesystems which do not implement ->writepage. + */ +int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) +{ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + return -EINVAL; + return generic_file_mmap(file, vma); +} +#else +int generic_file_mmap(struct file * file, struct vm_area_struct * vma) +{ + return -ENOSYS; +} +int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) +{ + return -ENOSYS; +} +#endif /* CONFIG_MMU */ + +EXPORT_SYMBOL(generic_file_mmap); +EXPORT_SYMBOL(generic_file_readonly_mmap); + +static struct page *__read_cache_page(struct address_space *mapping, + pgoff_t index, + int (*filler)(void *,struct page*), + void *data) +{ + struct page *page; + int err; +repeat: + page = find_get_page(mapping, index); + if (!page) { + page = page_cache_alloc_cold(mapping); + if (!page) + return ERR_PTR(-ENOMEM); + err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); + if (unlikely(err)) { + page_cache_release(page); + if (err == -EEXIST) + goto repeat; + /* Presumably ENOMEM for radix tree node */ + return ERR_PTR(err); + } + err = filler(data, page); + if (err < 0) { + page_cache_release(page); + page = ERR_PTR(err); + } + } + return page; +} + +/** + * read_cache_page_async - read into page cache, fill it if needed + * @mapping: the page's address_space + * @index: the page index + * @filler: function to perform the read + * @data: destination for read data + * + * Same as read_cache_page, but don't wait for page to become unlocked + * after submitting it to the filler. + * + * Read into the page cache. If a page already exists, and PageUptodate() is + * not set, try to fill the page but don't wait for it to become unlocked. + * + * If the page does not get brought uptodate, return -EIO. + */ +struct page *read_cache_page_async(struct address_space *mapping, + pgoff_t index, + int (*filler)(void *,struct page*), + void *data) +{ + struct page *page; + int err; + +retry: + page = __read_cache_page(mapping, index, filler, data); + if (IS_ERR(page)) + return page; + if (PageUptodate(page)) + goto out; + + lock_page(page); + if (!page->mapping) { + unlock_page(page); + page_cache_release(page); + goto retry; + } + if (PageUptodate(page)) { + unlock_page(page); + goto out; + } + err = filler(data, page); + if (err < 0) { + page_cache_release(page); + return ERR_PTR(err); + } +out: + mark_page_accessed(page); + return page; +} +EXPORT_SYMBOL(read_cache_page_async); + +/** + * read_cache_page - read into page cache, fill it if needed + * @mapping: the page's address_space + * @index: the page index + * @filler: function to perform the read + * @data: destination for read data + * + * Read into the page cache. If a page already exists, and PageUptodate() is + * not set, try to fill the page then wait for it to become unlocked. + * + * If the page does not get brought uptodate, return -EIO. + */ +struct page *read_cache_page(struct address_space *mapping, + pgoff_t index, + int (*filler)(void *,struct page*), + void *data) +{ + struct page *page; + + page = read_cache_page_async(mapping, index, filler, data); + if (IS_ERR(page)) + goto out; + wait_on_page_locked(page); + if (!PageUptodate(page)) { + page_cache_release(page); + page = ERR_PTR(-EIO); + } + out: + return page; +} +EXPORT_SYMBOL(read_cache_page); + +/* + * The logic we want is + * + * if suid or (sgid and xgrp) + * remove privs + */ +int should_remove_suid(struct dentry *dentry) +{ + mode_t mode = dentry->d_inode->i_mode; + int kill = 0; + + /* suid always must be killed */ + if (unlikely(mode & S_ISUID)) + kill = ATTR_KILL_SUID; + + /* + * sgid without any exec bits is just a mandatory locking mark; leave + * it alone. If some exec bits are set, it's a real sgid; kill it. + */ + if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) + kill |= ATTR_KILL_SGID; + + if (unlikely(kill && !capable(CAP_FSETID))) + return kill; + + return 0; +} +EXPORT_SYMBOL(should_remove_suid); + +static int __remove_suid(struct dentry *dentry, int kill) +{ + struct iattr newattrs; + + newattrs.ia_valid = ATTR_FORCE | kill; + return notify_change(dentry, &newattrs); +} + +int file_remove_suid(struct file *file) +{ + struct dentry *dentry = file->f_path.dentry; + int killsuid = should_remove_suid(dentry); + int killpriv = security_inode_need_killpriv(dentry); + int error = 0; + + if (killpriv < 0) + return killpriv; + if (killpriv) + error = security_inode_killpriv(dentry); + if (!error && killsuid) + error = __remove_suid(dentry, killsuid); + + return error; +} +EXPORT_SYMBOL(file_remove_suid); + +static size_t __iovec_copy_from_user_inatomic(char *vaddr, + const struct iovec *iov, size_t base, size_t bytes) +{ + size_t copied = 0, left = 0; + + while (bytes) { + char __user *buf = iov->iov_base + base; + int copy = min(bytes, iov->iov_len - base); + + base = 0; + left = __copy_from_user_inatomic_nocache(vaddr, buf, copy); + copied += copy; + bytes -= copy; + vaddr += copy; + iov++; + + if (unlikely(left)) + break; + } + return copied - left; +} + +/* + * Copy as much as we can into the page and return the number of bytes which + * were sucessfully copied. If a fault is encountered then return the number of + * bytes which were copied. + */ +size_t iov_iter_copy_from_user_atomic(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + char *kaddr; + size_t copied; + + BUG_ON(!in_atomic()); + kaddr = kmap_atomic(page, KM_USER0); + if (likely(i->nr_segs == 1)) { + int left; + char __user *buf = i->iov->iov_base + i->iov_offset; + left = __copy_from_user_inatomic_nocache(kaddr + offset, + buf, bytes); + copied = bytes - left; + } else { + copied = __iovec_copy_from_user_inatomic(kaddr + offset, + i->iov, i->iov_offset, bytes); + } + kunmap_atomic(kaddr, KM_USER0); + + return copied; +} +EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); + +/* + * This has the same sideeffects and return value as + * iov_iter_copy_from_user_atomic(). + * The difference is that it attempts to resolve faults. + * Page must not be locked. + */ +size_t iov_iter_copy_from_user(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + char *kaddr; + size_t copied; + + kaddr = kmap(page); + if (likely(i->nr_segs == 1)) { + int left; + char __user *buf = i->iov->iov_base + i->iov_offset; + left = __copy_from_user_nocache(kaddr + offset, buf, bytes); + copied = bytes - left; + } else { + copied = __iovec_copy_from_user_inatomic(kaddr + offset, + i->iov, i->iov_offset, bytes); + } + kunmap(page); + return copied; +} +EXPORT_SYMBOL(iov_iter_copy_from_user); + +void iov_iter_advance(struct iov_iter *i, size_t bytes) +{ + BUG_ON(i->count < bytes); + + if (likely(i->nr_segs == 1)) { + i->iov_offset += bytes; + i->count -= bytes; + } else { + const struct iovec *iov = i->iov; + size_t base = i->iov_offset; + + /* + * The !iov->iov_len check ensures we skip over unlikely + * zero-length segments (without overruning the iovec). + */ + while (bytes || unlikely(i->count && !iov->iov_len)) { + int copy; + + copy = min(bytes, iov->iov_len - base); + BUG_ON(!i->count || i->count < copy); + i->count -= copy; + bytes -= copy; + base += copy; + if (iov->iov_len == base) { + iov++; + base = 0; + } + } + i->iov = iov; + i->iov_offset = base; + } +} +EXPORT_SYMBOL(iov_iter_advance); + +/* + * Fault in the first iovec of the given iov_iter, to a maximum length + * of bytes. Returns 0 on success, or non-zero if the memory could not be + * accessed (ie. because it is an invalid address). + * + * writev-intensive code may want this to prefault several iovecs -- that + * would be possible (callers must not rely on the fact that _only_ the + * first iovec will be faulted with the current implementation). + */ +int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) +{ + char __user *buf = i->iov->iov_base + i->iov_offset; + bytes = min(bytes, i->iov->iov_len - i->iov_offset); + return fault_in_pages_readable(buf, bytes); +} +EXPORT_SYMBOL(iov_iter_fault_in_readable); + +/* + * Return the count of just the current iov_iter segment. + */ +size_t iov_iter_single_seg_count(struct iov_iter *i) +{ + const struct iovec *iov = i->iov; + if (i->nr_segs == 1) + return i->count; + else + return min(i->count, iov->iov_len - i->iov_offset); +} +EXPORT_SYMBOL(iov_iter_single_seg_count); + +/* + * Performs necessary checks before doing a write + * + * Can adjust writing position or amount of bytes to write. + * Returns appropriate error code that caller should return or + * zero in case that write should be allowed. + */ +inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) +{ + struct inode *inode = file->f_mapping->host; + unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + + if (unlikely(*pos < 0)) + return -EINVAL; + + if (!isblk) { + /* FIXME: this is for backwards compatibility with 2.4 */ + if (file->f_flags & O_APPEND) + *pos = i_size_read(inode); + + if (limit != RLIM_INFINITY) { + if (*pos >= limit) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; + } + if (*count > limit - (typeof(limit))*pos) { + *count = limit - (typeof(limit))*pos; + } + } + } + + /* + * LFS rule + */ + if (unlikely(*pos + *count > MAX_NON_LFS && + !(file->f_flags & O_LARGEFILE))) { + if (*pos >= MAX_NON_LFS) { + return -EFBIG; + } + if (*count > MAX_NON_LFS - (unsigned long)*pos) { + *count = MAX_NON_LFS - (unsigned long)*pos; + } + } + + /* + * Are we about to exceed the fs block limit ? + * + * If we have written data it becomes a short write. If we have + * exceeded without writing data we send a signal and return EFBIG. + * Linus frestrict idea will clean these up nicely.. + */ + if (likely(!isblk)) { + if (unlikely(*pos >= inode->i_sb->s_maxbytes)) { + if (*count || *pos > inode->i_sb->s_maxbytes) { + return -EFBIG; + } + /* zero-length writes at ->s_maxbytes are OK */ + } + + if (unlikely(*pos + *count > inode->i_sb->s_maxbytes)) + *count = inode->i_sb->s_maxbytes - *pos; + } else { +#ifdef CONFIG_BLOCK + loff_t isize; + if (bdev_read_only(I_BDEV(inode))) + return -EPERM; + isize = i_size_read(inode); + if (*pos >= isize) { + if (*count || *pos > isize) + return -ENOSPC; + } + + if (*pos + *count > isize) + *count = isize - *pos; +#else + return -EPERM; +#endif + } + return 0; +} +EXPORT_SYMBOL(generic_write_checks); + +int pagecache_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + const struct address_space_operations *aops = mapping->a_ops; + + return aops->write_begin(file, mapping, pos, len, flags, + pagep, fsdata); +} +EXPORT_SYMBOL(pagecache_write_begin); + +int pagecache_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + const struct address_space_operations *aops = mapping->a_ops; + + mark_page_accessed(page); + return aops->write_end(file, mapping, pos, len, copied, page, fsdata); +} +EXPORT_SYMBOL(pagecache_write_end); + +ssize_t +generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long *nr_segs, loff_t pos, loff_t *ppos, + size_t count, size_t ocount) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t written; + size_t write_len; + pgoff_t end; + + if (count != ocount) + *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); + + write_len = iov_length(iov, *nr_segs); + end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; + + written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); + if (written) + goto out; + + /* + * After a write we want buffered reads to be sure to go to disk to get + * the new data. We invalidate clean cached page from the region we're + * about to write. We do this *before* the write so that we can return + * without clobbering -EIOCBQUEUED from ->direct_IO(). + */ + if (mapping->nrpages) { + written = invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, end); + /* + * If a page can not be invalidated, return 0 to fall back + * to buffered write. + */ + if (written) { + if (written == -EBUSY) + return 0; + goto out; + } + } + + written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs); + + /* + * Finally, try again to invalidate clean pages which might have been + * cached by non-direct readahead, or faulted in by get_user_pages() + * if the source of the write was an mmap'ed region of the file + * we're writing. Either one is a pretty crazy thing to do, + * so we don't support it 100%. If this invalidation + * fails, tough, the write still worked... + */ + if (mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, end); + } + + if (written > 0) { + loff_t end = pos + written; + if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { + i_size_write(inode, end); + mark_inode_dirty(inode); + } + *ppos = end; + } + + /* + * Sync the fs metadata but not the minor inode changes and + * of course not the data as we did direct DMA for the IO. + * i_mutex is held, which protects generic_osync_inode() from + * livelocking. AIO O_DIRECT ops attempt to sync metadata here. + */ +out: + if ((written >= 0 || written == -EIOCBQUEUED) && + ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); + if (err < 0) + written = err; + } + return written; +} +EXPORT_SYMBOL(generic_file_direct_write); + +/* + * Find or create a page at the given pagecache position. Return the locked + * page. This function is specifically for buffered writes. + */ +struct page *grab_cache_page_write_begin(struct address_space *mapping, + pgoff_t index, unsigned flags) +{ + int status; + struct page *page; + gfp_t gfp_notmask = 0; + if (flags & AOP_FLAG_NOFS) + gfp_notmask = __GFP_FS; +repeat: + page = find_lock_page(mapping, index); + if (likely(page)) + return page; + + page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); + if (!page) + return NULL; + status = add_to_page_cache_lru(page, mapping, index, + GFP_KERNEL & ~gfp_notmask); + if (unlikely(status)) { + page_cache_release(page); + if (status == -EEXIST) + goto repeat; + return NULL; + } + return page; +} +EXPORT_SYMBOL(grab_cache_page_write_begin); + +static ssize_t generic_perform_write(struct file *file, + struct iov_iter *i, loff_t pos) +{ + struct address_space *mapping = file->f_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + long status = 0; + ssize_t written = 0; + unsigned int flags = 0; + + /* + * Copies from kernel address space cannot fail (NFSD is a big user). + */ + if (segment_eq(get_fs(), KERNEL_DS)) + flags |= AOP_FLAG_UNINTERRUPTIBLE; + + do { + struct page *page; + pgoff_t index; /* Pagecache index for current page */ + unsigned long offset; /* Offset into pagecache page */ + unsigned long bytes; /* Bytes to write to page */ + size_t copied; /* Bytes copied from user */ + void *fsdata; + + offset = (pos & (PAGE_CACHE_SIZE - 1)); + index = pos >> PAGE_CACHE_SHIFT; + bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, + iov_iter_count(i)); + +again: + + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + * + * Not only is this an optimisation, but it is also required + * to check that the address is actually valid, when atomic + * usercopies are used, below. + */ + if (unlikely(iov_iter_fault_in_readable(i, bytes))) { + status = -EFAULT; + break; + } + + status = a_ops->write_begin(file, mapping, pos, bytes, flags, + &page, &fsdata); + if (unlikely(status)) + break; + + pagefault_disable(); + copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); + pagefault_enable(); + flush_dcache_page(page); + + status = a_ops->write_end(file, mapping, pos, bytes, copied, + page, fsdata); + if (unlikely(status < 0)) + break; + copied = status; + + cond_resched(); + + iov_iter_advance(i, copied); + if (unlikely(copied == 0)) { + /* + * If we were unable to copy any data at all, we must + * fall back to a single segment length write. + * + * If we didn't fallback here, we could livelock + * because not all segments in the iov can be copied at + * once without a pagefault. + */ + bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, + iov_iter_single_seg_count(i)); + goto again; + } + pos += copied; + written += copied; + + balance_dirty_pages_ratelimited(mapping); + + } while (iov_iter_count(i)); + + return written ? written : status; +} + +ssize_t +generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos, loff_t *ppos, + size_t count, ssize_t written) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + struct inode *inode = mapping->host; + ssize_t status; + struct iov_iter i; + + iov_iter_init(&i, iov, nr_segs, count, written); + status = generic_perform_write(file, &i, pos); + + if (likely(status >= 0)) { + written += status; + *ppos = pos + status; + + /* + * For now, when the user asks for O_SYNC, we'll actually give + * O_DSYNC + */ + if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + if (!a_ops->writepage || !is_sync_kiocb(iocb)) + status = generic_osync_inode(inode, mapping, + OSYNC_METADATA|OSYNC_DATA); + } + } + + /* + * If we get here for O_DIRECT writes then we must have fallen through + * to buffered writes (block instantiation inside i_size). So we sync + * the file data here, to try to honour O_DIRECT expectations. + */ + if (unlikely(file->f_flags & O_DIRECT) && written) + status = filemap_write_and_wait_range(mapping, + pos, pos + written - 1); + + return written ? written : status; +} +EXPORT_SYMBOL(generic_file_buffered_write); + +static ssize_t +__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) +{ + struct file *file = iocb->ki_filp; + struct address_space * mapping = file->f_mapping; + size_t ocount; /* original count */ + size_t count; /* after file limit checks */ + struct inode *inode = mapping->host; + loff_t pos; + ssize_t written; + ssize_t err; + + ocount = 0; + err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); + if (err) + return err; + + count = ocount; + pos = *ppos; + + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = mapping->backing_dev_info; + written = 0; + + err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); + if (err) + goto out; + + if (count == 0) + goto out; + + err = file_remove_suid(file); + if (err) + goto out; + + file_update_time(file); + + /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ + if (unlikely(file->f_flags & O_DIRECT)) { + loff_t endbyte; + ssize_t written_buffered; + + written = generic_file_direct_write(iocb, iov, &nr_segs, pos, + ppos, count, ocount); + if (written < 0 || written == count) + goto out; + /* + * direct-io write to a hole: fall through to buffered I/O + * for completing the rest of the request. + */ + pos += written; + count -= written; + written_buffered = generic_file_buffered_write(iocb, iov, + nr_segs, pos, ppos, count, + written); + /* + * If generic_file_buffered_write() retuned a synchronous error + * then we want to return the number of bytes which were + * direct-written, or the error code if that was zero. Note + * that this differs from normal direct-io semantics, which + * will return -EFOO even if some bytes were written. + */ + if (written_buffered < 0) { + err = written_buffered; + goto out; + } + + /* + * We need to ensure that the page cache pages are written to + * disk and invalidated to preserve the expected O_DIRECT + * semantics. + */ + endbyte = pos + written_buffered - written - 1; + err = do_sync_mapping_range(file->f_mapping, pos, endbyte, + SYNC_FILE_RANGE_WAIT_BEFORE| + SYNC_FILE_RANGE_WRITE| + SYNC_FILE_RANGE_WAIT_AFTER); + if (err == 0) { + written = written_buffered; + invalidate_mapping_pages(mapping, + pos >> PAGE_CACHE_SHIFT, + endbyte >> PAGE_CACHE_SHIFT); + } else { + /* + * We don't know how much we wrote, so just return + * the number of bytes which were direct-written + */ + } + } else { + written = generic_file_buffered_write(iocb, iov, nr_segs, + pos, ppos, count, written); + } +out: + current->backing_dev_info = NULL; + return written ? written : err; +} + +ssize_t generic_file_aio_write_nolock(struct kiocb *iocb, + const struct iovec *iov, unsigned long nr_segs, loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; + + BUG_ON(iocb->ki_pos != pos); + + ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, + &iocb->ki_pos); + + if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + ssize_t err; + + err = sync_page_range_nolock(inode, mapping, pos, ret); + if (err < 0) + ret = err; + } + return ret; +} +EXPORT_SYMBOL(generic_file_aio_write_nolock); + +ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; + + BUG_ON(iocb->ki_pos != pos); + + mutex_lock(&inode->i_mutex); + ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, + &iocb->ki_pos); + mutex_unlock(&inode->i_mutex); + + if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + ssize_t err; + + err = sync_page_range(inode, mapping, pos, ret); + if (err < 0) + ret = err; + } + return ret; +} +EXPORT_SYMBOL(generic_file_aio_write); + +/** + * try_to_release_page() - release old fs-specific metadata on a page + * + * @page: the page which the kernel is trying to free + * @gfp_mask: memory allocation flags (and I/O mode) + * + * The address_space is to try to release any data against the page + * (presumably at page->private). If the release was successful, return `1'. + * Otherwise return zero. + * + * The @gfp_mask argument specifies whether I/O may be performed to release + * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). + * + */ +int try_to_release_page(struct page *page, gfp_t gfp_mask) +{ + struct address_space * const mapping = page->mapping; + + BUG_ON(!PageLocked(page)); + if (PageWriteback(page)) + return 0; + + if (mapping && mapping->a_ops->releasepage) + return mapping->a_ops->releasepage(page, gfp_mask); + return try_to_free_buffers(page); +} + +EXPORT_SYMBOL(try_to_release_page); diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c new file mode 100644 index 0000000..b5167df --- /dev/null +++ b/mm/filemap_xip.c @@ -0,0 +1,474 @@ +/* + * linux/mm/filemap_xip.c + * + * Copyright (C) 2005 IBM Corporation + * Author: Carsten Otte <cotte@de.ibm.com> + * + * derived from linux/mm/filemap.c - Copyright (C) Linus Torvalds + * + */ + +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/module.h> +#include <linux/uio.h> +#include <linux/rmap.h> +#include <linux/mmu_notifier.h> +#include <linux/sched.h> +#include <linux/seqlock.h> +#include <linux/mutex.h> +#include <asm/tlbflush.h> +#include <asm/io.h> + +/* + * We do use our own empty page to avoid interference with other users + * of ZERO_PAGE(), such as /dev/zero + */ +static DEFINE_MUTEX(xip_sparse_mutex); +static seqcount_t xip_sparse_seq = SEQCNT_ZERO; +static struct page *__xip_sparse_page; + +/* called under xip_sparse_mutex */ +static struct page *xip_sparse_page(void) +{ + if (!__xip_sparse_page) { + struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); + + if (page) + __xip_sparse_page = page; + } + return __xip_sparse_page; +} + +/* + * This is a file read routine for execute in place files, and uses + * the mapping->a_ops->get_xip_mem() function for the actual low-level + * stuff. + * + * Note the struct file* is not used at all. It may be NULL. + */ +static ssize_t +do_xip_mapping_read(struct address_space *mapping, + struct file_ra_state *_ra, + struct file *filp, + char __user *buf, + size_t len, + loff_t *ppos) +{ + struct inode *inode = mapping->host; + pgoff_t index, end_index; + unsigned long offset; + loff_t isize, pos; + size_t copied = 0, error = 0; + + BUG_ON(!mapping->a_ops->get_xip_mem); + + pos = *ppos; + index = pos >> PAGE_CACHE_SHIFT; + offset = pos & ~PAGE_CACHE_MASK; + + isize = i_size_read(inode); + if (!isize) + goto out; + + end_index = (isize - 1) >> PAGE_CACHE_SHIFT; + do { + unsigned long nr, left; + void *xip_mem; + unsigned long xip_pfn; + int zero = 0; + + /* nr is the maximum number of bytes to copy from this page */ + nr = PAGE_CACHE_SIZE; + if (index >= end_index) { + if (index > end_index) + goto out; + nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; + if (nr <= offset) { + goto out; + } + } + nr = nr - offset; + if (nr > len) + nr = len; + + error = mapping->a_ops->get_xip_mem(mapping, index, 0, + &xip_mem, &xip_pfn); + if (unlikely(error)) { + if (error == -ENODATA) { + /* sparse */ + zero = 1; + } else + goto out; + } + + /* If users can be writing to this page using arbitrary + * virtual addresses, take care about potential aliasing + * before reading the page on the kernel side. + */ + if (mapping_writably_mapped(mapping)) + /* address based flush */ ; + + /* + * Ok, we have the mem, so now we can copy it to user space... + * + * The actor routine returns how many bytes were actually used.. + * NOTE! This may not be the same as how much of a user buffer + * we filled up (we may be padding etc), so we can only update + * "pos" here (the actor routine has to update the user buffer + * pointers and the remaining count). + */ + if (!zero) + left = __copy_to_user(buf+copied, xip_mem+offset, nr); + else + left = __clear_user(buf + copied, nr); + + if (left) { + error = -EFAULT; + goto out; + } + + copied += (nr - left); + offset += (nr - left); + index += offset >> PAGE_CACHE_SHIFT; + offset &= ~PAGE_CACHE_MASK; + } while (copied < len); + +out: + *ppos = pos + copied; + if (filp) + file_accessed(filp); + + return (copied ? copied : error); +} + +ssize_t +xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) +{ + if (!access_ok(VERIFY_WRITE, buf, len)) + return -EFAULT; + + return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp, + buf, len, ppos); +} +EXPORT_SYMBOL_GPL(xip_file_read); + +/* + * __xip_unmap is invoked from xip_unmap and + * xip_write + * + * This function walks all vmas of the address_space and unmaps the + * __xip_sparse_page when found at pgoff. + */ +static void +__xip_unmap (struct address_space * mapping, + unsigned long pgoff) +{ + struct vm_area_struct *vma; + struct mm_struct *mm; + struct prio_tree_iter iter; + unsigned long address; + pte_t *pte; + pte_t pteval; + spinlock_t *ptl; + struct page *page; + unsigned count; + int locked = 0; + + count = read_seqcount_begin(&xip_sparse_seq); + + page = __xip_sparse_page; + if (!page) + return; + +retry: + spin_lock(&mapping->i_mmap_lock); + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + mm = vma->vm_mm; + address = vma->vm_start + + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + BUG_ON(address < vma->vm_start || address >= vma->vm_end); + pte = page_check_address(page, mm, address, &ptl, 1); + if (pte) { + /* Nuke the page table entry. */ + flush_cache_page(vma, address, pte_pfn(*pte)); + pteval = ptep_clear_flush_notify(vma, address, pte); + page_remove_rmap(page, vma); + dec_mm_counter(mm, file_rss); + BUG_ON(pte_dirty(pteval)); + pte_unmap_unlock(pte, ptl); + page_cache_release(page); + } + } + spin_unlock(&mapping->i_mmap_lock); + + if (locked) { + mutex_unlock(&xip_sparse_mutex); + } else if (read_seqcount_retry(&xip_sparse_seq, count)) { + mutex_lock(&xip_sparse_mutex); + locked = 1; + goto retry; + } +} + +/* + * xip_fault() is invoked via the vma operations vector for a + * mapped memory region to read in file data during a page fault. + * + * This function is derived from filemap_fault, but used for execute in place + */ +static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct file *file = vma->vm_file; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + pgoff_t size; + void *xip_mem; + unsigned long xip_pfn; + struct page *page; + int error; + + /* XXX: are VM_FAULT_ codes OK? */ +again: + size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (vmf->pgoff >= size) + return VM_FAULT_SIGBUS; + + error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0, + &xip_mem, &xip_pfn); + if (likely(!error)) + goto found; + if (error != -ENODATA) + return VM_FAULT_OOM; + + /* sparse block */ + if ((vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) && + (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) && + (!(mapping->host->i_sb->s_flags & MS_RDONLY))) { + int err; + + /* maybe shared writable, allocate new block */ + mutex_lock(&xip_sparse_mutex); + error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1, + &xip_mem, &xip_pfn); + mutex_unlock(&xip_sparse_mutex); + if (error) + return VM_FAULT_SIGBUS; + /* unmap sparse mappings at pgoff from all other vmas */ + __xip_unmap(mapping, vmf->pgoff); + +found: + err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, + xip_pfn); + if (err == -ENOMEM) + return VM_FAULT_OOM; + BUG_ON(err); + return VM_FAULT_NOPAGE; + } else { + int err, ret = VM_FAULT_OOM; + + mutex_lock(&xip_sparse_mutex); + write_seqcount_begin(&xip_sparse_seq); + error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0, + &xip_mem, &xip_pfn); + if (unlikely(!error)) { + write_seqcount_end(&xip_sparse_seq); + mutex_unlock(&xip_sparse_mutex); + goto again; + } + if (error != -ENODATA) + goto out; + /* not shared and writable, use xip_sparse_page() */ + page = xip_sparse_page(); + if (!page) + goto out; + err = vm_insert_page(vma, (unsigned long)vmf->virtual_address, + page); + if (err == -ENOMEM) + goto out; + + ret = VM_FAULT_NOPAGE; +out: + write_seqcount_end(&xip_sparse_seq); + mutex_unlock(&xip_sparse_mutex); + + return ret; + } +} + +static struct vm_operations_struct xip_file_vm_ops = { + .fault = xip_file_fault, +}; + +int xip_file_mmap(struct file * file, struct vm_area_struct * vma) +{ + BUG_ON(!file->f_mapping->a_ops->get_xip_mem); + + file_accessed(file); + vma->vm_ops = &xip_file_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP; + return 0; +} +EXPORT_SYMBOL_GPL(xip_file_mmap); + +static ssize_t +__xip_file_write(struct file *filp, const char __user *buf, + size_t count, loff_t pos, loff_t *ppos) +{ + struct address_space * mapping = filp->f_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + struct inode *inode = mapping->host; + long status = 0; + size_t bytes; + ssize_t written = 0; + + BUG_ON(!mapping->a_ops->get_xip_mem); + + do { + unsigned long index; + unsigned long offset; + size_t copied; + void *xip_mem; + unsigned long xip_pfn; + + offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ + index = pos >> PAGE_CACHE_SHIFT; + bytes = PAGE_CACHE_SIZE - offset; + if (bytes > count) + bytes = count; + + status = a_ops->get_xip_mem(mapping, index, 0, + &xip_mem, &xip_pfn); + if (status == -ENODATA) { + /* we allocate a new page unmap it */ + mutex_lock(&xip_sparse_mutex); + status = a_ops->get_xip_mem(mapping, index, 1, + &xip_mem, &xip_pfn); + mutex_unlock(&xip_sparse_mutex); + if (!status) + /* unmap page at pgoff from all other vmas */ + __xip_unmap(mapping, index); + } + + if (status) + break; + + copied = bytes - + __copy_from_user_nocache(xip_mem + offset, buf, bytes); + + if (likely(copied > 0)) { + status = copied; + + if (status >= 0) { + written += status; + count -= status; + pos += status; + buf += status; + } + } + if (unlikely(copied != bytes)) + if (status >= 0) + status = -EFAULT; + if (status < 0) + break; + } while (count); + *ppos = pos; + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold i_mutex. + */ + if (pos > inode->i_size) { + i_size_write(inode, pos); + mark_inode_dirty(inode); + } + + return written ? written : status; +} + +ssize_t +xip_file_write(struct file *filp, const char __user *buf, size_t len, + loff_t *ppos) +{ + struct address_space *mapping = filp->f_mapping; + struct inode *inode = mapping->host; + size_t count; + loff_t pos; + ssize_t ret; + + mutex_lock(&inode->i_mutex); + + if (!access_ok(VERIFY_READ, buf, len)) { + ret=-EFAULT; + goto out_up; + } + + pos = *ppos; + count = len; + + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = mapping->backing_dev_info; + + ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode)); + if (ret) + goto out_backing; + if (count == 0) + goto out_backing; + + ret = file_remove_suid(filp); + if (ret) + goto out_backing; + + file_update_time(filp); + + ret = __xip_file_write (filp, buf, count, pos, ppos); + + out_backing: + current->backing_dev_info = NULL; + out_up: + mutex_unlock(&inode->i_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(xip_file_write); + +/* + * truncate a page used for execute in place + * functionality is analog to block_truncate_page but does use get_xip_mem + * to get the page instead of page cache + */ +int +xip_truncate_page(struct address_space *mapping, loff_t from) +{ + pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize; + unsigned length; + void *xip_mem; + unsigned long xip_pfn; + int err; + + BUG_ON(!mapping->a_ops->get_xip_mem); + + blocksize = 1 << mapping->host->i_blkbits; + length = offset & (blocksize - 1); + + /* Block boundary? Nothing to do */ + if (!length) + return 0; + + length = blocksize - length; + + err = mapping->a_ops->get_xip_mem(mapping, index, 0, + &xip_mem, &xip_pfn); + if (unlikely(err)) { + if (err == -ENODATA) + /* Hole? No need to truncate */ + return 0; + else + return err; + } + memset(xip_mem + offset, 0, length); + return 0; +} +EXPORT_SYMBOL_GPL(xip_truncate_page); diff --git a/mm/fremap.c b/mm/fremap.c new file mode 100644 index 0000000..b602c27 --- /dev/null +++ b/mm/fremap.c @@ -0,0 +1,260 @@ +/* + * linux/mm/fremap.c + * + * Explicit pagetable population and nonlinear (random) mappings support. + * + * started by Ingo Molnar, Copyright (C) 2002, 2003 + */ +#include <linux/backing-dev.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/file.h> +#include <linux/mman.h> +#include <linux/pagemap.h> +#include <linux/swapops.h> +#include <linux/rmap.h> +#include <linux/module.h> +#include <linux/syscalls.h> +#include <linux/mmu_notifier.h> + +#include <asm/mmu_context.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> + +#include "internal.h" + +static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + pte_t pte = *ptep; + + if (pte_present(pte)) { + struct page *page; + + flush_cache_page(vma, addr, pte_pfn(pte)); + pte = ptep_clear_flush(vma, addr, ptep); + page = vm_normal_page(vma, addr, pte); + if (page) { + if (pte_dirty(pte)) + set_page_dirty(page); + page_remove_rmap(page, vma); + page_cache_release(page); + update_hiwater_rss(mm); + dec_mm_counter(mm, file_rss); + } + } else { + if (!pte_file(pte)) + free_swap_and_cache(pte_to_swp_entry(pte)); + pte_clear_not_present_full(mm, addr, ptep, 0); + } +} + +/* + * Install a file pte to a given virtual memory address, release any + * previously existing mapping. + */ +static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, unsigned long pgoff, pgprot_t prot) +{ + int err = -ENOMEM; + pte_t *pte; + spinlock_t *ptl; + + pte = get_locked_pte(mm, addr, &ptl); + if (!pte) + goto out; + + if (!pte_none(*pte)) + zap_pte(mm, vma, addr, pte); + + set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); + /* + * We don't need to run update_mmu_cache() here because the "file pte" + * being installed by install_file_pte() is not a real pte - it's a + * non-present entry (like a swap entry), noting what file offset should + * be mapped there when there's a fault (in a non-linear vma where + * that's not obvious). + */ + pte_unmap_unlock(pte, ptl); + err = 0; +out: + return err; +} + +static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, unsigned long size, pgoff_t pgoff) +{ + int err; + + do { + err = install_file_pte(mm, vma, addr, pgoff, vma->vm_page_prot); + if (err) + return err; + + size -= PAGE_SIZE; + addr += PAGE_SIZE; + pgoff++; + } while (size); + + return 0; + +} + +/** + * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma + * @start: start of the remapped virtual memory range + * @size: size of the remapped virtual memory range + * @prot: new protection bits of the range (see NOTE) + * @pgoff: to-be-mapped page of the backing store file + * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO. + * + * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma + * (shared backing store file). + * + * This syscall works purely via pagetables, so it's the most efficient + * way to map the same (large) file into a given virtual window. Unlike + * mmap()/mremap() it does not create any new vmas. The new mappings are + * also safe across swapout. + * + * NOTE: the @prot parameter right now is ignored (but must be zero), + * and the vma's default protection is used. Arbitrary protections + * might be implemented in the future. + */ +SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, + unsigned long, prot, unsigned long, pgoff, unsigned long, flags) +{ + struct mm_struct *mm = current->mm; + struct address_space *mapping; + unsigned long end = start + size; + struct vm_area_struct *vma; + int err = -EINVAL; + int has_write_lock = 0; + + if (prot) + return err; + /* + * Sanitize the syscall parameters: + */ + start = start & PAGE_MASK; + size = size & PAGE_MASK; + + /* Does the address range wrap, or is the span zero-sized? */ + if (start + size <= start) + return err; + + /* Can we represent this offset inside this architecture's pte's? */ +#if PTE_FILE_MAX_BITS < BITS_PER_LONG + if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS)) + return err; +#endif + + /* We need down_write() to change vma->vm_flags. */ + down_read(&mm->mmap_sem); + retry: + vma = find_vma(mm, start); + + /* + * Make sure the vma is shared, that it supports prefaulting, + * and that the remapped range is valid and fully within + * the single existing vma. vm_private_data is used as a + * swapout cursor in a VM_NONLINEAR vma. + */ + if (!vma || !(vma->vm_flags & VM_SHARED)) + goto out; + + if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR)) + goto out; + + if (!(vma->vm_flags & VM_CAN_NONLINEAR)) + goto out; + + if (end <= start || start < vma->vm_start || end > vma->vm_end) + goto out; + + /* Must set VM_NONLINEAR before any pages are populated. */ + if (!(vma->vm_flags & VM_NONLINEAR)) { + /* Don't need a nonlinear mapping, exit success */ + if (pgoff == linear_page_index(vma, start)) { + err = 0; + goto out; + } + + if (!has_write_lock) { + up_read(&mm->mmap_sem); + down_write(&mm->mmap_sem); + has_write_lock = 1; + goto retry; + } + mapping = vma->vm_file->f_mapping; + /* + * page_mkclean doesn't work on nonlinear vmas, so if + * dirty pages need to be accounted, emulate with linear + * vmas. + */ + if (mapping_cap_account_dirty(mapping)) { + unsigned long addr; + struct file *file = vma->vm_file; + + flags &= MAP_NONBLOCK; + get_file(file); + addr = mmap_region(file, start, size, + flags, vma->vm_flags, pgoff, 1); + fput(file); + if (IS_ERR_VALUE(addr)) { + err = addr; + } else { + BUG_ON(addr != start); + err = 0; + } + goto out; + } + spin_lock(&mapping->i_mmap_lock); + flush_dcache_mmap_lock(mapping); + vma->vm_flags |= VM_NONLINEAR; + vma_prio_tree_remove(vma, &mapping->i_mmap); + vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); + flush_dcache_mmap_unlock(mapping); + spin_unlock(&mapping->i_mmap_lock); + } + + if (vma->vm_flags & VM_LOCKED) { + /* + * drop PG_Mlocked flag for over-mapped range + */ + unsigned int saved_flags = vma->vm_flags; + munlock_vma_pages_range(vma, start, start + size); + vma->vm_flags = saved_flags; + } + + mmu_notifier_invalidate_range_start(mm, start, start + size); + err = populate_range(mm, vma, start, size, pgoff); + mmu_notifier_invalidate_range_end(mm, start, start + size); + if (!err && !(flags & MAP_NONBLOCK)) { + if (vma->vm_flags & VM_LOCKED) { + /* + * might be mapping previously unmapped range of file + */ + mlock_vma_pages_range(vma, start, start + size); + } else { + if (unlikely(has_write_lock)) { + downgrade_write(&mm->mmap_sem); + has_write_lock = 0; + } + make_pages_present(start, start+size); + } + } + + /* + * We can't clear VM_NONLINEAR because we'd have to do + * it after ->populate completes, and that would prevent + * downgrading the lock. (Locks can't be upgraded). + */ + +out: + if (likely(!has_write_lock)) + up_read(&mm->mmap_sem); + else + up_write(&mm->mmap_sem); + + return err; +} diff --git a/mm/highmem.c b/mm/highmem.c new file mode 100644 index 0000000..b36b83b --- /dev/null +++ b/mm/highmem.c @@ -0,0 +1,375 @@ +/* + * High memory handling common code and variables. + * + * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de + * Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de + * + * + * Redesigned the x86 32-bit VM architecture to deal with + * 64-bit physical space. With current x86 CPUs this + * means up to 64 Gigabytes physical RAM. + * + * Rewrote high memory support to move the page cache into + * high memory. Implemented permanent (schedulable) kmaps + * based on Linus' idea. + * + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> + */ + +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/swap.h> +#include <linux/bio.h> +#include <linux/pagemap.h> +#include <linux/mempool.h> +#include <linux/blkdev.h> +#include <linux/init.h> +#include <linux/hash.h> +#include <linux/highmem.h> +#include <linux/blktrace_api.h> +#include <asm/tlbflush.h> + +/* + * Virtual_count is not a pure "count". + * 0 means that it is not mapped, and has not been mapped + * since a TLB flush - it is usable. + * 1 means that there are no users, but it has been mapped + * since the last TLB flush - so we can't use it. + * n means that there are (n-1) current users of it. + */ +#ifdef CONFIG_HIGHMEM + +unsigned long totalhigh_pages __read_mostly; +EXPORT_SYMBOL(totalhigh_pages); + +unsigned int nr_free_highpages (void) +{ + pg_data_t *pgdat; + unsigned int pages = 0; + + for_each_online_pgdat(pgdat) { + pages += zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], + NR_FREE_PAGES); + if (zone_movable_is_highmem()) + pages += zone_page_state( + &pgdat->node_zones[ZONE_MOVABLE], + NR_FREE_PAGES); + } + + return pages; +} + +static int pkmap_count[LAST_PKMAP]; +static unsigned int last_pkmap_nr; +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); + +pte_t * pkmap_page_table; + +static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); + +static void flush_all_zero_pkmaps(void) +{ + int i; + int need_flush = 0; + + flush_cache_kmaps(); + + for (i = 0; i < LAST_PKMAP; i++) { + struct page *page; + + /* + * zero means we don't have anything to do, + * >1 means that it is still in use. Only + * a count of 1 means that it is free but + * needs to be unmapped + */ + if (pkmap_count[i] != 1) + continue; + pkmap_count[i] = 0; + + /* sanity check */ + BUG_ON(pte_none(pkmap_page_table[i])); + + /* + * Don't need an atomic fetch-and-clear op here; + * no-one has the page mapped, and cannot get at + * its virtual address (and hence PTE) without first + * getting the kmap_lock (which is held here). + * So no dangers, even with speculative execution. + */ + page = pte_page(pkmap_page_table[i]); + pte_clear(&init_mm, (unsigned long)page_address(page), + &pkmap_page_table[i]); + + set_page_address(page, NULL); + need_flush = 1; + } + if (need_flush) + flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); +} + +/** + * kmap_flush_unused - flush all unused kmap mappings in order to remove stray mappings + */ +void kmap_flush_unused(void) +{ + spin_lock(&kmap_lock); + flush_all_zero_pkmaps(); + spin_unlock(&kmap_lock); +} + +static inline unsigned long map_new_virtual(struct page *page) +{ + unsigned long vaddr; + int count; + +start: + count = LAST_PKMAP; + /* Find an empty entry */ + for (;;) { + last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; + if (!last_pkmap_nr) { + flush_all_zero_pkmaps(); + count = LAST_PKMAP; + } + if (!pkmap_count[last_pkmap_nr]) + break; /* Found a usable entry */ + if (--count) + continue; + + /* + * Sleep for somebody else to unmap their entries + */ + { + DECLARE_WAITQUEUE(wait, current); + + __set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&pkmap_map_wait, &wait); + spin_unlock(&kmap_lock); + schedule(); + remove_wait_queue(&pkmap_map_wait, &wait); + spin_lock(&kmap_lock); + + /* Somebody else might have mapped it while we slept */ + if (page_address(page)) + return (unsigned long)page_address(page); + + /* Re-start */ + goto start; + } + } + vaddr = PKMAP_ADDR(last_pkmap_nr); + set_pte_at(&init_mm, vaddr, + &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot)); + + pkmap_count[last_pkmap_nr] = 1; + set_page_address(page, (void *)vaddr); + + return vaddr; +} + +/** + * kmap_high - map a highmem page into memory + * @page: &struct page to map + * + * Returns the page's virtual memory address. + * + * We cannot call this from interrupts, as it may block. + */ +void *kmap_high(struct page *page) +{ + unsigned long vaddr; + + /* + * For highmem pages, we can't trust "virtual" until + * after we have the lock. + */ + spin_lock(&kmap_lock); + vaddr = (unsigned long)page_address(page); + if (!vaddr) + vaddr = map_new_virtual(page); + pkmap_count[PKMAP_NR(vaddr)]++; + BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2); + spin_unlock(&kmap_lock); + return (void*) vaddr; +} + +EXPORT_SYMBOL(kmap_high); + +/** + * kunmap_high - map a highmem page into memory + * @page: &struct page to unmap + */ +void kunmap_high(struct page *page) +{ + unsigned long vaddr; + unsigned long nr; + int need_wakeup; + + spin_lock(&kmap_lock); + vaddr = (unsigned long)page_address(page); + BUG_ON(!vaddr); + nr = PKMAP_NR(vaddr); + + /* + * A count must never go down to zero + * without a TLB flush! + */ + need_wakeup = 0; + switch (--pkmap_count[nr]) { + case 0: + BUG(); + case 1: + /* + * Avoid an unnecessary wake_up() function call. + * The common case is pkmap_count[] == 1, but + * no waiters. + * The tasks queued in the wait-queue are guarded + * by both the lock in the wait-queue-head and by + * the kmap_lock. As the kmap_lock is held here, + * no need for the wait-queue-head's lock. Simply + * test if the queue is empty. + */ + need_wakeup = waitqueue_active(&pkmap_map_wait); + } + spin_unlock(&kmap_lock); + + /* do wake-up, if needed, race-free outside of the spin lock */ + if (need_wakeup) + wake_up(&pkmap_map_wait); +} + +EXPORT_SYMBOL(kunmap_high); +#endif + +#if defined(HASHED_PAGE_VIRTUAL) + +#define PA_HASH_ORDER 7 + +/* + * Describes one page->virtual association + */ +struct page_address_map { + struct page *page; + void *virtual; + struct list_head list; +}; + +/* + * page_address_map freelist, allocated from page_address_maps. + */ +static struct list_head page_address_pool; /* freelist */ +static spinlock_t pool_lock; /* protects page_address_pool */ + +/* + * Hash table bucket + */ +static struct page_address_slot { + struct list_head lh; /* List of page_address_maps */ + spinlock_t lock; /* Protect this bucket's list */ +} ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER]; + +static struct page_address_slot *page_slot(struct page *page) +{ + return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)]; +} + +/** + * page_address - get the mapped virtual address of a page + * @page: &struct page to get the virtual address of + * + * Returns the page's virtual address. + */ +void *page_address(struct page *page) +{ + unsigned long flags; + void *ret; + struct page_address_slot *pas; + + if (!PageHighMem(page)) + return lowmem_page_address(page); + + pas = page_slot(page); + ret = NULL; + spin_lock_irqsave(&pas->lock, flags); + if (!list_empty(&pas->lh)) { + struct page_address_map *pam; + + list_for_each_entry(pam, &pas->lh, list) { + if (pam->page == page) { + ret = pam->virtual; + goto done; + } + } + } +done: + spin_unlock_irqrestore(&pas->lock, flags); + return ret; +} + +EXPORT_SYMBOL(page_address); + +/** + * set_page_address - set a page's virtual address + * @page: &struct page to set + * @virtual: virtual address to use + */ +void set_page_address(struct page *page, void *virtual) +{ + unsigned long flags; + struct page_address_slot *pas; + struct page_address_map *pam; + + BUG_ON(!PageHighMem(page)); + + pas = page_slot(page); + if (virtual) { /* Add */ + BUG_ON(list_empty(&page_address_pool)); + + spin_lock_irqsave(&pool_lock, flags); + pam = list_entry(page_address_pool.next, + struct page_address_map, list); + list_del(&pam->list); + spin_unlock_irqrestore(&pool_lock, flags); + + pam->page = page; + pam->virtual = virtual; + + spin_lock_irqsave(&pas->lock, flags); + list_add_tail(&pam->list, &pas->lh); + spin_unlock_irqrestore(&pas->lock, flags); + } else { /* Remove */ + spin_lock_irqsave(&pas->lock, flags); + list_for_each_entry(pam, &pas->lh, list) { + if (pam->page == page) { + list_del(&pam->list); + spin_unlock_irqrestore(&pas->lock, flags); + spin_lock_irqsave(&pool_lock, flags); + list_add_tail(&pam->list, &page_address_pool); + spin_unlock_irqrestore(&pool_lock, flags); + goto done; + } + } + spin_unlock_irqrestore(&pas->lock, flags); + } +done: + return; +} + +static struct page_address_map page_address_maps[LAST_PKMAP]; + +void __init page_address_init(void) +{ + int i; + + INIT_LIST_HEAD(&page_address_pool); + for (i = 0; i < ARRAY_SIZE(page_address_maps); i++) + list_add(&page_address_maps[i].list, &page_address_pool); + for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) { + INIT_LIST_HEAD(&page_address_htable[i].lh); + spin_lock_init(&page_address_htable[i].lock); + } + spin_lock_init(&pool_lock); +} + +#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c new file mode 100644 index 0000000..6058b53 --- /dev/null +++ b/mm/hugetlb.c @@ -0,0 +1,2293 @@ +/* + * Generic hugetlb support. + * (C) William Irwin, April 2004 + */ +#include <linux/gfp.h> +#include <linux/list.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/seq_file.h> +#include <linux/sysctl.h> +#include <linux/highmem.h> +#include <linux/mmu_notifier.h> +#include <linux/nodemask.h> +#include <linux/pagemap.h> +#include <linux/mempolicy.h> +#include <linux/cpuset.h> +#include <linux/mutex.h> +#include <linux/bootmem.h> +#include <linux/sysfs.h> + +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/io.h> + +#include <linux/hugetlb.h> +#include "internal.h" + +const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; +static gfp_t htlb_alloc_mask = GFP_HIGHUSER; +unsigned long hugepages_treat_as_movable; + +static int max_hstate; +unsigned int default_hstate_idx; +struct hstate hstates[HUGE_MAX_HSTATE]; + +__initdata LIST_HEAD(huge_boot_pages); + +/* for command line parsing */ +static struct hstate * __initdata parsed_hstate; +static unsigned long __initdata default_hstate_max_huge_pages; +static unsigned long __initdata default_hstate_size; + +#define for_each_hstate(h) \ + for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++) + +/* + * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages + */ +static DEFINE_SPINLOCK(hugetlb_lock); + +/* + * Region tracking -- allows tracking of reservations and instantiated pages + * across the pages in a mapping. + * + * The region data structures are protected by a combination of the mmap_sem + * and the hugetlb_instantion_mutex. To access or modify a region the caller + * must either hold the mmap_sem for write, or the mmap_sem for read and + * the hugetlb_instantiation mutex: + * + * down_write(&mm->mmap_sem); + * or + * down_read(&mm->mmap_sem); + * mutex_lock(&hugetlb_instantiation_mutex); + */ +struct file_region { + struct list_head link; + long from; + long to; +}; + +static long region_add(struct list_head *head, long f, long t) +{ + struct file_region *rg, *nrg, *trg; + + /* Locate the region we are either in or before. */ + list_for_each_entry(rg, head, link) + if (f <= rg->to) + break; + + /* Round our left edge to the current segment if it encloses us. */ + if (f > rg->from) + f = rg->from; + + /* Check for and consume any regions we now overlap with. */ + nrg = rg; + list_for_each_entry_safe(rg, trg, rg->link.prev, link) { + if (&rg->link == head) + break; + if (rg->from > t) + break; + + /* If this area reaches higher then extend our area to + * include it completely. If this is not the first area + * which we intend to reuse, free it. */ + if (rg->to > t) + t = rg->to; + if (rg != nrg) { + list_del(&rg->link); + kfree(rg); + } + } + nrg->from = f; + nrg->to = t; + return 0; +} + +static long region_chg(struct list_head *head, long f, long t) +{ + struct file_region *rg, *nrg; + long chg = 0; + + /* Locate the region we are before or in. */ + list_for_each_entry(rg, head, link) + if (f <= rg->to) + break; + + /* If we are below the current region then a new region is required. + * Subtle, allocate a new region at the position but make it zero + * size such that we can guarantee to record the reservation. */ + if (&rg->link == head || t < rg->from) { + nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); + if (!nrg) + return -ENOMEM; + nrg->from = f; + nrg->to = f; + INIT_LIST_HEAD(&nrg->link); + list_add(&nrg->link, rg->link.prev); + + return t - f; + } + + /* Round our left edge to the current segment if it encloses us. */ + if (f > rg->from) + f = rg->from; + chg = t - f; + + /* Check for and consume any regions we now overlap with. */ + list_for_each_entry(rg, rg->link.prev, link) { + if (&rg->link == head) + break; + if (rg->from > t) + return chg; + + /* We overlap with this area, if it extends futher than + * us then we must extend ourselves. Account for its + * existing reservation. */ + if (rg->to > t) { + chg += rg->to - t; + t = rg->to; + } + chg -= rg->to - rg->from; + } + return chg; +} + +static long region_truncate(struct list_head *head, long end) +{ + struct file_region *rg, *trg; + long chg = 0; + + /* Locate the region we are either in or before. */ + list_for_each_entry(rg, head, link) + if (end <= rg->to) + break; + if (&rg->link == head) + return 0; + + /* If we are in the middle of a region then adjust it. */ + if (end > rg->from) { + chg = rg->to - end; + rg->to = end; + rg = list_entry(rg->link.next, typeof(*rg), link); + } + + /* Drop any remaining regions. */ + list_for_each_entry_safe(rg, trg, rg->link.prev, link) { + if (&rg->link == head) + break; + chg += rg->to - rg->from; + list_del(&rg->link); + kfree(rg); + } + return chg; +} + +static long region_count(struct list_head *head, long f, long t) +{ + struct file_region *rg; + long chg = 0; + + /* Locate each segment we overlap with, and count that overlap. */ + list_for_each_entry(rg, head, link) { + int seg_from; + int seg_to; + + if (rg->to <= f) + continue; + if (rg->from >= t) + break; + + seg_from = max(rg->from, f); + seg_to = min(rg->to, t); + + chg += seg_to - seg_from; + } + + return chg; +} + +/* + * Convert the address within this vma to the page offset within + * the mapping, in pagecache page units; huge pages here. + */ +static pgoff_t vma_hugecache_offset(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) +{ + return ((address - vma->vm_start) >> huge_page_shift(h)) + + (vma->vm_pgoff >> huge_page_order(h)); +} + +/* + * Flags for MAP_PRIVATE reservations. These are stored in the bottom + * bits of the reservation map pointer, which are always clear due to + * alignment. + */ +#define HPAGE_RESV_OWNER (1UL << 0) +#define HPAGE_RESV_UNMAPPED (1UL << 1) +#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) + +/* + * These helpers are used to track how many pages are reserved for + * faults in a MAP_PRIVATE mapping. Only the process that called mmap() + * is guaranteed to have their future faults succeed. + * + * With the exception of reset_vma_resv_huge_pages() which is called at fork(), + * the reserve counters are updated with the hugetlb_lock held. It is safe + * to reset the VMA at fork() time as it is not in use yet and there is no + * chance of the global counters getting corrupted as a result of the values. + * + * The private mapping reservation is represented in a subtly different + * manner to a shared mapping. A shared mapping has a region map associated + * with the underlying file, this region map represents the backing file + * pages which have ever had a reservation assigned which this persists even + * after the page is instantiated. A private mapping has a region map + * associated with the original mmap which is attached to all VMAs which + * reference it, this region map represents those offsets which have consumed + * reservation ie. where pages have been instantiated. + */ +static unsigned long get_vma_private_data(struct vm_area_struct *vma) +{ + return (unsigned long)vma->vm_private_data; +} + +static void set_vma_private_data(struct vm_area_struct *vma, + unsigned long value) +{ + vma->vm_private_data = (void *)value; +} + +struct resv_map { + struct kref refs; + struct list_head regions; +}; + +static struct resv_map *resv_map_alloc(void) +{ + struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); + if (!resv_map) + return NULL; + + kref_init(&resv_map->refs); + INIT_LIST_HEAD(&resv_map->regions); + + return resv_map; +} + +static void resv_map_release(struct kref *ref) +{ + struct resv_map *resv_map = container_of(ref, struct resv_map, refs); + + /* Clear out any active regions before we release the map. */ + region_truncate(&resv_map->regions, 0); + kfree(resv_map); +} + +static struct resv_map *vma_resv_map(struct vm_area_struct *vma) +{ + VM_BUG_ON(!is_vm_hugetlb_page(vma)); + if (!(vma->vm_flags & VM_SHARED)) + return (struct resv_map *)(get_vma_private_data(vma) & + ~HPAGE_RESV_MASK); + return NULL; +} + +static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) +{ + VM_BUG_ON(!is_vm_hugetlb_page(vma)); + VM_BUG_ON(vma->vm_flags & VM_SHARED); + + set_vma_private_data(vma, (get_vma_private_data(vma) & + HPAGE_RESV_MASK) | (unsigned long)map); +} + +static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) +{ + VM_BUG_ON(!is_vm_hugetlb_page(vma)); + VM_BUG_ON(vma->vm_flags & VM_SHARED); + + set_vma_private_data(vma, get_vma_private_data(vma) | flags); +} + +static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) +{ + VM_BUG_ON(!is_vm_hugetlb_page(vma)); + + return (get_vma_private_data(vma) & flag) != 0; +} + +/* Decrement the reserved pages in the hugepage pool by one */ +static void decrement_hugepage_resv_vma(struct hstate *h, + struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_NORESERVE) + return; + + if (vma->vm_flags & VM_SHARED) { + /* Shared mappings always use reserves */ + h->resv_huge_pages--; + } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + /* + * Only the process that called mmap() has reserves for + * private mappings. + */ + h->resv_huge_pages--; + } +} + +/* Reset counters to 0 and clear all HPAGE_RESV_* flags */ +void reset_vma_resv_huge_pages(struct vm_area_struct *vma) +{ + VM_BUG_ON(!is_vm_hugetlb_page(vma)); + if (!(vma->vm_flags & VM_SHARED)) + vma->vm_private_data = (void *)0; +} + +/* Returns true if the VMA has associated reserve pages */ +static int vma_has_reserves(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_SHARED) + return 1; + if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + return 1; + return 0; +} + +static void clear_gigantic_page(struct page *page, + unsigned long addr, unsigned long sz) +{ + int i; + struct page *p = page; + + might_sleep(); + for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) { + cond_resched(); + clear_user_highpage(p, addr + i * PAGE_SIZE); + } +} +static void clear_huge_page(struct page *page, + unsigned long addr, unsigned long sz) +{ + int i; + + if (unlikely(sz > MAX_ORDER_NR_PAGES)) + return clear_gigantic_page(page, addr, sz); + + might_sleep(); + for (i = 0; i < sz/PAGE_SIZE; i++) { + cond_resched(); + clear_user_highpage(page + i, addr + i * PAGE_SIZE); + } +} + +static void copy_gigantic_page(struct page *dst, struct page *src, + unsigned long addr, struct vm_area_struct *vma) +{ + int i; + struct hstate *h = hstate_vma(vma); + struct page *dst_base = dst; + struct page *src_base = src; + might_sleep(); + for (i = 0; i < pages_per_huge_page(h); ) { + cond_resched(); + copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); + + i++; + dst = mem_map_next(dst, dst_base, i); + src = mem_map_next(src, src_base, i); + } +} +static void copy_huge_page(struct page *dst, struct page *src, + unsigned long addr, struct vm_area_struct *vma) +{ + int i; + struct hstate *h = hstate_vma(vma); + + if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) + return copy_gigantic_page(dst, src, addr, vma); + + might_sleep(); + for (i = 0; i < pages_per_huge_page(h); i++) { + cond_resched(); + copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); + } +} + +static void enqueue_huge_page(struct hstate *h, struct page *page) +{ + int nid = page_to_nid(page); + list_add(&page->lru, &h->hugepage_freelists[nid]); + h->free_huge_pages++; + h->free_huge_pages_node[nid]++; +} + +static struct page *dequeue_huge_page(struct hstate *h) +{ + int nid; + struct page *page = NULL; + + for (nid = 0; nid < MAX_NUMNODES; ++nid) { + if (!list_empty(&h->hugepage_freelists[nid])) { + page = list_entry(h->hugepage_freelists[nid].next, + struct page, lru); + list_del(&page->lru); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + break; + } + } + return page; +} + +static struct page *dequeue_huge_page_vma(struct hstate *h, + struct vm_area_struct *vma, + unsigned long address, int avoid_reserve) +{ + int nid; + struct page *page = NULL; + struct mempolicy *mpol; + nodemask_t *nodemask; + struct zonelist *zonelist = huge_zonelist(vma, address, + htlb_alloc_mask, &mpol, &nodemask); + struct zone *zone; + struct zoneref *z; + + /* + * A child process with MAP_PRIVATE mappings created by their parent + * have no page reserves. This check ensures that reservations are + * not "stolen". The child may still get SIGKILLed + */ + if (!vma_has_reserves(vma) && + h->free_huge_pages - h->resv_huge_pages == 0) + return NULL; + + /* If reserves cannot be used, ensure enough pages are in the pool */ + if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) + return NULL; + + for_each_zone_zonelist_nodemask(zone, z, zonelist, + MAX_NR_ZONES - 1, nodemask) { + nid = zone_to_nid(zone); + if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && + !list_empty(&h->hugepage_freelists[nid])) { + page = list_entry(h->hugepage_freelists[nid].next, + struct page, lru); + list_del(&page->lru); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + + if (!avoid_reserve) + decrement_hugepage_resv_vma(h, vma); + + break; + } + } + mpol_cond_put(mpol); + return page; +} + +static void update_and_free_page(struct hstate *h, struct page *page) +{ + int i; + + VM_BUG_ON(h->order >= MAX_ORDER); + + h->nr_huge_pages--; + h->nr_huge_pages_node[page_to_nid(page)]--; + for (i = 0; i < pages_per_huge_page(h); i++) { + page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | + 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | + 1 << PG_private | 1<< PG_writeback); + } + set_compound_page_dtor(page, NULL); + set_page_refcounted(page); + arch_release_hugepage(page); + __free_pages(page, huge_page_order(h)); +} + +struct hstate *size_to_hstate(unsigned long size) +{ + struct hstate *h; + + for_each_hstate(h) { + if (huge_page_size(h) == size) + return h; + } + return NULL; +} + +static void free_huge_page(struct page *page) +{ + /* + * Can't pass hstate in here because it is called from the + * compound page destructor. + */ + struct hstate *h = page_hstate(page); + int nid = page_to_nid(page); + struct address_space *mapping; + + mapping = (struct address_space *) page_private(page); + set_page_private(page, 0); + BUG_ON(page_count(page)); + INIT_LIST_HEAD(&page->lru); + + spin_lock(&hugetlb_lock); + if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { + update_and_free_page(h, page); + h->surplus_huge_pages--; + h->surplus_huge_pages_node[nid]--; + } else { + enqueue_huge_page(h, page); + } + spin_unlock(&hugetlb_lock); + if (mapping) + hugetlb_put_quota(mapping, 1); +} + +/* + * Increment or decrement surplus_huge_pages. Keep node-specific counters + * balanced by operating on them in a round-robin fashion. + * Returns 1 if an adjustment was made. + */ +static int adjust_pool_surplus(struct hstate *h, int delta) +{ + static int prev_nid; + int nid = prev_nid; + int ret = 0; + + VM_BUG_ON(delta != -1 && delta != 1); + do { + nid = next_node(nid, node_online_map); + if (nid == MAX_NUMNODES) + nid = first_node(node_online_map); + + /* To shrink on this node, there must be a surplus page */ + if (delta < 0 && !h->surplus_huge_pages_node[nid]) + continue; + /* Surplus cannot exceed the total number of pages */ + if (delta > 0 && h->surplus_huge_pages_node[nid] >= + h->nr_huge_pages_node[nid]) + continue; + + h->surplus_huge_pages += delta; + h->surplus_huge_pages_node[nid] += delta; + ret = 1; + break; + } while (nid != prev_nid); + + prev_nid = nid; + return ret; +} + +static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) +{ + set_compound_page_dtor(page, free_huge_page); + spin_lock(&hugetlb_lock); + h->nr_huge_pages++; + h->nr_huge_pages_node[nid]++; + spin_unlock(&hugetlb_lock); + put_page(page); /* free it into the hugepage allocator */ +} + +static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) +{ + struct page *page; + + if (h->order >= MAX_ORDER) + return NULL; + + page = alloc_pages_node(nid, + htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| + __GFP_REPEAT|__GFP_NOWARN, + huge_page_order(h)); + if (page) { + if (arch_prepare_hugepage(page)) { + __free_pages(page, huge_page_order(h)); + return NULL; + } + prep_new_huge_page(h, page, nid); + } + + return page; +} + +/* + * Use a helper variable to find the next node and then + * copy it back to hugetlb_next_nid afterwards: + * otherwise there's a window in which a racer might + * pass invalid nid MAX_NUMNODES to alloc_pages_node. + * But we don't need to use a spin_lock here: it really + * doesn't matter if occasionally a racer chooses the + * same nid as we do. Move nid forward in the mask even + * if we just successfully allocated a hugepage so that + * the next caller gets hugepages on the next node. + */ +static int hstate_next_node(struct hstate *h) +{ + int next_nid; + next_nid = next_node(h->hugetlb_next_nid, node_online_map); + if (next_nid == MAX_NUMNODES) + next_nid = first_node(node_online_map); + h->hugetlb_next_nid = next_nid; + return next_nid; +} + +static int alloc_fresh_huge_page(struct hstate *h) +{ + struct page *page; + int start_nid; + int next_nid; + int ret = 0; + + start_nid = h->hugetlb_next_nid; + + do { + page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid); + if (page) + ret = 1; + next_nid = hstate_next_node(h); + } while (!page && h->hugetlb_next_nid != start_nid); + + if (ret) + count_vm_event(HTLB_BUDDY_PGALLOC); + else + count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); + + return ret; +} + +static struct page *alloc_buddy_huge_page(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) +{ + struct page *page; + unsigned int nid; + + if (h->order >= MAX_ORDER) + return NULL; + + /* + * Assume we will successfully allocate the surplus page to + * prevent racing processes from causing the surplus to exceed + * overcommit + * + * This however introduces a different race, where a process B + * tries to grow the static hugepage pool while alloc_pages() is + * called by process A. B will only examine the per-node + * counters in determining if surplus huge pages can be + * converted to normal huge pages in adjust_pool_surplus(). A + * won't be able to increment the per-node counter, until the + * lock is dropped by B, but B doesn't drop hugetlb_lock until + * no more huge pages can be converted from surplus to normal + * state (and doesn't try to convert again). Thus, we have a + * case where a surplus huge page exists, the pool is grown, and + * the surplus huge page still exists after, even though it + * should just have been converted to a normal huge page. This + * does not leak memory, though, as the hugepage will be freed + * once it is out of use. It also does not allow the counters to + * go out of whack in adjust_pool_surplus() as we don't modify + * the node values until we've gotten the hugepage and only the + * per-node value is checked there. + */ + spin_lock(&hugetlb_lock); + if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { + spin_unlock(&hugetlb_lock); + return NULL; + } else { + h->nr_huge_pages++; + h->surplus_huge_pages++; + } + spin_unlock(&hugetlb_lock); + + page = alloc_pages(htlb_alloc_mask|__GFP_COMP| + __GFP_REPEAT|__GFP_NOWARN, + huge_page_order(h)); + + if (page && arch_prepare_hugepage(page)) { + __free_pages(page, huge_page_order(h)); + return NULL; + } + + spin_lock(&hugetlb_lock); + if (page) { + /* + * This page is now managed by the hugetlb allocator and has + * no users -- drop the buddy allocator's reference. + */ + put_page_testzero(page); + VM_BUG_ON(page_count(page)); + nid = page_to_nid(page); + set_compound_page_dtor(page, free_huge_page); + /* + * We incremented the global counters already + */ + h->nr_huge_pages_node[nid]++; + h->surplus_huge_pages_node[nid]++; + __count_vm_event(HTLB_BUDDY_PGALLOC); + } else { + h->nr_huge_pages--; + h->surplus_huge_pages--; + __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); + } + spin_unlock(&hugetlb_lock); + + return page; +} + +/* + * Increase the hugetlb pool such that it can accomodate a reservation + * of size 'delta'. + */ +static int gather_surplus_pages(struct hstate *h, int delta) +{ + struct list_head surplus_list; + struct page *page, *tmp; + int ret, i; + int needed, allocated; + + needed = (h->resv_huge_pages + delta) - h->free_huge_pages; + if (needed <= 0) { + h->resv_huge_pages += delta; + return 0; + } + + allocated = 0; + INIT_LIST_HEAD(&surplus_list); + + ret = -ENOMEM; +retry: + spin_unlock(&hugetlb_lock); + for (i = 0; i < needed; i++) { + page = alloc_buddy_huge_page(h, NULL, 0); + if (!page) { + /* + * We were not able to allocate enough pages to + * satisfy the entire reservation so we free what + * we've allocated so far. + */ + spin_lock(&hugetlb_lock); + needed = 0; + goto free; + } + + list_add(&page->lru, &surplus_list); + } + allocated += needed; + + /* + * After retaking hugetlb_lock, we need to recalculate 'needed' + * because either resv_huge_pages or free_huge_pages may have changed. + */ + spin_lock(&hugetlb_lock); + needed = (h->resv_huge_pages + delta) - + (h->free_huge_pages + allocated); + if (needed > 0) + goto retry; + + /* + * The surplus_list now contains _at_least_ the number of extra pages + * needed to accomodate the reservation. Add the appropriate number + * of pages to the hugetlb pool and free the extras back to the buddy + * allocator. Commit the entire reservation here to prevent another + * process from stealing the pages as they are added to the pool but + * before they are reserved. + */ + needed += allocated; + h->resv_huge_pages += delta; + ret = 0; +free: + /* Free the needed pages to the hugetlb pool */ + list_for_each_entry_safe(page, tmp, &surplus_list, lru) { + if ((--needed) < 0) + break; + list_del(&page->lru); + enqueue_huge_page(h, page); + } + + /* Free unnecessary surplus pages to the buddy allocator */ + if (!list_empty(&surplus_list)) { + spin_unlock(&hugetlb_lock); + list_for_each_entry_safe(page, tmp, &surplus_list, lru) { + list_del(&page->lru); + /* + * The page has a reference count of zero already, so + * call free_huge_page directly instead of using + * put_page. This must be done with hugetlb_lock + * unlocked which is safe because free_huge_page takes + * hugetlb_lock before deciding how to free the page. + */ + free_huge_page(page); + } + spin_lock(&hugetlb_lock); + } + + return ret; +} + +/* + * When releasing a hugetlb pool reservation, any surplus pages that were + * allocated to satisfy the reservation must be explicitly freed if they were + * never used. + */ +static void return_unused_surplus_pages(struct hstate *h, + unsigned long unused_resv_pages) +{ + static int nid = -1; + struct page *page; + unsigned long nr_pages; + + /* + * We want to release as many surplus pages as possible, spread + * evenly across all nodes. Iterate across all nodes until we + * can no longer free unreserved surplus pages. This occurs when + * the nodes with surplus pages have no free pages. + */ + unsigned long remaining_iterations = num_online_nodes(); + + /* Uncommit the reservation */ + h->resv_huge_pages -= unused_resv_pages; + + /* Cannot return gigantic pages currently */ + if (h->order >= MAX_ORDER) + return; + + nr_pages = min(unused_resv_pages, h->surplus_huge_pages); + + while (remaining_iterations-- && nr_pages) { + nid = next_node(nid, node_online_map); + if (nid == MAX_NUMNODES) + nid = first_node(node_online_map); + + if (!h->surplus_huge_pages_node[nid]) + continue; + + if (!list_empty(&h->hugepage_freelists[nid])) { + page = list_entry(h->hugepage_freelists[nid].next, + struct page, lru); + list_del(&page->lru); + update_and_free_page(h, page); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + h->surplus_huge_pages--; + h->surplus_huge_pages_node[nid]--; + nr_pages--; + remaining_iterations = num_online_nodes(); + } + } +} + +/* + * Determine if the huge page at addr within the vma has an associated + * reservation. Where it does not we will need to logically increase + * reservation and actually increase quota before an allocation can occur. + * Where any new reservation would be required the reservation change is + * prepared, but not committed. Once the page has been quota'd allocated + * an instantiated the change should be committed via vma_commit_reservation. + * No action is required on failure. + */ +static int vma_needs_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +{ + struct address_space *mapping = vma->vm_file->f_mapping; + struct inode *inode = mapping->host; + + if (vma->vm_flags & VM_SHARED) { + pgoff_t idx = vma_hugecache_offset(h, vma, addr); + return region_chg(&inode->i_mapping->private_list, + idx, idx + 1); + + } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + return 1; + + } else { + int err; + pgoff_t idx = vma_hugecache_offset(h, vma, addr); + struct resv_map *reservations = vma_resv_map(vma); + + err = region_chg(&reservations->regions, idx, idx + 1); + if (err < 0) + return err; + return 0; + } +} +static void vma_commit_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +{ + struct address_space *mapping = vma->vm_file->f_mapping; + struct inode *inode = mapping->host; + + if (vma->vm_flags & VM_SHARED) { + pgoff_t idx = vma_hugecache_offset(h, vma, addr); + region_add(&inode->i_mapping->private_list, idx, idx + 1); + + } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + pgoff_t idx = vma_hugecache_offset(h, vma, addr); + struct resv_map *reservations = vma_resv_map(vma); + + /* Mark this page used in the map. */ + region_add(&reservations->regions, idx, idx + 1); + } +} + +static struct page *alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr, int avoid_reserve) +{ + struct hstate *h = hstate_vma(vma); + struct page *page; + struct address_space *mapping = vma->vm_file->f_mapping; + struct inode *inode = mapping->host; + unsigned int chg; + + /* + * Processes that did not create the mapping will have no reserves and + * will not have accounted against quota. Check that the quota can be + * made before satisfying the allocation + * MAP_NORESERVE mappings may also need pages and quota allocated + * if no reserve mapping overlaps. + */ + chg = vma_needs_reservation(h, vma, addr); + if (chg < 0) + return ERR_PTR(chg); + if (chg) + if (hugetlb_get_quota(inode->i_mapping, chg)) + return ERR_PTR(-ENOSPC); + + spin_lock(&hugetlb_lock); + page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); + spin_unlock(&hugetlb_lock); + + if (!page) { + page = alloc_buddy_huge_page(h, vma, addr); + if (!page) { + hugetlb_put_quota(inode->i_mapping, chg); + return ERR_PTR(-VM_FAULT_OOM); + } + } + + set_page_refcounted(page); + set_page_private(page, (unsigned long) mapping); + + vma_commit_reservation(h, vma, addr); + + return page; +} + +__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h) +{ + struct huge_bootmem_page *m; + int nr_nodes = nodes_weight(node_online_map); + + while (nr_nodes) { + void *addr; + + addr = __alloc_bootmem_node_nopanic( + NODE_DATA(h->hugetlb_next_nid), + huge_page_size(h), huge_page_size(h), 0); + + if (addr) { + /* + * Use the beginning of the huge page to store the + * huge_bootmem_page struct (until gather_bootmem + * puts them into the mem_map). + */ + m = addr; + if (m) + goto found; + } + hstate_next_node(h); + nr_nodes--; + } + return 0; + +found: + BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1)); + /* Put them into a private list first because mem_map is not up yet */ + list_add(&m->list, &huge_boot_pages); + m->hstate = h; + return 1; +} + +static void prep_compound_huge_page(struct page *page, int order) +{ + if (unlikely(order > (MAX_ORDER - 1))) + prep_compound_gigantic_page(page, order); + else + prep_compound_page(page, order); +} + +/* Put bootmem huge pages into the standard lists after mem_map is up */ +static void __init gather_bootmem_prealloc(void) +{ + struct huge_bootmem_page *m; + + list_for_each_entry(m, &huge_boot_pages, list) { + struct page *page = virt_to_page(m); + struct hstate *h = m->hstate; + __ClearPageReserved(page); + WARN_ON(page_count(page) != 1); + prep_compound_huge_page(page, h->order); + prep_new_huge_page(h, page, page_to_nid(page)); + } +} + +static void __init hugetlb_hstate_alloc_pages(struct hstate *h) +{ + unsigned long i; + + for (i = 0; i < h->max_huge_pages; ++i) { + if (h->order >= MAX_ORDER) { + if (!alloc_bootmem_huge_page(h)) + break; + } else if (!alloc_fresh_huge_page(h)) + break; + } + h->max_huge_pages = i; +} + +static void __init hugetlb_init_hstates(void) +{ + struct hstate *h; + + for_each_hstate(h) { + /* oversize hugepages were init'ed in early boot */ + if (h->order < MAX_ORDER) + hugetlb_hstate_alloc_pages(h); + } +} + +static char * __init memfmt(char *buf, unsigned long n) +{ + if (n >= (1UL << 30)) + sprintf(buf, "%lu GB", n >> 30); + else if (n >= (1UL << 20)) + sprintf(buf, "%lu MB", n >> 20); + else + sprintf(buf, "%lu KB", n >> 10); + return buf; +} + +static void __init report_hugepages(void) +{ + struct hstate *h; + + for_each_hstate(h) { + char buf[32]; + printk(KERN_INFO "HugeTLB registered %s page size, " + "pre-allocated %ld pages\n", + memfmt(buf, huge_page_size(h)), + h->free_huge_pages); + } +} + +#ifdef CONFIG_HIGHMEM +static void try_to_free_low(struct hstate *h, unsigned long count) +{ + int i; + + if (h->order >= MAX_ORDER) + return; + + for (i = 0; i < MAX_NUMNODES; ++i) { + struct page *page, *next; + struct list_head *freel = &h->hugepage_freelists[i]; + list_for_each_entry_safe(page, next, freel, lru) { + if (count >= h->nr_huge_pages) + return; + if (PageHighMem(page)) + continue; + list_del(&page->lru); + update_and_free_page(h, page); + h->free_huge_pages--; + h->free_huge_pages_node[page_to_nid(page)]--; + } + } +} +#else +static inline void try_to_free_low(struct hstate *h, unsigned long count) +{ +} +#endif + +#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) +static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) +{ + unsigned long min_count, ret; + + if (h->order >= MAX_ORDER) + return h->max_huge_pages; + + /* + * Increase the pool size + * First take pages out of surplus state. Then make up the + * remaining difference by allocating fresh huge pages. + * + * We might race with alloc_buddy_huge_page() here and be unable + * to convert a surplus huge page to a normal huge page. That is + * not critical, though, it just means the overall size of the + * pool might be one hugepage larger than it needs to be, but + * within all the constraints specified by the sysctls. + */ + spin_lock(&hugetlb_lock); + while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { + if (!adjust_pool_surplus(h, -1)) + break; + } + + while (count > persistent_huge_pages(h)) { + /* + * If this allocation races such that we no longer need the + * page, free_huge_page will handle it by freeing the page + * and reducing the surplus. + */ + spin_unlock(&hugetlb_lock); + ret = alloc_fresh_huge_page(h); + spin_lock(&hugetlb_lock); + if (!ret) + goto out; + + } + + /* + * Decrease the pool size + * First return free pages to the buddy allocator (being careful + * to keep enough around to satisfy reservations). Then place + * pages into surplus state as needed so the pool will shrink + * to the desired size as pages become free. + * + * By placing pages into the surplus state independent of the + * overcommit value, we are allowing the surplus pool size to + * exceed overcommit. There are few sane options here. Since + * alloc_buddy_huge_page() is checking the global counter, + * though, we'll note that we're not allowed to exceed surplus + * and won't grow the pool anywhere else. Not until one of the + * sysctls are changed, or the surplus pages go out of use. + */ + min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; + min_count = max(count, min_count); + try_to_free_low(h, min_count); + while (min_count < persistent_huge_pages(h)) { + struct page *page = dequeue_huge_page(h); + if (!page) + break; + update_and_free_page(h, page); + } + while (count < persistent_huge_pages(h)) { + if (!adjust_pool_surplus(h, 1)) + break; + } +out: + ret = persistent_huge_pages(h); + spin_unlock(&hugetlb_lock); + return ret; +} + +#define HSTATE_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +#define HSTATE_ATTR(_name) \ + static struct kobj_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +static struct kobject *hugepages_kobj; +static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; + +static struct hstate *kobj_to_hstate(struct kobject *kobj) +{ + int i; + for (i = 0; i < HUGE_MAX_HSTATE; i++) + if (hstate_kobjs[i] == kobj) + return &hstates[i]; + BUG(); + return NULL; +} + +static ssize_t nr_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->nr_huge_pages); +} +static ssize_t nr_hugepages_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int err; + unsigned long input; + struct hstate *h = kobj_to_hstate(kobj); + + err = strict_strtoul(buf, 10, &input); + if (err) + return 0; + + h->max_huge_pages = set_max_huge_pages(h, input); + + return count; +} +HSTATE_ATTR(nr_hugepages); + +static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); +} +static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int err; + unsigned long input; + struct hstate *h = kobj_to_hstate(kobj); + + err = strict_strtoul(buf, 10, &input); + if (err) + return 0; + + spin_lock(&hugetlb_lock); + h->nr_overcommit_huge_pages = input; + spin_unlock(&hugetlb_lock); + + return count; +} +HSTATE_ATTR(nr_overcommit_hugepages); + +static ssize_t free_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->free_huge_pages); +} +HSTATE_ATTR_RO(free_hugepages); + +static ssize_t resv_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->resv_huge_pages); +} +HSTATE_ATTR_RO(resv_hugepages); + +static ssize_t surplus_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->surplus_huge_pages); +} +HSTATE_ATTR_RO(surplus_hugepages); + +static struct attribute *hstate_attrs[] = { + &nr_hugepages_attr.attr, + &nr_overcommit_hugepages_attr.attr, + &free_hugepages_attr.attr, + &resv_hugepages_attr.attr, + &surplus_hugepages_attr.attr, + NULL, +}; + +static struct attribute_group hstate_attr_group = { + .attrs = hstate_attrs, +}; + +static int __init hugetlb_sysfs_add_hstate(struct hstate *h) +{ + int retval; + + hstate_kobjs[h - hstates] = kobject_create_and_add(h->name, + hugepages_kobj); + if (!hstate_kobjs[h - hstates]) + return -ENOMEM; + + retval = sysfs_create_group(hstate_kobjs[h - hstates], + &hstate_attr_group); + if (retval) + kobject_put(hstate_kobjs[h - hstates]); + + return retval; +} + +static void __init hugetlb_sysfs_init(void) +{ + struct hstate *h; + int err; + + hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); + if (!hugepages_kobj) + return; + + for_each_hstate(h) { + err = hugetlb_sysfs_add_hstate(h); + if (err) + printk(KERN_ERR "Hugetlb: Unable to add hstate %s", + h->name); + } +} + +static void __exit hugetlb_exit(void) +{ + struct hstate *h; + + for_each_hstate(h) { + kobject_put(hstate_kobjs[h - hstates]); + } + + kobject_put(hugepages_kobj); +} +module_exit(hugetlb_exit); + +static int __init hugetlb_init(void) +{ + /* Some platform decide whether they support huge pages at boot + * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when + * there is no such support + */ + if (HPAGE_SHIFT == 0) + return 0; + + if (!size_to_hstate(default_hstate_size)) { + default_hstate_size = HPAGE_SIZE; + if (!size_to_hstate(default_hstate_size)) + hugetlb_add_hstate(HUGETLB_PAGE_ORDER); + } + default_hstate_idx = size_to_hstate(default_hstate_size) - hstates; + if (default_hstate_max_huge_pages) + default_hstate.max_huge_pages = default_hstate_max_huge_pages; + + hugetlb_init_hstates(); + + gather_bootmem_prealloc(); + + report_hugepages(); + + hugetlb_sysfs_init(); + + return 0; +} +module_init(hugetlb_init); + +/* Should be called on processing a hugepagesz=... option */ +void __init hugetlb_add_hstate(unsigned order) +{ + struct hstate *h; + unsigned long i; + + if (size_to_hstate(PAGE_SIZE << order)) { + printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); + return; + } + BUG_ON(max_hstate >= HUGE_MAX_HSTATE); + BUG_ON(order == 0); + h = &hstates[max_hstate++]; + h->order = order; + h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); + h->nr_huge_pages = 0; + h->free_huge_pages = 0; + for (i = 0; i < MAX_NUMNODES; ++i) + INIT_LIST_HEAD(&h->hugepage_freelists[i]); + h->hugetlb_next_nid = first_node(node_online_map); + snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", + huge_page_size(h)/1024); + + parsed_hstate = h; +} + +static int __init hugetlb_nrpages_setup(char *s) +{ + unsigned long *mhp; + static unsigned long *last_mhp; + + /* + * !max_hstate means we haven't parsed a hugepagesz= parameter yet, + * so this hugepages= parameter goes to the "default hstate". + */ + if (!max_hstate) + mhp = &default_hstate_max_huge_pages; + else + mhp = &parsed_hstate->max_huge_pages; + + if (mhp == last_mhp) { + printk(KERN_WARNING "hugepages= specified twice without " + "interleaving hugepagesz=, ignoring\n"); + return 1; + } + + if (sscanf(s, "%lu", mhp) <= 0) + *mhp = 0; + + /* + * Global state is always initialized later in hugetlb_init. + * But we need to allocate >= MAX_ORDER hstates here early to still + * use the bootmem allocator. + */ + if (max_hstate && parsed_hstate->order >= MAX_ORDER) + hugetlb_hstate_alloc_pages(parsed_hstate); + + last_mhp = mhp; + + return 1; +} +__setup("hugepages=", hugetlb_nrpages_setup); + +static int __init hugetlb_default_setup(char *s) +{ + default_hstate_size = memparse(s, &s); + return 1; +} +__setup("default_hugepagesz=", hugetlb_default_setup); + +static unsigned int cpuset_mems_nr(unsigned int *array) +{ + int node; + unsigned int nr = 0; + + for_each_node_mask(node, cpuset_current_mems_allowed) + nr += array[node]; + + return nr; +} + +#ifdef CONFIG_SYSCTL +int hugetlb_sysctl_handler(struct ctl_table *table, int write, + struct file *file, void __user *buffer, + size_t *length, loff_t *ppos) +{ + struct hstate *h = &default_hstate; + unsigned long tmp; + + if (!write) + tmp = h->max_huge_pages; + + table->data = &tmp; + table->maxlen = sizeof(unsigned long); + proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + + if (write) + h->max_huge_pages = set_max_huge_pages(h, tmp); + + return 0; +} + +int hugetlb_treat_movable_handler(struct ctl_table *table, int write, + struct file *file, void __user *buffer, + size_t *length, loff_t *ppos) +{ + proc_dointvec(table, write, file, buffer, length, ppos); + if (hugepages_treat_as_movable) + htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; + else + htlb_alloc_mask = GFP_HIGHUSER; + return 0; +} + +int hugetlb_overcommit_handler(struct ctl_table *table, int write, + struct file *file, void __user *buffer, + size_t *length, loff_t *ppos) +{ + struct hstate *h = &default_hstate; + unsigned long tmp; + + if (!write) + tmp = h->nr_overcommit_huge_pages; + + table->data = &tmp; + table->maxlen = sizeof(unsigned long); + proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + + if (write) { + spin_lock(&hugetlb_lock); + h->nr_overcommit_huge_pages = tmp; + spin_unlock(&hugetlb_lock); + } + + return 0; +} + +#endif /* CONFIG_SYSCTL */ + +void hugetlb_report_meminfo(struct seq_file *m) +{ + struct hstate *h = &default_hstate; + seq_printf(m, + "HugePages_Total: %5lu\n" + "HugePages_Free: %5lu\n" + "HugePages_Rsvd: %5lu\n" + "HugePages_Surp: %5lu\n" + "Hugepagesize: %8lu kB\n", + h->nr_huge_pages, + h->free_huge_pages, + h->resv_huge_pages, + h->surplus_huge_pages, + 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); +} + +int hugetlb_report_node_meminfo(int nid, char *buf) +{ + struct hstate *h = &default_hstate; + return sprintf(buf, + "Node %d HugePages_Total: %5u\n" + "Node %d HugePages_Free: %5u\n" + "Node %d HugePages_Surp: %5u\n", + nid, h->nr_huge_pages_node[nid], + nid, h->free_huge_pages_node[nid], + nid, h->surplus_huge_pages_node[nid]); +} + +/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ +unsigned long hugetlb_total_pages(void) +{ + struct hstate *h = &default_hstate; + return h->nr_huge_pages * pages_per_huge_page(h); +} + +static int hugetlb_acct_memory(struct hstate *h, long delta) +{ + int ret = -ENOMEM; + + spin_lock(&hugetlb_lock); + /* + * When cpuset is configured, it breaks the strict hugetlb page + * reservation as the accounting is done on a global variable. Such + * reservation is completely rubbish in the presence of cpuset because + * the reservation is not checked against page availability for the + * current cpuset. Application can still potentially OOM'ed by kernel + * with lack of free htlb page in cpuset that the task is in. + * Attempt to enforce strict accounting with cpuset is almost + * impossible (or too ugly) because cpuset is too fluid that + * task or memory node can be dynamically moved between cpusets. + * + * The change of semantics for shared hugetlb mapping with cpuset is + * undesirable. However, in order to preserve some of the semantics, + * we fall back to check against current free page availability as + * a best attempt and hopefully to minimize the impact of changing + * semantics that cpuset has. + */ + if (delta > 0) { + if (gather_surplus_pages(h, delta) < 0) + goto out; + + if (delta > cpuset_mems_nr(h->free_huge_pages_node)) { + return_unused_surplus_pages(h, delta); + goto out; + } + } + + ret = 0; + if (delta < 0) + return_unused_surplus_pages(h, (unsigned long) -delta); + +out: + spin_unlock(&hugetlb_lock); + return ret; +} + +static void hugetlb_vm_op_open(struct vm_area_struct *vma) +{ + struct resv_map *reservations = vma_resv_map(vma); + + /* + * This new VMA should share its siblings reservation map if present. + * The VMA will only ever have a valid reservation map pointer where + * it is being copied for another still existing VMA. As that VMA + * has a reference to the reservation map it cannot dissappear until + * after this open call completes. It is therefore safe to take a + * new reference here without additional locking. + */ + if (reservations) + kref_get(&reservations->refs); +} + +static void hugetlb_vm_op_close(struct vm_area_struct *vma) +{ + struct hstate *h = hstate_vma(vma); + struct resv_map *reservations = vma_resv_map(vma); + unsigned long reserve; + unsigned long start; + unsigned long end; + + if (reservations) { + start = vma_hugecache_offset(h, vma, vma->vm_start); + end = vma_hugecache_offset(h, vma, vma->vm_end); + + reserve = (end - start) - + region_count(&reservations->regions, start, end); + + kref_put(&reservations->refs, resv_map_release); + + if (reserve) { + hugetlb_acct_memory(h, -reserve); + hugetlb_put_quota(vma->vm_file->f_mapping, reserve); + } + } +} + +/* + * We cannot handle pagefaults against hugetlb pages at all. They cause + * handle_mm_fault() to try to instantiate regular-sized pages in the + * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get + * this far. + */ +static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + BUG(); + return 0; +} + +struct vm_operations_struct hugetlb_vm_ops = { + .fault = hugetlb_vm_op_fault, + .open = hugetlb_vm_op_open, + .close = hugetlb_vm_op_close, +}; + +static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, + int writable) +{ + pte_t entry; + + if (writable) { + entry = + pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); + } else { + entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot)); + } + entry = pte_mkyoung(entry); + entry = pte_mkhuge(entry); + + return entry; +} + +static void set_huge_ptep_writable(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + pte_t entry; + + entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); + if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { + update_mmu_cache(vma, address, entry); + } +} + + +int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma) +{ + pte_t *src_pte, *dst_pte, entry; + struct page *ptepage; + unsigned long addr; + int cow; + struct hstate *h = hstate_vma(vma); + unsigned long sz = huge_page_size(h); + + cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; + + for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { + src_pte = huge_pte_offset(src, addr); + if (!src_pte) + continue; + dst_pte = huge_pte_alloc(dst, addr, sz); + if (!dst_pte) + goto nomem; + + /* If the pagetables are shared don't copy or take references */ + if (dst_pte == src_pte) + continue; + + spin_lock(&dst->page_table_lock); + spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING); + if (!huge_pte_none(huge_ptep_get(src_pte))) { + if (cow) + huge_ptep_set_wrprotect(src, addr, src_pte); + entry = huge_ptep_get(src_pte); + ptepage = pte_page(entry); + get_page(ptepage); + set_huge_pte_at(dst, addr, dst_pte, entry); + } + spin_unlock(&src->page_table_lock); + spin_unlock(&dst->page_table_lock); + } + return 0; + +nomem: + return -ENOMEM; +} + +void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end, struct page *ref_page) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pte_t *ptep; + pte_t pte; + struct page *page; + struct page *tmp; + struct hstate *h = hstate_vma(vma); + unsigned long sz = huge_page_size(h); + + /* + * A page gathering list, protected by per file i_mmap_lock. The + * lock is used to avoid list corruption from multiple unmapping + * of the same page since we are using page->lru. + */ + LIST_HEAD(page_list); + + WARN_ON(!is_vm_hugetlb_page(vma)); + BUG_ON(start & ~huge_page_mask(h)); + BUG_ON(end & ~huge_page_mask(h)); + + mmu_notifier_invalidate_range_start(mm, start, end); + spin_lock(&mm->page_table_lock); + for (address = start; address < end; address += sz) { + ptep = huge_pte_offset(mm, address); + if (!ptep) + continue; + + if (huge_pmd_unshare(mm, &address, ptep)) + continue; + + /* + * If a reference page is supplied, it is because a specific + * page is being unmapped, not a range. Ensure the page we + * are about to unmap is the actual page of interest. + */ + if (ref_page) { + pte = huge_ptep_get(ptep); + if (huge_pte_none(pte)) + continue; + page = pte_page(pte); + if (page != ref_page) + continue; + + /* + * Mark the VMA as having unmapped its page so that + * future faults in this VMA will fail rather than + * looking like data was lost + */ + set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); + } + + pte = huge_ptep_get_and_clear(mm, address, ptep); + if (huge_pte_none(pte)) + continue; + + page = pte_page(pte); + if (pte_dirty(pte)) + set_page_dirty(page); + list_add(&page->lru, &page_list); + } + spin_unlock(&mm->page_table_lock); + flush_tlb_range(vma, start, end); + mmu_notifier_invalidate_range_end(mm, start, end); + list_for_each_entry_safe(page, tmp, &page_list, lru) { + list_del(&page->lru); + put_page(page); + } +} + +void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end, struct page *ref_page) +{ + spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); + __unmap_hugepage_range(vma, start, end, ref_page); + spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); +} + +/* + * This is called when the original mapper is failing to COW a MAP_PRIVATE + * mappping it owns the reserve page for. The intention is to unmap the page + * from other VMAs and let the children be SIGKILLed if they are faulting the + * same region. + */ +static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, + struct page *page, unsigned long address) +{ + struct hstate *h = hstate_vma(vma); + struct vm_area_struct *iter_vma; + struct address_space *mapping; + struct prio_tree_iter iter; + pgoff_t pgoff; + + /* + * vm_pgoff is in PAGE_SIZE units, hence the different calculation + * from page cache lookup which is in HPAGE_SIZE units. + */ + address = address & huge_page_mask(h); + pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + + (vma->vm_pgoff >> PAGE_SHIFT); + mapping = (struct address_space *)page_private(page); + + vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + /* Do not unmap the current VMA */ + if (iter_vma == vma) + continue; + + /* + * Unmap the page from other VMAs without their own reserves. + * They get marked to be SIGKILLed if they fault in these + * areas. This is because a future no-page fault on this VMA + * could insert a zeroed page instead of the data existing + * from the time of fork. This would look like data corruption + */ + if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) + unmap_hugepage_range(iter_vma, + address, address + huge_page_size(h), + page); + } + + return 1; +} + +static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, pte_t pte, + struct page *pagecache_page) +{ + struct hstate *h = hstate_vma(vma); + struct page *old_page, *new_page; + int avoidcopy; + int outside_reserve = 0; + + old_page = pte_page(pte); + +retry_avoidcopy: + /* If no-one else is actually using this page, avoid the copy + * and just make the page writable */ + avoidcopy = (page_count(old_page) == 1); + if (avoidcopy) { + set_huge_ptep_writable(vma, address, ptep); + return 0; + } + + /* + * If the process that created a MAP_PRIVATE mapping is about to + * perform a COW due to a shared page count, attempt to satisfy + * the allocation without using the existing reserves. The pagecache + * page is used to determine if the reserve at this address was + * consumed or not. If reserves were used, a partial faulted mapping + * at the time of fork() could consume its reserves on COW instead + * of the full address range. + */ + if (!(vma->vm_flags & VM_SHARED) && + is_vma_resv_set(vma, HPAGE_RESV_OWNER) && + old_page != pagecache_page) + outside_reserve = 1; + + page_cache_get(old_page); + new_page = alloc_huge_page(vma, address, outside_reserve); + + if (IS_ERR(new_page)) { + page_cache_release(old_page); + + /* + * If a process owning a MAP_PRIVATE mapping fails to COW, + * it is due to references held by a child and an insufficient + * huge page pool. To guarantee the original mappers + * reliability, unmap the page from child processes. The child + * may get SIGKILLed if it later faults. + */ + if (outside_reserve) { + BUG_ON(huge_pte_none(pte)); + if (unmap_ref_private(mm, vma, old_page, address)) { + BUG_ON(page_count(old_page) != 1); + BUG_ON(huge_pte_none(pte)); + goto retry_avoidcopy; + } + WARN_ON_ONCE(1); + } + + return -PTR_ERR(new_page); + } + + spin_unlock(&mm->page_table_lock); + copy_huge_page(new_page, old_page, address, vma); + __SetPageUptodate(new_page); + spin_lock(&mm->page_table_lock); + + ptep = huge_pte_offset(mm, address & huge_page_mask(h)); + if (likely(pte_same(huge_ptep_get(ptep), pte))) { + /* Break COW */ + huge_ptep_clear_flush(vma, address, ptep); + set_huge_pte_at(mm, address, ptep, + make_huge_pte(vma, new_page, 1)); + /* Make the old page be freed below */ + new_page = old_page; + } + page_cache_release(new_page); + page_cache_release(old_page); + return 0; +} + +/* Return the pagecache page at a given address within a VMA */ +static struct page *hugetlbfs_pagecache_page(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) +{ + struct address_space *mapping; + pgoff_t idx; + + mapping = vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, vma, address); + + return find_lock_page(mapping, idx); +} + +static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, int write_access) +{ + struct hstate *h = hstate_vma(vma); + int ret = VM_FAULT_SIGBUS; + pgoff_t idx; + unsigned long size; + struct page *page; + struct address_space *mapping; + pte_t new_pte; + + /* + * Currently, we are forced to kill the process in the event the + * original mapper has unmapped pages from the child due to a failed + * COW. Warn that such a situation has occured as it may not be obvious + */ + if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { + printk(KERN_WARNING + "PID %d killed due to inadequate hugepage pool\n", + current->pid); + return ret; + } + + mapping = vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, vma, address); + + /* + * Use page lock to guard against racing truncation + * before we get page_table_lock. + */ +retry: + page = find_lock_page(mapping, idx); + if (!page) { + size = i_size_read(mapping->host) >> huge_page_shift(h); + if (idx >= size) + goto out; + page = alloc_huge_page(vma, address, 0); + if (IS_ERR(page)) { + ret = -PTR_ERR(page); + goto out; + } + clear_huge_page(page, address, huge_page_size(h)); + __SetPageUptodate(page); + + if (vma->vm_flags & VM_SHARED) { + int err; + struct inode *inode = mapping->host; + + err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); + if (err) { + put_page(page); + if (err == -EEXIST) + goto retry; + goto out; + } + + spin_lock(&inode->i_lock); + inode->i_blocks += blocks_per_huge_page(h); + spin_unlock(&inode->i_lock); + } else + lock_page(page); + } + + /* + * If we are going to COW a private mapping later, we examine the + * pending reservations for this page now. This will ensure that + * any allocations necessary to record that reservation occur outside + * the spinlock. + */ + if (write_access && !(vma->vm_flags & VM_SHARED)) + if (vma_needs_reservation(h, vma, address) < 0) { + ret = VM_FAULT_OOM; + goto backout_unlocked; + } + + spin_lock(&mm->page_table_lock); + size = i_size_read(mapping->host) >> huge_page_shift(h); + if (idx >= size) + goto backout; + + ret = 0; + if (!huge_pte_none(huge_ptep_get(ptep))) + goto backout; + + new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) + && (vma->vm_flags & VM_SHARED))); + set_huge_pte_at(mm, address, ptep, new_pte); + + if (write_access && !(vma->vm_flags & VM_SHARED)) { + /* Optimization, do the COW without a second fault */ + ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); + } + + spin_unlock(&mm->page_table_lock); + unlock_page(page); +out: + return ret; + +backout: + spin_unlock(&mm->page_table_lock); +backout_unlocked: + unlock_page(page); + put_page(page); + goto out; +} + +int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, int write_access) +{ + pte_t *ptep; + pte_t entry; + int ret; + struct page *pagecache_page = NULL; + static DEFINE_MUTEX(hugetlb_instantiation_mutex); + struct hstate *h = hstate_vma(vma); + + ptep = huge_pte_alloc(mm, address, huge_page_size(h)); + if (!ptep) + return VM_FAULT_OOM; + + /* + * Serialize hugepage allocation and instantiation, so that we don't + * get spurious allocation failures if two CPUs race to instantiate + * the same page in the page cache. + */ + mutex_lock(&hugetlb_instantiation_mutex); + entry = huge_ptep_get(ptep); + if (huge_pte_none(entry)) { + ret = hugetlb_no_page(mm, vma, address, ptep, write_access); + goto out_mutex; + } + + ret = 0; + + /* + * If we are going to COW the mapping later, we examine the pending + * reservations for this page now. This will ensure that any + * allocations necessary to record that reservation occur outside the + * spinlock. For private mappings, we also lookup the pagecache + * page now as it is used to determine if a reservation has been + * consumed. + */ + if (write_access && !pte_write(entry)) { + if (vma_needs_reservation(h, vma, address) < 0) { + ret = VM_FAULT_OOM; + goto out_mutex; + } + + if (!(vma->vm_flags & VM_SHARED)) + pagecache_page = hugetlbfs_pagecache_page(h, + vma, address); + } + + spin_lock(&mm->page_table_lock); + /* Check for a racing update before calling hugetlb_cow */ + if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) + goto out_page_table_lock; + + + if (write_access) { + if (!pte_write(entry)) { + ret = hugetlb_cow(mm, vma, address, ptep, entry, + pagecache_page); + goto out_page_table_lock; + } + entry = pte_mkdirty(entry); + } + entry = pte_mkyoung(entry); + if (huge_ptep_set_access_flags(vma, address, ptep, entry, write_access)) + update_mmu_cache(vma, address, entry); + +out_page_table_lock: + spin_unlock(&mm->page_table_lock); + + if (pagecache_page) { + unlock_page(pagecache_page); + put_page(pagecache_page); + } + +out_mutex: + mutex_unlock(&hugetlb_instantiation_mutex); + + return ret; +} + +/* Can be overriden by architectures */ +__attribute__((weak)) struct page * +follow_huge_pud(struct mm_struct *mm, unsigned long address, + pud_t *pud, int write) +{ + BUG(); + return NULL; +} + +static int huge_zeropage_ok(pte_t *ptep, int write, int shared) +{ + if (!ptep || write || shared) + return 0; + else + return huge_pte_none(huge_ptep_get(ptep)); +} + +int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, + struct page **pages, struct vm_area_struct **vmas, + unsigned long *position, int *length, int i, + int write) +{ + unsigned long pfn_offset; + unsigned long vaddr = *position; + int remainder = *length; + struct hstate *h = hstate_vma(vma); + int zeropage_ok = 0; + int shared = vma->vm_flags & VM_SHARED; + + spin_lock(&mm->page_table_lock); + while (vaddr < vma->vm_end && remainder) { + pte_t *pte; + struct page *page; + + /* + * Some archs (sparc64, sh*) have multiple pte_ts to + * each hugepage. We have to make * sure we get the + * first, for the page indexing below to work. + */ + pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); + if (huge_zeropage_ok(pte, write, shared)) + zeropage_ok = 1; + + if (!pte || + (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) || + (write && !pte_write(huge_ptep_get(pte)))) { + int ret; + + spin_unlock(&mm->page_table_lock); + ret = hugetlb_fault(mm, vma, vaddr, write); + spin_lock(&mm->page_table_lock); + if (!(ret & VM_FAULT_ERROR)) + continue; + + remainder = 0; + if (!i) + i = -EFAULT; + break; + } + + pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; + page = pte_page(huge_ptep_get(pte)); +same_page: + if (pages) { + if (zeropage_ok) + pages[i] = ZERO_PAGE(0); + else + pages[i] = mem_map_offset(page, pfn_offset); + get_page(pages[i]); + } + + if (vmas) + vmas[i] = vma; + + vaddr += PAGE_SIZE; + ++pfn_offset; + --remainder; + ++i; + if (vaddr < vma->vm_end && remainder && + pfn_offset < pages_per_huge_page(h)) { + /* + * We use pfn_offset to avoid touching the pageframes + * of this compound page. + */ + goto same_page; + } + } + spin_unlock(&mm->page_table_lock); + *length = remainder; + *position = vaddr; + + return i; +} + +void hugetlb_change_protection(struct vm_area_struct *vma, + unsigned long address, unsigned long end, pgprot_t newprot) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long start = address; + pte_t *ptep; + pte_t pte; + struct hstate *h = hstate_vma(vma); + + BUG_ON(address >= end); + flush_cache_range(vma, address, end); + + spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); + spin_lock(&mm->page_table_lock); + for (; address < end; address += huge_page_size(h)) { + ptep = huge_pte_offset(mm, address); + if (!ptep) + continue; + if (huge_pmd_unshare(mm, &address, ptep)) + continue; + if (!huge_pte_none(huge_ptep_get(ptep))) { + pte = huge_ptep_get_and_clear(mm, address, ptep); + pte = pte_mkhuge(pte_modify(pte, newprot)); + set_huge_pte_at(mm, address, ptep, pte); + } + } + spin_unlock(&mm->page_table_lock); + spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); + + flush_tlb_range(vma, start, end); +} + +int hugetlb_reserve_pages(struct inode *inode, + long from, long to, + struct vm_area_struct *vma) +{ + long ret, chg; + struct hstate *h = hstate_inode(inode); + + if (vma && vma->vm_flags & VM_NORESERVE) + return 0; + + /* + * Shared mappings base their reservation on the number of pages that + * are already allocated on behalf of the file. Private mappings need + * to reserve the full area even if read-only as mprotect() may be + * called to make the mapping read-write. Assume !vma is a shm mapping + */ + if (!vma || vma->vm_flags & VM_SHARED) + chg = region_chg(&inode->i_mapping->private_list, from, to); + else { + struct resv_map *resv_map = resv_map_alloc(); + if (!resv_map) + return -ENOMEM; + + chg = to - from; + + set_vma_resv_map(vma, resv_map); + set_vma_resv_flags(vma, HPAGE_RESV_OWNER); + } + + if (chg < 0) + return chg; + + if (hugetlb_get_quota(inode->i_mapping, chg)) + return -ENOSPC; + ret = hugetlb_acct_memory(h, chg); + if (ret < 0) { + hugetlb_put_quota(inode->i_mapping, chg); + return ret; + } + if (!vma || vma->vm_flags & VM_SHARED) + region_add(&inode->i_mapping->private_list, from, to); + return 0; +} + +void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) +{ + struct hstate *h = hstate_inode(inode); + long chg = region_truncate(&inode->i_mapping->private_list, offset); + + spin_lock(&inode->i_lock); + inode->i_blocks -= blocks_per_huge_page(h); + spin_unlock(&inode->i_lock); + + hugetlb_put_quota(inode->i_mapping, (chg - freed)); + hugetlb_acct_memory(h, -(chg - freed)); +} diff --git a/mm/internal.h b/mm/internal.h new file mode 100644 index 0000000..13333bc --- /dev/null +++ b/mm/internal.h @@ -0,0 +1,283 @@ +/* internal.h: mm/ internal definitions + * + * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef __MM_INTERNAL_H +#define __MM_INTERNAL_H + +#include <linux/mm.h> + +void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, + unsigned long floor, unsigned long ceiling); + +extern void prep_compound_page(struct page *page, unsigned long order); +extern void prep_compound_gigantic_page(struct page *page, unsigned long order); + +static inline void set_page_count(struct page *page, int v) +{ + atomic_set(&page->_count, v); +} + +/* + * Turn a non-refcounted page (->_count == 0) into refcounted with + * a count of one. + */ +static inline void set_page_refcounted(struct page *page) +{ + VM_BUG_ON(PageTail(page)); + VM_BUG_ON(atomic_read(&page->_count)); + set_page_count(page, 1); +} + +static inline void __put_page(struct page *page) +{ + atomic_dec(&page->_count); +} + +/* + * in mm/vmscan.c: + */ +extern int isolate_lru_page(struct page *page); +extern void putback_lru_page(struct page *page); + +/* + * in mm/page_alloc.c + */ +extern void __free_pages_bootmem(struct page *page, unsigned int order); + +/* + * function for dealing with page's order in buddy system. + * zone->lock is already acquired when we use these. + * So, we don't need atomic page->flags operations here. + */ +static inline unsigned long page_order(struct page *page) +{ + VM_BUG_ON(!PageBuddy(page)); + return page_private(page); +} + +extern long mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end); +extern void munlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end); +static inline void munlock_vma_pages_all(struct vm_area_struct *vma) +{ + munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); +} + +#ifdef CONFIG_UNEVICTABLE_LRU +/* + * unevictable_migrate_page() called only from migrate_page_copy() to + * migrate unevictable flag to new page. + * Note that the old page has been isolated from the LRU lists at this + * point so we don't need to worry about LRU statistics. + */ +static inline void unevictable_migrate_page(struct page *new, struct page *old) +{ + if (TestClearPageUnevictable(old)) + SetPageUnevictable(new); +} +#else +static inline void unevictable_migrate_page(struct page *new, struct page *old) +{ +} +#endif + +#ifdef CONFIG_UNEVICTABLE_LRU +/* + * Called only in fault path via page_evictable() for a new page + * to determine if it's being mapped into a LOCKED vma. + * If so, mark page as mlocked. + */ +static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page) +{ + VM_BUG_ON(PageLRU(page)); + + if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) + return 0; + + if (!TestSetPageMlocked(page)) { + inc_zone_page_state(page, NR_MLOCK); + count_vm_event(UNEVICTABLE_PGMLOCKED); + } + return 1; +} + +/* + * must be called with vma's mmap_sem held for read, and page locked. + */ +extern void mlock_vma_page(struct page *page); + +/* + * Clear the page's PageMlocked(). This can be useful in a situation where + * we want to unconditionally remove a page from the pagecache -- e.g., + * on truncation or freeing. + * + * It is legal to call this function for any page, mlocked or not. + * If called for a page that is still mapped by mlocked vmas, all we do + * is revert to lazy LRU behaviour -- semantics are not broken. + */ +extern void __clear_page_mlock(struct page *page); +static inline void clear_page_mlock(struct page *page) +{ + if (unlikely(TestClearPageMlocked(page))) + __clear_page_mlock(page); +} + +/* + * mlock_migrate_page - called only from migrate_page_copy() to + * migrate the Mlocked page flag; update statistics. + */ +static inline void mlock_migrate_page(struct page *newpage, struct page *page) +{ + if (TestClearPageMlocked(page)) { + unsigned long flags; + + local_irq_save(flags); + __dec_zone_page_state(page, NR_MLOCK); + SetPageMlocked(newpage); + __inc_zone_page_state(newpage, NR_MLOCK); + local_irq_restore(flags); + } +} + +/* + * free_page_mlock() -- clean up attempts to free and mlocked() page. + * Page should not be on lru, so no need to fix that up. + * free_pages_check() will verify... + */ +static inline void free_page_mlock(struct page *page) +{ + if (unlikely(TestClearPageMlocked(page))) { + unsigned long flags; + + local_irq_save(flags); + __dec_zone_page_state(page, NR_MLOCK); + __count_vm_event(UNEVICTABLE_MLOCKFREED); + local_irq_restore(flags); + } +} + +#else /* CONFIG_UNEVICTABLE_LRU */ +static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) +{ + return 0; +} +static inline void clear_page_mlock(struct page *page) { } +static inline void mlock_vma_page(struct page *page) { } +static inline void mlock_migrate_page(struct page *new, struct page *old) { } +static inline void free_page_mlock(struct page *page) { } + +#endif /* CONFIG_UNEVICTABLE_LRU */ + +/* + * Return the mem_map entry representing the 'offset' subpage within + * the maximally aligned gigantic page 'base'. Handle any discontiguity + * in the mem_map at MAX_ORDER_NR_PAGES boundaries. + */ +static inline struct page *mem_map_offset(struct page *base, int offset) +{ + if (unlikely(offset >= MAX_ORDER_NR_PAGES)) + return pfn_to_page(page_to_pfn(base) + offset); + return base + offset; +} + +/* + * Iterator over all subpages withing the maximally aligned gigantic + * page 'base'. Handle any discontiguity in the mem_map. + */ +static inline struct page *mem_map_next(struct page *iter, + struct page *base, int offset) +{ + if (unlikely((offset & (MAX_ORDER_NR_PAGES - 1)) == 0)) { + unsigned long pfn = page_to_pfn(base) + offset; + if (!pfn_valid(pfn)) + return NULL; + return pfn_to_page(pfn); + } + return iter + 1; +} + +/* + * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node, + * so all functions starting at paging_init should be marked __init + * in those cases. SPARSEMEM, however, allows for memory hotplug, + * and alloc_bootmem_node is not used. + */ +#ifdef CONFIG_SPARSEMEM +#define __paginginit __meminit +#else +#define __paginginit __init +#endif + +/* Memory initialisation debug and verification */ +enum mminit_level { + MMINIT_WARNING, + MMINIT_VERIFY, + MMINIT_TRACE +}; + +#ifdef CONFIG_DEBUG_MEMORY_INIT + +extern int mminit_loglevel; + +#define mminit_dprintk(level, prefix, fmt, arg...) \ +do { \ + if (level < mminit_loglevel) { \ + printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \ + printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \ + } \ +} while (0) + +extern void mminit_verify_pageflags_layout(void); +extern void mminit_verify_page_links(struct page *page, + enum zone_type zone, unsigned long nid, unsigned long pfn); +extern void mminit_verify_zonelist(void); + +#else + +static inline void mminit_dprintk(enum mminit_level level, + const char *prefix, const char *fmt, ...) +{ +} + +static inline void mminit_verify_pageflags_layout(void) +{ +} + +static inline void mminit_verify_page_links(struct page *page, + enum zone_type zone, unsigned long nid, unsigned long pfn) +{ +} + +static inline void mminit_verify_zonelist(void) +{ +} +#endif /* CONFIG_DEBUG_MEMORY_INIT */ + +/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */ +#if defined(CONFIG_SPARSEMEM) +extern void mminit_validate_memmodel_limits(unsigned long *start_pfn, + unsigned long *end_pfn); +#else +static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, + unsigned long *end_pfn) +{ +} +#endif /* CONFIG_SPARSEMEM */ + +#define GUP_FLAGS_WRITE 0x1 +#define GUP_FLAGS_FORCE 0x2 +#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4 + +int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int flags, + struct page **pages, struct vm_area_struct **vmas); + +#endif diff --git a/mm/maccess.c b/mm/maccess.c new file mode 100644 index 0000000..ac40796 --- /dev/null +++ b/mm/maccess.c @@ -0,0 +1,55 @@ +/* + * Access kernel memory without faulting. + */ +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/mm.h> + +/** + * probe_kernel_read(): safely attempt to read from a location + * @dst: pointer to the buffer that shall take the data + * @src: address to read from + * @size: size of the data chunk + * + * Safely read from address @src to the buffer at @dst. If a kernel fault + * happens, handle that and return -EFAULT. + */ +long probe_kernel_read(void *dst, void *src, size_t size) +{ + long ret; + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + pagefault_disable(); + ret = __copy_from_user_inatomic(dst, + (__force const void __user *)src, size); + pagefault_enable(); + set_fs(old_fs); + + return ret ? -EFAULT : 0; +} +EXPORT_SYMBOL_GPL(probe_kernel_read); + +/** + * probe_kernel_write(): safely attempt to write to a location + * @dst: address to write to + * @src: pointer to the data that shall be written + * @size: size of the data chunk + * + * Safely write to address @dst from the buffer at @src. If a kernel fault + * happens, handle that and return -EFAULT. + */ +long probe_kernel_write(void *dst, void *src, size_t size) +{ + long ret; + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + pagefault_disable(); + ret = __copy_to_user_inatomic((__force void __user *)dst, src, size); + pagefault_enable(); + set_fs(old_fs); + + return ret ? -EFAULT : 0; +} +EXPORT_SYMBOL_GPL(probe_kernel_write); diff --git a/mm/madvise.c b/mm/madvise.c new file mode 100644 index 0000000..b9ce574 --- /dev/null +++ b/mm/madvise.c @@ -0,0 +1,365 @@ +/* + * linux/mm/madvise.c + * + * Copyright (C) 1999 Linus Torvalds + * Copyright (C) 2002 Christoph Hellwig + */ + +#include <linux/mman.h> +#include <linux/pagemap.h> +#include <linux/syscalls.h> +#include <linux/mempolicy.h> +#include <linux/hugetlb.h> +#include <linux/sched.h> + +/* + * Any behaviour which results in changes to the vma->vm_flags needs to + * take mmap_sem for writing. Others, which simply traverse vmas, need + * to only take it for reading. + */ +static int madvise_need_mmap_write(int behavior) +{ + switch (behavior) { + case MADV_REMOVE: + case MADV_WILLNEED: + case MADV_DONTNEED: + return 0; + default: + /* be safe, default to 1. list exceptions explicitly */ + return 1; + } +} + +/* + * We can potentially split a vm area into separate + * areas, each area with its own behavior. + */ +static long madvise_behavior(struct vm_area_struct * vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, int behavior) +{ + struct mm_struct * mm = vma->vm_mm; + int error = 0; + pgoff_t pgoff; + int new_flags = vma->vm_flags; + + switch (behavior) { + case MADV_NORMAL: + new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; + break; + case MADV_SEQUENTIAL: + new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; + break; + case MADV_RANDOM: + new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; + break; + case MADV_DONTFORK: + new_flags |= VM_DONTCOPY; + break; + case MADV_DOFORK: + new_flags &= ~VM_DONTCOPY; + break; + } + + if (new_flags == vma->vm_flags) { + *prev = vma; + goto out; + } + + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, + vma->vm_file, pgoff, vma_policy(vma)); + if (*prev) { + vma = *prev; + goto success; + } + + *prev = vma; + + if (start != vma->vm_start) { + error = split_vma(mm, vma, start, 1); + if (error) + goto out; + } + + if (end != vma->vm_end) { + error = split_vma(mm, vma, end, 0); + if (error) + goto out; + } + +success: + /* + * vm_flags is protected by the mmap_sem held in write mode. + */ + vma->vm_flags = new_flags; + +out: + if (error == -ENOMEM) + error = -EAGAIN; + return error; +} + +/* + * Schedule all required I/O operations. Do not wait for completion. + */ +static long madvise_willneed(struct vm_area_struct * vma, + struct vm_area_struct ** prev, + unsigned long start, unsigned long end) +{ + struct file *file = vma->vm_file; + + if (!file) + return -EBADF; + + if (file->f_mapping->a_ops->get_xip_mem) { + /* no bad return value, but ignore advice */ + return 0; + } + + *prev = vma; + start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + if (end > vma->vm_end) + end = vma->vm_end; + end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + + force_page_cache_readahead(file->f_mapping, + file, start, max_sane_readahead(end - start)); + return 0; +} + +/* + * Application no longer needs these pages. If the pages are dirty, + * it's OK to just throw them away. The app will be more careful about + * data it wants to keep. Be sure to free swap resources too. The + * zap_page_range call sets things up for shrink_active_list to actually free + * these pages later if no one else has touched them in the meantime, + * although we could add these pages to a global reuse list for + * shrink_active_list to pick up before reclaiming other pages. + * + * NB: This interface discards data rather than pushes it out to swap, + * as some implementations do. This has performance implications for + * applications like large transactional databases which want to discard + * pages in anonymous maps after committing to backing store the data + * that was kept in them. There is no reason to write this data out to + * the swap area if the application is discarding it. + * + * An interface that causes the system to free clean pages and flush + * dirty pages is already available as msync(MS_INVALIDATE). + */ +static long madvise_dontneed(struct vm_area_struct * vma, + struct vm_area_struct ** prev, + unsigned long start, unsigned long end) +{ + *prev = vma; + if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) + return -EINVAL; + + if (unlikely(vma->vm_flags & VM_NONLINEAR)) { + struct zap_details details = { + .nonlinear_vma = vma, + .last_index = ULONG_MAX, + }; + zap_page_range(vma, start, end - start, &details); + } else + zap_page_range(vma, start, end - start, NULL); + return 0; +} + +/* + * Application wants to free up the pages and associated backing store. + * This is effectively punching a hole into the middle of a file. + * + * NOTE: Currently, only shmfs/tmpfs is supported for this operation. + * Other filesystems return -ENOSYS. + */ +static long madvise_remove(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + struct address_space *mapping; + loff_t offset, endoff; + int error; + + *prev = NULL; /* tell sys_madvise we drop mmap_sem */ + + if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) + return -EINVAL; + + if (!vma->vm_file || !vma->vm_file->f_mapping + || !vma->vm_file->f_mapping->host) { + return -EINVAL; + } + + if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) + return -EACCES; + + mapping = vma->vm_file->f_mapping; + + offset = (loff_t)(start - vma->vm_start) + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + endoff = (loff_t)(end - vma->vm_start - 1) + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + + /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ + up_read(¤t->mm->mmap_sem); + error = vmtruncate_range(mapping->host, offset, endoff); + down_read(¤t->mm->mmap_sem); + return error; +} + +static long +madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, + unsigned long start, unsigned long end, int behavior) +{ + long error; + + switch (behavior) { + case MADV_DOFORK: + if (vma->vm_flags & VM_IO) { + error = -EINVAL; + break; + } + case MADV_DONTFORK: + case MADV_NORMAL: + case MADV_SEQUENTIAL: + case MADV_RANDOM: + error = madvise_behavior(vma, prev, start, end, behavior); + break; + case MADV_REMOVE: + error = madvise_remove(vma, prev, start, end); + break; + + case MADV_WILLNEED: + error = madvise_willneed(vma, prev, start, end); + break; + + case MADV_DONTNEED: + error = madvise_dontneed(vma, prev, start, end); + break; + + default: + error = -EINVAL; + break; + } + return error; +} + +/* + * The madvise(2) system call. + * + * Applications can use madvise() to advise the kernel how it should + * handle paging I/O in this VM area. The idea is to help the kernel + * use appropriate read-ahead and caching techniques. The information + * provided is advisory only, and can be safely disregarded by the + * kernel without affecting the correct operation of the application. + * + * behavior values: + * MADV_NORMAL - the default behavior is to read clusters. This + * results in some read-ahead and read-behind. + * MADV_RANDOM - the system should read the minimum amount of data + * on any access, since it is unlikely that the appli- + * cation will need more than what it asks for. + * MADV_SEQUENTIAL - pages in the given range will probably be accessed + * once, so they can be aggressively read ahead, and + * can be freed soon after they are accessed. + * MADV_WILLNEED - the application is notifying the system to read + * some pages ahead. + * MADV_DONTNEED - the application is finished with the given range, + * so the kernel can free resources associated with it. + * MADV_REMOVE - the application wants to free up the given range of + * pages and associated backing store. + * + * return values: + * zero - success + * -EINVAL - start + len < 0, start is not page-aligned, + * "behavior" is not a valid value, or application + * is attempting to release locked or shared pages. + * -ENOMEM - addresses in the specified range are not currently + * mapped, or are outside the AS of the process. + * -EIO - an I/O error occurred while paging in data. + * -EBADF - map exists, but area maps something that isn't a file. + * -EAGAIN - a kernel resource was temporarily unavailable. + */ +SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) +{ + unsigned long end, tmp; + struct vm_area_struct * vma, *prev; + int unmapped_error = 0; + int error = -EINVAL; + int write; + size_t len; + + write = madvise_need_mmap_write(behavior); + if (write) + down_write(¤t->mm->mmap_sem); + else + down_read(¤t->mm->mmap_sem); + + if (start & ~PAGE_MASK) + goto out; + len = (len_in + ~PAGE_MASK) & PAGE_MASK; + + /* Check to see whether len was rounded up from small -ve to zero */ + if (len_in && !len) + goto out; + + end = start + len; + if (end < start) + goto out; + + error = 0; + if (end == start) + goto out; + + /* + * If the interval [start,end) covers some unmapped address + * ranges, just ignore them, but return -ENOMEM at the end. + * - different from the way of handling in mlock etc. + */ + vma = find_vma_prev(current->mm, start, &prev); + if (vma && start > vma->vm_start) + prev = vma; + + for (;;) { + /* Still start < end. */ + error = -ENOMEM; + if (!vma) + goto out; + + /* Here start < (end|vma->vm_end). */ + if (start < vma->vm_start) { + unmapped_error = -ENOMEM; + start = vma->vm_start; + if (start >= end) + goto out; + } + + /* Here vma->vm_start <= start < (end|vma->vm_end) */ + tmp = vma->vm_end; + if (end < tmp) + tmp = end; + + /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ + error = madvise_vma(vma, &prev, start, tmp, behavior); + if (error) + goto out; + start = tmp; + if (prev && start < prev->vm_end) + start = prev->vm_end; + error = unmapped_error; + if (start >= end) + goto out; + if (prev) + vma = prev->vm_next; + else /* madvise_remove dropped mmap_sem */ + vma = find_vma(current->mm, start); + } +out: + if (write) + up_write(¤t->mm->mmap_sem); + else + up_read(¤t->mm->mmap_sem); + + return error; +} diff --git a/mm/memcontrol.c b/mm/memcontrol.c new file mode 100644 index 0000000..866dcc7 --- /dev/null +++ b/mm/memcontrol.c @@ -0,0 +1,1174 @@ +/* memcontrol.c - Memory Controller + * + * Copyright IBM Corporation, 2007 + * Author Balbir Singh <balbir@linux.vnet.ibm.com> + * + * Copyright 2007 OpenVZ SWsoft Inc + * Author: Pavel Emelianov <xemul@openvz.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/res_counter.h> +#include <linux/memcontrol.h> +#include <linux/cgroup.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/page-flags.h> +#include <linux/backing-dev.h> +#include <linux/bit_spinlock.h> +#include <linux/rcupdate.h> +#include <linux/slab.h> +#include <linux/swap.h> +#include <linux/spinlock.h> +#include <linux/fs.h> +#include <linux/seq_file.h> +#include <linux/vmalloc.h> +#include <linux/mm_inline.h> +#include <linux/page_cgroup.h> + +#include <asm/uaccess.h> + +struct cgroup_subsys mem_cgroup_subsys __read_mostly; +#define MEM_CGROUP_RECLAIM_RETRIES 5 + +/* + * Statistics for memory cgroup. + */ +enum mem_cgroup_stat_index { + /* + * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. + */ + MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ + MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */ + MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ + MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ + + MEM_CGROUP_STAT_NSTATS, +}; + +struct mem_cgroup_stat_cpu { + s64 count[MEM_CGROUP_STAT_NSTATS]; +} ____cacheline_aligned_in_smp; + +struct mem_cgroup_stat { + struct mem_cgroup_stat_cpu cpustat[NR_CPUS]; +}; + +/* + * For accounting under irq disable, no need for increment preempt count. + */ +static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat, + enum mem_cgroup_stat_index idx, int val) +{ + stat->count[idx] += val; +} + +static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, + enum mem_cgroup_stat_index idx) +{ + int cpu; + s64 ret = 0; + for_each_possible_cpu(cpu) + ret += stat->cpustat[cpu].count[idx]; + return ret; +} + +/* + * per-zone information in memory controller. + */ +struct mem_cgroup_per_zone { + /* + * spin_lock to protect the per cgroup LRU + */ + spinlock_t lru_lock; + struct list_head lists[NR_LRU_LISTS]; + unsigned long count[NR_LRU_LISTS]; +}; +/* Macro for accessing counter */ +#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) + +struct mem_cgroup_per_node { + struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; +}; + +struct mem_cgroup_lru_info { + struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; +}; + +/* + * The memory controller data structure. The memory controller controls both + * page cache and RSS per cgroup. We would eventually like to provide + * statistics based on the statistics developed by Rik Van Riel for clock-pro, + * to help the administrator determine what knobs to tune. + * + * TODO: Add a water mark for the memory controller. Reclaim will begin when + * we hit the water mark. May be even add a low water mark, such that + * no reclaim occurs from a cgroup at it's low water mark, this is + * a feature that will be implemented much later in the future. + */ +struct mem_cgroup { + struct cgroup_subsys_state css; + /* + * the counter to account for memory usage + */ + struct res_counter res; + /* + * Per cgroup active and inactive list, similar to the + * per zone LRU lists. + */ + struct mem_cgroup_lru_info info; + + int prev_priority; /* for recording reclaim priority */ + /* + * statistics. + */ + struct mem_cgroup_stat stat; +}; +static struct mem_cgroup init_mem_cgroup; + +enum charge_type { + MEM_CGROUP_CHARGE_TYPE_CACHE = 0, + MEM_CGROUP_CHARGE_TYPE_MAPPED, + MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ + MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ + NR_CHARGE_TYPE, +}; + +/* only for here (for easy reading.) */ +#define PCGF_CACHE (1UL << PCG_CACHE) +#define PCGF_USED (1UL << PCG_USED) +#define PCGF_ACTIVE (1UL << PCG_ACTIVE) +#define PCGF_LOCK (1UL << PCG_LOCK) +#define PCGF_FILE (1UL << PCG_FILE) +static const unsigned long +pcg_default_flags[NR_CHARGE_TYPE] = { + PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */ + PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */ + PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ + 0, /* FORCE */ +}; + +/* + * Always modified under lru lock. Then, not necessary to preempt_disable() + */ +static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, + struct page_cgroup *pc, + bool charge) +{ + int val = (charge)? 1 : -1; + struct mem_cgroup_stat *stat = &mem->stat; + struct mem_cgroup_stat_cpu *cpustat; + + VM_BUG_ON(!irqs_disabled()); + + cpustat = &stat->cpustat[smp_processor_id()]; + if (PageCgroupCache(pc)) + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); + else + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); + + if (charge) + __mem_cgroup_stat_add_safe(cpustat, + MEM_CGROUP_STAT_PGPGIN_COUNT, 1); + else + __mem_cgroup_stat_add_safe(cpustat, + MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); +} + +static struct mem_cgroup_per_zone * +mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) +{ + return &mem->info.nodeinfo[nid]->zoneinfo[zid]; +} + +static struct mem_cgroup_per_zone * +page_cgroup_zoneinfo(struct page_cgroup *pc) +{ + struct mem_cgroup *mem = pc->mem_cgroup; + int nid = page_cgroup_nid(pc); + int zid = page_cgroup_zid(pc); + + return mem_cgroup_zoneinfo(mem, nid, zid); +} + +static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, + enum lru_list idx) +{ + int nid, zid; + struct mem_cgroup_per_zone *mz; + u64 total = 0; + + for_each_online_node(nid) + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + mz = mem_cgroup_zoneinfo(mem, nid, zid); + total += MEM_CGROUP_ZSTAT(mz, idx); + } + return total; +} + +static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) +{ + return container_of(cgroup_subsys_state(cont, + mem_cgroup_subsys_id), struct mem_cgroup, + css); +} + +struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) +{ + /* + * mm_update_next_owner() may clear mm->owner to NULL + * if it races with swapoff, page migration, etc. + * So this can be called with p == NULL. + */ + if (unlikely(!p)) + return NULL; + + return container_of(task_subsys_state(p, mem_cgroup_subsys_id), + struct mem_cgroup, css); +} + +static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, + struct page_cgroup *pc) +{ + int lru = LRU_BASE; + + if (PageCgroupUnevictable(pc)) + lru = LRU_UNEVICTABLE; + else { + if (PageCgroupActive(pc)) + lru += LRU_ACTIVE; + if (PageCgroupFile(pc)) + lru += LRU_FILE; + } + + MEM_CGROUP_ZSTAT(mz, lru) -= 1; + + mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false); + list_del(&pc->lru); +} + +static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, + struct page_cgroup *pc) +{ + int lru = LRU_BASE; + + if (PageCgroupUnevictable(pc)) + lru = LRU_UNEVICTABLE; + else { + if (PageCgroupActive(pc)) + lru += LRU_ACTIVE; + if (PageCgroupFile(pc)) + lru += LRU_FILE; + } + + MEM_CGROUP_ZSTAT(mz, lru) += 1; + list_add(&pc->lru, &mz->lists[lru]); + + mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true); +} + +static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru) +{ + struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); + int active = PageCgroupActive(pc); + int file = PageCgroupFile(pc); + int unevictable = PageCgroupUnevictable(pc); + enum lru_list from = unevictable ? LRU_UNEVICTABLE : + (LRU_FILE * !!file + !!active); + + if (lru == from) + return; + + MEM_CGROUP_ZSTAT(mz, from) -= 1; + /* + * However this is done under mz->lru_lock, another flags, which + * are not related to LRU, will be modified from out-of-lock. + * We have to use atomic set/clear flags. + */ + if (is_unevictable_lru(lru)) { + ClearPageCgroupActive(pc); + SetPageCgroupUnevictable(pc); + } else { + if (is_active_lru(lru)) + SetPageCgroupActive(pc); + else + ClearPageCgroupActive(pc); + ClearPageCgroupUnevictable(pc); + } + + MEM_CGROUP_ZSTAT(mz, lru) += 1; + list_move(&pc->lru, &mz->lists[lru]); +} + +int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) +{ + int ret; + + task_lock(task); + ret = task->mm && mm_match_cgroup(task->mm, mem); + task_unlock(task); + return ret; +} + +/* + * This routine assumes that the appropriate zone's lru lock is already held + */ +void mem_cgroup_move_lists(struct page *page, enum lru_list lru) +{ + struct page_cgroup *pc; + struct mem_cgroup_per_zone *mz; + unsigned long flags; + + if (mem_cgroup_subsys.disabled) + return; + + /* + * We cannot lock_page_cgroup while holding zone's lru_lock, + * because other holders of lock_page_cgroup can be interrupted + * with an attempt to rotate_reclaimable_page. But we cannot + * safely get to page_cgroup without it, so just try_lock it: + * mem_cgroup_isolate_pages allows for page left on wrong list. + */ + pc = lookup_page_cgroup(page); + if (!trylock_page_cgroup(pc)) + return; + if (pc && PageCgroupUsed(pc)) { + mz = page_cgroup_zoneinfo(pc); + spin_lock_irqsave(&mz->lru_lock, flags); + __mem_cgroup_move_lists(pc, lru); + spin_unlock_irqrestore(&mz->lru_lock, flags); + } + unlock_page_cgroup(pc); +} + +/* + * Calculate mapped_ratio under memory controller. This will be used in + * vmscan.c for deteremining we have to reclaim mapped pages. + */ +int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) +{ + long total, rss; + + /* + * usage is recorded in bytes. But, here, we assume the number of + * physical pages can be represented by "long" on any arch. + */ + total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L; + rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); + return (int)((rss * 100L) / total); +} + +/* + * prev_priority control...this will be used in memory reclaim path. + */ +int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) +{ + return mem->prev_priority; +} + +void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) +{ + if (priority < mem->prev_priority) + mem->prev_priority = priority; +} + +void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) +{ + mem->prev_priority = priority; +} + +/* + * Calculate # of pages to be scanned in this priority/zone. + * See also vmscan.c + * + * priority starts from "DEF_PRIORITY" and decremented in each loop. + * (see include/linux/mmzone.h) + */ + +long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone, + int priority, enum lru_list lru) +{ + long nr_pages; + int nid = zone->zone_pgdat->node_id; + int zid = zone_idx(zone); + struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); + + nr_pages = MEM_CGROUP_ZSTAT(mz, lru); + + return (nr_pages >> priority); +} + +unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, + struct list_head *dst, + unsigned long *scanned, int order, + int mode, struct zone *z, + struct mem_cgroup *mem_cont, + int active, int file) +{ + unsigned long nr_taken = 0; + struct page *page; + unsigned long scan; + LIST_HEAD(pc_list); + struct list_head *src; + struct page_cgroup *pc, *tmp; + int nid = z->zone_pgdat->node_id; + int zid = zone_idx(z); + struct mem_cgroup_per_zone *mz; + int lru = LRU_FILE * !!file + !!active; + + BUG_ON(!mem_cont); + mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); + src = &mz->lists[lru]; + + spin_lock(&mz->lru_lock); + scan = 0; + list_for_each_entry_safe_reverse(pc, tmp, src, lru) { + if (scan >= nr_to_scan) + break; + if (unlikely(!PageCgroupUsed(pc))) + continue; + page = pc->page; + + if (unlikely(!PageLRU(page))) + continue; + + /* + * TODO: play better with lumpy reclaim, grabbing anything. + */ + if (PageUnevictable(page) || + (PageActive(page) && !active) || + (!PageActive(page) && active)) { + __mem_cgroup_move_lists(pc, page_lru(page)); + continue; + } + + scan++; + list_move(&pc->lru, &pc_list); + + if (__isolate_lru_page(page, mode, file) == 0) { + list_move(&page->lru, dst); + nr_taken++; + } + } + + list_splice(&pc_list, src); + spin_unlock(&mz->lru_lock); + + *scanned = scan; + return nr_taken; +} + +/* + * Charge the memory controller for page usage. + * Return + * 0 if the charge was successful + * < 0 if the cgroup is over its limit + */ +static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask, enum charge_type ctype, + struct mem_cgroup *memcg) +{ + struct mem_cgroup *mem; + struct page_cgroup *pc; + unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; + struct mem_cgroup_per_zone *mz; + unsigned long flags; + + pc = lookup_page_cgroup(page); + /* can happen at boot */ + if (unlikely(!pc)) + return 0; + prefetchw(pc); + /* + * We always charge the cgroup the mm_struct belongs to. + * The mm_struct's mem_cgroup changes on task migration if the + * thread group leader migrates. It's possible that mm is not + * set, if so charge the init_mm (happens for pagecache usage). + */ + + if (likely(!memcg)) { + rcu_read_lock(); + mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!mem)) { + rcu_read_unlock(); + return 0; + } + /* + * For every charge from the cgroup, increment reference count + */ + css_get(&mem->css); + rcu_read_unlock(); + } else { + mem = memcg; + css_get(&memcg->css); + } + + while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) { + if (!(gfp_mask & __GFP_WAIT)) + goto out; + + if (try_to_free_mem_cgroup_pages(mem, gfp_mask)) + continue; + + /* + * try_to_free_mem_cgroup_pages() might not give us a full + * picture of reclaim. Some pages are reclaimed and might be + * moved to swap cache or just unmapped from the cgroup. + * Check the limit again to see if the reclaim reduced the + * current usage of the cgroup before giving up + */ + if (res_counter_check_under_limit(&mem->res)) + continue; + + if (!nr_retries--) { + mem_cgroup_out_of_memory(mem, gfp_mask); + goto out; + } + } + + + lock_page_cgroup(pc); + if (unlikely(PageCgroupUsed(pc))) { + unlock_page_cgroup(pc); + res_counter_uncharge(&mem->res, PAGE_SIZE); + css_put(&mem->css); + + goto done; + } + pc->mem_cgroup = mem; + /* + * If a page is accounted as a page cache, insert to inactive list. + * If anon, insert to active list. + */ + pc->flags = pcg_default_flags[ctype]; + + mz = page_cgroup_zoneinfo(pc); + + spin_lock_irqsave(&mz->lru_lock, flags); + __mem_cgroup_add_list(mz, pc); + spin_unlock_irqrestore(&mz->lru_lock, flags); + unlock_page_cgroup(pc); + +done: + return 0; +out: + css_put(&mem->css); + return -ENOMEM; +} + +int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) +{ + if (mem_cgroup_subsys.disabled) + return 0; + if (PageCompound(page)) + return 0; + /* + * If already mapped, we don't have to account. + * If page cache, page->mapping has address_space. + * But page->mapping may have out-of-use anon_vma pointer, + * detecit it by PageAnon() check. newly-mapped-anon's page->mapping + * is NULL. + */ + if (page_mapped(page) || (page->mapping && !PageAnon(page))) + return 0; + if (unlikely(!mm)) + mm = &init_mm; + return mem_cgroup_charge_common(page, mm, gfp_mask, + MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); +} + +int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask) +{ + if (mem_cgroup_subsys.disabled) + return 0; + if (PageCompound(page)) + return 0; + /* + * Corner case handling. This is called from add_to_page_cache() + * in usual. But some FS (shmem) precharges this page before calling it + * and call add_to_page_cache() with GFP_NOWAIT. + * + * For GFP_NOWAIT case, the page may be pre-charged before calling + * add_to_page_cache(). (See shmem.c) check it here and avoid to call + * charge twice. (It works but has to pay a bit larger cost.) + */ + if (!(gfp_mask & __GFP_WAIT)) { + struct page_cgroup *pc; + + + pc = lookup_page_cgroup(page); + if (!pc) + return 0; + lock_page_cgroup(pc); + if (PageCgroupUsed(pc)) { + unlock_page_cgroup(pc); + return 0; + } + unlock_page_cgroup(pc); + } + + if (unlikely(!mm)) + mm = &init_mm; + + if (page_is_file_cache(page)) + return mem_cgroup_charge_common(page, mm, gfp_mask, + MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); + else + return mem_cgroup_charge_common(page, mm, gfp_mask, + MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); +} + +/* + * uncharge if !page_mapped(page) + */ +static void +__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) +{ + struct page_cgroup *pc; + struct mem_cgroup *mem; + struct mem_cgroup_per_zone *mz; + unsigned long flags; + + if (mem_cgroup_subsys.disabled) + return; + + /* + * Check if our page_cgroup is valid + */ + pc = lookup_page_cgroup(page); + if (unlikely(!pc || !PageCgroupUsed(pc))) + return; + + lock_page_cgroup(pc); + if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page)) + || !PageCgroupUsed(pc)) { + /* This happens at race in zap_pte_range() and do_swap_page()*/ + unlock_page_cgroup(pc); + return; + } + ClearPageCgroupUsed(pc); + mem = pc->mem_cgroup; + + mz = page_cgroup_zoneinfo(pc); + spin_lock_irqsave(&mz->lru_lock, flags); + __mem_cgroup_remove_list(mz, pc); + spin_unlock_irqrestore(&mz->lru_lock, flags); + unlock_page_cgroup(pc); + + res_counter_uncharge(&mem->res, PAGE_SIZE); + css_put(&mem->css); + + return; +} + +void mem_cgroup_uncharge_page(struct page *page) +{ + /* early check. */ + if (page_mapped(page)) + return; + if (page->mapping && !PageAnon(page)) + return; + __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); +} + +void mem_cgroup_uncharge_cache_page(struct page *page) +{ + VM_BUG_ON(page_mapped(page)); + VM_BUG_ON(page->mapping); + __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); +} + +/* + * Before starting migration, account against new page. + */ +int mem_cgroup_prepare_migration(struct page *page, struct page *newpage) +{ + struct page_cgroup *pc; + struct mem_cgroup *mem = NULL; + enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; + int ret = 0; + + if (mem_cgroup_subsys.disabled) + return 0; + + pc = lookup_page_cgroup(page); + lock_page_cgroup(pc); + if (PageCgroupUsed(pc)) { + mem = pc->mem_cgroup; + css_get(&mem->css); + if (PageCgroupCache(pc)) { + if (page_is_file_cache(page)) + ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; + else + ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; + } + } + unlock_page_cgroup(pc); + if (mem) { + ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL, + ctype, mem); + css_put(&mem->css); + } + return ret; +} + +/* remove redundant charge if migration failed*/ +void mem_cgroup_end_migration(struct page *newpage) +{ + /* + * At success, page->mapping is not NULL. + * special rollback care is necessary when + * 1. at migration failure. (newpage->mapping is cleared in this case) + * 2. the newpage was moved but not remapped again because the task + * exits and the newpage is obsolete. In this case, the new page + * may be a swapcache. So, we just call mem_cgroup_uncharge_page() + * always for avoiding mess. The page_cgroup will be removed if + * unnecessary. File cache pages is still on radix-tree. Don't + * care it. + */ + if (!newpage->mapping) + __mem_cgroup_uncharge_common(newpage, + MEM_CGROUP_CHARGE_TYPE_FORCE); + else if (PageAnon(newpage)) + mem_cgroup_uncharge_page(newpage); +} + +/* + * A call to try to shrink memory usage under specified resource controller. + * This is typically used for page reclaiming for shmem for reducing side + * effect of page allocation from shmem, which is used by some mem_cgroup. + */ +int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) +{ + struct mem_cgroup *mem; + int progress = 0; + int retry = MEM_CGROUP_RECLAIM_RETRIES; + + if (mem_cgroup_subsys.disabled) + return 0; + if (!mm) + return 0; + + rcu_read_lock(); + mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!mem)) { + rcu_read_unlock(); + return 0; + } + css_get(&mem->css); + rcu_read_unlock(); + + do { + progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); + progress += res_counter_check_under_limit(&mem->res); + } while (!progress && --retry); + + css_put(&mem->css); + if (!retry) + return -ENOMEM; + return 0; +} + +int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) +{ + + int retry_count = MEM_CGROUP_RECLAIM_RETRIES; + int progress; + int ret = 0; + + while (res_counter_set_limit(&memcg->res, val)) { + if (signal_pending(current)) { + ret = -EINTR; + break; + } + if (!retry_count) { + ret = -EBUSY; + break; + } + progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL); + if (!progress) + retry_count--; + } + return ret; +} + + +/* + * This routine traverse page_cgroup in given list and drop them all. + * *And* this routine doesn't reclaim page itself, just removes page_cgroup. + */ +#define FORCE_UNCHARGE_BATCH (128) +static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, + struct mem_cgroup_per_zone *mz, + enum lru_list lru) +{ + struct page_cgroup *pc; + struct page *page; + int count = FORCE_UNCHARGE_BATCH; + unsigned long flags; + struct list_head *list; + + list = &mz->lists[lru]; + + spin_lock_irqsave(&mz->lru_lock, flags); + while (!list_empty(list)) { + pc = list_entry(list->prev, struct page_cgroup, lru); + page = pc->page; + if (!PageCgroupUsed(pc)) + break; + get_page(page); + spin_unlock_irqrestore(&mz->lru_lock, flags); + /* + * Check if this page is on LRU. !LRU page can be found + * if it's under page migration. + */ + if (PageLRU(page)) { + __mem_cgroup_uncharge_common(page, + MEM_CGROUP_CHARGE_TYPE_FORCE); + put_page(page); + if (--count <= 0) { + count = FORCE_UNCHARGE_BATCH; + cond_resched(); + } + } else { + spin_lock_irqsave(&mz->lru_lock, flags); + break; + } + spin_lock_irqsave(&mz->lru_lock, flags); + } + spin_unlock_irqrestore(&mz->lru_lock, flags); +} + +/* + * make mem_cgroup's charge to be 0 if there is no task. + * This enables deleting this mem_cgroup. + */ +static int mem_cgroup_force_empty(struct mem_cgroup *mem) +{ + int ret = -EBUSY; + int node, zid; + + css_get(&mem->css); + /* + * page reclaim code (kswapd etc..) will move pages between + * active_list <-> inactive_list while we don't take a lock. + * So, we have to do loop here until all lists are empty. + */ + while (mem->res.usage > 0) { + if (atomic_read(&mem->css.cgroup->count) > 0) + goto out; + /* This is for making all *used* pages to be on LRU. */ + lru_add_drain_all(); + for_each_node_state(node, N_POSSIBLE) + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + struct mem_cgroup_per_zone *mz; + enum lru_list l; + mz = mem_cgroup_zoneinfo(mem, node, zid); + for_each_lru(l) + mem_cgroup_force_empty_list(mem, mz, l); + } + cond_resched(); + } + ret = 0; +out: + css_put(&mem->css); + return ret; +} + +static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) +{ + return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, + cft->private); +} +/* + * The user of this function is... + * RES_LIMIT. + */ +static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, + const char *buffer) +{ + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + unsigned long long val; + int ret; + + switch (cft->private) { + case RES_LIMIT: + /* This function does all necessary parse...reuse it */ + ret = res_counter_memparse_write_strategy(buffer, &val); + if (!ret) + ret = mem_cgroup_resize_limit(memcg, val); + break; + default: + ret = -EINVAL; /* should be BUG() ? */ + break; + } + return ret; +} + +static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) +{ + struct mem_cgroup *mem; + + mem = mem_cgroup_from_cont(cont); + switch (event) { + case RES_MAX_USAGE: + res_counter_reset_max(&mem->res); + break; + case RES_FAILCNT: + res_counter_reset_failcnt(&mem->res); + break; + } + return 0; +} + +static int mem_force_empty_write(struct cgroup *cont, unsigned int event) +{ + return mem_cgroup_force_empty(mem_cgroup_from_cont(cont)); +} + +static const struct mem_cgroup_stat_desc { + const char *msg; + u64 unit; +} mem_cgroup_stat_desc[] = { + [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, }, + [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, + [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, }, + [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, }, +}; + +static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); + struct mem_cgroup_stat *stat = &mem_cont->stat; + int i; + + for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) { + s64 val; + + val = mem_cgroup_read_stat(stat, i); + val *= mem_cgroup_stat_desc[i].unit; + cb->fill(cb, mem_cgroup_stat_desc[i].msg, val); + } + /* showing # of active pages */ + { + unsigned long active_anon, inactive_anon; + unsigned long active_file, inactive_file; + unsigned long unevictable; + + inactive_anon = mem_cgroup_get_all_zonestat(mem_cont, + LRU_INACTIVE_ANON); + active_anon = mem_cgroup_get_all_zonestat(mem_cont, + LRU_ACTIVE_ANON); + inactive_file = mem_cgroup_get_all_zonestat(mem_cont, + LRU_INACTIVE_FILE); + active_file = mem_cgroup_get_all_zonestat(mem_cont, + LRU_ACTIVE_FILE); + unevictable = mem_cgroup_get_all_zonestat(mem_cont, + LRU_UNEVICTABLE); + + cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE); + cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE); + cb->fill(cb, "active_file", (active_file) * PAGE_SIZE); + cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE); + cb->fill(cb, "unevictable", unevictable * PAGE_SIZE); + + } + return 0; +} + +static struct cftype mem_cgroup_files[] = { + { + .name = "usage_in_bytes", + .private = RES_USAGE, + .read_u64 = mem_cgroup_read, + }, + { + .name = "max_usage_in_bytes", + .private = RES_MAX_USAGE, + .trigger = mem_cgroup_reset, + .read_u64 = mem_cgroup_read, + }, + { + .name = "limit_in_bytes", + .private = RES_LIMIT, + .write_string = mem_cgroup_write, + .read_u64 = mem_cgroup_read, + }, + { + .name = "failcnt", + .private = RES_FAILCNT, + .trigger = mem_cgroup_reset, + .read_u64 = mem_cgroup_read, + }, + { + .name = "force_empty", + .trigger = mem_force_empty_write, + }, + { + .name = "stat", + .read_map = mem_control_stat_show, + }, +}; + +static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) +{ + struct mem_cgroup_per_node *pn; + struct mem_cgroup_per_zone *mz; + enum lru_list l; + int zone, tmp = node; + /* + * This routine is called against possible nodes. + * But it's BUG to call kmalloc() against offline node. + * + * TODO: this routine can waste much memory for nodes which will + * never be onlined. It's better to use memory hotplug callback + * function. + */ + if (!node_state(node, N_NORMAL_MEMORY)) + tmp = -1; + pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); + if (!pn) + return 1; + + mem->info.nodeinfo[node] = pn; + memset(pn, 0, sizeof(*pn)); + + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + mz = &pn->zoneinfo[zone]; + spin_lock_init(&mz->lru_lock); + for_each_lru(l) + INIT_LIST_HEAD(&mz->lists[l]); + } + return 0; +} + +static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) +{ + kfree(mem->info.nodeinfo[node]); +} + +static struct mem_cgroup *mem_cgroup_alloc(void) +{ + struct mem_cgroup *mem; + + if (sizeof(*mem) < PAGE_SIZE) + mem = kmalloc(sizeof(*mem), GFP_KERNEL); + else + mem = vmalloc(sizeof(*mem)); + + if (mem) + memset(mem, 0, sizeof(*mem)); + return mem; +} + +static void mem_cgroup_free(struct mem_cgroup *mem) +{ + if (sizeof(*mem) < PAGE_SIZE) + kfree(mem); + else + vfree(mem); +} + + +static struct cgroup_subsys_state * +mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) +{ + struct mem_cgroup *mem; + int node; + + if (unlikely((cont->parent) == NULL)) { + mem = &init_mem_cgroup; + } else { + mem = mem_cgroup_alloc(); + if (!mem) + return ERR_PTR(-ENOMEM); + } + + res_counter_init(&mem->res); + + for_each_node_state(node, N_POSSIBLE) + if (alloc_mem_cgroup_per_zone_info(mem, node)) + goto free_out; + + return &mem->css; +free_out: + for_each_node_state(node, N_POSSIBLE) + free_mem_cgroup_per_zone_info(mem, node); + if (cont->parent != NULL) + mem_cgroup_free(mem); + return ERR_PTR(-ENOMEM); +} + +static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + struct mem_cgroup *mem = mem_cgroup_from_cont(cont); + mem_cgroup_force_empty(mem); +} + +static void mem_cgroup_destroy(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + int node; + struct mem_cgroup *mem = mem_cgroup_from_cont(cont); + + for_each_node_state(node, N_POSSIBLE) + free_mem_cgroup_per_zone_info(mem, node); + + mem_cgroup_free(mem_cgroup_from_cont(cont)); +} + +static int mem_cgroup_populate(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + return cgroup_add_files(cont, ss, mem_cgroup_files, + ARRAY_SIZE(mem_cgroup_files)); +} + +static void mem_cgroup_move_task(struct cgroup_subsys *ss, + struct cgroup *cont, + struct cgroup *old_cont, + struct task_struct *p) +{ + struct mm_struct *mm; + struct mem_cgroup *mem, *old_mem; + + mm = get_task_mm(p); + if (mm == NULL) + return; + + mem = mem_cgroup_from_cont(cont); + old_mem = mem_cgroup_from_cont(old_cont); + + /* + * Only thread group leaders are allowed to migrate, the mm_struct is + * in effect owned by the leader + */ + if (!thread_group_leader(p)) + goto out; + +out: + mmput(mm); +} + +struct cgroup_subsys mem_cgroup_subsys = { + .name = "memory", + .subsys_id = mem_cgroup_subsys_id, + .create = mem_cgroup_create, + .pre_destroy = mem_cgroup_pre_destroy, + .destroy = mem_cgroup_destroy, + .populate = mem_cgroup_populate, + .attach = mem_cgroup_move_task, + .early_init = 0, +}; diff --git a/mm/memory.c b/mm/memory.c new file mode 100644 index 0000000..fe2257f --- /dev/null +++ b/mm/memory.c @@ -0,0 +1,3051 @@ +/* + * linux/mm/memory.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +/* + * demand-loading started 01.12.91 - seems it is high on the list of + * things wanted, and it should be easy to implement. - Linus + */ + +/* + * Ok, demand-loading was easy, shared pages a little bit tricker. Shared + * pages started 02.12.91, seems to work. - Linus. + * + * Tested sharing by executing about 30 /bin/sh: under the old kernel it + * would have taken more than the 6M I have free, but it worked well as + * far as I could see. + * + * Also corrected some "invalidate()"s - I wasn't doing enough of them. + */ + +/* + * Real VM (paging to/from disk) started 18.12.91. Much more work and + * thought has to go into this. Oh, well.. + * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. + * Found it. Everything seems to work now. + * 20.12.91 - Ok, making the swap-device changeable like the root. + */ + +/* + * 05.04.94 - Multi-page memory management added for v1.1. + * Idea by Alex Bligh (alex@cconcepts.co.uk) + * + * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG + * (Gerhard.Wichert@pdb.siemens.de) + * + * Aug/Sep 2004 Changed to four level page tables (Andi Kleen) + */ + +#include <linux/kernel_stat.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/mman.h> +#include <linux/swap.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/rmap.h> +#include <linux/module.h> +#include <linux/delayacct.h> +#include <linux/init.h> +#include <linux/writeback.h> +#include <linux/memcontrol.h> +#include <linux/mmu_notifier.h> + +#include <asm/pgalloc.h> +#include <asm/uaccess.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> +#include <asm/pgtable.h> + +#include <linux/swapops.h> +#include <linux/elf.h> + +#include "internal.h" + +#ifndef CONFIG_NEED_MULTIPLE_NODES +/* use the per-pgdat data instead for discontigmem - mbligh */ +unsigned long max_mapnr; +struct page *mem_map; + +EXPORT_SYMBOL(max_mapnr); +EXPORT_SYMBOL(mem_map); +#endif + +unsigned long num_physpages; +/* + * A number of key systems in x86 including ioremap() rely on the assumption + * that high_memory defines the upper bound on direct map memory, then end + * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and + * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL + * and ZONE_HIGHMEM. + */ +void * high_memory; + +EXPORT_SYMBOL(num_physpages); +EXPORT_SYMBOL(high_memory); + +/* + * Randomize the address space (stacks, mmaps, brk, etc.). + * + * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization, + * as ancient (libc5 based) binaries can segfault. ) + */ +int randomize_va_space __read_mostly = +#ifdef CONFIG_COMPAT_BRK + 1; +#else + 2; +#endif + +static int __init disable_randmaps(char *s) +{ + randomize_va_space = 0; + return 1; +} +__setup("norandmaps", disable_randmaps); + + +/* + * If a p?d_bad entry is found while walking page tables, report + * the error, before resetting entry to p?d_none. Usually (but + * very seldom) called out from the p?d_none_or_clear_bad macros. + */ + +void pgd_clear_bad(pgd_t *pgd) +{ + pgd_ERROR(*pgd); + pgd_clear(pgd); +} + +void pud_clear_bad(pud_t *pud) +{ + pud_ERROR(*pud); + pud_clear(pud); +} + +void pmd_clear_bad(pmd_t *pmd) +{ + pmd_ERROR(*pmd); + pmd_clear(pmd); +} + +/* + * Note: this doesn't free the actual pages themselves. That + * has been handled earlier when unmapping all the memory regions. + */ +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) +{ + pgtable_t token = pmd_pgtable(*pmd); + pmd_clear(pmd); + pte_free_tlb(tlb, token); + tlb->mm->nr_ptes--; +} + +static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + pmd_t *pmd; + unsigned long next; + unsigned long start; + + start = addr; + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; + free_pte_range(tlb, pmd); + } while (pmd++, addr = next, addr != end); + + start &= PUD_MASK; + if (start < floor) + return; + if (ceiling) { + ceiling &= PUD_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + return; + + pmd = pmd_offset(pud, start); + pud_clear(pud); + pmd_free_tlb(tlb, pmd); +} + +static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + pud_t *pud; + unsigned long next; + unsigned long start; + + start = addr; + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + free_pmd_range(tlb, pud, addr, next, floor, ceiling); + } while (pud++, addr = next, addr != end); + + start &= PGDIR_MASK; + if (start < floor) + return; + if (ceiling) { + ceiling &= PGDIR_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + return; + + pud = pud_offset(pgd, start); + pgd_clear(pgd); + pud_free_tlb(tlb, pud); +} + +/* + * This function frees user-level page tables of a process. + * + * Must be called with pagetable lock held. + */ +void free_pgd_range(struct mmu_gather *tlb, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + pgd_t *pgd; + unsigned long next; + unsigned long start; + + /* + * The next few lines have given us lots of grief... + * + * Why are we testing PMD* at this top level? Because often + * there will be no work to do at all, and we'd prefer not to + * go all the way down to the bottom just to discover that. + * + * Why all these "- 1"s? Because 0 represents both the bottom + * of the address space and the top of it (using -1 for the + * top wouldn't help much: the masks would do the wrong thing). + * The rule is that addr 0 and floor 0 refer to the bottom of + * the address space, but end 0 and ceiling 0 refer to the top + * Comparisons need to use "end - 1" and "ceiling - 1" (though + * that end 0 case should be mythical). + * + * Wherever addr is brought up or ceiling brought down, we must + * be careful to reject "the opposite 0" before it confuses the + * subsequent tests. But what about where end is brought down + * by PMD_SIZE below? no, end can't go down to 0 there. + * + * Whereas we round start (addr) and ceiling down, by different + * masks at different levels, in order to test whether a table + * now has no other vmas using it, so can be freed, we don't + * bother to round floor or end up - the tests don't need that. + */ + + addr &= PMD_MASK; + if (addr < floor) { + addr += PMD_SIZE; + if (!addr) + return; + } + if (ceiling) { + ceiling &= PMD_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + end -= PMD_SIZE; + if (addr > end - 1) + return; + + start = addr; + pgd = pgd_offset(tlb->mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + free_pud_range(tlb, pgd, addr, next, floor, ceiling); + } while (pgd++, addr = next, addr != end); +} + +void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long floor, unsigned long ceiling) +{ + while (vma) { + struct vm_area_struct *next = vma->vm_next; + unsigned long addr = vma->vm_start; + + /* + * Hide vma from rmap and vmtruncate before freeing pgtables + */ + anon_vma_unlink(vma); + unlink_file_vma(vma); + + if (is_vm_hugetlb_page(vma)) { + hugetlb_free_pgd_range(tlb, addr, vma->vm_end, + floor, next? next->vm_start: ceiling); + } else { + /* + * Optimization: gather nearby vmas into one call down + */ + while (next && next->vm_start <= vma->vm_end + PMD_SIZE + && !is_vm_hugetlb_page(next)) { + vma = next; + next = vma->vm_next; + anon_vma_unlink(vma); + unlink_file_vma(vma); + } + free_pgd_range(tlb, addr, vma->vm_end, + floor, next? next->vm_start: ceiling); + } + vma = next; + } +} + +int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +{ + pgtable_t new = pte_alloc_one(mm, address); + if (!new) + return -ENOMEM; + + /* + * Ensure all pte setup (eg. pte page lock and page clearing) are + * visible before the pte is made visible to other CPUs by being + * put into page tables. + * + * The other side of the story is the pointer chasing in the page + * table walking code (when walking the page table without locking; + * ie. most of the time). Fortunately, these data accesses consist + * of a chain of data-dependent loads, meaning most CPUs (alpha + * being the notable exception) will already guarantee loads are + * seen in-order. See the alpha page table accessors for the + * smp_read_barrier_depends() barriers in page table walking code. + */ + smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ + + spin_lock(&mm->page_table_lock); + if (!pmd_present(*pmd)) { /* Has another populated it ? */ + mm->nr_ptes++; + pmd_populate(mm, pmd, new); + new = NULL; + } + spin_unlock(&mm->page_table_lock); + if (new) + pte_free(mm, new); + return 0; +} + +int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) +{ + pte_t *new = pte_alloc_one_kernel(&init_mm, address); + if (!new) + return -ENOMEM; + + smp_wmb(); /* See comment in __pte_alloc */ + + spin_lock(&init_mm.page_table_lock); + if (!pmd_present(*pmd)) { /* Has another populated it ? */ + pmd_populate_kernel(&init_mm, pmd, new); + new = NULL; + } + spin_unlock(&init_mm.page_table_lock); + if (new) + pte_free_kernel(&init_mm, new); + return 0; +} + +static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) +{ + if (file_rss) + add_mm_counter(mm, file_rss, file_rss); + if (anon_rss) + add_mm_counter(mm, anon_rss, anon_rss); +} + +/* + * This function is called to print an error when a bad pte + * is found. For example, we might have a PFN-mapped pte in + * a region that doesn't allow it. + * + * The calling function must still handle the error. + */ +static void print_bad_pte(struct vm_area_struct *vma, pte_t pte, + unsigned long vaddr) +{ + printk(KERN_ERR "Bad pte = %08llx, process = %s, " + "vm_flags = %lx, vaddr = %lx\n", + (long long)pte_val(pte), + (vma->vm_mm == current->mm ? current->comm : "???"), + vma->vm_flags, vaddr); + dump_stack(); +} + +static inline int is_cow_mapping(unsigned int flags) +{ + return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; +} + +/* + * vm_normal_page -- This function gets the "struct page" associated with a pte. + * + * "Special" mappings do not wish to be associated with a "struct page" (either + * it doesn't exist, or it exists but they don't want to touch it). In this + * case, NULL is returned here. "Normal" mappings do have a struct page. + * + * There are 2 broad cases. Firstly, an architecture may define a pte_special() + * pte bit, in which case this function is trivial. Secondly, an architecture + * may not have a spare pte bit, which requires a more complicated scheme, + * described below. + * + * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a + * special mapping (even if there are underlying and valid "struct pages"). + * COWed pages of a VM_PFNMAP are always normal. + * + * The way we recognize COWed pages within VM_PFNMAP mappings is through the + * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit + * set, and the vm_pgoff will point to the first PFN mapped: thus every special + * mapping will always honor the rule + * + * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) + * + * And for normal mappings this is false. + * + * This restricts such mappings to be a linear translation from virtual address + * to pfn. To get around this restriction, we allow arbitrary mappings so long + * as the vma is not a COW mapping; in that case, we know that all ptes are + * special (because none can have been COWed). + * + * + * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP. + * + * VM_MIXEDMAP mappings can likewise contain memory with or without "struct + * page" backing, however the difference is that _all_ pages with a struct + * page (that is, those where pfn_valid is true) are refcounted and considered + * normal pages by the VM. The disadvantage is that pages are refcounted + * (which can be slower and simply not an option for some PFNMAP users). The + * advantage is that we don't have to follow the strict linearity rule of + * PFNMAP mappings in order to support COWable mappings. + * + */ +#ifdef __HAVE_ARCH_PTE_SPECIAL +# define HAVE_PTE_SPECIAL 1 +#else +# define HAVE_PTE_SPECIAL 0 +#endif +struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + pte_t pte) +{ + unsigned long pfn; + + if (HAVE_PTE_SPECIAL) { + if (likely(!pte_special(pte))) { + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + return pte_page(pte); + } + VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))); + return NULL; + } + + /* !HAVE_PTE_SPECIAL case follows: */ + + pfn = pte_pfn(pte); + + if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { + if (vma->vm_flags & VM_MIXEDMAP) { + if (!pfn_valid(pfn)) + return NULL; + goto out; + } else { + unsigned long off; + off = (addr - vma->vm_start) >> PAGE_SHIFT; + if (pfn == vma->vm_pgoff + off) + return NULL; + if (!is_cow_mapping(vma->vm_flags)) + return NULL; + } + } + + VM_BUG_ON(!pfn_valid(pfn)); + + /* + * NOTE! We still have PageReserved() pages in the page tables. + * + * eg. VDSO mappings can cause them to exist. + */ +out: + return pfn_to_page(pfn); +} + +/* + * copy one vm_area from one task to the other. Assumes the page tables + * already present in the new task to be cleared in the whole range + * covered by this vma. + */ + +static inline void +copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, + unsigned long addr, int *rss) +{ + unsigned long vm_flags = vma->vm_flags; + pte_t pte = *src_pte; + struct page *page; + + /* pte contains position in swap or file, so copy. */ + if (unlikely(!pte_present(pte))) { + if (!pte_file(pte)) { + swp_entry_t entry = pte_to_swp_entry(pte); + + swap_duplicate(entry); + /* make sure dst_mm is on swapoff's mmlist. */ + if (unlikely(list_empty(&dst_mm->mmlist))) { + spin_lock(&mmlist_lock); + if (list_empty(&dst_mm->mmlist)) + list_add(&dst_mm->mmlist, + &src_mm->mmlist); + spin_unlock(&mmlist_lock); + } + if (is_write_migration_entry(entry) && + is_cow_mapping(vm_flags)) { + /* + * COW mappings require pages in both parent + * and child to be set to read. + */ + make_migration_entry_read(&entry); + pte = swp_entry_to_pte(entry); + set_pte_at(src_mm, addr, src_pte, pte); + } + } + goto out_set_pte; + } + + /* + * If it's a COW mapping, write protect it both + * in the parent and the child + */ + if (is_cow_mapping(vm_flags)) { + ptep_set_wrprotect(src_mm, addr, src_pte); + pte = pte_wrprotect(pte); + } + + /* + * If it's a shared mapping, mark it clean in + * the child + */ + if (vm_flags & VM_SHARED) + pte = pte_mkclean(pte); + pte = pte_mkold(pte); + + page = vm_normal_page(vma, addr, pte); + if (page) { + get_page(page); + page_dup_rmap(page, vma, addr); + rss[!!PageAnon(page)]++; + } + +out_set_pte: + set_pte_at(dst_mm, addr, dst_pte, pte); +} + +static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + pte_t *src_pte, *dst_pte; + spinlock_t *src_ptl, *dst_ptl; + int progress = 0; + int rss[2]; + +again: + rss[1] = rss[0] = 0; + dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); + if (!dst_pte) + return -ENOMEM; + src_pte = pte_offset_map_nested(src_pmd, addr); + src_ptl = pte_lockptr(src_mm, src_pmd); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + arch_enter_lazy_mmu_mode(); + + do { + /* + * We are holding two locks at this point - either of them + * could generate latencies in another task on another CPU. + */ + if (progress >= 32) { + progress = 0; + if (need_resched() || + spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) + break; + } + if (pte_none(*src_pte)) { + progress++; + continue; + } + copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); + progress += 8; + } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); + + arch_leave_lazy_mmu_mode(); + spin_unlock(src_ptl); + pte_unmap_nested(src_pte - 1); + add_mm_rss(dst_mm, rss[0], rss[1]); + pte_unmap_unlock(dst_pte - 1, dst_ptl); + cond_resched(); + if (addr != end) + goto again; + return 0; +} + +static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + pmd_t *src_pmd, *dst_pmd; + unsigned long next; + + dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); + if (!dst_pmd) + return -ENOMEM; + src_pmd = pmd_offset(src_pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(src_pmd)) + continue; + if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, + vma, addr, next)) + return -ENOMEM; + } while (dst_pmd++, src_pmd++, addr = next, addr != end); + return 0; +} + +static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + pud_t *src_pud, *dst_pud; + unsigned long next; + + dst_pud = pud_alloc(dst_mm, dst_pgd, addr); + if (!dst_pud) + return -ENOMEM; + src_pud = pud_offset(src_pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(src_pud)) + continue; + if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, + vma, addr, next)) + return -ENOMEM; + } while (dst_pud++, src_pud++, addr = next, addr != end); + return 0; +} + +int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + struct vm_area_struct *vma) +{ + pgd_t *src_pgd, *dst_pgd; + unsigned long next; + unsigned long addr = vma->vm_start; + unsigned long end = vma->vm_end; + int ret; + + /* + * Don't copy ptes where a page fault will fill them correctly. + * Fork becomes much lighter when there are big shared or private + * readonly mappings. The tradeoff is that copy_page_range is more + * efficient than faulting. + */ + if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) { + if (!vma->anon_vma) + return 0; + } + + if (is_vm_hugetlb_page(vma)) + return copy_hugetlb_page_range(dst_mm, src_mm, vma); + + /* + * We need to invalidate the secondary MMU mappings only when + * there could be a permission downgrade on the ptes of the + * parent mm. And a permission downgrade will only happen if + * is_cow_mapping() returns true. + */ + if (is_cow_mapping(vma->vm_flags)) + mmu_notifier_invalidate_range_start(src_mm, addr, end); + + ret = 0; + dst_pgd = pgd_offset(dst_mm, addr); + src_pgd = pgd_offset(src_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(src_pgd)) + continue; + if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, + vma, addr, next))) { + ret = -ENOMEM; + break; + } + } while (dst_pgd++, src_pgd++, addr = next, addr != end); + + if (is_cow_mapping(vma->vm_flags)) + mmu_notifier_invalidate_range_end(src_mm, + vma->vm_start, end); + return ret; +} + +static unsigned long zap_pte_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, + long *zap_work, struct zap_details *details) +{ + struct mm_struct *mm = tlb->mm; + pte_t *pte; + spinlock_t *ptl; + int file_rss = 0; + int anon_rss = 0; + + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + arch_enter_lazy_mmu_mode(); + do { + pte_t ptent = *pte; + if (pte_none(ptent)) { + (*zap_work)--; + continue; + } + + (*zap_work) -= PAGE_SIZE; + + if (pte_present(ptent)) { + struct page *page; + + page = vm_normal_page(vma, addr, ptent); + if (unlikely(details) && page) { + /* + * unmap_shared_mapping_pages() wants to + * invalidate cache without truncating: + * unmap shared but keep private pages. + */ + if (details->check_mapping && + details->check_mapping != page->mapping) + continue; + /* + * Each page->index must be checked when + * invalidating or truncating nonlinear. + */ + if (details->nonlinear_vma && + (page->index < details->first_index || + page->index > details->last_index)) + continue; + } + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + tlb_remove_tlb_entry(tlb, pte, addr); + if (unlikely(!page)) + continue; + if (unlikely(details) && details->nonlinear_vma + && linear_page_index(details->nonlinear_vma, + addr) != page->index) + set_pte_at(mm, addr, pte, + pgoff_to_pte(page->index)); + if (PageAnon(page)) + anon_rss--; + else { + if (pte_dirty(ptent)) + set_page_dirty(page); + if (pte_young(ptent)) + SetPageReferenced(page); + file_rss--; + } + page_remove_rmap(page, vma); + tlb_remove_page(tlb, page); + continue; + } + /* + * If details->check_mapping, we leave swap entries; + * if details->nonlinear_vma, we leave file entries. + */ + if (unlikely(details)) + continue; + if (!pte_file(ptent)) + free_swap_and_cache(pte_to_swp_entry(ptent)); + pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); + + add_mm_rss(mm, file_rss, anon_rss); + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(pte - 1, ptl); + + return addr; +} + +static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pud_t *pud, + unsigned long addr, unsigned long end, + long *zap_work, struct zap_details *details) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) { + (*zap_work)--; + continue; + } + next = zap_pte_range(tlb, vma, pmd, addr, next, + zap_work, details); + } while (pmd++, addr = next, (addr != end && *zap_work > 0)); + + return addr; +} + +static inline unsigned long zap_pud_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end, + long *zap_work, struct zap_details *details) +{ + pud_t *pud; + unsigned long next; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) { + (*zap_work)--; + continue; + } + next = zap_pmd_range(tlb, vma, pud, addr, next, + zap_work, details); + } while (pud++, addr = next, (addr != end && *zap_work > 0)); + + return addr; +} + +static unsigned long unmap_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + long *zap_work, struct zap_details *details) +{ + pgd_t *pgd; + unsigned long next; + + if (details && !details->check_mapping && !details->nonlinear_vma) + details = NULL; + + BUG_ON(addr >= end); + tlb_start_vma(tlb, vma); + pgd = pgd_offset(vma->vm_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) { + (*zap_work)--; + continue; + } + next = zap_pud_range(tlb, vma, pgd, addr, next, + zap_work, details); + } while (pgd++, addr = next, (addr != end && *zap_work > 0)); + tlb_end_vma(tlb, vma); + + return addr; +} + +#ifdef CONFIG_PREEMPT +# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) +#else +/* No preempt: go for improved straight-line efficiency */ +# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) +#endif + +/** + * unmap_vmas - unmap a range of memory covered by a list of vma's + * @tlbp: address of the caller's struct mmu_gather + * @vma: the starting vma + * @start_addr: virtual address at which to start unmapping + * @end_addr: virtual address at which to end unmapping + * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here + * @details: details of nonlinear truncation or shared cache invalidation + * + * Returns the end address of the unmapping (restart addr if interrupted). + * + * Unmap all pages in the vma list. + * + * We aim to not hold locks for too long (for scheduling latency reasons). + * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to + * return the ending mmu_gather to the caller. + * + * Only addresses between `start' and `end' will be unmapped. + * + * The VMA list must be sorted in ascending virtual address order. + * + * unmap_vmas() assumes that the caller will flush the whole unmapped address + * range after unmap_vmas() returns. So the only responsibility here is to + * ensure that any thus-far unmapped pages are flushed before unmap_vmas() + * drops the lock and schedules. + */ +unsigned long unmap_vmas(struct mmu_gather **tlbp, + struct vm_area_struct *vma, unsigned long start_addr, + unsigned long end_addr, unsigned long *nr_accounted, + struct zap_details *details) +{ + long zap_work = ZAP_BLOCK_SIZE; + unsigned long tlb_start = 0; /* For tlb_finish_mmu */ + int tlb_start_valid = 0; + unsigned long start = start_addr; + spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; + int fullmm = (*tlbp)->fullmm; + struct mm_struct *mm = vma->vm_mm; + + mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); + for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { + unsigned long end; + + start = max(vma->vm_start, start_addr); + if (start >= vma->vm_end) + continue; + end = min(vma->vm_end, end_addr); + if (end <= vma->vm_start) + continue; + + if (vma->vm_flags & VM_ACCOUNT) + *nr_accounted += (end - start) >> PAGE_SHIFT; + + while (start != end) { + if (!tlb_start_valid) { + tlb_start = start; + tlb_start_valid = 1; + } + + if (unlikely(is_vm_hugetlb_page(vma))) { + /* + * It is undesirable to test vma->vm_file as it + * should be non-null for valid hugetlb area. + * However, vm_file will be NULL in the error + * cleanup path of do_mmap_pgoff. When + * hugetlbfs ->mmap method fails, + * do_mmap_pgoff() nullifies vma->vm_file + * before calling this function to clean up. + * Since no pte has actually been setup, it is + * safe to do nothing in this case. + */ + if (vma->vm_file) { + unmap_hugepage_range(vma, start, end, NULL); + zap_work -= (end - start) / + pages_per_huge_page(hstate_vma(vma)); + } + + start = end; + } else + start = unmap_page_range(*tlbp, vma, + start, end, &zap_work, details); + + if (zap_work > 0) { + BUG_ON(start != end); + break; + } + + tlb_finish_mmu(*tlbp, tlb_start, start); + + if (need_resched() || + (i_mmap_lock && spin_needbreak(i_mmap_lock))) { + if (i_mmap_lock) { + *tlbp = NULL; + goto out; + } + cond_resched(); + } + + *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); + tlb_start_valid = 0; + zap_work = ZAP_BLOCK_SIZE; + } + } +out: + mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); + return start; /* which is now the end (or restart) address */ +} + +/** + * zap_page_range - remove user pages in a given range + * @vma: vm_area_struct holding the applicable pages + * @address: starting address of pages to zap + * @size: number of bytes to zap + * @details: details of nonlinear truncation or shared cache invalidation + */ +unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, + unsigned long size, struct zap_details *details) +{ + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather *tlb; + unsigned long end = address + size; + unsigned long nr_accounted = 0; + + lru_add_drain(); + tlb = tlb_gather_mmu(mm, 0); + update_hiwater_rss(mm); + end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); + if (tlb) + tlb_finish_mmu(tlb, address, end); + return end; +} + +/** + * zap_vma_ptes - remove ptes mapping the vma + * @vma: vm_area_struct holding ptes to be zapped + * @address: starting address of pages to zap + * @size: number of bytes to zap + * + * This function only unmaps ptes assigned to VM_PFNMAP vmas. + * + * The entire address range must be fully contained within the vma. + * + * Returns 0 if successful. + */ +int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, + unsigned long size) +{ + if (address < vma->vm_start || address + size > vma->vm_end || + !(vma->vm_flags & VM_PFNMAP)) + return -1; + zap_page_range(vma, address, size, NULL); + return 0; +} +EXPORT_SYMBOL_GPL(zap_vma_ptes); + +/* + * Do a quick page-table lookup for a single page. + */ +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, + unsigned int flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + struct page *page; + struct mm_struct *mm = vma->vm_mm; + + page = follow_huge_addr(mm, address, flags & FOLL_WRITE); + if (!IS_ERR(page)) { + BUG_ON(flags & FOLL_GET); + goto out; + } + + page = NULL; + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto no_page_table; + + pud = pud_offset(pgd, address); + if (pud_none(*pud)) + goto no_page_table; + if (pud_huge(*pud)) { + BUG_ON(flags & FOLL_GET); + page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); + goto out; + } + if (unlikely(pud_bad(*pud))) + goto no_page_table; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + goto no_page_table; + if (pmd_huge(*pmd)) { + BUG_ON(flags & FOLL_GET); + page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); + goto out; + } + if (unlikely(pmd_bad(*pmd))) + goto no_page_table; + + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + + pte = *ptep; + if (!pte_present(pte)) + goto no_page; + if ((flags & FOLL_WRITE) && !pte_write(pte)) + goto unlock; + page = vm_normal_page(vma, address, pte); + if (unlikely(!page)) + goto bad_page; + + if (flags & FOLL_GET) + get_page(page); + if (flags & FOLL_TOUCH) { + if ((flags & FOLL_WRITE) && + !pte_dirty(pte) && !PageDirty(page)) + set_page_dirty(page); + mark_page_accessed(page); + } +unlock: + pte_unmap_unlock(ptep, ptl); +out: + return page; + +bad_page: + pte_unmap_unlock(ptep, ptl); + return ERR_PTR(-EFAULT); + +no_page: + pte_unmap_unlock(ptep, ptl); + if (!pte_none(pte)) + return page; + /* Fall through to ZERO_PAGE handling */ +no_page_table: + /* + * When core dumping an enormous anonymous area that nobody + * has touched so far, we don't want to allocate page tables. + */ + if (flags & FOLL_ANON) { + page = ZERO_PAGE(0); + if (flags & FOLL_GET) + get_page(page); + BUG_ON(flags & FOLL_WRITE); + } + return page; +} + +/* Can we do the FOLL_ANON optimization? */ +static inline int use_zero_page(struct vm_area_struct *vma) +{ + /* + * We don't want to optimize FOLL_ANON for make_pages_present() + * when it tries to page in a VM_LOCKED region. As to VM_SHARED, + * we want to get the page from the page tables to make sure + * that we serialize and update with any other user of that + * mapping. + */ + if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) + return 0; + /* + * And if we have a fault routine, it's not an anonymous region. + */ + return !vma->vm_ops || !vma->vm_ops->fault; +} + + + +int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int flags, + struct page **pages, struct vm_area_struct **vmas) +{ + int i; + unsigned int vm_flags = 0; + int write = !!(flags & GUP_FLAGS_WRITE); + int force = !!(flags & GUP_FLAGS_FORCE); + int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); + + if (len <= 0) + return 0; + /* + * Require read or write permissions. + * If 'force' is set, we only require the "MAY" flags. + */ + vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); + vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + i = 0; + + do { + struct vm_area_struct *vma; + unsigned int foll_flags; + + vma = find_extend_vma(mm, start); + if (!vma && in_gate_area(tsk, start)) { + unsigned long pg = start & PAGE_MASK; + struct vm_area_struct *gate_vma = get_gate_vma(tsk); + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + /* user gate pages are read-only */ + if (!ignore && write) + return i ? : -EFAULT; + if (pg > TASK_SIZE) + pgd = pgd_offset_k(pg); + else + pgd = pgd_offset_gate(mm, pg); + BUG_ON(pgd_none(*pgd)); + pud = pud_offset(pgd, pg); + BUG_ON(pud_none(*pud)); + pmd = pmd_offset(pud, pg); + if (pmd_none(*pmd)) + return i ? : -EFAULT; + pte = pte_offset_map(pmd, pg); + if (pte_none(*pte)) { + pte_unmap(pte); + return i ? : -EFAULT; + } + if (pages) { + struct page *page = vm_normal_page(gate_vma, start, *pte); + pages[i] = page; + if (page) + get_page(page); + } + pte_unmap(pte); + if (vmas) + vmas[i] = gate_vma; + i++; + start += PAGE_SIZE; + len--; + continue; + } + + if (!vma || + (vma->vm_flags & (VM_IO | VM_PFNMAP)) || + (!ignore && !(vm_flags & vma->vm_flags))) + return i ? : -EFAULT; + + if (is_vm_hugetlb_page(vma)) { + i = follow_hugetlb_page(mm, vma, pages, vmas, + &start, &len, i, write); + continue; + } + + foll_flags = FOLL_TOUCH; + if (pages) + foll_flags |= FOLL_GET; + if (!write && use_zero_page(vma)) + foll_flags |= FOLL_ANON; + + do { + struct page *page; + + /* + * If tsk is ooming, cut off its access to large memory + * allocations. It has a pending SIGKILL, but it can't + * be processed until returning to user space. + */ + if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE))) + return i ? i : -ENOMEM; + + if (write) + foll_flags |= FOLL_WRITE; + + cond_resched(); + while (!(page = follow_page(vma, start, foll_flags))) { + int ret; + ret = handle_mm_fault(mm, vma, start, + foll_flags & FOLL_WRITE); + if (ret & VM_FAULT_ERROR) { + if (ret & VM_FAULT_OOM) + return i ? i : -ENOMEM; + else if (ret & VM_FAULT_SIGBUS) + return i ? i : -EFAULT; + BUG(); + } + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + + /* + * The VM_FAULT_WRITE bit tells us that + * do_wp_page has broken COW when necessary, + * even if maybe_mkwrite decided not to set + * pte_write. We can thus safely do subsequent + * page lookups as if they were reads. + */ + if (ret & VM_FAULT_WRITE) + foll_flags &= ~FOLL_WRITE; + + cond_resched(); + } + if (IS_ERR(page)) + return i ? i : PTR_ERR(page); + if (pages) { + pages[i] = page; + + flush_anon_page(vma, page, start); + flush_dcache_page(page); + } + if (vmas) + vmas[i] = vma; + i++; + start += PAGE_SIZE; + len--; + } while (len && start < vma->vm_end); + } while (len); + return i; +} + +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int write, int force, + struct page **pages, struct vm_area_struct **vmas) +{ + int flags = 0; + + if (write) + flags |= GUP_FLAGS_WRITE; + if (force) + flags |= GUP_FLAGS_FORCE; + + return __get_user_pages(tsk, mm, + start, len, flags, + pages, vmas); +} + +EXPORT_SYMBOL(get_user_pages); + +pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, + spinlock_t **ptl) +{ + pgd_t * pgd = pgd_offset(mm, addr); + pud_t * pud = pud_alloc(mm, pgd, addr); + if (pud) { + pmd_t * pmd = pmd_alloc(mm, pud, addr); + if (pmd) + return pte_alloc_map_lock(mm, pmd, addr, ptl); + } + return NULL; +} + +/* + * This is the old fallback for page remapping. + * + * For historical reasons, it only allows reserved pages. Only + * old drivers should use this, and they needed to mark their + * pages reserved for the old functions anyway. + */ +static int insert_page(struct vm_area_struct *vma, unsigned long addr, + struct page *page, pgprot_t prot) +{ + struct mm_struct *mm = vma->vm_mm; + int retval; + pte_t *pte; + spinlock_t *ptl; + + retval = -EINVAL; + if (PageAnon(page)) + goto out; + retval = -ENOMEM; + flush_dcache_page(page); + pte = get_locked_pte(mm, addr, &ptl); + if (!pte) + goto out; + retval = -EBUSY; + if (!pte_none(*pte)) + goto out_unlock; + + /* Ok, finally just insert the thing.. */ + get_page(page); + inc_mm_counter(mm, file_rss); + page_add_file_rmap(page); + set_pte_at(mm, addr, pte, mk_pte(page, prot)); + + retval = 0; + pte_unmap_unlock(pte, ptl); + return retval; +out_unlock: + pte_unmap_unlock(pte, ptl); +out: + return retval; +} + +/** + * vm_insert_page - insert single page into user vma + * @vma: user vma to map to + * @addr: target user address of this page + * @page: source kernel page + * + * This allows drivers to insert individual pages they've allocated + * into a user vma. + * + * The page has to be a nice clean _individual_ kernel allocation. + * If you allocate a compound page, you need to have marked it as + * such (__GFP_COMP), or manually just split the page up yourself + * (see split_page()). + * + * NOTE! Traditionally this was done with "remap_pfn_range()" which + * took an arbitrary page protection parameter. This doesn't allow + * that. Your vma protection will have to be set up correctly, which + * means that if you want a shared writable mapping, you'd better + * ask for a shared writable mapping! + * + * The page does not need to be reserved. + */ +int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, + struct page *page) +{ + if (addr < vma->vm_start || addr >= vma->vm_end) + return -EFAULT; + if (!page_count(page)) + return -EINVAL; + vma->vm_flags |= VM_INSERTPAGE; + return insert_page(vma, addr, page, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_insert_page); + +static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t prot) +{ + struct mm_struct *mm = vma->vm_mm; + int retval; + pte_t *pte, entry; + spinlock_t *ptl; + + retval = -ENOMEM; + pte = get_locked_pte(mm, addr, &ptl); + if (!pte) + goto out; + retval = -EBUSY; + if (!pte_none(*pte)) + goto out_unlock; + + /* Ok, finally just insert the thing.. */ + entry = pte_mkspecial(pfn_pte(pfn, prot)); + set_pte_at(mm, addr, pte, entry); + update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */ + + retval = 0; +out_unlock: + pte_unmap_unlock(pte, ptl); +out: + return retval; +} + +/** + * vm_insert_pfn - insert single pfn into user vma + * @vma: user vma to map to + * @addr: target user address of this page + * @pfn: source kernel pfn + * + * Similar to vm_inert_page, this allows drivers to insert individual pages + * they've allocated into a user vma. Same comments apply. + * + * This function should only be called from a vm_ops->fault handler, and + * in that case the handler should return NULL. + * + * vma cannot be a COW mapping. + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. + */ +int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn) +{ + /* + * Technically, architectures with pte_special can avoid all these + * restrictions (same for remap_pfn_range). However we would like + * consistency in testing and feature parity among all, so we should + * try to keep these invariants in place for everybody. + */ + BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); + BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == + (VM_PFNMAP|VM_MIXEDMAP)); + BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); + BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); + + if (addr < vma->vm_start || addr >= vma->vm_end) + return -EFAULT; + return insert_pfn(vma, addr, pfn, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_insert_pfn); + +int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn) +{ + BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); + + if (addr < vma->vm_start || addr >= vma->vm_end) + return -EFAULT; + + /* + * If we don't have pte special, then we have to use the pfn_valid() + * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* + * refcount the page if pfn_valid is true (hence insert_page rather + * than insert_pfn). + */ + if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { + struct page *page; + + page = pfn_to_page(pfn); + return insert_page(vma, addr, page, vma->vm_page_prot); + } + return insert_pfn(vma, addr, pfn, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_insert_mixed); + +/* + * maps a range of physical memory into the requested pages. the old + * mappings are removed. any references to nonexistent pages results + * in null mappings (currently treated as "copy-on-access") + */ +static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + pte_t *pte; + spinlock_t *ptl; + + pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); + if (!pte) + return -ENOMEM; + arch_enter_lazy_mmu_mode(); + do { + BUG_ON(!pte_none(*pte)); + set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); + pfn++; + } while (pte++, addr += PAGE_SIZE, addr != end); + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(pte - 1, ptl); + return 0; +} + +static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + pmd_t *pmd; + unsigned long next; + + pfn -= addr >> PAGE_SHIFT; + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return -ENOMEM; + do { + next = pmd_addr_end(addr, end); + if (remap_pte_range(mm, pmd, addr, next, + pfn + (addr >> PAGE_SHIFT), prot)) + return -ENOMEM; + } while (pmd++, addr = next, addr != end); + return 0; +} + +static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + pud_t *pud; + unsigned long next; + + pfn -= addr >> PAGE_SHIFT; + pud = pud_alloc(mm, pgd, addr); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); + if (remap_pmd_range(mm, pud, addr, next, + pfn + (addr >> PAGE_SHIFT), prot)) + return -ENOMEM; + } while (pud++, addr = next, addr != end); + return 0; +} + +/** + * remap_pfn_range - remap kernel memory to userspace + * @vma: user vma to map to + * @addr: target user address to start at + * @pfn: physical address of kernel memory + * @size: size of map area + * @prot: page protection flags for this mapping + * + * Note: this is only safe if the mm semaphore is held when called. + */ +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + pgd_t *pgd; + unsigned long next; + unsigned long end = addr + PAGE_ALIGN(size); + struct mm_struct *mm = vma->vm_mm; + int err; + + /* + * Physically remapped pages are special. Tell the + * rest of the world about it: + * VM_IO tells people not to look at these pages + * (accesses can have side effects). + * VM_RESERVED is specified all over the place, because + * in 2.4 it kept swapout's vma scan off this vma; but + * in 2.6 the LRU scan won't even find its pages, so this + * flag means no more than count its pages in reserved_vm, + * and omit it from core dump, even when VM_IO turned off. + * VM_PFNMAP tells the core MM that the base pages are just + * raw PFN mappings, and do not have a "struct page" associated + * with them. + * + * There's a horrible special case to handle copy-on-write + * behaviour that some programs depend on. We mark the "original" + * un-COW'ed pages by matching them up with "vma->vm_pgoff". + */ + if (is_cow_mapping(vma->vm_flags)) { + if (addr != vma->vm_start || end != vma->vm_end) + return -EINVAL; + vma->vm_pgoff = pfn; + } + + vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; + + BUG_ON(addr >= end); + pfn -= addr >> PAGE_SHIFT; + pgd = pgd_offset(mm, addr); + flush_cache_range(vma, addr, end); + do { + next = pgd_addr_end(addr, end); + err = remap_pud_range(mm, pgd, addr, next, + pfn + (addr >> PAGE_SHIFT), prot); + if (err) + break; + } while (pgd++, addr = next, addr != end); + return err; +} +EXPORT_SYMBOL(remap_pfn_range); + +static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data) +{ + pte_t *pte; + int err; + pgtable_t token; + spinlock_t *uninitialized_var(ptl); + + pte = (mm == &init_mm) ? + pte_alloc_kernel(pmd, addr) : + pte_alloc_map_lock(mm, pmd, addr, &ptl); + if (!pte) + return -ENOMEM; + + BUG_ON(pmd_huge(*pmd)); + + token = pmd_pgtable(*pmd); + + do { + err = fn(pte, token, addr, data); + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != end); + + if (mm != &init_mm) + pte_unmap_unlock(pte-1, ptl); + return err; +} + +static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data) +{ + pmd_t *pmd; + unsigned long next; + int err; + + BUG_ON(pud_huge(*pud)); + + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return -ENOMEM; + do { + next = pmd_addr_end(addr, end); + err = apply_to_pte_range(mm, pmd, addr, next, fn, data); + if (err) + break; + } while (pmd++, addr = next, addr != end); + return err; +} + +static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data) +{ + pud_t *pud; + unsigned long next; + int err; + + pud = pud_alloc(mm, pgd, addr); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); + err = apply_to_pmd_range(mm, pud, addr, next, fn, data); + if (err) + break; + } while (pud++, addr = next, addr != end); + return err; +} + +/* + * Scan a region of virtual memory, filling in page tables as necessary + * and calling a provided function on each leaf page table. + */ +int apply_to_page_range(struct mm_struct *mm, unsigned long addr, + unsigned long size, pte_fn_t fn, void *data) +{ + pgd_t *pgd; + unsigned long next; + unsigned long start = addr, end = addr + size; + int err; + + BUG_ON(addr >= end); + mmu_notifier_invalidate_range_start(mm, start, end); + pgd = pgd_offset(mm, addr); + do { + next = pgd_addr_end(addr, end); + err = apply_to_pud_range(mm, pgd, addr, next, fn, data); + if (err) + break; + } while (pgd++, addr = next, addr != end); + mmu_notifier_invalidate_range_end(mm, start, end); + return err; +} +EXPORT_SYMBOL_GPL(apply_to_page_range); + +/* + * handle_pte_fault chooses page fault handler according to an entry + * which was read non-atomically. Before making any commitment, on + * those architectures or configurations (e.g. i386 with PAE) which + * might give a mix of unmatched parts, do_swap_page and do_file_page + * must check under lock before unmapping the pte and proceeding + * (but do_wp_page is only called after already making such a check; + * and do_anonymous_page and do_no_page can safely check later on). + */ +static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, + pte_t *page_table, pte_t orig_pte) +{ + int same = 1; +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) + if (sizeof(pte_t) > sizeof(unsigned long)) { + spinlock_t *ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + same = pte_same(*page_table, orig_pte); + spin_unlock(ptl); + } +#endif + pte_unmap(page_table); + return same; +} + +/* + * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when + * servicing faults for write access. In the normal case, do always want + * pte_mkwrite. But get_user_pages can cause write faults for mappings + * that do not have writing enabled, when used by access_process_vm. + */ +static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) +{ + if (likely(vma->vm_flags & VM_WRITE)) + pte = pte_mkwrite(pte); + return pte; +} + +static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) +{ + /* + * If the source page was a PFN mapping, we don't have + * a "struct page" for it. We do a best-effort copy by + * just copying from the original user address. If that + * fails, we just zero-fill it. Live with it. + */ + if (unlikely(!src)) { + void *kaddr = kmap_atomic(dst, KM_USER0); + void __user *uaddr = (void __user *)(va & PAGE_MASK); + + /* + * This really shouldn't fail, because the page is there + * in the page tables. But it might just be unreadable, + * in which case we just give up and fill the result with + * zeroes. + */ + if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) + memset(kaddr, 0, PAGE_SIZE); + kunmap_atomic(kaddr, KM_USER0); + flush_dcache_page(dst); + } else + copy_user_highpage(dst, src, va, vma); +} + +/* + * This routine handles present pages, when users try to write + * to a shared page. It is done by copying the page to a new address + * and decrementing the shared-page counter for the old page. + * + * Note that this routine assumes that the protection checks have been + * done by the caller (the low-level page fault routine in most cases). + * Thus we can safely just mark it writable once we've done any necessary + * COW. + * + * We also mark the page dirty at this point even though the page will + * change only once the write actually happens. This avoids a few races, + * and potentially makes it more efficient. + * + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), with pte both mapped and locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + */ +static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + spinlock_t *ptl, pte_t orig_pte) +{ + struct page *old_page, *new_page; + pte_t entry; + int reuse = 0, ret = 0; + int page_mkwrite = 0; + struct page *dirty_page = NULL; + + old_page = vm_normal_page(vma, address, orig_pte); + if (!old_page) { + /* + * VM_MIXEDMAP !pfn_valid() case + * + * We should not cow pages in a shared writeable mapping. + * Just mark the pages writable as we can't do any dirty + * accounting on raw pfn maps. + */ + if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == + (VM_WRITE|VM_SHARED)) + goto reuse; + goto gotten; + } + + /* + * Take out anonymous pages first, anonymous shared vmas are + * not dirty accountable. + */ + if (PageAnon(old_page)) { + if (trylock_page(old_page)) { + reuse = can_share_swap_page(old_page); + unlock_page(old_page); + } + } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == + (VM_WRITE|VM_SHARED))) { + /* + * Only catch write-faults on shared writable pages, + * read-only shared pages can get COWed by + * get_user_pages(.write=1, .force=1). + */ + if (vma->vm_ops && vma->vm_ops->page_mkwrite) { + /* + * Notify the address space that the page is about to + * become writable so that it can prohibit this or wait + * for the page to get into an appropriate state. + * + * We do this without the lock held, so that it can + * sleep if it needs to. + */ + page_cache_get(old_page); + pte_unmap_unlock(page_table, ptl); + + if (vma->vm_ops->page_mkwrite(vma, old_page) < 0) + goto unwritable_page; + + /* + * Since we dropped the lock we need to revalidate + * the PTE as someone else may have changed it. If + * they did, we just return, as we can count on the + * MMU to tell us if they didn't also make it writable. + */ + page_table = pte_offset_map_lock(mm, pmd, address, + &ptl); + page_cache_release(old_page); + if (!pte_same(*page_table, orig_pte)) + goto unlock; + + page_mkwrite = 1; + } + dirty_page = old_page; + get_page(dirty_page); + reuse = 1; + } + + if (reuse) { +reuse: + flush_cache_page(vma, address, pte_pfn(orig_pte)); + entry = pte_mkyoung(orig_pte); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (ptep_set_access_flags(vma, address, page_table, entry,1)) + update_mmu_cache(vma, address, entry); + ret |= VM_FAULT_WRITE; + goto unlock; + } + + /* + * Ok, we need to copy. Oh, well.. + */ + page_cache_get(old_page); +gotten: + pte_unmap_unlock(page_table, ptl); + + if (unlikely(anon_vma_prepare(vma))) + goto oom; + VM_BUG_ON(old_page == ZERO_PAGE(0)); + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (!new_page) + goto oom; + /* + * Don't let another task, with possibly unlocked vma, + * keep the mlocked page. + */ + if ((vma->vm_flags & VM_LOCKED) && old_page) { + lock_page(old_page); /* for LRU manipulation */ + clear_page_mlock(old_page); + unlock_page(old_page); + } + cow_user_page(new_page, old_page, address, vma); + __SetPageUptodate(new_page); + + if (mem_cgroup_charge(new_page, mm, GFP_KERNEL)) + goto oom_free_new; + + /* + * Re-check the pte - we dropped the lock + */ + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + if (likely(pte_same(*page_table, orig_pte))) { + if (old_page) { + if (!PageAnon(old_page)) { + dec_mm_counter(mm, file_rss); + inc_mm_counter(mm, anon_rss); + } + } else + inc_mm_counter(mm, anon_rss); + flush_cache_page(vma, address, pte_pfn(orig_pte)); + entry = mk_pte(new_page, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + /* + * Clear the pte entry and flush it first, before updating the + * pte with the new entry. This will avoid a race condition + * seen in the presence of one thread doing SMC and another + * thread doing COW. + */ + ptep_clear_flush_notify(vma, address, page_table); + SetPageSwapBacked(new_page); + lru_cache_add_active_or_unevictable(new_page, vma); + page_add_new_anon_rmap(new_page, vma, address); + +//TODO: is this safe? do_anonymous_page() does it this way. + set_pte_at(mm, address, page_table, entry); + update_mmu_cache(vma, address, entry); + if (old_page) { + /* + * Only after switching the pte to the new page may + * we remove the mapcount here. Otherwise another + * process may come and find the rmap count decremented + * before the pte is switched to the new page, and + * "reuse" the old page writing into it while our pte + * here still points into it and can be read by other + * threads. + * + * The critical issue is to order this + * page_remove_rmap with the ptp_clear_flush above. + * Those stores are ordered by (if nothing else,) + * the barrier present in the atomic_add_negative + * in page_remove_rmap. + * + * Then the TLB flush in ptep_clear_flush ensures that + * no process can access the old page before the + * decremented mapcount is visible. And the old page + * cannot be reused until after the decremented + * mapcount is visible. So transitively, TLBs to + * old page will be flushed before it can be reused. + */ + page_remove_rmap(old_page, vma); + } + + /* Free the old page.. */ + new_page = old_page; + ret |= VM_FAULT_WRITE; + } else + mem_cgroup_uncharge_page(new_page); + + if (new_page) + page_cache_release(new_page); + if (old_page) + page_cache_release(old_page); +unlock: + pte_unmap_unlock(page_table, ptl); + if (dirty_page) { + if (vma->vm_file) + file_update_time(vma->vm_file); + + /* + * Yes, Virginia, this is actually required to prevent a race + * with clear_page_dirty_for_io() from clearing the page dirty + * bit after it clear all dirty ptes, but before a racing + * do_wp_page installs a dirty pte. + * + * do_no_page is protected similarly. + */ + wait_on_page_locked(dirty_page); + set_page_dirty_balance(dirty_page, page_mkwrite); + put_page(dirty_page); + } + return ret; +oom_free_new: + page_cache_release(new_page); +oom: + if (old_page) + page_cache_release(old_page); + return VM_FAULT_OOM; + +unwritable_page: + page_cache_release(old_page); + return VM_FAULT_SIGBUS; +} + +/* + * Helper functions for unmap_mapping_range(). + * + * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __ + * + * We have to restart searching the prio_tree whenever we drop the lock, + * since the iterator is only valid while the lock is held, and anyway + * a later vma might be split and reinserted earlier while lock dropped. + * + * The list of nonlinear vmas could be handled more efficiently, using + * a placeholder, but handle it in the same way until a need is shown. + * It is important to search the prio_tree before nonlinear list: a vma + * may become nonlinear and be shifted from prio_tree to nonlinear list + * while the lock is dropped; but never shifted from list to prio_tree. + * + * In order to make forward progress despite restarting the search, + * vm_truncate_count is used to mark a vma as now dealt with, so we can + * quickly skip it next time around. Since the prio_tree search only + * shows us those vmas affected by unmapping the range in question, we + * can't efficiently keep all vmas in step with mapping->truncate_count: + * so instead reset them all whenever it wraps back to 0 (then go to 1). + * mapping->truncate_count and vma->vm_truncate_count are protected by + * i_mmap_lock. + * + * In order to make forward progress despite repeatedly restarting some + * large vma, note the restart_addr from unmap_vmas when it breaks out: + * and restart from that address when we reach that vma again. It might + * have been split or merged, shrunk or extended, but never shifted: so + * restart_addr remains valid so long as it remains in the vma's range. + * unmap_mapping_range forces truncate_count to leap over page-aligned + * values so we can save vma's restart_addr in its truncate_count field. + */ +#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK)) + +static void reset_vma_truncate_counts(struct address_space *mapping) +{ + struct vm_area_struct *vma; + struct prio_tree_iter iter; + + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) + vma->vm_truncate_count = 0; + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) + vma->vm_truncate_count = 0; +} + +static int unmap_mapping_range_vma(struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr, + struct zap_details *details) +{ + unsigned long restart_addr; + int need_break; + + /* + * files that support invalidating or truncating portions of the + * file from under mmaped areas must have their ->fault function + * return a locked page (and set VM_FAULT_LOCKED in the return). + * This provides synchronisation against concurrent unmapping here. + */ + +again: + restart_addr = vma->vm_truncate_count; + if (is_restart_addr(restart_addr) && start_addr < restart_addr) { + start_addr = restart_addr; + if (start_addr >= end_addr) { + /* Top of vma has been split off since last time */ + vma->vm_truncate_count = details->truncate_count; + return 0; + } + } + + restart_addr = zap_page_range(vma, start_addr, + end_addr - start_addr, details); + need_break = need_resched() || spin_needbreak(details->i_mmap_lock); + + if (restart_addr >= end_addr) { + /* We have now completed this vma: mark it so */ + vma->vm_truncate_count = details->truncate_count; + if (!need_break) + return 0; + } else { + /* Note restart_addr in vma's truncate_count field */ + vma->vm_truncate_count = restart_addr; + if (!need_break) + goto again; + } + + spin_unlock(details->i_mmap_lock); + cond_resched(); + spin_lock(details->i_mmap_lock); + return -EINTR; +} + +static inline void unmap_mapping_range_tree(struct prio_tree_root *root, + struct zap_details *details) +{ + struct vm_area_struct *vma; + struct prio_tree_iter iter; + pgoff_t vba, vea, zba, zea; + +restart: + vma_prio_tree_foreach(vma, &iter, root, + details->first_index, details->last_index) { + /* Skip quickly over those we have already dealt with */ + if (vma->vm_truncate_count == details->truncate_count) + continue; + + vba = vma->vm_pgoff; + vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; + /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ + zba = details->first_index; + if (zba < vba) + zba = vba; + zea = details->last_index; + if (zea > vea) + zea = vea; + + if (unmap_mapping_range_vma(vma, + ((zba - vba) << PAGE_SHIFT) + vma->vm_start, + ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, + details) < 0) + goto restart; + } +} + +static inline void unmap_mapping_range_list(struct list_head *head, + struct zap_details *details) +{ + struct vm_area_struct *vma; + + /* + * In nonlinear VMAs there is no correspondence between virtual address + * offset and file offset. So we must perform an exhaustive search + * across *all* the pages in each nonlinear VMA, not just the pages + * whose virtual address lies outside the file truncation point. + */ +restart: + list_for_each_entry(vma, head, shared.vm_set.list) { + /* Skip quickly over those we have already dealt with */ + if (vma->vm_truncate_count == details->truncate_count) + continue; + details->nonlinear_vma = vma; + if (unmap_mapping_range_vma(vma, vma->vm_start, + vma->vm_end, details) < 0) + goto restart; + } +} + +/** + * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file. + * @mapping: the address space containing mmaps to be unmapped. + * @holebegin: byte in first page to unmap, relative to the start of + * the underlying file. This will be rounded down to a PAGE_SIZE + * boundary. Note that this is different from vmtruncate(), which + * must keep the partial page. In contrast, we must get rid of + * partial pages. + * @holelen: size of prospective hole in bytes. This will be rounded + * up to a PAGE_SIZE boundary. A holelen of zero truncates to the + * end of the file. + * @even_cows: 1 when truncating a file, unmap even private COWed pages; + * but 0 when invalidating pagecache, don't throw away private data. + */ +void unmap_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen, int even_cows) +{ + struct zap_details details; + pgoff_t hba = holebegin >> PAGE_SHIFT; + pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; + + /* Check for overflow. */ + if (sizeof(holelen) > sizeof(hlen)) { + long long holeend = + (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (holeend & ~(long long)ULONG_MAX) + hlen = ULONG_MAX - hba + 1; + } + + details.check_mapping = even_cows? NULL: mapping; + details.nonlinear_vma = NULL; + details.first_index = hba; + details.last_index = hba + hlen - 1; + if (details.last_index < details.first_index) + details.last_index = ULONG_MAX; + details.i_mmap_lock = &mapping->i_mmap_lock; + + spin_lock(&mapping->i_mmap_lock); + + /* Protect against endless unmapping loops */ + mapping->truncate_count++; + if (unlikely(is_restart_addr(mapping->truncate_count))) { + if (mapping->truncate_count == 0) + reset_vma_truncate_counts(mapping); + mapping->truncate_count++; + } + details.truncate_count = mapping->truncate_count; + + if (unlikely(!prio_tree_empty(&mapping->i_mmap))) + unmap_mapping_range_tree(&mapping->i_mmap, &details); + if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) + unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); + spin_unlock(&mapping->i_mmap_lock); +} +EXPORT_SYMBOL(unmap_mapping_range); + +/** + * vmtruncate - unmap mappings "freed" by truncate() syscall + * @inode: inode of the file used + * @offset: file offset to start truncating + * + * NOTE! We have to be ready to update the memory sharing + * between the file and the memory map for a potential last + * incomplete page. Ugly, but necessary. + */ +int vmtruncate(struct inode * inode, loff_t offset) +{ + if (inode->i_size < offset) { + unsigned long limit; + + limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + if (limit != RLIM_INFINITY && offset > limit) + goto out_sig; + if (offset > inode->i_sb->s_maxbytes) + goto out_big; + i_size_write(inode, offset); + } else { + struct address_space *mapping = inode->i_mapping; + + /* + * truncation of in-use swapfiles is disallowed - it would + * cause subsequent swapout to scribble on the now-freed + * blocks. + */ + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + i_size_write(inode, offset); + + /* + * unmap_mapping_range is called twice, first simply for + * efficiency so that truncate_inode_pages does fewer + * single-page unmaps. However after this first call, and + * before truncate_inode_pages finishes, it is possible for + * private pages to be COWed, which remain after + * truncate_inode_pages finishes, hence the second + * unmap_mapping_range call must be made for correctness. + */ + unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(mapping, offset); + unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); + } + + if (inode->i_op && inode->i_op->truncate) + inode->i_op->truncate(inode); + return 0; + +out_sig: + send_sig(SIGXFSZ, current, 0); +out_big: + return -EFBIG; +} +EXPORT_SYMBOL(vmtruncate); + +int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) +{ + struct address_space *mapping = inode->i_mapping; + + /* + * If the underlying filesystem is not going to provide + * a way to truncate a range of blocks (punch a hole) - + * we should return failure right now. + */ + if (!inode->i_op || !inode->i_op->truncate_range) + return -ENOSYS; + + mutex_lock(&inode->i_mutex); + down_write(&inode->i_alloc_sem); + unmap_mapping_range(mapping, offset, (end - offset), 1); + truncate_inode_pages_range(mapping, offset, end); + unmap_mapping_range(mapping, offset, (end - offset), 1); + inode->i_op->truncate_range(inode, offset, end); + up_write(&inode->i_alloc_sem); + mutex_unlock(&inode->i_mutex); + + return 0; +} + +/* + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + */ +static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access, pte_t orig_pte) +{ + spinlock_t *ptl; + struct page *page; + swp_entry_t entry; + pte_t pte; + int ret = 0; + + if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) + goto out; + + entry = pte_to_swp_entry(orig_pte); + if (is_migration_entry(entry)) { + migration_entry_wait(mm, pmd, address); + goto out; + } + delayacct_set_flag(DELAYACCT_PF_SWAPIN); + page = lookup_swap_cache(entry); + if (!page) { + grab_swap_token(); /* Contend for token _before_ read-in */ + page = swapin_readahead(entry, + GFP_HIGHUSER_MOVABLE, vma, address); + if (!page) { + /* + * Back out if somebody else faulted in this pte + * while we released the pte lock. + */ + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + if (likely(pte_same(*page_table, orig_pte))) + ret = VM_FAULT_OOM; + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + goto unlock; + } + + /* Had to read the page from swap area: Major fault */ + ret = VM_FAULT_MAJOR; + count_vm_event(PGMAJFAULT); + } + + mark_page_accessed(page); + + lock_page(page); + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + + if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { + ret = VM_FAULT_OOM; + unlock_page(page); + goto out; + } + + /* + * Back out if somebody else already faulted in this pte. + */ + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + if (unlikely(!pte_same(*page_table, orig_pte))) + goto out_nomap; + + if (unlikely(!PageUptodate(page))) { + ret = VM_FAULT_SIGBUS; + goto out_nomap; + } + + /* The page isn't present yet, go ahead with the fault. */ + + inc_mm_counter(mm, anon_rss); + pte = mk_pte(page, vma->vm_page_prot); + if (write_access && can_share_swap_page(page)) { + pte = maybe_mkwrite(pte_mkdirty(pte), vma); + write_access = 0; + } + + flush_icache_page(vma, page); + set_pte_at(mm, address, page_table, pte); + page_add_anon_rmap(page, vma, address); + + swap_free(entry); + if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) + remove_exclusive_swap_page(page); + unlock_page(page); + + if (write_access) { + ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); + if (ret & VM_FAULT_ERROR) + ret &= VM_FAULT_ERROR; + goto out; + } + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, address, pte); +unlock: + pte_unmap_unlock(page_table, ptl); +out: + return ret; +out_nomap: + mem_cgroup_uncharge_page(page); + pte_unmap_unlock(page_table, ptl); + unlock_page(page); + page_cache_release(page); + return ret; +} + +/* + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + */ +static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access) +{ + struct page *page; + spinlock_t *ptl; + pte_t entry; + + /* Allocate our own private page. */ + pte_unmap(page_table); + + if (unlikely(anon_vma_prepare(vma))) + goto oom; + page = alloc_zeroed_user_highpage_movable(vma, address); + if (!page) + goto oom; + __SetPageUptodate(page); + + if (mem_cgroup_charge(page, mm, GFP_KERNEL)) + goto oom_free_page; + + entry = mk_pte(page, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + if (!pte_none(*page_table)) + goto release; + inc_mm_counter(mm, anon_rss); + SetPageSwapBacked(page); + lru_cache_add_active_or_unevictable(page, vma); + page_add_new_anon_rmap(page, vma, address); + set_pte_at(mm, address, page_table, entry); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, address, entry); +unlock: + pte_unmap_unlock(page_table, ptl); + return 0; +release: + mem_cgroup_uncharge_page(page); + page_cache_release(page); + goto unlock; +oom_free_page: + page_cache_release(page); +oom: + return VM_FAULT_OOM; +} + +/* + * __do_fault() tries to create a new page mapping. It aggressively + * tries to share with existing pages, but makes a separate copy if + * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid + * the next page fault. + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. + * + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte neither mapped nor locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + */ +static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +{ + pte_t *page_table; + spinlock_t *ptl; + struct page *page; + pte_t entry; + int anon = 0; + int charged = 0; + struct page *dirty_page = NULL; + struct vm_fault vmf; + int ret; + int page_mkwrite = 0; + + vmf.virtual_address = (void __user *)(address & PAGE_MASK); + vmf.pgoff = pgoff; + vmf.flags = flags; + vmf.page = NULL; + + ret = vma->vm_ops->fault(vma, &vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) + return ret; + + /* + * For consistency in subsequent calls, make the faulted page always + * locked. + */ + if (unlikely(!(ret & VM_FAULT_LOCKED))) + lock_page(vmf.page); + else + VM_BUG_ON(!PageLocked(vmf.page)); + + /* + * Should we do an early C-O-W break? + */ + page = vmf.page; + if (flags & FAULT_FLAG_WRITE) { + if (!(vma->vm_flags & VM_SHARED)) { + anon = 1; + if (unlikely(anon_vma_prepare(vma))) { + ret = VM_FAULT_OOM; + goto out; + } + page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, + vma, address); + if (!page) { + ret = VM_FAULT_OOM; + goto out; + } + if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { + ret = VM_FAULT_OOM; + page_cache_release(page); + goto out; + } + charged = 1; + /* + * Don't let another task, with possibly unlocked vma, + * keep the mlocked page. + */ + if (vma->vm_flags & VM_LOCKED) + clear_page_mlock(vmf.page); + copy_user_highpage(page, vmf.page, address, vma); + __SetPageUptodate(page); + } else { + /* + * If the page will be shareable, see if the backing + * address space wants to know that the page is about + * to become writable + */ + if (vma->vm_ops->page_mkwrite) { + unlock_page(page); + if (vma->vm_ops->page_mkwrite(vma, page) < 0) { + ret = VM_FAULT_SIGBUS; + anon = 1; /* no anon but release vmf.page */ + goto out_unlocked; + } + lock_page(page); + /* + * XXX: this is not quite right (racy vs + * invalidate) to unlock and relock the page + * like this, however a better fix requires + * reworking page_mkwrite locking API, which + * is better done later. + */ + if (!page->mapping) { + ret = 0; + anon = 1; /* no anon but release vmf.page */ + goto out; + } + page_mkwrite = 1; + } + } + + } + + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + + /* + * This silly early PAGE_DIRTY setting removes a race + * due to the bad i386 page protection. But it's valid + * for other architectures too. + * + * Note that if write_access is true, we either now have + * an exclusive copy of the page, or this is a shared mapping, + * so we can make it writable and dirty to avoid having to + * handle that later. + */ + /* Only go through if we didn't race with anybody else... */ + if (likely(pte_same(*page_table, orig_pte))) { + flush_icache_page(vma, page); + entry = mk_pte(page, vma->vm_page_prot); + if (flags & FAULT_FLAG_WRITE) + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (anon) { + inc_mm_counter(mm, anon_rss); + SetPageSwapBacked(page); + lru_cache_add_active_or_unevictable(page, vma); + page_add_new_anon_rmap(page, vma, address); + } else { + inc_mm_counter(mm, file_rss); + page_add_file_rmap(page); + if (flags & FAULT_FLAG_WRITE) { + dirty_page = page; + get_page(dirty_page); + } + } +//TODO: is this safe? do_anonymous_page() does it this way. + set_pte_at(mm, address, page_table, entry); + + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, address, entry); + } else { + if (charged) + mem_cgroup_uncharge_page(page); + if (anon) + page_cache_release(page); + else + anon = 1; /* no anon but release faulted_page */ + } + + pte_unmap_unlock(page_table, ptl); + +out: + unlock_page(vmf.page); +out_unlocked: + if (anon) + page_cache_release(vmf.page); + else if (dirty_page) { + if (vma->vm_file) + file_update_time(vma->vm_file); + + set_page_dirty_balance(dirty_page, page_mkwrite); + put_page(dirty_page); + } + + return ret; +} + +static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access, pte_t orig_pte) +{ + pgoff_t pgoff = (((address & PAGE_MASK) + - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0); + + pte_unmap(page_table); + return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); +} + +/* + * Fault of a previously existing named mapping. Repopulate the pte + * from the encoded file_pte if possible. This enables swappable + * nonlinear vmas. + * + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + */ +static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access, pte_t orig_pte) +{ + unsigned int flags = FAULT_FLAG_NONLINEAR | + (write_access ? FAULT_FLAG_WRITE : 0); + pgoff_t pgoff; + + if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) + return 0; + + if (unlikely(!(vma->vm_flags & VM_NONLINEAR) || + !(vma->vm_flags & VM_CAN_NONLINEAR))) { + /* + * Page table corrupted: show pte and kill process. + */ + print_bad_pte(vma, orig_pte, address); + return VM_FAULT_OOM; + } + + pgoff = pte_to_pgoff(orig_pte); + return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); +} + +/* + * These routines also need to handle stuff like marking pages dirty + * and/or accessed for architectures that don't do it in hardware (most + * RISC architectures). The early dirtying is also good on the i386. + * + * There is also a hook called "update_mmu_cache()" that architectures + * with external mmu caches can use to update those (ie the Sparc or + * PowerPC hashed page tables that act as extended TLBs). + * + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + */ +static inline int handle_pte_fault(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + pte_t *pte, pmd_t *pmd, int write_access) +{ + pte_t entry; + spinlock_t *ptl; + + entry = *pte; + if (!pte_present(entry)) { + if (pte_none(entry)) { + if (vma->vm_ops) { + if (likely(vma->vm_ops->fault)) + return do_linear_fault(mm, vma, address, + pte, pmd, write_access, entry); + } + return do_anonymous_page(mm, vma, address, + pte, pmd, write_access); + } + if (pte_file(entry)) + return do_nonlinear_fault(mm, vma, address, + pte, pmd, write_access, entry); + return do_swap_page(mm, vma, address, + pte, pmd, write_access, entry); + } + + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + if (unlikely(!pte_same(*pte, entry))) + goto unlock; + if (write_access) { + if (!pte_write(entry)) + return do_wp_page(mm, vma, address, + pte, pmd, ptl, entry); + entry = pte_mkdirty(entry); + } + entry = pte_mkyoung(entry); + if (ptep_set_access_flags(vma, address, pte, entry, write_access)) { + update_mmu_cache(vma, address, entry); + } else { + /* + * This is needed only for protection faults but the arch code + * is not yet telling us if this is a protection fault or not. + * This still avoids useless tlb flushes for .text page faults + * with threads. + */ + if (write_access) + flush_tlb_page(vma, address); + } +unlock: + pte_unmap_unlock(pte, ptl); + return 0; +} + +/* + * By the time we get here, we already hold the mm semaphore + */ +int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, int write_access) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + __set_current_state(TASK_RUNNING); + + count_vm_event(PGFAULT); + + if (unlikely(is_vm_hugetlb_page(vma))) + return hugetlb_fault(mm, vma, address, write_access); + + pgd = pgd_offset(mm, address); + pud = pud_alloc(mm, pgd, address); + if (!pud) + return VM_FAULT_OOM; + pmd = pmd_alloc(mm, pud, address); + if (!pmd) + return VM_FAULT_OOM; + pte = pte_alloc_map(mm, pmd, address); + if (!pte) + return VM_FAULT_OOM; + + return handle_pte_fault(mm, vma, address, pte, pmd, write_access); +} + +#ifndef __PAGETABLE_PUD_FOLDED +/* + * Allocate page upper directory. + * We've already handled the fast-path in-line. + */ +int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + pud_t *new = pud_alloc_one(mm, address); + if (!new) + return -ENOMEM; + + smp_wmb(); /* See comment in __pte_alloc */ + + spin_lock(&mm->page_table_lock); + if (pgd_present(*pgd)) /* Another has populated it */ + pud_free(mm, new); + else + pgd_populate(mm, pgd, new); + spin_unlock(&mm->page_table_lock); + return 0; +} +#endif /* __PAGETABLE_PUD_FOLDED */ + +#ifndef __PAGETABLE_PMD_FOLDED +/* + * Allocate page middle directory. + * We've already handled the fast-path in-line. + */ +int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) +{ + pmd_t *new = pmd_alloc_one(mm, address); + if (!new) + return -ENOMEM; + + smp_wmb(); /* See comment in __pte_alloc */ + + spin_lock(&mm->page_table_lock); +#ifndef __ARCH_HAS_4LEVEL_HACK + if (pud_present(*pud)) /* Another has populated it */ + pmd_free(mm, new); + else + pud_populate(mm, pud, new); +#else + if (pgd_present(*pud)) /* Another has populated it */ + pmd_free(mm, new); + else + pgd_populate(mm, pud, new); +#endif /* __ARCH_HAS_4LEVEL_HACK */ + spin_unlock(&mm->page_table_lock); + return 0; +} +#endif /* __PAGETABLE_PMD_FOLDED */ + +int make_pages_present(unsigned long addr, unsigned long end) +{ + int ret, len, write; + struct vm_area_struct * vma; + + vma = find_vma(current->mm, addr); + if (!vma) + return -ENOMEM; + write = (vma->vm_flags & VM_WRITE) != 0; + BUG_ON(addr >= end); + BUG_ON(end > vma->vm_end); + len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; + ret = get_user_pages(current, current->mm, addr, + len, write, 0, NULL, NULL); + if (ret < 0) + return ret; + return ret == len ? 0 : -EFAULT; +} + +#if !defined(__HAVE_ARCH_GATE_AREA) + +#if defined(AT_SYSINFO_EHDR) +static struct vm_area_struct gate_vma; + +static int __init gate_vma_init(void) +{ + gate_vma.vm_mm = NULL; + gate_vma.vm_start = FIXADDR_USER_START; + gate_vma.vm_end = FIXADDR_USER_END; + gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; + gate_vma.vm_page_prot = __P101; + /* + * Make sure the vDSO gets into every core dump. + * Dumping its contents makes post-mortem fully interpretable later + * without matching up the same kernel and hardware config to see + * what PC values meant. + */ + gate_vma.vm_flags |= VM_ALWAYSDUMP; + return 0; +} +__initcall(gate_vma_init); +#endif + +struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +{ +#ifdef AT_SYSINFO_EHDR + return &gate_vma; +#else + return NULL; +#endif +} + +int in_gate_area_no_task(unsigned long addr) +{ +#ifdef AT_SYSINFO_EHDR + if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) + return 1; +#endif + return 0; +} + +#endif /* __HAVE_ARCH_GATE_AREA */ + +#ifdef CONFIG_HAVE_IOREMAP_PROT +static resource_size_t follow_phys(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned long *prot) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + resource_size_t phys_addr = 0; + struct mm_struct *mm = vma->vm_mm; + + VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP))); + + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto no_page_table; + + pud = pud_offset(pgd, address); + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + goto no_page_table; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + goto no_page_table; + + /* We cannot handle huge page PFN maps. Luckily they don't exist. */ + if (pmd_huge(*pmd)) + goto no_page_table; + + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + if (!ptep) + goto out; + + pte = *ptep; + if (!pte_present(pte)) + goto unlock; + if ((flags & FOLL_WRITE) && !pte_write(pte)) + goto unlock; + phys_addr = pte_pfn(pte); + phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */ + + *prot = pgprot_val(pte_pgprot(pte)); + +unlock: + pte_unmap_unlock(ptep, ptl); +out: + return phys_addr; +no_page_table: + return 0; +} + +int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write) +{ + resource_size_t phys_addr; + unsigned long prot = 0; + void *maddr; + int offset = addr & (PAGE_SIZE-1); + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + return -EINVAL; + + phys_addr = follow_phys(vma, addr, write, &prot); + + if (!phys_addr) + return -EINVAL; + + maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot); + if (write) + memcpy_toio(maddr + offset, buf, len); + else + memcpy_fromio(buf, maddr + offset, len); + iounmap(maddr); + + return len; +} +#endif + +/* + * Access another process' address space. + * Source/target buffer must be kernel space, + * Do not walk the page table directly, use get_user_pages + */ +int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +{ + struct mm_struct *mm; + struct vm_area_struct *vma; + void *old_buf = buf; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + down_read(&mm->mmap_sem); + /* ignore errors, just check how much was successfully transferred */ + while (len) { + int bytes, ret, offset; + void *maddr; + struct page *page = NULL; + + ret = get_user_pages(tsk, mm, addr, 1, + write, 1, &page, &vma); + if (ret <= 0) { + /* + * Check if this is a VM_IO | VM_PFNMAP VMA, which + * we can access using slightly different code. + */ +#ifdef CONFIG_HAVE_IOREMAP_PROT + vma = find_vma(mm, addr); + if (!vma) + break; + if (vma->vm_ops && vma->vm_ops->access) + ret = vma->vm_ops->access(vma, addr, buf, + len, write); + if (ret <= 0) +#endif + break; + bytes = ret; + } else { + bytes = len; + offset = addr & (PAGE_SIZE-1); + if (bytes > PAGE_SIZE-offset) + bytes = PAGE_SIZE-offset; + + maddr = kmap(page); + if (write) { + copy_to_user_page(vma, page, addr, + maddr + offset, buf, bytes); + set_page_dirty_lock(page); + } else { + copy_from_user_page(vma, page, addr, + buf, maddr + offset, bytes); + } + kunmap(page); + page_cache_release(page); + } + len -= bytes; + buf += bytes; + addr += bytes; + } + up_read(&mm->mmap_sem); + mmput(mm); + + return buf - old_buf; +} + +/* + * Print the name of a VMA. + */ +void print_vma_addr(char *prefix, unsigned long ip) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + + /* + * Do not print if we are in atomic + * contexts (in exception stacks, etc.): + */ + if (preempt_count()) + return; + + down_read(&mm->mmap_sem); + vma = find_vma(mm, ip); + if (vma && vma->vm_file) { + struct file *f = vma->vm_file; + char *buf = (char *)__get_free_page(GFP_KERNEL); + if (buf) { + char *p, *s; + + p = d_path(&f->f_path, buf, PAGE_SIZE); + if (IS_ERR(p)) + p = "?"; + s = strrchr(p, '/'); + if (s) + p = s+1; + printk("%s%s[%lx+%lx]", prefix, p, + vma->vm_start, + vma->vm_end - vma->vm_start); + free_page((unsigned long)buf); + } + } + up_read(¤t->mm->mmap_sem); +} diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c new file mode 100644 index 0000000..b173711 --- /dev/null +++ b/mm/memory_hotplug.c @@ -0,0 +1,867 @@ +/* + * linux/mm/memory_hotplug.c + * + * Copyright (C) + */ + +#include <linux/stddef.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/interrupt.h> +#include <linux/pagemap.h> +#include <linux/bootmem.h> +#include <linux/compiler.h> +#include <linux/module.h> +#include <linux/pagevec.h> +#include <linux/writeback.h> +#include <linux/slab.h> +#include <linux/sysctl.h> +#include <linux/cpu.h> +#include <linux/memory.h> +#include <linux/memory_hotplug.h> +#include <linux/highmem.h> +#include <linux/vmalloc.h> +#include <linux/ioport.h> +#include <linux/delay.h> +#include <linux/migrate.h> +#include <linux/page-isolation.h> +#include <linux/pfn.h> + +#include <asm/tlbflush.h> + +#include "internal.h" + +/* add this memory to iomem resource */ +static struct resource *register_memory_resource(u64 start, u64 size) +{ + struct resource *res; + res = kzalloc(sizeof(struct resource), GFP_KERNEL); + BUG_ON(!res); + + res->name = "System RAM"; + res->start = start; + res->end = start + size - 1; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + if (request_resource(&iomem_resource, res) < 0) { + printk("System RAM resource %llx - %llx cannot be added\n", + (unsigned long long)res->start, (unsigned long long)res->end); + kfree(res); + res = NULL; + } + return res; +} + +static void release_memory_resource(struct resource *res) +{ + if (!res) + return; + release_resource(res); + kfree(res); + return; +} + +#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +#ifndef CONFIG_SPARSEMEM_VMEMMAP +static void get_page_bootmem(unsigned long info, struct page *page, int type) +{ + atomic_set(&page->_mapcount, type); + SetPagePrivate(page); + set_page_private(page, info); + atomic_inc(&page->_count); +} + +void put_page_bootmem(struct page *page) +{ + int type; + + type = atomic_read(&page->_mapcount); + BUG_ON(type >= -1); + + if (atomic_dec_return(&page->_count) == 1) { + ClearPagePrivate(page); + set_page_private(page, 0); + reset_page_mapcount(page); + __free_pages_bootmem(page, 0); + } + +} + +static void register_page_bootmem_info_section(unsigned long start_pfn) +{ + unsigned long *usemap, mapsize, section_nr, i; + struct mem_section *ms; + struct page *page, *memmap; + + if (!pfn_valid(start_pfn)) + return; + + section_nr = pfn_to_section_nr(start_pfn); + ms = __nr_to_section(section_nr); + + /* Get section's memmap address */ + memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); + + /* + * Get page for the memmap's phys address + * XXX: need more consideration for sparse_vmemmap... + */ + page = virt_to_page(memmap); + mapsize = sizeof(struct page) * PAGES_PER_SECTION; + mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; + + /* remember memmap's page */ + for (i = 0; i < mapsize; i++, page++) + get_page_bootmem(section_nr, page, SECTION_INFO); + + usemap = __nr_to_section(section_nr)->pageblock_flags; + page = virt_to_page(usemap); + + mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; + + for (i = 0; i < mapsize; i++, page++) + get_page_bootmem(section_nr, page, MIX_SECTION_INFO); + +} + +void register_page_bootmem_info_node(struct pglist_data *pgdat) +{ + unsigned long i, pfn, end_pfn, nr_pages; + int node = pgdat->node_id; + struct page *page; + struct zone *zone; + + nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; + page = virt_to_page(pgdat); + + for (i = 0; i < nr_pages; i++, page++) + get_page_bootmem(node, page, NODE_INFO); + + zone = &pgdat->node_zones[0]; + for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { + if (zone->wait_table) { + nr_pages = zone->wait_table_hash_nr_entries + * sizeof(wait_queue_head_t); + nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; + page = virt_to_page(zone->wait_table); + + for (i = 0; i < nr_pages; i++, page++) + get_page_bootmem(node, page, NODE_INFO); + } + } + + pfn = pgdat->node_start_pfn; + end_pfn = pfn + pgdat->node_spanned_pages; + + /* register_section info */ + for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) + register_page_bootmem_info_section(pfn); + +} +#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ + +static void grow_zone_span(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long old_zone_end_pfn; + + zone_span_writelock(zone); + + old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; + if (start_pfn < zone->zone_start_pfn) + zone->zone_start_pfn = start_pfn; + + zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - + zone->zone_start_pfn; + + zone_span_writeunlock(zone); +} + +static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long old_pgdat_end_pfn = + pgdat->node_start_pfn + pgdat->node_spanned_pages; + + if (start_pfn < pgdat->node_start_pfn) + pgdat->node_start_pfn = start_pfn; + + pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - + pgdat->node_start_pfn; +} + +static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + int nr_pages = PAGES_PER_SECTION; + int nid = pgdat->node_id; + int zone_type; + unsigned long flags; + + zone_type = zone - pgdat->node_zones; + if (!zone->wait_table) { + int ret; + + ret = init_currently_empty_zone(zone, phys_start_pfn, + nr_pages, MEMMAP_HOTPLUG); + if (ret) + return ret; + } + pgdat_resize_lock(zone->zone_pgdat, &flags); + grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); + grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, + phys_start_pfn + nr_pages); + pgdat_resize_unlock(zone->zone_pgdat, &flags); + memmap_init_zone(nr_pages, nid, zone_type, + phys_start_pfn, MEMMAP_HOTPLUG); + return 0; +} + +static int __meminit __add_section(struct zone *zone, unsigned long phys_start_pfn) +{ + int nr_pages = PAGES_PER_SECTION; + int ret; + + if (pfn_valid(phys_start_pfn)) + return -EEXIST; + + ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); + + if (ret < 0) + return ret; + + ret = __add_zone(zone, phys_start_pfn); + + if (ret < 0) + return ret; + + return register_new_memory(__pfn_to_section(phys_start_pfn)); +} + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static int __remove_section(struct zone *zone, struct mem_section *ms) +{ + /* + * XXX: Freeing memmap with vmemmap is not implement yet. + * This should be removed later. + */ + return -EBUSY; +} +#else +static int __remove_section(struct zone *zone, struct mem_section *ms) +{ + unsigned long flags; + struct pglist_data *pgdat = zone->zone_pgdat; + int ret = -EINVAL; + + if (!valid_section(ms)) + return ret; + + ret = unregister_memory_section(ms); + if (ret) + return ret; + + pgdat_resize_lock(pgdat, &flags); + sparse_remove_one_section(zone, ms); + pgdat_resize_unlock(pgdat, &flags); + return 0; +} +#endif + +/* + * Reasonably generic function for adding memory. It is + * expected that archs that support memory hotplug will + * call this function after deciding the zone to which to + * add the new pages. + */ +int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn, + unsigned long nr_pages) +{ + unsigned long i; + int err = 0; + int start_sec, end_sec; + /* during initialize mem_map, align hot-added range to section */ + start_sec = pfn_to_section_nr(phys_start_pfn); + end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); + + for (i = start_sec; i <= end_sec; i++) { + err = __add_section(zone, i << PFN_SECTION_SHIFT); + + /* + * EEXIST is finally dealt with by ioresource collision + * check. see add_memory() => register_memory_resource() + * Warning will be printed if there is collision. + */ + if (err && (err != -EEXIST)) + break; + err = 0; + } + + return err; +} +EXPORT_SYMBOL_GPL(__add_pages); + +/** + * __remove_pages() - remove sections of pages from a zone + * @zone: zone from which pages need to be removed + * @phys_start_pfn: starting pageframe (must be aligned to start of a section) + * @nr_pages: number of pages to remove (must be multiple of section size) + * + * Generic helper function to remove section mappings and sysfs entries + * for the section of the memory we are removing. Caller needs to make + * sure that pages are marked reserved and zones are adjust properly by + * calling offline_pages(). + */ +int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, + unsigned long nr_pages) +{ + unsigned long i, ret = 0; + int sections_to_remove; + + /* + * We can only remove entire sections + */ + BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); + BUG_ON(nr_pages % PAGES_PER_SECTION); + + sections_to_remove = nr_pages / PAGES_PER_SECTION; + for (i = 0; i < sections_to_remove; i++) { + unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; + release_mem_region(pfn << PAGE_SHIFT, + PAGES_PER_SECTION << PAGE_SHIFT); + ret = __remove_section(zone, __pfn_to_section(pfn)); + if (ret) + break; + } + return ret; +} +EXPORT_SYMBOL_GPL(__remove_pages); + +void online_page(struct page *page) +{ + totalram_pages++; + num_physpages++; + +#ifdef CONFIG_HIGHMEM + if (PageHighMem(page)) + totalhigh_pages++; +#endif + +#ifdef CONFIG_FLATMEM + max_mapnr = max(page_to_pfn(page), max_mapnr); +#endif + + ClearPageReserved(page); + init_page_count(page); + __free_page(page); +} + +static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, + void *arg) +{ + unsigned long i; + unsigned long onlined_pages = *(unsigned long *)arg; + struct page *page; + if (PageReserved(pfn_to_page(start_pfn))) + for (i = 0; i < nr_pages; i++) { + page = pfn_to_page(start_pfn + i); + online_page(page); + onlined_pages++; + } + *(unsigned long *)arg = onlined_pages; + return 0; +} + + +int online_pages(unsigned long pfn, unsigned long nr_pages) +{ + unsigned long onlined_pages = 0; + struct zone *zone; + int need_zonelists_rebuild = 0; + int nid; + int ret; + struct memory_notify arg; + + arg.start_pfn = pfn; + arg.nr_pages = nr_pages; + arg.status_change_nid = -1; + + nid = page_to_nid(pfn_to_page(pfn)); + if (node_present_pages(nid) == 0) + arg.status_change_nid = nid; + + ret = memory_notify(MEM_GOING_ONLINE, &arg); + ret = notifier_to_errno(ret); + if (ret) { + memory_notify(MEM_CANCEL_ONLINE, &arg); + return ret; + } + /* + * This doesn't need a lock to do pfn_to_page(). + * The section can't be removed here because of the + * memory_block->state_mutex. + */ + zone = page_zone(pfn_to_page(pfn)); + /* + * If this zone is not populated, then it is not in zonelist. + * This means the page allocator ignores this zone. + * So, zonelist must be updated after online. + */ + if (!populated_zone(zone)) + need_zonelists_rebuild = 1; + + ret = walk_memory_resource(pfn, nr_pages, &onlined_pages, + online_pages_range); + if (ret) { + printk(KERN_DEBUG "online_pages %lx at %lx failed\n", + nr_pages, pfn); + memory_notify(MEM_CANCEL_ONLINE, &arg); + return ret; + } + + zone->present_pages += onlined_pages; + zone->zone_pgdat->node_present_pages += onlined_pages; + + setup_per_zone_pages_min(); + if (onlined_pages) { + kswapd_run(zone_to_nid(zone)); + node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); + } + + if (need_zonelists_rebuild) + build_all_zonelists(); + else + vm_total_pages = nr_free_pagecache_pages(); + + writeback_set_ratelimit(); + + if (onlined_pages) + memory_notify(MEM_ONLINE, &arg); + + return 0; +} +#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ + +static pg_data_t *hotadd_new_pgdat(int nid, u64 start) +{ + struct pglist_data *pgdat; + unsigned long zones_size[MAX_NR_ZONES] = {0}; + unsigned long zholes_size[MAX_NR_ZONES] = {0}; + unsigned long start_pfn = start >> PAGE_SHIFT; + + pgdat = arch_alloc_nodedata(nid); + if (!pgdat) + return NULL; + + arch_refresh_nodedata(nid, pgdat); + + /* we can use NODE_DATA(nid) from here */ + + /* init node's zones as empty zones, we don't have any present pages.*/ + free_area_init_node(nid, zones_size, start_pfn, zholes_size); + + return pgdat; +} + +static void rollback_node_hotadd(int nid, pg_data_t *pgdat) +{ + arch_refresh_nodedata(nid, NULL); + arch_free_nodedata(pgdat); + return; +} + + +/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ +int __ref add_memory(int nid, u64 start, u64 size) +{ + pg_data_t *pgdat = NULL; + int new_pgdat = 0; + struct resource *res; + int ret; + + res = register_memory_resource(start, size); + if (!res) + return -EEXIST; + + if (!node_online(nid)) { + pgdat = hotadd_new_pgdat(nid, start); + if (!pgdat) + return -ENOMEM; + new_pgdat = 1; + } + + /* call arch's memory hotadd */ + ret = arch_add_memory(nid, start, size); + + if (ret < 0) + goto error; + + /* we online node here. we can't roll back from here. */ + node_set_online(nid); + + if (new_pgdat) { + ret = register_one_node(nid); + /* + * If sysfs file of new node can't create, cpu on the node + * can't be hot-added. There is no rollback way now. + * So, check by BUG_ON() to catch it reluctantly.. + */ + BUG_ON(ret); + } + + return ret; +error: + /* rollback pgdat allocation and others */ + if (new_pgdat) + rollback_node_hotadd(nid, pgdat); + if (res) + release_memory_resource(res); + + return ret; +} +EXPORT_SYMBOL_GPL(add_memory); + +#ifdef CONFIG_MEMORY_HOTREMOVE +/* + * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy + * set and the size of the free page is given by page_order(). Using this, + * the function determines if the pageblock contains only free pages. + * Due to buddy contraints, a free page at least the size of a pageblock will + * be located at the start of the pageblock + */ +static inline int pageblock_free(struct page *page) +{ + return PageBuddy(page) && page_order(page) >= pageblock_order; +} + +/* Return the start of the next active pageblock after a given page */ +static struct page *next_active_pageblock(struct page *page) +{ + int pageblocks_stride; + + /* Ensure the starting page is pageblock-aligned */ + BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); + + /* Move forward by at least 1 * pageblock_nr_pages */ + pageblocks_stride = 1; + + /* If the entire pageblock is free, move to the end of free page */ + if (pageblock_free(page)) + pageblocks_stride += page_order(page) - pageblock_order; + + return page + (pageblocks_stride * pageblock_nr_pages); +} + +/* Checks if this range of memory is likely to be hot-removable. */ +int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) +{ + int type; + struct page *page = pfn_to_page(start_pfn); + struct page *end_page = page + nr_pages; + + /* Check the starting page of each pageblock within the range */ + for (; page < end_page; page = next_active_pageblock(page)) { + type = get_pageblock_migratetype(page); + + /* + * A pageblock containing MOVABLE or free pages is considered + * removable + */ + if (type != MIGRATE_MOVABLE && !pageblock_free(page)) + return 0; + + /* + * A pageblock starting with a PageReserved page is not + * considered removable. + */ + if (PageReserved(page)) + return 0; + } + + /* All pageblocks in the memory block are likely to be hot-removable */ + return 1; +} + +/* + * Confirm all pages in a range [start, end) is belongs to the same zone. + */ +static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + struct zone *zone = NULL; + struct page *page; + int i; + for (pfn = start_pfn; + pfn < end_pfn; + pfn += MAX_ORDER_NR_PAGES) { + i = 0; + /* This is just a CONFIG_HOLES_IN_ZONE check.*/ + while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i)) + i++; + if (i == MAX_ORDER_NR_PAGES) + continue; + page = pfn_to_page(pfn + i); + if (zone && page_zone(page) != zone) + return 0; + zone = page_zone(page); + } + return 1; +} + +/* + * Scanning pfn is much easier than scanning lru list. + * Scan pfn from start to end and Find LRU page. + */ +int scan_lru_pages(unsigned long start, unsigned long end) +{ + unsigned long pfn; + struct page *page; + for (pfn = start; pfn < end; pfn++) { + if (pfn_valid(pfn)) { + page = pfn_to_page(pfn); + if (PageLRU(page)) + return pfn; + } + } + return 0; +} + +static struct page * +hotremove_migrate_alloc(struct page *page, + unsigned long private, + int **x) +{ + /* This should be improoooooved!! */ + return alloc_page(GFP_HIGHUSER_PAGECACHE); +} + + +#define NR_OFFLINE_AT_ONCE_PAGES (256) +static int +do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + struct page *page; + int move_pages = NR_OFFLINE_AT_ONCE_PAGES; + int not_managed = 0; + int ret = 0; + LIST_HEAD(source); + + for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { + if (!pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + if (!page_count(page)) + continue; + /* + * We can skip free pages. And we can only deal with pages on + * LRU. + */ + ret = isolate_lru_page(page); + if (!ret) { /* Success */ + list_add_tail(&page->lru, &source); + move_pages--; + } else { + /* Becasue we don't have big zone->lock. we should + check this again here. */ + if (page_count(page)) + not_managed++; +#ifdef CONFIG_DEBUG_VM + printk(KERN_INFO "removing from LRU failed" + " %lx/%d/%lx\n", + pfn, page_count(page), page->flags); +#endif + } + } + ret = -EBUSY; + if (not_managed) { + if (!list_empty(&source)) + putback_lru_pages(&source); + goto out; + } + ret = 0; + if (list_empty(&source)) + goto out; + /* this function returns # of failed pages */ + ret = migrate_pages(&source, hotremove_migrate_alloc, 0); + +out: + return ret; +} + +/* + * remove from free_area[] and mark all as Reserved. + */ +static int +offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, + void *data) +{ + __offline_isolated_pages(start, start + nr_pages); + return 0; +} + +static void +offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) +{ + walk_memory_resource(start_pfn, end_pfn - start_pfn, NULL, + offline_isolated_pages_cb); +} + +/* + * Check all pages in range, recoreded as memory resource, are isolated. + */ +static int +check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, + void *data) +{ + int ret; + long offlined = *(long *)data; + ret = test_pages_isolated(start_pfn, start_pfn + nr_pages); + offlined = nr_pages; + if (!ret) + *(long *)data += offlined; + return ret; +} + +static long +check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) +{ + long offlined = 0; + int ret; + + ret = walk_memory_resource(start_pfn, end_pfn - start_pfn, &offlined, + check_pages_isolated_cb); + if (ret < 0) + offlined = (long)ret; + return offlined; +} + +int offline_pages(unsigned long start_pfn, + unsigned long end_pfn, unsigned long timeout) +{ + unsigned long pfn, nr_pages, expire; + long offlined_pages; + int ret, drain, retry_max, node; + struct zone *zone; + struct memory_notify arg; + + BUG_ON(start_pfn >= end_pfn); + /* at least, alignment against pageblock is necessary */ + if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) + return -EINVAL; + if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) + return -EINVAL; + /* This makes hotplug much easier...and readable. + we assume this for now. .*/ + if (!test_pages_in_a_zone(start_pfn, end_pfn)) + return -EINVAL; + + zone = page_zone(pfn_to_page(start_pfn)); + node = zone_to_nid(zone); + nr_pages = end_pfn - start_pfn; + + /* set above range as isolated */ + ret = start_isolate_page_range(start_pfn, end_pfn); + if (ret) + return ret; + + arg.start_pfn = start_pfn; + arg.nr_pages = nr_pages; + arg.status_change_nid = -1; + if (nr_pages >= node_present_pages(node)) + arg.status_change_nid = node; + + ret = memory_notify(MEM_GOING_OFFLINE, &arg); + ret = notifier_to_errno(ret); + if (ret) + goto failed_removal; + + pfn = start_pfn; + expire = jiffies + timeout; + drain = 0; + retry_max = 5; +repeat: + /* start memory hot removal */ + ret = -EAGAIN; + if (time_after(jiffies, expire)) + goto failed_removal; + ret = -EINTR; + if (signal_pending(current)) + goto failed_removal; + ret = 0; + if (drain) { + lru_add_drain_all(); + flush_scheduled_work(); + cond_resched(); + drain_all_pages(); + } + + pfn = scan_lru_pages(start_pfn, end_pfn); + if (pfn) { /* We have page on LRU */ + ret = do_migrate_range(pfn, end_pfn); + if (!ret) { + drain = 1; + goto repeat; + } else { + if (ret < 0) + if (--retry_max == 0) + goto failed_removal; + yield(); + drain = 1; + goto repeat; + } + } + /* drain all zone's lru pagevec, this is asyncronous... */ + lru_add_drain_all(); + flush_scheduled_work(); + yield(); + /* drain pcp pages , this is synchrouns. */ + drain_all_pages(); + /* check again */ + offlined_pages = check_pages_isolated(start_pfn, end_pfn); + if (offlined_pages < 0) { + ret = -EBUSY; + goto failed_removal; + } + printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); + /* Ok, all of our target is islaoted. + We cannot do rollback at this point. */ + offline_isolated_pages(start_pfn, end_pfn); + /* reset pagetype flags and makes migrate type to be MOVABLE */ + undo_isolate_page_range(start_pfn, end_pfn); + /* removal success */ + zone->present_pages -= offlined_pages; + zone->zone_pgdat->node_present_pages -= offlined_pages; + totalram_pages -= offlined_pages; + num_physpages -= offlined_pages; + + vm_total_pages = nr_free_pagecache_pages(); + writeback_set_ratelimit(); + + memory_notify(MEM_OFFLINE, &arg); + return 0; + +failed_removal: + printk(KERN_INFO "memory offlining %lx to %lx failed\n", + start_pfn, end_pfn); + memory_notify(MEM_CANCEL_OFFLINE, &arg); + /* pushback to free area */ + undo_isolate_page_range(start_pfn, end_pfn); + + return ret; +} + +int remove_memory(u64 start, u64 size) +{ + unsigned long start_pfn, end_pfn; + + start_pfn = PFN_DOWN(start); + end_pfn = start_pfn + PFN_DOWN(size); + return offline_pages(start_pfn, end_pfn, 120 * HZ); +} +#else +int remove_memory(u64 start, u64 size) +{ + return -EINVAL; +} +#endif /* CONFIG_MEMORY_HOTREMOVE */ +EXPORT_SYMBOL_GPL(remove_memory); diff --git a/mm/mempolicy.c b/mm/mempolicy.c new file mode 100644 index 0000000..ac8f8f3 --- /dev/null +++ b/mm/mempolicy.c @@ -0,0 +1,2338 @@ +/* + * Simple NUMA memory policy for the Linux kernel. + * + * Copyright 2003,2004 Andi Kleen, SuSE Labs. + * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. + * Subject to the GNU Public License, version 2. + * + * NUMA policy allows the user to give hints in which node(s) memory should + * be allocated. + * + * Support four policies per VMA and per process: + * + * The VMA policy has priority over the process policy for a page fault. + * + * interleave Allocate memory interleaved over a set of nodes, + * with normal fallback if it fails. + * For VMA based allocations this interleaves based on the + * offset into the backing object or offset into the mapping + * for anonymous memory. For process policy an process counter + * is used. + * + * bind Only allocate memory on a specific set of nodes, + * no fallback. + * FIXME: memory is allocated starting with the first node + * to the last. It would be better if bind would truly restrict + * the allocation to memory nodes instead + * + * preferred Try a specific node first before normal fallback. + * As a special case node -1 here means do the allocation + * on the local CPU. This is normally identical to default, + * but useful to set in a VMA when you have a non default + * process policy. + * + * default Allocate on the local node first, or when on a VMA + * use the process policy. This is what Linux always did + * in a NUMA aware kernel and still does by, ahem, default. + * + * The process policy is applied for most non interrupt memory allocations + * in that process' context. Interrupts ignore the policies and always + * try to allocate on the local CPU. The VMA policy is only applied for memory + * allocations for a VMA in the VM. + * + * Currently there are a few corner cases in swapping where the policy + * is not applied, but the majority should be handled. When process policy + * is used it is not remembered over swap outs/swap ins. + * + * Only the highest zone in the zone hierarchy gets policied. Allocations + * requesting a lower zone just use default policy. This implies that + * on systems with highmem kernel lowmem allocation don't get policied. + * Same with GFP_DMA allocations. + * + * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between + * all users and remembered even when nobody has memory mapped. + */ + +/* Notebook: + fix mmap readahead to honour policy and enable policy for any page cache + object + statistics for bigpages + global policy for page cache? currently it uses process policy. Requires + first item above. + handle mremap for shared memory (currently ignored for the policy) + grows down? + make bind policy root only? It can trigger oom much faster and the + kernel is not always grateful with that. +*/ + +#include <linux/mempolicy.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/hugetlb.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/nodemask.h> +#include <linux/cpuset.h> +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/module.h> +#include <linux/nsproxy.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/compat.h> +#include <linux/swap.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> +#include <linux/migrate.h> +#include <linux/rmap.h> +#include <linux/security.h> +#include <linux/syscalls.h> +#include <linux/ctype.h> + +#include <asm/tlbflush.h> +#include <asm/uaccess.h> + +#include "internal.h" + +/* Internal flags */ +#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ +#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ +#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ + +static struct kmem_cache *policy_cache; +static struct kmem_cache *sn_cache; + +/* Highest zone. An specific allocation for a zone below that is not + policied. */ +enum zone_type policy_zone = 0; + +/* + * run-time system-wide default policy => local allocation + */ +struct mempolicy default_policy = { + .refcnt = ATOMIC_INIT(1), /* never free it */ + .mode = MPOL_PREFERRED, + .flags = MPOL_F_LOCAL, +}; + +static const struct mempolicy_operations { + int (*create)(struct mempolicy *pol, const nodemask_t *nodes); + void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); +} mpol_ops[MPOL_MAX]; + +/* Check that the nodemask contains at least one populated zone */ +static int is_valid_nodemask(const nodemask_t *nodemask) +{ + int nd, k; + + /* Check that there is something useful in this mask */ + k = policy_zone; + + for_each_node_mask(nd, *nodemask) { + struct zone *z; + + for (k = 0; k <= policy_zone; k++) { + z = &NODE_DATA(nd)->node_zones[k]; + if (z->present_pages > 0) + return 1; + } + } + + return 0; +} + +static inline int mpol_store_user_nodemask(const struct mempolicy *pol) +{ + return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES); +} + +static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, + const nodemask_t *rel) +{ + nodemask_t tmp; + nodes_fold(tmp, *orig, nodes_weight(*rel)); + nodes_onto(*ret, tmp, *rel); +} + +static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes) +{ + if (nodes_empty(*nodes)) + return -EINVAL; + pol->v.nodes = *nodes; + return 0; +} + +static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) +{ + if (!nodes) + pol->flags |= MPOL_F_LOCAL; /* local allocation */ + else if (nodes_empty(*nodes)) + return -EINVAL; /* no allowed nodes */ + else + pol->v.preferred_node = first_node(*nodes); + return 0; +} + +static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) +{ + if (!is_valid_nodemask(nodes)) + return -EINVAL; + pol->v.nodes = *nodes; + return 0; +} + +/* Create a new policy */ +static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, + nodemask_t *nodes) +{ + struct mempolicy *policy; + nodemask_t cpuset_context_nmask; + int ret; + + pr_debug("setting mode %d flags %d nodes[0] %lx\n", + mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); + + if (mode == MPOL_DEFAULT) { + if (nodes && !nodes_empty(*nodes)) + return ERR_PTR(-EINVAL); + return NULL; /* simply delete any existing policy */ + } + VM_BUG_ON(!nodes); + + /* + * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or + * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). + * All other modes require a valid pointer to a non-empty nodemask. + */ + if (mode == MPOL_PREFERRED) { + if (nodes_empty(*nodes)) { + if (((flags & MPOL_F_STATIC_NODES) || + (flags & MPOL_F_RELATIVE_NODES))) + return ERR_PTR(-EINVAL); + nodes = NULL; /* flag local alloc */ + } + } else if (nodes_empty(*nodes)) + return ERR_PTR(-EINVAL); + policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); + if (!policy) + return ERR_PTR(-ENOMEM); + atomic_set(&policy->refcnt, 1); + policy->mode = mode; + policy->flags = flags; + + if (nodes) { + /* + * cpuset related setup doesn't apply to local allocation + */ + cpuset_update_task_memory_state(); + if (flags & MPOL_F_RELATIVE_NODES) + mpol_relative_nodemask(&cpuset_context_nmask, nodes, + &cpuset_current_mems_allowed); + else + nodes_and(cpuset_context_nmask, *nodes, + cpuset_current_mems_allowed); + if (mpol_store_user_nodemask(policy)) + policy->w.user_nodemask = *nodes; + else + policy->w.cpuset_mems_allowed = + cpuset_mems_allowed(current); + } + + ret = mpol_ops[mode].create(policy, + nodes ? &cpuset_context_nmask : NULL); + if (ret < 0) { + kmem_cache_free(policy_cache, policy); + return ERR_PTR(ret); + } + return policy; +} + +/* Slow path of a mpol destructor. */ +void __mpol_put(struct mempolicy *p) +{ + if (!atomic_dec_and_test(&p->refcnt)) + return; + kmem_cache_free(policy_cache, p); +} + +static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) +{ +} + +static void mpol_rebind_nodemask(struct mempolicy *pol, + const nodemask_t *nodes) +{ + nodemask_t tmp; + + if (pol->flags & MPOL_F_STATIC_NODES) + nodes_and(tmp, pol->w.user_nodemask, *nodes); + else if (pol->flags & MPOL_F_RELATIVE_NODES) + mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); + else { + nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed, + *nodes); + pol->w.cpuset_mems_allowed = *nodes; + } + + pol->v.nodes = tmp; + if (!node_isset(current->il_next, tmp)) { + current->il_next = next_node(current->il_next, tmp); + if (current->il_next >= MAX_NUMNODES) + current->il_next = first_node(tmp); + if (current->il_next >= MAX_NUMNODES) + current->il_next = numa_node_id(); + } +} + +static void mpol_rebind_preferred(struct mempolicy *pol, + const nodemask_t *nodes) +{ + nodemask_t tmp; + + if (pol->flags & MPOL_F_STATIC_NODES) { + int node = first_node(pol->w.user_nodemask); + + if (node_isset(node, *nodes)) { + pol->v.preferred_node = node; + pol->flags &= ~MPOL_F_LOCAL; + } else + pol->flags |= MPOL_F_LOCAL; + } else if (pol->flags & MPOL_F_RELATIVE_NODES) { + mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); + pol->v.preferred_node = first_node(tmp); + } else if (!(pol->flags & MPOL_F_LOCAL)) { + pol->v.preferred_node = node_remap(pol->v.preferred_node, + pol->w.cpuset_mems_allowed, + *nodes); + pol->w.cpuset_mems_allowed = *nodes; + } +} + +/* Migrate a policy to a different set of nodes */ +static void mpol_rebind_policy(struct mempolicy *pol, + const nodemask_t *newmask) +{ + if (!pol) + return; + if (!mpol_store_user_nodemask(pol) && + nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) + return; + mpol_ops[pol->mode].rebind(pol, newmask); +} + +/* + * Wrapper for mpol_rebind_policy() that just requires task + * pointer, and updates task mempolicy. + */ + +void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) +{ + mpol_rebind_policy(tsk->mempolicy, new); +} + +/* + * Rebind each vma in mm to new nodemask. + * + * Call holding a reference to mm. Takes mm->mmap_sem during call. + */ + +void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) +{ + struct vm_area_struct *vma; + + down_write(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) + mpol_rebind_policy(vma->vm_policy, new); + up_write(&mm->mmap_sem); +} + +static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { + [MPOL_DEFAULT] = { + .rebind = mpol_rebind_default, + }, + [MPOL_INTERLEAVE] = { + .create = mpol_new_interleave, + .rebind = mpol_rebind_nodemask, + }, + [MPOL_PREFERRED] = { + .create = mpol_new_preferred, + .rebind = mpol_rebind_preferred, + }, + [MPOL_BIND] = { + .create = mpol_new_bind, + .rebind = mpol_rebind_nodemask, + }, +}; + +static void gather_stats(struct page *, void *, int pte_dirty); +static void migrate_page_add(struct page *page, struct list_head *pagelist, + unsigned long flags); + +/* Scan through pages checking if pages follow certain conditions. */ +static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, + const nodemask_t *nodes, unsigned long flags, + void *private) +{ + pte_t *orig_pte; + pte_t *pte; + spinlock_t *ptl; + + orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + do { + struct page *page; + int nid; + + if (!pte_present(*pte)) + continue; + page = vm_normal_page(vma, addr, *pte); + if (!page) + continue; + /* + * The check for PageReserved here is important to avoid + * handling zero pages and other pages that may have been + * marked special by the system. + * + * If the PageReserved would not be checked here then f.e. + * the location of the zero page could have an influence + * on MPOL_MF_STRICT, zero pages would be counted for + * the per node stats, and there would be useless attempts + * to put zero pages on the migration list. + */ + if (PageReserved(page)) + continue; + nid = page_to_nid(page); + if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) + continue; + + if (flags & MPOL_MF_STATS) + gather_stats(page, private, pte_dirty(*pte)); + else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + migrate_page_add(page, private, flags); + else + break; + } while (pte++, addr += PAGE_SIZE, addr != end); + pte_unmap_unlock(orig_pte, ptl); + return addr != end; +} + +static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, + unsigned long addr, unsigned long end, + const nodemask_t *nodes, unsigned long flags, + void *private) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; + if (check_pte_range(vma, pmd, addr, next, nodes, + flags, private)) + return -EIO; + } while (pmd++, addr = next, addr != end); + return 0; +} + +static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end, + const nodemask_t *nodes, unsigned long flags, + void *private) +{ + pud_t *pud; + unsigned long next; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + if (check_pmd_range(vma, pud, addr, next, nodes, + flags, private)) + return -EIO; + } while (pud++, addr = next, addr != end); + return 0; +} + +static inline int check_pgd_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + const nodemask_t *nodes, unsigned long flags, + void *private) +{ + pgd_t *pgd; + unsigned long next; + + pgd = pgd_offset(vma->vm_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + if (check_pud_range(vma, pgd, addr, next, nodes, + flags, private)) + return -EIO; + } while (pgd++, addr = next, addr != end); + return 0; +} + +/* + * Check if all pages in a range are on a set of nodes. + * If pagelist != NULL then isolate pages from the LRU and + * put them on the pagelist. + */ +static struct vm_area_struct * +check_range(struct mm_struct *mm, unsigned long start, unsigned long end, + const nodemask_t *nodes, unsigned long flags, void *private) +{ + int err; + struct vm_area_struct *first, *vma, *prev; + + + first = find_vma(mm, start); + if (!first) + return ERR_PTR(-EFAULT); + prev = NULL; + for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { + if (!(flags & MPOL_MF_DISCONTIG_OK)) { + if (!vma->vm_next && vma->vm_end < end) + return ERR_PTR(-EFAULT); + if (prev && prev->vm_end < vma->vm_start) + return ERR_PTR(-EFAULT); + } + if (!is_vm_hugetlb_page(vma) && + ((flags & MPOL_MF_STRICT) || + ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && + vma_migratable(vma)))) { + unsigned long endvma = vma->vm_end; + + if (endvma > end) + endvma = end; + if (vma->vm_start > start) + start = vma->vm_start; + err = check_pgd_range(vma, start, endvma, nodes, + flags, private); + if (err) { + first = ERR_PTR(err); + break; + } + } + prev = vma; + } + return first; +} + +/* Apply policy to a single VMA */ +static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) +{ + int err = 0; + struct mempolicy *old = vma->vm_policy; + + pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", + vma->vm_start, vma->vm_end, vma->vm_pgoff, + vma->vm_ops, vma->vm_file, + vma->vm_ops ? vma->vm_ops->set_policy : NULL); + + if (vma->vm_ops && vma->vm_ops->set_policy) + err = vma->vm_ops->set_policy(vma, new); + if (!err) { + mpol_get(new); + vma->vm_policy = new; + mpol_put(old); + } + return err; +} + +/* Step 2: apply policy to a range and do splits. */ +static int mbind_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end, struct mempolicy *new) +{ + struct vm_area_struct *next; + int err; + + err = 0; + for (; vma && vma->vm_start < end; vma = next) { + next = vma->vm_next; + if (vma->vm_start < start) + err = split_vma(vma->vm_mm, vma, start, 1); + if (!err && vma->vm_end > end) + err = split_vma(vma->vm_mm, vma, end, 0); + if (!err) + err = policy_vma(vma, new); + if (err) + break; + } + return err; +} + +/* + * Update task->flags PF_MEMPOLICY bit: set iff non-default + * mempolicy. Allows more rapid checking of this (combined perhaps + * with other PF_* flag bits) on memory allocation hot code paths. + * + * If called from outside this file, the task 'p' should -only- be + * a newly forked child not yet visible on the task list, because + * manipulating the task flags of a visible task is not safe. + * + * The above limitation is why this routine has the funny name + * mpol_fix_fork_child_flag(). + * + * It is also safe to call this with a task pointer of current, + * which the static wrapper mpol_set_task_struct_flag() does, + * for use within this file. + */ + +void mpol_fix_fork_child_flag(struct task_struct *p) +{ + if (p->mempolicy) + p->flags |= PF_MEMPOLICY; + else + p->flags &= ~PF_MEMPOLICY; +} + +static void mpol_set_task_struct_flag(void) +{ + mpol_fix_fork_child_flag(current); +} + +/* Set the process memory policy */ +static long do_set_mempolicy(unsigned short mode, unsigned short flags, + nodemask_t *nodes) +{ + struct mempolicy *new; + struct mm_struct *mm = current->mm; + + new = mpol_new(mode, flags, nodes); + if (IS_ERR(new)) + return PTR_ERR(new); + + /* + * prevent changing our mempolicy while show_numa_maps() + * is using it. + * Note: do_set_mempolicy() can be called at init time + * with no 'mm'. + */ + if (mm) + down_write(&mm->mmap_sem); + mpol_put(current->mempolicy); + current->mempolicy = new; + mpol_set_task_struct_flag(); + if (new && new->mode == MPOL_INTERLEAVE && + nodes_weight(new->v.nodes)) + current->il_next = first_node(new->v.nodes); + if (mm) + up_write(&mm->mmap_sem); + + return 0; +} + +/* + * Return nodemask for policy for get_mempolicy() query + */ +static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) +{ + nodes_clear(*nodes); + if (p == &default_policy) + return; + + switch (p->mode) { + case MPOL_BIND: + /* Fall through */ + case MPOL_INTERLEAVE: + *nodes = p->v.nodes; + break; + case MPOL_PREFERRED: + if (!(p->flags & MPOL_F_LOCAL)) + node_set(p->v.preferred_node, *nodes); + /* else return empty node mask for local allocation */ + break; + default: + BUG(); + } +} + +static int lookup_node(struct mm_struct *mm, unsigned long addr) +{ + struct page *p; + int err; + + err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL); + if (err >= 0) { + err = page_to_nid(p); + put_page(p); + } + return err; +} + +/* Retrieve NUMA policy */ +static long do_get_mempolicy(int *policy, nodemask_t *nmask, + unsigned long addr, unsigned long flags) +{ + int err; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = NULL; + struct mempolicy *pol = current->mempolicy; + + cpuset_update_task_memory_state(); + if (flags & + ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) + return -EINVAL; + + if (flags & MPOL_F_MEMS_ALLOWED) { + if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) + return -EINVAL; + *policy = 0; /* just so it's initialized */ + *nmask = cpuset_current_mems_allowed; + return 0; + } + + if (flags & MPOL_F_ADDR) { + /* + * Do NOT fall back to task policy if the + * vma/shared policy at addr is NULL. We + * want to return MPOL_DEFAULT in this case. + */ + down_read(&mm->mmap_sem); + vma = find_vma_intersection(mm, addr, addr+1); + if (!vma) { + up_read(&mm->mmap_sem); + return -EFAULT; + } + if (vma->vm_ops && vma->vm_ops->get_policy) + pol = vma->vm_ops->get_policy(vma, addr); + else + pol = vma->vm_policy; + } else if (addr) + return -EINVAL; + + if (!pol) + pol = &default_policy; /* indicates default behavior */ + + if (flags & MPOL_F_NODE) { + if (flags & MPOL_F_ADDR) { + err = lookup_node(mm, addr); + if (err < 0) + goto out; + *policy = err; + } else if (pol == current->mempolicy && + pol->mode == MPOL_INTERLEAVE) { + *policy = current->il_next; + } else { + err = -EINVAL; + goto out; + } + } else { + *policy = pol == &default_policy ? MPOL_DEFAULT : + pol->mode; + /* + * Internal mempolicy flags must be masked off before exposing + * the policy to userspace. + */ + *policy |= (pol->flags & MPOL_MODE_FLAGS); + } + + if (vma) { + up_read(¤t->mm->mmap_sem); + vma = NULL; + } + + err = 0; + if (nmask) + get_policy_nodemask(pol, nmask); + + out: + mpol_cond_put(pol); + if (vma) + up_read(¤t->mm->mmap_sem); + return err; +} + +#ifdef CONFIG_MIGRATION +/* + * page migration + */ +static void migrate_page_add(struct page *page, struct list_head *pagelist, + unsigned long flags) +{ + /* + * Avoid migrating a page that is shared with others. + */ + if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { + if (!isolate_lru_page(page)) { + list_add_tail(&page->lru, pagelist); + } + } +} + +static struct page *new_node_page(struct page *page, unsigned long node, int **x) +{ + return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0); +} + +/* + * Migrate pages from one node to a target node. + * Returns error or the number of pages not migrated. + */ +static int migrate_to_node(struct mm_struct *mm, int source, int dest, + int flags) +{ + nodemask_t nmask; + LIST_HEAD(pagelist); + int err = 0; + + nodes_clear(nmask); + node_set(source, nmask); + + check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask, + flags | MPOL_MF_DISCONTIG_OK, &pagelist); + + if (!list_empty(&pagelist)) + err = migrate_pages(&pagelist, new_node_page, dest); + + return err; +} + +/* + * Move pages between the two nodesets so as to preserve the physical + * layout as much as possible. + * + * Returns the number of page that could not be moved. + */ +int do_migrate_pages(struct mm_struct *mm, + const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) +{ + int busy = 0; + int err; + nodemask_t tmp; + + err = migrate_prep(); + if (err) + return err; + + down_read(&mm->mmap_sem); + + err = migrate_vmas(mm, from_nodes, to_nodes, flags); + if (err) + goto out; + +/* + * Find a 'source' bit set in 'tmp' whose corresponding 'dest' + * bit in 'to' is not also set in 'tmp'. Clear the found 'source' + * bit in 'tmp', and return that <source, dest> pair for migration. + * The pair of nodemasks 'to' and 'from' define the map. + * + * If no pair of bits is found that way, fallback to picking some + * pair of 'source' and 'dest' bits that are not the same. If the + * 'source' and 'dest' bits are the same, this represents a node + * that will be migrating to itself, so no pages need move. + * + * If no bits are left in 'tmp', or if all remaining bits left + * in 'tmp' correspond to the same bit in 'to', return false + * (nothing left to migrate). + * + * This lets us pick a pair of nodes to migrate between, such that + * if possible the dest node is not already occupied by some other + * source node, minimizing the risk of overloading the memory on a + * node that would happen if we migrated incoming memory to a node + * before migrating outgoing memory source that same node. + * + * A single scan of tmp is sufficient. As we go, we remember the + * most recent <s, d> pair that moved (s != d). If we find a pair + * that not only moved, but what's better, moved to an empty slot + * (d is not set in tmp), then we break out then, with that pair. + * Otherwise when we finish scannng from_tmp, we at least have the + * most recent <s, d> pair that moved. If we get all the way through + * the scan of tmp without finding any node that moved, much less + * moved to an empty node, then there is nothing left worth migrating. + */ + + tmp = *from_nodes; + while (!nodes_empty(tmp)) { + int s,d; + int source = -1; + int dest = 0; + + for_each_node_mask(s, tmp) { + d = node_remap(s, *from_nodes, *to_nodes); + if (s == d) + continue; + + source = s; /* Node moved. Memorize */ + dest = d; + + /* dest not in remaining from nodes? */ + if (!node_isset(dest, tmp)) + break; + } + if (source == -1) + break; + + node_clear(source, tmp); + err = migrate_to_node(mm, source, dest, flags); + if (err > 0) + busy += err; + if (err < 0) + break; + } +out: + up_read(&mm->mmap_sem); + if (err < 0) + return err; + return busy; + +} + +/* + * Allocate a new page for page migration based on vma policy. + * Start assuming that page is mapped by vma pointed to by @private. + * Search forward from there, if not. N.B., this assumes that the + * list of pages handed to migrate_pages()--which is how we get here-- + * is in virtual address order. + */ +static struct page *new_vma_page(struct page *page, unsigned long private, int **x) +{ + struct vm_area_struct *vma = (struct vm_area_struct *)private; + unsigned long uninitialized_var(address); + + while (vma) { + address = page_address_in_vma(page, vma); + if (address != -EFAULT) + break; + vma = vma->vm_next; + } + + /* + * if !vma, alloc_page_vma() will use task or system default policy + */ + return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); +} +#else + +static void migrate_page_add(struct page *page, struct list_head *pagelist, + unsigned long flags) +{ +} + +int do_migrate_pages(struct mm_struct *mm, + const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) +{ + return -ENOSYS; +} + +static struct page *new_vma_page(struct page *page, unsigned long private, int **x) +{ + return NULL; +} +#endif + +static long do_mbind(unsigned long start, unsigned long len, + unsigned short mode, unsigned short mode_flags, + nodemask_t *nmask, unsigned long flags) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + struct mempolicy *new; + unsigned long end; + int err; + LIST_HEAD(pagelist); + + if (flags & ~(unsigned long)(MPOL_MF_STRICT | + MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + return -EINVAL; + if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) + return -EPERM; + + if (start & ~PAGE_MASK) + return -EINVAL; + + if (mode == MPOL_DEFAULT) + flags &= ~MPOL_MF_STRICT; + + len = (len + PAGE_SIZE - 1) & PAGE_MASK; + end = start + len; + + if (end < start) + return -EINVAL; + if (end == start) + return 0; + + new = mpol_new(mode, mode_flags, nmask); + if (IS_ERR(new)) + return PTR_ERR(new); + + /* + * If we are using the default policy then operation + * on discontinuous address spaces is okay after all + */ + if (!new) + flags |= MPOL_MF_DISCONTIG_OK; + + pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", + start, start + len, mode, mode_flags, + nmask ? nodes_addr(*nmask)[0] : -1); + + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { + + err = migrate_prep(); + if (err) + return err; + } + down_write(&mm->mmap_sem); + vma = check_range(mm, start, end, nmask, + flags | MPOL_MF_INVERT, &pagelist); + + err = PTR_ERR(vma); + if (!IS_ERR(vma)) { + int nr_failed = 0; + + err = mbind_range(vma, start, end, new); + + if (!list_empty(&pagelist)) + nr_failed = migrate_pages(&pagelist, new_vma_page, + (unsigned long)vma); + + if (!err && nr_failed && (flags & MPOL_MF_STRICT)) + err = -EIO; + } + + up_write(&mm->mmap_sem); + mpol_put(new); + return err; +} + +/* + * User space interface with variable sized bitmaps for nodelists. + */ + +/* Copy a node mask from user space. */ +static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, + unsigned long maxnode) +{ + unsigned long k; + unsigned long nlongs; + unsigned long endmask; + + --maxnode; + nodes_clear(*nodes); + if (maxnode == 0 || !nmask) + return 0; + if (maxnode > PAGE_SIZE*BITS_PER_BYTE) + return -EINVAL; + + nlongs = BITS_TO_LONGS(maxnode); + if ((maxnode % BITS_PER_LONG) == 0) + endmask = ~0UL; + else + endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; + + /* When the user specified more nodes than supported just check + if the non supported part is all zero. */ + if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { + if (nlongs > PAGE_SIZE/sizeof(long)) + return -EINVAL; + for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { + unsigned long t; + if (get_user(t, nmask + k)) + return -EFAULT; + if (k == nlongs - 1) { + if (t & endmask) + return -EINVAL; + } else if (t) + return -EINVAL; + } + nlongs = BITS_TO_LONGS(MAX_NUMNODES); + endmask = ~0UL; + } + + if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) + return -EFAULT; + nodes_addr(*nodes)[nlongs-1] &= endmask; + return 0; +} + +/* Copy a kernel node mask to user space */ +static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, + nodemask_t *nodes) +{ + unsigned long copy = ALIGN(maxnode-1, 64) / 8; + const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); + + if (copy > nbytes) { + if (copy > PAGE_SIZE) + return -EINVAL; + if (clear_user((char __user *)mask + nbytes, copy - nbytes)) + return -EFAULT; + copy = nbytes; + } + return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; +} + +SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, + unsigned long, mode, unsigned long __user *, nmask, + unsigned long, maxnode, unsigned, flags) +{ + nodemask_t nodes; + int err; + unsigned short mode_flags; + + mode_flags = mode & MPOL_MODE_FLAGS; + mode &= ~MPOL_MODE_FLAGS; + if (mode >= MPOL_MAX) + return -EINVAL; + if ((mode_flags & MPOL_F_STATIC_NODES) && + (mode_flags & MPOL_F_RELATIVE_NODES)) + return -EINVAL; + err = get_nodes(&nodes, nmask, maxnode); + if (err) + return err; + return do_mbind(start, len, mode, mode_flags, &nodes, flags); +} + +/* Set the process memory policy */ +SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask, + unsigned long, maxnode) +{ + int err; + nodemask_t nodes; + unsigned short flags; + + flags = mode & MPOL_MODE_FLAGS; + mode &= ~MPOL_MODE_FLAGS; + if ((unsigned int)mode >= MPOL_MAX) + return -EINVAL; + if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES)) + return -EINVAL; + err = get_nodes(&nodes, nmask, maxnode); + if (err) + return err; + return do_set_mempolicy(mode, flags, &nodes); +} + +SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, + const unsigned long __user *, old_nodes, + const unsigned long __user *, new_nodes) +{ + struct mm_struct *mm; + struct task_struct *task; + nodemask_t old; + nodemask_t new; + nodemask_t task_nodes; + int err; + + err = get_nodes(&old, old_nodes, maxnode); + if (err) + return err; + + err = get_nodes(&new, new_nodes, maxnode); + if (err) + return err; + + /* Find the mm_struct */ + read_lock(&tasklist_lock); + task = pid ? find_task_by_vpid(pid) : current; + if (!task) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + mm = get_task_mm(task); + read_unlock(&tasklist_lock); + + if (!mm) + return -EINVAL; + + /* + * Check if this process has the right to modify the specified + * process. The right exists if the process has administrative + * capabilities, superuser privileges or the same + * userid as the target process. + */ + if ((current->euid != task->suid) && (current->euid != task->uid) && + (current->uid != task->suid) && (current->uid != task->uid) && + !capable(CAP_SYS_NICE)) { + err = -EPERM; + goto out; + } + + task_nodes = cpuset_mems_allowed(task); + /* Is the user allowed to access the target nodes? */ + if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) { + err = -EPERM; + goto out; + } + + if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) { + err = -EINVAL; + goto out; + } + + err = security_task_movememory(task); + if (err) + goto out; + + err = do_migrate_pages(mm, &old, &new, + capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); +out: + mmput(mm); + return err; +} + + +/* Retrieve NUMA policy */ +SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, + unsigned long __user *, nmask, unsigned long, maxnode, + unsigned long, addr, unsigned long, flags) +{ + int err; + int uninitialized_var(pval); + nodemask_t nodes; + + if (nmask != NULL && maxnode < MAX_NUMNODES) + return -EINVAL; + + err = do_get_mempolicy(&pval, &nodes, addr, flags); + + if (err) + return err; + + if (policy && put_user(pval, policy)) + return -EFAULT; + + if (nmask) + err = copy_nodes_to_user(nmask, maxnode, &nodes); + + return err; +} + +#ifdef CONFIG_COMPAT + +asmlinkage long compat_sys_get_mempolicy(int __user *policy, + compat_ulong_t __user *nmask, + compat_ulong_t maxnode, + compat_ulong_t addr, compat_ulong_t flags) +{ + long err; + unsigned long __user *nm = NULL; + unsigned long nr_bits, alloc_size; + DECLARE_BITMAP(bm, MAX_NUMNODES); + + nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); + alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; + + if (nmask) + nm = compat_alloc_user_space(alloc_size); + + err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); + + if (!err && nmask) { + err = copy_from_user(bm, nm, alloc_size); + /* ensure entire bitmap is zeroed */ + err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); + err |= compat_put_bitmap(nmask, bm, nr_bits); + } + + return err; +} + +asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, + compat_ulong_t maxnode) +{ + long err = 0; + unsigned long __user *nm = NULL; + unsigned long nr_bits, alloc_size; + DECLARE_BITMAP(bm, MAX_NUMNODES); + + nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); + alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; + + if (nmask) { + err = compat_get_bitmap(bm, nmask, nr_bits); + nm = compat_alloc_user_space(alloc_size); + err |= copy_to_user(nm, bm, alloc_size); + } + + if (err) + return -EFAULT; + + return sys_set_mempolicy(mode, nm, nr_bits+1); +} + +asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, + compat_ulong_t mode, compat_ulong_t __user *nmask, + compat_ulong_t maxnode, compat_ulong_t flags) +{ + long err = 0; + unsigned long __user *nm = NULL; + unsigned long nr_bits, alloc_size; + nodemask_t bm; + + nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); + alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; + + if (nmask) { + err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits); + nm = compat_alloc_user_space(alloc_size); + err |= copy_to_user(nm, nodes_addr(bm), alloc_size); + } + + if (err) + return -EFAULT; + + return sys_mbind(start, len, mode, nm, nr_bits+1, flags); +} + +#endif + +/* + * get_vma_policy(@task, @vma, @addr) + * @task - task for fallback if vma policy == default + * @vma - virtual memory area whose policy is sought + * @addr - address in @vma for shared policy lookup + * + * Returns effective policy for a VMA at specified address. + * Falls back to @task or system default policy, as necessary. + * Current or other task's task mempolicy and non-shared vma policies + * are protected by the task's mmap_sem, which must be held for read by + * the caller. + * Shared policies [those marked as MPOL_F_SHARED] require an extra reference + * count--added by the get_policy() vm_op, as appropriate--to protect against + * freeing by another task. It is the caller's responsibility to free the + * extra reference for shared policies. + */ +static struct mempolicy *get_vma_policy(struct task_struct *task, + struct vm_area_struct *vma, unsigned long addr) +{ + struct mempolicy *pol = task->mempolicy; + + if (vma) { + if (vma->vm_ops && vma->vm_ops->get_policy) { + struct mempolicy *vpol = vma->vm_ops->get_policy(vma, + addr); + if (vpol) + pol = vpol; + } else if (vma->vm_policy) + pol = vma->vm_policy; + } + if (!pol) + pol = &default_policy; + return pol; +} + +/* + * Return a nodemask representing a mempolicy for filtering nodes for + * page allocation + */ +static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) +{ + /* Lower zones don't get a nodemask applied for MPOL_BIND */ + if (unlikely(policy->mode == MPOL_BIND) && + gfp_zone(gfp) >= policy_zone && + cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) + return &policy->v.nodes; + + return NULL; +} + +/* Return a zonelist indicated by gfp for node representing a mempolicy */ +static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy) +{ + int nd = numa_node_id(); + + switch (policy->mode) { + case MPOL_PREFERRED: + if (!(policy->flags & MPOL_F_LOCAL)) + nd = policy->v.preferred_node; + break; + case MPOL_BIND: + /* + * Normally, MPOL_BIND allocations are node-local within the + * allowed nodemask. However, if __GFP_THISNODE is set and the + * current node is part of the mask, we use the zonelist for + * the first node in the mask instead. + */ + if (unlikely(gfp & __GFP_THISNODE) && + unlikely(!node_isset(nd, policy->v.nodes))) + nd = first_node(policy->v.nodes); + break; + case MPOL_INTERLEAVE: /* should not happen */ + break; + default: + BUG(); + } + return node_zonelist(nd, gfp); +} + +/* Do dynamic interleaving for a process */ +static unsigned interleave_nodes(struct mempolicy *policy) +{ + unsigned nid, next; + struct task_struct *me = current; + + nid = me->il_next; + next = next_node(nid, policy->v.nodes); + if (next >= MAX_NUMNODES) + next = first_node(policy->v.nodes); + if (next < MAX_NUMNODES) + me->il_next = next; + return nid; +} + +/* + * Depending on the memory policy provide a node from which to allocate the + * next slab entry. + * @policy must be protected by freeing by the caller. If @policy is + * the current task's mempolicy, this protection is implicit, as only the + * task can change it's policy. The system default policy requires no + * such protection. + */ +unsigned slab_node(struct mempolicy *policy) +{ + if (!policy || policy->flags & MPOL_F_LOCAL) + return numa_node_id(); + + switch (policy->mode) { + case MPOL_PREFERRED: + /* + * handled MPOL_F_LOCAL above + */ + return policy->v.preferred_node; + + case MPOL_INTERLEAVE: + return interleave_nodes(policy); + + case MPOL_BIND: { + /* + * Follow bind policy behavior and start allocation at the + * first node. + */ + struct zonelist *zonelist; + struct zone *zone; + enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); + zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0]; + (void)first_zones_zonelist(zonelist, highest_zoneidx, + &policy->v.nodes, + &zone); + return zone->node; + } + + default: + BUG(); + } +} + +/* Do static interleaving for a VMA with known offset. */ +static unsigned offset_il_node(struct mempolicy *pol, + struct vm_area_struct *vma, unsigned long off) +{ + unsigned nnodes = nodes_weight(pol->v.nodes); + unsigned target; + int c; + int nid = -1; + + if (!nnodes) + return numa_node_id(); + target = (unsigned int)off % nnodes; + c = 0; + do { + nid = next_node(nid, pol->v.nodes); + c++; + } while (c <= target); + return nid; +} + +/* Determine a node number for interleave */ +static inline unsigned interleave_nid(struct mempolicy *pol, + struct vm_area_struct *vma, unsigned long addr, int shift) +{ + if (vma) { + unsigned long off; + + /* + * for small pages, there is no difference between + * shift and PAGE_SHIFT, so the bit-shift is safe. + * for huge pages, since vm_pgoff is in units of small + * pages, we need to shift off the always 0 bits to get + * a useful offset. + */ + BUG_ON(shift < PAGE_SHIFT); + off = vma->vm_pgoff >> (shift - PAGE_SHIFT); + off += (addr - vma->vm_start) >> shift; + return offset_il_node(pol, vma, off); + } else + return interleave_nodes(pol); +} + +#ifdef CONFIG_HUGETLBFS +/* + * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) + * @vma = virtual memory area whose policy is sought + * @addr = address in @vma for shared policy lookup and interleave policy + * @gfp_flags = for requested zone + * @mpol = pointer to mempolicy pointer for reference counted mempolicy + * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask + * + * Returns a zonelist suitable for a huge page allocation and a pointer + * to the struct mempolicy for conditional unref after allocation. + * If the effective policy is 'BIND, returns a pointer to the mempolicy's + * @nodemask for filtering the zonelist. + */ +struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, + gfp_t gfp_flags, struct mempolicy **mpol, + nodemask_t **nodemask) +{ + struct zonelist *zl; + + *mpol = get_vma_policy(current, vma, addr); + *nodemask = NULL; /* assume !MPOL_BIND */ + + if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { + zl = node_zonelist(interleave_nid(*mpol, vma, addr, + huge_page_shift(hstate_vma(vma))), gfp_flags); + } else { + zl = policy_zonelist(gfp_flags, *mpol); + if ((*mpol)->mode == MPOL_BIND) + *nodemask = &(*mpol)->v.nodes; + } + return zl; +} +#endif + +/* Allocate a page in interleaved policy. + Own path because it needs to do special accounting. */ +static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, + unsigned nid) +{ + struct zonelist *zl; + struct page *page; + + zl = node_zonelist(nid, gfp); + page = __alloc_pages(gfp, order, zl); + if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0])) + inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); + return page; +} + +/** + * alloc_page_vma - Allocate a page for a VMA. + * + * @gfp: + * %GFP_USER user allocation. + * %GFP_KERNEL kernel allocations, + * %GFP_HIGHMEM highmem/user allocations, + * %GFP_FS allocation should not call back into a file system. + * %GFP_ATOMIC don't sleep. + * + * @vma: Pointer to VMA or NULL if not available. + * @addr: Virtual Address of the allocation. Must be inside the VMA. + * + * This function allocates a page from the kernel page pool and applies + * a NUMA policy associated with the VMA or the current process. + * When VMA is not NULL caller must hold down_read on the mmap_sem of the + * mm_struct of the VMA to prevent it from going away. Should be used for + * all allocations for pages that will be mapped into + * user space. Returns NULL when no page can be allocated. + * + * Should be called with the mm_sem of the vma hold. + */ +struct page * +alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) +{ + struct mempolicy *pol = get_vma_policy(current, vma, addr); + struct zonelist *zl; + + cpuset_update_task_memory_state(); + + if (unlikely(pol->mode == MPOL_INTERLEAVE)) { + unsigned nid; + + nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); + mpol_cond_put(pol); + return alloc_page_interleave(gfp, 0, nid); + } + zl = policy_zonelist(gfp, pol); + if (unlikely(mpol_needs_cond_ref(pol))) { + /* + * slow path: ref counted shared policy + */ + struct page *page = __alloc_pages_nodemask(gfp, 0, + zl, policy_nodemask(gfp, pol)); + __mpol_put(pol); + return page; + } + /* + * fast path: default or task policy + */ + return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); +} + +/** + * alloc_pages_current - Allocate pages. + * + * @gfp: + * %GFP_USER user allocation, + * %GFP_KERNEL kernel allocation, + * %GFP_HIGHMEM highmem allocation, + * %GFP_FS don't call back into a file system. + * %GFP_ATOMIC don't sleep. + * @order: Power of two of allocation size in pages. 0 is a single page. + * + * Allocate a page from the kernel page pool. When not in + * interrupt context and apply the current process NUMA policy. + * Returns NULL when no page can be allocated. + * + * Don't call cpuset_update_task_memory_state() unless + * 1) it's ok to take cpuset_sem (can WAIT), and + * 2) allocating for current task (not interrupt). + */ +struct page *alloc_pages_current(gfp_t gfp, unsigned order) +{ + struct mempolicy *pol = current->mempolicy; + + if ((gfp & __GFP_WAIT) && !in_interrupt()) + cpuset_update_task_memory_state(); + if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) + pol = &default_policy; + + /* + * No reference counting needed for current->mempolicy + * nor system default_policy + */ + if (pol->mode == MPOL_INTERLEAVE) + return alloc_page_interleave(gfp, order, interleave_nodes(pol)); + return __alloc_pages_nodemask(gfp, order, + policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); +} +EXPORT_SYMBOL(alloc_pages_current); + +/* + * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it + * rebinds the mempolicy its copying by calling mpol_rebind_policy() + * with the mems_allowed returned by cpuset_mems_allowed(). This + * keeps mempolicies cpuset relative after its cpuset moves. See + * further kernel/cpuset.c update_nodemask(). + */ + +/* Slow path of a mempolicy duplicate */ +struct mempolicy *__mpol_dup(struct mempolicy *old) +{ + struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); + + if (!new) + return ERR_PTR(-ENOMEM); + if (current_cpuset_is_being_rebound()) { + nodemask_t mems = cpuset_mems_allowed(current); + mpol_rebind_policy(old, &mems); + } + *new = *old; + atomic_set(&new->refcnt, 1); + return new; +} + +/* + * If *frompol needs [has] an extra ref, copy *frompol to *tompol , + * eliminate the * MPOL_F_* flags that require conditional ref and + * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly + * after return. Use the returned value. + * + * Allows use of a mempolicy for, e.g., multiple allocations with a single + * policy lookup, even if the policy needs/has extra ref on lookup. + * shmem_readahead needs this. + */ +struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol, + struct mempolicy *frompol) +{ + if (!mpol_needs_cond_ref(frompol)) + return frompol; + + *tompol = *frompol; + tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */ + __mpol_put(frompol); + return tompol; +} + +static int mpol_match_intent(const struct mempolicy *a, + const struct mempolicy *b) +{ + if (a->flags != b->flags) + return 0; + if (!mpol_store_user_nodemask(a)) + return 1; + return nodes_equal(a->w.user_nodemask, b->w.user_nodemask); +} + +/* Slow path of a mempolicy comparison */ +int __mpol_equal(struct mempolicy *a, struct mempolicy *b) +{ + if (!a || !b) + return 0; + if (a->mode != b->mode) + return 0; + if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b)) + return 0; + switch (a->mode) { + case MPOL_BIND: + /* Fall through */ + case MPOL_INTERLEAVE: + return nodes_equal(a->v.nodes, b->v.nodes); + case MPOL_PREFERRED: + return a->v.preferred_node == b->v.preferred_node && + a->flags == b->flags; + default: + BUG(); + return 0; + } +} + +/* + * Shared memory backing store policy support. + * + * Remember policies even when nobody has shared memory mapped. + * The policies are kept in Red-Black tree linked from the inode. + * They are protected by the sp->lock spinlock, which should be held + * for any accesses to the tree. + */ + +/* lookup first element intersecting start-end */ +/* Caller holds sp->lock */ +static struct sp_node * +sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) +{ + struct rb_node *n = sp->root.rb_node; + + while (n) { + struct sp_node *p = rb_entry(n, struct sp_node, nd); + + if (start >= p->end) + n = n->rb_right; + else if (end <= p->start) + n = n->rb_left; + else + break; + } + if (!n) + return NULL; + for (;;) { + struct sp_node *w = NULL; + struct rb_node *prev = rb_prev(n); + if (!prev) + break; + w = rb_entry(prev, struct sp_node, nd); + if (w->end <= start) + break; + n = prev; + } + return rb_entry(n, struct sp_node, nd); +} + +/* Insert a new shared policy into the list. */ +/* Caller holds sp->lock */ +static void sp_insert(struct shared_policy *sp, struct sp_node *new) +{ + struct rb_node **p = &sp->root.rb_node; + struct rb_node *parent = NULL; + struct sp_node *nd; + + while (*p) { + parent = *p; + nd = rb_entry(parent, struct sp_node, nd); + if (new->start < nd->start) + p = &(*p)->rb_left; + else if (new->end > nd->end) + p = &(*p)->rb_right; + else + BUG(); + } + rb_link_node(&new->nd, parent, p); + rb_insert_color(&new->nd, &sp->root); + pr_debug("inserting %lx-%lx: %d\n", new->start, new->end, + new->policy ? new->policy->mode : 0); +} + +/* Find shared policy intersecting idx */ +struct mempolicy * +mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) +{ + struct mempolicy *pol = NULL; + struct sp_node *sn; + + if (!sp->root.rb_node) + return NULL; + spin_lock(&sp->lock); + sn = sp_lookup(sp, idx, idx+1); + if (sn) { + mpol_get(sn->policy); + pol = sn->policy; + } + spin_unlock(&sp->lock); + return pol; +} + +static void sp_delete(struct shared_policy *sp, struct sp_node *n) +{ + pr_debug("deleting %lx-l%lx\n", n->start, n->end); + rb_erase(&n->nd, &sp->root); + mpol_put(n->policy); + kmem_cache_free(sn_cache, n); +} + +static struct sp_node *sp_alloc(unsigned long start, unsigned long end, + struct mempolicy *pol) +{ + struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); + + if (!n) + return NULL; + n->start = start; + n->end = end; + mpol_get(pol); + pol->flags |= MPOL_F_SHARED; /* for unref */ + n->policy = pol; + return n; +} + +/* Replace a policy range. */ +static int shared_policy_replace(struct shared_policy *sp, unsigned long start, + unsigned long end, struct sp_node *new) +{ + struct sp_node *n, *new2 = NULL; + +restart: + spin_lock(&sp->lock); + n = sp_lookup(sp, start, end); + /* Take care of old policies in the same range. */ + while (n && n->start < end) { + struct rb_node *next = rb_next(&n->nd); + if (n->start >= start) { + if (n->end <= end) + sp_delete(sp, n); + else + n->start = end; + } else { + /* Old policy spanning whole new range. */ + if (n->end > end) { + if (!new2) { + spin_unlock(&sp->lock); + new2 = sp_alloc(end, n->end, n->policy); + if (!new2) + return -ENOMEM; + goto restart; + } + n->end = start; + sp_insert(sp, new2); + new2 = NULL; + break; + } else + n->end = start; + } + if (!next) + break; + n = rb_entry(next, struct sp_node, nd); + } + if (new) + sp_insert(sp, new); + spin_unlock(&sp->lock); + if (new2) { + mpol_put(new2->policy); + kmem_cache_free(sn_cache, new2); + } + return 0; +} + +/** + * mpol_shared_policy_init - initialize shared policy for inode + * @sp: pointer to inode shared policy + * @mpol: struct mempolicy to install + * + * Install non-NULL @mpol in inode's shared policy rb-tree. + * On entry, the current task has a reference on a non-NULL @mpol. + * This must be released on exit. + */ +void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) +{ + sp->root = RB_ROOT; /* empty tree == default mempolicy */ + spin_lock_init(&sp->lock); + + if (mpol) { + struct vm_area_struct pvma; + struct mempolicy *new; + + /* contextualize the tmpfs mount point mempolicy */ + new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); + mpol_put(mpol); /* drop our ref on sb mpol */ + if (IS_ERR(new)) + return; /* no valid nodemask intersection */ + + /* Create pseudo-vma that contains just the policy */ + memset(&pvma, 0, sizeof(struct vm_area_struct)); + pvma.vm_end = TASK_SIZE; /* policy covers entire file */ + mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ + mpol_put(new); /* drop initial ref */ + } +} + +int mpol_set_shared_policy(struct shared_policy *info, + struct vm_area_struct *vma, struct mempolicy *npol) +{ + int err; + struct sp_node *new = NULL; + unsigned long sz = vma_pages(vma); + + pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n", + vma->vm_pgoff, + sz, npol ? npol->mode : -1, + npol ? npol->flags : -1, + npol ? nodes_addr(npol->v.nodes)[0] : -1); + + if (npol) { + new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); + if (!new) + return -ENOMEM; + } + err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); + if (err && new) + kmem_cache_free(sn_cache, new); + return err; +} + +/* Free a backing policy store on inode delete. */ +void mpol_free_shared_policy(struct shared_policy *p) +{ + struct sp_node *n; + struct rb_node *next; + + if (!p->root.rb_node) + return; + spin_lock(&p->lock); + next = rb_first(&p->root); + while (next) { + n = rb_entry(next, struct sp_node, nd); + next = rb_next(&n->nd); + rb_erase(&n->nd, &p->root); + mpol_put(n->policy); + kmem_cache_free(sn_cache, n); + } + spin_unlock(&p->lock); +} + +/* assumes fs == KERNEL_DS */ +void __init numa_policy_init(void) +{ + nodemask_t interleave_nodes; + unsigned long largest = 0; + int nid, prefer = 0; + + policy_cache = kmem_cache_create("numa_policy", + sizeof(struct mempolicy), + 0, SLAB_PANIC, NULL); + + sn_cache = kmem_cache_create("shared_policy_node", + sizeof(struct sp_node), + 0, SLAB_PANIC, NULL); + + /* + * Set interleaving policy for system init. Interleaving is only + * enabled across suitably sized nodes (default is >= 16MB), or + * fall back to the largest node if they're all smaller. + */ + nodes_clear(interleave_nodes); + for_each_node_state(nid, N_HIGH_MEMORY) { + unsigned long total_pages = node_present_pages(nid); + + /* Preserve the largest node */ + if (largest < total_pages) { + largest = total_pages; + prefer = nid; + } + + /* Interleave this node? */ + if ((total_pages << PAGE_SHIFT) >= (16 << 20)) + node_set(nid, interleave_nodes); + } + + /* All too small, use the largest */ + if (unlikely(nodes_empty(interleave_nodes))) + node_set(prefer, interleave_nodes); + + if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) + printk("numa_policy_init: interleaving failed\n"); +} + +/* Reset policy of current process to default */ +void numa_default_policy(void) +{ + do_set_mempolicy(MPOL_DEFAULT, 0, NULL); +} + +/* + * Parse and format mempolicy from/to strings + */ + +/* + * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag + * Used only for mpol_parse_str() and mpol_to_str() + */ +#define MPOL_LOCAL (MPOL_INTERLEAVE + 1) +static const char * const policy_types[] = + { "default", "prefer", "bind", "interleave", "local" }; + + +#ifdef CONFIG_TMPFS +/** + * mpol_parse_str - parse string to mempolicy + * @str: string containing mempolicy to parse + * @mpol: pointer to struct mempolicy pointer, returned on success. + * @no_context: flag whether to "contextualize" the mempolicy + * + * Format of input: + * <mode>[=<flags>][:<nodelist>] + * + * if @no_context is true, save the input nodemask in w.user_nodemask in + * the returned mempolicy. This will be used to "clone" the mempolicy in + * a specific context [cpuset] at a later time. Used to parse tmpfs mpol + * mount option. Note that if 'static' or 'relative' mode flags were + * specified, the input nodemask will already have been saved. Saving + * it again is redundant, but safe. + * + * On success, returns 0, else 1 + */ +int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) +{ + struct mempolicy *new = NULL; + unsigned short uninitialized_var(mode); + unsigned short uninitialized_var(mode_flags); + nodemask_t nodes; + char *nodelist = strchr(str, ':'); + char *flags = strchr(str, '='); + int i; + int err = 1; + + if (nodelist) { + /* NUL-terminate mode or flags string */ + *nodelist++ = '\0'; + if (nodelist_parse(nodelist, nodes)) + goto out; + if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY])) + goto out; + } else + nodes_clear(nodes); + + if (flags) + *flags++ = '\0'; /* terminate mode string */ + + for (i = 0; i <= MPOL_LOCAL; i++) { + if (!strcmp(str, policy_types[i])) { + mode = i; + break; + } + } + if (i > MPOL_LOCAL) + goto out; + + switch (mode) { + case MPOL_PREFERRED: + /* + * Insist on a nodelist of one node only + */ + if (nodelist) { + char *rest = nodelist; + while (isdigit(*rest)) + rest++; + if (!*rest) + err = 0; + } + break; + case MPOL_INTERLEAVE: + /* + * Default to online nodes with memory if no nodelist + */ + if (!nodelist) + nodes = node_states[N_HIGH_MEMORY]; + err = 0; + break; + case MPOL_LOCAL: + /* + * Don't allow a nodelist; mpol_new() checks flags + */ + if (nodelist) + goto out; + mode = MPOL_PREFERRED; + break; + + /* + * case MPOL_BIND: mpol_new() enforces non-empty nodemask. + * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags. + */ + } + + mode_flags = 0; + if (flags) { + /* + * Currently, we only support two mutually exclusive + * mode flags. + */ + if (!strcmp(flags, "static")) + mode_flags |= MPOL_F_STATIC_NODES; + else if (!strcmp(flags, "relative")) + mode_flags |= MPOL_F_RELATIVE_NODES; + else + err = 1; + } + + new = mpol_new(mode, mode_flags, &nodes); + if (IS_ERR(new)) + err = 1; + else if (no_context) + new->w.user_nodemask = nodes; /* save for contextualization */ + +out: + /* Restore string for error message */ + if (nodelist) + *--nodelist = ':'; + if (flags) + *--flags = '='; + if (!err) + *mpol = new; + return err; +} +#endif /* CONFIG_TMPFS */ + +/** + * mpol_to_str - format a mempolicy structure for printing + * @buffer: to contain formatted mempolicy string + * @maxlen: length of @buffer + * @pol: pointer to mempolicy to be formatted + * @no_context: "context free" mempolicy - use nodemask in w.user_nodemask + * + * Convert a mempolicy into a string. + * Returns the number of characters in buffer (if positive) + * or an error (negative) + */ +int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) +{ + char *p = buffer; + int l; + nodemask_t nodes; + unsigned short mode; + unsigned short flags = pol ? pol->flags : 0; + + /* + * Sanity check: room for longest mode, flag and some nodes + */ + VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16); + + if (!pol || pol == &default_policy) + mode = MPOL_DEFAULT; + else + mode = pol->mode; + + switch (mode) { + case MPOL_DEFAULT: + nodes_clear(nodes); + break; + + case MPOL_PREFERRED: + nodes_clear(nodes); + if (flags & MPOL_F_LOCAL) + mode = MPOL_LOCAL; /* pseudo-policy */ + else + node_set(pol->v.preferred_node, nodes); + break; + + case MPOL_BIND: + /* Fall through */ + case MPOL_INTERLEAVE: + if (no_context) + nodes = pol->w.user_nodemask; + else + nodes = pol->v.nodes; + break; + + default: + BUG(); + } + + l = strlen(policy_types[mode]); + if (buffer + maxlen < p + l + 1) + return -ENOSPC; + + strcpy(p, policy_types[mode]); + p += l; + + if (flags & MPOL_MODE_FLAGS) { + if (buffer + maxlen < p + 2) + return -ENOSPC; + *p++ = '='; + + /* + * Currently, the only defined flags are mutually exclusive + */ + if (flags & MPOL_F_STATIC_NODES) + p += snprintf(p, buffer + maxlen - p, "static"); + else if (flags & MPOL_F_RELATIVE_NODES) + p += snprintf(p, buffer + maxlen - p, "relative"); + } + + if (!nodes_empty(nodes)) { + if (buffer + maxlen < p + 2) + return -ENOSPC; + *p++ = ':'; + p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); + } + return p - buffer; +} + +struct numa_maps { + unsigned long pages; + unsigned long anon; + unsigned long active; + unsigned long writeback; + unsigned long mapcount_max; + unsigned long dirty; + unsigned long swapcache; + unsigned long node[MAX_NUMNODES]; +}; + +static void gather_stats(struct page *page, void *private, int pte_dirty) +{ + struct numa_maps *md = private; + int count = page_mapcount(page); + + md->pages++; + if (pte_dirty || PageDirty(page)) + md->dirty++; + + if (PageSwapCache(page)) + md->swapcache++; + + if (PageActive(page) || PageUnevictable(page)) + md->active++; + + if (PageWriteback(page)) + md->writeback++; + + if (PageAnon(page)) + md->anon++; + + if (count > md->mapcount_max) + md->mapcount_max = count; + + md->node[page_to_nid(page)]++; +} + +#ifdef CONFIG_HUGETLB_PAGE +static void check_huge_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct numa_maps *md) +{ + unsigned long addr; + struct page *page; + struct hstate *h = hstate_vma(vma); + unsigned long sz = huge_page_size(h); + + for (addr = start; addr < end; addr += sz) { + pte_t *ptep = huge_pte_offset(vma->vm_mm, + addr & huge_page_mask(h)); + pte_t pte; + + if (!ptep) + continue; + + pte = *ptep; + if (pte_none(pte)) + continue; + + page = pte_page(pte); + if (!page) + continue; + + gather_stats(page, md, pte_dirty(*ptep)); + } +} +#else +static inline void check_huge_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct numa_maps *md) +{ +} +#endif + +/* + * Display pages allocated per node and memory policy via /proc. + */ +int show_numa_map(struct seq_file *m, void *v) +{ + struct proc_maps_private *priv = m->private; + struct vm_area_struct *vma = v; + struct numa_maps *md; + struct file *file = vma->vm_file; + struct mm_struct *mm = vma->vm_mm; + struct mempolicy *pol; + int n; + char buffer[50]; + + if (!mm) + return 0; + + md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL); + if (!md) + return 0; + + pol = get_vma_policy(priv->task, vma, vma->vm_start); + mpol_to_str(buffer, sizeof(buffer), pol, 0); + mpol_cond_put(pol); + + seq_printf(m, "%08lx %s", vma->vm_start, buffer); + + if (file) { + seq_printf(m, " file="); + seq_path(m, &file->f_path, "\n\t= "); + } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { + seq_printf(m, " heap"); + } else if (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack) { + seq_printf(m, " stack"); + } + + if (is_vm_hugetlb_page(vma)) { + check_huge_range(vma, vma->vm_start, vma->vm_end, md); + seq_printf(m, " huge"); + } else { + check_pgd_range(vma, vma->vm_start, vma->vm_end, + &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md); + } + + if (!md->pages) + goto out; + + if (md->anon) + seq_printf(m," anon=%lu",md->anon); + + if (md->dirty) + seq_printf(m," dirty=%lu",md->dirty); + + if (md->pages != md->anon && md->pages != md->dirty) + seq_printf(m, " mapped=%lu", md->pages); + + if (md->mapcount_max > 1) + seq_printf(m, " mapmax=%lu", md->mapcount_max); + + if (md->swapcache) + seq_printf(m," swapcache=%lu", md->swapcache); + + if (md->active < md->pages && !is_vm_hugetlb_page(vma)) + seq_printf(m," active=%lu", md->active); + + if (md->writeback) + seq_printf(m," writeback=%lu", md->writeback); + + for_each_node_state(n, N_HIGH_MEMORY) + if (md->node[n]) + seq_printf(m, " N%d=%lu", n, md->node[n]); +out: + seq_putc(m, '\n'); + kfree(md); + + if (m->count < m->size) + m->version = (vma != priv->tail_vma) ? vma->vm_start : 0; + return 0; +} diff --git a/mm/mempool.c b/mm/mempool.c new file mode 100644 index 0000000..a46eb1b --- /dev/null +++ b/mm/mempool.c @@ -0,0 +1,340 @@ +/* + * linux/mm/mempool.c + * + * memory buffer pool support. Such pools are mostly used + * for guaranteed, deadlock-free memory allocations during + * extreme VM load. + * + * started by Ingo Molnar, Copyright (C) 2001 + */ + +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/mempool.h> +#include <linux/blkdev.h> +#include <linux/writeback.h> + +static void add_element(mempool_t *pool, void *element) +{ + BUG_ON(pool->curr_nr >= pool->min_nr); + pool->elements[pool->curr_nr++] = element; +} + +static void *remove_element(mempool_t *pool) +{ + BUG_ON(pool->curr_nr <= 0); + return pool->elements[--pool->curr_nr]; +} + +static void free_pool(mempool_t *pool) +{ + while (pool->curr_nr) { + void *element = remove_element(pool); + pool->free(element, pool->pool_data); + } + kfree(pool->elements); + kfree(pool); +} + +/** + * mempool_create - create a memory pool + * @min_nr: the minimum number of elements guaranteed to be + * allocated for this pool. + * @alloc_fn: user-defined element-allocation function. + * @free_fn: user-defined element-freeing function. + * @pool_data: optional private data available to the user-defined functions. + * + * this function creates and allocates a guaranteed size, preallocated + * memory pool. The pool can be used from the mempool_alloc() and mempool_free() + * functions. This function might sleep. Both the alloc_fn() and the free_fn() + * functions might sleep - as long as the mempool_alloc() function is not called + * from IRQ contexts. + */ +mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data) +{ + return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,-1); +} +EXPORT_SYMBOL(mempool_create); + +mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data, int node_id) +{ + mempool_t *pool; + pool = kmalloc_node(sizeof(*pool), GFP_KERNEL | __GFP_ZERO, node_id); + if (!pool) + return NULL; + pool->elements = kmalloc_node(min_nr * sizeof(void *), + GFP_KERNEL, node_id); + if (!pool->elements) { + kfree(pool); + return NULL; + } + spin_lock_init(&pool->lock); + pool->min_nr = min_nr; + pool->pool_data = pool_data; + init_waitqueue_head(&pool->wait); + pool->alloc = alloc_fn; + pool->free = free_fn; + + /* + * First pre-allocate the guaranteed number of buffers. + */ + while (pool->curr_nr < pool->min_nr) { + void *element; + + element = pool->alloc(GFP_KERNEL, pool->pool_data); + if (unlikely(!element)) { + free_pool(pool); + return NULL; + } + add_element(pool, element); + } + return pool; +} +EXPORT_SYMBOL(mempool_create_node); + +/** + * mempool_resize - resize an existing memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * @new_min_nr: the new minimum number of elements guaranteed to be + * allocated for this pool. + * @gfp_mask: the usual allocation bitmask. + * + * This function shrinks/grows the pool. In the case of growing, + * it cannot be guaranteed that the pool will be grown to the new + * size immediately, but new mempool_free() calls will refill it. + * + * Note, the caller must guarantee that no mempool_destroy is called + * while this function is running. mempool_alloc() & mempool_free() + * might be called (eg. from IRQ contexts) while this function executes. + */ +int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask) +{ + void *element; + void **new_elements; + unsigned long flags; + + BUG_ON(new_min_nr <= 0); + + spin_lock_irqsave(&pool->lock, flags); + if (new_min_nr <= pool->min_nr) { + while (new_min_nr < pool->curr_nr) { + element = remove_element(pool); + spin_unlock_irqrestore(&pool->lock, flags); + pool->free(element, pool->pool_data); + spin_lock_irqsave(&pool->lock, flags); + } + pool->min_nr = new_min_nr; + goto out_unlock; + } + spin_unlock_irqrestore(&pool->lock, flags); + + /* Grow the pool */ + new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask); + if (!new_elements) + return -ENOMEM; + + spin_lock_irqsave(&pool->lock, flags); + if (unlikely(new_min_nr <= pool->min_nr)) { + /* Raced, other resize will do our work */ + spin_unlock_irqrestore(&pool->lock, flags); + kfree(new_elements); + goto out; + } + memcpy(new_elements, pool->elements, + pool->curr_nr * sizeof(*new_elements)); + kfree(pool->elements); + pool->elements = new_elements; + pool->min_nr = new_min_nr; + + while (pool->curr_nr < pool->min_nr) { + spin_unlock_irqrestore(&pool->lock, flags); + element = pool->alloc(gfp_mask, pool->pool_data); + if (!element) + goto out; + spin_lock_irqsave(&pool->lock, flags); + if (pool->curr_nr < pool->min_nr) { + add_element(pool, element); + } else { + spin_unlock_irqrestore(&pool->lock, flags); + pool->free(element, pool->pool_data); /* Raced */ + goto out; + } + } +out_unlock: + spin_unlock_irqrestore(&pool->lock, flags); +out: + return 0; +} +EXPORT_SYMBOL(mempool_resize); + +/** + * mempool_destroy - deallocate a memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * + * this function only sleeps if the free_fn() function sleeps. The caller + * has to guarantee that all elements have been returned to the pool (ie: + * freed) prior to calling mempool_destroy(). + */ +void mempool_destroy(mempool_t *pool) +{ + /* Check for outstanding elements */ + BUG_ON(pool->curr_nr != pool->min_nr); + free_pool(pool); +} +EXPORT_SYMBOL(mempool_destroy); + +/** + * mempool_alloc - allocate an element from a specific memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * @gfp_mask: the usual allocation bitmask. + * + * this function only sleeps if the alloc_fn() function sleeps or + * returns NULL. Note that due to preallocation, this function + * *never* fails when called from process contexts. (it might + * fail if called from an IRQ context.) + */ +void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) +{ + void *element; + unsigned long flags; + wait_queue_t wait; + gfp_t gfp_temp; + + might_sleep_if(gfp_mask & __GFP_WAIT); + + gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ + gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ + gfp_mask |= __GFP_NOWARN; /* failures are OK */ + + gfp_temp = gfp_mask & ~(__GFP_WAIT|__GFP_IO); + +repeat_alloc: + + element = pool->alloc(gfp_temp, pool->pool_data); + if (likely(element != NULL)) + return element; + + spin_lock_irqsave(&pool->lock, flags); + if (likely(pool->curr_nr)) { + element = remove_element(pool); + spin_unlock_irqrestore(&pool->lock, flags); + return element; + } + spin_unlock_irqrestore(&pool->lock, flags); + + /* We must not sleep in the GFP_ATOMIC case */ + if (!(gfp_mask & __GFP_WAIT)) + return NULL; + + /* Now start performing page reclaim */ + gfp_temp = gfp_mask; + init_wait(&wait); + prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); + smp_mb(); + if (!pool->curr_nr) { + /* + * FIXME: this should be io_schedule(). The timeout is there + * as a workaround for some DM problems in 2.6.18. + */ + io_schedule_timeout(5*HZ); + } + finish_wait(&pool->wait, &wait); + + goto repeat_alloc; +} +EXPORT_SYMBOL(mempool_alloc); + +/** + * mempool_free - return an element to the pool. + * @element: pool element pointer. + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * + * this function only sleeps if the free_fn() function sleeps. + */ +void mempool_free(void *element, mempool_t *pool) +{ + unsigned long flags; + + if (unlikely(element == NULL)) + return; + + smp_mb(); + if (pool->curr_nr < pool->min_nr) { + spin_lock_irqsave(&pool->lock, flags); + if (pool->curr_nr < pool->min_nr) { + add_element(pool, element); + spin_unlock_irqrestore(&pool->lock, flags); + wake_up(&pool->wait); + return; + } + spin_unlock_irqrestore(&pool->lock, flags); + } + pool->free(element, pool->pool_data); +} +EXPORT_SYMBOL(mempool_free); + +/* + * A commonly used alloc and free fn. + */ +void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) +{ + struct kmem_cache *mem = pool_data; + return kmem_cache_alloc(mem, gfp_mask); +} +EXPORT_SYMBOL(mempool_alloc_slab); + +void mempool_free_slab(void *element, void *pool_data) +{ + struct kmem_cache *mem = pool_data; + kmem_cache_free(mem, element); +} +EXPORT_SYMBOL(mempool_free_slab); + +/* + * A commonly used alloc and free fn that kmalloc/kfrees the amount of memory + * specified by pool_data + */ +void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) +{ + size_t size = (size_t)(long)pool_data; + return kmalloc(size, gfp_mask); +} +EXPORT_SYMBOL(mempool_kmalloc); + +void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data) +{ + size_t size = (size_t) pool_data; + return kzalloc(size, gfp_mask); +} +EXPORT_SYMBOL(mempool_kzalloc); + +void mempool_kfree(void *element, void *pool_data) +{ + kfree(element); +} +EXPORT_SYMBOL(mempool_kfree); + +/* + * A simple mempool-backed page allocator that allocates pages + * of the order specified by pool_data. + */ +void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data) +{ + int order = (int)(long)pool_data; + return alloc_pages(gfp_mask, order); +} +EXPORT_SYMBOL(mempool_alloc_pages); + +void mempool_free_pages(void *element, void *pool_data) +{ + int order = (int)(long)pool_data; + __free_pages(element, order); +} +EXPORT_SYMBOL(mempool_free_pages); diff --git a/mm/migrate.c b/mm/migrate.c new file mode 100644 index 0000000..7fc57cc --- /dev/null +++ b/mm/migrate.c @@ -0,0 +1,1151 @@ +/* + * Memory Migration functionality - linux/mm/migration.c + * + * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter + * + * Page migration was first developed in the context of the memory hotplug + * project. The main authors of the migration code are: + * + * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> + * Hirokazu Takahashi <taka@valinux.co.jp> + * Dave Hansen <haveblue@us.ibm.com> + * Christoph Lameter + */ + +#include <linux/migrate.h> +#include <linux/module.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/pagemap.h> +#include <linux/buffer_head.h> +#include <linux/mm_inline.h> +#include <linux/nsproxy.h> +#include <linux/pagevec.h> +#include <linux/rmap.h> +#include <linux/topology.h> +#include <linux/cpu.h> +#include <linux/cpuset.h> +#include <linux/writeback.h> +#include <linux/mempolicy.h> +#include <linux/vmalloc.h> +#include <linux/security.h> +#include <linux/memcontrol.h> +#include <linux/syscalls.h> + +#include "internal.h" + +#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) + +/* + * migrate_prep() needs to be called before we start compiling a list of pages + * to be migrated using isolate_lru_page(). + */ +int migrate_prep(void) +{ + /* + * Clear the LRU lists so pages can be isolated. + * Note that pages may be moved off the LRU after we have + * drained them. Those pages will fail to migrate like other + * pages that may be busy. + */ + lru_add_drain_all(); + + return 0; +} + +/* + * Add isolated pages on the list back to the LRU under page lock + * to avoid leaking evictable pages back onto unevictable list. + * + * returns the number of pages put back. + */ +int putback_lru_pages(struct list_head *l) +{ + struct page *page; + struct page *page2; + int count = 0; + + list_for_each_entry_safe(page, page2, l, lru) { + list_del(&page->lru); + putback_lru_page(page); + count++; + } + return count; +} + +/* + * Restore a potential migration pte to a working pte entry + */ +static void remove_migration_pte(struct vm_area_struct *vma, + struct page *old, struct page *new) +{ + struct mm_struct *mm = vma->vm_mm; + swp_entry_t entry; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + unsigned long addr = page_address_in_vma(new, vma); + + if (addr == -EFAULT) + return; + + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + return; + + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + return; + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + return; + + ptep = pte_offset_map(pmd, addr); + + if (!is_swap_pte(*ptep)) { + pte_unmap(ptep); + return; + } + + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + pte = *ptep; + if (!is_swap_pte(pte)) + goto out; + + entry = pte_to_swp_entry(pte); + + if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) + goto out; + + /* + * Yes, ignore the return value from a GFP_ATOMIC mem_cgroup_charge. + * Failure is not an option here: we're now expected to remove every + * migration pte, and will cause crashes otherwise. Normally this + * is not an issue: mem_cgroup_prepare_migration bumped up the old + * page_cgroup count for safety, that's now attached to the new page, + * so this charge should just be another incrementation of the count, + * to keep in balance with rmap.c's mem_cgroup_uncharging. But if + * there's been a force_empty, those reference counts may no longer + * be reliable, and this charge can actually fail: oh well, we don't + * make the situation any worse by proceeding as if it had succeeded. + */ + mem_cgroup_charge(new, mm, GFP_ATOMIC); + + get_page(new); + pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); + if (is_write_migration_entry(entry)) + pte = pte_mkwrite(pte); + flush_cache_page(vma, addr, pte_pfn(pte)); + set_pte_at(mm, addr, ptep, pte); + + if (PageAnon(new)) + page_add_anon_rmap(new, vma, addr); + else + page_add_file_rmap(new); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, addr, pte); + +out: + pte_unmap_unlock(ptep, ptl); +} + +/* + * Note that remove_file_migration_ptes will only work on regular mappings, + * Nonlinear mappings do not use migration entries. + */ +static void remove_file_migration_ptes(struct page *old, struct page *new) +{ + struct vm_area_struct *vma; + struct address_space *mapping = page_mapping(new); + struct prio_tree_iter iter; + pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + + if (!mapping) + return; + + spin_lock(&mapping->i_mmap_lock); + + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) + remove_migration_pte(vma, old, new); + + spin_unlock(&mapping->i_mmap_lock); +} + +/* + * Must hold mmap_sem lock on at least one of the vmas containing + * the page so that the anon_vma cannot vanish. + */ +static void remove_anon_migration_ptes(struct page *old, struct page *new) +{ + struct anon_vma *anon_vma; + struct vm_area_struct *vma; + unsigned long mapping; + + mapping = (unsigned long)new->mapping; + + if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) + return; + + /* + * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. + */ + anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); + spin_lock(&anon_vma->lock); + + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) + remove_migration_pte(vma, old, new); + + spin_unlock(&anon_vma->lock); +} + +/* + * Get rid of all migration entries and replace them by + * references to the indicated page. + */ +static void remove_migration_ptes(struct page *old, struct page *new) +{ + if (PageAnon(new)) + remove_anon_migration_ptes(old, new); + else + remove_file_migration_ptes(old, new); +} + +/* + * Something used the pte of a page under migration. We need to + * get to the page and wait until migration is finished. + * When we return from this function the fault will be retried. + * + * This function is called from do_swap_page(). + */ +void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) +{ + pte_t *ptep, pte; + spinlock_t *ptl; + swp_entry_t entry; + struct page *page; + + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + pte = *ptep; + if (!is_swap_pte(pte)) + goto out; + + entry = pte_to_swp_entry(pte); + if (!is_migration_entry(entry)) + goto out; + + page = migration_entry_to_page(entry); + + /* + * Once radix-tree replacement of page migration started, page_count + * *must* be zero. And, we don't want to call wait_on_page_locked() + * against a page without get_page(). + * So, we use get_page_unless_zero(), here. Even failed, page fault + * will occur again. + */ + if (!get_page_unless_zero(page)) + goto out; + pte_unmap_unlock(ptep, ptl); + wait_on_page_locked(page); + put_page(page); + return; +out: + pte_unmap_unlock(ptep, ptl); +} + +/* + * Replace the page in the mapping. + * + * The number of remaining references must be: + * 1 for anonymous pages without a mapping + * 2 for pages with a mapping + * 3 for pages with a mapping and PagePrivate set. + */ +static int migrate_page_move_mapping(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + int expected_count; + void **pslot; + + if (!mapping) { + /* Anonymous page without mapping */ + if (page_count(page) != 1) + return -EAGAIN; + return 0; + } + + spin_lock_irq(&mapping->tree_lock); + + pslot = radix_tree_lookup_slot(&mapping->page_tree, + page_index(page)); + + expected_count = 2 + !!PagePrivate(page); + if (page_count(page) != expected_count || + (struct page *)radix_tree_deref_slot(pslot) != page) { + spin_unlock_irq(&mapping->tree_lock); + return -EAGAIN; + } + + if (!page_freeze_refs(page, expected_count)) { + spin_unlock_irq(&mapping->tree_lock); + return -EAGAIN; + } + + /* + * Now we know that no one else is looking at the page. + */ + get_page(newpage); /* add cache reference */ +#ifdef CONFIG_SWAP + if (PageSwapCache(page)) { + SetPageSwapCache(newpage); + set_page_private(newpage, page_private(page)); + } +#endif + + radix_tree_replace_slot(pslot, newpage); + + page_unfreeze_refs(page, expected_count); + /* + * Drop cache reference from old page. + * We know this isn't the last reference. + */ + __put_page(page); + + /* + * If moved to a different zone then also account + * the page for that zone. Other VM counters will be + * taken care of when we establish references to the + * new page and drop references to the old page. + * + * Note that anonymous pages are accounted for + * via NR_FILE_PAGES and NR_ANON_PAGES if they + * are mapped to swap space. + */ + __dec_zone_page_state(page, NR_FILE_PAGES); + __inc_zone_page_state(newpage, NR_FILE_PAGES); + + spin_unlock_irq(&mapping->tree_lock); + + return 0; +} + +/* + * Copy the page to its new location + */ +static void migrate_page_copy(struct page *newpage, struct page *page) +{ + int anon; + + copy_highpage(newpage, page); + + if (PageError(page)) + SetPageError(newpage); + if (PageReferenced(page)) + SetPageReferenced(newpage); + if (PageUptodate(page)) + SetPageUptodate(newpage); + if (TestClearPageActive(page)) { + VM_BUG_ON(PageUnevictable(page)); + SetPageActive(newpage); + } else + unevictable_migrate_page(newpage, page); + if (PageChecked(page)) + SetPageChecked(newpage); + if (PageMappedToDisk(page)) + SetPageMappedToDisk(newpage); + + if (PageDirty(page)) { + clear_page_dirty_for_io(page); + /* + * Want to mark the page and the radix tree as dirty, and + * redo the accounting that clear_page_dirty_for_io undid, + * but we can't use set_page_dirty because that function + * is actually a signal that all of the page has become dirty. + * Wheras only part of our page may be dirty. + */ + __set_page_dirty_nobuffers(newpage); + } + + mlock_migrate_page(newpage, page); + +#ifdef CONFIG_SWAP + ClearPageSwapCache(page); +#endif + ClearPagePrivate(page); + set_page_private(page, 0); + /* page->mapping contains a flag for PageAnon() */ + anon = PageAnon(page); + page->mapping = NULL; + + if (!anon) /* This page was removed from radix-tree. */ + mem_cgroup_uncharge_cache_page(page); + + /* + * If any waiters have accumulated on the new page then + * wake them up. + */ + if (PageWriteback(newpage)) + end_page_writeback(newpage); +} + +/************************************************************ + * Migration functions + ***********************************************************/ + +/* Always fail migration. Used for mappings that are not movable */ +int fail_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + return -EIO; +} +EXPORT_SYMBOL(fail_migrate_page); + +/* + * Common logic to directly migrate a single page suitable for + * pages that do not use PagePrivate. + * + * Pages are locked upon entry and exit. + */ +int migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + int rc; + + BUG_ON(PageWriteback(page)); /* Writeback must be complete */ + + rc = migrate_page_move_mapping(mapping, newpage, page); + + if (rc) + return rc; + + migrate_page_copy(newpage, page); + return 0; +} +EXPORT_SYMBOL(migrate_page); + +#ifdef CONFIG_BLOCK +/* + * Migration function for pages with buffers. This function can only be used + * if the underlying filesystem guarantees that no other references to "page" + * exist. + */ +int buffer_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + struct buffer_head *bh, *head; + int rc; + + if (!page_has_buffers(page)) + return migrate_page(mapping, newpage, page); + + head = page_buffers(page); + + rc = migrate_page_move_mapping(mapping, newpage, page); + + if (rc) + return rc; + + bh = head; + do { + get_bh(bh); + lock_buffer(bh); + bh = bh->b_this_page; + + } while (bh != head); + + ClearPagePrivate(page); + set_page_private(newpage, page_private(page)); + set_page_private(page, 0); + put_page(page); + get_page(newpage); + + bh = head; + do { + set_bh_page(bh, newpage, bh_offset(bh)); + bh = bh->b_this_page; + + } while (bh != head); + + SetPagePrivate(newpage); + + migrate_page_copy(newpage, page); + + bh = head; + do { + unlock_buffer(bh); + put_bh(bh); + bh = bh->b_this_page; + + } while (bh != head); + + return 0; +} +EXPORT_SYMBOL(buffer_migrate_page); +#endif + +/* + * Writeback a page to clean the dirty state + */ +static int writeout(struct address_space *mapping, struct page *page) +{ + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = 1, + .range_start = 0, + .range_end = LLONG_MAX, + .nonblocking = 1, + .for_reclaim = 1 + }; + int rc; + + if (!mapping->a_ops->writepage) + /* No write method for the address space */ + return -EINVAL; + + if (!clear_page_dirty_for_io(page)) + /* Someone else already triggered a write */ + return -EAGAIN; + + /* + * A dirty page may imply that the underlying filesystem has + * the page on some queue. So the page must be clean for + * migration. Writeout may mean we loose the lock and the + * page state is no longer what we checked for earlier. + * At this point we know that the migration attempt cannot + * be successful. + */ + remove_migration_ptes(page, page); + + rc = mapping->a_ops->writepage(page, &wbc); + + if (rc != AOP_WRITEPAGE_ACTIVATE) + /* unlocked. Relock */ + lock_page(page); + + return (rc < 0) ? -EIO : -EAGAIN; +} + +/* + * Default handling if a filesystem does not provide a migration function. + */ +static int fallback_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + if (PageDirty(page)) + return writeout(mapping, page); + + /* + * Buffers may be managed in a filesystem specific way. + * We must have no buffers or drop them. + */ + if (PagePrivate(page) && + !try_to_release_page(page, GFP_KERNEL)) + return -EAGAIN; + + return migrate_page(mapping, newpage, page); +} + +/* + * Move a page to a newly allocated page + * The page is locked and all ptes have been successfully removed. + * + * The new page will have replaced the old page if this function + * is successful. + * + * Return value: + * < 0 - error code + * == 0 - success + */ +static int move_to_new_page(struct page *newpage, struct page *page) +{ + struct address_space *mapping; + int rc; + + /* + * Block others from accessing the page when we get around to + * establishing additional references. We are the only one + * holding a reference to the new page at this point. + */ + if (!trylock_page(newpage)) + BUG(); + + /* Prepare mapping for the new page.*/ + newpage->index = page->index; + newpage->mapping = page->mapping; + if (PageSwapBacked(page)) + SetPageSwapBacked(newpage); + + mapping = page_mapping(page); + if (!mapping) + rc = migrate_page(mapping, newpage, page); + else if (mapping->a_ops->migratepage) + /* + * Most pages have a mapping and most filesystems + * should provide a migration function. Anonymous + * pages are part of swap space which also has its + * own migration function. This is the most common + * path for page migration. + */ + rc = mapping->a_ops->migratepage(mapping, + newpage, page); + else + rc = fallback_migrate_page(mapping, newpage, page); + + if (!rc) { + remove_migration_ptes(page, newpage); + } else + newpage->mapping = NULL; + + unlock_page(newpage); + + return rc; +} + +/* + * Obtain the lock on page, remove all ptes and migrate the page + * to the newly allocated page in newpage. + */ +static int unmap_and_move(new_page_t get_new_page, unsigned long private, + struct page *page, int force) +{ + int rc = 0; + int *result = NULL; + struct page *newpage = get_new_page(page, private, &result); + int rcu_locked = 0; + int charge = 0; + + if (!newpage) + return -ENOMEM; + + if (page_count(page) == 1) { + /* page was freed from under us. So we are done. */ + goto move_newpage; + } + + charge = mem_cgroup_prepare_migration(page, newpage); + if (charge == -ENOMEM) { + rc = -ENOMEM; + goto move_newpage; + } + /* prepare cgroup just returns 0 or -ENOMEM */ + BUG_ON(charge); + + rc = -EAGAIN; + if (!trylock_page(page)) { + if (!force) + goto move_newpage; + lock_page(page); + } + + if (PageWriteback(page)) { + if (!force) + goto unlock; + wait_on_page_writeback(page); + } + /* + * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, + * we cannot notice that anon_vma is freed while we migrates a page. + * This rcu_read_lock() delays freeing anon_vma pointer until the end + * of migration. File cache pages are no problem because of page_lock() + * File Caches may use write_page() or lock_page() in migration, then, + * just care Anon page here. + */ + if (PageAnon(page)) { + rcu_read_lock(); + rcu_locked = 1; + } + + /* + * Corner case handling: + * 1. When a new swap-cache page is read into, it is added to the LRU + * and treated as swapcache but it has no rmap yet. + * Calling try_to_unmap() against a page->mapping==NULL page will + * trigger a BUG. So handle it here. + * 2. An orphaned page (see truncate_complete_page) might have + * fs-private metadata. The page can be picked up due to memory + * offlining. Everywhere else except page reclaim, the page is + * invisible to the vm, so the page can not be migrated. So try to + * free the metadata, so the page can be freed. + */ + if (!page->mapping) { + if (!PageAnon(page) && PagePrivate(page)) { + /* + * Go direct to try_to_free_buffers() here because + * a) that's what try_to_release_page() would do anyway + * b) we may be under rcu_read_lock() here, so we can't + * use GFP_KERNEL which is what try_to_release_page() + * needs to be effective. + */ + try_to_free_buffers(page); + } + goto rcu_unlock; + } + + /* Establish migration ptes or remove ptes */ + try_to_unmap(page, 1); + + if (!page_mapped(page)) + rc = move_to_new_page(newpage, page); + + if (rc) + remove_migration_ptes(page, page); +rcu_unlock: + if (rcu_locked) + rcu_read_unlock(); + +unlock: + unlock_page(page); + + if (rc != -EAGAIN) { + /* + * A page that has been migrated has all references + * removed and will be freed. A page that has not been + * migrated will have kepts its references and be + * restored. + */ + list_del(&page->lru); + putback_lru_page(page); + } + +move_newpage: + if (!charge) + mem_cgroup_end_migration(newpage); + + /* + * Move the new page to the LRU. If migration was not successful + * then this will free the page. + */ + putback_lru_page(newpage); + + if (result) { + if (rc) + *result = rc; + else + *result = page_to_nid(newpage); + } + return rc; +} + +/* + * migrate_pages + * + * The function takes one list of pages to migrate and a function + * that determines from the page to be migrated and the private data + * the target of the move and allocates the page. + * + * The function returns after 10 attempts or if no pages + * are movable anymore because to has become empty + * or no retryable pages exist anymore. All pages will be + * returned to the LRU or freed. + * + * Return: Number of pages not migrated or error code. + */ +int migrate_pages(struct list_head *from, + new_page_t get_new_page, unsigned long private) +{ + int retry = 1; + int nr_failed = 0; + int pass = 0; + struct page *page; + struct page *page2; + int swapwrite = current->flags & PF_SWAPWRITE; + int rc; + + if (!swapwrite) + current->flags |= PF_SWAPWRITE; + + for(pass = 0; pass < 10 && retry; pass++) { + retry = 0; + + list_for_each_entry_safe(page, page2, from, lru) { + cond_resched(); + + rc = unmap_and_move(get_new_page, private, + page, pass > 2); + + switch(rc) { + case -ENOMEM: + goto out; + case -EAGAIN: + retry++; + break; + case 0: + break; + default: + /* Permanent failure */ + nr_failed++; + break; + } + } + } + rc = 0; +out: + if (!swapwrite) + current->flags &= ~PF_SWAPWRITE; + + putback_lru_pages(from); + + if (rc) + return rc; + + return nr_failed + retry; +} + +#ifdef CONFIG_NUMA +/* + * Move a list of individual pages + */ +struct page_to_node { + unsigned long addr; + struct page *page; + int node; + int status; +}; + +static struct page *new_page_node(struct page *p, unsigned long private, + int **result) +{ + struct page_to_node *pm = (struct page_to_node *)private; + + while (pm->node != MAX_NUMNODES && pm->page != p) + pm++; + + if (pm->node == MAX_NUMNODES) + return NULL; + + *result = &pm->status; + + return alloc_pages_node(pm->node, + GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); +} + +/* + * Move a set of pages as indicated in the pm array. The addr + * field must be set to the virtual address of the page to be moved + * and the node number must contain a valid target node. + * The pm array ends with node = MAX_NUMNODES. + */ +static int do_move_page_to_node_array(struct mm_struct *mm, + struct page_to_node *pm, + int migrate_all) +{ + int err; + struct page_to_node *pp; + LIST_HEAD(pagelist); + + migrate_prep(); + down_read(&mm->mmap_sem); + + /* + * Build a list of pages to migrate + */ + for (pp = pm; pp->node != MAX_NUMNODES; pp++) { + struct vm_area_struct *vma; + struct page *page; + + /* + * A valid page pointer that will not match any of the + * pages that will be moved. + */ + pp->page = ZERO_PAGE(0); + + err = -EFAULT; + vma = find_vma(mm, pp->addr); + if (!vma || !vma_migratable(vma)) + goto set_status; + + page = follow_page(vma, pp->addr, FOLL_GET); + + err = PTR_ERR(page); + if (IS_ERR(page)) + goto set_status; + + err = -ENOENT; + if (!page) + goto set_status; + + if (PageReserved(page)) /* Check for zero page */ + goto put_and_set; + + pp->page = page; + err = page_to_nid(page); + + if (err == pp->node) + /* + * Node already in the right place + */ + goto put_and_set; + + err = -EACCES; + if (page_mapcount(page) > 1 && + !migrate_all) + goto put_and_set; + + err = isolate_lru_page(page); + if (!err) + list_add_tail(&page->lru, &pagelist); +put_and_set: + /* + * Either remove the duplicate refcount from + * isolate_lru_page() or drop the page ref if it was + * not isolated. + */ + put_page(page); +set_status: + pp->status = err; + } + + err = 0; + if (!list_empty(&pagelist)) + err = migrate_pages(&pagelist, new_page_node, + (unsigned long)pm); + + up_read(&mm->mmap_sem); + return err; +} + +/* + * Migrate an array of page address onto an array of nodes and fill + * the corresponding array of status. + */ +static int do_pages_move(struct mm_struct *mm, struct task_struct *task, + unsigned long nr_pages, + const void __user * __user *pages, + const int __user *nodes, + int __user *status, int flags) +{ + struct page_to_node *pm = NULL; + nodemask_t task_nodes; + int err = 0; + int i; + + task_nodes = cpuset_mems_allowed(task); + + /* Limit nr_pages so that the multiplication may not overflow */ + if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { + err = -E2BIG; + goto out; + } + + pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node)); + if (!pm) { + err = -ENOMEM; + goto out; + } + + /* + * Get parameters from user space and initialize the pm + * array. Return various errors if the user did something wrong. + */ + for (i = 0; i < nr_pages; i++) { + const void __user *p; + + err = -EFAULT; + if (get_user(p, pages + i)) + goto out_pm; + + pm[i].addr = (unsigned long)p; + if (nodes) { + int node; + + if (get_user(node, nodes + i)) + goto out_pm; + + err = -ENODEV; + if (!node_state(node, N_HIGH_MEMORY)) + goto out_pm; + + err = -EACCES; + if (!node_isset(node, task_nodes)) + goto out_pm; + + pm[i].node = node; + } else + pm[i].node = 0; /* anything to not match MAX_NUMNODES */ + } + /* End marker */ + pm[nr_pages].node = MAX_NUMNODES; + + err = do_move_page_to_node_array(mm, pm, flags & MPOL_MF_MOVE_ALL); + if (err >= 0) + /* Return status information */ + for (i = 0; i < nr_pages; i++) + if (put_user(pm[i].status, status + i)) + err = -EFAULT; + +out_pm: + vfree(pm); +out: + return err; +} + +/* + * Determine the nodes of an array of pages and store it in an array of status. + */ +static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, + const void __user **pages, int *status) +{ + unsigned long i; + + down_read(&mm->mmap_sem); + + for (i = 0; i < nr_pages; i++) { + unsigned long addr = (unsigned long)(*pages); + struct vm_area_struct *vma; + struct page *page; + int err = -EFAULT; + + vma = find_vma(mm, addr); + if (!vma) + goto set_status; + + page = follow_page(vma, addr, 0); + + err = PTR_ERR(page); + if (IS_ERR(page)) + goto set_status; + + err = -ENOENT; + /* Use PageReserved to check for zero page */ + if (!page || PageReserved(page)) + goto set_status; + + err = page_to_nid(page); +set_status: + *status = err; + + pages++; + status++; + } + + up_read(&mm->mmap_sem); +} + +/* + * Determine the nodes of a user array of pages and store it in + * a user array of status. + */ +static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, + const void __user * __user *pages, + int __user *status) +{ +#define DO_PAGES_STAT_CHUNK_NR 16 + const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; + int chunk_status[DO_PAGES_STAT_CHUNK_NR]; + unsigned long i, chunk_nr = DO_PAGES_STAT_CHUNK_NR; + int err; + + for (i = 0; i < nr_pages; i += chunk_nr) { + if (chunk_nr + i > nr_pages) + chunk_nr = nr_pages - i; + + err = copy_from_user(chunk_pages, &pages[i], + chunk_nr * sizeof(*chunk_pages)); + if (err) { + err = -EFAULT; + goto out; + } + + do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); + + err = copy_to_user(&status[i], chunk_status, + chunk_nr * sizeof(*chunk_status)); + if (err) { + err = -EFAULT; + goto out; + } + } + err = 0; + +out: + return err; +} + +/* + * Move a list of pages in the address space of the currently executing + * process. + */ +SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, + const void __user * __user *, pages, + const int __user *, nodes, + int __user *, status, int, flags) +{ + struct task_struct *task; + struct mm_struct *mm; + int err; + + /* Check flags */ + if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) + return -EINVAL; + + if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) + return -EPERM; + + /* Find the mm_struct */ + read_lock(&tasklist_lock); + task = pid ? find_task_by_vpid(pid) : current; + if (!task) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + mm = get_task_mm(task); + read_unlock(&tasklist_lock); + + if (!mm) + return -EINVAL; + + /* + * Check if this process has the right to modify the specified + * process. The right exists if the process has administrative + * capabilities, superuser privileges or the same + * userid as the target process. + */ + if ((current->euid != task->suid) && (current->euid != task->uid) && + (current->uid != task->suid) && (current->uid != task->uid) && + !capable(CAP_SYS_NICE)) { + err = -EPERM; + goto out; + } + + err = security_task_movememory(task); + if (err) + goto out; + + if (nodes) { + err = do_pages_move(mm, task, nr_pages, pages, nodes, status, + flags); + } else { + err = do_pages_stat(mm, nr_pages, pages, status); + } + +out: + mmput(mm); + return err; +} + +/* + * Call migration functions in the vma_ops that may prepare + * memory in a vm for migration. migration functions may perform + * the migration for vmas that do not have an underlying page struct. + */ +int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, + const nodemask_t *from, unsigned long flags) +{ + struct vm_area_struct *vma; + int err = 0; + + for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) { + if (vma->vm_ops && vma->vm_ops->migrate) { + err = vma->vm_ops->migrate(vma, to, from, flags); + if (err) + break; + } + } + return err; +} +#endif diff --git a/mm/mincore.c b/mm/mincore.c new file mode 100644 index 0000000..8cb508f --- /dev/null +++ b/mm/mincore.c @@ -0,0 +1,229 @@ +/* + * linux/mm/mincore.c + * + * Copyright (C) 1994-2006 Linus Torvalds + */ + +/* + * The mincore() system call. + */ +#include <linux/slab.h> +#include <linux/pagemap.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/syscalls.h> +#include <linux/swap.h> +#include <linux/swapops.h> + +#include <asm/uaccess.h> +#include <asm/pgtable.h> + +/* + * Later we can get more picky about what "in core" means precisely. + * For now, simply check to see if the page is in the page cache, + * and is up to date; i.e. that no page-in operation would be required + * at this time if an application were to map and access this page. + */ +static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) +{ + unsigned char present = 0; + struct page *page; + + /* + * When tmpfs swaps out a page from a file, any process mapping that + * file will not get a swp_entry_t in its pte, but rather it is like + * any other file mapping (ie. marked !present and faulted in with + * tmpfs's .fault). So swapped out tmpfs mappings are tested here. + * + * However when tmpfs moves the page from pagecache and into swapcache, + * it is still in core, but the find_get_page below won't find it. + * No big deal, but make a note of it. + */ + page = find_get_page(mapping, pgoff); + if (page) { + present = PageUptodate(page); + page_cache_release(page); + } + + return present; +} + +/* + * Do a chunk of "sys_mincore()". We've already checked + * all the arguments, we hold the mmap semaphore: we should + * just return the amount of info we're asked for. + */ +static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep; + spinlock_t *ptl; + unsigned long nr; + int i; + pgoff_t pgoff; + struct vm_area_struct *vma = find_vma(current->mm, addr); + + /* + * find_vma() didn't find anything above us, or we're + * in an unmapped hole in the address space: ENOMEM. + */ + if (!vma || addr < vma->vm_start) + return -ENOMEM; + + /* + * Calculate how many pages there are left in the last level of the + * PTE array for our address. + */ + nr = PTRS_PER_PTE - ((addr >> PAGE_SHIFT) & (PTRS_PER_PTE-1)); + + /* + * Don't overrun this vma + */ + nr = min(nr, (vma->vm_end - addr) >> PAGE_SHIFT); + + /* + * Don't return more than the caller asked for + */ + nr = min(nr, pages); + + pgd = pgd_offset(vma->vm_mm, addr); + if (pgd_none_or_clear_bad(pgd)) + goto none_mapped; + pud = pud_offset(pgd, addr); + if (pud_none_or_clear_bad(pud)) + goto none_mapped; + pmd = pmd_offset(pud, addr); + if (pmd_none_or_clear_bad(pmd)) + goto none_mapped; + + ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (i = 0; i < nr; i++, ptep++, addr += PAGE_SIZE) { + unsigned char present; + pte_t pte = *ptep; + + if (pte_present(pte)) { + present = 1; + + } else if (pte_none(pte)) { + if (vma->vm_file) { + pgoff = linear_page_index(vma, addr); + present = mincore_page(vma->vm_file->f_mapping, + pgoff); + } else + present = 0; + + } else if (pte_file(pte)) { + pgoff = pte_to_pgoff(pte); + present = mincore_page(vma->vm_file->f_mapping, pgoff); + + } else { /* pte is a swap entry */ + swp_entry_t entry = pte_to_swp_entry(pte); + if (is_migration_entry(entry)) { + /* migration entries are always uptodate */ + present = 1; + } else { +#ifdef CONFIG_SWAP + pgoff = entry.val; + present = mincore_page(&swapper_space, pgoff); +#else + WARN_ON(1); + present = 1; +#endif + } + } + + vec[i] = present; + } + pte_unmap_unlock(ptep-1, ptl); + + return nr; + +none_mapped: + if (vma->vm_file) { + pgoff = linear_page_index(vma, addr); + for (i = 0; i < nr; i++, pgoff++) + vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff); + } else { + for (i = 0; i < nr; i++) + vec[i] = 0; + } + + return nr; +} + +/* + * The mincore(2) system call. + * + * mincore() returns the memory residency status of the pages in the + * current process's address space specified by [addr, addr + len). + * The status is returned in a vector of bytes. The least significant + * bit of each byte is 1 if the referenced page is in memory, otherwise + * it is zero. + * + * Because the status of a page can change after mincore() checks it + * but before it returns to the application, the returned vector may + * contain stale information. Only locked pages are guaranteed to + * remain in memory. + * + * return values: + * zero - success + * -EFAULT - vec points to an illegal address + * -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE + * -ENOMEM - Addresses in the range [addr, addr + len] are + * invalid for the address space of this process, or + * specify one or more pages which are not currently + * mapped + * -EAGAIN - A kernel resource was temporarily unavailable. + */ +SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, + unsigned char __user *, vec) +{ + long retval; + unsigned long pages; + unsigned char *tmp; + + /* Check the start address: needs to be page-aligned.. */ + if (start & ~PAGE_CACHE_MASK) + return -EINVAL; + + /* ..and we need to be passed a valid user-space range */ + if (!access_ok(VERIFY_READ, (void __user *) start, len)) + return -ENOMEM; + + /* This also avoids any overflows on PAGE_CACHE_ALIGN */ + pages = len >> PAGE_SHIFT; + pages += (len & ~PAGE_MASK) != 0; + + if (!access_ok(VERIFY_WRITE, vec, pages)) + return -EFAULT; + + tmp = (void *) __get_free_page(GFP_USER); + if (!tmp) + return -EAGAIN; + + retval = 0; + while (pages) { + /* + * Do at most PAGE_SIZE entries per iteration, due to + * the temporary buffer size. + */ + down_read(¤t->mm->mmap_sem); + retval = do_mincore(start, tmp, min(pages, PAGE_SIZE)); + up_read(¤t->mm->mmap_sem); + + if (retval <= 0) + break; + if (copy_to_user(vec, tmp, retval)) { + retval = -EFAULT; + break; + } + pages -= retval; + vec += retval; + start += retval << PAGE_SHIFT; + retval = 0; + } + free_page((unsigned long) tmp); + return retval; +} diff --git a/mm/mlock.c b/mm/mlock.c new file mode 100644 index 0000000..64dca47 --- /dev/null +++ b/mm/mlock.c @@ -0,0 +1,629 @@ +/* + * linux/mm/mlock.c + * + * (C) Copyright 1995 Linus Torvalds + * (C) Copyright 2002 Christoph Hellwig + */ + +#include <linux/capability.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/pagemap.h> +#include <linux/mempolicy.h> +#include <linux/syscalls.h> +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/rmap.h> +#include <linux/mmzone.h> +#include <linux/hugetlb.h> + +#include "internal.h" + +int can_do_mlock(void) +{ + if (capable(CAP_IPC_LOCK)) + return 1; + if (current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur != 0) + return 1; + return 0; +} +EXPORT_SYMBOL(can_do_mlock); + +#ifdef CONFIG_UNEVICTABLE_LRU +/* + * Mlocked pages are marked with PageMlocked() flag for efficient testing + * in vmscan and, possibly, the fault path; and to support semi-accurate + * statistics. + * + * An mlocked page [PageMlocked(page)] is unevictable. As such, it will + * be placed on the LRU "unevictable" list, rather than the [in]active lists. + * The unevictable list is an LRU sibling list to the [in]active lists. + * PageUnevictable is set to indicate the unevictable state. + * + * When lazy mlocking via vmscan, it is important to ensure that the + * vma's VM_LOCKED status is not concurrently being modified, otherwise we + * may have mlocked a page that is being munlocked. So lazy mlock must take + * the mmap_sem for read, and verify that the vma really is locked + * (see mm/rmap.c). + */ + +/* + * LRU accounting for clear_page_mlock() + */ +void __clear_page_mlock(struct page *page) +{ + VM_BUG_ON(!PageLocked(page)); + + if (!page->mapping) { /* truncated ? */ + return; + } + + dec_zone_page_state(page, NR_MLOCK); + count_vm_event(UNEVICTABLE_PGCLEARED); + if (!isolate_lru_page(page)) { + putback_lru_page(page); + } else { + /* + * We lost the race. the page already moved to evictable list. + */ + if (PageUnevictable(page)) + count_vm_event(UNEVICTABLE_PGSTRANDED); + } +} + +/* + * Mark page as mlocked if not already. + * If page on LRU, isolate and putback to move to unevictable list. + */ +void mlock_vma_page(struct page *page) +{ + BUG_ON(!PageLocked(page)); + + if (!TestSetPageMlocked(page)) { + inc_zone_page_state(page, NR_MLOCK); + count_vm_event(UNEVICTABLE_PGMLOCKED); + if (!isolate_lru_page(page)) + putback_lru_page(page); + } +} + +/* + * called from munlock()/munmap() path with page supposedly on the LRU. + * + * Note: unlike mlock_vma_page(), we can't just clear the PageMlocked + * [in try_to_munlock()] and then attempt to isolate the page. We must + * isolate the page to keep others from messing with its unevictable + * and mlocked state while trying to munlock. However, we pre-clear the + * mlocked state anyway as we might lose the isolation race and we might + * not get another chance to clear PageMlocked. If we successfully + * isolate the page and try_to_munlock() detects other VM_LOCKED vmas + * mapping the page, it will restore the PageMlocked state, unless the page + * is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(), + * perhaps redundantly. + * If we lose the isolation race, and the page is mapped by other VM_LOCKED + * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap() + * either of which will restore the PageMlocked state by calling + * mlock_vma_page() above, if it can grab the vma's mmap sem. + */ +static void munlock_vma_page(struct page *page) +{ + BUG_ON(!PageLocked(page)); + + if (TestClearPageMlocked(page)) { + dec_zone_page_state(page, NR_MLOCK); + if (!isolate_lru_page(page)) { + int ret = try_to_munlock(page); + /* + * did try_to_unlock() succeed or punt? + */ + if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN) + count_vm_event(UNEVICTABLE_PGMUNLOCKED); + + putback_lru_page(page); + } else { + /* + * We lost the race. let try_to_unmap() deal + * with it. At least we get the page state and + * mlock stats right. However, page is still on + * the noreclaim list. We'll fix that up when + * the page is eventually freed or we scan the + * noreclaim list. + */ + if (PageUnevictable(page)) + count_vm_event(UNEVICTABLE_PGSTRANDED); + else + count_vm_event(UNEVICTABLE_PGMUNLOCKED); + } + } +} + +/** + * __mlock_vma_pages_range() - mlock/munlock a range of pages in the vma. + * @vma: target vma + * @start: start address + * @end: end address + * @mlock: 0 indicate munlock, otherwise mlock. + * + * If @mlock == 0, unlock an mlocked range; + * else mlock the range of pages. This takes care of making the pages present , + * too. + * + * return 0 on success, negative error code on error. + * + * vma->vm_mm->mmap_sem must be held for at least read. + */ +static long __mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + int mlock) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long addr = start; + struct page *pages[16]; /* 16 gives a reasonable batch */ + int nr_pages = (end - start) / PAGE_SIZE; + int ret = 0; + int gup_flags = 0; + + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(end & ~PAGE_MASK); + VM_BUG_ON(start < vma->vm_start); + VM_BUG_ON(end > vma->vm_end); + VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) && + (atomic_read(&mm->mm_users) != 0)); + + /* + * mlock: don't page populate if page has PROT_NONE permission. + * munlock: the pages always do munlock althrough + * its has PROT_NONE permission. + */ + if (!mlock) + gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS; + + if (vma->vm_flags & VM_WRITE) + gup_flags |= GUP_FLAGS_WRITE; + + while (nr_pages > 0) { + int i; + + cond_resched(); + + /* + * get_user_pages makes pages present if we are + * setting mlock. and this extra reference count will + * disable migration of this page. However, page may + * still be truncated out from under us. + */ + ret = __get_user_pages(current, mm, addr, + min_t(int, nr_pages, ARRAY_SIZE(pages)), + gup_flags, pages, NULL); + /* + * This can happen for, e.g., VM_NONLINEAR regions before + * a page has been allocated and mapped at a given offset, + * or for addresses that map beyond end of a file. + * We'll mlock the the pages if/when they get faulted in. + */ + if (ret < 0) + break; + if (ret == 0) { + /* + * We know the vma is there, so the only time + * we cannot get a single page should be an + * error (ret < 0) case. + */ + WARN_ON(1); + break; + } + + lru_add_drain(); /* push cached pages to LRU */ + + for (i = 0; i < ret; i++) { + struct page *page = pages[i]; + + lock_page(page); + /* + * Because we lock page here and migration is blocked + * by the elevated reference, we need only check for + * page truncation (file-cache only). + */ + if (page->mapping) { + if (mlock) + mlock_vma_page(page); + else + munlock_vma_page(page); + } + unlock_page(page); + put_page(page); /* ref from get_user_pages() */ + + /* + * here we assume that get_user_pages() has given us + * a list of virtually contiguous pages. + */ + addr += PAGE_SIZE; /* for next get_user_pages() */ + nr_pages--; + } + ret = 0; + } + + return ret; /* count entire vma as locked_vm */ +} + +/* + * convert get_user_pages() return value to posix mlock() error + */ +static int __mlock_posix_error_return(long retval) +{ + if (retval == -EFAULT) + retval = -ENOMEM; + else if (retval == -ENOMEM) + retval = -EAGAIN; + return retval; +} + +#else /* CONFIG_UNEVICTABLE_LRU */ + +/* + * Just make pages present if VM_LOCKED. No-op if unlocking. + */ +static long __mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + int mlock) +{ + if (mlock && (vma->vm_flags & VM_LOCKED)) + return make_pages_present(start, end); + return 0; +} + +static inline int __mlock_posix_error_return(long retval) +{ + return 0; +} + +#endif /* CONFIG_UNEVICTABLE_LRU */ + +/** + * mlock_vma_pages_range() - mlock pages in specified vma range. + * @vma - the vma containing the specfied address range + * @start - starting address in @vma to mlock + * @end - end address [+1] in @vma to mlock + * + * For mmap()/mremap()/expansion of mlocked vma. + * + * return 0 on success for "normal" vmas. + * + * return number of pages [> 0] to be removed from locked_vm on success + * of "special" vmas. + */ +long mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + int nr_pages = (end - start) / PAGE_SIZE; + BUG_ON(!(vma->vm_flags & VM_LOCKED)); + + /* + * filter unlockable vmas + */ + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + goto no_mlock; + + if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || + is_vm_hugetlb_page(vma) || + vma == get_gate_vma(current))) { + + __mlock_vma_pages_range(vma, start, end, 1); + + /* Hide errors from mmap() and other callers */ + return 0; + } + + /* + * User mapped kernel pages or huge pages: + * make these pages present to populate the ptes, but + * fall thru' to reset VM_LOCKED--no need to unlock, and + * return nr_pages so these don't get counted against task's + * locked limit. huge pages are already counted against + * locked vm limit. + */ + make_pages_present(start, end); + +no_mlock: + vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */ + return nr_pages; /* error or pages NOT mlocked */ +} + + +/* + * munlock_vma_pages_range() - munlock all pages in the vma range.' + * @vma - vma containing range to be munlock()ed. + * @start - start address in @vma of the range + * @end - end of range in @vma. + * + * For mremap(), munmap() and exit(). + * + * Called with @vma VM_LOCKED. + * + * Returns with VM_LOCKED cleared. Callers must be prepared to + * deal with this. + * + * We don't save and restore VM_LOCKED here because pages are + * still on lru. In unmap path, pages might be scanned by reclaim + * and re-mlocked by try_to_{munlock|unmap} before we unmap and + * free them. This will result in freeing mlocked pages. + */ +void munlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + vma->vm_flags &= ~VM_LOCKED; + __mlock_vma_pages_range(vma, start, end, 0); +} + +/* + * mlock_fixup - handle mlock[all]/munlock[all] requests. + * + * Filters out "special" vmas -- VM_LOCKED never gets set for these, and + * munlock is a no-op. However, for some special vmas, we go ahead and + * populate the ptes via make_pages_present(). + * + * For vmas that pass the filters, merge/split as appropriate. + */ +static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, + unsigned long start, unsigned long end, unsigned int newflags) +{ + struct mm_struct *mm = vma->vm_mm; + pgoff_t pgoff; + int nr_pages; + int ret = 0; + int lock = newflags & VM_LOCKED; + + if (newflags == vma->vm_flags || + (vma->vm_flags & (VM_IO | VM_PFNMAP))) + goto out; /* don't set VM_LOCKED, don't count */ + + if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || + is_vm_hugetlb_page(vma) || + vma == get_gate_vma(current)) { + if (lock) + make_pages_present(start, end); + goto out; /* don't set VM_LOCKED, don't count */ + } + + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, + vma->vm_file, pgoff, vma_policy(vma)); + if (*prev) { + vma = *prev; + goto success; + } + + if (start != vma->vm_start) { + ret = split_vma(mm, vma, start, 1); + if (ret) + goto out; + } + + if (end != vma->vm_end) { + ret = split_vma(mm, vma, end, 0); + if (ret) + goto out; + } + +success: + /* + * Keep track of amount of locked VM. + */ + nr_pages = (end - start) >> PAGE_SHIFT; + if (!lock) + nr_pages = -nr_pages; + mm->locked_vm += nr_pages; + + /* + * vm_flags is protected by the mmap_sem held in write mode. + * It's okay if try_to_unmap_one unmaps a page just after we + * set VM_LOCKED, __mlock_vma_pages_range will bring it back. + */ + vma->vm_flags = newflags; + + if (lock) { + ret = __mlock_vma_pages_range(vma, start, end, 1); + + if (ret > 0) { + mm->locked_vm -= ret; + ret = 0; + } else + ret = __mlock_posix_error_return(ret); /* translate if needed */ + } else { + __mlock_vma_pages_range(vma, start, end, 0); + } + +out: + *prev = vma; + return ret; +} + +static int do_mlock(unsigned long start, size_t len, int on) +{ + unsigned long nstart, end, tmp; + struct vm_area_struct * vma, * prev; + int error; + + len = PAGE_ALIGN(len); + end = start + len; + if (end < start) + return -EINVAL; + if (end == start) + return 0; + vma = find_vma_prev(current->mm, start, &prev); + if (!vma || vma->vm_start > start) + return -ENOMEM; + + if (start > vma->vm_start) + prev = vma; + + for (nstart = start ; ; ) { + unsigned int newflags; + + /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + + newflags = vma->vm_flags | VM_LOCKED; + if (!on) + newflags &= ~VM_LOCKED; + + tmp = vma->vm_end; + if (tmp > end) + tmp = end; + error = mlock_fixup(vma, &prev, nstart, tmp, newflags); + if (error) + break; + nstart = tmp; + if (nstart < prev->vm_end) + nstart = prev->vm_end; + if (nstart >= end) + break; + + vma = prev->vm_next; + if (!vma || vma->vm_start != nstart) { + error = -ENOMEM; + break; + } + } + return error; +} + +SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) +{ + unsigned long locked; + unsigned long lock_limit; + int error = -ENOMEM; + + if (!can_do_mlock()) + return -EPERM; + + lru_add_drain_all(); /* flush pagevec */ + + down_write(¤t->mm->mmap_sem); + len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); + start &= PAGE_MASK; + + locked = len >> PAGE_SHIFT; + locked += current->mm->locked_vm; + + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit >>= PAGE_SHIFT; + + /* check against resource limits */ + if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) + error = do_mlock(start, len, 1); + up_write(¤t->mm->mmap_sem); + return error; +} + +SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) +{ + int ret; + + down_write(¤t->mm->mmap_sem); + len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); + start &= PAGE_MASK; + ret = do_mlock(start, len, 0); + up_write(¤t->mm->mmap_sem); + return ret; +} + +static int do_mlockall(int flags) +{ + struct vm_area_struct * vma, * prev = NULL; + unsigned int def_flags = 0; + + if (flags & MCL_FUTURE) + def_flags = VM_LOCKED; + current->mm->def_flags = def_flags; + if (flags == MCL_FUTURE) + goto out; + + for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { + unsigned int newflags; + + newflags = vma->vm_flags | VM_LOCKED; + if (!(flags & MCL_CURRENT)) + newflags &= ~VM_LOCKED; + + /* Ignore errors */ + mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); + } +out: + return 0; +} + +SYSCALL_DEFINE1(mlockall, int, flags) +{ + unsigned long lock_limit; + int ret = -EINVAL; + + if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE))) + goto out; + + ret = -EPERM; + if (!can_do_mlock()) + goto out; + + lru_add_drain_all(); /* flush pagevec */ + + down_write(¤t->mm->mmap_sem); + + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit >>= PAGE_SHIFT; + + ret = -ENOMEM; + if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || + capable(CAP_IPC_LOCK)) + ret = do_mlockall(flags); + up_write(¤t->mm->mmap_sem); +out: + return ret; +} + +SYSCALL_DEFINE0(munlockall) +{ + int ret; + + down_write(¤t->mm->mmap_sem); + ret = do_mlockall(0); + up_write(¤t->mm->mmap_sem); + return ret; +} + +/* + * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB + * shm segments) get accounted against the user_struct instead. + */ +static DEFINE_SPINLOCK(shmlock_user_lock); + +int user_shm_lock(size_t size, struct user_struct *user) +{ + unsigned long lock_limit, locked; + int allowed = 0; + + locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + if (lock_limit == RLIM_INFINITY) + allowed = 1; + lock_limit >>= PAGE_SHIFT; + spin_lock(&shmlock_user_lock); + if (!allowed && + locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK)) + goto out; + get_uid(user); + user->locked_shm += locked; + allowed = 1; +out: + spin_unlock(&shmlock_user_lock); + return allowed; +} + +void user_shm_unlock(size_t size, struct user_struct *user) +{ + spin_lock(&shmlock_user_lock); + user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + spin_unlock(&shmlock_user_lock); + free_uid(user); +} diff --git a/mm/mm_init.c b/mm/mm_init.c new file mode 100644 index 0000000..4e0e265 --- /dev/null +++ b/mm/mm_init.c @@ -0,0 +1,152 @@ +/* + * mm_init.c - Memory initialisation verification and debugging + * + * Copyright 2008 IBM Corporation, 2008 + * Author Mel Gorman <mel@csn.ul.ie> + * + */ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/kobject.h> +#include <linux/module.h> +#include "internal.h" + +#ifdef CONFIG_DEBUG_MEMORY_INIT +int mminit_loglevel; + +#ifndef SECTIONS_SHIFT +#define SECTIONS_SHIFT 0 +#endif + +/* The zonelists are simply reported, validation is manual. */ +void mminit_verify_zonelist(void) +{ + int nid; + + if (mminit_loglevel < MMINIT_VERIFY) + return; + + for_each_online_node(nid) { + pg_data_t *pgdat = NODE_DATA(nid); + struct zone *zone; + struct zoneref *z; + struct zonelist *zonelist; + int i, listid, zoneid; + + BUG_ON(MAX_ZONELISTS > 2); + for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) { + + /* Identify the zone and nodelist */ + zoneid = i % MAX_NR_ZONES; + listid = i / MAX_NR_ZONES; + zonelist = &pgdat->node_zonelists[listid]; + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + /* Print information about the zonelist */ + printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ", + listid > 0 ? "thisnode" : "general", nid, + zone->name); + + /* Iterate the zonelist */ + for_each_zone_zonelist(zone, z, zonelist, zoneid) { +#ifdef CONFIG_NUMA + printk(KERN_CONT "%d:%s ", + zone->node, zone->name); +#else + printk(KERN_CONT "0:%s ", zone->name); +#endif /* CONFIG_NUMA */ + } + printk(KERN_CONT "\n"); + } + } +} + +void __init mminit_verify_pageflags_layout(void) +{ + int shift, width; + unsigned long or_mask, add_mask; + + shift = 8 * sizeof(unsigned long); + width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH; + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", + "Section %d Node %d Zone %d Flags %d\n", + SECTIONS_WIDTH, + NODES_WIDTH, + ZONES_WIDTH, + NR_PAGEFLAGS); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", + "Section %d Node %d Zone %d\n", + SECTIONS_SHIFT, + NODES_SHIFT, + ZONES_SHIFT); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets", + "Section %lu Node %lu Zone %lu\n", + (unsigned long)SECTIONS_PGSHIFT, + (unsigned long)NODES_PGSHIFT, + (unsigned long)ZONES_PGSHIFT); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid", + "Zone ID: %lu -> %lu\n", + (unsigned long)ZONEID_PGOFF, + (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT)); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage", + "location: %d -> %d unused %d -> %d flags %d -> %d\n", + shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0); +#ifdef NODE_NOT_IN_PAGE_FLAGS + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", + "Node not in page flags"); +#endif + + if (SECTIONS_WIDTH) { + shift -= SECTIONS_WIDTH; + BUG_ON(shift != SECTIONS_PGSHIFT); + } + if (NODES_WIDTH) { + shift -= NODES_WIDTH; + BUG_ON(shift != NODES_PGSHIFT); + } + if (ZONES_WIDTH) { + shift -= ZONES_WIDTH; + BUG_ON(shift != ZONES_PGSHIFT); + } + + /* Check for bitmask overlaps */ + or_mask = (ZONES_MASK << ZONES_PGSHIFT) | + (NODES_MASK << NODES_PGSHIFT) | + (SECTIONS_MASK << SECTIONS_PGSHIFT); + add_mask = (ZONES_MASK << ZONES_PGSHIFT) + + (NODES_MASK << NODES_PGSHIFT) + + (SECTIONS_MASK << SECTIONS_PGSHIFT); + BUG_ON(or_mask != add_mask); +} + +void __meminit mminit_verify_page_links(struct page *page, enum zone_type zone, + unsigned long nid, unsigned long pfn) +{ + BUG_ON(page_to_nid(page) != nid); + BUG_ON(page_zonenum(page) != zone); + BUG_ON(page_to_pfn(page) != pfn); +} + +static __init int set_mminit_loglevel(char *str) +{ + get_option(&str, &mminit_loglevel); + return 0; +} +early_param("mminit_loglevel", set_mminit_loglevel); +#endif /* CONFIG_DEBUG_MEMORY_INIT */ + +struct kobject *mm_kobj; +EXPORT_SYMBOL_GPL(mm_kobj); + +static int __init mm_sysfs_init(void) +{ + mm_kobj = kobject_create_and_add("mm", kernel_kobj); + if (!mm_kobj) + return -ENOMEM; + + return 0; +} + +__initcall(mm_sysfs_init); diff --git a/mm/mmap.c b/mm/mmap.c new file mode 100644 index 0000000..eb61f47 --- /dev/null +++ b/mm/mmap.c @@ -0,0 +1,2482 @@ +/* + * mm/mmap.c + * + * Written by obz. + * + * Address space accounting code <alan@redhat.com> + */ + +#include <linux/slab.h> +#include <linux/backing-dev.h> +#include <linux/mm.h> +#include <linux/shm.h> +#include <linux/mman.h> +#include <linux/pagemap.h> +#include <linux/swap.h> +#include <linux/syscalls.h> +#include <linux/capability.h> +#include <linux/init.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/personality.h> +#include <linux/security.h> +#include <linux/hugetlb.h> +#include <linux/profile.h> +#include <linux/module.h> +#include <linux/mount.h> +#include <linux/mempolicy.h> +#include <linux/rmap.h> +#include <linux/mmu_notifier.h> + +#include <asm/uaccess.h> +#include <asm/cacheflush.h> +#include <asm/tlb.h> +#include <asm/mmu_context.h> + +#include "internal.h" + +#ifndef arch_mmap_check +#define arch_mmap_check(addr, len, flags) (0) +#endif + +#ifndef arch_rebalance_pgtables +#define arch_rebalance_pgtables(addr, len) (addr) +#endif + +static void unmap_region(struct mm_struct *mm, + struct vm_area_struct *vma, struct vm_area_struct *prev, + unsigned long start, unsigned long end); + +/* + * WARNING: the debugging will use recursive algorithms so never enable this + * unless you know what you are doing. + */ +#undef DEBUG_MM_RB + +/* description of effects of mapping type and prot in current implementation. + * this is due to the limited x86 page protection hardware. The expected + * behavior is in parens: + * + * map_type prot + * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC + * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (yes) yes w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (copy) copy w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + */ +pgprot_t protection_map[16] = { + __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, + __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 +}; + +pgprot_t vm_get_page_prot(unsigned long vm_flags) +{ + return __pgprot(pgprot_val(protection_map[vm_flags & + (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) | + pgprot_val(arch_vm_get_page_prot(vm_flags))); +} +EXPORT_SYMBOL(vm_get_page_prot); + +int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ +int sysctl_overcommit_ratio = 50; /* default is 50% */ +int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; +atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); + +/* + * Check that a process has enough memory to allocate a new virtual + * mapping. 0 means there is enough memory for the allocation to + * succeed and -ENOMEM implies there is not. + * + * We currently support three overcommit policies, which are set via the + * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting + * + * Strict overcommit modes added 2002 Feb 26 by Alan Cox. + * Additional code 2002 Jul 20 by Robert Love. + * + * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. + * + * Note this is a helper function intended to be used by LSMs which + * wish to use this logic. + */ +int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) +{ + unsigned long free, allowed; + + vm_acct_memory(pages); + + /* + * Sometimes we want to use more memory than we have + */ + if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) + return 0; + + if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { + unsigned long n; + + free = global_page_state(NR_FILE_PAGES); + free += nr_swap_pages; + + /* + * Any slabs which are created with the + * SLAB_RECLAIM_ACCOUNT flag claim to have contents + * which are reclaimable, under pressure. The dentry + * cache and most inode caches should fall into this + */ + free += global_page_state(NR_SLAB_RECLAIMABLE); + + /* + * Leave the last 3% for root + */ + if (!cap_sys_admin) + free -= free / 32; + + if (free > pages) + return 0; + + /* + * nr_free_pages() is very expensive on large systems, + * only call if we're about to fail. + */ + n = nr_free_pages(); + + /* + * Leave reserved pages. The pages are not for anonymous pages. + */ + if (n <= totalreserve_pages) + goto error; + else + n -= totalreserve_pages; + + /* + * Leave the last 3% for root + */ + if (!cap_sys_admin) + n -= n / 32; + free += n; + + if (free > pages) + return 0; + + goto error; + } + + allowed = (totalram_pages - hugetlb_total_pages()) + * sysctl_overcommit_ratio / 100; + /* + * Leave the last 3% for root + */ + if (!cap_sys_admin) + allowed -= allowed / 32; + allowed += total_swap_pages; + + /* Don't let a single process grow too big: + leave 3% of the size of this process for other processes */ + if (mm) + allowed -= mm->total_vm / 32; + + /* + * cast `allowed' as a signed long because vm_committed_space + * sometimes has a negative value + */ + if (atomic_long_read(&vm_committed_space) < (long)allowed) + return 0; +error: + vm_unacct_memory(pages); + + return -ENOMEM; +} + +/* + * Requires inode->i_mapping->i_mmap_lock + */ +static void __remove_shared_vm_struct(struct vm_area_struct *vma, + struct file *file, struct address_space *mapping) +{ + if (vma->vm_flags & VM_DENYWRITE) + atomic_inc(&file->f_path.dentry->d_inode->i_writecount); + if (vma->vm_flags & VM_SHARED) + mapping->i_mmap_writable--; + + flush_dcache_mmap_lock(mapping); + if (unlikely(vma->vm_flags & VM_NONLINEAR)) + list_del_init(&vma->shared.vm_set.list); + else + vma_prio_tree_remove(vma, &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); +} + +/* + * Unlink a file-based vm structure from its prio_tree, to hide + * vma from rmap and vmtruncate before freeing its page tables. + */ +void unlink_file_vma(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + + if (file) { + struct address_space *mapping = file->f_mapping; + spin_lock(&mapping->i_mmap_lock); + __remove_shared_vm_struct(vma, file, mapping); + spin_unlock(&mapping->i_mmap_lock); + } +} + +/* + * Close a vm structure and free it, returning the next. + */ +static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) +{ + struct vm_area_struct *next = vma->vm_next; + + might_sleep(); + if (vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); + if (vma->vm_file) { + fput(vma->vm_file); + if (vma->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(vma->vm_mm); + } + mpol_put(vma_policy(vma)); + kmem_cache_free(vm_area_cachep, vma); + return next; +} + +SYSCALL_DEFINE1(brk, unsigned long, brk) +{ + unsigned long rlim, retval; + unsigned long newbrk, oldbrk; + struct mm_struct *mm = current->mm; + unsigned long min_brk; + + down_write(&mm->mmap_sem); + +#ifdef CONFIG_COMPAT_BRK + min_brk = mm->end_code; +#else + min_brk = mm->start_brk; +#endif + if (brk < min_brk) + goto out; + + /* + * Check against rlimit here. If this check is done later after the test + * of oldbrk with newbrk then it can escape the test and let the data + * segment grow beyond its set limit the in case where the limit is + * not page aligned -Ram Gupta + */ + rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; + if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + + (mm->end_data - mm->start_data) > rlim) + goto out; + + newbrk = PAGE_ALIGN(brk); + oldbrk = PAGE_ALIGN(mm->brk); + if (oldbrk == newbrk) + goto set_brk; + + /* Always allow shrinking brk. */ + if (brk <= mm->brk) { + if (!do_munmap(mm, newbrk, oldbrk-newbrk)) + goto set_brk; + goto out; + } + + /* Check against existing mmap mappings. */ + if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) + goto out; + + /* Ok, looks good - let it rip. */ + if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) + goto out; +set_brk: + mm->brk = brk; +out: + retval = mm->brk; + up_write(&mm->mmap_sem); + return retval; +} + +#ifdef DEBUG_MM_RB +static int browse_rb(struct rb_root *root) +{ + int i = 0, j; + struct rb_node *nd, *pn = NULL; + unsigned long prev = 0, pend = 0; + + for (nd = rb_first(root); nd; nd = rb_next(nd)) { + struct vm_area_struct *vma; + vma = rb_entry(nd, struct vm_area_struct, vm_rb); + if (vma->vm_start < prev) + printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1; + if (vma->vm_start < pend) + printk("vm_start %lx pend %lx\n", vma->vm_start, pend); + if (vma->vm_start > vma->vm_end) + printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start); + i++; + pn = nd; + prev = vma->vm_start; + pend = vma->vm_end; + } + j = 0; + for (nd = pn; nd; nd = rb_prev(nd)) { + j++; + } + if (i != j) + printk("backwards %d, forwards %d\n", j, i), i = 0; + return i; +} + +void validate_mm(struct mm_struct *mm) +{ + int bug = 0; + int i = 0; + struct vm_area_struct *tmp = mm->mmap; + while (tmp) { + tmp = tmp->vm_next; + i++; + } + if (i != mm->map_count) + printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1; + i = browse_rb(&mm->mm_rb); + if (i != mm->map_count) + printk("map_count %d rb %d\n", mm->map_count, i), bug = 1; + BUG_ON(bug); +} +#else +#define validate_mm(mm) do { } while (0) +#endif + +static struct vm_area_struct * +find_vma_prepare(struct mm_struct *mm, unsigned long addr, + struct vm_area_struct **pprev, struct rb_node ***rb_link, + struct rb_node ** rb_parent) +{ + struct vm_area_struct * vma; + struct rb_node ** __rb_link, * __rb_parent, * rb_prev; + + __rb_link = &mm->mm_rb.rb_node; + rb_prev = __rb_parent = NULL; + vma = NULL; + + while (*__rb_link) { + struct vm_area_struct *vma_tmp; + + __rb_parent = *__rb_link; + vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); + + if (vma_tmp->vm_end > addr) { + vma = vma_tmp; + if (vma_tmp->vm_start <= addr) + break; + __rb_link = &__rb_parent->rb_left; + } else { + rb_prev = __rb_parent; + __rb_link = &__rb_parent->rb_right; + } + } + + *pprev = NULL; + if (rb_prev) + *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); + *rb_link = __rb_link; + *rb_parent = __rb_parent; + return vma; +} + +static inline void +__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev, struct rb_node *rb_parent) +{ + if (prev) { + vma->vm_next = prev->vm_next; + prev->vm_next = vma; + } else { + mm->mmap = vma; + if (rb_parent) + vma->vm_next = rb_entry(rb_parent, + struct vm_area_struct, vm_rb); + else + vma->vm_next = NULL; + } +} + +void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, + struct rb_node **rb_link, struct rb_node *rb_parent) +{ + rb_link_node(&vma->vm_rb, rb_parent, rb_link); + rb_insert_color(&vma->vm_rb, &mm->mm_rb); +} + +static void __vma_link_file(struct vm_area_struct *vma) +{ + struct file * file; + + file = vma->vm_file; + if (file) { + struct address_space *mapping = file->f_mapping; + + if (vma->vm_flags & VM_DENYWRITE) + atomic_dec(&file->f_path.dentry->d_inode->i_writecount); + if (vma->vm_flags & VM_SHARED) + mapping->i_mmap_writable++; + + flush_dcache_mmap_lock(mapping); + if (unlikely(vma->vm_flags & VM_NONLINEAR)) + vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); + else + vma_prio_tree_insert(vma, &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); + } +} + +static void +__vma_link(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev, struct rb_node **rb_link, + struct rb_node *rb_parent) +{ + __vma_link_list(mm, vma, prev, rb_parent); + __vma_link_rb(mm, vma, rb_link, rb_parent); + __anon_vma_link(vma); +} + +static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev, struct rb_node **rb_link, + struct rb_node *rb_parent) +{ + struct address_space *mapping = NULL; + + if (vma->vm_file) + mapping = vma->vm_file->f_mapping; + + if (mapping) { + spin_lock(&mapping->i_mmap_lock); + vma->vm_truncate_count = mapping->truncate_count; + } + anon_vma_lock(vma); + + __vma_link(mm, vma, prev, rb_link, rb_parent); + __vma_link_file(vma); + + anon_vma_unlock(vma); + if (mapping) + spin_unlock(&mapping->i_mmap_lock); + + mm->map_count++; + validate_mm(mm); +} + +/* + * Helper for vma_adjust in the split_vma insert case: + * insert vm structure into list and rbtree and anon_vma, + * but it has already been inserted into prio_tree earlier. + */ +static void +__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) +{ + struct vm_area_struct * __vma, * prev; + struct rb_node ** rb_link, * rb_parent; + + __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); + BUG_ON(__vma && __vma->vm_start < vma->vm_end); + __vma_link(mm, vma, prev, rb_link, rb_parent); + mm->map_count++; +} + +static inline void +__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev) +{ + prev->vm_next = vma->vm_next; + rb_erase(&vma->vm_rb, &mm->mm_rb); + if (mm->mmap_cache == vma) + mm->mmap_cache = prev; +} + +/* + * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that + * is already present in an i_mmap tree without adjusting the tree. + * The following helper function should be used when such adjustments + * are necessary. The "insert" vma (if any) is to be inserted + * before we drop the necessary locks. + */ +void vma_adjust(struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) +{ + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *next = vma->vm_next; + struct vm_area_struct *importer = NULL; + struct address_space *mapping = NULL; + struct prio_tree_root *root = NULL; + struct file *file = vma->vm_file; + struct anon_vma *anon_vma = NULL; + long adjust_next = 0; + int remove_next = 0; + + if (next && !insert) { + if (end >= next->vm_end) { + /* + * vma expands, overlapping all the next, and + * perhaps the one after too (mprotect case 6). + */ +again: remove_next = 1 + (end > next->vm_end); + end = next->vm_end; + anon_vma = next->anon_vma; + importer = vma; + } else if (end > next->vm_start) { + /* + * vma expands, overlapping part of the next: + * mprotect case 5 shifting the boundary up. + */ + adjust_next = (end - next->vm_start) >> PAGE_SHIFT; + anon_vma = next->anon_vma; + importer = vma; + } else if (end < vma->vm_end) { + /* + * vma shrinks, and !insert tells it's not + * split_vma inserting another: so it must be + * mprotect case 4 shifting the boundary down. + */ + adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT); + anon_vma = next->anon_vma; + importer = next; + } + } + + if (file) { + mapping = file->f_mapping; + if (!(vma->vm_flags & VM_NONLINEAR)) + root = &mapping->i_mmap; + spin_lock(&mapping->i_mmap_lock); + if (importer && + vma->vm_truncate_count != next->vm_truncate_count) { + /* + * unmap_mapping_range might be in progress: + * ensure that the expanding vma is rescanned. + */ + importer->vm_truncate_count = 0; + } + if (insert) { + insert->vm_truncate_count = vma->vm_truncate_count; + /* + * Put into prio_tree now, so instantiated pages + * are visible to arm/parisc __flush_dcache_page + * throughout; but we cannot insert into address + * space until vma start or end is updated. + */ + __vma_link_file(insert); + } + } + + /* + * When changing only vma->vm_end, we don't really need + * anon_vma lock: but is that case worth optimizing out? + */ + if (vma->anon_vma) + anon_vma = vma->anon_vma; + if (anon_vma) { + spin_lock(&anon_vma->lock); + /* + * Easily overlooked: when mprotect shifts the boundary, + * make sure the expanding vma has anon_vma set if the + * shrinking vma had, to cover any anon pages imported. + */ + if (importer && !importer->anon_vma) { + importer->anon_vma = anon_vma; + __anon_vma_link(importer); + } + } + + if (root) { + flush_dcache_mmap_lock(mapping); + vma_prio_tree_remove(vma, root); + if (adjust_next) + vma_prio_tree_remove(next, root); + } + + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; + if (adjust_next) { + next->vm_start += adjust_next << PAGE_SHIFT; + next->vm_pgoff += adjust_next; + } + + if (root) { + if (adjust_next) + vma_prio_tree_insert(next, root); + vma_prio_tree_insert(vma, root); + flush_dcache_mmap_unlock(mapping); + } + + if (remove_next) { + /* + * vma_merge has merged next into vma, and needs + * us to remove next before dropping the locks. + */ + __vma_unlink(mm, next, vma); + if (file) + __remove_shared_vm_struct(next, file, mapping); + if (next->anon_vma) + __anon_vma_merge(vma, next); + } else if (insert) { + /* + * split_vma has split insert from vma, and needs + * us to insert it before dropping the locks + * (it may either follow vma or precede it). + */ + __insert_vm_struct(mm, insert); + } + + if (anon_vma) + spin_unlock(&anon_vma->lock); + if (mapping) + spin_unlock(&mapping->i_mmap_lock); + + if (remove_next) { + if (file) { + fput(file); + if (next->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(mm); + } + mm->map_count--; + mpol_put(vma_policy(next)); + kmem_cache_free(vm_area_cachep, next); + /* + * In mprotect's case 6 (see comments on vma_merge), + * we must remove another next too. It would clutter + * up the code too much to do both in one go. + */ + if (remove_next == 2) { + next = vma->vm_next; + goto again; + } + } + + validate_mm(mm); +} + +/* + * If the vma has a ->close operation then the driver probably needs to release + * per-vma resources, so we don't attempt to merge those. + */ +static inline int is_mergeable_vma(struct vm_area_struct *vma, + struct file *file, unsigned long vm_flags) +{ + if (vma->vm_flags != vm_flags) + return 0; + if (vma->vm_file != file) + return 0; + if (vma->vm_ops && vma->vm_ops->close) + return 0; + return 1; +} + +static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, + struct anon_vma *anon_vma2) +{ + return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2); +} + +/* + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) + * in front of (at a lower virtual address and file offset than) the vma. + * + * We cannot merge two vmas if they have differently assigned (non-NULL) + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. + * + * We don't check here for the merged mmap wrapping around the end of pagecache + * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which + * wrap, nor mmaps which cover the final page at index -1UL. + */ +static int +can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, + struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) +{ + if (is_mergeable_vma(vma, file, vm_flags) && + is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { + if (vma->vm_pgoff == vm_pgoff) + return 1; + } + return 0; +} + +/* + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) + * beyond (at a higher virtual address and file offset than) the vma. + * + * We cannot merge two vmas if they have differently assigned (non-NULL) + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. + */ +static int +can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, + struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) +{ + if (is_mergeable_vma(vma, file, vm_flags) && + is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { + pgoff_t vm_pglen; + vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + if (vma->vm_pgoff + vm_pglen == vm_pgoff) + return 1; + } + return 0; +} + +/* + * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out + * whether that can be merged with its predecessor or its successor. + * Or both (it neatly fills a hole). + * + * In most cases - when called for mmap, brk or mremap - [addr,end) is + * certain not to be mapped by the time vma_merge is called; but when + * called for mprotect, it is certain to be already mapped (either at + * an offset within prev, or at the start of next), and the flags of + * this area are about to be changed to vm_flags - and the no-change + * case has already been eliminated. + * + * The following mprotect cases have to be considered, where AAAA is + * the area passed down from mprotect_fixup, never extending beyond one + * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after: + * + * AAAA AAAA AAAA AAAA + * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX + * cannot merge might become might become might become + * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or + * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or + * mremap move: PPPPNNNNNNNN 8 + * AAAA + * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN + * might become case 1 below case 2 below case 3 below + * + * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX: + * mprotect_fixup updates vm_flags & vm_page_prot on successful return. + */ +struct vm_area_struct *vma_merge(struct mm_struct *mm, + struct vm_area_struct *prev, unsigned long addr, + unsigned long end, unsigned long vm_flags, + struct anon_vma *anon_vma, struct file *file, + pgoff_t pgoff, struct mempolicy *policy) +{ + pgoff_t pglen = (end - addr) >> PAGE_SHIFT; + struct vm_area_struct *area, *next; + + /* + * We later require that vma->vm_flags == vm_flags, + * so this tests vma->vm_flags & VM_SPECIAL, too. + */ + if (vm_flags & VM_SPECIAL) + return NULL; + + if (prev) + next = prev->vm_next; + else + next = mm->mmap; + area = next; + if (next && next->vm_end == end) /* cases 6, 7, 8 */ + next = next->vm_next; + + /* + * Can it merge with the predecessor? + */ + if (prev && prev->vm_end == addr && + mpol_equal(vma_policy(prev), policy) && + can_vma_merge_after(prev, vm_flags, + anon_vma, file, pgoff)) { + /* + * OK, it can. Can we now merge in the successor as well? + */ + if (next && end == next->vm_start && + mpol_equal(policy, vma_policy(next)) && + can_vma_merge_before(next, vm_flags, + anon_vma, file, pgoff+pglen) && + is_mergeable_anon_vma(prev->anon_vma, + next->anon_vma)) { + /* cases 1, 6 */ + vma_adjust(prev, prev->vm_start, + next->vm_end, prev->vm_pgoff, NULL); + } else /* cases 2, 5, 7 */ + vma_adjust(prev, prev->vm_start, + end, prev->vm_pgoff, NULL); + return prev; + } + + /* + * Can this new request be merged in front of next? + */ + if (next && end == next->vm_start && + mpol_equal(policy, vma_policy(next)) && + can_vma_merge_before(next, vm_flags, + anon_vma, file, pgoff+pglen)) { + if (prev && addr < prev->vm_end) /* case 4 */ + vma_adjust(prev, prev->vm_start, + addr, prev->vm_pgoff, NULL); + else /* cases 3, 8 */ + vma_adjust(area, addr, next->vm_end, + next->vm_pgoff - pglen, NULL); + return area; + } + + return NULL; +} + +/* + * find_mergeable_anon_vma is used by anon_vma_prepare, to check + * neighbouring vmas for a suitable anon_vma, before it goes off + * to allocate a new anon_vma. It checks because a repetitive + * sequence of mprotects and faults may otherwise lead to distinct + * anon_vmas being allocated, preventing vma merge in subsequent + * mprotect. + */ +struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) +{ + struct vm_area_struct *near; + unsigned long vm_flags; + + near = vma->vm_next; + if (!near) + goto try_prev; + + /* + * Since only mprotect tries to remerge vmas, match flags + * which might be mprotected into each other later on. + * Neither mlock nor madvise tries to remerge at present, + * so leave their flags as obstructing a merge. + */ + vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); + vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC); + + if (near->anon_vma && vma->vm_end == near->vm_start && + mpol_equal(vma_policy(vma), vma_policy(near)) && + can_vma_merge_before(near, vm_flags, + NULL, vma->vm_file, vma->vm_pgoff + + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT))) + return near->anon_vma; +try_prev: + /* + * It is potentially slow to have to call find_vma_prev here. + * But it's only on the first write fault on the vma, not + * every time, and we could devise a way to avoid it later + * (e.g. stash info in next's anon_vma_node when assigning + * an anon_vma, or when trying vma_merge). Another time. + */ + BUG_ON(find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma); + if (!near) + goto none; + + vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); + vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC); + + if (near->anon_vma && near->vm_end == vma->vm_start && + mpol_equal(vma_policy(near), vma_policy(vma)) && + can_vma_merge_after(near, vm_flags, + NULL, vma->vm_file, vma->vm_pgoff)) + return near->anon_vma; +none: + /* + * There's no absolute need to look only at touching neighbours: + * we could search further afield for "compatible" anon_vmas. + * But it would probably just be a waste of time searching, + * or lead to too many vmas hanging off the same anon_vma. + * We're trying to allow mprotect remerging later on, + * not trying to minimize memory used for anon_vmas. + */ + return NULL; +} + +#ifdef CONFIG_PROC_FS +void vm_stat_account(struct mm_struct *mm, unsigned long flags, + struct file *file, long pages) +{ + const unsigned long stack_flags + = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); + + if (file) { + mm->shared_vm += pages; + if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) + mm->exec_vm += pages; + } else if (flags & stack_flags) + mm->stack_vm += pages; + if (flags & (VM_RESERVED|VM_IO)) + mm->reserved_vm += pages; +} +#endif /* CONFIG_PROC_FS */ + +/* + * The caller must hold down_write(current->mm->mmap_sem). + */ + +unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flags, unsigned long pgoff) +{ + struct mm_struct * mm = current->mm; + struct inode *inode; + unsigned int vm_flags; + int error; + int accountable = 1; + unsigned long reqprot = prot; + + /* + * Does the application expect PROT_READ to imply PROT_EXEC? + * + * (the exception is when the underlying filesystem is noexec + * mounted, in which case we dont add PROT_EXEC.) + */ + if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) + if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC))) + prot |= PROT_EXEC; + + if (!len) + return -EINVAL; + + if (!(flags & MAP_FIXED)) + addr = round_hint_to_min(addr); + + error = arch_mmap_check(addr, len, flags); + if (error) + return error; + + /* Careful about overflows.. */ + len = PAGE_ALIGN(len); + if (!len || len > TASK_SIZE) + return -ENOMEM; + + /* offset overflow? */ + if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) + return -EOVERFLOW; + + /* Too many mappings? */ + if (mm->map_count > sysctl_max_map_count) + return -ENOMEM; + + /* Obtain the address to map to. we verify (or select) it and ensure + * that it represents a valid section of the address space. + */ + addr = get_unmapped_area(file, addr, len, pgoff, flags); + if (addr & ~PAGE_MASK) + return addr; + + /* Do simple checking here so the lower-level routines won't have + * to. we assume access permissions have been handled by the open + * of the memory object, so we don't do any here. + */ + vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | + mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + + if (flags & MAP_LOCKED) { + if (!can_do_mlock()) + return -EPERM; + vm_flags |= VM_LOCKED; + } + + /* mlock MCL_FUTURE? */ + if (vm_flags & VM_LOCKED) { + unsigned long locked, lock_limit; + locked = len >> PAGE_SHIFT; + locked += mm->locked_vm; + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit >>= PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + return -EAGAIN; + } + + inode = file ? file->f_path.dentry->d_inode : NULL; + + if (file) { + switch (flags & MAP_TYPE) { + case MAP_SHARED: + if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) + return -EACCES; + + /* + * Make sure we don't allow writing to an append-only + * file.. + */ + if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) + return -EACCES; + + /* + * Make sure there are no mandatory locks on the file. + */ + if (locks_verify_locked(inode)) + return -EAGAIN; + + vm_flags |= VM_SHARED | VM_MAYSHARE; + if (!(file->f_mode & FMODE_WRITE)) + vm_flags &= ~(VM_MAYWRITE | VM_SHARED); + + /* fall through */ + case MAP_PRIVATE: + if (!(file->f_mode & FMODE_READ)) + return -EACCES; + if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { + if (vm_flags & VM_EXEC) + return -EPERM; + vm_flags &= ~VM_MAYEXEC; + } + if (is_file_hugepages(file)) + accountable = 0; + + if (!file->f_op || !file->f_op->mmap) + return -ENODEV; + break; + + default: + return -EINVAL; + } + } else { + switch (flags & MAP_TYPE) { + case MAP_SHARED: + /* + * Ignore pgoff. + */ + pgoff = 0; + vm_flags |= VM_SHARED | VM_MAYSHARE; + break; + case MAP_PRIVATE: + /* + * Set pgoff according to addr for anon_vma. + */ + pgoff = addr >> PAGE_SHIFT; + break; + default: + return -EINVAL; + } + } + + error = security_file_mmap(file, reqprot, prot, flags, addr, 0); + if (error) + return error; + + return mmap_region(file, addr, len, flags, vm_flags, pgoff, + accountable); +} +EXPORT_SYMBOL(do_mmap_pgoff); + +/* + * Some shared mappigns will want the pages marked read-only + * to track write events. If so, we'll downgrade vm_page_prot + * to the private version (using protection_map[] without the + * VM_SHARED bit). + */ +int vma_wants_writenotify(struct vm_area_struct *vma) +{ + unsigned int vm_flags = vma->vm_flags; + + /* If it was private or non-writable, the write bit is already clear */ + if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) + return 0; + + /* The backer wishes to know when pages are first written to? */ + if (vma->vm_ops && vma->vm_ops->page_mkwrite) + return 1; + + /* The open routine did something to the protections already? */ + if (pgprot_val(vma->vm_page_prot) != + pgprot_val(vm_get_page_prot(vm_flags))) + return 0; + + /* Specialty mapping? */ + if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE)) + return 0; + + /* Can the mapping track the dirty pages? */ + return vma->vm_file && vma->vm_file->f_mapping && + mapping_cap_account_dirty(vma->vm_file->f_mapping); +} + +unsigned long mmap_region(struct file *file, unsigned long addr, + unsigned long len, unsigned long flags, + unsigned int vm_flags, unsigned long pgoff, + int accountable) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma, *prev; + struct vm_area_struct *merged_vma; + int correct_wcount = 0; + int error; + struct rb_node **rb_link, *rb_parent; + unsigned long charged = 0; + struct inode *inode = file ? file->f_path.dentry->d_inode : NULL; + + /* Clear old maps */ + error = -ENOMEM; +munmap_back: + vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); + if (vma && vma->vm_start < addr + len) { + if (do_munmap(mm, addr, len)) + return -ENOMEM; + goto munmap_back; + } + + /* Check against address space limit. */ + if (!may_expand_vm(mm, len >> PAGE_SHIFT)) + return -ENOMEM; + + if (flags & MAP_NORESERVE) + vm_flags |= VM_NORESERVE; + + if (accountable && (!(flags & MAP_NORESERVE) || + sysctl_overcommit_memory == OVERCOMMIT_NEVER)) { + if (vm_flags & VM_SHARED) { + /* Check memory availability in shmem_file_setup? */ + vm_flags |= VM_ACCOUNT; + } else if (vm_flags & VM_WRITE) { + /* + * Private writable mapping: check memory availability + */ + charged = len >> PAGE_SHIFT; + if (security_vm_enough_memory(charged)) + return -ENOMEM; + vm_flags |= VM_ACCOUNT; + } + } + + /* + * Can we just expand an old private anonymous mapping? + * The VM_SHARED test is necessary because shmem_zero_setup + * will create the file object for a shared anonymous map below. + */ + if (!file && !(vm_flags & VM_SHARED)) { + vma = vma_merge(mm, prev, addr, addr + len, vm_flags, + NULL, NULL, pgoff, NULL); + if (vma) + goto out; + } + + /* + * Determine the object being mapped and call the appropriate + * specific mapper. the address has already been validated, but + * not unmapped, but the maps are removed from the list. + */ + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + if (!vma) { + error = -ENOMEM; + goto unacct_error; + } + + vma->vm_mm = mm; + vma->vm_start = addr; + vma->vm_end = addr + len; + vma->vm_flags = vm_flags; + vma->vm_page_prot = vm_get_page_prot(vm_flags); + vma->vm_pgoff = pgoff; + + if (file) { + error = -EINVAL; + if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) + goto free_vma; + if (vm_flags & VM_DENYWRITE) { + error = deny_write_access(file); + if (error) + goto free_vma; + correct_wcount = 1; + } + vma->vm_file = file; + get_file(file); + error = file->f_op->mmap(file, vma); + if (error) + goto unmap_and_free_vma; + if (vm_flags & VM_EXECUTABLE) + added_exe_file_vma(mm); + } else if (vm_flags & VM_SHARED) { + error = shmem_zero_setup(vma); + if (error) + goto free_vma; + } + + /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform + * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) + * that memory reservation must be checked; but that reservation + * belongs to shared memory object, not to vma: so now clear it. + */ + if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT)) + vma->vm_flags &= ~VM_ACCOUNT; + + /* Can addr have changed?? + * + * Answer: Yes, several device drivers can do it in their + * f_op->mmap method. -DaveM + */ + addr = vma->vm_start; + pgoff = vma->vm_pgoff; + vm_flags = vma->vm_flags; + + if (vma_wants_writenotify(vma)) + vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); + + merged_vma = NULL; + if (file) + merged_vma = vma_merge(mm, prev, addr, vma->vm_end, + vma->vm_flags, NULL, file, pgoff, vma_policy(vma)); + if (merged_vma) { + mpol_put(vma_policy(vma)); + kmem_cache_free(vm_area_cachep, vma); + fput(file); + if (vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(mm); + vma = merged_vma; + } else { + vma_link(mm, vma, prev, rb_link, rb_parent); + file = vma->vm_file; + } + + /* Once vma denies write, undo our temporary denial count */ + if (correct_wcount) + atomic_inc(&inode->i_writecount); +out: + mm->total_vm += len >> PAGE_SHIFT; + vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); + if (vm_flags & VM_LOCKED) { + /* + * makes pages present; downgrades, drops, reacquires mmap_sem + */ + long nr_pages = mlock_vma_pages_range(vma, addr, addr + len); + if (nr_pages < 0) + return nr_pages; /* vma gone! */ + mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages; + } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) + make_pages_present(addr, addr + len); + return addr; + +unmap_and_free_vma: + if (correct_wcount) + atomic_inc(&inode->i_writecount); + vma->vm_file = NULL; + fput(file); + + /* Undo any partial mapping done by a device driver. */ + unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); + charged = 0; +free_vma: + kmem_cache_free(vm_area_cachep, vma); +unacct_error: + if (charged) + vm_unacct_memory(charged); + return error; +} + +/* Get an address range which is currently unmapped. + * For shmat() with addr=0. + * + * Ugly calling convention alert: + * Return value with the low bits set means error value, + * ie + * if (ret & ~PAGE_MASK) + * error = ret; + * + * This function "knows" that -ENOMEM has the bits set. + */ +#ifndef HAVE_ARCH_UNMAPPED_AREA +unsigned long +arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long start_addr; + + if (len > TASK_SIZE) + return -ENOMEM; + + if (flags & MAP_FIXED) + return addr; + + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + if (len > mm->cached_hole_size) { + start_addr = addr = mm->free_area_cache; + } else { + start_addr = addr = TASK_UNMAPPED_BASE; + mm->cached_hole_size = 0; + } + +full_search: + for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { + /* At this point: (!vma || addr < vma->vm_end). */ + if (TASK_SIZE - len < addr) { + /* + * Start a new search - just in case we missed + * some holes. + */ + if (start_addr != TASK_UNMAPPED_BASE) { + addr = TASK_UNMAPPED_BASE; + start_addr = addr; + mm->cached_hole_size = 0; + goto full_search; + } + return -ENOMEM; + } + if (!vma || addr + len <= vma->vm_start) { + /* + * Remember the place where we stopped the search: + */ + mm->free_area_cache = addr + len; + return addr; + } + if (addr + mm->cached_hole_size < vma->vm_start) + mm->cached_hole_size = vma->vm_start - addr; + addr = vma->vm_end; + } +} +#endif + +void arch_unmap_area(struct mm_struct *mm, unsigned long addr) +{ + /* + * Is this a new hole at the lowest possible address? + */ + if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) { + mm->free_area_cache = addr; + mm->cached_hole_size = ~0UL; + } +} + +/* + * This mmap-allocator allocates new areas top-down from below the + * stack's low limit (the base): + */ +#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN +unsigned long +arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, + const unsigned long len, const unsigned long pgoff, + const unsigned long flags) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + unsigned long addr = addr0; + + /* requested length too big for entire address space */ + if (len > TASK_SIZE) + return -ENOMEM; + + if (flags & MAP_FIXED) + return addr; + + /* requesting a specific address */ + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + + /* check if free_area_cache is useful for us */ + if (len <= mm->cached_hole_size) { + mm->cached_hole_size = 0; + mm->free_area_cache = mm->mmap_base; + } + + /* either no address requested or can't fit in requested address hole */ + addr = mm->free_area_cache; + + /* make sure it can fit in the remaining address space */ + if (addr > len) { + vma = find_vma(mm, addr-len); + if (!vma || addr <= vma->vm_start) + /* remember the address as a hint for next time */ + return (mm->free_area_cache = addr-len); + } + + if (mm->mmap_base < len) + goto bottomup; + + addr = mm->mmap_base-len; + + do { + /* + * Lookup failure means no vma is above this address, + * else if new region fits below vma->vm_start, + * return with success: + */ + vma = find_vma(mm, addr); + if (!vma || addr+len <= vma->vm_start) + /* remember the address as a hint for next time */ + return (mm->free_area_cache = addr); + + /* remember the largest hole we saw so far */ + if (addr + mm->cached_hole_size < vma->vm_start) + mm->cached_hole_size = vma->vm_start - addr; + + /* try just below the current vma->vm_start */ + addr = vma->vm_start-len; + } while (len < vma->vm_start); + +bottomup: + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + mm->cached_hole_size = ~0UL; + mm->free_area_cache = TASK_UNMAPPED_BASE; + addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); + /* + * Restore the topdown base: + */ + mm->free_area_cache = mm->mmap_base; + mm->cached_hole_size = ~0UL; + + return addr; +} +#endif + +void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr) +{ + /* + * Is this a new hole at the highest possible address? + */ + if (addr > mm->free_area_cache) + mm->free_area_cache = addr; + + /* dont allow allocations above current base */ + if (mm->free_area_cache > mm->mmap_base) + mm->free_area_cache = mm->mmap_base; +} + +unsigned long +get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + unsigned long (*get_area)(struct file *, unsigned long, + unsigned long, unsigned long, unsigned long); + + get_area = current->mm->get_unmapped_area; + if (file && file->f_op && file->f_op->get_unmapped_area) + get_area = file->f_op->get_unmapped_area; + addr = get_area(file, addr, len, pgoff, flags); + if (IS_ERR_VALUE(addr)) + return addr; + + if (addr > TASK_SIZE - len) + return -ENOMEM; + if (addr & ~PAGE_MASK) + return -EINVAL; + + return arch_rebalance_pgtables(addr, len); +} + +EXPORT_SYMBOL(get_unmapped_area); + +/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ +struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr) +{ + struct vm_area_struct *vma = NULL; + + if (mm) { + /* Check the cache first. */ + /* (Cache hit rate is typically around 35%.) */ + vma = mm->mmap_cache; + if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { + struct rb_node * rb_node; + + rb_node = mm->mm_rb.rb_node; + vma = NULL; + + while (rb_node) { + struct vm_area_struct * vma_tmp; + + vma_tmp = rb_entry(rb_node, + struct vm_area_struct, vm_rb); + + if (vma_tmp->vm_end > addr) { + vma = vma_tmp; + if (vma_tmp->vm_start <= addr) + break; + rb_node = rb_node->rb_left; + } else + rb_node = rb_node->rb_right; + } + if (vma) + mm->mmap_cache = vma; + } + } + return vma; +} + +EXPORT_SYMBOL(find_vma); + +/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */ +struct vm_area_struct * +find_vma_prev(struct mm_struct *mm, unsigned long addr, + struct vm_area_struct **pprev) +{ + struct vm_area_struct *vma = NULL, *prev = NULL; + struct rb_node * rb_node; + if (!mm) + goto out; + + /* Guard against addr being lower than the first VMA */ + vma = mm->mmap; + + /* Go through the RB tree quickly. */ + rb_node = mm->mm_rb.rb_node; + + while (rb_node) { + struct vm_area_struct *vma_tmp; + vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); + + if (addr < vma_tmp->vm_end) { + rb_node = rb_node->rb_left; + } else { + prev = vma_tmp; + if (!prev->vm_next || (addr < prev->vm_next->vm_end)) + break; + rb_node = rb_node->rb_right; + } + } + +out: + *pprev = prev; + return prev ? prev->vm_next : vma; +} + +/* + * Verify that the stack growth is acceptable and + * update accounting. This is shared with both the + * grow-up and grow-down cases. + */ +static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow) +{ + struct mm_struct *mm = vma->vm_mm; + struct rlimit *rlim = current->signal->rlim; + unsigned long new_start; + + /* address space limit tests */ + if (!may_expand_vm(mm, grow)) + return -ENOMEM; + + /* Stack limit test */ + if (size > rlim[RLIMIT_STACK].rlim_cur) + return -ENOMEM; + + /* mlock limit tests */ + if (vma->vm_flags & VM_LOCKED) { + unsigned long locked; + unsigned long limit; + locked = mm->locked_vm + grow; + limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; + if (locked > limit && !capable(CAP_IPC_LOCK)) + return -ENOMEM; + } + + /* Check to ensure the stack will not grow into a hugetlb-only region */ + new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : + vma->vm_end - size; + if (is_hugepage_only_range(vma->vm_mm, new_start, size)) + return -EFAULT; + + /* + * Overcommit.. This must be the final test, as it will + * update security statistics. + */ + if (security_vm_enough_memory(grow)) + return -ENOMEM; + + /* Ok, everything looks good - let it rip */ + mm->total_vm += grow; + if (vma->vm_flags & VM_LOCKED) + mm->locked_vm += grow; + vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); + return 0; +} + +#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) +/* + * PA-RISC uses this for its stack; IA64 for its Register Backing Store. + * vma is the last one with address > vma->vm_end. Have to extend vma. + */ +#ifndef CONFIG_IA64 +static +#endif +int expand_upwards(struct vm_area_struct *vma, unsigned long address) +{ + int error; + + if (!(vma->vm_flags & VM_GROWSUP)) + return -EFAULT; + + /* + * We must make sure the anon_vma is allocated + * so that the anon_vma locking is not a noop. + */ + if (unlikely(anon_vma_prepare(vma))) + return -ENOMEM; + anon_vma_lock(vma); + + /* + * vma->vm_start/vm_end cannot change under us because the caller + * is required to hold the mmap_sem in read mode. We need the + * anon_vma lock to serialize against concurrent expand_stacks. + * Also guard against wrapping around to address 0. + */ + if (address < PAGE_ALIGN(address+4)) + address = PAGE_ALIGN(address+4); + else { + anon_vma_unlock(vma); + return -ENOMEM; + } + error = 0; + + /* Somebody else might have raced and expanded it already */ + if (address > vma->vm_end) { + unsigned long size, grow; + + size = address - vma->vm_start; + grow = (address - vma->vm_end) >> PAGE_SHIFT; + + error = acct_stack_growth(vma, size, grow); + if (!error) + vma->vm_end = address; + } + anon_vma_unlock(vma); + return error; +} +#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ + +/* + * vma is the first one with address < vma->vm_start. Have to extend vma. + */ +static int expand_downwards(struct vm_area_struct *vma, + unsigned long address) +{ + int error; + + /* + * We must make sure the anon_vma is allocated + * so that the anon_vma locking is not a noop. + */ + if (unlikely(anon_vma_prepare(vma))) + return -ENOMEM; + + address &= PAGE_MASK; + error = security_file_mmap(NULL, 0, 0, 0, address, 1); + if (error) + return error; + + anon_vma_lock(vma); + + /* + * vma->vm_start/vm_end cannot change under us because the caller + * is required to hold the mmap_sem in read mode. We need the + * anon_vma lock to serialize against concurrent expand_stacks. + */ + + /* Somebody else might have raced and expanded it already */ + if (address < vma->vm_start) { + unsigned long size, grow; + + size = vma->vm_end - address; + grow = (vma->vm_start - address) >> PAGE_SHIFT; + + error = acct_stack_growth(vma, size, grow); + if (!error) { + vma->vm_start = address; + vma->vm_pgoff -= grow; + } + } + anon_vma_unlock(vma); + return error; +} + +int expand_stack_downwards(struct vm_area_struct *vma, unsigned long address) +{ + return expand_downwards(vma, address); +} + +#ifdef CONFIG_STACK_GROWSUP +int expand_stack(struct vm_area_struct *vma, unsigned long address) +{ + return expand_upwards(vma, address); +} + +struct vm_area_struct * +find_extend_vma(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma, *prev; + + addr &= PAGE_MASK; + vma = find_vma_prev(mm, addr, &prev); + if (vma && (vma->vm_start <= addr)) + return vma; + if (!prev || expand_stack(prev, addr)) + return NULL; + if (prev->vm_flags & VM_LOCKED) { + if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0) + return NULL; /* vma gone! */ + } + return prev; +} +#else +int expand_stack(struct vm_area_struct *vma, unsigned long address) +{ + return expand_downwards(vma, address); +} + +struct vm_area_struct * +find_extend_vma(struct mm_struct * mm, unsigned long addr) +{ + struct vm_area_struct * vma; + unsigned long start; + + addr &= PAGE_MASK; + vma = find_vma(mm,addr); + if (!vma) + return NULL; + if (vma->vm_start <= addr) + return vma; + if (!(vma->vm_flags & VM_GROWSDOWN)) + return NULL; + start = vma->vm_start; + if (expand_stack(vma, addr)) + return NULL; + if (vma->vm_flags & VM_LOCKED) { + if (mlock_vma_pages_range(vma, addr, start) < 0) + return NULL; /* vma gone! */ + } + return vma; +} +#endif + +/* + * Ok - we have the memory areas we should free on the vma list, + * so release them, and do the vma updates. + * + * Called with the mm semaphore held. + */ +static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) +{ + /* Update high watermark before we lower total_vm */ + update_hiwater_vm(mm); + do { + long nrpages = vma_pages(vma); + + mm->total_vm -= nrpages; + vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); + vma = remove_vma(vma); + } while (vma); + validate_mm(mm); +} + +/* + * Get rid of page table information in the indicated region. + * + * Called with the mm semaphore held. + */ +static void unmap_region(struct mm_struct *mm, + struct vm_area_struct *vma, struct vm_area_struct *prev, + unsigned long start, unsigned long end) +{ + struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; + struct mmu_gather *tlb; + unsigned long nr_accounted = 0; + + lru_add_drain(); + tlb = tlb_gather_mmu(mm, 0); + update_hiwater_rss(mm); + unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); + vm_unacct_memory(nr_accounted); + free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, + next? next->vm_start: 0); + tlb_finish_mmu(tlb, start, end); +} + +/* + * Create a list of vma's touched by the unmap, removing them from the mm's + * vma list as we go.. + */ +static void +detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev, unsigned long end) +{ + struct vm_area_struct **insertion_point; + struct vm_area_struct *tail_vma = NULL; + unsigned long addr; + + insertion_point = (prev ? &prev->vm_next : &mm->mmap); + do { + rb_erase(&vma->vm_rb, &mm->mm_rb); + mm->map_count--; + tail_vma = vma; + vma = vma->vm_next; + } while (vma && vma->vm_start < end); + *insertion_point = vma; + tail_vma->vm_next = NULL; + if (mm->unmap_area == arch_unmap_area) + addr = prev ? prev->vm_end : mm->mmap_base; + else + addr = vma ? vma->vm_start : mm->mmap_base; + mm->unmap_area(mm, addr); + mm->mmap_cache = NULL; /* Kill the cache. */ +} + +/* + * Split a vma into two pieces at address 'addr', a new vma is allocated + * either for the first part or the tail. + */ +int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, + unsigned long addr, int new_below) +{ + struct mempolicy *pol; + struct vm_area_struct *new; + + if (is_vm_hugetlb_page(vma) && (addr & + ~(huge_page_mask(hstate_vma(vma))))) + return -EINVAL; + + if (mm->map_count >= sysctl_max_map_count) + return -ENOMEM; + + new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + if (!new) + return -ENOMEM; + + /* most fields are the same, copy all, and then fixup */ + *new = *vma; + + if (new_below) + new->vm_end = addr; + else { + new->vm_start = addr; + new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); + } + + pol = mpol_dup(vma_policy(vma)); + if (IS_ERR(pol)) { + kmem_cache_free(vm_area_cachep, new); + return PTR_ERR(pol); + } + vma_set_policy(new, pol); + + if (new->vm_file) { + get_file(new->vm_file); + if (vma->vm_flags & VM_EXECUTABLE) + added_exe_file_vma(mm); + } + + if (new->vm_ops && new->vm_ops->open) + new->vm_ops->open(new); + + if (new_below) + vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + + ((addr - new->vm_start) >> PAGE_SHIFT), new); + else + vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); + + return 0; +} + +/* Munmap is split into 2 main parts -- this part which finds + * what needs doing, and the areas themselves, which do the + * work. This now handles partial unmappings. + * Jeremy Fitzhardinge <jeremy@goop.org> + */ +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) +{ + unsigned long end; + struct vm_area_struct *vma, *prev, *last; + + if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) + return -EINVAL; + + if ((len = PAGE_ALIGN(len)) == 0) + return -EINVAL; + + /* Find the first overlapping VMA */ + vma = find_vma_prev(mm, start, &prev); + if (!vma) + return 0; + /* we have start < vma->vm_end */ + + /* if it doesn't overlap, we have nothing.. */ + end = start + len; + if (vma->vm_start >= end) + return 0; + + /* + * If we need to split any vma, do it now to save pain later. + * + * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially + * unmapped vm_area_struct will remain in use: so lower split_vma + * places tmp vma above, and higher split_vma places tmp vma below. + */ + if (start > vma->vm_start) { + int error = split_vma(mm, vma, start, 0); + if (error) + return error; + prev = vma; + } + + /* Does it split the last one? */ + last = find_vma(mm, end); + if (last && end > last->vm_start) { + int error = split_vma(mm, last, end, 1); + if (error) + return error; + } + vma = prev? prev->vm_next: mm->mmap; + + /* + * unlock any mlock()ed ranges before detaching vmas + */ + if (mm->locked_vm) { + struct vm_area_struct *tmp = vma; + while (tmp && tmp->vm_start < end) { + if (tmp->vm_flags & VM_LOCKED) { + mm->locked_vm -= vma_pages(tmp); + munlock_vma_pages_all(tmp); + } + tmp = tmp->vm_next; + } + } + + /* + * Remove the vma's, and unmap the actual pages + */ + detach_vmas_to_be_unmapped(mm, vma, prev, end); + unmap_region(mm, vma, prev, start, end); + + /* Fix up all other VM information */ + remove_vma_list(mm, vma); + + return 0; +} + +EXPORT_SYMBOL(do_munmap); + +SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) +{ + int ret; + struct mm_struct *mm = current->mm; + + profile_munmap(addr); + + down_write(&mm->mmap_sem); + ret = do_munmap(mm, addr, len); + up_write(&mm->mmap_sem); + return ret; +} + +static inline void verify_mm_writelocked(struct mm_struct *mm) +{ +#ifdef CONFIG_DEBUG_VM + if (unlikely(down_read_trylock(&mm->mmap_sem))) { + WARN_ON(1); + up_read(&mm->mmap_sem); + } +#endif +} + +/* + * this is really a simplified "do_mmap". it only handles + * anonymous maps. eventually we may be able to do some + * brk-specific accounting here. + */ +unsigned long do_brk(unsigned long addr, unsigned long len) +{ + struct mm_struct * mm = current->mm; + struct vm_area_struct * vma, * prev; + unsigned long flags; + struct rb_node ** rb_link, * rb_parent; + pgoff_t pgoff = addr >> PAGE_SHIFT; + int error; + + len = PAGE_ALIGN(len); + if (!len) + return addr; + + if ((addr + len) > TASK_SIZE || (addr + len) < addr) + return -EINVAL; + + if (is_hugepage_only_range(mm, addr, len)) + return -EINVAL; + + error = security_file_mmap(NULL, 0, 0, 0, addr, 1); + if (error) + return error; + + flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + + error = arch_mmap_check(addr, len, flags); + if (error) + return error; + + /* + * mlock MCL_FUTURE? + */ + if (mm->def_flags & VM_LOCKED) { + unsigned long locked, lock_limit; + locked = len >> PAGE_SHIFT; + locked += mm->locked_vm; + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit >>= PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + return -EAGAIN; + } + + /* + * mm->mmap_sem is required to protect against another thread + * changing the mappings in case we sleep. + */ + verify_mm_writelocked(mm); + + /* + * Clear old maps. this also does some error checking for us + */ + munmap_back: + vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); + if (vma && vma->vm_start < addr + len) { + if (do_munmap(mm, addr, len)) + return -ENOMEM; + goto munmap_back; + } + + /* Check against address space limits *after* clearing old maps... */ + if (!may_expand_vm(mm, len >> PAGE_SHIFT)) + return -ENOMEM; + + if (mm->map_count > sysctl_max_map_count) + return -ENOMEM; + + if (security_vm_enough_memory(len >> PAGE_SHIFT)) + return -ENOMEM; + + /* Can we just expand an old private anonymous mapping? */ + vma = vma_merge(mm, prev, addr, addr + len, flags, + NULL, NULL, pgoff, NULL); + if (vma) + goto out; + + /* + * create a vma struct for an anonymous mapping + */ + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + if (!vma) { + vm_unacct_memory(len >> PAGE_SHIFT); + return -ENOMEM; + } + + vma->vm_mm = mm; + vma->vm_start = addr; + vma->vm_end = addr + len; + vma->vm_pgoff = pgoff; + vma->vm_flags = flags; + vma->vm_page_prot = vm_get_page_prot(flags); + vma_link(mm, vma, prev, rb_link, rb_parent); +out: + mm->total_vm += len >> PAGE_SHIFT; + if (flags & VM_LOCKED) { + if (!mlock_vma_pages_range(vma, addr, addr + len)) + mm->locked_vm += (len >> PAGE_SHIFT); + } + return addr; +} + +EXPORT_SYMBOL(do_brk); + +/* Release all mmaps. */ +void exit_mmap(struct mm_struct *mm) +{ + struct mmu_gather *tlb; + struct vm_area_struct *vma; + unsigned long nr_accounted = 0; + unsigned long end; + + /* mm's last user has gone, and its about to be pulled down */ + mmu_notifier_release(mm); + + if (mm->locked_vm) { + vma = mm->mmap; + while (vma) { + if (vma->vm_flags & VM_LOCKED) + munlock_vma_pages_all(vma); + vma = vma->vm_next; + } + } + + arch_exit_mmap(mm); + + vma = mm->mmap; + if (!vma) /* Can happen if dup_mmap() received an OOM */ + return; + + lru_add_drain(); + flush_cache_mm(mm); + tlb = tlb_gather_mmu(mm, 1); + /* Don't update_hiwater_rss(mm) here, do_exit already did */ + /* Use -1 here to ensure all VMAs in the mm are unmapped */ + end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); + vm_unacct_memory(nr_accounted); + free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); + tlb_finish_mmu(tlb, 0, end); + + /* + * Walk the list again, actually closing and freeing it, + * with preemption enabled, without holding any MM locks. + */ + while (vma) + vma = remove_vma(vma); + + BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); +} + +/* Insert vm structure into process list sorted by address + * and into the inode's i_mmap tree. If vm_file is non-NULL + * then i_mmap_lock is taken here. + */ +int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) +{ + struct vm_area_struct * __vma, * prev; + struct rb_node ** rb_link, * rb_parent; + + /* + * The vm_pgoff of a purely anonymous vma should be irrelevant + * until its first write fault, when page's anon_vma and index + * are set. But now set the vm_pgoff it will almost certainly + * end up with (unless mremap moves it elsewhere before that + * first wfault), so /proc/pid/maps tells a consistent story. + * + * By setting it to reflect the virtual start address of the + * vma, merges and splits can happen in a seamless way, just + * using the existing file pgoff checks and manipulations. + * Similarly in do_mmap_pgoff and in do_brk. + */ + if (!vma->vm_file) { + BUG_ON(vma->anon_vma); + vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; + } + __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent); + if (__vma && __vma->vm_start < vma->vm_end) + return -ENOMEM; + if ((vma->vm_flags & VM_ACCOUNT) && + security_vm_enough_memory_mm(mm, vma_pages(vma))) + return -ENOMEM; + vma_link(mm, vma, prev, rb_link, rb_parent); + return 0; +} + +/* + * Copy the vma structure to a new location in the same mm, + * prior to moving page table entries, to effect an mremap move. + */ +struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, + unsigned long addr, unsigned long len, pgoff_t pgoff) +{ + struct vm_area_struct *vma = *vmap; + unsigned long vma_start = vma->vm_start; + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *new_vma, *prev; + struct rb_node **rb_link, *rb_parent; + struct mempolicy *pol; + + /* + * If anonymous vma has not yet been faulted, update new pgoff + * to match new location, to increase its chance of merging. + */ + if (!vma->vm_file && !vma->anon_vma) + pgoff = addr >> PAGE_SHIFT; + + find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); + new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); + if (new_vma) { + /* + * Source vma may have been merged into new_vma + */ + if (vma_start >= new_vma->vm_start && + vma_start < new_vma->vm_end) + *vmap = new_vma; + } else { + new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + if (new_vma) { + *new_vma = *vma; + pol = mpol_dup(vma_policy(vma)); + if (IS_ERR(pol)) { + kmem_cache_free(vm_area_cachep, new_vma); + return NULL; + } + vma_set_policy(new_vma, pol); + new_vma->vm_start = addr; + new_vma->vm_end = addr + len; + new_vma->vm_pgoff = pgoff; + if (new_vma->vm_file) { + get_file(new_vma->vm_file); + if (vma->vm_flags & VM_EXECUTABLE) + added_exe_file_vma(mm); + } + if (new_vma->vm_ops && new_vma->vm_ops->open) + new_vma->vm_ops->open(new_vma); + vma_link(mm, new_vma, prev, rb_link, rb_parent); + } + } + return new_vma; +} + +/* + * Return true if the calling process may expand its vm space by the passed + * number of pages + */ +int may_expand_vm(struct mm_struct *mm, unsigned long npages) +{ + unsigned long cur = mm->total_vm; /* pages */ + unsigned long lim; + + lim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; + + if (cur + npages > lim) + return 0; + return 1; +} + + +static int special_mapping_fault(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + pgoff_t pgoff; + struct page **pages; + + /* + * special mappings have no vm_file, and in that case, the mm + * uses vm_pgoff internally. So we have to subtract it from here. + * We are allowed to do this because we are the mm; do not copy + * this code into drivers! + */ + pgoff = vmf->pgoff - vma->vm_pgoff; + + for (pages = vma->vm_private_data; pgoff && *pages; ++pages) + pgoff--; + + if (*pages) { + struct page *page = *pages; + get_page(page); + vmf->page = page; + return 0; + } + + return VM_FAULT_SIGBUS; +} + +/* + * Having a close hook prevents vma merging regardless of flags. + */ +static void special_mapping_close(struct vm_area_struct *vma) +{ +} + +static struct vm_operations_struct special_mapping_vmops = { + .close = special_mapping_close, + .fault = special_mapping_fault, +}; + +/* + * Called with mm->mmap_sem held for writing. + * Insert a new vma covering the given region, with the given flags. + * Its pages are supplied by the given array of struct page *. + * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. + * The region past the last page supplied will always produce SIGBUS. + * The array pointer and the pages it points to are assumed to stay alive + * for as long as this mapping might exist. + */ +int install_special_mapping(struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long vm_flags, struct page **pages) +{ + struct vm_area_struct *vma; + + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + if (unlikely(vma == NULL)) + return -ENOMEM; + + vma->vm_mm = mm; + vma->vm_start = addr; + vma->vm_end = addr + len; + + vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND; + vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); + + vma->vm_ops = &special_mapping_vmops; + vma->vm_private_data = pages; + + if (unlikely(insert_vm_struct(mm, vma))) { + kmem_cache_free(vm_area_cachep, vma); + return -ENOMEM; + } + + mm->total_vm += len >> PAGE_SHIFT; + + return 0; +} + +static DEFINE_MUTEX(mm_all_locks_mutex); + +static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) +{ + if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) { + /* + * The LSB of head.next can't change from under us + * because we hold the mm_all_locks_mutex. + */ + spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem); + /* + * We can safely modify head.next after taking the + * anon_vma->lock. If some other vma in this mm shares + * the same anon_vma we won't take it again. + * + * No need of atomic instructions here, head.next + * can't change from under us thanks to the + * anon_vma->lock. + */ + if (__test_and_set_bit(0, (unsigned long *) + &anon_vma->head.next)) + BUG(); + } +} + +static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) +{ + if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { + /* + * AS_MM_ALL_LOCKS can't change from under us because + * we hold the mm_all_locks_mutex. + * + * Operations on ->flags have to be atomic because + * even if AS_MM_ALL_LOCKS is stable thanks to the + * mm_all_locks_mutex, there may be other cpus + * changing other bitflags in parallel to us. + */ + if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) + BUG(); + spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem); + } +} + +/* + * This operation locks against the VM for all pte/vma/mm related + * operations that could ever happen on a certain mm. This includes + * vmtruncate, try_to_unmap, and all page faults. + * + * The caller must take the mmap_sem in write mode before calling + * mm_take_all_locks(). The caller isn't allowed to release the + * mmap_sem until mm_drop_all_locks() returns. + * + * mmap_sem in write mode is required in order to block all operations + * that could modify pagetables and free pages without need of + * altering the vma layout (for example populate_range() with + * nonlinear vmas). It's also needed in write mode to avoid new + * anon_vmas to be associated with existing vmas. + * + * A single task can't take more than one mm_take_all_locks() in a row + * or it would deadlock. + * + * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in + * mapping->flags avoid to take the same lock twice, if more than one + * vma in this mm is backed by the same anon_vma or address_space. + * + * We can take all the locks in random order because the VM code + * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never + * takes more than one of them in a row. Secondly we're protected + * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. + * + * mm_take_all_locks() and mm_drop_all_locks are expensive operations + * that may have to take thousand of locks. + * + * mm_take_all_locks() can fail if it's interrupted by signals. + */ +int mm_take_all_locks(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + int ret = -EINTR; + + BUG_ON(down_read_trylock(&mm->mmap_sem)); + + mutex_lock(&mm_all_locks_mutex); + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (signal_pending(current)) + goto out_unlock; + if (vma->vm_file && vma->vm_file->f_mapping) + vm_lock_mapping(mm, vma->vm_file->f_mapping); + } + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (signal_pending(current)) + goto out_unlock; + if (vma->anon_vma) + vm_lock_anon_vma(mm, vma->anon_vma); + } + + ret = 0; + +out_unlock: + if (ret) + mm_drop_all_locks(mm); + + return ret; +} + +static void vm_unlock_anon_vma(struct anon_vma *anon_vma) +{ + if (test_bit(0, (unsigned long *) &anon_vma->head.next)) { + /* + * The LSB of head.next can't change to 0 from under + * us because we hold the mm_all_locks_mutex. + * + * We must however clear the bitflag before unlocking + * the vma so the users using the anon_vma->head will + * never see our bitflag. + * + * No need of atomic instructions here, head.next + * can't change from under us until we release the + * anon_vma->lock. + */ + if (!__test_and_clear_bit(0, (unsigned long *) + &anon_vma->head.next)) + BUG(); + spin_unlock(&anon_vma->lock); + } +} + +static void vm_unlock_mapping(struct address_space *mapping) +{ + if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { + /* + * AS_MM_ALL_LOCKS can't change to 0 from under us + * because we hold the mm_all_locks_mutex. + */ + spin_unlock(&mapping->i_mmap_lock); + if (!test_and_clear_bit(AS_MM_ALL_LOCKS, + &mapping->flags)) + BUG(); + } +} + +/* + * The mmap_sem cannot be released by the caller until + * mm_drop_all_locks() returns. + */ +void mm_drop_all_locks(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + BUG_ON(down_read_trylock(&mm->mmap_sem)); + BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma->anon_vma) + vm_unlock_anon_vma(vma->anon_vma); + if (vma->vm_file && vma->vm_file->f_mapping) + vm_unlock_mapping(vma->vm_file->f_mapping); + } + + mutex_unlock(&mm_all_locks_mutex); +} diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c new file mode 100644 index 0000000..5f4ef02 --- /dev/null +++ b/mm/mmu_notifier.c @@ -0,0 +1,277 @@ +/* + * linux/mm/mmu_notifier.c + * + * Copyright (C) 2008 Qumranet, Inc. + * Copyright (C) 2008 SGI + * Christoph Lameter <clameter@sgi.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/rculist.h> +#include <linux/mmu_notifier.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/err.h> +#include <linux/rcupdate.h> +#include <linux/sched.h> + +/* + * This function can't run concurrently against mmu_notifier_register + * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap + * runs with mm_users == 0. Other tasks may still invoke mmu notifiers + * in parallel despite there being no task using this mm any more, + * through the vmas outside of the exit_mmap context, such as with + * vmtruncate. This serializes against mmu_notifier_unregister with + * the mmu_notifier_mm->lock in addition to RCU and it serializes + * against the other mmu notifiers with RCU. struct mmu_notifier_mm + * can't go away from under us as exit_mmap holds an mm_count pin + * itself. + */ +void __mmu_notifier_release(struct mm_struct *mm) +{ + struct mmu_notifier *mn; + + spin_lock(&mm->mmu_notifier_mm->lock); + while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { + mn = hlist_entry(mm->mmu_notifier_mm->list.first, + struct mmu_notifier, + hlist); + /* + * We arrived before mmu_notifier_unregister so + * mmu_notifier_unregister will do nothing other than + * to wait ->release to finish and + * mmu_notifier_unregister to return. + */ + hlist_del_init_rcu(&mn->hlist); + /* + * RCU here will block mmu_notifier_unregister until + * ->release returns. + */ + rcu_read_lock(); + spin_unlock(&mm->mmu_notifier_mm->lock); + /* + * if ->release runs before mmu_notifier_unregister it + * must be handled as it's the only way for the driver + * to flush all existing sptes and stop the driver + * from establishing any more sptes before all the + * pages in the mm are freed. + */ + if (mn->ops->release) + mn->ops->release(mn, mm); + rcu_read_unlock(); + spin_lock(&mm->mmu_notifier_mm->lock); + } + spin_unlock(&mm->mmu_notifier_mm->lock); + + /* + * synchronize_rcu here prevents mmu_notifier_release to + * return to exit_mmap (which would proceed freeing all pages + * in the mm) until the ->release method returns, if it was + * invoked by mmu_notifier_unregister. + * + * The mmu_notifier_mm can't go away from under us because one + * mm_count is hold by exit_mmap. + */ + synchronize_rcu(); +} + +/* + * If no young bitflag is supported by the hardware, ->clear_flush_young can + * unmap the address and return 1 or 0 depending if the mapping previously + * existed or not. + */ +int __mmu_notifier_clear_flush_young(struct mm_struct *mm, + unsigned long address) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + int young = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->clear_flush_young) + young |= mn->ops->clear_flush_young(mn, mm, address); + } + rcu_read_unlock(); + + return young; +} + +void __mmu_notifier_invalidate_page(struct mm_struct *mm, + unsigned long address) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->invalidate_page) + mn->ops->invalidate_page(mn, mm, address); + } + rcu_read_unlock(); +} + +void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->invalidate_range_start) + mn->ops->invalidate_range_start(mn, mm, start, end); + } + rcu_read_unlock(); +} + +void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->invalidate_range_end) + mn->ops->invalidate_range_end(mn, mm, start, end); + } + rcu_read_unlock(); +} + +static int do_mmu_notifier_register(struct mmu_notifier *mn, + struct mm_struct *mm, + int take_mmap_sem) +{ + struct mmu_notifier_mm *mmu_notifier_mm; + int ret; + + BUG_ON(atomic_read(&mm->mm_users) <= 0); + + ret = -ENOMEM; + mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); + if (unlikely(!mmu_notifier_mm)) + goto out; + + if (take_mmap_sem) + down_write(&mm->mmap_sem); + ret = mm_take_all_locks(mm); + if (unlikely(ret)) + goto out_cleanup; + + if (!mm_has_notifiers(mm)) { + INIT_HLIST_HEAD(&mmu_notifier_mm->list); + spin_lock_init(&mmu_notifier_mm->lock); + mm->mmu_notifier_mm = mmu_notifier_mm; + mmu_notifier_mm = NULL; + } + atomic_inc(&mm->mm_count); + + /* + * Serialize the update against mmu_notifier_unregister. A + * side note: mmu_notifier_release can't run concurrently with + * us because we hold the mm_users pin (either implicitly as + * current->mm or explicitly with get_task_mm() or similar). + * We can't race against any other mmu notifier method either + * thanks to mm_take_all_locks(). + */ + spin_lock(&mm->mmu_notifier_mm->lock); + hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list); + spin_unlock(&mm->mmu_notifier_mm->lock); + + mm_drop_all_locks(mm); +out_cleanup: + if (take_mmap_sem) + up_write(&mm->mmap_sem); + /* kfree() does nothing if mmu_notifier_mm is NULL */ + kfree(mmu_notifier_mm); +out: + BUG_ON(atomic_read(&mm->mm_users) <= 0); + return ret; +} + +/* + * Must not hold mmap_sem nor any other VM related lock when calling + * this registration function. Must also ensure mm_users can't go down + * to zero while this runs to avoid races with mmu_notifier_release, + * so mm has to be current->mm or the mm should be pinned safely such + * as with get_task_mm(). If the mm is not current->mm, the mm_users + * pin should be released by calling mmput after mmu_notifier_register + * returns. mmu_notifier_unregister must be always called to + * unregister the notifier. mm_count is automatically pinned to allow + * mmu_notifier_unregister to safely run at any time later, before or + * after exit_mmap. ->release will always be called before exit_mmap + * frees the pages. + */ +int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) +{ + return do_mmu_notifier_register(mn, mm, 1); +} +EXPORT_SYMBOL_GPL(mmu_notifier_register); + +/* + * Same as mmu_notifier_register but here the caller must hold the + * mmap_sem in write mode. + */ +int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) +{ + return do_mmu_notifier_register(mn, mm, 0); +} +EXPORT_SYMBOL_GPL(__mmu_notifier_register); + +/* this is called after the last mmu_notifier_unregister() returned */ +void __mmu_notifier_mm_destroy(struct mm_struct *mm) +{ + BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list)); + kfree(mm->mmu_notifier_mm); + mm->mmu_notifier_mm = LIST_POISON1; /* debug */ +} + +/* + * This releases the mm_count pin automatically and frees the mm + * structure if it was the last user of it. It serializes against + * running mmu notifiers with RCU and against mmu_notifier_unregister + * with the unregister lock + RCU. All sptes must be dropped before + * calling mmu_notifier_unregister. ->release or any other notifier + * method may be invoked concurrently with mmu_notifier_unregister, + * and only after mmu_notifier_unregister returned we're guaranteed + * that ->release or any other method can't run anymore. + */ +void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) +{ + BUG_ON(atomic_read(&mm->mm_count) <= 0); + + spin_lock(&mm->mmu_notifier_mm->lock); + if (!hlist_unhashed(&mn->hlist)) { + hlist_del_rcu(&mn->hlist); + + /* + * RCU here will force exit_mmap to wait ->release to finish + * before freeing the pages. + */ + rcu_read_lock(); + spin_unlock(&mm->mmu_notifier_mm->lock); + /* + * exit_mmap will block in mmu_notifier_release to + * guarantee ->release is called before freeing the + * pages. + */ + if (mn->ops->release) + mn->ops->release(mn, mm); + rcu_read_unlock(); + } else + spin_unlock(&mm->mmu_notifier_mm->lock); + + /* + * Wait any running method to finish, of course including + * ->release if it was run by mmu_notifier_relase instead of us. + */ + synchronize_rcu(); + + BUG_ON(atomic_read(&mm->mm_count) <= 0); + + mmdrop(mm); +} +EXPORT_SYMBOL_GPL(mmu_notifier_unregister); diff --git a/mm/mmzone.c b/mm/mmzone.c new file mode 100644 index 0000000..16ce8b9 --- /dev/null +++ b/mm/mmzone.c @@ -0,0 +1,74 @@ +/* + * linux/mm/mmzone.c + * + * management codes for pgdats and zones. + */ + + +#include <linux/stddef.h> +#include <linux/mmzone.h> +#include <linux/module.h> + +struct pglist_data *first_online_pgdat(void) +{ + return NODE_DATA(first_online_node); +} + +struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) +{ + int nid = next_online_node(pgdat->node_id); + + if (nid == MAX_NUMNODES) + return NULL; + return NODE_DATA(nid); +} + +/* + * next_zone - helper magic for for_each_zone() + */ +struct zone *next_zone(struct zone *zone) +{ + pg_data_t *pgdat = zone->zone_pgdat; + + if (zone < pgdat->node_zones + MAX_NR_ZONES - 1) + zone++; + else { + pgdat = next_online_pgdat(pgdat); + if (pgdat) + zone = pgdat->node_zones; + else + zone = NULL; + } + return zone; +} + +static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes) +{ +#ifdef CONFIG_NUMA + return node_isset(zonelist_node_idx(zref), *nodes); +#else + return 1; +#endif /* CONFIG_NUMA */ +} + +/* Returns the next zone at or below highest_zoneidx in a zonelist */ +struct zoneref *next_zones_zonelist(struct zoneref *z, + enum zone_type highest_zoneidx, + nodemask_t *nodes, + struct zone **zone) +{ + /* + * Find the next suitable zone to use for the allocation. + * Only filter based on nodemask if it's set + */ + if (likely(nodes == NULL)) + while (zonelist_zone_idx(z) > highest_zoneidx) + z++; + else + while (zonelist_zone_idx(z) > highest_zoneidx || + (z->zone && !zref_in_nodemask(z, nodes))) + z++; + + *zone = zonelist_zone(z); + return z; +} diff --git a/mm/mprotect.c b/mm/mprotect.c new file mode 100644 index 0000000..2623e29 --- /dev/null +++ b/mm/mprotect.c @@ -0,0 +1,319 @@ +/* + * mm/mprotect.c + * + * (C) Copyright 1994 Linus Torvalds + * (C) Copyright 2002 Christoph Hellwig + * + * Address space accounting code <alan@redhat.com> + * (C) Copyright 2002 Red Hat Inc, All Rights Reserved + */ + +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/slab.h> +#include <linux/shm.h> +#include <linux/mman.h> +#include <linux/fs.h> +#include <linux/highmem.h> +#include <linux/security.h> +#include <linux/mempolicy.h> +#include <linux/personality.h> +#include <linux/syscalls.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/mmu_notifier.h> +#include <asm/uaccess.h> +#include <asm/pgtable.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> + +#ifndef pgprot_modify +static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) +{ + return newprot; +} +#endif + +static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, pgprot_t newprot, + int dirty_accountable) +{ + pte_t *pte, oldpte; + spinlock_t *ptl; + + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + arch_enter_lazy_mmu_mode(); + do { + oldpte = *pte; + if (pte_present(oldpte)) { + pte_t ptent; + + ptent = ptep_modify_prot_start(mm, addr, pte); + ptent = pte_modify(ptent, newprot); + + /* + * Avoid taking write faults for pages we know to be + * dirty. + */ + if (dirty_accountable && pte_dirty(ptent)) + ptent = pte_mkwrite(ptent); + + ptep_modify_prot_commit(mm, addr, pte, ptent); +#ifdef CONFIG_MIGRATION + } else if (!pte_file(oldpte)) { + swp_entry_t entry = pte_to_swp_entry(oldpte); + + if (is_write_migration_entry(entry)) { + /* + * A protection check is difficult so + * just be safe and disable write + */ + make_migration_entry_read(&entry); + set_pte_at(mm, addr, pte, + swp_entry_to_pte(entry)); + } +#endif + } + + } while (pte++, addr += PAGE_SIZE, addr != end); + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(pte - 1, ptl); +} + +static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, + unsigned long addr, unsigned long end, pgprot_t newprot, + int dirty_accountable) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; + change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable); + } while (pmd++, addr = next, addr != end); +} + +static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, unsigned long end, pgprot_t newprot, + int dirty_accountable) +{ + pud_t *pud; + unsigned long next; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable); + } while (pud++, addr = next, addr != end); +} + +static void change_protection(struct vm_area_struct *vma, + unsigned long addr, unsigned long end, pgprot_t newprot, + int dirty_accountable) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + unsigned long next; + unsigned long start = addr; + + BUG_ON(addr >= end); + pgd = pgd_offset(mm, addr); + flush_cache_range(vma, addr, end); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable); + } while (pgd++, addr = next, addr != end); + flush_tlb_range(vma, start, end); +} + +int +mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, + unsigned long start, unsigned long end, unsigned long newflags) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long oldflags = vma->vm_flags; + long nrpages = (end - start) >> PAGE_SHIFT; + unsigned long charged = 0; + pgoff_t pgoff; + int error; + int dirty_accountable = 0; + + if (newflags == oldflags) { + *pprev = vma; + return 0; + } + + /* + * If we make a private mapping writable we increase our commit; + * but (without finer accounting) cannot reduce our commit if we + * make it unwritable again. + */ + if (newflags & VM_WRITE) { + if (!(oldflags & (VM_ACCOUNT|VM_WRITE| + VM_SHARED|VM_NORESERVE))) { + charged = nrpages; + if (security_vm_enough_memory(charged)) + return -ENOMEM; + newflags |= VM_ACCOUNT; + } + } + + /* + * First try to merge with previous and/or next vma. + */ + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + *pprev = vma_merge(mm, *pprev, start, end, newflags, + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); + if (*pprev) { + vma = *pprev; + goto success; + } + + *pprev = vma; + + if (start != vma->vm_start) { + error = split_vma(mm, vma, start, 1); + if (error) + goto fail; + } + + if (end != vma->vm_end) { + error = split_vma(mm, vma, end, 0); + if (error) + goto fail; + } + +success: + /* + * vm_flags and vm_page_prot are protected by the mmap_sem + * held in write mode. + */ + vma->vm_flags = newflags; + vma->vm_page_prot = pgprot_modify(vma->vm_page_prot, + vm_get_page_prot(newflags)); + + if (vma_wants_writenotify(vma)) { + vma->vm_page_prot = vm_get_page_prot(newflags & ~VM_SHARED); + dirty_accountable = 1; + } + + mmu_notifier_invalidate_range_start(mm, start, end); + if (is_vm_hugetlb_page(vma)) + hugetlb_change_protection(vma, start, end, vma->vm_page_prot); + else + change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); + mmu_notifier_invalidate_range_end(mm, start, end); + vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); + vm_stat_account(mm, newflags, vma->vm_file, nrpages); + return 0; + +fail: + vm_unacct_memory(charged); + return error; +} + +SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, + unsigned long, prot) +{ + unsigned long vm_flags, nstart, end, tmp, reqprot; + struct vm_area_struct *vma, *prev; + int error = -EINVAL; + const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); + prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); + if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */ + return -EINVAL; + + if (start & ~PAGE_MASK) + return -EINVAL; + if (!len) + return 0; + len = PAGE_ALIGN(len); + end = start + len; + if (end <= start) + return -ENOMEM; + if (!arch_validate_prot(prot)) + return -EINVAL; + + reqprot = prot; + /* + * Does the application expect PROT_READ to imply PROT_EXEC: + */ + if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) + prot |= PROT_EXEC; + + vm_flags = calc_vm_prot_bits(prot); + + down_write(¤t->mm->mmap_sem); + + vma = find_vma_prev(current->mm, start, &prev); + error = -ENOMEM; + if (!vma) + goto out; + if (unlikely(grows & PROT_GROWSDOWN)) { + if (vma->vm_start >= end) + goto out; + start = vma->vm_start; + error = -EINVAL; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto out; + } + else { + if (vma->vm_start > start) + goto out; + if (unlikely(grows & PROT_GROWSUP)) { + end = vma->vm_end; + error = -EINVAL; + if (!(vma->vm_flags & VM_GROWSUP)) + goto out; + } + } + if (start > vma->vm_start) + prev = vma; + + for (nstart = start ; ; ) { + unsigned long newflags; + + /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + + newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); + + /* newflags >> 4 shift VM_MAY% in place of VM_% */ + if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) { + error = -EACCES; + goto out; + } + + error = security_file_mprotect(vma, reqprot, prot); + if (error) + goto out; + + tmp = vma->vm_end; + if (tmp > end) + tmp = end; + error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); + if (error) + goto out; + nstart = tmp; + + if (nstart < prev->vm_end) + nstart = prev->vm_end; + if (nstart >= end) + goto out; + + vma = prev->vm_next; + if (!vma || vma->vm_start != nstart) { + error = -ENOMEM; + goto out; + } + } +out: + up_write(¤t->mm->mmap_sem); + return error; +} diff --git a/mm/mremap.c b/mm/mremap.c new file mode 100644 index 0000000..4207055 --- /dev/null +++ b/mm/mremap.c @@ -0,0 +1,433 @@ +/* + * mm/mremap.c + * + * (C) Copyright 1996 Linus Torvalds + * + * Address space accounting code <alan@redhat.com> + * (C) Copyright 2002 Red Hat Inc, All Rights Reserved + */ + +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/slab.h> +#include <linux/shm.h> +#include <linux/mman.h> +#include <linux/swap.h> +#include <linux/capability.h> +#include <linux/fs.h> +#include <linux/highmem.h> +#include <linux/security.h> +#include <linux/syscalls.h> +#include <linux/mmu_notifier.h> + +#include <asm/uaccess.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> + +#include "internal.h" + +static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + pgd = pgd_offset(mm, addr); + if (pgd_none_or_clear_bad(pgd)) + return NULL; + + pud = pud_offset(pgd, addr); + if (pud_none_or_clear_bad(pud)) + return NULL; + + pmd = pmd_offset(pud, addr); + if (pmd_none_or_clear_bad(pmd)) + return NULL; + + return pmd; +} + +static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + pgd = pgd_offset(mm, addr); + pud = pud_alloc(mm, pgd, addr); + if (!pud) + return NULL; + + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return NULL; + + if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr)) + return NULL; + + return pmd; +} + +static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, + unsigned long old_addr, unsigned long old_end, + struct vm_area_struct *new_vma, pmd_t *new_pmd, + unsigned long new_addr) +{ + struct address_space *mapping = NULL; + struct mm_struct *mm = vma->vm_mm; + pte_t *old_pte, *new_pte, pte; + spinlock_t *old_ptl, *new_ptl; + unsigned long old_start; + + old_start = old_addr; + mmu_notifier_invalidate_range_start(vma->vm_mm, + old_start, old_end); + if (vma->vm_file) { + /* + * Subtle point from Rajesh Venkatasubramanian: before + * moving file-based ptes, we must lock vmtruncate out, + * since it might clean the dst vma before the src vma, + * and we propagate stale pages into the dst afterward. + */ + mapping = vma->vm_file->f_mapping; + spin_lock(&mapping->i_mmap_lock); + if (new_vma->vm_truncate_count && + new_vma->vm_truncate_count != vma->vm_truncate_count) + new_vma->vm_truncate_count = 0; + } + + /* + * We don't have to worry about the ordering of src and dst + * pte locks because exclusive mmap_sem prevents deadlock. + */ + old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); + new_pte = pte_offset_map_nested(new_pmd, new_addr); + new_ptl = pte_lockptr(mm, new_pmd); + if (new_ptl != old_ptl) + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); + arch_enter_lazy_mmu_mode(); + + for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, + new_pte++, new_addr += PAGE_SIZE) { + if (pte_none(*old_pte)) + continue; + pte = ptep_clear_flush(vma, old_addr, old_pte); + pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); + set_pte_at(mm, new_addr, new_pte, pte); + } + + arch_leave_lazy_mmu_mode(); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); + pte_unmap_nested(new_pte - 1); + pte_unmap_unlock(old_pte - 1, old_ptl); + if (mapping) + spin_unlock(&mapping->i_mmap_lock); + mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end); +} + +#define LATENCY_LIMIT (64 * PAGE_SIZE) + +unsigned long move_page_tables(struct vm_area_struct *vma, + unsigned long old_addr, struct vm_area_struct *new_vma, + unsigned long new_addr, unsigned long len) +{ + unsigned long extent, next, old_end; + pmd_t *old_pmd, *new_pmd; + + old_end = old_addr + len; + flush_cache_range(vma, old_addr, old_end); + + for (; old_addr < old_end; old_addr += extent, new_addr += extent) { + cond_resched(); + next = (old_addr + PMD_SIZE) & PMD_MASK; + if (next - 1 > old_end) + next = old_end; + extent = next - old_addr; + old_pmd = get_old_pmd(vma->vm_mm, old_addr); + if (!old_pmd) + continue; + new_pmd = alloc_new_pmd(vma->vm_mm, new_addr); + if (!new_pmd) + break; + next = (new_addr + PMD_SIZE) & PMD_MASK; + if (extent > next - new_addr) + extent = next - new_addr; + if (extent > LATENCY_LIMIT) + extent = LATENCY_LIMIT; + move_ptes(vma, old_pmd, old_addr, old_addr + extent, + new_vma, new_pmd, new_addr); + } + + return len + old_addr - old_end; /* how much done */ +} + +static unsigned long move_vma(struct vm_area_struct *vma, + unsigned long old_addr, unsigned long old_len, + unsigned long new_len, unsigned long new_addr) +{ + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *new_vma; + unsigned long vm_flags = vma->vm_flags; + unsigned long new_pgoff; + unsigned long moved_len; + unsigned long excess = 0; + unsigned long hiwater_vm; + int split = 0; + + /* + * We'd prefer to avoid failure later on in do_munmap: + * which may split one vma into three before unmapping. + */ + if (mm->map_count >= sysctl_max_map_count - 3) + return -ENOMEM; + + new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); + new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); + if (!new_vma) + return -ENOMEM; + + moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); + if (moved_len < old_len) { + /* + * On error, move entries back from new area to old, + * which will succeed since page tables still there, + * and then proceed to unmap new area instead of old. + */ + move_page_tables(new_vma, new_addr, vma, old_addr, moved_len); + vma = new_vma; + old_len = new_len; + old_addr = new_addr; + new_addr = -ENOMEM; + } + + /* Conceal VM_ACCOUNT so old reservation is not undone */ + if (vm_flags & VM_ACCOUNT) { + vma->vm_flags &= ~VM_ACCOUNT; + excess = vma->vm_end - vma->vm_start - old_len; + if (old_addr > vma->vm_start && + old_addr + old_len < vma->vm_end) + split = 1; + } + + /* + * If we failed to move page tables we still do total_vm increment + * since do_munmap() will decrement it by old_len == new_len. + * + * Since total_vm is about to be raised artificially high for a + * moment, we need to restore high watermark afterwards: if stats + * are taken meanwhile, total_vm and hiwater_vm appear too high. + * If this were a serious issue, we'd add a flag to do_munmap(). + */ + hiwater_vm = mm->hiwater_vm; + mm->total_vm += new_len >> PAGE_SHIFT; + vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); + + if (do_munmap(mm, old_addr, old_len) < 0) { + /* OOM: unable to split vma, just get accounts right */ + vm_unacct_memory(excess >> PAGE_SHIFT); + excess = 0; + } + mm->hiwater_vm = hiwater_vm; + + /* Restore VM_ACCOUNT if one or two pieces of vma left */ + if (excess) { + vma->vm_flags |= VM_ACCOUNT; + if (split) + vma->vm_next->vm_flags |= VM_ACCOUNT; + } + + if (vm_flags & VM_LOCKED) { + mm->locked_vm += new_len >> PAGE_SHIFT; + if (new_len > old_len) + mlock_vma_pages_range(new_vma, new_addr + old_len, + new_addr + new_len); + } + + return new_addr; +} + +/* + * Expand (or shrink) an existing mapping, potentially moving it at the + * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) + * + * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise + * This option implies MREMAP_MAYMOVE. + */ +unsigned long do_mremap(unsigned long addr, + unsigned long old_len, unsigned long new_len, + unsigned long flags, unsigned long new_addr) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long ret = -EINVAL; + unsigned long charged = 0; + + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) + goto out; + + if (addr & ~PAGE_MASK) + goto out; + + old_len = PAGE_ALIGN(old_len); + new_len = PAGE_ALIGN(new_len); + + /* + * We allow a zero old-len as a special case + * for DOS-emu "duplicate shm area" thing. But + * a zero new-len is nonsensical. + */ + if (!new_len) + goto out; + + /* new_addr is only valid if MREMAP_FIXED is specified */ + if (flags & MREMAP_FIXED) { + if (new_addr & ~PAGE_MASK) + goto out; + if (!(flags & MREMAP_MAYMOVE)) + goto out; + + if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) + goto out; + + /* Check if the location we're moving into overlaps the + * old location at all, and fail if it does. + */ + if ((new_addr <= addr) && (new_addr+new_len) > addr) + goto out; + + if ((addr <= new_addr) && (addr+old_len) > new_addr) + goto out; + + ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); + if (ret) + goto out; + + ret = do_munmap(mm, new_addr, new_len); + if (ret) + goto out; + } + + /* + * Always allow a shrinking remap: that just unmaps + * the unnecessary pages.. + * do_munmap does all the needed commit accounting + */ + if (old_len >= new_len) { + ret = do_munmap(mm, addr+new_len, old_len - new_len); + if (ret && old_len != new_len) + goto out; + ret = addr; + if (!(flags & MREMAP_FIXED) || (new_addr == addr)) + goto out; + old_len = new_len; + } + + /* + * Ok, we need to grow.. or relocate. + */ + ret = -EFAULT; + vma = find_vma(mm, addr); + if (!vma || vma->vm_start > addr) + goto out; + if (is_vm_hugetlb_page(vma)) { + ret = -EINVAL; + goto out; + } + /* We can't remap across vm area boundaries */ + if (old_len > vma->vm_end - addr) + goto out; + if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) { + if (new_len > old_len) + goto out; + } + if (vma->vm_flags & VM_LOCKED) { + unsigned long locked, lock_limit; + locked = mm->locked_vm << PAGE_SHIFT; + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + locked += new_len - old_len; + ret = -EAGAIN; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + goto out; + } + if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) { + ret = -ENOMEM; + goto out; + } + + if (vma->vm_flags & VM_ACCOUNT) { + charged = (new_len - old_len) >> PAGE_SHIFT; + if (security_vm_enough_memory(charged)) + goto out_nc; + } + + /* old_len exactly to the end of the area.. + * And we're not relocating the area. + */ + if (old_len == vma->vm_end - addr && + !((flags & MREMAP_FIXED) && (addr != new_addr)) && + (old_len != new_len || !(flags & MREMAP_MAYMOVE))) { + unsigned long max_addr = TASK_SIZE; + if (vma->vm_next) + max_addr = vma->vm_next->vm_start; + /* can we just expand the current mapping? */ + if (max_addr - addr >= new_len) { + int pages = (new_len - old_len) >> PAGE_SHIFT; + + vma_adjust(vma, vma->vm_start, + addr + new_len, vma->vm_pgoff, NULL); + + mm->total_vm += pages; + vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); + if (vma->vm_flags & VM_LOCKED) { + mm->locked_vm += pages; + mlock_vma_pages_range(vma, addr + old_len, + addr + new_len); + } + ret = addr; + goto out; + } + } + + /* + * We weren't able to just expand or shrink the area, + * we need to create a new one and move it.. + */ + ret = -ENOMEM; + if (flags & MREMAP_MAYMOVE) { + if (!(flags & MREMAP_FIXED)) { + unsigned long map_flags = 0; + if (vma->vm_flags & VM_MAYSHARE) + map_flags |= MAP_SHARED; + + new_addr = get_unmapped_area(vma->vm_file, 0, new_len, + vma->vm_pgoff, map_flags); + if (new_addr & ~PAGE_MASK) { + ret = new_addr; + goto out; + } + + ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); + if (ret) + goto out; + } + ret = move_vma(vma, addr, old_len, new_len, new_addr); + } +out: + if (ret & ~PAGE_MASK) + vm_unacct_memory(charged); +out_nc: + return ret; +} + +SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, + unsigned long, new_len, unsigned long, flags, + unsigned long, new_addr) +{ + unsigned long ret; + + down_write(¤t->mm->mmap_sem); + ret = do_mremap(addr, old_len, new_len, flags, new_addr); + up_write(¤t->mm->mmap_sem); + return ret; +} diff --git a/mm/msync.c b/mm/msync.c new file mode 100644 index 0000000..9e06d2e --- /dev/null +++ b/mm/msync.c @@ -0,0 +1,103 @@ +/* + * linux/mm/msync.c + * + * Copyright (C) 1994-1999 Linus Torvalds + */ + +/* + * The msync() system call. + */ +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/file.h> +#include <linux/syscalls.h> +#include <linux/sched.h> + +/* + * MS_SYNC syncs the entire file - including mappings. + * + * MS_ASYNC does not start I/O (it used to, up to 2.5.67). + * Nor does it marks the relevant pages dirty (it used to up to 2.6.17). + * Now it doesn't do anything, since dirty pages are properly tracked. + * + * The application may now run fsync() to + * write out the dirty pages and wait on the writeout and check the result. + * Or the application may run fadvise(FADV_DONTNEED) against the fd to start + * async writeout immediately. + * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to + * applications. + */ +SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) +{ + unsigned long end; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + int unmapped_error = 0; + int error = -EINVAL; + + if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) + goto out; + if (start & ~PAGE_MASK) + goto out; + if ((flags & MS_ASYNC) && (flags & MS_SYNC)) + goto out; + error = -ENOMEM; + len = (len + ~PAGE_MASK) & PAGE_MASK; + end = start + len; + if (end < start) + goto out; + error = 0; + if (end == start) + goto out; + /* + * If the interval [start,end) covers some unmapped address ranges, + * just ignore them, but return -ENOMEM at the end. + */ + down_read(&mm->mmap_sem); + vma = find_vma(mm, start); + for (;;) { + struct file *file; + + /* Still start < end. */ + error = -ENOMEM; + if (!vma) + goto out_unlock; + /* Here start < vma->vm_end. */ + if (start < vma->vm_start) { + start = vma->vm_start; + if (start >= end) + goto out_unlock; + unmapped_error = -ENOMEM; + } + /* Here vma->vm_start <= start < vma->vm_end. */ + if ((flags & MS_INVALIDATE) && + (vma->vm_flags & VM_LOCKED)) { + error = -EBUSY; + goto out_unlock; + } + file = vma->vm_file; + start = vma->vm_end; + if ((flags & MS_SYNC) && file && + (vma->vm_flags & VM_SHARED)) { + get_file(file); + up_read(&mm->mmap_sem); + error = do_fsync(file, 0); + fput(file); + if (error || start >= end) + goto out; + down_read(&mm->mmap_sem); + vma = find_vma(mm, start); + } else { + if (start >= end) { + error = 0; + goto out_unlock; + } + vma = vma->vm_next; + } + } +out_unlock: + up_read(&mm->mmap_sem); +out: + return error ? : unmapped_error; +} diff --git a/mm/nommu.c b/mm/nommu.c new file mode 100644 index 0000000..b8fade4 --- /dev/null +++ b/mm/nommu.c @@ -0,0 +1,1523 @@ +/* + * linux/mm/nommu.c + * + * Replacement code for mm functions to support CPU's that don't + * have any form of memory management unit (thus no virtual memory). + * + * See Documentation/nommu-mmap.txt + * + * Copyright (c) 2004-2005 David Howells <dhowells@redhat.com> + * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> + * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> + * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> + * Copyright (c) 2007 Paul Mundt <lethal@linux-sh.org> + */ + +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/swap.h> +#include <linux/file.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/tracehook.h> +#include <linux/blkdev.h> +#include <linux/backing-dev.h> +#include <linux/mount.h> +#include <linux/personality.h> +#include <linux/security.h> +#include <linux/syscalls.h> + +#include <asm/uaccess.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> + +#include "internal.h" + +void *high_memory; +struct page *mem_map; +unsigned long max_mapnr; +unsigned long num_physpages; +unsigned long askedalloc, realalloc; +atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); +int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ +int sysctl_overcommit_ratio = 50; /* default is 50% */ +int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; +int heap_stack_gap = 0; + +EXPORT_SYMBOL(mem_map); +EXPORT_SYMBOL(num_physpages); + +/* list of shareable VMAs */ +struct rb_root nommu_vma_tree = RB_ROOT; +DECLARE_RWSEM(nommu_vma_sem); + +struct vm_operations_struct generic_file_vm_ops = { +}; + +/* + * Handle all mappings that got truncated by a "truncate()" + * system call. + * + * NOTE! We have to be ready to update the memory sharing + * between the file and the memory map for a potential last + * incomplete page. Ugly, but necessary. + */ +int vmtruncate(struct inode *inode, loff_t offset) +{ + struct address_space *mapping = inode->i_mapping; + unsigned long limit; + + if (inode->i_size < offset) + goto do_expand; + i_size_write(inode, offset); + + truncate_inode_pages(mapping, offset); + goto out_truncate; + +do_expand: + limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + if (limit != RLIM_INFINITY && offset > limit) + goto out_sig; + if (offset > inode->i_sb->s_maxbytes) + goto out; + i_size_write(inode, offset); + +out_truncate: + if (inode->i_op && inode->i_op->truncate) + inode->i_op->truncate(inode); + return 0; +out_sig: + send_sig(SIGXFSZ, current, 0); +out: + return -EFBIG; +} + +EXPORT_SYMBOL(vmtruncate); + +/* + * Return the total memory allocated for this pointer, not + * just what the caller asked for. + * + * Doesn't have to be accurate, i.e. may have races. + */ +unsigned int kobjsize(const void *objp) +{ + struct page *page; + + /* + * If the object we have should not have ksize performed on it, + * return size of 0 + */ + if (!objp || !virt_addr_valid(objp)) + return 0; + + page = virt_to_head_page(objp); + + /* + * If the allocator sets PageSlab, we know the pointer came from + * kmalloc(). + */ + if (PageSlab(page)) + return ksize(objp); + + /* + * The ksize() function is only guaranteed to work for pointers + * returned by kmalloc(). So handle arbitrary pointers here. + */ + return PAGE_SIZE << compound_order(page); +} + +int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int flags, + struct page **pages, struct vm_area_struct **vmas) +{ + struct vm_area_struct *vma; + unsigned long vm_flags; + int i; + int write = !!(flags & GUP_FLAGS_WRITE); + int force = !!(flags & GUP_FLAGS_FORCE); + int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); + + /* calculate required read or write permissions. + * - if 'force' is set, we only require the "MAY" flags. + */ + vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); + vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + + for (i = 0; i < len; i++) { + vma = find_vma(mm, start); + if (!vma) + goto finish_or_fault; + + /* protect what we can, including chardevs */ + if (vma->vm_flags & (VM_IO | VM_PFNMAP) || + (!ignore && !(vm_flags & vma->vm_flags))) + goto finish_or_fault; + + if (pages) { + pages[i] = virt_to_page(start); + if (pages[i]) + page_cache_get(pages[i]); + } + if (vmas) + vmas[i] = vma; + start += PAGE_SIZE; + } + + return i; + +finish_or_fault: + return i ? : -EFAULT; +} + + +/* + * get a list of pages in an address range belonging to the specified process + * and indicate the VMA that covers each page + * - this is potentially dodgy as we may end incrementing the page count of a + * slab page or a secondary page from a compound page + * - don't permit access to VMAs that don't support it, such as I/O mappings + */ +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int write, int force, + struct page **pages, struct vm_area_struct **vmas) +{ + int flags = 0; + + if (write) + flags |= GUP_FLAGS_WRITE; + if (force) + flags |= GUP_FLAGS_FORCE; + + return __get_user_pages(tsk, mm, + start, len, flags, + pages, vmas); +} +EXPORT_SYMBOL(get_user_pages); + +DEFINE_RWLOCK(vmlist_lock); +struct vm_struct *vmlist; + +void vfree(const void *addr) +{ + kfree(addr); +} +EXPORT_SYMBOL(vfree); + +void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) +{ + /* + * You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc() + * returns only a logical address. + */ + return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM); +} +EXPORT_SYMBOL(__vmalloc); + +void *vmalloc_user(unsigned long size) +{ + void *ret; + + ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + PAGE_KERNEL); + if (ret) { + struct vm_area_struct *vma; + + down_write(¤t->mm->mmap_sem); + vma = find_vma(current->mm, (unsigned long)ret); + if (vma) + vma->vm_flags |= VM_USERMAP; + up_write(¤t->mm->mmap_sem); + } + + return ret; +} +EXPORT_SYMBOL(vmalloc_user); + +struct page *vmalloc_to_page(const void *addr) +{ + return virt_to_page(addr); +} +EXPORT_SYMBOL(vmalloc_to_page); + +unsigned long vmalloc_to_pfn(const void *addr) +{ + return page_to_pfn(virt_to_page(addr)); +} +EXPORT_SYMBOL(vmalloc_to_pfn); + +long vread(char *buf, char *addr, unsigned long count) +{ + memcpy(buf, addr, count); + return count; +} + +long vwrite(char *buf, char *addr, unsigned long count) +{ + /* Don't allow overflow */ + if ((unsigned long) addr + count < count) + count = -(unsigned long) addr; + + memcpy(addr, buf, count); + return(count); +} + +/* + * vmalloc - allocate virtually continguos memory + * + * @size: allocation size + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into continguos kernel virtual space. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vmalloc(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); +} +EXPORT_SYMBOL(vmalloc); + +void *vmalloc_node(unsigned long size, int node) +{ + return vmalloc(size); +} +EXPORT_SYMBOL(vmalloc_node); + +#ifndef PAGE_KERNEL_EXEC +# define PAGE_KERNEL_EXEC PAGE_KERNEL +#endif + +/** + * vmalloc_exec - allocate virtually contiguous, executable memory + * @size: allocation size + * + * Kernel-internal function to allocate enough pages to cover @size + * the page level allocator and map them into contiguous and + * executable kernel virtual space. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ + +void *vmalloc_exec(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); +} + +/** + * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) + * @size: allocation size + * + * Allocate enough 32bit PA addressable pages to cover @size from the + * page level allocator and map them into continguos kernel virtual space. + */ +void *vmalloc_32(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); +} +EXPORT_SYMBOL(vmalloc_32); + +/** + * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory + * @size: allocation size + * + * The resulting memory area is 32bit addressable and zeroed so it can be + * mapped to userspace without leaking data. + * + * VM_USERMAP is set on the corresponding VMA so that subsequent calls to + * remap_vmalloc_range() are permissible. + */ +void *vmalloc_32_user(unsigned long size) +{ + /* + * We'll have to sort out the ZONE_DMA bits for 64-bit, + * but for now this can simply use vmalloc_user() directly. + */ + return vmalloc_user(size); +} +EXPORT_SYMBOL(vmalloc_32_user); + +void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot) +{ + BUG(); + return NULL; +} +EXPORT_SYMBOL(vmap); + +void vunmap(const void *addr) +{ + BUG(); +} +EXPORT_SYMBOL(vunmap); + +/* + * Implement a stub for vmalloc_sync_all() if the architecture chose not to + * have one. + */ +void __attribute__((weak)) vmalloc_sync_all(void) +{ +} + +int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, + struct page *page) +{ + return -EINVAL; +} +EXPORT_SYMBOL(vm_insert_page); + +/* + * sys_brk() for the most part doesn't need the global kernel + * lock, except when an application is doing something nasty + * like trying to un-brk an area that has already been mapped + * to a regular file. in this case, the unmapping will need + * to invoke file system routines that need the global lock. + */ +SYSCALL_DEFINE1(brk, unsigned long, brk) +{ + struct mm_struct *mm = current->mm; + + if (brk < mm->start_brk || brk > mm->context.end_brk) + return mm->brk; + + if (mm->brk == brk) + return mm->brk; + + /* + * Always allow shrinking brk + */ + if (brk <= mm->brk) { + mm->brk = brk; + return brk; + } + + /* + * Ok, looks good - let it rip. + */ + return mm->brk = brk; +} + +#ifdef DEBUG +static void show_process_blocks(void) +{ + struct vm_list_struct *vml; + + printk("Process blocks %d:", current->pid); + + for (vml = ¤t->mm->context.vmlist; vml; vml = vml->next) { + printk(" %p: %p", vml, vml->vma); + if (vml->vma) + printk(" (%d @%lx #%d)", + kobjsize((void *) vml->vma->vm_start), + vml->vma->vm_start, + atomic_read(&vml->vma->vm_usage)); + printk(vml->next ? " ->" : ".\n"); + } +} +#endif /* DEBUG */ + +/* + * add a VMA into a process's mm_struct in the appropriate place in the list + * - should be called with mm->mmap_sem held writelocked + */ +static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml) +{ + struct vm_list_struct **ppv; + + for (ppv = ¤t->mm->context.vmlist; *ppv; ppv = &(*ppv)->next) + if ((*ppv)->vma->vm_start > vml->vma->vm_start) + break; + + vml->next = *ppv; + *ppv = vml; +} + +/* + * look up the first VMA in which addr resides, NULL if none + * - should be called with mm->mmap_sem at least held readlocked + */ +struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +{ + struct vm_list_struct *loop, *vml; + + /* search the vm_start ordered list */ + vml = NULL; + for (loop = mm->context.vmlist; loop; loop = loop->next) { + if (loop->vma->vm_start > addr) + break; + vml = loop; + } + + if (vml && vml->vma->vm_end > addr) + return vml->vma; + + return NULL; +} +EXPORT_SYMBOL(find_vma); + +/* + * find a VMA + * - we don't extend stack VMAs under NOMMU conditions + */ +struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) +{ + return find_vma(mm, addr); +} + +int expand_stack(struct vm_area_struct *vma, unsigned long address) +{ + return -ENOMEM; +} + +/* + * look up the first VMA exactly that exactly matches addr + * - should be called with mm->mmap_sem at least held readlocked + */ +static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm, + unsigned long addr) +{ + struct vm_list_struct *vml; + + /* search the vm_start ordered list */ + for (vml = mm->context.vmlist; vml; vml = vml->next) { + if (vml->vma->vm_start == addr) + return vml->vma; + if (vml->vma->vm_start > addr) + break; + } + + return NULL; +} + +/* + * find a VMA in the global tree + */ +static inline struct vm_area_struct *find_nommu_vma(unsigned long start) +{ + struct vm_area_struct *vma; + struct rb_node *n = nommu_vma_tree.rb_node; + + while (n) { + vma = rb_entry(n, struct vm_area_struct, vm_rb); + + if (start < vma->vm_start) + n = n->rb_left; + else if (start > vma->vm_start) + n = n->rb_right; + else + return vma; + } + + return NULL; +} + +/* + * add a VMA in the global tree + */ +static void add_nommu_vma(struct vm_area_struct *vma) +{ + struct vm_area_struct *pvma; + struct address_space *mapping; + struct rb_node **p = &nommu_vma_tree.rb_node; + struct rb_node *parent = NULL; + + /* add the VMA to the mapping */ + if (vma->vm_file) { + mapping = vma->vm_file->f_mapping; + + flush_dcache_mmap_lock(mapping); + vma_prio_tree_insert(vma, &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); + } + + /* add the VMA to the master list */ + while (*p) { + parent = *p; + pvma = rb_entry(parent, struct vm_area_struct, vm_rb); + + if (vma->vm_start < pvma->vm_start) { + p = &(*p)->rb_left; + } + else if (vma->vm_start > pvma->vm_start) { + p = &(*p)->rb_right; + } + else { + /* mappings are at the same address - this can only + * happen for shared-mem chardevs and shared file + * mappings backed by ramfs/tmpfs */ + BUG_ON(!(pvma->vm_flags & VM_SHARED)); + + if (vma < pvma) + p = &(*p)->rb_left; + else if (vma > pvma) + p = &(*p)->rb_right; + else + BUG(); + } + } + + rb_link_node(&vma->vm_rb, parent, p); + rb_insert_color(&vma->vm_rb, &nommu_vma_tree); +} + +/* + * delete a VMA from the global list + */ +static void delete_nommu_vma(struct vm_area_struct *vma) +{ + struct address_space *mapping; + + /* remove the VMA from the mapping */ + if (vma->vm_file) { + mapping = vma->vm_file->f_mapping; + + flush_dcache_mmap_lock(mapping); + vma_prio_tree_remove(vma, &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); + } + + /* remove from the master list */ + rb_erase(&vma->vm_rb, &nommu_vma_tree); +} + +/* + * determine whether a mapping should be permitted and, if so, what sort of + * mapping we're capable of supporting + */ +static int validate_mmap_request(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long prot, + unsigned long flags, + unsigned long pgoff, + unsigned long *_capabilities) +{ + unsigned long capabilities; + unsigned long reqprot = prot; + int ret; + + /* do the simple checks first */ + if (flags & MAP_FIXED || addr) { + printk(KERN_DEBUG + "%d: Can't do fixed-address/overlay mmap of RAM\n", + current->pid); + return -EINVAL; + } + + if ((flags & MAP_TYPE) != MAP_PRIVATE && + (flags & MAP_TYPE) != MAP_SHARED) + return -EINVAL; + + if (!len) + return -EINVAL; + + /* Careful about overflows.. */ + len = PAGE_ALIGN(len); + if (!len || len > TASK_SIZE) + return -ENOMEM; + + /* offset overflow? */ + if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) + return -EOVERFLOW; + + if (file) { + /* validate file mapping requests */ + struct address_space *mapping; + + /* files must support mmap */ + if (!file->f_op || !file->f_op->mmap) + return -ENODEV; + + /* work out if what we've got could possibly be shared + * - we support chardevs that provide their own "memory" + * - we support files/blockdevs that are memory backed + */ + mapping = file->f_mapping; + if (!mapping) + mapping = file->f_path.dentry->d_inode->i_mapping; + + capabilities = 0; + if (mapping && mapping->backing_dev_info) + capabilities = mapping->backing_dev_info->capabilities; + + if (!capabilities) { + /* no explicit capabilities set, so assume some + * defaults */ + switch (file->f_path.dentry->d_inode->i_mode & S_IFMT) { + case S_IFREG: + case S_IFBLK: + capabilities = BDI_CAP_MAP_COPY; + break; + + case S_IFCHR: + capabilities = + BDI_CAP_MAP_DIRECT | + BDI_CAP_READ_MAP | + BDI_CAP_WRITE_MAP; + break; + + default: + return -EINVAL; + } + } + + /* eliminate any capabilities that we can't support on this + * device */ + if (!file->f_op->get_unmapped_area) + capabilities &= ~BDI_CAP_MAP_DIRECT; + if (!file->f_op->read) + capabilities &= ~BDI_CAP_MAP_COPY; + + if (flags & MAP_SHARED) { + /* do checks for writing, appending and locking */ + if ((prot & PROT_WRITE) && + !(file->f_mode & FMODE_WRITE)) + return -EACCES; + + if (IS_APPEND(file->f_path.dentry->d_inode) && + (file->f_mode & FMODE_WRITE)) + return -EACCES; + + if (locks_verify_locked(file->f_path.dentry->d_inode)) + return -EAGAIN; + + if (!(capabilities & BDI_CAP_MAP_DIRECT)) + return -ENODEV; + + if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) || + ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) || + ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP)) + ) { + printk("MAP_SHARED not completely supported on !MMU\n"); + return -EINVAL; + } + + /* we mustn't privatise shared mappings */ + capabilities &= ~BDI_CAP_MAP_COPY; + } + else { + /* we're going to read the file into private memory we + * allocate */ + if (!(capabilities & BDI_CAP_MAP_COPY)) + return -ENODEV; + + /* we don't permit a private writable mapping to be + * shared with the backing device */ + if (prot & PROT_WRITE) + capabilities &= ~BDI_CAP_MAP_DIRECT; + } + + /* handle executable mappings and implied executable + * mappings */ + if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { + if (prot & PROT_EXEC) + return -EPERM; + } + else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { + /* handle implication of PROT_EXEC by PROT_READ */ + if (current->personality & READ_IMPLIES_EXEC) { + if (capabilities & BDI_CAP_EXEC_MAP) + prot |= PROT_EXEC; + } + } + else if ((prot & PROT_READ) && + (prot & PROT_EXEC) && + !(capabilities & BDI_CAP_EXEC_MAP) + ) { + /* backing file is not executable, try to copy */ + capabilities &= ~BDI_CAP_MAP_DIRECT; + } + } + else { + /* anonymous mappings are always memory backed and can be + * privately mapped + */ + capabilities = BDI_CAP_MAP_COPY; + + /* handle PROT_EXEC implication by PROT_READ */ + if ((prot & PROT_READ) && + (current->personality & READ_IMPLIES_EXEC)) + prot |= PROT_EXEC; + } + + /* allow the security API to have its say */ + ret = security_file_mmap(file, reqprot, prot, flags, addr, 0); + if (ret < 0) + return ret; + + /* looks okay */ + *_capabilities = capabilities; + return 0; +} + +/* + * we've determined that we can make the mapping, now translate what we + * now know into VMA flags + */ +static unsigned long determine_vm_flags(struct file *file, + unsigned long prot, + unsigned long flags, + unsigned long capabilities) +{ + unsigned long vm_flags; + + vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags); + vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + /* vm_flags |= mm->def_flags; */ + + if (!(capabilities & BDI_CAP_MAP_DIRECT)) { + /* attempt to share read-only copies of mapped file chunks */ + if (file && !(prot & PROT_WRITE)) + vm_flags |= VM_MAYSHARE; + } + else { + /* overlay a shareable mapping on the backing device or inode + * if possible - used for chardevs, ramfs/tmpfs/shmfs and + * romfs/cramfs */ + if (flags & MAP_SHARED) + vm_flags |= VM_MAYSHARE | VM_SHARED; + else if ((((vm_flags & capabilities) ^ vm_flags) & BDI_CAP_VMFLAGS) == 0) + vm_flags |= VM_MAYSHARE; + } + + /* refuse to let anyone share private mappings with this process if + * it's being traced - otherwise breakpoints set in it may interfere + * with another untraced process + */ + if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current)) + vm_flags &= ~VM_MAYSHARE; + + return vm_flags; +} + +/* + * set up a shared mapping on a file + */ +static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len) +{ + int ret; + + ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); + if (ret != -ENOSYS) + return ret; + + /* getting an ENOSYS error indicates that direct mmap isn't + * possible (as opposed to tried but failed) so we'll fall + * through to making a private copy of the data and mapping + * that if we can */ + return -ENODEV; +} + +/* + * set up a private mapping or an anonymous shared mapping + */ +static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) +{ + void *base; + int ret; + + /* invoke the file's mapping function so that it can keep track of + * shared mappings on devices or memory + * - VM_MAYSHARE will be set if it may attempt to share + */ + if (vma->vm_file) { + ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); + if (ret != -ENOSYS) { + /* shouldn't return success if we're not sharing */ + BUG_ON(ret == 0 && !(vma->vm_flags & VM_MAYSHARE)); + return ret; /* success or a real error */ + } + + /* getting an ENOSYS error indicates that direct mmap isn't + * possible (as opposed to tried but failed) so we'll try to + * make a private copy of the data and map that instead */ + } + + /* allocate some memory to hold the mapping + * - note that this may not return a page-aligned address if the object + * we're allocating is smaller than a page + */ + base = kmalloc(len, GFP_KERNEL|__GFP_COMP); + if (!base) + goto enomem; + + vma->vm_start = (unsigned long) base; + vma->vm_end = vma->vm_start + len; + vma->vm_flags |= VM_MAPPED_COPY; + +#ifdef WARN_ON_SLACK + if (len + WARN_ON_SLACK <= kobjsize(result)) + printk("Allocation of %lu bytes from process %d has %lu bytes of slack\n", + len, current->pid, kobjsize(result) - len); +#endif + + if (vma->vm_file) { + /* read the contents of a file into the copy */ + mm_segment_t old_fs; + loff_t fpos; + + fpos = vma->vm_pgoff; + fpos <<= PAGE_SHIFT; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos); + set_fs(old_fs); + + if (ret < 0) + goto error_free; + + /* clear the last little bit */ + if (ret < len) + memset(base + ret, 0, len - ret); + + } else { + /* if it's an anonymous mapping, then just clear it */ + memset(base, 0, len); + } + + return 0; + +error_free: + kfree(base); + vma->vm_start = 0; + return ret; + +enomem: + printk("Allocation of length %lu from process %d failed\n", + len, current->pid); + show_free_areas(); + return -ENOMEM; +} + +/* + * handle mapping creation for uClinux + */ +unsigned long do_mmap_pgoff(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long prot, + unsigned long flags, + unsigned long pgoff) +{ + struct vm_list_struct *vml = NULL; + struct vm_area_struct *vma = NULL; + struct rb_node *rb; + unsigned long capabilities, vm_flags; + void *result; + int ret; + + if (!(flags & MAP_FIXED)) + addr = round_hint_to_min(addr); + + /* decide whether we should attempt the mapping, and if so what sort of + * mapping */ + ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, + &capabilities); + if (ret < 0) + return ret; + + /* we've determined that we can make the mapping, now translate what we + * now know into VMA flags */ + vm_flags = determine_vm_flags(file, prot, flags, capabilities); + + /* we're going to need to record the mapping if it works */ + vml = kzalloc(sizeof(struct vm_list_struct), GFP_KERNEL); + if (!vml) + goto error_getting_vml; + + down_write(&nommu_vma_sem); + + /* if we want to share, we need to check for VMAs created by other + * mmap() calls that overlap with our proposed mapping + * - we can only share with an exact match on most regular files + * - shared mappings on character devices and memory backed files are + * permitted to overlap inexactly as far as we are concerned for in + * these cases, sharing is handled in the driver or filesystem rather + * than here + */ + if (vm_flags & VM_MAYSHARE) { + unsigned long pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long vmpglen; + + /* suppress VMA sharing for shared regions */ + if (vm_flags & VM_SHARED && + capabilities & BDI_CAP_MAP_DIRECT) + goto dont_share_VMAs; + + for (rb = rb_first(&nommu_vma_tree); rb; rb = rb_next(rb)) { + vma = rb_entry(rb, struct vm_area_struct, vm_rb); + + if (!(vma->vm_flags & VM_MAYSHARE)) + continue; + + /* search for overlapping mappings on the same file */ + if (vma->vm_file->f_path.dentry->d_inode != file->f_path.dentry->d_inode) + continue; + + if (vma->vm_pgoff >= pgoff + pglen) + continue; + + vmpglen = vma->vm_end - vma->vm_start + PAGE_SIZE - 1; + vmpglen >>= PAGE_SHIFT; + if (pgoff >= vma->vm_pgoff + vmpglen) + continue; + + /* handle inexactly overlapping matches between mappings */ + if (vma->vm_pgoff != pgoff || vmpglen != pglen) { + if (!(capabilities & BDI_CAP_MAP_DIRECT)) + goto sharing_violation; + continue; + } + + /* we've found a VMA we can share */ + atomic_inc(&vma->vm_usage); + + vml->vma = vma; + result = (void *) vma->vm_start; + goto shared; + } + + dont_share_VMAs: + vma = NULL; + + /* obtain the address at which to make a shared mapping + * - this is the hook for quasi-memory character devices to + * tell us the location of a shared mapping + */ + if (file && file->f_op->get_unmapped_area) { + addr = file->f_op->get_unmapped_area(file, addr, len, + pgoff, flags); + if (IS_ERR((void *) addr)) { + ret = addr; + if (ret != (unsigned long) -ENOSYS) + goto error; + + /* the driver refused to tell us where to site + * the mapping so we'll have to attempt to copy + * it */ + ret = (unsigned long) -ENODEV; + if (!(capabilities & BDI_CAP_MAP_COPY)) + goto error; + + capabilities &= ~BDI_CAP_MAP_DIRECT; + } + } + } + + /* we're going to need a VMA struct as well */ + vma = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL); + if (!vma) + goto error_getting_vma; + + INIT_LIST_HEAD(&vma->anon_vma_node); + atomic_set(&vma->vm_usage, 1); + if (file) { + get_file(file); + if (vm_flags & VM_EXECUTABLE) { + added_exe_file_vma(current->mm); + vma->vm_mm = current->mm; + } + } + vma->vm_file = file; + vma->vm_flags = vm_flags; + vma->vm_start = addr; + vma->vm_end = addr + len; + vma->vm_pgoff = pgoff; + + vml->vma = vma; + + /* set up the mapping */ + if (file && vma->vm_flags & VM_SHARED) + ret = do_mmap_shared_file(vma, len); + else + ret = do_mmap_private(vma, len); + if (ret < 0) + goto error; + + /* okay... we have a mapping; now we have to register it */ + result = (void *) vma->vm_start; + + if (vma->vm_flags & VM_MAPPED_COPY) { + realalloc += kobjsize(result); + askedalloc += len; + } + + realalloc += kobjsize(vma); + askedalloc += sizeof(*vma); + + current->mm->total_vm += len >> PAGE_SHIFT; + + add_nommu_vma(vma); + + shared: + realalloc += kobjsize(vml); + askedalloc += sizeof(*vml); + + add_vma_to_mm(current->mm, vml); + + up_write(&nommu_vma_sem); + + if (prot & PROT_EXEC) + flush_icache_range((unsigned long) result, + (unsigned long) result + len); + +#ifdef DEBUG + printk("do_mmap:\n"); + show_process_blocks(); +#endif + + return (unsigned long) result; + + error: + up_write(&nommu_vma_sem); + kfree(vml); + if (vma) { + if (vma->vm_file) { + fput(vma->vm_file); + if (vma->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(vma->vm_mm); + } + kfree(vma); + } + return ret; + + sharing_violation: + up_write(&nommu_vma_sem); + printk("Attempt to share mismatched mappings\n"); + kfree(vml); + return -EINVAL; + + error_getting_vma: + up_write(&nommu_vma_sem); + kfree(vml); + printk("Allocation of vma for %lu byte allocation from process %d failed\n", + len, current->pid); + show_free_areas(); + return -ENOMEM; + + error_getting_vml: + printk("Allocation of vml for %lu byte allocation from process %d failed\n", + len, current->pid); + show_free_areas(); + return -ENOMEM; +} +EXPORT_SYMBOL(do_mmap_pgoff); + +/* + * handle mapping disposal for uClinux + */ +static void put_vma(struct mm_struct *mm, struct vm_area_struct *vma) +{ + if (vma) { + down_write(&nommu_vma_sem); + + if (atomic_dec_and_test(&vma->vm_usage)) { + delete_nommu_vma(vma); + + if (vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); + + /* IO memory and memory shared directly out of the pagecache from + * ramfs/tmpfs mustn't be released here */ + if (vma->vm_flags & VM_MAPPED_COPY) { + realalloc -= kobjsize((void *) vma->vm_start); + askedalloc -= vma->vm_end - vma->vm_start; + kfree((void *) vma->vm_start); + } + + realalloc -= kobjsize(vma); + askedalloc -= sizeof(*vma); + + if (vma->vm_file) { + fput(vma->vm_file); + if (vma->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(mm); + } + kfree(vma); + } + + up_write(&nommu_vma_sem); + } +} + +/* + * release a mapping + * - under NOMMU conditions the parameters must match exactly to the mapping to + * be removed + */ +int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) +{ + struct vm_list_struct *vml, **parent; + unsigned long end = addr + len; + +#ifdef DEBUG + printk("do_munmap:\n"); +#endif + + for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) { + if ((*parent)->vma->vm_start > addr) + break; + if ((*parent)->vma->vm_start == addr && + ((len == 0) || ((*parent)->vma->vm_end == end))) + goto found; + } + + printk("munmap of non-mmaped memory by process %d (%s): %p\n", + current->pid, current->comm, (void *) addr); + return -EINVAL; + + found: + vml = *parent; + + put_vma(mm, vml->vma); + + *parent = vml->next; + realalloc -= kobjsize(vml); + askedalloc -= sizeof(*vml); + kfree(vml); + + update_hiwater_vm(mm); + mm->total_vm -= len >> PAGE_SHIFT; + +#ifdef DEBUG + show_process_blocks(); +#endif + + return 0; +} +EXPORT_SYMBOL(do_munmap); + +SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) +{ + int ret; + struct mm_struct *mm = current->mm; + + down_write(&mm->mmap_sem); + ret = do_munmap(mm, addr, len); + up_write(&mm->mmap_sem); + return ret; +} + +/* + * Release all mappings + */ +void exit_mmap(struct mm_struct * mm) +{ + struct vm_list_struct *tmp; + + if (mm) { +#ifdef DEBUG + printk("Exit_mmap:\n"); +#endif + + mm->total_vm = 0; + + while ((tmp = mm->context.vmlist)) { + mm->context.vmlist = tmp->next; + put_vma(mm, tmp->vma); + + realalloc -= kobjsize(tmp); + askedalloc -= sizeof(*tmp); + kfree(tmp); + } + +#ifdef DEBUG + show_process_blocks(); +#endif + } +} + +unsigned long do_brk(unsigned long addr, unsigned long len) +{ + return -ENOMEM; +} + +/* + * expand (or shrink) an existing mapping, potentially moving it at the same + * time (controlled by the MREMAP_MAYMOVE flag and available VM space) + * + * under NOMMU conditions, we only permit changing a mapping's size, and only + * as long as it stays within the hole allocated by the kmalloc() call in + * do_mmap_pgoff() and the block is not shareable + * + * MREMAP_FIXED is not supported under NOMMU conditions + */ +unsigned long do_mremap(unsigned long addr, + unsigned long old_len, unsigned long new_len, + unsigned long flags, unsigned long new_addr) +{ + struct vm_area_struct *vma; + + /* insanity checks first */ + if (new_len == 0) + return (unsigned long) -EINVAL; + + if (flags & MREMAP_FIXED && new_addr != addr) + return (unsigned long) -EINVAL; + + vma = find_vma_exact(current->mm, addr); + if (!vma) + return (unsigned long) -EINVAL; + + if (vma->vm_end != vma->vm_start + old_len) + return (unsigned long) -EFAULT; + + if (vma->vm_flags & VM_MAYSHARE) + return (unsigned long) -EPERM; + + if (new_len > kobjsize((void *) addr)) + return (unsigned long) -ENOMEM; + + /* all checks complete - do it */ + vma->vm_end = vma->vm_start + new_len; + + askedalloc -= old_len; + askedalloc += new_len; + + return vma->vm_start; +} +EXPORT_SYMBOL(do_mremap); + +SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, + unsigned long, new_len, unsigned long, flags, + unsigned long, new_addr) +{ + unsigned long ret; + + down_write(¤t->mm->mmap_sem); + ret = do_mremap(addr, old_len, new_len, flags, new_addr); + up_write(¤t->mm->mmap_sem); + return ret; +} + +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, + unsigned int foll_flags) +{ + return NULL; +} + +int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, + unsigned long to, unsigned long size, pgprot_t prot) +{ + vma->vm_start = vma->vm_pgoff << PAGE_SHIFT; + return 0; +} +EXPORT_SYMBOL(remap_pfn_range); + +int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, + unsigned long pgoff) +{ + unsigned int size = vma->vm_end - vma->vm_start; + + if (!(vma->vm_flags & VM_USERMAP)) + return -EINVAL; + + vma->vm_start = (unsigned long)(addr + (pgoff << PAGE_SHIFT)); + vma->vm_end = vma->vm_start + size; + + return 0; +} +EXPORT_SYMBOL(remap_vmalloc_range); + +void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) +{ +} + +unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + return -ENOMEM; +} + +void arch_unmap_area(struct mm_struct *mm, unsigned long addr) +{ +} + +void unmap_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen, + int even_cows) +{ +} +EXPORT_SYMBOL(unmap_mapping_range); + +/* + * ask for an unmapped area at which to create a mapping on a file + */ +unsigned long get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + unsigned long (*get_area)(struct file *, unsigned long, unsigned long, + unsigned long, unsigned long); + + get_area = current->mm->get_unmapped_area; + if (file && file->f_op && file->f_op->get_unmapped_area) + get_area = file->f_op->get_unmapped_area; + + if (!get_area) + return -ENOSYS; + + return get_area(file, addr, len, pgoff, flags); +} +EXPORT_SYMBOL(get_unmapped_area); + +/* + * Check that a process has enough memory to allocate a new virtual + * mapping. 0 means there is enough memory for the allocation to + * succeed and -ENOMEM implies there is not. + * + * We currently support three overcommit policies, which are set via the + * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting + * + * Strict overcommit modes added 2002 Feb 26 by Alan Cox. + * Additional code 2002 Jul 20 by Robert Love. + * + * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. + * + * Note this is a helper function intended to be used by LSMs which + * wish to use this logic. + */ +int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) +{ + unsigned long free, allowed; + + vm_acct_memory(pages); + + /* + * Sometimes we want to use more memory than we have + */ + if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) + return 0; + + if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { + unsigned long n; + + free = global_page_state(NR_FILE_PAGES); + free += nr_swap_pages; + + /* + * Any slabs which are created with the + * SLAB_RECLAIM_ACCOUNT flag claim to have contents + * which are reclaimable, under pressure. The dentry + * cache and most inode caches should fall into this + */ + free += global_page_state(NR_SLAB_RECLAIMABLE); + + /* + * Leave the last 3% for root + */ + if (!cap_sys_admin) + free -= free / 32; + + if (free > pages) + return 0; + + /* + * nr_free_pages() is very expensive on large systems, + * only call if we're about to fail. + */ + n = nr_free_pages(); + + /* + * Leave reserved pages. The pages are not for anonymous pages. + */ + if (n <= totalreserve_pages) + goto error; + else + n -= totalreserve_pages; + + /* + * Leave the last 3% for root + */ + if (!cap_sys_admin) + n -= n / 32; + free += n; + + if (free > pages) + return 0; + + goto error; + } + + allowed = totalram_pages * sysctl_overcommit_ratio / 100; + /* + * Leave the last 3% for root + */ + if (!cap_sys_admin) + allowed -= allowed / 32; + allowed += total_swap_pages; + + /* Don't let a single process grow too big: + leave 3% of the size of this process for other processes */ + if (mm) + allowed -= mm->total_vm / 32; + + /* + * cast `allowed' as a signed long because vm_committed_space + * sometimes has a negative value + */ + if (atomic_long_read(&vm_committed_space) < (long)allowed) + return 0; +error: + vm_unacct_memory(pages); + + return -ENOMEM; +} + +int in_gate_area_no_task(unsigned long addr) +{ + return 0; +} + +int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + BUG(); + return 0; +} +EXPORT_SYMBOL(filemap_fault); + +/* + * Access another process' address space. + * - source/target buffer must be kernel space + */ +int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +{ + struct vm_area_struct *vma; + struct mm_struct *mm; + + if (addr + len < addr) + return 0; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + down_read(&mm->mmap_sem); + + /* the access must start within one of the target process's mappings */ + vma = find_vma(mm, addr); + if (vma) { + /* don't overrun this mapping */ + if (addr + len >= vma->vm_end) + len = vma->vm_end - addr; + + /* only read or write mappings where it is permitted */ + if (write && vma->vm_flags & VM_MAYWRITE) + len -= copy_to_user((void *) addr, buf, len); + else if (!write && vma->vm_flags & VM_MAYREAD) + len -= copy_from_user(buf, (void *) addr, len); + else + len = 0; + } else { + len = 0; + } + + up_read(&mm->mmap_sem); + mmput(mm); + return len; +} diff --git a/mm/oom_kill.c b/mm/oom_kill.c new file mode 100644 index 0000000..a0a0190 --- /dev/null +++ b/mm/oom_kill.c @@ -0,0 +1,593 @@ +/* + * linux/mm/oom_kill.c + * + * Copyright (C) 1998,2000 Rik van Riel + * Thanks go out to Claus Fischer for some serious inspiration and + * for goading me into coding this file... + * + * The routines in this file are used to kill a process when + * we're seriously out of memory. This gets called from __alloc_pages() + * in mm/page_alloc.c when we really run out of memory. + * + * Since we won't call these routines often (on a well-configured + * machine) this file will double as a 'coding guide' and a signpost + * for newbie kernel hackers. It features several pointers to major + * kernel subsystems and hints as to where to find out what things do. + */ + +#include <linux/oom.h> +#include <linux/mm.h> +#include <linux/err.h> +#include <linux/sched.h> +#include <linux/swap.h> +#include <linux/timex.h> +#include <linux/jiffies.h> +#include <linux/cpuset.h> +#include <linux/module.h> +#include <linux/notifier.h> +#include <linux/memcontrol.h> +#include <linux/security.h> + +int sysctl_panic_on_oom; +int sysctl_oom_kill_allocating_task; +int sysctl_oom_dump_tasks; +static DEFINE_SPINLOCK(zone_scan_mutex); +/* #define DEBUG */ + +/** + * badness - calculate a numeric value for how bad this task has been + * @p: task struct of which task we should calculate + * @uptime: current uptime in seconds + * + * The formula used is relatively simple and documented inline in the + * function. The main rationale is that we want to select a good task + * to kill when we run out of memory. + * + * Good in this context means that: + * 1) we lose the minimum amount of work done + * 2) we recover a large amount of memory + * 3) we don't kill anything innocent of eating tons of memory + * 4) we want to kill the minimum amount of processes (one) + * 5) we try to kill the process the user expects us to kill, this + * algorithm has been meticulously tuned to meet the principle + * of least surprise ... (be careful when you change it) + */ + +unsigned long badness(struct task_struct *p, unsigned long uptime) +{ + unsigned long points, cpu_time, run_time, s; + struct mm_struct *mm; + struct task_struct *child; + + task_lock(p); + mm = p->mm; + if (!mm) { + task_unlock(p); + return 0; + } + + /* + * The memory size of the process is the basis for the badness. + */ + points = mm->total_vm; + + /* + * After this unlock we can no longer dereference local variable `mm' + */ + task_unlock(p); + + /* + * swapoff can easily use up all memory, so kill those first. + */ + if (p->flags & PF_SWAPOFF) + return ULONG_MAX; + + /* + * Processes which fork a lot of child processes are likely + * a good choice. We add half the vmsize of the children if they + * have an own mm. This prevents forking servers to flood the + * machine with an endless amount of children. In case a single + * child is eating the vast majority of memory, adding only half + * to the parents will make the child our kill candidate of choice. + */ + list_for_each_entry(child, &p->children, sibling) { + task_lock(child); + if (child->mm != mm && child->mm) + points += child->mm->total_vm/2 + 1; + task_unlock(child); + } + + /* + * CPU time is in tens of seconds and run time is in thousands + * of seconds. There is no particular reason for this other than + * that it turned out to work very well in practice. + */ + cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime)) + >> (SHIFT_HZ + 3); + + if (uptime >= p->start_time.tv_sec) + run_time = (uptime - p->start_time.tv_sec) >> 10; + else + run_time = 0; + + s = int_sqrt(cpu_time); + if (s) + points /= s; + s = int_sqrt(int_sqrt(run_time)); + if (s) + points /= s; + + /* + * Niced processes are most likely less important, so double + * their badness points. + */ + if (task_nice(p) > 0) + points *= 2; + + /* + * Superuser processes are usually more important, so we make it + * less likely that we kill those. + */ + if (has_capability(p, CAP_SYS_ADMIN) || + has_capability(p, CAP_SYS_RESOURCE)) + points /= 4; + + /* + * We don't want to kill a process with direct hardware access. + * Not only could that mess up the hardware, but usually users + * tend to only have this flag set on applications they think + * of as important. + */ + if (has_capability(p, CAP_SYS_RAWIO)) + points /= 4; + + /* + * If p's nodes don't overlap ours, it may still help to kill p + * because p may have allocated or otherwise mapped memory on + * this node before. However it will be less likely. + */ + if (!cpuset_mems_allowed_intersects(current, p)) + points /= 8; + + /* + * Adjust the score by oomkilladj. + */ + if (p->oomkilladj) { + if (p->oomkilladj > 0) { + if (!points) + points = 1; + points <<= p->oomkilladj; + } else + points >>= -(p->oomkilladj); + } + +#ifdef DEBUG + printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points\n", + p->pid, p->comm, points); +#endif + return points; +} + +/* + * Determine the type of allocation constraint. + */ +static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist, + gfp_t gfp_mask) +{ +#ifdef CONFIG_NUMA + struct zone *zone; + struct zoneref *z; + enum zone_type high_zoneidx = gfp_zone(gfp_mask); + nodemask_t nodes = node_states[N_HIGH_MEMORY]; + + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) + if (cpuset_zone_allowed_softwall(zone, gfp_mask)) + node_clear(zone_to_nid(zone), nodes); + else + return CONSTRAINT_CPUSET; + + if (!nodes_empty(nodes)) + return CONSTRAINT_MEMORY_POLICY; +#endif + + return CONSTRAINT_NONE; +} + +/* + * Simple selection loop. We chose the process with the highest + * number of 'points'. We expect the caller will lock the tasklist. + * + * (not docbooked, we don't want this one cluttering up the manual) + */ +static struct task_struct *select_bad_process(unsigned long *ppoints, + struct mem_cgroup *mem) +{ + struct task_struct *g, *p; + struct task_struct *chosen = NULL; + struct timespec uptime; + *ppoints = 0; + + do_posix_clock_monotonic_gettime(&uptime); + do_each_thread(g, p) { + unsigned long points; + + /* + * skip kernel threads and tasks which have already released + * their mm. + */ + if (!p->mm) + continue; + /* skip the init task */ + if (is_global_init(p)) + continue; + if (mem && !task_in_mem_cgroup(p, mem)) + continue; + + /* + * This task already has access to memory reserves and is + * being killed. Don't allow any other task access to the + * memory reserve. + * + * Note: this may have a chance of deadlock if it gets + * blocked waiting for another task which itself is waiting + * for memory. Is there a better alternative? + */ + if (test_tsk_thread_flag(p, TIF_MEMDIE)) + return ERR_PTR(-1UL); + + /* + * This is in the process of releasing memory so wait for it + * to finish before killing some other task by mistake. + * + * However, if p is the current task, we allow the 'kill' to + * go ahead if it is exiting: this will simply set TIF_MEMDIE, + * which will allow it to gain access to memory reserves in + * the process of exiting and releasing its resources. + * Otherwise we could get an easy OOM deadlock. + */ + if (p->flags & PF_EXITING) { + if (p != current) + return ERR_PTR(-1UL); + + chosen = p; + *ppoints = ULONG_MAX; + } + + if (p->oomkilladj == OOM_DISABLE) + continue; + + points = badness(p, uptime.tv_sec); + if (points > *ppoints || !chosen) { + chosen = p; + *ppoints = points; + } + } while_each_thread(g, p); + + return chosen; +} + +/** + * dump_tasks - dump current memory state of all system tasks + * @mem: target memory controller + * + * Dumps the current memory state of all system tasks, excluding kernel threads. + * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj + * score, and name. + * + * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are + * shown. + * + * Call with tasklist_lock read-locked. + */ +static void dump_tasks(const struct mem_cgroup *mem) +{ + struct task_struct *g, *p; + + printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj " + "name\n"); + do_each_thread(g, p) { + /* + * total_vm and rss sizes do not exist for tasks with a + * detached mm so there's no need to report them. + */ + if (!p->mm) + continue; + if (mem && !task_in_mem_cgroup(p, mem)) + continue; + if (!thread_group_leader(p)) + continue; + + task_lock(p); + printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", + p->pid, p->uid, p->tgid, p->mm->total_vm, + get_mm_rss(p->mm), (int)task_cpu(p), p->oomkilladj, + p->comm); + task_unlock(p); + } while_each_thread(g, p); +} + +/* + * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO + * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO + * set. + */ +static void __oom_kill_task(struct task_struct *p, int verbose) +{ + if (is_global_init(p)) { + WARN_ON(1); + printk(KERN_WARNING "tried to kill init!\n"); + return; + } + + if (!p->mm) { + WARN_ON(1); + printk(KERN_WARNING "tried to kill an mm-less task!\n"); + return; + } + + if (verbose) + printk(KERN_ERR "Killed process %d (%s)\n", + task_pid_nr(p), p->comm); + + /* + * We give our sacrificial lamb high priority and access to + * all the memory it needs. That way it should be able to + * exit() and clear out its resources quickly... + */ + p->rt.time_slice = HZ; + set_tsk_thread_flag(p, TIF_MEMDIE); + + force_sig(SIGKILL, p); +} + +static int oom_kill_task(struct task_struct *p) +{ + struct mm_struct *mm; + struct task_struct *g, *q; + + mm = p->mm; + + /* WARNING: mm may not be dereferenced since we did not obtain its + * value from get_task_mm(p). This is OK since all we need to do is + * compare mm to q->mm below. + * + * Furthermore, even if mm contains a non-NULL value, p->mm may + * change to NULL at any time since we do not hold task_lock(p). + * However, this is of no concern to us. + */ + + if (mm == NULL) + return 1; + + /* + * Don't kill the process if any threads are set to OOM_DISABLE + */ + do_each_thread(g, q) { + if (q->mm == mm && q->oomkilladj == OOM_DISABLE) + return 1; + } while_each_thread(g, q); + + __oom_kill_task(p, 1); + + /* + * kill all processes that share the ->mm (i.e. all threads), + * but are in a different thread group. Don't let them have access + * to memory reserves though, otherwise we might deplete all memory. + */ + do_each_thread(g, q) { + if (q->mm == mm && !same_thread_group(q, p)) + force_sig(SIGKILL, q); + } while_each_thread(g, q); + + return 0; +} + +static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, + unsigned long points, struct mem_cgroup *mem, + const char *message) +{ + struct task_struct *c; + + if (printk_ratelimit()) { + printk(KERN_WARNING "%s invoked oom-killer: " + "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", + current->comm, gfp_mask, order, current->oomkilladj); + dump_stack(); + show_mem(); + if (sysctl_oom_dump_tasks) + dump_tasks(mem); + } + + /* + * If the task is already exiting, don't alarm the sysadmin or kill + * its children or threads, just set TIF_MEMDIE so it can die quickly + */ + if (p->flags & PF_EXITING) { + __oom_kill_task(p, 0); + return 0; + } + + printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n", + message, task_pid_nr(p), p->comm, points); + + /* Try to kill a child first */ + list_for_each_entry(c, &p->children, sibling) { + if (c->mm == p->mm) + continue; + if (!oom_kill_task(c)) + return 0; + } + return oom_kill_task(p); +} + +#ifdef CONFIG_CGROUP_MEM_RES_CTLR +void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) +{ + unsigned long points = 0; + struct task_struct *p; + + cgroup_lock(); + read_lock(&tasklist_lock); +retry: + p = select_bad_process(&points, mem); + if (PTR_ERR(p) == -1UL) + goto out; + + if (!p) + p = current; + + if (oom_kill_process(p, gfp_mask, 0, points, mem, + "Memory cgroup out of memory")) + goto retry; +out: + read_unlock(&tasklist_lock); + cgroup_unlock(); +} +#endif + +static BLOCKING_NOTIFIER_HEAD(oom_notify_list); + +int register_oom_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&oom_notify_list, nb); +} +EXPORT_SYMBOL_GPL(register_oom_notifier); + +int unregister_oom_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&oom_notify_list, nb); +} +EXPORT_SYMBOL_GPL(unregister_oom_notifier); + +/* + * Try to acquire the OOM killer lock for the zones in zonelist. Returns zero + * if a parallel OOM killing is already taking place that includes a zone in + * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. + */ +int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) +{ + struct zoneref *z; + struct zone *zone; + int ret = 1; + + spin_lock(&zone_scan_mutex); + for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { + if (zone_is_oom_locked(zone)) { + ret = 0; + goto out; + } + } + + for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { + /* + * Lock each zone in the zonelist under zone_scan_mutex so a + * parallel invocation of try_set_zone_oom() doesn't succeed + * when it shouldn't. + */ + zone_set_flag(zone, ZONE_OOM_LOCKED); + } + +out: + spin_unlock(&zone_scan_mutex); + return ret; +} + +/* + * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed + * allocation attempts with zonelists containing them may now recall the OOM + * killer, if necessary. + */ +void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) +{ + struct zoneref *z; + struct zone *zone; + + spin_lock(&zone_scan_mutex); + for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { + zone_clear_flag(zone, ZONE_OOM_LOCKED); + } + spin_unlock(&zone_scan_mutex); +} + +/** + * out_of_memory - kill the "best" process when we run out of memory + * @zonelist: zonelist pointer + * @gfp_mask: memory allocation flags + * @order: amount of memory being requested as a power of 2 + * + * If we run out of memory, we have the choice between either + * killing a random task (bad), letting the system crash (worse) + * OR try to be smart about which process to kill. Note that we + * don't have to be perfect here, we just have to be good. + */ +void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) +{ + struct task_struct *p; + unsigned long points = 0; + unsigned long freed = 0; + enum oom_constraint constraint; + + blocking_notifier_call_chain(&oom_notify_list, 0, &freed); + if (freed > 0) + /* Got some memory back in the last second. */ + return; + + if (sysctl_panic_on_oom == 2) + panic("out of memory. Compulsory panic_on_oom is selected.\n"); + + /* + * Check if there were limitations on the allocation (only relevant for + * NUMA) that may require different handling. + */ + constraint = constrained_alloc(zonelist, gfp_mask); + read_lock(&tasklist_lock); + + switch (constraint) { + case CONSTRAINT_MEMORY_POLICY: + oom_kill_process(current, gfp_mask, order, points, NULL, + "No available memory (MPOL_BIND)"); + break; + + case CONSTRAINT_NONE: + if (sysctl_panic_on_oom) + panic("out of memory. panic_on_oom is selected\n"); + /* Fall-through */ + case CONSTRAINT_CPUSET: + if (sysctl_oom_kill_allocating_task) { + oom_kill_process(current, gfp_mask, order, points, NULL, + "Out of memory (oom_kill_allocating_task)"); + break; + } +retry: + /* + * Rambo mode: Shoot down a process and hope it solves whatever + * issues we may have. + */ + p = select_bad_process(&points, NULL); + + if (PTR_ERR(p) == -1UL) + goto out; + + /* Found nothing?!?! Either we hang forever, or we panic. */ + if (!p) { + read_unlock(&tasklist_lock); + panic("Out of memory and no killable processes...\n"); + } + + if (oom_kill_process(p, gfp_mask, order, points, NULL, + "Out of memory")) + goto retry; + + break; + } + +out: + read_unlock(&tasklist_lock); + + /* + * Give "p" a good chance of killing itself before we + * retry to allocate memory unless "p" is current + */ + if (!test_thread_flag(TIF_MEMDIE)) + schedule_timeout_uninterruptible(1); +} diff --git a/mm/page-writeback.c b/mm/page-writeback.c new file mode 100644 index 0000000..5270591 --- /dev/null +++ b/mm/page-writeback.c @@ -0,0 +1,1389 @@ +/* + * mm/page-writeback.c + * + * Copyright (C) 2002, Linus Torvalds. + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * + * Contains functions related to writing back dirty pages at the + * address_space level. + * + * 10Apr2002 Andrew Morton + * Initial version + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/slab.h> +#include <linux/pagemap.h> +#include <linux/writeback.h> +#include <linux/init.h> +#include <linux/backing-dev.h> +#include <linux/task_io_accounting_ops.h> +#include <linux/blkdev.h> +#include <linux/mpage.h> +#include <linux/rmap.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/smp.h> +#include <linux/sysctl.h> +#include <linux/cpu.h> +#include <linux/syscalls.h> +#include <linux/buffer_head.h> +#include <linux/pagevec.h> + +/* + * The maximum number of pages to writeout in a single bdflush/kupdate + * operation. We do this so we don't hold I_SYNC against an inode for + * enormous amounts of time, which would block a userspace task which has + * been forced to throttle against that inode. Also, the code reevaluates + * the dirty each time it has written this many pages. + */ +#define MAX_WRITEBACK_PAGES 1024 + +/* + * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited + * will look to see if it needs to force writeback or throttling. + */ +static long ratelimit_pages = 32; + +/* + * When balance_dirty_pages decides that the caller needs to perform some + * non-background writeback, this is how many pages it will attempt to write. + * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably + * large amounts of I/O are submitted. + */ +static inline long sync_writeback_pages(void) +{ + return ratelimit_pages + ratelimit_pages / 2; +} + +/* The following parameters are exported via /proc/sys/vm */ + +/* + * Start background writeback (via pdflush) at this percentage + */ +int dirty_background_ratio = 5; + +/* + * free highmem will not be subtracted from the total free memory + * for calculating free ratios if vm_highmem_is_dirtyable is true + */ +int vm_highmem_is_dirtyable; + +/* + * The generator of dirty data starts writeback at this percentage + */ +int vm_dirty_ratio = 10; + +/* + * The interval between `kupdate'-style writebacks, in jiffies + */ +int dirty_writeback_interval = 5 * HZ; + +/* + * The longest number of jiffies for which data is allowed to remain dirty + */ +int dirty_expire_interval = 30 * HZ; + +/* + * Flag that makes the machine dump writes/reads and block dirtyings. + */ +int block_dump; + +/* + * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: + * a full sync is triggered after this time elapses without any disk activity. + */ +int laptop_mode; + +EXPORT_SYMBOL(laptop_mode); + +/* End of sysctl-exported parameters */ + + +static void background_writeout(unsigned long _min_pages); + +/* + * Scale the writeback cache size proportional to the relative writeout speeds. + * + * We do this by keeping a floating proportion between BDIs, based on page + * writeback completions [end_page_writeback()]. Those devices that write out + * pages fastest will get the larger share, while the slower will get a smaller + * share. + * + * We use page writeout completions because we are interested in getting rid of + * dirty pages. Having them written out is the primary goal. + * + * We introduce a concept of time, a period over which we measure these events, + * because demand can/will vary over time. The length of this period itself is + * measured in page writeback completions. + * + */ +static struct prop_descriptor vm_completions; +static struct prop_descriptor vm_dirties; + +/* + * couple the period to the dirty_ratio: + * + * period/2 ~ roundup_pow_of_two(dirty limit) + */ +static int calc_period_shift(void) +{ + unsigned long dirty_total; + + dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100; + return 2 + ilog2(dirty_total - 1); +} + +/* + * update the period when the dirty ratio changes. + */ +int dirty_ratio_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int old_ratio = vm_dirty_ratio; + int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + if (ret == 0 && write && vm_dirty_ratio != old_ratio) { + int shift = calc_period_shift(); + prop_change_shift(&vm_completions, shift); + prop_change_shift(&vm_dirties, shift); + } + return ret; +} + +/* + * Increment the BDI's writeout completion count and the global writeout + * completion count. Called from test_clear_page_writeback(). + */ +static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) +{ + __prop_inc_percpu_max(&vm_completions, &bdi->completions, + bdi->max_prop_frac); +} + +void bdi_writeout_inc(struct backing_dev_info *bdi) +{ + unsigned long flags; + + local_irq_save(flags); + __bdi_writeout_inc(bdi); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(bdi_writeout_inc); + +static inline void task_dirty_inc(struct task_struct *tsk) +{ + prop_inc_single(&vm_dirties, &tsk->dirties); +} + +/* + * Obtain an accurate fraction of the BDI's portion. + */ +static void bdi_writeout_fraction(struct backing_dev_info *bdi, + long *numerator, long *denominator) +{ + if (bdi_cap_writeback_dirty(bdi)) { + prop_fraction_percpu(&vm_completions, &bdi->completions, + numerator, denominator); + } else { + *numerator = 0; + *denominator = 1; + } +} + +/* + * Clip the earned share of dirty pages to that which is actually available. + * This avoids exceeding the total dirty_limit when the floating averages + * fluctuate too quickly. + */ +static void +clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty) +{ + long avail_dirty; + + avail_dirty = dirty - + (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_WRITEBACK) + + global_page_state(NR_UNSTABLE_NFS) + + global_page_state(NR_WRITEBACK_TEMP)); + + if (avail_dirty < 0) + avail_dirty = 0; + + avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + + bdi_stat(bdi, BDI_WRITEBACK); + + *pbdi_dirty = min(*pbdi_dirty, avail_dirty); +} + +static inline void task_dirties_fraction(struct task_struct *tsk, + long *numerator, long *denominator) +{ + prop_fraction_single(&vm_dirties, &tsk->dirties, + numerator, denominator); +} + +/* + * scale the dirty limit + * + * task specific dirty limit: + * + * dirty -= (dirty/8) * p_{t} + */ +static void task_dirty_limit(struct task_struct *tsk, long *pdirty) +{ + long numerator, denominator; + long dirty = *pdirty; + u64 inv = dirty >> 3; + + task_dirties_fraction(tsk, &numerator, &denominator); + inv *= numerator; + do_div(inv, denominator); + + dirty -= inv; + if (dirty < *pdirty/2) + dirty = *pdirty/2; + + *pdirty = dirty; +} + +/* + * + */ +static DEFINE_SPINLOCK(bdi_lock); +static unsigned int bdi_min_ratio; + +int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) +{ + int ret = 0; + unsigned long flags; + + spin_lock_irqsave(&bdi_lock, flags); + if (min_ratio > bdi->max_ratio) { + ret = -EINVAL; + } else { + min_ratio -= bdi->min_ratio; + if (bdi_min_ratio + min_ratio < 100) { + bdi_min_ratio += min_ratio; + bdi->min_ratio += min_ratio; + } else { + ret = -EINVAL; + } + } + spin_unlock_irqrestore(&bdi_lock, flags); + + return ret; +} + +int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) +{ + unsigned long flags; + int ret = 0; + + if (max_ratio > 100) + return -EINVAL; + + spin_lock_irqsave(&bdi_lock, flags); + if (bdi->min_ratio > max_ratio) { + ret = -EINVAL; + } else { + bdi->max_ratio = max_ratio; + bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; + } + spin_unlock_irqrestore(&bdi_lock, flags); + + return ret; +} +EXPORT_SYMBOL(bdi_set_max_ratio); + +/* + * Work out the current dirty-memory clamping and background writeout + * thresholds. + * + * The main aim here is to lower them aggressively if there is a lot of mapped + * memory around. To avoid stressing page reclaim with lots of unreclaimable + * pages. It is better to clamp down on writers than to start swapping, and + * performing lots of scanning. + * + * We only allow 1/2 of the currently-unmapped memory to be dirtied. + * + * We don't permit the clamping level to fall below 5% - that is getting rather + * excessive. + * + * We make sure that the background writeout level is below the adjusted + * clamping level. + */ + +static unsigned long highmem_dirtyable_memory(unsigned long total) +{ +#ifdef CONFIG_HIGHMEM + int node; + unsigned long x = 0; + + for_each_node_state(node, N_HIGH_MEMORY) { + struct zone *z = + &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; + + x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z); + } + /* + * Make sure that the number of highmem pages is never larger + * than the number of the total dirtyable memory. This can only + * occur in very strange VM situations but we want to make sure + * that this does not occur. + */ + return min(x, total); +#else + return 0; +#endif +} + +/** + * determine_dirtyable_memory - amount of memory that may be used + * + * Returns the numebr of pages that can currently be freed and used + * by the kernel for direct mappings. + */ +unsigned long determine_dirtyable_memory(void) +{ + unsigned long x; + + x = global_page_state(NR_FREE_PAGES) + global_lru_pages(); + + if (!vm_highmem_is_dirtyable) + x -= highmem_dirtyable_memory(x); + + return x + 1; /* Ensure that we never return 0 */ +} + +void +get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, + struct backing_dev_info *bdi) +{ + int background_ratio; /* Percentages */ + int dirty_ratio; + long background; + long dirty; + unsigned long available_memory = determine_dirtyable_memory(); + struct task_struct *tsk; + + dirty_ratio = vm_dirty_ratio; + if (dirty_ratio < 5) + dirty_ratio = 5; + + background_ratio = dirty_background_ratio; + if (background_ratio >= dirty_ratio) + background_ratio = dirty_ratio / 2; + + background = (background_ratio * available_memory) / 100; + dirty = (dirty_ratio * available_memory) / 100; + tsk = current; + if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { + background += background / 4; + dirty += dirty / 4; + } + *pbackground = background; + *pdirty = dirty; + + if (bdi) { + u64 bdi_dirty; + long numerator, denominator; + + /* + * Calculate this BDI's share of the dirty ratio. + */ + bdi_writeout_fraction(bdi, &numerator, &denominator); + + bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100; + bdi_dirty *= numerator; + do_div(bdi_dirty, denominator); + bdi_dirty += (dirty * bdi->min_ratio) / 100; + if (bdi_dirty > (dirty * bdi->max_ratio) / 100) + bdi_dirty = dirty * bdi->max_ratio / 100; + + *pbdi_dirty = bdi_dirty; + clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty); + task_dirty_limit(current, pbdi_dirty); + } +} + +/* + * balance_dirty_pages() must be called by processes which are generating dirty + * data. It looks at the number of dirty pages in the machine and will force + * the caller to perform writeback if the system is over `vm_dirty_ratio'. + * If we're over `background_thresh' then pdflush is woken to perform some + * writeout. + */ +static void balance_dirty_pages(struct address_space *mapping) +{ + long nr_reclaimable, bdi_nr_reclaimable; + long nr_writeback, bdi_nr_writeback; + long background_thresh; + long dirty_thresh; + long bdi_thresh; + unsigned long pages_written = 0; + unsigned long write_chunk = sync_writeback_pages(); + + struct backing_dev_info *bdi = mapping->backing_dev_info; + + for (;;) { + struct writeback_control wbc = { + .bdi = bdi, + .sync_mode = WB_SYNC_NONE, + .older_than_this = NULL, + .nr_to_write = write_chunk, + .range_cyclic = 1, + }; + + get_dirty_limits(&background_thresh, &dirty_thresh, + &bdi_thresh, bdi); + + nr_reclaimable = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); + nr_writeback = global_page_state(NR_WRITEBACK); + + bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); + bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); + + if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) + break; + + /* + * Throttle it only when the background writeback cannot + * catch-up. This avoids (excessively) small writeouts + * when the bdi limits are ramping up. + */ + if (nr_reclaimable + nr_writeback < + (background_thresh + dirty_thresh) / 2) + break; + + if (!bdi->dirty_exceeded) + bdi->dirty_exceeded = 1; + + /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. + * Unstable writes are a feature of certain networked + * filesystems (i.e. NFS) in which data may have been + * written to the server's write cache, but has not yet + * been flushed to permanent storage. + */ + if (bdi_nr_reclaimable) { + writeback_inodes(&wbc); + pages_written += write_chunk - wbc.nr_to_write; + get_dirty_limits(&background_thresh, &dirty_thresh, + &bdi_thresh, bdi); + } + + /* + * In order to avoid the stacked BDI deadlock we need + * to ensure we accurately count the 'dirty' pages when + * the threshold is low. + * + * Otherwise it would be possible to get thresh+n pages + * reported dirty, even though there are thresh-m pages + * actually dirty; with m+n sitting in the percpu + * deltas. + */ + if (bdi_thresh < 2*bdi_stat_error(bdi)) { + bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); + bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); + } else if (bdi_nr_reclaimable) { + bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); + bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); + } + + if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) + break; + if (pages_written >= write_chunk) + break; /* We've done our duty */ + + congestion_wait(WRITE, HZ/10); + } + + if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && + bdi->dirty_exceeded) + bdi->dirty_exceeded = 0; + + if (writeback_in_progress(bdi)) + return; /* pdflush is already working this queue */ + + /* + * In laptop mode, we wait until hitting the higher threshold before + * starting background writeout, and then write out all the way down + * to the lower threshold. So slow writers cause minimal disk activity. + * + * In normal mode, we start background writeout at the lower + * background_thresh, to keep the amount of dirty memory low. + */ + if ((laptop_mode && pages_written) || + (!laptop_mode && (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) + > background_thresh))) + pdflush_operation(background_writeout, 0); +} + +void set_page_dirty_balance(struct page *page, int page_mkwrite) +{ + if (set_page_dirty(page) || page_mkwrite) { + struct address_space *mapping = page_mapping(page); + + if (mapping) + balance_dirty_pages_ratelimited(mapping); + } +} + +/** + * balance_dirty_pages_ratelimited_nr - balance dirty memory state + * @mapping: address_space which was dirtied + * @nr_pages_dirtied: number of pages which the caller has just dirtied + * + * Processes which are dirtying memory should call in here once for each page + * which was newly dirtied. The function will periodically check the system's + * dirty state and will initiate writeback if needed. + * + * On really big machines, get_writeback_state is expensive, so try to avoid + * calling it too often (ratelimiting). But once we're over the dirty memory + * limit we decrease the ratelimiting by a lot, to prevent individual processes + * from overshooting the limit by (ratelimit_pages) each. + */ +void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, + unsigned long nr_pages_dirtied) +{ + static DEFINE_PER_CPU(unsigned long, ratelimits) = 0; + unsigned long ratelimit; + unsigned long *p; + + ratelimit = ratelimit_pages; + if (mapping->backing_dev_info->dirty_exceeded) + ratelimit = 8; + + /* + * Check the rate limiting. Also, we do not want to throttle real-time + * tasks in balance_dirty_pages(). Period. + */ + preempt_disable(); + p = &__get_cpu_var(ratelimits); + *p += nr_pages_dirtied; + if (unlikely(*p >= ratelimit)) { + *p = 0; + preempt_enable(); + balance_dirty_pages(mapping); + return; + } + preempt_enable(); +} +EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); + +void throttle_vm_writeout(gfp_t gfp_mask) +{ + long background_thresh; + long dirty_thresh; + + for ( ; ; ) { + get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); + + /* + * Boost the allowable dirty threshold a bit for page + * allocators so they don't get DoS'ed by heavy writers + */ + dirty_thresh += dirty_thresh / 10; /* wheeee... */ + + if (global_page_state(NR_UNSTABLE_NFS) + + global_page_state(NR_WRITEBACK) <= dirty_thresh) + break; + congestion_wait(WRITE, HZ/10); + + /* + * The caller might hold locks which can prevent IO completion + * or progress in the filesystem. So we cannot just sit here + * waiting for IO to complete. + */ + if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) + break; + } +} + +/* + * writeback at least _min_pages, and keep writing until the amount of dirty + * memory is less than the background threshold, or until we're all clean. + */ +static void background_writeout(unsigned long _min_pages) +{ + long min_pages = _min_pages; + struct writeback_control wbc = { + .bdi = NULL, + .sync_mode = WB_SYNC_NONE, + .older_than_this = NULL, + .nr_to_write = 0, + .nonblocking = 1, + .range_cyclic = 1, + }; + + for ( ; ; ) { + long background_thresh; + long dirty_thresh; + + get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); + if (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) < background_thresh + && min_pages <= 0) + break; + wbc.more_io = 0; + wbc.encountered_congestion = 0; + wbc.nr_to_write = MAX_WRITEBACK_PAGES; + wbc.pages_skipped = 0; + writeback_inodes(&wbc); + min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; + if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { + /* Wrote less than expected */ + if (wbc.encountered_congestion || wbc.more_io) + congestion_wait(WRITE, HZ/10); + else + break; + } + } +} + +/* + * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back + * the whole world. Returns 0 if a pdflush thread was dispatched. Returns + * -1 if all pdflush threads were busy. + */ +int wakeup_pdflush(long nr_pages) +{ + if (nr_pages == 0) + nr_pages = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); + return pdflush_operation(background_writeout, nr_pages); +} + +static void wb_timer_fn(unsigned long unused); +static void laptop_timer_fn(unsigned long unused); + +static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0); +static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); + +/* + * Periodic writeback of "old" data. + * + * Define "old": the first time one of an inode's pages is dirtied, we mark the + * dirtying-time in the inode's address_space. So this periodic writeback code + * just walks the superblock inode list, writing back any inodes which are + * older than a specific point in time. + * + * Try to run once per dirty_writeback_interval. But if a writeback event + * takes longer than a dirty_writeback_interval interval, then leave a + * one-second gap. + * + * older_than_this takes precedence over nr_to_write. So we'll only write back + * all dirty pages if they are all attached to "old" mappings. + */ +static void wb_kupdate(unsigned long arg) +{ + unsigned long oldest_jif; + unsigned long start_jif; + unsigned long next_jif; + long nr_to_write; + struct writeback_control wbc = { + .bdi = NULL, + .sync_mode = WB_SYNC_NONE, + .older_than_this = &oldest_jif, + .nr_to_write = 0, + .nonblocking = 1, + .for_kupdate = 1, + .range_cyclic = 1, + }; + + sync_supers(); + + oldest_jif = jiffies - dirty_expire_interval; + start_jif = jiffies; + next_jif = start_jif + dirty_writeback_interval; + nr_to_write = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) + + (inodes_stat.nr_inodes - inodes_stat.nr_unused); + while (nr_to_write > 0) { + wbc.more_io = 0; + wbc.encountered_congestion = 0; + wbc.nr_to_write = MAX_WRITEBACK_PAGES; + writeback_inodes(&wbc); + if (wbc.nr_to_write > 0) { + if (wbc.encountered_congestion || wbc.more_io) + congestion_wait(WRITE, HZ/10); + else + break; /* All the old data is written */ + } + nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; + } + if (time_before(next_jif, jiffies + HZ)) + next_jif = jiffies + HZ; + if (dirty_writeback_interval) + mod_timer(&wb_timer, next_jif); +} + +/* + * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs + */ +int dirty_writeback_centisecs_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos); + if (dirty_writeback_interval) + mod_timer(&wb_timer, jiffies + dirty_writeback_interval); + else + del_timer(&wb_timer); + return 0; +} + +static void wb_timer_fn(unsigned long unused) +{ + if (pdflush_operation(wb_kupdate, 0) < 0) + mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */ +} + +static void laptop_flush(unsigned long unused) +{ + sys_sync(); +} + +static void laptop_timer_fn(unsigned long unused) +{ + pdflush_operation(laptop_flush, 0); +} + +/* + * We've spun up the disk and we're in laptop mode: schedule writeback + * of all dirty data a few seconds from now. If the flush is already scheduled + * then push it back - the user is still using the disk. + */ +void laptop_io_completion(void) +{ + mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode); +} + +/* + * We're in laptop mode and we've just synced. The sync's writes will have + * caused another writeback to be scheduled by laptop_io_completion. + * Nothing needs to be written back anymore, so we unschedule the writeback. + */ +void laptop_sync_completion(void) +{ + del_timer(&laptop_mode_wb_timer); +} + +/* + * If ratelimit_pages is too high then we can get into dirty-data overload + * if a large number of processes all perform writes at the same time. + * If it is too low then SMP machines will call the (expensive) + * get_writeback_state too often. + * + * Here we set ratelimit_pages to a level which ensures that when all CPUs are + * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory + * thresholds before writeback cuts in. + * + * But the limit should not be set too high. Because it also controls the + * amount of memory which the balance_dirty_pages() caller has to write back. + * If this is too large then the caller will block on the IO queue all the + * time. So limit it to four megabytes - the balance_dirty_pages() caller + * will write six megabyte chunks, max. + */ + +void writeback_set_ratelimit(void) +{ + ratelimit_pages = vm_total_pages / (num_online_cpus() * 32); + if (ratelimit_pages < 16) + ratelimit_pages = 16; + if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024) + ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE; +} + +static int __cpuinit +ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) +{ + writeback_set_ratelimit(); + return NOTIFY_DONE; +} + +static struct notifier_block __cpuinitdata ratelimit_nb = { + .notifier_call = ratelimit_handler, + .next = NULL, +}; + +/* + * Called early on to tune the page writeback dirty limits. + * + * We used to scale dirty pages according to how total memory + * related to pages that could be allocated for buffers (by + * comparing nr_free_buffer_pages() to vm_total_pages. + * + * However, that was when we used "dirty_ratio" to scale with + * all memory, and we don't do that any more. "dirty_ratio" + * is now applied to total non-HIGHPAGE memory (by subtracting + * totalhigh_pages from vm_total_pages), and as such we can't + * get into the old insane situation any more where we had + * large amounts of dirty pages compared to a small amount of + * non-HIGHMEM memory. + * + * But we might still want to scale the dirty_ratio by how + * much memory the box has.. + */ +void __init page_writeback_init(void) +{ + int shift; + + mod_timer(&wb_timer, jiffies + dirty_writeback_interval); + writeback_set_ratelimit(); + register_cpu_notifier(&ratelimit_nb); + + shift = calc_period_shift(); + prop_descriptor_init(&vm_completions, shift); + prop_descriptor_init(&vm_dirties, shift); +} + +/** + * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @writepage: function called for each page + * @data: data passed to writepage function + * + * If a page is already under I/O, write_cache_pages() skips it, even + * if it's dirty. This is desirable behaviour for memory-cleaning writeback, + * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() + * and msync() need to guarantee that all the data which was dirty at the time + * the call was made get new I/O started against them. If wbc->sync_mode is + * WB_SYNC_ALL then we were called for data integrity and we must wait for + * existing IO to complete. + */ +int write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc, writepage_t writepage, + void *data) +{ + struct backing_dev_info *bdi = mapping->backing_dev_info; + int ret = 0; + int done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t uninitialized_var(writeback_index); + pgoff_t index; + pgoff_t end; /* Inclusive */ + pgoff_t done_index; + int cycled; + int range_whole = 0; + long nr_to_write = wbc->nr_to_write; + + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + return 0; + } + + pagevec_init(&pvec, 0); + if (wbc->range_cyclic) { + writeback_index = mapping->writeback_index; /* prev offset */ + index = writeback_index; + if (index == 0) + cycled = 1; + else + cycled = 0; + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + cycled = 1; /* ignore range_cyclic tests */ + } +retry: + done_index = index; + while (!done && (index <= end)) { + int i; + + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * At this point, the page may be truncated or + * invalidated (changing page->mapping to NULL), or + * even swizzled back from swapper_space to tmpfs file + * mapping. However, page->index will not change + * because we have a reference on the page. + */ + if (page->index > end) { + /* + * can't be range_cyclic (1st pass) because + * end == -1 in that case. + */ + done = 1; + break; + } + + done_index = page->index + 1; + + lock_page(page); + + /* + * Page truncated or invalidated. We can freely skip it + * then, even for data integrity operations: the page + * has disappeared concurrently, so there could be no + * real expectation of this data interity operation + * even if there is now a new, dirty page at the same + * pagecache address. + */ + if (unlikely(page->mapping != mapping)) { +continue_unlock: + unlock_page(page); + continue; + } + + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (PageWriteback(page)) { + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + else + goto continue_unlock; + } + + BUG_ON(PageWriteback(page)); + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + ret = (*writepage)(page, wbc, data); + if (unlikely(ret)) { + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + ret = 0; + } else { + /* + * done_index is set past this page, + * so media errors will not choke + * background writeout for the entire + * file. This has consequences for + * range_cyclic semantics (ie. it may + * not be suitable for data integrity + * writeout). + */ + done = 1; + break; + } + } + + if (nr_to_write > 0) { + nr_to_write--; + if (nr_to_write == 0 && + wbc->sync_mode == WB_SYNC_NONE) { + /* + * We stop writing back only if we are + * not doing integrity sync. In case of + * integrity sync we have to keep going + * because someone may be concurrently + * dirtying pages, and we might have + * synced a lot of newly appeared dirty + * pages, but have not synced all of the + * old dirty pages. + */ + done = 1; + break; + } + } + + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + done = 1; + break; + } + } + pagevec_release(&pvec); + cond_resched(); + } + if (!cycled && !done) { + /* + * range_cyclic: + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + cycled = 1; + index = 0; + end = writeback_index - 1; + goto retry; + } + if (!wbc->no_nrwrite_index_update) { + if (wbc->range_cyclic || (range_whole && nr_to_write > 0)) + mapping->writeback_index = done_index; + wbc->nr_to_write = nr_to_write; + } + + return ret; +} +EXPORT_SYMBOL(write_cache_pages); + +/* + * Function used by generic_writepages to call the real writepage + * function and set the mapping flags on error + */ +static int __writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct address_space *mapping = data; + int ret = mapping->a_ops->writepage(page, wbc); + mapping_set_error(mapping, ret); + return ret; +} + +/** + * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them. + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * + * This is a library function, which implements the writepages() + * address_space_operation. + */ +int generic_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + /* deal with chardevs and other special file */ + if (!mapping->a_ops->writepage) + return 0; + + return write_cache_pages(mapping, wbc, __writepage, mapping); +} + +EXPORT_SYMBOL(generic_writepages); + +int do_writepages(struct address_space *mapping, struct writeback_control *wbc) +{ + int ret; + + if (wbc->nr_to_write <= 0) + return 0; + wbc->for_writepages = 1; + if (mapping->a_ops->writepages) + ret = mapping->a_ops->writepages(mapping, wbc); + else + ret = generic_writepages(mapping, wbc); + wbc->for_writepages = 0; + return ret; +} + +/** + * write_one_page - write out a single page and optionally wait on I/O + * @page: the page to write + * @wait: if true, wait on writeout + * + * The page must be locked by the caller and will be unlocked upon return. + * + * write_one_page() returns a negative error code if I/O failed. + */ +int write_one_page(struct page *page, int wait) +{ + struct address_space *mapping = page->mapping; + int ret = 0; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 1, + }; + + BUG_ON(!PageLocked(page)); + + if (wait) + wait_on_page_writeback(page); + + if (clear_page_dirty_for_io(page)) { + page_cache_get(page); + ret = mapping->a_ops->writepage(page, &wbc); + if (ret == 0 && wait) { + wait_on_page_writeback(page); + if (PageError(page)) + ret = -EIO; + } + page_cache_release(page); + } else { + unlock_page(page); + } + return ret; +} +EXPORT_SYMBOL(write_one_page); + +/* + * For address_spaces which do not use buffers nor write back. + */ +int __set_page_dirty_no_writeback(struct page *page) +{ + if (!PageDirty(page)) + SetPageDirty(page); + return 0; +} + +/* + * For address_spaces which do not use buffers. Just tag the page as dirty in + * its radix tree. + * + * This is also used when a single buffer is being dirtied: we want to set the + * page dirty in that case, but not all the buffers. This is a "bottom-up" + * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. + * + * Most callers have locked the page, which pins the address_space in memory. + * But zap_pte_range() does not lock the page, however in that case the + * mapping is pinned by the vma's ->vm_file reference. + * + * We take care to handle the case where the page was truncated from the + * mapping by re-checking page_mapping() inside tree_lock. + */ +int __set_page_dirty_nobuffers(struct page *page) +{ + if (!TestSetPageDirty(page)) { + struct address_space *mapping = page_mapping(page); + struct address_space *mapping2; + + if (!mapping) + return 1; + + spin_lock_irq(&mapping->tree_lock); + mapping2 = page_mapping(page); + if (mapping2) { /* Race with truncate? */ + BUG_ON(mapping2 != mapping); + WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); + if (mapping_cap_account_dirty(mapping)) { + __inc_zone_page_state(page, NR_FILE_DIRTY); + __inc_bdi_stat(mapping->backing_dev_info, + BDI_RECLAIMABLE); + task_io_account_write(PAGE_CACHE_SIZE); + } + radix_tree_tag_set(&mapping->page_tree, + page_index(page), PAGECACHE_TAG_DIRTY); + } + spin_unlock_irq(&mapping->tree_lock); + if (mapping->host) { + /* !PageAnon && !swapper_space */ + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + } + return 1; + } + return 0; +} +EXPORT_SYMBOL(__set_page_dirty_nobuffers); + +/* + * When a writepage implementation decides that it doesn't want to write this + * page for some reason, it should redirty the locked page via + * redirty_page_for_writepage() and it should then unlock the page and return 0 + */ +int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) +{ + wbc->pages_skipped++; + return __set_page_dirty_nobuffers(page); +} +EXPORT_SYMBOL(redirty_page_for_writepage); + +/* + * If the mapping doesn't provide a set_page_dirty a_op, then + * just fall through and assume that it wants buffer_heads. + */ +static int __set_page_dirty(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + + if (likely(mapping)) { + int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; +#ifdef CONFIG_BLOCK + if (!spd) + spd = __set_page_dirty_buffers; +#endif + return (*spd)(page); + } + if (!PageDirty(page)) { + if (!TestSetPageDirty(page)) + return 1; + } + return 0; +} + +int set_page_dirty(struct page *page) +{ + int ret = __set_page_dirty(page); + if (ret) + task_dirty_inc(current); + return ret; +} +EXPORT_SYMBOL(set_page_dirty); + +/* + * set_page_dirty() is racy if the caller has no reference against + * page->mapping->host, and if the page is unlocked. This is because another + * CPU could truncate the page off the mapping and then free the mapping. + * + * Usually, the page _is_ locked, or the caller is a user-space process which + * holds a reference on the inode by having an open file. + * + * In other cases, the page should be locked before running set_page_dirty(). + */ +int set_page_dirty_lock(struct page *page) +{ + int ret; + + lock_page_nosync(page); + ret = set_page_dirty(page); + unlock_page(page); + return ret; +} +EXPORT_SYMBOL(set_page_dirty_lock); + +/* + * Clear a page's dirty flag, while caring for dirty memory accounting. + * Returns true if the page was previously dirty. + * + * This is for preparing to put the page under writeout. We leave the page + * tagged as dirty in the radix tree so that a concurrent write-for-sync + * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage + * implementation will run either set_page_writeback() or set_page_dirty(), + * at which stage we bring the page's dirty flag and radix-tree dirty tag + * back into sync. + * + * This incoherency between the page's dirty flag and radix-tree tag is + * unfortunate, but it only exists while the page is locked. + */ +int clear_page_dirty_for_io(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + + BUG_ON(!PageLocked(page)); + + ClearPageReclaim(page); + if (mapping && mapping_cap_account_dirty(mapping)) { + /* + * Yes, Virginia, this is indeed insane. + * + * We use this sequence to make sure that + * (a) we account for dirty stats properly + * (b) we tell the low-level filesystem to + * mark the whole page dirty if it was + * dirty in a pagetable. Only to then + * (c) clean the page again and return 1 to + * cause the writeback. + * + * This way we avoid all nasty races with the + * dirty bit in multiple places and clearing + * them concurrently from different threads. + * + * Note! Normally the "set_page_dirty(page)" + * has no effect on the actual dirty bit - since + * that will already usually be set. But we + * need the side effects, and it can help us + * avoid races. + * + * We basically use the page "master dirty bit" + * as a serialization point for all the different + * threads doing their things. + */ + if (page_mkclean(page)) + set_page_dirty(page); + /* + * We carefully synchronise fault handlers against + * installing a dirty pte and marking the page dirty + * at this point. We do this by having them hold the + * page lock at some point after installing their + * pte, but before marking the page dirty. + * Pages are always locked coming in here, so we get + * the desired exclusion. See mm/memory.c:do_wp_page() + * for more comments. + */ + if (TestClearPageDirty(page)) { + dec_zone_page_state(page, NR_FILE_DIRTY); + dec_bdi_stat(mapping->backing_dev_info, + BDI_RECLAIMABLE); + return 1; + } + return 0; + } + return TestClearPageDirty(page); +} +EXPORT_SYMBOL(clear_page_dirty_for_io); + +int test_clear_page_writeback(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + int ret; + + if (mapping) { + struct backing_dev_info *bdi = mapping->backing_dev_info; + unsigned long flags; + + spin_lock_irqsave(&mapping->tree_lock, flags); + ret = TestClearPageWriteback(page); + if (ret) { + radix_tree_tag_clear(&mapping->page_tree, + page_index(page), + PAGECACHE_TAG_WRITEBACK); + if (bdi_cap_account_writeback(bdi)) { + __dec_bdi_stat(bdi, BDI_WRITEBACK); + __bdi_writeout_inc(bdi); + } + } + spin_unlock_irqrestore(&mapping->tree_lock, flags); + } else { + ret = TestClearPageWriteback(page); + } + if (ret) + dec_zone_page_state(page, NR_WRITEBACK); + return ret; +} + +int test_set_page_writeback(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + int ret; + + if (mapping) { + struct backing_dev_info *bdi = mapping->backing_dev_info; + unsigned long flags; + + spin_lock_irqsave(&mapping->tree_lock, flags); + ret = TestSetPageWriteback(page); + if (!ret) { + radix_tree_tag_set(&mapping->page_tree, + page_index(page), + PAGECACHE_TAG_WRITEBACK); + if (bdi_cap_account_writeback(bdi)) + __inc_bdi_stat(bdi, BDI_WRITEBACK); + } + if (!PageDirty(page)) + radix_tree_tag_clear(&mapping->page_tree, + page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + } else { + ret = TestSetPageWriteback(page); + } + if (!ret) + inc_zone_page_state(page, NR_WRITEBACK); + return ret; + +} +EXPORT_SYMBOL(test_set_page_writeback); + +/* + * Return true if any of the pages in the mapping are marked with the + * passed tag. + */ +int mapping_tagged(struct address_space *mapping, int tag) +{ + int ret; + rcu_read_lock(); + ret = radix_tree_tagged(&mapping->page_tree, tag); + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL(mapping_tagged); diff --git a/mm/page_alloc.c b/mm/page_alloc.c new file mode 100644 index 0000000..566ee21 --- /dev/null +++ b/mm/page_alloc.c @@ -0,0 +1,4779 @@ +/* + * linux/mm/page_alloc.c + * + * Manages the free list, the system allocates free pages here. + * Note that kmalloc() lives in slab.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * Swap reorganised 29.12.95, Stephen Tweedie + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 + * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 + * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 + * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 + * (lots of bits borrowed from Ingo Molnar & Andrew Morton) + */ + +#include <linux/stddef.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/interrupt.h> +#include <linux/pagemap.h> +#include <linux/jiffies.h> +#include <linux/bootmem.h> +#include <linux/compiler.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/suspend.h> +#include <linux/pagevec.h> +#include <linux/blkdev.h> +#include <linux/slab.h> +#include <linux/oom.h> +#include <linux/notifier.h> +#include <linux/topology.h> +#include <linux/sysctl.h> +#include <linux/cpu.h> +#include <linux/cpuset.h> +#include <linux/memory_hotplug.h> +#include <linux/nodemask.h> +#include <linux/vmalloc.h> +#include <linux/mempolicy.h> +#include <linux/stop_machine.h> +#include <linux/sort.h> +#include <linux/pfn.h> +#include <linux/backing-dev.h> +#include <linux/fault-inject.h> +#include <linux/page-isolation.h> +#include <linux/page_cgroup.h> +#include <linux/debugobjects.h> + +#include <asm/tlbflush.h> +#include <asm/div64.h> +#include "internal.h" + +/* + * Array of node states. + */ +nodemask_t node_states[NR_NODE_STATES] __read_mostly = { + [N_POSSIBLE] = NODE_MASK_ALL, + [N_ONLINE] = { { [0] = 1UL } }, +#ifndef CONFIG_NUMA + [N_NORMAL_MEMORY] = { { [0] = 1UL } }, +#ifdef CONFIG_HIGHMEM + [N_HIGH_MEMORY] = { { [0] = 1UL } }, +#endif + [N_CPU] = { { [0] = 1UL } }, +#endif /* NUMA */ +}; +EXPORT_SYMBOL(node_states); + +unsigned long totalram_pages __read_mostly; +unsigned long totalreserve_pages __read_mostly; +long nr_swap_pages; +int percpu_pagelist_fraction; + +#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE +int pageblock_order __read_mostly; +#endif + +static void __free_pages_ok(struct page *page, unsigned int order); + +/* + * results with 256, 32 in the lowmem_reserve sysctl: + * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) + * 1G machine -> (16M dma, 784M normal, 224M high) + * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA + * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL + * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA + * + * TBD: should special case ZONE_DMA32 machines here - in those we normally + * don't need any ZONE_NORMAL reservation + */ +int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { +#ifdef CONFIG_ZONE_DMA + 256, +#endif +#ifdef CONFIG_ZONE_DMA32 + 256, +#endif +#ifdef CONFIG_HIGHMEM + 32, +#endif + 32, +}; + +EXPORT_SYMBOL(totalram_pages); + +static char * const zone_names[MAX_NR_ZONES] = { +#ifdef CONFIG_ZONE_DMA + "DMA", +#endif +#ifdef CONFIG_ZONE_DMA32 + "DMA32", +#endif + "Normal", +#ifdef CONFIG_HIGHMEM + "HighMem", +#endif + "Movable", +}; + +int min_free_kbytes = 1024; + +unsigned long __meminitdata nr_kernel_pages; +unsigned long __meminitdata nr_all_pages; +static unsigned long __meminitdata dma_reserve; + +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP + /* + * MAX_ACTIVE_REGIONS determines the maximum number of distinct + * ranges of memory (RAM) that may be registered with add_active_range(). + * Ranges passed to add_active_range() will be merged if possible + * so the number of times add_active_range() can be called is + * related to the number of nodes and the number of holes + */ + #ifdef CONFIG_MAX_ACTIVE_REGIONS + /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */ + #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS + #else + #if MAX_NUMNODES >= 32 + /* If there can be many nodes, allow up to 50 holes per node */ + #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50) + #else + /* By default, allow up to 256 distinct regions */ + #define MAX_ACTIVE_REGIONS 256 + #endif + #endif + + static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS]; + static int __meminitdata nr_nodemap_entries; + static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; + static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; +#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE + static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; + static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; +#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ + static unsigned long __initdata required_kernelcore; + static unsigned long __initdata required_movablecore; + static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; + + /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ + int movable_zone; + EXPORT_SYMBOL(movable_zone); +#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ + +#if MAX_NUMNODES > 1 +int nr_node_ids __read_mostly = MAX_NUMNODES; +EXPORT_SYMBOL(nr_node_ids); +#endif + +int page_group_by_mobility_disabled __read_mostly; + +static void set_pageblock_migratetype(struct page *page, int migratetype) +{ + set_pageblock_flags_group(page, (unsigned long)migratetype, + PB_migrate, PB_migrate_end); +} + +#ifdef CONFIG_DEBUG_VM +static int page_outside_zone_boundaries(struct zone *zone, struct page *page) +{ + int ret = 0; + unsigned seq; + unsigned long pfn = page_to_pfn(page); + + do { + seq = zone_span_seqbegin(zone); + if (pfn >= zone->zone_start_pfn + zone->spanned_pages) + ret = 1; + else if (pfn < zone->zone_start_pfn) + ret = 1; + } while (zone_span_seqretry(zone, seq)); + + return ret; +} + +static int page_is_consistent(struct zone *zone, struct page *page) +{ + if (!pfn_valid_within(page_to_pfn(page))) + return 0; + if (zone != page_zone(page)) + return 0; + + return 1; +} +/* + * Temporary debugging check for pages not lying within a given zone. + */ +static int bad_range(struct zone *zone, struct page *page) +{ + if (page_outside_zone_boundaries(zone, page)) + return 1; + if (!page_is_consistent(zone, page)) + return 1; + + return 0; +} +#else +static inline int bad_range(struct zone *zone, struct page *page) +{ + return 0; +} +#endif + +static void bad_page(struct page *page) +{ + printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG + "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", + current->comm, page, (int)(2*sizeof(unsigned long)), + (unsigned long)page->flags, page->mapping, + page_mapcount(page), page_count(page)); + + printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" + KERN_EMERG "Backtrace:\n"); + dump_stack(); + page->flags &= ~PAGE_FLAGS_CLEAR_WHEN_BAD; + set_page_count(page, 0); + reset_page_mapcount(page); + page->mapping = NULL; + add_taint(TAINT_BAD_PAGE); +} + +/* + * Higher-order pages are called "compound pages". They are structured thusly: + * + * The first PAGE_SIZE page is called the "head page". + * + * The remaining PAGE_SIZE pages are called "tail pages". + * + * All pages have PG_compound set. All pages have their ->private pointing at + * the head page (even the head page has this). + * + * The first tail page's ->lru.next holds the address of the compound page's + * put_page() function. Its ->lru.prev holds the order of allocation. + * This usage means that zero-order pages may not be compound. + */ + +static void free_compound_page(struct page *page) +{ + __free_pages_ok(page, compound_order(page)); +} + +void prep_compound_page(struct page *page, unsigned long order) +{ + int i; + int nr_pages = 1 << order; + + set_compound_page_dtor(page, free_compound_page); + set_compound_order(page, order); + __SetPageHead(page); + for (i = 1; i < nr_pages; i++) { + struct page *p = page + i; + + __SetPageTail(p); + p->first_page = page; + } +} + +#ifdef CONFIG_HUGETLBFS +void prep_compound_gigantic_page(struct page *page, unsigned long order) +{ + int i; + int nr_pages = 1 << order; + struct page *p = page + 1; + + set_compound_page_dtor(page, free_compound_page); + set_compound_order(page, order); + __SetPageHead(page); + for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { + __SetPageTail(p); + p->first_page = page; + } +} +#endif + +static void destroy_compound_page(struct page *page, unsigned long order) +{ + int i; + int nr_pages = 1 << order; + + if (unlikely(compound_order(page) != order)) + bad_page(page); + + if (unlikely(!PageHead(page))) + bad_page(page); + __ClearPageHead(page); + for (i = 1; i < nr_pages; i++) { + struct page *p = page + i; + + if (unlikely(!PageTail(p) | + (p->first_page != page))) + bad_page(page); + __ClearPageTail(p); + } +} + +static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) +{ + int i; + + /* + * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO + * and __GFP_HIGHMEM from hard or soft interrupt context. + */ + VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); + for (i = 0; i < (1 << order); i++) + clear_highpage(page + i); +} + +static inline void set_page_order(struct page *page, int order) +{ + set_page_private(page, order); + __SetPageBuddy(page); +} + +static inline void rmv_page_order(struct page *page) +{ + __ClearPageBuddy(page); + set_page_private(page, 0); +} + +/* + * Locate the struct page for both the matching buddy in our + * pair (buddy1) and the combined O(n+1) page they form (page). + * + * 1) Any buddy B1 will have an order O twin B2 which satisfies + * the following equation: + * B2 = B1 ^ (1 << O) + * For example, if the starting buddy (buddy2) is #8 its order + * 1 buddy is #10: + * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 + * + * 2) Any buddy B will have an order O+1 parent P which + * satisfies the following equation: + * P = B & ~(1 << O) + * + * Assumption: *_mem_map is contiguous at least up to MAX_ORDER + */ +static inline struct page * +__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) +{ + unsigned long buddy_idx = page_idx ^ (1 << order); + + return page + (buddy_idx - page_idx); +} + +static inline unsigned long +__find_combined_index(unsigned long page_idx, unsigned int order) +{ + return (page_idx & ~(1 << order)); +} + +/* + * This function checks whether a page is free && is the buddy + * we can do coalesce a page and its buddy if + * (a) the buddy is not in a hole && + * (b) the buddy is in the buddy system && + * (c) a page and its buddy have the same order && + * (d) a page and its buddy are in the same zone. + * + * For recording whether a page is in the buddy system, we use PG_buddy. + * Setting, clearing, and testing PG_buddy is serialized by zone->lock. + * + * For recording page's order, we use page_private(page). + */ +static inline int page_is_buddy(struct page *page, struct page *buddy, + int order) +{ + if (!pfn_valid_within(page_to_pfn(buddy))) + return 0; + + if (page_zone_id(page) != page_zone_id(buddy)) + return 0; + + if (PageBuddy(buddy) && page_order(buddy) == order) { + BUG_ON(page_count(buddy) != 0); + return 1; + } + return 0; +} + +/* + * Freeing function for a buddy system allocator. + * + * The concept of a buddy system is to maintain direct-mapped table + * (containing bit values) for memory blocks of various "orders". + * The bottom level table contains the map for the smallest allocatable + * units of memory (here, pages), and each level above it describes + * pairs of units from the levels below, hence, "buddies". + * At a high level, all that happens here is marking the table entry + * at the bottom level available, and propagating the changes upward + * as necessary, plus some accounting needed to play nicely with other + * parts of the VM system. + * At each level, we keep a list of pages, which are heads of continuous + * free pages of length of (1 << order) and marked with PG_buddy. Page's + * order is recorded in page_private(page) field. + * So when we are allocating or freeing one, we can derive the state of the + * other. That is, if we allocate a small block, and both were + * free, the remainder of the region must be split into blocks. + * If a block is freed, and its buddy is also free, then this + * triggers coalescing into a block of larger size. + * + * -- wli + */ + +static inline void __free_one_page(struct page *page, + struct zone *zone, unsigned int order) +{ + unsigned long page_idx; + int order_size = 1 << order; + int migratetype = get_pageblock_migratetype(page); + + if (unlikely(PageCompound(page))) + destroy_compound_page(page, order); + + page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); + + VM_BUG_ON(page_idx & (order_size - 1)); + VM_BUG_ON(bad_range(zone, page)); + + __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); + while (order < MAX_ORDER-1) { + unsigned long combined_idx; + struct page *buddy; + + buddy = __page_find_buddy(page, page_idx, order); + if (!page_is_buddy(page, buddy, order)) + break; + + /* Our buddy is free, merge with it and move up one order. */ + list_del(&buddy->lru); + zone->free_area[order].nr_free--; + rmv_page_order(buddy); + combined_idx = __find_combined_index(page_idx, order); + page = page + (combined_idx - page_idx); + page_idx = combined_idx; + order++; + } + set_page_order(page, order); + list_add(&page->lru, + &zone->free_area[order].free_list[migratetype]); + zone->free_area[order].nr_free++; +} + +static inline int free_pages_check(struct page *page) +{ + free_page_mlock(page); + if (unlikely(page_mapcount(page) | + (page->mapping != NULL) | + (page_count(page) != 0) | + (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) + bad_page(page); + if (PageDirty(page)) + __ClearPageDirty(page); + if (PageSwapBacked(page)) + __ClearPageSwapBacked(page); + /* + * For now, we report if PG_reserved was found set, but do not + * clear it, and do not free the page. But we shall soon need + * to do more, for when the ZERO_PAGE count wraps negative. + */ + return PageReserved(page); +} + +/* + * Frees a list of pages. + * Assumes all pages on list are in same zone, and of same order. + * count is the number of pages to free. + * + * If the zone was previously in an "all pages pinned" state then look to + * see if this freeing clears that state. + * + * And clear the zone's pages_scanned counter, to hold off the "all pages are + * pinned" detection logic. + */ +static void free_pages_bulk(struct zone *zone, int count, + struct list_head *list, int order) +{ + spin_lock(&zone->lock); + zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); + zone->pages_scanned = 0; + while (count--) { + struct page *page; + + VM_BUG_ON(list_empty(list)); + page = list_entry(list->prev, struct page, lru); + /* have to delete it as __free_one_page list manipulates */ + list_del(&page->lru); + __free_one_page(page, zone, order); + } + spin_unlock(&zone->lock); +} + +static void free_one_page(struct zone *zone, struct page *page, int order) +{ + spin_lock(&zone->lock); + zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); + zone->pages_scanned = 0; + __free_one_page(page, zone, order); + spin_unlock(&zone->lock); +} + +static void __free_pages_ok(struct page *page, unsigned int order) +{ + unsigned long flags; + int i; + int reserved = 0; + + for (i = 0 ; i < (1 << order) ; ++i) + reserved += free_pages_check(page + i); + if (reserved) + return; + + if (!PageHighMem(page)) { + debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); + debug_check_no_obj_freed(page_address(page), + PAGE_SIZE << order); + } + arch_free_page(page, order); + kernel_map_pages(page, 1 << order, 0); + + local_irq_save(flags); + __count_vm_events(PGFREE, 1 << order); + free_one_page(page_zone(page), page, order); + local_irq_restore(flags); +} + +/* + * permit the bootmem allocator to evade page validation on high-order frees + */ +void __meminit __free_pages_bootmem(struct page *page, unsigned int order) +{ + if (order == 0) { + __ClearPageReserved(page); + set_page_count(page, 0); + set_page_refcounted(page); + __free_page(page); + } else { + int loop; + + prefetchw(page); + for (loop = 0; loop < BITS_PER_LONG; loop++) { + struct page *p = &page[loop]; + + if (loop + 1 < BITS_PER_LONG) + prefetchw(p + 1); + __ClearPageReserved(p); + set_page_count(p, 0); + } + + set_page_refcounted(page); + __free_pages(page, order); + } +} + + +/* + * The order of subdivision here is critical for the IO subsystem. + * Please do not alter this order without good reasons and regression + * testing. Specifically, as large blocks of memory are subdivided, + * the order in which smaller blocks are delivered depends on the order + * they're subdivided in this function. This is the primary factor + * influencing the order in which pages are delivered to the IO + * subsystem according to empirical testing, and this is also justified + * by considering the behavior of a buddy system containing a single + * large block of memory acted on by a series of small allocations. + * This behavior is a critical factor in sglist merging's success. + * + * -- wli + */ +static inline void expand(struct zone *zone, struct page *page, + int low, int high, struct free_area *area, + int migratetype) +{ + unsigned long size = 1 << high; + + while (high > low) { + area--; + high--; + size >>= 1; + VM_BUG_ON(bad_range(zone, &page[size])); + list_add(&page[size].lru, &area->free_list[migratetype]); + area->nr_free++; + set_page_order(&page[size], high); + } +} + +/* + * This page is about to be returned from the page allocator + */ +static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) +{ + if (unlikely(page_mapcount(page) | + (page->mapping != NULL) | + (page_count(page) != 0) | + (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) + bad_page(page); + + /* + * For now, we report if PG_reserved was found set, but do not + * clear it, and do not allocate the page: as a safety net. + */ + if (PageReserved(page)) + return 1; + + page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim | + 1 << PG_referenced | 1 << PG_arch_1 | + 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk +#ifdef CONFIG_UNEVICTABLE_LRU + | 1 << PG_mlocked +#endif + ); + set_page_private(page, 0); + set_page_refcounted(page); + + arch_alloc_page(page, order); + kernel_map_pages(page, 1 << order, 1); + + if (gfp_flags & __GFP_ZERO) + prep_zero_page(page, order, gfp_flags); + + if (order && (gfp_flags & __GFP_COMP)) + prep_compound_page(page, order); + + return 0; +} + +/* + * Go through the free lists for the given migratetype and remove + * the smallest available page from the freelists + */ +static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, + int migratetype) +{ + unsigned int current_order; + struct free_area * area; + struct page *page; + + /* Find a page of the appropriate size in the preferred list */ + for (current_order = order; current_order < MAX_ORDER; ++current_order) { + area = &(zone->free_area[current_order]); + if (list_empty(&area->free_list[migratetype])) + continue; + + page = list_entry(area->free_list[migratetype].next, + struct page, lru); + list_del(&page->lru); + rmv_page_order(page); + area->nr_free--; + __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); + expand(zone, page, order, current_order, area, migratetype); + return page; + } + + return NULL; +} + + +/* + * This array describes the order lists are fallen back to when + * the free lists for the desirable migrate type are depleted + */ +static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = { + [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, + [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, + [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, + [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */ +}; + +/* + * Move the free pages in a range to the free lists of the requested type. + * Note that start_page and end_pages are not aligned on a pageblock + * boundary. If alignment is required, use move_freepages_block() + */ +static int move_freepages(struct zone *zone, + struct page *start_page, struct page *end_page, + int migratetype) +{ + struct page *page; + unsigned long order; + int pages_moved = 0; + +#ifndef CONFIG_HOLES_IN_ZONE + /* + * page_zone is not safe to call in this context when + * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant + * anyway as we check zone boundaries in move_freepages_block(). + * Remove at a later date when no bug reports exist related to + * grouping pages by mobility + */ + BUG_ON(page_zone(start_page) != page_zone(end_page)); +#endif + + for (page = start_page; page <= end_page;) { + /* Make sure we are not inadvertently changing nodes */ + VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); + + if (!pfn_valid_within(page_to_pfn(page))) { + page++; + continue; + } + + if (!PageBuddy(page)) { + page++; + continue; + } + + order = page_order(page); + list_del(&page->lru); + list_add(&page->lru, + &zone->free_area[order].free_list[migratetype]); + page += 1 << order; + pages_moved += 1 << order; + } + + return pages_moved; +} + +static int move_freepages_block(struct zone *zone, struct page *page, + int migratetype) +{ + unsigned long start_pfn, end_pfn; + struct page *start_page, *end_page; + + start_pfn = page_to_pfn(page); + start_pfn = start_pfn & ~(pageblock_nr_pages-1); + start_page = pfn_to_page(start_pfn); + end_page = start_page + pageblock_nr_pages - 1; + end_pfn = start_pfn + pageblock_nr_pages - 1; + + /* Do not cross zone boundaries */ + if (start_pfn < zone->zone_start_pfn) + start_page = page; + if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages) + return 0; + + return move_freepages(zone, start_page, end_page, migratetype); +} + +/* Remove an element from the buddy allocator from the fallback list */ +static struct page *__rmqueue_fallback(struct zone *zone, int order, + int start_migratetype) +{ + struct free_area * area; + int current_order; + struct page *page; + int migratetype, i; + + /* Find the largest possible block of pages in the other list */ + for (current_order = MAX_ORDER-1; current_order >= order; + --current_order) { + for (i = 0; i < MIGRATE_TYPES - 1; i++) { + migratetype = fallbacks[start_migratetype][i]; + + /* MIGRATE_RESERVE handled later if necessary */ + if (migratetype == MIGRATE_RESERVE) + continue; + + area = &(zone->free_area[current_order]); + if (list_empty(&area->free_list[migratetype])) + continue; + + page = list_entry(area->free_list[migratetype].next, + struct page, lru); + area->nr_free--; + + /* + * If breaking a large block of pages, move all free + * pages to the preferred allocation list. If falling + * back for a reclaimable kernel allocation, be more + * agressive about taking ownership of free pages + */ + if (unlikely(current_order >= (pageblock_order >> 1)) || + start_migratetype == MIGRATE_RECLAIMABLE) { + unsigned long pages; + pages = move_freepages_block(zone, page, + start_migratetype); + + /* Claim the whole block if over half of it is free */ + if (pages >= (1 << (pageblock_order-1))) + set_pageblock_migratetype(page, + start_migratetype); + + migratetype = start_migratetype; + } + + /* Remove the page from the freelists */ + list_del(&page->lru); + rmv_page_order(page); + __mod_zone_page_state(zone, NR_FREE_PAGES, + -(1UL << order)); + + if (current_order == pageblock_order) + set_pageblock_migratetype(page, + start_migratetype); + + expand(zone, page, order, current_order, area, migratetype); + return page; + } + } + + /* Use MIGRATE_RESERVE rather than fail an allocation */ + return __rmqueue_smallest(zone, order, MIGRATE_RESERVE); +} + +/* + * Do the hard work of removing an element from the buddy allocator. + * Call me with the zone->lock already held. + */ +static struct page *__rmqueue(struct zone *zone, unsigned int order, + int migratetype) +{ + struct page *page; + + page = __rmqueue_smallest(zone, order, migratetype); + + if (unlikely(!page)) + page = __rmqueue_fallback(zone, order, migratetype); + + return page; +} + +/* + * Obtain a specified number of elements from the buddy allocator, all under + * a single hold of the lock, for efficiency. Add them to the supplied list. + * Returns the number of new pages which were placed at *list. + */ +static int rmqueue_bulk(struct zone *zone, unsigned int order, + unsigned long count, struct list_head *list, + int migratetype) +{ + int i; + + spin_lock(&zone->lock); + for (i = 0; i < count; ++i) { + struct page *page = __rmqueue(zone, order, migratetype); + if (unlikely(page == NULL)) + break; + + /* + * Split buddy pages returned by expand() are received here + * in physical page order. The page is added to the callers and + * list and the list head then moves forward. From the callers + * perspective, the linked list is ordered by page number in + * some conditions. This is useful for IO devices that can + * merge IO requests if the physical pages are ordered + * properly. + */ + list_add(&page->lru, list); + set_page_private(page, migratetype); + list = &page->lru; + } + spin_unlock(&zone->lock); + return i; +} + +#ifdef CONFIG_NUMA +/* + * Called from the vmstat counter updater to drain pagesets of this + * currently executing processor on remote nodes after they have + * expired. + * + * Note that this function must be called with the thread pinned to + * a single processor. + */ +void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) +{ + unsigned long flags; + int to_drain; + + local_irq_save(flags); + if (pcp->count >= pcp->batch) + to_drain = pcp->batch; + else + to_drain = pcp->count; + free_pages_bulk(zone, to_drain, &pcp->list, 0); + pcp->count -= to_drain; + local_irq_restore(flags); +} +#endif + +/* + * Drain pages of the indicated processor. + * + * The processor must either be the current processor and the + * thread pinned to the current processor or a processor that + * is not online. + */ +static void drain_pages(unsigned int cpu) +{ + unsigned long flags; + struct zone *zone; + + for_each_zone(zone) { + struct per_cpu_pageset *pset; + struct per_cpu_pages *pcp; + + if (!populated_zone(zone)) + continue; + + pset = zone_pcp(zone, cpu); + + pcp = &pset->pcp; + local_irq_save(flags); + free_pages_bulk(zone, pcp->count, &pcp->list, 0); + pcp->count = 0; + local_irq_restore(flags); + } +} + +/* + * Spill all of this CPU's per-cpu pages back into the buddy allocator. + */ +void drain_local_pages(void *arg) +{ + drain_pages(smp_processor_id()); +} + +/* + * Spill all the per-cpu pages from all CPUs back into the buddy allocator + */ +void drain_all_pages(void) +{ + on_each_cpu(drain_local_pages, NULL, 1); +} + +#ifdef CONFIG_HIBERNATION + +void mark_free_pages(struct zone *zone) +{ + unsigned long pfn, max_zone_pfn; + unsigned long flags; + int order, t; + struct list_head *curr; + + if (!zone->spanned_pages) + return; + + spin_lock_irqsave(&zone->lock, flags); + + max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) + if (pfn_valid(pfn)) { + struct page *page = pfn_to_page(pfn); + + if (!swsusp_page_is_forbidden(page)) + swsusp_unset_page_free(page); + } + + for_each_migratetype_order(order, t) { + list_for_each(curr, &zone->free_area[order].free_list[t]) { + unsigned long i; + + pfn = page_to_pfn(list_entry(curr, struct page, lru)); + for (i = 0; i < (1UL << order); i++) + swsusp_set_page_free(pfn_to_page(pfn + i)); + } + } + spin_unlock_irqrestore(&zone->lock, flags); +} +#endif /* CONFIG_PM */ + +/* + * Free a 0-order page + */ +static void free_hot_cold_page(struct page *page, int cold) +{ + struct zone *zone = page_zone(page); + struct per_cpu_pages *pcp; + unsigned long flags; + + if (PageAnon(page)) + page->mapping = NULL; + if (free_pages_check(page)) + return; + + if (!PageHighMem(page)) { + debug_check_no_locks_freed(page_address(page), PAGE_SIZE); + debug_check_no_obj_freed(page_address(page), PAGE_SIZE); + } + arch_free_page(page, 0); + kernel_map_pages(page, 1, 0); + + pcp = &zone_pcp(zone, get_cpu())->pcp; + local_irq_save(flags); + __count_vm_event(PGFREE); + if (cold) + list_add_tail(&page->lru, &pcp->list); + else + list_add(&page->lru, &pcp->list); + set_page_private(page, get_pageblock_migratetype(page)); + pcp->count++; + if (pcp->count >= pcp->high) { + free_pages_bulk(zone, pcp->batch, &pcp->list, 0); + pcp->count -= pcp->batch; + } + local_irq_restore(flags); + put_cpu(); +} + +void free_hot_page(struct page *page) +{ + free_hot_cold_page(page, 0); +} + +void free_cold_page(struct page *page) +{ + free_hot_cold_page(page, 1); +} + +/* + * split_page takes a non-compound higher-order page, and splits it into + * n (1<<order) sub-pages: page[0..n] + * Each sub-page must be freed individually. + * + * Note: this is probably too low level an operation for use in drivers. + * Please consult with lkml before using this in your driver. + */ +void split_page(struct page *page, unsigned int order) +{ + int i; + + VM_BUG_ON(PageCompound(page)); + VM_BUG_ON(!page_count(page)); + for (i = 1; i < (1 << order); i++) + set_page_refcounted(page + i); +} + +/* + * Really, prep_compound_page() should be called from __rmqueue_bulk(). But + * we cheat by calling it from here, in the order > 0 path. Saves a branch + * or two. + */ +static struct page *buffered_rmqueue(struct zone *preferred_zone, + struct zone *zone, int order, gfp_t gfp_flags) +{ + unsigned long flags; + struct page *page; + int cold = !!(gfp_flags & __GFP_COLD); + int cpu; + int migratetype = allocflags_to_migratetype(gfp_flags); + +again: + cpu = get_cpu(); + if (likely(order == 0)) { + struct per_cpu_pages *pcp; + + pcp = &zone_pcp(zone, cpu)->pcp; + local_irq_save(flags); + if (!pcp->count) { + pcp->count = rmqueue_bulk(zone, 0, + pcp->batch, &pcp->list, migratetype); + if (unlikely(!pcp->count)) + goto failed; + } + + /* Find a page of the appropriate migrate type */ + if (cold) { + list_for_each_entry_reverse(page, &pcp->list, lru) + if (page_private(page) == migratetype) + break; + } else { + list_for_each_entry(page, &pcp->list, lru) + if (page_private(page) == migratetype) + break; + } + + /* Allocate more to the pcp list if necessary */ + if (unlikely(&page->lru == &pcp->list)) { + pcp->count += rmqueue_bulk(zone, 0, + pcp->batch, &pcp->list, migratetype); + page = list_entry(pcp->list.next, struct page, lru); + } + + list_del(&page->lru); + pcp->count--; + } else { + spin_lock_irqsave(&zone->lock, flags); + page = __rmqueue(zone, order, migratetype); + spin_unlock(&zone->lock); + if (!page) + goto failed; + } + + __count_zone_vm_events(PGALLOC, zone, 1 << order); + zone_statistics(preferred_zone, zone); + local_irq_restore(flags); + put_cpu(); + + VM_BUG_ON(bad_range(zone, page)); + if (prep_new_page(page, order, gfp_flags)) + goto again; + return page; + +failed: + local_irq_restore(flags); + put_cpu(); + return NULL; +} + +#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ +#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ +#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ +#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ +#define ALLOC_HARDER 0x10 /* try to alloc harder */ +#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ +#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ + +#ifdef CONFIG_FAIL_PAGE_ALLOC + +static struct fail_page_alloc_attr { + struct fault_attr attr; + + u32 ignore_gfp_highmem; + u32 ignore_gfp_wait; + u32 min_order; + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + + struct dentry *ignore_gfp_highmem_file; + struct dentry *ignore_gfp_wait_file; + struct dentry *min_order_file; + +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ + +} fail_page_alloc = { + .attr = FAULT_ATTR_INITIALIZER, + .ignore_gfp_wait = 1, + .ignore_gfp_highmem = 1, + .min_order = 1, +}; + +static int __init setup_fail_page_alloc(char *str) +{ + return setup_fault_attr(&fail_page_alloc.attr, str); +} +__setup("fail_page_alloc=", setup_fail_page_alloc); + +static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +{ + if (order < fail_page_alloc.min_order) + return 0; + if (gfp_mask & __GFP_NOFAIL) + return 0; + if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) + return 0; + if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) + return 0; + + return should_fail(&fail_page_alloc.attr, 1 << order); +} + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + +static int __init fail_page_alloc_debugfs(void) +{ + mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + struct dentry *dir; + int err; + + err = init_fault_attr_dentries(&fail_page_alloc.attr, + "fail_page_alloc"); + if (err) + return err; + dir = fail_page_alloc.attr.dentries.dir; + + fail_page_alloc.ignore_gfp_wait_file = + debugfs_create_bool("ignore-gfp-wait", mode, dir, + &fail_page_alloc.ignore_gfp_wait); + + fail_page_alloc.ignore_gfp_highmem_file = + debugfs_create_bool("ignore-gfp-highmem", mode, dir, + &fail_page_alloc.ignore_gfp_highmem); + fail_page_alloc.min_order_file = + debugfs_create_u32("min-order", mode, dir, + &fail_page_alloc.min_order); + + if (!fail_page_alloc.ignore_gfp_wait_file || + !fail_page_alloc.ignore_gfp_highmem_file || + !fail_page_alloc.min_order_file) { + err = -ENOMEM; + debugfs_remove(fail_page_alloc.ignore_gfp_wait_file); + debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file); + debugfs_remove(fail_page_alloc.min_order_file); + cleanup_fault_attr_dentries(&fail_page_alloc.attr); + } + + return err; +} + +late_initcall(fail_page_alloc_debugfs); + +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ + +#else /* CONFIG_FAIL_PAGE_ALLOC */ + +static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +{ + return 0; +} + +#endif /* CONFIG_FAIL_PAGE_ALLOC */ + +/* + * Return 1 if free pages are above 'mark'. This takes into account the order + * of the allocation. + */ +int zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags) +{ + /* free_pages my go negative - that's OK */ + long min = mark; + long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1; + int o; + + if (alloc_flags & ALLOC_HIGH) + min -= min / 2; + if (alloc_flags & ALLOC_HARDER) + min -= min / 4; + + if (free_pages <= min + z->lowmem_reserve[classzone_idx]) + return 0; + for (o = 0; o < order; o++) { + /* At the next order, this order's pages become unavailable */ + free_pages -= z->free_area[o].nr_free << o; + + /* Require fewer higher order pages to be free */ + min >>= 1; + + if (free_pages <= min) + return 0; + } + return 1; +} + +#ifdef CONFIG_NUMA +/* + * zlc_setup - Setup for "zonelist cache". Uses cached zone data to + * skip over zones that are not allowed by the cpuset, or that have + * been recently (in last second) found to be nearly full. See further + * comments in mmzone.h. Reduces cache footprint of zonelist scans + * that have to skip over a lot of full or unallowed zones. + * + * If the zonelist cache is present in the passed in zonelist, then + * returns a pointer to the allowed node mask (either the current + * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) + * + * If the zonelist cache is not available for this zonelist, does + * nothing and returns NULL. + * + * If the fullzones BITMAP in the zonelist cache is stale (more than + * a second since last zap'd) then we zap it out (clear its bits.) + * + * We hold off even calling zlc_setup, until after we've checked the + * first zone in the zonelist, on the theory that most allocations will + * be satisfied from that first zone, so best to examine that zone as + * quickly as we can. + */ +static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) +{ + struct zonelist_cache *zlc; /* cached zonelist speedup info */ + nodemask_t *allowednodes; /* zonelist_cache approximation */ + + zlc = zonelist->zlcache_ptr; + if (!zlc) + return NULL; + + if (time_after(jiffies, zlc->last_full_zap + HZ)) { + bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); + zlc->last_full_zap = jiffies; + } + + allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? + &cpuset_current_mems_allowed : + &node_states[N_HIGH_MEMORY]; + return allowednodes; +} + +/* + * Given 'z' scanning a zonelist, run a couple of quick checks to see + * if it is worth looking at further for free memory: + * 1) Check that the zone isn't thought to be full (doesn't have its + * bit set in the zonelist_cache fullzones BITMAP). + * 2) Check that the zones node (obtained from the zonelist_cache + * z_to_n[] mapping) is allowed in the passed in allowednodes mask. + * Return true (non-zero) if zone is worth looking at further, or + * else return false (zero) if it is not. + * + * This check -ignores- the distinction between various watermarks, + * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is + * found to be full for any variation of these watermarks, it will + * be considered full for up to one second by all requests, unless + * we are so low on memory on all allowed nodes that we are forced + * into the second scan of the zonelist. + * + * In the second scan we ignore this zonelist cache and exactly + * apply the watermarks to all zones, even it is slower to do so. + * We are low on memory in the second scan, and should leave no stone + * unturned looking for a free page. + */ +static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, + nodemask_t *allowednodes) +{ + struct zonelist_cache *zlc; /* cached zonelist speedup info */ + int i; /* index of *z in zonelist zones */ + int n; /* node that zone *z is on */ + + zlc = zonelist->zlcache_ptr; + if (!zlc) + return 1; + + i = z - zonelist->_zonerefs; + n = zlc->z_to_n[i]; + + /* This zone is worth trying if it is allowed but not full */ + return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); +} + +/* + * Given 'z' scanning a zonelist, set the corresponding bit in + * zlc->fullzones, so that subsequent attempts to allocate a page + * from that zone don't waste time re-examining it. + */ +static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) +{ + struct zonelist_cache *zlc; /* cached zonelist speedup info */ + int i; /* index of *z in zonelist zones */ + + zlc = zonelist->zlcache_ptr; + if (!zlc) + return; + + i = z - zonelist->_zonerefs; + + set_bit(i, zlc->fullzones); +} + +#else /* CONFIG_NUMA */ + +static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) +{ + return NULL; +} + +static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, + nodemask_t *allowednodes) +{ + return 1; +} + +static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) +{ +} +#endif /* CONFIG_NUMA */ + +/* + * get_page_from_freelist goes through the zonelist trying to allocate + * a page. + */ +static struct page * +get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, + struct zonelist *zonelist, int high_zoneidx, int alloc_flags) +{ + struct zoneref *z; + struct page *page = NULL; + int classzone_idx; + struct zone *zone, *preferred_zone; + nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ + int zlc_active = 0; /* set if using zonelist_cache */ + int did_zlc_setup = 0; /* just call zlc_setup() one time */ + + (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask, + &preferred_zone); + if (!preferred_zone) + return NULL; + + classzone_idx = zone_idx(preferred_zone); + +zonelist_scan: + /* + * Scan zonelist, looking for a zone with enough free. + * See also cpuset_zone_allowed() comment in kernel/cpuset.c. + */ + for_each_zone_zonelist_nodemask(zone, z, zonelist, + high_zoneidx, nodemask) { + if (NUMA_BUILD && zlc_active && + !zlc_zone_worth_trying(zonelist, z, allowednodes)) + continue; + if ((alloc_flags & ALLOC_CPUSET) && + !cpuset_zone_allowed_softwall(zone, gfp_mask)) + goto try_next_zone; + + if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { + unsigned long mark; + if (alloc_flags & ALLOC_WMARK_MIN) + mark = zone->pages_min; + else if (alloc_flags & ALLOC_WMARK_LOW) + mark = zone->pages_low; + else + mark = zone->pages_high; + if (!zone_watermark_ok(zone, order, mark, + classzone_idx, alloc_flags)) { + if (!zone_reclaim_mode || + !zone_reclaim(zone, gfp_mask, order)) + goto this_zone_full; + } + } + + page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask); + if (page) + break; +this_zone_full: + if (NUMA_BUILD) + zlc_mark_zone_full(zonelist, z); +try_next_zone: + if (NUMA_BUILD && !did_zlc_setup) { + /* we do zlc_setup after the first zone is tried */ + allowednodes = zlc_setup(zonelist, alloc_flags); + zlc_active = 1; + did_zlc_setup = 1; + } + } + + if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { + /* Disable zlc cache for second zonelist scan */ + zlc_active = 0; + goto zonelist_scan; + } + return page; +} + +/* + * This is the 'heart' of the zoned buddy allocator. + */ +struct page * +__alloc_pages_internal(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, nodemask_t *nodemask) +{ + const gfp_t wait = gfp_mask & __GFP_WAIT; + enum zone_type high_zoneidx = gfp_zone(gfp_mask); + struct zoneref *z; + struct zone *zone; + struct page *page; + struct reclaim_state reclaim_state; + struct task_struct *p = current; + int do_retry; + int alloc_flags; + unsigned long did_some_progress; + unsigned long pages_reclaimed = 0; + + might_sleep_if(wait); + + if (should_fail_alloc_page(gfp_mask, order)) + return NULL; + +restart: + z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */ + + if (unlikely(!z->zone)) { + /* + * Happens if we have an empty zonelist as a result of + * GFP_THISNODE being used on a memoryless node + */ + return NULL; + } + + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, + zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); + if (page) + goto got_pg; + + /* + * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and + * __GFP_NOWARN set) should not cause reclaim since the subsystem + * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim + * using a larger set of nodes after it has established that the + * allowed per node queues are empty and that nodes are + * over allocated. + */ + if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) + goto nopage; + + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) + wakeup_kswapd(zone, order); + + /* + * OK, we're below the kswapd watermark and have kicked background + * reclaim. Now things get more complex, so set up alloc_flags according + * to how we want to proceed. + * + * The caller may dip into page reserves a bit more if the caller + * cannot run direct reclaim, or if the caller has realtime scheduling + * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will + * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). + */ + alloc_flags = ALLOC_WMARK_MIN; + if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) + alloc_flags |= ALLOC_HARDER; + if (gfp_mask & __GFP_HIGH) + alloc_flags |= ALLOC_HIGH; + if (wait) + alloc_flags |= ALLOC_CPUSET; + + /* + * Go through the zonelist again. Let __GFP_HIGH and allocations + * coming from realtime tasks go deeper into reserves. + * + * This is the last chance, in general, before the goto nopage. + * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. + * See also cpuset_zone_allowed() comment in kernel/cpuset.c. + */ + page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, + high_zoneidx, alloc_flags); + if (page) + goto got_pg; + + /* This allocation should allow future memory freeing. */ + +rebalance: + if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) + && !in_interrupt()) { + if (!(gfp_mask & __GFP_NOMEMALLOC)) { +nofail_alloc: + /* go through the zonelist yet again, ignoring mins */ + page = get_page_from_freelist(gfp_mask, nodemask, order, + zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); + if (page) + goto got_pg; + if (gfp_mask & __GFP_NOFAIL) { + congestion_wait(WRITE, HZ/50); + goto nofail_alloc; + } + } + goto nopage; + } + + /* Atomic allocations - we can't balance anything */ + if (!wait) + goto nopage; + + cond_resched(); + + /* We now go into synchronous reclaim */ + cpuset_memory_pressure_bump(); + /* + * The task's cpuset might have expanded its set of allowable nodes + */ + cpuset_update_task_memory_state(); + p->flags |= PF_MEMALLOC; + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + + did_some_progress = try_to_free_pages(zonelist, order, gfp_mask); + + p->reclaim_state = NULL; + p->flags &= ~PF_MEMALLOC; + + cond_resched(); + + if (order != 0) + drain_all_pages(); + + if (likely(did_some_progress)) { + page = get_page_from_freelist(gfp_mask, nodemask, order, + zonelist, high_zoneidx, alloc_flags); + if (page) + goto got_pg; + } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { + if (!try_set_zone_oom(zonelist, gfp_mask)) { + schedule_timeout_uninterruptible(1); + goto restart; + } + + /* + * Go through the zonelist yet one more time, keep + * very high watermark here, this is only to catch + * a parallel oom killing, we must fail if we're still + * under heavy pressure. + */ + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, + order, zonelist, high_zoneidx, + ALLOC_WMARK_HIGH|ALLOC_CPUSET); + if (page) { + clear_zonelist_oom(zonelist, gfp_mask); + goto got_pg; + } + + /* The OOM killer will not help higher order allocs so fail */ + if (order > PAGE_ALLOC_COSTLY_ORDER) { + clear_zonelist_oom(zonelist, gfp_mask); + goto nopage; + } + + out_of_memory(zonelist, gfp_mask, order); + clear_zonelist_oom(zonelist, gfp_mask); + goto restart; + } + + /* + * Don't let big-order allocations loop unless the caller explicitly + * requests that. Wait for some write requests to complete then retry. + * + * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER + * means __GFP_NOFAIL, but that may not be true in other + * implementations. + * + * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is + * specified, then we retry until we no longer reclaim any pages + * (above), or we've reclaimed an order of pages at least as + * large as the allocation's order. In both cases, if the + * allocation still fails, we stop retrying. + */ + pages_reclaimed += did_some_progress; + do_retry = 0; + if (!(gfp_mask & __GFP_NORETRY)) { + if (order <= PAGE_ALLOC_COSTLY_ORDER) { + do_retry = 1; + } else { + if (gfp_mask & __GFP_REPEAT && + pages_reclaimed < (1 << order)) + do_retry = 1; + } + if (gfp_mask & __GFP_NOFAIL) + do_retry = 1; + } + if (do_retry) { + congestion_wait(WRITE, HZ/50); + goto rebalance; + } + +nopage: + if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { + printk(KERN_WARNING "%s: page allocation failure." + " order:%d, mode:0x%x\n", + p->comm, order, gfp_mask); + dump_stack(); + show_mem(); + } +got_pg: + return page; +} +EXPORT_SYMBOL(__alloc_pages_internal); + +/* + * Common helper functions. + */ +unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) +{ + struct page * page; + page = alloc_pages(gfp_mask, order); + if (!page) + return 0; + return (unsigned long) page_address(page); +} + +EXPORT_SYMBOL(__get_free_pages); + +unsigned long get_zeroed_page(gfp_t gfp_mask) +{ + struct page * page; + + /* + * get_zeroed_page() returns a 32-bit address, which cannot represent + * a highmem page + */ + VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); + + page = alloc_pages(gfp_mask | __GFP_ZERO, 0); + if (page) + return (unsigned long) page_address(page); + return 0; +} + +EXPORT_SYMBOL(get_zeroed_page); + +void __pagevec_free(struct pagevec *pvec) +{ + int i = pagevec_count(pvec); + + while (--i >= 0) + free_hot_cold_page(pvec->pages[i], pvec->cold); +} + +void __free_pages(struct page *page, unsigned int order) +{ + if (put_page_testzero(page)) { + if (order == 0) + free_hot_page(page); + else + __free_pages_ok(page, order); + } +} + +EXPORT_SYMBOL(__free_pages); + +void free_pages(unsigned long addr, unsigned int order) +{ + if (addr != 0) { + VM_BUG_ON(!virt_addr_valid((void *)addr)); + __free_pages(virt_to_page((void *)addr), order); + } +} + +EXPORT_SYMBOL(free_pages); + +/** + * alloc_pages_exact - allocate an exact number physically-contiguous pages. + * @size: the number of bytes to allocate + * @gfp_mask: GFP flags for the allocation + * + * This function is similar to alloc_pages(), except that it allocates the + * minimum number of pages to satisfy the request. alloc_pages() can only + * allocate memory in power-of-two pages. + * + * This function is also limited by MAX_ORDER. + * + * Memory allocated by this function must be released by free_pages_exact(). + */ +void *alloc_pages_exact(size_t size, gfp_t gfp_mask) +{ + unsigned int order = get_order(size); + unsigned long addr; + + addr = __get_free_pages(gfp_mask, order); + if (addr) { + unsigned long alloc_end = addr + (PAGE_SIZE << order); + unsigned long used = addr + PAGE_ALIGN(size); + + split_page(virt_to_page(addr), order); + while (used < alloc_end) { + free_page(used); + used += PAGE_SIZE; + } + } + + return (void *)addr; +} +EXPORT_SYMBOL(alloc_pages_exact); + +/** + * free_pages_exact - release memory allocated via alloc_pages_exact() + * @virt: the value returned by alloc_pages_exact. + * @size: size of allocation, same value as passed to alloc_pages_exact(). + * + * Release the memory allocated by a previous call to alloc_pages_exact. + */ +void free_pages_exact(void *virt, size_t size) +{ + unsigned long addr = (unsigned long)virt; + unsigned long end = addr + PAGE_ALIGN(size); + + while (addr < end) { + free_page(addr); + addr += PAGE_SIZE; + } +} +EXPORT_SYMBOL(free_pages_exact); + +static unsigned int nr_free_zone_pages(int offset) +{ + struct zoneref *z; + struct zone *zone; + + /* Just pick one node, since fallback list is circular */ + unsigned int sum = 0; + + struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); + + for_each_zone_zonelist(zone, z, zonelist, offset) { + unsigned long size = zone->present_pages; + unsigned long high = zone->pages_high; + if (size > high) + sum += size - high; + } + + return sum; +} + +/* + * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL + */ +unsigned int nr_free_buffer_pages(void) +{ + return nr_free_zone_pages(gfp_zone(GFP_USER)); +} +EXPORT_SYMBOL_GPL(nr_free_buffer_pages); + +/* + * Amount of free RAM allocatable within all zones + */ +unsigned int nr_free_pagecache_pages(void) +{ + return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); +} + +static inline void show_node(struct zone *zone) +{ + if (NUMA_BUILD) + printk("Node %d ", zone_to_nid(zone)); +} + +void si_meminfo(struct sysinfo *val) +{ + val->totalram = totalram_pages; + val->sharedram = 0; + val->freeram = global_page_state(NR_FREE_PAGES); + val->bufferram = nr_blockdev_pages(); + val->totalhigh = totalhigh_pages; + val->freehigh = nr_free_highpages(); + val->mem_unit = PAGE_SIZE; +} + +EXPORT_SYMBOL(si_meminfo); + +#ifdef CONFIG_NUMA +void si_meminfo_node(struct sysinfo *val, int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + + val->totalram = pgdat->node_present_pages; + val->freeram = node_page_state(nid, NR_FREE_PAGES); +#ifdef CONFIG_HIGHMEM + val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; + val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], + NR_FREE_PAGES); +#else + val->totalhigh = 0; + val->freehigh = 0; +#endif + val->mem_unit = PAGE_SIZE; +} +#endif + +#define K(x) ((x) << (PAGE_SHIFT-10)) + +/* + * Show free area list (used inside shift_scroll-lock stuff) + * We also calculate the percentage fragmentation. We do this by counting the + * memory on each free list with the exception of the first item on the list. + */ +void show_free_areas(void) +{ + int cpu; + struct zone *zone; + + for_each_zone(zone) { + if (!populated_zone(zone)) + continue; + + show_node(zone); + printk("%s per-cpu:\n", zone->name); + + for_each_online_cpu(cpu) { + struct per_cpu_pageset *pageset; + + pageset = zone_pcp(zone, cpu); + + printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", + cpu, pageset->pcp.high, + pageset->pcp.batch, pageset->pcp.count); + } + } + + printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" + " inactive_file:%lu" +//TODO: check/adjust line lengths +#ifdef CONFIG_UNEVICTABLE_LRU + " unevictable:%lu" +#endif + " dirty:%lu writeback:%lu unstable:%lu\n" + " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", + global_page_state(NR_ACTIVE_ANON), + global_page_state(NR_ACTIVE_FILE), + global_page_state(NR_INACTIVE_ANON), + global_page_state(NR_INACTIVE_FILE), +#ifdef CONFIG_UNEVICTABLE_LRU + global_page_state(NR_UNEVICTABLE), +#endif + global_page_state(NR_FILE_DIRTY), + global_page_state(NR_WRITEBACK), + global_page_state(NR_UNSTABLE_NFS), + global_page_state(NR_FREE_PAGES), + global_page_state(NR_SLAB_RECLAIMABLE) + + global_page_state(NR_SLAB_UNRECLAIMABLE), + global_page_state(NR_FILE_MAPPED), + global_page_state(NR_PAGETABLE), + global_page_state(NR_BOUNCE)); + + for_each_zone(zone) { + int i; + + if (!populated_zone(zone)) + continue; + + show_node(zone); + printk("%s" + " free:%lukB" + " min:%lukB" + " low:%lukB" + " high:%lukB" + " active_anon:%lukB" + " inactive_anon:%lukB" + " active_file:%lukB" + " inactive_file:%lukB" +#ifdef CONFIG_UNEVICTABLE_LRU + " unevictable:%lukB" +#endif + " present:%lukB" + " pages_scanned:%lu" + " all_unreclaimable? %s" + "\n", + zone->name, + K(zone_page_state(zone, NR_FREE_PAGES)), + K(zone->pages_min), + K(zone->pages_low), + K(zone->pages_high), + K(zone_page_state(zone, NR_ACTIVE_ANON)), + K(zone_page_state(zone, NR_INACTIVE_ANON)), + K(zone_page_state(zone, NR_ACTIVE_FILE)), + K(zone_page_state(zone, NR_INACTIVE_FILE)), +#ifdef CONFIG_UNEVICTABLE_LRU + K(zone_page_state(zone, NR_UNEVICTABLE)), +#endif + K(zone->present_pages), + zone->pages_scanned, + (zone_is_all_unreclaimable(zone) ? "yes" : "no") + ); + printk("lowmem_reserve[]:"); + for (i = 0; i < MAX_NR_ZONES; i++) + printk(" %lu", zone->lowmem_reserve[i]); + printk("\n"); + } + + for_each_zone(zone) { + unsigned long nr[MAX_ORDER], flags, order, total = 0; + + if (!populated_zone(zone)) + continue; + + show_node(zone); + printk("%s: ", zone->name); + + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) { + nr[order] = zone->free_area[order].nr_free; + total += nr[order] << order; + } + spin_unlock_irqrestore(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) + printk("%lu*%lukB ", nr[order], K(1UL) << order); + printk("= %lukB\n", K(total)); + } + + printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); + + show_swap_cache_info(); +} + +static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) +{ + zoneref->zone = zone; + zoneref->zone_idx = zone_idx(zone); +} + +/* + * Builds allocation fallback zone lists. + * + * Add all populated zones of a node to the zonelist. + */ +static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, + int nr_zones, enum zone_type zone_type) +{ + struct zone *zone; + + BUG_ON(zone_type >= MAX_NR_ZONES); + zone_type++; + + do { + zone_type--; + zone = pgdat->node_zones + zone_type; + if (populated_zone(zone)) { + zoneref_set_zone(zone, + &zonelist->_zonerefs[nr_zones++]); + check_highest_zone(zone_type); + } + + } while (zone_type); + return nr_zones; +} + + +/* + * zonelist_order: + * 0 = automatic detection of better ordering. + * 1 = order by ([node] distance, -zonetype) + * 2 = order by (-zonetype, [node] distance) + * + * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create + * the same zonelist. So only NUMA can configure this param. + */ +#define ZONELIST_ORDER_DEFAULT 0 +#define ZONELIST_ORDER_NODE 1 +#define ZONELIST_ORDER_ZONE 2 + +/* zonelist order in the kernel. + * set_zonelist_order() will set this to NODE or ZONE. + */ +static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; +static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; + + +#ifdef CONFIG_NUMA +/* The value user specified ....changed by config */ +static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; +/* string for sysctl */ +#define NUMA_ZONELIST_ORDER_LEN 16 +char numa_zonelist_order[16] = "default"; + +/* + * interface for configure zonelist ordering. + * command line option "numa_zonelist_order" + * = "[dD]efault - default, automatic configuration. + * = "[nN]ode - order by node locality, then by zone within node + * = "[zZ]one - order by zone, then by locality within zone + */ + +static int __parse_numa_zonelist_order(char *s) +{ + if (*s == 'd' || *s == 'D') { + user_zonelist_order = ZONELIST_ORDER_DEFAULT; + } else if (*s == 'n' || *s == 'N') { + user_zonelist_order = ZONELIST_ORDER_NODE; + } else if (*s == 'z' || *s == 'Z') { + user_zonelist_order = ZONELIST_ORDER_ZONE; + } else { + printk(KERN_WARNING + "Ignoring invalid numa_zonelist_order value: " + "%s\n", s); + return -EINVAL; + } + return 0; +} + +static __init int setup_numa_zonelist_order(char *s) +{ + if (s) + return __parse_numa_zonelist_order(s); + return 0; +} +early_param("numa_zonelist_order", setup_numa_zonelist_order); + +/* + * sysctl handler for numa_zonelist_order + */ +int numa_zonelist_order_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, + loff_t *ppos) +{ + char saved_string[NUMA_ZONELIST_ORDER_LEN]; + int ret; + + if (write) + strncpy(saved_string, (char*)table->data, + NUMA_ZONELIST_ORDER_LEN); + ret = proc_dostring(table, write, file, buffer, length, ppos); + if (ret) + return ret; + if (write) { + int oldval = user_zonelist_order; + if (__parse_numa_zonelist_order((char*)table->data)) { + /* + * bogus value. restore saved string + */ + strncpy((char*)table->data, saved_string, + NUMA_ZONELIST_ORDER_LEN); + user_zonelist_order = oldval; + } else if (oldval != user_zonelist_order) + build_all_zonelists(); + } + return 0; +} + + +#define MAX_NODE_LOAD (num_online_nodes()) +static int node_load[MAX_NUMNODES]; + +/** + * find_next_best_node - find the next node that should appear in a given node's fallback list + * @node: node whose fallback list we're appending + * @used_node_mask: nodemask_t of already used nodes + * + * We use a number of factors to determine which is the next node that should + * appear on a given node's fallback list. The node should not have appeared + * already in @node's fallback list, and it should be the next closest node + * according to the distance array (which contains arbitrary distance values + * from each node to each node in the system), and should also prefer nodes + * with no CPUs, since presumably they'll have very little allocation pressure + * on them otherwise. + * It returns -1 if no node is found. + */ +static int find_next_best_node(int node, nodemask_t *used_node_mask) +{ + int n, val; + int min_val = INT_MAX; + int best_node = -1; + node_to_cpumask_ptr(tmp, 0); + + /* Use the local node if we haven't already */ + if (!node_isset(node, *used_node_mask)) { + node_set(node, *used_node_mask); + return node; + } + + for_each_node_state(n, N_HIGH_MEMORY) { + + /* Don't want a node to appear more than once */ + if (node_isset(n, *used_node_mask)) + continue; + + /* Use the distance array to find the distance */ + val = node_distance(node, n); + + /* Penalize nodes under us ("prefer the next node") */ + val += (n < node); + + /* Give preference to headless and unused nodes */ + node_to_cpumask_ptr_next(tmp, n); + if (!cpus_empty(*tmp)) + val += PENALTY_FOR_NODE_WITH_CPUS; + + /* Slight preference for less loaded node */ + val *= (MAX_NODE_LOAD*MAX_NUMNODES); + val += node_load[n]; + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + if (best_node >= 0) + node_set(best_node, *used_node_mask); + + return best_node; +} + + +/* + * Build zonelists ordered by node and zones within node. + * This results in maximum locality--normal zone overflows into local + * DMA zone, if any--but risks exhausting DMA zone. + */ +static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) +{ + int j; + struct zonelist *zonelist; + + zonelist = &pgdat->node_zonelists[0]; + for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) + ; + j = build_zonelists_node(NODE_DATA(node), zonelist, j, + MAX_NR_ZONES - 1); + zonelist->_zonerefs[j].zone = NULL; + zonelist->_zonerefs[j].zone_idx = 0; +} + +/* + * Build gfp_thisnode zonelists + */ +static void build_thisnode_zonelists(pg_data_t *pgdat) +{ + int j; + struct zonelist *zonelist; + + zonelist = &pgdat->node_zonelists[1]; + j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); + zonelist->_zonerefs[j].zone = NULL; + zonelist->_zonerefs[j].zone_idx = 0; +} + +/* + * Build zonelists ordered by zone and nodes within zones. + * This results in conserving DMA zone[s] until all Normal memory is + * exhausted, but results in overflowing to remote node while memory + * may still exist in local DMA zone. + */ +static int node_order[MAX_NUMNODES]; + +static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) +{ + int pos, j, node; + int zone_type; /* needs to be signed */ + struct zone *z; + struct zonelist *zonelist; + + zonelist = &pgdat->node_zonelists[0]; + pos = 0; + for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { + for (j = 0; j < nr_nodes; j++) { + node = node_order[j]; + z = &NODE_DATA(node)->node_zones[zone_type]; + if (populated_zone(z)) { + zoneref_set_zone(z, + &zonelist->_zonerefs[pos++]); + check_highest_zone(zone_type); + } + } + } + zonelist->_zonerefs[pos].zone = NULL; + zonelist->_zonerefs[pos].zone_idx = 0; +} + +static int default_zonelist_order(void) +{ + int nid, zone_type; + unsigned long low_kmem_size,total_size; + struct zone *z; + int average_size; + /* + * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem. + * If they are really small and used heavily, the system can fall + * into OOM very easily. + * This function detect ZONE_DMA/DMA32 size and confgigures zone order. + */ + /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ + low_kmem_size = 0; + total_size = 0; + for_each_online_node(nid) { + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { + z = &NODE_DATA(nid)->node_zones[zone_type]; + if (populated_zone(z)) { + if (zone_type < ZONE_NORMAL) + low_kmem_size += z->present_pages; + total_size += z->present_pages; + } + } + } + if (!low_kmem_size || /* there are no DMA area. */ + low_kmem_size > total_size/2) /* DMA/DMA32 is big. */ + return ZONELIST_ORDER_NODE; + /* + * look into each node's config. + * If there is a node whose DMA/DMA32 memory is very big area on + * local memory, NODE_ORDER may be suitable. + */ + average_size = total_size / + (nodes_weight(node_states[N_HIGH_MEMORY]) + 1); + for_each_online_node(nid) { + low_kmem_size = 0; + total_size = 0; + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { + z = &NODE_DATA(nid)->node_zones[zone_type]; + if (populated_zone(z)) { + if (zone_type < ZONE_NORMAL) + low_kmem_size += z->present_pages; + total_size += z->present_pages; + } + } + if (low_kmem_size && + total_size > average_size && /* ignore small node */ + low_kmem_size > total_size * 70/100) + return ZONELIST_ORDER_NODE; + } + return ZONELIST_ORDER_ZONE; +} + +static void set_zonelist_order(void) +{ + if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) + current_zonelist_order = default_zonelist_order(); + else + current_zonelist_order = user_zonelist_order; +} + +static void build_zonelists(pg_data_t *pgdat) +{ + int j, node, load; + enum zone_type i; + nodemask_t used_mask; + int local_node, prev_node; + struct zonelist *zonelist; + int order = current_zonelist_order; + + /* initialize zonelists */ + for (i = 0; i < MAX_ZONELISTS; i++) { + zonelist = pgdat->node_zonelists + i; + zonelist->_zonerefs[0].zone = NULL; + zonelist->_zonerefs[0].zone_idx = 0; + } + + /* NUMA-aware ordering of nodes */ + local_node = pgdat->node_id; + load = num_online_nodes(); + prev_node = local_node; + nodes_clear(used_mask); + + memset(node_load, 0, sizeof(node_load)); + memset(node_order, 0, sizeof(node_order)); + j = 0; + + while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { + int distance = node_distance(local_node, node); + + /* + * If another node is sufficiently far away then it is better + * to reclaim pages in a zone before going off node. + */ + if (distance > RECLAIM_DISTANCE) + zone_reclaim_mode = 1; + + /* + * We don't want to pressure a particular node. + * So adding penalty to the first node in same + * distance group to make it round-robin. + */ + if (distance != node_distance(local_node, prev_node)) + node_load[node] = load; + + prev_node = node; + load--; + if (order == ZONELIST_ORDER_NODE) + build_zonelists_in_node_order(pgdat, node); + else + node_order[j++] = node; /* remember order */ + } + + if (order == ZONELIST_ORDER_ZONE) { + /* calculate node order -- i.e., DMA last! */ + build_zonelists_in_zone_order(pgdat, j); + } + + build_thisnode_zonelists(pgdat); +} + +/* Construct the zonelist performance cache - see further mmzone.h */ +static void build_zonelist_cache(pg_data_t *pgdat) +{ + struct zonelist *zonelist; + struct zonelist_cache *zlc; + struct zoneref *z; + + zonelist = &pgdat->node_zonelists[0]; + zonelist->zlcache_ptr = zlc = &zonelist->zlcache; + bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); + for (z = zonelist->_zonerefs; z->zone; z++) + zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); +} + + +#else /* CONFIG_NUMA */ + +static void set_zonelist_order(void) +{ + current_zonelist_order = ZONELIST_ORDER_ZONE; +} + +static void build_zonelists(pg_data_t *pgdat) +{ + int node, local_node; + enum zone_type j; + struct zonelist *zonelist; + + local_node = pgdat->node_id; + + zonelist = &pgdat->node_zonelists[0]; + j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); + + /* + * Now we build the zonelist so that it contains the zones + * of all the other nodes. + * We don't want to pressure a particular node, so when + * building the zones for node N, we make sure that the + * zones coming right after the local ones are those from + * node N+1 (modulo N) + */ + for (node = local_node + 1; node < MAX_NUMNODES; node++) { + if (!node_online(node)) + continue; + j = build_zonelists_node(NODE_DATA(node), zonelist, j, + MAX_NR_ZONES - 1); + } + for (node = 0; node < local_node; node++) { + if (!node_online(node)) + continue; + j = build_zonelists_node(NODE_DATA(node), zonelist, j, + MAX_NR_ZONES - 1); + } + + zonelist->_zonerefs[j].zone = NULL; + zonelist->_zonerefs[j].zone_idx = 0; +} + +/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ +static void build_zonelist_cache(pg_data_t *pgdat) +{ + pgdat->node_zonelists[0].zlcache_ptr = NULL; +} + +#endif /* CONFIG_NUMA */ + +/* return values int ....just for stop_machine() */ +static int __build_all_zonelists(void *dummy) +{ + int nid; + + for_each_online_node(nid) { + pg_data_t *pgdat = NODE_DATA(nid); + + build_zonelists(pgdat); + build_zonelist_cache(pgdat); + } + return 0; +} + +void build_all_zonelists(void) +{ + set_zonelist_order(); + + if (system_state == SYSTEM_BOOTING) { + __build_all_zonelists(NULL); + mminit_verify_zonelist(); + cpuset_init_current_mems_allowed(); + } else { + /* we have to stop all cpus to guarantee there is no user + of zonelist */ + stop_machine(__build_all_zonelists, NULL, NULL); + /* cpuset refresh routine should be here */ + } + vm_total_pages = nr_free_pagecache_pages(); + /* + * Disable grouping by mobility if the number of pages in the + * system is too low to allow the mechanism to work. It would be + * more accurate, but expensive to check per-zone. This check is + * made on memory-hotadd so a system can start with mobility + * disabled and enable it later + */ + if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) + page_group_by_mobility_disabled = 1; + else + page_group_by_mobility_disabled = 0; + + printk("Built %i zonelists in %s order, mobility grouping %s. " + "Total pages: %ld\n", + num_online_nodes(), + zonelist_order_name[current_zonelist_order], + page_group_by_mobility_disabled ? "off" : "on", + vm_total_pages); +#ifdef CONFIG_NUMA + printk("Policy zone: %s\n", zone_names[policy_zone]); +#endif +} + +/* + * Helper functions to size the waitqueue hash table. + * Essentially these want to choose hash table sizes sufficiently + * large so that collisions trying to wait on pages are rare. + * But in fact, the number of active page waitqueues on typical + * systems is ridiculously low, less than 200. So this is even + * conservative, even though it seems large. + * + * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to + * waitqueues, i.e. the size of the waitq table given the number of pages. + */ +#define PAGES_PER_WAITQUEUE 256 + +#ifndef CONFIG_MEMORY_HOTPLUG +static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) +{ + unsigned long size = 1; + + pages /= PAGES_PER_WAITQUEUE; + + while (size < pages) + size <<= 1; + + /* + * Once we have dozens or even hundreds of threads sleeping + * on IO we've got bigger problems than wait queue collision. + * Limit the size of the wait table to a reasonable size. + */ + size = min(size, 4096UL); + + return max(size, 4UL); +} +#else +/* + * A zone's size might be changed by hot-add, so it is not possible to determine + * a suitable size for its wait_table. So we use the maximum size now. + * + * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: + * + * i386 (preemption config) : 4096 x 16 = 64Kbyte. + * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. + * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. + * + * The maximum entries are prepared when a zone's memory is (512K + 256) pages + * or more by the traditional way. (See above). It equals: + * + * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. + * ia64(16K page size) : = ( 8G + 4M)byte. + * powerpc (64K page size) : = (32G +16M)byte. + */ +static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) +{ + return 4096UL; +} +#endif + +/* + * This is an integer logarithm so that shifts can be used later + * to extract the more random high bits from the multiplicative + * hash function before the remainder is taken. + */ +static inline unsigned long wait_table_bits(unsigned long size) +{ + return ffz(~size); +} + +#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) + +/* + * Mark a number of pageblocks as MIGRATE_RESERVE. The number + * of blocks reserved is based on zone->pages_min. The memory within the + * reserve will tend to store contiguous free pages. Setting min_free_kbytes + * higher will lead to a bigger reserve which will get freed as contiguous + * blocks as reclaim kicks in + */ +static void setup_zone_migrate_reserve(struct zone *zone) +{ + unsigned long start_pfn, pfn, end_pfn; + struct page *page; + unsigned long reserve, block_migratetype; + + /* Get the start pfn, end pfn and the number of blocks to reserve */ + start_pfn = zone->zone_start_pfn; + end_pfn = start_pfn + zone->spanned_pages; + reserve = roundup(zone->pages_min, pageblock_nr_pages) >> + pageblock_order; + + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { + if (!pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + + /* Watch out for overlapping nodes */ + if (page_to_nid(page) != zone_to_nid(zone)) + continue; + + /* Blocks with reserved pages will never free, skip them. */ + if (PageReserved(page)) + continue; + + block_migratetype = get_pageblock_migratetype(page); + + /* If this block is reserved, account for it */ + if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) { + reserve--; + continue; + } + + /* Suitable for reserving if this block is movable */ + if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) { + set_pageblock_migratetype(page, MIGRATE_RESERVE); + move_freepages_block(zone, page, MIGRATE_RESERVE); + reserve--; + continue; + } + + /* + * If the reserve is met and this is a previous reserved block, + * take it back + */ + if (block_migratetype == MIGRATE_RESERVE) { + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + move_freepages_block(zone, page, MIGRATE_MOVABLE); + } + } +} + +/* + * Initially all pages are reserved - free ones are freed + * up by free_all_bootmem() once the early boot process is + * done. Non-atomic initialization, single-pass. + */ +void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, + unsigned long start_pfn, enum memmap_context context) +{ + struct page *page; + unsigned long end_pfn = start_pfn + size; + unsigned long pfn; + struct zone *z; + + z = &NODE_DATA(nid)->node_zones[zone]; + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + /* + * There can be holes in boot-time mem_map[]s + * handed to this function. They do not + * exist on hotplugged memory. + */ + if (context == MEMMAP_EARLY) { + if (!early_pfn_valid(pfn)) + continue; + if (!early_pfn_in_nid(pfn, nid)) + continue; + } + page = pfn_to_page(pfn); + set_page_links(page, zone, nid, pfn); + mminit_verify_page_links(page, zone, nid, pfn); + init_page_count(page); + reset_page_mapcount(page); + SetPageReserved(page); + /* + * Mark the block movable so that blocks are reserved for + * movable at startup. This will force kernel allocations + * to reserve their blocks rather than leaking throughout + * the address space during boot when many long-lived + * kernel allocations are made. Later some blocks near + * the start are marked MIGRATE_RESERVE by + * setup_zone_migrate_reserve() + * + * bitmap is created for zone's valid pfn range. but memmap + * can be created for invalid pages (for alignment) + * check here not to call set_pageblock_migratetype() against + * pfn out of zone. + */ + if ((z->zone_start_pfn <= pfn) + && (pfn < z->zone_start_pfn + z->spanned_pages) + && !(pfn & (pageblock_nr_pages - 1))) + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + + INIT_LIST_HEAD(&page->lru); +#ifdef WANT_PAGE_VIRTUAL + /* The shift won't overflow because ZONE_NORMAL is below 4G. */ + if (!is_highmem_idx(zone)) + set_page_address(page, __va(pfn << PAGE_SHIFT)); +#endif + } +} + +static void __meminit zone_init_free_lists(struct zone *zone) +{ + int order, t; + for_each_migratetype_order(order, t) { + INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); + zone->free_area[order].nr_free = 0; + } +} + +#ifndef __HAVE_ARCH_MEMMAP_INIT +#define memmap_init(size, nid, zone, start_pfn) \ + memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) +#endif + +static int zone_batchsize(struct zone *zone) +{ + int batch; + + /* + * The per-cpu-pages pools are set to around 1000th of the + * size of the zone. But no more than 1/2 of a meg. + * + * OK, so we don't know how big the cache is. So guess. + */ + batch = zone->present_pages / 1024; + if (batch * PAGE_SIZE > 512 * 1024) + batch = (512 * 1024) / PAGE_SIZE; + batch /= 4; /* We effectively *= 4 below */ + if (batch < 1) + batch = 1; + + /* + * Clamp the batch to a 2^n - 1 value. Having a power + * of 2 value was found to be more likely to have + * suboptimal cache aliasing properties in some cases. + * + * For example if 2 tasks are alternately allocating + * batches of pages, one task can end up with a lot + * of pages of one half of the possible page colors + * and the other with pages of the other colors. + */ + batch = (1 << (fls(batch + batch/2)-1)) - 1; + + return batch; +} + +static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +{ + struct per_cpu_pages *pcp; + + memset(p, 0, sizeof(*p)); + + pcp = &p->pcp; + pcp->count = 0; + pcp->high = 6 * batch; + pcp->batch = max(1UL, 1 * batch); + INIT_LIST_HEAD(&pcp->list); +} + +/* + * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist + * to the value high for the pageset p. + */ + +static void setup_pagelist_highmark(struct per_cpu_pageset *p, + unsigned long high) +{ + struct per_cpu_pages *pcp; + + pcp = &p->pcp; + pcp->high = high; + pcp->batch = max(1UL, high/4); + if ((high/4) > (PAGE_SHIFT * 8)) + pcp->batch = PAGE_SHIFT * 8; +} + + +#ifdef CONFIG_NUMA +/* + * Boot pageset table. One per cpu which is going to be used for all + * zones and all nodes. The parameters will be set in such a way + * that an item put on a list will immediately be handed over to + * the buddy list. This is safe since pageset manipulation is done + * with interrupts disabled. + * + * Some NUMA counter updates may also be caught by the boot pagesets. + * + * The boot_pagesets must be kept even after bootup is complete for + * unused processors and/or zones. They do play a role for bootstrapping + * hotplugged processors. + * + * zoneinfo_show() and maybe other functions do + * not check if the processor is online before following the pageset pointer. + * Other parts of the kernel may not check if the zone is available. + */ +static struct per_cpu_pageset boot_pageset[NR_CPUS]; + +/* + * Dynamically allocate memory for the + * per cpu pageset array in struct zone. + */ +static int __cpuinit process_zones(int cpu) +{ + struct zone *zone, *dzone; + int node = cpu_to_node(cpu); + + node_set_state(node, N_CPU); /* this node has a cpu */ + + for_each_zone(zone) { + + if (!populated_zone(zone)) + continue; + + zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), + GFP_KERNEL, node); + if (!zone_pcp(zone, cpu)) + goto bad; + + setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); + + if (percpu_pagelist_fraction) + setup_pagelist_highmark(zone_pcp(zone, cpu), + (zone->present_pages / percpu_pagelist_fraction)); + } + + return 0; +bad: + for_each_zone(dzone) { + if (!populated_zone(dzone)) + continue; + if (dzone == zone) + break; + kfree(zone_pcp(dzone, cpu)); + zone_pcp(dzone, cpu) = NULL; + } + return -ENOMEM; +} + +static inline void free_zone_pagesets(int cpu) +{ + struct zone *zone; + + for_each_zone(zone) { + struct per_cpu_pageset *pset = zone_pcp(zone, cpu); + + /* Free per_cpu_pageset if it is slab allocated */ + if (pset != &boot_pageset[cpu]) + kfree(pset); + zone_pcp(zone, cpu) = NULL; + } +} + +static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + int ret = NOTIFY_OK; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + if (process_zones(cpu)) + ret = NOTIFY_BAD; + break; + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + free_zone_pagesets(cpu); + break; + default: + break; + } + return ret; +} + +static struct notifier_block __cpuinitdata pageset_notifier = + { &pageset_cpuup_callback, NULL, 0 }; + +void __init setup_per_cpu_pageset(void) +{ + int err; + + /* Initialize per_cpu_pageset for cpu 0. + * A cpuup callback will do this for every cpu + * as it comes online + */ + err = process_zones(smp_processor_id()); + BUG_ON(err); + register_cpu_notifier(&pageset_notifier); +} + +#endif + +static noinline __init_refok +int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) +{ + int i; + struct pglist_data *pgdat = zone->zone_pgdat; + size_t alloc_size; + + /* + * The per-page waitqueue mechanism uses hashed waitqueues + * per zone. + */ + zone->wait_table_hash_nr_entries = + wait_table_hash_nr_entries(zone_size_pages); + zone->wait_table_bits = + wait_table_bits(zone->wait_table_hash_nr_entries); + alloc_size = zone->wait_table_hash_nr_entries + * sizeof(wait_queue_head_t); + + if (!slab_is_available()) { + zone->wait_table = (wait_queue_head_t *) + alloc_bootmem_node(pgdat, alloc_size); + } else { + /* + * This case means that a zone whose size was 0 gets new memory + * via memory hot-add. + * But it may be the case that a new node was hot-added. In + * this case vmalloc() will not be able to use this new node's + * memory - this wait_table must be initialized to use this new + * node itself as well. + * To use this new node's memory, further consideration will be + * necessary. + */ + zone->wait_table = vmalloc(alloc_size); + } + if (!zone->wait_table) + return -ENOMEM; + + for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) + init_waitqueue_head(zone->wait_table + i); + + return 0; +} + +static __meminit void zone_pcp_init(struct zone *zone) +{ + int cpu; + unsigned long batch = zone_batchsize(zone); + + for (cpu = 0; cpu < NR_CPUS; cpu++) { +#ifdef CONFIG_NUMA + /* Early boot. Slab allocator not functional yet */ + zone_pcp(zone, cpu) = &boot_pageset[cpu]; + setup_pageset(&boot_pageset[cpu],0); +#else + setup_pageset(zone_pcp(zone,cpu), batch); +#endif + } + if (zone->present_pages) + printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", + zone->name, zone->present_pages, batch); +} + +__meminit int init_currently_empty_zone(struct zone *zone, + unsigned long zone_start_pfn, + unsigned long size, + enum memmap_context context) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + int ret; + ret = zone_wait_table_init(zone, size); + if (ret) + return ret; + pgdat->nr_zones = zone_idx(zone) + 1; + + zone->zone_start_pfn = zone_start_pfn; + + mminit_dprintk(MMINIT_TRACE, "memmap_init", + "Initialising map node %d zone %lu pfns %lu -> %lu\n", + pgdat->node_id, + (unsigned long)zone_idx(zone), + zone_start_pfn, (zone_start_pfn + size)); + + zone_init_free_lists(zone); + + return 0; +} + +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP +/* + * Basic iterator support. Return the first range of PFNs for a node + * Note: nid == MAX_NUMNODES returns first region regardless of node + */ +static int __meminit first_active_region_index_in_nid(int nid) +{ + int i; + + for (i = 0; i < nr_nodemap_entries; i++) + if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) + return i; + + return -1; +} + +/* + * Basic iterator support. Return the next active range of PFNs for a node + * Note: nid == MAX_NUMNODES returns next region regardless of node + */ +static int __meminit next_active_region_index_in_nid(int index, int nid) +{ + for (index = index + 1; index < nr_nodemap_entries; index++) + if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) + return index; + + return -1; +} + +#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID +/* + * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. + * Architectures may implement their own version but if add_active_range() + * was used and there are no special requirements, this is a convenient + * alternative + */ +int __meminit __early_pfn_to_nid(unsigned long pfn) +{ + int i; + + for (i = 0; i < nr_nodemap_entries; i++) { + unsigned long start_pfn = early_node_map[i].start_pfn; + unsigned long end_pfn = early_node_map[i].end_pfn; + + if (start_pfn <= pfn && pfn < end_pfn) + return early_node_map[i].nid; + } + /* This is a memory hole */ + return -1; +} +#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ + +int __meminit early_pfn_to_nid(unsigned long pfn) +{ + int nid; + + nid = __early_pfn_to_nid(pfn); + if (nid >= 0) + return nid; + /* just returns 0 */ + return 0; +} + +#ifdef CONFIG_NODES_SPAN_OTHER_NODES +bool __meminit early_pfn_in_nid(unsigned long pfn, int node) +{ + int nid; + + nid = __early_pfn_to_nid(pfn); + if (nid >= 0 && nid != node) + return false; + return true; +} +#endif + +/* Basic iterator support to walk early_node_map[] */ +#define for_each_active_range_index_in_nid(i, nid) \ + for (i = first_active_region_index_in_nid(nid); i != -1; \ + i = next_active_region_index_in_nid(i, nid)) + +/** + * free_bootmem_with_active_regions - Call free_bootmem_node for each active range + * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. + * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node + * + * If an architecture guarantees that all ranges registered with + * add_active_ranges() contain no holes and may be freed, this + * this function may be used instead of calling free_bootmem() manually. + */ +void __init free_bootmem_with_active_regions(int nid, + unsigned long max_low_pfn) +{ + int i; + + for_each_active_range_index_in_nid(i, nid) { + unsigned long size_pages = 0; + unsigned long end_pfn = early_node_map[i].end_pfn; + + if (early_node_map[i].start_pfn >= max_low_pfn) + continue; + + if (end_pfn > max_low_pfn) + end_pfn = max_low_pfn; + + size_pages = end_pfn - early_node_map[i].start_pfn; + free_bootmem_node(NODE_DATA(early_node_map[i].nid), + PFN_PHYS(early_node_map[i].start_pfn), + size_pages << PAGE_SHIFT); + } +} + +void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) +{ + int i; + int ret; + + for_each_active_range_index_in_nid(i, nid) { + ret = work_fn(early_node_map[i].start_pfn, + early_node_map[i].end_pfn, data); + if (ret) + break; + } +} +/** + * sparse_memory_present_with_active_regions - Call memory_present for each active range + * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. + * + * If an architecture guarantees that all ranges registered with + * add_active_ranges() contain no holes and may be freed, this + * function may be used instead of calling memory_present() manually. + */ +void __init sparse_memory_present_with_active_regions(int nid) +{ + int i; + + for_each_active_range_index_in_nid(i, nid) + memory_present(early_node_map[i].nid, + early_node_map[i].start_pfn, + early_node_map[i].end_pfn); +} + +/** + * push_node_boundaries - Push node boundaries to at least the requested boundary + * @nid: The nid of the node to push the boundary for + * @start_pfn: The start pfn of the node + * @end_pfn: The end pfn of the node + * + * In reserve-based hot-add, mem_map is allocated that is unused until hotadd + * time. Specifically, on x86_64, SRAT will report ranges that can potentially + * be hotplugged even though no physical memory exists. This function allows + * an arch to push out the node boundaries so mem_map is allocated that can + * be used later. + */ +#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE +void __init push_node_boundaries(unsigned int nid, + unsigned long start_pfn, unsigned long end_pfn) +{ + mminit_dprintk(MMINIT_TRACE, "zoneboundary", + "Entering push_node_boundaries(%u, %lu, %lu)\n", + nid, start_pfn, end_pfn); + + /* Initialise the boundary for this node if necessary */ + if (node_boundary_end_pfn[nid] == 0) + node_boundary_start_pfn[nid] = -1UL; + + /* Update the boundaries */ + if (node_boundary_start_pfn[nid] > start_pfn) + node_boundary_start_pfn[nid] = start_pfn; + if (node_boundary_end_pfn[nid] < end_pfn) + node_boundary_end_pfn[nid] = end_pfn; +} + +/* If necessary, push the node boundary out for reserve hotadd */ +static void __meminit account_node_boundary(unsigned int nid, + unsigned long *start_pfn, unsigned long *end_pfn) +{ + mminit_dprintk(MMINIT_TRACE, "zoneboundary", + "Entering account_node_boundary(%u, %lu, %lu)\n", + nid, *start_pfn, *end_pfn); + + /* Return if boundary information has not been provided */ + if (node_boundary_end_pfn[nid] == 0) + return; + + /* Check the boundaries and update if necessary */ + if (node_boundary_start_pfn[nid] < *start_pfn) + *start_pfn = node_boundary_start_pfn[nid]; + if (node_boundary_end_pfn[nid] > *end_pfn) + *end_pfn = node_boundary_end_pfn[nid]; +} +#else +void __init push_node_boundaries(unsigned int nid, + unsigned long start_pfn, unsigned long end_pfn) {} + +static void __meminit account_node_boundary(unsigned int nid, + unsigned long *start_pfn, unsigned long *end_pfn) {} +#endif + + +/** + * get_pfn_range_for_nid - Return the start and end page frames for a node + * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. + * @start_pfn: Passed by reference. On return, it will have the node start_pfn. + * @end_pfn: Passed by reference. On return, it will have the node end_pfn. + * + * It returns the start and end page frame of a node based on information + * provided by an arch calling add_active_range(). If called for a node + * with no available memory, a warning is printed and the start and end + * PFNs will be 0. + */ +void __meminit get_pfn_range_for_nid(unsigned int nid, + unsigned long *start_pfn, unsigned long *end_pfn) +{ + int i; + *start_pfn = -1UL; + *end_pfn = 0; + + for_each_active_range_index_in_nid(i, nid) { + *start_pfn = min(*start_pfn, early_node_map[i].start_pfn); + *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); + } + + if (*start_pfn == -1UL) + *start_pfn = 0; + + /* Push the node boundaries out if requested */ + account_node_boundary(nid, start_pfn, end_pfn); +} + +/* + * This finds a zone that can be used for ZONE_MOVABLE pages. The + * assumption is made that zones within a node are ordered in monotonic + * increasing memory addresses so that the "highest" populated zone is used + */ +static void __init find_usable_zone_for_movable(void) +{ + int zone_index; + for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { + if (zone_index == ZONE_MOVABLE) + continue; + + if (arch_zone_highest_possible_pfn[zone_index] > + arch_zone_lowest_possible_pfn[zone_index]) + break; + } + + VM_BUG_ON(zone_index == -1); + movable_zone = zone_index; +} + +/* + * The zone ranges provided by the architecture do not include ZONE_MOVABLE + * because it is sized independant of architecture. Unlike the other zones, + * the starting point for ZONE_MOVABLE is not fixed. It may be different + * in each node depending on the size of each node and how evenly kernelcore + * is distributed. This helper function adjusts the zone ranges + * provided by the architecture for a given node by using the end of the + * highest usable zone for ZONE_MOVABLE. This preserves the assumption that + * zones within a node are in order of monotonic increases memory addresses + */ +static void __meminit adjust_zone_range_for_zone_movable(int nid, + unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, + unsigned long *zone_start_pfn, + unsigned long *zone_end_pfn) +{ + /* Only adjust if ZONE_MOVABLE is on this node */ + if (zone_movable_pfn[nid]) { + /* Size ZONE_MOVABLE */ + if (zone_type == ZONE_MOVABLE) { + *zone_start_pfn = zone_movable_pfn[nid]; + *zone_end_pfn = min(node_end_pfn, + arch_zone_highest_possible_pfn[movable_zone]); + + /* Adjust for ZONE_MOVABLE starting within this range */ + } else if (*zone_start_pfn < zone_movable_pfn[nid] && + *zone_end_pfn > zone_movable_pfn[nid]) { + *zone_end_pfn = zone_movable_pfn[nid]; + + /* Check if this whole range is within ZONE_MOVABLE */ + } else if (*zone_start_pfn >= zone_movable_pfn[nid]) + *zone_start_pfn = *zone_end_pfn; + } +} + +/* + * Return the number of pages a zone spans in a node, including holes + * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() + */ +static unsigned long __meminit zone_spanned_pages_in_node(int nid, + unsigned long zone_type, + unsigned long *ignored) +{ + unsigned long node_start_pfn, node_end_pfn; + unsigned long zone_start_pfn, zone_end_pfn; + + /* Get the start and end of the node and zone */ + get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); + zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; + zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; + adjust_zone_range_for_zone_movable(nid, zone_type, + node_start_pfn, node_end_pfn, + &zone_start_pfn, &zone_end_pfn); + + /* Check that this node has pages within the zone's required range */ + if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) + return 0; + + /* Move the zone boundaries inside the node if necessary */ + zone_end_pfn = min(zone_end_pfn, node_end_pfn); + zone_start_pfn = max(zone_start_pfn, node_start_pfn); + + /* Return the spanned pages */ + return zone_end_pfn - zone_start_pfn; +} + +/* + * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, + * then all holes in the requested range will be accounted for. + */ +static unsigned long __meminit __absent_pages_in_range(int nid, + unsigned long range_start_pfn, + unsigned long range_end_pfn) +{ + int i = 0; + unsigned long prev_end_pfn = 0, hole_pages = 0; + unsigned long start_pfn; + + /* Find the end_pfn of the first active range of pfns in the node */ + i = first_active_region_index_in_nid(nid); + if (i == -1) + return 0; + + prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn); + + /* Account for ranges before physical memory on this node */ + if (early_node_map[i].start_pfn > range_start_pfn) + hole_pages = prev_end_pfn - range_start_pfn; + + /* Find all holes for the zone within the node */ + for (; i != -1; i = next_active_region_index_in_nid(i, nid)) { + + /* No need to continue if prev_end_pfn is outside the zone */ + if (prev_end_pfn >= range_end_pfn) + break; + + /* Make sure the end of the zone is not within the hole */ + start_pfn = min(early_node_map[i].start_pfn, range_end_pfn); + prev_end_pfn = max(prev_end_pfn, range_start_pfn); + + /* Update the hole size cound and move on */ + if (start_pfn > range_start_pfn) { + BUG_ON(prev_end_pfn > start_pfn); + hole_pages += start_pfn - prev_end_pfn; + } + prev_end_pfn = early_node_map[i].end_pfn; + } + + /* Account for ranges past physical memory on this node */ + if (range_end_pfn > prev_end_pfn) + hole_pages += range_end_pfn - + max(range_start_pfn, prev_end_pfn); + + return hole_pages; +} + +/** + * absent_pages_in_range - Return number of page frames in holes within a range + * @start_pfn: The start PFN to start searching for holes + * @end_pfn: The end PFN to stop searching for holes + * + * It returns the number of pages frames in memory holes within a range. + */ +unsigned long __init absent_pages_in_range(unsigned long start_pfn, + unsigned long end_pfn) +{ + return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); +} + +/* Return the number of page frames in holes in a zone on a node */ +static unsigned long __meminit zone_absent_pages_in_node(int nid, + unsigned long zone_type, + unsigned long *ignored) +{ + unsigned long node_start_pfn, node_end_pfn; + unsigned long zone_start_pfn, zone_end_pfn; + + get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); + zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type], + node_start_pfn); + zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type], + node_end_pfn); + + adjust_zone_range_for_zone_movable(nid, zone_type, + node_start_pfn, node_end_pfn, + &zone_start_pfn, &zone_end_pfn); + return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); +} + +#else +static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, + unsigned long zone_type, + unsigned long *zones_size) +{ + return zones_size[zone_type]; +} + +static inline unsigned long __meminit zone_absent_pages_in_node(int nid, + unsigned long zone_type, + unsigned long *zholes_size) +{ + if (!zholes_size) + return 0; + + return zholes_size[zone_type]; +} + +#endif + +static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, + unsigned long *zones_size, unsigned long *zholes_size) +{ + unsigned long realtotalpages, totalpages = 0; + enum zone_type i; + + for (i = 0; i < MAX_NR_ZONES; i++) + totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, + zones_size); + pgdat->node_spanned_pages = totalpages; + + realtotalpages = totalpages; + for (i = 0; i < MAX_NR_ZONES; i++) + realtotalpages -= + zone_absent_pages_in_node(pgdat->node_id, i, + zholes_size); + pgdat->node_present_pages = realtotalpages; + printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, + realtotalpages); +} + +#ifndef CONFIG_SPARSEMEM +/* + * Calculate the size of the zone->blockflags rounded to an unsigned long + * Start by making sure zonesize is a multiple of pageblock_order by rounding + * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally + * round what is now in bits to nearest long in bits, then return it in + * bytes. + */ +static unsigned long __init usemap_size(unsigned long zonesize) +{ + unsigned long usemapsize; + + usemapsize = roundup(zonesize, pageblock_nr_pages); + usemapsize = usemapsize >> pageblock_order; + usemapsize *= NR_PAGEBLOCK_BITS; + usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); + + return usemapsize / 8; +} + +static void __init setup_usemap(struct pglist_data *pgdat, + struct zone *zone, unsigned long zonesize) +{ + unsigned long usemapsize = usemap_size(zonesize); + zone->pageblock_flags = NULL; + if (usemapsize) { + zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); + memset(zone->pageblock_flags, 0, usemapsize); + } +} +#else +static void inline setup_usemap(struct pglist_data *pgdat, + struct zone *zone, unsigned long zonesize) {} +#endif /* CONFIG_SPARSEMEM */ + +#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE + +/* Return a sensible default order for the pageblock size. */ +static inline int pageblock_default_order(void) +{ + if (HPAGE_SHIFT > PAGE_SHIFT) + return HUGETLB_PAGE_ORDER; + + return MAX_ORDER-1; +} + +/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ +static inline void __init set_pageblock_order(unsigned int order) +{ + /* Check that pageblock_nr_pages has not already been setup */ + if (pageblock_order) + return; + + /* + * Assume the largest contiguous order of interest is a huge page. + * This value may be variable depending on boot parameters on IA64 + */ + pageblock_order = order; +} +#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ + +/* + * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() + * and pageblock_default_order() are unused as pageblock_order is set + * at compile-time. See include/linux/pageblock-flags.h for the values of + * pageblock_order based on the kernel config + */ +static inline int pageblock_default_order(unsigned int order) +{ + return MAX_ORDER-1; +} +#define set_pageblock_order(x) do {} while (0) + +#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ + +/* + * Set up the zone data structures: + * - mark all pages reserved + * - mark all memory queues empty + * - clear the memory bitmaps + */ +static void __paginginit free_area_init_core(struct pglist_data *pgdat, + unsigned long *zones_size, unsigned long *zholes_size) +{ + enum zone_type j; + int nid = pgdat->node_id; + unsigned long zone_start_pfn = pgdat->node_start_pfn; + int ret; + + pgdat_resize_init(pgdat); + pgdat->nr_zones = 0; + init_waitqueue_head(&pgdat->kswapd_wait); + pgdat->kswapd_max_order = 0; + pgdat_page_cgroup_init(pgdat); + + for (j = 0; j < MAX_NR_ZONES; j++) { + struct zone *zone = pgdat->node_zones + j; + unsigned long size, realsize, memmap_pages; + enum lru_list l; + + size = zone_spanned_pages_in_node(nid, j, zones_size); + realsize = size - zone_absent_pages_in_node(nid, j, + zholes_size); + + /* + * Adjust realsize so that it accounts for how much memory + * is used by this zone for memmap. This affects the watermark + * and per-cpu initialisations + */ + memmap_pages = + PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; + if (realsize >= memmap_pages) { + realsize -= memmap_pages; + printk(KERN_DEBUG + " %s zone: %lu pages used for memmap\n", + zone_names[j], memmap_pages); + } else + printk(KERN_WARNING + " %s zone: %lu pages exceeds realsize %lu\n", + zone_names[j], memmap_pages, realsize); + + /* Account for reserved pages */ + if (j == 0 && realsize > dma_reserve) { + realsize -= dma_reserve; + printk(KERN_DEBUG " %s zone: %lu pages reserved\n", + zone_names[0], dma_reserve); + } + + if (!is_highmem_idx(j)) + nr_kernel_pages += realsize; + nr_all_pages += realsize; + + zone->spanned_pages = size; + zone->present_pages = realsize; +#ifdef CONFIG_NUMA + zone->node = nid; + zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) + / 100; + zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; +#endif + zone->name = zone_names[j]; + spin_lock_init(&zone->lock); + spin_lock_init(&zone->lru_lock); + zone_seqlock_init(zone); + zone->zone_pgdat = pgdat; + + zone->prev_priority = DEF_PRIORITY; + + zone_pcp_init(zone); + for_each_lru(l) { + INIT_LIST_HEAD(&zone->lru[l].list); + zone->lru[l].nr_scan = 0; + } + zone->recent_rotated[0] = 0; + zone->recent_rotated[1] = 0; + zone->recent_scanned[0] = 0; + zone->recent_scanned[1] = 0; + zap_zone_vm_stats(zone); + zone->flags = 0; + if (!size) + continue; + + set_pageblock_order(pageblock_default_order()); + setup_usemap(pgdat, zone, size); + ret = init_currently_empty_zone(zone, zone_start_pfn, + size, MEMMAP_EARLY); + BUG_ON(ret); + memmap_init(size, nid, j, zone_start_pfn); + zone_start_pfn += size; + } +} + +static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) +{ + /* Skip empty nodes */ + if (!pgdat->node_spanned_pages) + return; + +#ifdef CONFIG_FLAT_NODE_MEM_MAP + /* ia64 gets its own node_mem_map, before this, without bootmem */ + if (!pgdat->node_mem_map) { + unsigned long size, start, end; + struct page *map; + + /* + * The zone's endpoints aren't required to be MAX_ORDER + * aligned but the node_mem_map endpoints must be in order + * for the buddy allocator to function correctly. + */ + start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); + end = pgdat->node_start_pfn + pgdat->node_spanned_pages; + end = ALIGN(end, MAX_ORDER_NR_PAGES); + size = (end - start) * sizeof(struct page); + map = alloc_remap(pgdat->node_id, size); + if (!map) + map = alloc_bootmem_node(pgdat, size); + pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); + } +#ifndef CONFIG_NEED_MULTIPLE_NODES + /* + * With no DISCONTIG, the global mem_map is just set as node 0's + */ + if (pgdat == NODE_DATA(0)) { + mem_map = NODE_DATA(0)->node_mem_map; +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP + if (page_to_pfn(mem_map) != pgdat->node_start_pfn) + mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); +#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ + } +#endif +#endif /* CONFIG_FLAT_NODE_MEM_MAP */ +} + +void __paginginit free_area_init_node(int nid, unsigned long *zones_size, + unsigned long node_start_pfn, unsigned long *zholes_size) +{ + pg_data_t *pgdat = NODE_DATA(nid); + + pgdat->node_id = nid; + pgdat->node_start_pfn = node_start_pfn; + calculate_node_totalpages(pgdat, zones_size, zholes_size); + + alloc_node_mem_map(pgdat); +#ifdef CONFIG_FLAT_NODE_MEM_MAP + printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", + nid, (unsigned long)pgdat, + (unsigned long)pgdat->node_mem_map); +#endif + + free_area_init_core(pgdat, zones_size, zholes_size); +} + +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP + +#if MAX_NUMNODES > 1 +/* + * Figure out the number of possible node ids. + */ +static void __init setup_nr_node_ids(void) +{ + unsigned int node; + unsigned int highest = 0; + + for_each_node_mask(node, node_possible_map) + highest = node; + nr_node_ids = highest + 1; +} +#else +static inline void setup_nr_node_ids(void) +{ +} +#endif + +/** + * add_active_range - Register a range of PFNs backed by physical memory + * @nid: The node ID the range resides on + * @start_pfn: The start PFN of the available physical memory + * @end_pfn: The end PFN of the available physical memory + * + * These ranges are stored in an early_node_map[] and later used by + * free_area_init_nodes() to calculate zone sizes and holes. If the + * range spans a memory hole, it is up to the architecture to ensure + * the memory is not freed by the bootmem allocator. If possible + * the range being registered will be merged with existing ranges. + */ +void __init add_active_range(unsigned int nid, unsigned long start_pfn, + unsigned long end_pfn) +{ + int i; + + mminit_dprintk(MMINIT_TRACE, "memory_register", + "Entering add_active_range(%d, %#lx, %#lx) " + "%d entries of %d used\n", + nid, start_pfn, end_pfn, + nr_nodemap_entries, MAX_ACTIVE_REGIONS); + + mminit_validate_memmodel_limits(&start_pfn, &end_pfn); + + /* Merge with existing active regions if possible */ + for (i = 0; i < nr_nodemap_entries; i++) { + if (early_node_map[i].nid != nid) + continue; + + /* Skip if an existing region covers this new one */ + if (start_pfn >= early_node_map[i].start_pfn && + end_pfn <= early_node_map[i].end_pfn) + return; + + /* Merge forward if suitable */ + if (start_pfn <= early_node_map[i].end_pfn && + end_pfn > early_node_map[i].end_pfn) { + early_node_map[i].end_pfn = end_pfn; + return; + } + + /* Merge backward if suitable */ + if (start_pfn < early_node_map[i].end_pfn && + end_pfn >= early_node_map[i].start_pfn) { + early_node_map[i].start_pfn = start_pfn; + return; + } + } + + /* Check that early_node_map is large enough */ + if (i >= MAX_ACTIVE_REGIONS) { + printk(KERN_CRIT "More than %d memory regions, truncating\n", + MAX_ACTIVE_REGIONS); + return; + } + + early_node_map[i].nid = nid; + early_node_map[i].start_pfn = start_pfn; + early_node_map[i].end_pfn = end_pfn; + nr_nodemap_entries = i + 1; +} + +/** + * remove_active_range - Shrink an existing registered range of PFNs + * @nid: The node id the range is on that should be shrunk + * @start_pfn: The new PFN of the range + * @end_pfn: The new PFN of the range + * + * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. + * The map is kept near the end physical page range that has already been + * registered. This function allows an arch to shrink an existing registered + * range. + */ +void __init remove_active_range(unsigned int nid, unsigned long start_pfn, + unsigned long end_pfn) +{ + int i, j; + int removed = 0; + + printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n", + nid, start_pfn, end_pfn); + + /* Find the old active region end and shrink */ + for_each_active_range_index_in_nid(i, nid) { + if (early_node_map[i].start_pfn >= start_pfn && + early_node_map[i].end_pfn <= end_pfn) { + /* clear it */ + early_node_map[i].start_pfn = 0; + early_node_map[i].end_pfn = 0; + removed = 1; + continue; + } + if (early_node_map[i].start_pfn < start_pfn && + early_node_map[i].end_pfn > start_pfn) { + unsigned long temp_end_pfn = early_node_map[i].end_pfn; + early_node_map[i].end_pfn = start_pfn; + if (temp_end_pfn > end_pfn) + add_active_range(nid, end_pfn, temp_end_pfn); + continue; + } + if (early_node_map[i].start_pfn >= start_pfn && + early_node_map[i].end_pfn > end_pfn && + early_node_map[i].start_pfn < end_pfn) { + early_node_map[i].start_pfn = end_pfn; + continue; + } + } + + if (!removed) + return; + + /* remove the blank ones */ + for (i = nr_nodemap_entries - 1; i > 0; i--) { + if (early_node_map[i].nid != nid) + continue; + if (early_node_map[i].end_pfn) + continue; + /* we found it, get rid of it */ + for (j = i; j < nr_nodemap_entries - 1; j++) + memcpy(&early_node_map[j], &early_node_map[j+1], + sizeof(early_node_map[j])); + j = nr_nodemap_entries - 1; + memset(&early_node_map[j], 0, sizeof(early_node_map[j])); + nr_nodemap_entries--; + } +} + +/** + * remove_all_active_ranges - Remove all currently registered regions + * + * During discovery, it may be found that a table like SRAT is invalid + * and an alternative discovery method must be used. This function removes + * all currently registered regions. + */ +void __init remove_all_active_ranges(void) +{ + memset(early_node_map, 0, sizeof(early_node_map)); + nr_nodemap_entries = 0; +#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE + memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn)); + memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn)); +#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ +} + +/* Compare two active node_active_regions */ +static int __init cmp_node_active_region(const void *a, const void *b) +{ + struct node_active_region *arange = (struct node_active_region *)a; + struct node_active_region *brange = (struct node_active_region *)b; + + /* Done this way to avoid overflows */ + if (arange->start_pfn > brange->start_pfn) + return 1; + if (arange->start_pfn < brange->start_pfn) + return -1; + + return 0; +} + +/* sort the node_map by start_pfn */ +static void __init sort_node_map(void) +{ + sort(early_node_map, (size_t)nr_nodemap_entries, + sizeof(struct node_active_region), + cmp_node_active_region, NULL); +} + +/* Find the lowest pfn for a node */ +static unsigned long __init find_min_pfn_for_node(int nid) +{ + int i; + unsigned long min_pfn = ULONG_MAX; + + /* Assuming a sorted map, the first range found has the starting pfn */ + for_each_active_range_index_in_nid(i, nid) + min_pfn = min(min_pfn, early_node_map[i].start_pfn); + + if (min_pfn == ULONG_MAX) { + printk(KERN_WARNING + "Could not find start_pfn for node %d\n", nid); + return 0; + } + + return min_pfn; +} + +/** + * find_min_pfn_with_active_regions - Find the minimum PFN registered + * + * It returns the minimum PFN based on information provided via + * add_active_range(). + */ +unsigned long __init find_min_pfn_with_active_regions(void) +{ + return find_min_pfn_for_node(MAX_NUMNODES); +} + +/* + * early_calculate_totalpages() + * Sum pages in active regions for movable zone. + * Populate N_HIGH_MEMORY for calculating usable_nodes. + */ +static unsigned long __init early_calculate_totalpages(void) +{ + int i; + unsigned long totalpages = 0; + + for (i = 0; i < nr_nodemap_entries; i++) { + unsigned long pages = early_node_map[i].end_pfn - + early_node_map[i].start_pfn; + totalpages += pages; + if (pages) + node_set_state(early_node_map[i].nid, N_HIGH_MEMORY); + } + return totalpages; +} + +/* + * Find the PFN the Movable zone begins in each node. Kernel memory + * is spread evenly between nodes as long as the nodes have enough + * memory. When they don't, some nodes will have more kernelcore than + * others + */ +static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) +{ + int i, nid; + unsigned long usable_startpfn; + unsigned long kernelcore_node, kernelcore_remaining; + unsigned long totalpages = early_calculate_totalpages(); + int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); + + /* + * If movablecore was specified, calculate what size of + * kernelcore that corresponds so that memory usable for + * any allocation type is evenly spread. If both kernelcore + * and movablecore are specified, then the value of kernelcore + * will be used for required_kernelcore if it's greater than + * what movablecore would have allowed. + */ + if (required_movablecore) { + unsigned long corepages; + + /* + * Round-up so that ZONE_MOVABLE is at least as large as what + * was requested by the user + */ + required_movablecore = + roundup(required_movablecore, MAX_ORDER_NR_PAGES); + corepages = totalpages - required_movablecore; + + required_kernelcore = max(required_kernelcore, corepages); + } + + /* If kernelcore was not specified, there is no ZONE_MOVABLE */ + if (!required_kernelcore) + return; + + /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ + find_usable_zone_for_movable(); + usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; + +restart: + /* Spread kernelcore memory as evenly as possible throughout nodes */ + kernelcore_node = required_kernelcore / usable_nodes; + for_each_node_state(nid, N_HIGH_MEMORY) { + /* + * Recalculate kernelcore_node if the division per node + * now exceeds what is necessary to satisfy the requested + * amount of memory for the kernel + */ + if (required_kernelcore < kernelcore_node) + kernelcore_node = required_kernelcore / usable_nodes; + + /* + * As the map is walked, we track how much memory is usable + * by the kernel using kernelcore_remaining. When it is + * 0, the rest of the node is usable by ZONE_MOVABLE + */ + kernelcore_remaining = kernelcore_node; + + /* Go through each range of PFNs within this node */ + for_each_active_range_index_in_nid(i, nid) { + unsigned long start_pfn, end_pfn; + unsigned long size_pages; + + start_pfn = max(early_node_map[i].start_pfn, + zone_movable_pfn[nid]); + end_pfn = early_node_map[i].end_pfn; + if (start_pfn >= end_pfn) + continue; + + /* Account for what is only usable for kernelcore */ + if (start_pfn < usable_startpfn) { + unsigned long kernel_pages; + kernel_pages = min(end_pfn, usable_startpfn) + - start_pfn; + + kernelcore_remaining -= min(kernel_pages, + kernelcore_remaining); + required_kernelcore -= min(kernel_pages, + required_kernelcore); + + /* Continue if range is now fully accounted */ + if (end_pfn <= usable_startpfn) { + + /* + * Push zone_movable_pfn to the end so + * that if we have to rebalance + * kernelcore across nodes, we will + * not double account here + */ + zone_movable_pfn[nid] = end_pfn; + continue; + } + start_pfn = usable_startpfn; + } + + /* + * The usable PFN range for ZONE_MOVABLE is from + * start_pfn->end_pfn. Calculate size_pages as the + * number of pages used as kernelcore + */ + size_pages = end_pfn - start_pfn; + if (size_pages > kernelcore_remaining) + size_pages = kernelcore_remaining; + zone_movable_pfn[nid] = start_pfn + size_pages; + + /* + * Some kernelcore has been met, update counts and + * break if the kernelcore for this node has been + * satisified + */ + required_kernelcore -= min(required_kernelcore, + size_pages); + kernelcore_remaining -= size_pages; + if (!kernelcore_remaining) + break; + } + } + + /* + * If there is still required_kernelcore, we do another pass with one + * less node in the count. This will push zone_movable_pfn[nid] further + * along on the nodes that still have memory until kernelcore is + * satisified + */ + usable_nodes--; + if (usable_nodes && required_kernelcore > usable_nodes) + goto restart; + + /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ + for (nid = 0; nid < MAX_NUMNODES; nid++) + zone_movable_pfn[nid] = + roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); +} + +/* Any regular memory on that node ? */ +static void check_for_regular_memory(pg_data_t *pgdat) +{ +#ifdef CONFIG_HIGHMEM + enum zone_type zone_type; + + for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { + struct zone *zone = &pgdat->node_zones[zone_type]; + if (zone->present_pages) + node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); + } +#endif +} + +/** + * free_area_init_nodes - Initialise all pg_data_t and zone data + * @max_zone_pfn: an array of max PFNs for each zone + * + * This will call free_area_init_node() for each active node in the system. + * Using the page ranges provided by add_active_range(), the size of each + * zone in each node and their holes is calculated. If the maximum PFN + * between two adjacent zones match, it is assumed that the zone is empty. + * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed + * that arch_max_dma32_pfn has no pages. It is also assumed that a zone + * starts where the previous one ended. For example, ZONE_DMA32 starts + * at arch_max_dma_pfn. + */ +void __init free_area_init_nodes(unsigned long *max_zone_pfn) +{ + unsigned long nid; + int i; + + /* Sort early_node_map as initialisation assumes it is sorted */ + sort_node_map(); + + /* Record where the zone boundaries are */ + memset(arch_zone_lowest_possible_pfn, 0, + sizeof(arch_zone_lowest_possible_pfn)); + memset(arch_zone_highest_possible_pfn, 0, + sizeof(arch_zone_highest_possible_pfn)); + arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); + arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; + for (i = 1; i < MAX_NR_ZONES; i++) { + if (i == ZONE_MOVABLE) + continue; + arch_zone_lowest_possible_pfn[i] = + arch_zone_highest_possible_pfn[i-1]; + arch_zone_highest_possible_pfn[i] = + max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); + } + arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; + arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; + + /* Find the PFNs that ZONE_MOVABLE begins at in each node */ + memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); + find_zone_movable_pfns_for_nodes(zone_movable_pfn); + + /* Print out the zone ranges */ + printk("Zone PFN ranges:\n"); + for (i = 0; i < MAX_NR_ZONES; i++) { + if (i == ZONE_MOVABLE) + continue; + printk(" %-8s %0#10lx -> %0#10lx\n", + zone_names[i], + arch_zone_lowest_possible_pfn[i], + arch_zone_highest_possible_pfn[i]); + } + + /* Print out the PFNs ZONE_MOVABLE begins at in each node */ + printk("Movable zone start PFN for each node\n"); + for (i = 0; i < MAX_NUMNODES; i++) { + if (zone_movable_pfn[i]) + printk(" Node %d: %lu\n", i, zone_movable_pfn[i]); + } + + /* Print out the early_node_map[] */ + printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); + for (i = 0; i < nr_nodemap_entries; i++) + printk(" %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid, + early_node_map[i].start_pfn, + early_node_map[i].end_pfn); + + /* Initialise every node */ + mminit_verify_pageflags_layout(); + setup_nr_node_ids(); + for_each_online_node(nid) { + pg_data_t *pgdat = NODE_DATA(nid); + free_area_init_node(nid, NULL, + find_min_pfn_for_node(nid), NULL); + + /* Any memory on that node */ + if (pgdat->node_present_pages) + node_set_state(nid, N_HIGH_MEMORY); + check_for_regular_memory(pgdat); + } +} + +static int __init cmdline_parse_core(char *p, unsigned long *core) +{ + unsigned long long coremem; + if (!p) + return -EINVAL; + + coremem = memparse(p, &p); + *core = coremem >> PAGE_SHIFT; + + /* Paranoid check that UL is enough for the coremem value */ + WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); + + return 0; +} + +/* + * kernelcore=size sets the amount of memory for use for allocations that + * cannot be reclaimed or migrated. + */ +static int __init cmdline_parse_kernelcore(char *p) +{ + return cmdline_parse_core(p, &required_kernelcore); +} + +/* + * movablecore=size sets the amount of memory for use for allocations that + * can be reclaimed or migrated. + */ +static int __init cmdline_parse_movablecore(char *p) +{ + return cmdline_parse_core(p, &required_movablecore); +} + +early_param("kernelcore", cmdline_parse_kernelcore); +early_param("movablecore", cmdline_parse_movablecore); + +#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ + +/** + * set_dma_reserve - set the specified number of pages reserved in the first zone + * @new_dma_reserve: The number of pages to mark reserved + * + * The per-cpu batchsize and zone watermarks are determined by present_pages. + * In the DMA zone, a significant percentage may be consumed by kernel image + * and other unfreeable allocations which can skew the watermarks badly. This + * function may optionally be used to account for unfreeable pages in the + * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and + * smaller per-cpu batchsize. + */ +void __init set_dma_reserve(unsigned long new_dma_reserve) +{ + dma_reserve = new_dma_reserve; +} + +#ifndef CONFIG_NEED_MULTIPLE_NODES +struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] }; +EXPORT_SYMBOL(contig_page_data); +#endif + +void __init free_area_init(unsigned long *zones_size) +{ + free_area_init_node(0, zones_size, + __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); +} + +static int page_alloc_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + int cpu = (unsigned long)hcpu; + + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { + drain_pages(cpu); + + /* + * Spill the event counters of the dead processor + * into the current processors event counters. + * This artificially elevates the count of the current + * processor. + */ + vm_events_fold_cpu(cpu); + + /* + * Zero the differential counters of the dead processor + * so that the vm statistics are consistent. + * + * This is only okay since the processor is dead and cannot + * race with what we are doing. + */ + refresh_cpu_vm_stats(cpu); + } + return NOTIFY_OK; +} + +void __init page_alloc_init(void) +{ + hotcpu_notifier(page_alloc_cpu_notify, 0); +} + +/* + * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio + * or min_free_kbytes changes. + */ +static void calculate_totalreserve_pages(void) +{ + struct pglist_data *pgdat; + unsigned long reserve_pages = 0; + enum zone_type i, j; + + for_each_online_pgdat(pgdat) { + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zone *zone = pgdat->node_zones + i; + unsigned long max = 0; + + /* Find valid and maximum lowmem_reserve in the zone */ + for (j = i; j < MAX_NR_ZONES; j++) { + if (zone->lowmem_reserve[j] > max) + max = zone->lowmem_reserve[j]; + } + + /* we treat pages_high as reserved pages. */ + max += zone->pages_high; + + if (max > zone->present_pages) + max = zone->present_pages; + reserve_pages += max; + } + } + totalreserve_pages = reserve_pages; +} + +/* + * setup_per_zone_lowmem_reserve - called whenever + * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone + * has a correct pages reserved value, so an adequate number of + * pages are left in the zone after a successful __alloc_pages(). + */ +static void setup_per_zone_lowmem_reserve(void) +{ + struct pglist_data *pgdat; + enum zone_type j, idx; + + for_each_online_pgdat(pgdat) { + for (j = 0; j < MAX_NR_ZONES; j++) { + struct zone *zone = pgdat->node_zones + j; + unsigned long present_pages = zone->present_pages; + + zone->lowmem_reserve[j] = 0; + + idx = j; + while (idx) { + struct zone *lower_zone; + + idx--; + + if (sysctl_lowmem_reserve_ratio[idx] < 1) + sysctl_lowmem_reserve_ratio[idx] = 1; + + lower_zone = pgdat->node_zones + idx; + lower_zone->lowmem_reserve[j] = present_pages / + sysctl_lowmem_reserve_ratio[idx]; + present_pages += lower_zone->present_pages; + } + } + } + + /* update totalreserve_pages */ + calculate_totalreserve_pages(); +} + +/** + * setup_per_zone_pages_min - called when min_free_kbytes changes. + * + * Ensures that the pages_{min,low,high} values for each zone are set correctly + * with respect to min_free_kbytes. + */ +void setup_per_zone_pages_min(void) +{ + unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); + unsigned long lowmem_pages = 0; + struct zone *zone; + unsigned long flags; + + /* Calculate total number of !ZONE_HIGHMEM pages */ + for_each_zone(zone) { + if (!is_highmem(zone)) + lowmem_pages += zone->present_pages; + } + + for_each_zone(zone) { + u64 tmp; + + spin_lock_irqsave(&zone->lock, flags); + tmp = (u64)pages_min * zone->present_pages; + do_div(tmp, lowmem_pages); + if (is_highmem(zone)) { + /* + * __GFP_HIGH and PF_MEMALLOC allocations usually don't + * need highmem pages, so cap pages_min to a small + * value here. + * + * The (pages_high-pages_low) and (pages_low-pages_min) + * deltas controls asynch page reclaim, and so should + * not be capped for highmem. + */ + int min_pages; + + min_pages = zone->present_pages / 1024; + if (min_pages < SWAP_CLUSTER_MAX) + min_pages = SWAP_CLUSTER_MAX; + if (min_pages > 128) + min_pages = 128; + zone->pages_min = min_pages; + } else { + /* + * If it's a lowmem zone, reserve a number of pages + * proportionate to the zone's size. + */ + zone->pages_min = tmp; + } + + zone->pages_low = zone->pages_min + (tmp >> 2); + zone->pages_high = zone->pages_min + (tmp >> 1); + setup_zone_migrate_reserve(zone); + spin_unlock_irqrestore(&zone->lock, flags); + } + + /* update totalreserve_pages */ + calculate_totalreserve_pages(); +} + +/** + * setup_per_zone_inactive_ratio - called when min_free_kbytes changes. + * + * The inactive anon list should be small enough that the VM never has to + * do too much work, but large enough that each inactive page has a chance + * to be referenced again before it is swapped out. + * + * The inactive_anon ratio is the target ratio of ACTIVE_ANON to + * INACTIVE_ANON pages on this zone's LRU, maintained by the + * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of + * the anonymous pages are kept on the inactive list. + * + * total target max + * memory ratio inactive anon + * ------------------------------------- + * 10MB 1 5MB + * 100MB 1 50MB + * 1GB 3 250MB + * 10GB 10 0.9GB + * 100GB 31 3GB + * 1TB 101 10GB + * 10TB 320 32GB + */ +void setup_per_zone_inactive_ratio(void) +{ + struct zone *zone; + + for_each_zone(zone) { + unsigned int gb, ratio; + + /* Zone size in gigabytes */ + gb = zone->present_pages >> (30 - PAGE_SHIFT); + ratio = int_sqrt(10 * gb); + if (!ratio) + ratio = 1; + + zone->inactive_ratio = ratio; + } +} + +/* + * Initialise min_free_kbytes. + * + * For small machines we want it small (128k min). For large machines + * we want it large (64MB max). But it is not linear, because network + * bandwidth does not increase linearly with machine size. We use + * + * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: + * min_free_kbytes = sqrt(lowmem_kbytes * 16) + * + * which yields + * + * 16MB: 512k + * 32MB: 724k + * 64MB: 1024k + * 128MB: 1448k + * 256MB: 2048k + * 512MB: 2896k + * 1024MB: 4096k + * 2048MB: 5792k + * 4096MB: 8192k + * 8192MB: 11584k + * 16384MB: 16384k + */ +static int __init init_per_zone_pages_min(void) +{ + unsigned long lowmem_kbytes; + + lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); + + min_free_kbytes = int_sqrt(lowmem_kbytes * 16); + if (min_free_kbytes < 128) + min_free_kbytes = 128; + if (min_free_kbytes > 65536) + min_free_kbytes = 65536; + setup_per_zone_pages_min(); + setup_per_zone_lowmem_reserve(); + setup_per_zone_inactive_ratio(); + return 0; +} +module_init(init_per_zone_pages_min) + +/* + * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so + * that we can call two helper functions whenever min_free_kbytes + * changes. + */ +int min_free_kbytes_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + proc_dointvec(table, write, file, buffer, length, ppos); + if (write) + setup_per_zone_pages_min(); + return 0; +} + +#ifdef CONFIG_NUMA +int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + struct zone *zone; + int rc; + + rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + if (rc) + return rc; + + for_each_zone(zone) + zone->min_unmapped_pages = (zone->present_pages * + sysctl_min_unmapped_ratio) / 100; + return 0; +} + +int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + struct zone *zone; + int rc; + + rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + if (rc) + return rc; + + for_each_zone(zone) + zone->min_slab_pages = (zone->present_pages * + sysctl_min_slab_ratio) / 100; + return 0; +} +#endif + +/* + * lowmem_reserve_ratio_sysctl_handler - just a wrapper around + * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() + * whenever sysctl_lowmem_reserve_ratio changes. + * + * The reserve ratio obviously has absolutely no relation with the + * pages_min watermarks. The lowmem reserve ratio can only make sense + * if in function of the boot time zone sizes. + */ +int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + proc_dointvec_minmax(table, write, file, buffer, length, ppos); + setup_per_zone_lowmem_reserve(); + return 0; +} + +/* + * percpu_pagelist_fraction - changes the pcp->high for each zone on each + * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist + * can have before it gets flushed back to buddy allocator. + */ + +int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + struct zone *zone; + unsigned int cpu; + int ret; + + ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + if (!write || (ret == -EINVAL)) + return ret; + for_each_zone(zone) { + for_each_online_cpu(cpu) { + unsigned long high; + high = zone->present_pages / percpu_pagelist_fraction; + setup_pagelist_highmark(zone_pcp(zone, cpu), high); + } + } + return 0; +} + +int hashdist = HASHDIST_DEFAULT; + +#ifdef CONFIG_NUMA +static int __init set_hashdist(char *str) +{ + if (!str) + return 0; + hashdist = simple_strtoul(str, &str, 0); + return 1; +} +__setup("hashdist=", set_hashdist); +#endif + +/* + * allocate a large system hash table from bootmem + * - it is assumed that the hash table must contain an exact power-of-2 + * quantity of entries + * - limit is the number of hash buckets, not the total allocation size + */ +void *__init alloc_large_system_hash(const char *tablename, + unsigned long bucketsize, + unsigned long numentries, + int scale, + int flags, + unsigned int *_hash_shift, + unsigned int *_hash_mask, + unsigned long limit) +{ + unsigned long long max = limit; + unsigned long log2qty, size; + void *table = NULL; + + /* allow the kernel cmdline to have a say */ + if (!numentries) { + /* round applicable memory size up to nearest megabyte */ + numentries = nr_kernel_pages; + numentries += (1UL << (20 - PAGE_SHIFT)) - 1; + numentries >>= 20 - PAGE_SHIFT; + numentries <<= 20 - PAGE_SHIFT; + + /* limit to 1 bucket per 2^scale bytes of low memory */ + if (scale > PAGE_SHIFT) + numentries >>= (scale - PAGE_SHIFT); + else + numentries <<= (PAGE_SHIFT - scale); + + /* Make sure we've got at least a 0-order allocation.. */ + if (unlikely((numentries * bucketsize) < PAGE_SIZE)) + numentries = PAGE_SIZE / bucketsize; + } + numentries = roundup_pow_of_two(numentries); + + /* limit allocation size to 1/16 total memory by default */ + if (max == 0) { + max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; + do_div(max, bucketsize); + } + + if (numentries > max) + numentries = max; + + log2qty = ilog2(numentries); + + do { + size = bucketsize << log2qty; + if (flags & HASH_EARLY) + table = alloc_bootmem_nopanic(size); + else if (hashdist) + table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); + else { + unsigned long order = get_order(size); + table = (void*) __get_free_pages(GFP_ATOMIC, order); + /* + * If bucketsize is not a power-of-two, we may free + * some pages at the end of hash table. + */ + if (table) { + unsigned long alloc_end = (unsigned long)table + + (PAGE_SIZE << order); + unsigned long used = (unsigned long)table + + PAGE_ALIGN(size); + split_page(virt_to_page(table), order); + while (used < alloc_end) { + free_page(used); + used += PAGE_SIZE; + } + } + } + } while (!table && size > PAGE_SIZE && --log2qty); + + if (!table) + panic("Failed to allocate %s hash table\n", tablename); + + printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n", + tablename, + (1U << log2qty), + ilog2(size) - PAGE_SHIFT, + size); + + if (_hash_shift) + *_hash_shift = log2qty; + if (_hash_mask) + *_hash_mask = (1 << log2qty) - 1; + + return table; +} + +#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE +struct page *pfn_to_page(unsigned long pfn) +{ + return __pfn_to_page(pfn); +} +unsigned long page_to_pfn(struct page *page) +{ + return __page_to_pfn(page); +} +EXPORT_SYMBOL(pfn_to_page); +EXPORT_SYMBOL(page_to_pfn); +#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ + +/* Return a pointer to the bitmap storing bits affecting a block of pages */ +static inline unsigned long *get_pageblock_bitmap(struct zone *zone, + unsigned long pfn) +{ +#ifdef CONFIG_SPARSEMEM + return __pfn_to_section(pfn)->pageblock_flags; +#else + return zone->pageblock_flags; +#endif /* CONFIG_SPARSEMEM */ +} + +static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) +{ +#ifdef CONFIG_SPARSEMEM + pfn &= (PAGES_PER_SECTION-1); + return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; +#else + pfn = pfn - zone->zone_start_pfn; + return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; +#endif /* CONFIG_SPARSEMEM */ +} + +/** + * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages + * @page: The page within the block of interest + * @start_bitidx: The first bit of interest to retrieve + * @end_bitidx: The last bit of interest + * returns pageblock_bits flags + */ +unsigned long get_pageblock_flags_group(struct page *page, + int start_bitidx, int end_bitidx) +{ + struct zone *zone; + unsigned long *bitmap; + unsigned long pfn, bitidx; + unsigned long flags = 0; + unsigned long value = 1; + + zone = page_zone(page); + pfn = page_to_pfn(page); + bitmap = get_pageblock_bitmap(zone, pfn); + bitidx = pfn_to_bitidx(zone, pfn); + + for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) + if (test_bit(bitidx + start_bitidx, bitmap)) + flags |= value; + + return flags; +} + +/** + * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages + * @page: The page within the block of interest + * @start_bitidx: The first bit of interest + * @end_bitidx: The last bit of interest + * @flags: The flags to set + */ +void set_pageblock_flags_group(struct page *page, unsigned long flags, + int start_bitidx, int end_bitidx) +{ + struct zone *zone; + unsigned long *bitmap; + unsigned long pfn, bitidx; + unsigned long value = 1; + + zone = page_zone(page); + pfn = page_to_pfn(page); + bitmap = get_pageblock_bitmap(zone, pfn); + bitidx = pfn_to_bitidx(zone, pfn); + VM_BUG_ON(pfn < zone->zone_start_pfn); + VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages); + + for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) + if (flags & value) + __set_bit(bitidx + start_bitidx, bitmap); + else + __clear_bit(bitidx + start_bitidx, bitmap); +} + +/* + * This is designed as sub function...plz see page_isolation.c also. + * set/clear page block's type to be ISOLATE. + * page allocater never alloc memory from ISOLATE block. + */ + +int set_migratetype_isolate(struct page *page) +{ + struct zone *zone; + unsigned long flags; + int ret = -EBUSY; + + zone = page_zone(page); + spin_lock_irqsave(&zone->lock, flags); + /* + * In future, more migrate types will be able to be isolation target. + */ + if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE) + goto out; + set_pageblock_migratetype(page, MIGRATE_ISOLATE); + move_freepages_block(zone, page, MIGRATE_ISOLATE); + ret = 0; +out: + spin_unlock_irqrestore(&zone->lock, flags); + if (!ret) + drain_all_pages(); + return ret; +} + +void unset_migratetype_isolate(struct page *page) +{ + struct zone *zone; + unsigned long flags; + zone = page_zone(page); + spin_lock_irqsave(&zone->lock, flags); + if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + goto out; + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + move_freepages_block(zone, page, MIGRATE_MOVABLE); +out: + spin_unlock_irqrestore(&zone->lock, flags); +} + +#ifdef CONFIG_MEMORY_HOTREMOVE +/* + * All pages in the range must be isolated before calling this. + */ +void +__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) +{ + struct page *page; + struct zone *zone; + int order, i; + unsigned long pfn; + unsigned long flags; + /* find the first valid pfn */ + for (pfn = start_pfn; pfn < end_pfn; pfn++) + if (pfn_valid(pfn)) + break; + if (pfn == end_pfn) + return; + zone = page_zone(pfn_to_page(pfn)); + spin_lock_irqsave(&zone->lock, flags); + pfn = start_pfn; + while (pfn < end_pfn) { + if (!pfn_valid(pfn)) { + pfn++; + continue; + } + page = pfn_to_page(pfn); + BUG_ON(page_count(page)); + BUG_ON(!PageBuddy(page)); + order = page_order(page); +#ifdef CONFIG_DEBUG_VM + printk(KERN_INFO "remove from free list %lx %d %lx\n", + pfn, 1 << order, end_pfn); +#endif + list_del(&page->lru); + rmv_page_order(page); + zone->free_area[order].nr_free--; + __mod_zone_page_state(zone, NR_FREE_PAGES, + - (1UL << order)); + for (i = 0; i < (1 << order); i++) + SetPageReserved((page+i)); + pfn += (1 << order); + } + spin_unlock_irqrestore(&zone->lock, flags); +} +#endif diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c new file mode 100644 index 0000000..ab27ff7 --- /dev/null +++ b/mm/page_cgroup.c @@ -0,0 +1,275 @@ +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/bootmem.h> +#include <linux/bit_spinlock.h> +#include <linux/page_cgroup.h> +#include <linux/hash.h> +#include <linux/slab.h> +#include <linux/memory.h> +#include <linux/vmalloc.h> +#include <linux/cgroup.h> + +static void __meminit +__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) +{ + pc->flags = 0; + pc->mem_cgroup = NULL; + pc->page = pfn_to_page(pfn); +} +static unsigned long total_usage; + +#if !defined(CONFIG_SPARSEMEM) + + +void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) +{ + pgdat->node_page_cgroup = NULL; +} + +struct page_cgroup *lookup_page_cgroup(struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + unsigned long offset; + struct page_cgroup *base; + + base = NODE_DATA(page_to_nid(page))->node_page_cgroup; + if (unlikely(!base)) + return NULL; + + offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn; + return base + offset; +} + +static int __init alloc_node_page_cgroup(int nid) +{ + struct page_cgroup *base, *pc; + unsigned long table_size; + unsigned long start_pfn, nr_pages, index; + + start_pfn = NODE_DATA(nid)->node_start_pfn; + nr_pages = NODE_DATA(nid)->node_spanned_pages; + + if (!nr_pages) + return 0; + + table_size = sizeof(struct page_cgroup) * nr_pages; + + base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), + table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + if (!base) + return -ENOMEM; + for (index = 0; index < nr_pages; index++) { + pc = base + index; + __init_page_cgroup(pc, start_pfn + index); + } + NODE_DATA(nid)->node_page_cgroup = base; + total_usage += table_size; + return 0; +} + +void __init page_cgroup_init(void) +{ + + int nid, fail; + + if (mem_cgroup_subsys.disabled) + return; + + for_each_online_node(nid) { + fail = alloc_node_page_cgroup(nid); + if (fail) + goto fail; + } + printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); + printk(KERN_INFO "please try cgroup_disable=memory option if you" + " don't want\n"); + return; +fail: + printk(KERN_CRIT "allocation of page_cgroup was failed.\n"); + printk(KERN_CRIT "please try cgroup_disable=memory boot option\n"); + panic("Out of memory"); +} + +#else /* CONFIG_FLAT_NODE_MEM_MAP */ + +struct page_cgroup *lookup_page_cgroup(struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + struct mem_section *section = __pfn_to_section(pfn); + + return section->page_cgroup + pfn; +} + +/* __alloc_bootmem...() is protected by !slab_available() */ +int __init_refok init_section_page_cgroup(unsigned long pfn) +{ + struct mem_section *section; + struct page_cgroup *base, *pc; + unsigned long table_size; + int nid, index; + + section = __pfn_to_section(pfn); + + if (!section->page_cgroup) { + nid = page_to_nid(pfn_to_page(pfn)); + table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; + if (slab_is_available()) { + base = kmalloc_node(table_size, GFP_KERNEL, nid); + if (!base) + base = vmalloc_node(table_size, nid); + } else { + base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), + table_size, + PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + } + } else { + /* + * We don't have to allocate page_cgroup again, but + * address of memmap may be changed. So, we have to initialize + * again. + */ + base = section->page_cgroup + pfn; + table_size = 0; + /* check address of memmap is changed or not. */ + if (base->page == pfn_to_page(pfn)) + return 0; + } + + if (!base) { + printk(KERN_ERR "page cgroup allocation failure\n"); + return -ENOMEM; + } + + for (index = 0; index < PAGES_PER_SECTION; index++) { + pc = base + index; + __init_page_cgroup(pc, pfn + index); + } + + section = __pfn_to_section(pfn); + section->page_cgroup = base - pfn; + total_usage += table_size; + return 0; +} +#ifdef CONFIG_MEMORY_HOTPLUG +void __free_page_cgroup(unsigned long pfn) +{ + struct mem_section *ms; + struct page_cgroup *base; + + ms = __pfn_to_section(pfn); + if (!ms || !ms->page_cgroup) + return; + base = ms->page_cgroup + pfn; + if (is_vmalloc_addr(base)) { + vfree(base); + ms->page_cgroup = NULL; + } else { + struct page *page = virt_to_page(base); + if (!PageReserved(page)) { /* Is bootmem ? */ + kfree(base); + ms->page_cgroup = NULL; + } + } +} + +int __meminit online_page_cgroup(unsigned long start_pfn, + unsigned long nr_pages, + int nid) +{ + unsigned long start, end, pfn; + int fail = 0; + + start = start_pfn & ~(PAGES_PER_SECTION - 1); + end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); + + for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { + if (!pfn_present(pfn)) + continue; + fail = init_section_page_cgroup(pfn); + } + if (!fail) + return 0; + + /* rollback */ + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) + __free_page_cgroup(pfn); + + return -ENOMEM; +} + +int __meminit offline_page_cgroup(unsigned long start_pfn, + unsigned long nr_pages, int nid) +{ + unsigned long start, end, pfn; + + start = start_pfn & ~(PAGES_PER_SECTION - 1); + end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); + + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) + __free_page_cgroup(pfn); + return 0; + +} + +static int __meminit page_cgroup_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct memory_notify *mn = arg; + int ret = 0; + switch (action) { + case MEM_GOING_ONLINE: + ret = online_page_cgroup(mn->start_pfn, + mn->nr_pages, mn->status_change_nid); + break; + case MEM_OFFLINE: + offline_page_cgroup(mn->start_pfn, + mn->nr_pages, mn->status_change_nid); + break; + case MEM_CANCEL_ONLINE: + case MEM_GOING_OFFLINE: + break; + case MEM_ONLINE: + case MEM_CANCEL_OFFLINE: + break; + } + + if (ret) + ret = notifier_from_errno(ret); + else + ret = NOTIFY_OK; + + return ret; +} + +#endif + +void __init page_cgroup_init(void) +{ + unsigned long pfn; + int fail = 0; + + if (mem_cgroup_subsys.disabled) + return; + + for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { + if (!pfn_present(pfn)) + continue; + fail = init_section_page_cgroup(pfn); + } + if (fail) { + printk(KERN_CRIT "try cgroup_disable=memory boot option\n"); + panic("Out of memory"); + } else { + hotplug_memory_notifier(page_cgroup_callback, 0); + } + printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); + printk(KERN_INFO "please try cgroup_disable=memory option if you don't" + " want\n"); +} + +void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) +{ + return; +} + +#endif diff --git a/mm/page_io.c b/mm/page_io.c new file mode 100644 index 0000000..065c448 --- /dev/null +++ b/mm/page_io.c @@ -0,0 +1,141 @@ +/* + * linux/mm/page_io.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * Swap reorganised 29.12.95, + * Asynchronous swapping added 30.12.95. Stephen Tweedie + * Removed race in async swapping. 14.4.1996. Bruno Haible + * Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie + * Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman + */ + +#include <linux/mm.h> +#include <linux/kernel_stat.h> +#include <linux/pagemap.h> +#include <linux/swap.h> +#include <linux/bio.h> +#include <linux/swapops.h> +#include <linux/writeback.h> +#include <asm/pgtable.h> + +static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index, + struct page *page, bio_end_io_t end_io) +{ + struct bio *bio; + + bio = bio_alloc(gfp_flags, 1); + if (bio) { + struct swap_info_struct *sis; + swp_entry_t entry = { .val = index, }; + + sis = get_swap_info_struct(swp_type(entry)); + bio->bi_sector = map_swap_page(sis, swp_offset(entry)) * + (PAGE_SIZE >> 9); + bio->bi_bdev = sis->bdev; + bio->bi_io_vec[0].bv_page = page; + bio->bi_io_vec[0].bv_len = PAGE_SIZE; + bio->bi_io_vec[0].bv_offset = 0; + bio->bi_vcnt = 1; + bio->bi_idx = 0; + bio->bi_size = PAGE_SIZE; + bio->bi_end_io = end_io; + } + return bio; +} + +static void end_swap_bio_write(struct bio *bio, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct page *page = bio->bi_io_vec[0].bv_page; + + if (!uptodate) { + SetPageError(page); + /* + * We failed to write the page out to swap-space. + * Re-dirty the page in order to avoid it being reclaimed. + * Also print a dire warning that things will go BAD (tm) + * very quickly. + * + * Also clear PG_reclaim to avoid rotate_reclaimable_page() + */ + set_page_dirty(page); + printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n", + imajor(bio->bi_bdev->bd_inode), + iminor(bio->bi_bdev->bd_inode), + (unsigned long long)bio->bi_sector); + ClearPageReclaim(page); + } + end_page_writeback(page); + bio_put(bio); +} + +void end_swap_bio_read(struct bio *bio, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct page *page = bio->bi_io_vec[0].bv_page; + + if (!uptodate) { + SetPageError(page); + ClearPageUptodate(page); + printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", + imajor(bio->bi_bdev->bd_inode), + iminor(bio->bi_bdev->bd_inode), + (unsigned long long)bio->bi_sector); + } else { + SetPageUptodate(page); + } + unlock_page(page); + bio_put(bio); +} + +/* + * We may have stale swap cache pages in memory: notice + * them here and get rid of the unnecessary final write. + */ +int swap_writepage(struct page *page, struct writeback_control *wbc) +{ + struct bio *bio; + int ret = 0, rw = WRITE; + + if (remove_exclusive_swap_page(page)) { + unlock_page(page); + goto out; + } + bio = get_swap_bio(GFP_NOIO, page_private(page), page, + end_swap_bio_write); + if (bio == NULL) { + set_page_dirty(page); + unlock_page(page); + ret = -ENOMEM; + goto out; + } + if (wbc->sync_mode == WB_SYNC_ALL) + rw |= (1 << BIO_RW_SYNC); + count_vm_event(PSWPOUT); + set_page_writeback(page); + unlock_page(page); + submit_bio(rw, bio); +out: + return ret; +} + +int swap_readpage(struct file *file, struct page *page) +{ + struct bio *bio; + int ret = 0; + + BUG_ON(!PageLocked(page)); + BUG_ON(PageUptodate(page)); + bio = get_swap_bio(GFP_KERNEL, page_private(page), page, + end_swap_bio_read); + if (bio == NULL) { + unlock_page(page); + ret = -ENOMEM; + goto out; + } + count_vm_event(PSWPIN); + submit_bio(READ, bio); +out: + return ret; +} diff --git a/mm/page_isolation.c b/mm/page_isolation.c new file mode 100644 index 0000000..5e0ffd9 --- /dev/null +++ b/mm/page_isolation.c @@ -0,0 +1,142 @@ +/* + * linux/mm/page_isolation.c + */ + +#include <linux/mm.h> +#include <linux/page-isolation.h> +#include <linux/pageblock-flags.h> +#include "internal.h" + +static inline struct page * +__first_valid_page(unsigned long pfn, unsigned long nr_pages) +{ + int i; + for (i = 0; i < nr_pages; i++) + if (pfn_valid_within(pfn + i)) + break; + if (unlikely(i == nr_pages)) + return NULL; + return pfn_to_page(pfn + i); +} + +/* + * start_isolate_page_range() -- make page-allocation-type of range of pages + * to be MIGRATE_ISOLATE. + * @start_pfn: The lower PFN of the range to be isolated. + * @end_pfn: The upper PFN of the range to be isolated. + * + * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in + * the range will never be allocated. Any free pages and pages freed in the + * future will not be allocated again. + * + * start_pfn/end_pfn must be aligned to pageblock_order. + * Returns 0 on success and -EBUSY if any part of range cannot be isolated. + */ +int +start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + unsigned long undo_pfn; + struct page *page; + + BUG_ON((start_pfn) & (pageblock_nr_pages - 1)); + BUG_ON((end_pfn) & (pageblock_nr_pages - 1)); + + for (pfn = start_pfn; + pfn < end_pfn; + pfn += pageblock_nr_pages) { + page = __first_valid_page(pfn, pageblock_nr_pages); + if (page && set_migratetype_isolate(page)) { + undo_pfn = pfn; + goto undo; + } + } + return 0; +undo: + for (pfn = start_pfn; + pfn < undo_pfn; + pfn += pageblock_nr_pages) + unset_migratetype_isolate(pfn_to_page(pfn)); + + return -EBUSY; +} + +/* + * Make isolated pages available again. + */ +int +undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + struct page *page; + BUG_ON((start_pfn) & (pageblock_nr_pages - 1)); + BUG_ON((end_pfn) & (pageblock_nr_pages - 1)); + for (pfn = start_pfn; + pfn < end_pfn; + pfn += pageblock_nr_pages) { + page = __first_valid_page(pfn, pageblock_nr_pages); + if (!page || get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + continue; + unset_migratetype_isolate(page); + } + return 0; +} +/* + * Test all pages in the range is free(means isolated) or not. + * all pages in [start_pfn...end_pfn) must be in the same zone. + * zone->lock must be held before call this. + * + * Returns 0 if all pages in the range is isolated. + */ +static int +__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) +{ + struct page *page; + + while (pfn < end_pfn) { + if (!pfn_valid_within(pfn)) { + pfn++; + continue; + } + page = pfn_to_page(pfn); + if (PageBuddy(page)) + pfn += 1 << page_order(page); + else if (page_count(page) == 0 && + page_private(page) == MIGRATE_ISOLATE) + pfn += 1; + else + break; + } + if (pfn < end_pfn) + return 0; + return 1; +} + +int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn, flags; + struct page *page; + struct zone *zone; + int ret; + + pfn = start_pfn; + /* + * Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page + * is not aligned to pageblock_nr_pages. + * Then we just check pagetype fist. + */ + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { + page = __first_valid_page(pfn, pageblock_nr_pages); + if (page && get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + break; + } + page = __first_valid_page(start_pfn, end_pfn - start_pfn); + if ((pfn < end_pfn) || !page) + return -EBUSY; + /* Check all pages are free or Marked as ISOLATED */ + zone = page_zone(page); + spin_lock_irqsave(&zone->lock, flags); + ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn); + spin_unlock_irqrestore(&zone->lock, flags); + return ret ? 0 : -EBUSY; +} diff --git a/mm/pagewalk.c b/mm/pagewalk.c new file mode 100644 index 0000000..d5878be --- /dev/null +++ b/mm/pagewalk.c @@ -0,0 +1,137 @@ +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/sched.h> + +static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pte_t *pte; + int err = 0; + + pte = pte_offset_map(pmd, addr); + for (;;) { + err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk); + if (err) + break; + addr += PAGE_SIZE; + if (addr == end) + break; + pte++; + } + + pte_unmap(pte); + return err; +} + +static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pmd_t *pmd; + unsigned long next; + int err = 0; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) { + if (walk->pte_hole) + err = walk->pte_hole(addr, next, walk); + if (err) + break; + continue; + } + if (walk->pmd_entry) + err = walk->pmd_entry(pmd, addr, next, walk); + if (!err && walk->pte_entry) + err = walk_pte_range(pmd, addr, next, walk); + if (err) + break; + } while (pmd++, addr = next, addr != end); + + return err; +} + +static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pud_t *pud; + unsigned long next; + int err = 0; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) { + if (walk->pte_hole) + err = walk->pte_hole(addr, next, walk); + if (err) + break; + continue; + } + if (walk->pud_entry) + err = walk->pud_entry(pud, addr, next, walk); + if (!err && (walk->pmd_entry || walk->pte_entry)) + err = walk_pmd_range(pud, addr, next, walk); + if (err) + break; + } while (pud++, addr = next, addr != end); + + return err; +} + +/** + * walk_page_range - walk a memory map's page tables with a callback + * @mm: memory map to walk + * @addr: starting address + * @end: ending address + * @walk: set of callbacks to invoke for each level of the tree + * + * Recursively walk the page table for the memory area in a VMA, + * calling supplied callbacks. Callbacks are called in-order (first + * PGD, first PUD, first PMD, first PTE, second PTE... second PMD, + * etc.). If lower-level callbacks are omitted, walking depth is reduced. + * + * Each callback receives an entry pointer and the start and end of the + * associated range, and a copy of the original mm_walk for access to + * the ->private or ->mm fields. + * + * No locks are taken, but the bottom level iterator will map PTE + * directories from highmem if necessary. + * + * If any callback returns a non-zero value, the walk is aborted and + * the return value is propagated back to the caller. Otherwise 0 is returned. + */ +int walk_page_range(unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pgd_t *pgd; + unsigned long next; + int err = 0; + + if (addr >= end) + return err; + + if (!walk->mm) + return -EINVAL; + + pgd = pgd_offset(walk->mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) { + if (walk->pte_hole) + err = walk->pte_hole(addr, next, walk); + if (err) + break; + continue; + } + if (walk->pgd_entry) + err = walk->pgd_entry(pgd, addr, next, walk); + if (!err && + (walk->pud_entry || walk->pmd_entry || walk->pte_entry)) + err = walk_pud_range(pgd, addr, next, walk); + if (err) + break; + } while (pgd++, addr = next, addr != end); + + return err; +} diff --git a/mm/pdflush.c b/mm/pdflush.c new file mode 100644 index 0000000..a0a14c4 --- /dev/null +++ b/mm/pdflush.c @@ -0,0 +1,241 @@ +/* + * mm/pdflush.c - worker threads for writing back filesystem data + * + * Copyright (C) 2002, Linus Torvalds. + * + * 09Apr2002 Andrew Morton + * Initial version + * 29Feb2004 kaos@sgi.com + * Move worker thread creation to kthread to avoid chewing + * up stack space with nested calls to kernel_thread. + */ + +#include <linux/sched.h> +#include <linux/list.h> +#include <linux/signal.h> +#include <linux/spinlock.h> +#include <linux/gfp.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/fs.h> /* Needed by writeback.h */ +#include <linux/writeback.h> /* Prototypes pdflush_operation() */ +#include <linux/kthread.h> +#include <linux/cpuset.h> +#include <linux/freezer.h> + + +/* + * Minimum and maximum number of pdflush instances + */ +#define MIN_PDFLUSH_THREADS 2 +#define MAX_PDFLUSH_THREADS 8 + +static void start_one_pdflush_thread(void); + + +/* + * The pdflush threads are worker threads for writing back dirty data. + * Ideally, we'd like one thread per active disk spindle. But the disk + * topology is very hard to divine at this level. Instead, we take + * care in various places to prevent more than one pdflush thread from + * performing writeback against a single filesystem. pdflush threads + * have the PF_FLUSHER flag set in current->flags to aid in this. + */ + +/* + * All the pdflush threads. Protected by pdflush_lock + */ +static LIST_HEAD(pdflush_list); +static DEFINE_SPINLOCK(pdflush_lock); + +/* + * The count of currently-running pdflush threads. Protected + * by pdflush_lock. + * + * Readable by sysctl, but not writable. Published to userspace at + * /proc/sys/vm/nr_pdflush_threads. + */ +int nr_pdflush_threads = 0; + +/* + * The time at which the pdflush thread pool last went empty + */ +static unsigned long last_empty_jifs; + +/* + * The pdflush thread. + * + * Thread pool management algorithm: + * + * - The minimum and maximum number of pdflush instances are bound + * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS. + * + * - If there have been no idle pdflush instances for 1 second, create + * a new one. + * + * - If the least-recently-went-to-sleep pdflush thread has been asleep + * for more than one second, terminate a thread. + */ + +/* + * A structure for passing work to a pdflush thread. Also for passing + * state information between pdflush threads. Protected by pdflush_lock. + */ +struct pdflush_work { + struct task_struct *who; /* The thread */ + void (*fn)(unsigned long); /* A callback function */ + unsigned long arg0; /* An argument to the callback */ + struct list_head list; /* On pdflush_list, when idle */ + unsigned long when_i_went_to_sleep; +}; + +static int __pdflush(struct pdflush_work *my_work) +{ + current->flags |= PF_FLUSHER | PF_SWAPWRITE; + set_freezable(); + my_work->fn = NULL; + my_work->who = current; + INIT_LIST_HEAD(&my_work->list); + + spin_lock_irq(&pdflush_lock); + nr_pdflush_threads++; + for ( ; ; ) { + struct pdflush_work *pdf; + + set_current_state(TASK_INTERRUPTIBLE); + list_move(&my_work->list, &pdflush_list); + my_work->when_i_went_to_sleep = jiffies; + spin_unlock_irq(&pdflush_lock); + schedule(); + try_to_freeze(); + spin_lock_irq(&pdflush_lock); + if (!list_empty(&my_work->list)) { + /* + * Someone woke us up, but without removing our control + * structure from the global list. swsusp will do this + * in try_to_freeze()->refrigerator(). Handle it. + */ + my_work->fn = NULL; + continue; + } + if (my_work->fn == NULL) { + printk("pdflush: bogus wakeup\n"); + continue; + } + spin_unlock_irq(&pdflush_lock); + + (*my_work->fn)(my_work->arg0); + + /* + * Thread creation: For how long have there been zero + * available threads? + */ + if (time_after(jiffies, last_empty_jifs + 1 * HZ)) { + /* unlocked list_empty() test is OK here */ + if (list_empty(&pdflush_list)) { + /* unlocked test is OK here */ + if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) + start_one_pdflush_thread(); + } + } + + spin_lock_irq(&pdflush_lock); + my_work->fn = NULL; + + /* + * Thread destruction: For how long has the sleepiest + * thread slept? + */ + if (list_empty(&pdflush_list)) + continue; + if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS) + continue; + pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); + if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) { + /* Limit exit rate */ + pdf->when_i_went_to_sleep = jiffies; + break; /* exeunt */ + } + } + nr_pdflush_threads--; + spin_unlock_irq(&pdflush_lock); + return 0; +} + +/* + * Of course, my_work wants to be just a local in __pdflush(). It is + * separated out in this manner to hopefully prevent the compiler from + * performing unfortunate optimisations against the auto variables. Because + * these are visible to other tasks and CPUs. (No problem has actually + * been observed. This is just paranoia). + */ +static int pdflush(void *dummy) +{ + struct pdflush_work my_work; + cpumask_t cpus_allowed; + + /* + * pdflush can spend a lot of time doing encryption via dm-crypt. We + * don't want to do that at keventd's priority. + */ + set_user_nice(current, 0); + + /* + * Some configs put our parent kthread in a limited cpuset, + * which kthread() overrides, forcing cpus_allowed == CPU_MASK_ALL. + * Our needs are more modest - cut back to our cpusets cpus_allowed. + * This is needed as pdflush's are dynamically created and destroyed. + * The boottime pdflush's are easily placed w/o these 2 lines. + */ + cpuset_cpus_allowed(current, &cpus_allowed); + set_cpus_allowed_ptr(current, &cpus_allowed); + + return __pdflush(&my_work); +} + +/* + * Attempt to wake up a pdflush thread, and get it to do some work for you. + * Returns zero if it indeed managed to find a worker thread, and passed your + * payload to it. + */ +int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0) +{ + unsigned long flags; + int ret = 0; + + BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */ + + spin_lock_irqsave(&pdflush_lock, flags); + if (list_empty(&pdflush_list)) { + ret = -1; + } else { + struct pdflush_work *pdf; + + pdf = list_entry(pdflush_list.next, struct pdflush_work, list); + list_del_init(&pdf->list); + if (list_empty(&pdflush_list)) + last_empty_jifs = jiffies; + pdf->fn = fn; + pdf->arg0 = arg0; + wake_up_process(pdf->who); + } + spin_unlock_irqrestore(&pdflush_lock, flags); + + return ret; +} + +static void start_one_pdflush_thread(void) +{ + kthread_run(pdflush, NULL, "pdflush"); +} + +static int __init pdflush_init(void) +{ + int i; + + for (i = 0; i < MIN_PDFLUSH_THREADS; i++) + start_one_pdflush_thread(); + return 0; +} + +module_init(pdflush_init); diff --git a/mm/prio_tree.c b/mm/prio_tree.c new file mode 100644 index 0000000..603ae98 --- /dev/null +++ b/mm/prio_tree.c @@ -0,0 +1,207 @@ +/* + * mm/prio_tree.c - priority search tree for mapping->i_mmap + * + * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu> + * + * This file is released under the GPL v2. + * + * Based on the radix priority search tree proposed by Edward M. McCreight + * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985 + * + * 02Feb2004 Initial version + */ + +#include <linux/mm.h> +#include <linux/prio_tree.h> + +/* + * See lib/prio_tree.c for details on the general radix priority search tree + * code. + */ + +/* + * The following #defines are mirrored from lib/prio_tree.c. They're only used + * for debugging, and should be removed (along with the debugging code using + * them) when switching also VMAs to the regular prio_tree code. + */ + +#define RADIX_INDEX(vma) ((vma)->vm_pgoff) +#define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT) +/* avoid overflow */ +#define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1)) + +/* + * Radix priority search tree for address_space->i_mmap + * + * For each vma that map a unique set of file pages i.e., unique [radix_index, + * heap_index] value, we have a corresponding priority search tree node. If + * multiple vmas have identical [radix_index, heap_index] value, then one of + * them is used as a tree node and others are stored in a vm_set list. The tree + * node points to the first vma (head) of the list using vm_set.head. + * + * prio_tree_root + * | + * A vm_set.head + * / \ / + * L R -> H-I-J-K-M-N-O-P-Q-S + * ^ ^ <-- vm_set.list --> + * tree nodes + * + * We need some way to identify whether a vma is a tree node, head of a vm_set + * list, or just a member of a vm_set list. We cannot use vm_flags to store + * such information. The reason is, in the above figure, it is possible that + * vm_flags' of R and H are covered by the different mmap_sems. When R is + * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold + * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now. + * That's why some trick involving shared.vm_set.parent is used for identifying + * tree nodes and list head nodes. + * + * vma radix priority search tree node rules: + * + * vma->shared.vm_set.parent != NULL ==> a tree node + * vma->shared.vm_set.head != NULL ==> list of others mapping same range + * vma->shared.vm_set.head == NULL ==> no others map the same range + * + * vma->shared.vm_set.parent == NULL + * vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range + * vma->shared.vm_set.head == NULL ==> a list node + */ + +/* + * Add a new vma known to map the same set of pages as the old vma: + * useful for fork's dup_mmap as well as vma_prio_tree_insert below. + * Note that it just happens to work correctly on i_mmap_nonlinear too. + */ +void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old) +{ + /* Leave these BUG_ONs till prio_tree patch stabilizes */ + BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old)); + BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old)); + + vma->shared.vm_set.head = NULL; + vma->shared.vm_set.parent = NULL; + + if (!old->shared.vm_set.parent) + list_add(&vma->shared.vm_set.list, + &old->shared.vm_set.list); + else if (old->shared.vm_set.head) + list_add_tail(&vma->shared.vm_set.list, + &old->shared.vm_set.head->shared.vm_set.list); + else { + INIT_LIST_HEAD(&vma->shared.vm_set.list); + vma->shared.vm_set.head = old; + old->shared.vm_set.head = vma; + } +} + +void vma_prio_tree_insert(struct vm_area_struct *vma, + struct prio_tree_root *root) +{ + struct prio_tree_node *ptr; + struct vm_area_struct *old; + + vma->shared.vm_set.head = NULL; + + ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node); + if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) { + old = prio_tree_entry(ptr, struct vm_area_struct, + shared.prio_tree_node); + vma_prio_tree_add(vma, old); + } +} + +void vma_prio_tree_remove(struct vm_area_struct *vma, + struct prio_tree_root *root) +{ + struct vm_area_struct *node, *head, *new_head; + + if (!vma->shared.vm_set.head) { + if (!vma->shared.vm_set.parent) + list_del_init(&vma->shared.vm_set.list); + else + raw_prio_tree_remove(root, &vma->shared.prio_tree_node); + } else { + /* Leave this BUG_ON till prio_tree patch stabilizes */ + BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma); + if (vma->shared.vm_set.parent) { + head = vma->shared.vm_set.head; + if (!list_empty(&head->shared.vm_set.list)) { + new_head = list_entry( + head->shared.vm_set.list.next, + struct vm_area_struct, + shared.vm_set.list); + list_del_init(&head->shared.vm_set.list); + } else + new_head = NULL; + + raw_prio_tree_replace(root, &vma->shared.prio_tree_node, + &head->shared.prio_tree_node); + head->shared.vm_set.head = new_head; + if (new_head) + new_head->shared.vm_set.head = head; + + } else { + node = vma->shared.vm_set.head; + if (!list_empty(&vma->shared.vm_set.list)) { + new_head = list_entry( + vma->shared.vm_set.list.next, + struct vm_area_struct, + shared.vm_set.list); + list_del_init(&vma->shared.vm_set.list); + node->shared.vm_set.head = new_head; + new_head->shared.vm_set.head = node; + } else + node->shared.vm_set.head = NULL; + } + } +} + +/* + * Helper function to enumerate vmas that map a given file page or a set of + * contiguous file pages. The function returns vmas that at least map a single + * page in the given range of contiguous file pages. + */ +struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma, + struct prio_tree_iter *iter) +{ + struct prio_tree_node *ptr; + struct vm_area_struct *next; + + if (!vma) { + /* + * First call is with NULL vma + */ + ptr = prio_tree_next(iter); + if (ptr) { + next = prio_tree_entry(ptr, struct vm_area_struct, + shared.prio_tree_node); + prefetch(next->shared.vm_set.head); + return next; + } else + return NULL; + } + + if (vma->shared.vm_set.parent) { + if (vma->shared.vm_set.head) { + next = vma->shared.vm_set.head; + prefetch(next->shared.vm_set.list.next); + return next; + } + } else { + next = list_entry(vma->shared.vm_set.list.next, + struct vm_area_struct, shared.vm_set.list); + if (!next->shared.vm_set.head) { + prefetch(next->shared.vm_set.list.next); + return next; + } + } + + ptr = prio_tree_next(iter); + if (ptr) { + next = prio_tree_entry(ptr, struct vm_area_struct, + shared.prio_tree_node); + prefetch(next->shared.vm_set.head); + return next; + } else + return NULL; +} diff --git a/mm/quicklist.c b/mm/quicklist.c new file mode 100644 index 0000000..8dbb680 --- /dev/null +++ b/mm/quicklist.c @@ -0,0 +1,103 @@ +/* + * Quicklist support. + * + * Quicklists are light weight lists of pages that have a defined state + * on alloc and free. Pages must be in the quicklist specific defined state + * (zero by default) when the page is freed. It seems that the initial idea + * for such lists first came from Dave Miller and then various other people + * improved on it. + * + * Copyright (C) 2007 SGI, + * Christoph Lameter <clameter@sgi.com> + * Generalized, added support for multiple lists and + * constructors / destructors. + */ +#include <linux/kernel.h> + +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/module.h> +#include <linux/quicklist.h> + +DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; + +#define FRACTION_OF_NODE_MEM 16 + +static unsigned long max_pages(unsigned long min_pages) +{ + unsigned long node_free_pages, max; + int node = numa_node_id(); + struct zone *zones = NODE_DATA(node)->node_zones; + int num_cpus_on_node; + node_to_cpumask_ptr(cpumask_on_node, node); + + node_free_pages = +#ifdef CONFIG_ZONE_DMA + zone_page_state(&zones[ZONE_DMA], NR_FREE_PAGES) + +#endif +#ifdef CONFIG_ZONE_DMA32 + zone_page_state(&zones[ZONE_DMA32], NR_FREE_PAGES) + +#endif + zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES); + + max = node_free_pages / FRACTION_OF_NODE_MEM; + + num_cpus_on_node = cpus_weight_nr(*cpumask_on_node); + max /= num_cpus_on_node; + + return max(max, min_pages); +} + +static long min_pages_to_free(struct quicklist *q, + unsigned long min_pages, long max_free) +{ + long pages_to_free; + + pages_to_free = q->nr_pages - max_pages(min_pages); + + return min(pages_to_free, max_free); +} + +/* + * Trim down the number of pages in the quicklist + */ +void quicklist_trim(int nr, void (*dtor)(void *), + unsigned long min_pages, unsigned long max_free) +{ + long pages_to_free; + struct quicklist *q; + + q = &get_cpu_var(quicklist)[nr]; + if (q->nr_pages > min_pages) { + pages_to_free = min_pages_to_free(q, min_pages, max_free); + + while (pages_to_free > 0) { + /* + * We pass a gfp_t of 0 to quicklist_alloc here + * because we will never call into the page allocator. + */ + void *p = quicklist_alloc(nr, 0, NULL); + + if (dtor) + dtor(p); + free_page((unsigned long)p); + pages_to_free--; + } + } + put_cpu_var(quicklist); +} + +unsigned long quicklist_total_size(void) +{ + unsigned long count = 0; + int cpu; + struct quicklist *ql, *q; + + for_each_online_cpu(cpu) { + ql = per_cpu(quicklist, cpu); + for (q = ql; q < ql + CONFIG_NR_QUICK; q++) + count += q->nr_pages; + } + return count; +} + diff --git a/mm/readahead.c b/mm/readahead.c new file mode 100644 index 0000000..bec83c1 --- /dev/null +++ b/mm/readahead.c @@ -0,0 +1,483 @@ +/* + * mm/readahead.c - address_space-level file readahead. + * + * Copyright (C) 2002, Linus Torvalds + * + * 09Apr2002 Andrew Morton + * Initial version. + */ + +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/blkdev.h> +#include <linux/backing-dev.h> +#include <linux/task_io_accounting_ops.h> +#include <linux/pagevec.h> +#include <linux/pagemap.h> + +void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) +{ +} +EXPORT_SYMBOL(default_unplug_io_fn); + +struct backing_dev_info default_backing_dev_info = { + .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, + .state = 0, + .capabilities = BDI_CAP_MAP_COPY, + .unplug_io_fn = default_unplug_io_fn, +}; +EXPORT_SYMBOL_GPL(default_backing_dev_info); + +/* + * Initialise a struct file's readahead state. Assumes that the caller has + * memset *ra to zero. + */ +void +file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) +{ + ra->ra_pages = mapping->backing_dev_info->ra_pages; + ra->prev_pos = -1; +} +EXPORT_SYMBOL_GPL(file_ra_state_init); + +#define list_to_page(head) (list_entry((head)->prev, struct page, lru)) + +/** + * read_cache_pages - populate an address space with some pages & start reads against them + * @mapping: the address_space + * @pages: The address of a list_head which contains the target pages. These + * pages have their ->index populated and are otherwise uninitialised. + * @filler: callback routine for filling a single page. + * @data: private data for the callback routine. + * + * Hides the details of the LRU cache etc from the filesystems. + */ +int read_cache_pages(struct address_space *mapping, struct list_head *pages, + int (*filler)(void *, struct page *), void *data) +{ + struct page *page; + int ret = 0; + + while (!list_empty(pages)) { + page = list_to_page(pages); + list_del(&page->lru); + if (add_to_page_cache_lru(page, mapping, + page->index, GFP_KERNEL)) { + page_cache_release(page); + continue; + } + page_cache_release(page); + + ret = filler(data, page); + if (unlikely(ret)) { + put_pages_list(pages); + break; + } + task_io_account_read(PAGE_CACHE_SIZE); + } + return ret; +} + +EXPORT_SYMBOL(read_cache_pages); + +static int read_pages(struct address_space *mapping, struct file *filp, + struct list_head *pages, unsigned nr_pages) +{ + unsigned page_idx; + int ret; + + if (mapping->a_ops->readpages) { + ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); + /* Clean up the remaining pages */ + put_pages_list(pages); + goto out; + } + + for (page_idx = 0; page_idx < nr_pages; page_idx++) { + struct page *page = list_to_page(pages); + list_del(&page->lru); + if (!add_to_page_cache_lru(page, mapping, + page->index, GFP_KERNEL)) { + mapping->a_ops->readpage(filp, page); + } + page_cache_release(page); + } + ret = 0; +out: + return ret; +} + +/* + * do_page_cache_readahead actually reads a chunk of disk. It allocates all + * the pages first, then submits them all for I/O. This avoids the very bad + * behaviour which would occur if page allocations are causing VM writeback. + * We really don't want to intermingle reads and writes like that. + * + * Returns the number of pages requested, or the maximum amount of I/O allowed. + * + * do_page_cache_readahead() returns -1 if it encountered request queue + * congestion. + */ +static int +__do_page_cache_readahead(struct address_space *mapping, struct file *filp, + pgoff_t offset, unsigned long nr_to_read, + unsigned long lookahead_size) +{ + struct inode *inode = mapping->host; + struct page *page; + unsigned long end_index; /* The last page we want to read */ + LIST_HEAD(page_pool); + int page_idx; + int ret = 0; + loff_t isize = i_size_read(inode); + + if (isize == 0) + goto out; + + end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); + + /* + * Preallocate as many pages as we will need. + */ + for (page_idx = 0; page_idx < nr_to_read; page_idx++) { + pgoff_t page_offset = offset + page_idx; + + if (page_offset > end_index) + break; + + rcu_read_lock(); + page = radix_tree_lookup(&mapping->page_tree, page_offset); + rcu_read_unlock(); + if (page) + continue; + + page = page_cache_alloc_cold(mapping); + if (!page) + break; + page->index = page_offset; + list_add(&page->lru, &page_pool); + if (page_idx == nr_to_read - lookahead_size) + SetPageReadahead(page); + ret++; + } + + /* + * Now start the IO. We ignore I/O errors - if the page is not + * uptodate then the caller will launch readpage again, and + * will then handle the error. + */ + if (ret) + read_pages(mapping, filp, &page_pool, ret); + BUG_ON(!list_empty(&page_pool)); +out: + return ret; +} + +/* + * Chunk the readahead into 2 megabyte units, so that we don't pin too much + * memory at once. + */ +int force_page_cache_readahead(struct address_space *mapping, struct file *filp, + pgoff_t offset, unsigned long nr_to_read) +{ + int ret = 0; + + if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) + return -EINVAL; + + while (nr_to_read) { + int err; + + unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE; + + if (this_chunk > nr_to_read) + this_chunk = nr_to_read; + err = __do_page_cache_readahead(mapping, filp, + offset, this_chunk, 0); + if (err < 0) { + ret = err; + break; + } + ret += err; + offset += this_chunk; + nr_to_read -= this_chunk; + } + return ret; +} + +/* + * This version skips the IO if the queue is read-congested, and will tell the + * block layer to abandon the readahead if request allocation would block. + * + * force_page_cache_readahead() will ignore queue congestion and will block on + * request queues. + */ +int do_page_cache_readahead(struct address_space *mapping, struct file *filp, + pgoff_t offset, unsigned long nr_to_read) +{ + if (bdi_read_congested(mapping->backing_dev_info)) + return -1; + + return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0); +} + +/* + * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a + * sensible upper limit. + */ +unsigned long max_sane_readahead(unsigned long nr) +{ + return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE) + + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); +} + +static int __init readahead_init(void) +{ + int err; + + err = bdi_init(&default_backing_dev_info); + if (!err) + bdi_register(&default_backing_dev_info, NULL, "default"); + + return err; +} +subsys_initcall(readahead_init); + +/* + * Submit IO for the read-ahead request in file_ra_state. + */ +static unsigned long ra_submit(struct file_ra_state *ra, + struct address_space *mapping, struct file *filp) +{ + int actual; + + actual = __do_page_cache_readahead(mapping, filp, + ra->start, ra->size, ra->async_size); + + return actual; +} + +/* + * Set the initial window size, round to next power of 2 and square + * for small size, x 4 for medium, and x 2 for large + * for 128k (32 page) max ra + * 1-8 page = 32k initial, > 8 page = 128k initial + */ +static unsigned long get_init_ra_size(unsigned long size, unsigned long max) +{ + unsigned long newsize = roundup_pow_of_two(size); + + if (newsize <= max / 32) + newsize = newsize * 4; + else if (newsize <= max / 4) + newsize = newsize * 2; + else + newsize = max; + + return newsize; +} + +/* + * Get the previous window size, ramp it up, and + * return it as the new window size. + */ +static unsigned long get_next_ra_size(struct file_ra_state *ra, + unsigned long max) +{ + unsigned long cur = ra->size; + unsigned long newsize; + + if (cur < max / 16) + newsize = 4 * cur; + else + newsize = 2 * cur; + + return min(newsize, max); +} + +/* + * On-demand readahead design. + * + * The fields in struct file_ra_state represent the most-recently-executed + * readahead attempt: + * + * |<----- async_size ---------| + * |------------------- size -------------------->| + * |==================#===========================| + * ^start ^page marked with PG_readahead + * + * To overlap application thinking time and disk I/O time, we do + * `readahead pipelining': Do not wait until the application consumed all + * readahead pages and stalled on the missing page at readahead_index; + * Instead, submit an asynchronous readahead I/O as soon as there are + * only async_size pages left in the readahead window. Normally async_size + * will be equal to size, for maximum pipelining. + * + * In interleaved sequential reads, concurrent streams on the same fd can + * be invalidating each other's readahead state. So we flag the new readahead + * page at (start+size-async_size) with PG_readahead, and use it as readahead + * indicator. The flag won't be set on already cached pages, to avoid the + * readahead-for-nothing fuss, saving pointless page cache lookups. + * + * prev_pos tracks the last visited byte in the _previous_ read request. + * It should be maintained by the caller, and will be used for detecting + * small random reads. Note that the readahead algorithm checks loosely + * for sequential patterns. Hence interleaved reads might be served as + * sequential ones. + * + * There is a special-case: if the first page which the application tries to + * read happens to be the first page of the file, it is assumed that a linear + * read is about to happen and the window is immediately set to the initial size + * based on I/O request size and the max_readahead. + * + * The code ramps up the readahead size aggressively at first, but slow down as + * it approaches max_readhead. + */ + +/* + * A minimal readahead algorithm for trivial sequential/random reads. + */ +static unsigned long +ondemand_readahead(struct address_space *mapping, + struct file_ra_state *ra, struct file *filp, + bool hit_readahead_marker, pgoff_t offset, + unsigned long req_size) +{ + int max = ra->ra_pages; /* max readahead pages */ + pgoff_t prev_offset; + int sequential; + + /* + * It's the expected callback offset, assume sequential access. + * Ramp up sizes, and push forward the readahead window. + */ + if (offset && (offset == (ra->start + ra->size - ra->async_size) || + offset == (ra->start + ra->size))) { + ra->start += ra->size; + ra->size = get_next_ra_size(ra, max); + ra->async_size = ra->size; + goto readit; + } + + prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT; + sequential = offset - prev_offset <= 1UL || req_size > max; + + /* + * Standalone, small read. + * Read as is, and do not pollute the readahead state. + */ + if (!hit_readahead_marker && !sequential) { + return __do_page_cache_readahead(mapping, filp, + offset, req_size, 0); + } + + /* + * Hit a marked page without valid readahead state. + * E.g. interleaved reads. + * Query the pagecache for async_size, which normally equals to + * readahead size. Ramp it up and use it as the new readahead size. + */ + if (hit_readahead_marker) { + pgoff_t start; + + rcu_read_lock(); + start = radix_tree_next_hole(&mapping->page_tree, offset,max+1); + rcu_read_unlock(); + + if (!start || start - offset > max) + return 0; + + ra->start = start; + ra->size = start - offset; /* old async_size */ + ra->size = get_next_ra_size(ra, max); + ra->async_size = ra->size; + goto readit; + } + + /* + * It may be one of + * - first read on start of file + * - sequential cache miss + * - oversize random read + * Start readahead for it. + */ + ra->start = offset; + ra->size = get_init_ra_size(req_size, max); + ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; + +readit: + return ra_submit(ra, mapping, filp); +} + +/** + * page_cache_sync_readahead - generic file readahead + * @mapping: address_space which holds the pagecache and I/O vectors + * @ra: file_ra_state which holds the readahead state + * @filp: passed on to ->readpage() and ->readpages() + * @offset: start offset into @mapping, in pagecache page-sized units + * @req_size: hint: total size of the read which the caller is performing in + * pagecache pages + * + * page_cache_sync_readahead() should be called when a cache miss happened: + * it will submit the read. The readahead logic may decide to piggyback more + * pages onto the read request if access patterns suggest it will improve + * performance. + */ +void page_cache_sync_readahead(struct address_space *mapping, + struct file_ra_state *ra, struct file *filp, + pgoff_t offset, unsigned long req_size) +{ + /* no read-ahead */ + if (!ra->ra_pages) + return; + + /* do read-ahead */ + ondemand_readahead(mapping, ra, filp, false, offset, req_size); +} +EXPORT_SYMBOL_GPL(page_cache_sync_readahead); + +/** + * page_cache_async_readahead - file readahead for marked pages + * @mapping: address_space which holds the pagecache and I/O vectors + * @ra: file_ra_state which holds the readahead state + * @filp: passed on to ->readpage() and ->readpages() + * @page: the page at @offset which has the PG_readahead flag set + * @offset: start offset into @mapping, in pagecache page-sized units + * @req_size: hint: total size of the read which the caller is performing in + * pagecache pages + * + * page_cache_async_ondemand() should be called when a page is used which + * has the PG_readahead flag; this is a marker to suggest that the application + * has used up enough of the readahead window that we should start pulling in + * more pages. + */ +void +page_cache_async_readahead(struct address_space *mapping, + struct file_ra_state *ra, struct file *filp, + struct page *page, pgoff_t offset, + unsigned long req_size) +{ + /* no read-ahead */ + if (!ra->ra_pages) + return; + + /* + * Same bit is used for PG_readahead and PG_reclaim. + */ + if (PageWriteback(page)) + return; + + ClearPageReadahead(page); + + /* + * Defer asynchronous read-ahead on IO congestion. + */ + if (bdi_read_congested(mapping->backing_dev_info)) + return; + + /* do read-ahead */ + ondemand_readahead(mapping, ra, filp, true, offset, req_size); +} +EXPORT_SYMBOL_GPL(page_cache_async_readahead); diff --git a/mm/rmap.c b/mm/rmap.c new file mode 100644 index 0000000..1099394 --- /dev/null +++ b/mm/rmap.c @@ -0,0 +1,1236 @@ +/* + * mm/rmap.c - physical to virtual reverse mappings + * + * Copyright 2001, Rik van Riel <riel@conectiva.com.br> + * Released under the General Public License (GPL). + * + * Simple, low overhead reverse mapping scheme. + * Please try to keep this thing as modular as possible. + * + * Provides methods for unmapping each kind of mapped page: + * the anon methods track anonymous pages, and + * the file methods track pages belonging to an inode. + * + * Original design by Rik van Riel <riel@conectiva.com.br> 2001 + * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 + * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 + * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004 + */ + +/* + * Lock ordering in mm: + * + * inode->i_mutex (while writing or truncating, not reading or faulting) + * inode->i_alloc_sem (vmtruncate_range) + * mm->mmap_sem + * page->flags PG_locked (lock_page) + * mapping->i_mmap_lock + * anon_vma->lock + * mm->page_table_lock or pte_lock + * zone->lru_lock (in mark_page_accessed, isolate_lru_page) + * swap_lock (in swap_duplicate, swap_info_get) + * mmlist_lock (in mmput, drain_mmlist and others) + * mapping->private_lock (in __set_page_dirty_buffers) + * inode_lock (in set_page_dirty's __mark_inode_dirty) + * sb_lock (within inode_lock in fs/fs-writeback.c) + * mapping->tree_lock (widely used, in set_page_dirty, + * in arch-dependent flush_dcache_mmap_lock, + * within inode_lock in __sync_single_inode) + */ + +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/rmap.h> +#include <linux/rcupdate.h> +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/memcontrol.h> +#include <linux/mmu_notifier.h> + +#include <asm/tlbflush.h> + +#include "internal.h" + +static struct kmem_cache *anon_vma_cachep; + +static inline struct anon_vma *anon_vma_alloc(void) +{ + return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); +} + +static inline void anon_vma_free(struct anon_vma *anon_vma) +{ + kmem_cache_free(anon_vma_cachep, anon_vma); +} + +/** + * anon_vma_prepare - attach an anon_vma to a memory region + * @vma: the memory region in question + * + * This makes sure the memory mapping described by 'vma' has + * an 'anon_vma' attached to it, so that we can associate the + * anonymous pages mapped into it with that anon_vma. + * + * The common case will be that we already have one, but if + * if not we either need to find an adjacent mapping that we + * can re-use the anon_vma from (very common when the only + * reason for splitting a vma has been mprotect()), or we + * allocate a new one. + * + * Anon-vma allocations are very subtle, because we may have + * optimistically looked up an anon_vma in page_lock_anon_vma() + * and that may actually touch the spinlock even in the newly + * allocated vma (it depends on RCU to make sure that the + * anon_vma isn't actually destroyed). + * + * As a result, we need to do proper anon_vma locking even + * for the new allocation. At the same time, we do not want + * to do any locking for the common case of already having + * an anon_vma. + * + * This must be called with the mmap_sem held for reading. + */ +int anon_vma_prepare(struct vm_area_struct *vma) +{ + struct anon_vma *anon_vma = vma->anon_vma; + + might_sleep(); + if (unlikely(!anon_vma)) { + struct mm_struct *mm = vma->vm_mm; + struct anon_vma *allocated; + + anon_vma = find_mergeable_anon_vma(vma); + allocated = NULL; + if (!anon_vma) { + anon_vma = anon_vma_alloc(); + if (unlikely(!anon_vma)) + return -ENOMEM; + allocated = anon_vma; + } + spin_lock(&anon_vma->lock); + + /* page_table_lock to protect against threads */ + spin_lock(&mm->page_table_lock); + if (likely(!vma->anon_vma)) { + vma->anon_vma = anon_vma; + list_add_tail(&vma->anon_vma_node, &anon_vma->head); + allocated = NULL; + } + spin_unlock(&mm->page_table_lock); + + spin_unlock(&anon_vma->lock); + if (unlikely(allocated)) + anon_vma_free(allocated); + } + return 0; +} + +void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) +{ + BUG_ON(vma->anon_vma != next->anon_vma); + list_del(&next->anon_vma_node); +} + +void __anon_vma_link(struct vm_area_struct *vma) +{ + struct anon_vma *anon_vma = vma->anon_vma; + + if (anon_vma) + list_add_tail(&vma->anon_vma_node, &anon_vma->head); +} + +void anon_vma_link(struct vm_area_struct *vma) +{ + struct anon_vma *anon_vma = vma->anon_vma; + + if (anon_vma) { + spin_lock(&anon_vma->lock); + list_add_tail(&vma->anon_vma_node, &anon_vma->head); + spin_unlock(&anon_vma->lock); + } +} + +void anon_vma_unlink(struct vm_area_struct *vma) +{ + struct anon_vma *anon_vma = vma->anon_vma; + int empty; + + if (!anon_vma) + return; + + spin_lock(&anon_vma->lock); + list_del(&vma->anon_vma_node); + + /* We must garbage collect the anon_vma if it's empty */ + empty = list_empty(&anon_vma->head); + spin_unlock(&anon_vma->lock); + + if (empty) + anon_vma_free(anon_vma); +} + +static void anon_vma_ctor(void *data) +{ + struct anon_vma *anon_vma = data; + + spin_lock_init(&anon_vma->lock); + INIT_LIST_HEAD(&anon_vma->head); +} + +void __init anon_vma_init(void) +{ + anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), + 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); +} + +/* + * Getting a lock on a stable anon_vma from a page off the LRU is + * tricky: page_lock_anon_vma rely on RCU to guard against the races. + */ +struct anon_vma *page_lock_anon_vma(struct page *page) +{ + struct anon_vma *anon_vma; + unsigned long anon_mapping; + + rcu_read_lock(); + anon_mapping = (unsigned long) page->mapping; + if (!(anon_mapping & PAGE_MAPPING_ANON)) + goto out; + if (!page_mapped(page)) + goto out; + + anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); + spin_lock(&anon_vma->lock); + return anon_vma; +out: + rcu_read_unlock(); + return NULL; +} + +void page_unlock_anon_vma(struct anon_vma *anon_vma) +{ + spin_unlock(&anon_vma->lock); + rcu_read_unlock(); +} + +/* + * At what user virtual address is page expected in @vma? + * Returns virtual address or -EFAULT if page's index/offset is not + * within the range mapped the @vma. + */ +static inline unsigned long +vma_address(struct page *page, struct vm_area_struct *vma) +{ + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + unsigned long address; + + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { + /* page should be within @vma mapping range */ + return -EFAULT; + } + return address; +} + +/* + * At what user virtual address is page expected in vma? checking that the + * page matches the vma: currently only used on anon pages, by unuse_vma; + */ +unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) +{ + if (PageAnon(page)) { + if ((void *)vma->anon_vma != + (void *)page->mapping - PAGE_MAPPING_ANON) + return -EFAULT; + } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { + if (!vma->vm_file || + vma->vm_file->f_mapping != page->mapping) + return -EFAULT; + } else + return -EFAULT; + return vma_address(page, vma); +} + +/* + * Check that @page is mapped at @address into @mm. + * + * If @sync is false, page_check_address may perform a racy check to avoid + * the page table lock when the pte is not present (helpful when reclaiming + * highly shared pages). + * + * On success returns with pte mapped and locked. + */ +pte_t *page_check_address(struct page *page, struct mm_struct *mm, + unsigned long address, spinlock_t **ptlp, int sync) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + spinlock_t *ptl; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + return NULL; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return NULL; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return NULL; + + pte = pte_offset_map(pmd, address); + /* Make a quick check before getting the lock */ + if (!sync && !pte_present(*pte)) { + pte_unmap(pte); + return NULL; + } + + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { + *ptlp = ptl; + return pte; + } + pte_unmap_unlock(pte, ptl); + return NULL; +} + +/** + * page_mapped_in_vma - check whether a page is really mapped in a VMA + * @page: the page to test + * @vma: the VMA to test + * + * Returns 1 if the page is mapped into the page tables of the VMA, 0 + * if the page is not mapped into the page tables of this VMA. Only + * valid for normal file or anonymous VMAs. + */ +static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) +{ + unsigned long address; + pte_t *pte; + spinlock_t *ptl; + + address = vma_address(page, vma); + if (address == -EFAULT) /* out of vma range */ + return 0; + pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); + if (!pte) /* the page is not in this mm */ + return 0; + pte_unmap_unlock(pte, ptl); + + return 1; +} + +/* + * Subfunctions of page_referenced: page_referenced_one called + * repeatedly from either page_referenced_anon or page_referenced_file. + */ +static int page_referenced_one(struct page *page, + struct vm_area_struct *vma, unsigned int *mapcount) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pte_t *pte; + spinlock_t *ptl; + int referenced = 0; + + address = vma_address(page, vma); + if (address == -EFAULT) + goto out; + + pte = page_check_address(page, mm, address, &ptl, 0); + if (!pte) + goto out; + + /* + * Don't want to elevate referenced for mlocked page that gets this far, + * in order that it progresses to try_to_unmap and is moved to the + * unevictable list. + */ + if (vma->vm_flags & VM_LOCKED) { + *mapcount = 1; /* break early from loop */ + goto out_unmap; + } + + if (ptep_clear_flush_young_notify(vma, address, pte)) + referenced++; + + /* Pretend the page is referenced if the task has the + swap token and is in the middle of a page fault. */ + if (mm != current->mm && has_swap_token(mm) && + rwsem_is_locked(&mm->mmap_sem)) + referenced++; + +out_unmap: + (*mapcount)--; + pte_unmap_unlock(pte, ptl); +out: + return referenced; +} + +static int page_referenced_anon(struct page *page, + struct mem_cgroup *mem_cont) +{ + unsigned int mapcount; + struct anon_vma *anon_vma; + struct vm_area_struct *vma; + int referenced = 0; + + anon_vma = page_lock_anon_vma(page); + if (!anon_vma) + return referenced; + + mapcount = page_mapcount(page); + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + /* + * If we are reclaiming on behalf of a cgroup, skip + * counting on behalf of references from different + * cgroups + */ + if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) + continue; + referenced += page_referenced_one(page, vma, &mapcount); + if (!mapcount) + break; + } + + page_unlock_anon_vma(anon_vma); + return referenced; +} + +/** + * page_referenced_file - referenced check for object-based rmap + * @page: the page we're checking references on. + * @mem_cont: target memory controller + * + * For an object-based mapped page, find all the places it is mapped and + * check/clear the referenced flag. This is done by following the page->mapping + * pointer, then walking the chain of vmas it holds. It returns the number + * of references it found. + * + * This function is only called from page_referenced for object-based pages. + */ +static int page_referenced_file(struct page *page, + struct mem_cgroup *mem_cont) +{ + unsigned int mapcount; + struct address_space *mapping = page->mapping; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + struct vm_area_struct *vma; + struct prio_tree_iter iter; + int referenced = 0; + + /* + * The caller's checks on page->mapping and !PageAnon have made + * sure that this is a file page: the check for page->mapping + * excludes the case just before it gets set on an anon page. + */ + BUG_ON(PageAnon(page)); + + /* + * The page lock not only makes sure that page->mapping cannot + * suddenly be NULLified by truncation, it makes sure that the + * structure at mapping cannot be freed and reused yet, + * so we can safely take mapping->i_mmap_lock. + */ + BUG_ON(!PageLocked(page)); + + spin_lock(&mapping->i_mmap_lock); + + /* + * i_mmap_lock does not stabilize mapcount at all, but mapcount + * is more likely to be accurate if we note it after spinning. + */ + mapcount = page_mapcount(page); + + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + /* + * If we are reclaiming on behalf of a cgroup, skip + * counting on behalf of references from different + * cgroups + */ + if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) + continue; + referenced += page_referenced_one(page, vma, &mapcount); + if (!mapcount) + break; + } + + spin_unlock(&mapping->i_mmap_lock); + return referenced; +} + +/** + * page_referenced - test if the page was referenced + * @page: the page to test + * @is_locked: caller holds lock on the page + * @mem_cont: target memory controller + * + * Quick test_and_clear_referenced for all mappings to a page, + * returns the number of ptes which referenced the page. + */ +int page_referenced(struct page *page, int is_locked, + struct mem_cgroup *mem_cont) +{ + int referenced = 0; + + if (TestClearPageReferenced(page)) + referenced++; + + if (page_mapped(page) && page->mapping) { + if (PageAnon(page)) + referenced += page_referenced_anon(page, mem_cont); + else if (is_locked) + referenced += page_referenced_file(page, mem_cont); + else if (!trylock_page(page)) + referenced++; + else { + if (page->mapping) + referenced += + page_referenced_file(page, mem_cont); + unlock_page(page); + } + } + + if (page_test_and_clear_young(page)) + referenced++; + + return referenced; +} + +static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pte_t *pte; + spinlock_t *ptl; + int ret = 0; + + address = vma_address(page, vma); + if (address == -EFAULT) + goto out; + + pte = page_check_address(page, mm, address, &ptl, 1); + if (!pte) + goto out; + + if (pte_dirty(*pte) || pte_write(*pte)) { + pte_t entry; + + flush_cache_page(vma, address, pte_pfn(*pte)); + entry = ptep_clear_flush_notify(vma, address, pte); + entry = pte_wrprotect(entry); + entry = pte_mkclean(entry); + set_pte_at(mm, address, pte, entry); + ret = 1; + } + + pte_unmap_unlock(pte, ptl); +out: + return ret; +} + +static int page_mkclean_file(struct address_space *mapping, struct page *page) +{ + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + struct vm_area_struct *vma; + struct prio_tree_iter iter; + int ret = 0; + + BUG_ON(PageAnon(page)); + + spin_lock(&mapping->i_mmap_lock); + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + if (vma->vm_flags & VM_SHARED) + ret += page_mkclean_one(page, vma); + } + spin_unlock(&mapping->i_mmap_lock); + return ret; +} + +int page_mkclean(struct page *page) +{ + int ret = 0; + + BUG_ON(!PageLocked(page)); + + if (page_mapped(page)) { + struct address_space *mapping = page_mapping(page); + if (mapping) { + ret = page_mkclean_file(mapping, page); + if (page_test_dirty(page)) { + page_clear_dirty(page); + ret = 1; + } + } + } + + return ret; +} +EXPORT_SYMBOL_GPL(page_mkclean); + +/** + * __page_set_anon_rmap - setup new anonymous rmap + * @page: the page to add the mapping to + * @vma: the vm area in which the mapping is added + * @address: the user virtual address mapped + */ +static void __page_set_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + struct anon_vma *anon_vma = vma->anon_vma; + + BUG_ON(!anon_vma); + anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; + page->mapping = (struct address_space *) anon_vma; + + page->index = linear_page_index(vma, address); + + /* + * nr_mapped state can be updated without turning off + * interrupts because it is not modified via interrupt. + */ + __inc_zone_page_state(page, NR_ANON_PAGES); +} + +/** + * __page_check_anon_rmap - sanity check anonymous rmap addition + * @page: the page to add the mapping to + * @vma: the vm area in which the mapping is added + * @address: the user virtual address mapped + */ +static void __page_check_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ +#ifdef CONFIG_DEBUG_VM + /* + * The page's anon-rmap details (mapping and index) are guaranteed to + * be set up correctly at this point. + * + * We have exclusion against page_add_anon_rmap because the caller + * always holds the page locked, except if called from page_dup_rmap, + * in which case the page is already known to be setup. + * + * We have exclusion against page_add_new_anon_rmap because those pages + * are initially only visible via the pagetables, and the pte is locked + * over the call to page_add_new_anon_rmap. + */ + struct anon_vma *anon_vma = vma->anon_vma; + anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; + BUG_ON(page->mapping != (struct address_space *)anon_vma); + BUG_ON(page->index != linear_page_index(vma, address)); +#endif +} + +/** + * page_add_anon_rmap - add pte mapping to an anonymous page + * @page: the page to add the mapping to + * @vma: the vm area in which the mapping is added + * @address: the user virtual address mapped + * + * The caller needs to hold the pte lock and the page must be locked. + */ +void page_add_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); + if (atomic_inc_and_test(&page->_mapcount)) + __page_set_anon_rmap(page, vma, address); + else + __page_check_anon_rmap(page, vma, address); +} + +/** + * page_add_new_anon_rmap - add pte mapping to a new anonymous page + * @page: the page to add the mapping to + * @vma: the vm area in which the mapping is added + * @address: the user virtual address mapped + * + * Same as page_add_anon_rmap but must only be called on *new* pages. + * This means the inc-and-test can be bypassed. + * Page does not have to be locked. + */ +void page_add_new_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + BUG_ON(address < vma->vm_start || address >= vma->vm_end); + atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */ + __page_set_anon_rmap(page, vma, address); +} + +/** + * page_add_file_rmap - add pte mapping to a file page + * @page: the page to add the mapping to + * + * The caller needs to hold the pte lock. + */ +void page_add_file_rmap(struct page *page) +{ + if (atomic_inc_and_test(&page->_mapcount)) + __inc_zone_page_state(page, NR_FILE_MAPPED); +} + +#ifdef CONFIG_DEBUG_VM +/** + * page_dup_rmap - duplicate pte mapping to a page + * @page: the page to add the mapping to + * @vma: the vm area being duplicated + * @address: the user virtual address mapped + * + * For copy_page_range only: minimal extract from page_add_file_rmap / + * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's + * quicker. + * + * The caller needs to hold the pte lock. + */ +void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) +{ + BUG_ON(page_mapcount(page) == 0); + if (PageAnon(page)) + __page_check_anon_rmap(page, vma, address); + atomic_inc(&page->_mapcount); +} +#endif + +/** + * page_remove_rmap - take down pte mapping from a page + * @page: page to remove mapping from + * @vma: the vm area in which the mapping is removed + * + * The caller needs to hold the pte lock. + */ +void page_remove_rmap(struct page *page, struct vm_area_struct *vma) +{ + if (atomic_add_negative(-1, &page->_mapcount)) { + if (unlikely(page_mapcount(page) < 0)) { + printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); + printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page)); + printk (KERN_EMERG " page->flags = %lx\n", page->flags); + printk (KERN_EMERG " page->count = %x\n", page_count(page)); + printk (KERN_EMERG " page->mapping = %p\n", page->mapping); + print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops); + if (vma->vm_ops) { + print_symbol (KERN_EMERG " vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault); + } + if (vma->vm_file && vma->vm_file->f_op) + print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap); + BUG(); + } + + /* + * Now that the last pte has gone, s390 must transfer dirty + * flag from storage key to struct page. We can usually skip + * this if the page is anon, so about to be freed; but perhaps + * not if it's in swapcache - there might be another pte slot + * containing the swap entry, but page not yet written to swap. + */ + if ((!PageAnon(page) || PageSwapCache(page)) && + page_test_dirty(page)) { + page_clear_dirty(page); + set_page_dirty(page); + } + if (PageAnon(page)) + mem_cgroup_uncharge_page(page); + __dec_zone_page_state(page, + PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); + /* + * It would be tidy to reset the PageAnon mapping here, + * but that might overwrite a racing page_add_anon_rmap + * which increments mapcount after us but sets mapping + * before us: so leave the reset to free_hot_cold_page, + * and remember that it's only reliable while mapped. + * Leaving it set also helps swapoff to reinstate ptes + * faster for those pages still in swapcache. + */ + } +} + +/* + * Subfunctions of try_to_unmap: try_to_unmap_one called + * repeatedly from either try_to_unmap_anon or try_to_unmap_file. + */ +static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, + int migration) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pte_t *pte; + pte_t pteval; + spinlock_t *ptl; + int ret = SWAP_AGAIN; + + address = vma_address(page, vma); + if (address == -EFAULT) + goto out; + + pte = page_check_address(page, mm, address, &ptl, 0); + if (!pte) + goto out; + + /* + * If the page is mlock()d, we cannot swap it out. + * If it's recently referenced (perhaps page_referenced + * skipped over this mm) then we should reactivate it. + */ + if (!migration) { + if (vma->vm_flags & VM_LOCKED) { + ret = SWAP_MLOCK; + goto out_unmap; + } + if (ptep_clear_flush_young_notify(vma, address, pte)) { + ret = SWAP_FAIL; + goto out_unmap; + } + } + + /* Nuke the page table entry. */ + flush_cache_page(vma, address, page_to_pfn(page)); + pteval = ptep_clear_flush_notify(vma, address, pte); + + /* Move the dirty bit to the physical page now the pte is gone. */ + if (pte_dirty(pteval)) + set_page_dirty(page); + + /* Update high watermark before we lower rss */ + update_hiwater_rss(mm); + + if (PageAnon(page)) { + swp_entry_t entry = { .val = page_private(page) }; + + if (PageSwapCache(page)) { + /* + * Store the swap location in the pte. + * See handle_pte_fault() ... + */ + swap_duplicate(entry); + if (list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + if (list_empty(&mm->mmlist)) + list_add(&mm->mmlist, &init_mm.mmlist); + spin_unlock(&mmlist_lock); + } + dec_mm_counter(mm, anon_rss); +#ifdef CONFIG_MIGRATION + } else { + /* + * Store the pfn of the page in a special migration + * pte. do_swap_page() will wait until the migration + * pte is removed and then restart fault handling. + */ + BUG_ON(!migration); + entry = make_migration_entry(page, pte_write(pteval)); +#endif + } + set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); + BUG_ON(pte_file(*pte)); + } else +#ifdef CONFIG_MIGRATION + if (migration) { + /* Establish migration entry for a file page */ + swp_entry_t entry; + entry = make_migration_entry(page, pte_write(pteval)); + set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); + } else +#endif + dec_mm_counter(mm, file_rss); + + + page_remove_rmap(page, vma); + page_cache_release(page); + +out_unmap: + pte_unmap_unlock(pte, ptl); +out: + return ret; +} + +/* + * objrmap doesn't work for nonlinear VMAs because the assumption that + * offset-into-file correlates with offset-into-virtual-addresses does not hold. + * Consequently, given a particular page and its ->index, we cannot locate the + * ptes which are mapping that page without an exhaustive linear search. + * + * So what this code does is a mini "virtual scan" of each nonlinear VMA which + * maps the file to which the target page belongs. The ->vm_private_data field + * holds the current cursor into that scan. Successive searches will circulate + * around the vma's virtual address space. + * + * So as more replacement pressure is applied to the pages in a nonlinear VMA, + * more scanning pressure is placed against them as well. Eventually pages + * will become fully unmapped and are eligible for eviction. + * + * For very sparsely populated VMAs this is a little inefficient - chances are + * there there won't be many ptes located within the scan cluster. In this case + * maybe we could scan further - to the end of the pte page, perhaps. + * + * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can + * acquire it without blocking. If vma locked, mlock the pages in the cluster, + * rather than unmapping them. If we encounter the "check_page" that vmscan is + * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN. + */ +#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) +#define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) + +static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, + struct vm_area_struct *vma, struct page *check_page) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + pte_t pteval; + spinlock_t *ptl; + struct page *page; + unsigned long address; + unsigned long end; + int ret = SWAP_AGAIN; + int locked_vma = 0; + + address = (vma->vm_start + cursor) & CLUSTER_MASK; + end = address + CLUSTER_SIZE; + if (address < vma->vm_start) + address = vma->vm_start; + if (end > vma->vm_end) + end = vma->vm_end; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + return ret; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return ret; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return ret; + + /* + * MLOCK_PAGES => feature is configured. + * if we can acquire the mmap_sem for read, and vma is VM_LOCKED, + * keep the sem while scanning the cluster for mlocking pages. + */ + if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) { + locked_vma = (vma->vm_flags & VM_LOCKED); + if (!locked_vma) + up_read(&vma->vm_mm->mmap_sem); /* don't need it */ + } + + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + + /* Update high watermark before we lower rss */ + update_hiwater_rss(mm); + + for (; address < end; pte++, address += PAGE_SIZE) { + if (!pte_present(*pte)) + continue; + page = vm_normal_page(vma, address, *pte); + BUG_ON(!page || PageAnon(page)); + + if (locked_vma) { + mlock_vma_page(page); /* no-op if already mlocked */ + if (page == check_page) + ret = SWAP_MLOCK; + continue; /* don't unmap */ + } + + if (ptep_clear_flush_young_notify(vma, address, pte)) + continue; + + /* Nuke the page table entry. */ + flush_cache_page(vma, address, pte_pfn(*pte)); + pteval = ptep_clear_flush_notify(vma, address, pte); + + /* If nonlinear, store the file page offset in the pte. */ + if (page->index != linear_page_index(vma, address)) + set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); + + /* Move the dirty bit to the physical page now the pte is gone. */ + if (pte_dirty(pteval)) + set_page_dirty(page); + + page_remove_rmap(page, vma); + page_cache_release(page); + dec_mm_counter(mm, file_rss); + (*mapcount)--; + } + pte_unmap_unlock(pte - 1, ptl); + if (locked_vma) + up_read(&vma->vm_mm->mmap_sem); + return ret; +} + +/* + * common handling for pages mapped in VM_LOCKED vmas + */ +static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma) +{ + int mlocked = 0; + + if (down_read_trylock(&vma->vm_mm->mmap_sem)) { + if (vma->vm_flags & VM_LOCKED) { + mlock_vma_page(page); + mlocked++; /* really mlocked the page */ + } + up_read(&vma->vm_mm->mmap_sem); + } + return mlocked; +} + +/** + * try_to_unmap_anon - unmap or unlock anonymous page using the object-based + * rmap method + * @page: the page to unmap/unlock + * @unlock: request for unlock rather than unmap [unlikely] + * @migration: unmapping for migration - ignored if @unlock + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the anon_vma struct it points to. + * + * This function is only called from try_to_unmap/try_to_munlock for + * anonymous pages. + * When called from try_to_munlock(), the mmap_sem of the mm containing the vma + * where the page was found will be held for write. So, we won't recheck + * vm_flags for that VMA. That should be OK, because that vma shouldn't be + * 'LOCKED. + */ +static int try_to_unmap_anon(struct page *page, int unlock, int migration) +{ + struct anon_vma *anon_vma; + struct vm_area_struct *vma; + unsigned int mlocked = 0; + int ret = SWAP_AGAIN; + + if (MLOCK_PAGES && unlikely(unlock)) + ret = SWAP_SUCCESS; /* default for try_to_munlock() */ + + anon_vma = page_lock_anon_vma(page); + if (!anon_vma) + return ret; + + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + if (MLOCK_PAGES && unlikely(unlock)) { + if (!((vma->vm_flags & VM_LOCKED) && + page_mapped_in_vma(page, vma))) + continue; /* must visit all unlocked vmas */ + ret = SWAP_MLOCK; /* saw at least one mlocked vma */ + } else { + ret = try_to_unmap_one(page, vma, migration); + if (ret == SWAP_FAIL || !page_mapped(page)) + break; + } + if (ret == SWAP_MLOCK) { + mlocked = try_to_mlock_page(page, vma); + if (mlocked) + break; /* stop if actually mlocked page */ + } + } + + page_unlock_anon_vma(anon_vma); + + if (mlocked) + ret = SWAP_MLOCK; /* actually mlocked the page */ + else if (ret == SWAP_MLOCK) + ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ + + return ret; +} + +/** + * try_to_unmap_file - unmap/unlock file page using the object-based rmap method + * @page: the page to unmap/unlock + * @unlock: request for unlock rather than unmap [unlikely] + * @migration: unmapping for migration - ignored if @unlock + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the address_space struct it points to. + * + * This function is only called from try_to_unmap/try_to_munlock for + * object-based pages. + * When called from try_to_munlock(), the mmap_sem of the mm containing the vma + * where the page was found will be held for write. So, we won't recheck + * vm_flags for that VMA. That should be OK, because that vma shouldn't be + * 'LOCKED. + */ +static int try_to_unmap_file(struct page *page, int unlock, int migration) +{ + struct address_space *mapping = page->mapping; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + struct vm_area_struct *vma; + struct prio_tree_iter iter; + int ret = SWAP_AGAIN; + unsigned long cursor; + unsigned long max_nl_cursor = 0; + unsigned long max_nl_size = 0; + unsigned int mapcount; + unsigned int mlocked = 0; + + if (MLOCK_PAGES && unlikely(unlock)) + ret = SWAP_SUCCESS; /* default for try_to_munlock() */ + + spin_lock(&mapping->i_mmap_lock); + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + if (MLOCK_PAGES && unlikely(unlock)) { + if (!(vma->vm_flags & VM_LOCKED)) + continue; /* must visit all vmas */ + ret = SWAP_MLOCK; + } else { + ret = try_to_unmap_one(page, vma, migration); + if (ret == SWAP_FAIL || !page_mapped(page)) + goto out; + } + if (ret == SWAP_MLOCK) { + mlocked = try_to_mlock_page(page, vma); + if (mlocked) + break; /* stop if actually mlocked page */ + } + } + + if (mlocked) + goto out; + + if (list_empty(&mapping->i_mmap_nonlinear)) + goto out; + + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, + shared.vm_set.list) { + if (MLOCK_PAGES && unlikely(unlock)) { + if (!(vma->vm_flags & VM_LOCKED)) + continue; /* must visit all vmas */ + ret = SWAP_MLOCK; /* leave mlocked == 0 */ + goto out; /* no need to look further */ + } + if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED)) + continue; + cursor = (unsigned long) vma->vm_private_data; + if (cursor > max_nl_cursor) + max_nl_cursor = cursor; + cursor = vma->vm_end - vma->vm_start; + if (cursor > max_nl_size) + max_nl_size = cursor; + } + + if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ + ret = SWAP_FAIL; + goto out; + } + + /* + * We don't try to search for this page in the nonlinear vmas, + * and page_referenced wouldn't have found it anyway. Instead + * just walk the nonlinear vmas trying to age and unmap some. + * The mapcount of the page we came in with is irrelevant, + * but even so use it as a guide to how hard we should try? + */ + mapcount = page_mapcount(page); + if (!mapcount) + goto out; + cond_resched_lock(&mapping->i_mmap_lock); + + max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; + if (max_nl_cursor == 0) + max_nl_cursor = CLUSTER_SIZE; + + do { + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, + shared.vm_set.list) { + if (!MLOCK_PAGES && !migration && + (vma->vm_flags & VM_LOCKED)) + continue; + cursor = (unsigned long) vma->vm_private_data; + while ( cursor < max_nl_cursor && + cursor < vma->vm_end - vma->vm_start) { + ret = try_to_unmap_cluster(cursor, &mapcount, + vma, page); + if (ret == SWAP_MLOCK) + mlocked = 2; /* to return below */ + cursor += CLUSTER_SIZE; + vma->vm_private_data = (void *) cursor; + if ((int)mapcount <= 0) + goto out; + } + vma->vm_private_data = (void *) max_nl_cursor; + } + cond_resched_lock(&mapping->i_mmap_lock); + max_nl_cursor += CLUSTER_SIZE; + } while (max_nl_cursor <= max_nl_size); + + /* + * Don't loop forever (perhaps all the remaining pages are + * in locked vmas). Reset cursor on all unreserved nonlinear + * vmas, now forgetting on which ones it had fallen behind. + */ + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) + vma->vm_private_data = NULL; +out: + spin_unlock(&mapping->i_mmap_lock); + if (mlocked) + ret = SWAP_MLOCK; /* actually mlocked the page */ + else if (ret == SWAP_MLOCK) + ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ + return ret; +} + +/** + * try_to_unmap - try to remove all page table mappings to a page + * @page: the page to get unmapped + * @migration: migration flag + * + * Tries to remove all the page table entries which are mapping this + * page, used in the pageout path. Caller must hold the page lock. + * Return values are: + * + * SWAP_SUCCESS - we succeeded in removing all mappings + * SWAP_AGAIN - we missed a mapping, try again later + * SWAP_FAIL - the page is unswappable + * SWAP_MLOCK - page is mlocked. + */ +int try_to_unmap(struct page *page, int migration) +{ + int ret; + + BUG_ON(!PageLocked(page)); + + if (PageAnon(page)) + ret = try_to_unmap_anon(page, 0, migration); + else + ret = try_to_unmap_file(page, 0, migration); + if (ret != SWAP_MLOCK && !page_mapped(page)) + ret = SWAP_SUCCESS; + return ret; +} + +#ifdef CONFIG_UNEVICTABLE_LRU +/** + * try_to_munlock - try to munlock a page + * @page: the page to be munlocked + * + * Called from munlock code. Checks all of the VMAs mapping the page + * to make sure nobody else has this page mlocked. The page will be + * returned with PG_mlocked cleared if no other vmas have it mlocked. + * + * Return values are: + * + * SWAP_SUCCESS - no vma's holding page mlocked. + * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem + * SWAP_MLOCK - page is now mlocked. + */ +int try_to_munlock(struct page *page) +{ + VM_BUG_ON(!PageLocked(page) || PageLRU(page)); + + if (PageAnon(page)) + return try_to_unmap_anon(page, 1, 0); + else + return try_to_unmap_file(page, 1, 0); +} +#endif diff --git a/mm/shmem.c b/mm/shmem.c new file mode 100644 index 0000000..0ed0752 --- /dev/null +++ b/mm/shmem.c @@ -0,0 +1,2608 @@ +/* + * Resizable virtual memory filesystem for Linux. + * + * Copyright (C) 2000 Linus Torvalds. + * 2000 Transmeta Corp. + * 2000-2001 Christoph Rohland + * 2000-2001 SAP AG + * 2002 Red Hat Inc. + * Copyright (C) 2002-2005 Hugh Dickins. + * Copyright (C) 2002-2005 VERITAS Software Corporation. + * Copyright (C) 2004 Andi Kleen, SuSE Labs + * + * Extended attribute support for tmpfs: + * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> + * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> + * + * This file is released under the GPL. + */ + +/* + * This virtual memory filesystem is heavily based on the ramfs. It + * extends ramfs by the ability to use swap and honor resource limits + * which makes it a completely usable filesystem. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/xattr.h> +#include <linux/exportfs.h> +#include <linux/generic_acl.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/file.h> +#include <linux/swap.h> +#include <linux/pagemap.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/backing-dev.h> +#include <linux/shmem_fs.h> +#include <linux/mount.h> +#include <linux/writeback.h> +#include <linux/vfs.h> +#include <linux/blkdev.h> +#include <linux/security.h> +#include <linux/swapops.h> +#include <linux/mempolicy.h> +#include <linux/namei.h> +#include <linux/ctype.h> +#include <linux/migrate.h> +#include <linux/highmem.h> +#include <linux/seq_file.h> +#include <linux/magic.h> + +#include <asm/uaccess.h> +#include <asm/div64.h> +#include <asm/pgtable.h> + +#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) +#define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) +#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) + +#define SHMEM_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1)) +#define SHMEM_MAX_BYTES ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT) + +#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) + +/* info->flags needs VM_flags to handle pagein/truncate races efficiently */ +#define SHMEM_PAGEIN VM_READ +#define SHMEM_TRUNCATE VM_WRITE + +/* Definition to limit shmem_truncate's steps between cond_rescheds */ +#define LATENCY_LIMIT 64 + +/* Pretend that each entry is of this size in directory's i_size */ +#define BOGO_DIRENT_SIZE 20 + +/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ +enum sgp_type { + SGP_READ, /* don't exceed i_size, don't allocate page */ + SGP_CACHE, /* don't exceed i_size, may allocate page */ + SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ + SGP_WRITE, /* may exceed i_size, may allocate page */ +}; + +#ifdef CONFIG_TMPFS +static unsigned long shmem_default_max_blocks(void) +{ + return totalram_pages / 2; +} + +static unsigned long shmem_default_max_inodes(void) +{ + return min(totalram_pages - totalhigh_pages, totalram_pages / 2); +} +#endif + +static int shmem_getpage(struct inode *inode, unsigned long idx, + struct page **pagep, enum sgp_type sgp, int *type); + +static inline struct page *shmem_dir_alloc(gfp_t gfp_mask) +{ + /* + * The above definition of ENTRIES_PER_PAGE, and the use of + * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE: + * might be reconsidered if it ever diverges from PAGE_SIZE. + * + * Mobility flags are masked out as swap vectors cannot move + */ + return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO, + PAGE_CACHE_SHIFT-PAGE_SHIFT); +} + +static inline void shmem_dir_free(struct page *page) +{ + __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT); +} + +static struct page **shmem_dir_map(struct page *page) +{ + return (struct page **)kmap_atomic(page, KM_USER0); +} + +static inline void shmem_dir_unmap(struct page **dir) +{ + kunmap_atomic(dir, KM_USER0); +} + +static swp_entry_t *shmem_swp_map(struct page *page) +{ + return (swp_entry_t *)kmap_atomic(page, KM_USER1); +} + +static inline void shmem_swp_balance_unmap(void) +{ + /* + * When passing a pointer to an i_direct entry, to code which + * also handles indirect entries and so will shmem_swp_unmap, + * we must arrange for the preempt count to remain in balance. + * What kmap_atomic of a lowmem page does depends on config + * and architecture, so pretend to kmap_atomic some lowmem page. + */ + (void) kmap_atomic(ZERO_PAGE(0), KM_USER1); +} + +static inline void shmem_swp_unmap(swp_entry_t *entry) +{ + kunmap_atomic(entry, KM_USER1); +} + +static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +/* + * shmem_file_setup pre-accounts the whole fixed size of a VM object, + * for shared memory and for shared anonymous (/dev/zero) mappings + * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1), + * consistent with the pre-accounting of private mappings ... + */ +static inline int shmem_acct_size(unsigned long flags, loff_t size) +{ + return (flags & VM_ACCOUNT) ? + security_vm_enough_memory_kern(VM_ACCT(size)) : 0; +} + +static inline void shmem_unacct_size(unsigned long flags, loff_t size) +{ + if (flags & VM_ACCOUNT) + vm_unacct_memory(VM_ACCT(size)); +} + +/* + * ... whereas tmpfs objects are accounted incrementally as + * pages are allocated, in order to allow huge sparse files. + * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM, + * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. + */ +static inline int shmem_acct_block(unsigned long flags) +{ + return (flags & VM_ACCOUNT) ? + 0 : security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE)); +} + +static inline void shmem_unacct_blocks(unsigned long flags, long pages) +{ + if (!(flags & VM_ACCOUNT)) + vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE)); +} + +static const struct super_operations shmem_ops; +static const struct address_space_operations shmem_aops; +static const struct file_operations shmem_file_operations; +static const struct inode_operations shmem_inode_operations; +static const struct inode_operations shmem_dir_inode_operations; +static const struct inode_operations shmem_special_inode_operations; +static struct vm_operations_struct shmem_vm_ops; + +static struct backing_dev_info shmem_backing_dev_info __read_mostly = { + .ra_pages = 0, /* No readahead */ + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, + .unplug_io_fn = default_unplug_io_fn, +}; + +static LIST_HEAD(shmem_swaplist); +static DEFINE_MUTEX(shmem_swaplist_mutex); + +static void shmem_free_blocks(struct inode *inode, long pages) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + if (sbinfo->max_blocks) { + spin_lock(&sbinfo->stat_lock); + sbinfo->free_blocks += pages; + inode->i_blocks -= pages*BLOCKS_PER_PAGE; + spin_unlock(&sbinfo->stat_lock); + } +} + +static int shmem_reserve_inode(struct super_block *sb) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + if (sbinfo->max_inodes) { + spin_lock(&sbinfo->stat_lock); + if (!sbinfo->free_inodes) { + spin_unlock(&sbinfo->stat_lock); + return -ENOSPC; + } + sbinfo->free_inodes--; + spin_unlock(&sbinfo->stat_lock); + } + return 0; +} + +static void shmem_free_inode(struct super_block *sb) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + if (sbinfo->max_inodes) { + spin_lock(&sbinfo->stat_lock); + sbinfo->free_inodes++; + spin_unlock(&sbinfo->stat_lock); + } +} + +/** + * shmem_recalc_inode - recalculate the size of an inode + * @inode: inode to recalc + * + * We have to calculate the free blocks since the mm can drop + * undirtied hole pages behind our back. + * + * But normally info->alloced == inode->i_mapping->nrpages + info->swapped + * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) + * + * It has to be called with the spinlock held. + */ +static void shmem_recalc_inode(struct inode *inode) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + long freed; + + freed = info->alloced - info->swapped - inode->i_mapping->nrpages; + if (freed > 0) { + info->alloced -= freed; + shmem_unacct_blocks(info->flags, freed); + shmem_free_blocks(inode, freed); + } +} + +/** + * shmem_swp_entry - find the swap vector position in the info structure + * @info: info structure for the inode + * @index: index of the page to find + * @page: optional page to add to the structure. Has to be preset to + * all zeros + * + * If there is no space allocated yet it will return NULL when + * page is NULL, else it will use the page for the needed block, + * setting it to NULL on return to indicate that it has been used. + * + * The swap vector is organized the following way: + * + * There are SHMEM_NR_DIRECT entries directly stored in the + * shmem_inode_info structure. So small files do not need an addional + * allocation. + * + * For pages with index > SHMEM_NR_DIRECT there is the pointer + * i_indirect which points to a page which holds in the first half + * doubly indirect blocks, in the second half triple indirect blocks: + * + * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the + * following layout (for SHMEM_NR_DIRECT == 16): + * + * i_indirect -> dir --> 16-19 + * | +-> 20-23 + * | + * +-->dir2 --> 24-27 + * | +-> 28-31 + * | +-> 32-35 + * | +-> 36-39 + * | + * +-->dir3 --> 40-43 + * +-> 44-47 + * +-> 48-51 + * +-> 52-55 + */ +static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page) +{ + unsigned long offset; + struct page **dir; + struct page *subdir; + + if (index < SHMEM_NR_DIRECT) { + shmem_swp_balance_unmap(); + return info->i_direct+index; + } + if (!info->i_indirect) { + if (page) { + info->i_indirect = *page; + *page = NULL; + } + return NULL; /* need another page */ + } + + index -= SHMEM_NR_DIRECT; + offset = index % ENTRIES_PER_PAGE; + index /= ENTRIES_PER_PAGE; + dir = shmem_dir_map(info->i_indirect); + + if (index >= ENTRIES_PER_PAGE/2) { + index -= ENTRIES_PER_PAGE/2; + dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE; + index %= ENTRIES_PER_PAGE; + subdir = *dir; + if (!subdir) { + if (page) { + *dir = *page; + *page = NULL; + } + shmem_dir_unmap(dir); + return NULL; /* need another page */ + } + shmem_dir_unmap(dir); + dir = shmem_dir_map(subdir); + } + + dir += index; + subdir = *dir; + if (!subdir) { + if (!page || !(subdir = *page)) { + shmem_dir_unmap(dir); + return NULL; /* need a page */ + } + *dir = subdir; + *page = NULL; + } + shmem_dir_unmap(dir); + return shmem_swp_map(subdir) + offset; +} + +static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value) +{ + long incdec = value? 1: -1; + + entry->val = value; + info->swapped += incdec; + if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) { + struct page *page = kmap_atomic_to_page(entry); + set_page_private(page, page_private(page) + incdec); + } +} + +/** + * shmem_swp_alloc - get the position of the swap entry for the page. + * @info: info structure for the inode + * @index: index of the page to find + * @sgp: check and recheck i_size? skip allocation? + * + * If the entry does not exist, allocate it. + */ +static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp) +{ + struct inode *inode = &info->vfs_inode; + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct page *page = NULL; + swp_entry_t *entry; + + if (sgp != SGP_WRITE && + ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) + return ERR_PTR(-EINVAL); + + while (!(entry = shmem_swp_entry(info, index, &page))) { + if (sgp == SGP_READ) + return shmem_swp_map(ZERO_PAGE(0)); + /* + * Test free_blocks against 1 not 0, since we have 1 data + * page (and perhaps indirect index pages) yet to allocate: + * a waste to allocate index if we cannot allocate data. + */ + if (sbinfo->max_blocks) { + spin_lock(&sbinfo->stat_lock); + if (sbinfo->free_blocks <= 1) { + spin_unlock(&sbinfo->stat_lock); + return ERR_PTR(-ENOSPC); + } + sbinfo->free_blocks--; + inode->i_blocks += BLOCKS_PER_PAGE; + spin_unlock(&sbinfo->stat_lock); + } + + spin_unlock(&info->lock); + page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); + if (page) + set_page_private(page, 0); + spin_lock(&info->lock); + + if (!page) { + shmem_free_blocks(inode, 1); + return ERR_PTR(-ENOMEM); + } + if (sgp != SGP_WRITE && + ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { + entry = ERR_PTR(-EINVAL); + break; + } + if (info->next_index <= index) + info->next_index = index + 1; + } + if (page) { + /* another task gave its page, or truncated the file */ + shmem_free_blocks(inode, 1); + shmem_dir_free(page); + } + if (info->next_index <= index && !IS_ERR(entry)) + info->next_index = index + 1; + return entry; +} + +/** + * shmem_free_swp - free some swap entries in a directory + * @dir: pointer to the directory + * @edir: pointer after last entry of the directory + * @punch_lock: pointer to spinlock when needed for the holepunch case + */ +static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir, + spinlock_t *punch_lock) +{ + spinlock_t *punch_unlock = NULL; + swp_entry_t *ptr; + int freed = 0; + + for (ptr = dir; ptr < edir; ptr++) { + if (ptr->val) { + if (unlikely(punch_lock)) { + punch_unlock = punch_lock; + punch_lock = NULL; + spin_lock(punch_unlock); + if (!ptr->val) + continue; + } + free_swap_and_cache(*ptr); + *ptr = (swp_entry_t){0}; + freed++; + } + } + if (punch_unlock) + spin_unlock(punch_unlock); + return freed; +} + +static int shmem_map_and_free_swp(struct page *subdir, int offset, + int limit, struct page ***dir, spinlock_t *punch_lock) +{ + swp_entry_t *ptr; + int freed = 0; + + ptr = shmem_swp_map(subdir); + for (; offset < limit; offset += LATENCY_LIMIT) { + int size = limit - offset; + if (size > LATENCY_LIMIT) + size = LATENCY_LIMIT; + freed += shmem_free_swp(ptr+offset, ptr+offset+size, + punch_lock); + if (need_resched()) { + shmem_swp_unmap(ptr); + if (*dir) { + shmem_dir_unmap(*dir); + *dir = NULL; + } + cond_resched(); + ptr = shmem_swp_map(subdir); + } + } + shmem_swp_unmap(ptr); + return freed; +} + +static void shmem_free_pages(struct list_head *next) +{ + struct page *page; + int freed = 0; + + do { + page = container_of(next, struct page, lru); + next = next->next; + shmem_dir_free(page); + freed++; + if (freed >= LATENCY_LIMIT) { + cond_resched(); + freed = 0; + } + } while (next); +} + +static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + unsigned long idx; + unsigned long size; + unsigned long limit; + unsigned long stage; + unsigned long diroff; + struct page **dir; + struct page *topdir; + struct page *middir; + struct page *subdir; + swp_entry_t *ptr; + LIST_HEAD(pages_to_free); + long nr_pages_to_free = 0; + long nr_swaps_freed = 0; + int offset; + int freed; + int punch_hole; + spinlock_t *needs_lock; + spinlock_t *punch_lock; + unsigned long upper_limit; + + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (idx >= info->next_index) + return; + + spin_lock(&info->lock); + info->flags |= SHMEM_TRUNCATE; + if (likely(end == (loff_t) -1)) { + limit = info->next_index; + upper_limit = SHMEM_MAX_INDEX; + info->next_index = idx; + needs_lock = NULL; + punch_hole = 0; + } else { + if (end + 1 >= inode->i_size) { /* we may free a little more */ + limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + upper_limit = SHMEM_MAX_INDEX; + } else { + limit = (end + 1) >> PAGE_CACHE_SHIFT; + upper_limit = limit; + } + needs_lock = &info->lock; + punch_hole = 1; + } + + topdir = info->i_indirect; + if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) { + info->i_indirect = NULL; + nr_pages_to_free++; + list_add(&topdir->lru, &pages_to_free); + } + spin_unlock(&info->lock); + + if (info->swapped && idx < SHMEM_NR_DIRECT) { + ptr = info->i_direct; + size = limit; + if (size > SHMEM_NR_DIRECT) + size = SHMEM_NR_DIRECT; + nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock); + } + + /* + * If there are no indirect blocks or we are punching a hole + * below indirect blocks, nothing to be done. + */ + if (!topdir || limit <= SHMEM_NR_DIRECT) + goto done2; + + /* + * The truncation case has already dropped info->lock, and we're safe + * because i_size and next_index have already been lowered, preventing + * access beyond. But in the punch_hole case, we still need to take + * the lock when updating the swap directory, because there might be + * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or + * shmem_writepage. However, whenever we find we can remove a whole + * directory page (not at the misaligned start or end of the range), + * we first NULLify its pointer in the level above, and then have no + * need to take the lock when updating its contents: needs_lock and + * punch_lock (either pointing to info->lock or NULL) manage this. + */ + + upper_limit -= SHMEM_NR_DIRECT; + limit -= SHMEM_NR_DIRECT; + idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0; + offset = idx % ENTRIES_PER_PAGE; + idx -= offset; + + dir = shmem_dir_map(topdir); + stage = ENTRIES_PER_PAGEPAGE/2; + if (idx < ENTRIES_PER_PAGEPAGE/2) { + middir = topdir; + diroff = idx/ENTRIES_PER_PAGE; + } else { + dir += ENTRIES_PER_PAGE/2; + dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE; + while (stage <= idx) + stage += ENTRIES_PER_PAGEPAGE; + middir = *dir; + if (*dir) { + diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) % + ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE; + if (!diroff && !offset && upper_limit >= stage) { + if (needs_lock) { + spin_lock(needs_lock); + *dir = NULL; + spin_unlock(needs_lock); + needs_lock = NULL; + } else + *dir = NULL; + nr_pages_to_free++; + list_add(&middir->lru, &pages_to_free); + } + shmem_dir_unmap(dir); + dir = shmem_dir_map(middir); + } else { + diroff = 0; + offset = 0; + idx = stage; + } + } + + for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) { + if (unlikely(idx == stage)) { + shmem_dir_unmap(dir); + dir = shmem_dir_map(topdir) + + ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; + while (!*dir) { + dir++; + idx += ENTRIES_PER_PAGEPAGE; + if (idx >= limit) + goto done1; + } + stage = idx + ENTRIES_PER_PAGEPAGE; + middir = *dir; + if (punch_hole) + needs_lock = &info->lock; + if (upper_limit >= stage) { + if (needs_lock) { + spin_lock(needs_lock); + *dir = NULL; + spin_unlock(needs_lock); + needs_lock = NULL; + } else + *dir = NULL; + nr_pages_to_free++; + list_add(&middir->lru, &pages_to_free); + } + shmem_dir_unmap(dir); + cond_resched(); + dir = shmem_dir_map(middir); + diroff = 0; + } + punch_lock = needs_lock; + subdir = dir[diroff]; + if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) { + if (needs_lock) { + spin_lock(needs_lock); + dir[diroff] = NULL; + spin_unlock(needs_lock); + punch_lock = NULL; + } else + dir[diroff] = NULL; + nr_pages_to_free++; + list_add(&subdir->lru, &pages_to_free); + } + if (subdir && page_private(subdir) /* has swap entries */) { + size = limit - idx; + if (size > ENTRIES_PER_PAGE) + size = ENTRIES_PER_PAGE; + freed = shmem_map_and_free_swp(subdir, + offset, size, &dir, punch_lock); + if (!dir) + dir = shmem_dir_map(middir); + nr_swaps_freed += freed; + if (offset || punch_lock) { + spin_lock(&info->lock); + set_page_private(subdir, + page_private(subdir) - freed); + spin_unlock(&info->lock); + } else + BUG_ON(page_private(subdir) != freed); + } + offset = 0; + } +done1: + shmem_dir_unmap(dir); +done2: + if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) { + /* + * Call truncate_inode_pages again: racing shmem_unuse_inode + * may have swizzled a page in from swap since vmtruncate or + * generic_delete_inode did it, before we lowered next_index. + * Also, though shmem_getpage checks i_size before adding to + * cache, no recheck after: so fix the narrow window there too. + * + * Recalling truncate_inode_pages_range and unmap_mapping_range + * every time for punch_hole (which never got a chance to clear + * SHMEM_PAGEIN at the start of vmtruncate_range) is expensive, + * yet hardly ever necessary: try to optimize them out later. + */ + truncate_inode_pages_range(inode->i_mapping, start, end); + if (punch_hole) + unmap_mapping_range(inode->i_mapping, start, + end - start, 1); + } + + spin_lock(&info->lock); + info->flags &= ~SHMEM_TRUNCATE; + info->swapped -= nr_swaps_freed; + if (nr_pages_to_free) + shmem_free_blocks(inode, nr_pages_to_free); + shmem_recalc_inode(inode); + spin_unlock(&info->lock); + + /* + * Empty swap vector directory pages to be freed? + */ + if (!list_empty(&pages_to_free)) { + pages_to_free.prev->next = NULL; + shmem_free_pages(pages_to_free.next); + } +} + +static void shmem_truncate(struct inode *inode) +{ + shmem_truncate_range(inode, inode->i_size, (loff_t)-1); +} + +static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + struct page *page = NULL; + int error; + + if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { + if (attr->ia_size < inode->i_size) { + /* + * If truncating down to a partial page, then + * if that page is already allocated, hold it + * in memory until the truncation is over, so + * truncate_partial_page cannnot miss it were + * it assigned to swap. + */ + if (attr->ia_size & (PAGE_CACHE_SIZE-1)) { + (void) shmem_getpage(inode, + attr->ia_size>>PAGE_CACHE_SHIFT, + &page, SGP_READ, NULL); + if (page) + unlock_page(page); + } + /* + * Reset SHMEM_PAGEIN flag so that shmem_truncate can + * detect if any pages might have been added to cache + * after truncate_inode_pages. But we needn't bother + * if it's being fully truncated to zero-length: the + * nrpages check is efficient enough in that case. + */ + if (attr->ia_size) { + struct shmem_inode_info *info = SHMEM_I(inode); + spin_lock(&info->lock); + info->flags &= ~SHMEM_PAGEIN; + spin_unlock(&info->lock); + } + } + } + + error = inode_change_ok(inode, attr); + if (!error) + error = inode_setattr(inode, attr); +#ifdef CONFIG_TMPFS_POSIX_ACL + if (!error && (attr->ia_valid & ATTR_MODE)) + error = generic_acl_chmod(inode, &shmem_acl_ops); +#endif + if (page) + page_cache_release(page); + return error; +} + +static void shmem_delete_inode(struct inode *inode) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + + if (inode->i_op->truncate == shmem_truncate) { + truncate_inode_pages(inode->i_mapping, 0); + shmem_unacct_size(info->flags, inode->i_size); + inode->i_size = 0; + shmem_truncate(inode); + if (!list_empty(&info->swaplist)) { + mutex_lock(&shmem_swaplist_mutex); + list_del_init(&info->swaplist); + mutex_unlock(&shmem_swaplist_mutex); + } + } + BUG_ON(inode->i_blocks); + shmem_free_inode(inode->i_sb); + clear_inode(inode); +} + +static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir) +{ + swp_entry_t *ptr; + + for (ptr = dir; ptr < edir; ptr++) { + if (ptr->val == entry.val) + return ptr - dir; + } + return -1; +} + +static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page) +{ + struct inode *inode; + unsigned long idx; + unsigned long size; + unsigned long limit; + unsigned long stage; + struct page **dir; + struct page *subdir; + swp_entry_t *ptr; + int offset; + int error; + + idx = 0; + ptr = info->i_direct; + spin_lock(&info->lock); + if (!info->swapped) { + list_del_init(&info->swaplist); + goto lost2; + } + limit = info->next_index; + size = limit; + if (size > SHMEM_NR_DIRECT) + size = SHMEM_NR_DIRECT; + offset = shmem_find_swp(entry, ptr, ptr+size); + if (offset >= 0) + goto found; + if (!info->i_indirect) + goto lost2; + + dir = shmem_dir_map(info->i_indirect); + stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2; + + for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) { + if (unlikely(idx == stage)) { + shmem_dir_unmap(dir-1); + if (cond_resched_lock(&info->lock)) { + /* check it has not been truncated */ + if (limit > info->next_index) { + limit = info->next_index; + if (idx >= limit) + goto lost2; + } + } + dir = shmem_dir_map(info->i_indirect) + + ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; + while (!*dir) { + dir++; + idx += ENTRIES_PER_PAGEPAGE; + if (idx >= limit) + goto lost1; + } + stage = idx + ENTRIES_PER_PAGEPAGE; + subdir = *dir; + shmem_dir_unmap(dir); + dir = shmem_dir_map(subdir); + } + subdir = *dir; + if (subdir && page_private(subdir)) { + ptr = shmem_swp_map(subdir); + size = limit - idx; + if (size > ENTRIES_PER_PAGE) + size = ENTRIES_PER_PAGE; + offset = shmem_find_swp(entry, ptr, ptr+size); + shmem_swp_unmap(ptr); + if (offset >= 0) { + shmem_dir_unmap(dir); + goto found; + } + } + } +lost1: + shmem_dir_unmap(dir-1); +lost2: + spin_unlock(&info->lock); + return 0; +found: + idx += offset; + inode = igrab(&info->vfs_inode); + spin_unlock(&info->lock); + + /* + * Move _head_ to start search for next from here. + * But be careful: shmem_delete_inode checks list_empty without taking + * mutex, and there's an instant in list_move_tail when info->swaplist + * would appear empty, if it were the only one on shmem_swaplist. We + * could avoid doing it if inode NULL; or use this minor optimization. + */ + if (shmem_swaplist.next != &info->swaplist) + list_move_tail(&shmem_swaplist, &info->swaplist); + mutex_unlock(&shmem_swaplist_mutex); + + error = 1; + if (!inode) + goto out; + /* Precharge page using GFP_KERNEL while we can wait */ + error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); + if (error) + goto out; + error = radix_tree_preload(GFP_KERNEL); + if (error) { + mem_cgroup_uncharge_cache_page(page); + goto out; + } + error = 1; + + spin_lock(&info->lock); + ptr = shmem_swp_entry(info, idx, NULL); + if (ptr && ptr->val == entry.val) { + error = add_to_page_cache_locked(page, inode->i_mapping, + idx, GFP_NOWAIT); + /* does mem_cgroup_uncharge_cache_page on error */ + } else /* we must compensate for our precharge above */ + mem_cgroup_uncharge_cache_page(page); + + if (error == -EEXIST) { + struct page *filepage = find_get_page(inode->i_mapping, idx); + error = 1; + if (filepage) { + /* + * There might be a more uptodate page coming down + * from a stacked writepage: forget our swappage if so. + */ + if (PageUptodate(filepage)) + error = 0; + page_cache_release(filepage); + } + } + if (!error) { + delete_from_swap_cache(page); + set_page_dirty(page); + info->flags |= SHMEM_PAGEIN; + shmem_swp_set(info, ptr, 0); + swap_free(entry); + error = 1; /* not an error, but entry was found */ + } + if (ptr) + shmem_swp_unmap(ptr); + spin_unlock(&info->lock); + radix_tree_preload_end(); +out: + unlock_page(page); + page_cache_release(page); + iput(inode); /* allows for NULL */ + return error; +} + +/* + * shmem_unuse() search for an eventually swapped out shmem page. + */ +int shmem_unuse(swp_entry_t entry, struct page *page) +{ + struct list_head *p, *next; + struct shmem_inode_info *info; + int found = 0; + + mutex_lock(&shmem_swaplist_mutex); + list_for_each_safe(p, next, &shmem_swaplist) { + info = list_entry(p, struct shmem_inode_info, swaplist); + found = shmem_unuse_inode(info, entry, page); + cond_resched(); + if (found) + goto out; + } + mutex_unlock(&shmem_swaplist_mutex); +out: return found; /* 0 or 1 or -ENOMEM */ +} + +/* + * Move the page from the page cache to the swap cache. + */ +static int shmem_writepage(struct page *page, struct writeback_control *wbc) +{ + struct shmem_inode_info *info; + swp_entry_t *entry, swap; + struct address_space *mapping; + unsigned long index; + struct inode *inode; + + BUG_ON(!PageLocked(page)); + mapping = page->mapping; + index = page->index; + inode = mapping->host; + info = SHMEM_I(inode); + if (info->flags & VM_LOCKED) + goto redirty; + if (!total_swap_pages) + goto redirty; + + /* + * shmem_backing_dev_info's capabilities prevent regular writeback or + * sync from ever calling shmem_writepage; but a stacking filesystem + * may use the ->writepage of its underlying filesystem, in which case + * tmpfs should write out to swap only in response to memory pressure, + * and not for pdflush or sync. However, in those cases, we do still + * want to check if there's a redundant swappage to be discarded. + */ + if (wbc->for_reclaim) + swap = get_swap_page(); + else + swap.val = 0; + + spin_lock(&info->lock); + if (index >= info->next_index) { + BUG_ON(!(info->flags & SHMEM_TRUNCATE)); + goto unlock; + } + entry = shmem_swp_entry(info, index, NULL); + if (entry->val) { + /* + * The more uptodate page coming down from a stacked + * writepage should replace our old swappage. + */ + free_swap_and_cache(*entry); + shmem_swp_set(info, entry, 0); + } + shmem_recalc_inode(inode); + + if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { + remove_from_page_cache(page); + shmem_swp_set(info, entry, swap.val); + shmem_swp_unmap(entry); + if (list_empty(&info->swaplist)) + inode = igrab(inode); + else + inode = NULL; + spin_unlock(&info->lock); + swap_duplicate(swap); + BUG_ON(page_mapped(page)); + page_cache_release(page); /* pagecache ref */ + set_page_dirty(page); + unlock_page(page); + if (inode) { + mutex_lock(&shmem_swaplist_mutex); + /* move instead of add in case we're racing */ + list_move_tail(&info->swaplist, &shmem_swaplist); + mutex_unlock(&shmem_swaplist_mutex); + iput(inode); + } + return 0; + } + + shmem_swp_unmap(entry); +unlock: + spin_unlock(&info->lock); + swap_free(swap); +redirty: + set_page_dirty(page); + if (wbc->for_reclaim) + return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */ + unlock_page(page); + return 0; +} + +#ifdef CONFIG_NUMA +#ifdef CONFIG_TMPFS +static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) +{ + char buffer[64]; + + if (!mpol || mpol->mode == MPOL_DEFAULT) + return; /* show nothing */ + + mpol_to_str(buffer, sizeof(buffer), mpol, 1); + + seq_printf(seq, ",mpol=%s", buffer); +} + +static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) +{ + struct mempolicy *mpol = NULL; + if (sbinfo->mpol) { + spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ + mpol = sbinfo->mpol; + mpol_get(mpol); + spin_unlock(&sbinfo->stat_lock); + } + return mpol; +} +#endif /* CONFIG_TMPFS */ + +static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, + struct shmem_inode_info *info, unsigned long idx) +{ + struct mempolicy mpol, *spol; + struct vm_area_struct pvma; + struct page *page; + + spol = mpol_cond_copy(&mpol, + mpol_shared_policy_lookup(&info->policy, idx)); + + /* Create a pseudo vma that just contains the policy */ + pvma.vm_start = 0; + pvma.vm_pgoff = idx; + pvma.vm_ops = NULL; + pvma.vm_policy = spol; + page = swapin_readahead(entry, gfp, &pvma, 0); + return page; +} + +static struct page *shmem_alloc_page(gfp_t gfp, + struct shmem_inode_info *info, unsigned long idx) +{ + struct vm_area_struct pvma; + + /* Create a pseudo vma that just contains the policy */ + pvma.vm_start = 0; + pvma.vm_pgoff = idx; + pvma.vm_ops = NULL; + pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); + + /* + * alloc_page_vma() will drop the shared policy reference + */ + return alloc_page_vma(gfp, &pvma, 0); +} +#else /* !CONFIG_NUMA */ +#ifdef CONFIG_TMPFS +static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p) +{ +} +#endif /* CONFIG_TMPFS */ + +static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, + struct shmem_inode_info *info, unsigned long idx) +{ + return swapin_readahead(entry, gfp, NULL, 0); +} + +static inline struct page *shmem_alloc_page(gfp_t gfp, + struct shmem_inode_info *info, unsigned long idx) +{ + return alloc_page(gfp); +} +#endif /* CONFIG_NUMA */ + +#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS) +static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) +{ + return NULL; +} +#endif + +/* + * shmem_getpage - either get the page from swap or allocate a new one + * + * If we allocate a new one we do not mark it dirty. That's up to the + * vm. If we swap it in we mark it dirty since we also free the swap + * entry since a page cannot live in both the swap and page cache + */ +static int shmem_getpage(struct inode *inode, unsigned long idx, + struct page **pagep, enum sgp_type sgp, int *type) +{ + struct address_space *mapping = inode->i_mapping; + struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo; + struct page *filepage = *pagep; + struct page *swappage; + swp_entry_t *entry; + swp_entry_t swap; + gfp_t gfp; + int error; + + if (idx >= SHMEM_MAX_INDEX) + return -EFBIG; + + if (type) + *type = 0; + + /* + * Normally, filepage is NULL on entry, and either found + * uptodate immediately, or allocated and zeroed, or read + * in under swappage, which is then assigned to filepage. + * But shmem_readpage (required for splice) passes in a locked + * filepage, which may be found not uptodate by other callers + * too, and may need to be copied from the swappage read in. + */ +repeat: + if (!filepage) + filepage = find_lock_page(mapping, idx); + if (filepage && PageUptodate(filepage)) + goto done; + error = 0; + gfp = mapping_gfp_mask(mapping); + if (!filepage) { + /* + * Try to preload while we can wait, to not make a habit of + * draining atomic reserves; but don't latch on to this cpu. + */ + error = radix_tree_preload(gfp & ~__GFP_HIGHMEM); + if (error) + goto failed; + radix_tree_preload_end(); + } + + spin_lock(&info->lock); + shmem_recalc_inode(inode); + entry = shmem_swp_alloc(info, idx, sgp); + if (IS_ERR(entry)) { + spin_unlock(&info->lock); + error = PTR_ERR(entry); + goto failed; + } + swap = *entry; + + if (swap.val) { + /* Look it up and read it in.. */ + swappage = lookup_swap_cache(swap); + if (!swappage) { + shmem_swp_unmap(entry); + /* here we actually do the io */ + if (type && !(*type & VM_FAULT_MAJOR)) { + __count_vm_event(PGMAJFAULT); + *type |= VM_FAULT_MAJOR; + } + spin_unlock(&info->lock); + swappage = shmem_swapin(swap, gfp, info, idx); + if (!swappage) { + spin_lock(&info->lock); + entry = shmem_swp_alloc(info, idx, sgp); + if (IS_ERR(entry)) + error = PTR_ERR(entry); + else { + if (entry->val == swap.val) + error = -ENOMEM; + shmem_swp_unmap(entry); + } + spin_unlock(&info->lock); + if (error) + goto failed; + goto repeat; + } + wait_on_page_locked(swappage); + page_cache_release(swappage); + goto repeat; + } + + /* We have to do this with page locked to prevent races */ + if (!trylock_page(swappage)) { + shmem_swp_unmap(entry); + spin_unlock(&info->lock); + wait_on_page_locked(swappage); + page_cache_release(swappage); + goto repeat; + } + if (PageWriteback(swappage)) { + shmem_swp_unmap(entry); + spin_unlock(&info->lock); + wait_on_page_writeback(swappage); + unlock_page(swappage); + page_cache_release(swappage); + goto repeat; + } + if (!PageUptodate(swappage)) { + shmem_swp_unmap(entry); + spin_unlock(&info->lock); + unlock_page(swappage); + page_cache_release(swappage); + error = -EIO; + goto failed; + } + + if (filepage) { + shmem_swp_set(info, entry, 0); + shmem_swp_unmap(entry); + delete_from_swap_cache(swappage); + spin_unlock(&info->lock); + copy_highpage(filepage, swappage); + unlock_page(swappage); + page_cache_release(swappage); + flush_dcache_page(filepage); + SetPageUptodate(filepage); + set_page_dirty(filepage); + swap_free(swap); + } else if (!(error = add_to_page_cache_locked(swappage, mapping, + idx, GFP_NOWAIT))) { + info->flags |= SHMEM_PAGEIN; + shmem_swp_set(info, entry, 0); + shmem_swp_unmap(entry); + delete_from_swap_cache(swappage); + spin_unlock(&info->lock); + filepage = swappage; + set_page_dirty(filepage); + swap_free(swap); + } else { + shmem_swp_unmap(entry); + spin_unlock(&info->lock); + unlock_page(swappage); + page_cache_release(swappage); + if (error == -ENOMEM) { + /* allow reclaim from this memory cgroup */ + error = mem_cgroup_shrink_usage(current->mm, + gfp); + if (error) + goto failed; + } + goto repeat; + } + } else if (sgp == SGP_READ && !filepage) { + shmem_swp_unmap(entry); + filepage = find_get_page(mapping, idx); + if (filepage && + (!PageUptodate(filepage) || !trylock_page(filepage))) { + spin_unlock(&info->lock); + wait_on_page_locked(filepage); + page_cache_release(filepage); + filepage = NULL; + goto repeat; + } + spin_unlock(&info->lock); + } else { + shmem_swp_unmap(entry); + sbinfo = SHMEM_SB(inode->i_sb); + if (sbinfo->max_blocks) { + spin_lock(&sbinfo->stat_lock); + if (sbinfo->free_blocks == 0 || + shmem_acct_block(info->flags)) { + spin_unlock(&sbinfo->stat_lock); + spin_unlock(&info->lock); + error = -ENOSPC; + goto failed; + } + sbinfo->free_blocks--; + inode->i_blocks += BLOCKS_PER_PAGE; + spin_unlock(&sbinfo->stat_lock); + } else if (shmem_acct_block(info->flags)) { + spin_unlock(&info->lock); + error = -ENOSPC; + goto failed; + } + + if (!filepage) { + int ret; + + spin_unlock(&info->lock); + filepage = shmem_alloc_page(gfp, info, idx); + if (!filepage) { + shmem_unacct_blocks(info->flags, 1); + shmem_free_blocks(inode, 1); + error = -ENOMEM; + goto failed; + } + SetPageSwapBacked(filepage); + + /* Precharge page while we can wait, compensate after */ + error = mem_cgroup_cache_charge(filepage, current->mm, + gfp & ~__GFP_HIGHMEM); + if (error) { + page_cache_release(filepage); + shmem_unacct_blocks(info->flags, 1); + shmem_free_blocks(inode, 1); + filepage = NULL; + goto failed; + } + + spin_lock(&info->lock); + entry = shmem_swp_alloc(info, idx, sgp); + if (IS_ERR(entry)) + error = PTR_ERR(entry); + else { + swap = *entry; + shmem_swp_unmap(entry); + } + ret = error || swap.val; + if (ret) + mem_cgroup_uncharge_cache_page(filepage); + else + ret = add_to_page_cache_lru(filepage, mapping, + idx, GFP_NOWAIT); + /* + * At add_to_page_cache_lru() failure, uncharge will + * be done automatically. + */ + if (ret) { + spin_unlock(&info->lock); + page_cache_release(filepage); + shmem_unacct_blocks(info->flags, 1); + shmem_free_blocks(inode, 1); + filepage = NULL; + if (error) + goto failed; + goto repeat; + } + info->flags |= SHMEM_PAGEIN; + } + + info->alloced++; + spin_unlock(&info->lock); + clear_highpage(filepage); + flush_dcache_page(filepage); + SetPageUptodate(filepage); + if (sgp == SGP_DIRTY) + set_page_dirty(filepage); + } +done: + *pagep = filepage; + return 0; + +failed: + if (*pagep != filepage) { + unlock_page(filepage); + page_cache_release(filepage); + } + return error; +} + +static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + int error; + int ret; + + if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) + return VM_FAULT_SIGBUS; + + error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); + if (error) + return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); + + mark_page_accessed(vmf->page); + return ret | VM_FAULT_LOCKED; +} + +#ifdef CONFIG_NUMA +static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) +{ + struct inode *i = vma->vm_file->f_path.dentry->d_inode; + return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); +} + +static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, + unsigned long addr) +{ + struct inode *i = vma->vm_file->f_path.dentry->d_inode; + unsigned long idx; + + idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx); +} +#endif + +int shmem_lock(struct file *file, int lock, struct user_struct *user) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct shmem_inode_info *info = SHMEM_I(inode); + int retval = -ENOMEM; + + spin_lock(&info->lock); + if (lock && !(info->flags & VM_LOCKED)) { + if (!user_shm_lock(inode->i_size, user)) + goto out_nomem; + info->flags |= VM_LOCKED; + mapping_set_unevictable(file->f_mapping); + } + if (!lock && (info->flags & VM_LOCKED) && user) { + user_shm_unlock(inode->i_size, user); + info->flags &= ~VM_LOCKED; + mapping_clear_unevictable(file->f_mapping); + scan_mapping_unevictable_pages(file->f_mapping); + } + retval = 0; + +out_nomem: + spin_unlock(&info->lock); + return retval; +} + +static int shmem_mmap(struct file *file, struct vm_area_struct *vma) +{ + file_accessed(file); + vma->vm_ops = &shmem_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; + return 0; +} + +static struct inode * +shmem_get_inode(struct super_block *sb, int mode, dev_t dev) +{ + struct inode *inode; + struct shmem_inode_info *info; + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + + if (shmem_reserve_inode(sb)) + return NULL; + + inode = new_inode(sb); + if (inode) { + inode->i_mode = mode; + inode->i_uid = current->fsuid; + inode->i_gid = current->fsgid; + inode->i_blocks = 0; + inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_generation = get_seconds(); + info = SHMEM_I(inode); + memset(info, 0, (char *)inode - (char *)info); + spin_lock_init(&info->lock); + INIT_LIST_HEAD(&info->swaplist); + + switch (mode & S_IFMT) { + default: + inode->i_op = &shmem_special_inode_operations; + init_special_inode(inode, mode, dev); + break; + case S_IFREG: + inode->i_mapping->a_ops = &shmem_aops; + inode->i_op = &shmem_inode_operations; + inode->i_fop = &shmem_file_operations; + mpol_shared_policy_init(&info->policy, + shmem_get_sbmpol(sbinfo)); + break; + case S_IFDIR: + inc_nlink(inode); + /* Some things misbehave if size == 0 on a directory */ + inode->i_size = 2 * BOGO_DIRENT_SIZE; + inode->i_op = &shmem_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + break; + case S_IFLNK: + /* + * Must not load anything in the rbtree, + * mpol_free_shared_policy will not be called. + */ + mpol_shared_policy_init(&info->policy, NULL); + break; + } + } else + shmem_free_inode(sb); + return inode; +} + +#ifdef CONFIG_TMPFS +static const struct inode_operations shmem_symlink_inode_operations; +static const struct inode_operations shmem_symlink_inline_operations; + +/* + * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin; + * but providing them allows a tmpfs file to be used for splice, sendfile, and + * below the loop driver, in the generic fashion that many filesystems support. + */ +static int shmem_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL); + unlock_page(page); + return error; +} + +static int +shmem_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + *pagep = NULL; + return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); +} + +static int +shmem_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + + if (pos + copied > inode->i_size) + i_size_write(inode, pos + copied); + + unlock_page(page); + set_page_dirty(page); + page_cache_release(page); + + return copied; +} + +static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + unsigned long index, offset; + enum sgp_type sgp = SGP_READ; + + /* + * Might this read be for a stacking filesystem? Then when reading + * holes of a sparse file, we actually need to allocate those pages, + * and even mark them dirty, so it cannot exceed the max_blocks limit. + */ + if (segment_eq(get_fs(), KERNEL_DS)) + sgp = SGP_DIRTY; + + index = *ppos >> PAGE_CACHE_SHIFT; + offset = *ppos & ~PAGE_CACHE_MASK; + + for (;;) { + struct page *page = NULL; + unsigned long end_index, nr, ret; + loff_t i_size = i_size_read(inode); + + end_index = i_size >> PAGE_CACHE_SHIFT; + if (index > end_index) + break; + if (index == end_index) { + nr = i_size & ~PAGE_CACHE_MASK; + if (nr <= offset) + break; + } + + desc->error = shmem_getpage(inode, index, &page, sgp, NULL); + if (desc->error) { + if (desc->error == -EINVAL) + desc->error = 0; + break; + } + if (page) + unlock_page(page); + + /* + * We must evaluate after, since reads (unlike writes) + * are called without i_mutex protection against truncate + */ + nr = PAGE_CACHE_SIZE; + i_size = i_size_read(inode); + end_index = i_size >> PAGE_CACHE_SHIFT; + if (index == end_index) { + nr = i_size & ~PAGE_CACHE_MASK; + if (nr <= offset) { + if (page) + page_cache_release(page); + break; + } + } + nr -= offset; + + if (page) { + /* + * If users can be writing to this page using arbitrary + * virtual addresses, take care about potential aliasing + * before reading the page on the kernel side. + */ + if (mapping_writably_mapped(mapping)) + flush_dcache_page(page); + /* + * Mark the page accessed if we read the beginning. + */ + if (!offset) + mark_page_accessed(page); + } else { + page = ZERO_PAGE(0); + page_cache_get(page); + } + + /* + * Ok, we have the page, and it's up-to-date, so + * now we can copy it to user space... + * + * The actor routine returns how many bytes were actually used.. + * NOTE! This may not be the same as how much of a user buffer + * we filled up (we may be padding etc), so we can only update + * "pos" here (the actor routine has to update the user buffer + * pointers and the remaining count). + */ + ret = actor(desc, page, offset, nr); + offset += ret; + index += offset >> PAGE_CACHE_SHIFT; + offset &= ~PAGE_CACHE_MASK; + + page_cache_release(page); + if (ret != nr || !desc->count) + break; + + cond_resched(); + } + + *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; + file_accessed(filp); +} + +static ssize_t shmem_file_aio_read(struct kiocb *iocb, + const struct iovec *iov, unsigned long nr_segs, loff_t pos) +{ + struct file *filp = iocb->ki_filp; + ssize_t retval; + unsigned long seg; + size_t count; + loff_t *ppos = &iocb->ki_pos; + + retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); + if (retval) + return retval; + + for (seg = 0; seg < nr_segs; seg++) { + read_descriptor_t desc; + + desc.written = 0; + desc.arg.buf = iov[seg].iov_base; + desc.count = iov[seg].iov_len; + if (desc.count == 0) + continue; + desc.error = 0; + do_shmem_file_read(filp, ppos, &desc, file_read_actor); + retval += desc.written; + if (desc.error) { + retval = retval ?: desc.error; + break; + } + if (desc.count > 0) + break; + } + return retval; +} + +static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); + + buf->f_type = TMPFS_MAGIC; + buf->f_bsize = PAGE_CACHE_SIZE; + buf->f_namelen = NAME_MAX; + spin_lock(&sbinfo->stat_lock); + if (sbinfo->max_blocks) { + buf->f_blocks = sbinfo->max_blocks; + buf->f_bavail = buf->f_bfree = sbinfo->free_blocks; + } + if (sbinfo->max_inodes) { + buf->f_files = sbinfo->max_inodes; + buf->f_ffree = sbinfo->free_inodes; + } + /* else leave those fields 0 like simple_statfs */ + spin_unlock(&sbinfo->stat_lock); + return 0; +} + +/* + * File creation. Allocate an inode, and we're done.. + */ +static int +shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) +{ + struct inode *inode = shmem_get_inode(dir->i_sb, mode, dev); + int error = -ENOSPC; + + if (inode) { + error = security_inode_init_security(inode, dir, NULL, NULL, + NULL); + if (error) { + if (error != -EOPNOTSUPP) { + iput(inode); + return error; + } + } + error = shmem_acl_init(inode, dir); + if (error) { + iput(inode); + return error; + } + if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + inode->i_mode |= S_ISGID; + } + dir->i_size += BOGO_DIRENT_SIZE; + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + d_instantiate(dentry, inode); + dget(dentry); /* Extra count - pin the dentry in core */ + } + return error; +} + +static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + int error; + + if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0))) + return error; + inc_nlink(dir); + return 0; +} + +static int shmem_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + return shmem_mknod(dir, dentry, mode | S_IFREG, 0); +} + +/* + * Link a file.. + */ +static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + int ret; + + /* + * No ordinary (disk based) filesystem counts links as inodes; + * but each new link needs a new dentry, pinning lowmem, and + * tmpfs dentries cannot be pruned until they are unlinked. + */ + ret = shmem_reserve_inode(inode->i_sb); + if (ret) + goto out; + + dir->i_size += BOGO_DIRENT_SIZE; + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + inc_nlink(inode); + atomic_inc(&inode->i_count); /* New dentry reference */ + dget(dentry); /* Extra pinning count for the created dentry */ + d_instantiate(dentry, inode); +out: + return ret; +} + +static int shmem_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + + if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) + shmem_free_inode(inode->i_sb); + + dir->i_size -= BOGO_DIRENT_SIZE; + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + drop_nlink(inode); + dput(dentry); /* Undo the count from "create" - this does all the work */ + return 0; +} + +static int shmem_rmdir(struct inode *dir, struct dentry *dentry) +{ + if (!simple_empty(dentry)) + return -ENOTEMPTY; + + drop_nlink(dentry->d_inode); + drop_nlink(dir); + return shmem_unlink(dir, dentry); +} + +/* + * The VFS layer already does all the dentry stuff for rename, + * we just have to decrement the usage count for the target if + * it exists so that the VFS layer correctly free's it when it + * gets overwritten. + */ +static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *inode = old_dentry->d_inode; + int they_are_dirs = S_ISDIR(inode->i_mode); + + if (!simple_empty(new_dentry)) + return -ENOTEMPTY; + + if (new_dentry->d_inode) { + (void) shmem_unlink(new_dir, new_dentry); + if (they_are_dirs) + drop_nlink(old_dir); + } else if (they_are_dirs) { + drop_nlink(old_dir); + inc_nlink(new_dir); + } + + old_dir->i_size -= BOGO_DIRENT_SIZE; + new_dir->i_size += BOGO_DIRENT_SIZE; + old_dir->i_ctime = old_dir->i_mtime = + new_dir->i_ctime = new_dir->i_mtime = + inode->i_ctime = CURRENT_TIME; + return 0; +} + +static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +{ + int error; + int len; + struct inode *inode; + struct page *page = NULL; + char *kaddr; + struct shmem_inode_info *info; + + len = strlen(symname) + 1; + if (len > PAGE_CACHE_SIZE) + return -ENAMETOOLONG; + + inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0); + if (!inode) + return -ENOSPC; + + error = security_inode_init_security(inode, dir, NULL, NULL, + NULL); + if (error) { + if (error != -EOPNOTSUPP) { + iput(inode); + return error; + } + error = 0; + } + + info = SHMEM_I(inode); + inode->i_size = len-1; + if (len <= (char *)inode - (char *)info) { + /* do it inline */ + memcpy(info, symname, len); + inode->i_op = &shmem_symlink_inline_operations; + } else { + error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); + if (error) { + iput(inode); + return error; + } + unlock_page(page); + inode->i_mapping->a_ops = &shmem_aops; + inode->i_op = &shmem_symlink_inode_operations; + kaddr = kmap_atomic(page, KM_USER0); + memcpy(kaddr, symname, len); + kunmap_atomic(kaddr, KM_USER0); + set_page_dirty(page); + page_cache_release(page); + } + if (dir->i_mode & S_ISGID) + inode->i_gid = dir->i_gid; + dir->i_size += BOGO_DIRENT_SIZE; + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + d_instantiate(dentry, inode); + dget(dentry); + return 0; +} + +static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd) +{ + nd_set_link(nd, (char *)SHMEM_I(dentry->d_inode)); + return NULL; +} + +static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct page *page = NULL; + int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); + nd_set_link(nd, res ? ERR_PTR(res) : kmap(page)); + if (page) + unlock_page(page); + return page; +} + +static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) +{ + if (!IS_ERR(nd_get_link(nd))) { + struct page *page = cookie; + kunmap(page); + mark_page_accessed(page); + page_cache_release(page); + } +} + +static const struct inode_operations shmem_symlink_inline_operations = { + .readlink = generic_readlink, + .follow_link = shmem_follow_link_inline, +}; + +static const struct inode_operations shmem_symlink_inode_operations = { + .truncate = shmem_truncate, + .readlink = generic_readlink, + .follow_link = shmem_follow_link, + .put_link = shmem_put_link, +}; + +#ifdef CONFIG_TMPFS_POSIX_ACL +/* + * Superblocks without xattr inode operations will get security.* xattr + * support from the VFS "for free". As soon as we have any other xattrs + * like ACLs, we also need to implement the security.* handlers at + * filesystem level, though. + */ + +static size_t shmem_xattr_security_list(struct inode *inode, char *list, + size_t list_len, const char *name, + size_t name_len) +{ + return security_inode_listsecurity(inode, list, list_len); +} + +static int shmem_xattr_security_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return xattr_getsecurity(inode, name, buffer, size); +} + +static int shmem_xattr_security_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return security_inode_setsecurity(inode, name, value, size, flags); +} + +static struct xattr_handler shmem_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = shmem_xattr_security_list, + .get = shmem_xattr_security_get, + .set = shmem_xattr_security_set, +}; + +static struct xattr_handler *shmem_xattr_handlers[] = { + &shmem_xattr_acl_access_handler, + &shmem_xattr_acl_default_handler, + &shmem_xattr_security_handler, + NULL +}; +#endif + +static struct dentry *shmem_get_parent(struct dentry *child) +{ + return ERR_PTR(-ESTALE); +} + +static int shmem_match(struct inode *ino, void *vfh) +{ + __u32 *fh = vfh; + __u64 inum = fh[2]; + inum = (inum << 32) | fh[1]; + return ino->i_ino == inum && fh[0] == ino->i_generation; +} + +static struct dentry *shmem_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + struct inode *inode; + struct dentry *dentry = NULL; + u64 inum = fid->raw[2]; + inum = (inum << 32) | fid->raw[1]; + + if (fh_len < 3) + return NULL; + + inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), + shmem_match, fid->raw); + if (inode) { + dentry = d_find_alias(inode); + iput(inode); + } + + return dentry; +} + +static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len, + int connectable) +{ + struct inode *inode = dentry->d_inode; + + if (*len < 3) + return 255; + + if (hlist_unhashed(&inode->i_hash)) { + /* Unfortunately insert_inode_hash is not idempotent, + * so as we hash inodes here rather than at creation + * time, we need a lock to ensure we only try + * to do it once + */ + static DEFINE_SPINLOCK(lock); + spin_lock(&lock); + if (hlist_unhashed(&inode->i_hash)) + __insert_inode_hash(inode, + inode->i_ino + inode->i_generation); + spin_unlock(&lock); + } + + fh[0] = inode->i_generation; + fh[1] = inode->i_ino; + fh[2] = ((__u64)inode->i_ino) >> 32; + + *len = 3; + return 1; +} + +static const struct export_operations shmem_export_ops = { + .get_parent = shmem_get_parent, + .encode_fh = shmem_encode_fh, + .fh_to_dentry = shmem_fh_to_dentry, +}; + +static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, + bool remount) +{ + char *this_char, *value, *rest; + + while (options != NULL) { + this_char = options; + for (;;) { + /* + * NUL-terminate this option: unfortunately, + * mount options form a comma-separated list, + * but mpol's nodelist may also contain commas. + */ + options = strchr(options, ','); + if (options == NULL) + break; + options++; + if (!isdigit(*options)) { + options[-1] = '\0'; + break; + } + } + if (!*this_char) + continue; + if ((value = strchr(this_char,'=')) != NULL) { + *value++ = 0; + } else { + printk(KERN_ERR + "tmpfs: No value for mount option '%s'\n", + this_char); + return 1; + } + + if (!strcmp(this_char,"size")) { + unsigned long long size; + size = memparse(value,&rest); + if (*rest == '%') { + size <<= PAGE_SHIFT; + size *= totalram_pages; + do_div(size, 100); + rest++; + } + if (*rest) + goto bad_val; + sbinfo->max_blocks = + DIV_ROUND_UP(size, PAGE_CACHE_SIZE); + } else if (!strcmp(this_char,"nr_blocks")) { + sbinfo->max_blocks = memparse(value, &rest); + if (*rest) + goto bad_val; + } else if (!strcmp(this_char,"nr_inodes")) { + sbinfo->max_inodes = memparse(value, &rest); + if (*rest) + goto bad_val; + } else if (!strcmp(this_char,"mode")) { + if (remount) + continue; + sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777; + if (*rest) + goto bad_val; + } else if (!strcmp(this_char,"uid")) { + if (remount) + continue; + sbinfo->uid = simple_strtoul(value, &rest, 0); + if (*rest) + goto bad_val; + } else if (!strcmp(this_char,"gid")) { + if (remount) + continue; + sbinfo->gid = simple_strtoul(value, &rest, 0); + if (*rest) + goto bad_val; + } else if (!strcmp(this_char,"mpol")) { + if (mpol_parse_str(value, &sbinfo->mpol, 1)) + goto bad_val; + } else { + printk(KERN_ERR "tmpfs: Bad mount option %s\n", + this_char); + return 1; + } + } + return 0; + +bad_val: + printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", + value, this_char); + return 1; + +} + +static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + struct shmem_sb_info config = *sbinfo; + unsigned long blocks; + unsigned long inodes; + int error = -EINVAL; + + if (shmem_parse_options(data, &config, true)) + return error; + + spin_lock(&sbinfo->stat_lock); + blocks = sbinfo->max_blocks - sbinfo->free_blocks; + inodes = sbinfo->max_inodes - sbinfo->free_inodes; + if (config.max_blocks < blocks) + goto out; + if (config.max_inodes < inodes) + goto out; + /* + * Those tests also disallow limited->unlimited while any are in + * use, so i_blocks will always be zero when max_blocks is zero; + * but we must separately disallow unlimited->limited, because + * in that case we have no record of how much is already in use. + */ + if (config.max_blocks && !sbinfo->max_blocks) + goto out; + if (config.max_inodes && !sbinfo->max_inodes) + goto out; + + error = 0; + sbinfo->max_blocks = config.max_blocks; + sbinfo->free_blocks = config.max_blocks - blocks; + sbinfo->max_inodes = config.max_inodes; + sbinfo->free_inodes = config.max_inodes - inodes; + + mpol_put(sbinfo->mpol); + sbinfo->mpol = config.mpol; /* transfers initial ref */ +out: + spin_unlock(&sbinfo->stat_lock); + return error; +} + +static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(vfs->mnt_sb); + + if (sbinfo->max_blocks != shmem_default_max_blocks()) + seq_printf(seq, ",size=%luk", + sbinfo->max_blocks << (PAGE_CACHE_SHIFT - 10)); + if (sbinfo->max_inodes != shmem_default_max_inodes()) + seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); + if (sbinfo->mode != (S_IRWXUGO | S_ISVTX)) + seq_printf(seq, ",mode=%03o", sbinfo->mode); + if (sbinfo->uid != 0) + seq_printf(seq, ",uid=%u", sbinfo->uid); + if (sbinfo->gid != 0) + seq_printf(seq, ",gid=%u", sbinfo->gid); + shmem_show_mpol(seq, sbinfo->mpol); + return 0; +} +#endif /* CONFIG_TMPFS */ + +static void shmem_put_super(struct super_block *sb) +{ + kfree(sb->s_fs_info); + sb->s_fs_info = NULL; +} + +static int shmem_fill_super(struct super_block *sb, + void *data, int silent) +{ + struct inode *inode; + struct dentry *root; + struct shmem_sb_info *sbinfo; + int err = -ENOMEM; + + /* Round up to L1_CACHE_BYTES to resist false sharing */ + sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info), + L1_CACHE_BYTES), GFP_KERNEL); + if (!sbinfo) + return -ENOMEM; + + sbinfo->max_blocks = 0; + sbinfo->max_inodes = 0; + sbinfo->mode = S_IRWXUGO | S_ISVTX; + sbinfo->uid = current->fsuid; + sbinfo->gid = current->fsgid; + sbinfo->mpol = NULL; + sb->s_fs_info = sbinfo; + +#ifdef CONFIG_TMPFS + /* + * Per default we only allow half of the physical ram per + * tmpfs instance, limiting inodes to one per page of lowmem; + * but the internal instance is left unlimited. + */ + if (!(sb->s_flags & MS_NOUSER)) { + sbinfo->max_blocks = shmem_default_max_blocks(); + sbinfo->max_inodes = shmem_default_max_inodes(); + if (shmem_parse_options(data, sbinfo, false)) { + err = -EINVAL; + goto failed; + } + } + sb->s_export_op = &shmem_export_ops; +#else + sb->s_flags |= MS_NOUSER; +#endif + + spin_lock_init(&sbinfo->stat_lock); + sbinfo->free_blocks = sbinfo->max_blocks; + sbinfo->free_inodes = sbinfo->max_inodes; + + sb->s_maxbytes = SHMEM_MAX_BYTES; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = TMPFS_MAGIC; + sb->s_op = &shmem_ops; + sb->s_time_gran = 1; +#ifdef CONFIG_TMPFS_POSIX_ACL + sb->s_xattr = shmem_xattr_handlers; + sb->s_flags |= MS_POSIXACL; +#endif + + inode = shmem_get_inode(sb, S_IFDIR | sbinfo->mode, 0); + if (!inode) + goto failed; + inode->i_uid = sbinfo->uid; + inode->i_gid = sbinfo->gid; + root = d_alloc_root(inode); + if (!root) + goto failed_iput; + sb->s_root = root; + return 0; + +failed_iput: + iput(inode); +failed: + shmem_put_super(sb); + return err; +} + +static struct kmem_cache *shmem_inode_cachep; + +static struct inode *shmem_alloc_inode(struct super_block *sb) +{ + struct shmem_inode_info *p; + p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); + if (!p) + return NULL; + return &p->vfs_inode; +} + +static void shmem_destroy_inode(struct inode *inode) +{ + if ((inode->i_mode & S_IFMT) == S_IFREG) { + /* only struct inode is valid if it's an inline symlink */ + mpol_free_shared_policy(&SHMEM_I(inode)->policy); + } + shmem_acl_destroy_inode(inode); + kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); +} + +static void init_once(void *foo) +{ + struct shmem_inode_info *p = (struct shmem_inode_info *) foo; + + inode_init_once(&p->vfs_inode); +#ifdef CONFIG_TMPFS_POSIX_ACL + p->i_acl = NULL; + p->i_default_acl = NULL; +#endif +} + +static int init_inodecache(void) +{ + shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", + sizeof(struct shmem_inode_info), + 0, SLAB_PANIC, init_once); + return 0; +} + +static void destroy_inodecache(void) +{ + kmem_cache_destroy(shmem_inode_cachep); +} + +static const struct address_space_operations shmem_aops = { + .writepage = shmem_writepage, + .set_page_dirty = __set_page_dirty_no_writeback, +#ifdef CONFIG_TMPFS + .readpage = shmem_readpage, + .write_begin = shmem_write_begin, + .write_end = shmem_write_end, +#endif + .migratepage = migrate_page, +}; + +static const struct file_operations shmem_file_operations = { + .mmap = shmem_mmap, +#ifdef CONFIG_TMPFS + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = shmem_file_aio_read, + .aio_write = generic_file_aio_write, + .fsync = simple_sync_file, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, +#endif +}; + +static const struct inode_operations shmem_inode_operations = { + .truncate = shmem_truncate, + .setattr = shmem_notify_change, + .truncate_range = shmem_truncate_range, +#ifdef CONFIG_TMPFS_POSIX_ACL + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = generic_listxattr, + .removexattr = generic_removexattr, + .permission = shmem_permission, +#endif + +}; + +static const struct inode_operations shmem_dir_inode_operations = { +#ifdef CONFIG_TMPFS + .create = shmem_create, + .lookup = simple_lookup, + .link = shmem_link, + .unlink = shmem_unlink, + .symlink = shmem_symlink, + .mkdir = shmem_mkdir, + .rmdir = shmem_rmdir, + .mknod = shmem_mknod, + .rename = shmem_rename, +#endif +#ifdef CONFIG_TMPFS_POSIX_ACL + .setattr = shmem_notify_change, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = generic_listxattr, + .removexattr = generic_removexattr, + .permission = shmem_permission, +#endif +}; + +static const struct inode_operations shmem_special_inode_operations = { +#ifdef CONFIG_TMPFS_POSIX_ACL + .setattr = shmem_notify_change, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = generic_listxattr, + .removexattr = generic_removexattr, + .permission = shmem_permission, +#endif +}; + +static const struct super_operations shmem_ops = { + .alloc_inode = shmem_alloc_inode, + .destroy_inode = shmem_destroy_inode, +#ifdef CONFIG_TMPFS + .statfs = shmem_statfs, + .remount_fs = shmem_remount_fs, + .show_options = shmem_show_options, +#endif + .delete_inode = shmem_delete_inode, + .drop_inode = generic_delete_inode, + .put_super = shmem_put_super, +}; + +static struct vm_operations_struct shmem_vm_ops = { + .fault = shmem_fault, +#ifdef CONFIG_NUMA + .set_policy = shmem_set_policy, + .get_policy = shmem_get_policy, +#endif +}; + + +static int shmem_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) +{ + return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt); +} + +static struct file_system_type tmpfs_fs_type = { + .owner = THIS_MODULE, + .name = "tmpfs", + .get_sb = shmem_get_sb, + .kill_sb = kill_litter_super, +}; +static struct vfsmount *shm_mnt; + +static int __init init_tmpfs(void) +{ + int error; + + error = bdi_init(&shmem_backing_dev_info); + if (error) + goto out4; + + error = init_inodecache(); + if (error) + goto out3; + + error = register_filesystem(&tmpfs_fs_type); + if (error) { + printk(KERN_ERR "Could not register tmpfs\n"); + goto out2; + } + + shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER, + tmpfs_fs_type.name, NULL); + if (IS_ERR(shm_mnt)) { + error = PTR_ERR(shm_mnt); + printk(KERN_ERR "Could not kern_mount tmpfs\n"); + goto out1; + } + return 0; + +out1: + unregister_filesystem(&tmpfs_fs_type); +out2: + destroy_inodecache(); +out3: + bdi_destroy(&shmem_backing_dev_info); +out4: + shm_mnt = ERR_PTR(error); + return error; +} +module_init(init_tmpfs) + +/** + * shmem_file_setup - get an unlinked file living in tmpfs + * @name: name for dentry (to be seen in /proc/<pid>/maps + * @size: size to be set for the file + * @flags: vm_flags + */ +struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) +{ + int error; + struct file *file; + struct inode *inode; + struct dentry *dentry, *root; + struct qstr this; + + if (IS_ERR(shm_mnt)) + return (void *)shm_mnt; + + if (size < 0 || size > SHMEM_MAX_BYTES) + return ERR_PTR(-EINVAL); + + if (shmem_acct_size(flags, size)) + return ERR_PTR(-ENOMEM); + + error = -ENOMEM; + this.name = name; + this.len = strlen(name); + this.hash = 0; /* will go */ + root = shm_mnt->mnt_root; + dentry = d_alloc(root, &this); + if (!dentry) + goto put_memory; + + error = -ENFILE; + file = get_empty_filp(); + if (!file) + goto put_dentry; + + error = -ENOSPC; + inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); + if (!inode) + goto close_file; + + SHMEM_I(inode)->flags = flags & VM_ACCOUNT; + d_instantiate(dentry, inode); + inode->i_size = size; + inode->i_nlink = 0; /* It is unlinked */ + init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, + &shmem_file_operations); + return file; + +close_file: + put_filp(file); +put_dentry: + dput(dentry); +put_memory: + shmem_unacct_size(flags, size); + return ERR_PTR(error); +} +EXPORT_SYMBOL_GPL(shmem_file_setup); + +/** + * shmem_zero_setup - setup a shared anonymous mapping + * @vma: the vma to be mmapped is prepared by do_mmap_pgoff + */ +int shmem_zero_setup(struct vm_area_struct *vma) +{ + struct file *file; + loff_t size = vma->vm_end - vma->vm_start; + + file = shmem_file_setup("dev/zero", size, vma->vm_flags); + if (IS_ERR(file)) + return PTR_ERR(file); + + if (vma->vm_file) + fput(vma->vm_file); + vma->vm_file = file; + vma->vm_ops = &shmem_vm_ops; + return 0; +} diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c new file mode 100644 index 0000000..8e5aadd --- /dev/null +++ b/mm/shmem_acl.c @@ -0,0 +1,197 @@ +/* + * mm/shmem_acl.c + * + * (C) 2005 Andreas Gruenbacher <agruen@suse.de> + * + * This file is released under the GPL. + */ + +#include <linux/fs.h> +#include <linux/shmem_fs.h> +#include <linux/xattr.h> +#include <linux/generic_acl.h> + +/** + * shmem_get_acl - generic_acl_operations->getacl() operation + */ +static struct posix_acl * +shmem_get_acl(struct inode *inode, int type) +{ + struct posix_acl *acl = NULL; + + spin_lock(&inode->i_lock); + switch(type) { + case ACL_TYPE_ACCESS: + acl = posix_acl_dup(SHMEM_I(inode)->i_acl); + break; + + case ACL_TYPE_DEFAULT: + acl = posix_acl_dup(SHMEM_I(inode)->i_default_acl); + break; + } + spin_unlock(&inode->i_lock); + + return acl; +} + +/** + * shmem_set_acl - generic_acl_operations->setacl() operation + */ +static void +shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl) +{ + struct posix_acl *free = NULL; + + spin_lock(&inode->i_lock); + switch(type) { + case ACL_TYPE_ACCESS: + free = SHMEM_I(inode)->i_acl; + SHMEM_I(inode)->i_acl = posix_acl_dup(acl); + break; + + case ACL_TYPE_DEFAULT: + free = SHMEM_I(inode)->i_default_acl; + SHMEM_I(inode)->i_default_acl = posix_acl_dup(acl); + break; + } + spin_unlock(&inode->i_lock); + posix_acl_release(free); +} + +struct generic_acl_operations shmem_acl_ops = { + .getacl = shmem_get_acl, + .setacl = shmem_set_acl, +}; + +/** + * shmem_list_acl_access, shmem_get_acl_access, shmem_set_acl_access, + * shmem_xattr_acl_access_handler - plumbing code to implement the + * system.posix_acl_access xattr using the generic acl functions. + */ + +static size_t +shmem_list_acl_access(struct inode *inode, char *list, size_t list_size, + const char *name, size_t name_len) +{ + return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, + list, list_size); +} + +static int +shmem_get_acl_access(struct inode *inode, const char *name, void *buffer, + size_t size) +{ + if (strcmp(name, "") != 0) + return -EINVAL; + return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, buffer, + size); +} + +static int +shmem_set_acl_access(struct inode *inode, const char *name, const void *value, + size_t size, int flags) +{ + if (strcmp(name, "") != 0) + return -EINVAL; + return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, value, + size); +} + +struct xattr_handler shmem_xattr_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .list = shmem_list_acl_access, + .get = shmem_get_acl_access, + .set = shmem_set_acl_access, +}; + +/** + * shmem_list_acl_default, shmem_get_acl_default, shmem_set_acl_default, + * shmem_xattr_acl_default_handler - plumbing code to implement the + * system.posix_acl_default xattr using the generic acl functions. + */ + +static size_t +shmem_list_acl_default(struct inode *inode, char *list, size_t list_size, + const char *name, size_t name_len) +{ + return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, + list, list_size); +} + +static int +shmem_get_acl_default(struct inode *inode, const char *name, void *buffer, + size_t size) +{ + if (strcmp(name, "") != 0) + return -EINVAL; + return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, buffer, + size); +} + +static int +shmem_set_acl_default(struct inode *inode, const char *name, const void *value, + size_t size, int flags) +{ + if (strcmp(name, "") != 0) + return -EINVAL; + return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, value, + size); +} + +struct xattr_handler shmem_xattr_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .list = shmem_list_acl_default, + .get = shmem_get_acl_default, + .set = shmem_set_acl_default, +}; + +/** + * shmem_acl_init - Inizialize the acl(s) of a new inode + */ +int +shmem_acl_init(struct inode *inode, struct inode *dir) +{ + return generic_acl_init(inode, dir, &shmem_acl_ops); +} + +/** + * shmem_acl_destroy_inode - destroy acls hanging off the in-memory inode + * + * This is done before destroying the actual inode. + */ + +void +shmem_acl_destroy_inode(struct inode *inode) +{ + if (SHMEM_I(inode)->i_acl) + posix_acl_release(SHMEM_I(inode)->i_acl); + SHMEM_I(inode)->i_acl = NULL; + if (SHMEM_I(inode)->i_default_acl) + posix_acl_release(SHMEM_I(inode)->i_default_acl); + SHMEM_I(inode)->i_default_acl = NULL; +} + +/** + * shmem_check_acl - check_acl() callback for generic_permission() + */ +static int +shmem_check_acl(struct inode *inode, int mask) +{ + struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS); + + if (acl) { + int error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + return error; + } + return -EAGAIN; +} + +/** + * shmem_permission - permission() inode operation + */ +int +shmem_permission(struct inode *inode, int mask) +{ + return generic_permission(inode, mask, shmem_check_acl); +} diff --git a/mm/slab.c b/mm/slab.c new file mode 100644 index 0000000..0918751 --- /dev/null +++ b/mm/slab.c @@ -0,0 +1,4522 @@ +/* + * linux/mm/slab.c + * Written by Mark Hemment, 1996/97. + * (markhe@nextd.demon.co.uk) + * + * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli + * + * Major cleanup, different bufctl logic, per-cpu arrays + * (c) 2000 Manfred Spraul + * + * Cleanup, make the head arrays unconditional, preparation for NUMA + * (c) 2002 Manfred Spraul + * + * An implementation of the Slab Allocator as described in outline in; + * UNIX Internals: The New Frontiers by Uresh Vahalia + * Pub: Prentice Hall ISBN 0-13-101908-2 + * or with a little more detail in; + * The Slab Allocator: An Object-Caching Kernel Memory Allocator + * Jeff Bonwick (Sun Microsystems). + * Presented at: USENIX Summer 1994 Technical Conference + * + * The memory is organized in caches, one cache for each object type. + * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct) + * Each cache consists out of many slabs (they are small (usually one + * page long) and always contiguous), and each slab contains multiple + * initialized objects. + * + * This means, that your constructor is used only for newly allocated + * slabs and you must pass objects with the same initializations to + * kmem_cache_free. + * + * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM, + * normal). If you need a special memory type, then must create a new + * cache for that memory type. + * + * In order to reduce fragmentation, the slabs are sorted in 3 groups: + * full slabs with 0 free objects + * partial slabs + * empty slabs with no allocated objects + * + * If partial slabs exist, then new allocations come from these slabs, + * otherwise from empty slabs or new slabs are allocated. + * + * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache + * during kmem_cache_destroy(). The caller must prevent concurrent allocs. + * + * Each cache has a short per-cpu head array, most allocs + * and frees go into that array, and if that array overflows, then 1/2 + * of the entries in the array are given back into the global cache. + * The head array is strictly LIFO and should improve the cache hit rates. + * On SMP, it additionally reduces the spinlock operations. + * + * The c_cpuarray may not be read with enabled local interrupts - + * it's changed with a smp_call_function(). + * + * SMP synchronization: + * constructors and destructors are called without any locking. + * Several members in struct kmem_cache and struct slab never change, they + * are accessed without any locking. + * The per-cpu arrays are never accessed from the wrong cpu, no locking, + * and local interrupts are disabled so slab code is preempt-safe. + * The non-constant members are protected with a per-cache irq spinlock. + * + * Many thanks to Mark Hemment, who wrote another per-cpu slab patch + * in 2000 - many ideas in the current implementation are derived from + * his patch. + * + * Further notes from the original documentation: + * + * 11 April '97. Started multi-threading - markhe + * The global cache-chain is protected by the mutex 'cache_chain_mutex'. + * The sem is only needed when accessing/extending the cache-chain, which + * can never happen inside an interrupt (kmem_cache_create(), + * kmem_cache_shrink() and kmem_cache_reap()). + * + * At present, each engine can be growing a cache. This should be blocked. + * + * 15 March 2005. NUMA slab allocator. + * Shai Fultheim <shai@scalex86.org>. + * Shobhit Dayal <shobhit@calsoftinc.com> + * Alok N Kataria <alokk@calsoftinc.com> + * Christoph Lameter <christoph@lameter.com> + * + * Modified the slab allocator to be node aware on NUMA systems. + * Each node has its own list of partial, free and full slabs. + * All object allocations for a node occur from node specific slab lists. + */ + +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/poison.h> +#include <linux/swap.h> +#include <linux/cache.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/compiler.h> +#include <linux/cpuset.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/notifier.h> +#include <linux/kallsyms.h> +#include <linux/cpu.h> +#include <linux/sysctl.h> +#include <linux/module.h> +#include <linux/rcupdate.h> +#include <linux/string.h> +#include <linux/uaccess.h> +#include <linux/nodemask.h> +#include <linux/mempolicy.h> +#include <linux/mutex.h> +#include <linux/fault-inject.h> +#include <linux/rtmutex.h> +#include <linux/reciprocal_div.h> +#include <linux/debugobjects.h> + +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> +#include <asm/page.h> + +/* + * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. + * 0 for faster, smaller code (especially in the critical paths). + * + * STATS - 1 to collect stats for /proc/slabinfo. + * 0 for faster, smaller code (especially in the critical paths). + * + * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) + */ + +#ifdef CONFIG_DEBUG_SLAB +#define DEBUG 1 +#define STATS 1 +#define FORCED_DEBUG 1 +#else +#define DEBUG 0 +#define STATS 0 +#define FORCED_DEBUG 0 +#endif + +/* Shouldn't this be in a header file somewhere? */ +#define BYTES_PER_WORD sizeof(void *) +#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) + +#ifndef ARCH_KMALLOC_MINALIGN +/* + * Enforce a minimum alignment for the kmalloc caches. + * Usually, the kmalloc caches are cache_line_size() aligned, except when + * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned. + * Some archs want to perform DMA into kmalloc caches and need a guaranteed + * alignment larger than the alignment of a 64-bit integer. + * ARCH_KMALLOC_MINALIGN allows that. + * Note that increasing this value may disable some debug features. + */ +#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) +#endif + +#ifndef ARCH_SLAB_MINALIGN +/* + * Enforce a minimum alignment for all caches. + * Intended for archs that get misalignment faults even for BYTES_PER_WORD + * aligned buffers. Includes ARCH_KMALLOC_MINALIGN. + * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables + * some debug features. + */ +#define ARCH_SLAB_MINALIGN 0 +#endif + +#ifndef ARCH_KMALLOC_FLAGS +#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN +#endif + +/* Legal flag mask for kmem_cache_create(). */ +#if DEBUG +# define CREATE_MASK (SLAB_RED_ZONE | \ + SLAB_POISON | SLAB_HWCACHE_ALIGN | \ + SLAB_CACHE_DMA | \ + SLAB_STORE_USER | \ + SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ + SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ + SLAB_DEBUG_OBJECTS) +#else +# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ + SLAB_CACHE_DMA | \ + SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ + SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ + SLAB_DEBUG_OBJECTS) +#endif + +/* + * kmem_bufctl_t: + * + * Bufctl's are used for linking objs within a slab + * linked offsets. + * + * This implementation relies on "struct page" for locating the cache & + * slab an object belongs to. + * This allows the bufctl structure to be small (one int), but limits + * the number of objects a slab (not a cache) can contain when off-slab + * bufctls are used. The limit is the size of the largest general cache + * that does not use off-slab slabs. + * For 32bit archs with 4 kB pages, is this 56. + * This is not serious, as it is only for large objects, when it is unwise + * to have too many per slab. + * Note: This limit can be raised by introducing a general cache whose size + * is less than 512 (PAGE_SIZE<<3), but greater than 256. + */ + +typedef unsigned int kmem_bufctl_t; +#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) +#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) +#define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2) +#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) + +/* + * struct slab + * + * Manages the objs in a slab. Placed either at the beginning of mem allocated + * for a slab, or allocated from an general cache. + * Slabs are chained into three list: fully used, partial, fully free slabs. + */ +struct slab { + struct list_head list; + unsigned long colouroff; + void *s_mem; /* including colour offset */ + unsigned int inuse; /* num of objs active in slab */ + kmem_bufctl_t free; + unsigned short nodeid; +}; + +/* + * struct slab_rcu + * + * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to + * arrange for kmem_freepages to be called via RCU. This is useful if + * we need to approach a kernel structure obliquely, from its address + * obtained without the usual locking. We can lock the structure to + * stabilize it and check it's still at the given address, only if we + * can be sure that the memory has not been meanwhile reused for some + * other kind of object (which our subsystem's lock might corrupt). + * + * rcu_read_lock before reading the address, then rcu_read_unlock after + * taking the spinlock within the structure expected at that address. + * + * We assume struct slab_rcu can overlay struct slab when destroying. + */ +struct slab_rcu { + struct rcu_head head; + struct kmem_cache *cachep; + void *addr; +}; + +/* + * struct array_cache + * + * Purpose: + * - LIFO ordering, to hand out cache-warm objects from _alloc + * - reduce the number of linked list operations + * - reduce spinlock operations + * + * The limit is stored in the per-cpu structure to reduce the data cache + * footprint. + * + */ +struct array_cache { + unsigned int avail; + unsigned int limit; + unsigned int batchcount; + unsigned int touched; + spinlock_t lock; + void *entry[]; /* + * Must have this definition in here for the proper + * alignment of array_cache. Also simplifies accessing + * the entries. + */ +}; + +/* + * bootstrap: The caches do not work without cpuarrays anymore, but the + * cpuarrays are allocated from the generic caches... + */ +#define BOOT_CPUCACHE_ENTRIES 1 +struct arraycache_init { + struct array_cache cache; + void *entries[BOOT_CPUCACHE_ENTRIES]; +}; + +/* + * The slab lists for all objects. + */ +struct kmem_list3 { + struct list_head slabs_partial; /* partial list first, better asm code */ + struct list_head slabs_full; + struct list_head slabs_free; + unsigned long free_objects; + unsigned int free_limit; + unsigned int colour_next; /* Per-node cache coloring */ + spinlock_t list_lock; + struct array_cache *shared; /* shared per node */ + struct array_cache **alien; /* on other nodes */ + unsigned long next_reap; /* updated without locking */ + int free_touched; /* updated without locking */ +}; + +/* + * Need this for bootstrapping a per node allocator. + */ +#define NUM_INIT_LISTS (3 * MAX_NUMNODES) +struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; +#define CACHE_CACHE 0 +#define SIZE_AC MAX_NUMNODES +#define SIZE_L3 (2 * MAX_NUMNODES) + +static int drain_freelist(struct kmem_cache *cache, + struct kmem_list3 *l3, int tofree); +static void free_block(struct kmem_cache *cachep, void **objpp, int len, + int node); +static int enable_cpucache(struct kmem_cache *cachep); +static void cache_reap(struct work_struct *unused); + +/* + * This function must be completely optimized away if a constant is passed to + * it. Mostly the same as what is in linux/slab.h except it returns an index. + */ +static __always_inline int index_of(const size_t size) +{ + extern void __bad_size(void); + + if (__builtin_constant_p(size)) { + int i = 0; + +#define CACHE(x) \ + if (size <=x) \ + return i; \ + else \ + i++; +#include <linux/kmalloc_sizes.h> +#undef CACHE + __bad_size(); + } else + __bad_size(); + return 0; +} + +static int slab_early_init = 1; + +#define INDEX_AC index_of(sizeof(struct arraycache_init)) +#define INDEX_L3 index_of(sizeof(struct kmem_list3)) + +static void kmem_list3_init(struct kmem_list3 *parent) +{ + INIT_LIST_HEAD(&parent->slabs_full); + INIT_LIST_HEAD(&parent->slabs_partial); + INIT_LIST_HEAD(&parent->slabs_free); + parent->shared = NULL; + parent->alien = NULL; + parent->colour_next = 0; + spin_lock_init(&parent->list_lock); + parent->free_objects = 0; + parent->free_touched = 0; +} + +#define MAKE_LIST(cachep, listp, slab, nodeid) \ + do { \ + INIT_LIST_HEAD(listp); \ + list_splice(&(cachep->nodelists[nodeid]->slab), listp); \ + } while (0) + +#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ + do { \ + MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ + MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ + MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ + } while (0) + +/* + * struct kmem_cache + * + * manages a cache. + */ + +struct kmem_cache { +/* 1) per-cpu data, touched during every alloc/free */ + struct array_cache *array[NR_CPUS]; +/* 2) Cache tunables. Protected by cache_chain_mutex */ + unsigned int batchcount; + unsigned int limit; + unsigned int shared; + + unsigned int buffer_size; + u32 reciprocal_buffer_size; +/* 3) touched by every alloc & free from the backend */ + + unsigned int flags; /* constant flags */ + unsigned int num; /* # of objs per slab */ + +/* 4) cache_grow/shrink */ + /* order of pgs per slab (2^n) */ + unsigned int gfporder; + + /* force GFP flags, e.g. GFP_DMA */ + gfp_t gfpflags; + + size_t colour; /* cache colouring range */ + unsigned int colour_off; /* colour offset */ + struct kmem_cache *slabp_cache; + unsigned int slab_size; + unsigned int dflags; /* dynamic flags */ + + /* constructor func */ + void (*ctor)(void *obj); + +/* 5) cache creation/removal */ + const char *name; + struct list_head next; + +/* 6) statistics */ +#if STATS + unsigned long num_active; + unsigned long num_allocations; + unsigned long high_mark; + unsigned long grown; + unsigned long reaped; + unsigned long errors; + unsigned long max_freeable; + unsigned long node_allocs; + unsigned long node_frees; + unsigned long node_overflow; + atomic_t allochit; + atomic_t allocmiss; + atomic_t freehit; + atomic_t freemiss; +#endif +#if DEBUG + /* + * If debugging is enabled, then the allocator can add additional + * fields and/or padding to every object. buffer_size contains the total + * object size including these internal fields, the following two + * variables contain the offset to the user object and its size. + */ + int obj_offset; + int obj_size; +#endif + /* + * We put nodelists[] at the end of kmem_cache, because we want to size + * this array to nr_node_ids slots instead of MAX_NUMNODES + * (see kmem_cache_init()) + * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache + * is statically defined, so we reserve the max number of nodes. + */ + struct kmem_list3 *nodelists[MAX_NUMNODES]; + /* + * Do not add fields after nodelists[] + */ +}; + +#define CFLGS_OFF_SLAB (0x80000000UL) +#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) + +#define BATCHREFILL_LIMIT 16 +/* + * Optimization question: fewer reaps means less probability for unnessary + * cpucache drain/refill cycles. + * + * OTOH the cpuarrays can contain lots of objects, + * which could lock up otherwise freeable slabs. + */ +#define REAPTIMEOUT_CPUC (2*HZ) +#define REAPTIMEOUT_LIST3 (4*HZ) + +#if STATS +#define STATS_INC_ACTIVE(x) ((x)->num_active++) +#define STATS_DEC_ACTIVE(x) ((x)->num_active--) +#define STATS_INC_ALLOCED(x) ((x)->num_allocations++) +#define STATS_INC_GROWN(x) ((x)->grown++) +#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y)) +#define STATS_SET_HIGH(x) \ + do { \ + if ((x)->num_active > (x)->high_mark) \ + (x)->high_mark = (x)->num_active; \ + } while (0) +#define STATS_INC_ERR(x) ((x)->errors++) +#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) +#define STATS_INC_NODEFREES(x) ((x)->node_frees++) +#define STATS_INC_ACOVERFLOW(x) ((x)->node_overflow++) +#define STATS_SET_FREEABLE(x, i) \ + do { \ + if ((x)->max_freeable < i) \ + (x)->max_freeable = i; \ + } while (0) +#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) +#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) +#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) +#define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss) +#else +#define STATS_INC_ACTIVE(x) do { } while (0) +#define STATS_DEC_ACTIVE(x) do { } while (0) +#define STATS_INC_ALLOCED(x) do { } while (0) +#define STATS_INC_GROWN(x) do { } while (0) +#define STATS_ADD_REAPED(x,y) do { } while (0) +#define STATS_SET_HIGH(x) do { } while (0) +#define STATS_INC_ERR(x) do { } while (0) +#define STATS_INC_NODEALLOCS(x) do { } while (0) +#define STATS_INC_NODEFREES(x) do { } while (0) +#define STATS_INC_ACOVERFLOW(x) do { } while (0) +#define STATS_SET_FREEABLE(x, i) do { } while (0) +#define STATS_INC_ALLOCHIT(x) do { } while (0) +#define STATS_INC_ALLOCMISS(x) do { } while (0) +#define STATS_INC_FREEHIT(x) do { } while (0) +#define STATS_INC_FREEMISS(x) do { } while (0) +#endif + +#if DEBUG + +/* + * memory layout of objects: + * 0 : objp + * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that + * the end of an object is aligned with the end of the real + * allocation. Catches writes behind the end of the allocation. + * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1: + * redzone word. + * cachep->obj_offset: The real object. + * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] + * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address + * [BYTES_PER_WORD long] + */ +static int obj_offset(struct kmem_cache *cachep) +{ + return cachep->obj_offset; +} + +static int obj_size(struct kmem_cache *cachep) +{ + return cachep->obj_size; +} + +static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) +{ + BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); + return (unsigned long long*) (objp + obj_offset(cachep) - + sizeof(unsigned long long)); +} + +static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp) +{ + BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); + if (cachep->flags & SLAB_STORE_USER) + return (unsigned long long *)(objp + cachep->buffer_size - + sizeof(unsigned long long) - + REDZONE_ALIGN); + return (unsigned long long *) (objp + cachep->buffer_size - + sizeof(unsigned long long)); +} + +static void **dbg_userword(struct kmem_cache *cachep, void *objp) +{ + BUG_ON(!(cachep->flags & SLAB_STORE_USER)); + return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD); +} + +#else + +#define obj_offset(x) 0 +#define obj_size(cachep) (cachep->buffer_size) +#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) +#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) +#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) + +#endif + +/* + * Do not go above this order unless 0 objects fit into the slab. + */ +#define BREAK_GFP_ORDER_HI 1 +#define BREAK_GFP_ORDER_LO 0 +static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; + +/* + * Functions for storing/retrieving the cachep and or slab from the page + * allocator. These are used to find the slab an obj belongs to. With kfree(), + * these are used to find the cache which an obj belongs to. + */ +static inline void page_set_cache(struct page *page, struct kmem_cache *cache) +{ + page->lru.next = (struct list_head *)cache; +} + +static inline struct kmem_cache *page_get_cache(struct page *page) +{ + page = compound_head(page); + BUG_ON(!PageSlab(page)); + return (struct kmem_cache *)page->lru.next; +} + +static inline void page_set_slab(struct page *page, struct slab *slab) +{ + page->lru.prev = (struct list_head *)slab; +} + +static inline struct slab *page_get_slab(struct page *page) +{ + BUG_ON(!PageSlab(page)); + return (struct slab *)page->lru.prev; +} + +static inline struct kmem_cache *virt_to_cache(const void *obj) +{ + struct page *page = virt_to_head_page(obj); + return page_get_cache(page); +} + +static inline struct slab *virt_to_slab(const void *obj) +{ + struct page *page = virt_to_head_page(obj); + return page_get_slab(page); +} + +static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, + unsigned int idx) +{ + return slab->s_mem + cache->buffer_size * idx; +} + +/* + * We want to avoid an expensive divide : (offset / cache->buffer_size) + * Using the fact that buffer_size is a constant for a particular cache, + * we can replace (offset / cache->buffer_size) by + * reciprocal_divide(offset, cache->reciprocal_buffer_size) + */ +static inline unsigned int obj_to_index(const struct kmem_cache *cache, + const struct slab *slab, void *obj) +{ + u32 offset = (obj - slab->s_mem); + return reciprocal_divide(offset, cache->reciprocal_buffer_size); +} + +/* + * These are the default caches for kmalloc. Custom caches can have other sizes. + */ +struct cache_sizes malloc_sizes[] = { +#define CACHE(x) { .cs_size = (x) }, +#include <linux/kmalloc_sizes.h> + CACHE(ULONG_MAX) +#undef CACHE +}; +EXPORT_SYMBOL(malloc_sizes); + +/* Must match cache_sizes above. Out of line to keep cache footprint low. */ +struct cache_names { + char *name; + char *name_dma; +}; + +static struct cache_names __initdata cache_names[] = { +#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, +#include <linux/kmalloc_sizes.h> + {NULL,} +#undef CACHE +}; + +static struct arraycache_init initarray_cache __initdata = + { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; +static struct arraycache_init initarray_generic = + { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; + +/* internal cache of cache description objs */ +static struct kmem_cache cache_cache = { + .batchcount = 1, + .limit = BOOT_CPUCACHE_ENTRIES, + .shared = 1, + .buffer_size = sizeof(struct kmem_cache), + .name = "kmem_cache", +}; + +#define BAD_ALIEN_MAGIC 0x01020304ul + +#ifdef CONFIG_LOCKDEP + +/* + * Slab sometimes uses the kmalloc slabs to store the slab headers + * for other slabs "off slab". + * The locking for this is tricky in that it nests within the locks + * of all other slabs in a few places; to deal with this special + * locking we put on-slab caches into a separate lock-class. + * + * We set lock class for alien array caches which are up during init. + * The lock annotation will be lost if all cpus of a node goes down and + * then comes back up during hotplug + */ +static struct lock_class_key on_slab_l3_key; +static struct lock_class_key on_slab_alc_key; + +static inline void init_lock_keys(void) + +{ + int q; + struct cache_sizes *s = malloc_sizes; + + while (s->cs_size != ULONG_MAX) { + for_each_node(q) { + struct array_cache **alc; + int r; + struct kmem_list3 *l3 = s->cs_cachep->nodelists[q]; + if (!l3 || OFF_SLAB(s->cs_cachep)) + continue; + lockdep_set_class(&l3->list_lock, &on_slab_l3_key); + alc = l3->alien; + /* + * FIXME: This check for BAD_ALIEN_MAGIC + * should go away when common slab code is taught to + * work even without alien caches. + * Currently, non NUMA code returns BAD_ALIEN_MAGIC + * for alloc_alien_cache, + */ + if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) + continue; + for_each_node(r) { + if (alc[r]) + lockdep_set_class(&alc[r]->lock, + &on_slab_alc_key); + } + } + s++; + } +} +#else +static inline void init_lock_keys(void) +{ +} +#endif + +/* + * Guard access to the cache-chain. + */ +static DEFINE_MUTEX(cache_chain_mutex); +static struct list_head cache_chain; + +/* + * chicken and egg problem: delay the per-cpu array allocation + * until the general caches are up. + */ +static enum { + NONE, + PARTIAL_AC, + PARTIAL_L3, + FULL +} g_cpucache_up; + +/* + * used by boot code to determine if it can use slab based allocator + */ +int slab_is_available(void) +{ + return g_cpucache_up == FULL; +} + +static DEFINE_PER_CPU(struct delayed_work, reap_work); + +static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) +{ + return cachep->array[smp_processor_id()]; +} + +static inline struct kmem_cache *__find_general_cachep(size_t size, + gfp_t gfpflags) +{ + struct cache_sizes *csizep = malloc_sizes; + +#if DEBUG + /* This happens if someone tries to call + * kmem_cache_create(), or __kmalloc(), before + * the generic caches are initialized. + */ + BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); +#endif + if (!size) + return ZERO_SIZE_PTR; + + while (size > csizep->cs_size) + csizep++; + + /* + * Really subtle: The last entry with cs->cs_size==ULONG_MAX + * has cs_{dma,}cachep==NULL. Thus no special case + * for large kmalloc calls required. + */ +#ifdef CONFIG_ZONE_DMA + if (unlikely(gfpflags & GFP_DMA)) + return csizep->cs_dmacachep; +#endif + return csizep->cs_cachep; +} + +static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags) +{ + return __find_general_cachep(size, gfpflags); +} + +static size_t slab_mgmt_size(size_t nr_objs, size_t align) +{ + return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); +} + +/* + * Calculate the number of objects and left-over bytes for a given buffer size. + */ +static void cache_estimate(unsigned long gfporder, size_t buffer_size, + size_t align, int flags, size_t *left_over, + unsigned int *num) +{ + int nr_objs; + size_t mgmt_size; + size_t slab_size = PAGE_SIZE << gfporder; + + /* + * The slab management structure can be either off the slab or + * on it. For the latter case, the memory allocated for a + * slab is used for: + * + * - The struct slab + * - One kmem_bufctl_t for each object + * - Padding to respect alignment of @align + * - @buffer_size bytes for each object + * + * If the slab management structure is off the slab, then the + * alignment will already be calculated into the size. Because + * the slabs are all pages aligned, the objects will be at the + * correct alignment when allocated. + */ + if (flags & CFLGS_OFF_SLAB) { + mgmt_size = 0; + nr_objs = slab_size / buffer_size; + + if (nr_objs > SLAB_LIMIT) + nr_objs = SLAB_LIMIT; + } else { + /* + * Ignore padding for the initial guess. The padding + * is at most @align-1 bytes, and @buffer_size is at + * least @align. In the worst case, this result will + * be one greater than the number of objects that fit + * into the memory allocation when taking the padding + * into account. + */ + nr_objs = (slab_size - sizeof(struct slab)) / + (buffer_size + sizeof(kmem_bufctl_t)); + + /* + * This calculated number will be either the right + * amount, or one greater than what we want. + */ + if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size + > slab_size) + nr_objs--; + + if (nr_objs > SLAB_LIMIT) + nr_objs = SLAB_LIMIT; + + mgmt_size = slab_mgmt_size(nr_objs, align); + } + *num = nr_objs; + *left_over = slab_size - nr_objs*buffer_size - mgmt_size; +} + +#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg) + +static void __slab_error(const char *function, struct kmem_cache *cachep, + char *msg) +{ + printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", + function, cachep->name, msg); + dump_stack(); +} + +/* + * By default on NUMA we use alien caches to stage the freeing of + * objects allocated from other nodes. This causes massive memory + * inefficiencies when using fake NUMA setup to split memory into a + * large number of small nodes, so it can be disabled on the command + * line + */ + +static int use_alien_caches __read_mostly = 1; +static int numa_platform __read_mostly = 1; +static int __init noaliencache_setup(char *s) +{ + use_alien_caches = 0; + return 1; +} +__setup("noaliencache", noaliencache_setup); + +#ifdef CONFIG_NUMA +/* + * Special reaping functions for NUMA systems called from cache_reap(). + * These take care of doing round robin flushing of alien caches (containing + * objects freed on different nodes from which they were allocated) and the + * flushing of remote pcps by calling drain_node_pages. + */ +static DEFINE_PER_CPU(unsigned long, reap_node); + +static void init_reap_node(int cpu) +{ + int node; + + node = next_node(cpu_to_node(cpu), node_online_map); + if (node == MAX_NUMNODES) + node = first_node(node_online_map); + + per_cpu(reap_node, cpu) = node; +} + +static void next_reap_node(void) +{ + int node = __get_cpu_var(reap_node); + + node = next_node(node, node_online_map); + if (unlikely(node >= MAX_NUMNODES)) + node = first_node(node_online_map); + __get_cpu_var(reap_node) = node; +} + +#else +#define init_reap_node(cpu) do { } while (0) +#define next_reap_node(void) do { } while (0) +#endif + +/* + * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz + * via the workqueue/eventd. + * Add the CPU number into the expiration time to minimize the possibility of + * the CPUs getting into lockstep and contending for the global cache chain + * lock. + */ +static void __cpuinit start_cpu_timer(int cpu) +{ + struct delayed_work *reap_work = &per_cpu(reap_work, cpu); + + /* + * When this gets called from do_initcalls via cpucache_init(), + * init_workqueues() has already run, so keventd will be setup + * at that time. + */ + if (keventd_up() && reap_work->work.func == NULL) { + init_reap_node(cpu); + INIT_DELAYED_WORK(reap_work, cache_reap); + schedule_delayed_work_on(cpu, reap_work, + __round_jiffies_relative(HZ, cpu)); + } +} + +static struct array_cache *alloc_arraycache(int node, int entries, + int batchcount) +{ + int memsize = sizeof(void *) * entries + sizeof(struct array_cache); + struct array_cache *nc = NULL; + + nc = kmalloc_node(memsize, GFP_KERNEL, node); + if (nc) { + nc->avail = 0; + nc->limit = entries; + nc->batchcount = batchcount; + nc->touched = 0; + spin_lock_init(&nc->lock); + } + return nc; +} + +/* + * Transfer objects in one arraycache to another. + * Locking must be handled by the caller. + * + * Return the number of entries transferred. + */ +static int transfer_objects(struct array_cache *to, + struct array_cache *from, unsigned int max) +{ + /* Figure out how many entries to transfer */ + int nr = min(min(from->avail, max), to->limit - to->avail); + + if (!nr) + return 0; + + memcpy(to->entry + to->avail, from->entry + from->avail -nr, + sizeof(void *) *nr); + + from->avail -= nr; + to->avail += nr; + to->touched = 1; + return nr; +} + +#ifndef CONFIG_NUMA + +#define drain_alien_cache(cachep, alien) do { } while (0) +#define reap_alien(cachep, l3) do { } while (0) + +static inline struct array_cache **alloc_alien_cache(int node, int limit) +{ + return (struct array_cache **)BAD_ALIEN_MAGIC; +} + +static inline void free_alien_cache(struct array_cache **ac_ptr) +{ +} + +static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) +{ + return 0; +} + +static inline void *alternate_node_alloc(struct kmem_cache *cachep, + gfp_t flags) +{ + return NULL; +} + +static inline void *____cache_alloc_node(struct kmem_cache *cachep, + gfp_t flags, int nodeid) +{ + return NULL; +} + +#else /* CONFIG_NUMA */ + +static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); +static void *alternate_node_alloc(struct kmem_cache *, gfp_t); + +static struct array_cache **alloc_alien_cache(int node, int limit) +{ + struct array_cache **ac_ptr; + int memsize = sizeof(void *) * nr_node_ids; + int i; + + if (limit > 1) + limit = 12; + ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node); + if (ac_ptr) { + for_each_node(i) { + if (i == node || !node_online(i)) { + ac_ptr[i] = NULL; + continue; + } + ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d); + if (!ac_ptr[i]) { + for (i--; i >= 0; i--) + kfree(ac_ptr[i]); + kfree(ac_ptr); + return NULL; + } + } + } + return ac_ptr; +} + +static void free_alien_cache(struct array_cache **ac_ptr) +{ + int i; + + if (!ac_ptr) + return; + for_each_node(i) + kfree(ac_ptr[i]); + kfree(ac_ptr); +} + +static void __drain_alien_cache(struct kmem_cache *cachep, + struct array_cache *ac, int node) +{ + struct kmem_list3 *rl3 = cachep->nodelists[node]; + + if (ac->avail) { + spin_lock(&rl3->list_lock); + /* + * Stuff objects into the remote nodes shared array first. + * That way we could avoid the overhead of putting the objects + * into the free lists and getting them back later. + */ + if (rl3->shared) + transfer_objects(rl3->shared, ac, ac->limit); + + free_block(cachep, ac->entry, ac->avail, node); + ac->avail = 0; + spin_unlock(&rl3->list_lock); + } +} + +/* + * Called from cache_reap() to regularly drain alien caches round robin. + */ +static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) +{ + int node = __get_cpu_var(reap_node); + + if (l3->alien) { + struct array_cache *ac = l3->alien[node]; + + if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { + __drain_alien_cache(cachep, ac, node); + spin_unlock_irq(&ac->lock); + } + } +} + +static void drain_alien_cache(struct kmem_cache *cachep, + struct array_cache **alien) +{ + int i = 0; + struct array_cache *ac; + unsigned long flags; + + for_each_online_node(i) { + ac = alien[i]; + if (ac) { + spin_lock_irqsave(&ac->lock, flags); + __drain_alien_cache(cachep, ac, i); + spin_unlock_irqrestore(&ac->lock, flags); + } + } +} + +static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) +{ + struct slab *slabp = virt_to_slab(objp); + int nodeid = slabp->nodeid; + struct kmem_list3 *l3; + struct array_cache *alien = NULL; + int node; + + node = numa_node_id(); + + /* + * Make sure we are not freeing a object from another node to the array + * cache on this cpu. + */ + if (likely(slabp->nodeid == node)) + return 0; + + l3 = cachep->nodelists[node]; + STATS_INC_NODEFREES(cachep); + if (l3->alien && l3->alien[nodeid]) { + alien = l3->alien[nodeid]; + spin_lock(&alien->lock); + if (unlikely(alien->avail == alien->limit)) { + STATS_INC_ACOVERFLOW(cachep); + __drain_alien_cache(cachep, alien, nodeid); + } + alien->entry[alien->avail++] = objp; + spin_unlock(&alien->lock); + } else { + spin_lock(&(cachep->nodelists[nodeid])->list_lock); + free_block(cachep, &objp, 1, nodeid); + spin_unlock(&(cachep->nodelists[nodeid])->list_lock); + } + return 1; +} +#endif + +static void __cpuinit cpuup_canceled(long cpu) +{ + struct kmem_cache *cachep; + struct kmem_list3 *l3 = NULL; + int node = cpu_to_node(cpu); + node_to_cpumask_ptr(mask, node); + + list_for_each_entry(cachep, &cache_chain, next) { + struct array_cache *nc; + struct array_cache *shared; + struct array_cache **alien; + + /* cpu is dead; no one can alloc from it. */ + nc = cachep->array[cpu]; + cachep->array[cpu] = NULL; + l3 = cachep->nodelists[node]; + + if (!l3) + goto free_array_cache; + + spin_lock_irq(&l3->list_lock); + + /* Free limit for this kmem_list3 */ + l3->free_limit -= cachep->batchcount; + if (nc) + free_block(cachep, nc->entry, nc->avail, node); + + if (!cpus_empty(*mask)) { + spin_unlock_irq(&l3->list_lock); + goto free_array_cache; + } + + shared = l3->shared; + if (shared) { + free_block(cachep, shared->entry, + shared->avail, node); + l3->shared = NULL; + } + + alien = l3->alien; + l3->alien = NULL; + + spin_unlock_irq(&l3->list_lock); + + kfree(shared); + if (alien) { + drain_alien_cache(cachep, alien); + free_alien_cache(alien); + } +free_array_cache: + kfree(nc); + } + /* + * In the previous loop, all the objects were freed to + * the respective cache's slabs, now we can go ahead and + * shrink each nodelist to its limit. + */ + list_for_each_entry(cachep, &cache_chain, next) { + l3 = cachep->nodelists[node]; + if (!l3) + continue; + drain_freelist(cachep, l3, l3->free_objects); + } +} + +static int __cpuinit cpuup_prepare(long cpu) +{ + struct kmem_cache *cachep; + struct kmem_list3 *l3 = NULL; + int node = cpu_to_node(cpu); + const int memsize = sizeof(struct kmem_list3); + + /* + * We need to do this right in the beginning since + * alloc_arraycache's are going to use this list. + * kmalloc_node allows us to add the slab to the right + * kmem_list3 and not this cpu's kmem_list3 + */ + + list_for_each_entry(cachep, &cache_chain, next) { + /* + * Set up the size64 kmemlist for cpu before we can + * begin anything. Make sure some other cpu on this + * node has not already allocated this + */ + if (!cachep->nodelists[node]) { + l3 = kmalloc_node(memsize, GFP_KERNEL, node); + if (!l3) + goto bad; + kmem_list3_init(l3); + l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + + /* + * The l3s don't come and go as CPUs come and + * go. cache_chain_mutex is sufficient + * protection here. + */ + cachep->nodelists[node] = l3; + } + + spin_lock_irq(&cachep->nodelists[node]->list_lock); + cachep->nodelists[node]->free_limit = + (1 + nr_cpus_node(node)) * + cachep->batchcount + cachep->num; + spin_unlock_irq(&cachep->nodelists[node]->list_lock); + } + + /* + * Now we can go ahead with allocating the shared arrays and + * array caches + */ + list_for_each_entry(cachep, &cache_chain, next) { + struct array_cache *nc; + struct array_cache *shared = NULL; + struct array_cache **alien = NULL; + + nc = alloc_arraycache(node, cachep->limit, + cachep->batchcount); + if (!nc) + goto bad; + if (cachep->shared) { + shared = alloc_arraycache(node, + cachep->shared * cachep->batchcount, + 0xbaadf00d); + if (!shared) { + kfree(nc); + goto bad; + } + } + if (use_alien_caches) { + alien = alloc_alien_cache(node, cachep->limit); + if (!alien) { + kfree(shared); + kfree(nc); + goto bad; + } + } + cachep->array[cpu] = nc; + l3 = cachep->nodelists[node]; + BUG_ON(!l3); + + spin_lock_irq(&l3->list_lock); + if (!l3->shared) { + /* + * We are serialised from CPU_DEAD or + * CPU_UP_CANCELLED by the cpucontrol lock + */ + l3->shared = shared; + shared = NULL; + } +#ifdef CONFIG_NUMA + if (!l3->alien) { + l3->alien = alien; + alien = NULL; + } +#endif + spin_unlock_irq(&l3->list_lock); + kfree(shared); + free_alien_cache(alien); + } + return 0; +bad: + cpuup_canceled(cpu); + return -ENOMEM; +} + +static int __cpuinit cpuup_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + int err = 0; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + mutex_lock(&cache_chain_mutex); + err = cpuup_prepare(cpu); + mutex_unlock(&cache_chain_mutex); + break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + start_cpu_timer(cpu); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + /* + * Shutdown cache reaper. Note that the cache_chain_mutex is + * held so that if cache_reap() is invoked it cannot do + * anything expensive but will only modify reap_work + * and reschedule the timer. + */ + cancel_rearming_delayed_work(&per_cpu(reap_work, cpu)); + /* Now the cache_reaper is guaranteed to be not running. */ + per_cpu(reap_work, cpu).work.func = NULL; + break; + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + start_cpu_timer(cpu); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + /* + * Even if all the cpus of a node are down, we don't free the + * kmem_list3 of any cache. This to avoid a race between + * cpu_down, and a kmalloc allocation from another cpu for + * memory from the node of the cpu going down. The list3 + * structure is usually allocated from kmem_cache_create() and + * gets destroyed at kmem_cache_destroy(). + */ + /* fall through */ +#endif + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + mutex_lock(&cache_chain_mutex); + cpuup_canceled(cpu); + mutex_unlock(&cache_chain_mutex); + break; + } + return err ? NOTIFY_BAD : NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata cpucache_notifier = { + &cpuup_callback, NULL, 0 +}; + +/* + * swap the static kmem_list3 with kmalloced memory + */ +static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, + int nodeid) +{ + struct kmem_list3 *ptr; + + ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid); + BUG_ON(!ptr); + + local_irq_disable(); + memcpy(ptr, list, sizeof(struct kmem_list3)); + /* + * Do not assume that spinlocks can be initialized via memcpy: + */ + spin_lock_init(&ptr->list_lock); + + MAKE_ALL_LISTS(cachep, ptr, nodeid); + cachep->nodelists[nodeid] = ptr; + local_irq_enable(); +} + +/* + * For setting up all the kmem_list3s for cache whose buffer_size is same as + * size of kmem_list3. + */ +static void __init set_up_list3s(struct kmem_cache *cachep, int index) +{ + int node; + + for_each_online_node(node) { + cachep->nodelists[node] = &initkmem_list3[index + node]; + cachep->nodelists[node]->next_reap = jiffies + + REAPTIMEOUT_LIST3 + + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + } +} + +/* + * Initialisation. Called after the page allocator have been initialised and + * before smp_init(). + */ +void __init kmem_cache_init(void) +{ + size_t left_over; + struct cache_sizes *sizes; + struct cache_names *names; + int i; + int order; + int node; + + if (num_possible_nodes() == 1) { + use_alien_caches = 0; + numa_platform = 0; + } + + for (i = 0; i < NUM_INIT_LISTS; i++) { + kmem_list3_init(&initkmem_list3[i]); + if (i < MAX_NUMNODES) + cache_cache.nodelists[i] = NULL; + } + set_up_list3s(&cache_cache, CACHE_CACHE); + + /* + * Fragmentation resistance on low memory - only use bigger + * page orders on machines with more than 32MB of memory. + */ + if (num_physpages > (32 << 20) >> PAGE_SHIFT) + slab_break_gfp_order = BREAK_GFP_ORDER_HI; + + /* Bootstrap is tricky, because several objects are allocated + * from caches that do not exist yet: + * 1) initialize the cache_cache cache: it contains the struct + * kmem_cache structures of all caches, except cache_cache itself: + * cache_cache is statically allocated. + * Initially an __init data area is used for the head array and the + * kmem_list3 structures, it's replaced with a kmalloc allocated + * array at the end of the bootstrap. + * 2) Create the first kmalloc cache. + * The struct kmem_cache for the new cache is allocated normally. + * An __init data area is used for the head array. + * 3) Create the remaining kmalloc caches, with minimally sized + * head arrays. + * 4) Replace the __init data head arrays for cache_cache and the first + * kmalloc cache with kmalloc allocated arrays. + * 5) Replace the __init data for kmem_list3 for cache_cache and + * the other cache's with kmalloc allocated memory. + * 6) Resize the head arrays of the kmalloc caches to their final sizes. + */ + + node = numa_node_id(); + + /* 1) create the cache_cache */ + INIT_LIST_HEAD(&cache_chain); + list_add(&cache_cache.next, &cache_chain); + cache_cache.colour_off = cache_line_size(); + cache_cache.array[smp_processor_id()] = &initarray_cache.cache; + cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; + + /* + * struct kmem_cache size depends on nr_node_ids, which + * can be less than MAX_NUMNODES. + */ + cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) + + nr_node_ids * sizeof(struct kmem_list3 *); +#if DEBUG + cache_cache.obj_size = cache_cache.buffer_size; +#endif + cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, + cache_line_size()); + cache_cache.reciprocal_buffer_size = + reciprocal_value(cache_cache.buffer_size); + + for (order = 0; order < MAX_ORDER; order++) { + cache_estimate(order, cache_cache.buffer_size, + cache_line_size(), 0, &left_over, &cache_cache.num); + if (cache_cache.num) + break; + } + BUG_ON(!cache_cache.num); + cache_cache.gfporder = order; + cache_cache.colour = left_over / cache_cache.colour_off; + cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) + + sizeof(struct slab), cache_line_size()); + + /* 2+3) create the kmalloc caches */ + sizes = malloc_sizes; + names = cache_names; + + /* + * Initialize the caches that provide memory for the array cache and the + * kmem_list3 structures first. Without this, further allocations will + * bug. + */ + + sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, + sizes[INDEX_AC].cs_size, + ARCH_KMALLOC_MINALIGN, + ARCH_KMALLOC_FLAGS|SLAB_PANIC, + NULL); + + if (INDEX_AC != INDEX_L3) { + sizes[INDEX_L3].cs_cachep = + kmem_cache_create(names[INDEX_L3].name, + sizes[INDEX_L3].cs_size, + ARCH_KMALLOC_MINALIGN, + ARCH_KMALLOC_FLAGS|SLAB_PANIC, + NULL); + } + + slab_early_init = 0; + + while (sizes->cs_size != ULONG_MAX) { + /* + * For performance, all the general caches are L1 aligned. + * This should be particularly beneficial on SMP boxes, as it + * eliminates "false sharing". + * Note for systems short on memory removing the alignment will + * allow tighter packing of the smaller caches. + */ + if (!sizes->cs_cachep) { + sizes->cs_cachep = kmem_cache_create(names->name, + sizes->cs_size, + ARCH_KMALLOC_MINALIGN, + ARCH_KMALLOC_FLAGS|SLAB_PANIC, + NULL); + } +#ifdef CONFIG_ZONE_DMA + sizes->cs_dmacachep = kmem_cache_create( + names->name_dma, + sizes->cs_size, + ARCH_KMALLOC_MINALIGN, + ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| + SLAB_PANIC, + NULL); +#endif + sizes++; + names++; + } + /* 4) Replace the bootstrap head arrays */ + { + struct array_cache *ptr; + + ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); + + local_irq_disable(); + BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); + memcpy(ptr, cpu_cache_get(&cache_cache), + sizeof(struct arraycache_init)); + /* + * Do not assume that spinlocks can be initialized via memcpy: + */ + spin_lock_init(&ptr->lock); + + cache_cache.array[smp_processor_id()] = ptr; + local_irq_enable(); + + ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); + + local_irq_disable(); + BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) + != &initarray_generic.cache); + memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), + sizeof(struct arraycache_init)); + /* + * Do not assume that spinlocks can be initialized via memcpy: + */ + spin_lock_init(&ptr->lock); + + malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = + ptr; + local_irq_enable(); + } + /* 5) Replace the bootstrap kmem_list3's */ + { + int nid; + + for_each_online_node(nid) { + init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid); + + init_list(malloc_sizes[INDEX_AC].cs_cachep, + &initkmem_list3[SIZE_AC + nid], nid); + + if (INDEX_AC != INDEX_L3) { + init_list(malloc_sizes[INDEX_L3].cs_cachep, + &initkmem_list3[SIZE_L3 + nid], nid); + } + } + } + + /* 6) resize the head arrays to their final sizes */ + { + struct kmem_cache *cachep; + mutex_lock(&cache_chain_mutex); + list_for_each_entry(cachep, &cache_chain, next) + if (enable_cpucache(cachep)) + BUG(); + mutex_unlock(&cache_chain_mutex); + } + + /* Annotate slab for lockdep -- annotate the malloc caches */ + init_lock_keys(); + + + /* Done! */ + g_cpucache_up = FULL; + + /* + * Register a cpu startup notifier callback that initializes + * cpu_cache_get for all new cpus + */ + register_cpu_notifier(&cpucache_notifier); + + /* + * The reap timers are started later, with a module init call: That part + * of the kernel is not yet operational. + */ +} + +static int __init cpucache_init(void) +{ + int cpu; + + /* + * Register the timers that return unneeded pages to the page allocator + */ + for_each_online_cpu(cpu) + start_cpu_timer(cpu); + return 0; +} +__initcall(cpucache_init); + +/* + * Interface to system's page allocator. No need to hold the cache-lock. + * + * If we requested dmaable memory, we will get it. Even if we + * did not request dmaable memory, we might get it, but that + * would be relatively rare and ignorable. + */ +static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) +{ + struct page *page; + int nr_pages; + int i; + +#ifndef CONFIG_MMU + /* + * Nommu uses slab's for process anonymous memory allocations, and thus + * requires __GFP_COMP to properly refcount higher order allocations + */ + flags |= __GFP_COMP; +#endif + + flags |= cachep->gfpflags; + if (cachep->flags & SLAB_RECLAIM_ACCOUNT) + flags |= __GFP_RECLAIMABLE; + + page = alloc_pages_node(nodeid, flags, cachep->gfporder); + if (!page) + return NULL; + + nr_pages = (1 << cachep->gfporder); + if (cachep->flags & SLAB_RECLAIM_ACCOUNT) + add_zone_page_state(page_zone(page), + NR_SLAB_RECLAIMABLE, nr_pages); + else + add_zone_page_state(page_zone(page), + NR_SLAB_UNRECLAIMABLE, nr_pages); + for (i = 0; i < nr_pages; i++) + __SetPageSlab(page + i); + return page_address(page); +} + +/* + * Interface to system's page release. + */ +static void kmem_freepages(struct kmem_cache *cachep, void *addr) +{ + unsigned long i = (1 << cachep->gfporder); + struct page *page = virt_to_page(addr); + const unsigned long nr_freed = i; + + if (cachep->flags & SLAB_RECLAIM_ACCOUNT) + sub_zone_page_state(page_zone(page), + NR_SLAB_RECLAIMABLE, nr_freed); + else + sub_zone_page_state(page_zone(page), + NR_SLAB_UNRECLAIMABLE, nr_freed); + while (i--) { + BUG_ON(!PageSlab(page)); + __ClearPageSlab(page); + page++; + } + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += nr_freed; + free_pages((unsigned long)addr, cachep->gfporder); +} + +static void kmem_rcu_free(struct rcu_head *head) +{ + struct slab_rcu *slab_rcu = (struct slab_rcu *)head; + struct kmem_cache *cachep = slab_rcu->cachep; + + kmem_freepages(cachep, slab_rcu->addr); + if (OFF_SLAB(cachep)) + kmem_cache_free(cachep->slabp_cache, slab_rcu); +} + +#if DEBUG + +#ifdef CONFIG_DEBUG_PAGEALLOC +static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, + unsigned long caller) +{ + int size = obj_size(cachep); + + addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)]; + + if (size < 5 * sizeof(unsigned long)) + return; + + *addr++ = 0x12345678; + *addr++ = caller; + *addr++ = smp_processor_id(); + size -= 3 * sizeof(unsigned long); + { + unsigned long *sptr = &caller; + unsigned long svalue; + + while (!kstack_end(sptr)) { + svalue = *sptr++; + if (kernel_text_address(svalue)) { + *addr++ = svalue; + size -= sizeof(unsigned long); + if (size <= sizeof(unsigned long)) + break; + } + } + + } + *addr++ = 0x87654321; +} +#endif + +static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) +{ + int size = obj_size(cachep); + addr = &((char *)addr)[obj_offset(cachep)]; + + memset(addr, val, size); + *(unsigned char *)(addr + size - 1) = POISON_END; +} + +static void dump_line(char *data, int offset, int limit) +{ + int i; + unsigned char error = 0; + int bad_count = 0; + + printk(KERN_ERR "%03x:", offset); + for (i = 0; i < limit; i++) { + if (data[offset + i] != POISON_FREE) { + error = data[offset + i]; + bad_count++; + } + printk(" %02x", (unsigned char)data[offset + i]); + } + printk("\n"); + + if (bad_count == 1) { + error ^= POISON_FREE; + if (!(error & (error - 1))) { + printk(KERN_ERR "Single bit error detected. Probably " + "bad RAM.\n"); +#ifdef CONFIG_X86 + printk(KERN_ERR "Run memtest86+ or a similar memory " + "test tool.\n"); +#else + printk(KERN_ERR "Run a memory test tool.\n"); +#endif + } + } +} +#endif + +#if DEBUG + +static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) +{ + int i, size; + char *realobj; + + if (cachep->flags & SLAB_RED_ZONE) { + printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n", + *dbg_redzone1(cachep, objp), + *dbg_redzone2(cachep, objp)); + } + + if (cachep->flags & SLAB_STORE_USER) { + printk(KERN_ERR "Last user: [<%p>]", + *dbg_userword(cachep, objp)); + print_symbol("(%s)", + (unsigned long)*dbg_userword(cachep, objp)); + printk("\n"); + } + realobj = (char *)objp + obj_offset(cachep); + size = obj_size(cachep); + for (i = 0; i < size && lines; i += 16, lines--) { + int limit; + limit = 16; + if (i + limit > size) + limit = size - i; + dump_line(realobj, i, limit); + } +} + +static void check_poison_obj(struct kmem_cache *cachep, void *objp) +{ + char *realobj; + int size, i; + int lines = 0; + + realobj = (char *)objp + obj_offset(cachep); + size = obj_size(cachep); + + for (i = 0; i < size; i++) { + char exp = POISON_FREE; + if (i == size - 1) + exp = POISON_END; + if (realobj[i] != exp) { + int limit; + /* Mismatch ! */ + /* Print header */ + if (lines == 0) { + printk(KERN_ERR + "Slab corruption: %s start=%p, len=%d\n", + cachep->name, realobj, size); + print_objinfo(cachep, objp, 0); + } + /* Hexdump the affected line */ + i = (i / 16) * 16; + limit = 16; + if (i + limit > size) + limit = size - i; + dump_line(realobj, i, limit); + i += 16; + lines++; + /* Limit to 5 lines */ + if (lines > 5) + break; + } + } + if (lines != 0) { + /* Print some data about the neighboring objects, if they + * exist: + */ + struct slab *slabp = virt_to_slab(objp); + unsigned int objnr; + + objnr = obj_to_index(cachep, slabp, objp); + if (objnr) { + objp = index_to_obj(cachep, slabp, objnr - 1); + realobj = (char *)objp + obj_offset(cachep); + printk(KERN_ERR "Prev obj: start=%p, len=%d\n", + realobj, size); + print_objinfo(cachep, objp, 2); + } + if (objnr + 1 < cachep->num) { + objp = index_to_obj(cachep, slabp, objnr + 1); + realobj = (char *)objp + obj_offset(cachep); + printk(KERN_ERR "Next obj: start=%p, len=%d\n", + realobj, size); + print_objinfo(cachep, objp, 2); + } + } +} +#endif + +#if DEBUG +static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) +{ + int i; + for (i = 0; i < cachep->num; i++) { + void *objp = index_to_obj(cachep, slabp, i); + + if (cachep->flags & SLAB_POISON) { +#ifdef CONFIG_DEBUG_PAGEALLOC + if (cachep->buffer_size % PAGE_SIZE == 0 && + OFF_SLAB(cachep)) + kernel_map_pages(virt_to_page(objp), + cachep->buffer_size / PAGE_SIZE, 1); + else + check_poison_obj(cachep, objp); +#else + check_poison_obj(cachep, objp); +#endif + } + if (cachep->flags & SLAB_RED_ZONE) { + if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) + slab_error(cachep, "start of a freed object " + "was overwritten"); + if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) + slab_error(cachep, "end of a freed object " + "was overwritten"); + } + } +} +#else +static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) +{ +} +#endif + +/** + * slab_destroy - destroy and release all objects in a slab + * @cachep: cache pointer being destroyed + * @slabp: slab pointer being destroyed + * + * Destroy all the objs in a slab, and release the mem back to the system. + * Before calling the slab must have been unlinked from the cache. The + * cache-lock is not held/needed. + */ +static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) +{ + void *addr = slabp->s_mem - slabp->colouroff; + + slab_destroy_debugcheck(cachep, slabp); + if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { + struct slab_rcu *slab_rcu; + + slab_rcu = (struct slab_rcu *)slabp; + slab_rcu->cachep = cachep; + slab_rcu->addr = addr; + call_rcu(&slab_rcu->head, kmem_rcu_free); + } else { + kmem_freepages(cachep, addr); + if (OFF_SLAB(cachep)) + kmem_cache_free(cachep->slabp_cache, slabp); + } +} + +static void __kmem_cache_destroy(struct kmem_cache *cachep) +{ + int i; + struct kmem_list3 *l3; + + for_each_online_cpu(i) + kfree(cachep->array[i]); + + /* NUMA: free the list3 structures */ + for_each_online_node(i) { + l3 = cachep->nodelists[i]; + if (l3) { + kfree(l3->shared); + free_alien_cache(l3->alien); + kfree(l3); + } + } + kmem_cache_free(&cache_cache, cachep); +} + + +/** + * calculate_slab_order - calculate size (page order) of slabs + * @cachep: pointer to the cache that is being created + * @size: size of objects to be created in this cache. + * @align: required alignment for the objects. + * @flags: slab allocation flags + * + * Also calculates the number of objects per slab. + * + * This could be made much more intelligent. For now, try to avoid using + * high order pages for slabs. When the gfp() functions are more friendly + * towards high-order requests, this should be changed. + */ +static size_t calculate_slab_order(struct kmem_cache *cachep, + size_t size, size_t align, unsigned long flags) +{ + unsigned long offslab_limit; + size_t left_over = 0; + int gfporder; + + for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) { + unsigned int num; + size_t remainder; + + cache_estimate(gfporder, size, align, flags, &remainder, &num); + if (!num) + continue; + + if (flags & CFLGS_OFF_SLAB) { + /* + * Max number of objs-per-slab for caches which + * use off-slab slabs. Needed to avoid a possible + * looping condition in cache_grow(). + */ + offslab_limit = size - sizeof(struct slab); + offslab_limit /= sizeof(kmem_bufctl_t); + + if (num > offslab_limit) + break; + } + + /* Found something acceptable - save it away */ + cachep->num = num; + cachep->gfporder = gfporder; + left_over = remainder; + + /* + * A VFS-reclaimable slab tends to have most allocations + * as GFP_NOFS and we really don't want to have to be allocating + * higher-order pages when we are unable to shrink dcache. + */ + if (flags & SLAB_RECLAIM_ACCOUNT) + break; + + /* + * Large number of objects is good, but very large slabs are + * currently bad for the gfp()s. + */ + if (gfporder >= slab_break_gfp_order) + break; + + /* + * Acceptable internal fragmentation? + */ + if (left_over * 8 <= (PAGE_SIZE << gfporder)) + break; + } + return left_over; +} + +static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) +{ + if (g_cpucache_up == FULL) + return enable_cpucache(cachep); + + if (g_cpucache_up == NONE) { + /* + * Note: the first kmem_cache_create must create the cache + * that's used by kmalloc(24), otherwise the creation of + * further caches will BUG(). + */ + cachep->array[smp_processor_id()] = &initarray_generic.cache; + + /* + * If the cache that's used by kmalloc(sizeof(kmem_list3)) is + * the first cache, then we need to set up all its list3s, + * otherwise the creation of further caches will BUG(). + */ + set_up_list3s(cachep, SIZE_AC); + if (INDEX_AC == INDEX_L3) + g_cpucache_up = PARTIAL_L3; + else + g_cpucache_up = PARTIAL_AC; + } else { + cachep->array[smp_processor_id()] = + kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); + + if (g_cpucache_up == PARTIAL_AC) { + set_up_list3s(cachep, SIZE_L3); + g_cpucache_up = PARTIAL_L3; + } else { + int node; + for_each_online_node(node) { + cachep->nodelists[node] = + kmalloc_node(sizeof(struct kmem_list3), + GFP_KERNEL, node); + BUG_ON(!cachep->nodelists[node]); + kmem_list3_init(cachep->nodelists[node]); + } + } + } + cachep->nodelists[numa_node_id()]->next_reap = + jiffies + REAPTIMEOUT_LIST3 + + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + + cpu_cache_get(cachep)->avail = 0; + cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; + cpu_cache_get(cachep)->batchcount = 1; + cpu_cache_get(cachep)->touched = 0; + cachep->batchcount = 1; + cachep->limit = BOOT_CPUCACHE_ENTRIES; + return 0; +} + +/** + * kmem_cache_create - Create a cache. + * @name: A string which is used in /proc/slabinfo to identify this cache. + * @size: The size of objects to be created in this cache. + * @align: The required alignment for the objects. + * @flags: SLAB flags + * @ctor: A constructor for the objects. + * + * Returns a ptr to the cache on success, NULL on failure. + * Cannot be called within a int, but can be interrupted. + * The @ctor is run when new pages are allocated by the cache. + * + * @name must be valid until the cache is destroyed. This implies that + * the module calling this has to destroy the cache before getting unloaded. + * + * The flags are + * + * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) + * to catch references to uninitialised memory. + * + * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check + * for buffer overruns. + * + * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware + * cacheline. This can be beneficial if you're counting cycles as closely + * as davem. + */ +struct kmem_cache * +kmem_cache_create (const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) +{ + size_t left_over, slab_size, ralign; + struct kmem_cache *cachep = NULL, *pc; + + /* + * Sanity checks... these are all serious usage bugs. + */ + if (!name || in_interrupt() || (size < BYTES_PER_WORD) || + size > KMALLOC_MAX_SIZE) { + printk(KERN_ERR "%s: Early error in slab %s\n", __func__, + name); + BUG(); + } + + /* + * We use cache_chain_mutex to ensure a consistent view of + * cpu_online_map as well. Please see cpuup_callback + */ + get_online_cpus(); + mutex_lock(&cache_chain_mutex); + + list_for_each_entry(pc, &cache_chain, next) { + char tmp; + int res; + + /* + * This happens when the module gets unloaded and doesn't + * destroy its slab cache and no-one else reuses the vmalloc + * area of the module. Print a warning. + */ + res = probe_kernel_address(pc->name, tmp); + if (res) { + printk(KERN_ERR + "SLAB: cache with size %d has lost its name\n", + pc->buffer_size); + continue; + } + + if (!strcmp(pc->name, name)) { + printk(KERN_ERR + "kmem_cache_create: duplicate cache %s\n", name); + dump_stack(); + goto oops; + } + } + +#if DEBUG + WARN_ON(strchr(name, ' ')); /* It confuses parsers */ +#if FORCED_DEBUG + /* + * Enable redzoning and last user accounting, except for caches with + * large objects, if the increased size would increase the object size + * above the next power of two: caches with object sizes just above a + * power of two have a significant amount of internal fragmentation. + */ + if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + + 2 * sizeof(unsigned long long))) + flags |= SLAB_RED_ZONE | SLAB_STORE_USER; + if (!(flags & SLAB_DESTROY_BY_RCU)) + flags |= SLAB_POISON; +#endif + if (flags & SLAB_DESTROY_BY_RCU) + BUG_ON(flags & SLAB_POISON); +#endif + /* + * Always checks flags, a caller might be expecting debug support which + * isn't available. + */ + BUG_ON(flags & ~CREATE_MASK); + + /* + * Check that size is in terms of words. This is needed to avoid + * unaligned accesses for some archs when redzoning is used, and makes + * sure any on-slab bufctl's are also correctly aligned. + */ + if (size & (BYTES_PER_WORD - 1)) { + size += (BYTES_PER_WORD - 1); + size &= ~(BYTES_PER_WORD - 1); + } + + /* calculate the final buffer alignment: */ + + /* 1) arch recommendation: can be overridden for debug */ + if (flags & SLAB_HWCACHE_ALIGN) { + /* + * Default alignment: as specified by the arch code. Except if + * an object is really small, then squeeze multiple objects into + * one cacheline. + */ + ralign = cache_line_size(); + while (size <= ralign / 2) + ralign /= 2; + } else { + ralign = BYTES_PER_WORD; + } + + /* + * Redzoning and user store require word alignment or possibly larger. + * Note this will be overridden by architecture or caller mandated + * alignment if either is greater than BYTES_PER_WORD. + */ + if (flags & SLAB_STORE_USER) + ralign = BYTES_PER_WORD; + + if (flags & SLAB_RED_ZONE) { + ralign = REDZONE_ALIGN; + /* If redzoning, ensure that the second redzone is suitably + * aligned, by adjusting the object size accordingly. */ + size += REDZONE_ALIGN - 1; + size &= ~(REDZONE_ALIGN - 1); + } + + /* 2) arch mandated alignment */ + if (ralign < ARCH_SLAB_MINALIGN) { + ralign = ARCH_SLAB_MINALIGN; + } + /* 3) caller mandated alignment */ + if (ralign < align) { + ralign = align; + } + /* disable debug if necessary */ + if (ralign > __alignof__(unsigned long long)) + flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); + /* + * 4) Store it. + */ + align = ralign; + + /* Get cache's description obj. */ + cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL); + if (!cachep) + goto oops; + +#if DEBUG + cachep->obj_size = size; + + /* + * Both debugging options require word-alignment which is calculated + * into align above. + */ + if (flags & SLAB_RED_ZONE) { + /* add space for red zone words */ + cachep->obj_offset += sizeof(unsigned long long); + size += 2 * sizeof(unsigned long long); + } + if (flags & SLAB_STORE_USER) { + /* user store requires one word storage behind the end of + * the real object. But if the second red zone needs to be + * aligned to 64 bits, we must allow that much space. + */ + if (flags & SLAB_RED_ZONE) + size += REDZONE_ALIGN; + else + size += BYTES_PER_WORD; + } +#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) + if (size >= malloc_sizes[INDEX_L3 + 1].cs_size + && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) { + cachep->obj_offset += PAGE_SIZE - size; + size = PAGE_SIZE; + } +#endif +#endif + + /* + * Determine if the slab management is 'on' or 'off' slab. + * (bootstrapping cannot cope with offslab caches so don't do + * it too early on.) + */ + if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init) + /* + * Size is large, assume best to place the slab management obj + * off-slab (should allow better packing of objs). + */ + flags |= CFLGS_OFF_SLAB; + + size = ALIGN(size, align); + + left_over = calculate_slab_order(cachep, size, align, flags); + + if (!cachep->num) { + printk(KERN_ERR + "kmem_cache_create: couldn't create cache %s.\n", name); + kmem_cache_free(&cache_cache, cachep); + cachep = NULL; + goto oops; + } + slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) + + sizeof(struct slab), align); + + /* + * If the slab has been placed off-slab, and we have enough space then + * move it on-slab. This is at the expense of any extra colouring. + */ + if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) { + flags &= ~CFLGS_OFF_SLAB; + left_over -= slab_size; + } + + if (flags & CFLGS_OFF_SLAB) { + /* really off slab. No need for manual alignment */ + slab_size = + cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); + } + + cachep->colour_off = cache_line_size(); + /* Offset must be a multiple of the alignment. */ + if (cachep->colour_off < align) + cachep->colour_off = align; + cachep->colour = left_over / cachep->colour_off; + cachep->slab_size = slab_size; + cachep->flags = flags; + cachep->gfpflags = 0; + if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) + cachep->gfpflags |= GFP_DMA; + cachep->buffer_size = size; + cachep->reciprocal_buffer_size = reciprocal_value(size); + + if (flags & CFLGS_OFF_SLAB) { + cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); + /* + * This is a possibility for one of the malloc_sizes caches. + * But since we go off slab only for object size greater than + * PAGE_SIZE/8, and malloc_sizes gets created in ascending order, + * this should not happen at all. + * But leave a BUG_ON for some lucky dude. + */ + BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache)); + } + cachep->ctor = ctor; + cachep->name = name; + + if (setup_cpu_cache(cachep)) { + __kmem_cache_destroy(cachep); + cachep = NULL; + goto oops; + } + + /* cache setup completed, link it into the list */ + list_add(&cachep->next, &cache_chain); +oops: + if (!cachep && (flags & SLAB_PANIC)) + panic("kmem_cache_create(): failed to create slab `%s'\n", + name); + mutex_unlock(&cache_chain_mutex); + put_online_cpus(); + return cachep; +} +EXPORT_SYMBOL(kmem_cache_create); + +#if DEBUG +static void check_irq_off(void) +{ + BUG_ON(!irqs_disabled()); +} + +static void check_irq_on(void) +{ + BUG_ON(irqs_disabled()); +} + +static void check_spinlock_acquired(struct kmem_cache *cachep) +{ +#ifdef CONFIG_SMP + check_irq_off(); + assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock); +#endif +} + +static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) +{ +#ifdef CONFIG_SMP + check_irq_off(); + assert_spin_locked(&cachep->nodelists[node]->list_lock); +#endif +} + +#else +#define check_irq_off() do { } while(0) +#define check_irq_on() do { } while(0) +#define check_spinlock_acquired(x) do { } while(0) +#define check_spinlock_acquired_node(x, y) do { } while(0) +#endif + +static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, + struct array_cache *ac, + int force, int node); + +static void do_drain(void *arg) +{ + struct kmem_cache *cachep = arg; + struct array_cache *ac; + int node = numa_node_id(); + + check_irq_off(); + ac = cpu_cache_get(cachep); + spin_lock(&cachep->nodelists[node]->list_lock); + free_block(cachep, ac->entry, ac->avail, node); + spin_unlock(&cachep->nodelists[node]->list_lock); + ac->avail = 0; +} + +static void drain_cpu_caches(struct kmem_cache *cachep) +{ + struct kmem_list3 *l3; + int node; + + on_each_cpu(do_drain, cachep, 1); + check_irq_on(); + for_each_online_node(node) { + l3 = cachep->nodelists[node]; + if (l3 && l3->alien) + drain_alien_cache(cachep, l3->alien); + } + + for_each_online_node(node) { + l3 = cachep->nodelists[node]; + if (l3) + drain_array(cachep, l3, l3->shared, 1, node); + } +} + +/* + * Remove slabs from the list of free slabs. + * Specify the number of slabs to drain in tofree. + * + * Returns the actual number of slabs released. + */ +static int drain_freelist(struct kmem_cache *cache, + struct kmem_list3 *l3, int tofree) +{ + struct list_head *p; + int nr_freed; + struct slab *slabp; + + nr_freed = 0; + while (nr_freed < tofree && !list_empty(&l3->slabs_free)) { + + spin_lock_irq(&l3->list_lock); + p = l3->slabs_free.prev; + if (p == &l3->slabs_free) { + spin_unlock_irq(&l3->list_lock); + goto out; + } + + slabp = list_entry(p, struct slab, list); +#if DEBUG + BUG_ON(slabp->inuse); +#endif + list_del(&slabp->list); + /* + * Safe to drop the lock. The slab is no longer linked + * to the cache. + */ + l3->free_objects -= cache->num; + spin_unlock_irq(&l3->list_lock); + slab_destroy(cache, slabp); + nr_freed++; + } +out: + return nr_freed; +} + +/* Called with cache_chain_mutex held to protect against cpu hotplug */ +static int __cache_shrink(struct kmem_cache *cachep) +{ + int ret = 0, i = 0; + struct kmem_list3 *l3; + + drain_cpu_caches(cachep); + + check_irq_on(); + for_each_online_node(i) { + l3 = cachep->nodelists[i]; + if (!l3) + continue; + + drain_freelist(cachep, l3, l3->free_objects); + + ret += !list_empty(&l3->slabs_full) || + !list_empty(&l3->slabs_partial); + } + return (ret ? 1 : 0); +} + +/** + * kmem_cache_shrink - Shrink a cache. + * @cachep: The cache to shrink. + * + * Releases as many slabs as possible for a cache. + * To help debugging, a zero exit status indicates all slabs were released. + */ +int kmem_cache_shrink(struct kmem_cache *cachep) +{ + int ret; + BUG_ON(!cachep || in_interrupt()); + + get_online_cpus(); + mutex_lock(&cache_chain_mutex); + ret = __cache_shrink(cachep); + mutex_unlock(&cache_chain_mutex); + put_online_cpus(); + return ret; +} +EXPORT_SYMBOL(kmem_cache_shrink); + +/** + * kmem_cache_destroy - delete a cache + * @cachep: the cache to destroy + * + * Remove a &struct kmem_cache object from the slab cache. + * + * It is expected this function will be called by a module when it is + * unloaded. This will remove the cache completely, and avoid a duplicate + * cache being allocated each time a module is loaded and unloaded, if the + * module doesn't have persistent in-kernel storage across loads and unloads. + * + * The cache must be empty before calling this function. + * + * The caller must guarantee that noone will allocate memory from the cache + * during the kmem_cache_destroy(). + */ +void kmem_cache_destroy(struct kmem_cache *cachep) +{ + BUG_ON(!cachep || in_interrupt()); + + /* Find the cache in the chain of caches. */ + get_online_cpus(); + mutex_lock(&cache_chain_mutex); + /* + * the chain is never empty, cache_cache is never destroyed + */ + list_del(&cachep->next); + if (__cache_shrink(cachep)) { + slab_error(cachep, "Can't free all objects"); + list_add(&cachep->next, &cache_chain); + mutex_unlock(&cache_chain_mutex); + put_online_cpus(); + return; + } + + if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) + synchronize_rcu(); + + __kmem_cache_destroy(cachep); + mutex_unlock(&cache_chain_mutex); + put_online_cpus(); +} +EXPORT_SYMBOL(kmem_cache_destroy); + +/* + * Get the memory for a slab management obj. + * For a slab cache when the slab descriptor is off-slab, slab descriptors + * always come from malloc_sizes caches. The slab descriptor cannot + * come from the same cache which is getting created because, + * when we are searching for an appropriate cache for these + * descriptors in kmem_cache_create, we search through the malloc_sizes array. + * If we are creating a malloc_sizes cache here it would not be visible to + * kmem_find_general_cachep till the initialization is complete. + * Hence we cannot have slabp_cache same as the original cache. + */ +static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, + int colour_off, gfp_t local_flags, + int nodeid) +{ + struct slab *slabp; + + if (OFF_SLAB(cachep)) { + /* Slab management obj is off-slab. */ + slabp = kmem_cache_alloc_node(cachep->slabp_cache, + local_flags & ~GFP_THISNODE, nodeid); + if (!slabp) + return NULL; + } else { + slabp = objp + colour_off; + colour_off += cachep->slab_size; + } + slabp->inuse = 0; + slabp->colouroff = colour_off; + slabp->s_mem = objp + colour_off; + slabp->nodeid = nodeid; + slabp->free = 0; + return slabp; +} + +static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) +{ + return (kmem_bufctl_t *) (slabp + 1); +} + +static void cache_init_objs(struct kmem_cache *cachep, + struct slab *slabp) +{ + int i; + + for (i = 0; i < cachep->num; i++) { + void *objp = index_to_obj(cachep, slabp, i); +#if DEBUG + /* need to poison the objs? */ + if (cachep->flags & SLAB_POISON) + poison_obj(cachep, objp, POISON_FREE); + if (cachep->flags & SLAB_STORE_USER) + *dbg_userword(cachep, objp) = NULL; + + if (cachep->flags & SLAB_RED_ZONE) { + *dbg_redzone1(cachep, objp) = RED_INACTIVE; + *dbg_redzone2(cachep, objp) = RED_INACTIVE; + } + /* + * Constructors are not allowed to allocate memory from the same + * cache which they are a constructor for. Otherwise, deadlock. + * They must also be threaded. + */ + if (cachep->ctor && !(cachep->flags & SLAB_POISON)) + cachep->ctor(objp + obj_offset(cachep)); + + if (cachep->flags & SLAB_RED_ZONE) { + if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) + slab_error(cachep, "constructor overwrote the" + " end of an object"); + if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) + slab_error(cachep, "constructor overwrote the" + " start of an object"); + } + if ((cachep->buffer_size % PAGE_SIZE) == 0 && + OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) + kernel_map_pages(virt_to_page(objp), + cachep->buffer_size / PAGE_SIZE, 0); +#else + if (cachep->ctor) + cachep->ctor(objp); +#endif + slab_bufctl(slabp)[i] = i + 1; + } + slab_bufctl(slabp)[i - 1] = BUFCTL_END; +} + +static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) +{ + if (CONFIG_ZONE_DMA_FLAG) { + if (flags & GFP_DMA) + BUG_ON(!(cachep->gfpflags & GFP_DMA)); + else + BUG_ON(cachep->gfpflags & GFP_DMA); + } +} + +static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, + int nodeid) +{ + void *objp = index_to_obj(cachep, slabp, slabp->free); + kmem_bufctl_t next; + + slabp->inuse++; + next = slab_bufctl(slabp)[slabp->free]; +#if DEBUG + slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; + WARN_ON(slabp->nodeid != nodeid); +#endif + slabp->free = next; + + return objp; +} + +static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, + void *objp, int nodeid) +{ + unsigned int objnr = obj_to_index(cachep, slabp, objp); + +#if DEBUG + /* Verify that the slab belongs to the intended node */ + WARN_ON(slabp->nodeid != nodeid); + + if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) { + printk(KERN_ERR "slab: double free detected in cache " + "'%s', objp %p\n", cachep->name, objp); + BUG(); + } +#endif + slab_bufctl(slabp)[objnr] = slabp->free; + slabp->free = objnr; + slabp->inuse--; +} + +/* + * Map pages beginning at addr to the given cache and slab. This is required + * for the slab allocator to be able to lookup the cache and slab of a + * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging. + */ +static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, + void *addr) +{ + int nr_pages; + struct page *page; + + page = virt_to_page(addr); + + nr_pages = 1; + if (likely(!PageCompound(page))) + nr_pages <<= cache->gfporder; + + do { + page_set_cache(page, cache); + page_set_slab(page, slab); + page++; + } while (--nr_pages); +} + +/* + * Grow (by 1) the number of slabs within a cache. This is called by + * kmem_cache_alloc() when there are no active objs left in a cache. + */ +static int cache_grow(struct kmem_cache *cachep, + gfp_t flags, int nodeid, void *objp) +{ + struct slab *slabp; + size_t offset; + gfp_t local_flags; + struct kmem_list3 *l3; + + /* + * Be lazy and only check for valid flags here, keeping it out of the + * critical path in kmem_cache_alloc(). + */ + BUG_ON(flags & GFP_SLAB_BUG_MASK); + local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); + + /* Take the l3 list lock to change the colour_next on this node */ + check_irq_off(); + l3 = cachep->nodelists[nodeid]; + spin_lock(&l3->list_lock); + + /* Get colour for the slab, and cal the next value. */ + offset = l3->colour_next; + l3->colour_next++; + if (l3->colour_next >= cachep->colour) + l3->colour_next = 0; + spin_unlock(&l3->list_lock); + + offset *= cachep->colour_off; + + if (local_flags & __GFP_WAIT) + local_irq_enable(); + + /* + * The test for missing atomic flag is performed here, rather than + * the more obvious place, simply to reduce the critical path length + * in kmem_cache_alloc(). If a caller is seriously mis-behaving they + * will eventually be caught here (where it matters). + */ + kmem_flagcheck(cachep, flags); + + /* + * Get mem for the objs. Attempt to allocate a physical page from + * 'nodeid'. + */ + if (!objp) + objp = kmem_getpages(cachep, local_flags, nodeid); + if (!objp) + goto failed; + + /* Get slab management. */ + slabp = alloc_slabmgmt(cachep, objp, offset, + local_flags & ~GFP_CONSTRAINT_MASK, nodeid); + if (!slabp) + goto opps1; + + slab_map_pages(cachep, slabp, objp); + + cache_init_objs(cachep, slabp); + + if (local_flags & __GFP_WAIT) + local_irq_disable(); + check_irq_off(); + spin_lock(&l3->list_lock); + + /* Make slab active. */ + list_add_tail(&slabp->list, &(l3->slabs_free)); + STATS_INC_GROWN(cachep); + l3->free_objects += cachep->num; + spin_unlock(&l3->list_lock); + return 1; +opps1: + kmem_freepages(cachep, objp); +failed: + if (local_flags & __GFP_WAIT) + local_irq_disable(); + return 0; +} + +#if DEBUG + +/* + * Perform extra freeing checks: + * - detect bad pointers. + * - POISON/RED_ZONE checking + */ +static void kfree_debugcheck(const void *objp) +{ + if (!virt_addr_valid(objp)) { + printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", + (unsigned long)objp); + BUG(); + } +} + +static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) +{ + unsigned long long redzone1, redzone2; + + redzone1 = *dbg_redzone1(cache, obj); + redzone2 = *dbg_redzone2(cache, obj); + + /* + * Redzone is ok. + */ + if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE) + return; + + if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE) + slab_error(cache, "double free detected"); + else + slab_error(cache, "memory outside object was overwritten"); + + printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n", + obj, redzone1, redzone2); +} + +static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, + void *caller) +{ + struct page *page; + unsigned int objnr; + struct slab *slabp; + + BUG_ON(virt_to_cache(objp) != cachep); + + objp -= obj_offset(cachep); + kfree_debugcheck(objp); + page = virt_to_head_page(objp); + + slabp = page_get_slab(page); + + if (cachep->flags & SLAB_RED_ZONE) { + verify_redzone_free(cachep, objp); + *dbg_redzone1(cachep, objp) = RED_INACTIVE; + *dbg_redzone2(cachep, objp) = RED_INACTIVE; + } + if (cachep->flags & SLAB_STORE_USER) + *dbg_userword(cachep, objp) = caller; + + objnr = obj_to_index(cachep, slabp, objp); + + BUG_ON(objnr >= cachep->num); + BUG_ON(objp != index_to_obj(cachep, slabp, objnr)); + +#ifdef CONFIG_DEBUG_SLAB_LEAK + slab_bufctl(slabp)[objnr] = BUFCTL_FREE; +#endif + if (cachep->flags & SLAB_POISON) { +#ifdef CONFIG_DEBUG_PAGEALLOC + if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { + store_stackinfo(cachep, objp, (unsigned long)caller); + kernel_map_pages(virt_to_page(objp), + cachep->buffer_size / PAGE_SIZE, 0); + } else { + poison_obj(cachep, objp, POISON_FREE); + } +#else + poison_obj(cachep, objp, POISON_FREE); +#endif + } + return objp; +} + +static void check_slabp(struct kmem_cache *cachep, struct slab *slabp) +{ + kmem_bufctl_t i; + int entries = 0; + + /* Check slab's freelist to see if this obj is there. */ + for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { + entries++; + if (entries > cachep->num || i >= cachep->num) + goto bad; + } + if (entries != cachep->num - slabp->inuse) { +bad: + printk(KERN_ERR "slab: Internal list corruption detected in " + "cache '%s'(%d), slabp %p(%d). Hexdump:\n", + cachep->name, cachep->num, slabp, slabp->inuse); + for (i = 0; + i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); + i++) { + if (i % 16 == 0) + printk("\n%03x:", i); + printk(" %02x", ((unsigned char *)slabp)[i]); + } + printk("\n"); + BUG(); + } +} +#else +#define kfree_debugcheck(x) do { } while(0) +#define cache_free_debugcheck(x,objp,z) (objp) +#define check_slabp(x,y) do { } while(0) +#endif + +static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) +{ + int batchcount; + struct kmem_list3 *l3; + struct array_cache *ac; + int node; + +retry: + check_irq_off(); + node = numa_node_id(); + ac = cpu_cache_get(cachep); + batchcount = ac->batchcount; + if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { + /* + * If there was little recent activity on this cache, then + * perform only a partial refill. Otherwise we could generate + * refill bouncing. + */ + batchcount = BATCHREFILL_LIMIT; + } + l3 = cachep->nodelists[node]; + + BUG_ON(ac->avail > 0 || !l3); + spin_lock(&l3->list_lock); + + /* See if we can refill from the shared array */ + if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) + goto alloc_done; + + while (batchcount > 0) { + struct list_head *entry; + struct slab *slabp; + /* Get slab alloc is to come from. */ + entry = l3->slabs_partial.next; + if (entry == &l3->slabs_partial) { + l3->free_touched = 1; + entry = l3->slabs_free.next; + if (entry == &l3->slabs_free) + goto must_grow; + } + + slabp = list_entry(entry, struct slab, list); + check_slabp(cachep, slabp); + check_spinlock_acquired(cachep); + + /* + * The slab was either on partial or free list so + * there must be at least one object available for + * allocation. + */ + BUG_ON(slabp->inuse < 0 || slabp->inuse >= cachep->num); + + while (slabp->inuse < cachep->num && batchcount--) { + STATS_INC_ALLOCED(cachep); + STATS_INC_ACTIVE(cachep); + STATS_SET_HIGH(cachep); + + ac->entry[ac->avail++] = slab_get_obj(cachep, slabp, + node); + } + check_slabp(cachep, slabp); + + /* move slabp to correct slabp list: */ + list_del(&slabp->list); + if (slabp->free == BUFCTL_END) + list_add(&slabp->list, &l3->slabs_full); + else + list_add(&slabp->list, &l3->slabs_partial); + } + +must_grow: + l3->free_objects -= ac->avail; +alloc_done: + spin_unlock(&l3->list_lock); + + if (unlikely(!ac->avail)) { + int x; + x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); + + /* cache_grow can reenable interrupts, then ac could change. */ + ac = cpu_cache_get(cachep); + if (!x && ac->avail == 0) /* no objects in sight? abort */ + return NULL; + + if (!ac->avail) /* objects refilled by interrupt? */ + goto retry; + } + ac->touched = 1; + return ac->entry[--ac->avail]; +} + +static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, + gfp_t flags) +{ + might_sleep_if(flags & __GFP_WAIT); +#if DEBUG + kmem_flagcheck(cachep, flags); +#endif +} + +#if DEBUG +static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, + gfp_t flags, void *objp, void *caller) +{ + if (!objp) + return objp; + if (cachep->flags & SLAB_POISON) { +#ifdef CONFIG_DEBUG_PAGEALLOC + if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) + kernel_map_pages(virt_to_page(objp), + cachep->buffer_size / PAGE_SIZE, 1); + else + check_poison_obj(cachep, objp); +#else + check_poison_obj(cachep, objp); +#endif + poison_obj(cachep, objp, POISON_INUSE); + } + if (cachep->flags & SLAB_STORE_USER) + *dbg_userword(cachep, objp) = caller; + + if (cachep->flags & SLAB_RED_ZONE) { + if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || + *dbg_redzone2(cachep, objp) != RED_INACTIVE) { + slab_error(cachep, "double free, or memory outside" + " object was overwritten"); + printk(KERN_ERR + "%p: redzone 1:0x%llx, redzone 2:0x%llx\n", + objp, *dbg_redzone1(cachep, objp), + *dbg_redzone2(cachep, objp)); + } + *dbg_redzone1(cachep, objp) = RED_ACTIVE; + *dbg_redzone2(cachep, objp) = RED_ACTIVE; + } +#ifdef CONFIG_DEBUG_SLAB_LEAK + { + struct slab *slabp; + unsigned objnr; + + slabp = page_get_slab(virt_to_head_page(objp)); + objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; + slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; + } +#endif + objp += obj_offset(cachep); + if (cachep->ctor && cachep->flags & SLAB_POISON) + cachep->ctor(objp); +#if ARCH_SLAB_MINALIGN + if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { + printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", + objp, ARCH_SLAB_MINALIGN); + } +#endif + return objp; +} +#else +#define cache_alloc_debugcheck_after(a,b,objp,d) (objp) +#endif + +#ifdef CONFIG_FAILSLAB + +static struct failslab_attr { + + struct fault_attr attr; + + u32 ignore_gfp_wait; +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + struct dentry *ignore_gfp_wait_file; +#endif + +} failslab = { + .attr = FAULT_ATTR_INITIALIZER, + .ignore_gfp_wait = 1, +}; + +static int __init setup_failslab(char *str) +{ + return setup_fault_attr(&failslab.attr, str); +} +__setup("failslab=", setup_failslab); + +static int should_failslab(struct kmem_cache *cachep, gfp_t flags) +{ + if (cachep == &cache_cache) + return 0; + if (flags & __GFP_NOFAIL) + return 0; + if (failslab.ignore_gfp_wait && (flags & __GFP_WAIT)) + return 0; + + return should_fail(&failslab.attr, obj_size(cachep)); +} + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + +static int __init failslab_debugfs(void) +{ + mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + struct dentry *dir; + int err; + + err = init_fault_attr_dentries(&failslab.attr, "failslab"); + if (err) + return err; + dir = failslab.attr.dentries.dir; + + failslab.ignore_gfp_wait_file = + debugfs_create_bool("ignore-gfp-wait", mode, dir, + &failslab.ignore_gfp_wait); + + if (!failslab.ignore_gfp_wait_file) { + err = -ENOMEM; + debugfs_remove(failslab.ignore_gfp_wait_file); + cleanup_fault_attr_dentries(&failslab.attr); + } + + return err; +} + +late_initcall(failslab_debugfs); + +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ + +#else /* CONFIG_FAILSLAB */ + +static inline int should_failslab(struct kmem_cache *cachep, gfp_t flags) +{ + return 0; +} + +#endif /* CONFIG_FAILSLAB */ + +static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) +{ + void *objp; + struct array_cache *ac; + + check_irq_off(); + + ac = cpu_cache_get(cachep); + if (likely(ac->avail)) { + STATS_INC_ALLOCHIT(cachep); + ac->touched = 1; + objp = ac->entry[--ac->avail]; + } else { + STATS_INC_ALLOCMISS(cachep); + objp = cache_alloc_refill(cachep, flags); + } + return objp; +} + +#ifdef CONFIG_NUMA +/* + * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY. + * + * If we are in_interrupt, then process context, including cpusets and + * mempolicy, may not apply and should not be used for allocation policy. + */ +static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) +{ + int nid_alloc, nid_here; + + if (in_interrupt() || (flags & __GFP_THISNODE)) + return NULL; + nid_alloc = nid_here = numa_node_id(); + if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) + nid_alloc = cpuset_mem_spread_node(); + else if (current->mempolicy) + nid_alloc = slab_node(current->mempolicy); + if (nid_alloc != nid_here) + return ____cache_alloc_node(cachep, flags, nid_alloc); + return NULL; +} + +/* + * Fallback function if there was no memory available and no objects on a + * certain node and fall back is permitted. First we scan all the + * available nodelists for available objects. If that fails then we + * perform an allocation without specifying a node. This allows the page + * allocator to do its reclaim / fallback magic. We then insert the + * slab into the proper nodelist and then allocate from it. + */ +static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) +{ + struct zonelist *zonelist; + gfp_t local_flags; + struct zoneref *z; + struct zone *zone; + enum zone_type high_zoneidx = gfp_zone(flags); + void *obj = NULL; + int nid; + + if (flags & __GFP_THISNODE) + return NULL; + + zonelist = node_zonelist(slab_node(current->mempolicy), flags); + local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); + +retry: + /* + * Look through allowed nodes for objects available + * from existing per node queues. + */ + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + nid = zone_to_nid(zone); + + if (cpuset_zone_allowed_hardwall(zone, flags) && + cache->nodelists[nid] && + cache->nodelists[nid]->free_objects) { + obj = ____cache_alloc_node(cache, + flags | GFP_THISNODE, nid); + if (obj) + break; + } + } + + if (!obj) { + /* + * This allocation will be performed within the constraints + * of the current cpuset / memory policy requirements. + * We may trigger various forms of reclaim on the allowed + * set and go into memory reserves if necessary. + */ + if (local_flags & __GFP_WAIT) + local_irq_enable(); + kmem_flagcheck(cache, flags); + obj = kmem_getpages(cache, local_flags, -1); + if (local_flags & __GFP_WAIT) + local_irq_disable(); + if (obj) { + /* + * Insert into the appropriate per node queues + */ + nid = page_to_nid(virt_to_page(obj)); + if (cache_grow(cache, flags, nid, obj)) { + obj = ____cache_alloc_node(cache, + flags | GFP_THISNODE, nid); + if (!obj) + /* + * Another processor may allocate the + * objects in the slab since we are + * not holding any locks. + */ + goto retry; + } else { + /* cache_grow already freed obj */ + obj = NULL; + } + } + } + return obj; +} + +/* + * A interface to enable slab creation on nodeid + */ +static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, + int nodeid) +{ + struct list_head *entry; + struct slab *slabp; + struct kmem_list3 *l3; + void *obj; + int x; + + l3 = cachep->nodelists[nodeid]; + BUG_ON(!l3); + +retry: + check_irq_off(); + spin_lock(&l3->list_lock); + entry = l3->slabs_partial.next; + if (entry == &l3->slabs_partial) { + l3->free_touched = 1; + entry = l3->slabs_free.next; + if (entry == &l3->slabs_free) + goto must_grow; + } + + slabp = list_entry(entry, struct slab, list); + check_spinlock_acquired_node(cachep, nodeid); + check_slabp(cachep, slabp); + + STATS_INC_NODEALLOCS(cachep); + STATS_INC_ACTIVE(cachep); + STATS_SET_HIGH(cachep); + + BUG_ON(slabp->inuse == cachep->num); + + obj = slab_get_obj(cachep, slabp, nodeid); + check_slabp(cachep, slabp); + l3->free_objects--; + /* move slabp to correct slabp list: */ + list_del(&slabp->list); + + if (slabp->free == BUFCTL_END) + list_add(&slabp->list, &l3->slabs_full); + else + list_add(&slabp->list, &l3->slabs_partial); + + spin_unlock(&l3->list_lock); + goto done; + +must_grow: + spin_unlock(&l3->list_lock); + x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); + if (x) + goto retry; + + return fallback_alloc(cachep, flags); + +done: + return obj; +} + +/** + * kmem_cache_alloc_node - Allocate an object on the specified node + * @cachep: The cache to allocate from. + * @flags: See kmalloc(). + * @nodeid: node number of the target node. + * @caller: return address of caller, used for debug information + * + * Identical to kmem_cache_alloc but it will allocate memory on the given + * node, which can improve the performance for cpu bound structures. + * + * Fallback to other node is possible if __GFP_THISNODE is not set. + */ +static __always_inline void * +__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, + void *caller) +{ + unsigned long save_flags; + void *ptr; + + if (should_failslab(cachep, flags)) + return NULL; + + cache_alloc_debugcheck_before(cachep, flags); + local_irq_save(save_flags); + + if (unlikely(nodeid == -1)) + nodeid = numa_node_id(); + + if (unlikely(!cachep->nodelists[nodeid])) { + /* Node not bootstrapped yet */ + ptr = fallback_alloc(cachep, flags); + goto out; + } + + if (nodeid == numa_node_id()) { + /* + * Use the locally cached objects if possible. + * However ____cache_alloc does not allow fallback + * to other nodes. It may fail while we still have + * objects on other nodes available. + */ + ptr = ____cache_alloc(cachep, flags); + if (ptr) + goto out; + } + /* ___cache_alloc_node can fall back to other nodes */ + ptr = ____cache_alloc_node(cachep, flags, nodeid); + out: + local_irq_restore(save_flags); + ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); + + if (unlikely((flags & __GFP_ZERO) && ptr)) + memset(ptr, 0, obj_size(cachep)); + + return ptr; +} + +static __always_inline void * +__do_cache_alloc(struct kmem_cache *cache, gfp_t flags) +{ + void *objp; + + if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { + objp = alternate_node_alloc(cache, flags); + if (objp) + goto out; + } + objp = ____cache_alloc(cache, flags); + + /* + * We may just have run out of memory on the local node. + * ____cache_alloc_node() knows how to locate memory on other nodes + */ + if (!objp) + objp = ____cache_alloc_node(cache, flags, numa_node_id()); + + out: + return objp; +} +#else + +static __always_inline void * +__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) +{ + return ____cache_alloc(cachep, flags); +} + +#endif /* CONFIG_NUMA */ + +static __always_inline void * +__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) +{ + unsigned long save_flags; + void *objp; + + if (should_failslab(cachep, flags)) + return NULL; + + cache_alloc_debugcheck_before(cachep, flags); + local_irq_save(save_flags); + objp = __do_cache_alloc(cachep, flags); + local_irq_restore(save_flags); + objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); + prefetchw(objp); + + if (unlikely((flags & __GFP_ZERO) && objp)) + memset(objp, 0, obj_size(cachep)); + + return objp; +} + +/* + * Caller needs to acquire correct kmem_list's list_lock + */ +static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, + int node) +{ + int i; + struct kmem_list3 *l3; + + for (i = 0; i < nr_objects; i++) { + void *objp = objpp[i]; + struct slab *slabp; + + slabp = virt_to_slab(objp); + l3 = cachep->nodelists[node]; + list_del(&slabp->list); + check_spinlock_acquired_node(cachep, node); + check_slabp(cachep, slabp); + slab_put_obj(cachep, slabp, objp, node); + STATS_DEC_ACTIVE(cachep); + l3->free_objects++; + check_slabp(cachep, slabp); + + /* fixup slab chains */ + if (slabp->inuse == 0) { + if (l3->free_objects > l3->free_limit) { + l3->free_objects -= cachep->num; + /* No need to drop any previously held + * lock here, even if we have a off-slab slab + * descriptor it is guaranteed to come from + * a different cache, refer to comments before + * alloc_slabmgmt. + */ + slab_destroy(cachep, slabp); + } else { + list_add(&slabp->list, &l3->slabs_free); + } + } else { + /* Unconditionally move a slab to the end of the + * partial list on free - maximum time for the + * other objects to be freed, too. + */ + list_add_tail(&slabp->list, &l3->slabs_partial); + } + } +} + +static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) +{ + int batchcount; + struct kmem_list3 *l3; + int node = numa_node_id(); + + batchcount = ac->batchcount; +#if DEBUG + BUG_ON(!batchcount || batchcount > ac->avail); +#endif + check_irq_off(); + l3 = cachep->nodelists[node]; + spin_lock(&l3->list_lock); + if (l3->shared) { + struct array_cache *shared_array = l3->shared; + int max = shared_array->limit - shared_array->avail; + if (max) { + if (batchcount > max) + batchcount = max; + memcpy(&(shared_array->entry[shared_array->avail]), + ac->entry, sizeof(void *) * batchcount); + shared_array->avail += batchcount; + goto free_done; + } + } + + free_block(cachep, ac->entry, batchcount, node); +free_done: +#if STATS + { + int i = 0; + struct list_head *p; + + p = l3->slabs_free.next; + while (p != &(l3->slabs_free)) { + struct slab *slabp; + + slabp = list_entry(p, struct slab, list); + BUG_ON(slabp->inuse); + + i++; + p = p->next; + } + STATS_SET_FREEABLE(cachep, i); + } +#endif + spin_unlock(&l3->list_lock); + ac->avail -= batchcount; + memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); +} + +/* + * Release an obj back to its cache. If the obj has a constructed state, it must + * be in this state _before_ it is released. Called with disabled ints. + */ +static inline void __cache_free(struct kmem_cache *cachep, void *objp) +{ + struct array_cache *ac = cpu_cache_get(cachep); + + check_irq_off(); + objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); + + /* + * Skip calling cache_free_alien() when the platform is not numa. + * This will avoid cache misses that happen while accessing slabp (which + * is per page memory reference) to get nodeid. Instead use a global + * variable to skip the call, which is mostly likely to be present in + * the cache. + */ + if (numa_platform && cache_free_alien(cachep, objp)) + return; + + if (likely(ac->avail < ac->limit)) { + STATS_INC_FREEHIT(cachep); + ac->entry[ac->avail++] = objp; + return; + } else { + STATS_INC_FREEMISS(cachep); + cache_flusharray(cachep, ac); + ac->entry[ac->avail++] = objp; + } +} + +/** + * kmem_cache_alloc - Allocate an object + * @cachep: The cache to allocate from. + * @flags: See kmalloc(). + * + * Allocate an object from this cache. The flags are only relevant + * if the cache has no available objects. + */ +void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) +{ + return __cache_alloc(cachep, flags, __builtin_return_address(0)); +} +EXPORT_SYMBOL(kmem_cache_alloc); + +/** + * kmem_ptr_validate - check if an untrusted pointer might be a slab entry. + * @cachep: the cache we're checking against + * @ptr: pointer to validate + * + * This verifies that the untrusted pointer looks sane; + * it is _not_ a guarantee that the pointer is actually + * part of the slab cache in question, but it at least + * validates that the pointer can be dereferenced and + * looks half-way sane. + * + * Currently only used for dentry validation. + */ +int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr) +{ + unsigned long addr = (unsigned long)ptr; + unsigned long min_addr = PAGE_OFFSET; + unsigned long align_mask = BYTES_PER_WORD - 1; + unsigned long size = cachep->buffer_size; + struct page *page; + + if (unlikely(addr < min_addr)) + goto out; + if (unlikely(addr > (unsigned long)high_memory - size)) + goto out; + if (unlikely(addr & align_mask)) + goto out; + if (unlikely(!kern_addr_valid(addr))) + goto out; + if (unlikely(!kern_addr_valid(addr + size - 1))) + goto out; + page = virt_to_page(ptr); + if (unlikely(!PageSlab(page))) + goto out; + if (unlikely(page_get_cache(page) != cachep)) + goto out; + return 1; +out: + return 0; +} + +#ifdef CONFIG_NUMA +void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) +{ + return __cache_alloc_node(cachep, flags, nodeid, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(kmem_cache_alloc_node); + +static __always_inline void * +__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) +{ + struct kmem_cache *cachep; + + cachep = kmem_find_general_cachep(size, flags); + if (unlikely(ZERO_OR_NULL_PTR(cachep))) + return cachep; + return kmem_cache_alloc_node(cachep, flags, node); +} + +#ifdef CONFIG_DEBUG_SLAB +void *__kmalloc_node(size_t size, gfp_t flags, int node) +{ + return __do_kmalloc_node(size, flags, node, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(__kmalloc_node); + +void *__kmalloc_node_track_caller(size_t size, gfp_t flags, + int node, void *caller) +{ + return __do_kmalloc_node(size, flags, node, caller); +} +EXPORT_SYMBOL(__kmalloc_node_track_caller); +#else +void *__kmalloc_node(size_t size, gfp_t flags, int node) +{ + return __do_kmalloc_node(size, flags, node, NULL); +} +EXPORT_SYMBOL(__kmalloc_node); +#endif /* CONFIG_DEBUG_SLAB */ +#endif /* CONFIG_NUMA */ + +/** + * __do_kmalloc - allocate memory + * @size: how many bytes of memory are required. + * @flags: the type of memory to allocate (see kmalloc). + * @caller: function caller for debug tracking of the caller + */ +static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, + void *caller) +{ + struct kmem_cache *cachep; + + /* If you want to save a few bytes .text space: replace + * __ with kmem_. + * Then kmalloc uses the uninlined functions instead of the inline + * functions. + */ + cachep = __find_general_cachep(size, flags); + if (unlikely(ZERO_OR_NULL_PTR(cachep))) + return cachep; + return __cache_alloc(cachep, flags, caller); +} + + +#ifdef CONFIG_DEBUG_SLAB +void *__kmalloc(size_t size, gfp_t flags) +{ + return __do_kmalloc(size, flags, __builtin_return_address(0)); +} +EXPORT_SYMBOL(__kmalloc); + +void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller) +{ + return __do_kmalloc(size, flags, caller); +} +EXPORT_SYMBOL(__kmalloc_track_caller); + +#else +void *__kmalloc(size_t size, gfp_t flags) +{ + return __do_kmalloc(size, flags, NULL); +} +EXPORT_SYMBOL(__kmalloc); +#endif + +/** + * kmem_cache_free - Deallocate an object + * @cachep: The cache the allocation was from. + * @objp: The previously allocated object. + * + * Free an object which was previously allocated from this + * cache. + */ +void kmem_cache_free(struct kmem_cache *cachep, void *objp) +{ + unsigned long flags; + + local_irq_save(flags); + debug_check_no_locks_freed(objp, obj_size(cachep)); + if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(objp, obj_size(cachep)); + __cache_free(cachep, objp); + local_irq_restore(flags); +} +EXPORT_SYMBOL(kmem_cache_free); + +/** + * kfree - free previously allocated memory + * @objp: pointer returned by kmalloc. + * + * If @objp is NULL, no operation is performed. + * + * Don't free memory not originally allocated by kmalloc() + * or you will run into trouble. + */ +void kfree(const void *objp) +{ + struct kmem_cache *c; + unsigned long flags; + + if (unlikely(ZERO_OR_NULL_PTR(objp))) + return; + local_irq_save(flags); + kfree_debugcheck(objp); + c = virt_to_cache(objp); + debug_check_no_locks_freed(objp, obj_size(c)); + debug_check_no_obj_freed(objp, obj_size(c)); + __cache_free(c, (void *)objp); + local_irq_restore(flags); +} +EXPORT_SYMBOL(kfree); + +unsigned int kmem_cache_size(struct kmem_cache *cachep) +{ + return obj_size(cachep); +} +EXPORT_SYMBOL(kmem_cache_size); + +const char *kmem_cache_name(struct kmem_cache *cachep) +{ + return cachep->name; +} +EXPORT_SYMBOL_GPL(kmem_cache_name); + +/* + * This initializes kmem_list3 or resizes various caches for all nodes. + */ +static int alloc_kmemlist(struct kmem_cache *cachep) +{ + int node; + struct kmem_list3 *l3; + struct array_cache *new_shared; + struct array_cache **new_alien = NULL; + + for_each_online_node(node) { + + if (use_alien_caches) { + new_alien = alloc_alien_cache(node, cachep->limit); + if (!new_alien) + goto fail; + } + + new_shared = NULL; + if (cachep->shared) { + new_shared = alloc_arraycache(node, + cachep->shared*cachep->batchcount, + 0xbaadf00d); + if (!new_shared) { + free_alien_cache(new_alien); + goto fail; + } + } + + l3 = cachep->nodelists[node]; + if (l3) { + struct array_cache *shared = l3->shared; + + spin_lock_irq(&l3->list_lock); + + if (shared) + free_block(cachep, shared->entry, + shared->avail, node); + + l3->shared = new_shared; + if (!l3->alien) { + l3->alien = new_alien; + new_alien = NULL; + } + l3->free_limit = (1 + nr_cpus_node(node)) * + cachep->batchcount + cachep->num; + spin_unlock_irq(&l3->list_lock); + kfree(shared); + free_alien_cache(new_alien); + continue; + } + l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node); + if (!l3) { + free_alien_cache(new_alien); + kfree(new_shared); + goto fail; + } + + kmem_list3_init(l3); + l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + l3->shared = new_shared; + l3->alien = new_alien; + l3->free_limit = (1 + nr_cpus_node(node)) * + cachep->batchcount + cachep->num; + cachep->nodelists[node] = l3; + } + return 0; + +fail: + if (!cachep->next.next) { + /* Cache is not active yet. Roll back what we did */ + node--; + while (node >= 0) { + if (cachep->nodelists[node]) { + l3 = cachep->nodelists[node]; + + kfree(l3->shared); + free_alien_cache(l3->alien); + kfree(l3); + cachep->nodelists[node] = NULL; + } + node--; + } + } + return -ENOMEM; +} + +struct ccupdate_struct { + struct kmem_cache *cachep; + struct array_cache *new[NR_CPUS]; +}; + +static void do_ccupdate_local(void *info) +{ + struct ccupdate_struct *new = info; + struct array_cache *old; + + check_irq_off(); + old = cpu_cache_get(new->cachep); + + new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; + new->new[smp_processor_id()] = old; +} + +/* Always called with the cache_chain_mutex held */ +static int do_tune_cpucache(struct kmem_cache *cachep, int limit, + int batchcount, int shared) +{ + struct ccupdate_struct *new; + int i; + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return -ENOMEM; + + for_each_online_cpu(i) { + new->new[i] = alloc_arraycache(cpu_to_node(i), limit, + batchcount); + if (!new->new[i]) { + for (i--; i >= 0; i--) + kfree(new->new[i]); + kfree(new); + return -ENOMEM; + } + } + new->cachep = cachep; + + on_each_cpu(do_ccupdate_local, (void *)new, 1); + + check_irq_on(); + cachep->batchcount = batchcount; + cachep->limit = limit; + cachep->shared = shared; + + for_each_online_cpu(i) { + struct array_cache *ccold = new->new[i]; + if (!ccold) + continue; + spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); + free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i)); + spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); + kfree(ccold); + } + kfree(new); + return alloc_kmemlist(cachep); +} + +/* Called with cache_chain_mutex held always */ +static int enable_cpucache(struct kmem_cache *cachep) +{ + int err; + int limit, shared; + + /* + * The head array serves three purposes: + * - create a LIFO ordering, i.e. return objects that are cache-warm + * - reduce the number of spinlock operations. + * - reduce the number of linked list operations on the slab and + * bufctl chains: array operations are cheaper. + * The numbers are guessed, we should auto-tune as described by + * Bonwick. + */ + if (cachep->buffer_size > 131072) + limit = 1; + else if (cachep->buffer_size > PAGE_SIZE) + limit = 8; + else if (cachep->buffer_size > 1024) + limit = 24; + else if (cachep->buffer_size > 256) + limit = 54; + else + limit = 120; + + /* + * CPU bound tasks (e.g. network routing) can exhibit cpu bound + * allocation behaviour: Most allocs on one cpu, most free operations + * on another cpu. For these cases, an efficient object passing between + * cpus is necessary. This is provided by a shared array. The array + * replaces Bonwick's magazine layer. + * On uniprocessor, it's functionally equivalent (but less efficient) + * to a larger limit. Thus disabled by default. + */ + shared = 0; + if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1) + shared = 8; + +#if DEBUG + /* + * With debugging enabled, large batchcount lead to excessively long + * periods with disabled local interrupts. Limit the batchcount + */ + if (limit > 32) + limit = 32; +#endif + err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared); + if (err) + printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", + cachep->name, -err); + return err; +} + +/* + * Drain an array if it contains any elements taking the l3 lock only if + * necessary. Note that the l3 listlock also protects the array_cache + * if drain_array() is used on the shared array. + */ +void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, + struct array_cache *ac, int force, int node) +{ + int tofree; + + if (!ac || !ac->avail) + return; + if (ac->touched && !force) { + ac->touched = 0; + } else { + spin_lock_irq(&l3->list_lock); + if (ac->avail) { + tofree = force ? ac->avail : (ac->limit + 4) / 5; + if (tofree > ac->avail) + tofree = (ac->avail + 1) / 2; + free_block(cachep, ac->entry, tofree, node); + ac->avail -= tofree; + memmove(ac->entry, &(ac->entry[tofree]), + sizeof(void *) * ac->avail); + } + spin_unlock_irq(&l3->list_lock); + } +} + +/** + * cache_reap - Reclaim memory from caches. + * @w: work descriptor + * + * Called from workqueue/eventd every few seconds. + * Purpose: + * - clear the per-cpu caches for this CPU. + * - return freeable pages to the main free memory pool. + * + * If we cannot acquire the cache chain mutex then just give up - we'll try + * again on the next iteration. + */ +static void cache_reap(struct work_struct *w) +{ + struct kmem_cache *searchp; + struct kmem_list3 *l3; + int node = numa_node_id(); + struct delayed_work *work = + container_of(w, struct delayed_work, work); + + if (!mutex_trylock(&cache_chain_mutex)) + /* Give up. Setup the next iteration. */ + goto out; + + list_for_each_entry(searchp, &cache_chain, next) { + check_irq_on(); + + /* + * We only take the l3 lock if absolutely necessary and we + * have established with reasonable certainty that + * we can do some work if the lock was obtained. + */ + l3 = searchp->nodelists[node]; + + reap_alien(searchp, l3); + + drain_array(searchp, l3, cpu_cache_get(searchp), 0, node); + + /* + * These are racy checks but it does not matter + * if we skip one check or scan twice. + */ + if (time_after(l3->next_reap, jiffies)) + goto next; + + l3->next_reap = jiffies + REAPTIMEOUT_LIST3; + + drain_array(searchp, l3, l3->shared, 0, node); + + if (l3->free_touched) + l3->free_touched = 0; + else { + int freed; + + freed = drain_freelist(searchp, l3, (l3->free_limit + + 5 * searchp->num - 1) / (5 * searchp->num)); + STATS_ADD_REAPED(searchp, freed); + } +next: + cond_resched(); + } + check_irq_on(); + mutex_unlock(&cache_chain_mutex); + next_reap_node(); +out: + /* Set up the next iteration */ + schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); +} + +#ifdef CONFIG_SLABINFO + +static void print_slabinfo_header(struct seq_file *m) +{ + /* + * Output format version, so at least we can change it + * without _too_ many complaints. + */ +#if STATS + seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); +#else + seq_puts(m, "slabinfo - version: 2.1\n"); +#endif + seq_puts(m, "# name <active_objs> <num_objs> <objsize> " + "<objperslab> <pagesperslab>"); + seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); + seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); +#if STATS + seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> " + "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>"); + seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); +#endif + seq_putc(m, '\n'); +} + +static void *s_start(struct seq_file *m, loff_t *pos) +{ + loff_t n = *pos; + + mutex_lock(&cache_chain_mutex); + if (!n) + print_slabinfo_header(m); + + return seq_list_start(&cache_chain, *pos); +} + +static void *s_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &cache_chain, pos); +} + +static void s_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&cache_chain_mutex); +} + +static int s_show(struct seq_file *m, void *p) +{ + struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); + struct slab *slabp; + unsigned long active_objs; + unsigned long num_objs; + unsigned long active_slabs = 0; + unsigned long num_slabs, free_objects = 0, shared_avail = 0; + const char *name; + char *error = NULL; + int node; + struct kmem_list3 *l3; + + active_objs = 0; + num_slabs = 0; + for_each_online_node(node) { + l3 = cachep->nodelists[node]; + if (!l3) + continue; + + check_irq_on(); + spin_lock_irq(&l3->list_lock); + + list_for_each_entry(slabp, &l3->slabs_full, list) { + if (slabp->inuse != cachep->num && !error) + error = "slabs_full accounting error"; + active_objs += cachep->num; + active_slabs++; + } + list_for_each_entry(slabp, &l3->slabs_partial, list) { + if (slabp->inuse == cachep->num && !error) + error = "slabs_partial inuse accounting error"; + if (!slabp->inuse && !error) + error = "slabs_partial/inuse accounting error"; + active_objs += slabp->inuse; + active_slabs++; + } + list_for_each_entry(slabp, &l3->slabs_free, list) { + if (slabp->inuse && !error) + error = "slabs_free/inuse accounting error"; + num_slabs++; + } + free_objects += l3->free_objects; + if (l3->shared) + shared_avail += l3->shared->avail; + + spin_unlock_irq(&l3->list_lock); + } + num_slabs += active_slabs; + num_objs = num_slabs * cachep->num; + if (num_objs - active_objs != free_objects && !error) + error = "free_objects accounting error"; + + name = cachep->name; + if (error) + printk(KERN_ERR "slab: cache %s error: %s\n", name, error); + + seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", + name, active_objs, num_objs, cachep->buffer_size, + cachep->num, (1 << cachep->gfporder)); + seq_printf(m, " : tunables %4u %4u %4u", + cachep->limit, cachep->batchcount, cachep->shared); + seq_printf(m, " : slabdata %6lu %6lu %6lu", + active_slabs, num_slabs, shared_avail); +#if STATS + { /* list3 stats */ + unsigned long high = cachep->high_mark; + unsigned long allocs = cachep->num_allocations; + unsigned long grown = cachep->grown; + unsigned long reaped = cachep->reaped; + unsigned long errors = cachep->errors; + unsigned long max_freeable = cachep->max_freeable; + unsigned long node_allocs = cachep->node_allocs; + unsigned long node_frees = cachep->node_frees; + unsigned long overflows = cachep->node_overflow; + + seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ + %4lu %4lu %4lu %4lu %4lu", allocs, high, grown, + reaped, errors, max_freeable, node_allocs, + node_frees, overflows); + } + /* cpu stats */ + { + unsigned long allochit = atomic_read(&cachep->allochit); + unsigned long allocmiss = atomic_read(&cachep->allocmiss); + unsigned long freehit = atomic_read(&cachep->freehit); + unsigned long freemiss = atomic_read(&cachep->freemiss); + + seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", + allochit, allocmiss, freehit, freemiss); + } +#endif + seq_putc(m, '\n'); + return 0; +} + +/* + * slabinfo_op - iterator that generates /proc/slabinfo + * + * Output layout: + * cache-name + * num-active-objs + * total-objs + * object size + * num-active-slabs + * total-slabs + * num-pages-per-slab + * + further values on SMP and with statistics enabled + */ + +static const struct seq_operations slabinfo_op = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, +}; + +#define MAX_SLABINFO_WRITE 128 +/** + * slabinfo_write - Tuning for the slab allocator + * @file: unused + * @buffer: user buffer + * @count: data length + * @ppos: unused + */ +ssize_t slabinfo_write(struct file *file, const char __user * buffer, + size_t count, loff_t *ppos) +{ + char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; + int limit, batchcount, shared, res; + struct kmem_cache *cachep; + + if (count > MAX_SLABINFO_WRITE) + return -EINVAL; + if (copy_from_user(&kbuf, buffer, count)) + return -EFAULT; + kbuf[MAX_SLABINFO_WRITE] = '\0'; + + tmp = strchr(kbuf, ' '); + if (!tmp) + return -EINVAL; + *tmp = '\0'; + tmp++; + if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3) + return -EINVAL; + + /* Find the cache in the chain of caches. */ + mutex_lock(&cache_chain_mutex); + res = -EINVAL; + list_for_each_entry(cachep, &cache_chain, next) { + if (!strcmp(cachep->name, kbuf)) { + if (limit < 1 || batchcount < 1 || + batchcount > limit || shared < 0) { + res = 0; + } else { + res = do_tune_cpucache(cachep, limit, + batchcount, shared); + } + break; + } + } + mutex_unlock(&cache_chain_mutex); + if (res >= 0) + res = count; + return res; +} + +static int slabinfo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &slabinfo_op); +} + +static const struct file_operations proc_slabinfo_operations = { + .open = slabinfo_open, + .read = seq_read, + .write = slabinfo_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +#ifdef CONFIG_DEBUG_SLAB_LEAK + +static void *leaks_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&cache_chain_mutex); + return seq_list_start(&cache_chain, *pos); +} + +static inline int add_caller(unsigned long *n, unsigned long v) +{ + unsigned long *p; + int l; + if (!v) + return 1; + l = n[1]; + p = n + 2; + while (l) { + int i = l/2; + unsigned long *q = p + 2 * i; + if (*q == v) { + q[1]++; + return 1; + } + if (*q > v) { + l = i; + } else { + p = q + 2; + l -= i + 1; + } + } + if (++n[1] == n[0]) + return 0; + memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n)); + p[0] = v; + p[1] = 1; + return 1; +} + +static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s) +{ + void *p; + int i; + if (n[0] == n[1]) + return; + for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) { + if (slab_bufctl(s)[i] != BUFCTL_ACTIVE) + continue; + if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) + return; + } +} + +static void show_symbol(struct seq_file *m, unsigned long address) +{ +#ifdef CONFIG_KALLSYMS + unsigned long offset, size; + char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN]; + + if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) { + seq_printf(m, "%s+%#lx/%#lx", name, offset, size); + if (modname[0]) + seq_printf(m, " [%s]", modname); + return; + } +#endif + seq_printf(m, "%p", (void *)address); +} + +static int leaks_show(struct seq_file *m, void *p) +{ + struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); + struct slab *slabp; + struct kmem_list3 *l3; + const char *name; + unsigned long *n = m->private; + int node; + int i; + + if (!(cachep->flags & SLAB_STORE_USER)) + return 0; + if (!(cachep->flags & SLAB_RED_ZONE)) + return 0; + + /* OK, we can do it */ + + n[1] = 0; + + for_each_online_node(node) { + l3 = cachep->nodelists[node]; + if (!l3) + continue; + + check_irq_on(); + spin_lock_irq(&l3->list_lock); + + list_for_each_entry(slabp, &l3->slabs_full, list) + handle_slab(n, cachep, slabp); + list_for_each_entry(slabp, &l3->slabs_partial, list) + handle_slab(n, cachep, slabp); + spin_unlock_irq(&l3->list_lock); + } + name = cachep->name; + if (n[0] == n[1]) { + /* Increase the buffer size */ + mutex_unlock(&cache_chain_mutex); + m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL); + if (!m->private) { + /* Too bad, we are really out */ + m->private = n; + mutex_lock(&cache_chain_mutex); + return -ENOMEM; + } + *(unsigned long *)m->private = n[0] * 2; + kfree(n); + mutex_lock(&cache_chain_mutex); + /* Now make sure this entry will be retried */ + m->count = m->size; + return 0; + } + for (i = 0; i < n[1]; i++) { + seq_printf(m, "%s: %lu ", name, n[2*i+3]); + show_symbol(m, n[2*i+2]); + seq_putc(m, '\n'); + } + + return 0; +} + +static const struct seq_operations slabstats_op = { + .start = leaks_start, + .next = s_next, + .stop = s_stop, + .show = leaks_show, +}; + +static int slabstats_open(struct inode *inode, struct file *file) +{ + unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL); + int ret = -ENOMEM; + if (n) { + ret = seq_open(file, &slabstats_op); + if (!ret) { + struct seq_file *m = file->private_data; + *n = PAGE_SIZE / (2 * sizeof(unsigned long)); + m->private = n; + n = NULL; + } + kfree(n); + } + return ret; +} + +static const struct file_operations proc_slabstats_operations = { + .open = slabstats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; +#endif + +static int __init slab_proc_init(void) +{ + proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); +#ifdef CONFIG_DEBUG_SLAB_LEAK + proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); +#endif + return 0; +} +module_init(slab_proc_init); +#endif + +/** + * ksize - get the actual amount of memory allocated for a given object + * @objp: Pointer to the object + * + * kmalloc may internally round up allocations and return more memory + * than requested. ksize() can be used to determine the actual amount of + * memory allocated. The caller may use this additional memory, even though + * a smaller amount of memory was initially specified with the kmalloc call. + * The caller must guarantee that objp points to a valid object previously + * allocated with either kmalloc() or kmem_cache_alloc(). The object + * must not be freed during the duration of the call. + */ +size_t ksize(const void *objp) +{ + BUG_ON(!objp); + if (unlikely(objp == ZERO_SIZE_PTR)) + return 0; + + return obj_size(virt_to_cache(objp)); +} diff --git a/mm/slob.c b/mm/slob.c new file mode 100644 index 0000000..bf7e8fc --- /dev/null +++ b/mm/slob.c @@ -0,0 +1,647 @@ +/* + * SLOB Allocator: Simple List Of Blocks + * + * Matt Mackall <mpm@selenic.com> 12/30/03 + * + * NUMA support by Paul Mundt, 2007. + * + * How SLOB works: + * + * The core of SLOB is a traditional K&R style heap allocator, with + * support for returning aligned objects. The granularity of this + * allocator is as little as 2 bytes, however typically most architectures + * will require 4 bytes on 32-bit and 8 bytes on 64-bit. + * + * The slob heap is a set of linked list of pages from alloc_pages(), + * and within each page, there is a singly-linked list of free blocks + * (slob_t). The heap is grown on demand. To reduce fragmentation, + * heap pages are segregated into three lists, with objects less than + * 256 bytes, objects less than 1024 bytes, and all other objects. + * + * Allocation from heap involves first searching for a page with + * sufficient free blocks (using a next-fit-like approach) followed by + * a first-fit scan of the page. Deallocation inserts objects back + * into the free list in address order, so this is effectively an + * address-ordered first fit. + * + * Above this is an implementation of kmalloc/kfree. Blocks returned + * from kmalloc are prepended with a 4-byte header with the kmalloc size. + * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls + * alloc_pages() directly, allocating compound pages so the page order + * does not have to be separately tracked, and also stores the exact + * allocation size in page->private so that it can be used to accurately + * provide ksize(). These objects are detected in kfree() because slob_page() + * is false for them. + * + * SLAB is emulated on top of SLOB by simply calling constructors and + * destructors for every SLAB allocation. Objects are returned with the + * 4-byte alignment unless the SLAB_HWCACHE_ALIGN flag is set, in which + * case the low-level allocator will fragment blocks to create the proper + * alignment. Again, objects of page-size or greater are allocated by + * calling alloc_pages(). As SLAB objects know their size, no separate + * size bookkeeping is necessary and there is essentially no allocation + * space overhead, and compound pages aren't needed for multi-page + * allocations. + * + * NUMA support in SLOB is fairly simplistic, pushing most of the real + * logic down to the page allocator, and simply doing the node accounting + * on the upper levels. In the event that a node id is explicitly + * provided, alloc_pages_node() with the specified node id is used + * instead. The common case (or when the node id isn't explicitly provided) + * will default to the current node, as per numa_node_id(). + * + * Node aware pages are still inserted in to the global freelist, and + * these are scanned for by matching against the node id encoded in the + * page flags. As a result, block allocations that can be satisfied from + * the freelist will only be done so on pages residing on the same node, + * in order to prevent random node placement. + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/cache.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/rcupdate.h> +#include <linux/list.h> +#include <asm/atomic.h> + +/* + * slob_block has a field 'units', which indicates size of block if +ve, + * or offset of next block if -ve (in SLOB_UNITs). + * + * Free blocks of size 1 unit simply contain the offset of the next block. + * Those with larger size contain their size in the first SLOB_UNIT of + * memory, and the offset of the next free block in the second SLOB_UNIT. + */ +#if PAGE_SIZE <= (32767 * 2) +typedef s16 slobidx_t; +#else +typedef s32 slobidx_t; +#endif + +struct slob_block { + slobidx_t units; +}; +typedef struct slob_block slob_t; + +/* + * We use struct page fields to manage some slob allocation aspects, + * however to avoid the horrible mess in include/linux/mm_types.h, we'll + * just define our own struct page type variant here. + */ +struct slob_page { + union { + struct { + unsigned long flags; /* mandatory */ + atomic_t _count; /* mandatory */ + slobidx_t units; /* free units left in page */ + unsigned long pad[2]; + slob_t *free; /* first free slob_t in page */ + struct list_head list; /* linked list of free pages */ + }; + struct page page; + }; +}; +static inline void struct_slob_page_wrong_size(void) +{ BUILD_BUG_ON(sizeof(struct slob_page) != sizeof(struct page)); } + +/* + * free_slob_page: call before a slob_page is returned to the page allocator. + */ +static inline void free_slob_page(struct slob_page *sp) +{ + reset_page_mapcount(&sp->page); + sp->page.mapping = NULL; +} + +/* + * All partially free slob pages go on these lists. + */ +#define SLOB_BREAK1 256 +#define SLOB_BREAK2 1024 +static LIST_HEAD(free_slob_small); +static LIST_HEAD(free_slob_medium); +static LIST_HEAD(free_slob_large); + +/* + * slob_page: True for all slob pages (false for bigblock pages) + */ +static inline int slob_page(struct slob_page *sp) +{ + return PageSlobPage((struct page *)sp); +} + +static inline void set_slob_page(struct slob_page *sp) +{ + __SetPageSlobPage((struct page *)sp); +} + +static inline void clear_slob_page(struct slob_page *sp) +{ + __ClearPageSlobPage((struct page *)sp); +} + +/* + * slob_page_free: true for pages on free_slob_pages list. + */ +static inline int slob_page_free(struct slob_page *sp) +{ + return PageSlobFree((struct page *)sp); +} + +static void set_slob_page_free(struct slob_page *sp, struct list_head *list) +{ + list_add(&sp->list, list); + __SetPageSlobFree((struct page *)sp); +} + +static inline void clear_slob_page_free(struct slob_page *sp) +{ + list_del(&sp->list); + __ClearPageSlobFree((struct page *)sp); +} + +#define SLOB_UNIT sizeof(slob_t) +#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT) +#define SLOB_ALIGN L1_CACHE_BYTES + +/* + * struct slob_rcu is inserted at the tail of allocated slob blocks, which + * were created with a SLAB_DESTROY_BY_RCU slab. slob_rcu is used to free + * the block using call_rcu. + */ +struct slob_rcu { + struct rcu_head head; + int size; +}; + +/* + * slob_lock protects all slob allocator structures. + */ +static DEFINE_SPINLOCK(slob_lock); + +/* + * Encode the given size and next info into a free slob block s. + */ +static void set_slob(slob_t *s, slobidx_t size, slob_t *next) +{ + slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK); + slobidx_t offset = next - base; + + if (size > 1) { + s[0].units = size; + s[1].units = offset; + } else + s[0].units = -offset; +} + +/* + * Return the size of a slob block. + */ +static slobidx_t slob_units(slob_t *s) +{ + if (s->units > 0) + return s->units; + return 1; +} + +/* + * Return the next free slob block pointer after this one. + */ +static slob_t *slob_next(slob_t *s) +{ + slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK); + slobidx_t next; + + if (s[0].units < 0) + next = -s[0].units; + else + next = s[1].units; + return base+next; +} + +/* + * Returns true if s is the last free block in its page. + */ +static int slob_last(slob_t *s) +{ + return !((unsigned long)slob_next(s) & ~PAGE_MASK); +} + +static void *slob_new_page(gfp_t gfp, int order, int node) +{ + void *page; + +#ifdef CONFIG_NUMA + if (node != -1) + page = alloc_pages_node(node, gfp, order); + else +#endif + page = alloc_pages(gfp, order); + + if (!page) + return NULL; + + return page_address(page); +} + +/* + * Allocate a slob block within a given slob_page sp. + */ +static void *slob_page_alloc(struct slob_page *sp, size_t size, int align) +{ + slob_t *prev, *cur, *aligned = 0; + int delta = 0, units = SLOB_UNITS(size); + + for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) { + slobidx_t avail = slob_units(cur); + + if (align) { + aligned = (slob_t *)ALIGN((unsigned long)cur, align); + delta = aligned - cur; + } + if (avail >= units + delta) { /* room enough? */ + slob_t *next; + + if (delta) { /* need to fragment head to align? */ + next = slob_next(cur); + set_slob(aligned, avail - delta, next); + set_slob(cur, delta, aligned); + prev = cur; + cur = aligned; + avail = slob_units(cur); + } + + next = slob_next(cur); + if (avail == units) { /* exact fit? unlink. */ + if (prev) + set_slob(prev, slob_units(prev), next); + else + sp->free = next; + } else { /* fragment */ + if (prev) + set_slob(prev, slob_units(prev), cur + units); + else + sp->free = cur + units; + set_slob(cur + units, avail - units, next); + } + + sp->units -= units; + if (!sp->units) + clear_slob_page_free(sp); + return cur; + } + if (slob_last(cur)) + return NULL; + } +} + +/* + * slob_alloc: entry point into the slob allocator. + */ +static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) +{ + struct slob_page *sp; + struct list_head *prev; + struct list_head *slob_list; + slob_t *b = NULL; + unsigned long flags; + + if (size < SLOB_BREAK1) + slob_list = &free_slob_small; + else if (size < SLOB_BREAK2) + slob_list = &free_slob_medium; + else + slob_list = &free_slob_large; + + spin_lock_irqsave(&slob_lock, flags); + /* Iterate through each partially free page, try to find room */ + list_for_each_entry(sp, slob_list, list) { +#ifdef CONFIG_NUMA + /* + * If there's a node specification, search for a partial + * page with a matching node id in the freelist. + */ + if (node != -1 && page_to_nid(&sp->page) != node) + continue; +#endif + /* Enough room on this page? */ + if (sp->units < SLOB_UNITS(size)) + continue; + + /* Attempt to alloc */ + prev = sp->list.prev; + b = slob_page_alloc(sp, size, align); + if (!b) + continue; + + /* Improve fragment distribution and reduce our average + * search time by starting our next search here. (see + * Knuth vol 1, sec 2.5, pg 449) */ + if (prev != slob_list->prev && + slob_list->next != prev->next) + list_move_tail(slob_list, prev->next); + break; + } + spin_unlock_irqrestore(&slob_lock, flags); + + /* Not enough space: must allocate a new page */ + if (!b) { + b = slob_new_page(gfp & ~__GFP_ZERO, 0, node); + if (!b) + return 0; + sp = (struct slob_page *)virt_to_page(b); + set_slob_page(sp); + + spin_lock_irqsave(&slob_lock, flags); + sp->units = SLOB_UNITS(PAGE_SIZE); + sp->free = b; + INIT_LIST_HEAD(&sp->list); + set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); + set_slob_page_free(sp, slob_list); + b = slob_page_alloc(sp, size, align); + BUG_ON(!b); + spin_unlock_irqrestore(&slob_lock, flags); + } + if (unlikely((gfp & __GFP_ZERO) && b)) + memset(b, 0, size); + return b; +} + +/* + * slob_free: entry point into the slob allocator. + */ +static void slob_free(void *block, int size) +{ + struct slob_page *sp; + slob_t *prev, *next, *b = (slob_t *)block; + slobidx_t units; + unsigned long flags; + + if (unlikely(ZERO_OR_NULL_PTR(block))) + return; + BUG_ON(!size); + + sp = (struct slob_page *)virt_to_page(block); + units = SLOB_UNITS(size); + + spin_lock_irqsave(&slob_lock, flags); + + if (sp->units + units == SLOB_UNITS(PAGE_SIZE)) { + /* Go directly to page allocator. Do not pass slob allocator */ + if (slob_page_free(sp)) + clear_slob_page_free(sp); + clear_slob_page(sp); + free_slob_page(sp); + free_page((unsigned long)b); + goto out; + } + + if (!slob_page_free(sp)) { + /* This slob page is about to become partially free. Easy! */ + sp->units = units; + sp->free = b; + set_slob(b, units, + (void *)((unsigned long)(b + + SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK)); + set_slob_page_free(sp, &free_slob_small); + goto out; + } + + /* + * Otherwise the page is already partially free, so find reinsertion + * point. + */ + sp->units += units; + + if (b < sp->free) { + if (b + units == sp->free) { + units += slob_units(sp->free); + sp->free = slob_next(sp->free); + } + set_slob(b, units, sp->free); + sp->free = b; + } else { + prev = sp->free; + next = slob_next(prev); + while (b > next) { + prev = next; + next = slob_next(prev); + } + + if (!slob_last(prev) && b + units == next) { + units += slob_units(next); + set_slob(b, units, slob_next(next)); + } else + set_slob(b, units, next); + + if (prev + slob_units(prev) == b) { + units = slob_units(b) + slob_units(prev); + set_slob(prev, units, slob_next(b)); + } else + set_slob(prev, slob_units(prev), b); + } +out: + spin_unlock_irqrestore(&slob_lock, flags); +} + +/* + * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend. + */ + +#ifndef ARCH_KMALLOC_MINALIGN +#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long) +#endif + +#ifndef ARCH_SLAB_MINALIGN +#define ARCH_SLAB_MINALIGN __alignof__(unsigned long) +#endif + +void *__kmalloc_node(size_t size, gfp_t gfp, int node) +{ + unsigned int *m; + int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + + if (size < PAGE_SIZE - align) { + if (!size) + return ZERO_SIZE_PTR; + + m = slob_alloc(size + align, gfp, align, node); + if (!m) + return NULL; + *m = size; + return (void *)m + align; + } else { + void *ret; + + ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node); + if (ret) { + struct page *page; + page = virt_to_page(ret); + page->private = size; + } + return ret; + } +} +EXPORT_SYMBOL(__kmalloc_node); + +void kfree(const void *block) +{ + struct slob_page *sp; + + if (unlikely(ZERO_OR_NULL_PTR(block))) + return; + + sp = (struct slob_page *)virt_to_page(block); + if (slob_page(sp)) { + int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + unsigned int *m = (unsigned int *)(block - align); + slob_free(m, *m + align); + } else + put_page(&sp->page); +} +EXPORT_SYMBOL(kfree); + +/* can't use ksize for kmem_cache_alloc memory, only kmalloc */ +size_t ksize(const void *block) +{ + struct slob_page *sp; + + BUG_ON(!block); + if (unlikely(block == ZERO_SIZE_PTR)) + return 0; + + sp = (struct slob_page *)virt_to_page(block); + if (slob_page(sp)) { + int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + unsigned int *m = (unsigned int *)(block - align); + return SLOB_UNITS(*m) * SLOB_UNIT; + } else + return sp->page.private; +} + +struct kmem_cache { + unsigned int size, align; + unsigned long flags; + const char *name; + void (*ctor)(void *); +}; + +struct kmem_cache *kmem_cache_create(const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *)) +{ + struct kmem_cache *c; + + c = slob_alloc(sizeof(struct kmem_cache), + GFP_KERNEL, ARCH_KMALLOC_MINALIGN, -1); + + if (c) { + c->name = name; + c->size = size; + if (flags & SLAB_DESTROY_BY_RCU) { + /* leave room for rcu footer at the end of object */ + c->size += sizeof(struct slob_rcu); + } + c->flags = flags; + c->ctor = ctor; + /* ignore alignment unless it's forced */ + c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; + if (c->align < ARCH_SLAB_MINALIGN) + c->align = ARCH_SLAB_MINALIGN; + if (c->align < align) + c->align = align; + } else if (flags & SLAB_PANIC) + panic("Cannot create slab cache %s\n", name); + + return c; +} +EXPORT_SYMBOL(kmem_cache_create); + +void kmem_cache_destroy(struct kmem_cache *c) +{ + slob_free(c, sizeof(struct kmem_cache)); +} +EXPORT_SYMBOL(kmem_cache_destroy); + +void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) +{ + void *b; + + if (c->size < PAGE_SIZE) + b = slob_alloc(c->size, flags, c->align, node); + else + b = slob_new_page(flags, get_order(c->size), node); + + if (c->ctor) + c->ctor(b); + + return b; +} +EXPORT_SYMBOL(kmem_cache_alloc_node); + +static void __kmem_cache_free(void *b, int size) +{ + if (size < PAGE_SIZE) + slob_free(b, size); + else + free_pages((unsigned long)b, get_order(size)); +} + +static void kmem_rcu_free(struct rcu_head *head) +{ + struct slob_rcu *slob_rcu = (struct slob_rcu *)head; + void *b = (void *)slob_rcu - (slob_rcu->size - sizeof(struct slob_rcu)); + + __kmem_cache_free(b, slob_rcu->size); +} + +void kmem_cache_free(struct kmem_cache *c, void *b) +{ + if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) { + struct slob_rcu *slob_rcu; + slob_rcu = b + (c->size - sizeof(struct slob_rcu)); + INIT_RCU_HEAD(&slob_rcu->head); + slob_rcu->size = c->size; + call_rcu(&slob_rcu->head, kmem_rcu_free); + } else { + __kmem_cache_free(b, c->size); + } +} +EXPORT_SYMBOL(kmem_cache_free); + +unsigned int kmem_cache_size(struct kmem_cache *c) +{ + return c->size; +} +EXPORT_SYMBOL(kmem_cache_size); + +const char *kmem_cache_name(struct kmem_cache *c) +{ + return c->name; +} +EXPORT_SYMBOL(kmem_cache_name); + +int kmem_cache_shrink(struct kmem_cache *d) +{ + return 0; +} +EXPORT_SYMBOL(kmem_cache_shrink); + +int kmem_ptr_validate(struct kmem_cache *a, const void *b) +{ + return 0; +} + +static unsigned int slob_ready __read_mostly; + +int slab_is_available(void) +{ + return slob_ready; +} + +void __init kmem_cache_init(void) +{ + slob_ready = 1; +} diff --git a/mm/slub.c b/mm/slub.c new file mode 100644 index 0000000..a2cd47d --- /dev/null +++ b/mm/slub.c @@ -0,0 +1,4515 @@ +/* + * SLUB: A slab allocator that limits cache line use instead of queuing + * objects in per cpu and per node lists. + * + * The allocator synchronizes using per slab locks and only + * uses a centralized lock to manage a pool of partial slabs. + * + * (C) 2007 SGI, Christoph Lameter + */ + +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/bit_spinlock.h> +#include <linux/interrupt.h> +#include <linux/bitops.h> +#include <linux/slab.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/cpu.h> +#include <linux/cpuset.h> +#include <linux/mempolicy.h> +#include <linux/ctype.h> +#include <linux/debugobjects.h> +#include <linux/kallsyms.h> +#include <linux/memory.h> +#include <linux/math64.h> + +/* + * Lock order: + * 1. slab_lock(page) + * 2. slab->list_lock + * + * The slab_lock protects operations on the object of a particular + * slab and its metadata in the page struct. If the slab lock + * has been taken then no allocations nor frees can be performed + * on the objects in the slab nor can the slab be added or removed + * from the partial or full lists since this would mean modifying + * the page_struct of the slab. + * + * The list_lock protects the partial and full list on each node and + * the partial slab counter. If taken then no new slabs may be added or + * removed from the lists nor make the number of partial slabs be modified. + * (Note that the total number of slabs is an atomic value that may be + * modified without taking the list lock). + * + * The list_lock is a centralized lock and thus we avoid taking it as + * much as possible. As long as SLUB does not have to handle partial + * slabs, operations can continue without any centralized lock. F.e. + * allocating a long series of objects that fill up slabs does not require + * the list lock. + * + * The lock order is sometimes inverted when we are trying to get a slab + * off a list. We take the list_lock and then look for a page on the list + * to use. While we do that objects in the slabs may be freed. We can + * only operate on the slab if we have also taken the slab_lock. So we use + * a slab_trylock() on the slab. If trylock was successful then no frees + * can occur anymore and we can use the slab for allocations etc. If the + * slab_trylock() does not succeed then frees are in progress in the slab and + * we must stay away from it for a while since we may cause a bouncing + * cacheline if we try to acquire the lock. So go onto the next slab. + * If all pages are busy then we may allocate a new slab instead of reusing + * a partial slab. A new slab has noone operating on it and thus there is + * no danger of cacheline contention. + * + * Interrupts are disabled during allocation and deallocation in order to + * make the slab allocator safe to use in the context of an irq. In addition + * interrupts are disabled to ensure that the processor does not change + * while handling per_cpu slabs, due to kernel preemption. + * + * SLUB assigns one slab for allocation to each processor. + * Allocations only occur from these slabs called cpu slabs. + * + * Slabs with free elements are kept on a partial list and during regular + * operations no list for full slabs is used. If an object in a full slab is + * freed then the slab will show up again on the partial lists. + * We track full slabs for debugging purposes though because otherwise we + * cannot scan all objects. + * + * Slabs are freed when they become empty. Teardown and setup is + * minimal so we rely on the page allocators per cpu caches for + * fast frees and allocs. + * + * Overloading of page flags that are otherwise used for LRU management. + * + * PageActive The slab is frozen and exempt from list processing. + * This means that the slab is dedicated to a purpose + * such as satisfying allocations for a specific + * processor. Objects may be freed in the slab while + * it is frozen but slab_free will then skip the usual + * list operations. It is up to the processor holding + * the slab to integrate the slab into the slab lists + * when the slab is no longer needed. + * + * One use of this flag is to mark slabs that are + * used for allocations. Then such a slab becomes a cpu + * slab. The cpu slab may be equipped with an additional + * freelist that allows lockless access to + * free objects in addition to the regular freelist + * that requires the slab lock. + * + * PageError Slab requires special handling due to debug + * options set. This moves slab handling out of + * the fast path and disables lockless freelists. + */ + +#ifdef CONFIG_SLUB_DEBUG +#define SLABDEBUG 1 +#else +#define SLABDEBUG 0 +#endif + +/* + * Issues still to be resolved: + * + * - Support PAGE_ALLOC_DEBUG. Should be easy to do. + * + * - Variable sizing of the per node arrays + */ + +/* Enable to test recovery from slab corruption on boot */ +#undef SLUB_RESILIENCY_TEST + +/* + * Mininum number of partial slabs. These will be left on the partial + * lists even if they are empty. kmem_cache_shrink may reclaim them. + */ +#define MIN_PARTIAL 5 + +/* + * Maximum number of desirable partial slabs. + * The existence of more partial slabs makes kmem_cache_shrink + * sort the partial list by the number of objects in the. + */ +#define MAX_PARTIAL 10 + +#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ + SLAB_POISON | SLAB_STORE_USER) + +/* + * Set of flags that will prevent slab merging + */ +#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ + SLAB_TRACE | SLAB_DESTROY_BY_RCU) + +#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ + SLAB_CACHE_DMA) + +#ifndef ARCH_KMALLOC_MINALIGN +#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) +#endif + +#ifndef ARCH_SLAB_MINALIGN +#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) +#endif + +/* Internal SLUB flags */ +#define __OBJECT_POISON 0x80000000 /* Poison object */ +#define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ + +static int kmem_size = sizeof(struct kmem_cache); + +#ifdef CONFIG_SMP +static struct notifier_block slab_notifier; +#endif + +static enum { + DOWN, /* No slab functionality available */ + PARTIAL, /* kmem_cache_open() works but kmalloc does not */ + UP, /* Everything works but does not show up in sysfs */ + SYSFS /* Sysfs up */ +} slab_state = DOWN; + +/* A list of all slab caches on the system */ +static DECLARE_RWSEM(slub_lock); +static LIST_HEAD(slab_caches); + +/* + * Tracking user of a slab. + */ +struct track { + void *addr; /* Called from address */ + int cpu; /* Was running on cpu */ + int pid; /* Pid context */ + unsigned long when; /* When did the operation occur */ +}; + +enum track_item { TRACK_ALLOC, TRACK_FREE }; + +#ifdef CONFIG_SLUB_DEBUG +static int sysfs_slab_add(struct kmem_cache *); +static int sysfs_slab_alias(struct kmem_cache *, const char *); +static void sysfs_slab_remove(struct kmem_cache *); + +#else +static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } +static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) + { return 0; } +static inline void sysfs_slab_remove(struct kmem_cache *s) +{ + kfree(s); +} + +#endif + +static inline void stat(struct kmem_cache_cpu *c, enum stat_item si) +{ +#ifdef CONFIG_SLUB_STATS + c->stat[si]++; +#endif +} + +/******************************************************************** + * Core slab cache functions + *******************************************************************/ + +int slab_is_available(void) +{ + return slab_state >= UP; +} + +static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) +{ +#ifdef CONFIG_NUMA + return s->node[node]; +#else + return &s->local_node; +#endif +} + +static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu) +{ +#ifdef CONFIG_SMP + return s->cpu_slab[cpu]; +#else + return &s->cpu_slab; +#endif +} + +/* Verify that a pointer has an address that is valid within a slab page */ +static inline int check_valid_pointer(struct kmem_cache *s, + struct page *page, const void *object) +{ + void *base; + + if (!object) + return 1; + + base = page_address(page); + if (object < base || object >= base + page->objects * s->size || + (object - base) % s->size) { + return 0; + } + + return 1; +} + +/* + * Slow version of get and set free pointer. + * + * This version requires touching the cache lines of kmem_cache which + * we avoid to do in the fast alloc free paths. There we obtain the offset + * from the page struct. + */ +static inline void *get_freepointer(struct kmem_cache *s, void *object) +{ + return *(void **)(object + s->offset); +} + +static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) +{ + *(void **)(object + s->offset) = fp; +} + +/* Loop over all objects in a slab */ +#define for_each_object(__p, __s, __addr, __objects) \ + for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ + __p += (__s)->size) + +/* Scan freelist */ +#define for_each_free_object(__p, __s, __free) \ + for (__p = (__free); __p; __p = get_freepointer((__s), __p)) + +/* Determine object index from a given position */ +static inline int slab_index(void *p, struct kmem_cache *s, void *addr) +{ + return (p - addr) / s->size; +} + +static inline struct kmem_cache_order_objects oo_make(int order, + unsigned long size) +{ + struct kmem_cache_order_objects x = { + (order << 16) + (PAGE_SIZE << order) / size + }; + + return x; +} + +static inline int oo_order(struct kmem_cache_order_objects x) +{ + return x.x >> 16; +} + +static inline int oo_objects(struct kmem_cache_order_objects x) +{ + return x.x & ((1 << 16) - 1); +} + +#ifdef CONFIG_SLUB_DEBUG +/* + * Debug settings: + */ +#ifdef CONFIG_SLUB_DEBUG_ON +static int slub_debug = DEBUG_DEFAULT_FLAGS; +#else +static int slub_debug; +#endif + +static char *slub_debug_slabs; + +/* + * Object debugging + */ +static void print_section(char *text, u8 *addr, unsigned int length) +{ + int i, offset; + int newline = 1; + char ascii[17]; + + ascii[16] = 0; + + for (i = 0; i < length; i++) { + if (newline) { + printk(KERN_ERR "%8s 0x%p: ", text, addr + i); + newline = 0; + } + printk(KERN_CONT " %02x", addr[i]); + offset = i % 16; + ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; + if (offset == 15) { + printk(KERN_CONT " %s\n", ascii); + newline = 1; + } + } + if (!newline) { + i %= 16; + while (i < 16) { + printk(KERN_CONT " "); + ascii[i] = ' '; + i++; + } + printk(KERN_CONT " %s\n", ascii); + } +} + +static struct track *get_track(struct kmem_cache *s, void *object, + enum track_item alloc) +{ + struct track *p; + + if (s->offset) + p = object + s->offset + sizeof(void *); + else + p = object + s->inuse; + + return p + alloc; +} + +static void set_track(struct kmem_cache *s, void *object, + enum track_item alloc, void *addr) +{ + struct track *p; + + if (s->offset) + p = object + s->offset + sizeof(void *); + else + p = object + s->inuse; + + p += alloc; + if (addr) { + p->addr = addr; + p->cpu = smp_processor_id(); + p->pid = current->pid; + p->when = jiffies; + } else + memset(p, 0, sizeof(struct track)); +} + +static void init_tracking(struct kmem_cache *s, void *object) +{ + if (!(s->flags & SLAB_STORE_USER)) + return; + + set_track(s, object, TRACK_FREE, NULL); + set_track(s, object, TRACK_ALLOC, NULL); +} + +static void print_track(const char *s, struct track *t) +{ + if (!t->addr) + return; + + printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", + s, t->addr, jiffies - t->when, t->cpu, t->pid); +} + +static void print_tracking(struct kmem_cache *s, void *object) +{ + if (!(s->flags & SLAB_STORE_USER)) + return; + + print_track("Allocated", get_track(s, object, TRACK_ALLOC)); + print_track("Freed", get_track(s, object, TRACK_FREE)); +} + +static void print_page_info(struct page *page) +{ + printk(KERN_ERR "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", + page, page->objects, page->inuse, page->freelist, page->flags); + +} + +static void slab_bug(struct kmem_cache *s, char *fmt, ...) +{ + va_list args; + char buf[100]; + + va_start(args, fmt); + vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + printk(KERN_ERR "========================================" + "=====================================\n"); + printk(KERN_ERR "BUG %s: %s\n", s->name, buf); + printk(KERN_ERR "----------------------------------------" + "-------------------------------------\n\n"); +} + +static void slab_fix(struct kmem_cache *s, char *fmt, ...) +{ + va_list args; + char buf[100]; + + va_start(args, fmt); + vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + printk(KERN_ERR "FIX %s: %s\n", s->name, buf); +} + +static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) +{ + unsigned int off; /* Offset of last byte */ + u8 *addr = page_address(page); + + print_tracking(s, p); + + print_page_info(page); + + printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", + p, p - addr, get_freepointer(s, p)); + + if (p > addr + 16) + print_section("Bytes b4", p - 16, 16); + + print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE)); + + if (s->flags & SLAB_RED_ZONE) + print_section("Redzone", p + s->objsize, + s->inuse - s->objsize); + + if (s->offset) + off = s->offset + sizeof(void *); + else + off = s->inuse; + + if (s->flags & SLAB_STORE_USER) + off += 2 * sizeof(struct track); + + if (off != s->size) + /* Beginning of the filler is the free pointer */ + print_section("Padding", p + off, s->size - off); + + dump_stack(); +} + +static void object_err(struct kmem_cache *s, struct page *page, + u8 *object, char *reason) +{ + slab_bug(s, "%s", reason); + print_trailer(s, page, object); +} + +static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...) +{ + va_list args; + char buf[100]; + + va_start(args, fmt); + vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + slab_bug(s, "%s", buf); + print_page_info(page); + dump_stack(); +} + +static void init_object(struct kmem_cache *s, void *object, int active) +{ + u8 *p = object; + + if (s->flags & __OBJECT_POISON) { + memset(p, POISON_FREE, s->objsize - 1); + p[s->objsize - 1] = POISON_END; + } + + if (s->flags & SLAB_RED_ZONE) + memset(p + s->objsize, + active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE, + s->inuse - s->objsize); +} + +static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) +{ + while (bytes) { + if (*start != (u8)value) + return start; + start++; + bytes--; + } + return NULL; +} + +static void restore_bytes(struct kmem_cache *s, char *message, u8 data, + void *from, void *to) +{ + slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data); + memset(from, data, to - from); +} + +static int check_bytes_and_report(struct kmem_cache *s, struct page *page, + u8 *object, char *what, + u8 *start, unsigned int value, unsigned int bytes) +{ + u8 *fault; + u8 *end; + + fault = check_bytes(start, value, bytes); + if (!fault) + return 1; + + end = start + bytes; + while (end > fault && end[-1] == value) + end--; + + slab_bug(s, "%s overwritten", what); + printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", + fault, end - 1, fault[0], value); + print_trailer(s, page, object); + + restore_bytes(s, what, value, fault, end); + return 0; +} + +/* + * Object layout: + * + * object address + * Bytes of the object to be managed. + * If the freepointer may overlay the object then the free + * pointer is the first word of the object. + * + * Poisoning uses 0x6b (POISON_FREE) and the last byte is + * 0xa5 (POISON_END) + * + * object + s->objsize + * Padding to reach word boundary. This is also used for Redzoning. + * Padding is extended by another word if Redzoning is enabled and + * objsize == inuse. + * + * We fill with 0xbb (RED_INACTIVE) for inactive objects and with + * 0xcc (RED_ACTIVE) for objects in use. + * + * object + s->inuse + * Meta data starts here. + * + * A. Free pointer (if we cannot overwrite object on free) + * B. Tracking data for SLAB_STORE_USER + * C. Padding to reach required alignment boundary or at mininum + * one word if debugging is on to be able to detect writes + * before the word boundary. + * + * Padding is done using 0x5a (POISON_INUSE) + * + * object + s->size + * Nothing is used beyond s->size. + * + * If slabcaches are merged then the objsize and inuse boundaries are mostly + * ignored. And therefore no slab options that rely on these boundaries + * may be used with merged slabcaches. + */ + +static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) +{ + unsigned long off = s->inuse; /* The end of info */ + + if (s->offset) + /* Freepointer is placed after the object. */ + off += sizeof(void *); + + if (s->flags & SLAB_STORE_USER) + /* We also have user information there */ + off += 2 * sizeof(struct track); + + if (s->size == off) + return 1; + + return check_bytes_and_report(s, page, p, "Object padding", + p + off, POISON_INUSE, s->size - off); +} + +/* Check the pad bytes at the end of a slab page */ +static int slab_pad_check(struct kmem_cache *s, struct page *page) +{ + u8 *start; + u8 *fault; + u8 *end; + int length; + int remainder; + + if (!(s->flags & SLAB_POISON)) + return 1; + + start = page_address(page); + length = (PAGE_SIZE << compound_order(page)); + end = start + length; + remainder = length % s->size; + if (!remainder) + return 1; + + fault = check_bytes(end - remainder, POISON_INUSE, remainder); + if (!fault) + return 1; + while (end > fault && end[-1] == POISON_INUSE) + end--; + + slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); + print_section("Padding", end - remainder, remainder); + + restore_bytes(s, "slab padding", POISON_INUSE, start, end); + return 0; +} + +static int check_object(struct kmem_cache *s, struct page *page, + void *object, int active) +{ + u8 *p = object; + u8 *endobject = object + s->objsize; + + if (s->flags & SLAB_RED_ZONE) { + unsigned int red = + active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE; + + if (!check_bytes_and_report(s, page, object, "Redzone", + endobject, red, s->inuse - s->objsize)) + return 0; + } else { + if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) { + check_bytes_and_report(s, page, p, "Alignment padding", + endobject, POISON_INUSE, s->inuse - s->objsize); + } + } + + if (s->flags & SLAB_POISON) { + if (!active && (s->flags & __OBJECT_POISON) && + (!check_bytes_and_report(s, page, p, "Poison", p, + POISON_FREE, s->objsize - 1) || + !check_bytes_and_report(s, page, p, "Poison", + p + s->objsize - 1, POISON_END, 1))) + return 0; + /* + * check_pad_bytes cleans up on its own. + */ + check_pad_bytes(s, page, p); + } + + if (!s->offset && active) + /* + * Object and freepointer overlap. Cannot check + * freepointer while object is allocated. + */ + return 1; + + /* Check free pointer validity */ + if (!check_valid_pointer(s, page, get_freepointer(s, p))) { + object_err(s, page, p, "Freepointer corrupt"); + /* + * No choice but to zap it and thus loose the remainder + * of the free objects in this slab. May cause + * another error because the object count is now wrong. + */ + set_freepointer(s, p, NULL); + return 0; + } + return 1; +} + +static int check_slab(struct kmem_cache *s, struct page *page) +{ + int maxobj; + + VM_BUG_ON(!irqs_disabled()); + + if (!PageSlab(page)) { + slab_err(s, page, "Not a valid slab page"); + return 0; + } + + maxobj = (PAGE_SIZE << compound_order(page)) / s->size; + if (page->objects > maxobj) { + slab_err(s, page, "objects %u > max %u", + s->name, page->objects, maxobj); + return 0; + } + if (page->inuse > page->objects) { + slab_err(s, page, "inuse %u > max %u", + s->name, page->inuse, page->objects); + return 0; + } + /* Slab_pad_check fixes things up after itself */ + slab_pad_check(s, page); + return 1; +} + +/* + * Determine if a certain object on a page is on the freelist. Must hold the + * slab lock to guarantee that the chains are in a consistent state. + */ +static int on_freelist(struct kmem_cache *s, struct page *page, void *search) +{ + int nr = 0; + void *fp = page->freelist; + void *object = NULL; + unsigned long max_objects; + + while (fp && nr <= page->objects) { + if (fp == search) + return 1; + if (!check_valid_pointer(s, page, fp)) { + if (object) { + object_err(s, page, object, + "Freechain corrupt"); + set_freepointer(s, object, NULL); + break; + } else { + slab_err(s, page, "Freepointer corrupt"); + page->freelist = NULL; + page->inuse = page->objects; + slab_fix(s, "Freelist cleared"); + return 0; + } + break; + } + object = fp; + fp = get_freepointer(s, object); + nr++; + } + + max_objects = (PAGE_SIZE << compound_order(page)) / s->size; + if (max_objects > 65535) + max_objects = 65535; + + if (page->objects != max_objects) { + slab_err(s, page, "Wrong number of objects. Found %d but " + "should be %d", page->objects, max_objects); + page->objects = max_objects; + slab_fix(s, "Number of objects adjusted."); + } + if (page->inuse != page->objects - nr) { + slab_err(s, page, "Wrong object count. Counter is %d but " + "counted were %d", page->inuse, page->objects - nr); + page->inuse = page->objects - nr; + slab_fix(s, "Object count adjusted."); + } + return search == NULL; +} + +static void trace(struct kmem_cache *s, struct page *page, void *object, + int alloc) +{ + if (s->flags & SLAB_TRACE) { + printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", + s->name, + alloc ? "alloc" : "free", + object, page->inuse, + page->freelist); + + if (!alloc) + print_section("Object", (void *)object, s->objsize); + + dump_stack(); + } +} + +/* + * Tracking of fully allocated slabs for debugging purposes. + */ +static void add_full(struct kmem_cache_node *n, struct page *page) +{ + spin_lock(&n->list_lock); + list_add(&page->lru, &n->full); + spin_unlock(&n->list_lock); +} + +static void remove_full(struct kmem_cache *s, struct page *page) +{ + struct kmem_cache_node *n; + + if (!(s->flags & SLAB_STORE_USER)) + return; + + n = get_node(s, page_to_nid(page)); + + spin_lock(&n->list_lock); + list_del(&page->lru); + spin_unlock(&n->list_lock); +} + +/* Tracking of the number of slabs for debugging purposes */ +static inline unsigned long slabs_node(struct kmem_cache *s, int node) +{ + struct kmem_cache_node *n = get_node(s, node); + + return atomic_long_read(&n->nr_slabs); +} + +static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) +{ + struct kmem_cache_node *n = get_node(s, node); + + /* + * May be called early in order to allocate a slab for the + * kmem_cache_node structure. Solve the chicken-egg + * dilemma by deferring the increment of the count during + * bootstrap (see early_kmem_cache_node_alloc). + */ + if (!NUMA_BUILD || n) { + atomic_long_inc(&n->nr_slabs); + atomic_long_add(objects, &n->total_objects); + } +} +static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) +{ + struct kmem_cache_node *n = get_node(s, node); + + atomic_long_dec(&n->nr_slabs); + atomic_long_sub(objects, &n->total_objects); +} + +/* Object debug checks for alloc/free paths */ +static void setup_object_debug(struct kmem_cache *s, struct page *page, + void *object) +{ + if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) + return; + + init_object(s, object, 0); + init_tracking(s, object); +} + +static int alloc_debug_processing(struct kmem_cache *s, struct page *page, + void *object, void *addr) +{ + if (!check_slab(s, page)) + goto bad; + + if (!on_freelist(s, page, object)) { + object_err(s, page, object, "Object already allocated"); + goto bad; + } + + if (!check_valid_pointer(s, page, object)) { + object_err(s, page, object, "Freelist Pointer check fails"); + goto bad; + } + + if (!check_object(s, page, object, 0)) + goto bad; + + /* Success perform special debug activities for allocs */ + if (s->flags & SLAB_STORE_USER) + set_track(s, object, TRACK_ALLOC, addr); + trace(s, page, object, 1); + init_object(s, object, 1); + return 1; + +bad: + if (PageSlab(page)) { + /* + * If this is a slab page then lets do the best we can + * to avoid issues in the future. Marking all objects + * as used avoids touching the remaining objects. + */ + slab_fix(s, "Marking all objects used"); + page->inuse = page->objects; + page->freelist = NULL; + } + return 0; +} + +static int free_debug_processing(struct kmem_cache *s, struct page *page, + void *object, void *addr) +{ + if (!check_slab(s, page)) + goto fail; + + if (!check_valid_pointer(s, page, object)) { + slab_err(s, page, "Invalid object pointer 0x%p", object); + goto fail; + } + + if (on_freelist(s, page, object)) { + object_err(s, page, object, "Object already free"); + goto fail; + } + + if (!check_object(s, page, object, 1)) + return 0; + + if (unlikely(s != page->slab)) { + if (!PageSlab(page)) { + slab_err(s, page, "Attempt to free object(0x%p) " + "outside of slab", object); + } else if (!page->slab) { + printk(KERN_ERR + "SLUB <none>: no slab for object 0x%p.\n", + object); + dump_stack(); + } else + object_err(s, page, object, + "page slab pointer corrupt."); + goto fail; + } + + /* Special debug activities for freeing objects */ + if (!PageSlubFrozen(page) && !page->freelist) + remove_full(s, page); + if (s->flags & SLAB_STORE_USER) + set_track(s, object, TRACK_FREE, addr); + trace(s, page, object, 0); + init_object(s, object, 0); + return 1; + +fail: + slab_fix(s, "Object at 0x%p not freed", object); + return 0; +} + +static int __init setup_slub_debug(char *str) +{ + slub_debug = DEBUG_DEFAULT_FLAGS; + if (*str++ != '=' || !*str) + /* + * No options specified. Switch on full debugging. + */ + goto out; + + if (*str == ',') + /* + * No options but restriction on slabs. This means full + * debugging for slabs matching a pattern. + */ + goto check_slabs; + + slub_debug = 0; + if (*str == '-') + /* + * Switch off all debugging measures. + */ + goto out; + + /* + * Determine which debug features should be switched on + */ + for (; *str && *str != ','; str++) { + switch (tolower(*str)) { + case 'f': + slub_debug |= SLAB_DEBUG_FREE; + break; + case 'z': + slub_debug |= SLAB_RED_ZONE; + break; + case 'p': + slub_debug |= SLAB_POISON; + break; + case 'u': + slub_debug |= SLAB_STORE_USER; + break; + case 't': + slub_debug |= SLAB_TRACE; + break; + default: + printk(KERN_ERR "slub_debug option '%c' " + "unknown. skipped\n", *str); + } + } + +check_slabs: + if (*str == ',') + slub_debug_slabs = str + 1; +out: + return 1; +} + +__setup("slub_debug", setup_slub_debug); + +static unsigned long kmem_cache_flags(unsigned long objsize, + unsigned long flags, const char *name, + void (*ctor)(void *)) +{ + /* + * Enable debugging if selected on the kernel commandline. + */ + if (slub_debug && (!slub_debug_slabs || + strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)) == 0)) + flags |= slub_debug; + + return flags; +} +#else +static inline void setup_object_debug(struct kmem_cache *s, + struct page *page, void *object) {} + +static inline int alloc_debug_processing(struct kmem_cache *s, + struct page *page, void *object, void *addr) { return 0; } + +static inline int free_debug_processing(struct kmem_cache *s, + struct page *page, void *object, void *addr) { return 0; } + +static inline int slab_pad_check(struct kmem_cache *s, struct page *page) + { return 1; } +static inline int check_object(struct kmem_cache *s, struct page *page, + void *object, int active) { return 1; } +static inline void add_full(struct kmem_cache_node *n, struct page *page) {} +static inline unsigned long kmem_cache_flags(unsigned long objsize, + unsigned long flags, const char *name, + void (*ctor)(void *)) +{ + return flags; +} +#define slub_debug 0 + +static inline unsigned long slabs_node(struct kmem_cache *s, int node) + { return 0; } +static inline void inc_slabs_node(struct kmem_cache *s, int node, + int objects) {} +static inline void dec_slabs_node(struct kmem_cache *s, int node, + int objects) {} +#endif + +/* + * Slab allocation and freeing + */ +static inline struct page *alloc_slab_page(gfp_t flags, int node, + struct kmem_cache_order_objects oo) +{ + int order = oo_order(oo); + + if (node == -1) + return alloc_pages(flags, order); + else + return alloc_pages_node(node, flags, order); +} + +static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) +{ + struct page *page; + struct kmem_cache_order_objects oo = s->oo; + + flags |= s->allocflags; + + page = alloc_slab_page(flags | __GFP_NOWARN | __GFP_NORETRY, node, + oo); + if (unlikely(!page)) { + oo = s->min; + /* + * Allocation may have failed due to fragmentation. + * Try a lower order alloc if possible + */ + page = alloc_slab_page(flags, node, oo); + if (!page) + return NULL; + + stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); + } + page->objects = oo_objects(oo); + mod_zone_page_state(page_zone(page), + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, + 1 << oo_order(oo)); + + return page; +} + +static void setup_object(struct kmem_cache *s, struct page *page, + void *object) +{ + setup_object_debug(s, page, object); + if (unlikely(s->ctor)) + s->ctor(object); +} + +static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) +{ + struct page *page; + void *start; + void *last; + void *p; + + BUG_ON(flags & GFP_SLAB_BUG_MASK); + + page = allocate_slab(s, + flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); + if (!page) + goto out; + + inc_slabs_node(s, page_to_nid(page), page->objects); + page->slab = s; + page->flags |= 1 << PG_slab; + if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | + SLAB_STORE_USER | SLAB_TRACE)) + __SetPageSlubDebug(page); + + start = page_address(page); + + if (unlikely(s->flags & SLAB_POISON)) + memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page)); + + last = start; + for_each_object(p, s, start, page->objects) { + setup_object(s, page, last); + set_freepointer(s, last, p); + last = p; + } + setup_object(s, page, last); + set_freepointer(s, last, NULL); + + page->freelist = start; + page->inuse = 0; +out: + return page; +} + +static void __free_slab(struct kmem_cache *s, struct page *page) +{ + int order = compound_order(page); + int pages = 1 << order; + + if (unlikely(SLABDEBUG && PageSlubDebug(page))) { + void *p; + + slab_pad_check(s, page); + for_each_object(p, s, page_address(page), + page->objects) + check_object(s, page, p, 0); + __ClearPageSlubDebug(page); + } + + mod_zone_page_state(page_zone(page), + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, + -pages); + + __ClearPageSlab(page); + reset_page_mapcount(page); + __free_pages(page, order); +} + +static void rcu_free_slab(struct rcu_head *h) +{ + struct page *page; + + page = container_of((struct list_head *)h, struct page, lru); + __free_slab(page->slab, page); +} + +static void free_slab(struct kmem_cache *s, struct page *page) +{ + if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { + /* + * RCU free overloads the RCU head over the LRU + */ + struct rcu_head *head = (void *)&page->lru; + + call_rcu(head, rcu_free_slab); + } else + __free_slab(s, page); +} + +static void discard_slab(struct kmem_cache *s, struct page *page) +{ + dec_slabs_node(s, page_to_nid(page), page->objects); + free_slab(s, page); +} + +/* + * Per slab locking using the pagelock + */ +static __always_inline void slab_lock(struct page *page) +{ + bit_spin_lock(PG_locked, &page->flags); +} + +static __always_inline void slab_unlock(struct page *page) +{ + __bit_spin_unlock(PG_locked, &page->flags); +} + +static __always_inline int slab_trylock(struct page *page) +{ + int rc = 1; + + rc = bit_spin_trylock(PG_locked, &page->flags); + return rc; +} + +/* + * Management of partially allocated slabs + */ +static void add_partial(struct kmem_cache_node *n, + struct page *page, int tail) +{ + spin_lock(&n->list_lock); + n->nr_partial++; + if (tail) + list_add_tail(&page->lru, &n->partial); + else + list_add(&page->lru, &n->partial); + spin_unlock(&n->list_lock); +} + +static void remove_partial(struct kmem_cache *s, struct page *page) +{ + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + + spin_lock(&n->list_lock); + list_del(&page->lru); + n->nr_partial--; + spin_unlock(&n->list_lock); +} + +/* + * Lock slab and remove from the partial list. + * + * Must hold list_lock. + */ +static inline int lock_and_freeze_slab(struct kmem_cache_node *n, + struct page *page) +{ + if (slab_trylock(page)) { + list_del(&page->lru); + n->nr_partial--; + __SetPageSlubFrozen(page); + return 1; + } + return 0; +} + +/* + * Try to allocate a partial slab from a specific node. + */ +static struct page *get_partial_node(struct kmem_cache_node *n) +{ + struct page *page; + + /* + * Racy check. If we mistakenly see no partial slabs then we + * just allocate an empty slab. If we mistakenly try to get a + * partial slab and there is none available then get_partials() + * will return NULL. + */ + if (!n || !n->nr_partial) + return NULL; + + spin_lock(&n->list_lock); + list_for_each_entry(page, &n->partial, lru) + if (lock_and_freeze_slab(n, page)) + goto out; + page = NULL; +out: + spin_unlock(&n->list_lock); + return page; +} + +/* + * Get a page from somewhere. Search in increasing NUMA distances. + */ +static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) +{ +#ifdef CONFIG_NUMA + struct zonelist *zonelist; + struct zoneref *z; + struct zone *zone; + enum zone_type high_zoneidx = gfp_zone(flags); + struct page *page; + + /* + * The defrag ratio allows a configuration of the tradeoffs between + * inter node defragmentation and node local allocations. A lower + * defrag_ratio increases the tendency to do local allocations + * instead of attempting to obtain partial slabs from other nodes. + * + * If the defrag_ratio is set to 0 then kmalloc() always + * returns node local objects. If the ratio is higher then kmalloc() + * may return off node objects because partial slabs are obtained + * from other nodes and filled up. + * + * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes + * defrag_ratio = 1000) then every (well almost) allocation will + * first attempt to defrag slab caches on other nodes. This means + * scanning over all nodes to look for partial slabs which may be + * expensive if we do it every time we are trying to find a slab + * with available objects. + */ + if (!s->remote_node_defrag_ratio || + get_cycles() % 1024 > s->remote_node_defrag_ratio) + return NULL; + + zonelist = node_zonelist(slab_node(current->mempolicy), flags); + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + struct kmem_cache_node *n; + + n = get_node(s, zone_to_nid(zone)); + + if (n && cpuset_zone_allowed_hardwall(zone, flags) && + n->nr_partial > n->min_partial) { + page = get_partial_node(n); + if (page) + return page; + } + } +#endif + return NULL; +} + +/* + * Get a partial page, lock it and return it. + */ +static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) +{ + struct page *page; + int searchnode = (node == -1) ? numa_node_id() : node; + + page = get_partial_node(get_node(s, searchnode)); + if (page || (flags & __GFP_THISNODE)) + return page; + + return get_any_partial(s, flags); +} + +/* + * Move a page back to the lists. + * + * Must be called with the slab lock held. + * + * On exit the slab lock will have been dropped. + */ +static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) +{ + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id()); + + __ClearPageSlubFrozen(page); + if (page->inuse) { + + if (page->freelist) { + add_partial(n, page, tail); + stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); + } else { + stat(c, DEACTIVATE_FULL); + if (SLABDEBUG && PageSlubDebug(page) && + (s->flags & SLAB_STORE_USER)) + add_full(n, page); + } + slab_unlock(page); + } else { + stat(c, DEACTIVATE_EMPTY); + if (n->nr_partial < n->min_partial) { + /* + * Adding an empty slab to the partial slabs in order + * to avoid page allocator overhead. This slab needs + * to come after the other slabs with objects in + * so that the others get filled first. That way the + * size of the partial list stays small. + * + * kmem_cache_shrink can reclaim any empty slabs from + * the partial list. + */ + add_partial(n, page, 1); + slab_unlock(page); + } else { + slab_unlock(page); + stat(get_cpu_slab(s, raw_smp_processor_id()), FREE_SLAB); + discard_slab(s, page); + } + } +} + +/* + * Remove the cpu slab + */ +static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) +{ + struct page *page = c->page; + int tail = 1; + + if (page->freelist) + stat(c, DEACTIVATE_REMOTE_FREES); + /* + * Merge cpu freelist into slab freelist. Typically we get here + * because both freelists are empty. So this is unlikely + * to occur. + */ + while (unlikely(c->freelist)) { + void **object; + + tail = 0; /* Hot objects. Put the slab first */ + + /* Retrieve object from cpu_freelist */ + object = c->freelist; + c->freelist = c->freelist[c->offset]; + + /* And put onto the regular freelist */ + object[c->offset] = page->freelist; + page->freelist = object; + page->inuse--; + } + c->page = NULL; + unfreeze_slab(s, page, tail); +} + +static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) +{ + stat(c, CPUSLAB_FLUSH); + slab_lock(c->page); + deactivate_slab(s, c); +} + +/* + * Flush cpu slab. + * + * Called from IPI handler with interrupts disabled. + */ +static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) +{ + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + + if (likely(c && c->page)) + flush_slab(s, c); +} + +static void flush_cpu_slab(void *d) +{ + struct kmem_cache *s = d; + + __flush_cpu_slab(s, smp_processor_id()); +} + +static void flush_all(struct kmem_cache *s) +{ + on_each_cpu(flush_cpu_slab, s, 1); +} + +/* + * Check if the objects in a per cpu structure fit numa + * locality expectations. + */ +static inline int node_match(struct kmem_cache_cpu *c, int node) +{ +#ifdef CONFIG_NUMA + if (node != -1 && c->node != node) + return 0; +#endif + return 1; +} + +/* + * Slow path. The lockless freelist is empty or we need to perform + * debugging duties. + * + * Interrupts are disabled. + * + * Processing is still very fast if new objects have been freed to the + * regular freelist. In that case we simply take over the regular freelist + * as the lockless freelist and zap the regular freelist. + * + * If that is not working then we fall back to the partial lists. We take the + * first element of the freelist as the object to allocate now and move the + * rest of the freelist to the lockless freelist. + * + * And if we were unable to get a new slab from the partial slab lists then + * we need to allocate a new slab. This is the slowest path since it involves + * a call to the page allocator and the setup of a new slab. + */ +static void *__slab_alloc(struct kmem_cache *s, + gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c) +{ + void **object; + struct page *new; + + /* We handle __GFP_ZERO in the caller */ + gfpflags &= ~__GFP_ZERO; + + if (!c->page) + goto new_slab; + + slab_lock(c->page); + if (unlikely(!node_match(c, node))) + goto another_slab; + + stat(c, ALLOC_REFILL); + +load_freelist: + object = c->page->freelist; + if (unlikely(!object)) + goto another_slab; + if (unlikely(SLABDEBUG && PageSlubDebug(c->page))) + goto debug; + + c->freelist = object[c->offset]; + c->page->inuse = c->page->objects; + c->page->freelist = NULL; + c->node = page_to_nid(c->page); +unlock_out: + slab_unlock(c->page); + stat(c, ALLOC_SLOWPATH); + return object; + +another_slab: + deactivate_slab(s, c); + +new_slab: + new = get_partial(s, gfpflags, node); + if (new) { + c->page = new; + stat(c, ALLOC_FROM_PARTIAL); + goto load_freelist; + } + + if (gfpflags & __GFP_WAIT) + local_irq_enable(); + + new = new_slab(s, gfpflags, node); + + if (gfpflags & __GFP_WAIT) + local_irq_disable(); + + if (new) { + c = get_cpu_slab(s, smp_processor_id()); + stat(c, ALLOC_SLAB); + if (c->page) + flush_slab(s, c); + slab_lock(new); + __SetPageSlubFrozen(new); + c->page = new; + goto load_freelist; + } + return NULL; +debug: + if (!alloc_debug_processing(s, c->page, object, addr)) + goto another_slab; + + c->page->inuse++; + c->page->freelist = object[c->offset]; + c->node = -1; + goto unlock_out; +} + +/* + * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) + * have the fastpath folded into their functions. So no function call + * overhead for requests that can be satisfied on the fastpath. + * + * The fastpath works by first checking if the lockless freelist can be used. + * If not then __slab_alloc is called for slow processing. + * + * Otherwise we can simply pick the next object from the lockless free list. + */ +static __always_inline void *slab_alloc(struct kmem_cache *s, + gfp_t gfpflags, int node, void *addr) +{ + void **object; + struct kmem_cache_cpu *c; + unsigned long flags; + unsigned int objsize; + + local_irq_save(flags); + c = get_cpu_slab(s, smp_processor_id()); + objsize = c->objsize; + if (unlikely(!c->freelist || !node_match(c, node))) + + object = __slab_alloc(s, gfpflags, node, addr, c); + + else { + object = c->freelist; + c->freelist = object[c->offset]; + stat(c, ALLOC_FASTPATH); + } + local_irq_restore(flags); + + if (unlikely((gfpflags & __GFP_ZERO) && object)) + memset(object, 0, objsize); + + return object; +} + +void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) +{ + return slab_alloc(s, gfpflags, -1, __builtin_return_address(0)); +} +EXPORT_SYMBOL(kmem_cache_alloc); + +#ifdef CONFIG_NUMA +void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) +{ + return slab_alloc(s, gfpflags, node, __builtin_return_address(0)); +} +EXPORT_SYMBOL(kmem_cache_alloc_node); +#endif + +/* + * Slow patch handling. This may still be called frequently since objects + * have a longer lifetime than the cpu slabs in most processing loads. + * + * So we still attempt to reduce cache line usage. Just take the slab + * lock and free the item. If there is no additional partial page + * handling required then we can return immediately. + */ +static void __slab_free(struct kmem_cache *s, struct page *page, + void *x, void *addr, unsigned int offset) +{ + void *prior; + void **object = (void *)x; + struct kmem_cache_cpu *c; + + c = get_cpu_slab(s, raw_smp_processor_id()); + stat(c, FREE_SLOWPATH); + slab_lock(page); + + if (unlikely(SLABDEBUG && PageSlubDebug(page))) + goto debug; + +checks_ok: + prior = object[offset] = page->freelist; + page->freelist = object; + page->inuse--; + + if (unlikely(PageSlubFrozen(page))) { + stat(c, FREE_FROZEN); + goto out_unlock; + } + + if (unlikely(!page->inuse)) + goto slab_empty; + + /* + * Objects left in the slab. If it was not on the partial list before + * then add it. + */ + if (unlikely(!prior)) { + add_partial(get_node(s, page_to_nid(page)), page, 1); + stat(c, FREE_ADD_PARTIAL); + } + +out_unlock: + slab_unlock(page); + return; + +slab_empty: + if (prior) { + /* + * Slab still on the partial list. + */ + remove_partial(s, page); + stat(c, FREE_REMOVE_PARTIAL); + } + slab_unlock(page); + stat(c, FREE_SLAB); + discard_slab(s, page); + return; + +debug: + if (!free_debug_processing(s, page, x, addr)) + goto out_unlock; + goto checks_ok; +} + +/* + * Fastpath with forced inlining to produce a kfree and kmem_cache_free that + * can perform fastpath freeing without additional function calls. + * + * The fastpath is only possible if we are freeing to the current cpu slab + * of this processor. This typically the case if we have just allocated + * the item before. + * + * If fastpath is not possible then fall back to __slab_free where we deal + * with all sorts of special processing. + */ +static __always_inline void slab_free(struct kmem_cache *s, + struct page *page, void *x, void *addr) +{ + void **object = (void *)x; + struct kmem_cache_cpu *c; + unsigned long flags; + + local_irq_save(flags); + c = get_cpu_slab(s, smp_processor_id()); + debug_check_no_locks_freed(object, c->objsize); + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(object, s->objsize); + if (likely(page == c->page && c->node >= 0)) { + object[c->offset] = c->freelist; + c->freelist = object; + stat(c, FREE_FASTPATH); + } else + __slab_free(s, page, x, addr, c->offset); + + local_irq_restore(flags); +} + +void kmem_cache_free(struct kmem_cache *s, void *x) +{ + struct page *page; + + page = virt_to_head_page(x); + + slab_free(s, page, x, __builtin_return_address(0)); +} +EXPORT_SYMBOL(kmem_cache_free); + +/* Figure out on which slab object the object resides */ +static struct page *get_object_page(const void *x) +{ + struct page *page = virt_to_head_page(x); + + if (!PageSlab(page)) + return NULL; + + return page; +} + +/* + * Object placement in a slab is made very easy because we always start at + * offset 0. If we tune the size of the object to the alignment then we can + * get the required alignment by putting one properly sized object after + * another. + * + * Notice that the allocation order determines the sizes of the per cpu + * caches. Each processor has always one slab available for allocations. + * Increasing the allocation order reduces the number of times that slabs + * must be moved on and off the partial lists and is therefore a factor in + * locking overhead. + */ + +/* + * Mininum / Maximum order of slab pages. This influences locking overhead + * and slab fragmentation. A higher order reduces the number of partial slabs + * and increases the number of allocations possible without having to + * take the list_lock. + */ +static int slub_min_order; +static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; +static int slub_min_objects; + +/* + * Merge control. If this is set then no merging of slab caches will occur. + * (Could be removed. This was introduced to pacify the merge skeptics.) + */ +static int slub_nomerge; + +/* + * Calculate the order of allocation given an slab object size. + * + * The order of allocation has significant impact on performance and other + * system components. Generally order 0 allocations should be preferred since + * order 0 does not cause fragmentation in the page allocator. Larger objects + * be problematic to put into order 0 slabs because there may be too much + * unused space left. We go to a higher order if more than 1/16th of the slab + * would be wasted. + * + * In order to reach satisfactory performance we must ensure that a minimum + * number of objects is in one slab. Otherwise we may generate too much + * activity on the partial lists which requires taking the list_lock. This is + * less a concern for large slabs though which are rarely used. + * + * slub_max_order specifies the order where we begin to stop considering the + * number of objects in a slab as critical. If we reach slub_max_order then + * we try to keep the page order as low as possible. So we accept more waste + * of space in favor of a small page order. + * + * Higher order allocations also allow the placement of more objects in a + * slab and thereby reduce object handling overhead. If the user has + * requested a higher mininum order then we start with that one instead of + * the smallest order which will fit the object. + */ +static inline int slab_order(int size, int min_objects, + int max_order, int fract_leftover) +{ + int order; + int rem; + int min_order = slub_min_order; + + if ((PAGE_SIZE << min_order) / size > 65535) + return get_order(size * 65535) - 1; + + for (order = max(min_order, + fls(min_objects * size - 1) - PAGE_SHIFT); + order <= max_order; order++) { + + unsigned long slab_size = PAGE_SIZE << order; + + if (slab_size < min_objects * size) + continue; + + rem = slab_size % size; + + if (rem <= slab_size / fract_leftover) + break; + + } + + return order; +} + +static inline int calculate_order(int size) +{ + int order; + int min_objects; + int fraction; + + /* + * Attempt to find best configuration for a slab. This + * works by first attempting to generate a layout with + * the best configuration and backing off gradually. + * + * First we reduce the acceptable waste in a slab. Then + * we reduce the minimum objects required in a slab. + */ + min_objects = slub_min_objects; + if (!min_objects) + min_objects = 4 * (fls(nr_cpu_ids) + 1); + while (min_objects > 1) { + fraction = 16; + while (fraction >= 4) { + order = slab_order(size, min_objects, + slub_max_order, fraction); + if (order <= slub_max_order) + return order; + fraction /= 2; + } + min_objects /= 2; + } + + /* + * We were unable to place multiple objects in a slab. Now + * lets see if we can place a single object there. + */ + order = slab_order(size, 1, slub_max_order, 1); + if (order <= slub_max_order) + return order; + + /* + * Doh this slab cannot be placed using slub_max_order. + */ + order = slab_order(size, 1, MAX_ORDER, 1); + if (order <= MAX_ORDER) + return order; + return -ENOSYS; +} + +/* + * Figure out what the alignment of the objects will be. + */ +static unsigned long calculate_alignment(unsigned long flags, + unsigned long align, unsigned long size) +{ + /* + * If the user wants hardware cache aligned objects then follow that + * suggestion if the object is sufficiently large. + * + * The hardware cache alignment cannot override the specified + * alignment though. If that is greater then use it. + */ + if (flags & SLAB_HWCACHE_ALIGN) { + unsigned long ralign = cache_line_size(); + while (size <= ralign / 2) + ralign /= 2; + align = max(align, ralign); + } + + if (align < ARCH_SLAB_MINALIGN) + align = ARCH_SLAB_MINALIGN; + + return ALIGN(align, sizeof(void *)); +} + +static void init_kmem_cache_cpu(struct kmem_cache *s, + struct kmem_cache_cpu *c) +{ + c->page = NULL; + c->freelist = NULL; + c->node = 0; + c->offset = s->offset / sizeof(void *); + c->objsize = s->objsize; +#ifdef CONFIG_SLUB_STATS + memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned)); +#endif +} + +static void +init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) +{ + n->nr_partial = 0; + + /* + * The larger the object size is, the more pages we want on the partial + * list to avoid pounding the page allocator excessively. + */ + n->min_partial = ilog2(s->size); + if (n->min_partial < MIN_PARTIAL) + n->min_partial = MIN_PARTIAL; + else if (n->min_partial > MAX_PARTIAL) + n->min_partial = MAX_PARTIAL; + + spin_lock_init(&n->list_lock); + INIT_LIST_HEAD(&n->partial); +#ifdef CONFIG_SLUB_DEBUG + atomic_long_set(&n->nr_slabs, 0); + atomic_long_set(&n->total_objects, 0); + INIT_LIST_HEAD(&n->full); +#endif +} + +#ifdef CONFIG_SMP +/* + * Per cpu array for per cpu structures. + * + * The per cpu array places all kmem_cache_cpu structures from one processor + * close together meaning that it becomes possible that multiple per cpu + * structures are contained in one cacheline. This may be particularly + * beneficial for the kmalloc caches. + * + * A desktop system typically has around 60-80 slabs. With 100 here we are + * likely able to get per cpu structures for all caches from the array defined + * here. We must be able to cover all kmalloc caches during bootstrap. + * + * If the per cpu array is exhausted then fall back to kmalloc + * of individual cachelines. No sharing is possible then. + */ +#define NR_KMEM_CACHE_CPU 100 + +static DEFINE_PER_CPU(struct kmem_cache_cpu, + kmem_cache_cpu)[NR_KMEM_CACHE_CPU]; + +static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); +static cpumask_t kmem_cach_cpu_free_init_once = CPU_MASK_NONE; + +static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, + int cpu, gfp_t flags) +{ + struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu); + + if (c) + per_cpu(kmem_cache_cpu_free, cpu) = + (void *)c->freelist; + else { + /* Table overflow: So allocate ourselves */ + c = kmalloc_node( + ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()), + flags, cpu_to_node(cpu)); + if (!c) + return NULL; + } + + init_kmem_cache_cpu(s, c); + return c; +} + +static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu) +{ + if (c < per_cpu(kmem_cache_cpu, cpu) || + c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { + kfree(c); + return; + } + c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu); + per_cpu(kmem_cache_cpu_free, cpu) = c; +} + +static void free_kmem_cache_cpus(struct kmem_cache *s) +{ + int cpu; + + for_each_online_cpu(cpu) { + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + + if (c) { + s->cpu_slab[cpu] = NULL; + free_kmem_cache_cpu(c, cpu); + } + } +} + +static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) +{ + int cpu; + + for_each_online_cpu(cpu) { + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + + if (c) + continue; + + c = alloc_kmem_cache_cpu(s, cpu, flags); + if (!c) { + free_kmem_cache_cpus(s); + return 0; + } + s->cpu_slab[cpu] = c; + } + return 1; +} + +/* + * Initialize the per cpu array. + */ +static void init_alloc_cpu_cpu(int cpu) +{ + int i; + + if (cpu_isset(cpu, kmem_cach_cpu_free_init_once)) + return; + + for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--) + free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu); + + cpu_set(cpu, kmem_cach_cpu_free_init_once); +} + +static void __init init_alloc_cpu(void) +{ + int cpu; + + for_each_online_cpu(cpu) + init_alloc_cpu_cpu(cpu); + } + +#else +static inline void free_kmem_cache_cpus(struct kmem_cache *s) {} +static inline void init_alloc_cpu(void) {} + +static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) +{ + init_kmem_cache_cpu(s, &s->cpu_slab); + return 1; +} +#endif + +#ifdef CONFIG_NUMA +/* + * No kmalloc_node yet so do it by hand. We know that this is the first + * slab on the node for this slabcache. There are no concurrent accesses + * possible. + * + * Note that this function only works on the kmalloc_node_cache + * when allocating for the kmalloc_node_cache. This is used for bootstrapping + * memory on a fresh node that has no slab structures yet. + */ +static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, + int node) +{ + struct page *page; + struct kmem_cache_node *n; + unsigned long flags; + + BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); + + page = new_slab(kmalloc_caches, gfpflags, node); + + BUG_ON(!page); + if (page_to_nid(page) != node) { + printk(KERN_ERR "SLUB: Unable to allocate memory from " + "node %d\n", node); + printk(KERN_ERR "SLUB: Allocating a useless per node structure " + "in order to be able to continue\n"); + } + + n = page->freelist; + BUG_ON(!n); + page->freelist = get_freepointer(kmalloc_caches, n); + page->inuse++; + kmalloc_caches->node[node] = n; +#ifdef CONFIG_SLUB_DEBUG + init_object(kmalloc_caches, n, 1); + init_tracking(kmalloc_caches, n); +#endif + init_kmem_cache_node(n, kmalloc_caches); + inc_slabs_node(kmalloc_caches, node, page->objects); + + /* + * lockdep requires consistent irq usage for each lock + * so even though there cannot be a race this early in + * the boot sequence, we still disable irqs. + */ + local_irq_save(flags); + add_partial(n, page, 0); + local_irq_restore(flags); + return n; +} + +static void free_kmem_cache_nodes(struct kmem_cache *s) +{ + int node; + + for_each_node_state(node, N_NORMAL_MEMORY) { + struct kmem_cache_node *n = s->node[node]; + if (n && n != &s->local_node) + kmem_cache_free(kmalloc_caches, n); + s->node[node] = NULL; + } +} + +static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) +{ + int node; + int local_node; + + if (slab_state >= UP) + local_node = page_to_nid(virt_to_page(s)); + else + local_node = 0; + + for_each_node_state(node, N_NORMAL_MEMORY) { + struct kmem_cache_node *n; + + if (local_node == node) + n = &s->local_node; + else { + if (slab_state == DOWN) { + n = early_kmem_cache_node_alloc(gfpflags, + node); + continue; + } + n = kmem_cache_alloc_node(kmalloc_caches, + gfpflags, node); + + if (!n) { + free_kmem_cache_nodes(s); + return 0; + } + + } + s->node[node] = n; + init_kmem_cache_node(n, s); + } + return 1; +} +#else +static void free_kmem_cache_nodes(struct kmem_cache *s) +{ +} + +static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) +{ + init_kmem_cache_node(&s->local_node, s); + return 1; +} +#endif + +/* + * calculate_sizes() determines the order and the distribution of data within + * a slab object. + */ +static int calculate_sizes(struct kmem_cache *s, int forced_order) +{ + unsigned long flags = s->flags; + unsigned long size = s->objsize; + unsigned long align = s->align; + int order; + + /* + * Round up object size to the next word boundary. We can only + * place the free pointer at word boundaries and this determines + * the possible location of the free pointer. + */ + size = ALIGN(size, sizeof(void *)); + +#ifdef CONFIG_SLUB_DEBUG + /* + * Determine if we can poison the object itself. If the user of + * the slab may touch the object after free or before allocation + * then we should never poison the object itself. + */ + if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && + !s->ctor) + s->flags |= __OBJECT_POISON; + else + s->flags &= ~__OBJECT_POISON; + + + /* + * If we are Redzoning then check if there is some space between the + * end of the object and the free pointer. If not then add an + * additional word to have some bytes to store Redzone information. + */ + if ((flags & SLAB_RED_ZONE) && size == s->objsize) + size += sizeof(void *); +#endif + + /* + * With that we have determined the number of bytes in actual use + * by the object. This is the potential offset to the free pointer. + */ + s->inuse = size; + + if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || + s->ctor)) { + /* + * Relocate free pointer after the object if it is not + * permitted to overwrite the first word of the object on + * kmem_cache_free. + * + * This is the case if we do RCU, have a constructor or + * destructor or are poisoning the objects. + */ + s->offset = size; + size += sizeof(void *); + } + +#ifdef CONFIG_SLUB_DEBUG + if (flags & SLAB_STORE_USER) + /* + * Need to store information about allocs and frees after + * the object. + */ + size += 2 * sizeof(struct track); + + if (flags & SLAB_RED_ZONE) + /* + * Add some empty padding so that we can catch + * overwrites from earlier objects rather than let + * tracking information or the free pointer be + * corrupted if an user writes before the start + * of the object. + */ + size += sizeof(void *); +#endif + + /* + * Determine the alignment based on various parameters that the + * user specified and the dynamic determination of cache line size + * on bootup. + */ + align = calculate_alignment(flags, align, s->objsize); + + /* + * SLUB stores one object immediately after another beginning from + * offset 0. In order to align the objects we have to simply size + * each object to conform to the alignment. + */ + size = ALIGN(size, align); + s->size = size; + if (forced_order >= 0) + order = forced_order; + else + order = calculate_order(size); + + if (order < 0) + return 0; + + s->allocflags = 0; + if (order) + s->allocflags |= __GFP_COMP; + + if (s->flags & SLAB_CACHE_DMA) + s->allocflags |= SLUB_DMA; + + if (s->flags & SLAB_RECLAIM_ACCOUNT) + s->allocflags |= __GFP_RECLAIMABLE; + + /* + * Determine the number of objects per slab + */ + s->oo = oo_make(order, size); + s->min = oo_make(get_order(size), size); + if (oo_objects(s->oo) > oo_objects(s->max)) + s->max = s->oo; + + return !!oo_objects(s->oo); + +} + +static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, + const char *name, size_t size, + size_t align, unsigned long flags, + void (*ctor)(void *)) +{ + memset(s, 0, kmem_size); + s->name = name; + s->ctor = ctor; + s->objsize = size; + s->align = align; + s->flags = kmem_cache_flags(size, flags, name, ctor); + + if (!calculate_sizes(s, -1)) + goto error; + + s->refcount = 1; +#ifdef CONFIG_NUMA + s->remote_node_defrag_ratio = 1000; +#endif + if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) + goto error; + + if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) + return 1; + free_kmem_cache_nodes(s); +error: + if (flags & SLAB_PANIC) + panic("Cannot create slab %s size=%lu realsize=%u " + "order=%u offset=%u flags=%lx\n", + s->name, (unsigned long)size, s->size, oo_order(s->oo), + s->offset, flags); + return 0; +} + +/* + * Check if a given pointer is valid + */ +int kmem_ptr_validate(struct kmem_cache *s, const void *object) +{ + struct page *page; + + page = get_object_page(object); + + if (!page || s != page->slab) + /* No slab or wrong slab */ + return 0; + + if (!check_valid_pointer(s, page, object)) + return 0; + + /* + * We could also check if the object is on the slabs freelist. + * But this would be too expensive and it seems that the main + * purpose of kmem_ptr_valid() is to check if the object belongs + * to a certain slab. + */ + return 1; +} +EXPORT_SYMBOL(kmem_ptr_validate); + +/* + * Determine the size of a slab object + */ +unsigned int kmem_cache_size(struct kmem_cache *s) +{ + return s->objsize; +} +EXPORT_SYMBOL(kmem_cache_size); + +const char *kmem_cache_name(struct kmem_cache *s) +{ + return s->name; +} +EXPORT_SYMBOL(kmem_cache_name); + +static void list_slab_objects(struct kmem_cache *s, struct page *page, + const char *text) +{ +#ifdef CONFIG_SLUB_DEBUG + void *addr = page_address(page); + void *p; + DECLARE_BITMAP(map, page->objects); + + bitmap_zero(map, page->objects); + slab_err(s, page, "%s", text); + slab_lock(page); + for_each_free_object(p, s, page->freelist) + set_bit(slab_index(p, s, addr), map); + + for_each_object(p, s, addr, page->objects) { + + if (!test_bit(slab_index(p, s, addr), map)) { + printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n", + p, p - addr); + print_tracking(s, p); + } + } + slab_unlock(page); +#endif +} + +/* + * Attempt to free all partial slabs on a node. + */ +static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) +{ + unsigned long flags; + struct page *page, *h; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry_safe(page, h, &n->partial, lru) { + if (!page->inuse) { + list_del(&page->lru); + discard_slab(s, page); + n->nr_partial--; + } else { + list_slab_objects(s, page, + "Objects remaining on kmem_cache_close()"); + } + } + spin_unlock_irqrestore(&n->list_lock, flags); +} + +/* + * Release all resources used by a slab cache. + */ +static inline int kmem_cache_close(struct kmem_cache *s) +{ + int node; + + flush_all(s); + + /* Attempt to free all objects */ + free_kmem_cache_cpus(s); + for_each_node_state(node, N_NORMAL_MEMORY) { + struct kmem_cache_node *n = get_node(s, node); + + free_partial(s, n); + if (n->nr_partial || slabs_node(s, node)) + return 1; + } + free_kmem_cache_nodes(s); + return 0; +} + +/* + * Close a cache and release the kmem_cache structure + * (must be used for caches created using kmem_cache_create) + */ +void kmem_cache_destroy(struct kmem_cache *s) +{ + down_write(&slub_lock); + s->refcount--; + if (!s->refcount) { + list_del(&s->list); + up_write(&slub_lock); + if (kmem_cache_close(s)) { + printk(KERN_ERR "SLUB %s: %s called for cache that " + "still has objects.\n", s->name, __func__); + dump_stack(); + } + sysfs_slab_remove(s); + } else + up_write(&slub_lock); +} +EXPORT_SYMBOL(kmem_cache_destroy); + +/******************************************************************** + * Kmalloc subsystem + *******************************************************************/ + +struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned; +EXPORT_SYMBOL(kmalloc_caches); + +static int __init setup_slub_min_order(char *str) +{ + get_option(&str, &slub_min_order); + + return 1; +} + +__setup("slub_min_order=", setup_slub_min_order); + +static int __init setup_slub_max_order(char *str) +{ + get_option(&str, &slub_max_order); + + return 1; +} + +__setup("slub_max_order=", setup_slub_max_order); + +static int __init setup_slub_min_objects(char *str) +{ + get_option(&str, &slub_min_objects); + + return 1; +} + +__setup("slub_min_objects=", setup_slub_min_objects); + +static int __init setup_slub_nomerge(char *str) +{ + slub_nomerge = 1; + return 1; +} + +__setup("slub_nomerge", setup_slub_nomerge); + +static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, + const char *name, int size, gfp_t gfp_flags) +{ + unsigned int flags = 0; + + if (gfp_flags & SLUB_DMA) + flags = SLAB_CACHE_DMA; + + down_write(&slub_lock); + if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, + flags, NULL)) + goto panic; + + list_add(&s->list, &slab_caches); + up_write(&slub_lock); + if (sysfs_slab_add(s)) + goto panic; + return s; + +panic: + panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); +} + +#ifdef CONFIG_ZONE_DMA +static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1]; + +static void sysfs_add_func(struct work_struct *w) +{ + struct kmem_cache *s; + + down_write(&slub_lock); + list_for_each_entry(s, &slab_caches, list) { + if (s->flags & __SYSFS_ADD_DEFERRED) { + s->flags &= ~__SYSFS_ADD_DEFERRED; + sysfs_slab_add(s); + } + } + up_write(&slub_lock); +} + +static DECLARE_WORK(sysfs_add_work, sysfs_add_func); + +static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) +{ + struct kmem_cache *s; + char *text; + size_t realsize; + + s = kmalloc_caches_dma[index]; + if (s) + return s; + + /* Dynamically create dma cache */ + if (flags & __GFP_WAIT) + down_write(&slub_lock); + else { + if (!down_write_trylock(&slub_lock)) + goto out; + } + + if (kmalloc_caches_dma[index]) + goto unlock_out; + + realsize = kmalloc_caches[index].objsize; + text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", + (unsigned int)realsize); + s = kmalloc(kmem_size, flags & ~SLUB_DMA); + + if (!s || !text || !kmem_cache_open(s, flags, text, + realsize, ARCH_KMALLOC_MINALIGN, + SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) { + kfree(s); + kfree(text); + goto unlock_out; + } + + list_add(&s->list, &slab_caches); + kmalloc_caches_dma[index] = s; + + schedule_work(&sysfs_add_work); + +unlock_out: + up_write(&slub_lock); +out: + return kmalloc_caches_dma[index]; +} +#endif + +/* + * Conversion table for small slabs sizes / 8 to the index in the + * kmalloc array. This is necessary for slabs < 192 since we have non power + * of two cache sizes there. The size of larger slabs can be determined using + * fls. + */ +static s8 size_index[24] = { + 3, /* 8 */ + 4, /* 16 */ + 5, /* 24 */ + 5, /* 32 */ + 6, /* 40 */ + 6, /* 48 */ + 6, /* 56 */ + 6, /* 64 */ + 1, /* 72 */ + 1, /* 80 */ + 1, /* 88 */ + 1, /* 96 */ + 7, /* 104 */ + 7, /* 112 */ + 7, /* 120 */ + 7, /* 128 */ + 2, /* 136 */ + 2, /* 144 */ + 2, /* 152 */ + 2, /* 160 */ + 2, /* 168 */ + 2, /* 176 */ + 2, /* 184 */ + 2 /* 192 */ +}; + +static struct kmem_cache *get_slab(size_t size, gfp_t flags) +{ + int index; + + if (size <= 192) { + if (!size) + return ZERO_SIZE_PTR; + + index = size_index[(size - 1) / 8]; + } else + index = fls(size - 1); + +#ifdef CONFIG_ZONE_DMA + if (unlikely((flags & SLUB_DMA))) + return dma_kmalloc_cache(index, flags); + +#endif + return &kmalloc_caches[index]; +} + +void *__kmalloc(size_t size, gfp_t flags) +{ + struct kmem_cache *s; + + if (unlikely(size > PAGE_SIZE)) + return kmalloc_large(size, flags); + + s = get_slab(size, flags); + + if (unlikely(ZERO_OR_NULL_PTR(s))) + return s; + + return slab_alloc(s, flags, -1, __builtin_return_address(0)); +} +EXPORT_SYMBOL(__kmalloc); + +static void *kmalloc_large_node(size_t size, gfp_t flags, int node) +{ + struct page *page = alloc_pages_node(node, flags | __GFP_COMP, + get_order(size)); + + if (page) + return page_address(page); + else + return NULL; +} + +#ifdef CONFIG_NUMA +void *__kmalloc_node(size_t size, gfp_t flags, int node) +{ + struct kmem_cache *s; + + if (unlikely(size > PAGE_SIZE)) + return kmalloc_large_node(size, flags, node); + + s = get_slab(size, flags); + + if (unlikely(ZERO_OR_NULL_PTR(s))) + return s; + + return slab_alloc(s, flags, node, __builtin_return_address(0)); +} +EXPORT_SYMBOL(__kmalloc_node); +#endif + +size_t ksize(const void *object) +{ + struct page *page; + struct kmem_cache *s; + + if (unlikely(object == ZERO_SIZE_PTR)) + return 0; + + page = virt_to_head_page(object); + + if (unlikely(!PageSlab(page))) { + WARN_ON(!PageCompound(page)); + return PAGE_SIZE << compound_order(page); + } + s = page->slab; + +#ifdef CONFIG_SLUB_DEBUG + /* + * Debugging requires use of the padding between object + * and whatever may come after it. + */ + if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) + return s->objsize; + +#endif + /* + * If we have the need to store the freelist pointer + * back there or track user information then we can + * only use the space before that information. + */ + if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation + */ + return s->size; +} + +void kfree(const void *x) +{ + struct page *page; + void *object = (void *)x; + + if (unlikely(ZERO_OR_NULL_PTR(x))) + return; + + page = virt_to_head_page(x); + if (unlikely(!PageSlab(page))) { + BUG_ON(!PageCompound(page)); + put_page(page); + return; + } + slab_free(page->slab, page, object, __builtin_return_address(0)); +} +EXPORT_SYMBOL(kfree); + +/* + * kmem_cache_shrink removes empty slabs from the partial lists and sorts + * the remaining slabs by the number of items in use. The slabs with the + * most items in use come first. New allocations will then fill those up + * and thus they can be removed from the partial lists. + * + * The slabs with the least items are placed last. This results in them + * being allocated from last increasing the chance that the last objects + * are freed in them. + */ +int kmem_cache_shrink(struct kmem_cache *s) +{ + int node; + int i; + struct kmem_cache_node *n; + struct page *page; + struct page *t; + int objects = oo_objects(s->max); + struct list_head *slabs_by_inuse = + kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL); + unsigned long flags; + + if (!slabs_by_inuse) + return -ENOMEM; + + flush_all(s); + for_each_node_state(node, N_NORMAL_MEMORY) { + n = get_node(s, node); + + if (!n->nr_partial) + continue; + + for (i = 0; i < objects; i++) + INIT_LIST_HEAD(slabs_by_inuse + i); + + spin_lock_irqsave(&n->list_lock, flags); + + /* + * Build lists indexed by the items in use in each slab. + * + * Note that concurrent frees may occur while we hold the + * list_lock. page->inuse here is the upper limit. + */ + list_for_each_entry_safe(page, t, &n->partial, lru) { + if (!page->inuse && slab_trylock(page)) { + /* + * Must hold slab lock here because slab_free + * may have freed the last object and be + * waiting to release the slab. + */ + list_del(&page->lru); + n->nr_partial--; + slab_unlock(page); + discard_slab(s, page); + } else { + list_move(&page->lru, + slabs_by_inuse + page->inuse); + } + } + + /* + * Rebuild the partial list with the slabs filled up most + * first and the least used slabs at the end. + */ + for (i = objects - 1; i >= 0; i--) + list_splice(slabs_by_inuse + i, n->partial.prev); + + spin_unlock_irqrestore(&n->list_lock, flags); + } + + kfree(slabs_by_inuse); + return 0; +} +EXPORT_SYMBOL(kmem_cache_shrink); + +#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) +static int slab_mem_going_offline_callback(void *arg) +{ + struct kmem_cache *s; + + down_read(&slub_lock); + list_for_each_entry(s, &slab_caches, list) + kmem_cache_shrink(s); + up_read(&slub_lock); + + return 0; +} + +static void slab_mem_offline_callback(void *arg) +{ + struct kmem_cache_node *n; + struct kmem_cache *s; + struct memory_notify *marg = arg; + int offline_node; + + offline_node = marg->status_change_nid; + + /* + * If the node still has available memory. we need kmem_cache_node + * for it yet. + */ + if (offline_node < 0) + return; + + down_read(&slub_lock); + list_for_each_entry(s, &slab_caches, list) { + n = get_node(s, offline_node); + if (n) { + /* + * if n->nr_slabs > 0, slabs still exist on the node + * that is going down. We were unable to free them, + * and offline_pages() function shoudn't call this + * callback. So, we must fail. + */ + BUG_ON(slabs_node(s, offline_node)); + + s->node[offline_node] = NULL; + kmem_cache_free(kmalloc_caches, n); + } + } + up_read(&slub_lock); +} + +static int slab_mem_going_online_callback(void *arg) +{ + struct kmem_cache_node *n; + struct kmem_cache *s; + struct memory_notify *marg = arg; + int nid = marg->status_change_nid; + int ret = 0; + + /* + * If the node's memory is already available, then kmem_cache_node is + * already created. Nothing to do. + */ + if (nid < 0) + return 0; + + /* + * We are bringing a node online. No memory is available yet. We must + * allocate a kmem_cache_node structure in order to bring the node + * online. + */ + down_read(&slub_lock); + list_for_each_entry(s, &slab_caches, list) { + /* + * XXX: kmem_cache_alloc_node will fallback to other nodes + * since memory is not yet available from the node that + * is brought up. + */ + n = kmem_cache_alloc(kmalloc_caches, GFP_KERNEL); + if (!n) { + ret = -ENOMEM; + goto out; + } + init_kmem_cache_node(n, s); + s->node[nid] = n; + } +out: + up_read(&slub_lock); + return ret; +} + +static int slab_memory_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + int ret = 0; + + switch (action) { + case MEM_GOING_ONLINE: + ret = slab_mem_going_online_callback(arg); + break; + case MEM_GOING_OFFLINE: + ret = slab_mem_going_offline_callback(arg); + break; + case MEM_OFFLINE: + case MEM_CANCEL_ONLINE: + slab_mem_offline_callback(arg); + break; + case MEM_ONLINE: + case MEM_CANCEL_OFFLINE: + break; + } + if (ret) + ret = notifier_from_errno(ret); + else + ret = NOTIFY_OK; + return ret; +} + +#endif /* CONFIG_MEMORY_HOTPLUG */ + +/******************************************************************** + * Basic setup of slabs + *******************************************************************/ + +void __init kmem_cache_init(void) +{ + int i; + int caches = 0; + + init_alloc_cpu(); + +#ifdef CONFIG_NUMA + /* + * Must first have the slab cache available for the allocations of the + * struct kmem_cache_node's. There is special bootstrap code in + * kmem_cache_open for slab_state == DOWN. + */ + create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", + sizeof(struct kmem_cache_node), GFP_KERNEL); + kmalloc_caches[0].refcount = -1; + caches++; + + hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); +#endif + + /* Able to allocate the per node structures */ + slab_state = PARTIAL; + + /* Caches that are not of the two-to-the-power-of size */ + if (KMALLOC_MIN_SIZE <= 64) { + create_kmalloc_cache(&kmalloc_caches[1], + "kmalloc-96", 96, GFP_KERNEL); + caches++; + create_kmalloc_cache(&kmalloc_caches[2], + "kmalloc-192", 192, GFP_KERNEL); + caches++; + } + + for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) { + create_kmalloc_cache(&kmalloc_caches[i], + "kmalloc", 1 << i, GFP_KERNEL); + caches++; + } + + + /* + * Patch up the size_index table if we have strange large alignment + * requirements for the kmalloc array. This is only the case for + * MIPS it seems. The standard arches will not generate any code here. + * + * Largest permitted alignment is 256 bytes due to the way we + * handle the index determination for the smaller caches. + * + * Make sure that nothing crazy happens if someone starts tinkering + * around with ARCH_KMALLOC_MINALIGN + */ + BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || + (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); + + for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) + size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW; + + if (KMALLOC_MIN_SIZE == 128) { + /* + * The 192 byte sized cache is not used if the alignment + * is 128 byte. Redirect kmalloc to use the 256 byte cache + * instead. + */ + for (i = 128 + 8; i <= 192; i += 8) + size_index[(i - 1) / 8] = 8; + } + + slab_state = UP; + + /* Provide the correct kmalloc names now that the caches are up */ + for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) + kmalloc_caches[i]. name = + kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); + +#ifdef CONFIG_SMP + register_cpu_notifier(&slab_notifier); + kmem_size = offsetof(struct kmem_cache, cpu_slab) + + nr_cpu_ids * sizeof(struct kmem_cache_cpu *); +#else + kmem_size = sizeof(struct kmem_cache); +#endif + + printk(KERN_INFO + "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," + " CPUs=%d, Nodes=%d\n", + caches, cache_line_size(), + slub_min_order, slub_max_order, slub_min_objects, + nr_cpu_ids, nr_node_ids); +} + +/* + * Find a mergeable slab cache + */ +static int slab_unmergeable(struct kmem_cache *s) +{ + if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) + return 1; + + if (s->ctor) + return 1; + + /* + * We may have set a slab to be unmergeable during bootstrap. + */ + if (s->refcount < 0) + return 1; + + return 0; +} + +static struct kmem_cache *find_mergeable(size_t size, + size_t align, unsigned long flags, const char *name, + void (*ctor)(void *)) +{ + struct kmem_cache *s; + + if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) + return NULL; + + if (ctor) + return NULL; + + size = ALIGN(size, sizeof(void *)); + align = calculate_alignment(flags, align, size); + size = ALIGN(size, align); + flags = kmem_cache_flags(size, flags, name, NULL); + + list_for_each_entry(s, &slab_caches, list) { + if (slab_unmergeable(s)) + continue; + + if (size > s->size) + continue; + + if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) + continue; + /* + * Check if alignment is compatible. + * Courtesy of Adrian Drzewiecki + */ + if ((s->size & ~(align - 1)) != s->size) + continue; + + if (s->size - size >= sizeof(void *)) + continue; + + return s; + } + return NULL; +} + +struct kmem_cache *kmem_cache_create(const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *)) +{ + struct kmem_cache *s; + + down_write(&slub_lock); + s = find_mergeable(size, align, flags, name, ctor); + if (s) { + int cpu; + + s->refcount++; + /* + * Adjust the object sizes so that we clear + * the complete object on kzalloc. + */ + s->objsize = max(s->objsize, (int)size); + + /* + * And then we need to update the object size in the + * per cpu structures + */ + for_each_online_cpu(cpu) + get_cpu_slab(s, cpu)->objsize = s->objsize; + + s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); + up_write(&slub_lock); + + if (sysfs_slab_alias(s, name)) + goto err; + return s; + } + + s = kmalloc(kmem_size, GFP_KERNEL); + if (s) { + if (kmem_cache_open(s, GFP_KERNEL, name, + size, align, flags, ctor)) { + list_add(&s->list, &slab_caches); + up_write(&slub_lock); + if (sysfs_slab_add(s)) + goto err; + return s; + } + kfree(s); + } + up_write(&slub_lock); + +err: + if (flags & SLAB_PANIC) + panic("Cannot create slabcache %s\n", name); + else + s = NULL; + return s; +} +EXPORT_SYMBOL(kmem_cache_create); + +#ifdef CONFIG_SMP +/* + * Use the cpu notifier to insure that the cpu slabs are flushed when + * necessary. + */ +static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + struct kmem_cache *s; + unsigned long flags; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + init_alloc_cpu_cpu(cpu); + down_read(&slub_lock); + list_for_each_entry(s, &slab_caches, list) + s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu, + GFP_KERNEL); + up_read(&slub_lock); + break; + + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + down_read(&slub_lock); + list_for_each_entry(s, &slab_caches, list) { + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + + local_irq_save(flags); + __flush_cpu_slab(s, cpu); + local_irq_restore(flags); + free_kmem_cache_cpu(c, cpu); + s->cpu_slab[cpu] = NULL; + } + up_read(&slub_lock); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata slab_notifier = { + .notifier_call = slab_cpuup_callback +}; + +#endif + +void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) +{ + struct kmem_cache *s; + + if (unlikely(size > PAGE_SIZE)) + return kmalloc_large(size, gfpflags); + + s = get_slab(size, gfpflags); + + if (unlikely(ZERO_OR_NULL_PTR(s))) + return s; + + return slab_alloc(s, gfpflags, -1, caller); +} + +void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, + int node, void *caller) +{ + struct kmem_cache *s; + + if (unlikely(size > PAGE_SIZE)) + return kmalloc_large_node(size, gfpflags, node); + + s = get_slab(size, gfpflags); + + if (unlikely(ZERO_OR_NULL_PTR(s))) + return s; + + return slab_alloc(s, gfpflags, node, caller); +} + +#ifdef CONFIG_SLUB_DEBUG +static unsigned long count_partial(struct kmem_cache_node *n, + int (*get_count)(struct page *)) +{ + unsigned long flags; + unsigned long x = 0; + struct page *page; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->partial, lru) + x += get_count(page); + spin_unlock_irqrestore(&n->list_lock, flags); + return x; +} + +static int count_inuse(struct page *page) +{ + return page->inuse; +} + +static int count_total(struct page *page) +{ + return page->objects; +} + +static int count_free(struct page *page) +{ + return page->objects - page->inuse; +} + +static int validate_slab(struct kmem_cache *s, struct page *page, + unsigned long *map) +{ + void *p; + void *addr = page_address(page); + + if (!check_slab(s, page) || + !on_freelist(s, page, NULL)) + return 0; + + /* Now we know that a valid freelist exists */ + bitmap_zero(map, page->objects); + + for_each_free_object(p, s, page->freelist) { + set_bit(slab_index(p, s, addr), map); + if (!check_object(s, page, p, 0)) + return 0; + } + + for_each_object(p, s, addr, page->objects) + if (!test_bit(slab_index(p, s, addr), map)) + if (!check_object(s, page, p, 1)) + return 0; + return 1; +} + +static void validate_slab_slab(struct kmem_cache *s, struct page *page, + unsigned long *map) +{ + if (slab_trylock(page)) { + validate_slab(s, page, map); + slab_unlock(page); + } else + printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", + s->name, page); + + if (s->flags & DEBUG_DEFAULT_FLAGS) { + if (!PageSlubDebug(page)) + printk(KERN_ERR "SLUB %s: SlubDebug not set " + "on slab 0x%p\n", s->name, page); + } else { + if (PageSlubDebug(page)) + printk(KERN_ERR "SLUB %s: SlubDebug set on " + "slab 0x%p\n", s->name, page); + } +} + +static int validate_slab_node(struct kmem_cache *s, + struct kmem_cache_node *n, unsigned long *map) +{ + unsigned long count = 0; + struct page *page; + unsigned long flags; + + spin_lock_irqsave(&n->list_lock, flags); + + list_for_each_entry(page, &n->partial, lru) { + validate_slab_slab(s, page, map); + count++; + } + if (count != n->nr_partial) + printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " + "counter=%ld\n", s->name, count, n->nr_partial); + + if (!(s->flags & SLAB_STORE_USER)) + goto out; + + list_for_each_entry(page, &n->full, lru) { + validate_slab_slab(s, page, map); + count++; + } + if (count != atomic_long_read(&n->nr_slabs)) + printk(KERN_ERR "SLUB: %s %ld slabs counted but " + "counter=%ld\n", s->name, count, + atomic_long_read(&n->nr_slabs)); + +out: + spin_unlock_irqrestore(&n->list_lock, flags); + return count; +} + +static long validate_slab_cache(struct kmem_cache *s) +{ + int node; + unsigned long count = 0; + unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * + sizeof(unsigned long), GFP_KERNEL); + + if (!map) + return -ENOMEM; + + flush_all(s); + for_each_node_state(node, N_NORMAL_MEMORY) { + struct kmem_cache_node *n = get_node(s, node); + + count += validate_slab_node(s, n, map); + } + kfree(map); + return count; +} + +#ifdef SLUB_RESILIENCY_TEST +static void resiliency_test(void) +{ + u8 *p; + + printk(KERN_ERR "SLUB resiliency testing\n"); + printk(KERN_ERR "-----------------------\n"); + printk(KERN_ERR "A. Corruption after allocation\n"); + + p = kzalloc(16, GFP_KERNEL); + p[16] = 0x12; + printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" + " 0x12->0x%p\n\n", p + 16); + + validate_slab_cache(kmalloc_caches + 4); + + /* Hmmm... The next two are dangerous */ + p = kzalloc(32, GFP_KERNEL); + p[32 + sizeof(void *)] = 0x34; + printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" + " 0x34 -> -0x%p\n", p); + printk(KERN_ERR + "If allocated object is overwritten then not detectable\n\n"); + + validate_slab_cache(kmalloc_caches + 5); + p = kzalloc(64, GFP_KERNEL); + p += 64 + (get_cycles() & 0xff) * sizeof(void *); + *p = 0x56; + printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", + p); + printk(KERN_ERR + "If allocated object is overwritten then not detectable\n\n"); + validate_slab_cache(kmalloc_caches + 6); + + printk(KERN_ERR "\nB. Corruption after free\n"); + p = kzalloc(128, GFP_KERNEL); + kfree(p); + *p = 0x78; + printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); + validate_slab_cache(kmalloc_caches + 7); + + p = kzalloc(256, GFP_KERNEL); + kfree(p); + p[50] = 0x9a; + printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", + p); + validate_slab_cache(kmalloc_caches + 8); + + p = kzalloc(512, GFP_KERNEL); + kfree(p); + p[512] = 0xab; + printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); + validate_slab_cache(kmalloc_caches + 9); +} +#else +static void resiliency_test(void) {}; +#endif + +/* + * Generate lists of code addresses where slabcache objects are allocated + * and freed. + */ + +struct location { + unsigned long count; + void *addr; + long long sum_time; + long min_time; + long max_time; + long min_pid; + long max_pid; + cpumask_t cpus; + nodemask_t nodes; +}; + +struct loc_track { + unsigned long max; + unsigned long count; + struct location *loc; +}; + +static void free_loc_track(struct loc_track *t) +{ + if (t->max) + free_pages((unsigned long)t->loc, + get_order(sizeof(struct location) * t->max)); +} + +static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags) +{ + struct location *l; + int order; + + order = get_order(sizeof(struct location) * max); + + l = (void *)__get_free_pages(flags, order); + if (!l) + return 0; + + if (t->count) { + memcpy(l, t->loc, sizeof(struct location) * t->count); + free_loc_track(t); + } + t->max = max; + t->loc = l; + return 1; +} + +static int add_location(struct loc_track *t, struct kmem_cache *s, + const struct track *track) +{ + long start, end, pos; + struct location *l; + void *caddr; + unsigned long age = jiffies - track->when; + + start = -1; + end = t->count; + + for ( ; ; ) { + pos = start + (end - start + 1) / 2; + + /* + * There is nothing at "end". If we end up there + * we need to add something to before end. + */ + if (pos == end) + break; + + caddr = t->loc[pos].addr; + if (track->addr == caddr) { + + l = &t->loc[pos]; + l->count++; + if (track->when) { + l->sum_time += age; + if (age < l->min_time) + l->min_time = age; + if (age > l->max_time) + l->max_time = age; + + if (track->pid < l->min_pid) + l->min_pid = track->pid; + if (track->pid > l->max_pid) + l->max_pid = track->pid; + + cpu_set(track->cpu, l->cpus); + } + node_set(page_to_nid(virt_to_page(track)), l->nodes); + return 1; + } + + if (track->addr < caddr) + end = pos; + else + start = pos; + } + + /* + * Not found. Insert new tracking element. + */ + if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC)) + return 0; + + l = t->loc + pos; + if (pos < t->count) + memmove(l + 1, l, + (t->count - pos) * sizeof(struct location)); + t->count++; + l->count = 1; + l->addr = track->addr; + l->sum_time = age; + l->min_time = age; + l->max_time = age; + l->min_pid = track->pid; + l->max_pid = track->pid; + cpus_clear(l->cpus); + cpu_set(track->cpu, l->cpus); + nodes_clear(l->nodes); + node_set(page_to_nid(virt_to_page(track)), l->nodes); + return 1; +} + +static void process_slab(struct loc_track *t, struct kmem_cache *s, + struct page *page, enum track_item alloc) +{ + void *addr = page_address(page); + DECLARE_BITMAP(map, page->objects); + void *p; + + bitmap_zero(map, page->objects); + for_each_free_object(p, s, page->freelist) + set_bit(slab_index(p, s, addr), map); + + for_each_object(p, s, addr, page->objects) + if (!test_bit(slab_index(p, s, addr), map)) + add_location(t, s, get_track(s, p, alloc)); +} + +static int list_locations(struct kmem_cache *s, char *buf, + enum track_item alloc) +{ + int len = 0; + unsigned long i; + struct loc_track t = { 0, 0, NULL }; + int node; + + if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), + GFP_TEMPORARY)) + return sprintf(buf, "Out of memory\n"); + + /* Push back cpu slabs */ + flush_all(s); + + for_each_node_state(node, N_NORMAL_MEMORY) { + struct kmem_cache_node *n = get_node(s, node); + unsigned long flags; + struct page *page; + + if (!atomic_long_read(&n->nr_slabs)) + continue; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->partial, lru) + process_slab(&t, s, page, alloc); + list_for_each_entry(page, &n->full, lru) + process_slab(&t, s, page, alloc); + spin_unlock_irqrestore(&n->list_lock, flags); + } + + for (i = 0; i < t.count; i++) { + struct location *l = &t.loc[i]; + + if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100) + break; + len += sprintf(buf + len, "%7ld ", l->count); + + if (l->addr) + len += sprint_symbol(buf + len, (unsigned long)l->addr); + else + len += sprintf(buf + len, "<not-available>"); + + if (l->sum_time != l->min_time) { + len += sprintf(buf + len, " age=%ld/%ld/%ld", + l->min_time, + (long)div_u64(l->sum_time, l->count), + l->max_time); + } else + len += sprintf(buf + len, " age=%ld", + l->min_time); + + if (l->min_pid != l->max_pid) + len += sprintf(buf + len, " pid=%ld-%ld", + l->min_pid, l->max_pid); + else + len += sprintf(buf + len, " pid=%ld", + l->min_pid); + + if (num_online_cpus() > 1 && !cpus_empty(l->cpus) && + len < PAGE_SIZE - 60) { + len += sprintf(buf + len, " cpus="); + len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50, + l->cpus); + } + + if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && + len < PAGE_SIZE - 60) { + len += sprintf(buf + len, " nodes="); + len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, + l->nodes); + } + + len += sprintf(buf + len, "\n"); + } + + free_loc_track(&t); + if (!t.count) + len += sprintf(buf, "No data\n"); + return len; +} + +enum slab_stat_type { + SL_ALL, /* All slabs */ + SL_PARTIAL, /* Only partially allocated slabs */ + SL_CPU, /* Only slabs used for cpu caches */ + SL_OBJECTS, /* Determine allocated objects not slabs */ + SL_TOTAL /* Determine object capacity not slabs */ +}; + +#define SO_ALL (1 << SL_ALL) +#define SO_PARTIAL (1 << SL_PARTIAL) +#define SO_CPU (1 << SL_CPU) +#define SO_OBJECTS (1 << SL_OBJECTS) +#define SO_TOTAL (1 << SL_TOTAL) + +static ssize_t show_slab_objects(struct kmem_cache *s, + char *buf, unsigned long flags) +{ + unsigned long total = 0; + int node; + int x; + unsigned long *nodes; + unsigned long *per_cpu; + + nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); + if (!nodes) + return -ENOMEM; + per_cpu = nodes + nr_node_ids; + + if (flags & SO_CPU) { + int cpu; + + for_each_possible_cpu(cpu) { + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + + if (!c || c->node < 0) + continue; + + if (c->page) { + if (flags & SO_TOTAL) + x = c->page->objects; + else if (flags & SO_OBJECTS) + x = c->page->inuse; + else + x = 1; + + total += x; + nodes[c->node] += x; + } + per_cpu[c->node]++; + } + } + + if (flags & SO_ALL) { + for_each_node_state(node, N_NORMAL_MEMORY) { + struct kmem_cache_node *n = get_node(s, node); + + if (flags & SO_TOTAL) + x = atomic_long_read(&n->total_objects); + else if (flags & SO_OBJECTS) + x = atomic_long_read(&n->total_objects) - + count_partial(n, count_free); + + else + x = atomic_long_read(&n->nr_slabs); + total += x; + nodes[node] += x; + } + + } else if (flags & SO_PARTIAL) { + for_each_node_state(node, N_NORMAL_MEMORY) { + struct kmem_cache_node *n = get_node(s, node); + + if (flags & SO_TOTAL) + x = count_partial(n, count_total); + else if (flags & SO_OBJECTS) + x = count_partial(n, count_inuse); + else + x = n->nr_partial; + total += x; + nodes[node] += x; + } + } + x = sprintf(buf, "%lu", total); +#ifdef CONFIG_NUMA + for_each_node_state(node, N_NORMAL_MEMORY) + if (nodes[node]) + x += sprintf(buf + x, " N%d=%lu", + node, nodes[node]); +#endif + kfree(nodes); + return x + sprintf(buf + x, "\n"); +} + +static int any_slab_objects(struct kmem_cache *s) +{ + int node; + + for_each_online_node(node) { + struct kmem_cache_node *n = get_node(s, node); + + if (!n) + continue; + + if (atomic_long_read(&n->total_objects)) + return 1; + } + return 0; +} + +#define to_slab_attr(n) container_of(n, struct slab_attribute, attr) +#define to_slab(n) container_of(n, struct kmem_cache, kobj); + +struct slab_attribute { + struct attribute attr; + ssize_t (*show)(struct kmem_cache *s, char *buf); + ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count); +}; + +#define SLAB_ATTR_RO(_name) \ + static struct slab_attribute _name##_attr = __ATTR_RO(_name) + +#define SLAB_ATTR(_name) \ + static struct slab_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +static ssize_t slab_size_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->size); +} +SLAB_ATTR_RO(slab_size); + +static ssize_t align_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->align); +} +SLAB_ATTR_RO(align); + +static ssize_t object_size_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->objsize); +} +SLAB_ATTR_RO(object_size); + +static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", oo_objects(s->oo)); +} +SLAB_ATTR_RO(objs_per_slab); + +static ssize_t order_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + unsigned long order; + int err; + + err = strict_strtoul(buf, 10, &order); + if (err) + return err; + + if (order > slub_max_order || order < slub_min_order) + return -EINVAL; + + calculate_sizes(s, order); + return length; +} + +static ssize_t order_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", oo_order(s->oo)); +} +SLAB_ATTR(order); + +static ssize_t ctor_show(struct kmem_cache *s, char *buf) +{ + if (s->ctor) { + int n = sprint_symbol(buf, (unsigned long)s->ctor); + + return n + sprintf(buf + n, "\n"); + } + return 0; +} +SLAB_ATTR_RO(ctor); + +static ssize_t aliases_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->refcount - 1); +} +SLAB_ATTR_RO(aliases); + +static ssize_t slabs_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_ALL); +} +SLAB_ATTR_RO(slabs); + +static ssize_t partial_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_PARTIAL); +} +SLAB_ATTR_RO(partial); + +static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_CPU); +} +SLAB_ATTR_RO(cpu_slabs); + +static ssize_t objects_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS); +} +SLAB_ATTR_RO(objects); + +static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS); +} +SLAB_ATTR_RO(objects_partial); + +static ssize_t total_objects_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_ALL|SO_TOTAL); +} +SLAB_ATTR_RO(total_objects); + +static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); +} + +static ssize_t sanity_checks_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + s->flags &= ~SLAB_DEBUG_FREE; + if (buf[0] == '1') + s->flags |= SLAB_DEBUG_FREE; + return length; +} +SLAB_ATTR(sanity_checks); + +static ssize_t trace_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); +} + +static ssize_t trace_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + s->flags &= ~SLAB_TRACE; + if (buf[0] == '1') + s->flags |= SLAB_TRACE; + return length; +} +SLAB_ATTR(trace); + +static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); +} + +static ssize_t reclaim_account_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + s->flags &= ~SLAB_RECLAIM_ACCOUNT; + if (buf[0] == '1') + s->flags |= SLAB_RECLAIM_ACCOUNT; + return length; +} +SLAB_ATTR(reclaim_account); + +static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); +} +SLAB_ATTR_RO(hwcache_align); + +#ifdef CONFIG_ZONE_DMA +static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); +} +SLAB_ATTR_RO(cache_dma); +#endif + +static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); +} +SLAB_ATTR_RO(destroy_by_rcu); + +static ssize_t red_zone_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); +} + +static ssize_t red_zone_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + if (any_slab_objects(s)) + return -EBUSY; + + s->flags &= ~SLAB_RED_ZONE; + if (buf[0] == '1') + s->flags |= SLAB_RED_ZONE; + calculate_sizes(s, -1); + return length; +} +SLAB_ATTR(red_zone); + +static ssize_t poison_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON)); +} + +static ssize_t poison_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + if (any_slab_objects(s)) + return -EBUSY; + + s->flags &= ~SLAB_POISON; + if (buf[0] == '1') + s->flags |= SLAB_POISON; + calculate_sizes(s, -1); + return length; +} +SLAB_ATTR(poison); + +static ssize_t store_user_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER)); +} + +static ssize_t store_user_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + if (any_slab_objects(s)) + return -EBUSY; + + s->flags &= ~SLAB_STORE_USER; + if (buf[0] == '1') + s->flags |= SLAB_STORE_USER; + calculate_sizes(s, -1); + return length; +} +SLAB_ATTR(store_user); + +static ssize_t validate_show(struct kmem_cache *s, char *buf) +{ + return 0; +} + +static ssize_t validate_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + int ret = -EINVAL; + + if (buf[0] == '1') { + ret = validate_slab_cache(s); + if (ret >= 0) + ret = length; + } + return ret; +} +SLAB_ATTR(validate); + +static ssize_t shrink_show(struct kmem_cache *s, char *buf) +{ + return 0; +} + +static ssize_t shrink_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + if (buf[0] == '1') { + int rc = kmem_cache_shrink(s); + + if (rc) + return rc; + } else + return -EINVAL; + return length; +} +SLAB_ATTR(shrink); + +static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) +{ + if (!(s->flags & SLAB_STORE_USER)) + return -ENOSYS; + return list_locations(s, buf, TRACK_ALLOC); +} +SLAB_ATTR_RO(alloc_calls); + +static ssize_t free_calls_show(struct kmem_cache *s, char *buf) +{ + if (!(s->flags & SLAB_STORE_USER)) + return -ENOSYS; + return list_locations(s, buf, TRACK_FREE); +} +SLAB_ATTR_RO(free_calls); + +#ifdef CONFIG_NUMA +static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10); +} + +static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + unsigned long ratio; + int err; + + err = strict_strtoul(buf, 10, &ratio); + if (err) + return err; + + if (ratio <= 100) + s->remote_node_defrag_ratio = ratio * 10; + + return length; +} +SLAB_ATTR(remote_node_defrag_ratio); +#endif + +#ifdef CONFIG_SLUB_STATS +static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) +{ + unsigned long sum = 0; + int cpu; + int len; + int *data = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL); + + if (!data) + return -ENOMEM; + + for_each_online_cpu(cpu) { + unsigned x = get_cpu_slab(s, cpu)->stat[si]; + + data[cpu] = x; + sum += x; + } + + len = sprintf(buf, "%lu", sum); + +#ifdef CONFIG_SMP + for_each_online_cpu(cpu) { + if (data[cpu] && len < PAGE_SIZE - 20) + len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]); + } +#endif + kfree(data); + return len + sprintf(buf + len, "\n"); +} + +#define STAT_ATTR(si, text) \ +static ssize_t text##_show(struct kmem_cache *s, char *buf) \ +{ \ + return show_stat(s, buf, si); \ +} \ +SLAB_ATTR_RO(text); \ + +STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); +STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); +STAT_ATTR(FREE_FASTPATH, free_fastpath); +STAT_ATTR(FREE_SLOWPATH, free_slowpath); +STAT_ATTR(FREE_FROZEN, free_frozen); +STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial); +STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); +STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); +STAT_ATTR(ALLOC_SLAB, alloc_slab); +STAT_ATTR(ALLOC_REFILL, alloc_refill); +STAT_ATTR(FREE_SLAB, free_slab); +STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); +STAT_ATTR(DEACTIVATE_FULL, deactivate_full); +STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); +STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); +STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); +STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); +STAT_ATTR(ORDER_FALLBACK, order_fallback); +#endif + +static struct attribute *slab_attrs[] = { + &slab_size_attr.attr, + &object_size_attr.attr, + &objs_per_slab_attr.attr, + &order_attr.attr, + &objects_attr.attr, + &objects_partial_attr.attr, + &total_objects_attr.attr, + &slabs_attr.attr, + &partial_attr.attr, + &cpu_slabs_attr.attr, + &ctor_attr.attr, + &aliases_attr.attr, + &align_attr.attr, + &sanity_checks_attr.attr, + &trace_attr.attr, + &hwcache_align_attr.attr, + &reclaim_account_attr.attr, + &destroy_by_rcu_attr.attr, + &red_zone_attr.attr, + &poison_attr.attr, + &store_user_attr.attr, + &validate_attr.attr, + &shrink_attr.attr, + &alloc_calls_attr.attr, + &free_calls_attr.attr, +#ifdef CONFIG_ZONE_DMA + &cache_dma_attr.attr, +#endif +#ifdef CONFIG_NUMA + &remote_node_defrag_ratio_attr.attr, +#endif +#ifdef CONFIG_SLUB_STATS + &alloc_fastpath_attr.attr, + &alloc_slowpath_attr.attr, + &free_fastpath_attr.attr, + &free_slowpath_attr.attr, + &free_frozen_attr.attr, + &free_add_partial_attr.attr, + &free_remove_partial_attr.attr, + &alloc_from_partial_attr.attr, + &alloc_slab_attr.attr, + &alloc_refill_attr.attr, + &free_slab_attr.attr, + &cpuslab_flush_attr.attr, + &deactivate_full_attr.attr, + &deactivate_empty_attr.attr, + &deactivate_to_head_attr.attr, + &deactivate_to_tail_attr.attr, + &deactivate_remote_frees_attr.attr, + &order_fallback_attr.attr, +#endif + NULL +}; + +static struct attribute_group slab_attr_group = { + .attrs = slab_attrs, +}; + +static ssize_t slab_attr_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct slab_attribute *attribute; + struct kmem_cache *s; + int err; + + attribute = to_slab_attr(attr); + s = to_slab(kobj); + + if (!attribute->show) + return -EIO; + + err = attribute->show(s, buf); + + return err; +} + +static ssize_t slab_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct slab_attribute *attribute; + struct kmem_cache *s; + int err; + + attribute = to_slab_attr(attr); + s = to_slab(kobj); + + if (!attribute->store) + return -EIO; + + err = attribute->store(s, buf, len); + + return err; +} + +static void kmem_cache_release(struct kobject *kobj) +{ + struct kmem_cache *s = to_slab(kobj); + + kfree(s); +} + +static struct sysfs_ops slab_sysfs_ops = { + .show = slab_attr_show, + .store = slab_attr_store, +}; + +static struct kobj_type slab_ktype = { + .sysfs_ops = &slab_sysfs_ops, + .release = kmem_cache_release +}; + +static int uevent_filter(struct kset *kset, struct kobject *kobj) +{ + struct kobj_type *ktype = get_ktype(kobj); + + if (ktype == &slab_ktype) + return 1; + return 0; +} + +static struct kset_uevent_ops slab_uevent_ops = { + .filter = uevent_filter, +}; + +static struct kset *slab_kset; + +#define ID_STR_LENGTH 64 + +/* Create a unique string id for a slab cache: + * + * Format :[flags-]size + */ +static char *create_unique_id(struct kmem_cache *s) +{ + char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL); + char *p = name; + + BUG_ON(!name); + + *p++ = ':'; + /* + * First flags affecting slabcache operations. We will only + * get here for aliasable slabs so we do not need to support + * too many flags. The flags here must cover all flags that + * are matched during merging to guarantee that the id is + * unique. + */ + if (s->flags & SLAB_CACHE_DMA) + *p++ = 'd'; + if (s->flags & SLAB_RECLAIM_ACCOUNT) + *p++ = 'a'; + if (s->flags & SLAB_DEBUG_FREE) + *p++ = 'F'; + if (p != name + 1) + *p++ = '-'; + p += sprintf(p, "%07d", s->size); + BUG_ON(p > name + ID_STR_LENGTH - 1); + return name; +} + +static int sysfs_slab_add(struct kmem_cache *s) +{ + int err; + const char *name; + int unmergeable; + + if (slab_state < SYSFS) + /* Defer until later */ + return 0; + + unmergeable = slab_unmergeable(s); + if (unmergeable) { + /* + * Slabcache can never be merged so we can use the name proper. + * This is typically the case for debug situations. In that + * case we can catch duplicate names easily. + */ + sysfs_remove_link(&slab_kset->kobj, s->name); + name = s->name; + } else { + /* + * Create a unique name for the slab as a target + * for the symlinks. + */ + name = create_unique_id(s); + } + + s->kobj.kset = slab_kset; + err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name); + if (err) { + kobject_put(&s->kobj); + return err; + } + + err = sysfs_create_group(&s->kobj, &slab_attr_group); + if (err) + return err; + kobject_uevent(&s->kobj, KOBJ_ADD); + if (!unmergeable) { + /* Setup first alias */ + sysfs_slab_alias(s, s->name); + kfree(name); + } + return 0; +} + +static void sysfs_slab_remove(struct kmem_cache *s) +{ + kobject_uevent(&s->kobj, KOBJ_REMOVE); + kobject_del(&s->kobj); + kobject_put(&s->kobj); +} + +/* + * Need to buffer aliases during bootup until sysfs becomes + * available lest we loose that information. + */ +struct saved_alias { + struct kmem_cache *s; + const char *name; + struct saved_alias *next; +}; + +static struct saved_alias *alias_list; + +static int sysfs_slab_alias(struct kmem_cache *s, const char *name) +{ + struct saved_alias *al; + + if (slab_state == SYSFS) { + /* + * If we have a leftover link then remove it. + */ + sysfs_remove_link(&slab_kset->kobj, name); + return sysfs_create_link(&slab_kset->kobj, &s->kobj, name); + } + + al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL); + if (!al) + return -ENOMEM; + + al->s = s; + al->name = name; + al->next = alias_list; + alias_list = al; + return 0; +} + +static int __init slab_sysfs_init(void) +{ + struct kmem_cache *s; + int err; + + slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); + if (!slab_kset) { + printk(KERN_ERR "Cannot register slab subsystem.\n"); + return -ENOSYS; + } + + slab_state = SYSFS; + + list_for_each_entry(s, &slab_caches, list) { + err = sysfs_slab_add(s); + if (err) + printk(KERN_ERR "SLUB: Unable to add boot slab %s" + " to sysfs\n", s->name); + } + + while (alias_list) { + struct saved_alias *al = alias_list; + + alias_list = alias_list->next; + err = sysfs_slab_alias(al->s, al->name); + if (err) + printk(KERN_ERR "SLUB: Unable to add boot slab alias" + " %s to sysfs\n", s->name); + kfree(al); + } + + resiliency_test(); + return 0; +} + +__initcall(slab_sysfs_init); +#endif + +/* + * The /proc/slabinfo ABI + */ +#ifdef CONFIG_SLABINFO +static void print_slabinfo_header(struct seq_file *m) +{ + seq_puts(m, "slabinfo - version: 2.1\n"); + seq_puts(m, "# name <active_objs> <num_objs> <objsize> " + "<objperslab> <pagesperslab>"); + seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); + seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); + seq_putc(m, '\n'); +} + +static void *s_start(struct seq_file *m, loff_t *pos) +{ + loff_t n = *pos; + + down_read(&slub_lock); + if (!n) + print_slabinfo_header(m); + + return seq_list_start(&slab_caches, *pos); +} + +static void *s_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &slab_caches, pos); +} + +static void s_stop(struct seq_file *m, void *p) +{ + up_read(&slub_lock); +} + +static int s_show(struct seq_file *m, void *p) +{ + unsigned long nr_partials = 0; + unsigned long nr_slabs = 0; + unsigned long nr_inuse = 0; + unsigned long nr_objs = 0; + unsigned long nr_free = 0; + struct kmem_cache *s; + int node; + + s = list_entry(p, struct kmem_cache, list); + + for_each_online_node(node) { + struct kmem_cache_node *n = get_node(s, node); + + if (!n) + continue; + + nr_partials += n->nr_partial; + nr_slabs += atomic_long_read(&n->nr_slabs); + nr_objs += atomic_long_read(&n->total_objects); + nr_free += count_partial(n, count_free); + } + + nr_inuse = nr_objs - nr_free; + + seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse, + nr_objs, s->size, oo_objects(s->oo), + (1 << oo_order(s->oo))); + seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0); + seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs, + 0UL); + seq_putc(m, '\n'); + return 0; +} + +static const struct seq_operations slabinfo_op = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, +}; + +static int slabinfo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &slabinfo_op); +} + +static const struct file_operations proc_slabinfo_operations = { + .open = slabinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init slab_proc_init(void) +{ + proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); + return 0; +} +module_init(slab_proc_init); +#endif /* CONFIG_SLABINFO */ diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c new file mode 100644 index 0000000..a13ea64 --- /dev/null +++ b/mm/sparse-vmemmap.c @@ -0,0 +1,159 @@ +/* + * Virtual Memory Map support + * + * (C) 2007 sgi. Christoph Lameter. + * + * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn, + * virt_to_page, page_address() to be implemented as a base offset + * calculation without memory access. + * + * However, virtual mappings need a page table and TLBs. Many Linux + * architectures already map their physical space using 1-1 mappings + * via TLBs. For those arches the virtual memmory map is essentially + * for free if we use the same page size as the 1-1 mappings. In that + * case the overhead consists of a few additional pages that are + * allocated to create a view of memory for vmemmap. + * + * The architecture is expected to provide a vmemmap_populate() function + * to instantiate the mapping. + */ +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/bootmem.h> +#include <linux/highmem.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/vmalloc.h> +#include <linux/sched.h> +#include <asm/dma.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> + +/* + * Allocate a block of memory to be used to back the virtual memory map + * or to back the page tables that are used to create the mapping. + * Uses the main allocators if they are available, else bootmem. + */ + +static void * __init_refok __earlyonly_bootmem_alloc(int node, + unsigned long size, + unsigned long align, + unsigned long goal) +{ + return __alloc_bootmem_node(NODE_DATA(node), size, align, goal); +} + + +void * __meminit vmemmap_alloc_block(unsigned long size, int node) +{ + /* If the main allocator is up use that, fallback to bootmem. */ + if (slab_is_available()) { + struct page *page = alloc_pages_node(node, + GFP_KERNEL | __GFP_ZERO, get_order(size)); + if (page) + return page_address(page); + return NULL; + } else + return __earlyonly_bootmem_alloc(node, size, size, + __pa(MAX_DMA_ADDRESS)); +} + +void __meminit vmemmap_verify(pte_t *pte, int node, + unsigned long start, unsigned long end) +{ + unsigned long pfn = pte_pfn(*pte); + int actual_node = early_pfn_to_nid(pfn); + + if (node_distance(actual_node, node) > LOCAL_DISTANCE) + printk(KERN_WARNING "[%lx-%lx] potential offnode " + "page_structs\n", start, end - 1); +} + +pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) +{ + pte_t *pte = pte_offset_kernel(pmd, addr); + if (pte_none(*pte)) { + pte_t entry; + void *p = vmemmap_alloc_block(PAGE_SIZE, node); + if (!p) + return NULL; + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); + set_pte_at(&init_mm, addr, pte, entry); + } + return pte; +} + +pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) +{ + pmd_t *pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + void *p = vmemmap_alloc_block(PAGE_SIZE, node); + if (!p) + return NULL; + pmd_populate_kernel(&init_mm, pmd, p); + } + return pmd; +} + +pud_t * __meminit vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node) +{ + pud_t *pud = pud_offset(pgd, addr); + if (pud_none(*pud)) { + void *p = vmemmap_alloc_block(PAGE_SIZE, node); + if (!p) + return NULL; + pud_populate(&init_mm, pud, p); + } + return pud; +} + +pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) +{ + pgd_t *pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) { + void *p = vmemmap_alloc_block(PAGE_SIZE, node); + if (!p) + return NULL; + pgd_populate(&init_mm, pgd, p); + } + return pgd; +} + +int __meminit vmemmap_populate_basepages(struct page *start_page, + unsigned long size, int node) +{ + unsigned long addr = (unsigned long)start_page; + unsigned long end = (unsigned long)(start_page + size); + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + for (; addr < end; addr += PAGE_SIZE) { + pgd = vmemmap_pgd_populate(addr, node); + if (!pgd) + return -ENOMEM; + pud = vmemmap_pud_populate(pgd, addr, node); + if (!pud) + return -ENOMEM; + pmd = vmemmap_pmd_populate(pud, addr, node); + if (!pmd) + return -ENOMEM; + pte = vmemmap_pte_populate(pmd, addr, node); + if (!pte) + return -ENOMEM; + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + } + + return 0; +} + +struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid) +{ + struct page *map = pfn_to_page(pnum * PAGES_PER_SECTION); + int error = vmemmap_populate(map, PAGES_PER_SECTION, nid); + if (error) + return NULL; + + return map; +} diff --git a/mm/sparse.c b/mm/sparse.c new file mode 100644 index 0000000..083f5b6 --- /dev/null +++ b/mm/sparse.c @@ -0,0 +1,636 @@ +/* + * sparse memory mappings. + */ +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/bootmem.h> +#include <linux/highmem.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/vmalloc.h> +#include "internal.h" +#include <asm/dma.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> + +/* + * Permanent SPARSEMEM data: + * + * 1) mem_section - memory sections, mem_map's for valid memory + */ +#ifdef CONFIG_SPARSEMEM_EXTREME +struct mem_section *mem_section[NR_SECTION_ROOTS] + ____cacheline_internodealigned_in_smp; +#else +struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT] + ____cacheline_internodealigned_in_smp; +#endif +EXPORT_SYMBOL(mem_section); + +#ifdef NODE_NOT_IN_PAGE_FLAGS +/* + * If we did not store the node number in the page then we have to + * do a lookup in the section_to_node_table in order to find which + * node the page belongs to. + */ +#if MAX_NUMNODES <= 256 +static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; +#else +static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; +#endif + +int page_to_nid(struct page *page) +{ + return section_to_node_table[page_to_section(page)]; +} +EXPORT_SYMBOL(page_to_nid); + +static void set_section_nid(unsigned long section_nr, int nid) +{ + section_to_node_table[section_nr] = nid; +} +#else /* !NODE_NOT_IN_PAGE_FLAGS */ +static inline void set_section_nid(unsigned long section_nr, int nid) +{ +} +#endif + +#ifdef CONFIG_SPARSEMEM_EXTREME +static struct mem_section noinline __init_refok *sparse_index_alloc(int nid) +{ + struct mem_section *section = NULL; + unsigned long array_size = SECTIONS_PER_ROOT * + sizeof(struct mem_section); + + if (slab_is_available()) + section = kmalloc_node(array_size, GFP_KERNEL, nid); + else + section = alloc_bootmem_node(NODE_DATA(nid), array_size); + + if (section) + memset(section, 0, array_size); + + return section; +} + +static int __meminit sparse_index_init(unsigned long section_nr, int nid) +{ + static DEFINE_SPINLOCK(index_init_lock); + unsigned long root = SECTION_NR_TO_ROOT(section_nr); + struct mem_section *section; + int ret = 0; + + if (mem_section[root]) + return -EEXIST; + + section = sparse_index_alloc(nid); + if (!section) + return -ENOMEM; + /* + * This lock keeps two different sections from + * reallocating for the same index + */ + spin_lock(&index_init_lock); + + if (mem_section[root]) { + ret = -EEXIST; + goto out; + } + + mem_section[root] = section; +out: + spin_unlock(&index_init_lock); + return ret; +} +#else /* !SPARSEMEM_EXTREME */ +static inline int sparse_index_init(unsigned long section_nr, int nid) +{ + return 0; +} +#endif + +/* + * Although written for the SPARSEMEM_EXTREME case, this happens + * to also work for the flat array case because + * NR_SECTION_ROOTS==NR_MEM_SECTIONS. + */ +int __section_nr(struct mem_section* ms) +{ + unsigned long root_nr; + struct mem_section* root; + + for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) { + root = __nr_to_section(root_nr * SECTIONS_PER_ROOT); + if (!root) + continue; + + if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT))) + break; + } + + return (root_nr * SECTIONS_PER_ROOT) + (ms - root); +} + +/* + * During early boot, before section_mem_map is used for an actual + * mem_map, we use section_mem_map to store the section's NUMA + * node. This keeps us from having to use another data structure. The + * node information is cleared just before we store the real mem_map. + */ +static inline unsigned long sparse_encode_early_nid(int nid) +{ + return (nid << SECTION_NID_SHIFT); +} + +static inline int sparse_early_nid(struct mem_section *section) +{ + return (section->section_mem_map >> SECTION_NID_SHIFT); +} + +/* Validate the physical addressing limitations of the model */ +void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, + unsigned long *end_pfn) +{ + unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT); + + /* + * Sanity checks - do not allow an architecture to pass + * in larger pfns than the maximum scope of sparsemem: + */ + if (*start_pfn > max_sparsemem_pfn) { + mminit_dprintk(MMINIT_WARNING, "pfnvalidation", + "Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n", + *start_pfn, *end_pfn, max_sparsemem_pfn); + WARN_ON_ONCE(1); + *start_pfn = max_sparsemem_pfn; + *end_pfn = max_sparsemem_pfn; + } + + if (*end_pfn > max_sparsemem_pfn) { + mminit_dprintk(MMINIT_WARNING, "pfnvalidation", + "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n", + *start_pfn, *end_pfn, max_sparsemem_pfn); + WARN_ON_ONCE(1); + *end_pfn = max_sparsemem_pfn; + } +} + +/* Record a memory area against a node. */ +void __init memory_present(int nid, unsigned long start, unsigned long end) +{ + unsigned long pfn; + + start &= PAGE_SECTION_MASK; + mminit_validate_memmodel_limits(&start, &end); + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { + unsigned long section = pfn_to_section_nr(pfn); + struct mem_section *ms; + + sparse_index_init(section, nid); + set_section_nid(section, nid); + + ms = __nr_to_section(section); + if (!ms->section_mem_map) + ms->section_mem_map = sparse_encode_early_nid(nid) | + SECTION_MARKED_PRESENT; + } +} + +/* + * Only used by the i386 NUMA architecures, but relatively + * generic code. + */ +unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long pfn; + unsigned long nr_pages = 0; + + mminit_validate_memmodel_limits(&start_pfn, &end_pfn); + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + if (nid != early_pfn_to_nid(pfn)) + continue; + + if (pfn_present(pfn)) + nr_pages += PAGES_PER_SECTION; + } + + return nr_pages * sizeof(struct page); +} + +/* + * Subtle, we encode the real pfn into the mem_map such that + * the identity pfn - section_mem_map will return the actual + * physical page frame number. + */ +static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum) +{ + return (unsigned long)(mem_map - (section_nr_to_pfn(pnum))); +} + +/* + * Decode mem_map from the coded memmap + */ +struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum) +{ + /* mask off the extra low bits of information */ + coded_mem_map &= SECTION_MAP_MASK; + return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum); +} + +static int __meminit sparse_init_one_section(struct mem_section *ms, + unsigned long pnum, struct page *mem_map, + unsigned long *pageblock_bitmap) +{ + if (!present_section(ms)) + return -EINVAL; + + ms->section_mem_map &= ~SECTION_MAP_MASK; + ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) | + SECTION_HAS_MEM_MAP; + ms->pageblock_flags = pageblock_bitmap; + + return 1; +} + +unsigned long usemap_size(void) +{ + unsigned long size_bytes; + size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8; + size_bytes = roundup(size_bytes, sizeof(unsigned long)); + return size_bytes; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static unsigned long *__kmalloc_section_usemap(void) +{ + return kmalloc(usemap_size(), GFP_KERNEL); +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + +#ifdef CONFIG_MEMORY_HOTREMOVE +static unsigned long * __init +sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) +{ + unsigned long section_nr; + + /* + * A page may contain usemaps for other sections preventing the + * page being freed and making a section unremovable while + * other sections referencing the usemap retmain active. Similarly, + * a pgdat can prevent a section being removed. If section A + * contains a pgdat and section B contains the usemap, both + * sections become inter-dependent. This allocates usemaps + * from the same section as the pgdat where possible to avoid + * this problem. + */ + section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); + return alloc_bootmem_section(usemap_size(), section_nr); +} + +static void __init check_usemap_section_nr(int nid, unsigned long *usemap) +{ + unsigned long usemap_snr, pgdat_snr; + static unsigned long old_usemap_snr = NR_MEM_SECTIONS; + static unsigned long old_pgdat_snr = NR_MEM_SECTIONS; + struct pglist_data *pgdat = NODE_DATA(nid); + int usemap_nid; + + usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT); + pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); + if (usemap_snr == pgdat_snr) + return; + + if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr) + /* skip redundant message */ + return; + + old_usemap_snr = usemap_snr; + old_pgdat_snr = pgdat_snr; + + usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr)); + if (usemap_nid != nid) { + printk(KERN_INFO + "node %d must be removed before remove section %ld\n", + nid, usemap_snr); + return; + } + /* + * There is a circular dependency. + * Some platforms allow un-removable section because they will just + * gather other removable sections for dynamic partitioning. + * Just notify un-removable section's number here. + */ + printk(KERN_INFO "Section %ld and %ld (node %d)", usemap_snr, + pgdat_snr, nid); + printk(KERN_CONT + " have a circular dependency on usemap and pgdat allocations\n"); +} +#else +static unsigned long * __init +sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) +{ + return NULL; +} + +static void __init check_usemap_section_nr(int nid, unsigned long *usemap) +{ +} +#endif /* CONFIG_MEMORY_HOTREMOVE */ + +static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) +{ + unsigned long *usemap; + struct mem_section *ms = __nr_to_section(pnum); + int nid = sparse_early_nid(ms); + + usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid)); + if (usemap) + return usemap; + + usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); + if (usemap) { + check_usemap_section_nr(nid, usemap); + return usemap; + } + + /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ + nid = 0; + + printk(KERN_WARNING "%s: allocation failed\n", __func__); + return NULL; +} + +#ifndef CONFIG_SPARSEMEM_VMEMMAP +struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) +{ + struct page *map; + + map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); + if (map) + return map; + + map = alloc_bootmem_pages_node(NODE_DATA(nid), + PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); + return map; +} +#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ + +static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) +{ + struct page *map; + struct mem_section *ms = __nr_to_section(pnum); + int nid = sparse_early_nid(ms); + + map = sparse_mem_map_populate(pnum, nid); + if (map) + return map; + + printk(KERN_ERR "%s: sparsemem memory map backing failed " + "some memory will not be available.\n", __func__); + ms->section_mem_map = 0; + return NULL; +} + +void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) +{ +} +/* + * Allocate the accumulated non-linear sections, allocate a mem_map + * for each and record the physical to section mapping. + */ +void __init sparse_init(void) +{ + unsigned long pnum; + struct page *map; + unsigned long *usemap; + unsigned long **usemap_map; + int size; + + /* + * map is using big page (aka 2M in x86 64 bit) + * usemap is less one page (aka 24 bytes) + * so alloc 2M (with 2M align) and 24 bytes in turn will + * make next 2M slip to one more 2M later. + * then in big system, the memory will have a lot of holes... + * here try to allocate 2M pages continously. + * + * powerpc need to call sparse_init_one_section right after each + * sparse_early_mem_map_alloc, so allocate usemap_map at first. + */ + size = sizeof(unsigned long *) * NR_MEM_SECTIONS; + usemap_map = alloc_bootmem(size); + if (!usemap_map) + panic("can not allocate usemap_map\n"); + + for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { + if (!present_section_nr(pnum)) + continue; + usemap_map[pnum] = sparse_early_usemap_alloc(pnum); + } + + for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { + if (!present_section_nr(pnum)) + continue; + + usemap = usemap_map[pnum]; + if (!usemap) + continue; + + map = sparse_early_mem_map_alloc(pnum); + if (!map) + continue; + + sparse_init_one_section(__nr_to_section(pnum), pnum, map, + usemap); + } + + vmemmap_populate_print_last(); + + free_bootmem(__pa(usemap_map), size); +} + +#ifdef CONFIG_MEMORY_HOTPLUG +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, + unsigned long nr_pages) +{ + /* This will make the necessary allocations eventually. */ + return sparse_mem_map_populate(pnum, nid); +} +static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) +{ + return; /* XXX: Not implemented yet */ +} +static void free_map_bootmem(struct page *page, unsigned long nr_pages) +{ +} +#else +static struct page *__kmalloc_section_memmap(unsigned long nr_pages) +{ + struct page *page, *ret; + unsigned long memmap_size = sizeof(struct page) * nr_pages; + + page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size)); + if (page) + goto got_map_page; + + ret = vmalloc(memmap_size); + if (ret) + goto got_map_ptr; + + return NULL; +got_map_page: + ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); +got_map_ptr: + memset(ret, 0, memmap_size); + + return ret; +} + +static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, + unsigned long nr_pages) +{ + return __kmalloc_section_memmap(nr_pages); +} + +static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) +{ + if (is_vmalloc_addr(memmap)) + vfree(memmap); + else + free_pages((unsigned long)memmap, + get_order(sizeof(struct page) * nr_pages)); +} + +static void free_map_bootmem(struct page *page, unsigned long nr_pages) +{ + unsigned long maps_section_nr, removing_section_nr, i; + int magic; + + for (i = 0; i < nr_pages; i++, page++) { + magic = atomic_read(&page->_mapcount); + + BUG_ON(magic == NODE_INFO); + + maps_section_nr = pfn_to_section_nr(page_to_pfn(page)); + removing_section_nr = page->private; + + /* + * When this function is called, the removing section is + * logical offlined state. This means all pages are isolated + * from page allocator. If removing section's memmap is placed + * on the same section, it must not be freed. + * If it is freed, page allocator may allocate it which will + * be removed physically soon. + */ + if (maps_section_nr != removing_section_nr) + put_page_bootmem(page); + } +} +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + +static void free_section_usemap(struct page *memmap, unsigned long *usemap) +{ + struct page *usemap_page; + unsigned long nr_pages; + + if (!usemap) + return; + + usemap_page = virt_to_page(usemap); + /* + * Check to see if allocation came from hot-plug-add + */ + if (PageSlab(usemap_page)) { + kfree(usemap); + if (memmap) + __kfree_section_memmap(memmap, PAGES_PER_SECTION); + return; + } + + /* + * The usemap came from bootmem. This is packed with other usemaps + * on the section which has pgdat at boot time. Just keep it as is now. + */ + + if (memmap) { + struct page *memmap_page; + memmap_page = virt_to_page(memmap); + + nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) + >> PAGE_SHIFT; + + free_map_bootmem(memmap_page, nr_pages); + } +} + +/* + * returns the number of sections whose mem_maps were properly + * set. If this is <=0, then that means that the passed-in + * map was not consumed and must be freed. + */ +int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn, + int nr_pages) +{ + unsigned long section_nr = pfn_to_section_nr(start_pfn); + struct pglist_data *pgdat = zone->zone_pgdat; + struct mem_section *ms; + struct page *memmap; + unsigned long *usemap; + unsigned long flags; + int ret; + + /* + * no locking for this, because it does its own + * plus, it does a kmalloc + */ + ret = sparse_index_init(section_nr, pgdat->node_id); + if (ret < 0 && ret != -EEXIST) + return ret; + memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, nr_pages); + if (!memmap) + return -ENOMEM; + usemap = __kmalloc_section_usemap(); + if (!usemap) { + __kfree_section_memmap(memmap, nr_pages); + return -ENOMEM; + } + + pgdat_resize_lock(pgdat, &flags); + + ms = __pfn_to_section(start_pfn); + if (ms->section_mem_map & SECTION_MARKED_PRESENT) { + ret = -EEXIST; + goto out; + } + + ms->section_mem_map |= SECTION_MARKED_PRESENT; + + ret = sparse_init_one_section(ms, section_nr, memmap, usemap); + +out: + pgdat_resize_unlock(pgdat, &flags); + if (ret <= 0) { + kfree(usemap); + __kfree_section_memmap(memmap, nr_pages); + } + return ret; +} + +void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) +{ + struct page *memmap = NULL; + unsigned long *usemap = NULL; + + if (ms->section_mem_map) { + usemap = ms->pageblock_flags; + memmap = sparse_decode_mem_map(ms->section_mem_map, + __section_nr(ms)); + ms->section_mem_map = 0; + ms->pageblock_flags = NULL; + } + + free_section_usemap(memmap, usemap); +} +#endif diff --git a/mm/swap.c b/mm/swap.c new file mode 100644 index 0000000..b135ec9 --- /dev/null +++ b/mm/swap.c @@ -0,0 +1,606 @@ +/* + * linux/mm/swap.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +/* + * This file contains the default values for the operation of the + * Linux VM subsystem. Fine-tuning documentation can be found in + * Documentation/sysctl/vm.txt. + * Started 18.12.91 + * Swap aging added 23.2.95, Stephen Tweedie. + * Buffermem limits added 12.3.98, Rik van Riel. + */ + +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/kernel_stat.h> +#include <linux/swap.h> +#include <linux/mman.h> +#include <linux/pagemap.h> +#include <linux/pagevec.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/mm_inline.h> +#include <linux/buffer_head.h> /* for try_to_release_page() */ +#include <linux/percpu_counter.h> +#include <linux/percpu.h> +#include <linux/cpu.h> +#include <linux/notifier.h> +#include <linux/backing-dev.h> +#include <linux/memcontrol.h> + +#include "internal.h" + +/* How many pages do we try to swap or page in/out together? */ +int page_cluster; + +static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); + +/* + * This path almost never happens for VM activity - pages are normally + * freed via pagevecs. But it gets used by networking. + */ +static void __page_cache_release(struct page *page) +{ + if (PageLRU(page)) { + unsigned long flags; + struct zone *zone = page_zone(page); + + spin_lock_irqsave(&zone->lru_lock, flags); + VM_BUG_ON(!PageLRU(page)); + __ClearPageLRU(page); + del_page_from_lru(zone, page); + spin_unlock_irqrestore(&zone->lru_lock, flags); + } + free_hot_page(page); +} + +static void put_compound_page(struct page *page) +{ + page = compound_head(page); + if (put_page_testzero(page)) { + compound_page_dtor *dtor; + + dtor = get_compound_page_dtor(page); + (*dtor)(page); + } +} + +void put_page(struct page *page) +{ + if (unlikely(PageCompound(page))) + put_compound_page(page); + else if (put_page_testzero(page)) + __page_cache_release(page); +} +EXPORT_SYMBOL(put_page); + +/** + * put_pages_list() - release a list of pages + * @pages: list of pages threaded on page->lru + * + * Release a list of pages which are strung together on page.lru. Currently + * used by read_cache_pages() and related error recovery code. + */ +void put_pages_list(struct list_head *pages) +{ + while (!list_empty(pages)) { + struct page *victim; + + victim = list_entry(pages->prev, struct page, lru); + list_del(&victim->lru); + page_cache_release(victim); + } +} +EXPORT_SYMBOL(put_pages_list); + +/* + * pagevec_move_tail() must be called with IRQ disabled. + * Otherwise this may cause nasty races. + */ +static void pagevec_move_tail(struct pagevec *pvec) +{ + int i; + int pgmoved = 0; + struct zone *zone = NULL; + + for (i = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + struct zone *pagezone = page_zone(page); + + if (pagezone != zone) { + if (zone) + spin_unlock(&zone->lru_lock); + zone = pagezone; + spin_lock(&zone->lru_lock); + } + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { + int lru = page_is_file_cache(page); + list_move_tail(&page->lru, &zone->lru[lru].list); + pgmoved++; + } + } + if (zone) + spin_unlock(&zone->lru_lock); + __count_vm_events(PGROTATED, pgmoved); + release_pages(pvec->pages, pvec->nr, pvec->cold); + pagevec_reinit(pvec); +} + +/* + * Writeback is about to end against a page which has been marked for immediate + * reclaim. If it still appears to be reclaimable, move it to the tail of the + * inactive list. + */ +void rotate_reclaimable_page(struct page *page) +{ + if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && + !PageUnevictable(page) && PageLRU(page)) { + struct pagevec *pvec; + unsigned long flags; + + page_cache_get(page); + local_irq_save(flags); + pvec = &__get_cpu_var(lru_rotate_pvecs); + if (!pagevec_add(pvec, page)) + pagevec_move_tail(pvec); + local_irq_restore(flags); + } +} + +/* + * FIXME: speed this up? + */ +void activate_page(struct page *page) +{ + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { + int file = page_is_file_cache(page); + int lru = LRU_BASE + file; + del_page_from_lru_list(zone, page, lru); + + SetPageActive(page); + lru += LRU_ACTIVE; + add_page_to_lru_list(zone, page, lru); + __count_vm_event(PGACTIVATE); + mem_cgroup_move_lists(page, lru); + + zone->recent_rotated[!!file]++; + zone->recent_scanned[!!file]++; + } + spin_unlock_irq(&zone->lru_lock); +} + +/* + * Mark a page as having seen activity. + * + * inactive,unreferenced -> inactive,referenced + * inactive,referenced -> active,unreferenced + * active,unreferenced -> active,referenced + */ +void mark_page_accessed(struct page *page) +{ + if (!PageActive(page) && !PageUnevictable(page) && + PageReferenced(page) && PageLRU(page)) { + activate_page(page); + ClearPageReferenced(page); + } else if (!PageReferenced(page)) { + SetPageReferenced(page); + } +} + +EXPORT_SYMBOL(mark_page_accessed); + +void __lru_cache_add(struct page *page, enum lru_list lru) +{ + struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; + + page_cache_get(page); + if (!pagevec_add(pvec, page)) + ____pagevec_lru_add(pvec, lru); + put_cpu_var(lru_add_pvecs); +} + +/** + * lru_cache_add_lru - add a page to a page list + * @page: the page to be added to the LRU. + * @lru: the LRU list to which the page is added. + */ +void lru_cache_add_lru(struct page *page, enum lru_list lru) +{ + if (PageActive(page)) { + VM_BUG_ON(PageUnevictable(page)); + ClearPageActive(page); + } else if (PageUnevictable(page)) { + VM_BUG_ON(PageActive(page)); + ClearPageUnevictable(page); + } + + VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page)); + __lru_cache_add(page, lru); +} + +/** + * add_page_to_unevictable_list - add a page to the unevictable list + * @page: the page to be added to the unevictable list + * + * Add page directly to its zone's unevictable list. To avoid races with + * tasks that might be making the page evictable, through eg. munlock, + * munmap or exit, while it's not on the lru, we want to add the page + * while it's locked or otherwise "invisible" to other tasks. This is + * difficult to do when using the pagevec cache, so bypass that. + */ +void add_page_to_unevictable_list(struct page *page) +{ + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + SetPageUnevictable(page); + SetPageLRU(page); + add_page_to_lru_list(zone, page, LRU_UNEVICTABLE); + spin_unlock_irq(&zone->lru_lock); +} + +/** + * lru_cache_add_active_or_unevictable + * @page: the page to be added to LRU + * @vma: vma in which page is mapped for determining reclaimability + * + * place @page on active or unevictable LRU list, depending on + * page_evictable(). Note that if the page is not evictable, + * it goes directly back onto it's zone's unevictable list. It does + * NOT use a per cpu pagevec. + */ +void lru_cache_add_active_or_unevictable(struct page *page, + struct vm_area_struct *vma) +{ + if (page_evictable(page, vma)) + lru_cache_add_lru(page, LRU_ACTIVE + page_is_file_cache(page)); + else + add_page_to_unevictable_list(page); +} + +/* + * Drain pages out of the cpu's pagevecs. + * Either "cpu" is the current CPU, and preemption has already been + * disabled; or "cpu" is being hot-unplugged, and is already dead. + */ +static void drain_cpu_pagevecs(int cpu) +{ + struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); + struct pagevec *pvec; + int lru; + + for_each_lru(lru) { + pvec = &pvecs[lru - LRU_BASE]; + if (pagevec_count(pvec)) + ____pagevec_lru_add(pvec, lru); + } + + pvec = &per_cpu(lru_rotate_pvecs, cpu); + if (pagevec_count(pvec)) { + unsigned long flags; + + /* No harm done if a racing interrupt already did this */ + local_irq_save(flags); + pagevec_move_tail(pvec); + local_irq_restore(flags); + } +} + +void lru_add_drain(void) +{ + drain_cpu_pagevecs(get_cpu()); + put_cpu(); +} + +static void lru_add_drain_per_cpu(struct work_struct *dummy) +{ + lru_add_drain(); +} + +/* + * Returns 0 for success + */ +int lru_add_drain_all(void) +{ + return schedule_on_each_cpu(lru_add_drain_per_cpu); +} + +/* + * Batched page_cache_release(). Decrement the reference count on all the + * passed pages. If it fell to zero then remove the page from the LRU and + * free it. + * + * Avoid taking zone->lru_lock if possible, but if it is taken, retain it + * for the remainder of the operation. + * + * The locking in this function is against shrink_inactive_list(): we recheck + * the page count inside the lock to see whether shrink_inactive_list() + * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() + * will free it. + */ +void release_pages(struct page **pages, int nr, int cold) +{ + int i; + struct pagevec pages_to_free; + struct zone *zone = NULL; + unsigned long uninitialized_var(flags); + + pagevec_init(&pages_to_free, cold); + for (i = 0; i < nr; i++) { + struct page *page = pages[i]; + + if (unlikely(PageCompound(page))) { + if (zone) { + spin_unlock_irqrestore(&zone->lru_lock, flags); + zone = NULL; + } + put_compound_page(page); + continue; + } + + if (!put_page_testzero(page)) + continue; + + if (PageLRU(page)) { + struct zone *pagezone = page_zone(page); + + if (pagezone != zone) { + if (zone) + spin_unlock_irqrestore(&zone->lru_lock, + flags); + zone = pagezone; + spin_lock_irqsave(&zone->lru_lock, flags); + } + VM_BUG_ON(!PageLRU(page)); + __ClearPageLRU(page); + del_page_from_lru(zone, page); + } + + if (!pagevec_add(&pages_to_free, page)) { + if (zone) { + spin_unlock_irqrestore(&zone->lru_lock, flags); + zone = NULL; + } + __pagevec_free(&pages_to_free); + pagevec_reinit(&pages_to_free); + } + } + if (zone) + spin_unlock_irqrestore(&zone->lru_lock, flags); + + pagevec_free(&pages_to_free); +} + +/* + * The pages which we're about to release may be in the deferred lru-addition + * queues. That would prevent them from really being freed right now. That's + * OK from a correctness point of view but is inefficient - those pages may be + * cache-warm and we want to give them back to the page allocator ASAP. + * + * So __pagevec_release() will drain those queues here. __pagevec_lru_add() + * and __pagevec_lru_add_active() call release_pages() directly to avoid + * mutual recursion. + */ +void __pagevec_release(struct pagevec *pvec) +{ + lru_add_drain(); + release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); + pagevec_reinit(pvec); +} + +EXPORT_SYMBOL(__pagevec_release); + +/* + * pagevec_release() for pages which are known to not be on the LRU + * + * This function reinitialises the caller's pagevec. + */ +void __pagevec_release_nonlru(struct pagevec *pvec) +{ + int i; + struct pagevec pages_to_free; + + pagevec_init(&pages_to_free, pvec->cold); + for (i = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + + VM_BUG_ON(PageLRU(page)); + if (put_page_testzero(page)) + pagevec_add(&pages_to_free, page); + } + pagevec_free(&pages_to_free); + pagevec_reinit(pvec); +} + +/* + * Add the passed pages to the LRU, then drop the caller's refcount + * on them. Reinitialises the caller's pagevec. + */ +void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) +{ + int i; + struct zone *zone = NULL; + VM_BUG_ON(is_unevictable_lru(lru)); + + for (i = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + struct zone *pagezone = page_zone(page); + int file; + + if (pagezone != zone) { + if (zone) + spin_unlock_irq(&zone->lru_lock); + zone = pagezone; + spin_lock_irq(&zone->lru_lock); + } + VM_BUG_ON(PageActive(page)); + VM_BUG_ON(PageUnevictable(page)); + VM_BUG_ON(PageLRU(page)); + SetPageLRU(page); + file = is_file_lru(lru); + zone->recent_scanned[file]++; + if (is_active_lru(lru)) { + SetPageActive(page); + zone->recent_rotated[file]++; + } + add_page_to_lru_list(zone, page, lru); + } + if (zone) + spin_unlock_irq(&zone->lru_lock); + release_pages(pvec->pages, pvec->nr, pvec->cold); + pagevec_reinit(pvec); +} + +EXPORT_SYMBOL(____pagevec_lru_add); + +/* + * Try to drop buffers from the pages in a pagevec + */ +void pagevec_strip(struct pagevec *pvec) +{ + int i; + + for (i = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + + if (PagePrivate(page) && trylock_page(page)) { + if (PagePrivate(page)) + try_to_release_page(page, 0); + unlock_page(page); + } + } +} + +/** + * pagevec_swap_free - try to free swap space from the pages in a pagevec + * @pvec: pagevec with swapcache pages to free the swap space of + * + * The caller needs to hold an extra reference to each page and + * not hold the page lock on the pages. This function uses a + * trylock on the page lock so it may not always free the swap + * space associated with a page. + */ +void pagevec_swap_free(struct pagevec *pvec) +{ + int i; + + for (i = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + + if (PageSwapCache(page) && trylock_page(page)) { + if (PageSwapCache(page)) + remove_exclusive_swap_page_ref(page); + unlock_page(page); + } + } +} + +/** + * pagevec_lookup - gang pagecache lookup + * @pvec: Where the resulting pages are placed + * @mapping: The address_space to search + * @start: The starting page index + * @nr_pages: The maximum number of pages + * + * pagevec_lookup() will search for and return a group of up to @nr_pages pages + * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a + * reference against the pages in @pvec. + * + * The search returns a group of mapping-contiguous pages with ascending + * indexes. There may be holes in the indices due to not-present pages. + * + * pagevec_lookup() returns the number of pages which were found. + */ +unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, + pgoff_t start, unsigned nr_pages) +{ + pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); + return pagevec_count(pvec); +} + +EXPORT_SYMBOL(pagevec_lookup); + +unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, + pgoff_t *index, int tag, unsigned nr_pages) +{ + pvec->nr = find_get_pages_tag(mapping, index, tag, + nr_pages, pvec->pages); + return pagevec_count(pvec); +} + +EXPORT_SYMBOL(pagevec_lookup_tag); + +#ifdef CONFIG_SMP +/* + * We tolerate a little inaccuracy to avoid ping-ponging the counter between + * CPUs + */ +#define ACCT_THRESHOLD max(16, NR_CPUS * 2) + +static DEFINE_PER_CPU(long, committed_space); + +void vm_acct_memory(long pages) +{ + long *local; + + preempt_disable(); + local = &__get_cpu_var(committed_space); + *local += pages; + if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) { + atomic_long_add(*local, &vm_committed_space); + *local = 0; + } + preempt_enable(); +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* Drop the CPU's cached committed space back into the central pool. */ +static int cpu_swap_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + long *committed; + + committed = &per_cpu(committed_space, (long)hcpu); + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { + atomic_long_add(*committed, &vm_committed_space); + *committed = 0; + drain_cpu_pagevecs((long)hcpu); + } + return NOTIFY_OK; +} +#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* CONFIG_SMP */ + +/* + * Perform any setup for the swap system + */ +void __init swap_setup(void) +{ + unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); + +#ifdef CONFIG_SWAP + bdi_init(swapper_space.backing_dev_info); +#endif + + /* Use a smaller cluster for small-memory machines */ + if (megs < 16) + page_cluster = 2; + else + page_cluster = 3; + /* + * Right now other parts of the system means that we + * _really_ don't want to cluster much more + */ +#ifdef CONFIG_HOTPLUG_CPU + hotcpu_notifier(cpu_swap_callback, 0); +#endif +} diff --git a/mm/swap_state.c b/mm/swap_state.c new file mode 100644 index 0000000..3353c90 --- /dev/null +++ b/mm/swap_state.c @@ -0,0 +1,372 @@ +/* + * linux/mm/swap_state.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * Swap reorganised 29.12.95, Stephen Tweedie + * + * Rewritten to use page cache, (C) 1998 Stephen Tweedie + */ +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/kernel_stat.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/init.h> +#include <linux/pagemap.h> +#include <linux/buffer_head.h> +#include <linux/backing-dev.h> +#include <linux/pagevec.h> +#include <linux/migrate.h> + +#include <asm/pgtable.h> + +/* + * swapper_space is a fiction, retained to simplify the path through + * vmscan's shrink_page_list, to make sync_page look nicer, and to allow + * future use of radix_tree tags in the swap cache. + */ +static const struct address_space_operations swap_aops = { + .writepage = swap_writepage, + .sync_page = block_sync_page, + .set_page_dirty = __set_page_dirty_nobuffers, + .migratepage = migrate_page, +}; + +static struct backing_dev_info swap_backing_dev_info = { + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, + .unplug_io_fn = swap_unplug_io_fn, +}; + +struct address_space swapper_space = { + .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), + .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), + .a_ops = &swap_aops, + .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), + .backing_dev_info = &swap_backing_dev_info, +}; + +#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) + +static struct { + unsigned long add_total; + unsigned long del_total; + unsigned long find_success; + unsigned long find_total; +} swap_cache_info; + +void show_swap_cache_info(void) +{ + printk("%lu pages in swap cache\n", total_swapcache_pages); + printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", + swap_cache_info.add_total, swap_cache_info.del_total, + swap_cache_info.find_success, swap_cache_info.find_total); + printk("Free swap = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10)); + printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); +} + +/* + * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, + * but sets SwapCache flag and private instead of mapping and index. + */ +int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) +{ + int error; + + BUG_ON(!PageLocked(page)); + BUG_ON(PageSwapCache(page)); + BUG_ON(PagePrivate(page)); + BUG_ON(!PageSwapBacked(page)); + error = radix_tree_preload(gfp_mask); + if (!error) { + page_cache_get(page); + SetPageSwapCache(page); + set_page_private(page, entry.val); + + spin_lock_irq(&swapper_space.tree_lock); + error = radix_tree_insert(&swapper_space.page_tree, + entry.val, page); + if (likely(!error)) { + total_swapcache_pages++; + __inc_zone_page_state(page, NR_FILE_PAGES); + INC_CACHE_INFO(add_total); + } + spin_unlock_irq(&swapper_space.tree_lock); + radix_tree_preload_end(); + + if (unlikely(error)) { + set_page_private(page, 0UL); + ClearPageSwapCache(page); + page_cache_release(page); + } + } + return error; +} + +/* + * This must be called only on pages that have + * been verified to be in the swap cache. + */ +void __delete_from_swap_cache(struct page *page) +{ + BUG_ON(!PageLocked(page)); + BUG_ON(!PageSwapCache(page)); + BUG_ON(PageWriteback(page)); + BUG_ON(PagePrivate(page)); + + radix_tree_delete(&swapper_space.page_tree, page_private(page)); + set_page_private(page, 0); + ClearPageSwapCache(page); + total_swapcache_pages--; + __dec_zone_page_state(page, NR_FILE_PAGES); + INC_CACHE_INFO(del_total); +} + +/** + * add_to_swap - allocate swap space for a page + * @page: page we want to move to swap + * @gfp_mask: memory allocation flags + * + * Allocate swap space for the page and add the page to the + * swap cache. Caller needs to hold the page lock. + */ +int add_to_swap(struct page * page, gfp_t gfp_mask) +{ + swp_entry_t entry; + int err; + + BUG_ON(!PageLocked(page)); + BUG_ON(!PageUptodate(page)); + + for (;;) { + entry = get_swap_page(); + if (!entry.val) + return 0; + + /* + * Radix-tree node allocations from PF_MEMALLOC contexts could + * completely exhaust the page allocator. __GFP_NOMEMALLOC + * stops emergency reserves from being allocated. + * + * TODO: this could cause a theoretical memory reclaim + * deadlock in the swap out path. + */ + /* + * Add it to the swap cache and mark it dirty + */ + err = add_to_swap_cache(page, entry, + gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN); + + switch (err) { + case 0: /* Success */ + SetPageDirty(page); + return 1; + case -EEXIST: + /* Raced with "speculative" read_swap_cache_async */ + swap_free(entry); + continue; + default: + /* -ENOMEM radix-tree allocation failure */ + swap_free(entry); + return 0; + } + } +} + +/* + * This must be called only on pages that have + * been verified to be in the swap cache and locked. + * It will never put the page into the free list, + * the caller has a reference on the page. + */ +void delete_from_swap_cache(struct page *page) +{ + swp_entry_t entry; + + entry.val = page_private(page); + + spin_lock_irq(&swapper_space.tree_lock); + __delete_from_swap_cache(page); + spin_unlock_irq(&swapper_space.tree_lock); + + swap_free(entry); + page_cache_release(page); +} + +/* + * If we are the only user, then try to free up the swap cache. + * + * Its ok to check for PageSwapCache without the page lock + * here because we are going to recheck again inside + * exclusive_swap_page() _with_ the lock. + * - Marcelo + */ +static inline void free_swap_cache(struct page *page) +{ + if (PageSwapCache(page) && trylock_page(page)) { + remove_exclusive_swap_page(page); + unlock_page(page); + } +} + +/* + * Perform a free_page(), also freeing any swap cache associated with + * this page if it is the last user of the page. + */ +void free_page_and_swap_cache(struct page *page) +{ + free_swap_cache(page); + page_cache_release(page); +} + +/* + * Passed an array of pages, drop them all from swapcache and then release + * them. They are removed from the LRU and freed if this is their last use. + */ +void free_pages_and_swap_cache(struct page **pages, int nr) +{ + struct page **pagep = pages; + + lru_add_drain(); + while (nr) { + int todo = min(nr, PAGEVEC_SIZE); + int i; + + for (i = 0; i < todo; i++) + free_swap_cache(pagep[i]); + release_pages(pagep, todo, 0); + pagep += todo; + nr -= todo; + } +} + +/* + * Lookup a swap entry in the swap cache. A found page will be returned + * unlocked and with its refcount incremented - we rely on the kernel + * lock getting page table operations atomic even if we drop the page + * lock before returning. + */ +struct page * lookup_swap_cache(swp_entry_t entry) +{ + struct page *page; + + page = find_get_page(&swapper_space, entry.val); + + if (page) + INC_CACHE_INFO(find_success); + + INC_CACHE_INFO(find_total); + return page; +} + +/* + * Locate a page of swap in physical memory, reserving swap cache space + * and reading the disk if it is not already cached. + * A failure return means that either the page allocation failed or that + * the swap entry is no longer in use. + */ +struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, + struct vm_area_struct *vma, unsigned long addr) +{ + struct page *found_page, *new_page = NULL; + int err; + + do { + /* + * First check the swap cache. Since this is normally + * called after lookup_swap_cache() failed, re-calling + * that would confuse statistics. + */ + found_page = find_get_page(&swapper_space, entry.val); + if (found_page) + break; + + /* + * Get a new page to read into from swap. + */ + if (!new_page) { + new_page = alloc_page_vma(gfp_mask, vma, addr); + if (!new_page) + break; /* Out of memory */ + } + + /* + * Swap entry may have been freed since our caller observed it. + */ + if (!swap_duplicate(entry)) + break; + + /* + * Associate the page with swap entry in the swap cache. + * May fail (-EEXIST) if there is already a page associated + * with this entry in the swap cache: added by a racing + * read_swap_cache_async, or add_to_swap or shmem_writepage + * re-using the just freed swap entry for an existing page. + * May fail (-ENOMEM) if radix-tree node allocation failed. + */ + __set_page_locked(new_page); + SetPageSwapBacked(new_page); + err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); + if (likely(!err)) { + /* + * Initiate read into locked page and return. + */ + lru_cache_add_anon(new_page); + swap_readpage(NULL, new_page); + return new_page; + } + ClearPageSwapBacked(new_page); + __clear_page_locked(new_page); + swap_free(entry); + } while (err != -ENOMEM); + + if (new_page) + page_cache_release(new_page); + return found_page; +} + +/** + * swapin_readahead - swap in pages in hope we need them soon + * @entry: swap entry of this memory + * @gfp_mask: memory allocation flags + * @vma: user vma this address belongs to + * @addr: target address for mempolicy + * + * Returns the struct page for entry and addr, after queueing swapin. + * + * Primitive swap readahead code. We simply read an aligned block of + * (1 << page_cluster) entries in the swap area. This method is chosen + * because it doesn't cost us any seek time. We also make sure to queue + * the 'original' request together with the readahead ones... + * + * This has been extended to use the NUMA policies from the mm triggering + * the readahead. + * + * Caller must hold down_read on the vma->vm_mm if vma is not NULL. + */ +struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, + struct vm_area_struct *vma, unsigned long addr) +{ + int nr_pages; + struct page *page; + unsigned long offset; + unsigned long end_offset; + + /* + * Get starting offset for readaround, and number of pages to read. + * Adjust starting address by readbehind (for NUMA interleave case)? + * No, it's very unlikely that swap layout would follow vma layout, + * more likely that neighbouring swap pages came from the same node: + * so use the same "addr" to choose the same node for each swap read. + */ + nr_pages = valid_swaphandles(entry, &offset); + for (end_offset = offset + nr_pages; offset < end_offset; offset++) { + /* Ok, do the async read-ahead now */ + page = read_swap_cache_async(swp_entry(swp_type(entry), offset), + gfp_mask, vma, addr); + if (!page) + break; + page_cache_release(page); + } + lru_add_drain(); /* Push any new pages onto the LRU now */ + return read_swap_cache_async(entry, gfp_mask, vma, addr); +} diff --git a/mm/swapfile.c b/mm/swapfile.c new file mode 100644 index 0000000..d06896b --- /dev/null +++ b/mm/swapfile.c @@ -0,0 +1,1870 @@ +/* + * linux/mm/swapfile.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * Swap reorganised 29.12.95, Stephen Tweedie + */ + +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/mman.h> +#include <linux/slab.h> +#include <linux/kernel_stat.h> +#include <linux/swap.h> +#include <linux/vmalloc.h> +#include <linux/pagemap.h> +#include <linux/namei.h> +#include <linux/shm.h> +#include <linux/blkdev.h> +#include <linux/writeback.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/rmap.h> +#include <linux/security.h> +#include <linux/backing-dev.h> +#include <linux/mutex.h> +#include <linux/capability.h> +#include <linux/syscalls.h> +#include <linux/memcontrol.h> + +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <linux/swapops.h> + +static DEFINE_SPINLOCK(swap_lock); +static unsigned int nr_swapfiles; +long total_swap_pages; +static int swap_overflow; +static int least_priority; + +static const char Bad_file[] = "Bad swap file entry "; +static const char Unused_file[] = "Unused swap file entry "; +static const char Bad_offset[] = "Bad swap offset entry "; +static const char Unused_offset[] = "Unused swap offset entry "; + +static struct swap_list_t swap_list = {-1, -1}; + +static struct swap_info_struct swap_info[MAX_SWAPFILES]; + +static DEFINE_MUTEX(swapon_mutex); + +/* + * We need this because the bdev->unplug_fn can sleep and we cannot + * hold swap_lock while calling the unplug_fn. And swap_lock + * cannot be turned into a mutex. + */ +static DECLARE_RWSEM(swap_unplug_sem); + +void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) +{ + swp_entry_t entry; + + down_read(&swap_unplug_sem); + entry.val = page_private(page); + if (PageSwapCache(page)) { + struct block_device *bdev = swap_info[swp_type(entry)].bdev; + struct backing_dev_info *bdi; + + /* + * If the page is removed from swapcache from under us (with a + * racy try_to_unuse/swapoff) we need an additional reference + * count to avoid reading garbage from page_private(page) above. + * If the WARN_ON triggers during a swapoff it maybe the race + * condition and it's harmless. However if it triggers without + * swapoff it signals a problem. + */ + WARN_ON(page_count(page) <= 1); + + bdi = bdev->bd_inode->i_mapping->backing_dev_info; + blk_run_backing_dev(bdi, page); + } + up_read(&swap_unplug_sem); +} + +#define SWAPFILE_CLUSTER 256 +#define LATENCY_LIMIT 256 + +static inline unsigned long scan_swap_map(struct swap_info_struct *si) +{ + unsigned long offset, last_in_cluster; + int latency_ration = LATENCY_LIMIT; + + /* + * We try to cluster swap pages by allocating them sequentially + * in swap. Once we've allocated SWAPFILE_CLUSTER pages this + * way, however, we resort to first-free allocation, starting + * a new cluster. This prevents us from scattering swap pages + * all over the entire swap partition, so that we reduce + * overall disk seek times between swap pages. -- sct + * But we do now try to find an empty cluster. -Andrea + */ + + si->flags += SWP_SCANNING; + if (unlikely(!si->cluster_nr)) { + si->cluster_nr = SWAPFILE_CLUSTER - 1; + if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) + goto lowest; + spin_unlock(&swap_lock); + + offset = si->lowest_bit; + last_in_cluster = offset + SWAPFILE_CLUSTER - 1; + + /* Locate the first empty (unaligned) cluster */ + for (; last_in_cluster <= si->highest_bit; offset++) { + if (si->swap_map[offset]) + last_in_cluster = offset + SWAPFILE_CLUSTER; + else if (offset == last_in_cluster) { + spin_lock(&swap_lock); + si->cluster_next = offset-SWAPFILE_CLUSTER+1; + goto cluster; + } + if (unlikely(--latency_ration < 0)) { + cond_resched(); + latency_ration = LATENCY_LIMIT; + } + } + spin_lock(&swap_lock); + goto lowest; + } + + si->cluster_nr--; +cluster: + offset = si->cluster_next; + if (offset > si->highest_bit) +lowest: offset = si->lowest_bit; +checks: if (!(si->flags & SWP_WRITEOK)) + goto no_page; + if (!si->highest_bit) + goto no_page; + if (!si->swap_map[offset]) { + if (offset == si->lowest_bit) + si->lowest_bit++; + if (offset == si->highest_bit) + si->highest_bit--; + si->inuse_pages++; + if (si->inuse_pages == si->pages) { + si->lowest_bit = si->max; + si->highest_bit = 0; + } + si->swap_map[offset] = 1; + si->cluster_next = offset + 1; + si->flags -= SWP_SCANNING; + return offset; + } + + spin_unlock(&swap_lock); + while (++offset <= si->highest_bit) { + if (!si->swap_map[offset]) { + spin_lock(&swap_lock); + goto checks; + } + if (unlikely(--latency_ration < 0)) { + cond_resched(); + latency_ration = LATENCY_LIMIT; + } + } + spin_lock(&swap_lock); + goto lowest; + +no_page: + si->flags -= SWP_SCANNING; + return 0; +} + +swp_entry_t get_swap_page(void) +{ + struct swap_info_struct *si; + pgoff_t offset; + int type, next; + int wrapped = 0; + + spin_lock(&swap_lock); + if (nr_swap_pages <= 0) + goto noswap; + nr_swap_pages--; + + for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { + si = swap_info + type; + next = si->next; + if (next < 0 || + (!wrapped && si->prio != swap_info[next].prio)) { + next = swap_list.head; + wrapped++; + } + + if (!si->highest_bit) + continue; + if (!(si->flags & SWP_WRITEOK)) + continue; + + swap_list.next = next; + offset = scan_swap_map(si); + if (offset) { + spin_unlock(&swap_lock); + return swp_entry(type, offset); + } + next = swap_list.next; + } + + nr_swap_pages++; +noswap: + spin_unlock(&swap_lock); + return (swp_entry_t) {0}; +} + +swp_entry_t get_swap_page_of_type(int type) +{ + struct swap_info_struct *si; + pgoff_t offset; + + spin_lock(&swap_lock); + si = swap_info + type; + if (si->flags & SWP_WRITEOK) { + nr_swap_pages--; + offset = scan_swap_map(si); + if (offset) { + spin_unlock(&swap_lock); + return swp_entry(type, offset); + } + nr_swap_pages++; + } + spin_unlock(&swap_lock); + return (swp_entry_t) {0}; +} + +static struct swap_info_struct * swap_info_get(swp_entry_t entry) +{ + struct swap_info_struct * p; + unsigned long offset, type; + + if (!entry.val) + goto out; + type = swp_type(entry); + if (type >= nr_swapfiles) + goto bad_nofile; + p = & swap_info[type]; + if (!(p->flags & SWP_USED)) + goto bad_device; + offset = swp_offset(entry); + if (offset >= p->max) + goto bad_offset; + if (!p->swap_map[offset]) + goto bad_free; + spin_lock(&swap_lock); + return p; + +bad_free: + printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); + goto out; +bad_offset: + printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); + goto out; +bad_device: + printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); + goto out; +bad_nofile: + printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); +out: + return NULL; +} + +static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) +{ + int count = p->swap_map[offset]; + + if (count < SWAP_MAP_MAX) { + count--; + p->swap_map[offset] = count; + if (!count) { + if (offset < p->lowest_bit) + p->lowest_bit = offset; + if (offset > p->highest_bit) + p->highest_bit = offset; + if (p->prio > swap_info[swap_list.next].prio) + swap_list.next = p - swap_info; + nr_swap_pages++; + p->inuse_pages--; + } + } + return count; +} + +/* + * Caller has made sure that the swapdevice corresponding to entry + * is still around or has not been recycled. + */ +void swap_free(swp_entry_t entry) +{ + struct swap_info_struct * p; + + p = swap_info_get(entry); + if (p) { + swap_entry_free(p, swp_offset(entry)); + spin_unlock(&swap_lock); + } +} + +/* + * How many references to page are currently swapped out? + */ +static inline int page_swapcount(struct page *page) +{ + int count = 0; + struct swap_info_struct *p; + swp_entry_t entry; + + entry.val = page_private(page); + p = swap_info_get(entry); + if (p) { + /* Subtract the 1 for the swap cache itself */ + count = p->swap_map[swp_offset(entry)] - 1; + spin_unlock(&swap_lock); + } + return count; +} + +/* + * We can use this swap cache entry directly + * if there are no other references to it. + */ +int can_share_swap_page(struct page *page) +{ + int count; + + BUG_ON(!PageLocked(page)); + count = page_mapcount(page); + if (count <= 1 && PageSwapCache(page)) + count += page_swapcount(page); + return count == 1; +} + +/* + * Work out if there are any other processes sharing this + * swap cache page. Free it if you can. Return success. + */ +static int remove_exclusive_swap_page_count(struct page *page, int count) +{ + int retval; + struct swap_info_struct * p; + swp_entry_t entry; + + BUG_ON(PagePrivate(page)); + BUG_ON(!PageLocked(page)); + + if (!PageSwapCache(page)) + return 0; + if (PageWriteback(page)) + return 0; + if (page_count(page) != count) /* us + cache + ptes */ + return 0; + + entry.val = page_private(page); + p = swap_info_get(entry); + if (!p) + return 0; + + /* Is the only swap cache user the cache itself? */ + retval = 0; + if (p->swap_map[swp_offset(entry)] == 1) { + /* Recheck the page count with the swapcache lock held.. */ + spin_lock_irq(&swapper_space.tree_lock); + if ((page_count(page) == count) && !PageWriteback(page)) { + __delete_from_swap_cache(page); + SetPageDirty(page); + retval = 1; + } + spin_unlock_irq(&swapper_space.tree_lock); + } + spin_unlock(&swap_lock); + + if (retval) { + swap_free(entry); + page_cache_release(page); + } + + return retval; +} + +/* + * Most of the time the page should have two references: one for the + * process and one for the swap cache. + */ +int remove_exclusive_swap_page(struct page *page) +{ + return remove_exclusive_swap_page_count(page, 2); +} + +/* + * The pageout code holds an extra reference to the page. That raises + * the reference count to test for to 2 for a page that is only in the + * swap cache plus 1 for each process that maps the page. + */ +int remove_exclusive_swap_page_ref(struct page *page) +{ + return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page)); +} + +/* + * Free the swap entry like above, but also try to + * free the page cache entry if it is the last user. + */ +void free_swap_and_cache(swp_entry_t entry) +{ + struct swap_info_struct * p; + struct page *page = NULL; + + if (is_migration_entry(entry)) + return; + + p = swap_info_get(entry); + if (p) { + if (swap_entry_free(p, swp_offset(entry)) == 1) { + page = find_get_page(&swapper_space, entry.val); + if (page && !trylock_page(page)) { + page_cache_release(page); + page = NULL; + } + } + spin_unlock(&swap_lock); + } + if (page) { + int one_user; + + BUG_ON(PagePrivate(page)); + one_user = (page_count(page) == 2); + /* Only cache user (+us), or swap space full? Free it! */ + /* Also recheck PageSwapCache after page is locked (above) */ + if (PageSwapCache(page) && !PageWriteback(page) && + (one_user || vm_swap_full())) { + delete_from_swap_cache(page); + SetPageDirty(page); + } + unlock_page(page); + page_cache_release(page); + } +} + +#ifdef CONFIG_HIBERNATION +/* + * Find the swap type that corresponds to given device (if any). + * + * @offset - number of the PAGE_SIZE-sized block of the device, starting + * from 0, in which the swap header is expected to be located. + * + * This is needed for the suspend to disk (aka swsusp). + */ +int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) +{ + struct block_device *bdev = NULL; + int i; + + if (device) + bdev = bdget(device); + + spin_lock(&swap_lock); + for (i = 0; i < nr_swapfiles; i++) { + struct swap_info_struct *sis = swap_info + i; + + if (!(sis->flags & SWP_WRITEOK)) + continue; + + if (!bdev) { + if (bdev_p) + *bdev_p = sis->bdev; + + spin_unlock(&swap_lock); + return i; + } + if (bdev == sis->bdev) { + struct swap_extent *se; + + se = list_entry(sis->extent_list.next, + struct swap_extent, list); + if (se->start_block == offset) { + if (bdev_p) + *bdev_p = sis->bdev; + + spin_unlock(&swap_lock); + bdput(bdev); + return i; + } + } + } + spin_unlock(&swap_lock); + if (bdev) + bdput(bdev); + + return -ENODEV; +} + +/* + * Return either the total number of swap pages of given type, or the number + * of free pages of that type (depending on @free) + * + * This is needed for software suspend + */ +unsigned int count_swap_pages(int type, int free) +{ + unsigned int n = 0; + + if (type < nr_swapfiles) { + spin_lock(&swap_lock); + if (swap_info[type].flags & SWP_WRITEOK) { + n = swap_info[type].pages; + if (free) + n -= swap_info[type].inuse_pages; + } + spin_unlock(&swap_lock); + } + return n; +} +#endif + +/* + * No need to decide whether this PTE shares the swap entry with others, + * just let do_wp_page work it out if a write is requested later - to + * force COW, vm_page_prot omits write permission from any private vma. + */ +static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, swp_entry_t entry, struct page *page) +{ + spinlock_t *ptl; + pte_t *pte; + int ret = 1; + + if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) + ret = -ENOMEM; + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { + if (ret > 0) + mem_cgroup_uncharge_page(page); + ret = 0; + goto out; + } + + inc_mm_counter(vma->vm_mm, anon_rss); + get_page(page); + set_pte_at(vma->vm_mm, addr, pte, + pte_mkold(mk_pte(page, vma->vm_page_prot))); + page_add_anon_rmap(page, vma, addr); + swap_free(entry); + /* + * Move the page to the active list so it is not + * immediately swapped out again after swapon. + */ + activate_page(page); +out: + pte_unmap_unlock(pte, ptl); + return ret; +} + +static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, + swp_entry_t entry, struct page *page) +{ + pte_t swp_pte = swp_entry_to_pte(entry); + pte_t *pte; + int ret = 0; + + /* + * We don't actually need pte lock while scanning for swp_pte: since + * we hold page lock and mmap_sem, swp_pte cannot be inserted into the + * page table while we're scanning; though it could get zapped, and on + * some architectures (e.g. x86_32 with PAE) we might catch a glimpse + * of unmatched parts which look like swp_pte, so unuse_pte must + * recheck under pte lock. Scanning without pte lock lets it be + * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. + */ + pte = pte_offset_map(pmd, addr); + do { + /* + * swapoff spends a _lot_ of time in this loop! + * Test inline before going to call unuse_pte. + */ + if (unlikely(pte_same(*pte, swp_pte))) { + pte_unmap(pte); + ret = unuse_pte(vma, pmd, addr, entry, page); + if (ret) + goto out; + pte = pte_offset_map(pmd, addr); + } + } while (pte++, addr += PAGE_SIZE, addr != end); + pte_unmap(pte - 1); +out: + return ret; +} + +static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, + unsigned long addr, unsigned long end, + swp_entry_t entry, struct page *page) +{ + pmd_t *pmd; + unsigned long next; + int ret; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; + ret = unuse_pte_range(vma, pmd, addr, next, entry, page); + if (ret) + return ret; + } while (pmd++, addr = next, addr != end); + return 0; +} + +static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end, + swp_entry_t entry, struct page *page) +{ + pud_t *pud; + unsigned long next; + int ret; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + ret = unuse_pmd_range(vma, pud, addr, next, entry, page); + if (ret) + return ret; + } while (pud++, addr = next, addr != end); + return 0; +} + +static int unuse_vma(struct vm_area_struct *vma, + swp_entry_t entry, struct page *page) +{ + pgd_t *pgd; + unsigned long addr, end, next; + int ret; + + if (page->mapping) { + addr = page_address_in_vma(page, vma); + if (addr == -EFAULT) + return 0; + else + end = addr + PAGE_SIZE; + } else { + addr = vma->vm_start; + end = vma->vm_end; + } + + pgd = pgd_offset(vma->vm_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + ret = unuse_pud_range(vma, pgd, addr, next, entry, page); + if (ret) + return ret; + } while (pgd++, addr = next, addr != end); + return 0; +} + +static int unuse_mm(struct mm_struct *mm, + swp_entry_t entry, struct page *page) +{ + struct vm_area_struct *vma; + int ret = 0; + + if (!down_read_trylock(&mm->mmap_sem)) { + /* + * Activate page so shrink_inactive_list is unlikely to unmap + * its ptes while lock is dropped, so swapoff can make progress. + */ + activate_page(page); + unlock_page(page); + down_read(&mm->mmap_sem); + lock_page(page); + } + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) + break; + } + up_read(&mm->mmap_sem); + return (ret < 0)? ret: 0; +} + +/* + * Scan swap_map from current position to next entry still in use. + * Recycle to start on reaching the end, returning 0 when empty. + */ +static unsigned int find_next_to_unuse(struct swap_info_struct *si, + unsigned int prev) +{ + unsigned int max = si->max; + unsigned int i = prev; + int count; + + /* + * No need for swap_lock here: we're just looking + * for whether an entry is in use, not modifying it; false + * hits are okay, and sys_swapoff() has already prevented new + * allocations from this area (while holding swap_lock). + */ + for (;;) { + if (++i >= max) { + if (!prev) { + i = 0; + break; + } + /* + * No entries in use at top of swap_map, + * loop back to start and recheck there. + */ + max = prev + 1; + prev = 0; + i = 1; + } + count = si->swap_map[i]; + if (count && count != SWAP_MAP_BAD) + break; + } + return i; +} + +/* + * We completely avoid races by reading each swap page in advance, + * and then search for the process using it. All the necessary + * page table adjustments can then be made atomically. + */ +static int try_to_unuse(unsigned int type) +{ + struct swap_info_struct * si = &swap_info[type]; + struct mm_struct *start_mm; + unsigned short *swap_map; + unsigned short swcount; + struct page *page; + swp_entry_t entry; + unsigned int i = 0; + int retval = 0; + int reset_overflow = 0; + int shmem; + + /* + * When searching mms for an entry, a good strategy is to + * start at the first mm we freed the previous entry from + * (though actually we don't notice whether we or coincidence + * freed the entry). Initialize this start_mm with a hold. + * + * A simpler strategy would be to start at the last mm we + * freed the previous entry from; but that would take less + * advantage of mmlist ordering, which clusters forked mms + * together, child after parent. If we race with dup_mmap(), we + * prefer to resolve parent before child, lest we miss entries + * duplicated after we scanned child: using last mm would invert + * that. Though it's only a serious concern when an overflowed + * swap count is reset from SWAP_MAP_MAX, preventing a rescan. + */ + start_mm = &init_mm; + atomic_inc(&init_mm.mm_users); + + /* + * Keep on scanning until all entries have gone. Usually, + * one pass through swap_map is enough, but not necessarily: + * there are races when an instance of an entry might be missed. + */ + while ((i = find_next_to_unuse(si, i)) != 0) { + if (signal_pending(current)) { + retval = -EINTR; + break; + } + + /* + * Get a page for the entry, using the existing swap + * cache page if there is one. Otherwise, get a clean + * page and read the swap into it. + */ + swap_map = &si->swap_map[i]; + entry = swp_entry(type, i); + page = read_swap_cache_async(entry, + GFP_HIGHUSER_MOVABLE, NULL, 0); + if (!page) { + /* + * Either swap_duplicate() failed because entry + * has been freed independently, and will not be + * reused since sys_swapoff() already disabled + * allocation from here, or alloc_page() failed. + */ + if (!*swap_map) + continue; + retval = -ENOMEM; + break; + } + + /* + * Don't hold on to start_mm if it looks like exiting. + */ + if (atomic_read(&start_mm->mm_users) == 1) { + mmput(start_mm); + start_mm = &init_mm; + atomic_inc(&init_mm.mm_users); + } + + /* + * Wait for and lock page. When do_swap_page races with + * try_to_unuse, do_swap_page can handle the fault much + * faster than try_to_unuse can locate the entry. This + * apparently redundant "wait_on_page_locked" lets try_to_unuse + * defer to do_swap_page in such a case - in some tests, + * do_swap_page and try_to_unuse repeatedly compete. + */ + wait_on_page_locked(page); + wait_on_page_writeback(page); + lock_page(page); + wait_on_page_writeback(page); + + /* + * Remove all references to entry. + * Whenever we reach init_mm, there's no address space + * to search, but use it as a reminder to search shmem. + */ + shmem = 0; + swcount = *swap_map; + if (swcount > 1) { + if (start_mm == &init_mm) + shmem = shmem_unuse(entry, page); + else + retval = unuse_mm(start_mm, entry, page); + } + if (*swap_map > 1) { + int set_start_mm = (*swap_map >= swcount); + struct list_head *p = &start_mm->mmlist; + struct mm_struct *new_start_mm = start_mm; + struct mm_struct *prev_mm = start_mm; + struct mm_struct *mm; + + atomic_inc(&new_start_mm->mm_users); + atomic_inc(&prev_mm->mm_users); + spin_lock(&mmlist_lock); + while (*swap_map > 1 && !retval && !shmem && + (p = p->next) != &start_mm->mmlist) { + mm = list_entry(p, struct mm_struct, mmlist); + if (!atomic_inc_not_zero(&mm->mm_users)) + continue; + spin_unlock(&mmlist_lock); + mmput(prev_mm); + prev_mm = mm; + + cond_resched(); + + swcount = *swap_map; + if (swcount <= 1) + ; + else if (mm == &init_mm) { + set_start_mm = 1; + shmem = shmem_unuse(entry, page); + } else + retval = unuse_mm(mm, entry, page); + if (set_start_mm && *swap_map < swcount) { + mmput(new_start_mm); + atomic_inc(&mm->mm_users); + new_start_mm = mm; + set_start_mm = 0; + } + spin_lock(&mmlist_lock); + } + spin_unlock(&mmlist_lock); + mmput(prev_mm); + mmput(start_mm); + start_mm = new_start_mm; + } + if (shmem) { + /* page has already been unlocked and released */ + if (shmem > 0) + continue; + retval = shmem; + break; + } + if (retval) { + unlock_page(page); + page_cache_release(page); + break; + } + + /* + * How could swap count reach 0x7fff when the maximum + * pid is 0x7fff, and there's no way to repeat a swap + * page within an mm (except in shmem, where it's the + * shared object which takes the reference count)? + * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. + * + * If that's wrong, then we should worry more about + * exit_mmap() and do_munmap() cases described above: + * we might be resetting SWAP_MAP_MAX too early here. + * We know "Undead"s can happen, they're okay, so don't + * report them; but do report if we reset SWAP_MAP_MAX. + */ + if (*swap_map == SWAP_MAP_MAX) { + spin_lock(&swap_lock); + *swap_map = 1; + spin_unlock(&swap_lock); + reset_overflow = 1; + } + + /* + * If a reference remains (rare), we would like to leave + * the page in the swap cache; but try_to_unmap could + * then re-duplicate the entry once we drop page lock, + * so we might loop indefinitely; also, that page could + * not be swapped out to other storage meanwhile. So: + * delete from cache even if there's another reference, + * after ensuring that the data has been saved to disk - + * since if the reference remains (rarer), it will be + * read from disk into another page. Splitting into two + * pages would be incorrect if swap supported "shared + * private" pages, but they are handled by tmpfs files. + */ + if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + }; + + swap_writepage(page, &wbc); + lock_page(page); + wait_on_page_writeback(page); + } + if (PageSwapCache(page)) + delete_from_swap_cache(page); + + /* + * So we could skip searching mms once swap count went + * to 1, we did not mark any present ptes as dirty: must + * mark page dirty so shrink_page_list will preserve it. + */ + SetPageDirty(page); + unlock_page(page); + page_cache_release(page); + + /* + * Make sure that we aren't completely killing + * interactive performance. + */ + cond_resched(); + } + + mmput(start_mm); + if (reset_overflow) { + printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); + swap_overflow = 0; + } + return retval; +} + +/* + * After a successful try_to_unuse, if no swap is now in use, we know + * we can empty the mmlist. swap_lock must be held on entry and exit. + * Note that mmlist_lock nests inside swap_lock, and an mm must be + * added to the mmlist just after page_duplicate - before would be racy. + */ +static void drain_mmlist(void) +{ + struct list_head *p, *next; + unsigned int i; + + for (i = 0; i < nr_swapfiles; i++) + if (swap_info[i].inuse_pages) + return; + spin_lock(&mmlist_lock); + list_for_each_safe(p, next, &init_mm.mmlist) + list_del_init(p); + spin_unlock(&mmlist_lock); +} + +/* + * Use this swapdev's extent info to locate the (PAGE_SIZE) block which + * corresponds to page offset `offset'. + */ +sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) +{ + struct swap_extent *se = sis->curr_swap_extent; + struct swap_extent *start_se = se; + + for ( ; ; ) { + struct list_head *lh; + + if (se->start_page <= offset && + offset < (se->start_page + se->nr_pages)) { + return se->start_block + (offset - se->start_page); + } + lh = se->list.next; + if (lh == &sis->extent_list) + lh = lh->next; + se = list_entry(lh, struct swap_extent, list); + sis->curr_swap_extent = se; + BUG_ON(se == start_se); /* It *must* be present */ + } +} + +#ifdef CONFIG_HIBERNATION +/* + * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev + * corresponding to given index in swap_info (swap type). + */ +sector_t swapdev_block(int swap_type, pgoff_t offset) +{ + struct swap_info_struct *sis; + + if (swap_type >= nr_swapfiles) + return 0; + + sis = swap_info + swap_type; + return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0; +} +#endif /* CONFIG_HIBERNATION */ + +/* + * Free all of a swapdev's extent information + */ +static void destroy_swap_extents(struct swap_info_struct *sis) +{ + while (!list_empty(&sis->extent_list)) { + struct swap_extent *se; + + se = list_entry(sis->extent_list.next, + struct swap_extent, list); + list_del(&se->list); + kfree(se); + } +} + +/* + * Add a block range (and the corresponding page range) into this swapdev's + * extent list. The extent list is kept sorted in page order. + * + * This function rather assumes that it is called in ascending page order. + */ +static int +add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, + unsigned long nr_pages, sector_t start_block) +{ + struct swap_extent *se; + struct swap_extent *new_se; + struct list_head *lh; + + lh = sis->extent_list.prev; /* The highest page extent */ + if (lh != &sis->extent_list) { + se = list_entry(lh, struct swap_extent, list); + BUG_ON(se->start_page + se->nr_pages != start_page); + if (se->start_block + se->nr_pages == start_block) { + /* Merge it */ + se->nr_pages += nr_pages; + return 0; + } + } + + /* + * No merge. Insert a new extent, preserving ordering. + */ + new_se = kmalloc(sizeof(*se), GFP_KERNEL); + if (new_se == NULL) + return -ENOMEM; + new_se->start_page = start_page; + new_se->nr_pages = nr_pages; + new_se->start_block = start_block; + + list_add_tail(&new_se->list, &sis->extent_list); + return 1; +} + +/* + * A `swap extent' is a simple thing which maps a contiguous range of pages + * onto a contiguous range of disk blocks. An ordered list of swap extents + * is built at swapon time and is then used at swap_writepage/swap_readpage + * time for locating where on disk a page belongs. + * + * If the swapfile is an S_ISBLK block device, a single extent is installed. + * This is done so that the main operating code can treat S_ISBLK and S_ISREG + * swap files identically. + * + * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap + * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK + * swapfiles are handled *identically* after swapon time. + * + * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks + * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If + * some stray blocks are found which do not fall within the PAGE_SIZE alignment + * requirements, they are simply tossed out - we will never use those blocks + * for swapping. + * + * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon. This + * prevents root from shooting her foot off by ftruncating an in-use swapfile, + * which will scribble on the fs. + * + * The amount of disk space which a single swap extent represents varies. + * Typically it is in the 1-4 megabyte range. So we can have hundreds of + * extents in the list. To avoid much list walking, we cache the previous + * search location in `curr_swap_extent', and start new searches from there. + * This is extremely effective. The average number of iterations in + * map_swap_page() has been measured at about 0.3 per page. - akpm. + */ +static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) +{ + struct inode *inode; + unsigned blocks_per_page; + unsigned long page_no; + unsigned blkbits; + sector_t probe_block; + sector_t last_block; + sector_t lowest_block = -1; + sector_t highest_block = 0; + int nr_extents = 0; + int ret; + + inode = sis->swap_file->f_mapping->host; + if (S_ISBLK(inode->i_mode)) { + ret = add_swap_extent(sis, 0, sis->max, 0); + *span = sis->pages; + goto done; + } + + blkbits = inode->i_blkbits; + blocks_per_page = PAGE_SIZE >> blkbits; + + /* + * Map all the blocks into the extent list. This code doesn't try + * to be very smart. + */ + probe_block = 0; + page_no = 0; + last_block = i_size_read(inode) >> blkbits; + while ((probe_block + blocks_per_page) <= last_block && + page_no < sis->max) { + unsigned block_in_page; + sector_t first_block; + + first_block = bmap(inode, probe_block); + if (first_block == 0) + goto bad_bmap; + + /* + * It must be PAGE_SIZE aligned on-disk + */ + if (first_block & (blocks_per_page - 1)) { + probe_block++; + goto reprobe; + } + + for (block_in_page = 1; block_in_page < blocks_per_page; + block_in_page++) { + sector_t block; + + block = bmap(inode, probe_block + block_in_page); + if (block == 0) + goto bad_bmap; + if (block != first_block + block_in_page) { + /* Discontiguity */ + probe_block++; + goto reprobe; + } + } + + first_block >>= (PAGE_SHIFT - blkbits); + if (page_no) { /* exclude the header page */ + if (first_block < lowest_block) + lowest_block = first_block; + if (first_block > highest_block) + highest_block = first_block; + } + + /* + * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks + */ + ret = add_swap_extent(sis, page_no, 1, first_block); + if (ret < 0) + goto out; + nr_extents += ret; + page_no++; + probe_block += blocks_per_page; +reprobe: + continue; + } + ret = nr_extents; + *span = 1 + highest_block - lowest_block; + if (page_no == 0) + page_no = 1; /* force Empty message */ + sis->max = page_no; + sis->pages = page_no - 1; + sis->highest_bit = page_no - 1; +done: + sis->curr_swap_extent = list_entry(sis->extent_list.prev, + struct swap_extent, list); + goto out; +bad_bmap: + printk(KERN_ERR "swapon: swapfile has holes\n"); + ret = -EINVAL; +out: + return ret; +} + +#if 0 /* We don't need this yet */ +#include <linux/backing-dev.h> +int page_queue_congested(struct page *page) +{ + struct backing_dev_info *bdi; + + BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ + + if (PageSwapCache(page)) { + swp_entry_t entry = { .val = page_private(page) }; + struct swap_info_struct *sis; + + sis = get_swap_info_struct(swp_type(entry)); + bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info; + } else + bdi = page->mapping->backing_dev_info; + return bdi_write_congested(bdi); +} +#endif + +SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) +{ + struct swap_info_struct * p = NULL; + unsigned short *swap_map; + struct file *swap_file, *victim; + struct address_space *mapping; + struct inode *inode; + char * pathname; + int i, type, prev; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + pathname = getname(specialfile); + err = PTR_ERR(pathname); + if (IS_ERR(pathname)) + goto out; + + victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0); + putname(pathname); + err = PTR_ERR(victim); + if (IS_ERR(victim)) + goto out; + + mapping = victim->f_mapping; + prev = -1; + spin_lock(&swap_lock); + for (type = swap_list.head; type >= 0; type = swap_info[type].next) { + p = swap_info + type; + if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) { + if (p->swap_file->f_mapping == mapping) + break; + } + prev = type; + } + if (type < 0) { + err = -EINVAL; + spin_unlock(&swap_lock); + goto out_dput; + } + if (!security_vm_enough_memory(p->pages)) + vm_unacct_memory(p->pages); + else { + err = -ENOMEM; + spin_unlock(&swap_lock); + goto out_dput; + } + if (prev < 0) { + swap_list.head = p->next; + } else { + swap_info[prev].next = p->next; + } + if (type == swap_list.next) { + /* just pick something that's safe... */ + swap_list.next = swap_list.head; + } + if (p->prio < 0) { + for (i = p->next; i >= 0; i = swap_info[i].next) + swap_info[i].prio = p->prio--; + least_priority++; + } + nr_swap_pages -= p->pages; + total_swap_pages -= p->pages; + p->flags &= ~SWP_WRITEOK; + spin_unlock(&swap_lock); + + current->flags |= PF_SWAPOFF; + err = try_to_unuse(type); + current->flags &= ~PF_SWAPOFF; + + if (err) { + /* re-insert swap space back into swap_list */ + spin_lock(&swap_lock); + if (p->prio < 0) + p->prio = --least_priority; + prev = -1; + for (i = swap_list.head; i >= 0; i = swap_info[i].next) { + if (p->prio >= swap_info[i].prio) + break; + prev = i; + } + p->next = i; + if (prev < 0) + swap_list.head = swap_list.next = p - swap_info; + else + swap_info[prev].next = p - swap_info; + nr_swap_pages += p->pages; + total_swap_pages += p->pages; + p->flags |= SWP_WRITEOK; + spin_unlock(&swap_lock); + goto out_dput; + } + + /* wait for any unplug function to finish */ + down_write(&swap_unplug_sem); + up_write(&swap_unplug_sem); + + destroy_swap_extents(p); + mutex_lock(&swapon_mutex); + spin_lock(&swap_lock); + drain_mmlist(); + + /* wait for anyone still in scan_swap_map */ + p->highest_bit = 0; /* cuts scans short */ + while (p->flags >= SWP_SCANNING) { + spin_unlock(&swap_lock); + schedule_timeout_uninterruptible(1); + spin_lock(&swap_lock); + } + + swap_file = p->swap_file; + p->swap_file = NULL; + p->max = 0; + swap_map = p->swap_map; + p->swap_map = NULL; + p->flags = 0; + spin_unlock(&swap_lock); + mutex_unlock(&swapon_mutex); + vfree(swap_map); + inode = mapping->host; + if (S_ISBLK(inode->i_mode)) { + struct block_device *bdev = I_BDEV(inode); + set_blocksize(bdev, p->old_block_size); + bd_release(bdev); + } else { + mutex_lock(&inode->i_mutex); + inode->i_flags &= ~S_SWAPFILE; + mutex_unlock(&inode->i_mutex); + } + filp_close(swap_file, NULL); + err = 0; + +out_dput: + filp_close(victim, NULL); +out: + return err; +} + +#ifdef CONFIG_PROC_FS +/* iterator */ +static void *swap_start(struct seq_file *swap, loff_t *pos) +{ + struct swap_info_struct *ptr = swap_info; + int i; + loff_t l = *pos; + + mutex_lock(&swapon_mutex); + + if (!l) + return SEQ_START_TOKEN; + + for (i = 0; i < nr_swapfiles; i++, ptr++) { + if (!(ptr->flags & SWP_USED) || !ptr->swap_map) + continue; + if (!--l) + return ptr; + } + + return NULL; +} + +static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) +{ + struct swap_info_struct *ptr; + struct swap_info_struct *endptr = swap_info + nr_swapfiles; + + if (v == SEQ_START_TOKEN) + ptr = swap_info; + else { + ptr = v; + ptr++; + } + + for (; ptr < endptr; ptr++) { + if (!(ptr->flags & SWP_USED) || !ptr->swap_map) + continue; + ++*pos; + return ptr; + } + + return NULL; +} + +static void swap_stop(struct seq_file *swap, void *v) +{ + mutex_unlock(&swapon_mutex); +} + +static int swap_show(struct seq_file *swap, void *v) +{ + struct swap_info_struct *ptr = v; + struct file *file; + int len; + + if (ptr == SEQ_START_TOKEN) { + seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); + return 0; + } + + file = ptr->swap_file; + len = seq_path(swap, &file->f_path, " \t\n\\"); + seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", + len < 40 ? 40 - len : 1, " ", + S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? + "partition" : "file\t", + ptr->pages << (PAGE_SHIFT - 10), + ptr->inuse_pages << (PAGE_SHIFT - 10), + ptr->prio); + return 0; +} + +static const struct seq_operations swaps_op = { + .start = swap_start, + .next = swap_next, + .stop = swap_stop, + .show = swap_show +}; + +static int swaps_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &swaps_op); +} + +static const struct file_operations proc_swaps_operations = { + .open = swaps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init procswaps_init(void) +{ + proc_create("swaps", 0, NULL, &proc_swaps_operations); + return 0; +} +__initcall(procswaps_init); +#endif /* CONFIG_PROC_FS */ + +/* + * Written 01/25/92 by Simmule Turner, heavily changed by Linus. + * + * The swapon system call + */ +SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) +{ + struct swap_info_struct * p; + char *name = NULL; + struct block_device *bdev = NULL; + struct file *swap_file = NULL; + struct address_space *mapping; + unsigned int type; + int i, prev; + int error; + union swap_header *swap_header = NULL; + int swap_header_version; + unsigned int nr_good_pages = 0; + int nr_extents = 0; + sector_t span; + unsigned long maxpages = 1; + int swapfilesize; + unsigned short *swap_map = NULL; + struct page *page = NULL; + struct inode *inode = NULL; + int did_down = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + spin_lock(&swap_lock); + p = swap_info; + for (type = 0 ; type < nr_swapfiles ; type++,p++) + if (!(p->flags & SWP_USED)) + break; + error = -EPERM; + if (type >= MAX_SWAPFILES) { + spin_unlock(&swap_lock); + goto out; + } + if (type >= nr_swapfiles) + nr_swapfiles = type+1; + memset(p, 0, sizeof(*p)); + INIT_LIST_HEAD(&p->extent_list); + p->flags = SWP_USED; + p->next = -1; + spin_unlock(&swap_lock); + name = getname(specialfile); + error = PTR_ERR(name); + if (IS_ERR(name)) { + name = NULL; + goto bad_swap_2; + } + swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0); + error = PTR_ERR(swap_file); + if (IS_ERR(swap_file)) { + swap_file = NULL; + goto bad_swap_2; + } + + p->swap_file = swap_file; + mapping = swap_file->f_mapping; + inode = mapping->host; + + error = -EBUSY; + for (i = 0; i < nr_swapfiles; i++) { + struct swap_info_struct *q = &swap_info[i]; + + if (i == type || !q->swap_file) + continue; + if (mapping == q->swap_file->f_mapping) + goto bad_swap; + } + + error = -EINVAL; + if (S_ISBLK(inode->i_mode)) { + bdev = I_BDEV(inode); + error = bd_claim(bdev, sys_swapon); + if (error < 0) { + bdev = NULL; + error = -EINVAL; + goto bad_swap; + } + p->old_block_size = block_size(bdev); + error = set_blocksize(bdev, PAGE_SIZE); + if (error < 0) + goto bad_swap; + p->bdev = bdev; + } else if (S_ISREG(inode->i_mode)) { + p->bdev = inode->i_sb->s_bdev; + mutex_lock(&inode->i_mutex); + did_down = 1; + if (IS_SWAPFILE(inode)) { + error = -EBUSY; + goto bad_swap; + } + } else { + goto bad_swap; + } + + swapfilesize = i_size_read(inode) >> PAGE_SHIFT; + + /* + * Read the swap header. + */ + if (!mapping->a_ops->readpage) { + error = -EINVAL; + goto bad_swap; + } + page = read_mapping_page(mapping, 0, swap_file); + if (IS_ERR(page)) { + error = PTR_ERR(page); + goto bad_swap; + } + kmap(page); + swap_header = page_address(page); + + if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10)) + swap_header_version = 1; + else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10)) + swap_header_version = 2; + else { + printk(KERN_ERR "Unable to find swap-space signature\n"); + error = -EINVAL; + goto bad_swap; + } + + switch (swap_header_version) { + case 1: + printk(KERN_ERR "version 0 swap is no longer supported. " + "Use mkswap -v1 %s\n", name); + error = -EINVAL; + goto bad_swap; + case 2: + /* swap partition endianess hack... */ + if (swab32(swap_header->info.version) == 1) { + swab32s(&swap_header->info.version); + swab32s(&swap_header->info.last_page); + swab32s(&swap_header->info.nr_badpages); + for (i = 0; i < swap_header->info.nr_badpages; i++) + swab32s(&swap_header->info.badpages[i]); + } + /* Check the swap header's sub-version and the size of + the swap file and bad block lists */ + if (swap_header->info.version != 1) { + printk(KERN_WARNING + "Unable to handle swap header version %d\n", + swap_header->info.version); + error = -EINVAL; + goto bad_swap; + } + + p->lowest_bit = 1; + p->cluster_next = 1; + + /* + * Find out how many pages are allowed for a single swap + * device. There are two limiting factors: 1) the number of + * bits for the swap offset in the swp_entry_t type and + * 2) the number of bits in the a swap pte as defined by + * the different architectures. In order to find the + * largest possible bit mask a swap entry with swap type 0 + * and swap offset ~0UL is created, encoded to a swap pte, + * decoded to a swp_entry_t again and finally the swap + * offset is extracted. This will mask all the bits from + * the initial ~0UL mask that can't be encoded in either + * the swp_entry_t or the architecture definition of a + * swap pte. + */ + maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1; + if (maxpages > swap_header->info.last_page) + maxpages = swap_header->info.last_page; + p->highest_bit = maxpages - 1; + + error = -EINVAL; + if (!maxpages) + goto bad_swap; + if (swapfilesize && maxpages > swapfilesize) { + printk(KERN_WARNING + "Swap area shorter than signature indicates\n"); + goto bad_swap; + } + if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) + goto bad_swap; + if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) + goto bad_swap; + + /* OK, set up the swap map and apply the bad block list */ + swap_map = vmalloc(maxpages * sizeof(short)); + if (!swap_map) { + error = -ENOMEM; + goto bad_swap; + } + + error = 0; + memset(swap_map, 0, maxpages * sizeof(short)); + for (i = 0; i < swap_header->info.nr_badpages; i++) { + int page_nr = swap_header->info.badpages[i]; + if (page_nr <= 0 || page_nr >= swap_header->info.last_page) + error = -EINVAL; + else + swap_map[page_nr] = SWAP_MAP_BAD; + } + nr_good_pages = swap_header->info.last_page - + swap_header->info.nr_badpages - + 1 /* header page */; + if (error) + goto bad_swap; + } + + if (nr_good_pages) { + swap_map[0] = SWAP_MAP_BAD; + p->max = maxpages; + p->pages = nr_good_pages; + nr_extents = setup_swap_extents(p, &span); + if (nr_extents < 0) { + error = nr_extents; + goto bad_swap; + } + nr_good_pages = p->pages; + } + if (!nr_good_pages) { + printk(KERN_WARNING "Empty swap-file\n"); + error = -EINVAL; + goto bad_swap; + } + + mutex_lock(&swapon_mutex); + spin_lock(&swap_lock); + if (swap_flags & SWAP_FLAG_PREFER) + p->prio = + (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; + else + p->prio = --least_priority; + p->swap_map = swap_map; + p->flags = SWP_ACTIVE; + nr_swap_pages += nr_good_pages; + total_swap_pages += nr_good_pages; + + printk(KERN_INFO "Adding %uk swap on %s. " + "Priority:%d extents:%d across:%lluk\n", + nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, + nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10)); + + /* insert swap space into swap_list: */ + prev = -1; + for (i = swap_list.head; i >= 0; i = swap_info[i].next) { + if (p->prio >= swap_info[i].prio) { + break; + } + prev = i; + } + p->next = i; + if (prev < 0) { + swap_list.head = swap_list.next = p - swap_info; + } else { + swap_info[prev].next = p - swap_info; + } + spin_unlock(&swap_lock); + mutex_unlock(&swapon_mutex); + error = 0; + goto out; +bad_swap: + if (bdev) { + set_blocksize(bdev, p->old_block_size); + bd_release(bdev); + } + destroy_swap_extents(p); +bad_swap_2: + spin_lock(&swap_lock); + p->swap_file = NULL; + p->flags = 0; + spin_unlock(&swap_lock); + vfree(swap_map); + if (swap_file) + filp_close(swap_file, NULL); +out: + if (page && !IS_ERR(page)) { + kunmap(page); + page_cache_release(page); + } + if (name) + putname(name); + if (did_down) { + if (!error) + inode->i_flags |= S_SWAPFILE; + mutex_unlock(&inode->i_mutex); + } + return error; +} + +void si_swapinfo(struct sysinfo *val) +{ + unsigned int i; + unsigned long nr_to_be_unused = 0; + + spin_lock(&swap_lock); + for (i = 0; i < nr_swapfiles; i++) { + if (!(swap_info[i].flags & SWP_USED) || + (swap_info[i].flags & SWP_WRITEOK)) + continue; + nr_to_be_unused += swap_info[i].inuse_pages; + } + val->freeswap = nr_swap_pages + nr_to_be_unused; + val->totalswap = total_swap_pages + nr_to_be_unused; + spin_unlock(&swap_lock); +} + +/* + * Verify that a swap entry is valid and increment its swap map count. + * + * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as + * "permanent", but will be reclaimed by the next swapoff. + */ +int swap_duplicate(swp_entry_t entry) +{ + struct swap_info_struct * p; + unsigned long offset, type; + int result = 0; + + if (is_migration_entry(entry)) + return 1; + + type = swp_type(entry); + if (type >= nr_swapfiles) + goto bad_file; + p = type + swap_info; + offset = swp_offset(entry); + + spin_lock(&swap_lock); + if (offset < p->max && p->swap_map[offset]) { + if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { + p->swap_map[offset]++; + result = 1; + } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { + if (swap_overflow++ < 5) + printk(KERN_WARNING "swap_dup: swap entry overflow\n"); + p->swap_map[offset] = SWAP_MAP_MAX; + result = 1; + } + } + spin_unlock(&swap_lock); +out: + return result; + +bad_file: + printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); + goto out; +} + +struct swap_info_struct * +get_swap_info_struct(unsigned type) +{ + return &swap_info[type]; +} + +/* + * swap_lock prevents swap_map being freed. Don't grab an extra + * reference on the swaphandle, it doesn't matter if it becomes unused. + */ +int valid_swaphandles(swp_entry_t entry, unsigned long *offset) +{ + struct swap_info_struct *si; + int our_page_cluster = page_cluster; + pgoff_t target, toff; + pgoff_t base, end; + int nr_pages = 0; + + if (!our_page_cluster) /* no readahead */ + return 0; + + si = &swap_info[swp_type(entry)]; + target = swp_offset(entry); + base = (target >> our_page_cluster) << our_page_cluster; + end = base + (1 << our_page_cluster); + if (!base) /* first page is swap header */ + base++; + + spin_lock(&swap_lock); + if (end > si->max) /* don't go beyond end of map */ + end = si->max; + + /* Count contiguous allocated slots above our target */ + for (toff = target; ++toff < end; nr_pages++) { + /* Don't read in free or bad pages */ + if (!si->swap_map[toff]) + break; + if (si->swap_map[toff] == SWAP_MAP_BAD) + break; + } + /* Count contiguous allocated slots below our target */ + for (toff = target; --toff >= base; nr_pages++) { + /* Don't read in free or bad pages */ + if (!si->swap_map[toff]) + break; + if (si->swap_map[toff] == SWAP_MAP_BAD) + break; + } + spin_unlock(&swap_lock); + + /* + * Indicate starting offset, and return number of pages to get: + * if only 1, say 0, since there's then no readahead to be done. + */ + *offset = ++toff; + return nr_pages? ++nr_pages: 0; +} diff --git a/mm/thrash.c b/mm/thrash.c new file mode 100644 index 0000000..c4c5205 --- /dev/null +++ b/mm/thrash.c @@ -0,0 +1,79 @@ +/* + * mm/thrash.c + * + * Copyright (C) 2004, Red Hat, Inc. + * Copyright (C) 2004, Rik van Riel <riel@redhat.com> + * Released under the GPL, see the file COPYING for details. + * + * Simple token based thrashing protection, using the algorithm + * described in: http://www.cs.wm.edu/~sjiang/token.pdf + * + * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com> + * Improved algorithm to pass token: + * Each task has a priority which is incremented if it contended + * for the token in an interval less than its previous attempt. + * If the token is acquired, that task's priority is boosted to prevent + * the token from bouncing around too often and to let the task make + * some progress in its execution. + */ + +#include <linux/jiffies.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/swap.h> + +static DEFINE_SPINLOCK(swap_token_lock); +struct mm_struct *swap_token_mm; +static unsigned int global_faults; + +void grab_swap_token(void) +{ + int current_interval; + + global_faults++; + + current_interval = global_faults - current->mm->faultstamp; + + if (!spin_trylock(&swap_token_lock)) + return; + + /* First come first served */ + if (swap_token_mm == NULL) { + current->mm->token_priority = current->mm->token_priority + 2; + swap_token_mm = current->mm; + goto out; + } + + if (current->mm != swap_token_mm) { + if (current_interval < current->mm->last_interval) + current->mm->token_priority++; + else { + if (likely(current->mm->token_priority > 0)) + current->mm->token_priority--; + } + /* Check if we deserve the token */ + if (current->mm->token_priority > + swap_token_mm->token_priority) { + current->mm->token_priority += 2; + swap_token_mm = current->mm; + } + } else { + /* Token holder came in again! */ + current->mm->token_priority += 2; + } + +out: + current->mm->faultstamp = global_faults; + current->mm->last_interval = current_interval; + spin_unlock(&swap_token_lock); +return; +} + +/* Called on process exit. */ +void __put_swap_token(struct mm_struct *mm) +{ + spin_lock(&swap_token_lock); + if (likely(mm == swap_token_mm)) + swap_token_mm = NULL; + spin_unlock(&swap_token_lock); +} diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c new file mode 100644 index 0000000..3e67d57 --- /dev/null +++ b/mm/tiny-shmem.c @@ -0,0 +1,134 @@ +/* + * tiny-shmem.c: simple shmemfs and tmpfs using ramfs code + * + * Matt Mackall <mpm@selenic.com> January, 2004 + * derived from mm/shmem.c and fs/ramfs/inode.c + * + * This is intended for small system where the benefits of the full + * shmem code (swap-backed and resource-limited) are outweighed by + * their complexity. On systems without swap this code should be + * effectively equivalent, but much lighter weight. + */ + +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/vfs.h> +#include <linux/mount.h> +#include <linux/file.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/swap.h> +#include <linux/ramfs.h> + +static struct file_system_type tmpfs_fs_type = { + .name = "tmpfs", + .get_sb = ramfs_get_sb, + .kill_sb = kill_litter_super, +}; + +static struct vfsmount *shm_mnt; + +static int __init init_tmpfs(void) +{ + BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); + + shm_mnt = kern_mount(&tmpfs_fs_type); + BUG_ON(IS_ERR(shm_mnt)); + + return 0; +} +module_init(init_tmpfs) + +/** + * shmem_file_setup - get an unlinked file living in tmpfs + * @name: name for dentry (to be seen in /proc/<pid>/maps + * @size: size to be set for the file + * @flags: vm_flags + */ +struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) +{ + int error; + struct file *file; + struct inode *inode; + struct dentry *dentry, *root; + struct qstr this; + + if (IS_ERR(shm_mnt)) + return (void *)shm_mnt; + + error = -ENOMEM; + this.name = name; + this.len = strlen(name); + this.hash = 0; /* will go */ + root = shm_mnt->mnt_root; + dentry = d_alloc(root, &this); + if (!dentry) + goto put_memory; + + error = -ENFILE; + file = get_empty_filp(); + if (!file) + goto put_dentry; + + error = -ENOSPC; + inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); + if (!inode) + goto close_file; + + d_instantiate(dentry, inode); + inode->i_size = size; + inode->i_nlink = 0; /* It is unlinked */ + init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, + &ramfs_file_operations); + +#ifndef CONFIG_MMU + error = ramfs_nommu_expand_for_mapping(inode, size); + if (error) + goto close_file; +#endif + return file; + +close_file: + put_filp(file); +put_dentry: + dput(dentry); +put_memory: + return ERR_PTR(error); +} +EXPORT_SYMBOL_GPL(shmem_file_setup); + +/** + * shmem_zero_setup - setup a shared anonymous mapping + * @vma: the vma to be mmapped is prepared by do_mmap_pgoff + */ +int shmem_zero_setup(struct vm_area_struct *vma) +{ + struct file *file; + loff_t size = vma->vm_end - vma->vm_start; + + file = shmem_file_setup("dev/zero", size, vma->vm_flags); + if (IS_ERR(file)) + return PTR_ERR(file); + + if (vma->vm_file) + fput(vma->vm_file); + vma->vm_file = file; + vma->vm_ops = &generic_file_vm_ops; + return 0; +} + +int shmem_unuse(swp_entry_t entry, struct page *page) +{ + return 0; +} + +#ifndef CONFIG_MMU +unsigned long shmem_get_unmapped_area(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags) +{ + return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags); +} +#endif diff --git a/mm/truncate.c b/mm/truncate.c new file mode 100644 index 0000000..1229211 --- /dev/null +++ b/mm/truncate.c @@ -0,0 +1,473 @@ +/* + * mm/truncate.c - code for taking down pages from address_spaces + * + * Copyright (C) 2002, Linus Torvalds + * + * 10Sep2002 Andrew Morton + * Initial version. + */ + +#include <linux/kernel.h> +#include <linux/backing-dev.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/module.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/pagevec.h> +#include <linux/task_io_accounting_ops.h> +#include <linux/buffer_head.h> /* grr. try_to_release_page, + do_invalidatepage */ +#include "internal.h" + + +/** + * do_invalidatepage - invalidate part or all of a page + * @page: the page which is affected + * @offset: the index of the truncation point + * + * do_invalidatepage() is called when all or part of the page has become + * invalidated by a truncate operation. + * + * do_invalidatepage() does not have to release all buffers, but it must + * ensure that no dirty buffer is left outside @offset and that no I/O + * is underway against any of the blocks which are outside the truncation + * point. Because the caller is about to free (and possibly reuse) those + * blocks on-disk. + */ +void do_invalidatepage(struct page *page, unsigned long offset) +{ + void (*invalidatepage)(struct page *, unsigned long); + invalidatepage = page->mapping->a_ops->invalidatepage; +#ifdef CONFIG_BLOCK + if (!invalidatepage) + invalidatepage = block_invalidatepage; +#endif + if (invalidatepage) + (*invalidatepage)(page, offset); +} + +static inline void truncate_partial_page(struct page *page, unsigned partial) +{ + zero_user_segment(page, partial, PAGE_CACHE_SIZE); + if (PagePrivate(page)) + do_invalidatepage(page, partial); +} + +/* + * This cancels just the dirty bit on the kernel page itself, it + * does NOT actually remove dirty bits on any mmap's that may be + * around. It also leaves the page tagged dirty, so any sync + * activity will still find it on the dirty lists, and in particular, + * clear_page_dirty_for_io() will still look at the dirty bits in + * the VM. + * + * Doing this should *normally* only ever be done when a page + * is truncated, and is not actually mapped anywhere at all. However, + * fs/buffer.c does this when it notices that somebody has cleaned + * out all the buffers on a page without actually doing it through + * the VM. Can you say "ext3 is horribly ugly"? Tought you could. + */ +void cancel_dirty_page(struct page *page, unsigned int account_size) +{ + if (TestClearPageDirty(page)) { + struct address_space *mapping = page->mapping; + if (mapping && mapping_cap_account_dirty(mapping)) { + dec_zone_page_state(page, NR_FILE_DIRTY); + dec_bdi_stat(mapping->backing_dev_info, + BDI_RECLAIMABLE); + if (account_size) + task_io_account_cancelled_write(account_size); + } + } +} +EXPORT_SYMBOL(cancel_dirty_page); + +/* + * If truncate cannot remove the fs-private metadata from the page, the page + * becomes orphaned. It will be left on the LRU and may even be mapped into + * user pagetables if we're racing with filemap_fault(). + * + * We need to bale out if page->mapping is no longer equal to the original + * mapping. This happens a) when the VM reclaimed the page while we waited on + * its lock, b) when a concurrent invalidate_mapping_pages got there first and + * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. + */ +static void +truncate_complete_page(struct address_space *mapping, struct page *page) +{ + if (page->mapping != mapping) + return; + + if (PagePrivate(page)) + do_invalidatepage(page, 0); + + cancel_dirty_page(page, PAGE_CACHE_SIZE); + + clear_page_mlock(page); + remove_from_page_cache(page); + ClearPageMappedToDisk(page); + page_cache_release(page); /* pagecache ref */ +} + +/* + * This is for invalidate_mapping_pages(). That function can be called at + * any time, and is not supposed to throw away dirty pages. But pages can + * be marked dirty at any time too, so use remove_mapping which safely + * discards clean, unused pages. + * + * Returns non-zero if the page was successfully invalidated. + */ +static int +invalidate_complete_page(struct address_space *mapping, struct page *page) +{ + int ret; + + if (page->mapping != mapping) + return 0; + + if (PagePrivate(page) && !try_to_release_page(page, 0)) + return 0; + + clear_page_mlock(page); + ret = remove_mapping(mapping, page); + + return ret; +} + +/** + * truncate_inode_pages - truncate range of pages specified by start & end byte offsets + * @mapping: mapping to truncate + * @lstart: offset from which to truncate + * @lend: offset to which to truncate + * + * Truncate the page cache, removing the pages that are between + * specified offsets (and zeroing out partial page + * (if lstart is not page aligned)). + * + * Truncate takes two passes - the first pass is nonblocking. It will not + * block on page locks and it will not block on writeback. The second pass + * will wait. This is to prevent as much IO as possible in the affected region. + * The first pass will remove most pages, so the search cost of the second pass + * is low. + * + * When looking at page->index outside the page lock we need to be careful to + * copy it into a local to avoid races (it could change at any time). + * + * We pass down the cache-hot hint to the page freeing code. Even if the + * mapping is large, it is probably the case that the final pages are the most + * recently touched, and freeing happens in ascending file offset order. + */ +void truncate_inode_pages_range(struct address_space *mapping, + loff_t lstart, loff_t lend) +{ + const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; + pgoff_t end; + const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); + struct pagevec pvec; + pgoff_t next; + int i; + + if (mapping->nrpages == 0) + return; + + BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); + end = (lend >> PAGE_CACHE_SHIFT); + + pagevec_init(&pvec, 0); + next = start; + while (next <= end && + pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + pgoff_t page_index = page->index; + + if (page_index > end) { + next = page_index; + break; + } + + if (page_index > next) + next = page_index; + next++; + if (!trylock_page(page)) + continue; + if (PageWriteback(page)) { + unlock_page(page); + continue; + } + if (page_mapped(page)) { + unmap_mapping_range(mapping, + (loff_t)page_index<<PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE, 0); + } + truncate_complete_page(mapping, page); + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } + + if (partial) { + struct page *page = find_lock_page(mapping, start - 1); + if (page) { + wait_on_page_writeback(page); + truncate_partial_page(page, partial); + unlock_page(page); + page_cache_release(page); + } + } + + next = start; + for ( ; ; ) { + cond_resched(); + if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + if (next == start) + break; + next = start; + continue; + } + if (pvec.pages[0]->index > end) { + pagevec_release(&pvec); + break; + } + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + + if (page->index > end) + break; + lock_page(page); + wait_on_page_writeback(page); + if (page_mapped(page)) { + unmap_mapping_range(mapping, + (loff_t)page->index<<PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE, 0); + } + if (page->index > next) + next = page->index; + next++; + truncate_complete_page(mapping, page); + unlock_page(page); + } + pagevec_release(&pvec); + } +} +EXPORT_SYMBOL(truncate_inode_pages_range); + +/** + * truncate_inode_pages - truncate *all* the pages from an offset + * @mapping: mapping to truncate + * @lstart: offset from which to truncate + * + * Called under (and serialised by) inode->i_mutex. + */ +void truncate_inode_pages(struct address_space *mapping, loff_t lstart) +{ + truncate_inode_pages_range(mapping, lstart, (loff_t)-1); +} +EXPORT_SYMBOL(truncate_inode_pages); + +unsigned long __invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end, bool be_atomic) +{ + struct pagevec pvec; + pgoff_t next = start; + unsigned long ret = 0; + int i; + + pagevec_init(&pvec, 0); + while (next <= end && + pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + pgoff_t index; + int lock_failed; + + lock_failed = !trylock_page(page); + + /* + * We really shouldn't be looking at the ->index of an + * unlocked page. But we're not allowed to lock these + * pages. So we rely upon nobody altering the ->index + * of this (pinned-by-us) page. + */ + index = page->index; + if (index > next) + next = index; + next++; + if (lock_failed) + continue; + + if (PageDirty(page) || PageWriteback(page)) + goto unlock; + if (page_mapped(page)) + goto unlock; + ret += invalidate_complete_page(mapping, page); +unlock: + unlock_page(page); + if (next > end) + break; + } + pagevec_release(&pvec); + if (likely(!be_atomic)) + cond_resched(); + } + return ret; +} + +/** + * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode + * @mapping: the address_space which holds the pages to invalidate + * @start: the offset 'from' which to invalidate + * @end: the offset 'to' which to invalidate (inclusive) + * + * This function only removes the unlocked pages, if you want to + * remove all the pages of one inode, you must call truncate_inode_pages. + * + * invalidate_mapping_pages() will not block on IO activity. It will not + * invalidate pages which are dirty, locked, under writeback or mapped into + * pagetables. + */ +unsigned long invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + return __invalidate_mapping_pages(mapping, start, end, false); +} +EXPORT_SYMBOL(invalidate_mapping_pages); + +/* + * This is like invalidate_complete_page(), except it ignores the page's + * refcount. We do this because invalidate_inode_pages2() needs stronger + * invalidation guarantees, and cannot afford to leave pages behind because + * shrink_page_list() has a temp ref on them, or because they're transiently + * sitting in the lru_cache_add() pagevecs. + */ +static int +invalidate_complete_page2(struct address_space *mapping, struct page *page) +{ + if (page->mapping != mapping) + return 0; + + if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) + return 0; + + spin_lock_irq(&mapping->tree_lock); + if (PageDirty(page)) + goto failed; + + clear_page_mlock(page); + BUG_ON(PagePrivate(page)); + __remove_from_page_cache(page); + spin_unlock_irq(&mapping->tree_lock); + page_cache_release(page); /* pagecache ref */ + return 1; +failed: + spin_unlock_irq(&mapping->tree_lock); + return 0; +} + +static int do_launder_page(struct address_space *mapping, struct page *page) +{ + if (!PageDirty(page)) + return 0; + if (page->mapping != mapping || mapping->a_ops->launder_page == NULL) + return 0; + return mapping->a_ops->launder_page(page); +} + +/** + * invalidate_inode_pages2_range - remove range of pages from an address_space + * @mapping: the address_space + * @start: the page offset 'from' which to invalidate + * @end: the page offset 'to' which to invalidate (inclusive) + * + * Any pages which are found to be mapped into pagetables are unmapped prior to + * invalidation. + * + * Returns -EBUSY if any pages could not be invalidated. + */ +int invalidate_inode_pages2_range(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + struct pagevec pvec; + pgoff_t next; + int i; + int ret = 0; + int ret2 = 0; + int did_range_unmap = 0; + int wrapped = 0; + + pagevec_init(&pvec, 0); + next = start; + while (next <= end && !wrapped && + pagevec_lookup(&pvec, mapping, next, + min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + pgoff_t page_index; + + lock_page(page); + if (page->mapping != mapping) { + unlock_page(page); + continue; + } + page_index = page->index; + next = page_index + 1; + if (next == 0) + wrapped = 1; + if (page_index > end) { + unlock_page(page); + break; + } + wait_on_page_writeback(page); + if (page_mapped(page)) { + if (!did_range_unmap) { + /* + * Zap the rest of the file in one hit. + */ + unmap_mapping_range(mapping, + (loff_t)page_index<<PAGE_CACHE_SHIFT, + (loff_t)(end - page_index + 1) + << PAGE_CACHE_SHIFT, + 0); + did_range_unmap = 1; + } else { + /* + * Just zap this page + */ + unmap_mapping_range(mapping, + (loff_t)page_index<<PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE, 0); + } + } + BUG_ON(page_mapped(page)); + ret2 = do_launder_page(mapping, page); + if (ret2 == 0) { + if (!invalidate_complete_page2(mapping, page)) + ret2 = -EBUSY; + } + if (ret2 < 0) + ret = ret2; + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } + return ret; +} +EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); + +/** + * invalidate_inode_pages2 - remove all pages from an address_space + * @mapping: the address_space + * + * Any pages which are found to be mapped into pagetables are unmapped prior to + * invalidation. + * + * Returns -EIO if any pages could not be invalidated. + */ +int invalidate_inode_pages2(struct address_space *mapping) +{ + return invalidate_inode_pages2_range(mapping, 0, -1); +} +EXPORT_SYMBOL_GPL(invalidate_inode_pages2); diff --git a/mm/util.c b/mm/util.c new file mode 100644 index 0000000..cb00b74 --- /dev/null +++ b/mm/util.c @@ -0,0 +1,188 @@ +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/module.h> +#include <linux/err.h> +#include <linux/sched.h> +#include <asm/uaccess.h> + +/** + * kstrdup - allocate space for and copy an existing string + * @s: the string to duplicate + * @gfp: the GFP mask used in the kmalloc() call when allocating memory + */ +char *kstrdup(const char *s, gfp_t gfp) +{ + size_t len; + char *buf; + + if (!s) + return NULL; + + len = strlen(s) + 1; + buf = kmalloc_track_caller(len, gfp); + if (buf) + memcpy(buf, s, len); + return buf; +} +EXPORT_SYMBOL(kstrdup); + +/** + * kstrndup - allocate space for and copy an existing string + * @s: the string to duplicate + * @max: read at most @max chars from @s + * @gfp: the GFP mask used in the kmalloc() call when allocating memory + */ +char *kstrndup(const char *s, size_t max, gfp_t gfp) +{ + size_t len; + char *buf; + + if (!s) + return NULL; + + len = strnlen(s, max); + buf = kmalloc_track_caller(len+1, gfp); + if (buf) { + memcpy(buf, s, len); + buf[len] = '\0'; + } + return buf; +} +EXPORT_SYMBOL(kstrndup); + +/** + * kmemdup - duplicate region of memory + * + * @src: memory region to duplicate + * @len: memory region length + * @gfp: GFP mask to use + */ +void *kmemdup(const void *src, size_t len, gfp_t gfp) +{ + void *p; + + p = kmalloc_track_caller(len, gfp); + if (p) + memcpy(p, src, len); + return p; +} +EXPORT_SYMBOL(kmemdup); + +/** + * __krealloc - like krealloc() but don't free @p. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * This function is like krealloc() except it never frees the originally + * allocated buffer. Use this if you don't want to free the buffer immediately + * like, for example, with RCU. + */ +void *__krealloc(const void *p, size_t new_size, gfp_t flags) +{ + void *ret; + size_t ks = 0; + + if (unlikely(!new_size)) + return ZERO_SIZE_PTR; + + if (p) + ks = ksize(p); + + if (ks >= new_size) + return (void *)p; + + ret = kmalloc_track_caller(new_size, flags); + if (ret && p) + memcpy(ret, p, ks); + + return ret; +} +EXPORT_SYMBOL(__krealloc); + +/** + * krealloc - reallocate memory. The contents will remain unchanged. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * The contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. If @p is %NULL, krealloc() + * behaves exactly like kmalloc(). If @size is 0 and @p is not a + * %NULL pointer, the object pointed to is freed. + */ +void *krealloc(const void *p, size_t new_size, gfp_t flags) +{ + void *ret; + + if (unlikely(!new_size)) { + kfree(p); + return ZERO_SIZE_PTR; + } + + ret = __krealloc(p, new_size, flags); + if (ret && p != ret) + kfree(p); + + return ret; +} +EXPORT_SYMBOL(krealloc); + +/* + * strndup_user - duplicate an existing string from user space + * @s: The string to duplicate + * @n: Maximum number of bytes to copy, including the trailing NUL. + */ +char *strndup_user(const char __user *s, long n) +{ + char *p; + long length; + + length = strnlen_user(s, n); + + if (!length) + return ERR_PTR(-EFAULT); + + if (length > n) + return ERR_PTR(-EINVAL); + + p = kmalloc(length, GFP_KERNEL); + + if (!p) + return ERR_PTR(-ENOMEM); + + if (copy_from_user(p, s, length)) { + kfree(p); + return ERR_PTR(-EFAULT); + } + + p[length - 1] = '\0'; + + return p; +} +EXPORT_SYMBOL(strndup_user); + +#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT +void arch_pick_mmap_layout(struct mm_struct *mm) +{ + mm->mmap_base = TASK_UNMAPPED_BASE; + mm->get_unmapped_area = arch_get_unmapped_area; + mm->unmap_area = arch_unmap_area; +} +#endif + +int __attribute__((weak)) get_user_pages_fast(unsigned long start, + int nr_pages, int write, struct page **pages) +{ + struct mm_struct *mm = current->mm; + int ret; + + down_read(&mm->mmap_sem); + ret = get_user_pages(current, mm, start, nr_pages, + write, 0, pages, NULL); + up_read(&mm->mmap_sem); + + return ret; +} +EXPORT_SYMBOL_GPL(get_user_pages_fast); diff --git a/mm/vmalloc.c b/mm/vmalloc.c new file mode 100644 index 0000000..4172ce4 --- /dev/null +++ b/mm/vmalloc.c @@ -0,0 +1,1812 @@ +/* + * linux/mm/vmalloc.c + * + * Copyright (C) 1993 Linus Torvalds + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 + * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 + * Numa awareness, Christoph Lameter, SGI, June 2005 + */ + +#include <linux/vmalloc.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/highmem.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/interrupt.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/debugobjects.h> +#include <linux/kallsyms.h> +#include <linux/list.h> +#include <linux/rbtree.h> +#include <linux/radix-tree.h> +#include <linux/rcupdate.h> +#include <linux/bootmem.h> + +#include <asm/atomic.h> +#include <asm/uaccess.h> +#include <asm/tlbflush.h> + + +/*** Page table manipulation functions ***/ + +static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) +{ + pte_t *pte; + + pte = pte_offset_kernel(pmd, addr); + do { + pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); + WARN_ON(!pte_none(ptent) && !pte_present(ptent)); + } while (pte++, addr += PAGE_SIZE, addr != end); +} + +static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; + vunmap_pte_range(pmd, addr, next); + } while (pmd++, addr = next, addr != end); +} + +static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end) +{ + pud_t *pud; + unsigned long next; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + vunmap_pmd_range(pud, addr, next); + } while (pud++, addr = next, addr != end); +} + +static void vunmap_page_range(unsigned long addr, unsigned long end) +{ + pgd_t *pgd; + unsigned long next; + + BUG_ON(addr >= end); + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + vunmap_pud_range(pgd, addr, next); + } while (pgd++, addr = next, addr != end); +} + +static int vmap_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, pgprot_t prot, struct page **pages, int *nr) +{ + pte_t *pte; + + /* + * nr is a running index into the array which helps higher level + * callers keep track of where we're up to. + */ + + pte = pte_alloc_kernel(pmd, addr); + if (!pte) + return -ENOMEM; + do { + struct page *page = pages[*nr]; + + if (WARN_ON(!pte_none(*pte))) + return -EBUSY; + if (WARN_ON(!page)) + return -ENOMEM; + set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); + (*nr)++; + } while (pte++, addr += PAGE_SIZE, addr != end); + return 0; +} + +static int vmap_pmd_range(pud_t *pud, unsigned long addr, + unsigned long end, pgprot_t prot, struct page **pages, int *nr) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_alloc(&init_mm, pud, addr); + if (!pmd) + return -ENOMEM; + do { + next = pmd_addr_end(addr, end); + if (vmap_pte_range(pmd, addr, next, prot, pages, nr)) + return -ENOMEM; + } while (pmd++, addr = next, addr != end); + return 0; +} + +static int vmap_pud_range(pgd_t *pgd, unsigned long addr, + unsigned long end, pgprot_t prot, struct page **pages, int *nr) +{ + pud_t *pud; + unsigned long next; + + pud = pud_alloc(&init_mm, pgd, addr); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); + if (vmap_pmd_range(pud, addr, next, prot, pages, nr)) + return -ENOMEM; + } while (pud++, addr = next, addr != end); + return 0; +} + +/* + * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and + * will have pfns corresponding to the "pages" array. + * + * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] + */ +static int vmap_page_range(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages) +{ + pgd_t *pgd; + unsigned long next; + unsigned long addr = start; + int err = 0; + int nr = 0; + + BUG_ON(addr >= end); + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, end); + err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); + if (err) + break; + } while (pgd++, addr = next, addr != end); + flush_cache_vmap(start, end); + + if (unlikely(err)) + return err; + return nr; +} + +static inline int is_vmalloc_or_module_addr(const void *x) +{ + /* + * ARM, x86-64 and sparc64 put modules in a special place, + * and fall back on vmalloc() if that fails. Others + * just put it in the vmalloc space. + */ +#if defined(CONFIG_MODULES) && defined(MODULES_VADDR) + unsigned long addr = (unsigned long)x; + if (addr >= MODULES_VADDR && addr < MODULES_END) + return 1; +#endif + return is_vmalloc_addr(x); +} + +/* + * Walk a vmap address to the struct page it maps. + */ +struct page *vmalloc_to_page(const void *vmalloc_addr) +{ + unsigned long addr = (unsigned long) vmalloc_addr; + struct page *page = NULL; + pgd_t *pgd = pgd_offset_k(addr); + + /* + * XXX we might need to change this if we add VIRTUAL_BUG_ON for + * architectures that do not vmalloc module space + */ + VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); + + if (!pgd_none(*pgd)) { + pud_t *pud = pud_offset(pgd, addr); + if (!pud_none(*pud)) { + pmd_t *pmd = pmd_offset(pud, addr); + if (!pmd_none(*pmd)) { + pte_t *ptep, pte; + + ptep = pte_offset_map(pmd, addr); + pte = *ptep; + if (pte_present(pte)) + page = pte_page(pte); + pte_unmap(ptep); + } + } + } + return page; +} +EXPORT_SYMBOL(vmalloc_to_page); + +/* + * Map a vmalloc()-space virtual address to the physical page frame number. + */ +unsigned long vmalloc_to_pfn(const void *vmalloc_addr) +{ + return page_to_pfn(vmalloc_to_page(vmalloc_addr)); +} +EXPORT_SYMBOL(vmalloc_to_pfn); + + +/*** Global kva allocator ***/ + +#define VM_LAZY_FREE 0x01 +#define VM_LAZY_FREEING 0x02 +#define VM_VM_AREA 0x04 + +struct vmap_area { + unsigned long va_start; + unsigned long va_end; + unsigned long flags; + struct rb_node rb_node; /* address sorted rbtree */ + struct list_head list; /* address sorted list */ + struct list_head purge_list; /* "lazy purge" list */ + void *private; + struct rcu_head rcu_head; +}; + +static DEFINE_SPINLOCK(vmap_area_lock); +static struct rb_root vmap_area_root = RB_ROOT; +static LIST_HEAD(vmap_area_list); + +static struct vmap_area *__find_vmap_area(unsigned long addr) +{ + struct rb_node *n = vmap_area_root.rb_node; + + while (n) { + struct vmap_area *va; + + va = rb_entry(n, struct vmap_area, rb_node); + if (addr < va->va_start) + n = n->rb_left; + else if (addr > va->va_start) + n = n->rb_right; + else + return va; + } + + return NULL; +} + +static void __insert_vmap_area(struct vmap_area *va) +{ + struct rb_node **p = &vmap_area_root.rb_node; + struct rb_node *parent = NULL; + struct rb_node *tmp; + + while (*p) { + struct vmap_area *tmp; + + parent = *p; + tmp = rb_entry(parent, struct vmap_area, rb_node); + if (va->va_start < tmp->va_end) + p = &(*p)->rb_left; + else if (va->va_end > tmp->va_start) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&va->rb_node, parent, p); + rb_insert_color(&va->rb_node, &vmap_area_root); + + /* address-sort this list so it is usable like the vmlist */ + tmp = rb_prev(&va->rb_node); + if (tmp) { + struct vmap_area *prev; + prev = rb_entry(tmp, struct vmap_area, rb_node); + list_add_rcu(&va->list, &prev->list); + } else + list_add_rcu(&va->list, &vmap_area_list); +} + +static void purge_vmap_area_lazy(void); + +/* + * Allocate a region of KVA of the specified size and alignment, within the + * vstart and vend. + */ +static struct vmap_area *alloc_vmap_area(unsigned long size, + unsigned long align, + unsigned long vstart, unsigned long vend, + int node, gfp_t gfp_mask) +{ + struct vmap_area *va; + struct rb_node *n; + unsigned long addr; + int purged = 0; + + BUG_ON(!size); + BUG_ON(size & ~PAGE_MASK); + + va = kmalloc_node(sizeof(struct vmap_area), + gfp_mask & GFP_RECLAIM_MASK, node); + if (unlikely(!va)) + return ERR_PTR(-ENOMEM); + +retry: + addr = ALIGN(vstart, align); + + spin_lock(&vmap_area_lock); + if (addr + size - 1 < addr) + goto overflow; + + /* XXX: could have a last_hole cache */ + n = vmap_area_root.rb_node; + if (n) { + struct vmap_area *first = NULL; + + do { + struct vmap_area *tmp; + tmp = rb_entry(n, struct vmap_area, rb_node); + if (tmp->va_end >= addr) { + if (!first && tmp->va_start < addr + size) + first = tmp; + n = n->rb_left; + } else { + first = tmp; + n = n->rb_right; + } + } while (n); + + if (!first) + goto found; + + if (first->va_end < addr) { + n = rb_next(&first->rb_node); + if (n) + first = rb_entry(n, struct vmap_area, rb_node); + else + goto found; + } + + while (addr + size > first->va_start && addr + size <= vend) { + addr = ALIGN(first->va_end + PAGE_SIZE, align); + if (addr + size - 1 < addr) + goto overflow; + + n = rb_next(&first->rb_node); + if (n) + first = rb_entry(n, struct vmap_area, rb_node); + else + goto found; + } + } +found: + if (addr + size > vend) { +overflow: + spin_unlock(&vmap_area_lock); + if (!purged) { + purge_vmap_area_lazy(); + purged = 1; + goto retry; + } + if (printk_ratelimit()) + printk(KERN_WARNING "vmap allocation failed: " + "use vmalloc=<size> to increase size.\n"); + return ERR_PTR(-EBUSY); + } + + BUG_ON(addr & (align-1)); + + va->va_start = addr; + va->va_end = addr + size; + va->flags = 0; + __insert_vmap_area(va); + spin_unlock(&vmap_area_lock); + + return va; +} + +static void rcu_free_va(struct rcu_head *head) +{ + struct vmap_area *va = container_of(head, struct vmap_area, rcu_head); + + kfree(va); +} + +static void __free_vmap_area(struct vmap_area *va) +{ + BUG_ON(RB_EMPTY_NODE(&va->rb_node)); + rb_erase(&va->rb_node, &vmap_area_root); + RB_CLEAR_NODE(&va->rb_node); + list_del_rcu(&va->list); + + call_rcu(&va->rcu_head, rcu_free_va); +} + +/* + * Free a region of KVA allocated by alloc_vmap_area + */ +static void free_vmap_area(struct vmap_area *va) +{ + spin_lock(&vmap_area_lock); + __free_vmap_area(va); + spin_unlock(&vmap_area_lock); +} + +/* + * Clear the pagetable entries of a given vmap_area + */ +static void unmap_vmap_area(struct vmap_area *va) +{ + vunmap_page_range(va->va_start, va->va_end); +} + +/* + * lazy_max_pages is the maximum amount of virtual address space we gather up + * before attempting to purge with a TLB flush. + * + * There is a tradeoff here: a larger number will cover more kernel page tables + * and take slightly longer to purge, but it will linearly reduce the number of + * global TLB flushes that must be performed. It would seem natural to scale + * this number up linearly with the number of CPUs (because vmapping activity + * could also scale linearly with the number of CPUs), however it is likely + * that in practice, workloads might be constrained in other ways that mean + * vmap activity will not scale linearly with CPUs. Also, I want to be + * conservative and not introduce a big latency on huge systems, so go with + * a less aggressive log scale. It will still be an improvement over the old + * code, and it will be simple to change the scale factor if we find that it + * becomes a problem on bigger systems. + */ +static unsigned long lazy_max_pages(void) +{ + unsigned int log; + + log = fls(num_online_cpus()); + + return log * (32UL * 1024 * 1024 / PAGE_SIZE); +} + +static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); + +/* + * Purges all lazily-freed vmap areas. + * + * If sync is 0 then don't purge if there is already a purge in progress. + * If force_flush is 1, then flush kernel TLBs between *start and *end even + * if we found no lazy vmap areas to unmap (callers can use this to optimise + * their own TLB flushing). + * Returns with *start = min(*start, lowest purged address) + * *end = max(*end, highest purged address) + */ +static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, + int sync, int force_flush) +{ + static DEFINE_SPINLOCK(purge_lock); + LIST_HEAD(valist); + struct vmap_area *va; + struct vmap_area *n_va; + int nr = 0; + + /* + * If sync is 0 but force_flush is 1, we'll go sync anyway but callers + * should not expect such behaviour. This just simplifies locking for + * the case that isn't actually used at the moment anyway. + */ + if (!sync && !force_flush) { + if (!spin_trylock(&purge_lock)) + return; + } else + spin_lock(&purge_lock); + + rcu_read_lock(); + list_for_each_entry_rcu(va, &vmap_area_list, list) { + if (va->flags & VM_LAZY_FREE) { + if (va->va_start < *start) + *start = va->va_start; + if (va->va_end > *end) + *end = va->va_end; + nr += (va->va_end - va->va_start) >> PAGE_SHIFT; + unmap_vmap_area(va); + list_add_tail(&va->purge_list, &valist); + va->flags |= VM_LAZY_FREEING; + va->flags &= ~VM_LAZY_FREE; + } + } + rcu_read_unlock(); + + if (nr) { + BUG_ON(nr > atomic_read(&vmap_lazy_nr)); + atomic_sub(nr, &vmap_lazy_nr); + } + + if (nr || force_flush) + flush_tlb_kernel_range(*start, *end); + + if (nr) { + spin_lock(&vmap_area_lock); + list_for_each_entry_safe(va, n_va, &valist, purge_list) + __free_vmap_area(va); + spin_unlock(&vmap_area_lock); + } + spin_unlock(&purge_lock); +} + +/* + * Kick off a purge of the outstanding lazy areas. Don't bother if somebody + * is already purging. + */ +static void try_purge_vmap_area_lazy(void) +{ + unsigned long start = ULONG_MAX, end = 0; + + __purge_vmap_area_lazy(&start, &end, 0, 0); +} + +/* + * Kick off a purge of the outstanding lazy areas. + */ +static void purge_vmap_area_lazy(void) +{ + unsigned long start = ULONG_MAX, end = 0; + + __purge_vmap_area_lazy(&start, &end, 1, 0); +} + +/* + * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been + * called for the correct range previously. + */ +static void free_unmap_vmap_area_noflush(struct vmap_area *va) +{ + va->flags |= VM_LAZY_FREE; + atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); + if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages())) + try_purge_vmap_area_lazy(); +} + +/* + * Free and unmap a vmap area + */ +static void free_unmap_vmap_area(struct vmap_area *va) +{ + flush_cache_vunmap(va->va_start, va->va_end); + free_unmap_vmap_area_noflush(va); +} + +static struct vmap_area *find_vmap_area(unsigned long addr) +{ + struct vmap_area *va; + + spin_lock(&vmap_area_lock); + va = __find_vmap_area(addr); + spin_unlock(&vmap_area_lock); + + return va; +} + +static void free_unmap_vmap_area_addr(unsigned long addr) +{ + struct vmap_area *va; + + va = find_vmap_area(addr); + BUG_ON(!va); + free_unmap_vmap_area(va); +} + + +/*** Per cpu kva allocator ***/ + +/* + * vmap space is limited especially on 32 bit architectures. Ensure there is + * room for at least 16 percpu vmap blocks per CPU. + */ +/* + * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able + * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess + * instead (we just need a rough idea) + */ +#if BITS_PER_LONG == 32 +#define VMALLOC_SPACE (128UL*1024*1024) +#else +#define VMALLOC_SPACE (128UL*1024*1024*1024) +#endif + +#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) +#define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ +#define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ +#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) +#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ +#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ +#define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ + VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ + VMALLOC_PAGES / NR_CPUS / 16)) + +#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) + +static bool vmap_initialized __read_mostly = false; + +struct vmap_block_queue { + spinlock_t lock; + struct list_head free; + struct list_head dirty; + unsigned int nr_dirty; +}; + +struct vmap_block { + spinlock_t lock; + struct vmap_area *va; + struct vmap_block_queue *vbq; + unsigned long free, dirty; + DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); + DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); + union { + struct { + struct list_head free_list; + struct list_head dirty_list; + }; + struct rcu_head rcu_head; + }; +}; + +/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ +static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); + +/* + * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block + * in the free path. Could get rid of this if we change the API to return a + * "cookie" from alloc, to be passed to free. But no big deal yet. + */ +static DEFINE_SPINLOCK(vmap_block_tree_lock); +static RADIX_TREE(vmap_block_tree, GFP_ATOMIC); + +/* + * We should probably have a fallback mechanism to allocate virtual memory + * out of partially filled vmap blocks. However vmap block sizing should be + * fairly reasonable according to the vmalloc size, so it shouldn't be a + * big problem. + */ + +static unsigned long addr_to_vb_idx(unsigned long addr) +{ + addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); + addr /= VMAP_BLOCK_SIZE; + return addr; +} + +static struct vmap_block *new_vmap_block(gfp_t gfp_mask) +{ + struct vmap_block_queue *vbq; + struct vmap_block *vb; + struct vmap_area *va; + unsigned long vb_idx; + int node, err; + + node = numa_node_id(); + + vb = kmalloc_node(sizeof(struct vmap_block), + gfp_mask & GFP_RECLAIM_MASK, node); + if (unlikely(!vb)) + return ERR_PTR(-ENOMEM); + + va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, + VMALLOC_START, VMALLOC_END, + node, gfp_mask); + if (unlikely(IS_ERR(va))) { + kfree(vb); + return ERR_PTR(PTR_ERR(va)); + } + + err = radix_tree_preload(gfp_mask); + if (unlikely(err)) { + kfree(vb); + free_vmap_area(va); + return ERR_PTR(err); + } + + spin_lock_init(&vb->lock); + vb->va = va; + vb->free = VMAP_BBMAP_BITS; + vb->dirty = 0; + bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS); + bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); + INIT_LIST_HEAD(&vb->free_list); + INIT_LIST_HEAD(&vb->dirty_list); + + vb_idx = addr_to_vb_idx(va->va_start); + spin_lock(&vmap_block_tree_lock); + err = radix_tree_insert(&vmap_block_tree, vb_idx, vb); + spin_unlock(&vmap_block_tree_lock); + BUG_ON(err); + radix_tree_preload_end(); + + vbq = &get_cpu_var(vmap_block_queue); + vb->vbq = vbq; + spin_lock(&vbq->lock); + list_add(&vb->free_list, &vbq->free); + spin_unlock(&vbq->lock); + put_cpu_var(vmap_cpu_blocks); + + return vb; +} + +static void rcu_free_vb(struct rcu_head *head) +{ + struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head); + + kfree(vb); +} + +static void free_vmap_block(struct vmap_block *vb) +{ + struct vmap_block *tmp; + unsigned long vb_idx; + + spin_lock(&vb->vbq->lock); + if (!list_empty(&vb->free_list)) + list_del(&vb->free_list); + if (!list_empty(&vb->dirty_list)) + list_del(&vb->dirty_list); + spin_unlock(&vb->vbq->lock); + + vb_idx = addr_to_vb_idx(vb->va->va_start); + spin_lock(&vmap_block_tree_lock); + tmp = radix_tree_delete(&vmap_block_tree, vb_idx); + spin_unlock(&vmap_block_tree_lock); + BUG_ON(tmp != vb); + + free_unmap_vmap_area_noflush(vb->va); + call_rcu(&vb->rcu_head, rcu_free_vb); +} + +static void *vb_alloc(unsigned long size, gfp_t gfp_mask) +{ + struct vmap_block_queue *vbq; + struct vmap_block *vb; + unsigned long addr = 0; + unsigned int order; + + BUG_ON(size & ~PAGE_MASK); + BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); + order = get_order(size); + +again: + rcu_read_lock(); + vbq = &get_cpu_var(vmap_block_queue); + list_for_each_entry_rcu(vb, &vbq->free, free_list) { + int i; + + spin_lock(&vb->lock); + i = bitmap_find_free_region(vb->alloc_map, + VMAP_BBMAP_BITS, order); + + if (i >= 0) { + addr = vb->va->va_start + (i << PAGE_SHIFT); + BUG_ON(addr_to_vb_idx(addr) != + addr_to_vb_idx(vb->va->va_start)); + vb->free -= 1UL << order; + if (vb->free == 0) { + spin_lock(&vbq->lock); + list_del_init(&vb->free_list); + spin_unlock(&vbq->lock); + } + spin_unlock(&vb->lock); + break; + } + spin_unlock(&vb->lock); + } + put_cpu_var(vmap_cpu_blocks); + rcu_read_unlock(); + + if (!addr) { + vb = new_vmap_block(gfp_mask); + if (IS_ERR(vb)) + return vb; + goto again; + } + + return (void *)addr; +} + +static void vb_free(const void *addr, unsigned long size) +{ + unsigned long offset; + unsigned long vb_idx; + unsigned int order; + struct vmap_block *vb; + + BUG_ON(size & ~PAGE_MASK); + BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); + + flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size); + + order = get_order(size); + + offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); + + vb_idx = addr_to_vb_idx((unsigned long)addr); + rcu_read_lock(); + vb = radix_tree_lookup(&vmap_block_tree, vb_idx); + rcu_read_unlock(); + BUG_ON(!vb); + + spin_lock(&vb->lock); + bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); + if (!vb->dirty) { + spin_lock(&vb->vbq->lock); + list_add(&vb->dirty_list, &vb->vbq->dirty); + spin_unlock(&vb->vbq->lock); + } + vb->dirty += 1UL << order; + if (vb->dirty == VMAP_BBMAP_BITS) { + BUG_ON(vb->free || !list_empty(&vb->free_list)); + spin_unlock(&vb->lock); + free_vmap_block(vb); + } else + spin_unlock(&vb->lock); +} + +/** + * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer + * + * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily + * to amortize TLB flushing overheads. What this means is that any page you + * have now, may, in a former life, have been mapped into kernel virtual + * address by the vmap layer and so there might be some CPUs with TLB entries + * still referencing that page (additional to the regular 1:1 kernel mapping). + * + * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can + * be sure that none of the pages we have control over will have any aliases + * from the vmap layer. + */ +void vm_unmap_aliases(void) +{ + unsigned long start = ULONG_MAX, end = 0; + int cpu; + int flush = 0; + + if (unlikely(!vmap_initialized)) + return; + + for_each_possible_cpu(cpu) { + struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); + struct vmap_block *vb; + + rcu_read_lock(); + list_for_each_entry_rcu(vb, &vbq->free, free_list) { + int i; + + spin_lock(&vb->lock); + i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS); + while (i < VMAP_BBMAP_BITS) { + unsigned long s, e; + int j; + j = find_next_zero_bit(vb->dirty_map, + VMAP_BBMAP_BITS, i); + + s = vb->va->va_start + (i << PAGE_SHIFT); + e = vb->va->va_start + (j << PAGE_SHIFT); + vunmap_page_range(s, e); + flush = 1; + + if (s < start) + start = s; + if (e > end) + end = e; + + i = j; + i = find_next_bit(vb->dirty_map, + VMAP_BBMAP_BITS, i); + } + spin_unlock(&vb->lock); + } + rcu_read_unlock(); + } + + __purge_vmap_area_lazy(&start, &end, 1, flush); +} +EXPORT_SYMBOL_GPL(vm_unmap_aliases); + +/** + * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram + * @mem: the pointer returned by vm_map_ram + * @count: the count passed to that vm_map_ram call (cannot unmap partial) + */ +void vm_unmap_ram(const void *mem, unsigned int count) +{ + unsigned long size = count << PAGE_SHIFT; + unsigned long addr = (unsigned long)mem; + + BUG_ON(!addr); + BUG_ON(addr < VMALLOC_START); + BUG_ON(addr > VMALLOC_END); + BUG_ON(addr & (PAGE_SIZE-1)); + + debug_check_no_locks_freed(mem, size); + + if (likely(count <= VMAP_MAX_ALLOC)) + vb_free(mem, size); + else + free_unmap_vmap_area_addr(addr); +} +EXPORT_SYMBOL(vm_unmap_ram); + +/** + * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) + * @pages: an array of pointers to the pages to be mapped + * @count: number of pages + * @node: prefer to allocate data structures on this node + * @prot: memory protection to use. PAGE_KERNEL for regular RAM + * + * Returns: a pointer to the address that has been mapped, or %NULL on failure + */ +void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) +{ + unsigned long size = count << PAGE_SHIFT; + unsigned long addr; + void *mem; + + if (likely(count <= VMAP_MAX_ALLOC)) { + mem = vb_alloc(size, GFP_KERNEL); + if (IS_ERR(mem)) + return NULL; + addr = (unsigned long)mem; + } else { + struct vmap_area *va; + va = alloc_vmap_area(size, PAGE_SIZE, + VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); + if (IS_ERR(va)) + return NULL; + + addr = va->va_start; + mem = (void *)addr; + } + if (vmap_page_range(addr, addr + size, prot, pages) < 0) { + vm_unmap_ram(mem, count); + return NULL; + } + return mem; +} +EXPORT_SYMBOL(vm_map_ram); + +void __init vmalloc_init(void) +{ + struct vmap_area *va; + struct vm_struct *tmp; + int i; + + for_each_possible_cpu(i) { + struct vmap_block_queue *vbq; + + vbq = &per_cpu(vmap_block_queue, i); + spin_lock_init(&vbq->lock); + INIT_LIST_HEAD(&vbq->free); + INIT_LIST_HEAD(&vbq->dirty); + vbq->nr_dirty = 0; + } + + /* Import existing vmlist entries. */ + for (tmp = vmlist; tmp; tmp = tmp->next) { + va = alloc_bootmem(sizeof(struct vmap_area)); + va->flags = tmp->flags | VM_VM_AREA; + va->va_start = (unsigned long)tmp->addr; + va->va_end = va->va_start + tmp->size; + __insert_vmap_area(va); + } + vmap_initialized = true; +} + +void unmap_kernel_range(unsigned long addr, unsigned long size) +{ + unsigned long end = addr + size; + + flush_cache_vunmap(addr, end); + vunmap_page_range(addr, end); + flush_tlb_kernel_range(addr, end); +} + +int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) +{ + unsigned long addr = (unsigned long)area->addr; + unsigned long end = addr + area->size - PAGE_SIZE; + int err; + + err = vmap_page_range(addr, end, prot, *pages); + if (err > 0) { + *pages += err; + err = 0; + } + + return err; +} +EXPORT_SYMBOL_GPL(map_vm_area); + +/*** Old vmalloc interfaces ***/ +DEFINE_RWLOCK(vmlist_lock); +struct vm_struct *vmlist; + +static struct vm_struct *__get_vm_area_node(unsigned long size, + unsigned long flags, unsigned long start, unsigned long end, + int node, gfp_t gfp_mask, void *caller) +{ + static struct vmap_area *va; + struct vm_struct *area; + struct vm_struct *tmp, **p; + unsigned long align = 1; + + BUG_ON(in_interrupt()); + if (flags & VM_IOREMAP) { + int bit = fls(size); + + if (bit > IOREMAP_MAX_ORDER) + bit = IOREMAP_MAX_ORDER; + else if (bit < PAGE_SHIFT) + bit = PAGE_SHIFT; + + align = 1ul << bit; + } + + size = PAGE_ALIGN(size); + if (unlikely(!size)) + return NULL; + + area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); + if (unlikely(!area)) + return NULL; + + /* + * We always allocate a guard page. + */ + size += PAGE_SIZE; + + va = alloc_vmap_area(size, align, start, end, node, gfp_mask); + if (IS_ERR(va)) { + kfree(area); + return NULL; + } + + area->flags = flags; + area->addr = (void *)va->va_start; + area->size = size; + area->pages = NULL; + area->nr_pages = 0; + area->phys_addr = 0; + area->caller = caller; + va->private = area; + va->flags |= VM_VM_AREA; + + write_lock(&vmlist_lock); + for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { + if (tmp->addr >= area->addr) + break; + } + area->next = *p; + *p = area; + write_unlock(&vmlist_lock); + + return area; +} + +struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, + unsigned long start, unsigned long end) +{ + return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, + __builtin_return_address(0)); +} +EXPORT_SYMBOL_GPL(__get_vm_area); + +/** + * get_vm_area - reserve a contiguous kernel virtual area + * @size: size of the area + * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC + * + * Search an area of @size in the kernel virtual mapping area, + * and reserved it for out purposes. Returns the area descriptor + * on success or %NULL on failure. + */ +struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) +{ + return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, + -1, GFP_KERNEL, __builtin_return_address(0)); +} + +struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, + void *caller) +{ + return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, + -1, GFP_KERNEL, caller); +} + +struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, + int node, gfp_t gfp_mask) +{ + return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node, + gfp_mask, __builtin_return_address(0)); +} + +static struct vm_struct *find_vm_area(const void *addr) +{ + struct vmap_area *va; + + va = find_vmap_area((unsigned long)addr); + if (va && va->flags & VM_VM_AREA) + return va->private; + + return NULL; +} + +/** + * remove_vm_area - find and remove a continuous kernel virtual area + * @addr: base address + * + * Search for the kernel VM area starting at @addr, and remove it. + * This function returns the found VM area, but using it is NOT safe + * on SMP machines, except for its size or flags. + */ +struct vm_struct *remove_vm_area(const void *addr) +{ + struct vmap_area *va; + + va = find_vmap_area((unsigned long)addr); + if (va && va->flags & VM_VM_AREA) { + struct vm_struct *vm = va->private; + struct vm_struct *tmp, **p; + free_unmap_vmap_area(va); + vm->size -= PAGE_SIZE; + + write_lock(&vmlist_lock); + for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) + ; + *p = tmp->next; + write_unlock(&vmlist_lock); + + return vm; + } + return NULL; +} + +static void __vunmap(const void *addr, int deallocate_pages) +{ + struct vm_struct *area; + + if (!addr) + return; + + if ((PAGE_SIZE-1) & (unsigned long)addr) { + WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr); + return; + } + + area = remove_vm_area(addr); + if (unlikely(!area)) { + WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", + addr); + return; + } + + debug_check_no_locks_freed(addr, area->size); + debug_check_no_obj_freed(addr, area->size); + + if (deallocate_pages) { + int i; + + for (i = 0; i < area->nr_pages; i++) { + struct page *page = area->pages[i]; + + BUG_ON(!page); + __free_page(page); + } + + if (area->flags & VM_VPAGES) + vfree(area->pages); + else + kfree(area->pages); + } + + kfree(area); + return; +} + +/** + * vfree - release memory allocated by vmalloc() + * @addr: memory base address + * + * Free the virtually continuous memory area starting at @addr, as + * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is + * NULL, no operation is performed. + * + * Must not be called in interrupt context. + */ +void vfree(const void *addr) +{ + BUG_ON(in_interrupt()); + __vunmap(addr, 1); +} +EXPORT_SYMBOL(vfree); + +/** + * vunmap - release virtual mapping obtained by vmap() + * @addr: memory base address + * + * Free the virtually contiguous memory area starting at @addr, + * which was created from the page array passed to vmap(). + * + * Must not be called in interrupt context. + */ +void vunmap(const void *addr) +{ + BUG_ON(in_interrupt()); + __vunmap(addr, 0); +} +EXPORT_SYMBOL(vunmap); + +/** + * vmap - map an array of pages into virtually contiguous space + * @pages: array of page pointers + * @count: number of pages to map + * @flags: vm_area->flags + * @prot: page protection for the mapping + * + * Maps @count pages from @pages into contiguous kernel virtual + * space. + */ +void *vmap(struct page **pages, unsigned int count, + unsigned long flags, pgprot_t prot) +{ + struct vm_struct *area; + + if (count > num_physpages) + return NULL; + + area = get_vm_area_caller((count << PAGE_SHIFT), flags, + __builtin_return_address(0)); + if (!area) + return NULL; + + if (map_vm_area(area, prot, &pages)) { + vunmap(area->addr); + return NULL; + } + + return area->addr; +} +EXPORT_SYMBOL(vmap); + +static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, + int node, void *caller); +static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, + pgprot_t prot, int node, void *caller) +{ + struct page **pages; + unsigned int nr_pages, array_size, i; + + nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; + array_size = (nr_pages * sizeof(struct page *)); + + area->nr_pages = nr_pages; + /* Please note that the recursion is strictly bounded. */ + if (array_size > PAGE_SIZE) { + pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO, + PAGE_KERNEL, node, caller); + area->flags |= VM_VPAGES; + } else { + pages = kmalloc_node(array_size, + (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO, + node); + } + area->pages = pages; + area->caller = caller; + if (!area->pages) { + remove_vm_area(area->addr); + kfree(area); + return NULL; + } + + for (i = 0; i < area->nr_pages; i++) { + struct page *page; + + if (node < 0) + page = alloc_page(gfp_mask); + else + page = alloc_pages_node(node, gfp_mask, 0); + + if (unlikely(!page)) { + /* Successfully allocated i pages, free them in __vunmap() */ + area->nr_pages = i; + goto fail; + } + area->pages[i] = page; + } + + if (map_vm_area(area, prot, &pages)) + goto fail; + return area->addr; + +fail: + vfree(area->addr); + return NULL; +} + +void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) +{ + return __vmalloc_area_node(area, gfp_mask, prot, -1, + __builtin_return_address(0)); +} + +/** + * __vmalloc_node - allocate virtually contiguous memory + * @size: allocation size + * @gfp_mask: flags for the page level allocator + * @prot: protection mask for the allocated pages + * @node: node to use for allocation or -1 + * @caller: caller's return address + * + * Allocate enough pages to cover @size from the page level + * allocator with @gfp_mask flags. Map them into contiguous + * kernel virtual space, using a pagetable protection of @prot. + */ +static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, + int node, void *caller) +{ + struct vm_struct *area; + + size = PAGE_ALIGN(size); + if (!size || (size >> PAGE_SHIFT) > num_physpages) + return NULL; + + area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END, + node, gfp_mask, caller); + + if (!area) + return NULL; + + return __vmalloc_area_node(area, gfp_mask, prot, node, caller); +} + +void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) +{ + return __vmalloc_node(size, gfp_mask, prot, -1, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(__vmalloc); + +/** + * vmalloc - allocate virtually contiguous memory + * @size: allocation size + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vmalloc(unsigned long size) +{ + return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, + -1, __builtin_return_address(0)); +} +EXPORT_SYMBOL(vmalloc); + +/** + * vmalloc_user - allocate zeroed virtually contiguous memory for userspace + * @size: allocation size + * + * The resulting memory area is zeroed so it can be mapped to userspace + * without leaking data. + */ +void *vmalloc_user(unsigned long size) +{ + struct vm_struct *area; + void *ret; + + ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); + if (ret) { + area = find_vm_area(ret); + area->flags |= VM_USERMAP; + } + return ret; +} +EXPORT_SYMBOL(vmalloc_user); + +/** + * vmalloc_node - allocate memory on a specific node + * @size: allocation size + * @node: numa node + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vmalloc_node(unsigned long size, int node) +{ + return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, + node, __builtin_return_address(0)); +} +EXPORT_SYMBOL(vmalloc_node); + +#ifndef PAGE_KERNEL_EXEC +# define PAGE_KERNEL_EXEC PAGE_KERNEL +#endif + +/** + * vmalloc_exec - allocate virtually contiguous, executable memory + * @size: allocation size + * + * Kernel-internal function to allocate enough pages to cover @size + * the page level allocator and map them into contiguous and + * executable kernel virtual space. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ + +void *vmalloc_exec(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); +} + +#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) +#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL +#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) +#define GFP_VMALLOC32 GFP_DMA | GFP_KERNEL +#else +#define GFP_VMALLOC32 GFP_KERNEL +#endif + +/** + * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) + * @size: allocation size + * + * Allocate enough 32bit PA addressable pages to cover @size from the + * page level allocator and map them into contiguous kernel virtual space. + */ +void *vmalloc_32(unsigned long size) +{ + return __vmalloc(size, GFP_VMALLOC32, PAGE_KERNEL); +} +EXPORT_SYMBOL(vmalloc_32); + +/** + * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory + * @size: allocation size + * + * The resulting memory area is 32bit addressable and zeroed so it can be + * mapped to userspace without leaking data. + */ +void *vmalloc_32_user(unsigned long size) +{ + struct vm_struct *area; + void *ret; + + ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL); + if (ret) { + area = find_vm_area(ret); + area->flags |= VM_USERMAP; + } + return ret; +} +EXPORT_SYMBOL(vmalloc_32_user); + +long vread(char *buf, char *addr, unsigned long count) +{ + struct vm_struct *tmp; + char *vaddr, *buf_start = buf; + unsigned long n; + + /* Don't allow overflow */ + if ((unsigned long) addr + count < count) + count = -(unsigned long) addr; + + read_lock(&vmlist_lock); + for (tmp = vmlist; tmp; tmp = tmp->next) { + vaddr = (char *) tmp->addr; + if (addr >= vaddr + tmp->size - PAGE_SIZE) + continue; + while (addr < vaddr) { + if (count == 0) + goto finished; + *buf = '\0'; + buf++; + addr++; + count--; + } + n = vaddr + tmp->size - PAGE_SIZE - addr; + do { + if (count == 0) + goto finished; + *buf = *addr; + buf++; + addr++; + count--; + } while (--n > 0); + } +finished: + read_unlock(&vmlist_lock); + return buf - buf_start; +} + +long vwrite(char *buf, char *addr, unsigned long count) +{ + struct vm_struct *tmp; + char *vaddr, *buf_start = buf; + unsigned long n; + + /* Don't allow overflow */ + if ((unsigned long) addr + count < count) + count = -(unsigned long) addr; + + read_lock(&vmlist_lock); + for (tmp = vmlist; tmp; tmp = tmp->next) { + vaddr = (char *) tmp->addr; + if (addr >= vaddr + tmp->size - PAGE_SIZE) + continue; + while (addr < vaddr) { + if (count == 0) + goto finished; + buf++; + addr++; + count--; + } + n = vaddr + tmp->size - PAGE_SIZE - addr; + do { + if (count == 0) + goto finished; + *addr = *buf; + buf++; + addr++; + count--; + } while (--n > 0); + } +finished: + read_unlock(&vmlist_lock); + return buf - buf_start; +} + +/** + * remap_vmalloc_range - map vmalloc pages to userspace + * @vma: vma to cover (map full range of vma) + * @addr: vmalloc memory + * @pgoff: number of pages into addr before first page to map + * + * Returns: 0 for success, -Exxx on failure + * + * This function checks that addr is a valid vmalloc'ed area, and + * that it is big enough to cover the vma. Will return failure if + * that criteria isn't met. + * + * Similar to remap_pfn_range() (see mm/memory.c) + */ +int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, + unsigned long pgoff) +{ + struct vm_struct *area; + unsigned long uaddr = vma->vm_start; + unsigned long usize = vma->vm_end - vma->vm_start; + + if ((PAGE_SIZE-1) & (unsigned long)addr) + return -EINVAL; + + area = find_vm_area(addr); + if (!area) + return -EINVAL; + + if (!(area->flags & VM_USERMAP)) + return -EINVAL; + + if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) + return -EINVAL; + + addr += pgoff << PAGE_SHIFT; + do { + struct page *page = vmalloc_to_page(addr); + int ret; + + ret = vm_insert_page(vma, uaddr, page); + if (ret) + return ret; + + uaddr += PAGE_SIZE; + addr += PAGE_SIZE; + usize -= PAGE_SIZE; + } while (usize > 0); + + /* Prevent "things" like memory migration? VM_flags need a cleanup... */ + vma->vm_flags |= VM_RESERVED; + + return 0; +} +EXPORT_SYMBOL(remap_vmalloc_range); + +/* + * Implement a stub for vmalloc_sync_all() if the architecture chose not to + * have one. + */ +void __attribute__((weak)) vmalloc_sync_all(void) +{ +} + + +static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data) +{ + /* apply_to_page_range() does all the hard work. */ + return 0; +} + +/** + * alloc_vm_area - allocate a range of kernel address space + * @size: size of the area + * + * Returns: NULL on failure, vm_struct on success + * + * This function reserves a range of kernel address space, and + * allocates pagetables to map that range. No actual mappings + * are created. If the kernel address space is not shared + * between processes, it syncs the pagetable across all + * processes. + */ +struct vm_struct *alloc_vm_area(size_t size) +{ + struct vm_struct *area; + + area = get_vm_area_caller(size, VM_IOREMAP, + __builtin_return_address(0)); + if (area == NULL) + return NULL; + + /* + * This ensures that page tables are constructed for this region + * of kernel virtual address space and mapped into init_mm. + */ + if (apply_to_page_range(&init_mm, (unsigned long)area->addr, + area->size, f, NULL)) { + free_vm_area(area); + return NULL; + } + + /* Make sure the pagetables are constructed in process kernel + mappings */ + vmalloc_sync_all(); + + return area; +} +EXPORT_SYMBOL_GPL(alloc_vm_area); + +void free_vm_area(struct vm_struct *area) +{ + struct vm_struct *ret; + ret = remove_vm_area(area->addr); + BUG_ON(ret != area); + kfree(area); +} +EXPORT_SYMBOL_GPL(free_vm_area); + + +#ifdef CONFIG_PROC_FS +static void *s_start(struct seq_file *m, loff_t *pos) +{ + loff_t n = *pos; + struct vm_struct *v; + + read_lock(&vmlist_lock); + v = vmlist; + while (n > 0 && v) { + n--; + v = v->next; + } + if (!n) + return v; + + return NULL; + +} + +static void *s_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct vm_struct *v = p; + + ++*pos; + return v->next; +} + +static void s_stop(struct seq_file *m, void *p) +{ + read_unlock(&vmlist_lock); +} + +static void show_numa_info(struct seq_file *m, struct vm_struct *v) +{ + if (NUMA_BUILD) { + unsigned int nr, *counters = m->private; + + if (!counters) + return; + + memset(counters, 0, nr_node_ids * sizeof(unsigned int)); + + for (nr = 0; nr < v->nr_pages; nr++) + counters[page_to_nid(v->pages[nr])]++; + + for_each_node_state(nr, N_HIGH_MEMORY) + if (counters[nr]) + seq_printf(m, " N%u=%u", nr, counters[nr]); + } +} + +static int s_show(struct seq_file *m, void *p) +{ + struct vm_struct *v = p; + + seq_printf(m, "0x%p-0x%p %7ld", + v->addr, v->addr + v->size, v->size); + + if (v->caller) { + char buff[KSYM_SYMBOL_LEN]; + + seq_putc(m, ' '); + sprint_symbol(buff, (unsigned long)v->caller); + seq_puts(m, buff); + } + + if (v->nr_pages) + seq_printf(m, " pages=%d", v->nr_pages); + + if (v->phys_addr) + seq_printf(m, " phys=%lx", v->phys_addr); + + if (v->flags & VM_IOREMAP) + seq_printf(m, " ioremap"); + + if (v->flags & VM_ALLOC) + seq_printf(m, " vmalloc"); + + if (v->flags & VM_MAP) + seq_printf(m, " vmap"); + + if (v->flags & VM_USERMAP) + seq_printf(m, " user"); + + if (v->flags & VM_VPAGES) + seq_printf(m, " vpages"); + + show_numa_info(m, v); + seq_putc(m, '\n'); + return 0; +} + +static const struct seq_operations vmalloc_op = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, +}; + +static int vmalloc_open(struct inode *inode, struct file *file) +{ + unsigned int *ptr = NULL; + int ret; + + if (NUMA_BUILD) + ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); + ret = seq_open(file, &vmalloc_op); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = ptr; + } else + kfree(ptr); + return ret; +} + +static const struct file_operations proc_vmalloc_operations = { + .open = vmalloc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static int __init proc_vmalloc_init(void) +{ + proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations); + return 0; +} +module_init(proc_vmalloc_init); +#endif + diff --git a/mm/vmscan.c b/mm/vmscan.c new file mode 100644 index 0000000..62e7f62 --- /dev/null +++ b/mm/vmscan.c @@ -0,0 +1,2592 @@ +/* + * linux/mm/vmscan.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * Swap reorganised 29.12.95, Stephen Tweedie. + * kswapd added: 7.1.96 sct + * Removed kswapd_ctl limits, and swap out as many pages as needed + * to bring the system back to freepages.high: 2.4.97, Rik van Riel. + * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). + * Multiqueue VM started 5.8.00, Rik van Riel. + */ + +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/kernel_stat.h> +#include <linux/swap.h> +#include <linux/pagemap.h> +#include <linux/init.h> +#include <linux/highmem.h> +#include <linux/vmstat.h> +#include <linux/file.h> +#include <linux/writeback.h> +#include <linux/blkdev.h> +#include <linux/buffer_head.h> /* for try_to_release_page(), + buffer_heads_over_limit */ +#include <linux/mm_inline.h> +#include <linux/pagevec.h> +#include <linux/backing-dev.h> +#include <linux/rmap.h> +#include <linux/topology.h> +#include <linux/cpu.h> +#include <linux/cpuset.h> +#include <linux/notifier.h> +#include <linux/rwsem.h> +#include <linux/delay.h> +#include <linux/kthread.h> +#include <linux/freezer.h> +#include <linux/memcontrol.h> +#include <linux/delayacct.h> +#include <linux/sysctl.h> + +#include <asm/tlbflush.h> +#include <asm/div64.h> + +#include <linux/swapops.h> + +#include "internal.h" + +struct scan_control { + /* Incremented by the number of inactive pages that were scanned */ + unsigned long nr_scanned; + + /* This context's GFP mask */ + gfp_t gfp_mask; + + int may_writepage; + + /* Can pages be swapped as part of reclaim? */ + int may_swap; + + /* This context's SWAP_CLUSTER_MAX. If freeing memory for + * suspend, we effectively ignore SWAP_CLUSTER_MAX. + * In this context, it doesn't matter that we scan the + * whole list at once. */ + int swap_cluster_max; + + int swappiness; + + int all_unreclaimable; + + int order; + + /* Which cgroup do we reclaim from */ + struct mem_cgroup *mem_cgroup; + + /* Pluggable isolate pages callback */ + unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, + unsigned long *scanned, int order, int mode, + struct zone *z, struct mem_cgroup *mem_cont, + int active, int file); +}; + +#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) + +#ifdef ARCH_HAS_PREFETCH +#define prefetch_prev_lru_page(_page, _base, _field) \ + do { \ + if ((_page)->lru.prev != _base) { \ + struct page *prev; \ + \ + prev = lru_to_page(&(_page->lru)); \ + prefetch(&prev->_field); \ + } \ + } while (0) +#else +#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0) +#endif + +#ifdef ARCH_HAS_PREFETCHW +#define prefetchw_prev_lru_page(_page, _base, _field) \ + do { \ + if ((_page)->lru.prev != _base) { \ + struct page *prev; \ + \ + prev = lru_to_page(&(_page->lru)); \ + prefetchw(&prev->_field); \ + } \ + } while (0) +#else +#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) +#endif + +/* + * From 0 .. 100. Higher means more swappy. + */ +int vm_swappiness = 60; +long vm_total_pages; /* The total number of pages which the VM controls */ + +static LIST_HEAD(shrinker_list); +static DECLARE_RWSEM(shrinker_rwsem); + +#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#define scan_global_lru(sc) (!(sc)->mem_cgroup) +#else +#define scan_global_lru(sc) (1) +#endif + +/* + * Add a shrinker callback to be called from the vm + */ +void register_shrinker(struct shrinker *shrinker) +{ + shrinker->nr = 0; + down_write(&shrinker_rwsem); + list_add_tail(&shrinker->list, &shrinker_list); + up_write(&shrinker_rwsem); +} +EXPORT_SYMBOL(register_shrinker); + +/* + * Remove one + */ +void unregister_shrinker(struct shrinker *shrinker) +{ + down_write(&shrinker_rwsem); + list_del(&shrinker->list); + up_write(&shrinker_rwsem); +} +EXPORT_SYMBOL(unregister_shrinker); + +#define SHRINK_BATCH 128 +/* + * Call the shrink functions to age shrinkable caches + * + * Here we assume it costs one seek to replace a lru page and that it also + * takes a seek to recreate a cache object. With this in mind we age equal + * percentages of the lru and ageable caches. This should balance the seeks + * generated by these structures. + * + * If the vm encountered mapped pages on the LRU it increase the pressure on + * slab to avoid swapping. + * + * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. + * + * `lru_pages' represents the number of on-LRU pages in all the zones which + * are eligible for the caller's allocation attempt. It is used for balancing + * slab reclaim versus page reclaim. + * + * Returns the number of slab objects which we shrunk. + */ +unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, + unsigned long lru_pages) +{ + struct shrinker *shrinker; + unsigned long ret = 0; + + if (scanned == 0) + scanned = SWAP_CLUSTER_MAX; + + if (!down_read_trylock(&shrinker_rwsem)) + return 1; /* Assume we'll be able to shrink next time */ + + list_for_each_entry(shrinker, &shrinker_list, list) { + unsigned long long delta; + unsigned long total_scan; + unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask); + + delta = (4 * scanned) / shrinker->seeks; + delta *= max_pass; + do_div(delta, lru_pages + 1); + shrinker->nr += delta; + if (shrinker->nr < 0) { + printk(KERN_ERR "%s: nr=%ld\n", + __func__, shrinker->nr); + shrinker->nr = max_pass; + } + + /* + * Avoid risking looping forever due to too large nr value: + * never try to free more than twice the estimate number of + * freeable entries. + */ + if (shrinker->nr > max_pass * 2) + shrinker->nr = max_pass * 2; + + total_scan = shrinker->nr; + shrinker->nr = 0; + + while (total_scan >= SHRINK_BATCH) { + long this_scan = SHRINK_BATCH; + int shrink_ret; + int nr_before; + + nr_before = (*shrinker->shrink)(0, gfp_mask); + shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask); + if (shrink_ret == -1) + break; + if (shrink_ret < nr_before) + ret += nr_before - shrink_ret; + count_vm_events(SLABS_SCANNED, this_scan); + total_scan -= this_scan; + + cond_resched(); + } + + shrinker->nr += total_scan; + } + up_read(&shrinker_rwsem); + return ret; +} + +/* Called without lock on whether page is mapped, so answer is unstable */ +static inline int page_mapping_inuse(struct page *page) +{ + struct address_space *mapping; + + /* Page is in somebody's page tables. */ + if (page_mapped(page)) + return 1; + + /* Be more reluctant to reclaim swapcache than pagecache */ + if (PageSwapCache(page)) + return 1; + + mapping = page_mapping(page); + if (!mapping) + return 0; + + /* File is mmap'd by somebody? */ + return mapping_mapped(mapping); +} + +static inline int is_page_cache_freeable(struct page *page) +{ + return page_count(page) - !!PagePrivate(page) == 2; +} + +static int may_write_to_queue(struct backing_dev_info *bdi) +{ + if (current->flags & PF_SWAPWRITE) + return 1; + if (!bdi_write_congested(bdi)) + return 1; + if (bdi == current->backing_dev_info) + return 1; + return 0; +} + +/* + * We detected a synchronous write error writing a page out. Probably + * -ENOSPC. We need to propagate that into the address_space for a subsequent + * fsync(), msync() or close(). + * + * The tricky part is that after writepage we cannot touch the mapping: nothing + * prevents it from being freed up. But we have a ref on the page and once + * that page is locked, the mapping is pinned. + * + * We're allowed to run sleeping lock_page() here because we know the caller has + * __GFP_FS. + */ +static void handle_write_error(struct address_space *mapping, + struct page *page, int error) +{ + lock_page(page); + if (page_mapping(page) == mapping) + mapping_set_error(mapping, error); + unlock_page(page); +} + +/* Request for sync pageout. */ +enum pageout_io { + PAGEOUT_IO_ASYNC, + PAGEOUT_IO_SYNC, +}; + +/* possible outcome of pageout() */ +typedef enum { + /* failed to write page out, page is locked */ + PAGE_KEEP, + /* move page to the active list, page is locked */ + PAGE_ACTIVATE, + /* page has been sent to the disk successfully, page is unlocked */ + PAGE_SUCCESS, + /* page is clean and locked */ + PAGE_CLEAN, +} pageout_t; + +/* + * pageout is called by shrink_page_list() for each dirty page. + * Calls ->writepage(). + */ +static pageout_t pageout(struct page *page, struct address_space *mapping, + enum pageout_io sync_writeback) +{ + /* + * If the page is dirty, only perform writeback if that write + * will be non-blocking. To prevent this allocation from being + * stalled by pagecache activity. But note that there may be + * stalls if we need to run get_block(). We could test + * PagePrivate for that. + * + * If this process is currently in generic_file_write() against + * this page's queue, we can perform writeback even if that + * will block. + * + * If the page is swapcache, write it back even if that would + * block, for some throttling. This happens by accident, because + * swap_backing_dev_info is bust: it doesn't reflect the + * congestion state of the swapdevs. Easy to fix, if needed. + * See swapfile.c:page_queue_congested(). + */ + if (!is_page_cache_freeable(page)) + return PAGE_KEEP; + if (!mapping) { + /* + * Some data journaling orphaned pages can have + * page->mapping == NULL while being dirty with clean buffers. + */ + if (PagePrivate(page)) { + if (try_to_free_buffers(page)) { + ClearPageDirty(page); + printk("%s: orphaned page\n", __func__); + return PAGE_CLEAN; + } + } + return PAGE_KEEP; + } + if (mapping->a_ops->writepage == NULL) + return PAGE_ACTIVATE; + if (!may_write_to_queue(mapping->backing_dev_info)) + return PAGE_KEEP; + + if (clear_page_dirty_for_io(page)) { + int res; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = SWAP_CLUSTER_MAX, + .range_start = 0, + .range_end = LLONG_MAX, + .nonblocking = 1, + .for_reclaim = 1, + }; + + SetPageReclaim(page); + res = mapping->a_ops->writepage(page, &wbc); + if (res < 0) + handle_write_error(mapping, page, res); + if (res == AOP_WRITEPAGE_ACTIVATE) { + ClearPageReclaim(page); + return PAGE_ACTIVATE; + } + + /* + * Wait on writeback if requested to. This happens when + * direct reclaiming a large contiguous area and the + * first attempt to free a range of pages fails. + */ + if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC) + wait_on_page_writeback(page); + + if (!PageWriteback(page)) { + /* synchronous write or broken a_ops? */ + ClearPageReclaim(page); + } + inc_zone_page_state(page, NR_VMSCAN_WRITE); + return PAGE_SUCCESS; + } + + return PAGE_CLEAN; +} + +/* + * Same as remove_mapping, but if the page is removed from the mapping, it + * gets returned with a refcount of 0. + */ +static int __remove_mapping(struct address_space *mapping, struct page *page) +{ + BUG_ON(!PageLocked(page)); + BUG_ON(mapping != page_mapping(page)); + + spin_lock_irq(&mapping->tree_lock); + /* + * The non racy check for a busy page. + * + * Must be careful with the order of the tests. When someone has + * a ref to the page, it may be possible that they dirty it then + * drop the reference. So if PageDirty is tested before page_count + * here, then the following race may occur: + * + * get_user_pages(&page); + * [user mapping goes away] + * write_to(page); + * !PageDirty(page) [good] + * SetPageDirty(page); + * put_page(page); + * !page_count(page) [good, discard it] + * + * [oops, our write_to data is lost] + * + * Reversing the order of the tests ensures such a situation cannot + * escape unnoticed. The smp_rmb is needed to ensure the page->flags + * load is not satisfied before that of page->_count. + * + * Note that if SetPageDirty is always performed via set_page_dirty, + * and thus under tree_lock, then this ordering is not required. + */ + if (!page_freeze_refs(page, 2)) + goto cannot_free; + /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ + if (unlikely(PageDirty(page))) { + page_unfreeze_refs(page, 2); + goto cannot_free; + } + + if (PageSwapCache(page)) { + swp_entry_t swap = { .val = page_private(page) }; + __delete_from_swap_cache(page); + spin_unlock_irq(&mapping->tree_lock); + swap_free(swap); + } else { + __remove_from_page_cache(page); + spin_unlock_irq(&mapping->tree_lock); + } + + return 1; + +cannot_free: + spin_unlock_irq(&mapping->tree_lock); + return 0; +} + +/* + * Attempt to detach a locked page from its ->mapping. If it is dirty or if + * someone else has a ref on the page, abort and return 0. If it was + * successfully detached, return 1. Assumes the caller has a single ref on + * this page. + */ +int remove_mapping(struct address_space *mapping, struct page *page) +{ + if (__remove_mapping(mapping, page)) { + /* + * Unfreezing the refcount with 1 rather than 2 effectively + * drops the pagecache ref for us without requiring another + * atomic operation. + */ + page_unfreeze_refs(page, 1); + return 1; + } + return 0; +} + +/** + * putback_lru_page - put previously isolated page onto appropriate LRU list + * @page: page to be put back to appropriate lru list + * + * Add previously isolated @page to appropriate LRU list. + * Page may still be unevictable for other reasons. + * + * lru_lock must not be held, interrupts must be enabled. + */ +#ifdef CONFIG_UNEVICTABLE_LRU +void putback_lru_page(struct page *page) +{ + int lru; + int active = !!TestClearPageActive(page); + int was_unevictable = PageUnevictable(page); + + VM_BUG_ON(PageLRU(page)); + +redo: + ClearPageUnevictable(page); + + if (page_evictable(page, NULL)) { + /* + * For evictable pages, we can use the cache. + * In event of a race, worst case is we end up with an + * unevictable page on [in]active list. + * We know how to handle that. + */ + lru = active + page_is_file_cache(page); + lru_cache_add_lru(page, lru); + } else { + /* + * Put unevictable pages directly on zone's unevictable + * list. + */ + lru = LRU_UNEVICTABLE; + add_page_to_unevictable_list(page); + } + mem_cgroup_move_lists(page, lru); + + /* + * page's status can change while we move it among lru. If an evictable + * page is on unevictable list, it never be freed. To avoid that, + * check after we added it to the list, again. + */ + if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) { + if (!isolate_lru_page(page)) { + put_page(page); + goto redo; + } + /* This means someone else dropped this page from LRU + * So, it will be freed or putback to LRU again. There is + * nothing to do here. + */ + } + + if (was_unevictable && lru != LRU_UNEVICTABLE) + count_vm_event(UNEVICTABLE_PGRESCUED); + else if (!was_unevictable && lru == LRU_UNEVICTABLE) + count_vm_event(UNEVICTABLE_PGCULLED); + + put_page(page); /* drop ref from isolate */ +} + +#else /* CONFIG_UNEVICTABLE_LRU */ + +void putback_lru_page(struct page *page) +{ + int lru; + VM_BUG_ON(PageLRU(page)); + + lru = !!TestClearPageActive(page) + page_is_file_cache(page); + lru_cache_add_lru(page, lru); + mem_cgroup_move_lists(page, lru); + put_page(page); +} +#endif /* CONFIG_UNEVICTABLE_LRU */ + + +/* + * shrink_page_list() returns the number of reclaimed pages + */ +static unsigned long shrink_page_list(struct list_head *page_list, + struct scan_control *sc, + enum pageout_io sync_writeback) +{ + LIST_HEAD(ret_pages); + struct pagevec freed_pvec; + int pgactivate = 0; + unsigned long nr_reclaimed = 0; + + cond_resched(); + + pagevec_init(&freed_pvec, 1); + while (!list_empty(page_list)) { + struct address_space *mapping; + struct page *page; + int may_enter_fs; + int referenced; + + cond_resched(); + + page = lru_to_page(page_list); + list_del(&page->lru); + + if (!trylock_page(page)) + goto keep; + + VM_BUG_ON(PageActive(page)); + + sc->nr_scanned++; + + if (unlikely(!page_evictable(page, NULL))) + goto cull_mlocked; + + if (!sc->may_swap && page_mapped(page)) + goto keep_locked; + + /* Double the slab pressure for mapped and swapcache pages */ + if (page_mapped(page) || PageSwapCache(page)) + sc->nr_scanned++; + + may_enter_fs = (sc->gfp_mask & __GFP_FS) || + (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); + + if (PageWriteback(page)) { + /* + * Synchronous reclaim is performed in two passes, + * first an asynchronous pass over the list to + * start parallel writeback, and a second synchronous + * pass to wait for the IO to complete. Wait here + * for any page for which writeback has already + * started. + */ + if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs) + wait_on_page_writeback(page); + else + goto keep_locked; + } + + referenced = page_referenced(page, 1, sc->mem_cgroup); + /* In active use or really unfreeable? Activate it. */ + if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && + referenced && page_mapping_inuse(page)) + goto activate_locked; + +#ifdef CONFIG_SWAP + /* + * Anonymous process memory has backing store? + * Try to allocate it some swap space here. + */ + if (PageAnon(page) && !PageSwapCache(page)) { + if (!(sc->gfp_mask & __GFP_IO)) + goto keep_locked; + switch (try_to_munlock(page)) { + case SWAP_FAIL: /* shouldn't happen */ + case SWAP_AGAIN: + goto keep_locked; + case SWAP_MLOCK: + goto cull_mlocked; + case SWAP_SUCCESS: + ; /* fall thru'; add to swap cache */ + } + if (!add_to_swap(page, GFP_ATOMIC)) + goto activate_locked; + may_enter_fs = 1; + } +#endif /* CONFIG_SWAP */ + + mapping = page_mapping(page); + + /* + * The page is mapped into the page tables of one or more + * processes. Try to unmap it here. + */ + if (page_mapped(page) && mapping) { + switch (try_to_unmap(page, 0)) { + case SWAP_FAIL: + goto activate_locked; + case SWAP_AGAIN: + goto keep_locked; + case SWAP_MLOCK: + goto cull_mlocked; + case SWAP_SUCCESS: + ; /* try to free the page below */ + } + } + + if (PageDirty(page)) { + if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced) + goto keep_locked; + if (!may_enter_fs) + goto keep_locked; + if (!sc->may_writepage) + goto keep_locked; + + /* Page is dirty, try to write it out here */ + switch (pageout(page, mapping, sync_writeback)) { + case PAGE_KEEP: + goto keep_locked; + case PAGE_ACTIVATE: + goto activate_locked; + case PAGE_SUCCESS: + if (PageWriteback(page) || PageDirty(page)) + goto keep; + /* + * A synchronous write - probably a ramdisk. Go + * ahead and try to reclaim the page. + */ + if (!trylock_page(page)) + goto keep; + if (PageDirty(page) || PageWriteback(page)) + goto keep_locked; + mapping = page_mapping(page); + case PAGE_CLEAN: + ; /* try to free the page below */ + } + } + + /* + * If the page has buffers, try to free the buffer mappings + * associated with this page. If we succeed we try to free + * the page as well. + * + * We do this even if the page is PageDirty(). + * try_to_release_page() does not perform I/O, but it is + * possible for a page to have PageDirty set, but it is actually + * clean (all its buffers are clean). This happens if the + * buffers were written out directly, with submit_bh(). ext3 + * will do this, as well as the blockdev mapping. + * try_to_release_page() will discover that cleanness and will + * drop the buffers and mark the page clean - it can be freed. + * + * Rarely, pages can have buffers and no ->mapping. These are + * the pages which were not successfully invalidated in + * truncate_complete_page(). We try to drop those buffers here + * and if that worked, and the page is no longer mapped into + * process address space (page_count == 1) it can be freed. + * Otherwise, leave the page on the LRU so it is swappable. + */ + if (PagePrivate(page)) { + if (!try_to_release_page(page, sc->gfp_mask)) + goto activate_locked; + if (!mapping && page_count(page) == 1) { + unlock_page(page); + if (put_page_testzero(page)) + goto free_it; + else { + /* + * rare race with speculative reference. + * the speculative reference will free + * this page shortly, so we may + * increment nr_reclaimed here (and + * leave it off the LRU). + */ + nr_reclaimed++; + continue; + } + } + } + + if (!mapping || !__remove_mapping(mapping, page)) + goto keep_locked; + + /* + * At this point, we have no other references and there is + * no way to pick any more up (removed from LRU, removed + * from pagecache). Can use non-atomic bitops now (and + * we obviously don't have to worry about waking up a process + * waiting on the page lock, because there are no references. + */ + __clear_page_locked(page); +free_it: + nr_reclaimed++; + if (!pagevec_add(&freed_pvec, page)) { + __pagevec_free(&freed_pvec); + pagevec_reinit(&freed_pvec); + } + continue; + +cull_mlocked: + unlock_page(page); + putback_lru_page(page); + continue; + +activate_locked: + /* Not a candidate for swapping, so reclaim swap space. */ + if (PageSwapCache(page) && vm_swap_full()) + remove_exclusive_swap_page_ref(page); + VM_BUG_ON(PageActive(page)); + SetPageActive(page); + pgactivate++; +keep_locked: + unlock_page(page); +keep: + list_add(&page->lru, &ret_pages); + VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); + } + list_splice(&ret_pages, page_list); + if (pagevec_count(&freed_pvec)) + __pagevec_free(&freed_pvec); + count_vm_events(PGACTIVATE, pgactivate); + return nr_reclaimed; +} + +/* LRU Isolation modes. */ +#define ISOLATE_INACTIVE 0 /* Isolate inactive pages. */ +#define ISOLATE_ACTIVE 1 /* Isolate active pages. */ +#define ISOLATE_BOTH 2 /* Isolate both active and inactive pages. */ + +/* + * Attempt to remove the specified page from its LRU. Only take this page + * if it is of the appropriate PageActive status. Pages which are being + * freed elsewhere are also ignored. + * + * page: page to consider + * mode: one of the LRU isolation modes defined above + * + * returns 0 on success, -ve errno on failure. + */ +int __isolate_lru_page(struct page *page, int mode, int file) +{ + int ret = -EINVAL; + + /* Only take pages on the LRU. */ + if (!PageLRU(page)) + return ret; + + /* + * When checking the active state, we need to be sure we are + * dealing with comparible boolean values. Take the logical not + * of each. + */ + if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) + return ret; + + if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file)) + return ret; + + /* + * When this function is being called for lumpy reclaim, we + * initially look into all LRU pages, active, inactive and + * unevictable; only give shrink_page_list evictable pages. + */ + if (PageUnevictable(page)) + return ret; + + ret = -EBUSY; + if (likely(get_page_unless_zero(page))) { + /* + * Be careful not to clear PageLRU until after we're + * sure the page is not being freed elsewhere -- the + * page release code relies on it. + */ + ClearPageLRU(page); + ret = 0; + } + + return ret; +} + +/* + * zone->lru_lock is heavily contended. Some of the functions that + * shrink the lists perform better by taking out a batch of pages + * and working on them outside the LRU lock. + * + * For pagecache intensive workloads, this function is the hottest + * spot in the kernel (apart from copy_*_user functions). + * + * Appropriate locks must be held before calling this function. + * + * @nr_to_scan: The number of pages to look through on the list. + * @src: The LRU list to pull pages off. + * @dst: The temp list to put pages on to. + * @scanned: The number of pages that were scanned. + * @order: The caller's attempted allocation order + * @mode: One of the LRU isolation modes + * @file: True [1] if isolating file [!anon] pages + * + * returns how many pages were moved onto *@dst. + */ +static unsigned long isolate_lru_pages(unsigned long nr_to_scan, + struct list_head *src, struct list_head *dst, + unsigned long *scanned, int order, int mode, int file) +{ + unsigned long nr_taken = 0; + unsigned long scan; + + for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { + struct page *page; + unsigned long pfn; + unsigned long end_pfn; + unsigned long page_pfn; + int zone_id; + + page = lru_to_page(src); + prefetchw_prev_lru_page(page, src, flags); + + VM_BUG_ON(!PageLRU(page)); + + switch (__isolate_lru_page(page, mode, file)) { + case 0: + list_move(&page->lru, dst); + nr_taken++; + break; + + case -EBUSY: + /* else it is being freed elsewhere */ + list_move(&page->lru, src); + continue; + + default: + BUG(); + } + + if (!order) + continue; + + /* + * Attempt to take all pages in the order aligned region + * surrounding the tag page. Only take those pages of + * the same active state as that tag page. We may safely + * round the target page pfn down to the requested order + * as the mem_map is guarenteed valid out to MAX_ORDER, + * where that page is in a different zone we will detect + * it from its zone id and abort this block scan. + */ + zone_id = page_zone_id(page); + page_pfn = page_to_pfn(page); + pfn = page_pfn & ~((1 << order) - 1); + end_pfn = pfn + (1 << order); + for (; pfn < end_pfn; pfn++) { + struct page *cursor_page; + + /* The target page is in the block, ignore it. */ + if (unlikely(pfn == page_pfn)) + continue; + + /* Avoid holes within the zone. */ + if (unlikely(!pfn_valid_within(pfn))) + break; + + cursor_page = pfn_to_page(pfn); + + /* Check that we have not crossed a zone boundary. */ + if (unlikely(page_zone_id(cursor_page) != zone_id)) + continue; + switch (__isolate_lru_page(cursor_page, mode, file)) { + case 0: + list_move(&cursor_page->lru, dst); + nr_taken++; + scan++; + break; + + case -EBUSY: + /* else it is being freed elsewhere */ + list_move(&cursor_page->lru, src); + default: + break; /* ! on LRU or wrong list */ + } + } + } + + *scanned = scan; + return nr_taken; +} + +static unsigned long isolate_pages_global(unsigned long nr, + struct list_head *dst, + unsigned long *scanned, int order, + int mode, struct zone *z, + struct mem_cgroup *mem_cont, + int active, int file) +{ + int lru = LRU_BASE; + if (active) + lru += LRU_ACTIVE; + if (file) + lru += LRU_FILE; + return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, + mode, !!file); +} + +/* + * clear_active_flags() is a helper for shrink_active_list(), clearing + * any active bits from the pages in the list. + */ +static unsigned long clear_active_flags(struct list_head *page_list, + unsigned int *count) +{ + int nr_active = 0; + int lru; + struct page *page; + + list_for_each_entry(page, page_list, lru) { + lru = page_is_file_cache(page); + if (PageActive(page)) { + lru += LRU_ACTIVE; + ClearPageActive(page); + nr_active++; + } + count[lru]++; + } + + return nr_active; +} + +/** + * isolate_lru_page - tries to isolate a page from its LRU list + * @page: page to isolate from its LRU list + * + * Isolates a @page from an LRU list, clears PageLRU and adjusts the + * vmstat statistic corresponding to whatever LRU list the page was on. + * + * Returns 0 if the page was removed from an LRU list. + * Returns -EBUSY if the page was not on an LRU list. + * + * The returned page will have PageLRU() cleared. If it was found on + * the active list, it will have PageActive set. If it was found on + * the unevictable list, it will have the PageUnevictable bit set. That flag + * may need to be cleared by the caller before letting the page go. + * + * The vmstat statistic corresponding to the list on which the page was + * found will be decremented. + * + * Restrictions: + * (1) Must be called with an elevated refcount on the page. This is a + * fundamentnal difference from isolate_lru_pages (which is called + * without a stable reference). + * (2) the lru_lock must not be held. + * (3) interrupts must be enabled. + */ +int isolate_lru_page(struct page *page) +{ + int ret = -EBUSY; + + if (PageLRU(page)) { + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + if (PageLRU(page) && get_page_unless_zero(page)) { + int lru = page_lru(page); + ret = 0; + ClearPageLRU(page); + + del_page_from_lru_list(zone, page, lru); + } + spin_unlock_irq(&zone->lru_lock); + } + return ret; +} + +/* + * shrink_inactive_list() is a helper for shrink_zone(). It returns the number + * of reclaimed pages + */ +static unsigned long shrink_inactive_list(unsigned long max_scan, + struct zone *zone, struct scan_control *sc, + int priority, int file) +{ + LIST_HEAD(page_list); + struct pagevec pvec; + unsigned long nr_scanned = 0; + unsigned long nr_reclaimed = 0; + + pagevec_init(&pvec, 1); + + lru_add_drain(); + spin_lock_irq(&zone->lru_lock); + do { + struct page *page; + unsigned long nr_taken; + unsigned long nr_scan; + unsigned long nr_freed; + unsigned long nr_active; + unsigned int count[NR_LRU_LISTS] = { 0, }; + int mode = ISOLATE_INACTIVE; + + /* + * If we need a large contiguous chunk of memory, or have + * trouble getting a small set of contiguous pages, we + * will reclaim both active and inactive pages. + * + * We use the same threshold as pageout congestion_wait below. + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + mode = ISOLATE_BOTH; + else if (sc->order && priority < DEF_PRIORITY - 2) + mode = ISOLATE_BOTH; + + nr_taken = sc->isolate_pages(sc->swap_cluster_max, + &page_list, &nr_scan, sc->order, mode, + zone, sc->mem_cgroup, 0, file); + nr_active = clear_active_flags(&page_list, count); + __count_vm_events(PGDEACTIVATE, nr_active); + + __mod_zone_page_state(zone, NR_ACTIVE_FILE, + -count[LRU_ACTIVE_FILE]); + __mod_zone_page_state(zone, NR_INACTIVE_FILE, + -count[LRU_INACTIVE_FILE]); + __mod_zone_page_state(zone, NR_ACTIVE_ANON, + -count[LRU_ACTIVE_ANON]); + __mod_zone_page_state(zone, NR_INACTIVE_ANON, + -count[LRU_INACTIVE_ANON]); + + if (scan_global_lru(sc)) { + zone->pages_scanned += nr_scan; + zone->recent_scanned[0] += count[LRU_INACTIVE_ANON]; + zone->recent_scanned[0] += count[LRU_ACTIVE_ANON]; + zone->recent_scanned[1] += count[LRU_INACTIVE_FILE]; + zone->recent_scanned[1] += count[LRU_ACTIVE_FILE]; + } + spin_unlock_irq(&zone->lru_lock); + + nr_scanned += nr_scan; + nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC); + + /* + * If we are direct reclaiming for contiguous pages and we do + * not reclaim everything in the list, try again and wait + * for IO to complete. This will stall high-order allocations + * but that should be acceptable to the caller + */ + if (nr_freed < nr_taken && !current_is_kswapd() && + sc->order > PAGE_ALLOC_COSTLY_ORDER) { + congestion_wait(WRITE, HZ/10); + + /* + * The attempt at page out may have made some + * of the pages active, mark them inactive again. + */ + nr_active = clear_active_flags(&page_list, count); + count_vm_events(PGDEACTIVATE, nr_active); + + nr_freed += shrink_page_list(&page_list, sc, + PAGEOUT_IO_SYNC); + } + + nr_reclaimed += nr_freed; + local_irq_disable(); + if (current_is_kswapd()) { + __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); + __count_vm_events(KSWAPD_STEAL, nr_freed); + } else if (scan_global_lru(sc)) + __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); + + __count_zone_vm_events(PGSTEAL, zone, nr_freed); + + if (nr_taken == 0) + goto done; + + spin_lock(&zone->lru_lock); + /* + * Put back any unfreeable pages. + */ + while (!list_empty(&page_list)) { + int lru; + page = lru_to_page(&page_list); + VM_BUG_ON(PageLRU(page)); + list_del(&page->lru); + if (unlikely(!page_evictable(page, NULL))) { + spin_unlock_irq(&zone->lru_lock); + putback_lru_page(page); + spin_lock_irq(&zone->lru_lock); + continue; + } + SetPageLRU(page); + lru = page_lru(page); + add_page_to_lru_list(zone, page, lru); + mem_cgroup_move_lists(page, lru); + if (PageActive(page) && scan_global_lru(sc)) { + int file = !!page_is_file_cache(page); + zone->recent_rotated[file]++; + } + if (!pagevec_add(&pvec, page)) { + spin_unlock_irq(&zone->lru_lock); + __pagevec_release(&pvec); + spin_lock_irq(&zone->lru_lock); + } + } + } while (nr_scanned < max_scan); + spin_unlock(&zone->lru_lock); +done: + local_irq_enable(); + pagevec_release(&pvec); + return nr_reclaimed; +} + +/* + * We are about to scan this zone at a certain priority level. If that priority + * level is smaller (ie: more urgent) than the previous priority, then note + * that priority level within the zone. This is done so that when the next + * process comes in to scan this zone, it will immediately start out at this + * priority level rather than having to build up its own scanning priority. + * Here, this priority affects only the reclaim-mapped threshold. + */ +static inline void note_zone_scanning_priority(struct zone *zone, int priority) +{ + if (priority < zone->prev_priority) + zone->prev_priority = priority; +} + +static inline int zone_is_near_oom(struct zone *zone) +{ + return zone->pages_scanned >= (zone_lru_pages(zone) * 3); +} + +/* + * This moves pages from the active list to the inactive list. + * + * We move them the other way if the page is referenced by one or more + * processes, from rmap. + * + * If the pages are mostly unmapped, the processing is fast and it is + * appropriate to hold zone->lru_lock across the whole operation. But if + * the pages are mapped, the processing is slow (page_referenced()) so we + * should drop zone->lru_lock around each page. It's impossible to balance + * this, so instead we remove the pages from the LRU while processing them. + * It is safe to rely on PG_active against the non-LRU pages in here because + * nobody will play with that bit on a non-LRU page. + * + * The downside is that we have to touch page->_count against each page. + * But we had to alter page->flags anyway. + */ + + +static void shrink_active_list(unsigned long nr_pages, struct zone *zone, + struct scan_control *sc, int priority, int file) +{ + unsigned long pgmoved; + int pgdeactivate = 0; + unsigned long pgscanned; + LIST_HEAD(l_hold); /* The pages which were snipped off */ + LIST_HEAD(l_inactive); + struct page *page; + struct pagevec pvec; + enum lru_list lru; + + lru_add_drain(); + spin_lock_irq(&zone->lru_lock); + pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, + ISOLATE_ACTIVE, zone, + sc->mem_cgroup, 1, file); + /* + * zone->pages_scanned is used for detect zone's oom + * mem_cgroup remembers nr_scan by itself. + */ + if (scan_global_lru(sc)) { + zone->pages_scanned += pgscanned; + zone->recent_scanned[!!file] += pgmoved; + } + + if (file) + __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); + else + __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); + spin_unlock_irq(&zone->lru_lock); + + pgmoved = 0; + while (!list_empty(&l_hold)) { + cond_resched(); + page = lru_to_page(&l_hold); + list_del(&page->lru); + + if (unlikely(!page_evictable(page, NULL))) { + putback_lru_page(page); + continue; + } + + /* page_referenced clears PageReferenced */ + if (page_mapping_inuse(page) && + page_referenced(page, 0, sc->mem_cgroup)) + pgmoved++; + + list_add(&page->lru, &l_inactive); + } + + spin_lock_irq(&zone->lru_lock); + /* + * Count referenced pages from currently used mappings as + * rotated, even though they are moved to the inactive list. + * This helps balance scan pressure between file and anonymous + * pages in get_scan_ratio. + */ + zone->recent_rotated[!!file] += pgmoved; + + /* + * Move the pages to the [file or anon] inactive list. + */ + pagevec_init(&pvec, 1); + + pgmoved = 0; + lru = LRU_BASE + file * LRU_FILE; + while (!list_empty(&l_inactive)) { + page = lru_to_page(&l_inactive); + prefetchw_prev_lru_page(page, &l_inactive, flags); + VM_BUG_ON(PageLRU(page)); + SetPageLRU(page); + VM_BUG_ON(!PageActive(page)); + ClearPageActive(page); + + list_move(&page->lru, &zone->lru[lru].list); + mem_cgroup_move_lists(page, lru); + pgmoved++; + if (!pagevec_add(&pvec, page)) { + __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); + spin_unlock_irq(&zone->lru_lock); + pgdeactivate += pgmoved; + pgmoved = 0; + if (buffer_heads_over_limit) + pagevec_strip(&pvec); + __pagevec_release(&pvec); + spin_lock_irq(&zone->lru_lock); + } + } + __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); + pgdeactivate += pgmoved; + if (buffer_heads_over_limit) { + spin_unlock_irq(&zone->lru_lock); + pagevec_strip(&pvec); + spin_lock_irq(&zone->lru_lock); + } + __count_zone_vm_events(PGREFILL, zone, pgscanned); + __count_vm_events(PGDEACTIVATE, pgdeactivate); + spin_unlock_irq(&zone->lru_lock); + if (vm_swap_full()) + pagevec_swap_free(&pvec); + + pagevec_release(&pvec); +} + +static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, + struct zone *zone, struct scan_control *sc, int priority) +{ + int file = is_file_lru(lru); + + if (lru == LRU_ACTIVE_FILE) { + shrink_active_list(nr_to_scan, zone, sc, priority, file); + return 0; + } + + if (lru == LRU_ACTIVE_ANON && + (!scan_global_lru(sc) || inactive_anon_is_low(zone))) { + shrink_active_list(nr_to_scan, zone, sc, priority, file); + return 0; + } + return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); +} + +/* + * Determine how aggressively the anon and file LRU lists should be + * scanned. The relative value of each set of LRU lists is determined + * by looking at the fraction of the pages scanned we did rotate back + * onto the active list instead of evict. + * + * percent[0] specifies how much pressure to put on ram/swap backed + * memory, while percent[1] determines pressure on the file LRUs. + */ +static void get_scan_ratio(struct zone *zone, struct scan_control *sc, + unsigned long *percent) +{ + unsigned long anon, file, free; + unsigned long anon_prio, file_prio; + unsigned long ap, fp; + + anon = zone_page_state(zone, NR_ACTIVE_ANON) + + zone_page_state(zone, NR_INACTIVE_ANON); + file = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + free = zone_page_state(zone, NR_FREE_PAGES); + + /* If we have no swap space, do not bother scanning anon pages. */ + if (nr_swap_pages <= 0) { + percent[0] = 0; + percent[1] = 100; + return; + } + + /* If we have very few page cache pages, force-scan anon pages. */ + if (unlikely(file + free <= zone->pages_high)) { + percent[0] = 100; + percent[1] = 0; + return; + } + + /* + * OK, so we have swap space and a fair amount of page cache + * pages. We use the recently rotated / recently scanned + * ratios to determine how valuable each cache is. + * + * Because workloads change over time (and to avoid overflow) + * we keep these statistics as a floating average, which ends + * up weighing recent references more than old ones. + * + * anon in [0], file in [1] + */ + if (unlikely(zone->recent_scanned[0] > anon / 4)) { + spin_lock_irq(&zone->lru_lock); + zone->recent_scanned[0] /= 2; + zone->recent_rotated[0] /= 2; + spin_unlock_irq(&zone->lru_lock); + } + + if (unlikely(zone->recent_scanned[1] > file / 4)) { + spin_lock_irq(&zone->lru_lock); + zone->recent_scanned[1] /= 2; + zone->recent_rotated[1] /= 2; + spin_unlock_irq(&zone->lru_lock); + } + + /* + * With swappiness at 100, anonymous and file have the same priority. + * This scanning priority is essentially the inverse of IO cost. + */ + anon_prio = sc->swappiness; + file_prio = 200 - sc->swappiness; + + /* + * The amount of pressure on anon vs file pages is inversely + * proportional to the fraction of recently scanned pages on + * each list that were recently referenced and in active use. + */ + ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1); + ap /= zone->recent_rotated[0] + 1; + + fp = (file_prio + 1) * (zone->recent_scanned[1] + 1); + fp /= zone->recent_rotated[1] + 1; + + /* Normalize to percentages */ + percent[0] = 100 * ap / (ap + fp + 1); + percent[1] = 100 - percent[0]; +} + + +/* + * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. + */ +static unsigned long shrink_zone(int priority, struct zone *zone, + struct scan_control *sc) +{ + unsigned long nr[NR_LRU_LISTS]; + unsigned long nr_to_scan; + unsigned long nr_reclaimed = 0; + unsigned long percent[2]; /* anon @ 0; file @ 1 */ + enum lru_list l; + + get_scan_ratio(zone, sc, percent); + + for_each_evictable_lru(l) { + if (scan_global_lru(sc)) { + int file = is_file_lru(l); + int scan; + + scan = zone_page_state(zone, NR_LRU_BASE + l); + if (priority) { + scan >>= priority; + scan = (scan * percent[file]) / 100; + } + zone->lru[l].nr_scan += scan; + nr[l] = zone->lru[l].nr_scan; + if (nr[l] >= sc->swap_cluster_max) + zone->lru[l].nr_scan = 0; + else + nr[l] = 0; + } else { + /* + * This reclaim occurs not because zone memory shortage + * but because memory controller hits its limit. + * Don't modify zone reclaim related data. + */ + nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone, + priority, l); + } + } + + while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || + nr[LRU_INACTIVE_FILE]) { + for_each_evictable_lru(l) { + if (nr[l]) { + nr_to_scan = min(nr[l], + (unsigned long)sc->swap_cluster_max); + nr[l] -= nr_to_scan; + + nr_reclaimed += shrink_list(l, nr_to_scan, + zone, sc, priority); + } + } + } + + /* + * Even if we did not try to evict anon pages at all, we want to + * rebalance the anon lru active/inactive ratio. + */ + if (!scan_global_lru(sc) || inactive_anon_is_low(zone)) + shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); + else if (!scan_global_lru(sc)) + shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); + + throttle_vm_writeout(sc->gfp_mask); + return nr_reclaimed; +} + +/* + * This is the direct reclaim path, for page-allocating processes. We only + * try to reclaim pages from zones which will satisfy the caller's allocation + * request. + * + * We reclaim from a zone even if that zone is over pages_high. Because: + * a) The caller may be trying to free *extra* pages to satisfy a higher-order + * allocation or + * b) The zones may be over pages_high but they must go *over* pages_high to + * satisfy the `incremental min' zone defense algorithm. + * + * Returns the number of reclaimed pages. + * + * If a zone is deemed to be full of pinned pages then just give it a light + * scan then give up on it. + */ +static unsigned long shrink_zones(int priority, struct zonelist *zonelist, + struct scan_control *sc) +{ + enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); + unsigned long nr_reclaimed = 0; + struct zoneref *z; + struct zone *zone; + + sc->all_unreclaimable = 1; + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + if (!populated_zone(zone)) + continue; + /* + * Take care memory controller reclaiming has small influence + * to global LRU. + */ + if (scan_global_lru(sc)) { + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + continue; + note_zone_scanning_priority(zone, priority); + + if (zone_is_all_unreclaimable(zone) && + priority != DEF_PRIORITY) + continue; /* Let kswapd poll it */ + sc->all_unreclaimable = 0; + } else { + /* + * Ignore cpuset limitation here. We just want to reduce + * # of used pages by us regardless of memory shortage. + */ + sc->all_unreclaimable = 0; + mem_cgroup_note_reclaim_priority(sc->mem_cgroup, + priority); + } + + nr_reclaimed += shrink_zone(priority, zone, sc); + } + + return nr_reclaimed; +} + +/* + * This is the main entry point to direct page reclaim. + * + * If a full scan of the inactive list fails to free enough memory then we + * are "out of memory" and something needs to be killed. + * + * If the caller is !__GFP_FS then the probability of a failure is reasonably + * high - the zone may be full of dirty or under-writeback pages, which this + * caller can't do much about. We kick pdflush and take explicit naps in the + * hope that some of these pages can be written. But if the allocating task + * holds filesystem locks which prevent writeout this might not work, and the + * allocation attempt will fail. + * + * returns: 0, if no pages reclaimed + * else, the number of pages reclaimed + */ +static unsigned long do_try_to_free_pages(struct zonelist *zonelist, + struct scan_control *sc) +{ + int priority; + unsigned long ret = 0; + unsigned long total_scanned = 0; + unsigned long nr_reclaimed = 0; + struct reclaim_state *reclaim_state = current->reclaim_state; + unsigned long lru_pages = 0; + struct zoneref *z; + struct zone *zone; + enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); + + delayacct_freepages_start(); + + if (scan_global_lru(sc)) + count_vm_event(ALLOCSTALL); + /* + * mem_cgroup will not do shrink_slab. + */ + if (scan_global_lru(sc)) { + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + continue; + + lru_pages += zone_lru_pages(zone); + } + } + + for (priority = DEF_PRIORITY; priority >= 0; priority--) { + sc->nr_scanned = 0; + if (!priority) + disable_swap_token(); + nr_reclaimed += shrink_zones(priority, zonelist, sc); + /* + * Don't shrink slabs when reclaiming memory from + * over limit cgroups + */ + if (scan_global_lru(sc)) { + shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); + if (reclaim_state) { + nr_reclaimed += reclaim_state->reclaimed_slab; + reclaim_state->reclaimed_slab = 0; + } + } + total_scanned += sc->nr_scanned; + if (nr_reclaimed >= sc->swap_cluster_max) { + ret = nr_reclaimed; + goto out; + } + + /* + * Try to write back as many pages as we just scanned. This + * tends to cause slow streaming writers to write data to the + * disk smoothly, at the dirtying rate, which is nice. But + * that's undesirable in laptop mode, where we *want* lumpy + * writeout. So in laptop mode, write out the whole world. + */ + if (total_scanned > sc->swap_cluster_max + + sc->swap_cluster_max / 2) { + wakeup_pdflush(laptop_mode ? 0 : total_scanned); + sc->may_writepage = 1; + } + + /* Take a nap, wait for some writeback to complete */ + if (sc->nr_scanned && priority < DEF_PRIORITY - 2) + congestion_wait(WRITE, HZ/10); + } + /* top priority shrink_zones still had more to do? don't OOM, then */ + if (!sc->all_unreclaimable && scan_global_lru(sc)) + ret = nr_reclaimed; +out: + /* + * Now that we've scanned all the zones at this priority level, note + * that level within the zone so that the next thread which performs + * scanning of this zone will immediately start out at this priority + * level. This affects only the decision whether or not to bring + * mapped pages onto the inactive list. + */ + if (priority < 0) + priority = 0; + + if (scan_global_lru(sc)) { + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + continue; + + zone->prev_priority = priority; + } + } else + mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); + + delayacct_freepages_end(); + + return ret; +} + +unsigned long try_to_free_pages(struct zonelist *zonelist, int order, + gfp_t gfp_mask) +{ + struct scan_control sc = { + .gfp_mask = gfp_mask, + .may_writepage = !laptop_mode, + .swap_cluster_max = SWAP_CLUSTER_MAX, + .may_swap = 1, + .swappiness = vm_swappiness, + .order = order, + .mem_cgroup = NULL, + .isolate_pages = isolate_pages_global, + }; + + return do_try_to_free_pages(zonelist, &sc); +} + +#ifdef CONFIG_CGROUP_MEM_RES_CTLR + +unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, + gfp_t gfp_mask) +{ + struct scan_control sc = { + .may_writepage = !laptop_mode, + .may_swap = 1, + .swap_cluster_max = SWAP_CLUSTER_MAX, + .swappiness = vm_swappiness, + .order = 0, + .mem_cgroup = mem_cont, + .isolate_pages = mem_cgroup_isolate_pages, + }; + struct zonelist *zonelist; + + sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | + (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); + zonelist = NODE_DATA(numa_node_id())->node_zonelists; + return do_try_to_free_pages(zonelist, &sc); +} +#endif + +/* + * For kswapd, balance_pgdat() will work across all this node's zones until + * they are all at pages_high. + * + * Returns the number of pages which were actually freed. + * + * There is special handling here for zones which are full of pinned pages. + * This can happen if the pages are all mlocked, or if they are all used by + * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb. + * What we do is to detect the case where all pages in the zone have been + * scanned twice and there has been zero successful reclaim. Mark the zone as + * dead and from now on, only perform a short scan. Basically we're polling + * the zone for when the problem goes away. + * + * kswapd scans the zones in the highmem->normal->dma direction. It skips + * zones which have free_pages > pages_high, but once a zone is found to have + * free_pages <= pages_high, we scan that zone and the lower zones regardless + * of the number of free pages in the lower zones. This interoperates with + * the page allocator fallback scheme to ensure that aging of pages is balanced + * across the zones. + */ +static unsigned long balance_pgdat(pg_data_t *pgdat, int order) +{ + int all_zones_ok; + int priority; + int i; + unsigned long total_scanned; + unsigned long nr_reclaimed; + struct reclaim_state *reclaim_state = current->reclaim_state; + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .may_swap = 1, + .swap_cluster_max = SWAP_CLUSTER_MAX, + .swappiness = vm_swappiness, + .order = order, + .mem_cgroup = NULL, + .isolate_pages = isolate_pages_global, + }; + /* + * temp_priority is used to remember the scanning priority at which + * this zone was successfully refilled to free_pages == pages_high. + */ + int temp_priority[MAX_NR_ZONES]; + +loop_again: + total_scanned = 0; + nr_reclaimed = 0; + sc.may_writepage = !laptop_mode; + count_vm_event(PAGEOUTRUN); + + for (i = 0; i < pgdat->nr_zones; i++) + temp_priority[i] = DEF_PRIORITY; + + for (priority = DEF_PRIORITY; priority >= 0; priority--) { + int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ + unsigned long lru_pages = 0; + + /* The swap token gets in the way of swapout... */ + if (!priority) + disable_swap_token(); + + all_zones_ok = 1; + + /* + * Scan in the highmem->dma direction for the highest + * zone which needs scanning + */ + for (i = pgdat->nr_zones - 1; i >= 0; i--) { + struct zone *zone = pgdat->node_zones + i; + + if (!populated_zone(zone)) + continue; + + if (zone_is_all_unreclaimable(zone) && + priority != DEF_PRIORITY) + continue; + + /* + * Do some background aging of the anon list, to give + * pages a chance to be referenced before reclaiming. + */ + if (inactive_anon_is_low(zone)) + shrink_active_list(SWAP_CLUSTER_MAX, zone, + &sc, priority, 0); + + if (!zone_watermark_ok(zone, order, zone->pages_high, + 0, 0)) { + end_zone = i; + break; + } + } + if (i < 0) + goto out; + + for (i = 0; i <= end_zone; i++) { + struct zone *zone = pgdat->node_zones + i; + + lru_pages += zone_lru_pages(zone); + } + + /* + * Now scan the zone in the dma->highmem direction, stopping + * at the last zone which needs scanning. + * + * We do this because the page allocator works in the opposite + * direction. This prevents the page allocator from allocating + * pages behind kswapd's direction of progress, which would + * cause too much scanning of the lower zones. + */ + for (i = 0; i <= end_zone; i++) { + struct zone *zone = pgdat->node_zones + i; + int nr_slab; + + if (!populated_zone(zone)) + continue; + + if (zone_is_all_unreclaimable(zone) && + priority != DEF_PRIORITY) + continue; + + if (!zone_watermark_ok(zone, order, zone->pages_high, + end_zone, 0)) + all_zones_ok = 0; + temp_priority[i] = priority; + sc.nr_scanned = 0; + note_zone_scanning_priority(zone, priority); + /* + * We put equal pressure on every zone, unless one + * zone has way too many pages free already. + */ + if (!zone_watermark_ok(zone, order, 8*zone->pages_high, + end_zone, 0)) + nr_reclaimed += shrink_zone(priority, zone, &sc); + reclaim_state->reclaimed_slab = 0; + nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, + lru_pages); + nr_reclaimed += reclaim_state->reclaimed_slab; + total_scanned += sc.nr_scanned; + if (zone_is_all_unreclaimable(zone)) + continue; + if (nr_slab == 0 && zone->pages_scanned >= + (zone_lru_pages(zone) * 6)) + zone_set_flag(zone, + ZONE_ALL_UNRECLAIMABLE); + /* + * If we've done a decent amount of scanning and + * the reclaim ratio is low, start doing writepage + * even in laptop mode + */ + if (total_scanned > SWAP_CLUSTER_MAX * 2 && + total_scanned > nr_reclaimed + nr_reclaimed / 2) + sc.may_writepage = 1; + } + if (all_zones_ok) + break; /* kswapd: all done */ + /* + * OK, kswapd is getting into trouble. Take a nap, then take + * another pass across the zones. + */ + if (total_scanned && priority < DEF_PRIORITY - 2) + congestion_wait(WRITE, HZ/10); + + /* + * We do this so kswapd doesn't build up large priorities for + * example when it is freeing in parallel with allocators. It + * matches the direct reclaim path behaviour in terms of impact + * on zone->*_priority. + */ + if (nr_reclaimed >= SWAP_CLUSTER_MAX) + break; + } +out: + /* + * Note within each zone the priority level at which this zone was + * brought into a happy state. So that the next thread which scans this + * zone will start out at that priority level. + */ + for (i = 0; i < pgdat->nr_zones; i++) { + struct zone *zone = pgdat->node_zones + i; + + zone->prev_priority = temp_priority[i]; + } + if (!all_zones_ok) { + cond_resched(); + + try_to_freeze(); + + goto loop_again; + } + + return nr_reclaimed; +} + +/* + * The background pageout daemon, started as a kernel thread + * from the init process. + * + * This basically trickles out pages so that we have _some_ + * free memory available even if there is no other activity + * that frees anything up. This is needed for things like routing + * etc, where we otherwise might have all activity going on in + * asynchronous contexts that cannot page things out. + * + * If there are applications that are active memory-allocators + * (most normal use), this basically shouldn't matter. + */ +static int kswapd(void *p) +{ + unsigned long order; + pg_data_t *pgdat = (pg_data_t*)p; + struct task_struct *tsk = current; + DEFINE_WAIT(wait); + struct reclaim_state reclaim_state = { + .reclaimed_slab = 0, + }; + node_to_cpumask_ptr(cpumask, pgdat->node_id); + + if (!cpus_empty(*cpumask)) + set_cpus_allowed_ptr(tsk, cpumask); + current->reclaim_state = &reclaim_state; + + /* + * Tell the memory management that we're a "memory allocator", + * and that if we need more memory we should get access to it + * regardless (see "__alloc_pages()"). "kswapd" should + * never get caught in the normal page freeing logic. + * + * (Kswapd normally doesn't need memory anyway, but sometimes + * you need a small amount of memory in order to be able to + * page out something else, and this flag essentially protects + * us from recursively trying to free more memory as we're + * trying to free the first piece of memory in the first place). + */ + tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; + set_freezable(); + + order = 0; + for ( ; ; ) { + unsigned long new_order; + + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + new_order = pgdat->kswapd_max_order; + pgdat->kswapd_max_order = 0; + if (order < new_order) { + /* + * Don't sleep if someone wants a larger 'order' + * allocation + */ + order = new_order; + } else { + if (!freezing(current)) + schedule(); + + order = pgdat->kswapd_max_order; + } + finish_wait(&pgdat->kswapd_wait, &wait); + + if (!try_to_freeze()) { + /* We can speed up thawing tasks if we don't call + * balance_pgdat after returning from the refrigerator + */ + balance_pgdat(pgdat, order); + } + } + return 0; +} + +/* + * A zone is low on free memory, so wake its kswapd task to service it. + */ +void wakeup_kswapd(struct zone *zone, int order) +{ + pg_data_t *pgdat; + + if (!populated_zone(zone)) + return; + + pgdat = zone->zone_pgdat; + if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) + return; + if (pgdat->kswapd_max_order < order) + pgdat->kswapd_max_order = order; + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + return; + if (!waitqueue_active(&pgdat->kswapd_wait)) + return; + wake_up_interruptible(&pgdat->kswapd_wait); +} + +unsigned long global_lru_pages(void) +{ + return global_page_state(NR_ACTIVE_ANON) + + global_page_state(NR_ACTIVE_FILE) + + global_page_state(NR_INACTIVE_ANON) + + global_page_state(NR_INACTIVE_FILE); +} + +#ifdef CONFIG_PM +/* + * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages + * from LRU lists system-wide, for given pass and priority, and returns the + * number of reclaimed pages + * + * For pass > 3 we also try to shrink the LRU lists that contain a few pages + */ +static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, + int pass, struct scan_control *sc) +{ + struct zone *zone; + unsigned long nr_to_scan, ret = 0; + enum lru_list l; + + for_each_zone(zone) { + + if (!populated_zone(zone)) + continue; + + if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) + continue; + + for_each_evictable_lru(l) { + /* For pass = 0, we don't shrink the active list */ + if (pass == 0 && + (l == LRU_ACTIVE || l == LRU_ACTIVE_FILE)) + continue; + + zone->lru[l].nr_scan += + (zone_page_state(zone, NR_LRU_BASE + l) + >> prio) + 1; + if (zone->lru[l].nr_scan >= nr_pages || pass > 3) { + zone->lru[l].nr_scan = 0; + nr_to_scan = min(nr_pages, + zone_page_state(zone, + NR_LRU_BASE + l)); + ret += shrink_list(l, nr_to_scan, zone, + sc, prio); + if (ret >= nr_pages) + return ret; + } + } + } + + return ret; +} + +/* + * Try to free `nr_pages' of memory, system-wide, and return the number of + * freed pages. + * + * Rather than trying to age LRUs the aim is to preserve the overall + * LRU order by reclaiming preferentially + * inactive > active > active referenced > active mapped + */ +unsigned long shrink_all_memory(unsigned long nr_pages) +{ + unsigned long lru_pages, nr_slab; + unsigned long ret = 0; + int pass; + struct reclaim_state reclaim_state; + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .may_swap = 0, + .swap_cluster_max = nr_pages, + .may_writepage = 1, + .swappiness = vm_swappiness, + .isolate_pages = isolate_pages_global, + }; + + current->reclaim_state = &reclaim_state; + + lru_pages = global_lru_pages(); + nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); + /* If slab caches are huge, it's better to hit them first */ + while (nr_slab >= lru_pages) { + reclaim_state.reclaimed_slab = 0; + shrink_slab(nr_pages, sc.gfp_mask, lru_pages); + if (!reclaim_state.reclaimed_slab) + break; + + ret += reclaim_state.reclaimed_slab; + if (ret >= nr_pages) + goto out; + + nr_slab -= reclaim_state.reclaimed_slab; + } + + /* + * We try to shrink LRUs in 5 passes: + * 0 = Reclaim from inactive_list only + * 1 = Reclaim from active list but don't reclaim mapped + * 2 = 2nd pass of type 1 + * 3 = Reclaim mapped (normal reclaim) + * 4 = 2nd pass of type 3 + */ + for (pass = 0; pass < 5; pass++) { + int prio; + + /* Force reclaiming mapped pages in the passes #3 and #4 */ + if (pass > 2) { + sc.may_swap = 1; + sc.swappiness = 100; + } + + for (prio = DEF_PRIORITY; prio >= 0; prio--) { + unsigned long nr_to_scan = nr_pages - ret; + + sc.nr_scanned = 0; + ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); + if (ret >= nr_pages) + goto out; + + reclaim_state.reclaimed_slab = 0; + shrink_slab(sc.nr_scanned, sc.gfp_mask, + global_lru_pages()); + ret += reclaim_state.reclaimed_slab; + if (ret >= nr_pages) + goto out; + + if (sc.nr_scanned && prio < DEF_PRIORITY - 2) + congestion_wait(WRITE, HZ / 10); + } + } + + /* + * If ret = 0, we could not shrink LRUs, but there may be something + * in slab caches + */ + if (!ret) { + do { + reclaim_state.reclaimed_slab = 0; + shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); + ret += reclaim_state.reclaimed_slab; + } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); + } + +out: + current->reclaim_state = NULL; + + return ret; +} +#endif + +/* It's optimal to keep kswapds on the same CPUs as their memory, but + not required for correctness. So if the last cpu in a node goes + away, we get changed to run anywhere: as the first one comes back, + restore their cpu bindings. */ +static int __devinit cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + int nid; + + if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { + for_each_node_state(nid, N_HIGH_MEMORY) { + pg_data_t *pgdat = NODE_DATA(nid); + node_to_cpumask_ptr(mask, pgdat->node_id); + + if (any_online_cpu(*mask) < nr_cpu_ids) + /* One of our CPUs online: restore mask */ + set_cpus_allowed_ptr(pgdat->kswapd, mask); + } + } + return NOTIFY_OK; +} + +/* + * This kswapd start function will be called by init and node-hot-add. + * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. + */ +int kswapd_run(int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + int ret = 0; + + if (pgdat->kswapd) + return 0; + + pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); + if (IS_ERR(pgdat->kswapd)) { + /* failure at boot is fatal */ + BUG_ON(system_state == SYSTEM_BOOTING); + printk("Failed to start kswapd on node %d\n",nid); + ret = -1; + } + return ret; +} + +static int __init kswapd_init(void) +{ + int nid; + + swap_setup(); + for_each_node_state(nid, N_HIGH_MEMORY) + kswapd_run(nid); + hotcpu_notifier(cpu_callback, 0); + return 0; +} + +module_init(kswapd_init) + +#ifdef CONFIG_NUMA +/* + * Zone reclaim mode + * + * If non-zero call zone_reclaim when the number of free pages falls below + * the watermarks. + */ +int zone_reclaim_mode __read_mostly; + +#define RECLAIM_OFF 0 +#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ +#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ +#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ + +/* + * Priority for ZONE_RECLAIM. This determines the fraction of pages + * of a node considered for each zone_reclaim. 4 scans 1/16th of + * a zone. + */ +#define ZONE_RECLAIM_PRIORITY 4 + +/* + * Percentage of pages in a zone that must be unmapped for zone_reclaim to + * occur. + */ +int sysctl_min_unmapped_ratio = 1; + +/* + * If the number of slab pages in a zone grows beyond this percentage then + * slab reclaim needs to occur. + */ +int sysctl_min_slab_ratio = 5; + +/* + * Try to free up some pages from this zone through reclaim. + */ +static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) +{ + /* Minimum pages needed in order to stay on node */ + const unsigned long nr_pages = 1 << order; + struct task_struct *p = current; + struct reclaim_state reclaim_state; + int priority; + unsigned long nr_reclaimed = 0; + struct scan_control sc = { + .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), + .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), + .swap_cluster_max = max_t(unsigned long, nr_pages, + SWAP_CLUSTER_MAX), + .gfp_mask = gfp_mask, + .swappiness = vm_swappiness, + .isolate_pages = isolate_pages_global, + }; + unsigned long slab_reclaimable; + + disable_swap_token(); + cond_resched(); + /* + * We need to be able to allocate from the reserves for RECLAIM_SWAP + * and we also need to be able to write out pages for RECLAIM_WRITE + * and RECLAIM_SWAP. + */ + p->flags |= PF_MEMALLOC | PF_SWAPWRITE; + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + + if (zone_page_state(zone, NR_FILE_PAGES) - + zone_page_state(zone, NR_FILE_MAPPED) > + zone->min_unmapped_pages) { + /* + * Free memory by calling shrink zone with increasing + * priorities until we have enough memory freed. + */ + priority = ZONE_RECLAIM_PRIORITY; + do { + note_zone_scanning_priority(zone, priority); + nr_reclaimed += shrink_zone(priority, zone, &sc); + priority--; + } while (priority >= 0 && nr_reclaimed < nr_pages); + } + + slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); + if (slab_reclaimable > zone->min_slab_pages) { + /* + * shrink_slab() does not currently allow us to determine how + * many pages were freed in this zone. So we take the current + * number of slab pages and shake the slab until it is reduced + * by the same nr_pages that we used for reclaiming unmapped + * pages. + * + * Note that shrink_slab will free memory on all zones and may + * take a long time. + */ + while (shrink_slab(sc.nr_scanned, gfp_mask, order) && + zone_page_state(zone, NR_SLAB_RECLAIMABLE) > + slab_reclaimable - nr_pages) + ; + + /* + * Update nr_reclaimed by the number of slab pages we + * reclaimed from this zone. + */ + nr_reclaimed += slab_reclaimable - + zone_page_state(zone, NR_SLAB_RECLAIMABLE); + } + + p->reclaim_state = NULL; + current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); + return nr_reclaimed >= nr_pages; +} + +int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) +{ + int node_id; + int ret; + + /* + * Zone reclaim reclaims unmapped file backed pages and + * slab pages if we are over the defined limits. + * + * A small portion of unmapped file backed pages is needed for + * file I/O otherwise pages read by file I/O will be immediately + * thrown out if the zone is overallocated. So we do not reclaim + * if less than a specified percentage of the zone is used by + * unmapped file backed pages. + */ + if (zone_page_state(zone, NR_FILE_PAGES) - + zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages + && zone_page_state(zone, NR_SLAB_RECLAIMABLE) + <= zone->min_slab_pages) + return 0; + + if (zone_is_all_unreclaimable(zone)) + return 0; + + /* + * Do not scan if the allocation should not be delayed. + */ + if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) + return 0; + + /* + * Only run zone reclaim on the local zone or on zones that do not + * have associated processors. This will favor the local processor + * over remote processors and spread off node memory allocations + * as wide as possible. + */ + node_id = zone_to_nid(zone); + if (node_state(node_id, N_CPU) && node_id != numa_node_id()) + return 0; + + if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) + return 0; + ret = __zone_reclaim(zone, gfp_mask, order); + zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); + + return ret; +} +#endif + +#ifdef CONFIG_UNEVICTABLE_LRU +/* + * page_evictable - test whether a page is evictable + * @page: the page to test + * @vma: the VMA in which the page is or will be mapped, may be NULL + * + * Test whether page is evictable--i.e., should be placed on active/inactive + * lists vs unevictable list. The vma argument is !NULL when called from the + * fault path to determine how to instantate a new page. + * + * Reasons page might not be evictable: + * (1) page's mapping marked unevictable + * (2) page is part of an mlocked VMA + * + */ +int page_evictable(struct page *page, struct vm_area_struct *vma) +{ + + if (mapping_unevictable(page_mapping(page))) + return 0; + + if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page))) + return 0; + + return 1; +} + +/** + * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list + * @page: page to check evictability and move to appropriate lru list + * @zone: zone page is in + * + * Checks a page for evictability and moves the page to the appropriate + * zone lru list. + * + * Restrictions: zone->lru_lock must be held, page must be on LRU and must + * have PageUnevictable set. + */ +static void check_move_unevictable_page(struct page *page, struct zone *zone) +{ + VM_BUG_ON(PageActive(page)); + +retry: + ClearPageUnevictable(page); + if (page_evictable(page, NULL)) { + enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page); + + __dec_zone_state(zone, NR_UNEVICTABLE); + list_move(&page->lru, &zone->lru[l].list); + __inc_zone_state(zone, NR_INACTIVE_ANON + l); + __count_vm_event(UNEVICTABLE_PGRESCUED); + } else { + /* + * rotate unevictable list + */ + SetPageUnevictable(page); + list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); + if (page_evictable(page, NULL)) + goto retry; + } +} + +/** + * scan_mapping_unevictable_pages - scan an address space for evictable pages + * @mapping: struct address_space to scan for evictable pages + * + * Scan all pages in mapping. Check unevictable pages for + * evictability and move them to the appropriate zone lru list. + */ +void scan_mapping_unevictable_pages(struct address_space *mapping) +{ + pgoff_t next = 0; + pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + struct zone *zone; + struct pagevec pvec; + + if (mapping->nrpages == 0) + return; + + pagevec_init(&pvec, 0); + while (next < end && + pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + int i; + int pg_scanned = 0; + + zone = NULL; + + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + pgoff_t page_index = page->index; + struct zone *pagezone = page_zone(page); + + pg_scanned++; + if (page_index > next) + next = page_index; + next++; + + if (pagezone != zone) { + if (zone) + spin_unlock_irq(&zone->lru_lock); + zone = pagezone; + spin_lock_irq(&zone->lru_lock); + } + + if (PageLRU(page) && PageUnevictable(page)) + check_move_unevictable_page(page, zone); + } + if (zone) + spin_unlock_irq(&zone->lru_lock); + pagevec_release(&pvec); + + count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned); + } + +} + +/** + * scan_zone_unevictable_pages - check unevictable list for evictable pages + * @zone - zone of which to scan the unevictable list + * + * Scan @zone's unevictable LRU lists to check for pages that have become + * evictable. Move those that have to @zone's inactive list where they + * become candidates for reclaim, unless shrink_inactive_zone() decides + * to reactivate them. Pages that are still unevictable are rotated + * back onto @zone's unevictable list. + */ +#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ +void scan_zone_unevictable_pages(struct zone *zone) +{ + struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; + unsigned long scan; + unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE); + + while (nr_to_scan > 0) { + unsigned long batch_size = min(nr_to_scan, + SCAN_UNEVICTABLE_BATCH_SIZE); + + spin_lock_irq(&zone->lru_lock); + for (scan = 0; scan < batch_size; scan++) { + struct page *page = lru_to_page(l_unevictable); + + if (!trylock_page(page)) + continue; + + prefetchw_prev_lru_page(page, l_unevictable, flags); + + if (likely(PageLRU(page) && PageUnevictable(page))) + check_move_unevictable_page(page, zone); + + unlock_page(page); + } + spin_unlock_irq(&zone->lru_lock); + + nr_to_scan -= batch_size; + } +} + + +/** + * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages + * + * A really big hammer: scan all zones' unevictable LRU lists to check for + * pages that have become evictable. Move those back to the zones' + * inactive list where they become candidates for reclaim. + * This occurs when, e.g., we have unswappable pages on the unevictable lists, + * and we add swap to the system. As such, it runs in the context of a task + * that has possibly/probably made some previously unevictable pages + * evictable. + */ +void scan_all_zones_unevictable_pages(void) +{ + struct zone *zone; + + for_each_zone(zone) { + scan_zone_unevictable_pages(zone); + } +} + +/* + * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of + * all nodes' unevictable lists for evictable pages + */ +unsigned long scan_unevictable_pages; + +int scan_unevictable_handler(struct ctl_table *table, int write, + struct file *file, void __user *buffer, + size_t *length, loff_t *ppos) +{ + proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + + if (write && *(unsigned long *)table->data) + scan_all_zones_unevictable_pages(); + + scan_unevictable_pages = 0; + return 0; +} + +/* + * per node 'scan_unevictable_pages' attribute. On demand re-scan of + * a specified node's per zone unevictable lists for evictable pages. + */ + +static ssize_t read_scan_unevictable_node(struct sys_device *dev, + struct sysdev_attribute *attr, + char *buf) +{ + return sprintf(buf, "0\n"); /* always zero; should fit... */ +} + +static ssize_t write_scan_unevictable_node(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, size_t count) +{ + struct zone *node_zones = NODE_DATA(dev->id)->node_zones; + struct zone *zone; + unsigned long res; + unsigned long req = strict_strtoul(buf, 10, &res); + + if (!req) + return 1; /* zero is no-op */ + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { + if (!populated_zone(zone)) + continue; + scan_zone_unevictable_pages(zone); + } + return 1; +} + + +static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR, + read_scan_unevictable_node, + write_scan_unevictable_node); + +int scan_unevictable_register_node(struct node *node) +{ + return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages); +} + +void scan_unevictable_unregister_node(struct node *node) +{ + sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); +} + +#endif diff --git a/mm/vmstat.c b/mm/vmstat.c new file mode 100644 index 0000000..c3ccfda --- /dev/null +++ b/mm/vmstat.c @@ -0,0 +1,969 @@ +/* + * linux/mm/vmstat.c + * + * Manages VM statistics + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * zoned VM statistics + * Copyright (C) 2006 Silicon Graphics, Inc., + * Christoph Lameter <christoph@lameter.com> + */ +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/err.h> +#include <linux/module.h> +#include <linux/cpu.h> +#include <linux/vmstat.h> +#include <linux/sched.h> + +#ifdef CONFIG_VM_EVENT_COUNTERS +DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; +EXPORT_PER_CPU_SYMBOL(vm_event_states); + +static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask) +{ + int cpu; + int i; + + memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); + + for_each_cpu_mask_nr(cpu, *cpumask) { + struct vm_event_state *this = &per_cpu(vm_event_states, cpu); + + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) + ret[i] += this->event[i]; + } +} + +/* + * Accumulate the vm event counters across all CPUs. + * The result is unavoidably approximate - it can change + * during and after execution of this function. +*/ +void all_vm_events(unsigned long *ret) +{ + get_online_cpus(); + sum_vm_events(ret, &cpu_online_map); + put_online_cpus(); +} +EXPORT_SYMBOL_GPL(all_vm_events); + +#ifdef CONFIG_HOTPLUG +/* + * Fold the foreign cpu events into our own. + * + * This is adding to the events on one processor + * but keeps the global counts constant. + */ +void vm_events_fold_cpu(int cpu) +{ + struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu); + int i; + + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { + count_vm_events(i, fold_state->event[i]); + fold_state->event[i] = 0; + } +} +#endif /* CONFIG_HOTPLUG */ + +#endif /* CONFIG_VM_EVENT_COUNTERS */ + +/* + * Manage combined zone based / global counters + * + * vm_stat contains the global counters + */ +atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; +EXPORT_SYMBOL(vm_stat); + +#ifdef CONFIG_SMP + +static int calculate_threshold(struct zone *zone) +{ + int threshold; + int mem; /* memory in 128 MB units */ + + /* + * The threshold scales with the number of processors and the amount + * of memory per zone. More memory means that we can defer updates for + * longer, more processors could lead to more contention. + * fls() is used to have a cheap way of logarithmic scaling. + * + * Some sample thresholds: + * + * Threshold Processors (fls) Zonesize fls(mem+1) + * ------------------------------------------------------------------ + * 8 1 1 0.9-1 GB 4 + * 16 2 2 0.9-1 GB 4 + * 20 2 2 1-2 GB 5 + * 24 2 2 2-4 GB 6 + * 28 2 2 4-8 GB 7 + * 32 2 2 8-16 GB 8 + * 4 2 2 <128M 1 + * 30 4 3 2-4 GB 5 + * 48 4 3 8-16 GB 8 + * 32 8 4 1-2 GB 4 + * 32 8 4 0.9-1GB 4 + * 10 16 5 <128M 1 + * 40 16 5 900M 4 + * 70 64 7 2-4 GB 5 + * 84 64 7 4-8 GB 6 + * 108 512 9 4-8 GB 6 + * 125 1024 10 8-16 GB 8 + * 125 1024 10 16-32 GB 9 + */ + + mem = zone->present_pages >> (27 - PAGE_SHIFT); + + threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); + + /* + * Maximum threshold is 125 + */ + threshold = min(125, threshold); + + return threshold; +} + +/* + * Refresh the thresholds for each zone. + */ +static void refresh_zone_stat_thresholds(void) +{ + struct zone *zone; + int cpu; + int threshold; + + for_each_zone(zone) { + + if (!zone->present_pages) + continue; + + threshold = calculate_threshold(zone); + + for_each_online_cpu(cpu) + zone_pcp(zone, cpu)->stat_threshold = threshold; + } +} + +/* + * For use when we know that interrupts are disabled. + */ +void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, + int delta) +{ + struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + s8 *p = pcp->vm_stat_diff + item; + long x; + + x = delta + *p; + + if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) { + zone_page_state_add(x, zone, item); + x = 0; + } + *p = x; +} +EXPORT_SYMBOL(__mod_zone_page_state); + +/* + * For an unknown interrupt state + */ +void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, + int delta) +{ + unsigned long flags; + + local_irq_save(flags); + __mod_zone_page_state(zone, item, delta); + local_irq_restore(flags); +} +EXPORT_SYMBOL(mod_zone_page_state); + +/* + * Optimized increment and decrement functions. + * + * These are only for a single page and therefore can take a struct page * + * argument instead of struct zone *. This allows the inclusion of the code + * generated for page_zone(page) into the optimized functions. + * + * No overflow check is necessary and therefore the differential can be + * incremented or decremented in place which may allow the compilers to + * generate better code. + * The increment or decrement is known and therefore one boundary check can + * be omitted. + * + * NOTE: These functions are very performance sensitive. Change only + * with care. + * + * Some processors have inc/dec instructions that are atomic vs an interrupt. + * However, the code must first determine the differential location in a zone + * based on the processor number and then inc/dec the counter. There is no + * guarantee without disabling preemption that the processor will not change + * in between and therefore the atomicity vs. interrupt cannot be exploited + * in a useful way here. + */ +void __inc_zone_state(struct zone *zone, enum zone_stat_item item) +{ + struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + s8 *p = pcp->vm_stat_diff + item; + + (*p)++; + + if (unlikely(*p > pcp->stat_threshold)) { + int overstep = pcp->stat_threshold / 2; + + zone_page_state_add(*p + overstep, zone, item); + *p = -overstep; + } +} + +void __inc_zone_page_state(struct page *page, enum zone_stat_item item) +{ + __inc_zone_state(page_zone(page), item); +} +EXPORT_SYMBOL(__inc_zone_page_state); + +void __dec_zone_state(struct zone *zone, enum zone_stat_item item) +{ + struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + s8 *p = pcp->vm_stat_diff + item; + + (*p)--; + + if (unlikely(*p < - pcp->stat_threshold)) { + int overstep = pcp->stat_threshold / 2; + + zone_page_state_add(*p - overstep, zone, item); + *p = overstep; + } +} + +void __dec_zone_page_state(struct page *page, enum zone_stat_item item) +{ + __dec_zone_state(page_zone(page), item); +} +EXPORT_SYMBOL(__dec_zone_page_state); + +void inc_zone_state(struct zone *zone, enum zone_stat_item item) +{ + unsigned long flags; + + local_irq_save(flags); + __inc_zone_state(zone, item); + local_irq_restore(flags); +} + +void inc_zone_page_state(struct page *page, enum zone_stat_item item) +{ + unsigned long flags; + struct zone *zone; + + zone = page_zone(page); + local_irq_save(flags); + __inc_zone_state(zone, item); + local_irq_restore(flags); +} +EXPORT_SYMBOL(inc_zone_page_state); + +void dec_zone_page_state(struct page *page, enum zone_stat_item item) +{ + unsigned long flags; + + local_irq_save(flags); + __dec_zone_page_state(page, item); + local_irq_restore(flags); +} +EXPORT_SYMBOL(dec_zone_page_state); + +/* + * Update the zone counters for one cpu. + * + * The cpu specified must be either the current cpu or a processor that + * is not online. If it is the current cpu then the execution thread must + * be pinned to the current cpu. + * + * Note that refresh_cpu_vm_stats strives to only access + * node local memory. The per cpu pagesets on remote zones are placed + * in the memory local to the processor using that pageset. So the + * loop over all zones will access a series of cachelines local to + * the processor. + * + * The call to zone_page_state_add updates the cachelines with the + * statistics in the remote zone struct as well as the global cachelines + * with the global counters. These could cause remote node cache line + * bouncing and will have to be only done when necessary. + */ +void refresh_cpu_vm_stats(int cpu) +{ + struct zone *zone; + int i; + int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; + + for_each_zone(zone) { + struct per_cpu_pageset *p; + + if (!populated_zone(zone)) + continue; + + p = zone_pcp(zone, cpu); + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + if (p->vm_stat_diff[i]) { + unsigned long flags; + int v; + + local_irq_save(flags); + v = p->vm_stat_diff[i]; + p->vm_stat_diff[i] = 0; + local_irq_restore(flags); + atomic_long_add(v, &zone->vm_stat[i]); + global_diff[i] += v; +#ifdef CONFIG_NUMA + /* 3 seconds idle till flush */ + p->expire = 3; +#endif + } + cond_resched(); +#ifdef CONFIG_NUMA + /* + * Deal with draining the remote pageset of this + * processor + * + * Check if there are pages remaining in this pageset + * if not then there is nothing to expire. + */ + if (!p->expire || !p->pcp.count) + continue; + + /* + * We never drain zones local to this processor. + */ + if (zone_to_nid(zone) == numa_node_id()) { + p->expire = 0; + continue; + } + + p->expire--; + if (p->expire) + continue; + + if (p->pcp.count) + drain_zone_pages(zone, &p->pcp); +#endif + } + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + if (global_diff[i]) + atomic_long_add(global_diff[i], &vm_stat[i]); +} + +#endif + +#ifdef CONFIG_NUMA +/* + * zonelist = the list of zones passed to the allocator + * z = the zone from which the allocation occurred. + * + * Must be called with interrupts disabled. + */ +void zone_statistics(struct zone *preferred_zone, struct zone *z) +{ + if (z->zone_pgdat == preferred_zone->zone_pgdat) { + __inc_zone_state(z, NUMA_HIT); + } else { + __inc_zone_state(z, NUMA_MISS); + __inc_zone_state(preferred_zone, NUMA_FOREIGN); + } + if (z->node == numa_node_id()) + __inc_zone_state(z, NUMA_LOCAL); + else + __inc_zone_state(z, NUMA_OTHER); +} +#endif + +#ifdef CONFIG_PROC_FS +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +static char * const migratetype_names[MIGRATE_TYPES] = { + "Unmovable", + "Reclaimable", + "Movable", + "Reserve", + "Isolate", +}; + +static void *frag_start(struct seq_file *m, loff_t *pos) +{ + pg_data_t *pgdat; + loff_t node = *pos; + for (pgdat = first_online_pgdat(); + pgdat && node; + pgdat = next_online_pgdat(pgdat)) + --node; + + return pgdat; +} + +static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + (*pos)++; + return next_online_pgdat(pgdat); +} + +static void frag_stop(struct seq_file *m, void *arg) +{ +} + +/* Walk all the zones in a node and print using a callback */ +static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, + void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) +{ + struct zone *zone; + struct zone *node_zones = pgdat->node_zones; + unsigned long flags; + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { + if (!populated_zone(zone)) + continue; + + spin_lock_irqsave(&zone->lock, flags); + print(m, pgdat, zone); + spin_unlock_irqrestore(&zone->lock, flags); + } +} + +static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, + struct zone *zone) +{ + int order; + + seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6lu ", zone->free_area[order].nr_free); + seq_putc(m, '\n'); +} + +/* + * This walks the free areas for each zone. + */ +static int frag_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + walk_zones_in_node(m, pgdat, frag_show_print); + return 0; +} + +static void pagetypeinfo_showfree_print(struct seq_file *m, + pg_data_t *pgdat, struct zone *zone) +{ + int order, mtype; + + for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) { + seq_printf(m, "Node %4d, zone %8s, type %12s ", + pgdat->node_id, + zone->name, + migratetype_names[mtype]); + for (order = 0; order < MAX_ORDER; ++order) { + unsigned long freecount = 0; + struct free_area *area; + struct list_head *curr; + + area = &(zone->free_area[order]); + + list_for_each(curr, &area->free_list[mtype]) + freecount++; + seq_printf(m, "%6lu ", freecount); + } + seq_putc(m, '\n'); + } +} + +/* Print out the free pages at each order for each migatetype */ +static int pagetypeinfo_showfree(struct seq_file *m, void *arg) +{ + int order; + pg_data_t *pgdat = (pg_data_t *)arg; + + /* Print header */ + seq_printf(m, "%-43s ", "Free pages count per migrate type at order"); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6d ", order); + seq_putc(m, '\n'); + + walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print); + + return 0; +} + +static void pagetypeinfo_showblockcount_print(struct seq_file *m, + pg_data_t *pgdat, struct zone *zone) +{ + int mtype; + unsigned long pfn; + unsigned long start_pfn = zone->zone_start_pfn; + unsigned long end_pfn = start_pfn + zone->spanned_pages; + unsigned long count[MIGRATE_TYPES] = { 0, }; + + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { + struct page *page; + + if (!pfn_valid(pfn)) + continue; + + page = pfn_to_page(pfn); +#ifdef CONFIG_ARCH_FLATMEM_HAS_HOLES + /* + * Ordinarily, memory holes in flatmem still have a valid + * memmap for the PFN range. However, an architecture for + * embedded systems (e.g. ARM) can free up the memmap backing + * holes to save memory on the assumption the memmap is + * never used. The page_zone linkages are then broken even + * though pfn_valid() returns true. Skip the page if the + * linkages are broken. Even if this test passed, the impact + * is that the counters for the movable type are off but + * fragmentation monitoring is likely meaningless on small + * systems. + */ + if (page_zone(page) != zone) + continue; +#endif + mtype = get_pageblock_migratetype(page); + + if (mtype < MIGRATE_TYPES) + count[mtype]++; + } + + /* Print counts */ + seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); + for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) + seq_printf(m, "%12lu ", count[mtype]); + seq_putc(m, '\n'); +} + +/* Print out the free pages at each order for each migratetype */ +static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) +{ + int mtype; + pg_data_t *pgdat = (pg_data_t *)arg; + + seq_printf(m, "\n%-23s", "Number of blocks type "); + for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) + seq_printf(m, "%12s ", migratetype_names[mtype]); + seq_putc(m, '\n'); + walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print); + + return 0; +} + +/* + * This prints out statistics in relation to grouping pages by mobility. + * It is expensive to collect so do not constantly read the file. + */ +static int pagetypeinfo_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + /* check memoryless node */ + if (!node_state(pgdat->node_id, N_HIGH_MEMORY)) + return 0; + + seq_printf(m, "Page block order: %d\n", pageblock_order); + seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages); + seq_putc(m, '\n'); + pagetypeinfo_showfree(m, pgdat); + pagetypeinfo_showblockcount(m, pgdat); + + return 0; +} + +static const struct seq_operations fragmentation_op = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = frag_show, +}; + +static int fragmentation_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &fragmentation_op); +} + +static const struct file_operations fragmentation_file_operations = { + .open = fragmentation_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct seq_operations pagetypeinfo_op = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = pagetypeinfo_show, +}; + +static int pagetypeinfo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &pagetypeinfo_op); +} + +static const struct file_operations pagetypeinfo_file_ops = { + .open = pagetypeinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#ifdef CONFIG_ZONE_DMA +#define TEXT_FOR_DMA(xx) xx "_dma", +#else +#define TEXT_FOR_DMA(xx) +#endif + +#ifdef CONFIG_ZONE_DMA32 +#define TEXT_FOR_DMA32(xx) xx "_dma32", +#else +#define TEXT_FOR_DMA32(xx) +#endif + +#ifdef CONFIG_HIGHMEM +#define TEXT_FOR_HIGHMEM(xx) xx "_high", +#else +#define TEXT_FOR_HIGHMEM(xx) +#endif + +#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \ + TEXT_FOR_HIGHMEM(xx) xx "_movable", + +static const char * const vmstat_text[] = { + /* Zoned VM counters */ + "nr_free_pages", + "nr_inactive_anon", + "nr_active_anon", + "nr_inactive_file", + "nr_active_file", +#ifdef CONFIG_UNEVICTABLE_LRU + "nr_unevictable", + "nr_mlock", +#endif + "nr_anon_pages", + "nr_mapped", + "nr_file_pages", + "nr_dirty", + "nr_writeback", + "nr_slab_reclaimable", + "nr_slab_unreclaimable", + "nr_page_table_pages", + "nr_unstable", + "nr_bounce", + "nr_vmscan_write", + "nr_writeback_temp", + +#ifdef CONFIG_NUMA + "numa_hit", + "numa_miss", + "numa_foreign", + "numa_interleave", + "numa_local", + "numa_other", +#endif + +#ifdef CONFIG_VM_EVENT_COUNTERS + "pgpgin", + "pgpgout", + "pswpin", + "pswpout", + + TEXTS_FOR_ZONES("pgalloc") + + "pgfree", + "pgactivate", + "pgdeactivate", + + "pgfault", + "pgmajfault", + + TEXTS_FOR_ZONES("pgrefill") + TEXTS_FOR_ZONES("pgsteal") + TEXTS_FOR_ZONES("pgscan_kswapd") + TEXTS_FOR_ZONES("pgscan_direct") + + "pginodesteal", + "slabs_scanned", + "kswapd_steal", + "kswapd_inodesteal", + "pageoutrun", + "allocstall", + + "pgrotated", +#ifdef CONFIG_HUGETLB_PAGE + "htlb_buddy_alloc_success", + "htlb_buddy_alloc_fail", +#endif +#ifdef CONFIG_UNEVICTABLE_LRU + "unevictable_pgs_culled", + "unevictable_pgs_scanned", + "unevictable_pgs_rescued", + "unevictable_pgs_mlocked", + "unevictable_pgs_munlocked", + "unevictable_pgs_cleared", + "unevictable_pgs_stranded", + "unevictable_pgs_mlockfreed", +#endif +#endif +}; + +static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, + struct zone *zone) +{ + int i; + seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); + seq_printf(m, + "\n pages free %lu" + "\n min %lu" + "\n low %lu" + "\n high %lu" + "\n scanned %lu (aa: %lu ia: %lu af: %lu if: %lu)" + "\n spanned %lu" + "\n present %lu", + zone_page_state(zone, NR_FREE_PAGES), + zone->pages_min, + zone->pages_low, + zone->pages_high, + zone->pages_scanned, + zone->lru[LRU_ACTIVE_ANON].nr_scan, + zone->lru[LRU_INACTIVE_ANON].nr_scan, + zone->lru[LRU_ACTIVE_FILE].nr_scan, + zone->lru[LRU_INACTIVE_FILE].nr_scan, + zone->spanned_pages, + zone->present_pages); + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + seq_printf(m, "\n %-12s %lu", vmstat_text[i], + zone_page_state(zone, i)); + + seq_printf(m, + "\n protection: (%lu", + zone->lowmem_reserve[0]); + for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) + seq_printf(m, ", %lu", zone->lowmem_reserve[i]); + seq_printf(m, + ")" + "\n pagesets"); + for_each_online_cpu(i) { + struct per_cpu_pageset *pageset; + + pageset = zone_pcp(zone, i); + seq_printf(m, + "\n cpu: %i" + "\n count: %i" + "\n high: %i" + "\n batch: %i", + i, + pageset->pcp.count, + pageset->pcp.high, + pageset->pcp.batch); +#ifdef CONFIG_SMP + seq_printf(m, "\n vm stats threshold: %d", + pageset->stat_threshold); +#endif + } + seq_printf(m, + "\n all_unreclaimable: %u" + "\n prev_priority: %i" + "\n start_pfn: %lu" + "\n inactive_ratio: %u", + zone_is_all_unreclaimable(zone), + zone->prev_priority, + zone->zone_start_pfn, + zone->inactive_ratio); + seq_putc(m, '\n'); +} + +/* + * Output information about zones in @pgdat. + */ +static int zoneinfo_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + walk_zones_in_node(m, pgdat, zoneinfo_show_print); + return 0; +} + +static const struct seq_operations zoneinfo_op = { + .start = frag_start, /* iterate over all zones. The same as in + * fragmentation. */ + .next = frag_next, + .stop = frag_stop, + .show = zoneinfo_show, +}; + +static int zoneinfo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &zoneinfo_op); +} + +static const struct file_operations proc_zoneinfo_file_operations = { + .open = zoneinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void *vmstat_start(struct seq_file *m, loff_t *pos) +{ + unsigned long *v; +#ifdef CONFIG_VM_EVENT_COUNTERS + unsigned long *e; +#endif + int i; + + if (*pos >= ARRAY_SIZE(vmstat_text)) + return NULL; + +#ifdef CONFIG_VM_EVENT_COUNTERS + v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + + sizeof(struct vm_event_state), GFP_KERNEL); +#else + v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long), + GFP_KERNEL); +#endif + m->private = v; + if (!v) + return ERR_PTR(-ENOMEM); + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + v[i] = global_page_state(i); +#ifdef CONFIG_VM_EVENT_COUNTERS + e = v + NR_VM_ZONE_STAT_ITEMS; + all_vm_events(e); + e[PGPGIN] /= 2; /* sectors -> kbytes */ + e[PGPGOUT] /= 2; +#endif + return v + *pos; +} + +static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) +{ + (*pos)++; + if (*pos >= ARRAY_SIZE(vmstat_text)) + return NULL; + return (unsigned long *)m->private + *pos; +} + +static int vmstat_show(struct seq_file *m, void *arg) +{ + unsigned long *l = arg; + unsigned long off = l - (unsigned long *)m->private; + + seq_printf(m, "%s %lu\n", vmstat_text[off], *l); + return 0; +} + +static void vmstat_stop(struct seq_file *m, void *arg) +{ + kfree(m->private); + m->private = NULL; +} + +static const struct seq_operations vmstat_op = { + .start = vmstat_start, + .next = vmstat_next, + .stop = vmstat_stop, + .show = vmstat_show, +}; + +static int vmstat_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &vmstat_op); +} + +static const struct file_operations proc_vmstat_file_operations = { + .open = vmstat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif /* CONFIG_PROC_FS */ + +#ifdef CONFIG_SMP +static DEFINE_PER_CPU(struct delayed_work, vmstat_work); +int sysctl_stat_interval __read_mostly = HZ; + +static void vmstat_update(struct work_struct *w) +{ + refresh_cpu_vm_stats(smp_processor_id()); + schedule_delayed_work(&__get_cpu_var(vmstat_work), + sysctl_stat_interval); +} + +static void __cpuinit start_cpu_timer(int cpu) +{ + struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu); + + INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update); + schedule_delayed_work_on(cpu, vmstat_work, HZ + cpu); +} + +/* + * Use the cpu notifier to insure that the thresholds are recalculated + * when necessary. + */ +static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + long cpu = (long)hcpu; + + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + start_cpu_timer(cpu); + break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu)); + per_cpu(vmstat_work, cpu).work.func = NULL; + break; + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + start_cpu_timer(cpu); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + refresh_zone_stat_thresholds(); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata vmstat_notifier = + { &vmstat_cpuup_callback, NULL, 0 }; +#endif + +static int __init setup_vmstat(void) +{ +#ifdef CONFIG_SMP + int cpu; + + refresh_zone_stat_thresholds(); + register_cpu_notifier(&vmstat_notifier); + + for_each_online_cpu(cpu) + start_cpu_timer(cpu); +#endif +#ifdef CONFIG_PROC_FS + proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); + proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops); + proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations); + proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations); +#endif + return 0; +} +module_init(setup_vmstat) |