diff options
author | Ingo Molnar <mingo@elte.hu> | 2009-12-16 18:33:49 +0100 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-12-16 18:33:49 +0100 |
commit | ee1156c11a1121e118b0a7f2dec240f0d421b1fd (patch) | |
tree | b8771cc5a9758af9d7410fc519227c036c222130 /mm | |
parent | b9f8fcd55bbdb037e5332dbdb7b494f0b70861ac (diff) | |
parent | 8bea8672edfca7ec5f661cafb218f1205863b343 (diff) | |
download | op-kernel-dev-ee1156c11a1121e118b0a7f2dec240f0d421b1fd.zip op-kernel-dev-ee1156c11a1121e118b0a7f2dec240f0d421b1fd.tar.gz |
Merge branch 'linus' into sched/urgent
Conflicts:
kernel/sched_idletask.c
Merge reason: resolve the conflicts, pick up latest changes.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 16 | ||||
-rw-r--r-- | mm/Makefile | 4 | ||||
-rw-r--r-- | mm/allocpercpu.c | 177 | ||||
-rw-r--r-- | mm/bootmem.c | 8 | ||||
-rw-r--r-- | mm/hugetlb.c | 551 | ||||
-rw-r--r-- | mm/internal.h | 23 | ||||
-rw-r--r-- | mm/ksm.c | 953 | ||||
-rw-r--r-- | mm/memcontrol.c | 7 | ||||
-rw-r--r-- | mm/memory-failure.c | 2 | ||||
-rw-r--r-- | mm/memory.c | 29 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 16 | ||||
-rw-r--r-- | mm/mempolicy.c | 69 | ||||
-rw-r--r-- | mm/migrate.c | 133 | ||||
-rw-r--r-- | mm/mincore.c | 37 | ||||
-rw-r--r-- | mm/mlock.c | 45 | ||||
-rw-r--r-- | mm/mmap.c | 50 | ||||
-rw-r--r-- | mm/nommu.c | 8 | ||||
-rw-r--r-- | mm/oom_kill.c | 40 | ||||
-rw-r--r-- | mm/page_alloc.c | 4 | ||||
-rw-r--r-- | mm/page_io.c | 17 | ||||
-rw-r--r-- | mm/pagewalk.c | 32 | ||||
-rw-r--r-- | mm/percpu.c | 24 | ||||
-rw-r--r-- | mm/rmap.c | 350 | ||||
-rw-r--r-- | mm/shmem.c | 11 | ||||
-rw-r--r-- | mm/slab.c | 30 | ||||
-rw-r--r-- | mm/slub.c | 4 | ||||
-rw-r--r-- | mm/swapfile.c | 847 | ||||
-rw-r--r-- | mm/vmalloc.c | 11 | ||||
-rw-r--r-- | mm/vmscan.c | 321 | ||||
-rw-r--r-- | mm/vmstat.c | 10 |
30 files changed, 2330 insertions, 1499 deletions
@@ -158,11 +158,13 @@ config PAGEFLAGS_EXTENDED # Default to 4 for wider testing, though 8 might be more appropriate. # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes. +# DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page. # config SPLIT_PTLOCK_CPUS int - default "4096" if ARM && !CPU_CACHE_VIPT - default "4096" if PARISC && !PA20 + default "999999" if ARM && !CPU_CACHE_VIPT + default "999999" if PARISC && !PA20 + default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC default "4" # @@ -200,14 +202,6 @@ config VIRT_TO_BUS def_bool y depends on !ARCH_NO_VIRT_TO_BUS -config HAVE_MLOCK - bool - default y if MMU=y - -config HAVE_MLOCKED_PAGE_BIT - bool - default y if HAVE_MLOCK=y - config MMU_NOTIFIER bool @@ -218,7 +212,7 @@ config KSM Enable Kernel Samepage Merging: KSM periodically scans those areas of an application's address space that an app has advised may be mergeable. When it finds pages of identical content, it replaces - the many instances by a single resident page with that content, so + the many instances by a single page with that content, so saving memory until one or another app needs to modify the content. Recommended for use with KVM, or with other duplicative applications. See Documentation/vm/ksm.txt for more information: KSM is inactive diff --git a/mm/Makefile b/mm/Makefile index ebf8490..82131d0 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -34,11 +34,7 @@ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o -ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA obj-$(CONFIG_SMP) += percpu.o -else -obj-$(CONFIG_SMP) += allocpercpu.o -endif obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c deleted file mode 100644 index df34cea..0000000 --- a/mm/allocpercpu.c +++ /dev/null @@ -1,177 +0,0 @@ -/* - * linux/mm/allocpercpu.c - * - * Separated from slab.c August 11, 2006 Christoph Lameter - */ -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/bootmem.h> -#include <asm/sections.h> - -#ifndef cache_line_size -#define cache_line_size() L1_CACHE_BYTES -#endif - -/** - * percpu_depopulate - depopulate per-cpu data for given cpu - * @__pdata: per-cpu data to depopulate - * @cpu: depopulate per-cpu data for this cpu - * - * Depopulating per-cpu data for a cpu going offline would be a typical - * use case. You need to register a cpu hotplug handler for that purpose. - */ -static void percpu_depopulate(void *__pdata, int cpu) -{ - struct percpu_data *pdata = __percpu_disguise(__pdata); - - kfree(pdata->ptrs[cpu]); - pdata->ptrs[cpu] = NULL; -} - -/** - * percpu_depopulate_mask - depopulate per-cpu data for some cpu's - * @__pdata: per-cpu data to depopulate - * @mask: depopulate per-cpu data for cpu's selected through mask bits - */ -static void __percpu_depopulate_mask(void *__pdata, const cpumask_t *mask) -{ - int cpu; - for_each_cpu_mask_nr(cpu, *mask) - percpu_depopulate(__pdata, cpu); -} - -#define percpu_depopulate_mask(__pdata, mask) \ - __percpu_depopulate_mask((__pdata), &(mask)) - -/** - * percpu_populate - populate per-cpu data for given cpu - * @__pdata: per-cpu data to populate further - * @size: size of per-cpu object - * @gfp: may sleep or not etc. - * @cpu: populate per-data for this cpu - * - * Populating per-cpu data for a cpu coming online would be a typical - * use case. You need to register a cpu hotplug handler for that purpose. - * Per-cpu object is populated with zeroed buffer. - */ -static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) -{ - struct percpu_data *pdata = __percpu_disguise(__pdata); - int node = cpu_to_node(cpu); - - /* - * We should make sure each CPU gets private memory. - */ - size = roundup(size, cache_line_size()); - - BUG_ON(pdata->ptrs[cpu]); - if (node_online(node)) - pdata->ptrs[cpu] = kmalloc_node(size, gfp|__GFP_ZERO, node); - else - pdata->ptrs[cpu] = kzalloc(size, gfp); - return pdata->ptrs[cpu]; -} - -/** - * percpu_populate_mask - populate per-cpu data for more cpu's - * @__pdata: per-cpu data to populate further - * @size: size of per-cpu object - * @gfp: may sleep or not etc. - * @mask: populate per-cpu data for cpu's selected through mask bits - * - * Per-cpu objects are populated with zeroed buffers. - */ -static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, - cpumask_t *mask) -{ - cpumask_t populated; - int cpu; - - cpus_clear(populated); - for_each_cpu_mask_nr(cpu, *mask) - if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) { - __percpu_depopulate_mask(__pdata, &populated); - return -ENOMEM; - } else - cpu_set(cpu, populated); - return 0; -} - -#define percpu_populate_mask(__pdata, size, gfp, mask) \ - __percpu_populate_mask((__pdata), (size), (gfp), &(mask)) - -/** - * alloc_percpu - initial setup of per-cpu data - * @size: size of per-cpu object - * @align: alignment - * - * Allocate dynamic percpu area. Percpu objects are populated with - * zeroed buffers. - */ -void *__alloc_percpu(size_t size, size_t align) -{ - /* - * We allocate whole cache lines to avoid false sharing - */ - size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size()); - void *pdata = kzalloc(sz, GFP_KERNEL); - void *__pdata = __percpu_disguise(pdata); - - /* - * Can't easily make larger alignment work with kmalloc. WARN - * on it. Larger alignment should only be used for module - * percpu sections on SMP for which this path isn't used. - */ - WARN_ON_ONCE(align > SMP_CACHE_BYTES); - - if (unlikely(!pdata)) - return NULL; - if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL, - &cpu_possible_map))) - return __pdata; - kfree(pdata); - return NULL; -} -EXPORT_SYMBOL_GPL(__alloc_percpu); - -/** - * free_percpu - final cleanup of per-cpu data - * @__pdata: object to clean up - * - * We simply clean up any per-cpu object left. No need for the client to - * track and specify through a bis mask which per-cpu objects are to free. - */ -void free_percpu(void *__pdata) -{ - if (unlikely(!__pdata)) - return; - __percpu_depopulate_mask(__pdata, cpu_possible_mask); - kfree(__percpu_disguise(__pdata)); -} -EXPORT_SYMBOL_GPL(free_percpu); - -/* - * Generic percpu area setup. - */ -#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA -unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; - -EXPORT_SYMBOL(__per_cpu_offset); - -void __init setup_per_cpu_areas(void) -{ - unsigned long size, i; - char *ptr; - unsigned long nr_possible_cpus = num_possible_cpus(); - - /* Copy section for each CPU (we discard the original) */ - size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); - ptr = alloc_bootmem_pages(size * nr_possible_cpus); - - for_each_possible_cpu(i) { - __per_cpu_offset[i] = ptr - __per_cpu_start; - memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - ptr += size; - } -} -#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ diff --git a/mm/bootmem.c b/mm/bootmem.c index d1dc23c..7d14868 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -432,8 +432,8 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size, return mark_bootmem(start, end, 1, flags); } -static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, - unsigned long step) +static unsigned long __init align_idx(struct bootmem_data *bdata, + unsigned long idx, unsigned long step) { unsigned long base = bdata->node_min_pfn; @@ -445,8 +445,8 @@ static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, return ALIGN(base + idx, step) - base; } -static unsigned long align_off(struct bootmem_data *bdata, unsigned long off, - unsigned long align) +static unsigned long __init align_off(struct bootmem_data *bdata, + unsigned long off, unsigned long align) { unsigned long base = PFN_PHYS(bdata->node_min_pfn); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5d7601b..65f38c2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -24,6 +24,7 @@ #include <asm/io.h> #include <linux/hugetlb.h> +#include <linux/node.h> #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; @@ -622,42 +623,66 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) } /* - * Use a helper variable to find the next node and then - * copy it back to next_nid_to_alloc afterwards: - * otherwise there's a window in which a racer might - * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. - * But we don't need to use a spin_lock here: it really - * doesn't matter if occasionally a racer chooses the - * same nid as we do. Move nid forward in the mask even - * if we just successfully allocated a hugepage so that - * the next caller gets hugepages on the next node. + * common helper functions for hstate_next_node_to_{alloc|free}. + * We may have allocated or freed a huge page based on a different + * nodes_allowed previously, so h->next_node_to_{alloc|free} might + * be outside of *nodes_allowed. Ensure that we use an allowed + * node for alloc or free. */ -static int hstate_next_node_to_alloc(struct hstate *h) +static int next_node_allowed(int nid, nodemask_t *nodes_allowed) { - int next_nid; - next_nid = next_node(h->next_nid_to_alloc, node_online_map); - if (next_nid == MAX_NUMNODES) - next_nid = first_node(node_online_map); - h->next_nid_to_alloc = next_nid; - return next_nid; + nid = next_node(nid, *nodes_allowed); + if (nid == MAX_NUMNODES) + nid = first_node(*nodes_allowed); + VM_BUG_ON(nid >= MAX_NUMNODES); + + return nid; +} + +static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) +{ + if (!node_isset(nid, *nodes_allowed)) + nid = next_node_allowed(nid, nodes_allowed); + return nid; +} + +/* + * returns the previously saved node ["this node"] from which to + * allocate a persistent huge page for the pool and advance the + * next node from which to allocate, handling wrap at end of node + * mask. + */ +static int hstate_next_node_to_alloc(struct hstate *h, + nodemask_t *nodes_allowed) +{ + int nid; + + VM_BUG_ON(!nodes_allowed); + + nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); + h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); + + return nid; } -static int alloc_fresh_huge_page(struct hstate *h) +static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) { struct page *page; int start_nid; int next_nid; int ret = 0; - start_nid = h->next_nid_to_alloc; + start_nid = hstate_next_node_to_alloc(h, nodes_allowed); next_nid = start_nid; do { page = alloc_fresh_huge_page_node(h, next_nid); - if (page) + if (page) { ret = 1; - next_nid = hstate_next_node_to_alloc(h); - } while (!page && next_nid != start_nid); + break; + } + next_nid = hstate_next_node_to_alloc(h, nodes_allowed); + } while (next_nid != start_nid); if (ret) count_vm_event(HTLB_BUDDY_PGALLOC); @@ -668,17 +693,21 @@ static int alloc_fresh_huge_page(struct hstate *h) } /* - * helper for free_pool_huge_page() - find next node - * from which to free a huge page + * helper for free_pool_huge_page() - return the previously saved + * node ["this node"] from which to free a huge page. Advance the + * next node id whether or not we find a free huge page to free so + * that the next attempt to free addresses the next node. */ -static int hstate_next_node_to_free(struct hstate *h) +static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) { - int next_nid; - next_nid = next_node(h->next_nid_to_free, node_online_map); - if (next_nid == MAX_NUMNODES) - next_nid = first_node(node_online_map); - h->next_nid_to_free = next_nid; - return next_nid; + int nid; + + VM_BUG_ON(!nodes_allowed); + + nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); + h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); + + return nid; } /* @@ -687,13 +716,14 @@ static int hstate_next_node_to_free(struct hstate *h) * balanced over allowed nodes. * Called with hugetlb_lock locked. */ -static int free_pool_huge_page(struct hstate *h, bool acct_surplus) +static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, + bool acct_surplus) { int start_nid; int next_nid; int ret = 0; - start_nid = h->next_nid_to_free; + start_nid = hstate_next_node_to_free(h, nodes_allowed); next_nid = start_nid; do { @@ -715,9 +745,10 @@ static int free_pool_huge_page(struct hstate *h, bool acct_surplus) } update_and_free_page(h, page); ret = 1; + break; } - next_nid = hstate_next_node_to_free(h); - } while (!ret && next_nid != start_nid); + next_nid = hstate_next_node_to_free(h, nodes_allowed); + } while (next_nid != start_nid); return ret; } @@ -911,14 +942,14 @@ static void return_unused_surplus_pages(struct hstate *h, /* * We want to release as many surplus pages as possible, spread - * evenly across all nodes. Iterate across all nodes until we - * can no longer free unreserved surplus pages. This occurs when - * the nodes with surplus pages have no free pages. - * free_pool_huge_page() will balance the the frees across the - * on-line nodes for us and will handle the hstate accounting. + * evenly across all nodes with memory. Iterate across these nodes + * until we can no longer free unreserved surplus pages. This occurs + * when the nodes with surplus pages have no free pages. + * free_pool_huge_page() will balance the the freed pages across the + * on-line nodes with memory and will handle the hstate accounting. */ while (nr_pages--) { - if (!free_pool_huge_page(h, 1)) + if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1)) break; } } @@ -1022,16 +1053,16 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, int __weak alloc_bootmem_huge_page(struct hstate *h) { struct huge_bootmem_page *m; - int nr_nodes = nodes_weight(node_online_map); + int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); while (nr_nodes) { void *addr; addr = __alloc_bootmem_node_nopanic( - NODE_DATA(h->next_nid_to_alloc), + NODE_DATA(hstate_next_node_to_alloc(h, + &node_states[N_HIGH_MEMORY])), huge_page_size(h), huge_page_size(h), 0); - hstate_next_node_to_alloc(h); if (addr) { /* * Use the beginning of the huge page to store the @@ -1084,7 +1115,8 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) if (h->order >= MAX_ORDER) { if (!alloc_bootmem_huge_page(h)) break; - } else if (!alloc_fresh_huge_page(h)) + } else if (!alloc_fresh_huge_page(h, + &node_states[N_HIGH_MEMORY])) break; } h->max_huge_pages = i; @@ -1126,14 +1158,15 @@ static void __init report_hugepages(void) } #ifdef CONFIG_HIGHMEM -static void try_to_free_low(struct hstate *h, unsigned long count) +static void try_to_free_low(struct hstate *h, unsigned long count, + nodemask_t *nodes_allowed) { int i; if (h->order >= MAX_ORDER) return; - for (i = 0; i < MAX_NUMNODES; ++i) { + for_each_node_mask(i, *nodes_allowed) { struct page *page, *next; struct list_head *freel = &h->hugepage_freelists[i]; list_for_each_entry_safe(page, next, freel, lru) { @@ -1149,7 +1182,8 @@ static void try_to_free_low(struct hstate *h, unsigned long count) } } #else -static inline void try_to_free_low(struct hstate *h, unsigned long count) +static inline void try_to_free_low(struct hstate *h, unsigned long count, + nodemask_t *nodes_allowed) { } #endif @@ -1159,7 +1193,8 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) * balanced by operating on them in a round-robin fashion. * Returns 1 if an adjustment was made. */ -static int adjust_pool_surplus(struct hstate *h, int delta) +static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, + int delta) { int start_nid, next_nid; int ret = 0; @@ -1167,29 +1202,33 @@ static int adjust_pool_surplus(struct hstate *h, int delta) VM_BUG_ON(delta != -1 && delta != 1); if (delta < 0) - start_nid = h->next_nid_to_alloc; + start_nid = hstate_next_node_to_alloc(h, nodes_allowed); else - start_nid = h->next_nid_to_free; + start_nid = hstate_next_node_to_free(h, nodes_allowed); next_nid = start_nid; do { int nid = next_nid; if (delta < 0) { - next_nid = hstate_next_node_to_alloc(h); /* * To shrink on this node, there must be a surplus page */ - if (!h->surplus_huge_pages_node[nid]) + if (!h->surplus_huge_pages_node[nid]) { + next_nid = hstate_next_node_to_alloc(h, + nodes_allowed); continue; + } } if (delta > 0) { - next_nid = hstate_next_node_to_free(h); /* * Surplus cannot exceed the total number of pages */ if (h->surplus_huge_pages_node[nid] >= - h->nr_huge_pages_node[nid]) + h->nr_huge_pages_node[nid]) { + next_nid = hstate_next_node_to_free(h, + nodes_allowed); continue; + } } h->surplus_huge_pages += delta; @@ -1202,7 +1241,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta) } #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) -static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) +static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, + nodemask_t *nodes_allowed) { unsigned long min_count, ret; @@ -1222,7 +1262,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) */ spin_lock(&hugetlb_lock); while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { - if (!adjust_pool_surplus(h, -1)) + if (!adjust_pool_surplus(h, nodes_allowed, -1)) break; } @@ -1233,11 +1273,14 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) * and reducing the surplus. */ spin_unlock(&hugetlb_lock); - ret = alloc_fresh_huge_page(h); + ret = alloc_fresh_huge_page(h, nodes_allowed); spin_lock(&hugetlb_lock); if (!ret) goto out; + /* Bail for signals. Probably ctrl-c from user */ + if (signal_pending(current)) + goto out; } /* @@ -1257,13 +1300,13 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) */ min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; min_count = max(count, min_count); - try_to_free_low(h, min_count); + try_to_free_low(h, min_count, nodes_allowed); while (min_count < persistent_huge_pages(h)) { - if (!free_pool_huge_page(h, 0)) + if (!free_pool_huge_page(h, nodes_allowed, 0)) break; } while (count < persistent_huge_pages(h)) { - if (!adjust_pool_surplus(h, 1)) + if (!adjust_pool_surplus(h, nodes_allowed, 1)) break; } out: @@ -1282,43 +1325,117 @@ out: static struct kobject *hugepages_kobj; static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; -static struct hstate *kobj_to_hstate(struct kobject *kobj) +static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); + +static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) { int i; + for (i = 0; i < HUGE_MAX_HSTATE; i++) - if (hstate_kobjs[i] == kobj) + if (hstate_kobjs[i] == kobj) { + if (nidp) + *nidp = NUMA_NO_NODE; return &hstates[i]; - BUG(); - return NULL; + } + + return kobj_to_node_hstate(kobj, nidp); } -static ssize_t nr_hugepages_show(struct kobject *kobj, +static ssize_t nr_hugepages_show_common(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - struct hstate *h = kobj_to_hstate(kobj); - return sprintf(buf, "%lu\n", h->nr_huge_pages); + struct hstate *h; + unsigned long nr_huge_pages; + int nid; + + h = kobj_to_hstate(kobj, &nid); + if (nid == NUMA_NO_NODE) + nr_huge_pages = h->nr_huge_pages; + else + nr_huge_pages = h->nr_huge_pages_node[nid]; + + return sprintf(buf, "%lu\n", nr_huge_pages); } -static ssize_t nr_hugepages_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) +static ssize_t nr_hugepages_store_common(bool obey_mempolicy, + struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t len) { int err; - unsigned long input; - struct hstate *h = kobj_to_hstate(kobj); + int nid; + unsigned long count; + struct hstate *h; + NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); - err = strict_strtoul(buf, 10, &input); + err = strict_strtoul(buf, 10, &count); if (err) return 0; - h->max_huge_pages = set_max_huge_pages(h, input); + h = kobj_to_hstate(kobj, &nid); + if (nid == NUMA_NO_NODE) { + /* + * global hstate attribute + */ + if (!(obey_mempolicy && + init_nodemask_of_mempolicy(nodes_allowed))) { + NODEMASK_FREE(nodes_allowed); + nodes_allowed = &node_states[N_HIGH_MEMORY]; + } + } else if (nodes_allowed) { + /* + * per node hstate attribute: adjust count to global, + * but restrict alloc/free to the specified node. + */ + count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; + init_nodemask_of_node(nodes_allowed, nid); + } else + nodes_allowed = &node_states[N_HIGH_MEMORY]; + + h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); - return count; + if (nodes_allowed != &node_states[N_HIGH_MEMORY]) + NODEMASK_FREE(nodes_allowed); + + return len; +} + +static ssize_t nr_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return nr_hugepages_show_common(kobj, attr, buf); +} + +static ssize_t nr_hugepages_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t len) +{ + return nr_hugepages_store_common(false, kobj, attr, buf, len); } HSTATE_ATTR(nr_hugepages); +#ifdef CONFIG_NUMA + +/* + * hstate attribute for optionally mempolicy-based constraint on persistent + * huge page alloc/free. + */ +static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return nr_hugepages_show_common(kobj, attr, buf); +} + +static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t len) +{ + return nr_hugepages_store_common(true, kobj, attr, buf, len); +} +HSTATE_ATTR(nr_hugepages_mempolicy); +#endif + + static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - struct hstate *h = kobj_to_hstate(kobj); + struct hstate *h = kobj_to_hstate(kobj, NULL); return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); } static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, @@ -1326,7 +1443,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, { int err; unsigned long input; - struct hstate *h = kobj_to_hstate(kobj); + struct hstate *h = kobj_to_hstate(kobj, NULL); err = strict_strtoul(buf, 10, &input); if (err) @@ -1343,15 +1460,24 @@ HSTATE_ATTR(nr_overcommit_hugepages); static ssize_t free_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - struct hstate *h = kobj_to_hstate(kobj); - return sprintf(buf, "%lu\n", h->free_huge_pages); + struct hstate *h; + unsigned long free_huge_pages; + int nid; + + h = kobj_to_hstate(kobj, &nid); + if (nid == NUMA_NO_NODE) + free_huge_pages = h->free_huge_pages; + else + free_huge_pages = h->free_huge_pages_node[nid]; + + return sprintf(buf, "%lu\n", free_huge_pages); } HSTATE_ATTR_RO(free_hugepages); static ssize_t resv_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - struct hstate *h = kobj_to_hstate(kobj); + struct hstate *h = kobj_to_hstate(kobj, NULL); return sprintf(buf, "%lu\n", h->resv_huge_pages); } HSTATE_ATTR_RO(resv_hugepages); @@ -1359,8 +1485,17 @@ HSTATE_ATTR_RO(resv_hugepages); static ssize_t surplus_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - struct hstate *h = kobj_to_hstate(kobj); - return sprintf(buf, "%lu\n", h->surplus_huge_pages); + struct hstate *h; + unsigned long surplus_huge_pages; + int nid; + + h = kobj_to_hstate(kobj, &nid); + if (nid == NUMA_NO_NODE) + surplus_huge_pages = h->surplus_huge_pages; + else + surplus_huge_pages = h->surplus_huge_pages_node[nid]; + + return sprintf(buf, "%lu\n", surplus_huge_pages); } HSTATE_ATTR_RO(surplus_hugepages); @@ -1370,6 +1505,9 @@ static struct attribute *hstate_attrs[] = { &free_hugepages_attr.attr, &resv_hugepages_attr.attr, &surplus_hugepages_attr.attr, +#ifdef CONFIG_NUMA + &nr_hugepages_mempolicy_attr.attr, +#endif NULL, }; @@ -1377,19 +1515,21 @@ static struct attribute_group hstate_attr_group = { .attrs = hstate_attrs, }; -static int __init hugetlb_sysfs_add_hstate(struct hstate *h) +static int __init hugetlb_sysfs_add_hstate(struct hstate *h, + struct kobject *parent, + struct kobject **hstate_kobjs, + struct attribute_group *hstate_attr_group) { int retval; + int hi = h - hstates; - hstate_kobjs[h - hstates] = kobject_create_and_add(h->name, - hugepages_kobj); - if (!hstate_kobjs[h - hstates]) + hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); + if (!hstate_kobjs[hi]) return -ENOMEM; - retval = sysfs_create_group(hstate_kobjs[h - hstates], - &hstate_attr_group); + retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); if (retval) - kobject_put(hstate_kobjs[h - hstates]); + kobject_put(hstate_kobjs[hi]); return retval; } @@ -1404,17 +1544,184 @@ static void __init hugetlb_sysfs_init(void) return; for_each_hstate(h) { - err = hugetlb_sysfs_add_hstate(h); + err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, + hstate_kobjs, &hstate_attr_group); if (err) printk(KERN_ERR "Hugetlb: Unable to add hstate %s", h->name); } } +#ifdef CONFIG_NUMA + +/* + * node_hstate/s - associate per node hstate attributes, via their kobjects, + * with node sysdevs in node_devices[] using a parallel array. The array + * index of a node sysdev or _hstate == node id. + * This is here to avoid any static dependency of the node sysdev driver, in + * the base kernel, on the hugetlb module. + */ +struct node_hstate { + struct kobject *hugepages_kobj; + struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; +}; +struct node_hstate node_hstates[MAX_NUMNODES]; + +/* + * A subset of global hstate attributes for node sysdevs + */ +static struct attribute *per_node_hstate_attrs[] = { + &nr_hugepages_attr.attr, + &free_hugepages_attr.attr, + &surplus_hugepages_attr.attr, + NULL, +}; + +static struct attribute_group per_node_hstate_attr_group = { + .attrs = per_node_hstate_attrs, +}; + +/* + * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj. + * Returns node id via non-NULL nidp. + */ +static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) +{ + int nid; + + for (nid = 0; nid < nr_node_ids; nid++) { + struct node_hstate *nhs = &node_hstates[nid]; + int i; + for (i = 0; i < HUGE_MAX_HSTATE; i++) + if (nhs->hstate_kobjs[i] == kobj) { + if (nidp) + *nidp = nid; + return &hstates[i]; + } + } + + BUG(); + return NULL; +} + +/* + * Unregister hstate attributes from a single node sysdev. + * No-op if no hstate attributes attached. + */ +void hugetlb_unregister_node(struct node *node) +{ + struct hstate *h; + struct node_hstate *nhs = &node_hstates[node->sysdev.id]; + + if (!nhs->hugepages_kobj) + return; /* no hstate attributes */ + + for_each_hstate(h) + if (nhs->hstate_kobjs[h - hstates]) { + kobject_put(nhs->hstate_kobjs[h - hstates]); + nhs->hstate_kobjs[h - hstates] = NULL; + } + + kobject_put(nhs->hugepages_kobj); + nhs->hugepages_kobj = NULL; +} + +/* + * hugetlb module exit: unregister hstate attributes from node sysdevs + * that have them. + */ +static void hugetlb_unregister_all_nodes(void) +{ + int nid; + + /* + * disable node sysdev registrations. + */ + register_hugetlbfs_with_node(NULL, NULL); + + /* + * remove hstate attributes from any nodes that have them. + */ + for (nid = 0; nid < nr_node_ids; nid++) + hugetlb_unregister_node(&node_devices[nid]); +} + +/* + * Register hstate attributes for a single node sysdev. + * No-op if attributes already registered. + */ +void hugetlb_register_node(struct node *node) +{ + struct hstate *h; + struct node_hstate *nhs = &node_hstates[node->sysdev.id]; + int err; + + if (nhs->hugepages_kobj) + return; /* already allocated */ + + nhs->hugepages_kobj = kobject_create_and_add("hugepages", + &node->sysdev.kobj); + if (!nhs->hugepages_kobj) + return; + + for_each_hstate(h) { + err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, + nhs->hstate_kobjs, + &per_node_hstate_attr_group); + if (err) { + printk(KERN_ERR "Hugetlb: Unable to add hstate %s" + " for node %d\n", + h->name, node->sysdev.id); + hugetlb_unregister_node(node); + break; + } + } +} + +/* + * hugetlb init time: register hstate attributes for all registered node + * sysdevs of nodes that have memory. All on-line nodes should have + * registered their associated sysdev by this time. + */ +static void hugetlb_register_all_nodes(void) +{ + int nid; + + for_each_node_state(nid, N_HIGH_MEMORY) { + struct node *node = &node_devices[nid]; + if (node->sysdev.id == nid) + hugetlb_register_node(node); + } + + /* + * Let the node sysdev driver know we're here so it can + * [un]register hstate attributes on node hotplug. + */ + register_hugetlbfs_with_node(hugetlb_register_node, + hugetlb_unregister_node); +} +#else /* !CONFIG_NUMA */ + +static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) +{ + BUG(); + if (nidp) + *nidp = -1; + return NULL; +} + +static void hugetlb_unregister_all_nodes(void) { } + +static void hugetlb_register_all_nodes(void) { } + +#endif + static void __exit hugetlb_exit(void) { struct hstate *h; + hugetlb_unregister_all_nodes(); + for_each_hstate(h) { kobject_put(hstate_kobjs[h - hstates]); } @@ -1449,6 +1756,8 @@ static int __init hugetlb_init(void) hugetlb_sysfs_init(); + hugetlb_register_all_nodes(); + return 0; } module_init(hugetlb_init); @@ -1472,8 +1781,8 @@ void __init hugetlb_add_hstate(unsigned order) h->free_huge_pages = 0; for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&h->hugepage_freelists[i]); - h->next_nid_to_alloc = first_node(node_online_map); - h->next_nid_to_free = first_node(node_online_map); + h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); + h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/1024); @@ -1536,9 +1845,9 @@ static unsigned int cpuset_mems_nr(unsigned int *array) } #ifdef CONFIG_SYSCTL -int hugetlb_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *length, loff_t *ppos) +static int hugetlb_sysctl_handler_common(bool obey_mempolicy, + struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; unsigned long tmp; @@ -1550,12 +1859,40 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, table->maxlen = sizeof(unsigned long); proc_doulongvec_minmax(table, write, buffer, length, ppos); - if (write) - h->max_huge_pages = set_max_huge_pages(h, tmp); + if (write) { + NODEMASK_ALLOC(nodemask_t, nodes_allowed, + GFP_KERNEL | __GFP_NORETRY); + if (!(obey_mempolicy && + init_nodemask_of_mempolicy(nodes_allowed))) { + NODEMASK_FREE(nodes_allowed); + nodes_allowed = &node_states[N_HIGH_MEMORY]; + } + h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); + + if (nodes_allowed != &node_states[N_HIGH_MEMORY]) + NODEMASK_FREE(nodes_allowed); + } return 0; } +int hugetlb_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + + return hugetlb_sysctl_handler_common(false, table, write, + buffer, length, ppos); +} + +#ifdef CONFIG_NUMA +int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + return hugetlb_sysctl_handler_common(true, table, write, + buffer, length, ppos); +} +#endif /* CONFIG_NUMA */ + int hugetlb_treat_movable_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) @@ -1903,6 +2240,12 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, + (vma->vm_pgoff >> PAGE_SHIFT); mapping = (struct address_space *)page_private(page); + /* + * Take the mapping lock for the duration of the table walk. As + * this mapping should be shared between all the VMAs, + * __unmap_hugepage_range() is called as the lock is already held + */ + spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { /* Do not unmap the current VMA */ if (iter_vma == vma) @@ -1916,10 +2259,11 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * from the time of fork. This would look like data corruption */ if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) - unmap_hugepage_range(iter_vma, + __unmap_hugepage_range(iter_vma, address, address + huge_page_size(h), page); } + spin_unlock(&mapping->i_mmap_lock); return 1; } @@ -1959,6 +2303,9 @@ retry_avoidcopy: outside_reserve = 1; page_cache_get(old_page); + + /* Drop page_table_lock as buddy allocator may be called */ + spin_unlock(&mm->page_table_lock); new_page = alloc_huge_page(vma, address, outside_reserve); if (IS_ERR(new_page)) { @@ -1976,19 +2323,25 @@ retry_avoidcopy: if (unmap_ref_private(mm, vma, old_page, address)) { BUG_ON(page_count(old_page) != 1); BUG_ON(huge_pte_none(pte)); + spin_lock(&mm->page_table_lock); goto retry_avoidcopy; } WARN_ON_ONCE(1); } + /* Caller expects lock to be held */ + spin_lock(&mm->page_table_lock); return -PTR_ERR(new_page); } - spin_unlock(&mm->page_table_lock); copy_huge_page(new_page, old_page, address, vma); __SetPageUptodate(new_page); - spin_lock(&mm->page_table_lock); + /* + * Retake the page_table_lock to check for racing updates + * before the page tables are altered + */ + spin_lock(&mm->page_table_lock); ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) { /* Break COW */ diff --git a/mm/internal.h b/mm/internal.h index 22ec8d2..4fe67a1 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -63,7 +63,7 @@ static inline unsigned long page_order(struct page *page) return page_private(page); } -#ifdef CONFIG_HAVE_MLOCK +#ifdef CONFIG_MMU extern long mlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void munlock_vma_pages_range(struct vm_area_struct *vma, @@ -72,21 +72,7 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) { munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); } -#endif - -/* - * unevictable_migrate_page() called only from migrate_page_copy() to - * migrate unevictable flag to new page. - * Note that the old page has been isolated from the LRU lists at this - * point so we don't need to worry about LRU statistics. - */ -static inline void unevictable_migrate_page(struct page *new, struct page *old) -{ - if (TestClearPageUnevictable(old)) - SetPageUnevictable(new); -} -#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT /* * Called only in fault path via page_evictable() for a new page * to determine if it's being mapped into a LOCKED vma. @@ -107,9 +93,10 @@ static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page) } /* - * must be called with vma's mmap_sem held for read, and page locked. + * must be called with vma's mmap_sem held for read or write, and page locked. */ extern void mlock_vma_page(struct page *page); +extern void munlock_vma_page(struct page *page); /* * Clear the page's PageMlocked(). This can be useful in a situation where @@ -144,7 +131,7 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) } } -#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ +#else /* !CONFIG_MMU */ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) { return 0; @@ -153,7 +140,7 @@ static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page) { } static inline void mlock_migrate_page(struct page *new, struct page *old) { } -#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ +#endif /* !CONFIG_MMU */ /* * Return the mem_map entry representing the 'offset' subpage within @@ -29,11 +29,13 @@ #include <linux/wait.h> #include <linux/slab.h> #include <linux/rbtree.h> +#include <linux/memory.h> #include <linux/mmu_notifier.h> #include <linux/swap.h> #include <linux/ksm.h> #include <asm/tlbflush.h> +#include "internal.h" /* * A few notes about the KSM scanning process, @@ -79,13 +81,13 @@ * struct mm_slot - ksm information per mm that is being scanned * @link: link to the mm_slots hash list * @mm_list: link into the mm_slots list, rooted in ksm_mm_head - * @rmap_list: head for this mm_slot's list of rmap_items + * @rmap_list: head for this mm_slot's singly-linked list of rmap_items * @mm: the mm that this information is valid for */ struct mm_slot { struct hlist_node link; struct list_head mm_list; - struct list_head rmap_list; + struct rmap_item *rmap_list; struct mm_struct *mm; }; @@ -93,7 +95,7 @@ struct mm_slot { * struct ksm_scan - cursor for scanning * @mm_slot: the current mm_slot we are scanning * @address: the next address inside that to be scanned - * @rmap_item: the current rmap that we are scanning inside the rmap_list + * @rmap_list: link to the next rmap to be scanned in the rmap_list * @seqnr: count of completed full scans (needed when removing unstable node) * * There is only the one ksm_scan instance of this cursor structure. @@ -101,37 +103,51 @@ struct mm_slot { struct ksm_scan { struct mm_slot *mm_slot; unsigned long address; - struct rmap_item *rmap_item; + struct rmap_item **rmap_list; unsigned long seqnr; }; /** + * struct stable_node - node of the stable rbtree + * @node: rb node of this ksm page in the stable tree + * @hlist: hlist head of rmap_items using this ksm page + * @kpfn: page frame number of this ksm page + */ +struct stable_node { + struct rb_node node; + struct hlist_head hlist; + unsigned long kpfn; +}; + +/** * struct rmap_item - reverse mapping item for virtual addresses - * @link: link into mm_slot's rmap_list (rmap_list is per mm) + * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list + * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree * @mm: the memory structure this rmap_item is pointing into * @address: the virtual address this rmap_item tracks (+ flags in low bits) * @oldchecksum: previous checksum of the page at that virtual address - * @node: rb_node of this rmap_item in either unstable or stable tree - * @next: next rmap_item hanging off the same node of the stable tree - * @prev: previous rmap_item hanging off the same node of the stable tree + * @node: rb node of this rmap_item in the unstable tree + * @head: pointer to stable_node heading this list in the stable tree + * @hlist: link into hlist of rmap_items hanging off that stable_node */ struct rmap_item { - struct list_head link; + struct rmap_item *rmap_list; + struct anon_vma *anon_vma; /* when stable */ struct mm_struct *mm; unsigned long address; /* + low bits used for flags below */ + unsigned int oldchecksum; /* when unstable */ union { - unsigned int oldchecksum; /* when unstable */ - struct rmap_item *next; /* when stable */ - }; - union { - struct rb_node node; /* when tree node */ - struct rmap_item *prev; /* in stable list */ + struct rb_node node; /* when node of unstable tree */ + struct { /* when listed from stable tree */ + struct stable_node *head; + struct hlist_node hlist; + }; }; }; #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ -#define NODE_FLAG 0x100 /* is a node of unstable or stable tree */ -#define STABLE_FLAG 0x200 /* is a node or list item of stable tree */ +#define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ +#define STABLE_FLAG 0x200 /* is listed from the stable tree */ /* The stable and unstable tree heads */ static struct rb_root root_stable_tree = RB_ROOT; @@ -148,6 +164,7 @@ static struct ksm_scan ksm_scan = { }; static struct kmem_cache *rmap_item_cache; +static struct kmem_cache *stable_node_cache; static struct kmem_cache *mm_slot_cache; /* The number of nodes in the stable tree */ @@ -162,9 +179,6 @@ static unsigned long ksm_pages_unshared; /* The number of rmap_items in use: to calculate pages_volatile */ static unsigned long ksm_rmap_items; -/* Limit on the number of unswappable pages used */ -static unsigned long ksm_max_kernel_pages; - /* Number of pages ksmd should scan in one batch */ static unsigned int ksm_thread_pages_to_scan = 100; @@ -190,13 +204,19 @@ static int __init ksm_slab_init(void) if (!rmap_item_cache) goto out; + stable_node_cache = KSM_KMEM_CACHE(stable_node, 0); + if (!stable_node_cache) + goto out_free1; + mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); if (!mm_slot_cache) - goto out_free; + goto out_free2; return 0; -out_free: +out_free2: + kmem_cache_destroy(stable_node_cache); +out_free1: kmem_cache_destroy(rmap_item_cache); out: return -ENOMEM; @@ -205,6 +225,7 @@ out: static void __init ksm_slab_free(void) { kmem_cache_destroy(mm_slot_cache); + kmem_cache_destroy(stable_node_cache); kmem_cache_destroy(rmap_item_cache); mm_slot_cache = NULL; } @@ -226,6 +247,16 @@ static inline void free_rmap_item(struct rmap_item *rmap_item) kmem_cache_free(rmap_item_cache, rmap_item); } +static inline struct stable_node *alloc_stable_node(void) +{ + return kmem_cache_alloc(stable_node_cache, GFP_KERNEL); +} + +static inline void free_stable_node(struct stable_node *stable_node) +{ + kmem_cache_free(stable_node_cache, stable_node); +} + static inline struct mm_slot *alloc_mm_slot(void) { if (!mm_slot_cache) /* initialization failed */ @@ -275,7 +306,6 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm, bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) % MM_SLOTS_HASH_HEADS]; mm_slot->mm = mm; - INIT_LIST_HEAD(&mm_slot->rmap_list); hlist_add_head(&mm_slot->link, bucket); } @@ -284,6 +314,25 @@ static inline int in_stable_tree(struct rmap_item *rmap_item) return rmap_item->address & STABLE_FLAG; } +static void hold_anon_vma(struct rmap_item *rmap_item, + struct anon_vma *anon_vma) +{ + rmap_item->anon_vma = anon_vma; + atomic_inc(&anon_vma->ksm_refcount); +} + +static void drop_anon_vma(struct rmap_item *rmap_item) +{ + struct anon_vma *anon_vma = rmap_item->anon_vma; + + if (atomic_dec_and_lock(&anon_vma->ksm_refcount, &anon_vma->lock)) { + int empty = list_empty(&anon_vma->head); + spin_unlock(&anon_vma->lock); + if (empty) + anon_vma_free(anon_vma); + } +} + /* * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's * page tables after it has passed through ksm_exit() - which, if necessary, @@ -356,10 +405,18 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; } -static void break_cow(struct mm_struct *mm, unsigned long addr) +static void break_cow(struct rmap_item *rmap_item) { + struct mm_struct *mm = rmap_item->mm; + unsigned long addr = rmap_item->address; struct vm_area_struct *vma; + /* + * It is not an accident that whenever we want to break COW + * to undo, we also need to drop a reference to the anon_vma. + */ + drop_anon_vma(rmap_item); + down_read(&mm->mmap_sem); if (ksm_test_exit(mm)) goto out; @@ -403,21 +460,77 @@ out: page = NULL; return page; } +static void remove_node_from_stable_tree(struct stable_node *stable_node) +{ + struct rmap_item *rmap_item; + struct hlist_node *hlist; + + hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { + if (rmap_item->hlist.next) + ksm_pages_sharing--; + else + ksm_pages_shared--; + drop_anon_vma(rmap_item); + rmap_item->address &= PAGE_MASK; + cond_resched(); + } + + rb_erase(&stable_node->node, &root_stable_tree); + free_stable_node(stable_node); +} + /* - * get_ksm_page: checks if the page at the virtual address in rmap_item - * is still PageKsm, in which case we can trust the content of the page, - * and it returns the gotten page; but NULL if the page has been zapped. + * get_ksm_page: checks if the page indicated by the stable node + * is still its ksm page, despite having held no reference to it. + * In which case we can trust the content of the page, and it + * returns the gotten page; but if the page has now been zapped, + * remove the stale node from the stable tree and return NULL. + * + * You would expect the stable_node to hold a reference to the ksm page. + * But if it increments the page's count, swapping out has to wait for + * ksmd to come around again before it can free the page, which may take + * seconds or even minutes: much too unresponsive. So instead we use a + * "keyhole reference": access to the ksm page from the stable node peeps + * out through its keyhole to see if that page still holds the right key, + * pointing back to this stable node. This relies on freeing a PageAnon + * page to reset its page->mapping to NULL, and relies on no other use of + * a page to put something that might look like our key in page->mapping. + * + * include/linux/pagemap.h page_cache_get_speculative() is a good reference, + * but this is different - made simpler by ksm_thread_mutex being held, but + * interesting for assuming that no other use of the struct page could ever + * put our expected_mapping into page->mapping (or a field of the union which + * coincides with page->mapping). The RCU calls are not for KSM at all, but + * to keep the page_count protocol described with page_cache_get_speculative. + * + * Note: it is possible that get_ksm_page() will return NULL one moment, + * then page the next, if the page is in between page_freeze_refs() and + * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page + * is on its way to being freed; but it is an anomaly to bear in mind. */ -static struct page *get_ksm_page(struct rmap_item *rmap_item) +static struct page *get_ksm_page(struct stable_node *stable_node) { struct page *page; - - page = get_mergeable_page(rmap_item); - if (page && !PageKsm(page)) { + void *expected_mapping; + + page = pfn_to_page(stable_node->kpfn); + expected_mapping = (void *)stable_node + + (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); + rcu_read_lock(); + if (page->mapping != expected_mapping) + goto stale; + if (!get_page_unless_zero(page)) + goto stale; + if (page->mapping != expected_mapping) { put_page(page); - page = NULL; + goto stale; } + rcu_read_unlock(); return page; +stale: + rcu_read_unlock(); + remove_node_from_stable_tree(stable_node); + return NULL; } /* @@ -426,35 +539,29 @@ static struct page *get_ksm_page(struct rmap_item *rmap_item) */ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) { - if (in_stable_tree(rmap_item)) { - struct rmap_item *next_item = rmap_item->next; - - if (rmap_item->address & NODE_FLAG) { - if (next_item) { - rb_replace_node(&rmap_item->node, - &next_item->node, - &root_stable_tree); - next_item->address |= NODE_FLAG; - ksm_pages_sharing--; - } else { - rb_erase(&rmap_item->node, &root_stable_tree); - ksm_pages_shared--; - } - } else { - struct rmap_item *prev_item = rmap_item->prev; + if (rmap_item->address & STABLE_FLAG) { + struct stable_node *stable_node; + struct page *page; - BUG_ON(prev_item->next != rmap_item); - prev_item->next = next_item; - if (next_item) { - BUG_ON(next_item->prev != rmap_item); - next_item->prev = rmap_item->prev; - } + stable_node = rmap_item->head; + page = get_ksm_page(stable_node); + if (!page) + goto out; + + lock_page(page); + hlist_del(&rmap_item->hlist); + unlock_page(page); + put_page(page); + + if (stable_node->hlist.first) ksm_pages_sharing--; - } + else + ksm_pages_shared--; - rmap_item->next = NULL; + drop_anon_vma(rmap_item); + rmap_item->address &= PAGE_MASK; - } else if (rmap_item->address & NODE_FLAG) { + } else if (rmap_item->address & UNSTABLE_FLAG) { unsigned char age; /* * Usually ksmd can and must skip the rb_erase, because @@ -467,24 +574,21 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) BUG_ON(age > 1); if (!age) rb_erase(&rmap_item->node, &root_unstable_tree); + ksm_pages_unshared--; + rmap_item->address &= PAGE_MASK; } - - rmap_item->address &= PAGE_MASK; - +out: cond_resched(); /* we're called from many long loops */ } static void remove_trailing_rmap_items(struct mm_slot *mm_slot, - struct list_head *cur) + struct rmap_item **rmap_list) { - struct rmap_item *rmap_item; - - while (cur != &mm_slot->rmap_list) { - rmap_item = list_entry(cur, struct rmap_item, link); - cur = cur->next; + while (*rmap_list) { + struct rmap_item *rmap_item = *rmap_list; + *rmap_list = rmap_item->rmap_list; remove_rmap_item_from_tree(rmap_item); - list_del(&rmap_item->link); free_rmap_item(rmap_item); } } @@ -550,7 +654,7 @@ static int unmerge_and_remove_all_rmap_items(void) goto error; } - remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next); + remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list); spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, @@ -646,7 +750,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, * Check that no O_DIRECT or similar I/O is in progress on the * page */ - if ((page_mapcount(page) + 2 + swapped) != page_count(page)) { + if (page_mapcount(page) + 1 + swapped != page_count(page)) { set_pte_at_notify(mm, addr, ptep, entry); goto out_unlock; } @@ -664,15 +768,15 @@ out: /** * replace_page - replace page in vma by new ksm page - * @vma: vma that holds the pte pointing to oldpage - * @oldpage: the page we are replacing by newpage - * @newpage: the ksm page we replace oldpage by + * @vma: vma that holds the pte pointing to page + * @page: the page we are replacing by kpage + * @kpage: the ksm page we replace page by * @orig_pte: the original value of the pte * * Returns 0 on success, -EFAULT on failure. */ -static int replace_page(struct vm_area_struct *vma, struct page *oldpage, - struct page *newpage, pte_t orig_pte) +static int replace_page(struct vm_area_struct *vma, struct page *page, + struct page *kpage, pte_t orig_pte) { struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; @@ -681,12 +785,9 @@ static int replace_page(struct vm_area_struct *vma, struct page *oldpage, pte_t *ptep; spinlock_t *ptl; unsigned long addr; - pgprot_t prot; int err = -EFAULT; - prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE); - - addr = page_address_in_vma(oldpage, vma); + addr = page_address_in_vma(page, vma); if (addr == -EFAULT) goto out; @@ -708,15 +809,15 @@ static int replace_page(struct vm_area_struct *vma, struct page *oldpage, goto out; } - get_page(newpage); - page_add_ksm_rmap(newpage); + get_page(kpage); + page_add_anon_rmap(kpage, vma, addr); flush_cache_page(vma, addr, pte_pfn(*ptep)); ptep_clear_flush(vma, addr, ptep); - set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot)); + set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); - page_remove_rmap(oldpage); - put_page(oldpage); + page_remove_rmap(page); + put_page(page); pte_unmap_unlock(ptep, ptl); err = 0; @@ -726,32 +827,27 @@ out: /* * try_to_merge_one_page - take two pages and merge them into one - * @vma: the vma that hold the pte pointing into oldpage - * @oldpage: the page that we want to replace with newpage - * @newpage: the page that we want to map instead of oldpage - * - * Note: - * oldpage should be a PageAnon page, while newpage should be a PageKsm page, - * or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm. + * @vma: the vma that holds the pte pointing to page + * @page: the PageAnon page that we want to replace with kpage + * @kpage: the PageKsm page that we want to map instead of page, + * or NULL the first time when we want to use page as kpage. * * This function returns 0 if the pages were merged, -EFAULT otherwise. */ static int try_to_merge_one_page(struct vm_area_struct *vma, - struct page *oldpage, - struct page *newpage) + struct page *page, struct page *kpage) { pte_t orig_pte = __pte(0); int err = -EFAULT; + if (page == kpage) /* ksm page forked */ + return 0; + if (!(vma->vm_flags & VM_MERGEABLE)) goto out; - - if (!PageAnon(oldpage)) + if (!PageAnon(page)) goto out; - get_page(newpage); - get_page(oldpage); - /* * We need the page lock to read a stable PageSwapCache in * write_protect_page(). We use trylock_page() instead of @@ -759,26 +855,39 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, * prefer to continue scanning and merging different pages, * then come back to this page when it is unlocked. */ - if (!trylock_page(oldpage)) - goto out_putpage; + if (!trylock_page(page)) + goto out; /* * If this anonymous page is mapped only here, its pte may need * to be write-protected. If it's mapped elsewhere, all of its * ptes are necessarily already write-protected. But in either * case, we need to lock and check page_count is not raised. */ - if (write_protect_page(vma, oldpage, &orig_pte)) { - unlock_page(oldpage); - goto out_putpage; + if (write_protect_page(vma, page, &orig_pte) == 0) { + if (!kpage) { + /* + * While we hold page lock, upgrade page from + * PageAnon+anon_vma to PageKsm+NULL stable_node: + * stable_tree_insert() will update stable_node. + */ + set_page_stable_node(page, NULL); + mark_page_accessed(page); + err = 0; + } else if (pages_identical(page, kpage)) + err = replace_page(vma, page, kpage, orig_pte); } - unlock_page(oldpage); - if (pages_identical(oldpage, newpage)) - err = replace_page(vma, oldpage, newpage, orig_pte); + if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { + munlock_vma_page(page); + if (!PageMlocked(kpage)) { + unlock_page(page); + lock_page(kpage); + mlock_vma_page(kpage); + page = kpage; /* for final unlock */ + } + } -out_putpage: - put_page(oldpage); - put_page(newpage); + unlock_page(page); out: return err; } @@ -786,26 +895,31 @@ out: /* * try_to_merge_with_ksm_page - like try_to_merge_two_pages, * but no new kernel page is allocated: kpage must already be a ksm page. + * + * This function returns 0 if the pages were merged, -EFAULT otherwise. */ -static int try_to_merge_with_ksm_page(struct mm_struct *mm1, - unsigned long addr1, - struct page *page1, - struct page *kpage) +static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, + struct page *page, struct page *kpage) { + struct mm_struct *mm = rmap_item->mm; struct vm_area_struct *vma; int err = -EFAULT; - down_read(&mm1->mmap_sem); - if (ksm_test_exit(mm1)) + down_read(&mm->mmap_sem); + if (ksm_test_exit(mm)) + goto out; + vma = find_vma(mm, rmap_item->address); + if (!vma || vma->vm_start > rmap_item->address) goto out; - vma = find_vma(mm1, addr1); - if (!vma || vma->vm_start > addr1) + err = try_to_merge_one_page(vma, page, kpage); + if (err) goto out; - err = try_to_merge_one_page(vma, page1, kpage); + /* Must get reference to anon_vma while still holding mmap_sem */ + hold_anon_vma(rmap_item, vma->anon_vma); out: - up_read(&mm1->mmap_sem); + up_read(&mm->mmap_sem); return err; } @@ -813,109 +927,73 @@ out: * try_to_merge_two_pages - take two identical pages and prepare them * to be merged into one page. * - * This function returns 0 if we successfully mapped two identical pages - * into one page, -EFAULT otherwise. + * This function returns the kpage if we successfully merged two identical + * pages into one ksm page, NULL otherwise. * - * Note that this function allocates a new kernel page: if one of the pages + * Note that this function upgrades page to ksm page: if one of the pages * is already a ksm page, try_to_merge_with_ksm_page should be used. */ -static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1, - struct page *page1, struct mm_struct *mm2, - unsigned long addr2, struct page *page2) +static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, + struct page *page, + struct rmap_item *tree_rmap_item, + struct page *tree_page) { - struct vm_area_struct *vma; - struct page *kpage; - int err = -EFAULT; - - /* - * The number of nodes in the stable tree - * is the number of kernel pages that we hold. - */ - if (ksm_max_kernel_pages && - ksm_max_kernel_pages <= ksm_pages_shared) - return err; - - kpage = alloc_page(GFP_HIGHUSER); - if (!kpage) - return err; - - down_read(&mm1->mmap_sem); - if (ksm_test_exit(mm1)) { - up_read(&mm1->mmap_sem); - goto out; - } - vma = find_vma(mm1, addr1); - if (!vma || vma->vm_start > addr1) { - up_read(&mm1->mmap_sem); - goto out; - } - - copy_user_highpage(kpage, page1, addr1, vma); - err = try_to_merge_one_page(vma, page1, kpage); - up_read(&mm1->mmap_sem); + int err; + err = try_to_merge_with_ksm_page(rmap_item, page, NULL); if (!err) { - err = try_to_merge_with_ksm_page(mm2, addr2, page2, kpage); + err = try_to_merge_with_ksm_page(tree_rmap_item, + tree_page, page); /* * If that fails, we have a ksm page with only one pte * pointing to it: so break it. */ if (err) - break_cow(mm1, addr1); + break_cow(rmap_item); } -out: - put_page(kpage); - return err; + return err ? NULL : page; } /* - * stable_tree_search - search page inside the stable tree - * @page: the page that we are searching identical pages to. - * @page2: pointer into identical page that we are holding inside the stable - * tree that we have found. - * @rmap_item: the reverse mapping item + * stable_tree_search - search for page inside the stable tree * * This function checks if there is a page inside the stable tree * with identical content to the page that we are scanning right now. * - * This function return rmap_item pointer to the identical item if found, + * This function returns the stable tree node of identical content if found, * NULL otherwise. */ -static struct rmap_item *stable_tree_search(struct page *page, - struct page **page2, - struct rmap_item *rmap_item) +static struct page *stable_tree_search(struct page *page) { struct rb_node *node = root_stable_tree.rb_node; + struct stable_node *stable_node; + + stable_node = page_stable_node(page); + if (stable_node) { /* ksm page forked */ + get_page(page); + return page; + } while (node) { - struct rmap_item *tree_rmap_item, *next_rmap_item; + struct page *tree_page; int ret; - tree_rmap_item = rb_entry(node, struct rmap_item, node); - while (tree_rmap_item) { - BUG_ON(!in_stable_tree(tree_rmap_item)); - cond_resched(); - page2[0] = get_ksm_page(tree_rmap_item); - if (page2[0]) - break; - next_rmap_item = tree_rmap_item->next; - remove_rmap_item_from_tree(tree_rmap_item); - tree_rmap_item = next_rmap_item; - } - if (!tree_rmap_item) + cond_resched(); + stable_node = rb_entry(node, struct stable_node, node); + tree_page = get_ksm_page(stable_node); + if (!tree_page) return NULL; - ret = memcmp_pages(page, page2[0]); + ret = memcmp_pages(page, tree_page); if (ret < 0) { - put_page(page2[0]); + put_page(tree_page); node = node->rb_left; } else if (ret > 0) { - put_page(page2[0]); + put_page(tree_page); node = node->rb_right; - } else { - return tree_rmap_item; - } + } else + return tree_page; } return NULL; @@ -925,38 +1003,26 @@ static struct rmap_item *stable_tree_search(struct page *page, * stable_tree_insert - insert rmap_item pointing to new ksm page * into the stable tree. * - * @page: the page that we are searching identical page to inside the stable - * tree. - * @rmap_item: pointer to the reverse mapping item. - * - * This function returns rmap_item if success, NULL otherwise. + * This function returns the stable tree node just allocated on success, + * NULL otherwise. */ -static struct rmap_item *stable_tree_insert(struct page *page, - struct rmap_item *rmap_item) +static struct stable_node *stable_tree_insert(struct page *kpage) { struct rb_node **new = &root_stable_tree.rb_node; struct rb_node *parent = NULL; + struct stable_node *stable_node; while (*new) { - struct rmap_item *tree_rmap_item, *next_rmap_item; struct page *tree_page; int ret; - tree_rmap_item = rb_entry(*new, struct rmap_item, node); - while (tree_rmap_item) { - BUG_ON(!in_stable_tree(tree_rmap_item)); - cond_resched(); - tree_page = get_ksm_page(tree_rmap_item); - if (tree_page) - break; - next_rmap_item = tree_rmap_item->next; - remove_rmap_item_from_tree(tree_rmap_item); - tree_rmap_item = next_rmap_item; - } - if (!tree_rmap_item) + cond_resched(); + stable_node = rb_entry(*new, struct stable_node, node); + tree_page = get_ksm_page(stable_node); + if (!tree_page) return NULL; - ret = memcmp_pages(page, tree_page); + ret = memcmp_pages(kpage, tree_page); put_page(tree_page); parent = *new; @@ -974,22 +1040,24 @@ static struct rmap_item *stable_tree_insert(struct page *page, } } - rmap_item->address |= NODE_FLAG | STABLE_FLAG; - rmap_item->next = NULL; - rb_link_node(&rmap_item->node, parent, new); - rb_insert_color(&rmap_item->node, &root_stable_tree); + stable_node = alloc_stable_node(); + if (!stable_node) + return NULL; - ksm_pages_shared++; - return rmap_item; + rb_link_node(&stable_node->node, parent, new); + rb_insert_color(&stable_node->node, &root_stable_tree); + + INIT_HLIST_HEAD(&stable_node->hlist); + + stable_node->kpfn = page_to_pfn(kpage); + set_page_stable_node(kpage, stable_node); + + return stable_node; } /* - * unstable_tree_search_insert - search and insert items into the unstable tree. - * - * @page: the page that we are going to search for identical page or to insert - * into the unstable tree - * @page2: pointer into identical page that was found inside the unstable tree - * @rmap_item: the reverse mapping item of page + * unstable_tree_search_insert - search for identical page, + * else insert rmap_item into the unstable tree. * * This function searches for a page in the unstable tree identical to the * page currently being scanned; and if no identical page is found in the @@ -1001,47 +1069,50 @@ static struct rmap_item *stable_tree_insert(struct page *page, * This function does both searching and inserting, because they share * the same walking algorithm in an rbtree. */ -static struct rmap_item *unstable_tree_search_insert(struct page *page, - struct page **page2, - struct rmap_item *rmap_item) +static +struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, + struct page *page, + struct page **tree_pagep) + { struct rb_node **new = &root_unstable_tree.rb_node; struct rb_node *parent = NULL; while (*new) { struct rmap_item *tree_rmap_item; + struct page *tree_page; int ret; cond_resched(); tree_rmap_item = rb_entry(*new, struct rmap_item, node); - page2[0] = get_mergeable_page(tree_rmap_item); - if (!page2[0]) + tree_page = get_mergeable_page(tree_rmap_item); + if (!tree_page) return NULL; /* - * Don't substitute an unswappable ksm page - * just for one good swappable forked page. + * Don't substitute a ksm page for a forked page. */ - if (page == page2[0]) { - put_page(page2[0]); + if (page == tree_page) { + put_page(tree_page); return NULL; } - ret = memcmp_pages(page, page2[0]); + ret = memcmp_pages(page, tree_page); parent = *new; if (ret < 0) { - put_page(page2[0]); + put_page(tree_page); new = &parent->rb_left; } else if (ret > 0) { - put_page(page2[0]); + put_page(tree_page); new = &parent->rb_right; } else { + *tree_pagep = tree_page; return tree_rmap_item; } } - rmap_item->address |= NODE_FLAG; + rmap_item->address |= UNSTABLE_FLAG; rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); rb_link_node(&rmap_item->node, parent, new); rb_insert_color(&rmap_item->node, &root_unstable_tree); @@ -1056,18 +1127,16 @@ static struct rmap_item *unstable_tree_search_insert(struct page *page, * the same ksm page. */ static void stable_tree_append(struct rmap_item *rmap_item, - struct rmap_item *tree_rmap_item) + struct stable_node *stable_node) { - rmap_item->next = tree_rmap_item->next; - rmap_item->prev = tree_rmap_item; - - if (tree_rmap_item->next) - tree_rmap_item->next->prev = rmap_item; - - tree_rmap_item->next = rmap_item; + rmap_item->head = stable_node; rmap_item->address |= STABLE_FLAG; + hlist_add_head(&rmap_item->hlist, &stable_node->hlist); - ksm_pages_sharing++; + if (rmap_item->hlist.next) + ksm_pages_sharing++; + else + ksm_pages_shared++; } /* @@ -1081,49 +1150,37 @@ static void stable_tree_append(struct rmap_item *rmap_item, */ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) { - struct page *page2[1]; struct rmap_item *tree_rmap_item; + struct page *tree_page = NULL; + struct stable_node *stable_node; + struct page *kpage; unsigned int checksum; int err; - if (in_stable_tree(rmap_item)) - remove_rmap_item_from_tree(rmap_item); + remove_rmap_item_from_tree(rmap_item); /* We first start with searching the page inside the stable tree */ - tree_rmap_item = stable_tree_search(page, page2, rmap_item); - if (tree_rmap_item) { - if (page == page2[0]) /* forked */ - err = 0; - else - err = try_to_merge_with_ksm_page(rmap_item->mm, - rmap_item->address, - page, page2[0]); - put_page(page2[0]); - + kpage = stable_tree_search(page); + if (kpage) { + err = try_to_merge_with_ksm_page(rmap_item, page, kpage); if (!err) { /* * The page was successfully merged: * add its rmap_item to the stable tree. */ - stable_tree_append(rmap_item, tree_rmap_item); + lock_page(kpage); + stable_tree_append(rmap_item, page_stable_node(kpage)); + unlock_page(kpage); } + put_page(kpage); return; } /* - * A ksm page might have got here by fork, but its other - * references have already been removed from the stable tree. - * Or it might be left over from a break_ksm which failed - * when the mem_cgroup had reached its limit: try again now. - */ - if (PageKsm(page)) - break_cow(rmap_item->mm, rmap_item->address); - - /* - * In case the hash value of the page was changed from the last time we - * have calculated it, this page to be changed frequely, therefore we - * don't want to insert it to the unstable tree, and we don't want to - * waste our time to search if there is something identical to it there. + * If the hash value of the page has changed from the last time + * we calculated it, this page is changing frequently: therefore we + * don't want to insert it in the unstable tree, and we don't want + * to waste our time searching for something identical to it there. */ checksum = calc_checksum(page); if (rmap_item->oldchecksum != checksum) { @@ -1131,21 +1188,27 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) return; } - tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item); + tree_rmap_item = + unstable_tree_search_insert(rmap_item, page, &tree_page); if (tree_rmap_item) { - err = try_to_merge_two_pages(rmap_item->mm, - rmap_item->address, page, - tree_rmap_item->mm, - tree_rmap_item->address, page2[0]); + kpage = try_to_merge_two_pages(rmap_item, page, + tree_rmap_item, tree_page); + put_page(tree_page); /* * As soon as we merge this page, we want to remove the * rmap_item of the page we have merged with from the unstable * tree, and insert it instead as new node in the stable tree. */ - if (!err) { - rb_erase(&tree_rmap_item->node, &root_unstable_tree); - tree_rmap_item->address &= ~NODE_FLAG; - ksm_pages_unshared--; + if (kpage) { + remove_rmap_item_from_tree(tree_rmap_item); + + lock_page(kpage); + stable_node = stable_tree_insert(kpage); + if (stable_node) { + stable_tree_append(tree_rmap_item, stable_node); + stable_tree_append(rmap_item, stable_node); + } + unlock_page(kpage); /* * If we fail to insert the page into the stable tree, @@ -1153,37 +1216,28 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) * to a ksm page left outside the stable tree, * in which case we need to break_cow on both. */ - if (stable_tree_insert(page2[0], tree_rmap_item)) - stable_tree_append(rmap_item, tree_rmap_item); - else { - break_cow(tree_rmap_item->mm, - tree_rmap_item->address); - break_cow(rmap_item->mm, rmap_item->address); + if (!stable_node) { + break_cow(tree_rmap_item); + break_cow(rmap_item); } } - - put_page(page2[0]); } } static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, - struct list_head *cur, + struct rmap_item **rmap_list, unsigned long addr) { struct rmap_item *rmap_item; - while (cur != &mm_slot->rmap_list) { - rmap_item = list_entry(cur, struct rmap_item, link); - if ((rmap_item->address & PAGE_MASK) == addr) { - if (!in_stable_tree(rmap_item)) - remove_rmap_item_from_tree(rmap_item); + while (*rmap_list) { + rmap_item = *rmap_list; + if ((rmap_item->address & PAGE_MASK) == addr) return rmap_item; - } if (rmap_item->address > addr) break; - cur = cur->next; + *rmap_list = rmap_item->rmap_list; remove_rmap_item_from_tree(rmap_item); - list_del(&rmap_item->link); free_rmap_item(rmap_item); } @@ -1192,7 +1246,8 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, /* It has already been zeroed */ rmap_item->mm = mm_slot->mm; rmap_item->address = addr; - list_add_tail(&rmap_item->link, cur); + rmap_item->rmap_list = *rmap_list; + *rmap_list = rmap_item; } return rmap_item; } @@ -1217,8 +1272,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) spin_unlock(&ksm_mmlist_lock); next_mm: ksm_scan.address = 0; - ksm_scan.rmap_item = list_entry(&slot->rmap_list, - struct rmap_item, link); + ksm_scan.rmap_list = &slot->rmap_list; } mm = slot->mm; @@ -1244,10 +1298,10 @@ next_mm: flush_anon_page(vma, *page, ksm_scan.address); flush_dcache_page(*page); rmap_item = get_next_rmap_item(slot, - ksm_scan.rmap_item->link.next, - ksm_scan.address); + ksm_scan.rmap_list, ksm_scan.address); if (rmap_item) { - ksm_scan.rmap_item = rmap_item; + ksm_scan.rmap_list = + &rmap_item->rmap_list; ksm_scan.address += PAGE_SIZE; } else put_page(*page); @@ -1263,14 +1317,13 @@ next_mm: if (ksm_test_exit(mm)) { ksm_scan.address = 0; - ksm_scan.rmap_item = list_entry(&slot->rmap_list, - struct rmap_item, link); + ksm_scan.rmap_list = &slot->rmap_list; } /* * Nuke all the rmap_items that are above this current rmap: * because there were no VM_MERGEABLE vmas with such addresses. */ - remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next); + remove_trailing_rmap_items(slot, ksm_scan.rmap_list); spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = list_entry(slot->mm_list.next, @@ -1323,14 +1376,6 @@ static void ksm_do_scan(unsigned int scan_npages) return; if (!PageKsm(page) || !in_stable_tree(rmap_item)) cmp_and_merge_page(page, rmap_item); - else if (page_mapcount(page) == 1) { - /* - * Replace now-unshared ksm page by ordinary page. - */ - break_cow(rmap_item->mm, rmap_item->address); - remove_rmap_item_from_tree(rmap_item); - rmap_item->oldchecksum = calc_checksum(page); - } put_page(page); } } @@ -1375,7 +1420,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | VM_PFNMAP | VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | - VM_MIXEDMAP | VM_SAO)) + VM_NONLINEAR | VM_MIXEDMAP | VM_SAO)) return 0; /* just ignore the advice */ if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { @@ -1452,7 +1497,7 @@ void __ksm_exit(struct mm_struct *mm) spin_lock(&ksm_mmlist_lock); mm_slot = get_mm_slot(mm); if (mm_slot && ksm_scan.mm_slot != mm_slot) { - if (list_empty(&mm_slot->rmap_list)) { + if (!mm_slot->rmap_list) { hlist_del(&mm_slot->link); list_del(&mm_slot->mm_list); easy_to_free = 1; @@ -1473,6 +1518,249 @@ void __ksm_exit(struct mm_struct *mm) } } +struct page *ksm_does_need_to_copy(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + struct page *new_page; + + unlock_page(page); /* any racers will COW it, not modify it */ + + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (new_page) { + copy_user_highpage(new_page, page, address, vma); + + SetPageDirty(new_page); + __SetPageUptodate(new_page); + SetPageSwapBacked(new_page); + __set_page_locked(new_page); + + if (page_evictable(new_page, vma)) + lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); + else + add_page_to_unevictable_list(new_page); + } + + page_cache_release(page); + return new_page; +} + +int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, + unsigned long *vm_flags) +{ + struct stable_node *stable_node; + struct rmap_item *rmap_item; + struct hlist_node *hlist; + unsigned int mapcount = page_mapcount(page); + int referenced = 0; + int search_new_forks = 0; + + VM_BUG_ON(!PageKsm(page)); + VM_BUG_ON(!PageLocked(page)); + + stable_node = page_stable_node(page); + if (!stable_node) + return 0; +again: + hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { + struct anon_vma *anon_vma = rmap_item->anon_vma; + struct vm_area_struct *vma; + + spin_lock(&anon_vma->lock); + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + if (rmap_item->address < vma->vm_start || + rmap_item->address >= vma->vm_end) + continue; + /* + * Initially we examine only the vma which covers this + * rmap_item; but later, if there is still work to do, + * we examine covering vmas in other mms: in case they + * were forked from the original since ksmd passed. + */ + if ((rmap_item->mm == vma->vm_mm) == search_new_forks) + continue; + + if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) + continue; + + referenced += page_referenced_one(page, vma, + rmap_item->address, &mapcount, vm_flags); + if (!search_new_forks || !mapcount) + break; + } + spin_unlock(&anon_vma->lock); + if (!mapcount) + goto out; + } + if (!search_new_forks++) + goto again; +out: + return referenced; +} + +int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) +{ + struct stable_node *stable_node; + struct hlist_node *hlist; + struct rmap_item *rmap_item; + int ret = SWAP_AGAIN; + int search_new_forks = 0; + + VM_BUG_ON(!PageKsm(page)); + VM_BUG_ON(!PageLocked(page)); + + stable_node = page_stable_node(page); + if (!stable_node) + return SWAP_FAIL; +again: + hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { + struct anon_vma *anon_vma = rmap_item->anon_vma; + struct vm_area_struct *vma; + + spin_lock(&anon_vma->lock); + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + if (rmap_item->address < vma->vm_start || + rmap_item->address >= vma->vm_end) + continue; + /* + * Initially we examine only the vma which covers this + * rmap_item; but later, if there is still work to do, + * we examine covering vmas in other mms: in case they + * were forked from the original since ksmd passed. + */ + if ((rmap_item->mm == vma->vm_mm) == search_new_forks) + continue; + + ret = try_to_unmap_one(page, vma, + rmap_item->address, flags); + if (ret != SWAP_AGAIN || !page_mapped(page)) { + spin_unlock(&anon_vma->lock); + goto out; + } + } + spin_unlock(&anon_vma->lock); + } + if (!search_new_forks++) + goto again; +out: + return ret; +} + +#ifdef CONFIG_MIGRATION +int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, + struct vm_area_struct *, unsigned long, void *), void *arg) +{ + struct stable_node *stable_node; + struct hlist_node *hlist; + struct rmap_item *rmap_item; + int ret = SWAP_AGAIN; + int search_new_forks = 0; + + VM_BUG_ON(!PageKsm(page)); + VM_BUG_ON(!PageLocked(page)); + + stable_node = page_stable_node(page); + if (!stable_node) + return ret; +again: + hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { + struct anon_vma *anon_vma = rmap_item->anon_vma; + struct vm_area_struct *vma; + + spin_lock(&anon_vma->lock); + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + if (rmap_item->address < vma->vm_start || + rmap_item->address >= vma->vm_end) + continue; + /* + * Initially we examine only the vma which covers this + * rmap_item; but later, if there is still work to do, + * we examine covering vmas in other mms: in case they + * were forked from the original since ksmd passed. + */ + if ((rmap_item->mm == vma->vm_mm) == search_new_forks) + continue; + + ret = rmap_one(page, vma, rmap_item->address, arg); + if (ret != SWAP_AGAIN) { + spin_unlock(&anon_vma->lock); + goto out; + } + } + spin_unlock(&anon_vma->lock); + } + if (!search_new_forks++) + goto again; +out: + return ret; +} + +void ksm_migrate_page(struct page *newpage, struct page *oldpage) +{ + struct stable_node *stable_node; + + VM_BUG_ON(!PageLocked(oldpage)); + VM_BUG_ON(!PageLocked(newpage)); + VM_BUG_ON(newpage->mapping != oldpage->mapping); + + stable_node = page_stable_node(newpage); + if (stable_node) { + VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); + stable_node->kpfn = page_to_pfn(newpage); + } +} +#endif /* CONFIG_MIGRATION */ + +#ifdef CONFIG_MEMORY_HOTREMOVE +static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn, + unsigned long end_pfn) +{ + struct rb_node *node; + + for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) { + struct stable_node *stable_node; + + stable_node = rb_entry(node, struct stable_node, node); + if (stable_node->kpfn >= start_pfn && + stable_node->kpfn < end_pfn) + return stable_node; + } + return NULL; +} + +static int ksm_memory_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct memory_notify *mn = arg; + struct stable_node *stable_node; + + switch (action) { + case MEM_GOING_OFFLINE: + /* + * Keep it very simple for now: just lock out ksmd and + * MADV_UNMERGEABLE while any memory is going offline. + */ + mutex_lock(&ksm_thread_mutex); + break; + + case MEM_OFFLINE: + /* + * Most of the work is done by page migration; but there might + * be a few stable_nodes left over, still pointing to struct + * pages which have been offlined: prune those from the tree. + */ + while ((stable_node = ksm_check_stable_tree(mn->start_pfn, + mn->start_pfn + mn->nr_pages)) != NULL) + remove_node_from_stable_tree(stable_node); + /* fallthrough */ + + case MEM_CANCEL_OFFLINE: + mutex_unlock(&ksm_thread_mutex); + break; + } + return NOTIFY_OK; +} +#endif /* CONFIG_MEMORY_HOTREMOVE */ + #ifdef CONFIG_SYSFS /* * This all compiles without CONFIG_SYSFS, but is a waste of space. @@ -1551,8 +1839,8 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, /* * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, - * breaking COW to free the unswappable pages_shared (but leaves - * mm_slots on the list for when ksmd may be set running again). + * breaking COW to free the pages_shared (but leaves mm_slots + * on the list for when ksmd may be set running again). */ mutex_lock(&ksm_thread_mutex); @@ -1577,29 +1865,6 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, } KSM_ATTR(run); -static ssize_t max_kernel_pages_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - int err; - unsigned long nr_pages; - - err = strict_strtoul(buf, 10, &nr_pages); - if (err) - return -EINVAL; - - ksm_max_kernel_pages = nr_pages; - - return count; -} - -static ssize_t max_kernel_pages_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%lu\n", ksm_max_kernel_pages); -} -KSM_ATTR(max_kernel_pages); - static ssize_t pages_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -1649,7 +1914,6 @@ static struct attribute *ksm_attrs[] = { &sleep_millisecs_attr.attr, &pages_to_scan_attr.attr, &run_attr.attr, - &max_kernel_pages_attr.attr, &pages_shared_attr.attr, &pages_sharing_attr.attr, &pages_unshared_attr.attr, @@ -1669,8 +1933,6 @@ static int __init ksm_init(void) struct task_struct *ksm_thread; int err; - ksm_max_kernel_pages = totalram_pages / 4; - err = ksm_slab_init(); if (err) goto out; @@ -1698,6 +1960,13 @@ static int __init ksm_init(void) #endif /* CONFIG_SYSFS */ +#ifdef CONFIG_MEMORY_HOTREMOVE + /* + * Choose a high priority since the callback takes ksm_thread_mutex: + * later callbacks could only be taking locks which nest within that. + */ + hotplug_memory_notifier(ksm_memory_callback, 100); +#endif return 0; out_free2: diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c31a310..e0c2066 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1737,11 +1737,12 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, goto charge_cur_mm; /* * A racing thread's fault, or swapoff, may have already updated - * the pte, and even removed page from swap cache: return success - * to go on to do_swap_page()'s pte_same() test, which should fail. + * the pte, and even removed page from swap cache: in those cases + * do_swap_page()'s pte_same() test will fail; but there's also a + * KSM case which does need to charge the page. */ if (!PageSwapCache(page)) - return 0; + goto charge_cur_mm; mem = try_get_mem_cgroup_from_swapcache(page); if (!mem) goto charge_cur_mm; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 1ac49fe..50d4f8d 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -582,10 +582,8 @@ static struct page_state { { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, { unevict, unevict, "unevictable LRU", me_pagecache_clean}, -#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, { mlock, mlock, "mlocked LRU", me_pagecache_clean }, -#endif { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, { lru|dirty, lru, "clean LRU", me_pagecache_clean }, diff --git a/mm/memory.c b/mm/memory.c index 6ab19dd..a54b2c4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -572,7 +572,7 @@ out: * covered by this vma. */ -static inline void +static inline unsigned long copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, unsigned long addr, int *rss) @@ -586,7 +586,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (!pte_file(pte)) { swp_entry_t entry = pte_to_swp_entry(pte); - swap_duplicate(entry); + if (swap_duplicate(entry) < 0) + return entry.val; + /* make sure dst_mm is on swapoff's mmlist. */ if (unlikely(list_empty(&dst_mm->mmlist))) { spin_lock(&mmlist_lock); @@ -635,6 +637,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, out_set_pte: set_pte_at(dst_mm, addr, dst_pte, pte); + return 0; } static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -646,6 +649,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, spinlock_t *src_ptl, *dst_ptl; int progress = 0; int rss[2]; + swp_entry_t entry = (swp_entry_t){0}; again: rss[1] = rss[0] = 0; @@ -674,7 +678,10 @@ again: progress++; continue; } - copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); + entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, + vma, addr, rss); + if (entry.val) + break; progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); @@ -684,6 +691,12 @@ again: add_mm_rss(dst_mm, rss[0], rss[1]); pte_unmap_unlock(orig_dst_pte, dst_ptl); cond_resched(); + + if (entry.val) { + if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) + return -ENOMEM; + progress = 0; + } if (addr != end) goto again; return 0; @@ -2514,7 +2527,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, ret = VM_FAULT_HWPOISON; } else { print_bad_pte(vma, address, orig_pte, NULL); - ret = VM_FAULT_OOM; + ret = VM_FAULT_SIGBUS; } goto out; } @@ -2548,6 +2561,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, lock_page(page); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + page = ksm_might_need_to_copy(page, vma, address); + if (!page) { + ret = VM_FAULT_OOM; + goto out; + } + if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { ret = VM_FAULT_OOM; goto out_page; @@ -2910,7 +2929,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, * Page table corrupted: show pte and kill process. */ print_bad_pte(vma, address, orig_pte, NULL); - return VM_FAULT_OOM; + return VM_FAULT_SIGBUS; } pgoff = pte_to_pgoff(orig_pte); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2047465..030ce8a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -27,6 +27,7 @@ #include <linux/page-isolation.h> #include <linux/pfn.h> #include <linux/suspend.h> +#include <linux/mm_inline.h> #include <asm/tlbflush.h> @@ -71,7 +72,9 @@ static void get_page_bootmem(unsigned long info, struct page *page, int type) atomic_inc(&page->_count); } -void put_page_bootmem(struct page *page) +/* reference to __meminit __free_pages_bootmem is valid + * so use __ref to tell modpost not to generate a warning */ +void __ref put_page_bootmem(struct page *page) { int type; @@ -672,6 +675,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (!ret) { /* Success */ list_add_tail(&page->lru, &source); move_pages--; + inc_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + } else { /* Becasue we don't have big zone->lock. we should check this again here. */ @@ -694,7 +700,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (list_empty(&source)) goto out; /* this function returns # of failed pages */ - ret = migrate_pages(&source, hotremove_migrate_alloc, 0); + ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1); out: return ret; @@ -747,7 +753,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) return offlined; } -int offline_pages(unsigned long start_pfn, +static int offline_pages(unsigned long start_pfn, unsigned long end_pfn, unsigned long timeout) { unsigned long pfn, nr_pages, expire; @@ -849,6 +855,10 @@ repeat: setup_per_zone_wmarks(); calculate_zone_inactive_ratio(zone); + if (!node_present_pages(node)) { + node_clear_state(node, N_HIGH_MEMORY); + kswapd_stop(node); + } vm_total_pages = nr_free_pagecache_pages(); writeback_set_ratelimit(); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4545d59..290fb5b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -85,10 +85,12 @@ #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/migrate.h> +#include <linux/ksm.h> #include <linux/rmap.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/ctype.h> +#include <linux/mm_inline.h> #include <asm/tlbflush.h> #include <asm/uaccess.h> @@ -412,17 +414,11 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (!page) continue; /* - * The check for PageReserved here is important to avoid - * handling zero pages and other pages that may have been - * marked special by the system. - * - * If the PageReserved would not be checked here then f.e. - * the location of the zero page could have an influence - * on MPOL_MF_STRICT, zero pages would be counted for - * the per node stats, and there would be useless attempts - * to put zero pages on the migration list. + * vm_normal_page() filters out zero pages, but there might + * still be PageReserved pages to skip, perhaps in a VDSO. + * And we cannot move PageKsm pages sensibly or safely yet. */ - if (PageReserved(page)) + if (PageReserved(page) || PageKsm(page)) continue; nid = page_to_nid(page); if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) @@ -809,6 +805,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { if (!isolate_lru_page(page)) { list_add_tail(&page->lru, pagelist); + inc_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); } } } @@ -836,7 +834,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) - err = migrate_pages(&pagelist, new_node_page, dest); + err = migrate_pages(&pagelist, new_node_page, dest, 0); return err; } @@ -1053,7 +1051,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) nr_failed = migrate_pages(&pagelist, new_vma_page, - (unsigned long)vma); + (unsigned long)vma, 0); if (!err && nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; @@ -1565,6 +1563,53 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, } return zl; } + +/* + * init_nodemask_of_mempolicy + * + * If the current task's mempolicy is "default" [NULL], return 'false' + * to indicate default policy. Otherwise, extract the policy nodemask + * for 'bind' or 'interleave' policy into the argument nodemask, or + * initialize the argument nodemask to contain the single node for + * 'preferred' or 'local' policy and return 'true' to indicate presence + * of non-default mempolicy. + * + * We don't bother with reference counting the mempolicy [mpol_get/put] + * because the current task is examining it's own mempolicy and a task's + * mempolicy is only ever changed by the task itself. + * + * N.B., it is the caller's responsibility to free a returned nodemask. + */ +bool init_nodemask_of_mempolicy(nodemask_t *mask) +{ + struct mempolicy *mempolicy; + int nid; + + if (!(mask && current->mempolicy)) + return false; + + mempolicy = current->mempolicy; + switch (mempolicy->mode) { + case MPOL_PREFERRED: + if (mempolicy->flags & MPOL_F_LOCAL) + nid = numa_node_id(); + else + nid = mempolicy->v.preferred_node; + init_nodemask_of_node(mask, nid); + break; + + case MPOL_BIND: + /* Fall through */ + case MPOL_INTERLEAVE: + *mask = mempolicy->v.nodes; + break; + + default: + BUG(); + } + + return true; +} #endif /* Allocate a page in interleaved policy. diff --git a/mm/migrate.c b/mm/migrate.c index 7dbcb22..efddbf0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -21,6 +21,7 @@ #include <linux/mm_inline.h> #include <linux/nsproxy.h> #include <linux/pagevec.h> +#include <linux/ksm.h> #include <linux/rmap.h> #include <linux/topology.h> #include <linux/cpu.h> @@ -78,8 +79,8 @@ int putback_lru_pages(struct list_head *l) /* * Restore a potential migration pte to a working pte entry */ -static void remove_migration_pte(struct vm_area_struct *vma, - struct page *old, struct page *new) +static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, + unsigned long addr, void *old) { struct mm_struct *mm = vma->vm_mm; swp_entry_t entry; @@ -88,40 +89,37 @@ static void remove_migration_pte(struct vm_area_struct *vma, pmd_t *pmd; pte_t *ptep, pte; spinlock_t *ptl; - unsigned long addr = page_address_in_vma(new, vma); - - if (addr == -EFAULT) - return; pgd = pgd_offset(mm, addr); if (!pgd_present(*pgd)) - return; + goto out; pud = pud_offset(pgd, addr); if (!pud_present(*pud)) - return; + goto out; pmd = pmd_offset(pud, addr); if (!pmd_present(*pmd)) - return; + goto out; ptep = pte_offset_map(pmd, addr); if (!is_swap_pte(*ptep)) { pte_unmap(ptep); - return; + goto out; } ptl = pte_lockptr(mm, pmd); spin_lock(ptl); pte = *ptep; if (!is_swap_pte(pte)) - goto out; + goto unlock; entry = pte_to_swp_entry(pte); - if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) - goto out; + if (!is_migration_entry(entry) || + migration_entry_to_page(entry) != old) + goto unlock; get_page(new); pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); @@ -137,58 +135,10 @@ static void remove_migration_pte(struct vm_area_struct *vma, /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, pte); - -out: +unlock: pte_unmap_unlock(ptep, ptl); -} - -/* - * Note that remove_file_migration_ptes will only work on regular mappings, - * Nonlinear mappings do not use migration entries. - */ -static void remove_file_migration_ptes(struct page *old, struct page *new) -{ - struct vm_area_struct *vma; - struct address_space *mapping = new->mapping; - struct prio_tree_iter iter; - pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - - if (!mapping) - return; - - spin_lock(&mapping->i_mmap_lock); - - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) - remove_migration_pte(vma, old, new); - - spin_unlock(&mapping->i_mmap_lock); -} - -/* - * Must hold mmap_sem lock on at least one of the vmas containing - * the page so that the anon_vma cannot vanish. - */ -static void remove_anon_migration_ptes(struct page *old, struct page *new) -{ - struct anon_vma *anon_vma; - struct vm_area_struct *vma; - unsigned long mapping; - - mapping = (unsigned long)new->mapping; - - if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) - return; - - /* - * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. - */ - anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); - spin_lock(&anon_vma->lock); - - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) - remove_migration_pte(vma, old, new); - - spin_unlock(&anon_vma->lock); +out: + return SWAP_AGAIN; } /* @@ -197,10 +147,7 @@ static void remove_anon_migration_ptes(struct page *old, struct page *new) */ static void remove_migration_ptes(struct page *old, struct page *new) { - if (PageAnon(new)) - remove_anon_migration_ptes(old, new); - else - remove_file_migration_ptes(old, new); + rmap_walk(new, remove_migration_pte, old); } /* @@ -341,8 +288,8 @@ static void migrate_page_copy(struct page *newpage, struct page *page) if (TestClearPageActive(page)) { VM_BUG_ON(PageUnevictable(page)); SetPageActive(newpage); - } else - unevictable_migrate_page(newpage, page); + } else if (TestClearPageUnevictable(page)) + SetPageUnevictable(newpage); if (PageChecked(page)) SetPageChecked(newpage); if (PageMappedToDisk(page)) @@ -361,6 +308,7 @@ static void migrate_page_copy(struct page *newpage, struct page *page) } mlock_migrate_page(newpage, page); + ksm_migrate_page(newpage, page); ClearPageSwapCache(page); ClearPagePrivate(page); @@ -580,9 +528,9 @@ static int move_to_new_page(struct page *newpage, struct page *page) else rc = fallback_migrate_page(mapping, newpage, page); - if (!rc) { + if (!rc) remove_migration_ptes(page, newpage); - } else + else newpage->mapping = NULL; unlock_page(newpage); @@ -595,7 +543,7 @@ static int move_to_new_page(struct page *newpage, struct page *page) * to the newly allocated page in newpage. */ static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force) + struct page *page, int force, int offlining) { int rc = 0; int *result = NULL; @@ -621,6 +569,20 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, lock_page(page); } + /* + * Only memory hotplug's offline_pages() caller has locked out KSM, + * and can safely migrate a KSM page. The other cases have skipped + * PageKsm along with PageReserved - but it is only now when we have + * the page lock that we can be certain it will not go KSM beneath us + * (KSM will not upgrade a page from PageAnon to PageKsm when it sees + * its pagecount raised, but only here do we take the page lock which + * serializes that). + */ + if (PageKsm(page) && !offlining) { + rc = -EBUSY; + goto unlock; + } + /* charge against new page */ charge = mem_cgroup_prepare_migration(page, &mem); if (charge == -ENOMEM) { @@ -737,7 +699,7 @@ move_newpage: * Return: Number of pages not migrated or error code. */ int migrate_pages(struct list_head *from, - new_page_t get_new_page, unsigned long private) + new_page_t get_new_page, unsigned long private, int offlining) { int retry = 1; int nr_failed = 0; @@ -746,13 +708,6 @@ int migrate_pages(struct list_head *from, struct page *page2; int swapwrite = current->flags & PF_SWAPWRITE; int rc; - unsigned long flags; - - local_irq_save(flags); - list_for_each_entry(page, from, lru) - __inc_zone_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); - local_irq_restore(flags); if (!swapwrite) current->flags |= PF_SWAPWRITE; @@ -764,7 +719,7 @@ int migrate_pages(struct list_head *from, cond_resched(); rc = unmap_and_move(get_new_page, private, - page, pass > 2); + page, pass > 2, offlining); switch(rc) { case -ENOMEM: @@ -860,7 +815,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm, if (!page) goto set_status; - if (PageReserved(page)) /* Check for zero page */ + /* Use PageReserved to check for zero page */ + if (PageReserved(page) || PageKsm(page)) goto put_and_set; pp->page = page; @@ -878,8 +834,11 @@ static int do_move_page_to_node_array(struct mm_struct *mm, goto put_and_set; err = isolate_lru_page(page); - if (!err) + if (!err) { list_add_tail(&page->lru, &pagelist); + inc_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + } put_and_set: /* * Either remove the duplicate refcount from @@ -894,7 +853,7 @@ set_status: err = 0; if (!list_empty(&pagelist)) err = migrate_pages(&pagelist, new_page_node, - (unsigned long)pm); + (unsigned long)pm, 0); up_read(&mm->mmap_sem); return err; @@ -1015,7 +974,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, err = -ENOENT; /* Use PageReserved to check for zero page */ - if (!page || PageReserved(page)) + if (!page || PageReserved(page) || PageKsm(page)) goto set_status; err = page_to_nid(page); @@ -1044,7 +1003,7 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, int err; for (i = 0; i < nr_pages; i += chunk_nr) { - if (chunk_nr + i > nr_pages) + if (chunk_nr > nr_pages - i) chunk_nr = nr_pages - i; err = copy_from_user(chunk_pages, &pages[i], diff --git a/mm/mincore.c b/mm/mincore.c index 8cb508f..7a3436e 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -14,6 +14,7 @@ #include <linux/syscalls.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/hugetlb.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -72,6 +73,42 @@ static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pag if (!vma || addr < vma->vm_start) return -ENOMEM; +#ifdef CONFIG_HUGETLB_PAGE + if (is_vm_hugetlb_page(vma)) { + struct hstate *h; + unsigned long nr_huge; + unsigned char present; + + i = 0; + nr = min(pages, (vma->vm_end - addr) >> PAGE_SHIFT); + h = hstate_vma(vma); + nr_huge = ((addr + pages * PAGE_SIZE - 1) >> huge_page_shift(h)) + - (addr >> huge_page_shift(h)) + 1; + nr_huge = min(nr_huge, + (vma->vm_end - addr) >> huge_page_shift(h)); + while (1) { + /* hugepage always in RAM for now, + * but generally it needs to be check */ + ptep = huge_pte_offset(current->mm, + addr & huge_page_mask(h)); + present = !!(ptep && + !huge_pte_none(huge_ptep_get(ptep))); + while (1) { + vec[i++] = present; + addr += PAGE_SIZE; + /* reach buffer limit */ + if (i == nr) + return nr; + /* check hugepage border */ + if (!((addr & ~huge_page_mask(h)) + >> PAGE_SHIFT)) + break; + } + } + return nr; + } +#endif + /* * Calculate how many pages there are left in the last level of the * PTE array for our address. @@ -88,25 +88,22 @@ void mlock_vma_page(struct page *page) } } -/* - * called from munlock()/munmap() path with page supposedly on the LRU. +/** + * munlock_vma_page - munlock a vma page + * @page - page to be unlocked * - * Note: unlike mlock_vma_page(), we can't just clear the PageMlocked - * [in try_to_munlock()] and then attempt to isolate the page. We must - * isolate the page to keep others from messing with its unevictable - * and mlocked state while trying to munlock. However, we pre-clear the - * mlocked state anyway as we might lose the isolation race and we might - * not get another chance to clear PageMlocked. If we successfully - * isolate the page and try_to_munlock() detects other VM_LOCKED vmas - * mapping the page, it will restore the PageMlocked state, unless the page - * is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(), - * perhaps redundantly. - * If we lose the isolation race, and the page is mapped by other VM_LOCKED - * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap() - * either of which will restore the PageMlocked state by calling - * mlock_vma_page() above, if it can grab the vma's mmap sem. + * called from munlock()/munmap() path with page supposedly on the LRU. + * When we munlock a page, because the vma where we found the page is being + * munlock()ed or munmap()ed, we want to check whether other vmas hold the + * page locked so that we can leave it on the unevictable lru list and not + * bother vmscan with it. However, to walk the page's rmap list in + * try_to_munlock() we must isolate the page from the LRU. If some other + * task has removed the page from the LRU, we won't be able to do that. + * So we clear the PageMlocked as we might not get another chance. If we + * can't isolate the page, we leave it for putback_lru_page() and vmscan + * [page_referenced()/try_to_unmap()] to deal with. */ -static void munlock_vma_page(struct page *page) +void munlock_vma_page(struct page *page) { BUG_ON(!PageLocked(page)); @@ -117,18 +114,18 @@ static void munlock_vma_page(struct page *page) /* * did try_to_unlock() succeed or punt? */ - if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN) + if (ret != SWAP_MLOCK) count_vm_event(UNEVICTABLE_PGMUNLOCKED); putback_lru_page(page); } else { /* - * We lost the race. let try_to_unmap() deal - * with it. At least we get the page state and - * mlock stats right. However, page is still on - * the noreclaim list. We'll fix that up when - * the page is eventually freed or we scan the - * noreclaim list. + * Some other task has removed the page from the LRU. + * putback_lru_page() will take care of removing the + * page from the unevictable list, if necessary. + * vmscan [page_referenced()] will move the page back + * to the unevictable list if some other vma has it + * mlocked. */ if (PageUnevictable(page)) count_vm_event(UNEVICTABLE_PGSTRANDED); @@ -1198,8 +1198,20 @@ munmap_back: goto free_vma; } - if (vma_wants_writenotify(vma)) + if (vma_wants_writenotify(vma)) { + pgprot_t pprot = vma->vm_page_prot; + + /* Can vma->vm_page_prot have changed?? + * + * Answer: Yes, drivers may have changed it in their + * f_op->mmap method. + * + * Ensures that vmas marked as uncached stay that way. + */ vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); + if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot))) + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + } vma_link(mm, vma, prev, rb_link, rb_parent); file = vma->vm_file; @@ -1811,10 +1823,10 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, } /* - * Split a vma into two pieces at address 'addr', a new vma is allocated - * either for the first part or the tail. + * __split_vma() bypasses sysctl_max_map_count checking. We use this on the + * munmap path where it doesn't make sense to fail. */ -int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, +static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long addr, int new_below) { struct mempolicy *pol; @@ -1824,9 +1836,6 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, ~(huge_page_mask(hstate_vma(vma))))) return -EINVAL; - if (mm->map_count >= sysctl_max_map_count) - return -ENOMEM; - new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!new) return -ENOMEM; @@ -1866,6 +1875,19 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, return 0; } +/* + * Split a vma into two pieces at address 'addr', a new vma is allocated + * either for the first part or the tail. + */ +int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, int new_below) +{ + if (mm->map_count >= sysctl_max_map_count) + return -ENOMEM; + + return __split_vma(mm, vma, addr, new_below); +} + /* Munmap is split into 2 main parts -- this part which finds * what needs doing, and the areas themselves, which do the * work. This now handles partial unmappings. @@ -1901,7 +1923,17 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) * places tmp vma above, and higher split_vma places tmp vma below. */ if (start > vma->vm_start) { - int error = split_vma(mm, vma, start, 0); + int error; + + /* + * Make sure that map_count on return from munmap() will + * not exceed its limit; but let map_count go just above + * its limit temporarily, to help free resources as expected. + */ + if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) + return -ENOMEM; + + error = __split_vma(mm, vma, start, 0); if (error) return error; prev = vma; @@ -1910,7 +1942,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) /* Does it split the last one? */ last = find_vma(mm, end); if (last && end > last->vm_start) { - int error = split_vma(mm, last, end, 1); + int error = __split_vma(mm, last, end, 1); if (error) return error; } @@ -1143,9 +1143,6 @@ static int do_mmap_private(struct vm_area_struct *vma, if (ret < rlen) memset(base + ret, 0, rlen - ret); - } else { - /* if it's an anonymous mapping, then just clear it */ - memset(base, 0, rlen); } return 0; @@ -1343,6 +1340,11 @@ unsigned long do_mmap_pgoff(struct file *file, goto error_just_free; add_nommu_region(region); + /* clear anonymous mappings that don't ask for uninitialized data */ + if (!vma->vm_file && !(flags & MAP_UNINITIALIZED)) + memset((void *)region->vm_start, 0, + region->vm_end - region->vm_start); + /* okay... we have a mapping; now we have to register it */ result = vma->vm_start; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ea2147d..492c986 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -337,6 +337,21 @@ static void dump_tasks(const struct mem_cgroup *mem) } while_each_thread(g, p); } +static void dump_header(gfp_t gfp_mask, int order, struct mem_cgroup *mem) +{ + pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " + "oom_adj=%d\n", + current->comm, gfp_mask, order, current->signal->oom_adj); + task_lock(current); + cpuset_print_task_mems_allowed(current); + task_unlock(current); + dump_stack(); + mem_cgroup_print_oom_info(mem, current); + show_mem(); + if (sysctl_oom_dump_tasks) + dump_tasks(mem); +} + /* * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO @@ -395,20 +410,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, { struct task_struct *c; - if (printk_ratelimit()) { - printk(KERN_WARNING "%s invoked oom-killer: " - "gfp_mask=0x%x, order=%d, oom_adj=%d\n", - current->comm, gfp_mask, order, - current->signal->oom_adj); - task_lock(current); - cpuset_print_task_mems_allowed(current); - task_unlock(current); - dump_stack(); - mem_cgroup_print_oom_info(mem, current); - show_mem(); - if (sysctl_oom_dump_tasks) - dump_tasks(mem); - } + if (printk_ratelimit()) + dump_header(gfp_mask, order, mem); /* * If the task is already exiting, don't alarm the sysadmin or kill @@ -544,6 +547,7 @@ retry: /* Found nothing?!?! Either we hang forever, or we panic. */ if (!p) { read_unlock(&tasklist_lock); + dump_header(gfp_mask, order, NULL); panic("Out of memory and no killable processes...\n"); } @@ -609,8 +613,10 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) /* Got some memory back in the last second. */ return; - if (sysctl_panic_on_oom == 2) + if (sysctl_panic_on_oom == 2) { + dump_header(gfp_mask, order, NULL); panic("out of memory. Compulsory panic_on_oom is selected.\n"); + } /* * Check if there were limitations on the allocation (only relevant for @@ -626,8 +632,10 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) break; case CONSTRAINT_NONE: - if (sysctl_panic_on_oom) + if (sysctl_panic_on_oom) { + dump_header(gfp_mask, order, NULL); panic("out of memory. panic_on_oom is selected\n"); + } /* Fall-through */ case CONSTRAINT_CPUSET: __out_of_memory(gfp_mask, order); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2bc2ac6..59d2e88 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -486,7 +486,6 @@ static inline void __free_one_page(struct page *page, zone->free_area[order].nr_free++; } -#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT /* * free_page_mlock() -- clean up attempts to free and mlocked() page. * Page should not be on lru, so no need to fix that up. @@ -497,9 +496,6 @@ static inline void free_page_mlock(struct page *page) __dec_zone_page_state(page, NR_MLOCK); __count_vm_event(UNEVICTABLE_MLOCKFREED); } -#else -static void free_page_mlock(struct page *page) { } -#endif static inline int free_pages_check(struct page *page) { diff --git a/mm/page_io.c b/mm/page_io.c index c6f3e50..a19af95 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -19,20 +19,15 @@ #include <linux/writeback.h> #include <asm/pgtable.h> -static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index, +static struct bio *get_swap_bio(gfp_t gfp_flags, struct page *page, bio_end_io_t end_io) { struct bio *bio; bio = bio_alloc(gfp_flags, 1); if (bio) { - struct swap_info_struct *sis; - swp_entry_t entry = { .val = index, }; - - sis = get_swap_info_struct(swp_type(entry)); - bio->bi_sector = map_swap_page(sis, swp_offset(entry)) * - (PAGE_SIZE >> 9); - bio->bi_bdev = sis->bdev; + bio->bi_sector = map_swap_page(page, &bio->bi_bdev); + bio->bi_sector <<= PAGE_SHIFT - 9; bio->bi_io_vec[0].bv_page = page; bio->bi_io_vec[0].bv_len = PAGE_SIZE; bio->bi_io_vec[0].bv_offset = 0; @@ -102,8 +97,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) unlock_page(page); goto out; } - bio = get_swap_bio(GFP_NOIO, page_private(page), page, - end_swap_bio_write); + bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); if (bio == NULL) { set_page_dirty(page); unlock_page(page); @@ -127,8 +121,7 @@ int swap_readpage(struct page *page) VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(PageUptodate(page)); - bio = get_swap_bio(GFP_KERNEL, page_private(page), page, - end_swap_bio_read); + bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); if (bio == NULL) { unlock_page(page); ret = -ENOMEM; diff --git a/mm/pagewalk.c b/mm/pagewalk.c index d5878be..7b47a57 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -1,6 +1,7 @@ #include <linux/mm.h> #include <linux/highmem.h> #include <linux/sched.h> +#include <linux/hugetlb.h> static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -107,6 +108,7 @@ int walk_page_range(unsigned long addr, unsigned long end, pgd_t *pgd; unsigned long next; int err = 0; + struct vm_area_struct *vma; if (addr >= end) return err; @@ -117,11 +119,38 @@ int walk_page_range(unsigned long addr, unsigned long end, pgd = pgd_offset(walk->mm, addr); do { next = pgd_addr_end(addr, end); + + /* + * handle hugetlb vma individually because pagetable walk for + * the hugetlb page is dependent on the architecture and + * we can't handled it in the same manner as non-huge pages. + */ + vma = find_vma(walk->mm, addr); +#ifdef CONFIG_HUGETLB_PAGE + if (vma && is_vm_hugetlb_page(vma)) { + pte_t *pte; + struct hstate *hs; + + if (vma->vm_end < next) + next = vma->vm_end; + hs = hstate_vma(vma); + pte = huge_pte_offset(walk->mm, + addr & huge_page_mask(hs)); + if (pte && !huge_pte_none(huge_ptep_get(pte)) + && walk->hugetlb_entry) + err = walk->hugetlb_entry(pte, addr, + next, walk); + if (err) + break; + continue; + } +#endif if (pgd_none_or_clear_bad(pgd)) { if (walk->pte_hole) err = walk->pte_hole(addr, next, walk); if (err) break; + pgd++; continue; } if (walk->pgd_entry) @@ -131,7 +160,8 @@ int walk_page_range(unsigned long addr, unsigned long end, err = walk_pud_range(pgd, addr, next, walk); if (err) break; - } while (pgd++, addr = next, addr != end); + pgd++; + } while (addr = next, addr != end); return err; } diff --git a/mm/percpu.c b/mm/percpu.c index 5adfc26..442010c 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -46,8 +46,6 @@ * * To use this allocator, arch code should do the followings. * - * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA - * * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate * regular address to percpu pointer and back if they need to be * different from the default @@ -74,6 +72,7 @@ #include <asm/cacheflush.h> #include <asm/sections.h> #include <asm/tlbflush.h> +#include <asm/io.h> #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ @@ -1302,6 +1301,27 @@ void free_percpu(void *ptr) } EXPORT_SYMBOL_GPL(free_percpu); +/** + * per_cpu_ptr_to_phys - convert translated percpu address to physical address + * @addr: the address to be converted to physical address + * + * Given @addr which is dereferenceable address obtained via one of + * percpu access macros, this function translates it into its physical + * address. The caller is responsible for ensuring @addr stays valid + * until this function finishes. + * + * RETURNS: + * The physical address for @addr. + */ +phys_addr_t per_cpu_ptr_to_phys(void *addr) +{ + if ((unsigned long)addr < VMALLOC_START || + (unsigned long)addr >= VMALLOC_END) + return __pa(addr); + else + return page_to_phys(vmalloc_to_page(addr)); +} + static inline size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size, ssize_t *dyn_sizep) @@ -49,6 +49,7 @@ #include <linux/swapops.h> #include <linux/slab.h> #include <linux/init.h> +#include <linux/ksm.h> #include <linux/rmap.h> #include <linux/rcupdate.h> #include <linux/module.h> @@ -67,7 +68,7 @@ static inline struct anon_vma *anon_vma_alloc(void) return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); } -static inline void anon_vma_free(struct anon_vma *anon_vma) +void anon_vma_free(struct anon_vma *anon_vma) { kmem_cache_free(anon_vma_cachep, anon_vma); } @@ -171,7 +172,7 @@ void anon_vma_unlink(struct vm_area_struct *vma) list_del(&vma->anon_vma_node); /* We must garbage collect the anon_vma if it's empty */ - empty = list_empty(&anon_vma->head); + empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma); spin_unlock(&anon_vma->lock); if (empty) @@ -183,6 +184,7 @@ static void anon_vma_ctor(void *data) struct anon_vma *anon_vma = data; spin_lock_init(&anon_vma->lock); + ksm_refcount_init(anon_vma); INIT_LIST_HEAD(&anon_vma->head); } @@ -202,8 +204,8 @@ struct anon_vma *page_lock_anon_vma(struct page *page) unsigned long anon_mapping; rcu_read_lock(); - anon_mapping = (unsigned long) page->mapping; - if (!(anon_mapping & PAGE_MAPPING_ANON)) + anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); + if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) goto out; if (!page_mapped(page)) goto out; @@ -248,8 +250,7 @@ vma_address(struct page *page, struct vm_area_struct *vma) unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { if (PageAnon(page)) { - if ((void *)vma->anon_vma != - (void *)page->mapping - PAGE_MAPPING_ANON) + if (vma->anon_vma != page_anon_vma(page)) return -EFAULT; } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { if (!vma->vm_file || @@ -337,21 +338,15 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) * Subfunctions of page_referenced: page_referenced_one called * repeatedly from either page_referenced_anon or page_referenced_file. */ -static int page_referenced_one(struct page *page, - struct vm_area_struct *vma, - unsigned int *mapcount, - unsigned long *vm_flags) +int page_referenced_one(struct page *page, struct vm_area_struct *vma, + unsigned long address, unsigned int *mapcount, + unsigned long *vm_flags) { struct mm_struct *mm = vma->vm_mm; - unsigned long address; pte_t *pte; spinlock_t *ptl; int referenced = 0; - address = vma_address(page, vma); - if (address == -EFAULT) - goto out; - pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) goto out; @@ -388,9 +383,10 @@ static int page_referenced_one(struct page *page, out_unmap: (*mapcount)--; pte_unmap_unlock(pte, ptl); -out: + if (referenced) *vm_flags |= vma->vm_flags; +out: return referenced; } @@ -409,6 +405,9 @@ static int page_referenced_anon(struct page *page, mapcount = page_mapcount(page); list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + unsigned long address = vma_address(page, vma); + if (address == -EFAULT) + continue; /* * If we are reclaiming on behalf of a cgroup, skip * counting on behalf of references from different @@ -416,7 +415,7 @@ static int page_referenced_anon(struct page *page, */ if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) continue; - referenced += page_referenced_one(page, vma, + referenced += page_referenced_one(page, vma, address, &mapcount, vm_flags); if (!mapcount) break; @@ -474,6 +473,9 @@ static int page_referenced_file(struct page *page, mapcount = page_mapcount(page); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + unsigned long address = vma_address(page, vma); + if (address == -EFAULT) + continue; /* * If we are reclaiming on behalf of a cgroup, skip * counting on behalf of references from different @@ -481,7 +483,7 @@ static int page_referenced_file(struct page *page, */ if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) continue; - referenced += page_referenced_one(page, vma, + referenced += page_referenced_one(page, vma, address, &mapcount, vm_flags); if (!mapcount) break; @@ -507,46 +509,47 @@ int page_referenced(struct page *page, unsigned long *vm_flags) { int referenced = 0; + int we_locked = 0; if (TestClearPageReferenced(page)) referenced++; *vm_flags = 0; - if (page_mapped(page) && page->mapping) { - if (PageAnon(page)) + if (page_mapped(page) && page_rmapping(page)) { + if (!is_locked && (!PageAnon(page) || PageKsm(page))) { + we_locked = trylock_page(page); + if (!we_locked) { + referenced++; + goto out; + } + } + if (unlikely(PageKsm(page))) + referenced += page_referenced_ksm(page, mem_cont, + vm_flags); + else if (PageAnon(page)) referenced += page_referenced_anon(page, mem_cont, vm_flags); - else if (is_locked) + else if (page->mapping) referenced += page_referenced_file(page, mem_cont, vm_flags); - else if (!trylock_page(page)) - referenced++; - else { - if (page->mapping) - referenced += page_referenced_file(page, - mem_cont, vm_flags); + if (we_locked) unlock_page(page); - } } - +out: if (page_test_and_clear_young(page)) referenced++; return referenced; } -static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) +static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, + unsigned long address) { struct mm_struct *mm = vma->vm_mm; - unsigned long address; pte_t *pte; spinlock_t *ptl; int ret = 0; - address = vma_address(page, vma); - if (address == -EFAULT) - goto out; - pte = page_check_address(page, mm, address, &ptl, 1); if (!pte) goto out; @@ -578,8 +581,12 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { - if (vma->vm_flags & VM_SHARED) - ret += page_mkclean_one(page, vma); + if (vma->vm_flags & VM_SHARED) { + unsigned long address = vma_address(page, vma); + if (address == -EFAULT) + continue; + ret += page_mkclean_one(page, vma, address); + } } spin_unlock(&mapping->i_mmap_lock); return ret; @@ -620,14 +627,7 @@ static void __page_set_anon_rmap(struct page *page, BUG_ON(!anon_vma); anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; page->mapping = (struct address_space *) anon_vma; - page->index = linear_page_index(vma, address); - - /* - * nr_mapped state can be updated without turning off - * interrupts because it is not modified via interrupt. - */ - __inc_zone_page_state(page, NR_ANON_PAGES); } /** @@ -665,14 +665,23 @@ static void __page_check_anon_rmap(struct page *page, * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped * - * The caller needs to hold the pte lock and the page must be locked. + * The caller needs to hold the pte lock, and the page must be locked in + * the anon_vma case: to serialize mapping,index checking after setting, + * and to ensure that PageAnon is not being upgraded racily to PageKsm + * (but PageKsm is never downgraded to PageAnon). */ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { + int first = atomic_inc_and_test(&page->_mapcount); + if (first) + __inc_zone_page_state(page, NR_ANON_PAGES); + if (unlikely(PageKsm(page))) + return; + VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); - if (atomic_inc_and_test(&page->_mapcount)) + if (first) __page_set_anon_rmap(page, vma, address); else __page_check_anon_rmap(page, vma, address); @@ -694,6 +703,7 @@ void page_add_new_anon_rmap(struct page *page, VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); SetPageSwapBacked(page); atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ + __inc_zone_page_state(page, NR_ANON_PAGES); __page_set_anon_rmap(page, vma, address); if (page_evictable(page, vma)) lru_cache_add_lru(page, LRU_ACTIVE_ANON); @@ -760,20 +770,15 @@ void page_remove_rmap(struct page *page) * Subfunctions of try_to_unmap: try_to_unmap_one called * repeatedly from either try_to_unmap_anon or try_to_unmap_file. */ -static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, - enum ttu_flags flags) +int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, + unsigned long address, enum ttu_flags flags) { struct mm_struct *mm = vma->vm_mm; - unsigned long address; pte_t *pte; pte_t pteval; spinlock_t *ptl; int ret = SWAP_AGAIN; - address = vma_address(page, vma); - if (address == -EFAULT) - goto out; - pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) goto out; @@ -784,10 +789,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * skipped over this mm) then we should reactivate it. */ if (!(flags & TTU_IGNORE_MLOCK)) { - if (vma->vm_flags & VM_LOCKED) { - ret = SWAP_MLOCK; + if (vma->vm_flags & VM_LOCKED) + goto out_mlock; + + if (TTU_ACTION(flags) == TTU_MUNLOCK) goto out_unmap; - } } if (!(flags & TTU_IGNORE_ACCESS)) { if (ptep_clear_flush_young_notify(vma, address, pte)) { @@ -822,7 +828,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * Store the swap location in the pte. * See handle_pte_fault() ... */ - swap_duplicate(entry); + if (swap_duplicate(entry) < 0) { + set_pte_at(mm, address, pte, pteval); + ret = SWAP_FAIL; + goto out_unmap; + } if (list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); if (list_empty(&mm->mmlist)) @@ -849,7 +859,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } else dec_mm_counter(mm, file_rss); - page_remove_rmap(page); page_cache_release(page); @@ -857,6 +866,27 @@ out_unmap: pte_unmap_unlock(pte, ptl); out: return ret; + +out_mlock: + pte_unmap_unlock(pte, ptl); + + + /* + * We need mmap_sem locking, Otherwise VM_LOCKED check makes + * unstable result and race. Plus, We can't wait here because + * we now hold anon_vma->lock or mapping->i_mmap_lock. + * if trylock failed, the page remain in evictable lru and later + * vmscan could retry to move the page to unevictable lru if the + * page is actually mlocked. + */ + if (down_read_trylock(&vma->vm_mm->mmap_sem)) { + if (vma->vm_flags & VM_LOCKED) { + mlock_vma_page(page); + ret = SWAP_MLOCK; + } + up_read(&vma->vm_mm->mmap_sem); + } + return ret; } /* @@ -922,11 +952,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, return ret; /* - * MLOCK_PAGES => feature is configured. - * if we can acquire the mmap_sem for read, and vma is VM_LOCKED, + * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, * keep the sem while scanning the cluster for mlocking pages. */ - if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) { + if (down_read_trylock(&vma->vm_mm->mmap_sem)) { locked_vma = (vma->vm_flags & VM_LOCKED); if (!locked_vma) up_read(&vma->vm_mm->mmap_sem); /* don't need it */ @@ -976,29 +1005,11 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, return ret; } -/* - * common handling for pages mapped in VM_LOCKED vmas - */ -static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma) -{ - int mlocked = 0; - - if (down_read_trylock(&vma->vm_mm->mmap_sem)) { - if (vma->vm_flags & VM_LOCKED) { - mlock_vma_page(page); - mlocked++; /* really mlocked the page */ - } - up_read(&vma->vm_mm->mmap_sem); - } - return mlocked; -} - /** * try_to_unmap_anon - unmap or unlock anonymous page using the object-based * rmap method * @page: the page to unmap/unlock - * @unlock: request for unlock rather than unmap [unlikely] - * @migration: unmapping for migration - ignored if @unlock + * @flags: action and flags * * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the anon_vma struct it points to. @@ -1014,42 +1025,22 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) { struct anon_vma *anon_vma; struct vm_area_struct *vma; - unsigned int mlocked = 0; int ret = SWAP_AGAIN; - int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; - - if (MLOCK_PAGES && unlikely(unlock)) - ret = SWAP_SUCCESS; /* default for try_to_munlock() */ anon_vma = page_lock_anon_vma(page); if (!anon_vma) return ret; list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { - if (MLOCK_PAGES && unlikely(unlock)) { - if (!((vma->vm_flags & VM_LOCKED) && - page_mapped_in_vma(page, vma))) - continue; /* must visit all unlocked vmas */ - ret = SWAP_MLOCK; /* saw at least one mlocked vma */ - } else { - ret = try_to_unmap_one(page, vma, flags); - if (ret == SWAP_FAIL || !page_mapped(page)) - break; - } - if (ret == SWAP_MLOCK) { - mlocked = try_to_mlock_page(page, vma); - if (mlocked) - break; /* stop if actually mlocked page */ - } + unsigned long address = vma_address(page, vma); + if (address == -EFAULT) + continue; + ret = try_to_unmap_one(page, vma, address, flags); + if (ret != SWAP_AGAIN || !page_mapped(page)) + break; } page_unlock_anon_vma(anon_vma); - - if (mlocked) - ret = SWAP_MLOCK; /* actually mlocked the page */ - else if (ret == SWAP_MLOCK) - ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ - return ret; } @@ -1079,48 +1070,30 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) unsigned long max_nl_cursor = 0; unsigned long max_nl_size = 0; unsigned int mapcount; - unsigned int mlocked = 0; - int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; - - if (MLOCK_PAGES && unlikely(unlock)) - ret = SWAP_SUCCESS; /* default for try_to_munlock() */ spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { - if (MLOCK_PAGES && unlikely(unlock)) { - if (!((vma->vm_flags & VM_LOCKED) && - page_mapped_in_vma(page, vma))) - continue; /* must visit all vmas */ - ret = SWAP_MLOCK; - } else { - ret = try_to_unmap_one(page, vma, flags); - if (ret == SWAP_FAIL || !page_mapped(page)) - goto out; - } - if (ret == SWAP_MLOCK) { - mlocked = try_to_mlock_page(page, vma); - if (mlocked) - break; /* stop if actually mlocked page */ - } + unsigned long address = vma_address(page, vma); + if (address == -EFAULT) + continue; + ret = try_to_unmap_one(page, vma, address, flags); + if (ret != SWAP_AGAIN || !page_mapped(page)) + goto out; } - if (mlocked) + if (list_empty(&mapping->i_mmap_nonlinear)) goto out; - if (list_empty(&mapping->i_mmap_nonlinear)) + /* + * We don't bother to try to find the munlocked page in nonlinears. + * It's costly. Instead, later, page reclaim logic may call + * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily. + */ + if (TTU_ACTION(flags) == TTU_MUNLOCK) goto out; list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) { - if (MLOCK_PAGES && unlikely(unlock)) { - if (!(vma->vm_flags & VM_LOCKED)) - continue; /* must visit all vmas */ - ret = SWAP_MLOCK; /* leave mlocked == 0 */ - goto out; /* no need to look further */ - } - if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && - (vma->vm_flags & VM_LOCKED)) - continue; cursor = (unsigned long) vma->vm_private_data; if (cursor > max_nl_cursor) max_nl_cursor = cursor; @@ -1153,16 +1126,12 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) do { list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) { - if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && - (vma->vm_flags & VM_LOCKED)) - continue; cursor = (unsigned long) vma->vm_private_data; while ( cursor < max_nl_cursor && cursor < vma->vm_end - vma->vm_start) { - ret = try_to_unmap_cluster(cursor, &mapcount, - vma, page); - if (ret == SWAP_MLOCK) - mlocked = 2; /* to return below */ + if (try_to_unmap_cluster(cursor, &mapcount, + vma, page) == SWAP_MLOCK) + ret = SWAP_MLOCK; cursor += CLUSTER_SIZE; vma->vm_private_data = (void *) cursor; if ((int)mapcount <= 0) @@ -1183,10 +1152,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) vma->vm_private_data = NULL; out: spin_unlock(&mapping->i_mmap_lock); - if (mlocked) - ret = SWAP_MLOCK; /* actually mlocked the page */ - else if (ret == SWAP_MLOCK) - ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ return ret; } @@ -1210,7 +1175,9 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) BUG_ON(!PageLocked(page)); - if (PageAnon(page)) + if (unlikely(PageKsm(page))) + ret = try_to_unmap_ksm(page, flags); + else if (PageAnon(page)) ret = try_to_unmap_anon(page, flags); else ret = try_to_unmap_file(page, flags); @@ -1229,17 +1196,98 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) * * Return values are: * - * SWAP_SUCCESS - no vma's holding page mlocked. + * SWAP_AGAIN - no vma is holding page mlocked, or, * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem + * SWAP_FAIL - page cannot be located at present * SWAP_MLOCK - page is now mlocked. */ int try_to_munlock(struct page *page) { VM_BUG_ON(!PageLocked(page) || PageLRU(page)); - if (PageAnon(page)) + if (unlikely(PageKsm(page))) + return try_to_unmap_ksm(page, TTU_MUNLOCK); + else if (PageAnon(page)) return try_to_unmap_anon(page, TTU_MUNLOCK); else return try_to_unmap_file(page, TTU_MUNLOCK); } +#ifdef CONFIG_MIGRATION +/* + * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): + * Called by migrate.c to remove migration ptes, but might be used more later. + */ +static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, + struct vm_area_struct *, unsigned long, void *), void *arg) +{ + struct anon_vma *anon_vma; + struct vm_area_struct *vma; + int ret = SWAP_AGAIN; + + /* + * Note: remove_migration_ptes() cannot use page_lock_anon_vma() + * because that depends on page_mapped(); but not all its usages + * are holding mmap_sem, which also gave the necessary guarantee + * (that this anon_vma's slab has not already been destroyed). + * This needs to be reviewed later: avoiding page_lock_anon_vma() + * is risky, and currently limits the usefulness of rmap_walk(). + */ + anon_vma = page_anon_vma(page); + if (!anon_vma) + return ret; + spin_lock(&anon_vma->lock); + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + unsigned long address = vma_address(page, vma); + if (address == -EFAULT) + continue; + ret = rmap_one(page, vma, address, arg); + if (ret != SWAP_AGAIN) + break; + } + spin_unlock(&anon_vma->lock); + return ret; +} + +static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, + struct vm_area_struct *, unsigned long, void *), void *arg) +{ + struct address_space *mapping = page->mapping; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + struct vm_area_struct *vma; + struct prio_tree_iter iter; + int ret = SWAP_AGAIN; + + if (!mapping) + return ret; + spin_lock(&mapping->i_mmap_lock); + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + unsigned long address = vma_address(page, vma); + if (address == -EFAULT) + continue; + ret = rmap_one(page, vma, address, arg); + if (ret != SWAP_AGAIN) + break; + } + /* + * No nonlinear handling: being always shared, nonlinear vmas + * never contain migration ptes. Decide what to do about this + * limitation to linear when we need rmap_walk() on nonlinear. + */ + spin_unlock(&mapping->i_mmap_lock); + return ret; +} + +int rmap_walk(struct page *page, int (*rmap_one)(struct page *, + struct vm_area_struct *, unsigned long, void *), void *arg) +{ + VM_BUG_ON(!PageLocked(page)); + + if (unlikely(PageKsm(page))) + return rmap_walk_ksm(page, rmap_one, arg); + else if (PageAnon(page)) + return rmap_walk_anon(page, rmap_one, arg); + else + return rmap_walk_file(page, rmap_one, arg); +} +#endif /* CONFIG_MIGRATION */ @@ -1017,7 +1017,14 @@ int shmem_unuse(swp_entry_t entry, struct page *page) goto out; } mutex_unlock(&shmem_swaplist_mutex); -out: return found; /* 0 or 1 or -ENOMEM */ + /* + * Can some race bring us here? We've been holding page lock, + * so I think not; but would rather try again later than BUG() + */ + unlock_page(page); + page_cache_release(page); +out: + return (found < 0) ? found : 0; } /* @@ -1080,7 +1087,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) else inode = NULL; spin_unlock(&info->lock); - swap_duplicate(swap); + swap_shmem_alloc(swap); BUG_ON(page_mapped(page)); page_cache_release(page); /* pagecache ref */ swap_writepage(page, wbc); @@ -490,7 +490,7 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #endif -#ifdef CONFIG_KMEMTRACE +#ifdef CONFIG_TRACING size_t slab_buffer_size(struct kmem_cache *cachep) { return cachep->buffer_size; @@ -697,7 +697,7 @@ static inline void init_lock_keys(void) static DEFINE_MUTEX(cache_chain_mutex); static struct list_head cache_chain; -static DEFINE_PER_CPU(struct delayed_work, reap_work); +static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) { @@ -838,7 +838,7 @@ __setup("noaliencache", noaliencache_setup); * objects freed on different nodes from which they were allocated) and the * flushing of remote pcps by calling drain_node_pages. */ -static DEFINE_PER_CPU(unsigned long, reap_node); +static DEFINE_PER_CPU(unsigned long, slab_reap_node); static void init_reap_node(int cpu) { @@ -848,17 +848,17 @@ static void init_reap_node(int cpu) if (node == MAX_NUMNODES) node = first_node(node_online_map); - per_cpu(reap_node, cpu) = node; + per_cpu(slab_reap_node, cpu) = node; } static void next_reap_node(void) { - int node = __get_cpu_var(reap_node); + int node = __get_cpu_var(slab_reap_node); node = next_node(node, node_online_map); if (unlikely(node >= MAX_NUMNODES)) node = first_node(node_online_map); - __get_cpu_var(reap_node) = node; + __get_cpu_var(slab_reap_node) = node; } #else @@ -875,7 +875,7 @@ static void next_reap_node(void) */ static void __cpuinit start_cpu_timer(int cpu) { - struct delayed_work *reap_work = &per_cpu(reap_work, cpu); + struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu); /* * When this gets called from do_initcalls via cpucache_init(), @@ -1039,7 +1039,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, */ static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) { - int node = __get_cpu_var(reap_node); + int node = __get_cpu_var(slab_reap_node); if (l3->alien) { struct array_cache *ac = l3->alien[node]; @@ -1300,9 +1300,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, * anything expensive but will only modify reap_work * and reschedule the timer. */ - cancel_rearming_delayed_work(&per_cpu(reap_work, cpu)); + cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu)); /* Now the cache_reaper is guaranteed to be not running. */ - per_cpu(reap_work, cpu).work.func = NULL; + per_cpu(slab_reap_work, cpu).work.func = NULL; break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: @@ -3578,7 +3578,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) } EXPORT_SYMBOL(kmem_cache_alloc); -#ifdef CONFIG_KMEMTRACE +#ifdef CONFIG_TRACING void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) { return __cache_alloc(cachep, flags, __builtin_return_address(0)); @@ -3641,7 +3641,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) } EXPORT_SYMBOL(kmem_cache_alloc_node); -#ifdef CONFIG_KMEMTRACE +#ifdef CONFIG_TRACING void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, gfp_t flags, int nodeid) @@ -3669,7 +3669,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) return ret; } -#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) +#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) void *__kmalloc_node(size_t size, gfp_t flags, int node) { return __do_kmalloc_node(size, flags, node, @@ -3689,7 +3689,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) return __do_kmalloc_node(size, flags, node, NULL); } EXPORT_SYMBOL(__kmalloc_node); -#endif /* CONFIG_DEBUG_SLAB */ +#endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */ #endif /* CONFIG_NUMA */ /** @@ -3721,7 +3721,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, } -#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) +#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) void *__kmalloc(size_t size, gfp_t flags) { return __do_kmalloc(size, flags, __builtin_return_address(0)); @@ -1754,7 +1754,7 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) } EXPORT_SYMBOL(kmem_cache_alloc); -#ifdef CONFIG_KMEMTRACE +#ifdef CONFIG_TRACING void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) { return slab_alloc(s, gfpflags, -1, _RET_IP_); @@ -1775,7 +1775,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) EXPORT_SYMBOL(kmem_cache_alloc_node); #endif -#ifdef CONFIG_KMEMTRACE +#ifdef CONFIG_TRACING void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, gfp_t gfpflags, int node) diff --git a/mm/swapfile.c b/mm/swapfile.c index 9c590ee..6c0585b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -22,6 +22,7 @@ #include <linux/seq_file.h> #include <linux/init.h> #include <linux/module.h> +#include <linux/ksm.h> #include <linux/rmap.h> #include <linux/security.h> #include <linux/backing-dev.h> @@ -35,11 +36,15 @@ #include <linux/swapops.h> #include <linux/page_cgroup.h> +static bool swap_count_continued(struct swap_info_struct *, pgoff_t, + unsigned char); +static void free_swap_count_continuations(struct swap_info_struct *); +static sector_t map_swap_entry(swp_entry_t, struct block_device**); + static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; long nr_swap_pages; long total_swap_pages; -static int swap_overflow; static int least_priority; static const char Bad_file[] = "Bad swap file entry "; @@ -49,42 +54,20 @@ static const char Unused_offset[] = "Unused swap offset entry "; static struct swap_list_t swap_list = {-1, -1}; -static struct swap_info_struct swap_info[MAX_SWAPFILES]; +static struct swap_info_struct *swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); -/* For reference count accounting in swap_map */ -/* enum for swap_map[] handling. internal use only */ -enum { - SWAP_MAP = 0, /* ops for reference from swap users */ - SWAP_CACHE, /* ops for reference from swap cache */ -}; - -static inline int swap_count(unsigned short ent) -{ - return ent & SWAP_COUNT_MASK; -} - -static inline bool swap_has_cache(unsigned short ent) +static inline unsigned char swap_count(unsigned char ent) { - return !!(ent & SWAP_HAS_CACHE); + return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ } -static inline unsigned short encode_swapmap(int count, bool has_cache) -{ - unsigned short ret = count; - - if (has_cache) - return SWAP_HAS_CACHE | ret; - return ret; -} - -/* returnes 1 if swap entry is freed */ +/* returns 1 if swap entry is freed */ static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) { - int type = si - swap_info; - swp_entry_t entry = swp_entry(type, offset); + swp_entry_t entry = swp_entry(si->type, offset); struct page *page; int ret = 0; @@ -120,7 +103,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) down_read(&swap_unplug_sem); entry.val = page_private(page); if (PageSwapCache(page)) { - struct block_device *bdev = swap_info[swp_type(entry)].bdev; + struct block_device *bdev = swap_info[swp_type(entry)]->bdev; struct backing_dev_info *bdi; /* @@ -146,23 +129,28 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) static int discard_swap(struct swap_info_struct *si) { struct swap_extent *se; + sector_t start_block; + sector_t nr_blocks; int err = 0; - list_for_each_entry(se, &si->extent_list, list) { - sector_t start_block = se->start_block << (PAGE_SHIFT - 9); - sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); + /* Do not discard the swap header page! */ + se = &si->first_swap_extent; + start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); + nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); + if (nr_blocks) { + err = blkdev_issue_discard(si->bdev, start_block, + nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); + if (err) + return err; + cond_resched(); + } - if (se->start_page == 0) { - /* Do not discard the swap header page! */ - start_block += 1 << (PAGE_SHIFT - 9); - nr_blocks -= 1 << (PAGE_SHIFT - 9); - if (!nr_blocks) - continue; - } + list_for_each_entry(se, &si->first_swap_extent.list, list) { + start_block = se->start_block << (PAGE_SHIFT - 9); + nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); err = blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_KERNEL, - DISCARD_FL_BARRIER); + nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); if (err) break; @@ -201,14 +189,11 @@ static void discard_swap_cluster(struct swap_info_struct *si, start_block <<= PAGE_SHIFT - 9; nr_blocks <<= PAGE_SHIFT - 9; if (blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_NOIO, - DISCARD_FL_BARRIER)) + nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER)) break; } lh = se->list.next; - if (lh == &si->extent_list) - lh = lh->next; se = list_entry(lh, struct swap_extent, list); } } @@ -223,7 +208,7 @@ static int wait_for_discard(void *word) #define LATENCY_LIMIT 256 static inline unsigned long scan_swap_map(struct swap_info_struct *si, - int cache) + unsigned char usage) { unsigned long offset; unsigned long scan_base; @@ -354,10 +339,7 @@ checks: si->lowest_bit = si->max; si->highest_bit = 0; } - if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */ - si->swap_map[offset] = encode_swapmap(0, true); - else /* at suspend */ - si->swap_map[offset] = encode_swapmap(1, false); + si->swap_map[offset] = usage; si->cluster_next = offset + 1; si->flags -= SWP_SCANNING; @@ -467,10 +449,10 @@ swp_entry_t get_swap_page(void) nr_swap_pages--; for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { - si = swap_info + type; + si = swap_info[type]; next = si->next; if (next < 0 || - (!wrapped && si->prio != swap_info[next].prio)) { + (!wrapped && si->prio != swap_info[next]->prio)) { next = swap_list.head; wrapped++; } @@ -482,7 +464,7 @@ swp_entry_t get_swap_page(void) swap_list.next = next; /* This is called for allocating swap entry for cache */ - offset = scan_swap_map(si, SWAP_CACHE); + offset = scan_swap_map(si, SWAP_HAS_CACHE); if (offset) { spin_unlock(&swap_lock); return swp_entry(type, offset); @@ -503,11 +485,11 @@ swp_entry_t get_swap_page_of_type(int type) pgoff_t offset; spin_lock(&swap_lock); - si = swap_info + type; - if (si->flags & SWP_WRITEOK) { + si = swap_info[type]; + if (si && (si->flags & SWP_WRITEOK)) { nr_swap_pages--; /* This is called for allocating swap entry, not cache */ - offset = scan_swap_map(si, SWAP_MAP); + offset = scan_swap_map(si, 1); if (offset) { spin_unlock(&swap_lock); return swp_entry(type, offset); @@ -518,9 +500,9 @@ swp_entry_t get_swap_page_of_type(int type) return (swp_entry_t) {0}; } -static struct swap_info_struct * swap_info_get(swp_entry_t entry) +static struct swap_info_struct *swap_info_get(swp_entry_t entry) { - struct swap_info_struct * p; + struct swap_info_struct *p; unsigned long offset, type; if (!entry.val) @@ -528,7 +510,7 @@ static struct swap_info_struct * swap_info_get(swp_entry_t entry) type = swp_type(entry); if (type >= nr_swapfiles) goto bad_nofile; - p = & swap_info[type]; + p = swap_info[type]; if (!(p->flags & SWP_USED)) goto bad_device; offset = swp_offset(entry); @@ -554,41 +536,56 @@ out: return NULL; } -static int swap_entry_free(struct swap_info_struct *p, - swp_entry_t ent, int cache) +static unsigned char swap_entry_free(struct swap_info_struct *p, + swp_entry_t entry, unsigned char usage) { - unsigned long offset = swp_offset(ent); - int count = swap_count(p->swap_map[offset]); - bool has_cache; + unsigned long offset = swp_offset(entry); + unsigned char count; + unsigned char has_cache; - has_cache = swap_has_cache(p->swap_map[offset]); + count = p->swap_map[offset]; + has_cache = count & SWAP_HAS_CACHE; + count &= ~SWAP_HAS_CACHE; - if (cache == SWAP_MAP) { /* dropping usage count of swap */ - if (count < SWAP_MAP_MAX) { - count--; - p->swap_map[offset] = encode_swapmap(count, has_cache); - } - } else { /* dropping swap cache flag */ + if (usage == SWAP_HAS_CACHE) { VM_BUG_ON(!has_cache); - p->swap_map[offset] = encode_swapmap(count, false); - + has_cache = 0; + } else if (count == SWAP_MAP_SHMEM) { + /* + * Or we could insist on shmem.c using a special + * swap_shmem_free() and free_shmem_swap_and_cache()... + */ + count = 0; + } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { + if (count == COUNT_CONTINUED) { + if (swap_count_continued(p, offset, count)) + count = SWAP_MAP_MAX | COUNT_CONTINUED; + else + count = SWAP_MAP_MAX; + } else + count--; } - /* return code. */ - count = p->swap_map[offset]; + + if (!count) + mem_cgroup_uncharge_swap(entry); + + usage = count | has_cache; + p->swap_map[offset] = usage; + /* free if no reference */ - if (!count) { + if (!usage) { if (offset < p->lowest_bit) p->lowest_bit = offset; if (offset > p->highest_bit) p->highest_bit = offset; - if (p->prio > swap_info[swap_list.next].prio) - swap_list.next = p - swap_info; + if (swap_list.next >= 0 && + p->prio > swap_info[swap_list.next]->prio) + swap_list.next = p->type; nr_swap_pages++; p->inuse_pages--; } - if (!swap_count(count)) - mem_cgroup_uncharge_swap(ent); - return count; + + return usage; } /* @@ -597,11 +594,11 @@ static int swap_entry_free(struct swap_info_struct *p, */ void swap_free(swp_entry_t entry) { - struct swap_info_struct * p; + struct swap_info_struct *p; p = swap_info_get(entry); if (p) { - swap_entry_free(p, entry, SWAP_MAP); + swap_entry_free(p, entry, 1); spin_unlock(&swap_lock); } } @@ -612,26 +609,21 @@ void swap_free(swp_entry_t entry) void swapcache_free(swp_entry_t entry, struct page *page) { struct swap_info_struct *p; - int ret; + unsigned char count; p = swap_info_get(entry); if (p) { - ret = swap_entry_free(p, entry, SWAP_CACHE); - if (page) { - bool swapout; - if (ret) - swapout = true; /* the end of swap out */ - else - swapout = false; /* no more swap users! */ - mem_cgroup_uncharge_swapcache(page, entry, swapout); - } + count = swap_entry_free(p, entry, SWAP_HAS_CACHE); + if (page) + mem_cgroup_uncharge_swapcache(page, entry, count != 0); spin_unlock(&swap_lock); } - return; } /* * How many references to page are currently swapped out? + * This does not give an exact answer when swap count is continued, + * but does include the high COUNT_CONTINUED flag to allow for that. */ static inline int page_swapcount(struct page *page) { @@ -659,6 +651,8 @@ int reuse_swap_page(struct page *page) int count; VM_BUG_ON(!PageLocked(page)); + if (unlikely(PageKsm(page))) + return 0; count = page_mapcount(page); if (count <= 1 && PageSwapCache(page)) { count += page_swapcount(page); @@ -667,7 +661,7 @@ int reuse_swap_page(struct page *page) SetPageDirty(page); } } - return count == 1; + return count <= 1; } /* @@ -704,7 +698,7 @@ int free_swap_and_cache(swp_entry_t entry) p = swap_info_get(entry); if (p) { - if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) { + if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { page = find_get_page(&swapper_space, entry.val); if (page && !trylock_page(page)) { page_cache_release(page); @@ -741,14 +735,14 @@ int free_swap_and_cache(swp_entry_t entry) int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) { struct block_device *bdev = NULL; - int i; + int type; if (device) bdev = bdget(device); spin_lock(&swap_lock); - for (i = 0; i < nr_swapfiles; i++) { - struct swap_info_struct *sis = swap_info + i; + for (type = 0; type < nr_swapfiles; type++) { + struct swap_info_struct *sis = swap_info[type]; if (!(sis->flags & SWP_WRITEOK)) continue; @@ -758,20 +752,18 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) *bdev_p = bdgrab(sis->bdev); spin_unlock(&swap_lock); - return i; + return type; } if (bdev == sis->bdev) { - struct swap_extent *se; + struct swap_extent *se = &sis->first_swap_extent; - se = list_entry(sis->extent_list.next, - struct swap_extent, list); if (se->start_block == offset) { if (bdev_p) *bdev_p = bdgrab(sis->bdev); spin_unlock(&swap_lock); bdput(bdev); - return i; + return type; } } } @@ -783,6 +775,21 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) } /* + * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev + * corresponding to given index in swap_info (swap type). + */ +sector_t swapdev_block(int type, pgoff_t offset) +{ + struct block_device *bdev; + + if ((unsigned int)type >= nr_swapfiles) + return 0; + if (!(swap_info[type]->flags & SWP_WRITEOK)) + return 0; + return map_swap_entry(swp_entry(type, offset), &bdev); +} + +/* * Return either the total number of swap pages of given type, or the number * of free pages of that type (depending on @free) * @@ -792,18 +799,20 @@ unsigned int count_swap_pages(int type, int free) { unsigned int n = 0; - if (type < nr_swapfiles) { - spin_lock(&swap_lock); - if (swap_info[type].flags & SWP_WRITEOK) { - n = swap_info[type].pages; + spin_lock(&swap_lock); + if ((unsigned int)type < nr_swapfiles) { + struct swap_info_struct *sis = swap_info[type]; + + if (sis->flags & SWP_WRITEOK) { + n = sis->pages; if (free) - n -= swap_info[type].inuse_pages; + n -= sis->inuse_pages; } - spin_unlock(&swap_lock); } + spin_unlock(&swap_lock); return n; } -#endif +#endif /* CONFIG_HIBERNATION */ /* * No need to decide whether this PTE shares the swap entry with others, @@ -932,7 +941,7 @@ static int unuse_vma(struct vm_area_struct *vma, unsigned long addr, end, next; int ret; - if (page->mapping) { + if (page_anon_vma(page)) { addr = page_address_in_vma(page, vma); if (addr == -EFAULT) return 0; @@ -988,7 +997,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, { unsigned int max = si->max; unsigned int i = prev; - int count; + unsigned char count; /* * No need for swap_lock here: we're just looking @@ -1024,16 +1033,14 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, */ static int try_to_unuse(unsigned int type) { - struct swap_info_struct * si = &swap_info[type]; + struct swap_info_struct *si = swap_info[type]; struct mm_struct *start_mm; - unsigned short *swap_map; - unsigned short swcount; + unsigned char *swap_map; + unsigned char swcount; struct page *page; swp_entry_t entry; unsigned int i = 0; int retval = 0; - int reset_overflow = 0; - int shmem; /* * When searching mms for an entry, a good strategy is to @@ -1047,8 +1054,7 @@ static int try_to_unuse(unsigned int type) * together, child after parent. If we race with dup_mmap(), we * prefer to resolve parent before child, lest we miss entries * duplicated after we scanned child: using last mm would invert - * that. Though it's only a serious concern when an overflowed - * swap count is reset from SWAP_MAP_MAX, preventing a rescan. + * that. */ start_mm = &init_mm; atomic_inc(&init_mm.mm_users); @@ -1110,17 +1116,18 @@ static int try_to_unuse(unsigned int type) /* * Remove all references to entry. - * Whenever we reach init_mm, there's no address space - * to search, but use it as a reminder to search shmem. */ - shmem = 0; swcount = *swap_map; - if (swap_count(swcount)) { - if (start_mm == &init_mm) - shmem = shmem_unuse(entry, page); - else - retval = unuse_mm(start_mm, entry, page); + if (swap_count(swcount) == SWAP_MAP_SHMEM) { + retval = shmem_unuse(entry, page); + /* page has already been unlocked and released */ + if (retval < 0) + break; + continue; } + if (swap_count(swcount) && start_mm != &init_mm) + retval = unuse_mm(start_mm, entry, page); + if (swap_count(*swap_map)) { int set_start_mm = (*swap_map >= swcount); struct list_head *p = &start_mm->mmlist; @@ -1131,7 +1138,7 @@ static int try_to_unuse(unsigned int type) atomic_inc(&new_start_mm->mm_users); atomic_inc(&prev_mm->mm_users); spin_lock(&mmlist_lock); - while (swap_count(*swap_map) && !retval && !shmem && + while (swap_count(*swap_map) && !retval && (p = p->next) != &start_mm->mmlist) { mm = list_entry(p, struct mm_struct, mmlist); if (!atomic_inc_not_zero(&mm->mm_users)) @@ -1145,10 +1152,9 @@ static int try_to_unuse(unsigned int type) swcount = *swap_map; if (!swap_count(swcount)) /* any usage ? */ ; - else if (mm == &init_mm) { + else if (mm == &init_mm) set_start_mm = 1; - shmem = shmem_unuse(entry, page); - } else + else retval = unuse_mm(mm, entry, page); if (set_start_mm && *swap_map < swcount) { @@ -1164,13 +1170,6 @@ static int try_to_unuse(unsigned int type) mmput(start_mm); start_mm = new_start_mm; } - if (shmem) { - /* page has already been unlocked and released */ - if (shmem > 0) - continue; - retval = shmem; - break; - } if (retval) { unlock_page(page); page_cache_release(page); @@ -1178,30 +1177,6 @@ static int try_to_unuse(unsigned int type) } /* - * How could swap count reach 0x7ffe ? - * There's no way to repeat a swap page within an mm - * (except in shmem, where it's the shared object which takes - * the reference count)? - * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned - * short is too small....) - * If that's wrong, then we should worry more about - * exit_mmap() and do_munmap() cases described above: - * we might be resetting SWAP_MAP_MAX too early here. - * We know "Undead"s can happen, they're okay, so don't - * report them; but do report if we reset SWAP_MAP_MAX. - */ - /* We might release the lock_page() in unuse_mm(). */ - if (!PageSwapCache(page) || page_private(page) != entry.val) - goto retry; - - if (swap_count(*swap_map) == SWAP_MAP_MAX) { - spin_lock(&swap_lock); - *swap_map = encode_swapmap(0, true); - spin_unlock(&swap_lock); - reset_overflow = 1; - } - - /* * If a reference remains (rare), we would like to leave * the page in the swap cache; but try_to_unmap could * then re-duplicate the entry once we drop page lock, @@ -1213,6 +1188,12 @@ static int try_to_unuse(unsigned int type) * read from disk into another page. Splitting into two * pages would be incorrect if swap supported "shared * private" pages, but they are handled by tmpfs files. + * + * Given how unuse_vma() targets one particular offset + * in an anon_vma, once the anon_vma has been determined, + * this splitting happens to be just what is needed to + * handle where KSM pages have been swapped out: re-reading + * is unnecessarily slow, but we can fix that later on. */ if (swap_count(*swap_map) && PageDirty(page) && PageSwapCache(page)) { @@ -1242,7 +1223,6 @@ static int try_to_unuse(unsigned int type) * mark page dirty so shrink_page_list will preserve it. */ SetPageDirty(page); -retry: unlock_page(page); page_cache_release(page); @@ -1254,10 +1234,6 @@ retry: } mmput(start_mm); - if (reset_overflow) { - printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); - swap_overflow = 0; - } return retval; } @@ -1270,10 +1246,10 @@ retry: static void drain_mmlist(void) { struct list_head *p, *next; - unsigned int i; + unsigned int type; - for (i = 0; i < nr_swapfiles; i++) - if (swap_info[i].inuse_pages) + for (type = 0; type < nr_swapfiles; type++) + if (swap_info[type]->inuse_pages) return; spin_lock(&mmlist_lock); list_for_each_safe(p, next, &init_mm.mmlist) @@ -1283,12 +1259,23 @@ static void drain_mmlist(void) /* * Use this swapdev's extent info to locate the (PAGE_SIZE) block which - * corresponds to page offset `offset'. + * corresponds to page offset for the specified swap entry. + * Note that the type of this function is sector_t, but it returns page offset + * into the bdev, not sector offset. */ -sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) +static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) { - struct swap_extent *se = sis->curr_swap_extent; - struct swap_extent *start_se = se; + struct swap_info_struct *sis; + struct swap_extent *start_se; + struct swap_extent *se; + pgoff_t offset; + + sis = swap_info[swp_type(entry)]; + *bdev = sis->bdev; + + offset = swp_offset(entry); + start_se = sis->curr_swap_extent; + se = start_se; for ( ; ; ) { struct list_head *lh; @@ -1298,40 +1285,31 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) return se->start_block + (offset - se->start_page); } lh = se->list.next; - if (lh == &sis->extent_list) - lh = lh->next; se = list_entry(lh, struct swap_extent, list); sis->curr_swap_extent = se; BUG_ON(se == start_se); /* It *must* be present */ } } -#ifdef CONFIG_HIBERNATION /* - * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev - * corresponding to given index in swap_info (swap type). + * Returns the page offset into bdev for the specified page's swap entry. */ -sector_t swapdev_block(int swap_type, pgoff_t offset) +sector_t map_swap_page(struct page *page, struct block_device **bdev) { - struct swap_info_struct *sis; - - if (swap_type >= nr_swapfiles) - return 0; - - sis = swap_info + swap_type; - return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0; + swp_entry_t entry; + entry.val = page_private(page); + return map_swap_entry(entry, bdev); } -#endif /* CONFIG_HIBERNATION */ /* * Free all of a swapdev's extent information */ static void destroy_swap_extents(struct swap_info_struct *sis) { - while (!list_empty(&sis->extent_list)) { + while (!list_empty(&sis->first_swap_extent.list)) { struct swap_extent *se; - se = list_entry(sis->extent_list.next, + se = list_entry(sis->first_swap_extent.list.next, struct swap_extent, list); list_del(&se->list); kfree(se); @@ -1352,8 +1330,15 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, struct swap_extent *new_se; struct list_head *lh; - lh = sis->extent_list.prev; /* The highest page extent */ - if (lh != &sis->extent_list) { + if (start_page == 0) { + se = &sis->first_swap_extent; + sis->curr_swap_extent = se; + se->start_page = 0; + se->nr_pages = nr_pages; + se->start_block = start_block; + return 1; + } else { + lh = sis->first_swap_extent.list.prev; /* Highest extent */ se = list_entry(lh, struct swap_extent, list); BUG_ON(se->start_page + se->nr_pages != start_page); if (se->start_block + se->nr_pages == start_block) { @@ -1373,7 +1358,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, new_se->nr_pages = nr_pages; new_se->start_block = start_block; - list_add_tail(&new_se->list, &sis->extent_list); + list_add_tail(&new_se->list, &sis->first_swap_extent.list); return 1; } @@ -1425,7 +1410,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) if (S_ISBLK(inode->i_mode)) { ret = add_swap_extent(sis, 0, sis->max, 0); *span = sis->pages; - goto done; + goto out; } blkbits = inode->i_blkbits; @@ -1496,25 +1481,22 @@ reprobe: sis->max = page_no; sis->pages = page_no - 1; sis->highest_bit = page_no - 1; -done: - sis->curr_swap_extent = list_entry(sis->extent_list.prev, - struct swap_extent, list); - goto out; +out: + return ret; bad_bmap: printk(KERN_ERR "swapon: swapfile has holes\n"); ret = -EINVAL; -out: - return ret; + goto out; } SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { - struct swap_info_struct * p = NULL; - unsigned short *swap_map; + struct swap_info_struct *p = NULL; + unsigned char *swap_map; struct file *swap_file, *victim; struct address_space *mapping; struct inode *inode; - char * pathname; + char *pathname; int i, type, prev; int err; @@ -1535,8 +1517,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mapping = victim->f_mapping; prev = -1; spin_lock(&swap_lock); - for (type = swap_list.head; type >= 0; type = swap_info[type].next) { - p = swap_info + type; + for (type = swap_list.head; type >= 0; type = swap_info[type]->next) { + p = swap_info[type]; if (p->flags & SWP_WRITEOK) { if (p->swap_file->f_mapping == mapping) break; @@ -1555,18 +1537,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); goto out_dput; } - if (prev < 0) { + if (prev < 0) swap_list.head = p->next; - } else { - swap_info[prev].next = p->next; - } + else + swap_info[prev]->next = p->next; if (type == swap_list.next) { /* just pick something that's safe... */ swap_list.next = swap_list.head; } if (p->prio < 0) { - for (i = p->next; i >= 0; i = swap_info[i].next) - swap_info[i].prio = p->prio--; + for (i = p->next; i >= 0; i = swap_info[i]->next) + swap_info[i]->prio = p->prio--; least_priority++; } nr_swap_pages -= p->pages; @@ -1584,16 +1565,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) if (p->prio < 0) p->prio = --least_priority; prev = -1; - for (i = swap_list.head; i >= 0; i = swap_info[i].next) { - if (p->prio >= swap_info[i].prio) + for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { + if (p->prio >= swap_info[i]->prio) break; prev = i; } p->next = i; if (prev < 0) - swap_list.head = swap_list.next = p - swap_info; + swap_list.head = swap_list.next = type; else - swap_info[prev].next = p - swap_info; + swap_info[prev]->next = type; nr_swap_pages += p->pages; total_swap_pages += p->pages; p->flags |= SWP_WRITEOK; @@ -1606,6 +1587,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) up_write(&swap_unplug_sem); destroy_swap_extents(p); + if (p->flags & SWP_CONTINUED) + free_swap_count_continuations(p); + mutex_lock(&swapon_mutex); spin_lock(&swap_lock); drain_mmlist(); @@ -1653,8 +1637,8 @@ out: /* iterator */ static void *swap_start(struct seq_file *swap, loff_t *pos) { - struct swap_info_struct *ptr = swap_info; - int i; + struct swap_info_struct *si; + int type; loff_t l = *pos; mutex_lock(&swapon_mutex); @@ -1662,11 +1646,13 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) if (!l) return SEQ_START_TOKEN; - for (i = 0; i < nr_swapfiles; i++, ptr++) { - if (!(ptr->flags & SWP_USED) || !ptr->swap_map) + for (type = 0; type < nr_swapfiles; type++) { + smp_rmb(); /* read nr_swapfiles before swap_info[type] */ + si = swap_info[type]; + if (!(si->flags & SWP_USED) || !si->swap_map) continue; if (!--l) - return ptr; + return si; } return NULL; @@ -1674,21 +1660,21 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) { - struct swap_info_struct *ptr; - struct swap_info_struct *endptr = swap_info + nr_swapfiles; + struct swap_info_struct *si = v; + int type; if (v == SEQ_START_TOKEN) - ptr = swap_info; - else { - ptr = v; - ptr++; - } + type = 0; + else + type = si->type + 1; - for (; ptr < endptr; ptr++) { - if (!(ptr->flags & SWP_USED) || !ptr->swap_map) + for (; type < nr_swapfiles; type++) { + smp_rmb(); /* read nr_swapfiles before swap_info[type] */ + si = swap_info[type]; + if (!(si->flags & SWP_USED) || !si->swap_map) continue; ++*pos; - return ptr; + return si; } return NULL; @@ -1701,24 +1687,24 @@ static void swap_stop(struct seq_file *swap, void *v) static int swap_show(struct seq_file *swap, void *v) { - struct swap_info_struct *ptr = v; + struct swap_info_struct *si = v; struct file *file; int len; - if (ptr == SEQ_START_TOKEN) { + if (si == SEQ_START_TOKEN) { seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); return 0; } - file = ptr->swap_file; + file = si->swap_file; len = seq_path(swap, &file->f_path, " \t\n\\"); seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", len < 40 ? 40 - len : 1, " ", S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? "partition" : "file\t", - ptr->pages << (PAGE_SHIFT - 10), - ptr->inuse_pages << (PAGE_SHIFT - 10), - ptr->prio); + si->pages << (PAGE_SHIFT - 10), + si->inuse_pages << (PAGE_SHIFT - 10), + si->prio); return 0; } @@ -1765,7 +1751,7 @@ late_initcall(max_swapfiles_check); */ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { - struct swap_info_struct * p; + struct swap_info_struct *p; char *name = NULL; struct block_device *bdev = NULL; struct file *swap_file = NULL; @@ -1779,30 +1765,52 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) sector_t span; unsigned long maxpages = 1; unsigned long swapfilepages; - unsigned short *swap_map = NULL; + unsigned char *swap_map = NULL; struct page *page = NULL; struct inode *inode = NULL; int did_down = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + spin_lock(&swap_lock); - p = swap_info; - for (type = 0 ; type < nr_swapfiles ; type++,p++) - if (!(p->flags & SWP_USED)) + for (type = 0; type < nr_swapfiles; type++) { + if (!(swap_info[type]->flags & SWP_USED)) break; + } error = -EPERM; if (type >= MAX_SWAPFILES) { spin_unlock(&swap_lock); + kfree(p); goto out; } - if (type >= nr_swapfiles) - nr_swapfiles = type+1; - memset(p, 0, sizeof(*p)); - INIT_LIST_HEAD(&p->extent_list); + if (type >= nr_swapfiles) { + p->type = type; + swap_info[type] = p; + /* + * Write swap_info[type] before nr_swapfiles, in case a + * racing procfs swap_start() or swap_next() is reading them. + * (We never shrink nr_swapfiles, we never free this entry.) + */ + smp_wmb(); + nr_swapfiles++; + } else { + kfree(p); + p = swap_info[type]; + /* + * Do not memset this entry: a racing procfs swap_next() + * would be relying on p->type to remain valid. + */ + } + INIT_LIST_HEAD(&p->first_swap_extent.list); p->flags = SWP_USED; p->next = -1; spin_unlock(&swap_lock); + name = getname(specialfile); error = PTR_ERR(name); if (IS_ERR(name)) { @@ -1822,7 +1830,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -EBUSY; for (i = 0; i < nr_swapfiles; i++) { - struct swap_info_struct *q = &swap_info[i]; + struct swap_info_struct *q = swap_info[i]; if (i == type || !q->swap_file) continue; @@ -1897,6 +1905,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->lowest_bit = 1; p->cluster_next = 1; + p->cluster_nr = 0; /* * Find out how many pages are allowed for a single swap @@ -1932,13 +1941,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap; /* OK, set up the swap map and apply the bad block list */ - swap_map = vmalloc(maxpages * sizeof(short)); + swap_map = vmalloc(maxpages); if (!swap_map) { error = -ENOMEM; goto bad_swap; } - memset(swap_map, 0, maxpages * sizeof(short)); + memset(swap_map, 0, maxpages); for (i = 0; i < swap_header->info.nr_badpages; i++) { int page_nr = swap_header->info.badpages[i]; if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { @@ -2003,18 +2012,16 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) /* insert swap space into swap_list: */ prev = -1; - for (i = swap_list.head; i >= 0; i = swap_info[i].next) { - if (p->prio >= swap_info[i].prio) { + for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { + if (p->prio >= swap_info[i]->prio) break; - } prev = i; } p->next = i; - if (prev < 0) { - swap_list.head = swap_list.next = p - swap_info; - } else { - swap_info[prev].next = p - swap_info; - } + if (prev < 0) + swap_list.head = swap_list.next = type; + else + swap_info[prev]->next = type; spin_unlock(&swap_lock); mutex_unlock(&swapon_mutex); error = 0; @@ -2051,15 +2058,15 @@ out: void si_swapinfo(struct sysinfo *val) { - unsigned int i; + unsigned int type; unsigned long nr_to_be_unused = 0; spin_lock(&swap_lock); - for (i = 0; i < nr_swapfiles; i++) { - if (!(swap_info[i].flags & SWP_USED) || - (swap_info[i].flags & SWP_WRITEOK)) - continue; - nr_to_be_unused += swap_info[i].inuse_pages; + for (type = 0; type < nr_swapfiles; type++) { + struct swap_info_struct *si = swap_info[type]; + + if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) + nr_to_be_unused += si->inuse_pages; } val->freeswap = nr_swap_pages + nr_to_be_unused; val->totalswap = total_swap_pages + nr_to_be_unused; @@ -2069,101 +2076,107 @@ void si_swapinfo(struct sysinfo *val) /* * Verify that a swap entry is valid and increment its swap map count. * - * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as - * "permanent", but will be reclaimed by the next swapoff. * Returns error code in following case. * - success -> 0 * - swp_entry is invalid -> EINVAL * - swp_entry is migration entry -> EINVAL * - swap-cache reference is requested but there is already one. -> EEXIST * - swap-cache reference is requested but the entry is not used. -> ENOENT + * - swap-mapped reference requested but needs continued swap count. -> ENOMEM */ -static int __swap_duplicate(swp_entry_t entry, bool cache) +static int __swap_duplicate(swp_entry_t entry, unsigned char usage) { - struct swap_info_struct * p; + struct swap_info_struct *p; unsigned long offset, type; - int result = -EINVAL; - int count; - bool has_cache; + unsigned char count; + unsigned char has_cache; + int err = -EINVAL; if (non_swap_entry(entry)) - return -EINVAL; + goto out; type = swp_type(entry); if (type >= nr_swapfiles) goto bad_file; - p = type + swap_info; + p = swap_info[type]; offset = swp_offset(entry); spin_lock(&swap_lock); - if (unlikely(offset >= p->max)) goto unlock_out; - count = swap_count(p->swap_map[offset]); - has_cache = swap_has_cache(p->swap_map[offset]); + count = p->swap_map[offset]; + has_cache = count & SWAP_HAS_CACHE; + count &= ~SWAP_HAS_CACHE; + err = 0; - if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */ + if (usage == SWAP_HAS_CACHE) { /* set SWAP_HAS_CACHE if there is no cache and entry is used */ - if (!has_cache && count) { - p->swap_map[offset] = encode_swapmap(count, true); - result = 0; - } else if (has_cache) /* someone added cache */ - result = -EEXIST; - else if (!count) /* no users */ - result = -ENOENT; + if (!has_cache && count) + has_cache = SWAP_HAS_CACHE; + else if (has_cache) /* someone else added cache */ + err = -EEXIST; + else /* no users remaining */ + err = -ENOENT; } else if (count || has_cache) { - if (count < SWAP_MAP_MAX - 1) { - p->swap_map[offset] = encode_swapmap(count + 1, - has_cache); - result = 0; - } else if (count <= SWAP_MAP_MAX) { - if (swap_overflow++ < 5) - printk(KERN_WARNING - "swap_dup: swap entry overflow\n"); - p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX, - has_cache); - result = 0; - } + + if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) + count += usage; + else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) + err = -EINVAL; + else if (swap_count_continued(p, offset, count)) + count = COUNT_CONTINUED; + else + err = -ENOMEM; } else - result = -ENOENT; /* unused swap entry */ + err = -ENOENT; /* unused swap entry */ + + p->swap_map[offset] = count | has_cache; + unlock_out: spin_unlock(&swap_lock); out: - return result; + return err; bad_file: printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); goto out; } + +/* + * Help swapoff by noting that swap entry belongs to shmem/tmpfs + * (in which case its reference count is never incremented). + */ +void swap_shmem_alloc(swp_entry_t entry) +{ + __swap_duplicate(entry, SWAP_MAP_SHMEM); +} + /* * increase reference count of swap entry by 1. */ -void swap_duplicate(swp_entry_t entry) +int swap_duplicate(swp_entry_t entry) { - __swap_duplicate(entry, SWAP_MAP); + int err = 0; + + while (!err && __swap_duplicate(entry, 1) == -ENOMEM) + err = add_swap_count_continuation(entry, GFP_ATOMIC); + return err; } /* * @entry: swap entry for which we allocate swap cache. * - * Called when allocating swap cache for exising swap entry, + * Called when allocating swap cache for existing swap entry, * This can return error codes. Returns 0 at success. * -EBUSY means there is a swap cache. * Note: return code is different from swap_duplicate(). */ int swapcache_prepare(swp_entry_t entry) { - return __swap_duplicate(entry, SWAP_CACHE); -} - - -struct swap_info_struct * -get_swap_info_struct(unsigned type) -{ - return &swap_info[type]; + return __swap_duplicate(entry, SWAP_HAS_CACHE); } /* @@ -2181,7 +2194,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) if (!our_page_cluster) /* no readahead */ return 0; - si = &swap_info[swp_type(entry)]; + si = swap_info[swp_type(entry)]; target = swp_offset(entry); base = (target >> our_page_cluster) << our_page_cluster; end = base + (1 << our_page_cluster); @@ -2217,3 +2230,219 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) *offset = ++toff; return nr_pages? ++nr_pages: 0; } + +/* + * add_swap_count_continuation - called when a swap count is duplicated + * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's + * page of the original vmalloc'ed swap_map, to hold the continuation count + * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called + * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. + * + * These continuation pages are seldom referenced: the common paths all work + * on the original swap_map, only referring to a continuation page when the + * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. + * + * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding + * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) + * can be called after dropping locks. + */ +int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) +{ + struct swap_info_struct *si; + struct page *head; + struct page *page; + struct page *list_page; + pgoff_t offset; + unsigned char count; + + /* + * When debugging, it's easier to use __GFP_ZERO here; but it's better + * for latency not to zero a page while GFP_ATOMIC and holding locks. + */ + page = alloc_page(gfp_mask | __GFP_HIGHMEM); + + si = swap_info_get(entry); + if (!si) { + /* + * An acceptable race has occurred since the failing + * __swap_duplicate(): the swap entry has been freed, + * perhaps even the whole swap_map cleared for swapoff. + */ + goto outer; + } + + offset = swp_offset(entry); + count = si->swap_map[offset] & ~SWAP_HAS_CACHE; + + if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { + /* + * The higher the swap count, the more likely it is that tasks + * will race to add swap count continuation: we need to avoid + * over-provisioning. + */ + goto out; + } + + if (!page) { + spin_unlock(&swap_lock); + return -ENOMEM; + } + + /* + * We are fortunate that although vmalloc_to_page uses pte_offset_map, + * no architecture is using highmem pages for kernel pagetables: so it + * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps. + */ + head = vmalloc_to_page(si->swap_map + offset); + offset &= ~PAGE_MASK; + + /* + * Page allocation does not initialize the page's lru field, + * but it does always reset its private field. + */ + if (!page_private(head)) { + BUG_ON(count & COUNT_CONTINUED); + INIT_LIST_HEAD(&head->lru); + set_page_private(head, SWP_CONTINUED); + si->flags |= SWP_CONTINUED; + } + + list_for_each_entry(list_page, &head->lru, lru) { + unsigned char *map; + + /* + * If the previous map said no continuation, but we've found + * a continuation page, free our allocation and use this one. + */ + if (!(count & COUNT_CONTINUED)) + goto out; + + map = kmap_atomic(list_page, KM_USER0) + offset; + count = *map; + kunmap_atomic(map, KM_USER0); + + /* + * If this continuation count now has some space in it, + * free our allocation and use this one. + */ + if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) + goto out; + } + + list_add_tail(&page->lru, &head->lru); + page = NULL; /* now it's attached, don't free it */ +out: + spin_unlock(&swap_lock); +outer: + if (page) + __free_page(page); + return 0; +} + +/* + * swap_count_continued - when the original swap_map count is incremented + * from SWAP_MAP_MAX, check if there is already a continuation page to carry + * into, carry if so, or else fail until a new continuation page is allocated; + * when the original swap_map count is decremented from 0 with continuation, + * borrow from the continuation and report whether it still holds more. + * Called while __swap_duplicate() or swap_entry_free() holds swap_lock. + */ +static bool swap_count_continued(struct swap_info_struct *si, + pgoff_t offset, unsigned char count) +{ + struct page *head; + struct page *page; + unsigned char *map; + + head = vmalloc_to_page(si->swap_map + offset); + if (page_private(head) != SWP_CONTINUED) { + BUG_ON(count & COUNT_CONTINUED); + return false; /* need to add count continuation */ + } + + offset &= ~PAGE_MASK; + page = list_entry(head->lru.next, struct page, lru); + map = kmap_atomic(page, KM_USER0) + offset; + + if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ + goto init_map; /* jump over SWAP_CONT_MAX checks */ + + if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ + /* + * Think of how you add 1 to 999 + */ + while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { + kunmap_atomic(map, KM_USER0); + page = list_entry(page->lru.next, struct page, lru); + BUG_ON(page == head); + map = kmap_atomic(page, KM_USER0) + offset; + } + if (*map == SWAP_CONT_MAX) { + kunmap_atomic(map, KM_USER0); + page = list_entry(page->lru.next, struct page, lru); + if (page == head) + return false; /* add count continuation */ + map = kmap_atomic(page, KM_USER0) + offset; +init_map: *map = 0; /* we didn't zero the page */ + } + *map += 1; + kunmap_atomic(map, KM_USER0); + page = list_entry(page->lru.prev, struct page, lru); + while (page != head) { + map = kmap_atomic(page, KM_USER0) + offset; + *map = COUNT_CONTINUED; + kunmap_atomic(map, KM_USER0); + page = list_entry(page->lru.prev, struct page, lru); + } + return true; /* incremented */ + + } else { /* decrementing */ + /* + * Think of how you subtract 1 from 1000 + */ + BUG_ON(count != COUNT_CONTINUED); + while (*map == COUNT_CONTINUED) { + kunmap_atomic(map, KM_USER0); + page = list_entry(page->lru.next, struct page, lru); + BUG_ON(page == head); + map = kmap_atomic(page, KM_USER0) + offset; + } + BUG_ON(*map == 0); + *map -= 1; + if (*map == 0) + count = 0; + kunmap_atomic(map, KM_USER0); + page = list_entry(page->lru.prev, struct page, lru); + while (page != head) { + map = kmap_atomic(page, KM_USER0) + offset; + *map = SWAP_CONT_MAX | count; + count = COUNT_CONTINUED; + kunmap_atomic(map, KM_USER0); + page = list_entry(page->lru.prev, struct page, lru); + } + return count == COUNT_CONTINUED; + } +} + +/* + * free_swap_count_continuations - swapoff free all the continuation pages + * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. + */ +static void free_swap_count_continuations(struct swap_info_struct *si) +{ + pgoff_t offset; + + for (offset = 0; offset < si->max; offset += PAGE_SIZE) { + struct page *head; + head = vmalloc_to_page(si->swap_map + offset); + if (page_private(head)) { + struct list_head *this, *next; + list_for_each_safe(this, next, &head->lru) { + struct page *page; + page = list_entry(this, struct page, lru); + list_del(this); + __free_page(page); + } + } + } +} diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0f551a4..37e6929 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -761,7 +761,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) spin_lock(&vbq->lock); list_add(&vb->free_list, &vbq->free); spin_unlock(&vbq->lock); - put_cpu_var(vmap_cpu_blocks); + put_cpu_var(vmap_block_queue); return vb; } @@ -826,7 +826,7 @@ again: } spin_unlock(&vb->lock); } - put_cpu_var(vmap_cpu_blocks); + put_cpu_var(vmap_block_queue); rcu_read_unlock(); if (!addr) { @@ -1411,6 +1411,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, { struct page **pages; unsigned int nr_pages, array_size, i; + gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *)); @@ -1418,13 +1419,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, area->nr_pages = nr_pages; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { - pages = __vmalloc_node(array_size, 1, gfp_mask | __GFP_ZERO, + pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, PAGE_KERNEL, node, caller); area->flags |= VM_VPAGES; } else { - pages = kmalloc_node(array_size, - (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO, - node); + pages = kmalloc_node(array_size, nested_gfp, node); } area->pages = pages; area->caller = caller; diff --git a/mm/vmscan.c b/mm/vmscan.c index 777af57..885207a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -55,6 +55,11 @@ struct scan_control { /* Number of pages freed so far during a call to shrink_zones() */ unsigned long nr_reclaimed; + /* How many pages shrink_list() should reclaim */ + unsigned long nr_to_reclaim; + + unsigned long hibernation_mode; + /* This context's GFP mask */ gfp_t gfp_mask; @@ -66,12 +71,6 @@ struct scan_control { /* Can pages be swapped as part of reclaim? */ int may_swap; - /* This context's SWAP_CLUSTER_MAX. If freeing memory for - * suspend, we effectively ignore SWAP_CLUSTER_MAX. - * In this context, it doesn't matter that we scan the - * whole list at once. */ - int swap_cluster_max; - int swappiness; int all_unreclaimable; @@ -358,7 +357,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * stalls if we need to run get_block(). We could test * PagePrivate for that. * - * If this process is currently in generic_file_write() against + * If this process is currently in __generic_file_aio_write() against * this page's queue, we can perform writeback even if that * will block. * @@ -1132,7 +1131,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, unsigned long nr_anon; unsigned long nr_file; - nr_taken = sc->isolate_pages(sc->swap_cluster_max, + nr_taken = sc->isolate_pages(SWAP_CLUSTER_MAX, &page_list, &nr_scan, sc->order, mode, zone, sc->mem_cgroup, 0, file); @@ -1166,10 +1165,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); - reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON]; - reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON]; - reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE]; - reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE]; + reclaim_stat->recent_scanned[0] += nr_anon; + reclaim_stat->recent_scanned[1] += nr_file; spin_unlock_irq(&zone->lru_lock); @@ -1464,20 +1461,26 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) return low; } +static int inactive_list_is_low(struct zone *zone, struct scan_control *sc, + int file) +{ + if (file) + return inactive_file_is_low(zone, sc); + else + return inactive_anon_is_low(zone, sc); +} + static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct zone *zone, struct scan_control *sc, int priority) { int file = is_file_lru(lru); - if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) { - shrink_active_list(nr_to_scan, zone, sc, priority, file); + if (is_active_lru(lru)) { + if (inactive_list_is_low(zone, sc, file)) + shrink_active_list(nr_to_scan, zone, sc, priority, file); return 0; } - if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) { - shrink_active_list(nr_to_scan, zone, sc, priority, file); - return 0; - } return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); } @@ -1567,15 +1570,14 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, * until we collected @swap_cluster_max pages to scan. */ static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, - unsigned long *nr_saved_scan, - unsigned long swap_cluster_max) + unsigned long *nr_saved_scan) { unsigned long nr; *nr_saved_scan += nr_to_scan; nr = *nr_saved_scan; - if (nr >= swap_cluster_max) + if (nr >= SWAP_CLUSTER_MAX) *nr_saved_scan = 0; else nr = 0; @@ -1594,7 +1596,7 @@ static void shrink_zone(int priority, struct zone *zone, unsigned long percent[2]; /* anon @ 0; file @ 1 */ enum lru_list l; unsigned long nr_reclaimed = sc->nr_reclaimed; - unsigned long swap_cluster_max = sc->swap_cluster_max; + unsigned long nr_to_reclaim = sc->nr_to_reclaim; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); int noswap = 0; @@ -1616,15 +1618,15 @@ static void shrink_zone(int priority, struct zone *zone, scan = (scan * percent[file]) / 100; } nr[l] = nr_scan_try_batch(scan, - &reclaim_stat->nr_saved_scan[l], - swap_cluster_max); + &reclaim_stat->nr_saved_scan[l]); } while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { for_each_evictable_lru(l) { if (nr[l]) { - nr_to_scan = min(nr[l], swap_cluster_max); + nr_to_scan = min_t(unsigned long, + nr[l], SWAP_CLUSTER_MAX); nr[l] -= nr_to_scan; nr_reclaimed += shrink_list(l, nr_to_scan, @@ -1639,8 +1641,7 @@ static void shrink_zone(int priority, struct zone *zone, * with multiple processes reclaiming pages, the total * freeing target can get unreasonably large. */ - if (nr_reclaimed > swap_cluster_max && - priority < DEF_PRIORITY && !current_is_kswapd()) + if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) break; } @@ -1738,6 +1739,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct zoneref *z; struct zone *zone; enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); + unsigned long writeback_threshold; delayacct_freepages_start(); @@ -1773,7 +1775,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, } } total_scanned += sc->nr_scanned; - if (sc->nr_reclaimed >= sc->swap_cluster_max) { + if (sc->nr_reclaimed >= sc->nr_to_reclaim) { ret = sc->nr_reclaimed; goto out; } @@ -1785,14 +1787,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, * that's undesirable in laptop mode, where we *want* lumpy * writeout. So in laptop mode, write out the whole world. */ - if (total_scanned > sc->swap_cluster_max + - sc->swap_cluster_max / 2) { + writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; + if (total_scanned > writeback_threshold) { wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); sc->may_writepage = 1; } /* Take a nap, wait for some writeback to complete */ - if (sc->nr_scanned && priority < DEF_PRIORITY - 2) + if (!sc->hibernation_mode && sc->nr_scanned && + priority < DEF_PRIORITY - 2) congestion_wait(BLK_RW_ASYNC, HZ/10); } /* top priority shrink_zones still had more to do? don't OOM, then */ @@ -1831,7 +1834,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, struct scan_control sc = { .gfp_mask = gfp_mask, .may_writepage = !laptop_mode, - .swap_cluster_max = SWAP_CLUSTER_MAX, + .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_unmap = 1, .may_swap = 1, .swappiness = vm_swappiness, @@ -1855,7 +1858,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, - .swap_cluster_max = SWAP_CLUSTER_MAX, .swappiness = swappiness, .order = 0, .mem_cgroup = mem, @@ -1889,7 +1891,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, - .swap_cluster_max = SWAP_CLUSTER_MAX, + .nr_to_reclaim = SWAP_CLUSTER_MAX, .swappiness = swappiness, .order = 0, .mem_cgroup = mem_cont, @@ -1904,6 +1906,30 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, } #endif +/* is kswapd sleeping prematurely? */ +static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) +{ + int i; + + /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ + if (remaining) + return 1; + + /* If after HZ/10, a zone is below the high mark, it's premature */ + for (i = 0; i < pgdat->nr_zones; i++) { + struct zone *zone = pgdat->node_zones + i; + + if (!populated_zone(zone)) + continue; + + if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), + 0, 0)) + return 1; + } + + return 0; +} + /* * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at high_wmark_pages(zone). @@ -1936,7 +1962,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) .gfp_mask = GFP_KERNEL, .may_unmap = 1, .may_swap = 1, - .swap_cluster_max = SWAP_CLUSTER_MAX, + /* + * kswapd doesn't want to be bailed out while reclaim. because + * we want to put equal scanning pressure on each zone. + */ + .nr_to_reclaim = ULONG_MAX, .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, @@ -1961,6 +1991,7 @@ loop_again: for (priority = DEF_PRIORITY; priority >= 0; priority--) { int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long lru_pages = 0; + int has_under_min_watermark_zone = 0; /* The swap token gets in the way of swapout... */ if (!priority) @@ -2067,6 +2098,15 @@ loop_again: if (total_scanned > SWAP_CLUSTER_MAX * 2 && total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; + + /* + * We are still under min water mark. it mean we have + * GFP_ATOMIC allocation failure risk. Hurry up! + */ + if (!zone_watermark_ok(zone, order, min_wmark_pages(zone), + end_zone, 0)) + has_under_min_watermark_zone = 1; + } if (all_zones_ok) break; /* kswapd: all done */ @@ -2074,8 +2114,12 @@ loop_again: * OK, kswapd is getting into trouble. Take a nap, then take * another pass across the zones. */ - if (total_scanned && priority < DEF_PRIORITY - 2) - congestion_wait(BLK_RW_ASYNC, HZ/10); + if (total_scanned && (priority < DEF_PRIORITY - 2)) { + if (has_under_min_watermark_zone) + count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); + else + congestion_wait(BLK_RW_ASYNC, HZ/10); + } /* * We do this so kswapd doesn't build up large priorities for @@ -2173,6 +2217,7 @@ static int kswapd(void *p) order = 0; for ( ; ; ) { unsigned long new_order; + int ret; prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); new_order = pgdat->kswapd_max_order; @@ -2184,19 +2229,45 @@ static int kswapd(void *p) */ order = new_order; } else { - if (!freezing(current)) - schedule(); + if (!freezing(current) && !kthread_should_stop()) { + long remaining = 0; + + /* Try to sleep for a short interval */ + if (!sleeping_prematurely(pgdat, order, remaining)) { + remaining = schedule_timeout(HZ/10); + finish_wait(&pgdat->kswapd_wait, &wait); + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + } + + /* + * After a short sleep, check if it was a + * premature sleep. If not, then go fully + * to sleep until explicitly woken up + */ + if (!sleeping_prematurely(pgdat, order, remaining)) + schedule(); + else { + if (remaining) + count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); + else + count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); + } + } order = pgdat->kswapd_max_order; } finish_wait(&pgdat->kswapd_wait, &wait); - if (!try_to_freeze()) { - /* We can speed up thawing tasks if we don't call - * balance_pgdat after returning from the refrigerator - */ + ret = try_to_freeze(); + if (kthread_should_stop()) + break; + + /* + * We can speed up thawing tasks if we don't call balance_pgdat + * after returning from the refrigerator + */ + if (!ret) balance_pgdat(pgdat, order); - } } return 0; } @@ -2260,148 +2331,43 @@ unsigned long zone_reclaimable_pages(struct zone *zone) #ifdef CONFIG_HIBERNATION /* - * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages - * from LRU lists system-wide, for given pass and priority. - * - * For pass > 3 we also try to shrink the LRU lists that contain a few pages - */ -static void shrink_all_zones(unsigned long nr_pages, int prio, - int pass, struct scan_control *sc) -{ - struct zone *zone; - unsigned long nr_reclaimed = 0; - struct zone_reclaim_stat *reclaim_stat; - - for_each_populated_zone(zone) { - enum lru_list l; - - if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) - continue; - - for_each_evictable_lru(l) { - enum zone_stat_item ls = NR_LRU_BASE + l; - unsigned long lru_pages = zone_page_state(zone, ls); - - /* For pass = 0, we don't shrink the active list */ - if (pass == 0 && (l == LRU_ACTIVE_ANON || - l == LRU_ACTIVE_FILE)) - continue; - - reclaim_stat = get_reclaim_stat(zone, sc); - reclaim_stat->nr_saved_scan[l] += - (lru_pages >> prio) + 1; - if (reclaim_stat->nr_saved_scan[l] - >= nr_pages || pass > 3) { - unsigned long nr_to_scan; - - reclaim_stat->nr_saved_scan[l] = 0; - nr_to_scan = min(nr_pages, lru_pages); - nr_reclaimed += shrink_list(l, nr_to_scan, zone, - sc, prio); - if (nr_reclaimed >= nr_pages) { - sc->nr_reclaimed += nr_reclaimed; - return; - } - } - } - } - sc->nr_reclaimed += nr_reclaimed; -} - -/* - * Try to free `nr_pages' of memory, system-wide, and return the number of + * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of * freed pages. * * Rather than trying to age LRUs the aim is to preserve the overall * LRU order by reclaiming preferentially * inactive > active > active referenced > active mapped */ -unsigned long shrink_all_memory(unsigned long nr_pages) +unsigned long shrink_all_memory(unsigned long nr_to_reclaim) { - unsigned long lru_pages, nr_slab; - int pass; struct reclaim_state reclaim_state; struct scan_control sc = { - .gfp_mask = GFP_KERNEL, - .may_unmap = 0, + .gfp_mask = GFP_HIGHUSER_MOVABLE, + .may_swap = 1, + .may_unmap = 1, .may_writepage = 1, + .nr_to_reclaim = nr_to_reclaim, + .hibernation_mode = 1, + .swappiness = vm_swappiness, + .order = 0, .isolate_pages = isolate_pages_global, - .nr_reclaimed = 0, }; + struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); + struct task_struct *p = current; + unsigned long nr_reclaimed; - current->reclaim_state = &reclaim_state; - - lru_pages = global_reclaimable_pages(); - nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); - /* If slab caches are huge, it's better to hit them first */ - while (nr_slab >= lru_pages) { - reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, lru_pages); - if (!reclaim_state.reclaimed_slab) - break; - - sc.nr_reclaimed += reclaim_state.reclaimed_slab; - if (sc.nr_reclaimed >= nr_pages) - goto out; - - nr_slab -= reclaim_state.reclaimed_slab; - } - - /* - * We try to shrink LRUs in 5 passes: - * 0 = Reclaim from inactive_list only - * 1 = Reclaim from active list but don't reclaim mapped - * 2 = 2nd pass of type 1 - * 3 = Reclaim mapped (normal reclaim) - * 4 = 2nd pass of type 3 - */ - for (pass = 0; pass < 5; pass++) { - int prio; - - /* Force reclaiming mapped pages in the passes #3 and #4 */ - if (pass > 2) - sc.may_unmap = 1; - - for (prio = DEF_PRIORITY; prio >= 0; prio--) { - unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed; - - sc.nr_scanned = 0; - sc.swap_cluster_max = nr_to_scan; - shrink_all_zones(nr_to_scan, prio, pass, &sc); - if (sc.nr_reclaimed >= nr_pages) - goto out; - - reclaim_state.reclaimed_slab = 0; - shrink_slab(sc.nr_scanned, sc.gfp_mask, - global_reclaimable_pages()); - sc.nr_reclaimed += reclaim_state.reclaimed_slab; - if (sc.nr_reclaimed >= nr_pages) - goto out; - - if (sc.nr_scanned && prio < DEF_PRIORITY - 2) - congestion_wait(BLK_RW_ASYNC, HZ / 10); - } - } - - /* - * If sc.nr_reclaimed = 0, we could not shrink LRUs, but there may be - * something in slab caches - */ - if (!sc.nr_reclaimed) { - do { - reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, - global_reclaimable_pages()); - sc.nr_reclaimed += reclaim_state.reclaimed_slab; - } while (sc.nr_reclaimed < nr_pages && - reclaim_state.reclaimed_slab > 0); - } + p->flags |= PF_MEMALLOC; + lockdep_set_current_reclaim_state(sc.gfp_mask); + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); -out: - current->reclaim_state = NULL; + p->reclaim_state = NULL; + lockdep_clear_current_reclaim_state(); + p->flags &= ~PF_MEMALLOC; - return sc.nr_reclaimed; + return nr_reclaimed; } #endif /* CONFIG_HIBERNATION */ @@ -2451,6 +2417,17 @@ int kswapd_run(int nid) return ret; } +/* + * Called by memory hotplug when all memory in a node is offlined. + */ +void kswapd_stop(int nid) +{ + struct task_struct *kswapd = NODE_DATA(nid)->kswapd; + + if (kswapd) + kthread_stop(kswapd); +} + static int __init kswapd_init(void) { int nid; @@ -2553,8 +2530,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), .may_swap = 1, - .swap_cluster_max = max_t(unsigned long, nr_pages, - SWAP_CLUSTER_MAX), + .nr_to_reclaim = max_t(unsigned long, nr_pages, + SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, .swappiness = vm_swappiness, .order = order, diff --git a/mm/vmstat.c b/mm/vmstat.c index c81321f..6051fba 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -683,6 +683,9 @@ static const char * const vmstat_text[] = { "slabs_scanned", "kswapd_steal", "kswapd_inodesteal", + "kswapd_low_wmark_hit_quickly", + "kswapd_high_wmark_hit_quickly", + "kswapd_skip_congestion_wait", "pageoutrun", "allocstall", @@ -883,11 +886,10 @@ static void vmstat_update(struct work_struct *w) static void __cpuinit start_cpu_timer(int cpu) { - struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu); + struct delayed_work *work = &per_cpu(vmstat_work, cpu); - INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update); - schedule_delayed_work_on(cpu, vmstat_work, - __round_jiffies_relative(HZ, cpu)); + INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update); + schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); } /* |