summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/ksm.h54
-rw-r--r--include/linux/rmap.h5
-rw-r--r--mm/ksm.c172
-rw-r--r--mm/memory.c6
-rw-r--r--mm/rmap.c65
-rw-r--r--mm/swapfile.c11
6 files changed, 264 insertions, 49 deletions
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index ef55ce1..157d83d 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -9,10 +9,12 @@
#include <linux/bitops.h>
#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
#include <linux/sched.h>
-#include <linux/vmstat.h>
struct stable_node;
+struct mem_cgroup;
#ifdef CONFIG_KSM
int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
@@ -57,11 +59,36 @@ static inline void set_page_stable_node(struct page *page,
(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
}
-static inline void page_add_ksm_rmap(struct page *page)
+/*
+ * When do_swap_page() first faults in from swap what used to be a KSM page,
+ * no problem, it will be assigned to this vma's anon_vma; but thereafter,
+ * it might be faulted into a different anon_vma (or perhaps to a different
+ * offset in the same anon_vma). do_swap_page() cannot do all the locking
+ * needed to reconstitute a cross-anon_vma KSM page: for now it has to make
+ * a copy, and leave remerging the pages to a later pass of ksmd.
+ *
+ * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE,
+ * but what if the vma was unmerged while the page was swapped out?
+ */
+struct page *ksm_does_need_to_copy(struct page *page,
+ struct vm_area_struct *vma, unsigned long address);
+static inline struct page *ksm_might_need_to_copy(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
{
- if (atomic_inc_and_test(&page->_mapcount))
- __inc_zone_page_state(page, NR_ANON_PAGES);
+ struct anon_vma *anon_vma = page_anon_vma(page);
+
+ if (!anon_vma ||
+ (anon_vma == vma->anon_vma &&
+ page->index == linear_page_index(vma, address)))
+ return page;
+
+ return ksm_does_need_to_copy(page, vma, address);
}
+
+int page_referenced_ksm(struct page *page,
+ struct mem_cgroup *memcg, unsigned long *vm_flags);
+int try_to_unmap_ksm(struct page *page, enum ttu_flags flags);
+
#else /* !CONFIG_KSM */
static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
@@ -84,7 +111,22 @@ static inline int PageKsm(struct page *page)
return 0;
}
-/* No stub required for page_add_ksm_rmap(page) */
+static inline struct page *ksm_might_need_to_copy(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ return page;
+}
+
+static inline int page_referenced_ksm(struct page *page,
+ struct mem_cgroup *memcg, unsigned long *vm_flags)
+{
+ return 0;
+}
+
+static inline int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
+{
+ return 0;
+}
#endif /* !CONFIG_KSM */
-#endif
+#endif /* __LINUX_KSM_H */
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 1f65af4..0b4913a 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -89,6 +89,9 @@ static inline void page_dup_rmap(struct page *page)
*/
int page_referenced(struct page *, int is_locked,
struct mem_cgroup *cnt, unsigned long *vm_flags);
+int page_referenced_one(struct page *, struct vm_area_struct *,
+ unsigned long address, unsigned int *mapcount, unsigned long *vm_flags);
+
enum ttu_flags {
TTU_UNMAP = 0, /* unmap mode */
TTU_MIGRATION = 1, /* migration mode */
@@ -102,6 +105,8 @@ enum ttu_flags {
#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
int try_to_unmap(struct page *, enum ttu_flags flags);
+int try_to_unmap_one(struct page *, struct vm_area_struct *,
+ unsigned long address, enum ttu_flags flags);
/*
* Called from mm/filemap_xip.c to unmap empty zero page
diff --git a/mm/ksm.c b/mm/ksm.c
index af5f571..2f58cee 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -196,6 +196,13 @@ static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
static DEFINE_MUTEX(ksm_thread_mutex);
static DEFINE_SPINLOCK(ksm_mmlist_lock);
+/*
+ * Temporary hack for page_referenced_ksm() and try_to_unmap_ksm(),
+ * later we rework things a little to get the right vma to them.
+ */
+static DEFINE_SPINLOCK(ksm_fallback_vma_lock);
+static struct vm_area_struct ksm_fallback_vma;
+
#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
sizeof(struct __struct), __alignof__(struct __struct),\
(__flags), NULL)
@@ -445,14 +452,20 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
{
if (rmap_item->address & STABLE_FLAG) {
struct stable_node *stable_node;
+ struct page *page;
stable_node = rmap_item->head;
+ page = stable_node->page;
+ lock_page(page);
+
hlist_del(&rmap_item->hlist);
- if (stable_node->hlist.first)
+ if (stable_node->hlist.first) {
+ unlock_page(page);
ksm_pages_sharing--;
- else {
- set_page_stable_node(stable_node->page, NULL);
- put_page(stable_node->page);
+ } else {
+ set_page_stable_node(page, NULL);
+ unlock_page(page);
+ put_page(page);
rb_erase(&stable_node->node, &root_stable_tree);
free_stable_node(stable_node);
@@ -710,7 +723,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
}
get_page(kpage);
- page_add_ksm_rmap(kpage);
+ page_add_anon_rmap(kpage, vma, addr);
flush_cache_page(vma, addr, pte_pfn(*ptep));
ptep_clear_flush(vma, addr, ptep);
@@ -763,8 +776,16 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
pages_identical(page, kpage))
err = replace_page(vma, page, kpage, orig_pte);
- if ((vma->vm_flags & VM_LOCKED) && !err)
+ if ((vma->vm_flags & VM_LOCKED) && !err) {
munlock_vma_page(page);
+ if (!PageMlocked(kpage)) {
+ unlock_page(page);
+ lru_add_drain();
+ lock_page(kpage);
+ mlock_vma_page(kpage);
+ page = kpage; /* for final unlock */
+ }
+ }
unlock_page(page);
out:
@@ -841,7 +862,11 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
copy_user_highpage(kpage, page, rmap_item->address, vma);
+ SetPageDirty(kpage);
+ __SetPageUptodate(kpage);
+ SetPageSwapBacked(kpage);
set_page_stable_node(kpage, NULL); /* mark it PageKsm */
+ lru_cache_add_lru(kpage, LRU_ACTIVE_ANON);
err = try_to_merge_one_page(vma, page, kpage);
up:
@@ -1071,7 +1096,9 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
* The page was successfully merged:
* add its rmap_item to the stable tree.
*/
+ lock_page(kpage);
stable_tree_append(rmap_item, stable_node);
+ unlock_page(kpage);
}
put_page(kpage);
return;
@@ -1112,11 +1139,13 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
if (kpage) {
remove_rmap_item_from_tree(tree_rmap_item);
+ lock_page(kpage);
stable_node = stable_tree_insert(kpage);
if (stable_node) {
stable_tree_append(tree_rmap_item, stable_node);
stable_tree_append(rmap_item, stable_node);
}
+ unlock_page(kpage);
put_page(kpage);
/*
@@ -1285,14 +1314,6 @@ static void ksm_do_scan(unsigned int scan_npages)
return;
if (!PageKsm(page) || !in_stable_tree(rmap_item))
cmp_and_merge_page(page, rmap_item);
- else if (page_mapcount(page) == 1) {
- /*
- * Replace now-unshared ksm page by ordinary page.
- */
- break_cow(rmap_item);
- remove_rmap_item_from_tree(rmap_item);
- rmap_item->oldchecksum = calc_checksum(page);
- }
put_page(page);
}
}
@@ -1337,7 +1358,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
VM_PFNMAP | VM_IO | VM_DONTEXPAND |
VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
- VM_MIXEDMAP | VM_SAO))
+ VM_NONLINEAR | VM_MIXEDMAP | VM_SAO))
return 0; /* just ignore the advice */
if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
@@ -1435,6 +1456,127 @@ void __ksm_exit(struct mm_struct *mm)
}
}
+struct page *ksm_does_need_to_copy(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ struct page *new_page;
+
+ unlock_page(page); /* any racers will COW it, not modify it */
+
+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ if (new_page) {
+ copy_user_highpage(new_page, page, address, vma);
+
+ SetPageDirty(new_page);
+ __SetPageUptodate(new_page);
+ SetPageSwapBacked(new_page);
+ __set_page_locked(new_page);
+
+ if (page_evictable(new_page, vma))
+ lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
+ else
+ add_page_to_unevictable_list(new_page);
+ }
+
+ page_cache_release(page);
+ return new_page;
+}
+
+int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
+ unsigned long *vm_flags)
+{
+ struct stable_node *stable_node;
+ struct rmap_item *rmap_item;
+ struct hlist_node *hlist;
+ unsigned int mapcount = page_mapcount(page);
+ int referenced = 0;
+ struct vm_area_struct *vma;
+
+ VM_BUG_ON(!PageKsm(page));
+ VM_BUG_ON(!PageLocked(page));
+
+ stable_node = page_stable_node(page);
+ if (!stable_node)
+ return 0;
+
+ /*
+ * Temporary hack: really we need anon_vma in rmap_item, to
+ * provide the correct vma, and to find recently forked instances.
+ * Use zalloc to avoid weirdness if any other fields are involved.
+ */
+ vma = kmem_cache_zalloc(vm_area_cachep, GFP_ATOMIC);
+ if (!vma) {
+ spin_lock(&ksm_fallback_vma_lock);
+ vma = &ksm_fallback_vma;
+ }
+
+ hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+ if (memcg && !mm_match_cgroup(rmap_item->mm, memcg))
+ continue;
+
+ vma->vm_mm = rmap_item->mm;
+ vma->vm_start = rmap_item->address;
+ vma->vm_end = vma->vm_start + PAGE_SIZE;
+
+ referenced += page_referenced_one(page, vma,
+ rmap_item->address, &mapcount, vm_flags);
+ if (!mapcount)
+ goto out;
+ }
+out:
+ if (vma == &ksm_fallback_vma)
+ spin_unlock(&ksm_fallback_vma_lock);
+ else
+ kmem_cache_free(vm_area_cachep, vma);
+ return referenced;
+}
+
+int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
+{
+ struct stable_node *stable_node;
+ struct hlist_node *hlist;
+ struct rmap_item *rmap_item;
+ int ret = SWAP_AGAIN;
+ struct vm_area_struct *vma;
+
+ VM_BUG_ON(!PageKsm(page));
+ VM_BUG_ON(!PageLocked(page));
+
+ stable_node = page_stable_node(page);
+ if (!stable_node)
+ return SWAP_FAIL;
+
+ /*
+ * Temporary hack: really we need anon_vma in rmap_item, to
+ * provide the correct vma, and to find recently forked instances.
+ * Use zalloc to avoid weirdness if any other fields are involved.
+ */
+ if (TTU_ACTION(flags) != TTU_UNMAP)
+ return SWAP_FAIL;
+
+ vma = kmem_cache_zalloc(vm_area_cachep, GFP_ATOMIC);
+ if (!vma) {
+ spin_lock(&ksm_fallback_vma_lock);
+ vma = &ksm_fallback_vma;
+ }
+
+ hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+ vma->vm_mm = rmap_item->mm;
+ vma->vm_start = rmap_item->address;
+ vma->vm_end = vma->vm_start + PAGE_SIZE;
+
+ ret = try_to_unmap_one(page, vma, rmap_item->address, flags);
+ if (ret != SWAP_AGAIN || !page_mapped(page))
+ goto out;
+ }
+out:
+ if (vma == &ksm_fallback_vma)
+ spin_unlock(&ksm_fallback_vma_lock);
+ else
+ kmem_cache_free(vm_area_cachep, vma);
+ return ret;
+}
+
#ifdef CONFIG_SYSFS
/*
* This all compiles without CONFIG_SYSFS, but is a waste of space.
diff --git a/mm/memory.c b/mm/memory.c
index 1c9dc46..a54b2c4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2561,6 +2561,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
lock_page(page);
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+ page = ksm_might_need_to_copy(page, vma, address);
+ if (!page) {
+ ret = VM_FAULT_OOM;
+ goto out;
+ }
+
if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
ret = VM_FAULT_OOM;
goto out_page;
diff --git a/mm/rmap.c b/mm/rmap.c
index ebee816..869aaa3 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -49,6 +49,7 @@
#include <linux/swapops.h>
#include <linux/slab.h>
#include <linux/init.h>
+#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/rcupdate.h>
#include <linux/module.h>
@@ -336,9 +337,9 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
* Subfunctions of page_referenced: page_referenced_one called
* repeatedly from either page_referenced_anon or page_referenced_file.
*/
-static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
- unsigned long address, unsigned int *mapcount,
- unsigned long *vm_flags)
+int page_referenced_one(struct page *page, struct vm_area_struct *vma,
+ unsigned long address, unsigned int *mapcount,
+ unsigned long *vm_flags)
{
struct mm_struct *mm = vma->vm_mm;
pte_t *pte;
@@ -507,28 +508,33 @@ int page_referenced(struct page *page,
unsigned long *vm_flags)
{
int referenced = 0;
+ int we_locked = 0;
if (TestClearPageReferenced(page))
referenced++;
*vm_flags = 0;
if (page_mapped(page) && page_rmapping(page)) {
- if (PageAnon(page))
+ if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
+ we_locked = trylock_page(page);
+ if (!we_locked) {
+ referenced++;
+ goto out;
+ }
+ }
+ if (unlikely(PageKsm(page)))
+ referenced += page_referenced_ksm(page, mem_cont,
+ vm_flags);
+ else if (PageAnon(page))
referenced += page_referenced_anon(page, mem_cont,
vm_flags);
- else if (is_locked)
+ else if (page->mapping)
referenced += page_referenced_file(page, mem_cont,
vm_flags);
- else if (!trylock_page(page))
- referenced++;
- else {
- if (page->mapping)
- referenced += page_referenced_file(page,
- mem_cont, vm_flags);
+ if (we_locked)
unlock_page(page);
- }
}
-
+out:
if (page_test_and_clear_young(page))
referenced++;
@@ -620,14 +626,7 @@ static void __page_set_anon_rmap(struct page *page,
BUG_ON(!anon_vma);
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
page->mapping = (struct address_space *) anon_vma;
-
page->index = linear_page_index(vma, address);
-
- /*
- * nr_mapped state can be updated without turning off
- * interrupts because it is not modified via interrupt.
- */
- __inc_zone_page_state(page, NR_ANON_PAGES);
}
/**
@@ -665,14 +664,21 @@ static void __page_check_anon_rmap(struct page *page,
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
*
- * The caller needs to hold the pte lock and the page must be locked.
+ * The caller needs to hold the pte lock, and the page must be locked in
+ * the anon_vma case: to serialize mapping,index checking after setting.
*/
void page_add_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
+ int first = atomic_inc_and_test(&page->_mapcount);
+ if (first)
+ __inc_zone_page_state(page, NR_ANON_PAGES);
+ if (unlikely(PageKsm(page)))
+ return;
+
VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
- if (atomic_inc_and_test(&page->_mapcount))
+ if (first)
__page_set_anon_rmap(page, vma, address);
else
__page_check_anon_rmap(page, vma, address);
@@ -694,6 +700,7 @@ void page_add_new_anon_rmap(struct page *page,
VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
SetPageSwapBacked(page);
atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
+ __inc_zone_page_state(page, NR_ANON_PAGES);
__page_set_anon_rmap(page, vma, address);
if (page_evictable(page, vma))
lru_cache_add_lru(page, LRU_ACTIVE_ANON);
@@ -760,8 +767,8 @@ void page_remove_rmap(struct page *page)
* Subfunctions of try_to_unmap: try_to_unmap_one called
* repeatedly from either try_to_unmap_anon or try_to_unmap_file.
*/
-static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
- unsigned long address, enum ttu_flags flags)
+int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
+ unsigned long address, enum ttu_flags flags)
{
struct mm_struct *mm = vma->vm_mm;
pte_t *pte;
@@ -1156,7 +1163,9 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
BUG_ON(!PageLocked(page));
- if (PageAnon(page))
+ if (unlikely(PageKsm(page)))
+ ret = try_to_unmap_ksm(page, flags);
+ else if (PageAnon(page))
ret = try_to_unmap_anon(page, flags);
else
ret = try_to_unmap_file(page, flags);
@@ -1177,15 +1186,17 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
*
* SWAP_AGAIN - no vma is holding page mlocked, or,
* SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem
+ * SWAP_FAIL - page cannot be located at present
* SWAP_MLOCK - page is now mlocked.
*/
int try_to_munlock(struct page *page)
{
VM_BUG_ON(!PageLocked(page) || PageLRU(page));
- if (PageAnon(page))
+ if (unlikely(PageKsm(page)))
+ return try_to_unmap_ksm(page, TTU_MUNLOCK);
+ else if (PageAnon(page))
return try_to_unmap_anon(page, TTU_MUNLOCK);
else
return try_to_unmap_file(page, TTU_MUNLOCK);
}
-
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e74112e..6c0585b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -22,6 +22,7 @@
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/module.h>
+#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/security.h>
#include <linux/backing-dev.h>
@@ -650,6 +651,8 @@ int reuse_swap_page(struct page *page)
int count;
VM_BUG_ON(!PageLocked(page));
+ if (unlikely(PageKsm(page)))
+ return 0;
count = page_mapcount(page);
if (count <= 1 && PageSwapCache(page)) {
count += page_swapcount(page);
@@ -658,7 +661,7 @@ int reuse_swap_page(struct page *page)
SetPageDirty(page);
}
}
- return count == 1;
+ return count <= 1;
}
/*
@@ -1185,6 +1188,12 @@ static int try_to_unuse(unsigned int type)
* read from disk into another page. Splitting into two
* pages would be incorrect if swap supported "shared
* private" pages, but they are handled by tmpfs files.
+ *
+ * Given how unuse_vma() targets one particular offset
+ * in an anon_vma, once the anon_vma has been determined,
+ * this splitting happens to be just what is needed to
+ * handle where KSM pages have been swapped out: re-reading
+ * is unnecessarily slow, but we can fix that later on.
*/
if (swap_count(*swap_map) &&
PageDirty(page) && PageSwapCache(page)) {
OpenPOWER on IntegriCloud