summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorJérôme Glisse <jglisse@redhat.com>2017-09-08 16:12:17 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2017-09-08 18:26:46 -0700
commita5430dda8a3a1cdd532e37270e6f36436241b6e7 (patch)
tree86a2edaf2d6aa11f4e76bcc3b0103cfb0bb2de09 /mm
parent8c3328f1f36a5efe817ad4e06497af601936a460 (diff)
downloadop-kernel-dev-a5430dda8a3a1cdd532e37270e6f36436241b6e7.zip
op-kernel-dev-a5430dda8a3a1cdd532e37270e6f36436241b6e7.tar.gz
mm/migrate: support un-addressable ZONE_DEVICE page in migration
Allow to unmap and restore special swap entry of un-addressable ZONE_DEVICE memory. Link: http://lkml.kernel.org/r/20170817000548.32038-17-jglisse@redhat.com Signed-off-by: Jérôme Glisse <jglisse@redhat.com> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com> Cc: Balbir Singh <bsingharora@gmail.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Dan Williams <dan.j.williams@intel.com> Cc: David Nellans <dnellans@nvidia.com> Cc: Evgeny Baskakov <ebaskakov@nvidia.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Mark Hairgrove <mhairgrove@nvidia.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Cc: Sherry Cheung <SCheung@nvidia.com> Cc: Subhash Gutti <sgutti@nvidia.com> Cc: Vladimir Davydov <vdavydov.dev@gmail.com> Cc: Bob Liu <liubo95@huawei.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/migrate.c149
-rw-r--r--mm/page_vma_mapped.c10
-rw-r--r--mm/rmap.c26
3 files changed, 157 insertions, 28 deletions
diff --git a/mm/migrate.c b/mm/migrate.c
index 652b2c6..77cb2fef 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -36,6 +36,7 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/gfp.h>
+#include <linux/memremap.h>
#include <linux/balloon_compaction.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
@@ -237,7 +238,13 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
if (is_write_migration_entry(entry))
pte = maybe_mkwrite(pte, vma);
- flush_dcache_page(new);
+ if (unlikely(is_zone_device_page(new)) &&
+ is_device_private_page(new)) {
+ entry = make_device_private_entry(new, pte_write(pte));
+ pte = swp_entry_to_pte(entry);
+ } else
+ flush_dcache_page(new);
+
#ifdef CONFIG_HUGETLB_PAGE
if (PageHuge(new)) {
pte = pte_mkhuge(pte);
@@ -2205,17 +2212,40 @@ again:
pte = *ptep;
pfn = pte_pfn(pte);
- if (!pte_present(pte)) {
+ if (pte_none(pte)) {
mpfn = pfn = 0;
goto next;
}
+ if (!pte_present(pte)) {
+ mpfn = pfn = 0;
+
+ /*
+ * Only care about unaddressable device page special
+ * page table entry. Other special swap entries are not
+ * migratable, and we ignore regular swapped page.
+ */
+ entry = pte_to_swp_entry(pte);
+ if (!is_device_private_entry(entry))
+ goto next;
+
+ page = device_private_entry_to_page(entry);
+ mpfn = migrate_pfn(page_to_pfn(page))|
+ MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE;
+ if (is_write_device_private_entry(entry))
+ mpfn |= MIGRATE_PFN_WRITE;
+ } else {
+ page = vm_normal_page(migrate->vma, addr, pte);
+ mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+ mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
+ }
+
/* FIXME support THP */
- page = vm_normal_page(migrate->vma, addr, pte);
if (!page || !page->mapping || PageTransCompound(page)) {
mpfn = pfn = 0;
goto next;
}
+ pfn = page_to_pfn(page);
/*
* By getting a reference on the page we pin it and that blocks
@@ -2228,8 +2258,6 @@ again:
*/
get_page(page);
migrate->cpages++;
- mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
- mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
/*
* Optimize for the common case where page is only mapped once
@@ -2256,10 +2284,13 @@ again:
*/
page_remove_rmap(page, false);
put_page(page);
- unmapped++;
+
+ if (pte_present(pte))
+ unmapped++;
}
next:
+ migrate->dst[migrate->npages] = 0;
migrate->src[migrate->npages++] = mpfn;
}
arch_leave_lazy_mmu_mode();
@@ -2329,6 +2360,28 @@ static bool migrate_vma_check_page(struct page *page)
if (PageCompound(page))
return false;
+ /* Page from ZONE_DEVICE have one extra reference */
+ if (is_zone_device_page(page)) {
+ /*
+ * Private page can never be pin as they have no valid pte and
+ * GUP will fail for those. Yet if there is a pending migration
+ * a thread might try to wait on the pte migration entry and
+ * will bump the page reference count. Sadly there is no way to
+ * differentiate a regular pin from migration wait. Hence to
+ * avoid 2 racing thread trying to migrate back to CPU to enter
+ * infinite loop (one stoping migration because the other is
+ * waiting on pte migration entry). We always return true here.
+ *
+ * FIXME proper solution is to rework migration_entry_wait() so
+ * it does not need to take a reference on page.
+ */
+ if (is_device_private_page(page))
+ return true;
+
+ /* Other ZONE_DEVICE memory type are not supported */
+ return false;
+ }
+
if ((page_count(page) - extra) > page_mapcount(page))
return false;
@@ -2379,24 +2432,30 @@ static void migrate_vma_prepare(struct migrate_vma *migrate)
migrate->src[i] |= MIGRATE_PFN_LOCKED;
}
- if (!PageLRU(page) && allow_drain) {
- /* Drain CPU's pagevec */
- lru_add_drain_all();
- allow_drain = false;
- }
+ /* ZONE_DEVICE pages are not on LRU */
+ if (!is_zone_device_page(page)) {
+ if (!PageLRU(page) && allow_drain) {
+ /* Drain CPU's pagevec */
+ lru_add_drain_all();
+ allow_drain = false;
+ }
- if (isolate_lru_page(page)) {
- if (remap) {
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
- migrate->cpages--;
- restore++;
- } else {
- migrate->src[i] = 0;
- unlock_page(page);
- migrate->cpages--;
- put_page(page);
+ if (isolate_lru_page(page)) {
+ if (remap) {
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ migrate->cpages--;
+ restore++;
+ } else {
+ migrate->src[i] = 0;
+ unlock_page(page);
+ migrate->cpages--;
+ put_page(page);
+ }
+ continue;
}
- continue;
+
+ /* Drop the reference we took in collect */
+ put_page(page);
}
if (!migrate_vma_check_page(page)) {
@@ -2405,14 +2464,19 @@ static void migrate_vma_prepare(struct migrate_vma *migrate)
migrate->cpages--;
restore++;
- get_page(page);
- putback_lru_page(page);
+ if (!is_zone_device_page(page)) {
+ get_page(page);
+ putback_lru_page(page);
+ }
} else {
migrate->src[i] = 0;
unlock_page(page);
migrate->cpages--;
- putback_lru_page(page);
+ if (!is_zone_device_page(page))
+ putback_lru_page(page);
+ else
+ put_page(page);
}
}
}
@@ -2483,7 +2547,10 @@ restore:
unlock_page(page);
restore--;
- putback_lru_page(page);
+ if (is_zone_device_page(page))
+ put_page(page);
+ else
+ putback_lru_page(page);
}
}
@@ -2514,6 +2581,26 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
mapping = page_mapping(page);
+ if (is_zone_device_page(newpage)) {
+ if (is_device_private_page(newpage)) {
+ /*
+ * For now only support private anonymous when
+ * migrating to un-addressable device memory.
+ */
+ if (mapping) {
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ continue;
+ }
+ } else {
+ /*
+ * Other types of ZONE_DEVICE page are not
+ * supported.
+ */
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ continue;
+ }
+ }
+
r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
if (r != MIGRATEPAGE_SUCCESS)
migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
@@ -2554,11 +2641,17 @@ static void migrate_vma_finalize(struct migrate_vma *migrate)
unlock_page(page);
migrate->cpages--;
- putback_lru_page(page);
+ if (is_zone_device_page(page))
+ put_page(page);
+ else
+ putback_lru_page(page);
if (newpage != page) {
unlock_page(newpage);
- putback_lru_page(newpage);
+ if (is_zone_device_page(newpage))
+ put_page(newpage);
+ else
+ putback_lru_page(newpage);
}
}
}
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 3bd3008..6a03946 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -48,6 +48,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
if (!is_swap_pte(*pvmw->pte))
return false;
entry = pte_to_swp_entry(*pvmw->pte);
+
if (!is_migration_entry(entry))
return false;
if (migration_entry_to_page(entry) - pvmw->page >=
@@ -60,6 +61,15 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
WARN_ON_ONCE(1);
#endif
} else {
+ if (is_swap_pte(*pvmw->pte)) {
+ swp_entry_t entry;
+
+ entry = pte_to_swp_entry(*pvmw->pte);
+ if (is_device_private_entry(entry) &&
+ device_private_entry_to_page(entry) == pvmw->page)
+ return true;
+ }
+
if (!pte_present(*pvmw->pte))
return false;
diff --git a/mm/rmap.c b/mm/rmap.c
index 7dc9c02..0618cd8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -63,6 +63,7 @@
#include <linux/hugetlb.h>
#include <linux/backing-dev.h>
#include <linux/page_idle.h>
+#include <linux/memremap.h>
#include <asm/tlbflush.h>
@@ -1346,6 +1347,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
return true;
+ if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
+ is_zone_device_page(page) && !is_device_private_page(page))
+ return true;
+
if (flags & TTU_SPLIT_HUGE_PMD) {
split_huge_pmd_address(vma, address,
flags & TTU_SPLIT_FREEZE, page);
@@ -1403,6 +1408,27 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
address = pvmw.address;
+ if (IS_ENABLED(CONFIG_MIGRATION) &&
+ (flags & TTU_MIGRATION) &&
+ is_zone_device_page(page)) {
+ swp_entry_t entry;
+ pte_t swp_pte;
+
+ pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte);
+
+ /*
+ * Store the pfn of the page in a special migration
+ * pte. do_swap_page() will wait until the migration
+ * pte is removed and then restart fault handling.
+ */
+ entry = make_migration_entry(page, 0);
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
+ goto discard;
+ }
+
if (!(flags & TTU_IGNORE_ACCESS)) {
if (ptep_clear_flush_young_notify(vma, address,
pvmw.pte)) {
OpenPOWER on IntegriCloud