diff options
author | Jérôme Glisse <jglisse@redhat.com> | 2017-09-08 16:12:24 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-09-08 18:26:46 -0700 |
commit | df6ad69838fc9dcdbee0dcf2fc2c6f1113f8d609 (patch) | |
tree | d5774eba9a9c2204123b8ca36d9cba90bfa9ad64 /mm | |
parent | 8315ada7f095bfa2cae0cd1e915b95bf6226897d (diff) | |
download | op-kernel-dev-df6ad69838fc9dcdbee0dcf2fc2c6f1113f8d609.zip op-kernel-dev-df6ad69838fc9dcdbee0dcf2fc2c6f1113f8d609.tar.gz |
mm/device-public-memory: device memory cache coherent with CPU
Platform with advance system bus (like CAPI or CCIX) allow device memory
to be accessible from CPU in a cache coherent fashion. Add a new type of
ZONE_DEVICE to represent such memory. The use case are the same as for
the un-addressable device memory but without all the corners cases.
Link: http://lkml.kernel.org/r/20170817000548.32038-19-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: David Nellans <dnellans@nvidia.com>
Cc: Evgeny Baskakov <ebaskakov@nvidia.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mark Hairgrove <mhairgrove@nvidia.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Sherry Cheung <SCheung@nvidia.com>
Cc: Subhash Gutti <sgutti@nvidia.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Bob Liu <liubo95@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 11 | ||||
-rw-r--r-- | mm/gup.c | 7 | ||||
-rw-r--r-- | mm/hmm.c | 4 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 12 | ||||
-rw-r--r-- | mm/memory.c | 46 | ||||
-rw-r--r-- | mm/migrate.c | 57 | ||||
-rw-r--r-- | mm/swap.c | 11 |
8 files changed, 118 insertions, 32 deletions
@@ -720,12 +720,23 @@ config HMM_MIRROR config DEVICE_PRIVATE bool "Unaddressable device memory (GPU memory, ...)" depends on ARCH_HAS_HMM + select HMM help Allows creation of struct pages to represent unaddressable device memory; i.e., memory that is only accessible from the device (or group of devices). You likely also want to select HMM_MIRROR. +config DEVICE_PUBLIC + bool "Addressable device memory (like GPU memory)" + depends on ARCH_HAS_HMM + select HMM + + help + Allows creation of struct pages to represent addressable device + memory; i.e., memory that is accessible from both the device and + the CPU + config FRAME_VECTOR bool @@ -456,6 +456,13 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) goto unmap; *page = pte_page(*pte); + + /* + * This should never happen (a device public page in the gate + * area). + */ + if (is_device_public_page(*page)) + goto unmap; } get_page(*page); out: @@ -737,7 +737,7 @@ EXPORT_SYMBOL(hmm_vma_fault); #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ -#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) +#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma, unsigned long addr) { @@ -1177,4 +1177,4 @@ static int __init hmm_init(void) } device_initcall(hmm_init); -#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */ +#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ diff --git a/mm/madvise.c b/mm/madvise.c index eea1c73..21261ff 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -355,7 +355,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, continue; } - page = vm_normal_page(vma, addr, ptent); + page = _vm_normal_page(vma, addr, ptent, true); if (!page) continue; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8aa98f9..126a939b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4623,10 +4623,11 @@ out: * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a * target for charge migration. if @target is not NULL, the entry is stored * in target->ent. - * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE - * (so ZONE_DEVICE page and thus not on the lru). For now we such page is - * charge like a regular page would be as for all intent and purposes it is - * just special memory taking the place of a regular page. + * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC + * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru). + * For now we such page is charge like a regular page would be as for all + * intent and purposes it is just special memory taking the place of a + * regular page. * * See Documentations/vm/hmm.txt and include/linux/hmm.h * @@ -4657,7 +4658,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, */ if (page->mem_cgroup == mc.from) { ret = MC_TARGET_PAGE; - if (is_device_private_page(page)) + if (is_device_private_page(page) || + is_device_public_page(page)) ret = MC_TARGET_DEVICE; if (target) target->page = page; diff --git a/mm/memory.c b/mm/memory.c index 079eeac..ad0ea1a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -818,8 +818,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, #else # define HAVE_PTE_SPECIAL 0 #endif -struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, - pte_t pte) +struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + pte_t pte, bool with_public_device) { unsigned long pfn = pte_pfn(pte); @@ -830,8 +830,31 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, return vma->vm_ops->find_special_page(vma, addr); if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return NULL; - if (!is_zero_pfn(pfn)) - print_bad_pte(vma, addr, pte, NULL); + if (is_zero_pfn(pfn)) + return NULL; + + /* + * Device public pages are special pages (they are ZONE_DEVICE + * pages but different from persistent memory). They behave + * allmost like normal pages. The difference is that they are + * not on the lru and thus should never be involve with any- + * thing that involve lru manipulation (mlock, numa balancing, + * ...). + * + * This is why we still want to return NULL for such page from + * vm_normal_page() so that we do not have to special case all + * call site of vm_normal_page(). + */ + if (likely(pfn < highest_memmap_pfn)) { + struct page *page = pfn_to_page(pfn); + + if (is_device_public_page(page)) { + if (with_public_device) + return page; + return NULL; + } + } + print_bad_pte(vma, addr, pte, NULL); return NULL; } @@ -1012,6 +1035,19 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, get_page(page); page_dup_rmap(page, false); rss[mm_counter(page)]++; + } else if (pte_devmap(pte)) { + page = pte_page(pte); + + /* + * Cache coherent device memory behave like regular page and + * not like persistent memory page. For more informations see + * MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h + */ + if (is_device_public_page(page)) { + get_page(page); + page_dup_rmap(page, false); + rss[mm_counter(page)]++; + } } out_set_pte: @@ -1267,7 +1303,7 @@ again: if (pte_present(ptent)) { struct page *page; - page = vm_normal_page(vma, addr, ptent); + page = _vm_normal_page(vma, addr, ptent, true); if (unlikely(details) && page) { /* * unmap_shared_mapping_pages() wants to diff --git a/mm/migrate.c b/mm/migrate.c index e581253..618aeb5 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -36,6 +36,7 @@ #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> #include <linux/gfp.h> +#include <linux/pfn_t.h> #include <linux/memremap.h> #include <linux/userfaultfd_k.h> #include <linux/balloon_compaction.h> @@ -239,10 +240,14 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, if (is_write_migration_entry(entry)) pte = maybe_mkwrite(pte, vma); - if (unlikely(is_zone_device_page(new)) && - is_device_private_page(new)) { - entry = make_device_private_entry(new, pte_write(pte)); - pte = swp_entry_to_pte(entry); + if (unlikely(is_zone_device_page(new))) { + if (is_device_private_page(new)) { + entry = make_device_private_entry(new, pte_write(pte)); + pte = swp_entry_to_pte(entry); + } else if (is_device_public_page(new)) { + pte = pte_mkdevmap(pte); + flush_dcache_page(new); + } } else flush_dcache_page(new); @@ -437,12 +442,11 @@ int migrate_page_move_mapping(struct address_space *mapping, void **pslot; /* - * ZONE_DEVICE pages have 1 refcount always held by their device - * - * Note that DAX memory will never reach that point as it does not have - * the MEMORY_DEVICE_ALLOW_MIGRATE flag set (see memory_hotplug.h). + * Device public or private pages have an extra refcount as they are + * ZONE_DEVICE pages. */ - expected_count += is_zone_device_page(page); + expected_count += is_device_private_page(page); + expected_count += is_device_public_page(page); if (!mapping) { /* Anonymous page without mapping */ @@ -2123,7 +2127,6 @@ out_unlock: #endif /* CONFIG_NUMA */ - struct migrate_vma { struct vm_area_struct *vma; unsigned long *dst; @@ -2263,7 +2266,7 @@ again: pfn = 0; goto next; } - page = vm_normal_page(migrate->vma, addr, pte); + page = _vm_normal_page(migrate->vma, addr, pte, true); mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; } @@ -2406,10 +2409,19 @@ static bool migrate_vma_check_page(struct page *page) if (is_device_private_page(page)) return true; - /* Other ZONE_DEVICE memory type are not supported */ - return false; + /* + * Only allow device public page to be migrated and account for + * the extra reference count imply by ZONE_DEVICE pages. + */ + if (!is_device_public_page(page)) + return false; + extra++; } + /* For file back page */ + if (page_mapping(page)) + extra += 1 + page_has_private(page); + if ((page_count(page) - extra) > page_mapcount(page)) return false; @@ -2647,11 +2659,18 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, */ __SetPageUptodate(page); - if (is_zone_device_page(page) && is_device_private_page(page)) { - swp_entry_t swp_entry; - - swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE); - entry = swp_entry_to_pte(swp_entry); + if (is_zone_device_page(page)) { + if (is_device_private_page(page)) { + swp_entry_t swp_entry; + + swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE); + entry = swp_entry_to_pte(swp_entry); + } else if (is_device_public_page(page)) { + entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry)); + entry = pte_mkdevmap(entry); + } } else { entry = mk_pte(page, vma->vm_page_prot); if (vma->vm_flags & VM_WRITE) @@ -2768,7 +2787,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate) migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; continue; } - } else { + } else if (!is_device_public_page(newpage)) { /* * Other types of ZONE_DEVICE page are not * supported. @@ -765,6 +765,17 @@ void release_pages(struct page **pages, int nr, bool cold) if (is_huge_zero_page(page)) continue; + /* Device public page can not be huge page */ + if (is_device_public_page(page)) { + if (locked_pgdat) { + spin_unlock_irqrestore(&locked_pgdat->lru_lock, + flags); + locked_pgdat = NULL; + } + put_zone_device_private_or_public_page(page); + continue; + } + page = compound_head(page); if (!put_page_testzero(page)) continue; |