From 73e64c51afc56d4863ae225e947ba2f16ad04487 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 14 Dec 2016 15:04:07 -0800 Subject: mm, compaction: allow compaction for GFP_NOFS requests compaction has been disabled for GFP_NOFS and GFP_NOIO requests since the direct compaction was introduced by commit 56de7263fcf3 ("mm: compaction: direct compact when a high-order allocation fails"). The main reason is that the migration of page cache pages might recurse back to fs/io layer and we could potentially deadlock. This is overly conservative because all the anonymous memory is migrateable in the GFP_NOFS context just fine. This might be a large portion of the memory in many/most workkloads. Remove the GFP_NOFS restriction and make sure that we skip all fs pages (those with a mapping) while isolating pages to be migrated. We cannot consider clean fs pages because they might need a metadata update so only isolate pages without any mapping for nofs requests. The effect of this patch will be probably very limited in many/most workloads because higher order GFP_NOFS requests are quite rare, although different configurations might lead to very different results. David Chinner has mentioned a heavy metadata workload with 64kB block which to quote him: : Unfortunately, there was an era of cargo cult configuration tweaks in the : Ceph community that has resulted in a large number of production machines : with XFS filesystems configured this way. And a lot of them store large : numbers of small files and run under significant sustained memory : pressure. : : I slowly working towards getting rid of these high order allocations and : replacing them with the equivalent number of single page allocations, but : I haven't got that (complex) change working yet. We can do the following to simulate that workload: $ mkfs.xfs -f -n size=64k $ mount /mnt/scratch $ time ./fs_mark -D 10000 -S0 -n 100000 -s 0 -L 32 \ -d /mnt/scratch/0 -d /mnt/scratch/1 \ -d /mnt/scratch/2 -d /mnt/scratch/3 \ -d /mnt/scratch/4 -d /mnt/scratch/5 \ -d /mnt/scratch/6 -d /mnt/scratch/7 \ -d /mnt/scratch/8 -d /mnt/scratch/9 \ -d /mnt/scratch/10 -d /mnt/scratch/11 \ -d /mnt/scratch/12 -d /mnt/scratch/13 \ -d /mnt/scratch/14 -d /mnt/scratch/15 and indeed is hammers the system with many high order GFP_NOFS requests as per a simle tracepoint during the load: $ echo '!(gfp_flags & 0x80) && (gfp_flags &0x400000)' > $TRACE_MNT/events/kmem/mm_page_alloc/filter I am getting 5287609 order=0 37 order=1 1594905 order=2 3048439 order=3 6699207 order=4 66645 order=5 My testing was done in a kvm guest so performance numbers should be taken with a grain of salt but there seems to be a difference when the patch is applied: * Original kernel FSUse% Count Size Files/sec App Overhead 1 1600000 0 4300.1 20745838 3 3200000 0 4239.9 23849857 5 4800000 0 4243.4 25939543 6 6400000 0 4248.4 19514050 8 8000000 0 4262.1 20796169 9 9600000 0 4257.6 21288675 11 11200000 0 4259.7 19375120 13 12800000 0 4220.7 22734141 14 14400000 0 4238.5 31936458 16 16000000 0 4231.5 23409901 18 17600000 0 4045.3 23577700 19 19200000 0 2783.4 58299526 21 20800000 0 2678.2 40616302 23 22400000 0 2693.5 83973996 and xfs complaining about memory allocation not making progress [ 2304.372647] XFS: fs_mark(3289) possible memory allocation deadlock size 65624 in kmem_alloc (mode:0x2408240) [ 2304.443323] XFS: fs_mark(3285) possible memory allocation deadlock size 65728 in kmem_alloc (mode:0x2408240) [ 4796.772477] XFS: fs_mark(3424) possible memory allocation deadlock size 46936 in kmem_alloc (mode:0x2408240) [ 4796.775329] XFS: fs_mark(3423) possible memory allocation deadlock size 51416 in kmem_alloc (mode:0x2408240) [ 4797.388808] XFS: fs_mark(3424) possible memory allocation deadlock size 65728 in kmem_alloc (mode:0x2408240) * Patched kernel FSUse% Count Size Files/sec App Overhead 1 1600000 0 4289.1 19243934 3 3200000 0 4241.6 32828865 5 4800000 0 4248.7 32884693 6 6400000 0 4314.4 19608921 8 8000000 0 4269.9 24953292 9 9600000 0 4270.7 33235572 11 11200000 0 4346.4 40817101 13 12800000 0 4285.3 29972397 14 14400000 0 4297.2 20539765 16 16000000 0 4219.6 18596767 18 17600000 0 4273.8 49611187 19 19200000 0 4300.4 27944451 21 20800000 0 4270.6 22324585 22 22400000 0 4317.6 22650382 24 24000000 0 4065.2 22297964 So the dropdown at Count 19200000 didn't happen and there was only a single warning about allocation not making progress [ 3063.815003] XFS: fs_mark(3272) possible memory allocation deadlock size 65624 in kmem_alloc (mode:0x2408240) This suggests that the patch has helped even though there is not all that much of anonymous memory as the workload mostly generates fs metadata. I assume the success rate would be higher with more anonymous memory which should be the case in many workloads. [akpm@linux-foundation.org: fix comment] Link: http://lkml.kernel.org/r/20161012114721.31853-1-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Joonsoo Kim Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 2234642..949198d 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -818,6 +818,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, page_count(page) > page_mapcount(page)) goto isolate_fail; + /* + * Only allow to migrate anonymous pages in GFP_NOFS context + * because those do not depend on fs locks. + */ + if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page)) + goto isolate_fail; + /* If we already hold the lock, we can skip some rechecking */ if (!locked) { locked = compact_trylock_irqsave(zone_lru_lock(zone), @@ -1677,14 +1684,16 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, enum compact_priority prio) { - int may_enter_fs = gfp_mask & __GFP_FS; int may_perform_io = gfp_mask & __GFP_IO; struct zoneref *z; struct zone *zone; enum compact_result rc = COMPACT_SKIPPED; - /* Check if the GFP flags allow compaction */ - if (!may_enter_fs || !may_perform_io) + /* + * Check if the GFP flags allow compaction - GFP_NOIO is really + * tricky context because the migration might require IO + */ + if (!may_perform_io) return COMPACT_SKIPPED; trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio); @@ -1751,6 +1760,7 @@ static void compact_node(int nid) .mode = MIGRATE_SYNC, .ignore_skip_hint = true, .whole_zone = true, + .gfp_mask = GFP_KERNEL, }; @@ -1876,6 +1886,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) .classzone_idx = pgdat->kcompactd_classzone_idx, .mode = MIGRATE_SYNC_LIGHT, .ignore_skip_hint = true, + .gfp_mask = GFP_KERNEL, }; trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, -- cgit v1.1 From 44fdffd70504c15b617686753dfdf9eb0ddf3729 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 14 Dec 2016 15:05:26 -0800 Subject: mm: add support for releasing multiple instances of a page Add a function that allows us to batch free a page that has multiple references outstanding. Specifically this function can be used to drop a page being used in the page frag alloc cache. With this drivers can make use of functionality similar to the page frag alloc cache without having to do any workarounds for the fact that there is no function that frees multiple references. Link: http://lkml.kernel.org/r/20161110113606.76501.70752.stgit@ahduyck-blue-test.jf.intel.com Signed-off-by: Alexander Duyck Cc: "David S. Miller" Cc: "James E.J. Bottomley" Cc: Chris Metcalf Cc: David Howells Cc: Geert Uytterhoeven Cc: Hans-Christian Noren Egtvedt Cc: Helge Deller Cc: James Hogan Cc: Jeff Kirsher Cc: Jonas Bonn Cc: Keguang Zhang Cc: Ley Foon Tan Cc: Mark Salter Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Ralf Baechle Cc: Rich Felker Cc: Richard Kuo Cc: Russell King Cc: Steven Miao Cc: Tobias Klauser Cc: Vineet Gupta Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f64e7bc..2c6d5f6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3925,6 +3925,20 @@ static struct page *__page_frag_refill(struct page_frag_cache *nc, return page; } +void __page_frag_drain(struct page *page, unsigned int order, + unsigned int count) +{ + VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); + + if (page_ref_sub_and_test(page, count)) { + if (order == 0) + free_hot_cold_page(page, false); + else + __free_pages_ok(page, order); + } +} +EXPORT_SYMBOL(__page_frag_drain); + void *__alloc_page_frag(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask) { -- cgit v1.1 From 5b56d49fc31dbb0487e14ead790fc81ca9fb2c99 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 14 Dec 2016 15:06:52 -0800 Subject: mm: add locked parameter to get_user_pages_remote() Patch series "mm: unexport __get_user_pages_unlocked()". This patch series continues the cleanup of get_user_pages*() functions taking advantage of the fact we can now pass gup_flags as we please. It firstly adds an additional 'locked' parameter to get_user_pages_remote() to allow for its callers to utilise VM_FAULT_RETRY functionality. This is necessary as the invocation of __get_user_pages_unlocked() in process_vm_rw_single_vec() makes use of this and no other existing higher level function would allow it to do so. Secondly existing callers of __get_user_pages_unlocked() are replaced with the appropriate higher-level replacement - get_user_pages_unlocked() if the current task and memory descriptor are referenced, or get_user_pages_remote() if other task/memory descriptors are referenced (having acquiring mmap_sem.) This patch (of 2): Add a int *locked parameter to get_user_pages_remote() to allow VM_FAULT_RETRY faulting behaviour similar to get_user_pages_[un]locked(). Taking into account the previous adjustments to get_user_pages*() functions allowing for the passing of gup_flags, we are now in a position where __get_user_pages_unlocked() need only be exported for his ability to allow VM_FAULT_RETRY behaviour, this adjustment allows us to subsequently unexport __get_user_pages_unlocked() as well as allowing for future flexibility in the use of get_user_pages_remote(). [sfr@canb.auug.org.au: merge fix for get_user_pages_remote API change] Link: http://lkml.kernel.org/r/20161122210511.024ec341@canb.auug.org.au Link: http://lkml.kernel.org/r/20161027095141.2569-2-lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: Michal Hocko Cc: Jan Kara Cc: Hugh Dickins Cc: Dave Hansen Cc: Rik van Riel Cc: Mel Gorman Cc: Paolo Bonzini Cc: Radim Krcmar Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 12 ++++++++---- mm/memory.c | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index e50178c..b64c907 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -917,6 +917,9 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * only intends to ensure the pages are faulted in. * @vmas: array of pointers to vmas corresponding to each page. * Or NULL if the caller does not require them. + * @locked: pointer to lock flag indicating whether lock is held and + * subsequently whether VM_FAULT_RETRY functionality can be + * utilised. Lock must initially be held. * * Returns number of pages pinned. This may be fewer than the number * requested. If nr_pages is 0 or negative, returns 0. If no pages @@ -960,10 +963,10 @@ EXPORT_SYMBOL(get_user_pages_unlocked); long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas) + struct vm_area_struct **vmas, int *locked) { return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, - NULL, false, + locked, true, gup_flags | FOLL_TOUCH | FOLL_REMOTE); } EXPORT_SYMBOL(get_user_pages_remote); @@ -971,8 +974,9 @@ EXPORT_SYMBOL(get_user_pages_remote); /* * This is the same as get_user_pages_remote(), just with a * less-flexible calling convention where we assume that the task - * and mm being operated on are the current task's. We also - * obviously don't pass FOLL_REMOTE in here. + * and mm being operated on are the current task's and don't allow + * passing of a locked parameter. We also obviously don't pass + * FOLL_REMOTE in here. */ long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, diff --git a/mm/memory.c b/mm/memory.c index c264f7c..3a6a123 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3919,7 +3919,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, struct page *page = NULL; ret = get_user_pages_remote(tsk, mm, addr, 1, - gup_flags, &page, &vma); + gup_flags, &page, &vma, NULL); if (ret <= 0) { #ifndef CONFIG_HAVE_IOREMAP_PROT break; -- cgit v1.1 From 8b7457ef9a9eb46cd1675d40d8e1fd3c47a38395 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 14 Dec 2016 15:06:55 -0800 Subject: mm: unexport __get_user_pages_unlocked() Unexport the low-level __get_user_pages_unlocked() function and replaces invocations with calls to more appropriate higher-level functions. In hva_to_pfn_slow() we are able to replace __get_user_pages_unlocked() with get_user_pages_unlocked() since we can now pass gup_flags. In async_pf_execute() and process_vm_rw_single_vec() we need to pass different tsk, mm arguments so get_user_pages_remote() is the sane replacement in these cases (having added manual acquisition and release of mmap_sem.) Additionally get_user_pages_remote() reintroduces use of the FOLL_TOUCH flag. However, this flag was originally silently dropped by commit 1e9877902dc7 ("mm/gup: Introduce get_user_pages_remote()"), so this appears to have been unintentional and reintroducing it is therefore not an issue. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20161027095141.2569-3-lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: Michal Hocko Cc: Jan Kara Cc: Hugh Dickins Cc: Dave Hansen Cc: Rik van Riel Cc: Mel Gorman Cc: Paolo Bonzini Cc: Radim Krcmar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 8 ++++---- mm/nommu.c | 8 ++++---- mm/process_vm_access.c | 12 ++++++++---- 3 files changed, 16 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index b64c907..5531555 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -865,9 +865,10 @@ EXPORT_SYMBOL(get_user_pages_locked); * caller if required (just like with __get_user_pages). "FOLL_GET" * is set implicitly if "pages" is non-NULL. */ -__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - struct page **pages, unsigned int gup_flags) +static __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, + struct mm_struct *mm, unsigned long start, + unsigned long nr_pages, struct page **pages, + unsigned int gup_flags) { long ret; int locked = 1; @@ -879,7 +880,6 @@ __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct m up_read(&mm->mmap_sem); return ret; } -EXPORT_SYMBOL(__get_user_pages_unlocked); /* * get_user_pages_unlocked() is suitable to replace the form: diff --git a/mm/nommu.c b/mm/nommu.c index 9720e0b..c299a7f 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -176,9 +176,10 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, } EXPORT_SYMBOL(get_user_pages_locked); -long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - struct page **pages, unsigned int gup_flags) +static long __get_user_pages_unlocked(struct task_struct *tsk, + struct mm_struct *mm, unsigned long start, + unsigned long nr_pages, struct page **pages, + unsigned int gup_flags) { long ret; down_read(&mm->mmap_sem); @@ -187,7 +188,6 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, up_read(&mm->mmap_sem); return ret; } -EXPORT_SYMBOL(__get_user_pages_unlocked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index be8dc8d..84d0c7e 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -88,7 +88,7 @@ static int process_vm_rw_single_vec(unsigned long addr, ssize_t rc = 0; unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES / sizeof(struct pages *); - unsigned int flags = FOLL_REMOTE; + unsigned int flags = 0; /* Work out address and page range required */ if (len == 0) @@ -100,15 +100,19 @@ static int process_vm_rw_single_vec(unsigned long addr, while (!rc && nr_pages && iov_iter_count(iter)) { int pages = min(nr_pages, max_pages_per_loop); + int locked = 1; size_t bytes; /* * Get the pages we're interested in. We must - * add FOLL_REMOTE because task/mm might not + * access remotely because task/mm might not * current/current->mm */ - pages = __get_user_pages_unlocked(task, mm, pa, pages, - process_pages, flags); + down_read(&mm->mmap_sem); + pages = get_user_pages_remote(task, mm, pa, pages, flags, + process_pages, NULL, &locked); + if (locked) + up_read(&mm->mmap_sem); if (pages <= 0) return -EFAULT; -- cgit v1.1 From 82b0f8c39a3869b6fd2a10e180a862248736ec6f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 14 Dec 2016 15:06:58 -0800 Subject: mm: join struct fault_env and vm_fault Currently we have two different structures for passing fault information around - struct vm_fault and struct fault_env. DAX will need more information in struct vm_fault to handle its faults so the content of that structure would become event closer to fault_env. Furthermore it would need to generate struct fault_env to be able to call some of the generic functions. So at this point I don't think there's much use in keeping these two structures separate. Just embed into struct vm_fault all that is needed to use it for both purposes. Link: http://lkml.kernel.org/r/1479460644-25076-2-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Acked-by: Kirill A. Shutemov Cc: Ross Zwisler Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 14 +- mm/huge_memory.c | 173 ++++++++--------- mm/internal.h | 2 +- mm/khugepaged.c | 20 +- mm/memory.c | 568 ++++++++++++++++++++++++++++--------------------------- mm/nommu.c | 2 +- 6 files changed, 392 insertions(+), 387 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 6956838..235021e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2164,12 +2164,12 @@ page_not_uptodate: } EXPORT_SYMBOL(filemap_fault); -void filemap_map_pages(struct fault_env *fe, +void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { struct radix_tree_iter iter; void **slot; - struct file *file = fe->vma->vm_file; + struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t last_pgoff = start_pgoff; loff_t size; @@ -2225,11 +2225,11 @@ repeat: if (file->f_ra.mmap_miss > 0) file->f_ra.mmap_miss--; - fe->address += (iter.index - last_pgoff) << PAGE_SHIFT; - if (fe->pte) - fe->pte += iter.index - last_pgoff; + vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT; + if (vmf->pte) + vmf->pte += iter.index - last_pgoff; last_pgoff = iter.index; - if (alloc_set_pte(fe, NULL, page)) + if (alloc_set_pte(vmf, NULL, page)) goto unlock; unlock_page(page); goto next; @@ -2239,7 +2239,7 @@ skip: put_page(page); next: /* Huge page is mapped? No need to proceed. */ - if (pmd_trans_huge(*fe->pmd)) + if (pmd_trans_huge(*vmf->pmd)) break; if (iter.index == end_pgoff) break; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cee42cf..10eedbf 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -542,13 +542,13 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, } EXPORT_SYMBOL_GPL(thp_get_unmapped_area); -static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, +static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, gfp_t gfp) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct mem_cgroup *memcg; pgtable_t pgtable; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; VM_BUG_ON_PAGE(!PageCompound(page), page); @@ -573,9 +573,9 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, */ __SetPageUptodate(page); - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_none(*fe->pmd))) { - spin_unlock(fe->ptl); + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_none(*vmf->pmd))) { + spin_unlock(vmf->ptl); mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(vma->vm_mm, pgtable); @@ -586,11 +586,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, if (userfaultfd_missing(vma)) { int ret; - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(vma->vm_mm, pgtable); - ret = handle_userfault(fe, VM_UFFD_MISSING); + ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); return ret; } @@ -600,11 +600,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, page_add_new_anon_rmap(page, vma, haddr, true); mem_cgroup_commit_charge(page, memcg, false, true); lru_cache_add_active_or_unevictable(page, vma); - pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable); - set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); atomic_long_inc(&vma->vm_mm->nr_ptes); - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); } @@ -651,12 +651,12 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, return true; } -int do_huge_pmd_anonymous_page(struct fault_env *fe) +int do_huge_pmd_anonymous_page(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; gfp_t gfp; struct page *page; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) return VM_FAULT_FALLBACK; @@ -664,7 +664,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) return VM_FAULT_OOM; if (unlikely(khugepaged_enter(vma, vma->vm_flags))) return VM_FAULT_OOM; - if (!(fe->flags & FAULT_FLAG_WRITE) && + if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && transparent_hugepage_use_zero_page()) { pgtable_t pgtable; @@ -680,22 +680,22 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); ret = 0; set = false; - if (pmd_none(*fe->pmd)) { + if (pmd_none(*vmf->pmd)) { if (userfaultfd_missing(vma)) { - spin_unlock(fe->ptl); - ret = handle_userfault(fe, VM_UFFD_MISSING); + spin_unlock(vmf->ptl); + ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); } else { set_huge_zero_page(pgtable, vma->vm_mm, vma, - haddr, fe->pmd, zero_page); - spin_unlock(fe->ptl); + haddr, vmf->pmd, zero_page); + spin_unlock(vmf->ptl); set = true; } } else - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); if (!set) pte_free(vma->vm_mm, pgtable); return ret; @@ -707,7 +707,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) return VM_FAULT_FALLBACK; } prep_transhuge_page(page); - return __do_huge_pmd_anonymous_page(fe, page, gfp); + return __do_huge_pmd_anonymous_page(vmf, page, gfp); } static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, @@ -879,30 +879,30 @@ out: return ret; } -void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd) +void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd) { pmd_t entry; unsigned long haddr; - fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd); - if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) + vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) goto unlock; entry = pmd_mkyoung(orig_pmd); - haddr = fe->address & HPAGE_PMD_MASK; - if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry, - fe->flags & FAULT_FLAG_WRITE)) - update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd); + haddr = vmf->address & HPAGE_PMD_MASK; + if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, + vmf->flags & FAULT_FLAG_WRITE)) + update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd); unlock: - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); } -static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, +static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, struct page *page) { - struct vm_area_struct *vma = fe->vma; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + struct vm_area_struct *vma = vmf->vma; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; struct mem_cgroup *memcg; pgtable_t pgtable; pmd_t _pmd; @@ -921,7 +921,7 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, for (i = 0; i < HPAGE_PMD_NR; i++) { pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | __GFP_OTHER_NODE, vma, - fe->address, page_to_nid(page)); + vmf->address, page_to_nid(page)); if (unlikely(!pages[i] || mem_cgroup_try_charge(pages[i], vma->vm_mm, GFP_KERNEL, &memcg, false))) { @@ -952,15 +952,15 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) goto out_free_pages; VM_BUG_ON_PAGE(!PageHead(page), page); - pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); + pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); /* leave pmd empty until pte is filled */ - pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd); + pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd); pmd_populate(vma->vm_mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { @@ -969,20 +969,20 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, entry = maybe_mkwrite(pte_mkdirty(entry), vma); memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); - page_add_new_anon_rmap(pages[i], fe->vma, haddr, false); + page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false); mem_cgroup_commit_charge(pages[i], memcg, false, false); lru_cache_add_active_or_unevictable(pages[i], vma); - fe->pte = pte_offset_map(&_pmd, haddr); - VM_BUG_ON(!pte_none(*fe->pte)); - set_pte_at(vma->vm_mm, haddr, fe->pte, entry); - pte_unmap(fe->pte); + vmf->pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte_none(*vmf->pte)); + set_pte_at(vma->vm_mm, haddr, vmf->pte, entry); + pte_unmap(vmf->pte); } kfree(pages); smp_wmb(); /* make pte visible before pmd */ - pmd_populate(vma->vm_mm, fe->pmd, pgtable); + pmd_populate(vma->vm_mm, vmf->pmd, pgtable); page_remove_rmap(page, true); - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); @@ -993,7 +993,7 @@ out: return ret; out_free_pages: - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); for (i = 0; i < HPAGE_PMD_NR; i++) { memcg = (void *)page_private(pages[i]); @@ -1005,23 +1005,23 @@ out_free_pages: goto out; } -int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd) +int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct page *page = NULL, *new_page; struct mem_cgroup *memcg; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ gfp_t huge_gfp; /* for allocation and charge */ int ret = 0; - fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd); + vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); if (is_huge_zero_pmd(orig_pmd)) goto alloc; - spin_lock(fe->ptl); - if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) + spin_lock(vmf->ptl); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) goto out_unlock; page = pmd_page(orig_pmd); @@ -1034,13 +1034,13 @@ int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd) pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry, 1)) - update_mmu_cache_pmd(vma, fe->address, fe->pmd); + if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); ret |= VM_FAULT_WRITE; goto out_unlock; } get_page(page); - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { @@ -1053,12 +1053,12 @@ alloc: prep_transhuge_page(new_page); } else { if (!page) { - split_huge_pmd(vma, fe->pmd, fe->address); + split_huge_pmd(vma, vmf->pmd, vmf->address); ret |= VM_FAULT_FALLBACK; } else { - ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page); + ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page); if (ret & VM_FAULT_OOM) { - split_huge_pmd(vma, fe->pmd, fe->address); + split_huge_pmd(vma, vmf->pmd, vmf->address); ret |= VM_FAULT_FALLBACK; } put_page(page); @@ -1070,7 +1070,7 @@ alloc: if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, huge_gfp, &memcg, true))) { put_page(new_page); - split_huge_pmd(vma, fe->pmd, fe->address); + split_huge_pmd(vma, vmf->pmd, vmf->address); if (page) put_page(page); ret |= VM_FAULT_FALLBACK; @@ -1090,11 +1090,11 @@ alloc: mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); - spin_lock(fe->ptl); + spin_lock(vmf->ptl); if (page) put_page(page); - if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) { - spin_unlock(fe->ptl); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { + spin_unlock(vmf->ptl); mem_cgroup_cancel_charge(new_page, memcg, true); put_page(new_page); goto out_mn; @@ -1102,12 +1102,12 @@ alloc: pmd_t entry; entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); + pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); page_add_new_anon_rmap(new_page, vma, haddr, true); mem_cgroup_commit_charge(new_page, memcg, false, true); lru_cache_add_active_or_unevictable(new_page, vma); - set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); - update_mmu_cache_pmd(vma, fe->address, fe->pmd); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); if (!page) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); } else { @@ -1117,13 +1117,13 @@ alloc: } ret |= VM_FAULT_WRITE; } - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); out_mn: mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); out: return ret; out_unlock: - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); return ret; } @@ -1196,12 +1196,12 @@ out: } /* NUMA hinting page fault entry point for trans huge pmds */ -int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) +int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct anon_vma *anon_vma = NULL; struct page *page; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; int page_nid = -1, this_nid = numa_node_id(); int target_nid, last_cpupid = -1; bool page_locked; @@ -1209,8 +1209,8 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) bool was_writable; int flags = 0; - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_same(pmd, *fe->pmd))) + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(pmd, *vmf->pmd))) goto out_unlock; /* @@ -1218,9 +1218,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) * without disrupting NUMA hinting information. Do not relock and * check_same as the page may no longer be mapped. */ - if (unlikely(pmd_trans_migrating(*fe->pmd))) { - page = pmd_page(*fe->pmd); - spin_unlock(fe->ptl); + if (unlikely(pmd_trans_migrating(*vmf->pmd))) { + page = pmd_page(*vmf->pmd); + spin_unlock(vmf->ptl); wait_on_page_locked(page); goto out; } @@ -1253,7 +1253,7 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) /* Migration could have started since the pmd_trans_migrating check */ if (!page_locked) { - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); wait_on_page_locked(page); page_nid = -1; goto out; @@ -1264,12 +1264,12 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) * to serialises splits */ get_page(page); - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); anon_vma = page_lock_anon_vma_read(page); /* Confirm the PMD did not change while page_table_lock was released */ - spin_lock(fe->ptl); - if (unlikely(!pmd_same(pmd, *fe->pmd))) { + spin_lock(vmf->ptl); + if (unlikely(!pmd_same(pmd, *vmf->pmd))) { unlock_page(page); put_page(page); page_nid = -1; @@ -1287,9 +1287,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) * Migrate the THP to the requested node, returns with page unlocked * and access rights restored. */ - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma, - fe->pmd, pmd, fe->address, page, target_nid); + vmf->pmd, pmd, vmf->address, page, target_nid); if (migrated) { flags |= TNF_MIGRATED; page_nid = target_nid; @@ -1304,18 +1304,19 @@ clear_pmdnuma: pmd = pmd_mkyoung(pmd); if (was_writable) pmd = pmd_mkwrite(pmd); - set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd); - update_mmu_cache_pmd(vma, fe->address, fe->pmd); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); unlock_page(page); out_unlock: - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); out: if (anon_vma) page_unlock_anon_vma_read(anon_vma); if (page_nid != -1) - task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags); + task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, + vmf->flags); return 0; } diff --git a/mm/internal.h b/mm/internal.h index 537ac99..093b1ea 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -36,7 +36,7 @@ /* Do not use these with a slab allocator */ #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) -int do_swap_page(struct fault_env *fe, pte_t orig_pte); +int do_swap_page(struct vm_fault *vmf, pte_t orig_pte); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 0946095..d950c25 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -877,7 +877,7 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, { pte_t pteval; int swapped_in = 0, ret = 0; - struct fault_env fe = { + struct vm_fault vmf = { .vma = vma, .address = address, .flags = FAULT_FLAG_ALLOW_RETRY, @@ -889,19 +889,19 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; } - fe.pte = pte_offset_map(pmd, address); - for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE; - fe.pte++, fe.address += PAGE_SIZE) { - pteval = *fe.pte; + vmf.pte = pte_offset_map(pmd, address); + for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE; + vmf.pte++, vmf.address += PAGE_SIZE) { + pteval = *vmf.pte; if (!is_swap_pte(pteval)) continue; swapped_in++; - ret = do_swap_page(&fe, pteval); + ret = do_swap_page(&vmf, pteval); /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ if (ret & VM_FAULT_RETRY) { down_read(&mm->mmap_sem); - if (hugepage_vma_revalidate(mm, address, &fe.vma)) { + if (hugepage_vma_revalidate(mm, address, &vmf.vma)) { /* vma is no longer available, don't continue to swapin */ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; @@ -915,10 +915,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, return false; } /* pte is unmapped now, we need to map it */ - fe.pte = pte_offset_map(pmd, fe.address); + vmf.pte = pte_offset_map(pmd, vmf.address); } - fe.pte--; - pte_unmap(fe.pte); + vmf.pte--; + pte_unmap(vmf.pte); trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); return true; } diff --git a/mm/memory.c b/mm/memory.c index 3a6a123..512e1c3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2070,11 +2070,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, * case, all we need to do here is to mark the page as writable and update * any related book-keeping. */ -static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, +static inline int wp_page_reuse(struct vm_fault *vmf, pte_t orig_pte, struct page *page, int page_mkwrite, int dirty_shared) - __releases(fe->ptl) + __releases(vmf->ptl) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; pte_t entry; /* * Clear the pages cpupid information as the existing @@ -2084,12 +2084,12 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, if (page) page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); - flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); + flush_cache_page(vma, vmf->address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1)) - update_mmu_cache(vma, fe->address, fe->pte); - pte_unmap_unlock(fe->pte, fe->ptl); + if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) + update_mmu_cache(vma, vmf->address, vmf->pte); + pte_unmap_unlock(vmf->pte, vmf->ptl); if (dirty_shared) { struct address_space *mapping; @@ -2135,15 +2135,15 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, * held to the old page, as well as updating the rmap. * - In any case, unlock the PTL and drop the reference we took to the old page. */ -static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, +static int wp_page_copy(struct vm_fault *vmf, pte_t orig_pte, struct page *old_page) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; struct page *new_page = NULL; pte_t entry; int page_copied = 0; - const unsigned long mmun_start = fe->address & PAGE_MASK; + const unsigned long mmun_start = vmf->address & PAGE_MASK; const unsigned long mmun_end = mmun_start + PAGE_SIZE; struct mem_cgroup *memcg; @@ -2151,15 +2151,16 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, goto oom; if (is_zero_pfn(pte_pfn(orig_pte))) { - new_page = alloc_zeroed_user_highpage_movable(vma, fe->address); + new_page = alloc_zeroed_user_highpage_movable(vma, + vmf->address); if (!new_page) goto oom; } else { new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, - fe->address); + vmf->address); if (!new_page) goto oom; - cow_user_page(new_page, old_page, fe->address, vma); + cow_user_page(new_page, old_page, vmf->address, vma); } if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) @@ -2172,8 +2173,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, /* * Re-check the pte - we dropped the lock */ - fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl); - if (likely(pte_same(*fe->pte, orig_pte))) { + vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); + if (likely(pte_same(*vmf->pte, orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { dec_mm_counter_fast(mm, @@ -2183,7 +2184,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, } else { inc_mm_counter_fast(mm, MM_ANONPAGES); } - flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); + flush_cache_page(vma, vmf->address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* @@ -2192,8 +2193,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, * seen in the presence of one thread doing SMC and another * thread doing COW. */ - ptep_clear_flush_notify(vma, fe->address, fe->pte); - page_add_new_anon_rmap(new_page, vma, fe->address, false); + ptep_clear_flush_notify(vma, vmf->address, vmf->pte); + page_add_new_anon_rmap(new_page, vma, vmf->address, false); mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); /* @@ -2201,8 +2202,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, * mmu page tables (such as kvm shadow page tables), we want the * new page to be mapped directly into the secondary page table. */ - set_pte_at_notify(mm, fe->address, fe->pte, entry); - update_mmu_cache(vma, fe->address, fe->pte); + set_pte_at_notify(mm, vmf->address, vmf->pte, entry); + update_mmu_cache(vma, vmf->address, vmf->pte); if (old_page) { /* * Only after switching the pte to the new page may @@ -2239,7 +2240,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, if (new_page) put_page(new_page); - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); if (old_page) { /* @@ -2267,43 +2268,43 @@ oom: * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED * mapping */ -static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte) +static int wp_pfn_shared(struct vm_fault *vmf, pte_t orig_pte) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { - struct vm_fault vmf = { + struct vm_fault vmf2 = { .page = NULL, - .pgoff = linear_page_index(vma, fe->address), + .pgoff = linear_page_index(vma, vmf->address), .virtual_address = - (void __user *)(fe->address & PAGE_MASK), + (void __user *)(vmf->address & PAGE_MASK), .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, }; int ret; - pte_unmap_unlock(fe->pte, fe->ptl); - ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); + pte_unmap_unlock(vmf->pte, vmf->ptl); + ret = vma->vm_ops->pfn_mkwrite(vma, &vmf2); if (ret & VM_FAULT_ERROR) return ret; - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); /* * We might have raced with another page fault while we * released the pte_offset_map_lock. */ - if (!pte_same(*fe->pte, orig_pte)) { - pte_unmap_unlock(fe->pte, fe->ptl); + if (!pte_same(*vmf->pte, orig_pte)) { + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } } - return wp_page_reuse(fe, orig_pte, NULL, 0, 0); + return wp_page_reuse(vmf, orig_pte, NULL, 0, 0); } -static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, +static int wp_page_shared(struct vm_fault *vmf, pte_t orig_pte, struct page *old_page) - __releases(fe->ptl) + __releases(vmf->ptl) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; int page_mkwrite = 0; get_page(old_page); @@ -2311,8 +2312,8 @@ static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, if (vma->vm_ops && vma->vm_ops->page_mkwrite) { int tmp; - pte_unmap_unlock(fe->pte, fe->ptl); - tmp = do_page_mkwrite(vma, old_page, fe->address); + pte_unmap_unlock(vmf->pte, vmf->ptl); + tmp = do_page_mkwrite(vma, old_page, vmf->address); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { put_page(old_page); @@ -2324,18 +2325,18 @@ static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, * they did, we just return, as we can count on the * MMU to tell us if they didn't also make it writable. */ - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - if (!pte_same(*fe->pte, orig_pte)) { + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (!pte_same(*vmf->pte, orig_pte)) { unlock_page(old_page); - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); put_page(old_page); return 0; } page_mkwrite = 1; } - return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1); + return wp_page_reuse(vmf, orig_pte, old_page, page_mkwrite, 1); } /* @@ -2356,13 +2357,13 @@ static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, * but allow concurrent faults), with pte both mapped and locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_wp_page(struct fault_env *fe, pte_t orig_pte) - __releases(fe->ptl) +static int do_wp_page(struct vm_fault *vmf, pte_t orig_pte) + __releases(vmf->ptl) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct page *old_page; - old_page = vm_normal_page(vma, fe->address, orig_pte); + old_page = vm_normal_page(vma, vmf->address, orig_pte); if (!old_page) { /* * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a @@ -2373,10 +2374,10 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte) */ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED)) - return wp_pfn_shared(fe, orig_pte); + return wp_pfn_shared(vmf, orig_pte); - pte_unmap_unlock(fe->pte, fe->ptl); - return wp_page_copy(fe, orig_pte, old_page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + return wp_page_copy(vmf, orig_pte, old_page); } /* @@ -2387,13 +2388,13 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte) int total_mapcount; if (!trylock_page(old_page)) { get_page(old_page); - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); lock_page(old_page); - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, - fe->address, &fe->ptl); - if (!pte_same(*fe->pte, orig_pte)) { + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (!pte_same(*vmf->pte, orig_pte)) { unlock_page(old_page); - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); put_page(old_page); return 0; } @@ -2411,12 +2412,12 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte) page_move_anon_rmap(old_page, vma); } unlock_page(old_page); - return wp_page_reuse(fe, orig_pte, old_page, 0, 0); + return wp_page_reuse(vmf, orig_pte, old_page, 0, 0); } unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { - return wp_page_shared(fe, orig_pte, old_page); + return wp_page_shared(vmf, orig_pte, old_page); } /* @@ -2424,8 +2425,8 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte) */ get_page(old_page); - pte_unmap_unlock(fe->pte, fe->ptl); - return wp_page_copy(fe, orig_pte, old_page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + return wp_page_copy(vmf, orig_pte, old_page); } static void unmap_mapping_range_vma(struct vm_area_struct *vma, @@ -2513,9 +2514,9 @@ EXPORT_SYMBOL(unmap_mapping_range); * We return with the mmap_sem locked or unlocked in the same cases * as does filemap_fault(). */ -int do_swap_page(struct fault_env *fe, pte_t orig_pte) +int do_swap_page(struct vm_fault *vmf, pte_t orig_pte) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct page *page, *swapcache; struct mem_cgroup *memcg; swp_entry_t entry; @@ -2524,17 +2525,18 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) int exclusive = 0; int ret = 0; - if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte)) + if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, orig_pte)) goto out; entry = pte_to_swp_entry(orig_pte); if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { - migration_entry_wait(vma->vm_mm, fe->pmd, fe->address); + migration_entry_wait(vma->vm_mm, vmf->pmd, + vmf->address); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else { - print_bad_pte(vma, fe->address, orig_pte, NULL); + print_bad_pte(vma, vmf->address, orig_pte, NULL); ret = VM_FAULT_SIGBUS; } goto out; @@ -2542,16 +2544,16 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) delayacct_set_flag(DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry); if (!page) { - page = swapin_readahead(entry, - GFP_HIGHUSER_MOVABLE, vma, fe->address); + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma, + vmf->address); if (!page) { /* * Back out if somebody else faulted in this pte * while we released the pte lock. */ - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, - fe->address, &fe->ptl); - if (likely(pte_same(*fe->pte, orig_pte))) + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (likely(pte_same(*vmf->pte, orig_pte))) ret = VM_FAULT_OOM; delayacct_clear_flag(DELAYACCT_PF_SWAPIN); goto unlock; @@ -2573,7 +2575,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) } swapcache = page; - locked = lock_page_or_retry(page, vma->vm_mm, fe->flags); + locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); if (!locked) { @@ -2590,7 +2592,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) goto out_page; - page = ksm_might_need_to_copy(page, vma, fe->address); + page = ksm_might_need_to_copy(page, vma, vmf->address); if (unlikely(!page)) { ret = VM_FAULT_OOM; page = swapcache; @@ -2606,9 +2608,9 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) /* * Back out if somebody else already faulted in this pte. */ - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - if (unlikely(!pte_same(*fe->pte, orig_pte))) + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, orig_pte))) goto out_nomap; if (unlikely(!PageUptodate(page))) { @@ -2629,22 +2631,22 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); - if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { + if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); - fe->flags &= ~FAULT_FLAG_WRITE; + vmf->flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; exclusive = RMAP_EXCLUSIVE; } flush_icache_page(vma, page); if (pte_swp_soft_dirty(orig_pte)) pte = pte_mksoft_dirty(pte); - set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); if (page == swapcache) { - do_page_add_anon_rmap(page, vma, fe->address, exclusive); + do_page_add_anon_rmap(page, vma, vmf->address, exclusive); mem_cgroup_commit_charge(page, memcg, true, false); activate_page(page); } else { /* ksm created a completely new copy */ - page_add_new_anon_rmap(page, vma, fe->address, false); + page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } @@ -2667,22 +2669,22 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) put_page(swapcache); } - if (fe->flags & FAULT_FLAG_WRITE) { - ret |= do_wp_page(fe, pte); + if (vmf->flags & FAULT_FLAG_WRITE) { + ret |= do_wp_page(vmf, pte); if (ret & VM_FAULT_ERROR) ret &= VM_FAULT_ERROR; goto out; } /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, fe->address, fe->pte); + update_mmu_cache(vma, vmf->address, vmf->pte); unlock: - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); out: return ret; out_nomap: mem_cgroup_cancel_charge(page, memcg, false); - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); out_page: unlock_page(page); out_release: @@ -2733,9 +2735,9 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_anonymous_page(struct fault_env *fe) +static int do_anonymous_page(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct mem_cgroup *memcg; struct page *page; pte_t entry; @@ -2745,7 +2747,7 @@ static int do_anonymous_page(struct fault_env *fe) return VM_FAULT_SIGBUS; /* Check if we need to add a guard page to the stack */ - if (check_stack_guard_page(vma, fe->address) < 0) + if (check_stack_guard_page(vma, vmf->address) < 0) return VM_FAULT_SIGSEGV; /* @@ -2758,26 +2760,26 @@ static int do_anonymous_page(struct fault_env *fe) * * Here we only have down_read(mmap_sem). */ - if (pte_alloc(vma->vm_mm, fe->pmd, fe->address)) + if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address)) return VM_FAULT_OOM; /* See the comment in pte_alloc_one_map() */ - if (unlikely(pmd_trans_unstable(fe->pmd))) + if (unlikely(pmd_trans_unstable(vmf->pmd))) return 0; /* Use the zero-page for reads */ - if (!(fe->flags & FAULT_FLAG_WRITE) && + if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm)) { - entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address), + entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), vma->vm_page_prot)); - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - if (!pte_none(*fe->pte)) + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (!pte_none(*vmf->pte)) goto unlock; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { - pte_unmap_unlock(fe->pte, fe->ptl); - return handle_userfault(fe, VM_UFFD_MISSING); + pte_unmap_unlock(vmf->pte, vmf->ptl); + return handle_userfault(vmf, VM_UFFD_MISSING); } goto setpte; } @@ -2785,7 +2787,7 @@ static int do_anonymous_page(struct fault_env *fe) /* Allocate our own private page. */ if (unlikely(anon_vma_prepare(vma))) goto oom; - page = alloc_zeroed_user_highpage_movable(vma, fe->address); + page = alloc_zeroed_user_highpage_movable(vma, vmf->address); if (!page) goto oom; @@ -2803,30 +2805,30 @@ static int do_anonymous_page(struct fault_env *fe) if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - if (!pte_none(*fe->pte)) + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); + if (!pte_none(*vmf->pte)) goto release; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); mem_cgroup_cancel_charge(page, memcg, false); put_page(page); - return handle_userfault(fe, VM_UFFD_MISSING); + return handle_userfault(vmf, VM_UFFD_MISSING); } inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, fe->address, false); + page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); setpte: - set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, fe->address, fe->pte); + update_mmu_cache(vma, vmf->address, vmf->pte); unlock: - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; release: mem_cgroup_cancel_charge(page, memcg, false); @@ -2843,62 +2845,62 @@ oom: * released depending on flags and vma->vm_ops->fault() return value. * See filemap_fault() and __lock_page_retry(). */ -static int __do_fault(struct fault_env *fe, pgoff_t pgoff, +static int __do_fault(struct vm_fault *vmf, pgoff_t pgoff, struct page *cow_page, struct page **page, void **entry) { - struct vm_area_struct *vma = fe->vma; - struct vm_fault vmf; + struct vm_area_struct *vma = vmf->vma; + struct vm_fault vmf2; int ret; - vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK); - vmf.pgoff = pgoff; - vmf.flags = fe->flags; - vmf.page = NULL; - vmf.gfp_mask = __get_fault_gfp_mask(vma); - vmf.cow_page = cow_page; + vmf2.virtual_address = (void __user *)(vmf->address & PAGE_MASK); + vmf2.pgoff = pgoff; + vmf2.flags = vmf->flags; + vmf2.page = NULL; + vmf2.gfp_mask = __get_fault_gfp_mask(vma); + vmf2.cow_page = cow_page; - ret = vma->vm_ops->fault(vma, &vmf); + ret = vma->vm_ops->fault(vma, &vmf2); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; if (ret & VM_FAULT_DAX_LOCKED) { - *entry = vmf.entry; + *entry = vmf2.entry; return ret; } - if (unlikely(PageHWPoison(vmf.page))) { + if (unlikely(PageHWPoison(vmf2.page))) { if (ret & VM_FAULT_LOCKED) - unlock_page(vmf.page); - put_page(vmf.page); + unlock_page(vmf2.page); + put_page(vmf2.page); return VM_FAULT_HWPOISON; } if (unlikely(!(ret & VM_FAULT_LOCKED))) - lock_page(vmf.page); + lock_page(vmf2.page); else - VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); + VM_BUG_ON_PAGE(!PageLocked(vmf2.page), vmf2.page); - *page = vmf.page; + *page = vmf2.page; return ret; } -static int pte_alloc_one_map(struct fault_env *fe) +static int pte_alloc_one_map(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; - if (!pmd_none(*fe->pmd)) + if (!pmd_none(*vmf->pmd)) goto map_pte; - if (fe->prealloc_pte) { - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_none(*fe->pmd))) { - spin_unlock(fe->ptl); + if (vmf->prealloc_pte) { + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_none(*vmf->pmd))) { + spin_unlock(vmf->ptl); goto map_pte; } atomic_long_inc(&vma->vm_mm->nr_ptes); - pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte); - spin_unlock(fe->ptl); - fe->prealloc_pte = 0; - } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) { + pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); + spin_unlock(vmf->ptl); + vmf->prealloc_pte = 0; + } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) { return VM_FAULT_OOM; } map_pte: @@ -2913,11 +2915,11 @@ map_pte: * through an atomic read in C, which is what pmd_trans_unstable() * provides. */ - if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) + if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd)) return VM_FAULT_NOPAGE; - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); return 0; } @@ -2935,24 +2937,24 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, return true; } -static void deposit_prealloc_pte(struct fault_env *fe) +static void deposit_prealloc_pte(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; - pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, fe->prealloc_pte); + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); /* * We are going to consume the prealloc table, * count that as nr_ptes. */ atomic_long_inc(&vma->vm_mm->nr_ptes); - fe->prealloc_pte = 0; + vmf->prealloc_pte = 0; } -static int do_set_pmd(struct fault_env *fe, struct page *page) +static int do_set_pmd(struct vm_fault *vmf, struct page *page) { - struct vm_area_struct *vma = fe->vma; - bool write = fe->flags & FAULT_FLAG_WRITE; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + struct vm_area_struct *vma = vmf->vma; + bool write = vmf->flags & FAULT_FLAG_WRITE; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t entry; int i, ret; @@ -2966,15 +2968,15 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) * Archs like ppc64 need additonal space to store information * related to pte entry. Use the preallocated table for that. */ - if (arch_needs_pgtable_deposit() && !fe->prealloc_pte) { - fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address); - if (!fe->prealloc_pte) + if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { + vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address); + if (!vmf->prealloc_pte) return VM_FAULT_OOM; smp_wmb(); /* See comment in __pte_alloc() */ } - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_none(*fe->pmd))) + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_none(*vmf->pmd))) goto out; for (i = 0; i < HPAGE_PMD_NR; i++) @@ -2990,11 +2992,11 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) * deposit and withdraw with pmd lock held */ if (arch_needs_pgtable_deposit()) - deposit_prealloc_pte(fe); + deposit_prealloc_pte(vmf); - set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); - update_mmu_cache_pmd(vma, haddr, fe->pmd); + update_mmu_cache_pmd(vma, haddr, vmf->pmd); /* fault is handled */ ret = 0; @@ -3005,13 +3007,13 @@ out: * withdraw with pmd lock held. */ if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK) - fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm, - fe->pmd); - spin_unlock(fe->ptl); + vmf->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm, + vmf->pmd); + spin_unlock(vmf->ptl); return ret; } #else -static int do_set_pmd(struct fault_env *fe, struct page *page) +static int do_set_pmd(struct vm_fault *vmf, struct page *page) { BUILD_BUG(); return 0; @@ -3022,41 +3024,42 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) * alloc_set_pte - setup new PTE entry for given page and add reverse page * mapping. If needed, the fucntion allocates page table or use pre-allocated. * - * @fe: fault environment + * @vmf: fault environment * @memcg: memcg to charge page (only for private mappings) * @page: page to map * - * Caller must take care of unlocking fe->ptl, if fe->pte is non-NULL on return. + * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on + * return. * * Target users are page handler itself and implementations of * vm_ops->map_pages. */ -int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, +int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, struct page *page) { - struct vm_area_struct *vma = fe->vma; - bool write = fe->flags & FAULT_FLAG_WRITE; + struct vm_area_struct *vma = vmf->vma; + bool write = vmf->flags & FAULT_FLAG_WRITE; pte_t entry; int ret; - if (pmd_none(*fe->pmd) && PageTransCompound(page) && + if (pmd_none(*vmf->pmd) && PageTransCompound(page) && IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { /* THP on COW? */ VM_BUG_ON_PAGE(memcg, page); - ret = do_set_pmd(fe, page); + ret = do_set_pmd(vmf, page); if (ret != VM_FAULT_FALLBACK) goto fault_handled; } - if (!fe->pte) { - ret = pte_alloc_one_map(fe); + if (!vmf->pte) { + ret = pte_alloc_one_map(vmf); if (ret) goto fault_handled; } /* Re-check under ptl */ - if (unlikely(!pte_none(*fe->pte))) { + if (unlikely(!pte_none(*vmf->pte))) { ret = VM_FAULT_NOPAGE; goto fault_handled; } @@ -3068,24 +3071,24 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, fe->address, false); + page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, false); } - set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, fe->address, fe->pte); + update_mmu_cache(vma, vmf->address, vmf->pte); ret = 0; fault_handled: /* preallocated pagetable is unused: free it */ - if (fe->prealloc_pte) { - pte_free(fe->vma->vm_mm, fe->prealloc_pte); - fe->prealloc_pte = 0; + if (vmf->prealloc_pte) { + pte_free(vmf->vma->vm_mm, vmf->prealloc_pte); + vmf->prealloc_pte = 0; } return ret; } @@ -3154,17 +3157,17 @@ late_initcall(fault_around_debugfs); * fault_around_pages() value (and therefore to page order). This way it's * easier to guarantee that we don't cross page table boundaries. */ -static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) +static int do_fault_around(struct vm_fault *vmf, pgoff_t start_pgoff) { - unsigned long address = fe->address, nr_pages, mask; + unsigned long address = vmf->address, nr_pages, mask; pgoff_t end_pgoff; int off, ret = 0; nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; - fe->address = max(address & mask, fe->vma->vm_start); - off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + vmf->address = max(address & mask, vmf->vma->vm_start); + off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); start_pgoff -= off; /* @@ -3172,44 +3175,45 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) * or fault_around_pages() from start_pgoff, depending what is nearest. */ end_pgoff = start_pgoff - - ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + + ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + PTRS_PER_PTE - 1; - end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1, + end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1, start_pgoff + nr_pages - 1); - if (pmd_none(*fe->pmd)) { - fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address); - if (!fe->prealloc_pte) + if (pmd_none(*vmf->pmd)) { + vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm, + vmf->address); + if (!vmf->prealloc_pte) goto out; smp_wmb(); /* See comment in __pte_alloc() */ } - fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff); + vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); /* Huge page is mapped? Page fault is solved */ - if (pmd_trans_huge(*fe->pmd)) { + if (pmd_trans_huge(*vmf->pmd)) { ret = VM_FAULT_NOPAGE; goto out; } /* ->map_pages() haven't done anything useful. Cold page cache? */ - if (!fe->pte) + if (!vmf->pte) goto out; /* check if the page fault is solved */ - fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); - if (!pte_none(*fe->pte)) + vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); + if (!pte_none(*vmf->pte)) ret = VM_FAULT_NOPAGE; - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); out: - fe->address = address; - fe->pte = NULL; + vmf->address = address; + vmf->pte = NULL; return ret; } -static int do_read_fault(struct fault_env *fe, pgoff_t pgoff) +static int do_read_fault(struct vm_fault *vmf, pgoff_t pgoff) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct page *fault_page; int ret = 0; @@ -3219,27 +3223,27 @@ static int do_read_fault(struct fault_env *fe, pgoff_t pgoff) * something). */ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { - ret = do_fault_around(fe, pgoff); + ret = do_fault_around(vmf, pgoff); if (ret) return ret; } - ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); + ret = __do_fault(vmf, pgoff, NULL, &fault_page, NULL); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; - ret |= alloc_set_pte(fe, NULL, fault_page); - if (fe->pte) - pte_unmap_unlock(fe->pte, fe->ptl); + ret |= alloc_set_pte(vmf, NULL, fault_page); + if (vmf->pte) + pte_unmap_unlock(vmf->pte, vmf->ptl); unlock_page(fault_page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) put_page(fault_page); return ret; } -static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff) +static int do_cow_fault(struct vm_fault *vmf, pgoff_t pgoff) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct page *fault_page, *new_page; void *fault_entry; struct mem_cgroup *memcg; @@ -3248,7 +3252,7 @@ static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff) if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address); + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (!new_page) return VM_FAULT_OOM; @@ -3258,17 +3262,17 @@ static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff) return VM_FAULT_OOM; } - ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry); + ret = __do_fault(vmf, pgoff, new_page, &fault_page, &fault_entry); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; if (!(ret & VM_FAULT_DAX_LOCKED)) - copy_user_highpage(new_page, fault_page, fe->address, vma); + copy_user_highpage(new_page, fault_page, vmf->address, vma); __SetPageUptodate(new_page); - ret |= alloc_set_pte(fe, memcg, new_page); - if (fe->pte) - pte_unmap_unlock(fe->pte, fe->ptl); + ret |= alloc_set_pte(vmf, memcg, new_page); + if (vmf->pte) + pte_unmap_unlock(vmf->pte, vmf->ptl); if (!(ret & VM_FAULT_DAX_LOCKED)) { unlock_page(fault_page); put_page(fault_page); @@ -3284,15 +3288,15 @@ uncharge_out: return ret; } -static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) +static int do_shared_fault(struct vm_fault *vmf, pgoff_t pgoff) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct page *fault_page; struct address_space *mapping; int dirtied = 0; int ret, tmp; - ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); + ret = __do_fault(vmf, pgoff, NULL, &fault_page, NULL); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; @@ -3302,7 +3306,7 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) */ if (vma->vm_ops->page_mkwrite) { unlock_page(fault_page); - tmp = do_page_mkwrite(vma, fault_page, fe->address); + tmp = do_page_mkwrite(vma, fault_page, vmf->address); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { put_page(fault_page); @@ -3310,9 +3314,9 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) } } - ret |= alloc_set_pte(fe, NULL, fault_page); - if (fe->pte) - pte_unmap_unlock(fe->pte, fe->ptl); + ret |= alloc_set_pte(vmf, NULL, fault_page); + if (vmf->pte) + pte_unmap_unlock(vmf->pte, vmf->ptl); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) { unlock_page(fault_page); @@ -3350,19 +3354,19 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ -static int do_fault(struct fault_env *fe) +static int do_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; - pgoff_t pgoff = linear_page_index(vma, fe->address); + struct vm_area_struct *vma = vmf->vma; + pgoff_t pgoff = linear_page_index(vma, vmf->address); /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ if (!vma->vm_ops->fault) return VM_FAULT_SIGBUS; - if (!(fe->flags & FAULT_FLAG_WRITE)) - return do_read_fault(fe, pgoff); + if (!(vmf->flags & FAULT_FLAG_WRITE)) + return do_read_fault(vmf, pgoff); if (!(vma->vm_flags & VM_SHARED)) - return do_cow_fault(fe, pgoff); - return do_shared_fault(fe, pgoff); + return do_cow_fault(vmf, pgoff); + return do_shared_fault(vmf, pgoff); } static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, @@ -3380,9 +3384,9 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, return mpol_misplaced(page, vma, addr); } -static int do_numa_page(struct fault_env *fe, pte_t pte) +static int do_numa_page(struct vm_fault *vmf, pte_t pte) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct page *page = NULL; int page_nid = -1; int last_cpupid; @@ -3400,10 +3404,10 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) * page table entry is not accessible, so there would be no * concurrent hardware modifications to the PTE. */ - fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd); - spin_lock(fe->ptl); - if (unlikely(!pte_same(*fe->pte, pte))) { - pte_unmap_unlock(fe->pte, fe->ptl); + vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd); + spin_lock(vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, pte))) { + pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } @@ -3412,18 +3416,18 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) pte = pte_mkyoung(pte); if (was_writable) pte = pte_mkwrite(pte); - set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); - update_mmu_cache(vma, fe->address, fe->pte); + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); + update_mmu_cache(vma, vmf->address, vmf->pte); - page = vm_normal_page(vma, fe->address, pte); + page = vm_normal_page(vma, vmf->address, pte); if (!page) { - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } /* TODO: handle PTE-mapped THP */ if (PageCompound(page)) { - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } @@ -3447,9 +3451,9 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page); - target_nid = numa_migrate_prep(page, vma, fe->address, page_nid, + target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, &flags); - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); if (target_nid == -1) { put_page(page); goto out; @@ -3469,28 +3473,28 @@ out: return 0; } -static int create_huge_pmd(struct fault_env *fe) +static int create_huge_pmd(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; if (vma_is_anonymous(vma)) - return do_huge_pmd_anonymous_page(fe); + return do_huge_pmd_anonymous_page(vmf); if (vma->vm_ops->pmd_fault) - return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd, - fe->flags); + return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd, + vmf->flags); return VM_FAULT_FALLBACK; } -static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd) +static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) { - if (vma_is_anonymous(fe->vma)) - return do_huge_pmd_wp_page(fe, orig_pmd); - if (fe->vma->vm_ops->pmd_fault) - return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd, - fe->flags); + if (vma_is_anonymous(vmf->vma)) + return do_huge_pmd_wp_page(vmf, orig_pmd); + if (vmf->vma->vm_ops->pmd_fault) + return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address, + vmf->pmd, vmf->flags); /* COW handled on pte level: split pmd */ - VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma); - __split_huge_pmd(fe->vma, fe->pmd, fe->address, false, NULL); + VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma); + __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL); return VM_FAULT_FALLBACK; } @@ -3515,21 +3519,21 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) * The mmap_sem may have been released depending on flags and our return value. * See filemap_fault() and __lock_page_or_retry(). */ -static int handle_pte_fault(struct fault_env *fe) +static int handle_pte_fault(struct vm_fault *vmf) { pte_t entry; - if (unlikely(pmd_none(*fe->pmd))) { + if (unlikely(pmd_none(*vmf->pmd))) { /* * Leave __pte_alloc() until later: because vm_ops->fault may * want to allocate huge page, and if we expose page table * for an instant, it will be difficult to retract from * concurrent faults and from rmap lookups. */ - fe->pte = NULL; + vmf->pte = NULL; } else { /* See comment in pte_alloc_one_map() */ - if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) + if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd)) return 0; /* * A regular pmd is established and it can't morph into a huge @@ -3537,9 +3541,9 @@ static int handle_pte_fault(struct fault_env *fe) * mmap_sem read mode and khugepaged takes it in write mode. * So now it's safe to run pte_offset_map(). */ - fe->pte = pte_offset_map(fe->pmd, fe->address); + vmf->pte = pte_offset_map(vmf->pmd, vmf->address); - entry = *fe->pte; + entry = *vmf->pte; /* * some architectures can have larger ptes than wordsize, @@ -3551,37 +3555,37 @@ static int handle_pte_fault(struct fault_env *fe) */ barrier(); if (pte_none(entry)) { - pte_unmap(fe->pte); - fe->pte = NULL; + pte_unmap(vmf->pte); + vmf->pte = NULL; } } - if (!fe->pte) { - if (vma_is_anonymous(fe->vma)) - return do_anonymous_page(fe); + if (!vmf->pte) { + if (vma_is_anonymous(vmf->vma)) + return do_anonymous_page(vmf); else - return do_fault(fe); + return do_fault(vmf); } if (!pte_present(entry)) - return do_swap_page(fe, entry); + return do_swap_page(vmf, entry); - if (pte_protnone(entry) && vma_is_accessible(fe->vma)) - return do_numa_page(fe, entry); + if (pte_protnone(entry) && vma_is_accessible(vmf->vma)) + return do_numa_page(vmf, entry); - fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd); - spin_lock(fe->ptl); - if (unlikely(!pte_same(*fe->pte, entry))) + vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); + spin_lock(vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, entry))) goto unlock; - if (fe->flags & FAULT_FLAG_WRITE) { + if (vmf->flags & FAULT_FLAG_WRITE) { if (!pte_write(entry)) - return do_wp_page(fe, entry); + return do_wp_page(vmf, entry); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); - if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry, - fe->flags & FAULT_FLAG_WRITE)) { - update_mmu_cache(fe->vma, fe->address, fe->pte); + if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, + vmf->flags & FAULT_FLAG_WRITE)) { + update_mmu_cache(vmf->vma, vmf->address, vmf->pte); } else { /* * This is needed only for protection faults but the arch code @@ -3589,11 +3593,11 @@ static int handle_pte_fault(struct fault_env *fe) * This still avoids useless tlb flushes for .text page faults * with threads. */ - if (fe->flags & FAULT_FLAG_WRITE) - flush_tlb_fix_spurious_fault(fe->vma, fe->address); + if (vmf->flags & FAULT_FLAG_WRITE) + flush_tlb_fix_spurious_fault(vmf->vma, vmf->address); } unlock: - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } @@ -3606,7 +3610,7 @@ unlock: static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags) { - struct fault_env fe = { + struct vm_fault vmf = { .vma = vma, .address = address, .flags = flags, @@ -3619,35 +3623,35 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, pud = pud_alloc(mm, pgd, address); if (!pud) return VM_FAULT_OOM; - fe.pmd = pmd_alloc(mm, pud, address); - if (!fe.pmd) + vmf.pmd = pmd_alloc(mm, pud, address); + if (!vmf.pmd) return VM_FAULT_OOM; - if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) { - int ret = create_huge_pmd(&fe); + if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) { + int ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { - pmd_t orig_pmd = *fe.pmd; + pmd_t orig_pmd = *vmf.pmd; int ret; barrier(); if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) - return do_huge_pmd_numa_page(&fe, orig_pmd); + return do_huge_pmd_numa_page(&vmf, orig_pmd); - if ((fe.flags & FAULT_FLAG_WRITE) && + if ((vmf.flags & FAULT_FLAG_WRITE) && !pmd_write(orig_pmd)) { - ret = wp_huge_pmd(&fe, orig_pmd); + ret = wp_huge_pmd(&vmf, orig_pmd); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { - huge_pmd_set_accessed(&fe, orig_pmd); + huge_pmd_set_accessed(&vmf, orig_pmd); return 0; } } } - return handle_pte_fault(&fe); + return handle_pte_fault(&vmf); } /* diff --git a/mm/nommu.c b/mm/nommu.c index c299a7f..9902d81 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1801,7 +1801,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_fault); -void filemap_map_pages(struct fault_env *fe, +void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { BUG(); -- cgit v1.1 From 1a29d85eb0f19b7d8271923d8917d7b4f5540b3e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 14 Dec 2016 15:07:01 -0800 Subject: mm: use vmf->address instead of of vmf->virtual_address Every single user of vmf->virtual_address typed that entry to unsigned long before doing anything with it so the type of virtual_address does not really provide us any additional safety. Just use masked vmf->address which already has the appropriate type. Link: http://lkml.kernel.org/r/1479460644-25076-3-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Acked-by: Kirill A. Shutemov Cc: Dan Williams Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 512e1c3..3798362 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2040,7 +2040,7 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, struct vm_fault vmf; int ret; - vmf.virtual_address = (void __user *)(address & PAGE_MASK); + vmf.address = address & PAGE_MASK; vmf.pgoff = page->index; vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; vmf.gfp_mask = __get_fault_gfp_mask(vma); @@ -2276,8 +2276,7 @@ static int wp_pfn_shared(struct vm_fault *vmf, pte_t orig_pte) struct vm_fault vmf2 = { .page = NULL, .pgoff = linear_page_index(vma, vmf->address), - .virtual_address = - (void __user *)(vmf->address & PAGE_MASK), + .address = vmf->address, .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, }; int ret; @@ -2852,7 +2851,7 @@ static int __do_fault(struct vm_fault *vmf, pgoff_t pgoff, struct vm_fault vmf2; int ret; - vmf2.virtual_address = (void __user *)(vmf->address & PAGE_MASK); + vmf2.address = vmf->address; vmf2.pgoff = pgoff; vmf2.flags = vmf->flags; vmf2.page = NULL; @@ -3612,7 +3611,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, { struct vm_fault vmf = { .vma = vma, - .address = address, + .address = address & PAGE_MASK, .flags = flags, }; struct mm_struct *mm = vma->vm_mm; -- cgit v1.1 From 0721ec8bc156fafc9057ec1df95cdb3bbc3cbae8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 14 Dec 2016 15:07:04 -0800 Subject: mm: use pgoff in struct vm_fault instead of passing it separately struct vm_fault has already pgoff entry. Use it instead of passing pgoff as a separate argument and then assigning it later. Link: http://lkml.kernel.org/r/1479460644-25076-4-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Ross Zwisler Acked-by: Kirill A. Shutemov Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 1 + mm/memory.c | 35 ++++++++++++++++++----------------- 2 files changed, 19 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/khugepaged.c b/mm/khugepaged.c index d950c25..f062d54 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -882,6 +882,7 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, .address = address, .flags = FAULT_FLAG_ALLOW_RETRY, .pmd = pmd, + .pgoff = linear_page_index(vma, address), }; /* we only decide to swapin, if there is enough young ptes */ diff --git a/mm/memory.c b/mm/memory.c index 3798362..c514b4a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2275,7 +2275,7 @@ static int wp_pfn_shared(struct vm_fault *vmf, pte_t orig_pte) if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { struct vm_fault vmf2 = { .page = NULL, - .pgoff = linear_page_index(vma, vmf->address), + .pgoff = vmf->pgoff, .address = vmf->address, .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, }; @@ -2844,15 +2844,15 @@ oom: * released depending on flags and vma->vm_ops->fault() return value. * See filemap_fault() and __lock_page_retry(). */ -static int __do_fault(struct vm_fault *vmf, pgoff_t pgoff, - struct page *cow_page, struct page **page, void **entry) +static int __do_fault(struct vm_fault *vmf, struct page *cow_page, + struct page **page, void **entry) { struct vm_area_struct *vma = vmf->vma; struct vm_fault vmf2; int ret; vmf2.address = vmf->address; - vmf2.pgoff = pgoff; + vmf2.pgoff = vmf->pgoff; vmf2.flags = vmf->flags; vmf2.page = NULL; vmf2.gfp_mask = __get_fault_gfp_mask(vma); @@ -3156,9 +3156,10 @@ late_initcall(fault_around_debugfs); * fault_around_pages() value (and therefore to page order). This way it's * easier to guarantee that we don't cross page table boundaries. */ -static int do_fault_around(struct vm_fault *vmf, pgoff_t start_pgoff) +static int do_fault_around(struct vm_fault *vmf) { unsigned long address = vmf->address, nr_pages, mask; + pgoff_t start_pgoff = vmf->pgoff; pgoff_t end_pgoff; int off, ret = 0; @@ -3210,7 +3211,7 @@ out: return ret; } -static int do_read_fault(struct vm_fault *vmf, pgoff_t pgoff) +static int do_read_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *fault_page; @@ -3222,12 +3223,12 @@ static int do_read_fault(struct vm_fault *vmf, pgoff_t pgoff) * something). */ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { - ret = do_fault_around(vmf, pgoff); + ret = do_fault_around(vmf); if (ret) return ret; } - ret = __do_fault(vmf, pgoff, NULL, &fault_page, NULL); + ret = __do_fault(vmf, NULL, &fault_page, NULL); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; @@ -3240,7 +3241,7 @@ static int do_read_fault(struct vm_fault *vmf, pgoff_t pgoff) return ret; } -static int do_cow_fault(struct vm_fault *vmf, pgoff_t pgoff) +static int do_cow_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *fault_page, *new_page; @@ -3261,7 +3262,7 @@ static int do_cow_fault(struct vm_fault *vmf, pgoff_t pgoff) return VM_FAULT_OOM; } - ret = __do_fault(vmf, pgoff, new_page, &fault_page, &fault_entry); + ret = __do_fault(vmf, new_page, &fault_page, &fault_entry); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; @@ -3276,7 +3277,7 @@ static int do_cow_fault(struct vm_fault *vmf, pgoff_t pgoff) unlock_page(fault_page); put_page(fault_page); } else { - dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff); + dax_unlock_mapping_entry(vma->vm_file->f_mapping, vmf->pgoff); } if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; @@ -3287,7 +3288,7 @@ uncharge_out: return ret; } -static int do_shared_fault(struct vm_fault *vmf, pgoff_t pgoff) +static int do_shared_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *fault_page; @@ -3295,7 +3296,7 @@ static int do_shared_fault(struct vm_fault *vmf, pgoff_t pgoff) int dirtied = 0; int ret, tmp; - ret = __do_fault(vmf, pgoff, NULL, &fault_page, NULL); + ret = __do_fault(vmf, NULL, &fault_page, NULL); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; @@ -3356,16 +3357,15 @@ static int do_shared_fault(struct vm_fault *vmf, pgoff_t pgoff) static int do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - pgoff_t pgoff = linear_page_index(vma, vmf->address); /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ if (!vma->vm_ops->fault) return VM_FAULT_SIGBUS; if (!(vmf->flags & FAULT_FLAG_WRITE)) - return do_read_fault(vmf, pgoff); + return do_read_fault(vmf); if (!(vma->vm_flags & VM_SHARED)) - return do_cow_fault(vmf, pgoff); - return do_shared_fault(vmf, pgoff); + return do_cow_fault(vmf); + return do_shared_fault(vmf); } static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, @@ -3613,6 +3613,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, .vma = vma, .address = address & PAGE_MASK, .flags = flags, + .pgoff = linear_page_index(vma, address), }; struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; -- cgit v1.1 From 667240e0f2e13e792a5af99b3c34dfab12ef125b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 14 Dec 2016 15:07:07 -0800 Subject: mm: use passed vm_fault structure in __do_fault() Instead of creating another vm_fault structure, use the one passed to __do_fault() for passing arguments into fault handler. Link: http://lkml.kernel.org/r/1479460644-25076-5-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Ross Zwisler Acked-by: Kirill A. Shutemov Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index c514b4a..cbc6d47 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2848,37 +2848,31 @@ static int __do_fault(struct vm_fault *vmf, struct page *cow_page, struct page **page, void **entry) { struct vm_area_struct *vma = vmf->vma; - struct vm_fault vmf2; int ret; - vmf2.address = vmf->address; - vmf2.pgoff = vmf->pgoff; - vmf2.flags = vmf->flags; - vmf2.page = NULL; - vmf2.gfp_mask = __get_fault_gfp_mask(vma); - vmf2.cow_page = cow_page; + vmf->cow_page = cow_page; - ret = vma->vm_ops->fault(vma, &vmf2); + ret = vma->vm_ops->fault(vma, vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; if (ret & VM_FAULT_DAX_LOCKED) { - *entry = vmf2.entry; + *entry = vmf->entry; return ret; } - if (unlikely(PageHWPoison(vmf2.page))) { + if (unlikely(PageHWPoison(vmf->page))) { if (ret & VM_FAULT_LOCKED) - unlock_page(vmf2.page); - put_page(vmf2.page); + unlock_page(vmf->page); + put_page(vmf->page); return VM_FAULT_HWPOISON; } if (unlikely(!(ret & VM_FAULT_LOCKED))) - lock_page(vmf2.page); + lock_page(vmf->page); else - VM_BUG_ON_PAGE(!PageLocked(vmf2.page), vmf2.page); + VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page); - *page = vmf2.page; + *page = vmf->page; return ret; } @@ -3614,6 +3608,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, .address = address & PAGE_MASK, .flags = flags, .pgoff = linear_page_index(vma, address), + .gfp_mask = __get_fault_gfp_mask(vma), }; struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; -- cgit v1.1 From 936ca80d3773bd9b6dda8a0dfd54425f9ec1be9d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 14 Dec 2016 15:07:10 -0800 Subject: mm: trim __do_fault() arguments Use vm_fault structure to pass cow_page, page, and entry in and out of the function. That reduces number of __do_fault() arguments from 4 to 1. Link: http://lkml.kernel.org/r/1479460644-25076-6-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Ross Zwisler Acked-by: Kirill A. Shutemov Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 67 ++++++++++++++++++++++++++----------------------------------- 1 file changed, 29 insertions(+), 38 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index cbc6d47..78b81e8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2844,26 +2844,22 @@ oom: * released depending on flags and vma->vm_ops->fault() return value. * See filemap_fault() and __lock_page_retry(). */ -static int __do_fault(struct vm_fault *vmf, struct page *cow_page, - struct page **page, void **entry) +static int __do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; int ret; - vmf->cow_page = cow_page; - ret = vma->vm_ops->fault(vma, vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; - if (ret & VM_FAULT_DAX_LOCKED) { - *entry = vmf->entry; + if (ret & VM_FAULT_DAX_LOCKED) return ret; - } if (unlikely(PageHWPoison(vmf->page))) { if (ret & VM_FAULT_LOCKED) unlock_page(vmf->page); put_page(vmf->page); + vmf->page = NULL; return VM_FAULT_HWPOISON; } @@ -2872,7 +2868,6 @@ static int __do_fault(struct vm_fault *vmf, struct page *cow_page, else VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page); - *page = vmf->page; return ret; } @@ -3208,7 +3203,6 @@ out: static int do_read_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct page *fault_page; int ret = 0; /* @@ -3222,54 +3216,52 @@ static int do_read_fault(struct vm_fault *vmf) return ret; } - ret = __do_fault(vmf, NULL, &fault_page, NULL); + ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; - ret |= alloc_set_pte(vmf, NULL, fault_page); + ret |= alloc_set_pte(vmf, NULL, vmf->page); if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); - unlock_page(fault_page); + unlock_page(vmf->page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) - put_page(fault_page); + put_page(vmf->page); return ret; } static int do_cow_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct page *fault_page, *new_page; - void *fault_entry; struct mem_cgroup *memcg; int ret; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); - if (!new_page) + vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); + if (!vmf->cow_page) return VM_FAULT_OOM; - if (mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, + if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL, &memcg, false)) { - put_page(new_page); + put_page(vmf->cow_page); return VM_FAULT_OOM; } - ret = __do_fault(vmf, new_page, &fault_page, &fault_entry); + ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; if (!(ret & VM_FAULT_DAX_LOCKED)) - copy_user_highpage(new_page, fault_page, vmf->address, vma); - __SetPageUptodate(new_page); + copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma); + __SetPageUptodate(vmf->cow_page); - ret |= alloc_set_pte(vmf, memcg, new_page); + ret |= alloc_set_pte(vmf, memcg, vmf->cow_page); if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); if (!(ret & VM_FAULT_DAX_LOCKED)) { - unlock_page(fault_page); - put_page(fault_page); + unlock_page(vmf->page); + put_page(vmf->page); } else { dax_unlock_mapping_entry(vma->vm_file->f_mapping, vmf->pgoff); } @@ -3277,20 +3269,19 @@ static int do_cow_fault(struct vm_fault *vmf) goto uncharge_out; return ret; uncharge_out: - mem_cgroup_cancel_charge(new_page, memcg, false); - put_page(new_page); + mem_cgroup_cancel_charge(vmf->cow_page, memcg, false); + put_page(vmf->cow_page); return ret; } static int do_shared_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct page *fault_page; struct address_space *mapping; int dirtied = 0; int ret, tmp; - ret = __do_fault(vmf, NULL, &fault_page, NULL); + ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; @@ -3299,26 +3290,26 @@ static int do_shared_fault(struct vm_fault *vmf) * about to become writable */ if (vma->vm_ops->page_mkwrite) { - unlock_page(fault_page); - tmp = do_page_mkwrite(vma, fault_page, vmf->address); + unlock_page(vmf->page); + tmp = do_page_mkwrite(vma, vmf->page, vmf->address); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { - put_page(fault_page); + put_page(vmf->page); return tmp; } } - ret |= alloc_set_pte(vmf, NULL, fault_page); + ret |= alloc_set_pte(vmf, NULL, vmf->page); if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) { - unlock_page(fault_page); - put_page(fault_page); + unlock_page(vmf->page); + put_page(vmf->page); return ret; } - if (set_page_dirty(fault_page)) + if (set_page_dirty(vmf->page)) dirtied = 1; /* * Take a local copy of the address_space - page.mapping may be zeroed @@ -3326,8 +3317,8 @@ static int do_shared_fault(struct vm_fault *vmf) * pinned by vma->vm_file's reference. We rely on unlock_page()'s * release semantics to prevent the compiler from undoing this copying. */ - mapping = page_rmapping(fault_page); - unlock_page(fault_page); + mapping = page_rmapping(vmf->page); + unlock_page(vmf->page); if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { /* * Some device drivers do not set page.mapping but still -- cgit v1.1 From fe82221f57ea6840a4238a8e077e3f93f257a03f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 14 Dec 2016 15:07:13 -0800 Subject: mm: use passed vm_fault structure for in wp_pfn_shared() Instead of creating another vm_fault structure, use the one passed to wp_pfn_shared() for passing arguments into pfn_mkwrite handler. Link: http://lkml.kernel.org/r/1479460644-25076-7-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Ross Zwisler Acked-by: Kirill A. Shutemov Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 78b81e8..7ba9cc5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2273,16 +2273,11 @@ static int wp_pfn_shared(struct vm_fault *vmf, pte_t orig_pte) struct vm_area_struct *vma = vmf->vma; if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { - struct vm_fault vmf2 = { - .page = NULL, - .pgoff = vmf->pgoff, - .address = vmf->address, - .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, - }; int ret; pte_unmap_unlock(vmf->pte, vmf->ptl); - ret = vma->vm_ops->pfn_mkwrite(vma, &vmf2); + vmf->flags |= FAULT_FLAG_MKWRITE; + ret = vma->vm_ops->pfn_mkwrite(vma, vmf); if (ret & VM_FAULT_ERROR) return ret; vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, -- cgit v1.1 From 2994302bc8a17180788fac66a47102d338d5d0ec Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 14 Dec 2016 15:07:16 -0800 Subject: mm: add orig_pte field into vm_fault Add orig_pte field to vm_fault structure to allow ->page_mkwrite handlers to fully handle the fault. This also allows us to save some passing of extra arguments around. Link: http://lkml.kernel.org/r/1479460644-25076-8-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Ross Zwisler Acked-by: Kirill A. Shutemov Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 2 +- mm/khugepaged.c | 7 +++-- mm/memory.c | 82 ++++++++++++++++++++++++++++----------------------------- 3 files changed, 45 insertions(+), 46 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 093b1ea..44d6889 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -36,7 +36,7 @@ /* Do not use these with a slab allocator */ #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) -int do_swap_page(struct vm_fault *vmf, pte_t orig_pte); +int do_swap_page(struct vm_fault *vmf); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index f062d54..7434a63 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -875,7 +875,6 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int referenced) { - pte_t pteval; int swapped_in = 0, ret = 0; struct vm_fault vmf = { .vma = vma, @@ -893,11 +892,11 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, vmf.pte = pte_offset_map(pmd, address); for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE; vmf.pte++, vmf.address += PAGE_SIZE) { - pteval = *vmf.pte; - if (!is_swap_pte(pteval)) + vmf.orig_pte = *vmf.pte; + if (!is_swap_pte(vmf.orig_pte)) continue; swapped_in++; - ret = do_swap_page(&vmf, pteval); + ret = do_swap_page(&vmf); /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ if (ret & VM_FAULT_RETRY) { diff --git a/mm/memory.c b/mm/memory.c index 7ba9cc5..cf74f7c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2070,8 +2070,8 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, * case, all we need to do here is to mark the page as writable and update * any related book-keeping. */ -static inline int wp_page_reuse(struct vm_fault *vmf, pte_t orig_pte, - struct page *page, int page_mkwrite, int dirty_shared) +static inline int wp_page_reuse(struct vm_fault *vmf, struct page *page, + int page_mkwrite, int dirty_shared) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; @@ -2084,8 +2084,8 @@ static inline int wp_page_reuse(struct vm_fault *vmf, pte_t orig_pte, if (page) page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); - flush_cache_page(vma, vmf->address, pte_pfn(orig_pte)); - entry = pte_mkyoung(orig_pte); + flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); + entry = pte_mkyoung(vmf->orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) update_mmu_cache(vma, vmf->address, vmf->pte); @@ -2135,8 +2135,7 @@ static inline int wp_page_reuse(struct vm_fault *vmf, pte_t orig_pte, * held to the old page, as well as updating the rmap. * - In any case, unlock the PTL and drop the reference we took to the old page. */ -static int wp_page_copy(struct vm_fault *vmf, pte_t orig_pte, - struct page *old_page) +static int wp_page_copy(struct vm_fault *vmf, struct page *old_page) { struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; @@ -2150,7 +2149,7 @@ static int wp_page_copy(struct vm_fault *vmf, pte_t orig_pte, if (unlikely(anon_vma_prepare(vma))) goto oom; - if (is_zero_pfn(pte_pfn(orig_pte))) { + if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { new_page = alloc_zeroed_user_highpage_movable(vma, vmf->address); if (!new_page) @@ -2174,7 +2173,7 @@ static int wp_page_copy(struct vm_fault *vmf, pte_t orig_pte, * Re-check the pte - we dropped the lock */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); - if (likely(pte_same(*vmf->pte, orig_pte))) { + if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { dec_mm_counter_fast(mm, @@ -2184,7 +2183,7 @@ static int wp_page_copy(struct vm_fault *vmf, pte_t orig_pte, } else { inc_mm_counter_fast(mm, MM_ANONPAGES); } - flush_cache_page(vma, vmf->address, pte_pfn(orig_pte)); + flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* @@ -2268,7 +2267,7 @@ oom: * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED * mapping */ -static int wp_pfn_shared(struct vm_fault *vmf, pte_t orig_pte) +static int wp_pfn_shared(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -2286,16 +2285,15 @@ static int wp_pfn_shared(struct vm_fault *vmf, pte_t orig_pte) * We might have raced with another page fault while we * released the pte_offset_map_lock. */ - if (!pte_same(*vmf->pte, orig_pte)) { + if (!pte_same(*vmf->pte, vmf->orig_pte)) { pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } } - return wp_page_reuse(vmf, orig_pte, NULL, 0, 0); + return wp_page_reuse(vmf, NULL, 0, 0); } -static int wp_page_shared(struct vm_fault *vmf, pte_t orig_pte, - struct page *old_page) +static int wp_page_shared(struct vm_fault *vmf, struct page *old_page) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; @@ -2321,7 +2319,7 @@ static int wp_page_shared(struct vm_fault *vmf, pte_t orig_pte, */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (!pte_same(*vmf->pte, orig_pte)) { + if (!pte_same(*vmf->pte, vmf->orig_pte)) { unlock_page(old_page); pte_unmap_unlock(vmf->pte, vmf->ptl); put_page(old_page); @@ -2330,7 +2328,7 @@ static int wp_page_shared(struct vm_fault *vmf, pte_t orig_pte, page_mkwrite = 1; } - return wp_page_reuse(vmf, orig_pte, old_page, page_mkwrite, 1); + return wp_page_reuse(vmf, old_page, page_mkwrite, 1); } /* @@ -2351,13 +2349,13 @@ static int wp_page_shared(struct vm_fault *vmf, pte_t orig_pte, * but allow concurrent faults), with pte both mapped and locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_wp_page(struct vm_fault *vmf, pte_t orig_pte) +static int do_wp_page(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; struct page *old_page; - old_page = vm_normal_page(vma, vmf->address, orig_pte); + old_page = vm_normal_page(vma, vmf->address, vmf->orig_pte); if (!old_page) { /* * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a @@ -2368,10 +2366,10 @@ static int do_wp_page(struct vm_fault *vmf, pte_t orig_pte) */ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED)) - return wp_pfn_shared(vmf, orig_pte); + return wp_pfn_shared(vmf); pte_unmap_unlock(vmf->pte, vmf->ptl); - return wp_page_copy(vmf, orig_pte, old_page); + return wp_page_copy(vmf, old_page); } /* @@ -2386,7 +2384,7 @@ static int do_wp_page(struct vm_fault *vmf, pte_t orig_pte) lock_page(old_page); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (!pte_same(*vmf->pte, orig_pte)) { + if (!pte_same(*vmf->pte, vmf->orig_pte)) { unlock_page(old_page); pte_unmap_unlock(vmf->pte, vmf->ptl); put_page(old_page); @@ -2406,12 +2404,12 @@ static int do_wp_page(struct vm_fault *vmf, pte_t orig_pte) page_move_anon_rmap(old_page, vma); } unlock_page(old_page); - return wp_page_reuse(vmf, orig_pte, old_page, 0, 0); + return wp_page_reuse(vmf, old_page, 0, 0); } unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { - return wp_page_shared(vmf, orig_pte, old_page); + return wp_page_shared(vmf, old_page); } /* @@ -2420,7 +2418,7 @@ static int do_wp_page(struct vm_fault *vmf, pte_t orig_pte) get_page(old_page); pte_unmap_unlock(vmf->pte, vmf->ptl); - return wp_page_copy(vmf, orig_pte, old_page); + return wp_page_copy(vmf, old_page); } static void unmap_mapping_range_vma(struct vm_area_struct *vma, @@ -2508,7 +2506,7 @@ EXPORT_SYMBOL(unmap_mapping_range); * We return with the mmap_sem locked or unlocked in the same cases * as does filemap_fault(). */ -int do_swap_page(struct vm_fault *vmf, pte_t orig_pte) +int do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page, *swapcache; @@ -2519,10 +2517,10 @@ int do_swap_page(struct vm_fault *vmf, pte_t orig_pte) int exclusive = 0; int ret = 0; - if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, orig_pte)) + if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) goto out; - entry = pte_to_swp_entry(orig_pte); + entry = pte_to_swp_entry(vmf->orig_pte); if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { migration_entry_wait(vma->vm_mm, vmf->pmd, @@ -2530,7 +2528,7 @@ int do_swap_page(struct vm_fault *vmf, pte_t orig_pte) } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else { - print_bad_pte(vma, vmf->address, orig_pte, NULL); + print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL); ret = VM_FAULT_SIGBUS; } goto out; @@ -2547,7 +2545,7 @@ int do_swap_page(struct vm_fault *vmf, pte_t orig_pte) */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (likely(pte_same(*vmf->pte, orig_pte))) + if (likely(pte_same(*vmf->pte, vmf->orig_pte))) ret = VM_FAULT_OOM; delayacct_clear_flag(DELAYACCT_PF_SWAPIN); goto unlock; @@ -2604,7 +2602,7 @@ int do_swap_page(struct vm_fault *vmf, pte_t orig_pte) */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (unlikely(!pte_same(*vmf->pte, orig_pte))) + if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) goto out_nomap; if (unlikely(!PageUptodate(page))) { @@ -2632,9 +2630,10 @@ int do_swap_page(struct vm_fault *vmf, pte_t orig_pte) exclusive = RMAP_EXCLUSIVE; } flush_icache_page(vma, page); - if (pte_swp_soft_dirty(orig_pte)) + if (pte_swp_soft_dirty(vmf->orig_pte)) pte = pte_mksoft_dirty(pte); set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); + vmf->orig_pte = pte; if (page == swapcache) { do_page_add_anon_rmap(page, vma, vmf->address, exclusive); mem_cgroup_commit_charge(page, memcg, true, false); @@ -2664,7 +2663,7 @@ int do_swap_page(struct vm_fault *vmf, pte_t orig_pte) } if (vmf->flags & FAULT_FLAG_WRITE) { - ret |= do_wp_page(vmf, pte); + ret |= do_wp_page(vmf); if (ret & VM_FAULT_ERROR) ret &= VM_FAULT_ERROR; goto out; @@ -3363,7 +3362,7 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, return mpol_misplaced(page, vma, addr); } -static int do_numa_page(struct vm_fault *vmf, pte_t pte) +static int do_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = NULL; @@ -3371,6 +3370,7 @@ static int do_numa_page(struct vm_fault *vmf, pte_t pte) int last_cpupid; int target_nid; bool migrated = false; + pte_t pte = vmf->orig_pte; bool was_writable = pte_write(pte); int flags = 0; @@ -3521,8 +3521,7 @@ static int handle_pte_fault(struct vm_fault *vmf) * So now it's safe to run pte_offset_map(). */ vmf->pte = pte_offset_map(vmf->pmd, vmf->address); - - entry = *vmf->pte; + vmf->orig_pte = *vmf->pte; /* * some architectures can have larger ptes than wordsize, @@ -3533,7 +3532,7 @@ static int handle_pte_fault(struct vm_fault *vmf) * ptl lock held. So here a barrier will do. */ barrier(); - if (pte_none(entry)) { + if (pte_none(vmf->orig_pte)) { pte_unmap(vmf->pte); vmf->pte = NULL; } @@ -3546,19 +3545,20 @@ static int handle_pte_fault(struct vm_fault *vmf) return do_fault(vmf); } - if (!pte_present(entry)) - return do_swap_page(vmf, entry); + if (!pte_present(vmf->orig_pte)) + return do_swap_page(vmf); - if (pte_protnone(entry) && vma_is_accessible(vmf->vma)) - return do_numa_page(vmf, entry); + if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) + return do_numa_page(vmf); vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); spin_lock(vmf->ptl); + entry = vmf->orig_pte; if (unlikely(!pte_same(*vmf->pte, entry))) goto unlock; if (vmf->flags & FAULT_FLAG_WRITE) { if (!pte_write(entry)) - return do_wp_page(vmf, entry); + return do_wp_page(vmf); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); -- cgit v1.1 From 3917048d4572b9cabf6f8f5ad395eb693717367c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 14 Dec 2016 15:07:18 -0800 Subject: mm: allow full handling of COW faults in ->fault handlers Patch series "dax: Clear dirty bits after flushing caches", v5. Patchset to clear dirty bits from radix tree of DAX inodes when caches for corresponding pfns have been flushed. In principle, these patches enable handlers to easily update PTEs and do other work necessary to finish the fault without duplicating the functionality present in the generic code. I'd like to thank Kirill and Ross for reviews of the series! This patch (of 20): To allow full handling of COW faults add memcg field to struct vm_fault and a return value of ->fault() handler meaning that COW fault is fully handled and memcg charge must not be canceled. This will allow us to remove knowledge about special DAX locking from the generic fault code. Link: http://lkml.kernel.org/r/1479460644-25076-9-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Ross Zwisler Acked-by: Kirill A. Shutemov Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index cf74f7c..02504cd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2844,9 +2844,8 @@ static int __do_fault(struct vm_fault *vmf) int ret; ret = vma->vm_ops->fault(vma, vmf); - if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) - return ret; - if (ret & VM_FAULT_DAX_LOCKED) + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | + VM_FAULT_DAX_LOCKED | VM_FAULT_DONE_COW))) return ret; if (unlikely(PageHWPoison(vmf->page))) { @@ -3226,7 +3225,6 @@ static int do_read_fault(struct vm_fault *vmf) static int do_cow_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct mem_cgroup *memcg; int ret; if (unlikely(anon_vma_prepare(vma))) @@ -3237,7 +3235,7 @@ static int do_cow_fault(struct vm_fault *vmf) return VM_FAULT_OOM; if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL, - &memcg, false)) { + &vmf->memcg, false)) { put_page(vmf->cow_page); return VM_FAULT_OOM; } @@ -3245,12 +3243,14 @@ static int do_cow_fault(struct vm_fault *vmf) ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; + if (ret & VM_FAULT_DONE_COW) + return ret; if (!(ret & VM_FAULT_DAX_LOCKED)) copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma); __SetPageUptodate(vmf->cow_page); - ret |= alloc_set_pte(vmf, memcg, vmf->cow_page); + ret |= alloc_set_pte(vmf, vmf->memcg, vmf->cow_page); if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); if (!(ret & VM_FAULT_DAX_LOCKED)) { @@ -3263,7 +3263,7 @@ static int do_cow_fault(struct vm_fault *vmf) goto uncharge_out; return ret; uncharge_out: - mem_cgroup_cancel_charge(vmf->cow_page, memcg, false); + mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false); put_page(vmf->cow_page); return ret; } -- cgit v1.1 From 9118c0cbd44262d0015568266f314e645ed6b9ce Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 14 Dec 2016 15:07:21 -0800 Subject: mm: factor out functionality to finish page faults Introduce finish_fault() as a helper function for finishing page faults. It is rather thin wrapper around alloc_set_pte() but since we'd want to call this from DAX code or filesystems, it is still useful to avoid some boilerplate code. Link: http://lkml.kernel.org/r/1479460644-25076-10-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Ross Zwisler Acked-by: Kirill A. Shutemov Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 44 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 02504cd..22f7f6e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3074,6 +3074,38 @@ fault_handled: return ret; } + +/** + * finish_fault - finish page fault once we have prepared the page to fault + * + * @vmf: structure describing the fault + * + * This function handles all that is needed to finish a page fault once the + * page to fault in is prepared. It handles locking of PTEs, inserts PTE for + * given page, adds reverse page mapping, handles memcg charges and LRU + * addition. The function returns 0 on success, VM_FAULT_ code in case of + * error. + * + * The function expects the page to be locked and on success it consumes a + * reference of a page being mapped (for the PTE which maps it). + */ +int finish_fault(struct vm_fault *vmf) +{ + struct page *page; + int ret; + + /* Did we COW the page? */ + if ((vmf->flags & FAULT_FLAG_WRITE) && + !(vmf->vma->vm_flags & VM_SHARED)) + page = vmf->cow_page; + else + page = vmf->page; + ret = alloc_set_pte(vmf, vmf->memcg, page); + if (vmf->pte) + pte_unmap_unlock(vmf->pte, vmf->ptl); + return ret; +} + static unsigned long fault_around_bytes __read_mostly = rounddown_pow_of_two(65536); @@ -3213,9 +3245,7 @@ static int do_read_fault(struct vm_fault *vmf) if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; - ret |= alloc_set_pte(vmf, NULL, vmf->page); - if (vmf->pte) - pte_unmap_unlock(vmf->pte, vmf->ptl); + ret |= finish_fault(vmf); unlock_page(vmf->page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) put_page(vmf->page); @@ -3250,9 +3280,7 @@ static int do_cow_fault(struct vm_fault *vmf) copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma); __SetPageUptodate(vmf->cow_page); - ret |= alloc_set_pte(vmf, vmf->memcg, vmf->cow_page); - if (vmf->pte) - pte_unmap_unlock(vmf->pte, vmf->ptl); + ret |= finish_fault(vmf); if (!(ret & VM_FAULT_DAX_LOCKED)) { unlock_page(vmf->page); put_page(vmf->page); @@ -3293,9 +3321,7 @@ static int do_shared_fault(struct vm_fault *vmf) } } - ret |= alloc_set_pte(vmf, NULL, vmf->page); - if (vmf->pte) - pte_unmap_unlock(vmf->pte, vmf->ptl); + ret |= finish_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) { unlock_page(vmf->page); -- cgit v1.1 From b1aa812b21084285e9f6098639be9cd5bf9e05d7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 14 Dec 2016 15:07:24 -0800 Subject: mm: move handling of COW faults into DAX code Move final handling of COW faults from generic code into DAX fault handler. That way generic code doesn't have to be aware of peculiarities of DAX locking so remove that knowledge and make locking functions private to fs/dax.c. Link: http://lkml.kernel.org/r/1479460644-25076-11-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Acked-by: Kirill A. Shutemov Reviewed-by: Ross Zwisler Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 22f7f6e..ca3b95f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2845,7 +2845,7 @@ static int __do_fault(struct vm_fault *vmf) ret = vma->vm_ops->fault(vma, vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | - VM_FAULT_DAX_LOCKED | VM_FAULT_DONE_COW))) + VM_FAULT_DONE_COW))) return ret; if (unlikely(PageHWPoison(vmf->page))) { @@ -3276,17 +3276,12 @@ static int do_cow_fault(struct vm_fault *vmf) if (ret & VM_FAULT_DONE_COW) return ret; - if (!(ret & VM_FAULT_DAX_LOCKED)) - copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma); + copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma); __SetPageUptodate(vmf->cow_page); ret |= finish_fault(vmf); - if (!(ret & VM_FAULT_DAX_LOCKED)) { - unlock_page(vmf->page); - put_page(vmf->page); - } else { - dax_unlock_mapping_entry(vma->vm_file->f_mapping, vmf->pgoff); - } + unlock_page(vmf->page); + put_page(vmf->page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; return ret; -- cgit v1.1 From 97ba0c2b4b0994044e404b7a96fc92a2e0424534 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 14 Dec 2016 15:07:27 -0800 Subject: mm: factor out common parts of write fault handling Currently we duplicate handling of shared write faults in wp_page_reuse() and do_shared_fault(). Factor them out into a common function. Link: http://lkml.kernel.org/r/1479460644-25076-12-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Ross Zwisler Acked-by: Kirill A. Shutemov Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 78 +++++++++++++++++++++++++++++-------------------------------- 1 file changed, 37 insertions(+), 41 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index ca3b95f..6fd8278 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2063,6 +2063,41 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, } /* + * Handle dirtying of a page in shared file mapping on a write fault. + * + * The function expects the page to be locked and unlocks it. + */ +static void fault_dirty_shared_page(struct vm_area_struct *vma, + struct page *page) +{ + struct address_space *mapping; + bool dirtied; + bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite; + + dirtied = set_page_dirty(page); + VM_BUG_ON_PAGE(PageAnon(page), page); + /* + * Take a local copy of the address_space - page.mapping may be zeroed + * by truncate after unlock_page(). The address_space itself remains + * pinned by vma->vm_file's reference. We rely on unlock_page()'s + * release semantics to prevent the compiler from undoing this copying. + */ + mapping = page_rmapping(page); + unlock_page(page); + + if ((dirtied || page_mkwrite) && mapping) { + /* + * Some device drivers do not set page.mapping + * but still dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); + } + + if (!page_mkwrite) + file_update_time(vma->vm_file); +} + +/* * Handle write page faults for pages that can be reused in the current vma * * This can happen either due to the mapping being with the VM_SHARED flag, @@ -2092,28 +2127,11 @@ static inline int wp_page_reuse(struct vm_fault *vmf, struct page *page, pte_unmap_unlock(vmf->pte, vmf->ptl); if (dirty_shared) { - struct address_space *mapping; - int dirtied; - if (!page_mkwrite) lock_page(page); - dirtied = set_page_dirty(page); - VM_BUG_ON_PAGE(PageAnon(page), page); - mapping = page->mapping; - unlock_page(page); + fault_dirty_shared_page(vma, page); put_page(page); - - if ((dirtied || page_mkwrite) && mapping) { - /* - * Some device drivers do not set page.mapping - * but still dirty their pages - */ - balance_dirty_pages_ratelimited(mapping); - } - - if (!page_mkwrite) - file_update_time(vma->vm_file); } return VM_FAULT_WRITE; @@ -3294,8 +3312,6 @@ uncharge_out: static int do_shared_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct address_space *mapping; - int dirtied = 0; int ret, tmp; ret = __do_fault(vmf); @@ -3324,27 +3340,7 @@ static int do_shared_fault(struct vm_fault *vmf) return ret; } - if (set_page_dirty(vmf->page)) - dirtied = 1; - /* - * Take a local copy of the address_space - page.mapping may be zeroed - * by truncate after unlock_page(). The address_space itself remains - * pinned by vma->vm_file's reference. We rely on unlock_page()'s - * release semantics to prevent the compiler from undoing this copying. - */ - mapping = page_rmapping(vmf->page); - unlock_page(vmf->page); - if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { - /* - * Some device drivers do not set page.mapping but still - * dirty their pages - */ - balance_dirty_pages_ratelimited(mapping); - } - - if (!vma->vm_ops->page_mkwrite) - file_update_time(vma->vm_file); - + fault_dirty_shared_page(vma, vmf->page); return ret; } -- cgit v1.1 From 38b8cb7fbb892503fe9fcf748ebbed8c9fde7bf8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 14 Dec 2016 15:07:30 -0800 Subject: mm: pass vm_fault structure into do_page_mkwrite() We will need more information in the ->page_mkwrite() helper for DAX to be able to fully finish faults there. Pass vm_fault structure to do_page_mkwrite() and use it there so that information propagates properly from upper layers. Link: http://lkml.kernel.org/r/1479460644-25076-13-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Ross Zwisler Acked-by: Kirill A. Shutemov Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 6fd8278..e8a5278 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2034,20 +2034,17 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) * * We do this without the lock held, so that it can sleep if it needs to. */ -static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, - unsigned long address) +static int do_page_mkwrite(struct vm_fault *vmf) { - struct vm_fault vmf; int ret; + struct page *page = vmf->page; + unsigned int old_flags = vmf->flags; - vmf.address = address & PAGE_MASK; - vmf.pgoff = page->index; - vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; - vmf.gfp_mask = __get_fault_gfp_mask(vma); - vmf.page = page; - vmf.cow_page = NULL; + vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; - ret = vma->vm_ops->page_mkwrite(vma, &vmf); + ret = vmf->vma->vm_ops->page_mkwrite(vmf->vma, vmf); + /* Restore original flags so that caller is not surprised */ + vmf->flags = old_flags; if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) return ret; if (unlikely(!(ret & VM_FAULT_LOCKED))) { @@ -2323,7 +2320,8 @@ static int wp_page_shared(struct vm_fault *vmf, struct page *old_page) int tmp; pte_unmap_unlock(vmf->pte, vmf->ptl); - tmp = do_page_mkwrite(vma, old_page, vmf->address); + vmf->page = old_page; + tmp = do_page_mkwrite(vmf); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { put_page(old_page); @@ -3324,7 +3322,7 @@ static int do_shared_fault(struct vm_fault *vmf) */ if (vma->vm_ops->page_mkwrite) { unlock_page(vmf->page); - tmp = do_page_mkwrite(vma, vmf->page, vmf->address); + tmp = do_page_mkwrite(vmf); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { put_page(vmf->page); -- cgit v1.1 From a41b70d6dfc28b9e1a17c2a9f3181c2b614bfd54 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 14 Dec 2016 15:07:33 -0800 Subject: mm: use vmf->page during WP faults So far we set vmf->page during WP faults only when we needed to pass it to the ->page_mkwrite handler. Set it in all the cases now and use that instead of passing page pointer explicitly around. Link: http://lkml.kernel.org/r/1479460644-25076-14-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Ross Zwisler Acked-by: Kirill A. Shutemov Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 58 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index e8a5278..ad45289 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2102,11 +2102,12 @@ static void fault_dirty_shared_page(struct vm_area_struct *vma, * case, all we need to do here is to mark the page as writable and update * any related book-keeping. */ -static inline int wp_page_reuse(struct vm_fault *vmf, struct page *page, +static inline int wp_page_reuse(struct vm_fault *vmf, int page_mkwrite, int dirty_shared) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; + struct page *page = vmf->page; pte_t entry; /* * Clear the pages cpupid information as the existing @@ -2150,10 +2151,11 @@ static inline int wp_page_reuse(struct vm_fault *vmf, struct page *page, * held to the old page, as well as updating the rmap. * - In any case, unlock the PTL and drop the reference we took to the old page. */ -static int wp_page_copy(struct vm_fault *vmf, struct page *old_page) +static int wp_page_copy(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; + struct page *old_page = vmf->page; struct page *new_page = NULL; pte_t entry; int page_copied = 0; @@ -2305,26 +2307,25 @@ static int wp_pfn_shared(struct vm_fault *vmf) return 0; } } - return wp_page_reuse(vmf, NULL, 0, 0); + return wp_page_reuse(vmf, 0, 0); } -static int wp_page_shared(struct vm_fault *vmf, struct page *old_page) +static int wp_page_shared(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; int page_mkwrite = 0; - get_page(old_page); + get_page(vmf->page); if (vma->vm_ops && vma->vm_ops->page_mkwrite) { int tmp; pte_unmap_unlock(vmf->pte, vmf->ptl); - vmf->page = old_page; tmp = do_page_mkwrite(vmf); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { - put_page(old_page); + put_page(vmf->page); return tmp; } /* @@ -2336,15 +2337,15 @@ static int wp_page_shared(struct vm_fault *vmf, struct page *old_page) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (!pte_same(*vmf->pte, vmf->orig_pte)) { - unlock_page(old_page); + unlock_page(vmf->page); pte_unmap_unlock(vmf->pte, vmf->ptl); - put_page(old_page); + put_page(vmf->page); return 0; } page_mkwrite = 1; } - return wp_page_reuse(vmf, old_page, page_mkwrite, 1); + return wp_page_reuse(vmf, page_mkwrite, 1); } /* @@ -2369,10 +2370,9 @@ static int do_wp_page(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; - struct page *old_page; - old_page = vm_normal_page(vma, vmf->address, vmf->orig_pte); - if (!old_page) { + vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); + if (!vmf->page) { /* * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a * VM_PFNMAP VMA. @@ -2385,30 +2385,30 @@ static int do_wp_page(struct vm_fault *vmf) return wp_pfn_shared(vmf); pte_unmap_unlock(vmf->pte, vmf->ptl); - return wp_page_copy(vmf, old_page); + return wp_page_copy(vmf); } /* * Take out anonymous pages first, anonymous shared vmas are * not dirty accountable. */ - if (PageAnon(old_page) && !PageKsm(old_page)) { + if (PageAnon(vmf->page) && !PageKsm(vmf->page)) { int total_mapcount; - if (!trylock_page(old_page)) { - get_page(old_page); + if (!trylock_page(vmf->page)) { + get_page(vmf->page); pte_unmap_unlock(vmf->pte, vmf->ptl); - lock_page(old_page); + lock_page(vmf->page); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (!pte_same(*vmf->pte, vmf->orig_pte)) { - unlock_page(old_page); + unlock_page(vmf->page); pte_unmap_unlock(vmf->pte, vmf->ptl); - put_page(old_page); + put_page(vmf->page); return 0; } - put_page(old_page); + put_page(vmf->page); } - if (reuse_swap_page(old_page, &total_mapcount)) { + if (reuse_swap_page(vmf->page, &total_mapcount)) { if (total_mapcount == 1) { /* * The page is all ours. Move it to @@ -2417,24 +2417,24 @@ static int do_wp_page(struct vm_fault *vmf) * Protected against the rmap code by * the page lock. */ - page_move_anon_rmap(old_page, vma); + page_move_anon_rmap(vmf->page, vma); } - unlock_page(old_page); - return wp_page_reuse(vmf, old_page, 0, 0); + unlock_page(vmf->page); + return wp_page_reuse(vmf, 0, 0); } - unlock_page(old_page); + unlock_page(vmf->page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { - return wp_page_shared(vmf, old_page); + return wp_page_shared(vmf); } /* * Ok, we need to copy. Oh, well.. */ - get_page(old_page); + get_page(vmf->page); pte_unmap_unlock(vmf->pte, vmf->ptl); - return wp_page_copy(vmf, old_page); + return wp_page_copy(vmf); } static void unmap_mapping_range_vma(struct vm_area_struct *vma, -- cgit v1.1 From 997dd98dd68beb2aea74cac53e7fd440cc8dba68 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 14 Dec 2016 15:07:36 -0800 Subject: mm: move part of wp_page_reuse() into the single call site wp_page_reuse() handles write shared faults which is needed only in wp_page_shared(). Move the handling only into that location to make wp_page_reuse() simpler and avoid a strange situation when we sometimes pass in locked page, sometimes unlocked etc. Link: http://lkml.kernel.org/r/1479460644-25076-15-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Ross Zwisler Acked-by: Kirill A. Shutemov Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index ad45289..82e7689 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2102,8 +2102,7 @@ static void fault_dirty_shared_page(struct vm_area_struct *vma, * case, all we need to do here is to mark the page as writable and update * any related book-keeping. */ -static inline int wp_page_reuse(struct vm_fault *vmf, - int page_mkwrite, int dirty_shared) +static inline void wp_page_reuse(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; @@ -2123,16 +2122,6 @@ static inline int wp_page_reuse(struct vm_fault *vmf, if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) update_mmu_cache(vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); - - if (dirty_shared) { - if (!page_mkwrite) - lock_page(page); - - fault_dirty_shared_page(vma, page); - put_page(page); - } - - return VM_FAULT_WRITE; } /* @@ -2307,7 +2296,8 @@ static int wp_pfn_shared(struct vm_fault *vmf) return 0; } } - return wp_page_reuse(vmf, 0, 0); + wp_page_reuse(vmf); + return VM_FAULT_WRITE; } static int wp_page_shared(struct vm_fault *vmf) @@ -2345,7 +2335,13 @@ static int wp_page_shared(struct vm_fault *vmf) page_mkwrite = 1; } - return wp_page_reuse(vmf, page_mkwrite, 1); + wp_page_reuse(vmf); + if (!page_mkwrite) + lock_page(vmf->page); + fault_dirty_shared_page(vma, vmf->page); + put_page(vmf->page); + + return VM_FAULT_WRITE; } /* @@ -2420,7 +2416,8 @@ static int do_wp_page(struct vm_fault *vmf) page_move_anon_rmap(vmf->page, vma); } unlock_page(vmf->page); - return wp_page_reuse(vmf, 0, 0); + wp_page_reuse(vmf); + return VM_FAULT_WRITE; } unlock_page(vmf->page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == -- cgit v1.1 From 66a6197c118540d454913eef24d68d7491ab5d5f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 14 Dec 2016 15:07:39 -0800 Subject: mm: provide helper for finishing mkwrite faults Provide a helper function for finishing write faults due to PTE being read-only. The helper will be used by DAX to avoid the need of complicating generic MM code with DAX locking specifics. Link: http://lkml.kernel.org/r/1479460644-25076-16-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Ross Zwisler Acked-by: Kirill A. Shutemov Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 67 ++++++++++++++++++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 27 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 82e7689..bbc25da 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2269,6 +2269,38 @@ oom: return VM_FAULT_OOM; } +/** + * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE + * writeable once the page is prepared + * + * @vmf: structure describing the fault + * + * This function handles all that is needed to finish a write page fault in a + * shared mapping due to PTE being read-only once the mapped page is prepared. + * It handles locking of PTE and modifying it. The function returns + * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE + * lock. + * + * The function expects the page to be locked or other protection against + * concurrent faults / writeback (such as DAX radix tree locks). + */ +int finish_mkwrite_fault(struct vm_fault *vmf) +{ + WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); + /* + * We might have raced with another page fault while we released the + * pte_offset_map_lock. + */ + if (!pte_same(*vmf->pte, vmf->orig_pte)) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + return 0; + } + wp_page_reuse(vmf); + return VM_FAULT_WRITE; +} + /* * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED * mapping @@ -2285,16 +2317,7 @@ static int wp_pfn_shared(struct vm_fault *vmf) ret = vma->vm_ops->pfn_mkwrite(vma, vmf); if (ret & VM_FAULT_ERROR) return ret; - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, - vmf->address, &vmf->ptl); - /* - * We might have raced with another page fault while we - * released the pte_offset_map_lock. - */ - if (!pte_same(*vmf->pte, vmf->orig_pte)) { - pte_unmap_unlock(vmf->pte, vmf->ptl); - return 0; - } + return finish_mkwrite_fault(vmf); } wp_page_reuse(vmf); return VM_FAULT_WRITE; @@ -2304,7 +2327,6 @@ static int wp_page_shared(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; - int page_mkwrite = 0; get_page(vmf->page); @@ -2318,26 +2340,17 @@ static int wp_page_shared(struct vm_fault *vmf) put_page(vmf->page); return tmp; } - /* - * Since we dropped the lock we need to revalidate - * the PTE as someone else may have changed it. If - * they did, we just return, as we can count on the - * MMU to tell us if they didn't also make it writable. - */ - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, - vmf->address, &vmf->ptl); - if (!pte_same(*vmf->pte, vmf->orig_pte)) { + tmp = finish_mkwrite_fault(vmf); + if (unlikely(!tmp || (tmp & + (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { unlock_page(vmf->page); - pte_unmap_unlock(vmf->pte, vmf->ptl); put_page(vmf->page); - return 0; + return tmp; } - page_mkwrite = 1; - } - - wp_page_reuse(vmf); - if (!page_mkwrite) + } else { + wp_page_reuse(vmf); lock_page(vmf->page); + } fault_dirty_shared_page(vma, vmf->page); put_page(vmf->page); -- cgit v1.1 From a19e25536ed3a20845f642ce531e10c27fb2add5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 14 Dec 2016 15:07:42 -0800 Subject: mm: change return values of finish_mkwrite_fault() Currently finish_mkwrite_fault() returns 0 when PTE got changed before we acquired PTE lock and VM_FAULT_WRITE when we succeeded in modifying the PTE. This is somewhat confusing since 0 generally means success, it is also inconsistent with finish_fault() which returns 0 on success. Change finish_mkwrite_fault() to return 0 on success and VM_FAULT_NOPAGE when PTE changed. Practically, there should be no behavioral difference since we bail out from the fault the same way regardless whether we return 0, VM_FAULT_NOPAGE, or VM_FAULT_WRITE. Also note that VM_FAULT_WRITE has no effect for shared mappings since the only two places that check it - KSM and GUP - care about private mappings only. Generally the meaning of VM_FAULT_WRITE for shared mappings is not well defined and we should probably clean that up. Link: http://lkml.kernel.org/r/1479460644-25076-17-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Acked-by: Kirill A. Shutemov Cc: Ross Zwisler Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index bbc25da..8b7f065 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2295,10 +2295,10 @@ int finish_mkwrite_fault(struct vm_fault *vmf) */ if (!pte_same(*vmf->pte, vmf->orig_pte)) { pte_unmap_unlock(vmf->pte, vmf->ptl); - return 0; + return VM_FAULT_NOPAGE; } wp_page_reuse(vmf); - return VM_FAULT_WRITE; + return 0; } /* @@ -2341,8 +2341,7 @@ static int wp_page_shared(struct vm_fault *vmf) return tmp; } tmp = finish_mkwrite_fault(vmf); - if (unlikely(!tmp || (tmp & - (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { + if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { unlock_page(vmf->page); put_page(vmf->page); return tmp; -- cgit v1.1 From cae1240257d9ba4b40eb240124c530de8ee349bc Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 14 Dec 2016 15:07:45 -0800 Subject: mm: export follow_pte() DAX will need to implement its own version of page_check_address(). To avoid duplicating page table walking code, export follow_pte() which does what we need. Link: http://lkml.kernel.org/r/1479460644-25076-18-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Ross Zwisler Cc: Kirill A. Shutemov Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 8b7f065..edd899d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3817,8 +3817,8 @@ out: return -EINVAL; } -static inline int follow_pte(struct mm_struct *mm, unsigned long address, - pte_t **ptepp, spinlock_t **ptlp) +int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, + spinlock_t **ptlp) { int res; -- cgit v1.1 From 2f89dc12a25ddf995b9acd7b6543fe892e3473d6 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 14 Dec 2016 15:07:50 -0800 Subject: dax: protect PTE modification on WP fault by radix tree entry lock Currently PTE gets updated in wp_pfn_shared() after dax_pfn_mkwrite() has released corresponding radix tree entry lock. When we want to writeprotect PTE on cache flush, we need PTE modification to happen under radix tree entry lock to ensure consistent updates of PTE and radix tree (standard faults use page lock to ensure this consistency). So move update of PTE bit into dax_pfn_mkwrite(). Link: http://lkml.kernel.org/r/1479460644-25076-20-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Ross Zwisler Cc: Kirill A. Shutemov Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index edd899d..57d0bd1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2315,7 +2315,7 @@ static int wp_pfn_shared(struct vm_fault *vmf) pte_unmap_unlock(vmf->pte, vmf->ptl); vmf->flags |= FAULT_FLAG_MKWRITE; ret = vma->vm_ops->pfn_mkwrite(vma, vmf); - if (ret & VM_FAULT_ERROR) + if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)) return ret; return finish_mkwrite_fault(vmf); } -- cgit v1.1 From 148deab223b23734069abcacb5c7118b0e7deadc Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 14 Dec 2016 15:08:49 -0800 Subject: radix-tree: improve multiorder iterators This fixes several interlinked problems with the iterators in the presence of multiorder entries. 1. radix_tree_iter_next() would only advance by one slot, which would result in the iterators returning the same entry more than once if there were sibling entries. 2. radix_tree_next_slot() could return an internal pointer instead of a user pointer if a tagged multiorder entry was immediately followed by an entry of lower order. 3. radix_tree_next_slot() expanded to a lot more code than it used to when multiorder support was compiled in. And I wasn't comfortable with entry_to_node() being in a header file. Fixing radix_tree_iter_next() for the presence of sibling entries necessarily involves examining the contents of the radix tree, so we now need to pass 'slot' to radix_tree_iter_next(), and we need to change the calling convention so it is called *before* dropping the lock which protects the tree. Also rename it to radix_tree_iter_resume(), as some people thought it was necessary to call radix_tree_iter_next() each time around the loop. radix_tree_next_slot() becomes closer to how it looked before multiorder support was introduced. It only checks to see if the next entry in the chunk is a sibling entry or a pointer to a node; this should be rare enough that handling this case out of line is not a performance impact (and such impact is amortised by the fact that the entry we just processed was a multiorder entry). Also, radix_tree_next_slot() used to force a new chunk lookup for untagged entries, which is more expensive than the out of line sibling entry skipping. Link: http://lkml.kernel.org/r/1480369871-5271-55-git-send-email-mawilcox@linuxonhyperv.com Signed-off-by: Matthew Wilcox Tested-by: Kirill A. Shutemov Cc: Konstantin Khlebnikov Cc: Ross Zwisler Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 7 +++---- mm/shmem.c | 6 +++--- 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 7434a63..e32389a 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1446,7 +1446,7 @@ static void collapse_shmem(struct mm_struct *mm, radix_tree_replace_slot(&mapping->page_tree, slot, new_page + (index % HPAGE_PMD_NR)); - slot = radix_tree_iter_next(&iter); + slot = radix_tree_iter_resume(slot, &iter); index++; continue; out_lru: @@ -1546,7 +1546,6 @@ tree_unlocked: /* Put holes back where they were */ radix_tree_delete(&mapping->page_tree, iter.index); - slot = radix_tree_iter_next(&iter); continue; } @@ -1557,11 +1556,11 @@ tree_unlocked: page_ref_unfreeze(page, 2); radix_tree_replace_slot(&mapping->page_tree, slot, page); + slot = radix_tree_iter_resume(slot, &iter); spin_unlock_irq(&mapping->tree_lock); putback_lru_page(page); unlock_page(page); spin_lock_irq(&mapping->tree_lock); - slot = radix_tree_iter_next(&iter); } VM_BUG_ON(nr_none); spin_unlock_irq(&mapping->tree_lock); @@ -1641,8 +1640,8 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, present++; if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); cond_resched_rcu(); - slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); diff --git a/mm/shmem.c b/mm/shmem.c index abd7403..be11c6d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -661,8 +661,8 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, swapped++; if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); cond_resched_rcu(); - slot = radix_tree_iter_next(&iter); } } @@ -2447,8 +2447,8 @@ static void shmem_tag_pins(struct address_space *mapping) } if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); cond_resched_rcu(); - slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); @@ -2517,8 +2517,8 @@ static int shmem_wait_for_pins(struct address_space *mapping) spin_unlock_irq(&mapping->tree_lock); continue_resched: if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); cond_resched_rcu(); - slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); -- cgit v1.1 From 478922e2b0f41567e4a530771bfb3f693f857d45 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 14 Dec 2016 15:08:52 -0800 Subject: radix-tree: delete radix_tree_locate_item() This rather complicated function can be better implemented as an iterator. It has only one caller, so move the functionality to the only place that needs it. Update the test suite to follow the same pattern. Link: http://lkml.kernel.org/r/1480369871-5271-56-git-send-email-mawilcox@linuxonhyperv.com Signed-off-by: Matthew Wilcox Acked-by: Konstantin Khlebnikov Tested-by: Kirill A. Shutemov Cc: Ross Zwisler Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index be11c6d..54287d44 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1049,6 +1049,30 @@ static void shmem_evict_inode(struct inode *inode) clear_inode(inode); } +static unsigned long find_swap_entry(struct radix_tree_root *root, void *item) +{ + struct radix_tree_iter iter; + void **slot; + unsigned long found = -1; + unsigned int checked = 0; + + rcu_read_lock(); + radix_tree_for_each_slot(slot, root, &iter, 0) { + if (*slot == item) { + found = iter.index; + break; + } + checked++; + if ((checked % 4096) != 0) + continue; + slot = radix_tree_iter_resume(slot, &iter); + cond_resched_rcu(); + } + + rcu_read_unlock(); + return found; +} + /* * If swap found in inode, free it and move page from swapcache to filecache. */ @@ -1062,7 +1086,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, int error = 0; radswap = swp_to_radix_entry(swap); - index = radix_tree_locate_item(&mapping->page_tree, radswap); + index = find_swap_entry(&mapping->page_tree, radswap); if (index == -1) return -EAGAIN; /* tell shmem_unuse we found nothing */ -- cgit v1.1 From 268f42de718128cd0301293177e79c08c38e39a6 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 14 Dec 2016 15:08:55 -0800 Subject: radix-tree: delete radix_tree_range_tag_if_tagged() This is an exceptionally complicated function with just one caller (tag_pages_for_writeback). We devote a large portion of the runtime of the test suite to testing this one function which has one caller. By introducing the new function radix_tree_iter_tag_set(), we can eliminate all of the complexity while keeping the performance. The caller can now use a fairly standard radix_tree_for_each() loop, and it doesn't need to worry about tricksy things like 'start' wrapping. The test suite continues to spend a large amount of time investigating this function, but now it's testing the underlying primitives such as radix_tree_iter_resume() and the radix_tree_for_each_tagged() iterator which are also used by other parts of the kernel. Link: http://lkml.kernel.org/r/1480369871-5271-57-git-send-email-mawilcox@linuxonhyperv.com Signed-off-by: Matthew Wilcox Tested-by: Kirill A. Shutemov Cc: Konstantin Khlebnikov Cc: Ross Zwisler Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 52e2f8e..290e8b7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2106,18 +2106,26 @@ void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end) { #define WRITEBACK_TAG_BATCH 4096 - unsigned long tagged; - - do { - spin_lock_irq(&mapping->tree_lock); - tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree, - &start, end, WRITEBACK_TAG_BATCH, - PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE); + unsigned long tagged = 0; + struct radix_tree_iter iter; + void **slot; + + spin_lock_irq(&mapping->tree_lock); + radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start, + PAGECACHE_TAG_DIRTY) { + if (iter.index > end) + break; + radix_tree_iter_tag_set(&mapping->page_tree, &iter, + PAGECACHE_TAG_TOWRITE); + tagged++; + if ((tagged % WRITEBACK_TAG_BATCH) != 0) + continue; + slot = radix_tree_iter_resume(slot, &iter); spin_unlock_irq(&mapping->tree_lock); - WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH); cond_resched(); - /* We check 'start' to handle wrapping when end == ~0UL */ - } while (tagged >= WRITEBACK_TAG_BATCH && start); + spin_lock_irq(&mapping->tree_lock); + } + spin_unlock_irq(&mapping->tree_lock); } EXPORT_SYMBOL(tag_pages_for_writeback); -- cgit v1.1