summaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-12-14 17:25:18 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2016-12-14 17:25:18 -0800
commita57cb1c1d7974c62a5c80f7869e35b492ace12cd (patch)
tree5a42ee9a668f171143464bc86013954c1bbe94ad /mm/memory.c
parentcf1b3341afab9d3ad02a76b3a619ea027dcf4e28 (diff)
parente1e14ab8411df344a17687821f8f78f0a1e73cbb (diff)
downloadop-kernel-dev-a57cb1c1d7974c62a5c80f7869e35b492ace12cd.zip
op-kernel-dev-a57cb1c1d7974c62a5c80f7869e35b492ace12cd.tar.gz
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton: - a few misc things - kexec updates - DMA-mapping updates to better support networking DMA operations - IPC updates - various MM changes to improve DAX fault handling - lots of radix-tree changes, mainly to the test suite. All leading up to reimplementing the IDA/IDR code to be a wrapper layer over the radix-tree. However the final trigger-pulling patch is held off for 4.11. * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (114 commits) radix tree test suite: delete unused rcupdate.c radix tree test suite: add new tag check radix-tree: ensure counts are initialised radix tree test suite: cache recently freed objects radix tree test suite: add some more functionality idr: reduce the number of bits per level from 8 to 6 rxrpc: abstract away knowledge of IDR internals tpm: use idr_find(), not idr_find_slowpath() idr: add ida_is_empty radix tree test suite: check multiorder iteration radix-tree: fix replacement for multiorder entries radix-tree: add radix_tree_split_preload() radix-tree: add radix_tree_split radix-tree: add radix_tree_join radix-tree: delete radix_tree_range_tag_if_tagged() radix-tree: delete radix_tree_locate_item() radix-tree: improve multiorder iterators btrfs: fix race in btrfs_free_dummy_fs_info() radix-tree: improve dump output radix-tree: make radix_tree_find_next_bit more useful ...
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c859
1 files changed, 434 insertions, 425 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 08d8da3..455c3e6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2034,20 +2034,17 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
*
* We do this without the lock held, so that it can sleep if it needs to.
*/
-static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
- unsigned long address)
+static int do_page_mkwrite(struct vm_fault *vmf)
{
- struct vm_fault vmf;
int ret;
+ struct page *page = vmf->page;
+ unsigned int old_flags = vmf->flags;
- vmf.virtual_address = (void __user *)(address & PAGE_MASK);
- vmf.pgoff = page->index;
- vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
- vmf.gfp_mask = __get_fault_gfp_mask(vma);
- vmf.page = page;
- vmf.cow_page = NULL;
+ vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
- ret = vma->vm_ops->page_mkwrite(vma, &vmf);
+ ret = vmf->vma->vm_ops->page_mkwrite(vmf->vma, vmf);
+ /* Restore original flags so that caller is not surprised */
+ vmf->flags = old_flags;
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
return ret;
if (unlikely(!(ret & VM_FAULT_LOCKED))) {
@@ -2063,6 +2060,41 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
}
/*
+ * Handle dirtying of a page in shared file mapping on a write fault.
+ *
+ * The function expects the page to be locked and unlocks it.
+ */
+static void fault_dirty_shared_page(struct vm_area_struct *vma,
+ struct page *page)
+{
+ struct address_space *mapping;
+ bool dirtied;
+ bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
+
+ dirtied = set_page_dirty(page);
+ VM_BUG_ON_PAGE(PageAnon(page), page);
+ /*
+ * Take a local copy of the address_space - page.mapping may be zeroed
+ * by truncate after unlock_page(). The address_space itself remains
+ * pinned by vma->vm_file's reference. We rely on unlock_page()'s
+ * release semantics to prevent the compiler from undoing this copying.
+ */
+ mapping = page_rmapping(page);
+ unlock_page(page);
+
+ if ((dirtied || page_mkwrite) && mapping) {
+ /*
+ * Some device drivers do not set page.mapping
+ * but still dirty their pages
+ */
+ balance_dirty_pages_ratelimited(mapping);
+ }
+
+ if (!page_mkwrite)
+ file_update_time(vma->vm_file);
+}
+
+/*
* Handle write page faults for pages that can be reused in the current vma
*
* This can happen either due to the mapping being with the VM_SHARED flag,
@@ -2070,11 +2102,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
* case, all we need to do here is to mark the page as writable and update
* any related book-keeping.
*/
-static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
- struct page *page, int page_mkwrite, int dirty_shared)
- __releases(fe->ptl)
+static inline void wp_page_reuse(struct vm_fault *vmf)
+ __releases(vmf->ptl)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
+ struct page *page = vmf->page;
pte_t entry;
/*
* Clear the pages cpupid information as the existing
@@ -2084,39 +2116,12 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
if (page)
page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
- flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
- entry = pte_mkyoung(orig_pte);
+ flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
+ entry = pte_mkyoung(vmf->orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1))
- update_mmu_cache(vma, fe->address, fe->pte);
- pte_unmap_unlock(fe->pte, fe->ptl);
-
- if (dirty_shared) {
- struct address_space *mapping;
- int dirtied;
-
- if (!page_mkwrite)
- lock_page(page);
-
- dirtied = set_page_dirty(page);
- VM_BUG_ON_PAGE(PageAnon(page), page);
- mapping = page->mapping;
- unlock_page(page);
- put_page(page);
-
- if ((dirtied || page_mkwrite) && mapping) {
- /*
- * Some device drivers do not set page.mapping
- * but still dirty their pages
- */
- balance_dirty_pages_ratelimited(mapping);
- }
-
- if (!page_mkwrite)
- file_update_time(vma->vm_file);
- }
-
- return VM_FAULT_WRITE;
+ if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
+ update_mmu_cache(vma, vmf->address, vmf->pte);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
}
/*
@@ -2135,31 +2140,32 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
* held to the old page, as well as updating the rmap.
* - In any case, unlock the PTL and drop the reference we took to the old page.
*/
-static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
- struct page *old_page)
+static int wp_page_copy(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
+ struct page *old_page = vmf->page;
struct page *new_page = NULL;
pte_t entry;
int page_copied = 0;
- const unsigned long mmun_start = fe->address & PAGE_MASK;
+ const unsigned long mmun_start = vmf->address & PAGE_MASK;
const unsigned long mmun_end = mmun_start + PAGE_SIZE;
struct mem_cgroup *memcg;
if (unlikely(anon_vma_prepare(vma)))
goto oom;
- if (is_zero_pfn(pte_pfn(orig_pte))) {
- new_page = alloc_zeroed_user_highpage_movable(vma, fe->address);
+ if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
+ new_page = alloc_zeroed_user_highpage_movable(vma,
+ vmf->address);
if (!new_page)
goto oom;
} else {
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
- fe->address);
+ vmf->address);
if (!new_page)
goto oom;
- cow_user_page(new_page, old_page, fe->address, vma);
+ cow_user_page(new_page, old_page, vmf->address, vma);
}
if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
@@ -2172,8 +2178,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
/*
* Re-check the pte - we dropped the lock
*/
- fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl);
- if (likely(pte_same(*fe->pte, orig_pte))) {
+ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
+ if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
dec_mm_counter_fast(mm,
@@ -2183,7 +2189,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
} else {
inc_mm_counter_fast(mm, MM_ANONPAGES);
}
- flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
+ flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
/*
@@ -2192,8 +2198,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
* seen in the presence of one thread doing SMC and another
* thread doing COW.
*/
- ptep_clear_flush_notify(vma, fe->address, fe->pte);
- page_add_new_anon_rmap(new_page, vma, fe->address, false);
+ ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
+ page_add_new_anon_rmap(new_page, vma, vmf->address, false);
mem_cgroup_commit_charge(new_page, memcg, false, false);
lru_cache_add_active_or_unevictable(new_page, vma);
/*
@@ -2201,8 +2207,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
* mmu page tables (such as kvm shadow page tables), we want the
* new page to be mapped directly into the secondary page table.
*/
- set_pte_at_notify(mm, fe->address, fe->pte, entry);
- update_mmu_cache(vma, fe->address, fe->pte);
+ set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
if (old_page) {
/*
* Only after switching the pte to the new page may
@@ -2239,7 +2245,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
if (new_page)
put_page(new_page);
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
if (old_page) {
/*
@@ -2263,79 +2269,91 @@ oom:
return VM_FAULT_OOM;
}
+/**
+ * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
+ * writeable once the page is prepared
+ *
+ * @vmf: structure describing the fault
+ *
+ * This function handles all that is needed to finish a write page fault in a
+ * shared mapping due to PTE being read-only once the mapped page is prepared.
+ * It handles locking of PTE and modifying it. The function returns
+ * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE
+ * lock.
+ *
+ * The function expects the page to be locked or other protection against
+ * concurrent faults / writeback (such as DAX radix tree locks).
+ */
+int finish_mkwrite_fault(struct vm_fault *vmf)
+{
+ WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
+ /*
+ * We might have raced with another page fault while we released the
+ * pte_offset_map_lock.
+ */
+ if (!pte_same(*vmf->pte, vmf->orig_pte)) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return VM_FAULT_NOPAGE;
+ }
+ wp_page_reuse(vmf);
+ return 0;
+}
+
/*
* Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
* mapping
*/
-static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte)
+static int wp_pfn_shared(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
- struct vm_fault vmf = {
- .page = NULL,
- .pgoff = linear_page_index(vma, fe->address),
- .virtual_address =
- (void __user *)(fe->address & PAGE_MASK),
- .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE,
- };
int ret;
- pte_unmap_unlock(fe->pte, fe->ptl);
- ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
- if (ret & VM_FAULT_ERROR)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ vmf->flags |= FAULT_FLAG_MKWRITE;
+ ret = vma->vm_ops->pfn_mkwrite(vma, vmf);
+ if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
return ret;
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- /*
- * We might have raced with another page fault while we
- * released the pte_offset_map_lock.
- */
- if (!pte_same(*fe->pte, orig_pte)) {
- pte_unmap_unlock(fe->pte, fe->ptl);
- return 0;
- }
+ return finish_mkwrite_fault(vmf);
}
- return wp_page_reuse(fe, orig_pte, NULL, 0, 0);
+ wp_page_reuse(vmf);
+ return VM_FAULT_WRITE;
}
-static int wp_page_shared(struct fault_env *fe, pte_t orig_pte,
- struct page *old_page)
- __releases(fe->ptl)
+static int wp_page_shared(struct vm_fault *vmf)
+ __releases(vmf->ptl)
{
- struct vm_area_struct *vma = fe->vma;
- int page_mkwrite = 0;
+ struct vm_area_struct *vma = vmf->vma;
- get_page(old_page);
+ get_page(vmf->page);
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
int tmp;
- pte_unmap_unlock(fe->pte, fe->ptl);
- tmp = do_page_mkwrite(vma, old_page, fe->address);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ tmp = do_page_mkwrite(vmf);
if (unlikely(!tmp || (tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- put_page(old_page);
+ put_page(vmf->page);
return tmp;
}
- /*
- * Since we dropped the lock we need to revalidate
- * the PTE as someone else may have changed it. If
- * they did, we just return, as we can count on the
- * MMU to tell us if they didn't also make it writable.
- */
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- if (!pte_same(*fe->pte, orig_pte)) {
- unlock_page(old_page);
- pte_unmap_unlock(fe->pte, fe->ptl);
- put_page(old_page);
- return 0;
+ tmp = finish_mkwrite_fault(vmf);
+ if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
+ unlock_page(vmf->page);
+ put_page(vmf->page);
+ return tmp;
}
- page_mkwrite = 1;
+ } else {
+ wp_page_reuse(vmf);
+ lock_page(vmf->page);
}
+ fault_dirty_shared_page(vma, vmf->page);
+ put_page(vmf->page);
- return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1);
+ return VM_FAULT_WRITE;
}
/*
@@ -2356,14 +2374,13 @@ static int wp_page_shared(struct fault_env *fe, pte_t orig_pte,
* but allow concurrent faults), with pte both mapped and locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
- __releases(fe->ptl)
+static int do_wp_page(struct vm_fault *vmf)
+ __releases(vmf->ptl)
{
- struct vm_area_struct *vma = fe->vma;
- struct page *old_page;
+ struct vm_area_struct *vma = vmf->vma;
- old_page = vm_normal_page(vma, fe->address, orig_pte);
- if (!old_page) {
+ vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
+ if (!vmf->page) {
/*
* VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
* VM_PFNMAP VMA.
@@ -2373,33 +2390,33 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
*/
if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))
- return wp_pfn_shared(fe, orig_pte);
+ return wp_pfn_shared(vmf);
- pte_unmap_unlock(fe->pte, fe->ptl);
- return wp_page_copy(fe, orig_pte, old_page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return wp_page_copy(vmf);
}
/*
* Take out anonymous pages first, anonymous shared vmas are
* not dirty accountable.
*/
- if (PageAnon(old_page) && !PageKsm(old_page)) {
+ if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
int total_mapcount;
- if (!trylock_page(old_page)) {
- get_page(old_page);
- pte_unmap_unlock(fe->pte, fe->ptl);
- lock_page(old_page);
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
- fe->address, &fe->ptl);
- if (!pte_same(*fe->pte, orig_pte)) {
- unlock_page(old_page);
- pte_unmap_unlock(fe->pte, fe->ptl);
- put_page(old_page);
+ if (!trylock_page(vmf->page)) {
+ get_page(vmf->page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ lock_page(vmf->page);
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (!pte_same(*vmf->pte, vmf->orig_pte)) {
+ unlock_page(vmf->page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ put_page(vmf->page);
return 0;
}
- put_page(old_page);
+ put_page(vmf->page);
}
- if (reuse_swap_page(old_page, &total_mapcount)) {
+ if (reuse_swap_page(vmf->page, &total_mapcount)) {
if (total_mapcount == 1) {
/*
* The page is all ours. Move it to
@@ -2408,24 +2425,25 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
* Protected against the rmap code by
* the page lock.
*/
- page_move_anon_rmap(old_page, vma);
+ page_move_anon_rmap(vmf->page, vma);
}
- unlock_page(old_page);
- return wp_page_reuse(fe, orig_pte, old_page, 0, 0);
+ unlock_page(vmf->page);
+ wp_page_reuse(vmf);
+ return VM_FAULT_WRITE;
}
- unlock_page(old_page);
+ unlock_page(vmf->page);
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
- return wp_page_shared(fe, orig_pte, old_page);
+ return wp_page_shared(vmf);
}
/*
* Ok, we need to copy. Oh, well..
*/
- get_page(old_page);
+ get_page(vmf->page);
- pte_unmap_unlock(fe->pte, fe->ptl);
- return wp_page_copy(fe, orig_pte, old_page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return wp_page_copy(vmf);
}
static void unmap_mapping_range_vma(struct vm_area_struct *vma,
@@ -2513,9 +2531,9 @@ EXPORT_SYMBOL(unmap_mapping_range);
* We return with the mmap_sem locked or unlocked in the same cases
* as does filemap_fault().
*/
-int do_swap_page(struct fault_env *fe, pte_t orig_pte)
+int do_swap_page(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct page *page, *swapcache;
struct mem_cgroup *memcg;
swp_entry_t entry;
@@ -2524,17 +2542,18 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
int exclusive = 0;
int ret = 0;
- if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte))
+ if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
goto out;
- entry = pte_to_swp_entry(orig_pte);
+ entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) {
if (is_migration_entry(entry)) {
- migration_entry_wait(vma->vm_mm, fe->pmd, fe->address);
+ migration_entry_wait(vma->vm_mm, vmf->pmd,
+ vmf->address);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else {
- print_bad_pte(vma, fe->address, orig_pte, NULL);
+ print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
ret = VM_FAULT_SIGBUS;
}
goto out;
@@ -2542,16 +2561,16 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
delayacct_set_flag(DELAYACCT_PF_SWAPIN);
page = lookup_swap_cache(entry);
if (!page) {
- page = swapin_readahead(entry,
- GFP_HIGHUSER_MOVABLE, vma, fe->address);
+ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma,
+ vmf->address);
if (!page) {
/*
* Back out if somebody else faulted in this pte
* while we released the pte lock.
*/
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
- fe->address, &fe->ptl);
- if (likely(pte_same(*fe->pte, orig_pte)))
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
ret = VM_FAULT_OOM;
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
goto unlock;
@@ -2573,7 +2592,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
}
swapcache = page;
- locked = lock_page_or_retry(page, vma->vm_mm, fe->flags);
+ locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
if (!locked) {
@@ -2590,7 +2609,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
goto out_page;
- page = ksm_might_need_to_copy(page, vma, fe->address);
+ page = ksm_might_need_to_copy(page, vma, vmf->address);
if (unlikely(!page)) {
ret = VM_FAULT_OOM;
page = swapcache;
@@ -2606,9 +2625,9 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
/*
* Back out if somebody else already faulted in this pte.
*/
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- if (unlikely(!pte_same(*fe->pte, orig_pte)))
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
+ if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
goto out_nomap;
if (unlikely(!PageUptodate(page))) {
@@ -2629,22 +2648,23 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);
- if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
+ if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
- fe->flags &= ~FAULT_FLAG_WRITE;
+ vmf->flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
exclusive = RMAP_EXCLUSIVE;
}
flush_icache_page(vma, page);
- if (pte_swp_soft_dirty(orig_pte))
+ if (pte_swp_soft_dirty(vmf->orig_pte))
pte = pte_mksoft_dirty(pte);
- set_pte_at(vma->vm_mm, fe->address, fe->pte, pte);
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
+ vmf->orig_pte = pte;
if (page == swapcache) {
- do_page_add_anon_rmap(page, vma, fe->address, exclusive);
+ do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
mem_cgroup_commit_charge(page, memcg, true, false);
activate_page(page);
} else { /* ksm created a completely new copy */
- page_add_new_anon_rmap(page, vma, fe->address, false);
+ page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
}
@@ -2667,22 +2687,22 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
put_page(swapcache);
}
- if (fe->flags & FAULT_FLAG_WRITE) {
- ret |= do_wp_page(fe, pte);
+ if (vmf->flags & FAULT_FLAG_WRITE) {
+ ret |= do_wp_page(vmf);
if (ret & VM_FAULT_ERROR)
ret &= VM_FAULT_ERROR;
goto out;
}
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, fe->address, fe->pte);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
return ret;
out_nomap:
mem_cgroup_cancel_charge(page, memcg, false);
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page:
unlock_page(page);
out_release:
@@ -2733,9 +2753,9 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int do_anonymous_page(struct fault_env *fe)
+static int do_anonymous_page(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct mem_cgroup *memcg;
struct page *page;
pte_t entry;
@@ -2745,7 +2765,7 @@ static int do_anonymous_page(struct fault_env *fe)
return VM_FAULT_SIGBUS;
/* Check if we need to add a guard page to the stack */
- if (check_stack_guard_page(vma, fe->address) < 0)
+ if (check_stack_guard_page(vma, vmf->address) < 0)
return VM_FAULT_SIGSEGV;
/*
@@ -2758,26 +2778,26 @@ static int do_anonymous_page(struct fault_env *fe)
*
* Here we only have down_read(mmap_sem).
*/
- if (pte_alloc(vma->vm_mm, fe->pmd, fe->address))
+ if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))
return VM_FAULT_OOM;
/* See the comment in pte_alloc_one_map() */
- if (unlikely(pmd_trans_unstable(fe->pmd)))
+ if (unlikely(pmd_trans_unstable(vmf->pmd)))
return 0;
/* Use the zero-page for reads */
- if (!(fe->flags & FAULT_FLAG_WRITE) &&
+ if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm)) {
- entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address),
+ entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
vma->vm_page_prot));
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- if (!pte_none(*fe->pte))
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (!pte_none(*vmf->pte))
goto unlock;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
- pte_unmap_unlock(fe->pte, fe->ptl);
- return handle_userfault(fe, VM_UFFD_MISSING);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return handle_userfault(vmf, VM_UFFD_MISSING);
}
goto setpte;
}
@@ -2785,7 +2805,7 @@ static int do_anonymous_page(struct fault_env *fe)
/* Allocate our own private page. */
if (unlikely(anon_vma_prepare(vma)))
goto oom;
- page = alloc_zeroed_user_highpage_movable(vma, fe->address);
+ page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
if (!page)
goto oom;
@@ -2803,30 +2823,30 @@ static int do_anonymous_page(struct fault_env *fe)
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- if (!pte_none(*fe->pte))
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
+ if (!pte_none(*vmf->pte))
goto release;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
- return handle_userfault(fe, VM_UFFD_MISSING);
+ return handle_userfault(vmf, VM_UFFD_MISSING);
}
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, fe->address, false);
+ page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
setpte:
- set_pte_at(vma->vm_mm, fe->address, fe->pte, entry);
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, fe->address, fe->pte);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
release:
mem_cgroup_cancel_charge(page, memcg, false);
@@ -2843,62 +2863,50 @@ oom:
* released depending on flags and vma->vm_ops->fault() return value.
* See filemap_fault() and __lock_page_retry().
*/
-static int __do_fault(struct fault_env *fe, pgoff_t pgoff,
- struct page *cow_page, struct page **page, void **entry)
+static int __do_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- struct vm_fault vmf;
+ struct vm_area_struct *vma = vmf->vma;
int ret;
- vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK);
- vmf.pgoff = pgoff;
- vmf.flags = fe->flags;
- vmf.page = NULL;
- vmf.gfp_mask = __get_fault_gfp_mask(vma);
- vmf.cow_page = cow_page;
-
- ret = vma->vm_ops->fault(vma, &vmf);
- if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
- return ret;
- if (ret & VM_FAULT_DAX_LOCKED) {
- *entry = vmf.entry;
+ ret = vma->vm_ops->fault(vma, vmf);
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
+ VM_FAULT_DONE_COW)))
return ret;
- }
- if (unlikely(PageHWPoison(vmf.page))) {
+ if (unlikely(PageHWPoison(vmf->page))) {
if (ret & VM_FAULT_LOCKED)
- unlock_page(vmf.page);
- put_page(vmf.page);
+ unlock_page(vmf->page);
+ put_page(vmf->page);
+ vmf->page = NULL;
return VM_FAULT_HWPOISON;
}
if (unlikely(!(ret & VM_FAULT_LOCKED)))
- lock_page(vmf.page);
+ lock_page(vmf->page);
else
- VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
+ VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
- *page = vmf.page;
return ret;
}
-static int pte_alloc_one_map(struct fault_env *fe)
+static int pte_alloc_one_map(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
- if (!pmd_none(*fe->pmd))
+ if (!pmd_none(*vmf->pmd))
goto map_pte;
- if (fe->prealloc_pte) {
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_none(*fe->pmd))) {
- spin_unlock(fe->ptl);
+ if (vmf->prealloc_pte) {
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_none(*vmf->pmd))) {
+ spin_unlock(vmf->ptl);
goto map_pte;
}
atomic_long_inc(&vma->vm_mm->nr_ptes);
- pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte);
- spin_unlock(fe->ptl);
- fe->prealloc_pte = 0;
- } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) {
+ pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
+ spin_unlock(vmf->ptl);
+ vmf->prealloc_pte = 0;
+ } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) {
return VM_FAULT_OOM;
}
map_pte:
@@ -2913,11 +2921,11 @@ map_pte:
* through an atomic read in C, which is what pmd_trans_unstable()
* provides.
*/
- if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
+ if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd))
return VM_FAULT_NOPAGE;
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
return 0;
}
@@ -2935,24 +2943,24 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
return true;
}
-static void deposit_prealloc_pte(struct fault_env *fe)
+static void deposit_prealloc_pte(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
- pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, fe->prealloc_pte);
+ pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
/*
* We are going to consume the prealloc table,
* count that as nr_ptes.
*/
atomic_long_inc(&vma->vm_mm->nr_ptes);
- fe->prealloc_pte = 0;
+ vmf->prealloc_pte = 0;
}
-static int do_set_pmd(struct fault_env *fe, struct page *page)
+static int do_set_pmd(struct vm_fault *vmf, struct page *page)
{
- struct vm_area_struct *vma = fe->vma;
- bool write = fe->flags & FAULT_FLAG_WRITE;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
pmd_t entry;
int i, ret;
@@ -2966,15 +2974,15 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
* Archs like ppc64 need additonal space to store information
* related to pte entry. Use the preallocated table for that.
*/
- if (arch_needs_pgtable_deposit() && !fe->prealloc_pte) {
- fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address);
- if (!fe->prealloc_pte)
+ if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
+ vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address);
+ if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
smp_wmb(); /* See comment in __pte_alloc() */
}
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_none(*fe->pmd)))
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_none(*vmf->pmd)))
goto out;
for (i = 0; i < HPAGE_PMD_NR; i++)
@@ -2990,11 +2998,11 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
* deposit and withdraw with pmd lock held
*/
if (arch_needs_pgtable_deposit())
- deposit_prealloc_pte(fe);
+ deposit_prealloc_pte(vmf);
- set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
+ set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
- update_mmu_cache_pmd(vma, haddr, fe->pmd);
+ update_mmu_cache_pmd(vma, haddr, vmf->pmd);
/* fault is handled */
ret = 0;
@@ -3005,13 +3013,13 @@ out:
* withdraw with pmd lock held.
*/
if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK)
- fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm,
- fe->pmd);
- spin_unlock(fe->ptl);
+ vmf->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm,
+ vmf->pmd);
+ spin_unlock(vmf->ptl);
return ret;
}
#else
-static int do_set_pmd(struct fault_env *fe, struct page *page)
+static int do_set_pmd(struct vm_fault *vmf, struct page *page)
{
BUILD_BUG();
return 0;
@@ -3022,41 +3030,42 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
* alloc_set_pte - setup new PTE entry for given page and add reverse page
* mapping. If needed, the fucntion allocates page table or use pre-allocated.
*
- * @fe: fault environment
+ * @vmf: fault environment
* @memcg: memcg to charge page (only for private mappings)
* @page: page to map
*
- * Caller must take care of unlocking fe->ptl, if fe->pte is non-NULL on return.
+ * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on
+ * return.
*
* Target users are page handler itself and implementations of
* vm_ops->map_pages.
*/
-int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
+int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
struct page *page)
{
- struct vm_area_struct *vma = fe->vma;
- bool write = fe->flags & FAULT_FLAG_WRITE;
+ struct vm_area_struct *vma = vmf->vma;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
pte_t entry;
int ret;
- if (pmd_none(*fe->pmd) && PageTransCompound(page) &&
+ if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
/* THP on COW? */
VM_BUG_ON_PAGE(memcg, page);
- ret = do_set_pmd(fe, page);
+ ret = do_set_pmd(vmf, page);
if (ret != VM_FAULT_FALLBACK)
goto fault_handled;
}
- if (!fe->pte) {
- ret = pte_alloc_one_map(fe);
+ if (!vmf->pte) {
+ ret = pte_alloc_one_map(vmf);
if (ret)
goto fault_handled;
}
/* Re-check under ptl */
- if (unlikely(!pte_none(*fe->pte))) {
+ if (unlikely(!pte_none(*vmf->pte))) {
ret = VM_FAULT_NOPAGE;
goto fault_handled;
}
@@ -3068,28 +3077,60 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
/* copy-on-write page */
if (write && !(vma->vm_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, fe->address, false);
+ page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
}
- set_pte_at(vma->vm_mm, fe->address, fe->pte, entry);
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
/* no need to invalidate: a not-present page won't be cached */
- update_mmu_cache(vma, fe->address, fe->pte);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
ret = 0;
fault_handled:
/* preallocated pagetable is unused: free it */
- if (fe->prealloc_pte) {
- pte_free(fe->vma->vm_mm, fe->prealloc_pte);
- fe->prealloc_pte = 0;
+ if (vmf->prealloc_pte) {
+ pte_free(vmf->vma->vm_mm, vmf->prealloc_pte);
+ vmf->prealloc_pte = 0;
}
return ret;
}
+
+/**
+ * finish_fault - finish page fault once we have prepared the page to fault
+ *
+ * @vmf: structure describing the fault
+ *
+ * This function handles all that is needed to finish a page fault once the
+ * page to fault in is prepared. It handles locking of PTEs, inserts PTE for
+ * given page, adds reverse page mapping, handles memcg charges and LRU
+ * addition. The function returns 0 on success, VM_FAULT_ code in case of
+ * error.
+ *
+ * The function expects the page to be locked and on success it consumes a
+ * reference of a page being mapped (for the PTE which maps it).
+ */
+int finish_fault(struct vm_fault *vmf)
+{
+ struct page *page;
+ int ret;
+
+ /* Did we COW the page? */
+ if ((vmf->flags & FAULT_FLAG_WRITE) &&
+ !(vmf->vma->vm_flags & VM_SHARED))
+ page = vmf->cow_page;
+ else
+ page = vmf->page;
+ ret = alloc_set_pte(vmf, vmf->memcg, page);
+ if (vmf->pte)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return ret;
+}
+
static unsigned long fault_around_bytes __read_mostly =
rounddown_pow_of_two(65536);
@@ -3154,17 +3195,18 @@ late_initcall(fault_around_debugfs);
* fault_around_pages() value (and therefore to page order). This way it's
* easier to guarantee that we don't cross page table boundaries.
*/
-static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
+static int do_fault_around(struct vm_fault *vmf)
{
- unsigned long address = fe->address, nr_pages, mask;
+ unsigned long address = vmf->address, nr_pages, mask;
+ pgoff_t start_pgoff = vmf->pgoff;
pgoff_t end_pgoff;
int off, ret = 0;
nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
- fe->address = max(address & mask, fe->vma->vm_start);
- off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+ vmf->address = max(address & mask, vmf->vma->vm_start);
+ off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
start_pgoff -= off;
/*
@@ -3172,45 +3214,45 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
* or fault_around_pages() from start_pgoff, depending what is nearest.
*/
end_pgoff = start_pgoff -
- ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
+ ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
PTRS_PER_PTE - 1;
- end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1,
+ end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
start_pgoff + nr_pages - 1);
- if (pmd_none(*fe->pmd)) {
- fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address);
- if (!fe->prealloc_pte)
+ if (pmd_none(*vmf->pmd)) {
+ vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm,
+ vmf->address);
+ if (!vmf->prealloc_pte)
goto out;
smp_wmb(); /* See comment in __pte_alloc() */
}
- fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff);
+ vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
/* Huge page is mapped? Page fault is solved */
- if (pmd_trans_huge(*fe->pmd)) {
+ if (pmd_trans_huge(*vmf->pmd)) {
ret = VM_FAULT_NOPAGE;
goto out;
}
/* ->map_pages() haven't done anything useful. Cold page cache? */
- if (!fe->pte)
+ if (!vmf->pte)
goto out;
/* check if the page fault is solved */
- fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
- if (!pte_none(*fe->pte))
+ vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
+ if (!pte_none(*vmf->pte))
ret = VM_FAULT_NOPAGE;
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
- fe->address = address;
- fe->pte = NULL;
+ vmf->address = address;
+ vmf->pte = NULL;
return ret;
}
-static int do_read_fault(struct fault_env *fe, pgoff_t pgoff)
+static int do_read_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- struct page *fault_page;
+ struct vm_area_struct *vma = vmf->vma;
int ret = 0;
/*
@@ -3219,80 +3261,67 @@ static int do_read_fault(struct fault_env *fe, pgoff_t pgoff)
* something).
*/
if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
- ret = do_fault_around(fe, pgoff);
+ ret = do_fault_around(vmf);
if (ret)
return ret;
}
- ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL);
+ ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
- ret |= alloc_set_pte(fe, NULL, fault_page);
- if (fe->pte)
- pte_unmap_unlock(fe->pte, fe->ptl);
- unlock_page(fault_page);
+ ret |= finish_fault(vmf);
+ unlock_page(vmf->page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
- put_page(fault_page);
+ put_page(vmf->page);
return ret;
}
-static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff)
+static int do_cow_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- struct page *fault_page, *new_page;
- void *fault_entry;
- struct mem_cgroup *memcg;
+ struct vm_area_struct *vma = vmf->vma;
int ret;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
- new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address);
- if (!new_page)
+ vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
+ if (!vmf->cow_page)
return VM_FAULT_OOM;
- if (mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL,
- &memcg, false)) {
- put_page(new_page);
+ if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
+ &vmf->memcg, false)) {
+ put_page(vmf->cow_page);
return VM_FAULT_OOM;
}
- ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry);
+ ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
+ if (ret & VM_FAULT_DONE_COW)
+ return ret;
- if (!(ret & VM_FAULT_DAX_LOCKED))
- copy_user_highpage(new_page, fault_page, fe->address, vma);
- __SetPageUptodate(new_page);
+ copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
+ __SetPageUptodate(vmf->cow_page);
- ret |= alloc_set_pte(fe, memcg, new_page);
- if (fe->pte)
- pte_unmap_unlock(fe->pte, fe->ptl);
- if (!(ret & VM_FAULT_DAX_LOCKED)) {
- unlock_page(fault_page);
- put_page(fault_page);
- } else {
- dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff);
- }
+ ret |= finish_fault(vmf);
+ unlock_page(vmf->page);
+ put_page(vmf->page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
return ret;
uncharge_out:
- mem_cgroup_cancel_charge(new_page, memcg, false);
- put_page(new_page);
+ mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false);
+ put_page(vmf->cow_page);
return ret;
}
-static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
+static int do_shared_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- struct page *fault_page;
- struct address_space *mapping;
- int dirtied = 0;
+ struct vm_area_struct *vma = vmf->vma;
int ret, tmp;
- ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL);
+ ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
@@ -3301,46 +3330,24 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
* about to become writable
*/
if (vma->vm_ops->page_mkwrite) {
- unlock_page(fault_page);
- tmp = do_page_mkwrite(vma, fault_page, fe->address);
+ unlock_page(vmf->page);
+ tmp = do_page_mkwrite(vmf);
if (unlikely(!tmp ||
(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- put_page(fault_page);
+ put_page(vmf->page);
return tmp;
}
}
- ret |= alloc_set_pte(fe, NULL, fault_page);
- if (fe->pte)
- pte_unmap_unlock(fe->pte, fe->ptl);
+ ret |= finish_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
VM_FAULT_RETRY))) {
- unlock_page(fault_page);
- put_page(fault_page);
+ unlock_page(vmf->page);
+ put_page(vmf->page);
return ret;
}
- if (set_page_dirty(fault_page))
- dirtied = 1;
- /*
- * Take a local copy of the address_space - page.mapping may be zeroed
- * by truncate after unlock_page(). The address_space itself remains
- * pinned by vma->vm_file's reference. We rely on unlock_page()'s
- * release semantics to prevent the compiler from undoing this copying.
- */
- mapping = page_rmapping(fault_page);
- unlock_page(fault_page);
- if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
- /*
- * Some device drivers do not set page.mapping but still
- * dirty their pages
- */
- balance_dirty_pages_ratelimited(mapping);
- }
-
- if (!vma->vm_ops->page_mkwrite)
- file_update_time(vma->vm_file);
-
+ fault_dirty_shared_page(vma, vmf->page);
return ret;
}
@@ -3350,19 +3357,18 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
-static int do_fault(struct fault_env *fe)
+static int do_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- pgoff_t pgoff = linear_page_index(vma, fe->address);
+ struct vm_area_struct *vma = vmf->vma;
/* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
if (!vma->vm_ops->fault)
return VM_FAULT_SIGBUS;
- if (!(fe->flags & FAULT_FLAG_WRITE))
- return do_read_fault(fe, pgoff);
+ if (!(vmf->flags & FAULT_FLAG_WRITE))
+ return do_read_fault(vmf);
if (!(vma->vm_flags & VM_SHARED))
- return do_cow_fault(fe, pgoff);
- return do_shared_fault(fe, pgoff);
+ return do_cow_fault(vmf);
+ return do_shared_fault(vmf);
}
static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
@@ -3380,14 +3386,15 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
return mpol_misplaced(page, vma, addr);
}
-static int do_numa_page(struct fault_env *fe, pte_t pte)
+static int do_numa_page(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL;
int page_nid = -1;
int last_cpupid;
int target_nid;
bool migrated = false;
+ pte_t pte = vmf->orig_pte;
bool was_writable = pte_write(pte);
int flags = 0;
@@ -3400,10 +3407,10 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
* page table entry is not accessible, so there would be no
* concurrent hardware modifications to the PTE.
*/
- fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd);
- spin_lock(fe->ptl);
- if (unlikely(!pte_same(*fe->pte, pte))) {
- pte_unmap_unlock(fe->pte, fe->ptl);
+ vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
+ spin_lock(vmf->ptl);
+ if (unlikely(!pte_same(*vmf->pte, pte))) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
goto out;
}
@@ -3412,18 +3419,18 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
pte = pte_mkyoung(pte);
if (was_writable)
pte = pte_mkwrite(pte);
- set_pte_at(vma->vm_mm, fe->address, fe->pte, pte);
- update_mmu_cache(vma, fe->address, fe->pte);
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
- page = vm_normal_page(vma, fe->address, pte);
+ page = vm_normal_page(vma, vmf->address, pte);
if (!page) {
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
}
/* TODO: handle PTE-mapped THP */
if (PageCompound(page)) {
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
}
@@ -3447,9 +3454,9 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
last_cpupid = page_cpupid_last(page);
page_nid = page_to_nid(page);
- target_nid = numa_migrate_prep(page, vma, fe->address, page_nid,
+ target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
&flags);
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
if (target_nid == -1) {
put_page(page);
goto out;
@@ -3469,28 +3476,28 @@ out:
return 0;
}
-static int create_huge_pmd(struct fault_env *fe)
+static int create_huge_pmd(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
if (vma_is_anonymous(vma))
- return do_huge_pmd_anonymous_page(fe);
+ return do_huge_pmd_anonymous_page(vmf);
if (vma->vm_ops->pmd_fault)
- return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd,
- fe->flags);
+ return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd,
+ vmf->flags);
return VM_FAULT_FALLBACK;
}
-static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd)
+static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
{
- if (vma_is_anonymous(fe->vma))
- return do_huge_pmd_wp_page(fe, orig_pmd);
- if (fe->vma->vm_ops->pmd_fault)
- return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd,
- fe->flags);
+ if (vma_is_anonymous(vmf->vma))
+ return do_huge_pmd_wp_page(vmf, orig_pmd);
+ if (vmf->vma->vm_ops->pmd_fault)
+ return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address,
+ vmf->pmd, vmf->flags);
/* COW handled on pte level: split pmd */
- VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma);
- __split_huge_pmd(fe->vma, fe->pmd, fe->address, false, NULL);
+ VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
+ __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
return VM_FAULT_FALLBACK;
}
@@ -3515,21 +3522,21 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
* The mmap_sem may have been released depending on flags and our return value.
* See filemap_fault() and __lock_page_or_retry().
*/
-static int handle_pte_fault(struct fault_env *fe)
+static int handle_pte_fault(struct vm_fault *vmf)
{
pte_t entry;
- if (unlikely(pmd_none(*fe->pmd))) {
+ if (unlikely(pmd_none(*vmf->pmd))) {
/*
* Leave __pte_alloc() until later: because vm_ops->fault may
* want to allocate huge page, and if we expose page table
* for an instant, it will be difficult to retract from
* concurrent faults and from rmap lookups.
*/
- fe->pte = NULL;
+ vmf->pte = NULL;
} else {
/* See comment in pte_alloc_one_map() */
- if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
+ if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd))
return 0;
/*
* A regular pmd is established and it can't morph into a huge
@@ -3537,9 +3544,8 @@ static int handle_pte_fault(struct fault_env *fe)
* mmap_sem read mode and khugepaged takes it in write mode.
* So now it's safe to run pte_offset_map().
*/
- fe->pte = pte_offset_map(fe->pmd, fe->address);
-
- entry = *fe->pte;
+ vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
+ vmf->orig_pte = *vmf->pte;
/*
* some architectures can have larger ptes than wordsize,
@@ -3550,38 +3556,39 @@ static int handle_pte_fault(struct fault_env *fe)
* ptl lock held. So here a barrier will do.
*/
barrier();
- if (pte_none(entry)) {
- pte_unmap(fe->pte);
- fe->pte = NULL;
+ if (pte_none(vmf->orig_pte)) {
+ pte_unmap(vmf->pte);
+ vmf->pte = NULL;
}
}
- if (!fe->pte) {
- if (vma_is_anonymous(fe->vma))
- return do_anonymous_page(fe);
+ if (!vmf->pte) {
+ if (vma_is_anonymous(vmf->vma))
+ return do_anonymous_page(vmf);
else
- return do_fault(fe);
+ return do_fault(vmf);
}
- if (!pte_present(entry))
- return do_swap_page(fe, entry);
+ if (!pte_present(vmf->orig_pte))
+ return do_swap_page(vmf);
- if (pte_protnone(entry) && vma_is_accessible(fe->vma))
- return do_numa_page(fe, entry);
+ if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
+ return do_numa_page(vmf);
- fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd);
- spin_lock(fe->ptl);
- if (unlikely(!pte_same(*fe->pte, entry)))
+ vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+ spin_lock(vmf->ptl);
+ entry = vmf->orig_pte;
+ if (unlikely(!pte_same(*vmf->pte, entry)))
goto unlock;
- if (fe->flags & FAULT_FLAG_WRITE) {
+ if (vmf->flags & FAULT_FLAG_WRITE) {
if (!pte_write(entry))
- return do_wp_page(fe, entry);
+ return do_wp_page(vmf);
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
- if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry,
- fe->flags & FAULT_FLAG_WRITE)) {
- update_mmu_cache(fe->vma, fe->address, fe->pte);
+ if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
+ vmf->flags & FAULT_FLAG_WRITE)) {
+ update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
} else {
/*
* This is needed only for protection faults but the arch code
@@ -3589,11 +3596,11 @@ static int handle_pte_fault(struct fault_env *fe)
* This still avoids useless tlb flushes for .text page faults
* with threads.
*/
- if (fe->flags & FAULT_FLAG_WRITE)
- flush_tlb_fix_spurious_fault(fe->vma, fe->address);
+ if (vmf->flags & FAULT_FLAG_WRITE)
+ flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
}
unlock:
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
}
@@ -3606,10 +3613,12 @@ unlock:
static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags)
{
- struct fault_env fe = {
+ struct vm_fault vmf = {
.vma = vma,
- .address = address,
+ .address = address & PAGE_MASK,
.flags = flags,
+ .pgoff = linear_page_index(vma, address),
+ .gfp_mask = __get_fault_gfp_mask(vma),
};
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
@@ -3619,35 +3628,35 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
pud = pud_alloc(mm, pgd, address);
if (!pud)
return VM_FAULT_OOM;
- fe.pmd = pmd_alloc(mm, pud, address);
- if (!fe.pmd)
+ vmf.pmd = pmd_alloc(mm, pud, address);
+ if (!vmf.pmd)
return VM_FAULT_OOM;
- if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) {
- int ret = create_huge_pmd(&fe);
+ if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
+ int ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
- pmd_t orig_pmd = *fe.pmd;
+ pmd_t orig_pmd = *vmf.pmd;
int ret;
barrier();
if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
- return do_huge_pmd_numa_page(&fe, orig_pmd);
+ return do_huge_pmd_numa_page(&vmf, orig_pmd);
- if ((fe.flags & FAULT_FLAG_WRITE) &&
+ if ((vmf.flags & FAULT_FLAG_WRITE) &&
!pmd_write(orig_pmd)) {
- ret = wp_huge_pmd(&fe, orig_pmd);
+ ret = wp_huge_pmd(&vmf, orig_pmd);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
- huge_pmd_set_accessed(&fe, orig_pmd);
+ huge_pmd_set_accessed(&vmf, orig_pmd);
return 0;
}
}
}
- return handle_pte_fault(&fe);
+ return handle_pte_fault(&vmf);
}
/*
@@ -3808,8 +3817,8 @@ out:
return -EINVAL;
}
-static inline int follow_pte(struct mm_struct *mm, unsigned long address,
- pte_t **ptepp, spinlock_t **ptlp)
+int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp,
+ spinlock_t **ptlp)
{
int res;
@@ -3919,7 +3928,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
struct page *page = NULL;
ret = get_user_pages_remote(tsk, mm, addr, 1,
- gup_flags, &page, &vma);
+ gup_flags, &page, &vma, NULL);
if (ret <= 0) {
#ifndef CONFIG_HAVE_IOREMAP_PROT
break;
OpenPOWER on IntegriCloud