diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Makefile | 1 | ||||
-rw-r--r-- | mm/fadvise.c | 6 | ||||
-rw-r--r-- | mm/filemap.c | 25 | ||||
-rw-r--r-- | mm/filemap_xip.c | 478 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memory.c | 42 |
6 files changed, 52 insertions, 502 deletions
diff --git a/mm/Makefile b/mm/Makefile index 088c68e..3c1caa2 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -55,7 +55,6 @@ obj-$(CONFIG_KMEMCHECK) += kmemcheck.o obj-$(CONFIG_KASAN) += kasan/ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o -obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o diff --git a/mm/fadvise.c b/mm/fadvise.c index fac23ec..4a3907c 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -28,6 +28,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) { struct fd f = fdget(fd); + struct inode *inode; struct address_space *mapping; struct backing_dev_info *bdi; loff_t endbyte; /* inclusive */ @@ -39,7 +40,8 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) if (!f.file) return -EBADF; - if (S_ISFIFO(file_inode(f.file)->i_mode)) { + inode = file_inode(f.file); + if (S_ISFIFO(inode->i_mode)) { ret = -ESPIPE; goto out; } @@ -50,7 +52,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) goto out; } - if (mapping->a_ops->get_xip_mem) { + if (IS_DAX(inode)) { switch (advice) { case POSIX_FADV_NORMAL: case POSIX_FADV_RANDOM: diff --git a/mm/filemap.c b/mm/filemap.c index d9f5336..ad72420 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1695,8 +1695,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) loff_t *ppos = &iocb->ki_pos; loff_t pos = *ppos; - /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ - if (file->f_flags & O_DIRECT) { + if (io_is_direct(file)) { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); @@ -1723,9 +1722,11 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) * we've already read everything we wanted to, or if * there was a short read because we hit EOF, go ahead * and return. Otherwise fallthrough to buffered io for - * the rest of the read. + * the rest of the read. Buffered reads will not work for + * DAX files, so don't bother trying. */ - if (retval < 0 || !iov_iter_count(iter) || *ppos >= size) { + if (retval < 0 || !iov_iter_count(iter) || *ppos >= size || + IS_DAX(inode)) { file_accessed(file); goto out; } @@ -2582,18 +2583,20 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (err) goto out; - /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ - if (unlikely(file->f_flags & O_DIRECT)) { + if (io_is_direct(file)) { loff_t endbyte; written = generic_file_direct_write(iocb, from, pos); - if (written < 0 || written == count) - goto out; - /* - * direct-io write to a hole: fall through to buffered I/O - * for completing the rest of the request. + * If the write stopped short of completing, fall back to + * buffered writes. Some filesystems do this for writes to + * holes, for example. For DAX files, a buffered write will + * not succeed (even if it did, DAX does not handle dirty + * page-cache pages correctly). */ + if (written < 0 || written == count || IS_DAX(inode)) + goto out; + pos += written; count -= written; diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c deleted file mode 100644 index c175f9f..0000000 --- a/mm/filemap_xip.c +++ /dev/null @@ -1,478 +0,0 @@ -/* - * linux/mm/filemap_xip.c - * - * Copyright (C) 2005 IBM Corporation - * Author: Carsten Otte <cotte@de.ibm.com> - * - * derived from linux/mm/filemap.c - Copyright (C) Linus Torvalds - * - */ - -#include <linux/fs.h> -#include <linux/backing-dev.h> -#include <linux/pagemap.h> -#include <linux/export.h> -#include <linux/uio.h> -#include <linux/rmap.h> -#include <linux/mmu_notifier.h> -#include <linux/sched.h> -#include <linux/seqlock.h> -#include <linux/mutex.h> -#include <linux/gfp.h> -#include <asm/tlbflush.h> -#include <asm/io.h> - -/* - * We do use our own empty page to avoid interference with other users - * of ZERO_PAGE(), such as /dev/zero - */ -static DEFINE_MUTEX(xip_sparse_mutex); -static seqcount_t xip_sparse_seq = SEQCNT_ZERO(xip_sparse_seq); -static struct page *__xip_sparse_page; - -/* called under xip_sparse_mutex */ -static struct page *xip_sparse_page(void) -{ - if (!__xip_sparse_page) { - struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); - - if (page) - __xip_sparse_page = page; - } - return __xip_sparse_page; -} - -/* - * This is a file read routine for execute in place files, and uses - * the mapping->a_ops->get_xip_mem() function for the actual low-level - * stuff. - * - * Note the struct file* is not used at all. It may be NULL. - */ -static ssize_t -do_xip_mapping_read(struct address_space *mapping, - struct file_ra_state *_ra, - struct file *filp, - char __user *buf, - size_t len, - loff_t *ppos) -{ - struct inode *inode = mapping->host; - pgoff_t index, end_index; - unsigned long offset; - loff_t isize, pos; - size_t copied = 0, error = 0; - - BUG_ON(!mapping->a_ops->get_xip_mem); - - pos = *ppos; - index = pos >> PAGE_CACHE_SHIFT; - offset = pos & ~PAGE_CACHE_MASK; - - isize = i_size_read(inode); - if (!isize) - goto out; - - end_index = (isize - 1) >> PAGE_CACHE_SHIFT; - do { - unsigned long nr, left; - void *xip_mem; - unsigned long xip_pfn; - int zero = 0; - - /* nr is the maximum number of bytes to copy from this page */ - nr = PAGE_CACHE_SIZE; - if (index >= end_index) { - if (index > end_index) - goto out; - nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; - if (nr <= offset) { - goto out; - } - } - nr = nr - offset; - if (nr > len - copied) - nr = len - copied; - - error = mapping->a_ops->get_xip_mem(mapping, index, 0, - &xip_mem, &xip_pfn); - if (unlikely(error)) { - if (error == -ENODATA) { - /* sparse */ - zero = 1; - } else - goto out; - } - - /* If users can be writing to this page using arbitrary - * virtual addresses, take care about potential aliasing - * before reading the page on the kernel side. - */ - if (mapping_writably_mapped(mapping)) - /* address based flush */ ; - - /* - * Ok, we have the mem, so now we can copy it to user space... - * - * The actor routine returns how many bytes were actually used.. - * NOTE! This may not be the same as how much of a user buffer - * we filled up (we may be padding etc), so we can only update - * "pos" here (the actor routine has to update the user buffer - * pointers and the remaining count). - */ - if (!zero) - left = __copy_to_user(buf+copied, xip_mem+offset, nr); - else - left = __clear_user(buf + copied, nr); - - if (left) { - error = -EFAULT; - goto out; - } - - copied += (nr - left); - offset += (nr - left); - index += offset >> PAGE_CACHE_SHIFT; - offset &= ~PAGE_CACHE_MASK; - } while (copied < len); - -out: - *ppos = pos + copied; - if (filp) - file_accessed(filp); - - return (copied ? copied : error); -} - -ssize_t -xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) -{ - if (!access_ok(VERIFY_WRITE, buf, len)) - return -EFAULT; - - return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp, - buf, len, ppos); -} -EXPORT_SYMBOL_GPL(xip_file_read); - -/* - * __xip_unmap is invoked from xip_unmap and xip_write - * - * This function walks all vmas of the address_space and unmaps the - * __xip_sparse_page when found at pgoff. - */ -static void __xip_unmap(struct address_space * mapping, unsigned long pgoff) -{ - struct vm_area_struct *vma; - struct page *page; - unsigned count; - int locked = 0; - - count = read_seqcount_begin(&xip_sparse_seq); - - page = __xip_sparse_page; - if (!page) - return; - -retry: - i_mmap_lock_read(mapping); - vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { - pte_t *pte, pteval; - spinlock_t *ptl; - struct mm_struct *mm = vma->vm_mm; - unsigned long address = vma->vm_start + - ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); - - BUG_ON(address < vma->vm_start || address >= vma->vm_end); - pte = page_check_address(page, mm, address, &ptl, 1); - if (pte) { - /* Nuke the page table entry. */ - flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush(vma, address, pte); - page_remove_rmap(page); - dec_mm_counter(mm, MM_FILEPAGES); - BUG_ON(pte_dirty(pteval)); - pte_unmap_unlock(pte, ptl); - /* must invalidate_page _before_ freeing the page */ - mmu_notifier_invalidate_page(mm, address); - page_cache_release(page); - } - } - i_mmap_unlock_read(mapping); - - if (locked) { - mutex_unlock(&xip_sparse_mutex); - } else if (read_seqcount_retry(&xip_sparse_seq, count)) { - mutex_lock(&xip_sparse_mutex); - locked = 1; - goto retry; - } -} - -/* - * xip_fault() is invoked via the vma operations vector for a - * mapped memory region to read in file data during a page fault. - * - * This function is derived from filemap_fault, but used for execute in place - */ -static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct file *file = vma->vm_file; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - pgoff_t size; - void *xip_mem; - unsigned long xip_pfn; - struct page *page; - int error; - - /* XXX: are VM_FAULT_ codes OK? */ -again: - size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (vmf->pgoff >= size) - return VM_FAULT_SIGBUS; - - error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0, - &xip_mem, &xip_pfn); - if (likely(!error)) - goto found; - if (error != -ENODATA) - return VM_FAULT_OOM; - - /* sparse block */ - if ((vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) && - (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) && - (!(mapping->host->i_sb->s_flags & MS_RDONLY))) { - int err; - - /* maybe shared writable, allocate new block */ - mutex_lock(&xip_sparse_mutex); - error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1, - &xip_mem, &xip_pfn); - mutex_unlock(&xip_sparse_mutex); - if (error) - return VM_FAULT_SIGBUS; - /* unmap sparse mappings at pgoff from all other vmas */ - __xip_unmap(mapping, vmf->pgoff); - -found: - err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, - xip_pfn); - if (err == -ENOMEM) - return VM_FAULT_OOM; - /* - * err == -EBUSY is fine, we've raced against another thread - * that faulted-in the same page - */ - if (err != -EBUSY) - BUG_ON(err); - return VM_FAULT_NOPAGE; - } else { - int err, ret = VM_FAULT_OOM; - - mutex_lock(&xip_sparse_mutex); - write_seqcount_begin(&xip_sparse_seq); - error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0, - &xip_mem, &xip_pfn); - if (unlikely(!error)) { - write_seqcount_end(&xip_sparse_seq); - mutex_unlock(&xip_sparse_mutex); - goto again; - } - if (error != -ENODATA) - goto out; - /* not shared and writable, use xip_sparse_page() */ - page = xip_sparse_page(); - if (!page) - goto out; - err = vm_insert_page(vma, (unsigned long)vmf->virtual_address, - page); - if (err == -ENOMEM) - goto out; - - ret = VM_FAULT_NOPAGE; -out: - write_seqcount_end(&xip_sparse_seq); - mutex_unlock(&xip_sparse_mutex); - - return ret; - } -} - -static const struct vm_operations_struct xip_file_vm_ops = { - .fault = xip_file_fault, - .page_mkwrite = filemap_page_mkwrite, -}; - -int xip_file_mmap(struct file * file, struct vm_area_struct * vma) -{ - BUG_ON(!file->f_mapping->a_ops->get_xip_mem); - - file_accessed(file); - vma->vm_ops = &xip_file_vm_ops; - vma->vm_flags |= VM_MIXEDMAP; - return 0; -} -EXPORT_SYMBOL_GPL(xip_file_mmap); - -static ssize_t -__xip_file_write(struct file *filp, const char __user *buf, - size_t count, loff_t pos, loff_t *ppos) -{ - struct address_space * mapping = filp->f_mapping; - const struct address_space_operations *a_ops = mapping->a_ops; - struct inode *inode = mapping->host; - long status = 0; - size_t bytes; - ssize_t written = 0; - - BUG_ON(!mapping->a_ops->get_xip_mem); - - do { - unsigned long index; - unsigned long offset; - size_t copied; - void *xip_mem; - unsigned long xip_pfn; - - offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ - index = pos >> PAGE_CACHE_SHIFT; - bytes = PAGE_CACHE_SIZE - offset; - if (bytes > count) - bytes = count; - - status = a_ops->get_xip_mem(mapping, index, 0, - &xip_mem, &xip_pfn); - if (status == -ENODATA) { - /* we allocate a new page unmap it */ - mutex_lock(&xip_sparse_mutex); - status = a_ops->get_xip_mem(mapping, index, 1, - &xip_mem, &xip_pfn); - mutex_unlock(&xip_sparse_mutex); - if (!status) - /* unmap page at pgoff from all other vmas */ - __xip_unmap(mapping, index); - } - - if (status) - break; - - copied = bytes - - __copy_from_user_nocache(xip_mem + offset, buf, bytes); - - if (likely(copied > 0)) { - status = copied; - - if (status >= 0) { - written += status; - count -= status; - pos += status; - buf += status; - } - } - if (unlikely(copied != bytes)) - if (status >= 0) - status = -EFAULT; - if (status < 0) - break; - } while (count); - *ppos = pos; - /* - * No need to use i_size_read() here, the i_size - * cannot change under us because we hold i_mutex. - */ - if (pos > inode->i_size) { - i_size_write(inode, pos); - mark_inode_dirty(inode); - } - - return written ? written : status; -} - -ssize_t -xip_file_write(struct file *filp, const char __user *buf, size_t len, - loff_t *ppos) -{ - struct address_space *mapping = filp->f_mapping; - struct inode *inode = mapping->host; - size_t count; - loff_t pos; - ssize_t ret; - - mutex_lock(&inode->i_mutex); - - if (!access_ok(VERIFY_READ, buf, len)) { - ret=-EFAULT; - goto out_up; - } - - pos = *ppos; - count = len; - - /* We can write back this queue in page reclaim */ - current->backing_dev_info = inode_to_bdi(inode); - - ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode)); - if (ret) - goto out_backing; - if (count == 0) - goto out_backing; - - ret = file_remove_suid(filp); - if (ret) - goto out_backing; - - ret = file_update_time(filp); - if (ret) - goto out_backing; - - ret = __xip_file_write (filp, buf, count, pos, ppos); - - out_backing: - current->backing_dev_info = NULL; - out_up: - mutex_unlock(&inode->i_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(xip_file_write); - -/* - * truncate a page used for execute in place - * functionality is analog to block_truncate_page but does use get_xip_mem - * to get the page instead of page cache - */ -int -xip_truncate_page(struct address_space *mapping, loff_t from) -{ - pgoff_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE-1); - unsigned blocksize; - unsigned length; - void *xip_mem; - unsigned long xip_pfn; - int err; - - BUG_ON(!mapping->a_ops->get_xip_mem); - - blocksize = 1 << mapping->host->i_blkbits; - length = offset & (blocksize - 1); - - /* Block boundary? Nothing to do */ - if (!length) - return 0; - - length = blocksize - length; - - err = mapping->a_ops->get_xip_mem(mapping, index, 0, - &xip_mem, &xip_pfn); - if (unlikely(err)) { - if (err == -ENODATA) - /* Hole? No need to truncate */ - return 0; - else - return err; - } - memset(xip_mem + offset, 0, length); - return 0; -} -EXPORT_SYMBOL_GPL(xip_truncate_page); diff --git a/mm/madvise.c b/mm/madvise.c index 1077cbd..d551475 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -239,7 +239,7 @@ static long madvise_willneed(struct vm_area_struct *vma, return -EBADF; #endif - if (file->f_mapping->a_ops->get_xip_mem) { + if (IS_DAX(file_inode(file))) { /* no bad return value, but ignore advice */ return 0; } diff --git a/mm/memory.c b/mm/memory.c index 9927532..8068893 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1965,6 +1965,7 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, vmf.pgoff = page->index; vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; vmf.page = page; + vmf.cow_page = NULL; ret = vma->vm_ops->page_mkwrite(vma, &vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) @@ -2329,6 +2330,7 @@ void unmap_mapping_range(struct address_space *mapping, details.last_index = ULONG_MAX; + /* DAX uses i_mmap_lock to serialise file truncate vs page fault */ i_mmap_lock_write(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) unmap_mapping_range_tree(&mapping->i_mmap, &details); @@ -2638,7 +2640,8 @@ oom: * See filemap_fault() and __lock_page_retry(). */ static int __do_fault(struct vm_area_struct *vma, unsigned long address, - pgoff_t pgoff, unsigned int flags, struct page **page) + pgoff_t pgoff, unsigned int flags, + struct page *cow_page, struct page **page) { struct vm_fault vmf; int ret; @@ -2647,10 +2650,13 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, vmf.pgoff = pgoff; vmf.flags = flags; vmf.page = NULL; + vmf.cow_page = cow_page; ret = vma->vm_ops->fault(vma, &vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; + if (!vmf.page) + goto out; if (unlikely(PageHWPoison(vmf.page))) { if (ret & VM_FAULT_LOCKED) @@ -2664,6 +2670,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, else VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); + out: *page = vmf.page; return ret; } @@ -2834,7 +2841,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap_unlock(pte, ptl); } - ret = __do_fault(vma, address, pgoff, flags, &fault_page); + ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; @@ -2874,26 +2881,43 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; } - ret = __do_fault(vma, address, pgoff, flags, &fault_page); + ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; - copy_user_highpage(new_page, fault_page, address, vma); + if (fault_page) + copy_user_highpage(new_page, fault_page, address, vma); __SetPageUptodate(new_page); pte = pte_offset_map_lock(mm, pmd, address, &ptl); if (unlikely(!pte_same(*pte, orig_pte))) { pte_unmap_unlock(pte, ptl); - unlock_page(fault_page); - page_cache_release(fault_page); + if (fault_page) { + unlock_page(fault_page); + page_cache_release(fault_page); + } else { + /* + * The fault handler has no page to lock, so it holds + * i_mmap_lock for read to protect against truncate. + */ + i_mmap_unlock_read(vma->vm_file->f_mapping); + } goto uncharge_out; } do_set_pte(vma, address, new_page, pte, true, true); mem_cgroup_commit_charge(new_page, memcg, false); lru_cache_add_active_or_unevictable(new_page, vma); pte_unmap_unlock(pte, ptl); - unlock_page(fault_page); - page_cache_release(fault_page); + if (fault_page) { + unlock_page(fault_page); + page_cache_release(fault_page); + } else { + /* + * The fault handler has no page to lock, so it holds + * i_mmap_lock for read to protect against truncate. + */ + i_mmap_unlock_read(vma->vm_file->f_mapping); + } return ret; uncharge_out: mem_cgroup_cancel_charge(new_page, memcg); @@ -2912,7 +2936,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, int dirtied = 0; int ret, tmp; - ret = __do_fault(vma, address, pgoff, flags, &fault_page); + ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; |