From de0e8c20ba3a65b0f15040aabbefdc1999876e6b Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 23 Feb 2015 21:44:19 +1100
Subject: xfs: use i_mmaplock on read faults

Take the i_mmaplock over read page faults. These come through the
->fault callout, so we need to wrap the generic implementation
with the i_mmaplock. While there, add tracepoints for the read
fault as it passes through XFS.

This gives us a lock order of mmap_sem -> i_mmaplock -> page_lock
-> i_lock.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_file.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

(limited to 'fs/xfs/xfs_file.c')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index ce615d1..ac17422 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1379,6 +1379,32 @@ xfs_file_llseek(
 	}
 }
 
+/*
+ * Locking for serialisation of IO during page faults. This results in a lock
+ * ordering of:
+ *
+ * mmap_sem (MM)
+ *   i_mmap_lock (XFS - truncate serialisation)
+ *     page_lock (MM)
+ *       i_lock (XFS - extent map serialisation)
+ */
+STATIC int
+xfs_filemap_fault(
+	struct vm_area_struct	*vma,
+	struct vm_fault		*vmf)
+{
+	struct xfs_inode	*ip = XFS_I(vma->vm_file->f_mapping->host);
+	int			error;
+
+	trace_xfs_filemap_fault(ip);
+
+	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+	error = filemap_fault(vma, vmf);
+	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+
+	return error;
+}
+
 const struct file_operations xfs_file_operations = {
 	.llseek		= xfs_file_llseek,
 	.read		= new_sync_read,
@@ -1411,7 +1437,7 @@ const struct file_operations xfs_dir_file_operations = {
 };
 
 static const struct vm_operations_struct xfs_file_vm_ops = {
-	.fault		= filemap_fault,
+	.fault		= xfs_filemap_fault,
 	.map_pages	= filemap_map_pages,
 	.page_mkwrite	= xfs_vm_page_mkwrite,
 };
-- 
cgit v1.1


From 075a924d45cc69c75a35f20b4912b85aa98b180a Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 23 Feb 2015 21:44:54 +1100
Subject: xfs: use i_mmaplock on write faults

Take the i_mmaplock over write page faults. These come through the
->page_mkwrite callout, so we need to wrap that calls with the
i_mmaplock.

This gives us a lock order of mmap_sem -> i_mmaplock -> page_lock
-> i_lock.

Also, move the page_mkwrite wrapper to the same region of xfs_file.c
as the read fault wrappers and add a tracepoint.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_file.c | 39 ++++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 15 deletions(-)

(limited to 'fs/xfs/xfs_file.c')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index ac17422..d55f011 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -991,20 +991,6 @@ xfs_file_mmap(
 }
 
 /*
- * mmap()d file has taken write protection fault and is being made
- * writable. We can set the page state up correctly for a writable
- * page, which means we can do correct delalloc accounting (ENOSPC
- * checking!) and unwritten extent mapping.
- */
-STATIC int
-xfs_vm_page_mkwrite(
-	struct vm_area_struct	*vma,
-	struct vm_fault		*vmf)
-{
-	return block_page_mkwrite(vma, vmf, xfs_get_blocks);
-}
-
-/*
  * This type is designed to indicate the type of offset we would like
  * to search from page cache for xfs_seek_hole_data().
  */
@@ -1405,6 +1391,29 @@ xfs_filemap_fault(
 	return error;
 }
 
+/*
+ * mmap()d file has taken write protection fault and is being made writable. We
+ * can set the page state up correctly for a writable page, which means we can
+ * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
+ * mapping.
+ */
+STATIC int
+xfs_filemap_page_mkwrite(
+	struct vm_area_struct	*vma,
+	struct vm_fault		*vmf)
+{
+	struct xfs_inode	*ip = XFS_I(vma->vm_file->f_mapping->host);
+	int			error;
+
+	trace_xfs_filemap_page_mkwrite(ip);
+
+	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+	error = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+
+	return error;
+}
+
 const struct file_operations xfs_file_operations = {
 	.llseek		= xfs_file_llseek,
 	.read		= new_sync_read,
@@ -1439,5 +1448,5 @@ const struct file_operations xfs_dir_file_operations = {
 static const struct vm_operations_struct xfs_file_vm_ops = {
 	.fault		= xfs_filemap_fault,
 	.map_pages	= filemap_map_pages,
-	.page_mkwrite	= xfs_vm_page_mkwrite,
+	.page_mkwrite	= xfs_filemap_page_mkwrite,
 };
-- 
cgit v1.1


From e8e9ad42c1f1e1bfbe0e8c32c8cac02e9ebfb7ef Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 23 Feb 2015 21:45:32 +1100
Subject: xfs: take i_mmap_lock on extent manipulation operations

Now we have the i_mmap_lock being held across the page fault IO
path, we now add extent manipulation operation exclusion by adding
the lock to the paths that directly modify extent maps. This
includes truncate, hole punching and other fallocate based
operations. The operations will now take both the i_iolock and the
i_mmaplock in exclusive mode, thereby ensuring that all IO and page
faults block without holding any page locks while the extent
manipulation is in progress.

This gives us the lock order during truncate of i_iolock ->
i_mmaplock -> page_lock -> i_lock, hence providing the same
lock order as the iolock provides the normal IO path without
involving the mmap_sem.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_file.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs/xfs/xfs_file.c')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index d55f011..609b5aa 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -841,6 +841,9 @@ xfs_file_fallocate(
 	if (error)
 		goto out_unlock;
 
+	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+	iolock |= XFS_MMAPLOCK_EXCL;
+
 	if (mode & FALLOC_FL_PUNCH_HOLE) {
 		error = xfs_free_file_space(ip, offset, len);
 		if (error)
-- 
cgit v1.1


From a904b1ca5751faf5ece8600e18cd3b674afcca1b Mon Sep 17 00:00:00 2001
From: Namjae Jeon <namjae.jeon@samsung.com>
Date: Wed, 25 Mar 2015 15:08:56 +1100
Subject: xfs: Add support FALLOC_FL_INSERT_RANGE for fallocate

This patch implements fallocate's FALLOC_FL_INSERT_RANGE for XFS.

1) Make sure that both offset and len are block size aligned.
2) Update the i_size of inode by len bytes.
3) Compute the file's logical block number against offset. If the computed
   block number is not the starting block of the extent, split the extent
   such that the block number is the starting block of the extent.
4) Shift all the extents which are lying bewteen [offset, last allocated extent]
   towards right by len bytes. This step will make a hole of len bytes
   at offset.

Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
Signed-off-by: Ashish Sangwan <a.sangwan@samsung.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_file.c | 41 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 39 insertions(+), 2 deletions(-)

(limited to 'fs/xfs/xfs_file.c')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index ce615d1..edeaccc 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -816,6 +816,11 @@ xfs_file_write_iter(
 	return ret;
 }
 
+#define	XFS_FALLOC_FL_SUPPORTED						\
+		(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |		\
+		 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |	\
+		 FALLOC_FL_INSERT_RANGE)
+
 STATIC long
 xfs_file_fallocate(
 	struct file		*file,
@@ -829,11 +834,11 @@ xfs_file_fallocate(
 	enum xfs_prealloc_flags	flags = 0;
 	uint			iolock = XFS_IOLOCK_EXCL;
 	loff_t			new_size = 0;
+	bool			do_file_insert = 0;
 
 	if (!S_ISREG(inode->i_mode))
 		return -EINVAL;
-	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
-		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
+	if (mode & ~XFS_FALLOC_FL_SUPPORTED)
 		return -EOPNOTSUPP;
 
 	xfs_ilock(ip, iolock);
@@ -867,6 +872,27 @@ xfs_file_fallocate(
 		error = xfs_collapse_file_space(ip, offset, len);
 		if (error)
 			goto out_unlock;
+	} else if (mode & FALLOC_FL_INSERT_RANGE) {
+		unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
+
+		new_size = i_size_read(inode) + len;
+		if (offset & blksize_mask || len & blksize_mask) {
+			error = -EINVAL;
+			goto out_unlock;
+		}
+
+		/* check the new inode size does not wrap through zero */
+		if (new_size > inode->i_sb->s_maxbytes) {
+			error = -EFBIG;
+			goto out_unlock;
+		}
+
+		/* Offset should be less than i_size */
+		if (offset >= i_size_read(inode)) {
+			error = -EINVAL;
+			goto out_unlock;
+		}
+		do_file_insert = 1;
 	} else {
 		flags |= XFS_PREALLOC_SET;
 
@@ -901,8 +927,19 @@ xfs_file_fallocate(
 		iattr.ia_valid = ATTR_SIZE;
 		iattr.ia_size = new_size;
 		error = xfs_setattr_size(ip, &iattr);
+		if (error)
+			goto out_unlock;
 	}
 
+	/*
+	 * Perform hole insertion now that the file size has been
+	 * updated so that if we crash during the operation we don't
+	 * leave shifted extents past EOF and hence losing access to
+	 * the data that is contained within them.
+	 */
+	if (do_file_insert)
+		error = xfs_insert_file_space(ip, offset, len);
+
 out_unlock:
 	xfs_iunlock(ip, iolock);
 	return error;
-- 
cgit v1.1


From 21c3ea18819b5f650c75f59a0457415bc05d2b17 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 Apr 2015 11:38:29 +1000
Subject: xfs: unlock i_mutex in xfs_break_layouts

We want to drop all I/O path locks when recalling layouts, and that includes
i_mutex for the write path.  Without this we get stuck processe when recalls
take too long.

[dchinner: fix build with !CONFIG_PNFS]

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/xfs/xfs_file.c')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index edeaccc..f63aedd 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -555,7 +555,7 @@ restart:
 	if (error)
 		return error;
 
-	error = xfs_break_layouts(inode, iolock);
+	error = xfs_break_layouts(inode, iolock, true);
 	if (error)
 		return error;
 
@@ -842,7 +842,7 @@ xfs_file_fallocate(
 		return -EOPNOTSUPP;
 
 	xfs_ilock(ip, iolock);
-	error = xfs_break_layouts(inode, &iolock);
+	error = xfs_break_layouts(inode, &iolock, false);
 	if (error)
 		goto out_unlock;
 
-- 
cgit v1.1


From b9d59846f73713d77f0f3fb784c7f84249fc2b93 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 16 Apr 2015 22:03:07 +1000
Subject: xfs: DIO write completion size updates race

xfs_end_io_direct_write() can race with other IO completions when
updating the in-core inode size. The IO completion processing is not
serialised for direct IO - they are done either under the
IOLOCK_SHARED for non-AIO DIO, and without any IOLOCK held at all
during AIO DIO completion. Hence the non-atomic test-and-set update
of the in-core inode size is racy and can result in the in-core
inode size going backwards if the race if hit just right.

If the inode size goes backwards, this can trigger the EOF zeroing
code to run incorrectly on the next IO, which then will zero data
that has successfully been written to disk by a previous DIO.

To fix this bug, we need to serialise the test/set updates of the
in-core inode size. This first patch introduces locking around the
relevant updates and checks in the DIO path. Because we now have an
ioend in xfs_end_io_direct_write(), we know exactly then we are
doing an IO that requires an in-core EOF update, and we know that
they are not running in interrupt context. As such, we do not need to
use irqsave() spinlock variants to protect against interrupts while
the lock is held.

Hence we can use an existing spinlock in the inode to do this
serialisation and so not need to grow the struct xfs_inode just to
work around this problem.

This patch does not address the test/set EOF update in
generic_file_write_direct() for various reasons - that will be done
as a followup with separate explanation.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_file.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs/xfs/xfs_file.c')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index ce615d1..2323b8b 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -565,8 +565,18 @@ restart:
 	 * write.  If zeroing is needed and we are currently holding the
 	 * iolock shared, we need to update it to exclusive which implies
 	 * having to redo all checks before.
+	 *
+	 * We need to serialise against EOF updates that occur in IO
+	 * completions here. We want to make sure that nobody is changing the
+	 * size while we do this check until we have placed an IO barrier (i.e.
+	 * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
+	 * The spinlock effectively forms a memory barrier once we have the
+	 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
+	 * and hence be able to correctly determine if we need to run zeroing.
 	 */
+	spin_lock(&ip->i_flags_lock);
 	if (*pos > i_size_read(inode)) {
+		spin_unlock(&ip->i_flags_lock);
 		if (*iolock == XFS_IOLOCK_SHARED) {
 			xfs_rw_iunlock(ip, *iolock);
 			*iolock = XFS_IOLOCK_EXCL;
@@ -576,7 +586,8 @@ restart:
 		error = xfs_zero_eof(ip, *pos, i_size_read(inode));
 		if (error)
 			return error;
-	}
+	} else
+		spin_unlock(&ip->i_flags_lock);
 
 	/*
 	 * Updating the timestamps will grab the ilock again from
-- 
cgit v1.1


From 40c63fbc55a968383b8bb5cacad81585e80cd323 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 16 Apr 2015 22:03:17 +1000
Subject: xfs: direct IO EOF zeroing needs to drain AIO

When we are doing AIO DIO writes, the IOLOCK only provides an IO
submission barrier. When we need to do EOF zeroing, we need to ensure
that no other IO is in progress and all pending in-core EOF updates
have been completed. This requires us to wait for all outstanding
AIO DIO writes to the inode to complete and, if necessary, run their
EOF updates.

Once all the EOF updates are complete, we can then restart
xfs_file_aio_write_checks() while holding the IOLOCK_EXCL, knowing
that EOF is up to date and we have exclusive IO access to the file
so we can run EOF block zeroing if we need to without interference.
This gives EOF zeroing the same exclusivity against other IO as we
provide truncate operations.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_file.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs/xfs/xfs_file.c')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 2323b8b..f6f0e96 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -581,6 +581,16 @@ restart:
 			xfs_rw_iunlock(ip, *iolock);
 			*iolock = XFS_IOLOCK_EXCL;
 			xfs_rw_ilock(ip, *iolock);
+
+			/*
+			 * We now have an IO submission barrier in place, but
+			 * AIO can do EOF updates during IO completion and hence
+			 * we now need to wait for all of them to drain. Non-AIO
+			 * DIO will have drained before we are given the
+			 * XFS_IOLOCK_EXCL, and so for most cases this wait is a
+			 * no-op.
+			 */
+			inode_dio_wait(inode);
 			goto restart;
 		}
 		error = xfs_zero_eof(ip, *pos, i_size_read(inode));
-- 
cgit v1.1


From 0cefb29e6a63727bc7606c47fc538467594ef112 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 16 Apr 2015 22:03:27 +1000
Subject: xfs: using generic_file_direct_write() is unnecessary

generic_file_direct_write() does all sorts of things to make DIO
work "sorta ok" with mixed buffered IO workloads. We already do
most of this work in xfs_file_aio_dio_write() because of the locking
requirements, so there's only a couple of things it does for us.

The first thing is that it does a page cache invalidation after the
->direct_IO callout. This can easily be added to the XFS code.

The second thing it does is that if data was written, it updates the
iov_iter structure to reflect the data written, and then does EOF
size updates if necessary. For XFS, these EOF size updates are now
not necessary, as we do them safely and race-free in IO completion
context. That leaves just the iov_iter update, and that's also moved
to the XFS code.

Therefore we don't need to call generic_file_direct_write() and in
doing so remove redundant buffered writeback and page cache
invalidation calls from the DIO submission path. We also remove a
racy EOF size update, and make the DIO submission code in XFS much
easier to follow. Wins all round, really.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_file.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

(limited to 'fs/xfs/xfs_file.c')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f6f0e96..79ffb3e 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -659,6 +659,8 @@ xfs_file_dio_aio_write(
 	int			iolock;
 	size_t			count = iov_iter_count(from);
 	loff_t			pos = iocb->ki_pos;
+	loff_t			end;
+	struct iov_iter		data;
 	struct xfs_buftarg	*target = XFS_IS_REALTIME_INODE(ip) ?
 					mp->m_rtdev_targp : mp->m_ddev_targp;
 
@@ -698,10 +700,11 @@ xfs_file_dio_aio_write(
 	if (ret)
 		goto out;
 	iov_iter_truncate(from, count);
+	end = pos + count - 1;
 
 	if (mapping->nrpages) {
 		ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-						    pos, pos + count - 1);
+						   pos, end);
 		if (ret)
 			goto out;
 		/*
@@ -711,7 +714,7 @@ xfs_file_dio_aio_write(
 		 */
 		ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
 					pos >> PAGE_CACHE_SHIFT,
-					(pos + count - 1) >> PAGE_CACHE_SHIFT);
+					end >> PAGE_CACHE_SHIFT);
 		WARN_ON_ONCE(ret);
 		ret = 0;
 	}
@@ -728,8 +731,22 @@ xfs_file_dio_aio_write(
 	}
 
 	trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
-	ret = generic_file_direct_write(iocb, from, pos);
 
+	data = *from;
+	ret = mapping->a_ops->direct_IO(WRITE, iocb, &data, pos);
+
+	/* see generic_file_direct_write() for why this is necessary */
+	if (mapping->nrpages) {
+		invalidate_inode_pages2_range(mapping,
+					      pos >> PAGE_CACHE_SHIFT,
+					      end >> PAGE_CACHE_SHIFT);
+	}
+
+	if (ret > 0) {
+		pos += ret;
+		iov_iter_advance(from, ret);
+		iocb->ki_pos = pos;
+	}
 out:
 	xfs_rw_iunlock(ip, iolock);
 
-- 
cgit v1.1