From de0e8c20ba3a65b0f15040aabbefdc1999876e6b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 23 Feb 2015 21:44:19 +1100 Subject: xfs: use i_mmaplock on read faults Take the i_mmaplock over read page faults. These come through the ->fault callout, so we need to wrap the generic implementation with the i_mmaplock. While there, add tracepoints for the read fault as it passes through XFS. This gives us a lock order of mmap_sem -> i_mmaplock -> page_lock -> i_lock. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'fs/xfs/xfs_file.c') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index ce615d1..ac17422 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1379,6 +1379,32 @@ xfs_file_llseek( } } +/* + * Locking for serialisation of IO during page faults. This results in a lock + * ordering of: + * + * mmap_sem (MM) + * i_mmap_lock (XFS - truncate serialisation) + * page_lock (MM) + * i_lock (XFS - extent map serialisation) + */ +STATIC int +xfs_filemap_fault( + struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); + int error; + + trace_xfs_filemap_fault(ip); + + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + error = filemap_fault(vma, vmf); + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + + return error; +} + const struct file_operations xfs_file_operations = { .llseek = xfs_file_llseek, .read = new_sync_read, @@ -1411,7 +1437,7 @@ const struct file_operations xfs_dir_file_operations = { }; static const struct vm_operations_struct xfs_file_vm_ops = { - .fault = filemap_fault, + .fault = xfs_filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = xfs_vm_page_mkwrite, }; -- cgit v1.1 From 075a924d45cc69c75a35f20b4912b85aa98b180a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 23 Feb 2015 21:44:54 +1100 Subject: xfs: use i_mmaplock on write faults Take the i_mmaplock over write page faults. These come through the ->page_mkwrite callout, so we need to wrap that calls with the i_mmaplock. This gives us a lock order of mmap_sem -> i_mmaplock -> page_lock -> i_lock. Also, move the page_mkwrite wrapper to the same region of xfs_file.c as the read fault wrappers and add a tracepoint. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) (limited to 'fs/xfs/xfs_file.c') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index ac17422..d55f011 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -991,20 +991,6 @@ xfs_file_mmap( } /* - * mmap()d file has taken write protection fault and is being made - * writable. We can set the page state up correctly for a writable - * page, which means we can do correct delalloc accounting (ENOSPC - * checking!) and unwritten extent mapping. - */ -STATIC int -xfs_vm_page_mkwrite( - struct vm_area_struct *vma, - struct vm_fault *vmf) -{ - return block_page_mkwrite(vma, vmf, xfs_get_blocks); -} - -/* * This type is designed to indicate the type of offset we would like * to search from page cache for xfs_seek_hole_data(). */ @@ -1405,6 +1391,29 @@ xfs_filemap_fault( return error; } +/* + * mmap()d file has taken write protection fault and is being made writable. We + * can set the page state up correctly for a writable page, which means we can + * do correct delalloc accounting (ENOSPC checking!) and unwritten extent + * mapping. + */ +STATIC int +xfs_filemap_page_mkwrite( + struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); + int error; + + trace_xfs_filemap_page_mkwrite(ip); + + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + error = block_page_mkwrite(vma, vmf, xfs_get_blocks); + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + + return error; +} + const struct file_operations xfs_file_operations = { .llseek = xfs_file_llseek, .read = new_sync_read, @@ -1439,5 +1448,5 @@ const struct file_operations xfs_dir_file_operations = { static const struct vm_operations_struct xfs_file_vm_ops = { .fault = xfs_filemap_fault, .map_pages = filemap_map_pages, - .page_mkwrite = xfs_vm_page_mkwrite, + .page_mkwrite = xfs_filemap_page_mkwrite, }; -- cgit v1.1 From e8e9ad42c1f1e1bfbe0e8c32c8cac02e9ebfb7ef Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 23 Feb 2015 21:45:32 +1100 Subject: xfs: take i_mmap_lock on extent manipulation operations Now we have the i_mmap_lock being held across the page fault IO path, we now add extent manipulation operation exclusion by adding the lock to the paths that directly modify extent maps. This includes truncate, hole punching and other fallocate based operations. The operations will now take both the i_iolock and the i_mmaplock in exclusive mode, thereby ensuring that all IO and page faults block without holding any page locks while the extent manipulation is in progress. This gives us the lock order during truncate of i_iolock -> i_mmaplock -> page_lock -> i_lock, hence providing the same lock order as the iolock provides the normal IO path without involving the mmap_sem. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/xfs/xfs_file.c') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index d55f011..609b5aa 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -841,6 +841,9 @@ xfs_file_fallocate( if (error) goto out_unlock; + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + iolock |= XFS_MMAPLOCK_EXCL; + if (mode & FALLOC_FL_PUNCH_HOLE) { error = xfs_free_file_space(ip, offset, len); if (error) -- cgit v1.1 From a904b1ca5751faf5ece8600e18cd3b674afcca1b Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 25 Mar 2015 15:08:56 +1100 Subject: xfs: Add support FALLOC_FL_INSERT_RANGE for fallocate This patch implements fallocate's FALLOC_FL_INSERT_RANGE for XFS. 1) Make sure that both offset and len are block size aligned. 2) Update the i_size of inode by len bytes. 3) Compute the file's logical block number against offset. If the computed block number is not the starting block of the extent, split the extent such that the block number is the starting block of the extent. 4) Shift all the extents which are lying bewteen [offset, last allocated extent] towards right by len bytes. This step will make a hole of len bytes at offset. Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Reviewed-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 41 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) (limited to 'fs/xfs/xfs_file.c') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index ce615d1..edeaccc 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -816,6 +816,11 @@ xfs_file_write_iter( return ret; } +#define XFS_FALLOC_FL_SUPPORTED \ + (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ + FALLOC_FL_INSERT_RANGE) + STATIC long xfs_file_fallocate( struct file *file, @@ -829,11 +834,11 @@ xfs_file_fallocate( enum xfs_prealloc_flags flags = 0; uint iolock = XFS_IOLOCK_EXCL; loff_t new_size = 0; + bool do_file_insert = 0; if (!S_ISREG(inode->i_mode)) return -EINVAL; - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | - FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) + if (mode & ~XFS_FALLOC_FL_SUPPORTED) return -EOPNOTSUPP; xfs_ilock(ip, iolock); @@ -867,6 +872,27 @@ xfs_file_fallocate( error = xfs_collapse_file_space(ip, offset, len); if (error) goto out_unlock; + } else if (mode & FALLOC_FL_INSERT_RANGE) { + unsigned blksize_mask = (1 << inode->i_blkbits) - 1; + + new_size = i_size_read(inode) + len; + if (offset & blksize_mask || len & blksize_mask) { + error = -EINVAL; + goto out_unlock; + } + + /* check the new inode size does not wrap through zero */ + if (new_size > inode->i_sb->s_maxbytes) { + error = -EFBIG; + goto out_unlock; + } + + /* Offset should be less than i_size */ + if (offset >= i_size_read(inode)) { + error = -EINVAL; + goto out_unlock; + } + do_file_insert = 1; } else { flags |= XFS_PREALLOC_SET; @@ -901,8 +927,19 @@ xfs_file_fallocate( iattr.ia_valid = ATTR_SIZE; iattr.ia_size = new_size; error = xfs_setattr_size(ip, &iattr); + if (error) + goto out_unlock; } + /* + * Perform hole insertion now that the file size has been + * updated so that if we crash during the operation we don't + * leave shifted extents past EOF and hence losing access to + * the data that is contained within them. + */ + if (do_file_insert) + error = xfs_insert_file_space(ip, offset, len); + out_unlock: xfs_iunlock(ip, iolock); return error; -- cgit v1.1 From 21c3ea18819b5f650c75f59a0457415bc05d2b17 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 Apr 2015 11:38:29 +1000 Subject: xfs: unlock i_mutex in xfs_break_layouts We want to drop all I/O path locks when recalling layouts, and that includes i_mutex for the write path. Without this we get stuck processe when recalls take too long. [dchinner: fix build with !CONFIG_PNFS] Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/xfs/xfs_file.c') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index edeaccc..f63aedd 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -555,7 +555,7 @@ restart: if (error) return error; - error = xfs_break_layouts(inode, iolock); + error = xfs_break_layouts(inode, iolock, true); if (error) return error; @@ -842,7 +842,7 @@ xfs_file_fallocate( return -EOPNOTSUPP; xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock); + error = xfs_break_layouts(inode, &iolock, false); if (error) goto out_unlock; -- cgit v1.1 From b9d59846f73713d77f0f3fb784c7f84249fc2b93 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 16 Apr 2015 22:03:07 +1000 Subject: xfs: DIO write completion size updates race xfs_end_io_direct_write() can race with other IO completions when updating the in-core inode size. The IO completion processing is not serialised for direct IO - they are done either under the IOLOCK_SHARED for non-AIO DIO, and without any IOLOCK held at all during AIO DIO completion. Hence the non-atomic test-and-set update of the in-core inode size is racy and can result in the in-core inode size going backwards if the race if hit just right. If the inode size goes backwards, this can trigger the EOF zeroing code to run incorrectly on the next IO, which then will zero data that has successfully been written to disk by a previous DIO. To fix this bug, we need to serialise the test/set updates of the in-core inode size. This first patch introduces locking around the relevant updates and checks in the DIO path. Because we now have an ioend in xfs_end_io_direct_write(), we know exactly then we are doing an IO that requires an in-core EOF update, and we know that they are not running in interrupt context. As such, we do not need to use irqsave() spinlock variants to protect against interrupts while the lock is held. Hence we can use an existing spinlock in the inode to do this serialisation and so not need to grow the struct xfs_inode just to work around this problem. This patch does not address the test/set EOF update in generic_file_write_direct() for various reasons - that will be done as a followup with separate explanation. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs/xfs/xfs_file.c') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index ce615d1..2323b8b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -565,8 +565,18 @@ restart: * write. If zeroing is needed and we are currently holding the * iolock shared, we need to update it to exclusive which implies * having to redo all checks before. + * + * We need to serialise against EOF updates that occur in IO + * completions here. We want to make sure that nobody is changing the + * size while we do this check until we have placed an IO barrier (i.e. + * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. + * The spinlock effectively forms a memory barrier once we have the + * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value + * and hence be able to correctly determine if we need to run zeroing. */ + spin_lock(&ip->i_flags_lock); if (*pos > i_size_read(inode)) { + spin_unlock(&ip->i_flags_lock); if (*iolock == XFS_IOLOCK_SHARED) { xfs_rw_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; @@ -576,7 +586,8 @@ restart: error = xfs_zero_eof(ip, *pos, i_size_read(inode)); if (error) return error; - } + } else + spin_unlock(&ip->i_flags_lock); /* * Updating the timestamps will grab the ilock again from -- cgit v1.1 From 40c63fbc55a968383b8bb5cacad81585e80cd323 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 16 Apr 2015 22:03:17 +1000 Subject: xfs: direct IO EOF zeroing needs to drain AIO When we are doing AIO DIO writes, the IOLOCK only provides an IO submission barrier. When we need to do EOF zeroing, we need to ensure that no other IO is in progress and all pending in-core EOF updates have been completed. This requires us to wait for all outstanding AIO DIO writes to the inode to complete and, if necessary, run their EOF updates. Once all the EOF updates are complete, we can then restart xfs_file_aio_write_checks() while holding the IOLOCK_EXCL, knowing that EOF is up to date and we have exclusive IO access to the file so we can run EOF block zeroing if we need to without interference. This gives EOF zeroing the same exclusivity against other IO as we provide truncate operations. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs/xfs/xfs_file.c') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 2323b8b..f6f0e96 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -581,6 +581,16 @@ restart: xfs_rw_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, *iolock); + + /* + * We now have an IO submission barrier in place, but + * AIO can do EOF updates during IO completion and hence + * we now need to wait for all of them to drain. Non-AIO + * DIO will have drained before we are given the + * XFS_IOLOCK_EXCL, and so for most cases this wait is a + * no-op. + */ + inode_dio_wait(inode); goto restart; } error = xfs_zero_eof(ip, *pos, i_size_read(inode)); -- cgit v1.1 From 0cefb29e6a63727bc7606c47fc538467594ef112 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 16 Apr 2015 22:03:27 +1000 Subject: xfs: using generic_file_direct_write() is unnecessary generic_file_direct_write() does all sorts of things to make DIO work "sorta ok" with mixed buffered IO workloads. We already do most of this work in xfs_file_aio_dio_write() because of the locking requirements, so there's only a couple of things it does for us. The first thing is that it does a page cache invalidation after the ->direct_IO callout. This can easily be added to the XFS code. The second thing it does is that if data was written, it updates the iov_iter structure to reflect the data written, and then does EOF size updates if necessary. For XFS, these EOF size updates are now not necessary, as we do them safely and race-free in IO completion context. That leaves just the iov_iter update, and that's also moved to the XFS code. Therefore we don't need to call generic_file_direct_write() and in doing so remove redundant buffered writeback and page cache invalidation calls from the DIO submission path. We also remove a racy EOF size update, and make the DIO submission code in XFS much easier to follow. Wins all round, really. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'fs/xfs/xfs_file.c') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f6f0e96..79ffb3e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -659,6 +659,8 @@ xfs_file_dio_aio_write( int iolock; size_t count = iov_iter_count(from); loff_t pos = iocb->ki_pos; + loff_t end; + struct iov_iter data; struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; @@ -698,10 +700,11 @@ xfs_file_dio_aio_write( if (ret) goto out; iov_iter_truncate(from, count); + end = pos + count - 1; if (mapping->nrpages) { ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - pos, pos + count - 1); + pos, end); if (ret) goto out; /* @@ -711,7 +714,7 @@ xfs_file_dio_aio_write( */ ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, pos >> PAGE_CACHE_SHIFT, - (pos + count - 1) >> PAGE_CACHE_SHIFT); + end >> PAGE_CACHE_SHIFT); WARN_ON_ONCE(ret); ret = 0; } @@ -728,8 +731,22 @@ xfs_file_dio_aio_write( } trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); - ret = generic_file_direct_write(iocb, from, pos); + data = *from; + ret = mapping->a_ops->direct_IO(WRITE, iocb, &data, pos); + + /* see generic_file_direct_write() for why this is necessary */ + if (mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, + end >> PAGE_CACHE_SHIFT); + } + + if (ret > 0) { + pos += ret; + iov_iter_advance(from, ret); + iocb->ki_pos = pos; + } out: xfs_rw_iunlock(ip, iolock); -- cgit v1.1