summaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
authorDave Chinner <david@fromorbit.com>2015-04-16 22:13:18 +1000
committerDave Chinner <david@fromorbit.com>2015-04-16 22:13:18 +1000
commit542c311813d5cb2e6f0dfa9557f41c829b8fb6a0 (patch)
tree573c5644eb966e44112016c9ae86e80251326223 /fs/xfs
parent6a63ef064b2444883ce8b68b0779d0c739d27204 (diff)
parent0cefb29e6a63727bc7606c47fc538467594ef112 (diff)
downloadop-kernel-dev-542c311813d5cb2e6f0dfa9557f41c829b8fb6a0.zip
op-kernel-dev-542c311813d5cb2e6f0dfa9557f41c829b8fb6a0.tar.gz
Merge branch 'xfs-dio-extend-fix' into for-next
Conflicts: fs/xfs/xfs_file.c
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/xfs_aops.c270
-rw-r--r--fs/xfs/xfs_file.c46
-rw-r--r--fs/xfs/xfs_trace.h5
3 files changed, 239 insertions, 82 deletions
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3a9b7a1..598b259 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1233,6 +1233,117 @@ xfs_vm_releasepage(
return try_to_free_buffers(page);
}
+/*
+ * When we map a DIO buffer, we may need to attach an ioend that describes the
+ * type of write IO we are doing. This passes to the completion function the
+ * operations it needs to perform. If the mapping is for an overwrite wholly
+ * within the EOF then we don't need an ioend and so we don't allocate one.
+ * This avoids the unnecessary overhead of allocating and freeing ioends for
+ * workloads that don't require transactions on IO completion.
+ *
+ * If we get multiple mappings in a single IO, we might be mapping different
+ * types. But because the direct IO can only have a single private pointer, we
+ * need to ensure that:
+ *
+ * a) i) the ioend spans the entire region of unwritten mappings; or
+ * ii) the ioend spans all the mappings that cross or are beyond EOF; and
+ * b) if it contains unwritten extents, it is *permanently* marked as such
+ *
+ * We could do this by chaining ioends like buffered IO does, but we only
+ * actually get one IO completion callback from the direct IO, and that spans
+ * the entire IO regardless of how many mappings and IOs are needed to complete
+ * the DIO. There is only going to be one reference to the ioend and its life
+ * cycle is constrained by the DIO completion code. hence we don't need
+ * reference counting here.
+ */
+static void
+xfs_map_direct(
+ struct inode *inode,
+ struct buffer_head *bh_result,
+ struct xfs_bmbt_irec *imap,
+ xfs_off_t offset)
+{
+ struct xfs_ioend *ioend;
+ xfs_off_t size = bh_result->b_size;
+ int type;
+
+ if (ISUNWRITTEN(imap))
+ type = XFS_IO_UNWRITTEN;
+ else
+ type = XFS_IO_OVERWRITE;
+
+ trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
+
+ if (bh_result->b_private) {
+ ioend = bh_result->b_private;
+ ASSERT(ioend->io_size > 0);
+ ASSERT(offset >= ioend->io_offset);
+ if (offset + size > ioend->io_offset + ioend->io_size)
+ ioend->io_size = offset - ioend->io_offset + size;
+
+ if (type == XFS_IO_UNWRITTEN && type != ioend->io_type)
+ ioend->io_type = XFS_IO_UNWRITTEN;
+
+ trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
+ ioend->io_size, ioend->io_type,
+ imap);
+ } else if (type == XFS_IO_UNWRITTEN ||
+ offset + size > i_size_read(inode)) {
+ ioend = xfs_alloc_ioend(inode, type);
+ ioend->io_offset = offset;
+ ioend->io_size = size;
+
+ bh_result->b_private = ioend;
+ set_buffer_defer_completion(bh_result);
+
+ trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
+ imap);
+ } else {
+ trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
+ imap);
+ }
+}
+
+/*
+ * If this is O_DIRECT or the mpage code calling tell them how large the mapping
+ * is, so that we can avoid repeated get_blocks calls.
+ *
+ * If the mapping spans EOF, then we have to break the mapping up as the mapping
+ * for blocks beyond EOF must be marked new so that sub block regions can be
+ * correctly zeroed. We can't do this for mappings within EOF unless the mapping
+ * was just allocated or is unwritten, otherwise the callers would overwrite
+ * existing data with zeros. Hence we have to split the mapping into a range up
+ * to and including EOF, and a second mapping for beyond EOF.
+ */
+static void
+xfs_map_trim_size(
+ struct inode *inode,
+ sector_t iblock,
+ struct buffer_head *bh_result,
+ struct xfs_bmbt_irec *imap,
+ xfs_off_t offset,
+ ssize_t size)
+{
+ xfs_off_t mapping_size;
+
+ mapping_size = imap->br_startoff + imap->br_blockcount - iblock;
+ mapping_size <<= inode->i_blkbits;
+
+ ASSERT(mapping_size > 0);
+ if (mapping_size > size)
+ mapping_size = size;
+ if (offset < i_size_read(inode) &&
+ offset + mapping_size >= i_size_read(inode)) {
+ /* limit mapping to block that spans EOF */
+ mapping_size = roundup_64(i_size_read(inode) - offset,
+ 1 << inode->i_blkbits);
+ }
+ if (mapping_size > LONG_MAX)
+ mapping_size = LONG_MAX;
+
+ bh_result->b_size = mapping_size;
+}
+
STATIC int
__xfs_get_blocks(
struct inode *inode,
@@ -1321,31 +1432,37 @@ __xfs_get_blocks(
xfs_iunlock(ip, lockmode);
}
-
- trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
+ trace_xfs_get_blocks_alloc(ip, offset, size,
+ ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
+ : XFS_IO_DELALLOC, &imap);
} else if (nimaps) {
- trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
+ trace_xfs_get_blocks_found(ip, offset, size,
+ ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
+ : XFS_IO_OVERWRITE, &imap);
xfs_iunlock(ip, lockmode);
} else {
trace_xfs_get_blocks_notfound(ip, offset, size);
goto out_unlock;
}
+ /* trim mapping down to size requested */
+ if (direct || size > (1 << inode->i_blkbits))
+ xfs_map_trim_size(inode, iblock, bh_result,
+ &imap, offset, size);
+
+ /*
+ * For unwritten extents do not report a disk address in the buffered
+ * read case (treat as if we're reading into a hole).
+ */
if (imap.br_startblock != HOLESTARTBLOCK &&
- imap.br_startblock != DELAYSTARTBLOCK) {
- /*
- * For unwritten extents do not report a disk address on
- * the read case (treat as if we're reading into a hole).
- */
- if (create || !ISUNWRITTEN(&imap))
- xfs_map_buffer(inode, bh_result, &imap, offset);
- if (create && ISUNWRITTEN(&imap)) {
- if (direct) {
- bh_result->b_private = inode;
- set_buffer_defer_completion(bh_result);
- }
+ imap.br_startblock != DELAYSTARTBLOCK &&
+ (create || !ISUNWRITTEN(&imap))) {
+ xfs_map_buffer(inode, bh_result, &imap, offset);
+ if (ISUNWRITTEN(&imap))
set_buffer_unwritten(bh_result);
- }
+ /* direct IO needs special help */
+ if (create && direct)
+ xfs_map_direct(inode, bh_result, &imap, offset);
}
/*
@@ -1378,39 +1495,6 @@ __xfs_get_blocks(
}
}
- /*
- * If this is O_DIRECT or the mpage code calling tell them how large
- * the mapping is, so that we can avoid repeated get_blocks calls.
- *
- * If the mapping spans EOF, then we have to break the mapping up as the
- * mapping for blocks beyond EOF must be marked new so that sub block
- * regions can be correctly zeroed. We can't do this for mappings within
- * EOF unless the mapping was just allocated or is unwritten, otherwise
- * the callers would overwrite existing data with zeros. Hence we have
- * to split the mapping into a range up to and including EOF, and a
- * second mapping for beyond EOF.
- */
- if (direct || size > (1 << inode->i_blkbits)) {
- xfs_off_t mapping_size;
-
- mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
- mapping_size <<= inode->i_blkbits;
-
- ASSERT(mapping_size > 0);
- if (mapping_size > size)
- mapping_size = size;
- if (offset < i_size_read(inode) &&
- offset + mapping_size >= i_size_read(inode)) {
- /* limit mapping to block that spans EOF */
- mapping_size = roundup_64(i_size_read(inode) - offset,
- 1 << inode->i_blkbits);
- }
- if (mapping_size > LONG_MAX)
- mapping_size = LONG_MAX;
-
- bh_result->b_size = mapping_size;
- }
-
return 0;
out_unlock:
@@ -1441,9 +1525,11 @@ xfs_get_blocks_direct(
/*
* Complete a direct I/O write request.
*
- * If the private argument is non-NULL __xfs_get_blocks signals us that we
- * need to issue a transaction to convert the range from unwritten to written
- * extents.
+ * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
+ * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
+ * wholly within the EOF and so there is nothing for us to do. Note that in this
+ * case the completion can be called in interrupt context, whereas if we have an
+ * ioend we will always be called in task context (i.e. from a workqueue).
*/
STATIC void
xfs_end_io_direct_write(
@@ -1455,43 +1541,71 @@ xfs_end_io_direct_write(
struct inode *inode = file_inode(iocb->ki_filp);
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ioend *ioend = private;
- if (XFS_FORCED_SHUTDOWN(mp))
+ trace_xfs_gbmap_direct_endio(ip, offset, size,
+ ioend ? ioend->io_type : 0, NULL);
+
+ if (!ioend) {
+ ASSERT(offset + size <= i_size_read(inode));
return;
+ }
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ goto out_end_io;
/*
- * While the generic direct I/O code updates the inode size, it does
- * so only after the end_io handler is called, which means our
- * end_io handler thinks the on-disk size is outside the in-core
- * size. To prevent this just update it a little bit earlier here.
+ * dio completion end_io functions are only called on writes if more
+ * than 0 bytes was written.
*/
+ ASSERT(size > 0);
+
+ /*
+ * The ioend only maps whole blocks, while the IO may be sector aligned.
+ * Hence the ioend offset/size may not match the IO offset/size exactly.
+ * Because we don't map overwrites within EOF into the ioend, the offset
+ * may not match, but only if the endio spans EOF. Either way, write
+ * the IO sizes into the ioend so that completion processing does the
+ * right thing.
+ */
+ ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
+ ioend->io_size = size;
+ ioend->io_offset = offset;
+
+ /*
+ * The ioend tells us whether we are doing unwritten extent conversion
+ * or an append transaction that updates the on-disk file size. These
+ * cases are the only cases where we should *potentially* be needing
+ * to update the VFS inode size.
+ *
+ * We need to update the in-core inode size here so that we don't end up
+ * with the on-disk inode size being outside the in-core inode size. We
+ * have no other method of updating EOF for AIO, so always do it here
+ * if necessary.
+ *
+ * We need to lock the test/set EOF update as we can be racing with
+ * other IO completions here to update the EOF. Failing to serialise
+ * here can result in EOF moving backwards and Bad Things Happen when
+ * that occurs.
+ */
+ spin_lock(&ip->i_flags_lock);
if (offset + size > i_size_read(inode))
i_size_write(inode, offset + size);
+ spin_unlock(&ip->i_flags_lock);
/*
- * For direct I/O we do not know if we need to allocate blocks or not,
- * so we can't preallocate an append transaction, as that results in
- * nested reservations and log space deadlocks. Hence allocate the
- * transaction here. While this is sub-optimal and can block IO
- * completion for some time, we're stuck with doing it this way until
- * we can pass the ioend to the direct IO allocation callbacks and
- * avoid nesting that way.
+ * If we are doing an append IO that needs to update the EOF on disk,
+ * do the transaction reserve now so we can use common end io
+ * processing. Stashing the error (if there is one) in the ioend will
+ * result in the ioend processing passing on the error if it is
+ * possible as we can't return it from here.
*/
- if (private && size > 0) {
- xfs_iomap_write_unwritten(ip, offset, size);
- } else if (offset + size > ip->i_d.di_size) {
- struct xfs_trans *tp;
- int error;
-
- tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
- if (error) {
- xfs_trans_cancel(tp, 0);
- return;
- }
+ if (ioend->io_type == XFS_IO_OVERWRITE)
+ ioend->io_error = xfs_setfilesize_trans_alloc(ioend);
- xfs_setfilesize(ip, tp, offset, size);
- }
+out_end_io:
+ xfs_end_io(&ioend->io_work);
+ return;
}
STATIC ssize_t
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index c203839..3a5d305 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -569,20 +569,41 @@ restart:
* write. If zeroing is needed and we are currently holding the
* iolock shared, we need to update it to exclusive which implies
* having to redo all checks before.
+ *
+ * We need to serialise against EOF updates that occur in IO
+ * completions here. We want to make sure that nobody is changing the
+ * size while we do this check until we have placed an IO barrier (i.e.
+ * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
+ * The spinlock effectively forms a memory barrier once we have the
+ * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
+ * and hence be able to correctly determine if we need to run zeroing.
*/
+ spin_lock(&ip->i_flags_lock);
if (*pos > i_size_read(inode)) {
bool zero = false;
+ spin_unlock(&ip->i_flags_lock);
if (*iolock == XFS_IOLOCK_SHARED) {
xfs_rw_iunlock(ip, *iolock);
*iolock = XFS_IOLOCK_EXCL;
xfs_rw_ilock(ip, *iolock);
+
+ /*
+ * We now have an IO submission barrier in place, but
+ * AIO can do EOF updates during IO completion and hence
+ * we now need to wait for all of them to drain. Non-AIO
+ * DIO will have drained before we are given the
+ * XFS_IOLOCK_EXCL, and so for most cases this wait is a
+ * no-op.
+ */
+ inode_dio_wait(inode);
goto restart;
}
error = xfs_zero_eof(ip, *pos, i_size_read(inode), &zero);
if (error)
return error;
- }
+ } else
+ spin_unlock(&ip->i_flags_lock);
/*
* Updating the timestamps will grab the ilock again from
@@ -644,6 +665,8 @@ xfs_file_dio_aio_write(
int iolock;
size_t count = iov_iter_count(from);
loff_t pos = iocb->ki_pos;
+ loff_t end;
+ struct iov_iter data;
struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp;
@@ -683,10 +706,11 @@ xfs_file_dio_aio_write(
if (ret)
goto out;
iov_iter_truncate(from, count);
+ end = pos + count - 1;
if (mapping->nrpages) {
ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
- pos, pos + count - 1);
+ pos, end);
if (ret)
goto out;
/*
@@ -696,7 +720,7 @@ xfs_file_dio_aio_write(
*/
ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
pos >> PAGE_CACHE_SHIFT,
- (pos + count - 1) >> PAGE_CACHE_SHIFT);
+ end >> PAGE_CACHE_SHIFT);
WARN_ON_ONCE(ret);
ret = 0;
}
@@ -713,8 +737,22 @@ xfs_file_dio_aio_write(
}
trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
- ret = generic_file_direct_write(iocb, from, pos);
+ data = *from;
+ ret = mapping->a_ops->direct_IO(WRITE, iocb, &data, pos);
+
+ /* see generic_file_direct_write() for why this is necessary */
+ if (mapping->nrpages) {
+ invalidate_inode_pages2_range(mapping,
+ pos >> PAGE_CACHE_SHIFT,
+ end >> PAGE_CACHE_SHIFT);
+ }
+
+ if (ret > 0) {
+ pos += ret;
+ iov_iter_advance(from, ret);
+ iocb->ki_pos = pos;
+ }
out:
xfs_rw_iunlock(ip, iolock);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index b2a45cc..615781b 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1221,6 +1221,11 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio);
DECLARE_EVENT_CLASS(xfs_simple_io_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
OpenPOWER on IntegriCloud