From a870fe6dfaba1cc67424cde4cfd2cd3eee62bf35 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 2 Oct 2014 09:02:28 +1000 Subject: xfs: force the log before shutting down When we have marked the filesystem for shutdown, we want to prevent any further buffer IO from being submitted. However, we currently force the log after marking the filesystem as shut down, hence allowing IO to the log *after* we have marked both the filesystem and the log as in an error state. Clean this up by forcing the log before we mark the filesytem with an error. This replaces the pure CIL flush that we currently have which works around this same issue (i.e the CIL can't be flushed once the shutdown flags are set) and hence enables us to clean up the logic substantially. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_log.c | 55 +++++++++++++++++-------------------------------------- 1 file changed, 17 insertions(+), 38 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index ca4fd5b..85f36f2 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3867,18 +3867,17 @@ xlog_state_ioerror( * This is called from xfs_force_shutdown, when we're forcibly * shutting down the filesystem, typically because of an IO error. * Our main objectives here are to make sure that: - * a. the filesystem gets marked 'SHUTDOWN' for all interested + * a. if !logerror, flush the logs to disk. Anything modified + * after this is ignored. + * b. the filesystem gets marked 'SHUTDOWN' for all interested * parties to find out, 'atomically'. - * b. those who're sleeping on log reservations, pinned objects and + * c. those who're sleeping on log reservations, pinned objects and * other resources get woken up, and be told the bad news. - * c. nothing new gets queued up after (a) and (b) are done. - * d. if !logerror, flush the iclogs to disk, then seal them off - * for business. + * d. nothing new gets queued up after (b) and (c) are done. * - * Note: for delayed logging the !logerror case needs to flush the regions - * held in memory out to the iclogs before flushing them to disk. This needs - * to be done before the log is marked as shutdown, otherwise the flush to the - * iclogs will fail. + * Note: for the !logerror case we need to flush the regions held in memory out + * to disk first. This needs to be done before the log is marked as shutdown, + * otherwise the iclog writes will fail. */ int xfs_log_force_umount( @@ -3910,16 +3909,16 @@ xfs_log_force_umount( ASSERT(XLOG_FORCED_SHUTDOWN(log)); return 1; } - retval = 0; /* - * Flush the in memory commit item list before marking the log as - * being shut down. We need to do it in this order to ensure all the - * completed transactions are flushed to disk with the xfs_log_force() - * call below. + * Flush all the completed transactions to disk before marking the log + * being shut down. We need to do it in this order to ensure that + * completed operations are safely on disk before we shut down, and that + * we don't have to issue any buffer IO after the shutdown flags are set + * to guarantee this. */ if (!logerror) - xlog_cil_force(log); + _xfs_log_force(mp, XFS_LOG_SYNC, NULL); /* * mark the filesystem and the as in a shutdown state and wake @@ -3931,18 +3930,11 @@ xfs_log_force_umount( XFS_BUF_DONE(mp->m_sb_bp); /* - * This flag is sort of redundant because of the mount flag, but - * it's good to maintain the separation between the log and the rest - * of XFS. + * Mark the log and the iclogs with IO error flags to prevent any + * further log IO from being issued or completed. */ log->l_flags |= XLOG_IO_ERROR; - - /* - * If we hit a log error, we want to mark all the iclogs IOERROR - * while we're still holding the loglock. - */ - if (logerror) - retval = xlog_state_ioerror(log); + retval = xlog_state_ioerror(log); spin_unlock(&log->l_icloglock); /* @@ -3955,19 +3947,6 @@ xfs_log_force_umount( xlog_grant_head_wake_all(&log->l_reserve_head); xlog_grant_head_wake_all(&log->l_write_head); - if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { - ASSERT(!logerror); - /* - * Force the incore logs to disk before shutting the - * log down completely. - */ - _xfs_log_force(mp, XFS_LOG_SYNC, NULL); - - spin_lock(&log->l_icloglock); - retval = xlog_state_ioerror(log); - spin_unlock(&log->l_icloglock); - } - /* * Wake up everybody waiting on xfs_log_force. Wake the CIL push first * as if the log writes were completed. The abort handling in the log -- cgit v1.1 From cf53e99d192171a58791136d33fd3fea5d8bab35 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 2 Oct 2014 09:04:01 +1000 Subject: xfs: Don't use xfs_buf_iowait in the delwri buffer code For the special case of delwri buffer submission and waiting, we don't need to issue IO synchronously at all. The second pass to call xfs_buf_iowait() can be replaced with blocking on xfs_buf_lock() - the buffer will be unlocked when the async IO is complete. This formalises a sane the method of waiting for async IO - take an extra reference, submit the IO, call xfs_buf_lock() when you want to wait for IO completion. i.e.: bp = xfs_buf_find(); xfs_buf_hold(bp); bp->b_flags |= XBF_ASYNC; xfs_buf_iosubmit(bp); xfs_buf_lock(bp) error = bp->b_error; .... xfs_buf_relse(bp); While this is somewhat racy for gathering IO errors, none of the code that calls xfs_buf_delwri_submit() will race against other users of the buffers being submitted. Even if they do, we don't really care if the error is detected by the delwri code or the user we raced against. Either way, the error will be detected and handled. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index cd7b8ca..9dc4c22 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1813,12 +1813,17 @@ __xfs_buf_delwri_submit( blk_start_plug(&plug); list_for_each_entry_safe(bp, n, io_list, b_list) { bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL); - bp->b_flags |= XBF_WRITE; + bp->b_flags |= XBF_WRITE | XBF_ASYNC; - if (!wait) { - bp->b_flags |= XBF_ASYNC; + /* + * we do all Io submission async. This means if we need to wait + * for IO completion we need to take an extra reference so the + * buffer is still valid on the other side. + */ + if (wait) + xfs_buf_hold(bp); + else list_del_init(&bp->b_list); - } xfs_bdstrat_cb(bp); } blk_finish_plug(&plug); @@ -1866,7 +1871,10 @@ xfs_buf_delwri_submit( bp = list_first_entry(&io_list, struct xfs_buf, b_list); list_del_init(&bp->b_list); - error2 = xfs_buf_iowait(bp); + + /* locking the buffer will wait for async IO completion. */ + xfs_buf_lock(bp); + error2 = bp->b_error; xfs_buf_relse(bp); if (!error) error = error2; -- cgit v1.1 From e11bb8052c3f500e66142f33579cc054d691a8fb Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 2 Oct 2014 09:04:11 +1000 Subject: xfs: synchronous buffer IO needs a reference When synchronous IO runs IO completion work, it does so without an IO reference or a hold reference on the buffer. The IO "hold reference" is owned by the submitter, and released when the submission is complete. The IO reference is released when both the submitter and the bio end_io processing is run, and so if the io completion work is run from IO completion context, it is run without an IO reference. Hence we can get the situation where the submitter can submit the IO, see an error on the buffer and unlock and free the buffer while there is still IO in progress. This leads to use-after-free and memory corruption. Fix this by taking a "sync IO hold" reference that is owned by the IO and not released until after the buffer completion calls are run to wake up synchronous waiters. This means that the buffer will not be freed in any circumstance until all IO processing is completed. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf.c | 51 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 9 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 9dc4c22..48b1e29 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1019,6 +1019,9 @@ xfs_buf_iodone_work( else { ASSERT(read && bp->b_ops); complete(&bp->b_iowait); + + /* release the !XBF_ASYNC ref now we are done. */ + xfs_buf_rele(bp); } } @@ -1044,6 +1047,7 @@ xfs_buf_ioend( } else { bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); complete(&bp->b_iowait); + xfs_buf_rele(bp); } } @@ -1086,8 +1090,11 @@ xfs_bioerror( xfs_buf_ioerror(bp, -EIO); /* - * We're calling xfs_buf_ioend, so delete XBF_DONE flag. + * We're calling xfs_buf_ioend, so delete XBF_DONE flag. For + * sync IO, xfs_buf_ioend is going to remove a ref here. */ + if (!(bp->b_flags & XBF_ASYNC)) + xfs_buf_hold(bp); XFS_BUF_UNREAD(bp); XFS_BUF_UNDONE(bp); xfs_buf_stale(bp); @@ -1383,22 +1390,48 @@ xfs_buf_iorequest( if (bp->b_flags & XBF_WRITE) xfs_buf_wait_unpin(bp); + + /* + * Take references to the buffer. For XBF_ASYNC buffers, holding a + * reference for as long as submission takes is all that is necessary + * here. The IO inherits the lock and hold count from the submitter, + * and these are release during IO completion processing. Taking a hold + * over submission ensures that the buffer is not freed until we have + * completed all processing, regardless of when IO errors occur or are + * reported. + * + * However, for synchronous IO, the IO does not inherit the submitters + * reference count, nor the buffer lock. Hence we need to take an extra + * reference to the buffer for the for the IO context so that we can + * guarantee the buffer is not freed until all IO completion processing + * is done. Otherwise the caller can drop their reference while the IO + * is still in progress and hence trigger a use-after-free situation. + */ xfs_buf_hold(bp); + if (!(bp->b_flags & XBF_ASYNC)) + xfs_buf_hold(bp); + /* - * Set the count to 1 initially, this will stop an I/O - * completion callout which happens before we have started - * all the I/O from calling xfs_buf_ioend too early. + * Set the count to 1 initially, this will stop an I/O completion + * callout which happens before we have started all the I/O from calling + * xfs_buf_ioend too early. */ atomic_set(&bp->b_io_remaining, 1); _xfs_buf_ioapply(bp); + /* - * If _xfs_buf_ioapply failed, we'll get back here with - * only the reference we took above. _xfs_buf_ioend will - * drop it to zero, so we'd better not queue it for later, - * or we'll free it before it's done. + * If _xfs_buf_ioapply failed or we are doing synchronous IO that + * completes extremely quickly, we can get back here with only the IO + * reference we took above. _xfs_buf_ioend will drop it to zero. Run + * completion processing synchronously so that we don't return to the + * caller with completion still pending. This avoids unnecessary context + * switches associated with the end_io workqueue. */ - _xfs_buf_ioend(bp, bp->b_error ? 0 : 1); + if (bp->b_error || !(bp->b_flags & XBF_ASYNC)) + _xfs_buf_ioend(bp, 0); + else + _xfs_buf_ioend(bp, 1); xfs_buf_rele(bp); } -- cgit v1.1 From e8aaba9a783c8e5d2c58ebe69650ea31b91bb745 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 2 Oct 2014 09:04:22 +1000 Subject: xfs: xfs_buf_ioend and xfs_buf_iodone_work duplicate functionality We do some work in xfs_buf_ioend, and some work in xfs_buf_iodone_work, but much of that functionality is the same. This work can all be done in a single function, leaving xfs_buf_iodone just a wrapper to determine if we should execute it by workqueue or directly. hence rename xfs_buf_iodone_work to xfs_buf_ioend(), and add a new xfs_buf_ioend_async() for places that need async processing. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf.c | 88 +++++++++++++++++++++--------------------------- fs/xfs/xfs_buf.h | 2 +- fs/xfs/xfs_buf_item.c | 4 +-- fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_log.c | 2 +- fs/xfs/xfs_log_recover.c | 2 +- 6 files changed, 45 insertions(+), 55 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 48b1e29..a046149 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -998,26 +998,30 @@ xfs_buf_wait_unpin( * Buffer Utility Routines */ -STATIC void -xfs_buf_iodone_work( - struct work_struct *work) +void +xfs_buf_ioend( + struct xfs_buf *bp) { - struct xfs_buf *bp = - container_of(work, xfs_buf_t, b_iodone_work); - bool read = !!(bp->b_flags & XBF_READ); + bool read = bp->b_flags & XBF_READ; + + trace_xfs_buf_iodone(bp, _RET_IP_); bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); - /* only validate buffers that were read without errors */ - if (read && bp->b_ops && !bp->b_error && (bp->b_flags & XBF_DONE)) + /* Only validate buffers that were read without errors */ + if (read && !bp->b_error && bp->b_ops) { + ASSERT(!bp->b_iodone); bp->b_ops->verify_read(bp); + } + + if (!bp->b_error) + bp->b_flags |= XBF_DONE; if (bp->b_iodone) (*(bp->b_iodone))(bp); else if (bp->b_flags & XBF_ASYNC) xfs_buf_relse(bp); else { - ASSERT(read && bp->b_ops); complete(&bp->b_iowait); /* release the !XBF_ASYNC ref now we are done. */ @@ -1025,30 +1029,22 @@ xfs_buf_iodone_work( } } -void -xfs_buf_ioend( - struct xfs_buf *bp, - int schedule) +static void +xfs_buf_ioend_work( + struct work_struct *work) { - bool read = !!(bp->b_flags & XBF_READ); - - trace_xfs_buf_iodone(bp, _RET_IP_); + struct xfs_buf *bp = + container_of(work, xfs_buf_t, b_iodone_work); - if (bp->b_error == 0) - bp->b_flags |= XBF_DONE; + xfs_buf_ioend(bp); +} - if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) { - if (schedule) { - INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work); - queue_work(xfslogd_workqueue, &bp->b_iodone_work); - } else { - xfs_buf_iodone_work(&bp->b_iodone_work); - } - } else { - bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); - complete(&bp->b_iowait); - xfs_buf_rele(bp); - } +void +xfs_buf_ioend_async( + struct xfs_buf *bp) +{ + INIT_WORK(&bp->b_iodone_work, xfs_buf_ioend_work); + queue_work(xfslogd_workqueue, &bp->b_iodone_work); } void @@ -1099,7 +1095,7 @@ xfs_bioerror( XFS_BUF_UNDONE(bp); xfs_buf_stale(bp); - xfs_buf_ioend(bp, 0); + xfs_buf_ioend(bp); return -EIO; } @@ -1186,15 +1182,6 @@ xfs_bwrite( } STATIC void -_xfs_buf_ioend( - xfs_buf_t *bp, - int schedule) -{ - if (atomic_dec_and_test(&bp->b_io_remaining) == 1) - xfs_buf_ioend(bp, schedule); -} - -STATIC void xfs_buf_bio_end_io( struct bio *bio, int error) @@ -1211,7 +1198,8 @@ xfs_buf_bio_end_io( if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); - _xfs_buf_ioend(bp, 1); + if (atomic_dec_and_test(&bp->b_io_remaining) == 1) + xfs_buf_ioend_async(bp); bio_put(bio); } @@ -1423,15 +1411,17 @@ xfs_buf_iorequest( /* * If _xfs_buf_ioapply failed or we are doing synchronous IO that * completes extremely quickly, we can get back here with only the IO - * reference we took above. _xfs_buf_ioend will drop it to zero. Run - * completion processing synchronously so that we don't return to the - * caller with completion still pending. This avoids unnecessary context - * switches associated with the end_io workqueue. + * reference we took above. If we drop it to zero, run completion + * processing synchronously so that we don't return to the caller with + * completion still pending. This avoids unnecessary context switches + * associated with the end_io workqueue. */ - if (bp->b_error || !(bp->b_flags & XBF_ASYNC)) - _xfs_buf_ioend(bp, 0); - else - _xfs_buf_ioend(bp, 1); + if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { + if (bp->b_error || !(bp->b_flags & XBF_ASYNC)) + xfs_buf_ioend(bp); + else + xfs_buf_ioend_async(bp); + } xfs_buf_rele(bp); } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index c753183..4585c15 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -286,7 +286,7 @@ extern void xfs_buf_unlock(xfs_buf_t *); /* Buffer Read and Write Routines */ extern int xfs_bwrite(struct xfs_buf *bp); -extern void xfs_buf_ioend(xfs_buf_t *, int); +extern void xfs_buf_ioend(struct xfs_buf *bp); extern void xfs_buf_ioerror(xfs_buf_t *, int); extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); extern void xfs_buf_iorequest(xfs_buf_t *); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 76007de..4fd41b5 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -491,7 +491,7 @@ xfs_buf_item_unpin( xfs_buf_ioerror(bp, -EIO); XFS_BUF_UNDONE(bp); xfs_buf_stale(bp); - xfs_buf_ioend(bp, 0); + xfs_buf_ioend(bp); } } @@ -1115,7 +1115,7 @@ do_callbacks: xfs_buf_do_callbacks(bp); bp->b_fspriv = NULL; bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); + xfs_buf_ioend(bp); } /* diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index fea3c92..00d210b 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3056,7 +3056,7 @@ cluster_corrupt_out: XFS_BUF_UNDONE(bp); xfs_buf_stale(bp); xfs_buf_ioerror(bp, -EIO); - xfs_buf_ioend(bp, 0); + xfs_buf_ioend(bp); } else { xfs_buf_stale(bp); xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 85f36f2..3567396 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1678,7 +1678,7 @@ xlog_bdstrat( if (iclog->ic_state & XLOG_STATE_IOERROR) { xfs_buf_ioerror(bp, -EIO); xfs_buf_stale(bp); - xfs_buf_ioend(bp, 0); + xfs_buf_ioend(bp); /* * It would seem logical to return EIO here, but we rely on * the log state machine to propagate I/O errors instead of diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 1fd5787..4ba19bf 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -383,7 +383,7 @@ xlog_recover_iodone( SHUTDOWN_META_IO_ERROR); } bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); + xfs_buf_ioend(bp); } /* -- cgit v1.1 From 61be9c529a4a715ab8679e9ca82bc3790c7ab66c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 2 Oct 2014 09:04:31 +1000 Subject: xfs: rework xfs_buf_bio_endio error handling Currently the report of a bio error from completion immediately marks the buffer with an error. The issue is that this is racy w.r.t. synchronous IO - the submitter can see b_error being set before the IO is complete, and hence we cannot differentiate between submission failures and completion failures. Add an internal b_io_error field protected by the b_lock to catch IO completion errors, and only propagate that to the buffer during final IO completion handling. Hence we can tell in xfs_buf_iorequest if we've had a submission failure bey checking bp->b_error before dropping our b_io_remaining reference - that reference will prevent b_io_error values from being propagated to b_error in the event that completion races with submission. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf.c | 18 ++++++++++++++++-- fs/xfs/xfs_buf.h | 1 + 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index a046149..170d6c0 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1008,6 +1008,13 @@ xfs_buf_ioend( bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); + /* + * Pull in IO completion errors now. We are guaranteed to be running + * single threaded, so we don't need the lock to read b_io_error. + */ + if (!bp->b_error && bp->b_io_error) + xfs_buf_ioerror(bp, bp->b_io_error); + /* Only validate buffers that were read without errors */ if (read && !bp->b_error && bp->b_ops) { ASSERT(!bp->b_iodone); @@ -1192,8 +1199,12 @@ xfs_buf_bio_end_io( * don't overwrite existing errors - otherwise we can lose errors on * buffers that require multiple bios to complete. */ - if (!bp->b_error) - xfs_buf_ioerror(bp, error); + if (error) { + spin_lock(&bp->b_lock); + if (!bp->b_io_error) + bp->b_io_error = error; + spin_unlock(&bp->b_lock); + } if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); @@ -1379,6 +1390,9 @@ xfs_buf_iorequest( if (bp->b_flags & XBF_WRITE) xfs_buf_wait_unpin(bp); + /* clear the internal error state to avoid spurious errors */ + bp->b_io_error = 0; + /* * Take references to the buffer. For XBF_ASYNC buffers, holding a * reference for as long as submission takes is all that is necessary diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 4585c15..44db8cd 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -158,6 +158,7 @@ typedef struct xfs_buf { struct list_head b_lru; /* lru list */ spinlock_t b_lock; /* internal state lock */ unsigned int b_state; /* internal state flags */ + int b_io_error; /* internal IO error state */ wait_queue_head_t b_waiters; /* unpin waiters */ struct list_head b_list; struct xfs_perag *b_pag; /* contains rbtree root */ -- cgit v1.1 From 8dac39219827113f14e97507646a610ca426b69e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 2 Oct 2014 09:04:40 +1000 Subject: xfs: kill xfs_bdstrat_cb Only has two callers, and is just a shutdown check and error handler around xfs_buf_iorequest. However, the error handling is a mess of read and write semantics, and both internal callers only call it for writes. Hence kill the wrapper, and follow up with a patch to sanitise the error handling. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf.c | 48 +++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 170d6c0..0eee0f1 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1146,27 +1146,6 @@ xfs_bioerror_relse( return -EIO; } -STATIC int -xfs_bdstrat_cb( - struct xfs_buf *bp) -{ - if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { - trace_xfs_bdstrat_shut(bp, _RET_IP_); - /* - * Metadata write that didn't get logged but - * written delayed anyway. These aren't associated - * with a transaction, and can be ignored. - */ - if (!bp->b_iodone && !XFS_BUF_ISREAD(bp)) - return xfs_bioerror_relse(bp); - else - return xfs_bioerror(bp); - } - - xfs_buf_iorequest(bp); - return 0; -} - int xfs_bwrite( struct xfs_buf *bp) @@ -1178,7 +1157,20 @@ xfs_bwrite( bp->b_flags |= XBF_WRITE; bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL); - xfs_bdstrat_cb(bp); + if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { + trace_xfs_bdstrat_shut(bp, _RET_IP_); + + /* + * Metadata write that didn't get logged but written anyway. + * These aren't associated with a transaction, and can be + * ignored. + */ + if (!bp->b_iodone) + return xfs_bioerror_relse(bp); + return xfs_bioerror(bp); + } + + xfs_buf_iorequest(bp); error = xfs_buf_iowait(bp); if (error) { @@ -1861,7 +1853,17 @@ __xfs_buf_delwri_submit( xfs_buf_hold(bp); else list_del_init(&bp->b_list); - xfs_bdstrat_cb(bp); + + if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { + trace_xfs_bdstrat_shut(bp, _RET_IP_); + + if (!bp->b_iodone) + xfs_bioerror_relse(bp); + else + xfs_bioerror(bp); + continue; + } + xfs_buf_iorequest(bp); } blk_finish_plug(&plug); -- cgit v1.1 From 2718775469a521c8b35442db5d665ac0c8c3c8ac Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 2 Oct 2014 09:04:56 +1000 Subject: xfs: xfs_bioerror can die. Internal buffer write error handling is a mess due to the unnatural split between xfs_bioerror and xfs_bioerror_relse(). xfs_bwrite() only does sync IO and determines the handler to call based on b_iodone, so for this caller the only difference between xfs_bioerror() and xfs_bioerror_release() is the XBF_DONE flag. We don't care what the XBF_DONE flag state is because we stale the buffer in both paths - the next buffer lookup will clear XBF_DONE because XBF_STALE is set. Hence we can use common error handling for xfs_bwrite(). __xfs_buf_delwri_submit() is a similar - it's only ever called on writes - all sync or async - and again there's no reason to handle them any differently at all. Clean up the nasty error handling and remove xfs_bioerror(). Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf.c | 58 ++++++++++++-------------------------------------------- 1 file changed, 12 insertions(+), 46 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 0eee0f1..409a8a0 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1075,39 +1075,6 @@ xfs_buf_ioerror_alert( } /* - * Called when we want to stop a buffer from getting written or read. - * We attach the EIO error, muck with its flags, and call xfs_buf_ioend - * so that the proper iodone callbacks get called. - */ -STATIC int -xfs_bioerror( - xfs_buf_t *bp) -{ -#ifdef XFSERRORDEBUG - ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone); -#endif - - /* - * No need to wait until the buffer is unpinned, we aren't flushing it. - */ - xfs_buf_ioerror(bp, -EIO); - - /* - * We're calling xfs_buf_ioend, so delete XBF_DONE flag. For - * sync IO, xfs_buf_ioend is going to remove a ref here. - */ - if (!(bp->b_flags & XBF_ASYNC)) - xfs_buf_hold(bp); - XFS_BUF_UNREAD(bp); - XFS_BUF_UNDONE(bp); - xfs_buf_stale(bp); - - xfs_buf_ioend(bp); - - return -EIO; -} - -/* * Same as xfs_bioerror, except that we are releasing the buffer * here ourselves, and avoiding the xfs_buf_ioend call. * This is meant for userdata errors; metadata bufs come with @@ -1155,19 +1122,19 @@ xfs_bwrite( ASSERT(xfs_buf_islocked(bp)); bp->b_flags |= XBF_WRITE; - bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL); + bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | + XBF_WRITE_FAIL | XBF_DONE); if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { trace_xfs_bdstrat_shut(bp, _RET_IP_); - /* - * Metadata write that didn't get logged but written anyway. - * These aren't associated with a transaction, and can be - * ignored. - */ - if (!bp->b_iodone) - return xfs_bioerror_relse(bp); - return xfs_bioerror(bp); + xfs_buf_ioerror(bp, -EIO); + xfs_buf_stale(bp); + + /* sync IO, xfs_buf_ioend is going to remove a ref here */ + xfs_buf_hold(bp); + xfs_buf_ioend(bp); + return -EIO; } xfs_buf_iorequest(bp); @@ -1857,10 +1824,9 @@ __xfs_buf_delwri_submit( if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { trace_xfs_bdstrat_shut(bp, _RET_IP_); - if (!bp->b_iodone) - xfs_bioerror_relse(bp); - else - xfs_bioerror(bp); + xfs_buf_ioerror(bp, -EIO); + xfs_buf_stale(bp); + xfs_buf_ioend(bp); continue; } xfs_buf_iorequest(bp); -- cgit v1.1 From 8b131973d1628f1a0c5a36fe02269d696bbe60a3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 2 Oct 2014 09:05:05 +1000 Subject: xfs: kill xfs_bioerror_relse There is only one caller now - xfs_trans_read_buf_map() - and it has very well defined call semantics - read, synchronous, and b_iodone is NULL. Hence it's pretty clear what error handling is necessary for this case. The bigger problem of untangling xfs_trans_read_buf_map error handling is left to a future patch. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf.c | 39 --------------------------------------- fs/xfs/xfs_buf.h | 2 -- fs/xfs/xfs_trans_buf.c | 9 ++++++--- 3 files changed, 6 insertions(+), 44 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 409a8a0..108eba7 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1074,45 +1074,6 @@ xfs_buf_ioerror_alert( (__uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length); } -/* - * Same as xfs_bioerror, except that we are releasing the buffer - * here ourselves, and avoiding the xfs_buf_ioend call. - * This is meant for userdata errors; metadata bufs come with - * iodone functions attached, so that we can track down errors. - */ -int -xfs_bioerror_relse( - struct xfs_buf *bp) -{ - int64_t fl = bp->b_flags; - /* - * No need to wait until the buffer is unpinned. - * We aren't flushing it. - * - * chunkhold expects B_DONE to be set, whether - * we actually finish the I/O or not. We don't want to - * change that interface. - */ - XFS_BUF_UNREAD(bp); - XFS_BUF_DONE(bp); - xfs_buf_stale(bp); - bp->b_iodone = NULL; - if (!(fl & XBF_ASYNC)) { - /* - * Mark b_error and B_ERROR _both_. - * Lot's of chunkcache code assumes that. - * There's no reason to mark error for - * ASYNC buffers. - */ - xfs_buf_ioerror(bp, -EIO); - complete(&bp->b_iowait); - } else { - xfs_buf_relse(bp); - } - - return -EIO; -} - int xfs_bwrite( struct xfs_buf *bp) diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 44db8cd..d8f57f6 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -297,8 +297,6 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, #define xfs_buf_zero(bp, off, len) \ xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) -extern int xfs_bioerror_relse(struct xfs_buf *); - /* Buffer Utility Routines */ extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 96c898e..db4be5b 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -324,11 +324,14 @@ xfs_trans_read_buf_map( */ if (XFS_FORCED_SHUTDOWN(mp)) { trace_xfs_bdstrat_shut(bp, _RET_IP_); - xfs_bioerror_relse(bp); - } else { - xfs_buf_iorequest(bp); + bp->b_flags &= ~(XBF_READ | XBF_DONE); + xfs_buf_ioerror(bp, -EIO); + xfs_buf_stale(bp); + xfs_buf_relse(bp); + return -EIO; } + xfs_buf_iorequest(bp); error = xfs_buf_iowait(bp); if (error) { xfs_buf_ioerror_alert(bp, __func__); -- cgit v1.1 From 595bff75dce51e0d6d94877b4b6d11b4747a63fd Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 2 Oct 2014 09:05:14 +1000 Subject: xfs: introduce xfs_buf_submit[_wait] There is a lot of cookie-cutter code that looks like: if (shutdown) handle buffer error xfs_buf_iorequest(bp) error = xfs_buf_iowait(bp) if (error) handle buffer error spread through XFS. There's significant complexity now in xfs_buf_iorequest() to specifically handle this sort of synchronous IO pattern, but there's all sorts of nasty surprises in different error handling code dependent on who owns the buffer references and the locks. Pull this pattern into a single helper, where we can hide all the synchronous IO warts and hence make the error handling for all the callers much saner. This removes the need for a special extra reference to protect IO completion processing, as we can now hold a single reference across dispatch and waiting, simplifying the sync IO smeantics and error handling. In doing this, also rename xfs_buf_iorequest to xfs_buf_submit and make it explicitly handle on asynchronous IO. This forces all users to be switched specifically to one interface or the other and removes any ambiguity between how the interfaces are to be used. It also means that xfs_buf_iowait() goes away. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_bmap_util.c | 14 +--- fs/xfs/xfs_buf.c | 169 ++++++++++++++++++++++++++--------------------- fs/xfs/xfs_buf.h | 4 +- fs/xfs/xfs_buf_item.c | 4 +- fs/xfs/xfs_log.c | 2 +- fs/xfs/xfs_log_recover.c | 30 ++++----- fs/xfs/xfs_trace.h | 3 +- fs/xfs/xfs_trans_buf.c | 19 +----- 8 files changed, 117 insertions(+), 128 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 2f1e30d..c53cc03 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1157,12 +1157,7 @@ xfs_zero_remaining_bytes( XFS_BUF_READ(bp); XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock)); - if (XFS_FORCED_SHUTDOWN(mp)) { - error = -EIO; - break; - } - xfs_buf_iorequest(bp); - error = xfs_buf_iowait(bp); + error = xfs_buf_submit_wait(bp); if (error) { xfs_buf_ioerror_alert(bp, "xfs_zero_remaining_bytes(read)"); @@ -1175,12 +1170,7 @@ xfs_zero_remaining_bytes( XFS_BUF_UNREAD(bp); XFS_BUF_WRITE(bp); - if (XFS_FORCED_SHUTDOWN(mp)) { - error = -EIO; - break; - } - xfs_buf_iorequest(bp); - error = xfs_buf_iowait(bp); + error = xfs_buf_submit_wait(bp); if (error) { xfs_buf_ioerror_alert(bp, "xfs_zero_remaining_bytes(write)"); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 108eba7..d99ec83 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -623,10 +623,11 @@ _xfs_buf_read( bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); - xfs_buf_iorequest(bp); - if (flags & XBF_ASYNC) + if (flags & XBF_ASYNC) { + xfs_buf_submit(bp); return 0; - return xfs_buf_iowait(bp); + } + return xfs_buf_submit_wait(bp); } xfs_buf_t * @@ -708,12 +709,7 @@ xfs_buf_read_uncached( bp->b_flags |= XBF_READ; bp->b_ops = ops; - if (XFS_FORCED_SHUTDOWN(target->bt_mount)) { - xfs_buf_relse(bp); - return NULL; - } - xfs_buf_iorequest(bp); - xfs_buf_iowait(bp); + xfs_buf_submit_wait(bp); return bp; } @@ -1028,12 +1024,8 @@ xfs_buf_ioend( (*(bp->b_iodone))(bp); else if (bp->b_flags & XBF_ASYNC) xfs_buf_relse(bp); - else { + else complete(&bp->b_iowait); - - /* release the !XBF_ASYNC ref now we are done. */ - xfs_buf_rele(bp); - } } static void @@ -1086,21 +1078,7 @@ xfs_bwrite( bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL | XBF_DONE); - if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { - trace_xfs_bdstrat_shut(bp, _RET_IP_); - - xfs_buf_ioerror(bp, -EIO); - xfs_buf_stale(bp); - - /* sync IO, xfs_buf_ioend is going to remove a ref here */ - xfs_buf_hold(bp); - xfs_buf_ioend(bp); - return -EIO; - } - - xfs_buf_iorequest(bp); - - error = xfs_buf_iowait(bp); + error = xfs_buf_submit_wait(bp); if (error) { xfs_force_shutdown(bp->b_target->bt_mount, SHUTDOWN_META_IO_ERROR); @@ -1209,7 +1187,7 @@ next_chunk: } else { /* * This is guaranteed not to be the last io reference count - * because the caller (xfs_buf_iorequest) holds a count itself. + * because the caller (xfs_buf_submit) holds a count itself. */ atomic_dec(&bp->b_io_remaining); xfs_buf_ioerror(bp, -EIO); @@ -1299,13 +1277,29 @@ _xfs_buf_ioapply( blk_finish_plug(&plug); } +/* + * Asynchronous IO submission path. This transfers the buffer lock ownership and + * the current reference to the IO. It is not safe to reference the buffer after + * a call to this function unless the caller holds an additional reference + * itself. + */ void -xfs_buf_iorequest( - xfs_buf_t *bp) +xfs_buf_submit( + struct xfs_buf *bp) { - trace_xfs_buf_iorequest(bp, _RET_IP_); + trace_xfs_buf_submit(bp, _RET_IP_); ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); + ASSERT(bp->b_flags & XBF_ASYNC); + + /* on shutdown we stale and complete the buffer immediately */ + if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { + xfs_buf_ioerror(bp, -EIO); + bp->b_flags &= ~XBF_DONE; + xfs_buf_stale(bp); + xfs_buf_ioend(bp); + return; + } if (bp->b_flags & XBF_WRITE) xfs_buf_wait_unpin(bp); @@ -1314,25 +1308,14 @@ xfs_buf_iorequest( bp->b_io_error = 0; /* - * Take references to the buffer. For XBF_ASYNC buffers, holding a - * reference for as long as submission takes is all that is necessary - * here. The IO inherits the lock and hold count from the submitter, - * and these are release during IO completion processing. Taking a hold - * over submission ensures that the buffer is not freed until we have - * completed all processing, regardless of when IO errors occur or are - * reported. - * - * However, for synchronous IO, the IO does not inherit the submitters - * reference count, nor the buffer lock. Hence we need to take an extra - * reference to the buffer for the for the IO context so that we can - * guarantee the buffer is not freed until all IO completion processing - * is done. Otherwise the caller can drop their reference while the IO - * is still in progress and hence trigger a use-after-free situation. + * The caller's reference is released during I/O completion. + * This occurs some time after the last b_io_remaining reference is + * released, so after we drop our Io reference we have to have some + * other reference to ensure the buffer doesn't go away from underneath + * us. Take a direct reference to ensure we have safe access to the + * buffer until we are finished with it. */ xfs_buf_hold(bp); - if (!(bp->b_flags & XBF_ASYNC)) - xfs_buf_hold(bp); - /* * Set the count to 1 initially, this will stop an I/O completion @@ -1343,40 +1326,82 @@ xfs_buf_iorequest( _xfs_buf_ioapply(bp); /* - * If _xfs_buf_ioapply failed or we are doing synchronous IO that - * completes extremely quickly, we can get back here with only the IO - * reference we took above. If we drop it to zero, run completion - * processing synchronously so that we don't return to the caller with - * completion still pending. This avoids unnecessary context switches - * associated with the end_io workqueue. + * If _xfs_buf_ioapply failed, we can get back here with only the IO + * reference we took above. If we drop it to zero, run completion so + * that we don't return to the caller with completion still pending. */ if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { - if (bp->b_error || !(bp->b_flags & XBF_ASYNC)) + if (bp->b_error) xfs_buf_ioend(bp); else xfs_buf_ioend_async(bp); } xfs_buf_rele(bp); + /* Note: it is not safe to reference bp now we've dropped our ref */ } /* - * Waits for I/O to complete on the buffer supplied. It returns immediately if - * no I/O is pending or there is already a pending error on the buffer, in which - * case nothing will ever complete. It returns the I/O error code, if any, or - * 0 if there was no error. + * Synchronous buffer IO submission path, read or write. */ int -xfs_buf_iowait( - xfs_buf_t *bp) +xfs_buf_submit_wait( + struct xfs_buf *bp) { - trace_xfs_buf_iowait(bp, _RET_IP_); + int error; - if (!bp->b_error) - wait_for_completion(&bp->b_iowait); + trace_xfs_buf_submit_wait(bp, _RET_IP_); + + ASSERT(!(bp->b_flags & (_XBF_DELWRI_Q | XBF_ASYNC))); + if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { + xfs_buf_ioerror(bp, -EIO); + xfs_buf_stale(bp); + bp->b_flags &= ~XBF_DONE; + return -EIO; + } + + if (bp->b_flags & XBF_WRITE) + xfs_buf_wait_unpin(bp); + + /* clear the internal error state to avoid spurious errors */ + bp->b_io_error = 0; + + /* + * For synchronous IO, the IO does not inherit the submitters reference + * count, nor the buffer lock. Hence we cannot release the reference we + * are about to take until we've waited for all IO completion to occur, + * including any xfs_buf_ioend_async() work that may be pending. + */ + xfs_buf_hold(bp); + + /* + * Set the count to 1 initially, this will stop an I/O completion + * callout which happens before we have started all the I/O from calling + * xfs_buf_ioend too early. + */ + atomic_set(&bp->b_io_remaining, 1); + _xfs_buf_ioapply(bp); + + /* + * make sure we run completion synchronously if it raced with us and is + * already complete. + */ + if (atomic_dec_and_test(&bp->b_io_remaining) == 1) + xfs_buf_ioend(bp); + + /* wait for completion before gathering the error from the buffer */ + trace_xfs_buf_iowait(bp, _RET_IP_); + wait_for_completion(&bp->b_iowait); trace_xfs_buf_iowait_done(bp, _RET_IP_); - return bp->b_error; + error = bp->b_error; + + /* + * all done now, we can release the hold that keeps the buffer + * referenced for the entire IO. + */ + xfs_buf_rele(bp); + return error; } xfs_caddr_t @@ -1782,15 +1807,7 @@ __xfs_buf_delwri_submit( else list_del_init(&bp->b_list); - if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { - trace_xfs_bdstrat_shut(bp, _RET_IP_); - - xfs_buf_ioerror(bp, -EIO); - xfs_buf_stale(bp); - xfs_buf_ioend(bp); - continue; - } - xfs_buf_iorequest(bp); + xfs_buf_submit(bp); } blk_finish_plug(&plug); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index d8f57f6..0acfc30 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -290,8 +290,8 @@ extern int xfs_bwrite(struct xfs_buf *bp); extern void xfs_buf_ioend(struct xfs_buf *bp); extern void xfs_buf_ioerror(xfs_buf_t *, int); extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); -extern void xfs_buf_iorequest(xfs_buf_t *); -extern int xfs_buf_iowait(xfs_buf_t *); +extern void xfs_buf_submit(struct xfs_buf *bp); +extern int xfs_buf_submit_wait(struct xfs_buf *bp); extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, xfs_buf_rw_t); #define xfs_buf_zero(bp, off, len) \ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 4fd41b5..cbea909 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -1081,7 +1081,7 @@ xfs_buf_iodone_callbacks( * a way to shut the filesystem down if the writes keep failing. * * In practice we'll shut the filesystem down soon as non-transient - * erorrs tend to affect the whole device and a failing log write + * errors tend to affect the whole device and a failing log write * will make us give up. But we really ought to do better here. */ if (XFS_BUF_ISASYNC(bp)) { @@ -1094,7 +1094,7 @@ xfs_buf_iodone_callbacks( if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) { bp->b_flags |= XBF_WRITE | XBF_ASYNC | XBF_DONE | XBF_WRITE_FAIL; - xfs_buf_iorequest(bp); + xfs_buf_submit(bp); } else { xfs_buf_relse(bp); } diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3567396..fe88ef6 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1688,7 +1688,7 @@ xlog_bdstrat( return 0; } - xfs_buf_iorequest(bp); + xfs_buf_submit(bp); return 0; } diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 4ba19bf..980e296 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -193,12 +193,8 @@ xlog_bread_noalign( bp->b_io_length = nbblks; bp->b_error = 0; - if (XFS_FORCED_SHUTDOWN(log->l_mp)) - return -EIO; - - xfs_buf_iorequest(bp); - error = xfs_buf_iowait(bp); - if (error) + error = xfs_buf_submit_wait(bp); + if (error && !XFS_FORCED_SHUTDOWN(log->l_mp)) xfs_buf_ioerror_alert(bp, __func__); return error; } @@ -378,9 +374,11 @@ xlog_recover_iodone( * We're not going to bother about retrying * this during recovery. One strike! */ - xfs_buf_ioerror_alert(bp, __func__); - xfs_force_shutdown(bp->b_target->bt_mount, - SHUTDOWN_META_IO_ERROR); + if (!XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { + xfs_buf_ioerror_alert(bp, __func__); + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_META_IO_ERROR); + } } bp->b_iodone = NULL; xfs_buf_ioend(bp); @@ -4427,16 +4425,12 @@ xlog_do_recover( XFS_BUF_UNASYNC(bp); bp->b_ops = &xfs_sb_buf_ops; - if (XFS_FORCED_SHUTDOWN(log->l_mp)) { - xfs_buf_relse(bp); - return -EIO; - } - - xfs_buf_iorequest(bp); - error = xfs_buf_iowait(bp); + error = xfs_buf_submit_wait(bp); if (error) { - xfs_buf_ioerror_alert(bp, __func__); - ASSERT(0); + if (!XFS_FORCED_SHUTDOWN(log->l_mp)) { + xfs_buf_ioerror_alert(bp, __func__); + ASSERT(0); + } xfs_buf_relse(bp); return error; } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 152f827..51372e3 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -349,7 +349,8 @@ DEFINE_BUF_EVENT(xfs_buf_free); DEFINE_BUF_EVENT(xfs_buf_hold); DEFINE_BUF_EVENT(xfs_buf_rele); DEFINE_BUF_EVENT(xfs_buf_iodone); -DEFINE_BUF_EVENT(xfs_buf_iorequest); +DEFINE_BUF_EVENT(xfs_buf_submit); +DEFINE_BUF_EVENT(xfs_buf_submit_wait); DEFINE_BUF_EVENT(xfs_buf_bawrite); DEFINE_BUF_EVENT(xfs_buf_lock); DEFINE_BUF_EVENT(xfs_buf_lock_done); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index db4be5b..e2b2216 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -318,23 +318,10 @@ xfs_trans_read_buf_map( XFS_BUF_READ(bp); bp->b_ops = ops; - /* - * XXX(hch): clean up the error handling here to be less - * of a mess.. - */ - if (XFS_FORCED_SHUTDOWN(mp)) { - trace_xfs_bdstrat_shut(bp, _RET_IP_); - bp->b_flags &= ~(XBF_READ | XBF_DONE); - xfs_buf_ioerror(bp, -EIO); - xfs_buf_stale(bp); - xfs_buf_relse(bp); - return -EIO; - } - - xfs_buf_iorequest(bp); - error = xfs_buf_iowait(bp); + error = xfs_buf_submit_wait(bp); if (error) { - xfs_buf_ioerror_alert(bp, __func__); + if (!XFS_FORCED_SHUTDOWN(mp)) + xfs_buf_ioerror_alert(bp, __func__); xfs_buf_relse(bp); /* * We can gracefully recover from most read -- cgit v1.1 From ba3726742c1712c43c5a18245476f3fe9fe74773 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 2 Oct 2014 09:05:32 +1000 Subject: xfs: check xfs_buf_read_uncached returns correctly xfs_buf_read_uncached() has two failure modes. If can either return NULL or bp->b_error != 0 depending on the type of failure, and not all callers check for both. Fix it so that xfs_buf_read_uncached() always returns the error status, and the buffer is returned as a function parameter. The buffer will only be returned on success. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf.c | 18 +++++++++++++---- fs/xfs/xfs_buf.h | 6 +++--- fs/xfs/xfs_fsops.c | 11 +++-------- fs/xfs/xfs_mount.c | 55 +++++++++++++++++++++++++--------------------------- fs/xfs/xfs_rtalloc.c | 30 ++++++++++++---------------- 5 files changed, 58 insertions(+), 62 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index d99ec83..6fbcbbf 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -688,29 +688,39 @@ xfs_buf_readahead_map( * Read an uncached buffer from disk. Allocates and returns a locked * buffer containing the disk contents or nothing. */ -struct xfs_buf * +int xfs_buf_read_uncached( struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, int flags, + struct xfs_buf **bpp, const struct xfs_buf_ops *ops) { struct xfs_buf *bp; + *bpp = NULL; + bp = xfs_buf_get_uncached(target, numblks, flags); if (!bp) - return NULL; + return -ENOMEM; /* set up the buffer for a read IO */ ASSERT(bp->b_map_count == 1); - bp->b_bn = daddr; + bp->b_bn = XFS_BUF_DADDR_NULL; /* always null for uncached buffers */ bp->b_maps[0].bm_bn = daddr; bp->b_flags |= XBF_READ; bp->b_ops = ops; xfs_buf_submit_wait(bp); - return bp; + if (bp->b_error) { + int error = bp->b_error; + xfs_buf_relse(bp); + return error; + } + + *bpp = bp; + return 0; } /* diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 0acfc30..82002c0 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -269,9 +269,9 @@ int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length); struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags); -struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target, - xfs_daddr_t daddr, size_t numblks, int flags, - const struct xfs_buf_ops *ops); +int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr, + size_t numblks, int flags, struct xfs_buf **bpp, + const struct xfs_buf_ops *ops); void xfs_buf_hold(struct xfs_buf *bp); /* Releasing Buffers */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index f91de1e..c05ac8b 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -172,16 +172,11 @@ xfs_growfs_data_private( if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb))) return error; dpct = pct - mp->m_sb.sb_imax_pct; - bp = xfs_buf_read_uncached(mp->m_ddev_targp, + error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0, NULL); - if (!bp) - return -EIO; - if (bp->b_error) { - error = bp->b_error; - xfs_buf_relse(bp); + XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); + if (error) return error; - } xfs_buf_relse(bp); new = nb; /* use new as a temporary here */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index fbf0384..142c460 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -302,21 +302,15 @@ xfs_readsb( * access to the superblock. */ reread: - bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), 0, buf_ops); - if (!bp) { - if (loud) - xfs_warn(mp, "SB buffer read failed"); - return -EIO; - } - if (bp->b_error) { - error = bp->b_error; + error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, + BTOBB(sector_size), 0, &bp, buf_ops); + if (error) { if (loud) xfs_warn(mp, "SB validate failed with error %d.", error); /* bad CRC means corrupted metadata */ if (error == -EFSBADCRC) error = -EFSCORRUPTED; - goto release_buf; + return error; } /* @@ -546,40 +540,43 @@ xfs_set_inoalignment(xfs_mount_t *mp) * Check that the data (and log if separate) is an ok size. */ STATIC int -xfs_check_sizes(xfs_mount_t *mp) +xfs_check_sizes( + struct xfs_mount *mp) { - xfs_buf_t *bp; + struct xfs_buf *bp; xfs_daddr_t d; + int error; d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) { xfs_warn(mp, "filesystem size mismatch detected"); return -EFBIG; } - bp = xfs_buf_read_uncached(mp->m_ddev_targp, + error = xfs_buf_read_uncached(mp->m_ddev_targp, d - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0, NULL); - if (!bp) { + XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); + if (error) { xfs_warn(mp, "last sector read failed"); - return -EIO; + return error; } xfs_buf_relse(bp); - if (mp->m_logdev_targp != mp->m_ddev_targp) { - d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); - if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { - xfs_warn(mp, "log size mismatch detected"); - return -EFBIG; - } - bp = xfs_buf_read_uncached(mp->m_logdev_targp, + if (mp->m_logdev_targp == mp->m_ddev_targp) + return 0; + + d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); + if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { + xfs_warn(mp, "log size mismatch detected"); + return -EFBIG; + } + error = xfs_buf_read_uncached(mp->m_logdev_targp, d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0, NULL); - if (!bp) { - xfs_warn(mp, "log device read failed"); - return -EIO; - } - xfs_buf_relse(bp); + XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); + if (error) { + xfs_warn(mp, "log device read failed"); + return error; } + xfs_buf_relse(bp); return 0; } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 909e143..1ad0093 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -972,16 +972,11 @@ xfs_growfs_rt( /* * Read in the last block of the device, make sure it exists. */ - bp = xfs_buf_read_uncached(mp->m_rtdev_targp, + error = xfs_buf_read_uncached(mp->m_rtdev_targp, XFS_FSB_TO_BB(mp, nrblocks - 1), - XFS_FSB_TO_BB(mp, 1), 0, NULL); - if (!bp) - return -EIO; - if (bp->b_error) { - error = bp->b_error; - xfs_buf_relse(bp); + XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); + if (error) return error; - } xfs_buf_relse(bp); /* @@ -1235,11 +1230,12 @@ xfs_rtallocate_extent( */ int /* error */ xfs_rtmount_init( - xfs_mount_t *mp) /* file system mount structure */ + struct xfs_mount *mp) /* file system mount structure */ { - xfs_buf_t *bp; /* buffer for last block of subvolume */ - xfs_daddr_t d; /* address of last block of subvolume */ - xfs_sb_t *sbp; /* filesystem superblock copy in mount */ + struct xfs_buf *bp; /* buffer for last block of subvolume */ + struct xfs_sb *sbp; /* filesystem superblock copy in mount */ + xfs_daddr_t d; /* address of last block of subvolume */ + int error; sbp = &mp->m_sb; if (sbp->sb_rblocks == 0) @@ -1265,14 +1261,12 @@ xfs_rtmount_init( (unsigned long long) mp->m_sb.sb_rblocks); return -EFBIG; } - bp = xfs_buf_read_uncached(mp->m_rtdev_targp, + error = xfs_buf_read_uncached(mp->m_rtdev_targp, d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0, NULL); - if (!bp || bp->b_error) { + XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); + if (error) { xfs_warn(mp, "realtime device size check failed"); - if (bp) - xfs_buf_relse(bp); - return -EIO; + return error; } xfs_buf_relse(bp); return 0; -- cgit v1.1 From 8c15612546bce1ecafb7dee3cce8a2a9b560e15e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Oct 2014 09:05:44 +1000 Subject: xfs: simplify xfs_zero_remaining_bytes xfs_zero_remaining_bytes() open codes a log of buffer manupulations to do a read forllowed by a write. It can simply be replaced by an uncached read followed by a xfs_bwrite() call. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_bmap_util.c | 44 ++++++++++++++------------------------------ 1 file changed, 14 insertions(+), 30 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index c53cc03..aa70620 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1122,14 +1122,6 @@ xfs_zero_remaining_bytes( if (endoff > XFS_ISIZE(ip)) endoff = XFS_ISIZE(ip); - bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp, - BTOBB(mp->m_sb.sb_blocksize), 0); - if (!bp) - return -ENOMEM; - - xfs_buf_unlock(bp); - for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { uint lock_mode; @@ -1152,32 +1144,24 @@ xfs_zero_remaining_bytes( ASSERT(imap.br_startblock != DELAYSTARTBLOCK); if (imap.br_state == XFS_EXT_UNWRITTEN) continue; - XFS_BUF_UNDONE(bp); - XFS_BUF_UNWRITE(bp); - XFS_BUF_READ(bp); - XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock)); - error = xfs_buf_submit_wait(bp); - if (error) { - xfs_buf_ioerror_alert(bp, - "xfs_zero_remaining_bytes(read)"); - break; - } + error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp, + xfs_fsb_to_db(ip, imap.br_startblock), + BTOBB(mp->m_sb.sb_blocksize), + 0, &bp, NULL); + if (error) + return error; + memset(bp->b_addr + - (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), - 0, lastoffset - offset + 1); - XFS_BUF_UNDONE(bp); - XFS_BUF_UNREAD(bp); - XFS_BUF_WRITE(bp); + (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), + 0, lastoffset - offset + 1); - error = xfs_buf_submit_wait(bp); - if (error) { - xfs_buf_ioerror_alert(bp, - "xfs_zero_remaining_bytes(write)"); - break; - } + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + return error; } - xfs_buf_free(bp); return error; } -- cgit v1.1