From 4ab9ed578e82851645f3dd69d36d91ae77564d6c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 14 Apr 2014 18:11:58 +1000 Subject: xfs: kill buffers over failed write ranges properly When a write fails, if we don't clear the delalloc flags from the buffers over the failed range, they can persist beyond EOF and cause problems. writeback will see the pages in the page cache, see they are dirty and continually retry the write, assuming that the page beyond EOF is just racing with a truncate. The page will eventually be released due to some other operation (e.g. direct IO), and it will not pass through invalidation because it is dirty. Hence it will be released with buffer_delay set on it, and trigger warnings in xfs_vm_releasepage() and assert fail in xfs_file_aio_write_direct because invalidation failed and we didn't write the corect amount. This causes failures on block size < page size filesystems in fsx and fsstress workloads run by xfstests. Fix it by completely trashing any state on the buffer that could be used to imply that it contains valid data when the delalloc range over the buffer is punched out during the failed write handling. Signed-off-by: Dave Chinner Tested-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 75df77d..282c726 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1566,6 +1566,16 @@ xfs_vm_write_failed( xfs_vm_kill_delalloc_range(inode, block_offset, block_offset + bh->b_size); + + /* + * This buffer does not contain data anymore. make sure anyone + * who finds it knows that for certain. + */ + clear_buffer_delay(bh); + clear_buffer_uptodate(bh); + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_dirty(bh); } } -- cgit v1.1 From 72ab70a19b4ebb19dbe2a79faaa6a4ccead58e70 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 14 Apr 2014 18:13:29 +1000 Subject: xfs: write failure beyond EOF truncates too much data If we fail a write beyond EOF and have to handle it in xfs_vm_write_begin(), we truncate the inode back to the current inode size. This doesn't take into account the fact that we may have already made successful writes to the same page (in the case of block size < page size) and hence we can truncate the page cache away from blocks with valid data in them. If these blocks are delayed allocation blocks, we now have a mismatch between the page cache and the extent tree, and this will trigger - at minimum - a delayed block count mismatch assert when the inode is evicted from the cache. We can also trip over it when block mapping for direct IO - this is the most common symptom seen from fsx and fsstress when run from xfstests. Fix it by only truncating away the exact range we are updating state for in this write_begin call. Signed-off-by: Dave Chinner Tested-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 282c726..5f29693 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1609,12 +1609,21 @@ xfs_vm_write_begin( status = __block_write_begin(page, pos, len, xfs_get_blocks); if (unlikely(status)) { struct inode *inode = mapping->host; + size_t isize = i_size_read(inode); xfs_vm_write_failed(inode, page, pos, len); unlock_page(page); - if (pos + len > i_size_read(inode)) - truncate_pagecache(inode, i_size_read(inode)); + /* + * If the write is beyond EOF, we only want to kill blocks + * allocated in this write, not blocks that were previously + * written successfully. + */ + if (pos + len > isize) { + ssize_t start = max_t(ssize_t, pos, isize); + + truncate_pagecache_range(inode, start, pos + len); + } page_cache_release(page); page = NULL; -- cgit v1.1 From aad3f3755e7f043789b772856d1a2935f2b41a4b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 14 Apr 2014 18:14:11 +1000 Subject: xfs: xfs_vm_write_end truncates too much on failure Similar to the write_begin problem, xfs-vm_write_end will truncate back to the old EOF, potentially removing page cache from over the top of delalloc blocks with valid data in them. Fix this by truncating back to just the start of the failed write. Signed-off-by: Dave Chinner Tested-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 5f29693..e0a7931 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1634,9 +1634,12 @@ xfs_vm_write_begin( } /* - * On failure, we only need to kill delalloc blocks beyond EOF because they - * will never be written. For blocks within EOF, generic_write_end() zeros them - * so they are safe to leave alone and be written with all the other valid data. + * On failure, we only need to kill delalloc blocks beyond EOF in the range of + * this specific write because they will never be written. Previous writes + * beyond EOF where block allocation succeeded do not need to be trashed, so + * only new blocks from this write should be trashed. For blocks within + * EOF, generic_write_end() zeros them so they are safe to leave alone and be + * written with all the other valid data. */ STATIC int xfs_vm_write_end( @@ -1659,8 +1662,11 @@ xfs_vm_write_end( loff_t to = pos + len; if (to > isize) { - truncate_pagecache(inode, isize); + /* only kill blocks in this write beyond EOF */ + if (pos > isize) + isize = pos; xfs_vm_kill_delalloc_range(inode, isize, to); + truncate_pagecache_range(inode, isize, to); } } return ret; -- cgit v1.1 From 897b73b6a2ee5d3c06648b601beb1724f7fbd678 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 14 Apr 2014 18:15:11 +1000 Subject: xfs: zeroing space needs to punch delalloc blocks When we are zeroing space andit is covered by a delalloc range, we need to punch the delalloc range out before we truncate the page cache. Failing to do so leaves and inconsistency between the page cache and the extent tree, which we later trip over when doing direct IO over the same range. Signed-off-by: Dave Chinner Tested-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_bmap_util.c | 13 ++++++++++++- fs/xfs/xfs_trace.h | 1 + 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 01f6a64..296160b 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1418,6 +1418,8 @@ xfs_zero_file_space( xfs_off_t end_boundary; int error; + trace_xfs_zero_file_space(ip); + granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); /* @@ -1432,9 +1434,18 @@ xfs_zero_file_space( ASSERT(end_boundary <= offset + len); if (start_boundary < end_boundary - 1) { - /* punch out the page cache over the conversion range */ + /* + * punch out delayed allocation blocks and the page cache over + * the conversion range + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, + XFS_B_TO_FSBT(mp, start_boundary), + XFS_B_TO_FSB(mp, end_boundary - start_boundary)); + xfs_iunlock(ip, XFS_ILOCK_EXCL); truncate_pagecache_range(VFS_I(ip), start_boundary, end_boundary - 1); + /* convert the blocks */ error = xfs_alloc_file_space(ip, start_boundary, end_boundary - start_boundary - 1, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index a4ae41c..65d8c79 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -603,6 +603,7 @@ DEFINE_INODE_EVENT(xfs_readlink); DEFINE_INODE_EVENT(xfs_inactive_symlink); DEFINE_INODE_EVENT(xfs_alloc_file_space); DEFINE_INODE_EVENT(xfs_free_file_space); +DEFINE_INODE_EVENT(xfs_zero_file_space); DEFINE_INODE_EVENT(xfs_collapse_file_space); DEFINE_INODE_EVENT(xfs_readdir); #ifdef CONFIG_XFS_POSIX_ACL -- cgit v1.1 From 0e1f789d0dc38db79dfc4ddfd9cf541a8c198b7a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 17 Apr 2014 08:15:19 +1000 Subject: xfs: don't map ranges that span EOF for direct IO Al Viro tracked down the problem that has caused generic/263 to fail on XFS since the test was introduced. If is caused by xfs_get_blocks() mapping a single extent that spans EOF without marking it as buffer-new() so that the direct IO code does not zero the tail of the block at the new EOF. This is a long standing bug that has been around for many, many years. Because xfs_get_blocks() starts the map before EOF, it can't set buffer_new(), because that causes he direct IO code to also zero unaligned sectors at the head of the IO. This would overwrite valid data with zeros, and hence we cannot validly return a single extent that spans EOF to direct IO. Fix this by detecting a mapping that spans EOF and truncate it down to EOF. This results in the the direct IO code doing the right thing for unaligned data blocks before EOF, and then returning to get another mapping for the region beyond EOF which XFS treats correctly by setting buffer_new() on it. This makes direct Io behave correctly w.r.t. tail block zeroing beyond EOF, and fsx is happy about that. Again, thanks to Al Viro for finding what I couldn't. [ dchinner: Fix for __divdi3 build error: Reported-by: Paul Gortmaker Tested-by: Paul Gortmaker Signed-off-by: Mark Tinguely Reviewed-by: Eric Sandeen ] Signed-off-by: Dave Chinner Tested-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index e0a7931..0479c32 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1344,6 +1344,14 @@ __xfs_get_blocks( /* * If this is O_DIRECT or the mpage code calling tell them how large * the mapping is, so that we can avoid repeated get_blocks calls. + * + * If the mapping spans EOF, then we have to break the mapping up as the + * mapping for blocks beyond EOF must be marked new so that sub block + * regions can be correctly zeroed. We can't do this for mappings within + * EOF unless the mapping was just allocated or is unwritten, otherwise + * the callers would overwrite existing data with zeros. Hence we have + * to split the mapping into a range up to and including EOF, and a + * second mapping for beyond EOF. */ if (direct || size > (1 << inode->i_blkbits)) { xfs_off_t mapping_size; @@ -1354,6 +1362,12 @@ __xfs_get_blocks( ASSERT(mapping_size > 0); if (mapping_size > size) mapping_size = size; + if (offset < i_size_read(inode) && + offset + mapping_size >= i_size_read(inode)) { + /* limit mapping to block that spans EOF */ + mapping_size = roundup_64(i_size_read(inode) - offset, + 1 << inode->i_blkbits); + } if (mapping_size > LONG_MAX) mapping_size = LONG_MAX; -- cgit v1.1 From d39a2ced0fa0172faa46df0866fc22419b876e2a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 17 Apr 2014 08:15:25 +1000 Subject: xfs: collapse range is delalloc challenged FSX has been detecting data corruption after to collapse range calls. The key observation is that the offset of the last extent in the file was not being shifted, and hence when the file size was adjusted it was truncating away data because the extents handled been correctly shifted. Tracing indicated that before the collapse, the extent list looked like: .... ino 0x5788 state idx 6 offset 26 block 195904 count 10 flag 0 ino 0x5788 state idx 7 offset 39 block 195917 count 35 flag 0 ino 0x5788 state idx 8 offset 86 block 195964 count 32 flag 0 and after the shift of 2 blocks: ino 0x5788 state idx 6 offset 24 block 195904 count 10 flag 0 ino 0x5788 state idx 7 offset 37 block 195917 count 35 flag 0 ino 0x5788 state idx 8 offset 86 block 195964 count 32 flag 0 Note that the last extent did not change offset. After the changing of the file size: ino 0x5788 state idx 6 offset 24 block 195904 count 10 flag 0 ino 0x5788 state idx 7 offset 37 block 195917 count 35 flag 0 ino 0x5788 state idx 8 offset 86 block 195964 count 30 flag 0 You can see that the last extent had it's length truncated, indicating that we've lost data. The reason for this is that the xfs_bmap_shift_extents() loop uses XFS_IFORK_NEXTENTS() to determine how many extents are in the inode. This, unfortunately, doesn't take into account delayed allocation extents - it's a count of physically allocated extents - and hence when the file being collapsed has a delalloc extent like this one does prior to the range being collapsed: .... ino 0x5788 state idx 4 offset 11 block 4503599627239429 count 1 flag 0 .... it gets the count wrong and terminates the shift loop early. Fix it by using the in-memory extent array size that includes delayed allocation extents to determine the number of extents on the inode. Signed-off-by: Dave Chinner Tested-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_bmap.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 5b6092e..f0efc7e 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5413,6 +5413,7 @@ xfs_bmap_shift_extents( int whichfork = XFS_DATA_FORK; int logflags; xfs_filblks_t blockcount = 0; + int total_extents; if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && @@ -5429,7 +5430,6 @@ xfs_bmap_shift_extents( ASSERT(current_ext != NULL); ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { /* Read in all the extents */ error = xfs_iread_extents(tp, ip, whichfork); @@ -5456,7 +5456,6 @@ xfs_bmap_shift_extents( /* We are going to change core inode */ logflags = XFS_ILOG_CORE; - if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_private.b.firstblock = *firstblock; @@ -5467,8 +5466,14 @@ xfs_bmap_shift_extents( logflags |= XFS_ILOG_DEXT; } - while (nexts++ < num_exts && - *current_ext < XFS_IFORK_NEXTENTS(ip, whichfork)) { + /* + * There may be delalloc extents in the data fork before the range we + * are collapsing out, so we cannot + * use the count of real extents here. Instead we have to calculate it + * from the incore fork. + */ + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + while (nexts++ < num_exts && *current_ext < total_extents) { gotp = xfs_iext_get_ext(ifp, *current_ext); xfs_bmbt_get_all(gotp, &got); @@ -5556,10 +5561,11 @@ xfs_bmap_shift_extents( } (*current_ext)++; + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); } /* Check if we are done */ - if (*current_ext == XFS_IFORK_NEXTENTS(ip, whichfork)) + if (*current_ext == total_extents) *done = 1; del_cursor: @@ -5568,6 +5574,5 @@ del_cursor: error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); xfs_trans_log_inode(tp, ip, logflags); - return error; } -- cgit v1.1 From 9c23eccc1e746f64b18fab070a37189b4422e44a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 17 Apr 2014 08:15:26 +1000 Subject: xfs: unmount does not wait for shutdown during unmount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit And interesting situation can occur if a log IO error occurs during the unmount of a filesystem. The cases reported have the same signature - the update of the superblock counters fails due to a log write IO error: XFS (dm-16): xfs_do_force_shutdown(0x2) called from line 1170 of file fs/xfs/xfs_log.c. Return address = 0xffffffffa08a44a1 XFS (dm-16): Log I/O Error Detected. Shutting down filesystem XFS (dm-16): Unable to update superblock counters. Freespace may not be correct on next mount. XFS (dm-16): xfs_log_force: error 5 returned. XFS (¿-¿¿¿): Please umount the filesystem and rectify the problem(s) It can be seen that the last line of output contains a corrupt device name - this is because the log and xfs_mount structures have already been freed by the time this message is printed. A kernel oops closely follows. The issue is that the shutdown is occurring in a separate IO completion thread to the unmount. Once the shutdown processing has started and all the iclogs are marked with XLOG_STATE_IOERROR, the log shutdown code wakes anyone waiting on a log force so they can process the shutdown error. This wakes up the unmount code that is doing a synchronous transaction to update the superblock counters. The unmount path now sees all the iclogs are marked with XLOG_STATE_IOERROR and so never waits on them again, knowing that if it does, there will not be a wakeup trigger for it and we will hang the unmount if we do. Hence the unmount runs through all the remaining code and frees all the filesystem structures while the xlog_iodone() is still processing the shutdown. When the log shutdown processing completes, xfs_do_force_shutdown() emits the "Please umount the filesystem and rectify the problem(s)" message, and xlog_iodone() then aborts all the objects attached to the iclog. An iclog that has already been freed.... The real issue here is that there is no serialisation point between the log IO and the unmount. We have serialisations points for log writes, log forces, reservations, etc, but we don't actually have any code that wakes for log IO to fully complete. We do that for all other types of object, so why not iclogbufs? Well, it turns out that we can easily do this. We've got xfs_buf handles, and that's what everyone else uses for IO serialisation. i.e. bp->b_sema. So, lets hold iclogbufs locked over IO, and only release the lock in xlog_iodone() when we are finished with the buffer. That way before we tear down the iclog, we can lock and unlock the buffer to ensure IO completion has finished completely before we tear it down. Signed-off-by: Dave Chinner Tested-by: Mike Snitzer Tested-by: Bob Mastors Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_log.c | 53 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 9 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 8497a00..08624dc 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1181,11 +1181,14 @@ xlog_iodone(xfs_buf_t *bp) /* log I/O is always issued ASYNC */ ASSERT(XFS_BUF_ISASYNC(bp)); xlog_state_done_syncing(iclog, aborted); + /* - * do not reference the buffer (bp) here as we could race - * with it being freed after writing the unmount record to the - * log. + * drop the buffer lock now that we are done. Nothing references + * the buffer after this, so an unmount waiting on this lock can now + * tear it down safely. As such, it is unsafe to reference the buffer + * (bp) after the unlock as we could race with it being freed. */ + xfs_buf_unlock(bp); } /* @@ -1368,8 +1371,16 @@ xlog_alloc_log( bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0); if (!bp) goto out_free_log; - bp->b_iodone = xlog_iodone; + + /* + * The iclogbuf buffer locks are held over IO but we are not going to do + * IO yet. Hence unlock the buffer so that the log IO path can grab it + * when appropriately. + */ ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); + + bp->b_iodone = xlog_iodone; log->l_xbuf = bp; spin_lock_init(&log->l_icloglock); @@ -1398,6 +1409,9 @@ xlog_alloc_log( if (!bp) goto out_free_iclog; + ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); + bp->b_iodone = xlog_iodone; iclog->ic_bp = bp; iclog->ic_data = bp->b_addr; @@ -1422,7 +1436,6 @@ xlog_alloc_log( iclog->ic_callback_tail = &(iclog->ic_callback); iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; - ASSERT(xfs_buf_islocked(iclog->ic_bp)); init_waitqueue_head(&iclog->ic_force_wait); init_waitqueue_head(&iclog->ic_write_wait); @@ -1631,6 +1644,12 @@ xlog_cksum( * we transition the iclogs to IOERROR state *after* flushing all existing * iclogs to disk. This is because we don't want anymore new transactions to be * started or completed afterwards. + * + * We lock the iclogbufs here so that we can serialise against IO completion + * during unmount. We might be processing a shutdown triggered during unmount, + * and that can occur asynchronously to the unmount thread, and hence we need to + * ensure that completes before tearing down the iclogbufs. Hence we need to + * hold the buffer lock across the log IO to acheive that. */ STATIC int xlog_bdstrat( @@ -1638,6 +1657,7 @@ xlog_bdstrat( { struct xlog_in_core *iclog = bp->b_fspriv; + xfs_buf_lock(bp); if (iclog->ic_state & XLOG_STATE_IOERROR) { xfs_buf_ioerror(bp, EIO); xfs_buf_stale(bp); @@ -1645,7 +1665,8 @@ xlog_bdstrat( /* * It would seem logical to return EIO here, but we rely on * the log state machine to propagate I/O errors instead of - * doing it here. + * doing it here. Similarly, IO completion will unlock the + * buffer, so we don't do it here. */ return 0; } @@ -1847,14 +1868,28 @@ xlog_dealloc_log( xlog_cil_destroy(log); /* - * always need to ensure that the extra buffer does not point to memory - * owned by another log buffer before we free it. + * Cycle all the iclogbuf locks to make sure all log IO completion + * is done before we tear down these buffers. */ + iclog = log->l_iclog; + for (i = 0; i < log->l_iclog_bufs; i++) { + xfs_buf_lock(iclog->ic_bp); + xfs_buf_unlock(iclog->ic_bp); + iclog = iclog->ic_next; + } + + /* + * Always need to ensure that the extra buffer does not point to memory + * owned by another log buffer before we free it. Also, cycle the lock + * first to ensure we've completed IO on it. + */ + xfs_buf_lock(log->l_xbuf); + xfs_buf_unlock(log->l_xbuf); xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size)); xfs_buf_free(log->l_xbuf); iclog = log->l_iclog; - for (i=0; il_iclog_bufs; i++) { + for (i = 0; i < log->l_iclog_bufs; i++) { xfs_buf_free(iclog->ic_bp); next_iclog = iclog->ic_next; kmem_free(iclog); -- cgit v1.1 From 07d5035a289f8bebe0ea86c293b2d5412478c481 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 17 Apr 2014 08:15:27 +1000 Subject: xfs: wrong error sign conversion during failed DIO writes We negate the error value being returned from a generic function incorrectly. The code path that it is running in returned negative errors, so there is no need to negate it to get the correct error signs here. This was uncovered by generic/019. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 79e96ce..82afdcb 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -679,7 +679,7 @@ xfs_file_dio_aio_write( goto out; if (mapping->nrpages) { - ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, pos, -1); if (ret) goto out; -- cgit v1.1 From 8d6c121018bf60d631c05a4a2efc468a392b97bb Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 17 Apr 2014 08:15:28 +1000 Subject: xfs: fix buffer use after free on IO error When testing exhaustion of dm snapshots, the following appeared with CONFIG_DEBUG_OBJECTS_FREE enabled: ODEBUG: free active (active state 0) object type: work_struct hint: xfs_buf_iodone_work+0x0/0x1d0 [xfs] indicating that we'd freed a buffer which still had a pending reference, down this path: [ 190.867975] [] debug_check_no_obj_freed+0x22b/0x270 [ 190.880820] [] kmem_cache_free+0xd0/0x370 [ 190.892615] [] xfs_buf_free+0xe4/0x210 [xfs] [ 190.905629] [] xfs_buf_rele+0xe7/0x270 [xfs] [ 190.911770] [] xfs_trans_read_buf_map+0x7b6/0xac0 [xfs] At issue is the fact that if IO fails in xfs_buf_iorequest, we'll queue completion unconditionally, and then call xfs_buf_rele; but if IO failed, there are no IOs remaining, and xfs_buf_rele will free the bp while work is still queued. Fix this by not scheduling completion if the buffer has an error on it; run it immediately. The rest is only comment changes. Thanks to dchinner for spotting the root cause. Signed-off-by: Eric Sandeen Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 107f2fd..cb10a0a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1372,21 +1372,29 @@ xfs_buf_iorequest( xfs_buf_wait_unpin(bp); xfs_buf_hold(bp); - /* Set the count to 1 initially, this will stop an I/O + /* + * Set the count to 1 initially, this will stop an I/O * completion callout which happens before we have started * all the I/O from calling xfs_buf_ioend too early. */ atomic_set(&bp->b_io_remaining, 1); _xfs_buf_ioapply(bp); - _xfs_buf_ioend(bp, 1); + /* + * If _xfs_buf_ioapply failed, we'll get back here with + * only the reference we took above. _xfs_buf_ioend will + * drop it to zero, so we'd better not queue it for later, + * or we'll free it before it's done. + */ + _xfs_buf_ioend(bp, bp->b_error ? 0 : 1); xfs_buf_rele(bp); } /* * Waits for I/O to complete on the buffer supplied. It returns immediately if - * no I/O is pending or there is already a pending error on the buffer. It - * returns the I/O error code, if any, or 0 if there was no error. + * no I/O is pending or there is already a pending error on the buffer, in which + * case nothing will ever complete. It returns the I/O error code, if any, or + * 0 if there was no error. */ int xfs_buf_iowait( -- cgit v1.1 From 330033d697ed8d296fa52b5303db9d802ad901cc Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 17 Apr 2014 08:15:30 +1000 Subject: xfs: fix tmpfile/selinux deadlock and initialize security xfstests generic/004 reproduces an ilock deadlock using the tmpfile interface when selinux is enabled. This occurs because xfs_create_tmpfile() takes the ilock and then calls d_tmpfile(). The latter eventually calls into xfs_xattr_get() which attempts to get the lock again. E.g.: xfs_io D ffffffff81c134c0 4096 3561 3560 0x00000080 ffff8801176a1a68 0000000000000046 ffff8800b401b540 ffff8801176a1fd8 00000000001d5800 00000000001d5800 ffff8800b401b540 ffff8800b401b540 ffff8800b73a6bd0 fffffffeffffffff ffff8800b73a6bd8 ffff8800b5ddb480 Call Trace: [] schedule+0x29/0x70 [] rwsem_down_read_failed+0xc5/0x120 [] ? xfs_ilock_attr_map_shared+0x1f/0x50 [xfs] [] call_rwsem_down_read_failed+0x14/0x30 [] ? down_read_nested+0x89/0xa0 [] ? xfs_ilock+0x122/0x250 [xfs] [] xfs_ilock+0x122/0x250 [xfs] [] xfs_ilock_attr_map_shared+0x1f/0x50 [xfs] [] xfs_attr_get+0x90/0xe0 [xfs] [] xfs_xattr_get+0x37/0x50 [xfs] [] generic_getxattr+0x4f/0x70 [] inode_doinit_with_dentry+0x1ae/0x650 [] selinux_d_instantiate+0x1c/0x20 [] security_d_instantiate+0x1b/0x30 [] d_instantiate+0x50/0x70 [] d_tmpfile+0xb5/0xc0 [] xfs_create_tmpfile+0x362/0x410 [xfs] [] xfs_vn_tmpfile+0x18/0x20 [xfs] [] path_openat+0x228/0x6a0 [] ? sched_clock+0x9/0x10 [] ? kvm_clock_read+0x27/0x40 [] ? __alloc_fd+0xaf/0x1f0 [] do_filp_open+0x3a/0x90 [] ? _raw_spin_unlock+0x27/0x40 [] ? __alloc_fd+0xaf/0x1f0 [] do_sys_open+0x12e/0x210 [] SyS_open+0x1e/0x20 [] system_call_fastpath+0x16/0x1b xfs_vn_tmpfile() also fails to initialize security on the newly created inode. Pull the d_tmpfile() call up into xfs_vn_tmpfile() after the transaction has been committed and the inode unlocked. Also, initialize security on the inode based on the parent directory provided via the tmpfile call. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_inode.c | 5 +++-- fs/xfs/xfs_inode.h | 2 +- fs/xfs/xfs_iops.c | 20 +++++++++++++++++--- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 5e7a38f..768087b 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1334,7 +1334,8 @@ int xfs_create_tmpfile( struct xfs_inode *dp, struct dentry *dentry, - umode_t mode) + umode_t mode, + struct xfs_inode **ipp) { struct xfs_mount *mp = dp->i_mount; struct xfs_inode *ip = NULL; @@ -1402,7 +1403,6 @@ xfs_create_tmpfile( xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); ip->i_d.di_nlink--; - d_tmpfile(dentry, VFS_I(ip)); error = xfs_iunlink(tp, ip); if (error) goto out_trans_abort; @@ -1415,6 +1415,7 @@ xfs_create_tmpfile( xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); + *ipp = ip; return 0; out_trans_abort: diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 396cc1f..f2fcde5 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -334,7 +334,7 @@ int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, int xfs_create(struct xfs_inode *dp, struct xfs_name *name, umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp); int xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry, - umode_t mode); + umode_t mode, struct xfs_inode **ipp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 89b07e4..ef1ca01 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1053,11 +1053,25 @@ xfs_vn_tmpfile( struct dentry *dentry, umode_t mode) { - int error; + int error; + struct xfs_inode *ip; + struct inode *inode; - error = xfs_create_tmpfile(XFS_I(dir), dentry, mode); + error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip); + if (unlikely(error)) + return -error; - return -error; + inode = VFS_I(ip); + + error = xfs_init_security(inode, dir, &dentry->d_name); + if (unlikely(error)) { + iput(inode); + return -error; + } + + d_tmpfile(dentry, inode); + + return 0; } static const struct inode_operations xfs_inode_operations = { -- cgit v1.1