From 4ab9ed578e82851645f3dd69d36d91ae77564d6c Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 14 Apr 2014 18:11:58 +1000
Subject: xfs: kill buffers over failed write ranges properly

When a write fails, if we don't clear the delalloc flags from the
buffers over the failed range, they can persist beyond EOF and cause
problems. writeback will see the pages in the page cache, see they
are dirty and continually retry the write, assuming that the page
beyond EOF is just racing with a truncate. The page will eventually
be released due to some other operation (e.g. direct IO), and it
will not pass through invalidation because it is dirty. Hence it
will be released with buffer_delay set on it, and trigger warnings
in xfs_vm_releasepage() and assert fail in xfs_file_aio_write_direct
because invalidation failed and we didn't write the corect amount.

This causes failures on block size < page size filesystems in fsx
and fsstress workloads run by xfstests.

Fix it by completely trashing any state on the buffer that could be
used to imply that it contains valid data when the delalloc range
over the buffer is punched out during the failed write handling.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Tested-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_aops.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 75df77d..282c726 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1566,6 +1566,16 @@ xfs_vm_write_failed(
 
 		xfs_vm_kill_delalloc_range(inode, block_offset,
 					   block_offset + bh->b_size);
+
+		/*
+		 * This buffer does not contain data anymore. make sure anyone
+		 * who finds it knows that for certain.
+		 */
+		clear_buffer_delay(bh);
+		clear_buffer_uptodate(bh);
+		clear_buffer_mapped(bh);
+		clear_buffer_new(bh);
+		clear_buffer_dirty(bh);
 	}
 
 }
-- 
cgit v1.1


From 72ab70a19b4ebb19dbe2a79faaa6a4ccead58e70 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 14 Apr 2014 18:13:29 +1000
Subject: xfs: write failure beyond EOF truncates too much data

If we fail a write beyond EOF and have to handle it in
xfs_vm_write_begin(), we truncate the inode back to the current inode
size. This doesn't take into account the fact that we may have
already made successful writes to the same page (in the case of block
size < page size) and hence we can truncate the page cache away from
blocks with valid data in them. If these blocks are delayed
allocation blocks, we now have a mismatch between the page cache and
the extent tree, and this will trigger - at minimum - a delayed
block count mismatch assert when the inode is evicted from the cache.
We can also trip over it when block mapping for direct IO - this is
the most common symptom seen from fsx and fsstress when run from
xfstests.

Fix it by only truncating away the exact range we are updating state
for in this write_begin call.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Tested-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_aops.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 282c726..5f29693 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1609,12 +1609,21 @@ xfs_vm_write_begin(
 	status = __block_write_begin(page, pos, len, xfs_get_blocks);
 	if (unlikely(status)) {
 		struct inode	*inode = mapping->host;
+		size_t		isize = i_size_read(inode);
 
 		xfs_vm_write_failed(inode, page, pos, len);
 		unlock_page(page);
 
-		if (pos + len > i_size_read(inode))
-			truncate_pagecache(inode, i_size_read(inode));
+		/*
+		 * If the write is beyond EOF, we only want to kill blocks
+		 * allocated in this write, not blocks that were previously
+		 * written successfully.
+		 */
+		if (pos + len > isize) {
+			ssize_t start = max_t(ssize_t, pos, isize);
+
+			truncate_pagecache_range(inode, start, pos + len);
+		}
 
 		page_cache_release(page);
 		page = NULL;
-- 
cgit v1.1


From aad3f3755e7f043789b772856d1a2935f2b41a4b Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 14 Apr 2014 18:14:11 +1000
Subject: xfs: xfs_vm_write_end truncates too much on failure

Similar to the write_begin problem, xfs-vm_write_end will truncate
back to the old EOF, potentially removing page cache from over the
top of delalloc blocks with valid data in them. Fix this by
truncating back to just the start of the failed write.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Tested-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_aops.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 5f29693..e0a7931 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1634,9 +1634,12 @@ xfs_vm_write_begin(
 }
 
 /*
- * On failure, we only need to kill delalloc blocks beyond EOF because they
- * will never be written. For blocks within EOF, generic_write_end() zeros them
- * so they are safe to leave alone and be written with all the other valid data.
+ * On failure, we only need to kill delalloc blocks beyond EOF in the range of
+ * this specific write because they will never be written. Previous writes
+ * beyond EOF where block allocation succeeded do not need to be trashed, so
+ * only new blocks from this write should be trashed. For blocks within
+ * EOF, generic_write_end() zeros them so they are safe to leave alone and be
+ * written with all the other valid data.
  */
 STATIC int
 xfs_vm_write_end(
@@ -1659,8 +1662,11 @@ xfs_vm_write_end(
 		loff_t		to = pos + len;
 
 		if (to > isize) {
-			truncate_pagecache(inode, isize);
+			/* only kill blocks in this write beyond EOF */
+			if (pos > isize)
+				isize = pos;
 			xfs_vm_kill_delalloc_range(inode, isize, to);
+			truncate_pagecache_range(inode, isize, to);
 		}
 	}
 	return ret;
-- 
cgit v1.1


From 897b73b6a2ee5d3c06648b601beb1724f7fbd678 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 14 Apr 2014 18:15:11 +1000
Subject: xfs: zeroing space needs to punch delalloc blocks

When we are zeroing space andit is covered by a delalloc range, we
need to punch the delalloc range out before we truncate the page
cache. Failing to do so leaves and inconsistency between the page
cache and the extent tree, which we later trip over when doing
direct IO over the same range.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Tested-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_bmap_util.c | 13 ++++++++++++-
 fs/xfs/xfs_trace.h     |  1 +
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 01f6a64..296160b 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1418,6 +1418,8 @@ xfs_zero_file_space(
 	xfs_off_t		end_boundary;
 	int			error;
 
+	trace_xfs_zero_file_space(ip);
+
 	granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
 
 	/*
@@ -1432,9 +1434,18 @@ xfs_zero_file_space(
 	ASSERT(end_boundary <= offset + len);
 
 	if (start_boundary < end_boundary - 1) {
-		/* punch out the page cache over the conversion range */
+		/*
+		 * punch out delayed allocation blocks and the page cache over
+		 * the conversion range
+		 */
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		error = xfs_bmap_punch_delalloc_range(ip,
+				XFS_B_TO_FSBT(mp, start_boundary),
+				XFS_B_TO_FSB(mp, end_boundary - start_boundary));
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 		truncate_pagecache_range(VFS_I(ip), start_boundary,
 					 end_boundary - 1);
+
 		/* convert the blocks */
 		error = xfs_alloc_file_space(ip, start_boundary,
 					end_boundary - start_boundary - 1,
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index a4ae41c..65d8c79 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -603,6 +603,7 @@ DEFINE_INODE_EVENT(xfs_readlink);
 DEFINE_INODE_EVENT(xfs_inactive_symlink);
 DEFINE_INODE_EVENT(xfs_alloc_file_space);
 DEFINE_INODE_EVENT(xfs_free_file_space);
+DEFINE_INODE_EVENT(xfs_zero_file_space);
 DEFINE_INODE_EVENT(xfs_collapse_file_space);
 DEFINE_INODE_EVENT(xfs_readdir);
 #ifdef CONFIG_XFS_POSIX_ACL
-- 
cgit v1.1


From 0e1f789d0dc38db79dfc4ddfd9cf541a8c198b7a Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 17 Apr 2014 08:15:19 +1000
Subject: xfs: don't map ranges that span EOF for direct IO

Al Viro tracked down the problem that has caused generic/263 to fail
on XFS since the test was introduced. If is caused by
xfs_get_blocks() mapping a single extent that spans EOF without
marking it as buffer-new() so that the direct IO code does not zero
the tail of the block at the new EOF. This is a long standing bug
that has been around for many, many years.

Because xfs_get_blocks() starts the map before EOF, it can't set
buffer_new(), because that causes he direct IO code to also zero
unaligned sectors at the head of the IO. This would overwrite valid
data with zeros, and hence we cannot validly return a single extent
that spans EOF to direct IO.

Fix this by detecting a mapping that spans EOF and truncate it down
to EOF. This results in the the direct IO code doing the right thing
for unaligned data blocks before EOF, and then returning to get
another mapping for the region beyond EOF which XFS treats correctly
by setting buffer_new() on it. This makes direct Io behave correctly
w.r.t. tail block zeroing beyond EOF, and fsx is happy about that.

Again, thanks to Al Viro for finding what I couldn't.

[ dchinner: Fix for __divdi3 build error:

	Reported-by: Paul Gortmaker <paul.gortmaker@windriver.com>
	Tested-by: Paul Gortmaker <paul.gortmaker@windriver.com>
	Signed-off-by: Mark Tinguely <tinguely@sgi.com>
	Reviewed-by: Eric Sandeen <sandeen@redhat.com>
]

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Tested-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_aops.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e0a7931..0479c32 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1344,6 +1344,14 @@ __xfs_get_blocks(
 	/*
 	 * If this is O_DIRECT or the mpage code calling tell them how large
 	 * the mapping is, so that we can avoid repeated get_blocks calls.
+	 *
+	 * If the mapping spans EOF, then we have to break the mapping up as the
+	 * mapping for blocks beyond EOF must be marked new so that sub block
+	 * regions can be correctly zeroed. We can't do this for mappings within
+	 * EOF unless the mapping was just allocated or is unwritten, otherwise
+	 * the callers would overwrite existing data with zeros. Hence we have
+	 * to split the mapping into a range up to and including EOF, and a
+	 * second mapping for beyond EOF.
 	 */
 	if (direct || size > (1 << inode->i_blkbits)) {
 		xfs_off_t		mapping_size;
@@ -1354,6 +1362,12 @@ __xfs_get_blocks(
 		ASSERT(mapping_size > 0);
 		if (mapping_size > size)
 			mapping_size = size;
+		if (offset < i_size_read(inode) &&
+		    offset + mapping_size >= i_size_read(inode)) {
+			/* limit mapping to block that spans EOF */
+			mapping_size = roundup_64(i_size_read(inode) - offset,
+						  1 << inode->i_blkbits);
+		}
 		if (mapping_size > LONG_MAX)
 			mapping_size = LONG_MAX;
 
-- 
cgit v1.1


From d39a2ced0fa0172faa46df0866fc22419b876e2a Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 17 Apr 2014 08:15:25 +1000
Subject: xfs: collapse range is delalloc challenged

FSX has been detecting data corruption after to collapse range
calls. The key observation is that the offset of the last extent in
the file was not being shifted, and hence when the file size was
adjusted it was truncating away data because the extents handled
been correctly shifted.

Tracing indicated that before the collapse, the extent list looked
like:

....
ino 0x5788 state  idx 6 offset 26 block 195904 count 10 flag 0
ino 0x5788 state  idx 7 offset 39 block 195917 count 35 flag 0
ino 0x5788 state  idx 8 offset 86 block 195964 count 32 flag 0

and after the shift of 2 blocks:

ino 0x5788 state  idx 6 offset 24 block 195904 count 10 flag 0
ino 0x5788 state  idx 7 offset 37 block 195917 count 35 flag 0
ino 0x5788 state  idx 8 offset 86 block 195964 count 32 flag 0

Note that the last extent did not change offset. After the changing
of the file size:

ino 0x5788 state  idx 6 offset 24 block 195904 count 10 flag 0
ino 0x5788 state  idx 7 offset 37 block 195917 count 35 flag 0
ino 0x5788 state  idx 8 offset 86 block 195964 count 30 flag 0

You can see that the last extent had it's length truncated,
indicating that we've lost data.

The reason for this is that the xfs_bmap_shift_extents() loop uses
XFS_IFORK_NEXTENTS() to determine how many extents are in the inode.
This, unfortunately, doesn't take into account delayed allocation
extents - it's a count of physically allocated extents - and hence
when the file being collapsed has a delalloc extent like this one
does prior to the range being collapsed:

....
ino 0x5788 state  idx 4 offset 11 block 4503599627239429 count 1 flag 0
....

it gets the count wrong and terminates the shift loop early.

Fix it by using the in-memory extent array size that includes
delayed allocation extents to determine the number of extents on the
inode.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Tested-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_bmap.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 5b6092e..f0efc7e 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5413,6 +5413,7 @@ xfs_bmap_shift_extents(
 	int				whichfork = XFS_DATA_FORK;
 	int				logflags;
 	xfs_filblks_t			blockcount = 0;
+	int				total_extents;
 
 	if (unlikely(XFS_TEST_ERROR(
 	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
@@ -5429,7 +5430,6 @@ xfs_bmap_shift_extents(
 	ASSERT(current_ext != NULL);
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
-
 	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
 		/* Read in all the extents */
 		error = xfs_iread_extents(tp, ip, whichfork);
@@ -5456,7 +5456,6 @@ xfs_bmap_shift_extents(
 
 	/* We are going to change core inode */
 	logflags = XFS_ILOG_CORE;
-
 	if (ifp->if_flags & XFS_IFBROOT) {
 		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
 		cur->bc_private.b.firstblock = *firstblock;
@@ -5467,8 +5466,14 @@ xfs_bmap_shift_extents(
 		logflags |= XFS_ILOG_DEXT;
 	}
 
-	while (nexts++ < num_exts &&
-	       *current_ext <  XFS_IFORK_NEXTENTS(ip, whichfork)) {
+	/*
+	 * There may be delalloc extents in the data fork before the range we
+	 * are collapsing out, so we cannot
+	 * use the count of real extents here. Instead we have to calculate it
+	 * from the incore fork.
+	 */
+	total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+	while (nexts++ < num_exts && *current_ext < total_extents) {
 
 		gotp = xfs_iext_get_ext(ifp, *current_ext);
 		xfs_bmbt_get_all(gotp, &got);
@@ -5556,10 +5561,11 @@ xfs_bmap_shift_extents(
 		}
 
 		(*current_ext)++;
+		total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
 	}
 
 	/* Check if we are done */
-	if (*current_ext ==  XFS_IFORK_NEXTENTS(ip, whichfork))
+	if (*current_ext == total_extents)
 		*done = 1;
 
 del_cursor:
@@ -5568,6 +5574,5 @@ del_cursor:
 			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
 
 	xfs_trans_log_inode(tp, ip, logflags);
-
 	return error;
 }
-- 
cgit v1.1


From 9c23eccc1e746f64b18fab070a37189b4422e44a Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 17 Apr 2014 08:15:26 +1000
Subject: xfs: unmount does not wait for shutdown during unmount
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

And interesting situation can occur if a log IO error occurs during
the unmount of a filesystem. The cases reported have the same
signature - the update of the superblock counters fails due to a log
write IO error:

XFS (dm-16): xfs_do_force_shutdown(0x2) called from line 1170 of file fs/xfs/xfs_log.c.  Return address = 0xffffffffa08a44a1
XFS (dm-16): Log I/O Error Detected.  Shutting down filesystem
XFS (dm-16): Unable to update superblock counters. Freespace may not be correct on next mount.
XFS (dm-16): xfs_log_force: error 5 returned.
XFS (¿-¿¿¿): Please umount the filesystem and rectify the problem(s)

It can be seen that the last line of output contains a corrupt
device name - this is because the log and xfs_mount structures have
already been freed by the time this message is printed. A kernel
oops closely follows.

The issue is that the shutdown is occurring in a separate IO
completion thread to the unmount. Once the shutdown processing has
started and all the iclogs are marked with XLOG_STATE_IOERROR, the
log shutdown code wakes anyone waiting on a log force so they can
process the shutdown error. This wakes up the unmount code that
is doing a synchronous transaction to update the superblock
counters.

The unmount path now sees all the iclogs are marked with
XLOG_STATE_IOERROR and so never waits on them again, knowing that if
it does, there will not be a wakeup trigger for it and we will hang
the unmount if we do. Hence the unmount runs through all the
remaining code and frees all the filesystem structures while the
xlog_iodone() is still processing the shutdown. When the log
shutdown processing completes, xfs_do_force_shutdown() emits the
"Please umount the filesystem and rectify the problem(s)" message,
and xlog_iodone() then aborts all the objects attached to the iclog.
An iclog that has already been freed....

The real issue here is that there is no serialisation point between
the log IO and the unmount. We have serialisations points for log
writes, log forces, reservations, etc, but we don't actually have
any code that wakes for log IO to fully complete. We do that for all
other types of object, so why not iclogbufs?

Well, it turns out that we can easily do this. We've got xfs_buf
handles, and that's what everyone else uses for IO serialisation.
i.e. bp->b_sema. So, lets hold iclogbufs locked over IO, and only
release the lock in xlog_iodone() when we are finished with the
buffer. That way before we tear down the iclog, we can lock and
unlock the buffer to ensure IO completion has finished completely
before we tear it down.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Tested-by: Mike Snitzer <snitzer@redhat.com>
Tested-by: Bob Mastors <bob.mastors@solidfire.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_log.c | 53 ++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 44 insertions(+), 9 deletions(-)

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 8497a00..08624dc 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1181,11 +1181,14 @@ xlog_iodone(xfs_buf_t *bp)
 	/* log I/O is always issued ASYNC */
 	ASSERT(XFS_BUF_ISASYNC(bp));
 	xlog_state_done_syncing(iclog, aborted);
+
 	/*
-	 * do not reference the buffer (bp) here as we could race
-	 * with it being freed after writing the unmount record to the
-	 * log.
+	 * drop the buffer lock now that we are done. Nothing references
+	 * the buffer after this, so an unmount waiting on this lock can now
+	 * tear it down safely. As such, it is unsafe to reference the buffer
+	 * (bp) after the unlock as we could race with it being freed.
 	 */
+	xfs_buf_unlock(bp);
 }
 
 /*
@@ -1368,8 +1371,16 @@ xlog_alloc_log(
 	bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0);
 	if (!bp)
 		goto out_free_log;
-	bp->b_iodone = xlog_iodone;
+
+	/*
+	 * The iclogbuf buffer locks are held over IO but we are not going to do
+	 * IO yet.  Hence unlock the buffer so that the log IO path can grab it
+	 * when appropriately.
+	 */
 	ASSERT(xfs_buf_islocked(bp));
+	xfs_buf_unlock(bp);
+
+	bp->b_iodone = xlog_iodone;
 	log->l_xbuf = bp;
 
 	spin_lock_init(&log->l_icloglock);
@@ -1398,6 +1409,9 @@ xlog_alloc_log(
 		if (!bp)
 			goto out_free_iclog;
 
+		ASSERT(xfs_buf_islocked(bp));
+		xfs_buf_unlock(bp);
+
 		bp->b_iodone = xlog_iodone;
 		iclog->ic_bp = bp;
 		iclog->ic_data = bp->b_addr;
@@ -1422,7 +1436,6 @@ xlog_alloc_log(
 		iclog->ic_callback_tail = &(iclog->ic_callback);
 		iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
 
-		ASSERT(xfs_buf_islocked(iclog->ic_bp));
 		init_waitqueue_head(&iclog->ic_force_wait);
 		init_waitqueue_head(&iclog->ic_write_wait);
 
@@ -1631,6 +1644,12 @@ xlog_cksum(
  * we transition the iclogs to IOERROR state *after* flushing all existing
  * iclogs to disk. This is because we don't want anymore new transactions to be
  * started or completed afterwards.
+ *
+ * We lock the iclogbufs here so that we can serialise against IO completion
+ * during unmount. We might be processing a shutdown triggered during unmount,
+ * and that can occur asynchronously to the unmount thread, and hence we need to
+ * ensure that completes before tearing down the iclogbufs. Hence we need to
+ * hold the buffer lock across the log IO to acheive that.
  */
 STATIC int
 xlog_bdstrat(
@@ -1638,6 +1657,7 @@ xlog_bdstrat(
 {
 	struct xlog_in_core	*iclog = bp->b_fspriv;
 
+	xfs_buf_lock(bp);
 	if (iclog->ic_state & XLOG_STATE_IOERROR) {
 		xfs_buf_ioerror(bp, EIO);
 		xfs_buf_stale(bp);
@@ -1645,7 +1665,8 @@ xlog_bdstrat(
 		/*
 		 * It would seem logical to return EIO here, but we rely on
 		 * the log state machine to propagate I/O errors instead of
-		 * doing it here.
+		 * doing it here. Similarly, IO completion will unlock the
+		 * buffer, so we don't do it here.
 		 */
 		return 0;
 	}
@@ -1847,14 +1868,28 @@ xlog_dealloc_log(
 	xlog_cil_destroy(log);
 
 	/*
-	 * always need to ensure that the extra buffer does not point to memory
-	 * owned by another log buffer before we free it.
+	 * Cycle all the iclogbuf locks to make sure all log IO completion
+	 * is done before we tear down these buffers.
 	 */
+	iclog = log->l_iclog;
+	for (i = 0; i < log->l_iclog_bufs; i++) {
+		xfs_buf_lock(iclog->ic_bp);
+		xfs_buf_unlock(iclog->ic_bp);
+		iclog = iclog->ic_next;
+	}
+
+	/*
+	 * Always need to ensure that the extra buffer does not point to memory
+	 * owned by another log buffer before we free it. Also, cycle the lock
+	 * first to ensure we've completed IO on it.
+	 */
+	xfs_buf_lock(log->l_xbuf);
+	xfs_buf_unlock(log->l_xbuf);
 	xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size));
 	xfs_buf_free(log->l_xbuf);
 
 	iclog = log->l_iclog;
-	for (i=0; i<log->l_iclog_bufs; i++) {
+	for (i = 0; i < log->l_iclog_bufs; i++) {
 		xfs_buf_free(iclog->ic_bp);
 		next_iclog = iclog->ic_next;
 		kmem_free(iclog);
-- 
cgit v1.1


From 07d5035a289f8bebe0ea86c293b2d5412478c481 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 17 Apr 2014 08:15:27 +1000
Subject: xfs: wrong error sign conversion during failed DIO writes

We negate the error value being returned from a generic function
incorrectly. The code path that it is running in returned negative
errors, so there is no need to negate it to get the correct error
signs here.

This was uncovered by generic/019.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 79e96ce..82afdcb 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -679,7 +679,7 @@ xfs_file_dio_aio_write(
 		goto out;
 
 	if (mapping->nrpages) {
-		ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+		ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
 						    pos, -1);
 		if (ret)
 			goto out;
-- 
cgit v1.1


From 8d6c121018bf60d631c05a4a2efc468a392b97bb Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Thu, 17 Apr 2014 08:15:28 +1000
Subject: xfs: fix buffer use after free on IO error

When testing exhaustion of dm snapshots, the following appeared
with CONFIG_DEBUG_OBJECTS_FREE enabled:

ODEBUG: free active (active state 0) object type: work_struct hint: xfs_buf_iodone_work+0x0/0x1d0 [xfs]

indicating that we'd freed a buffer which still had a pending reference,
down this path:

[  190.867975]  [<ffffffff8133e6fb>] debug_check_no_obj_freed+0x22b/0x270
[  190.880820]  [<ffffffff811da1d0>] kmem_cache_free+0xd0/0x370
[  190.892615]  [<ffffffffa02c5924>] xfs_buf_free+0xe4/0x210 [xfs]
[  190.905629]  [<ffffffffa02c6167>] xfs_buf_rele+0xe7/0x270 [xfs]
[  190.911770]  [<ffffffffa034c826>] xfs_trans_read_buf_map+0x7b6/0xac0 [xfs]

At issue is the fact that if IO fails in xfs_buf_iorequest,
we'll queue completion unconditionally, and then call
xfs_buf_rele; but if IO failed, there are no IOs remaining,
and xfs_buf_rele will free the bp while work is still queued.

Fix this by not scheduling completion if the buffer has
an error on it; run it immediately.  The rest is only comment
changes.

Thanks to dchinner for spotting the root cause.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_buf.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 107f2fd..cb10a0a 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1372,21 +1372,29 @@ xfs_buf_iorequest(
 		xfs_buf_wait_unpin(bp);
 	xfs_buf_hold(bp);
 
-	/* Set the count to 1 initially, this will stop an I/O
+	/*
+	 * Set the count to 1 initially, this will stop an I/O
 	 * completion callout which happens before we have started
 	 * all the I/O from calling xfs_buf_ioend too early.
 	 */
 	atomic_set(&bp->b_io_remaining, 1);
 	_xfs_buf_ioapply(bp);
-	_xfs_buf_ioend(bp, 1);
+	/*
+	 * If _xfs_buf_ioapply failed, we'll get back here with
+	 * only the reference we took above.  _xfs_buf_ioend will
+	 * drop it to zero, so we'd better not queue it for later,
+	 * or we'll free it before it's done.
+	 */
+	_xfs_buf_ioend(bp, bp->b_error ? 0 : 1);
 
 	xfs_buf_rele(bp);
 }
 
 /*
  * Waits for I/O to complete on the buffer supplied.  It returns immediately if
- * no I/O is pending or there is already a pending error on the buffer.  It
- * returns the I/O error code, if any, or 0 if there was no error.
+ * no I/O is pending or there is already a pending error on the buffer, in which
+ * case nothing will ever complete.  It returns the I/O error code, if any, or
+ * 0 if there was no error.
  */
 int
 xfs_buf_iowait(
-- 
cgit v1.1


From 330033d697ed8d296fa52b5303db9d802ad901cc Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Thu, 17 Apr 2014 08:15:30 +1000
Subject: xfs: fix tmpfile/selinux deadlock and initialize security

xfstests generic/004 reproduces an ilock deadlock using the tmpfile
interface when selinux is enabled. This occurs because
xfs_create_tmpfile() takes the ilock and then calls d_tmpfile(). The
latter eventually calls into xfs_xattr_get() which attempts to get the
lock again. E.g.:

xfs_io          D ffffffff81c134c0  4096  3561   3560 0x00000080
ffff8801176a1a68 0000000000000046 ffff8800b401b540 ffff8801176a1fd8
00000000001d5800 00000000001d5800 ffff8800b401b540 ffff8800b401b540
ffff8800b73a6bd0 fffffffeffffffff ffff8800b73a6bd8 ffff8800b5ddb480
Call Trace:
[<ffffffff8177f969>] schedule+0x29/0x70
[<ffffffff81783a65>] rwsem_down_read_failed+0xc5/0x120
[<ffffffffa05aa97f>] ? xfs_ilock_attr_map_shared+0x1f/0x50 [xfs]
[<ffffffff813b3434>] call_rwsem_down_read_failed+0x14/0x30
[<ffffffff810ed179>] ? down_read_nested+0x89/0xa0
[<ffffffffa05aa7f2>] ? xfs_ilock+0x122/0x250 [xfs]
[<ffffffffa05aa7f2>] xfs_ilock+0x122/0x250 [xfs]
[<ffffffffa05aa97f>] xfs_ilock_attr_map_shared+0x1f/0x50 [xfs]
[<ffffffffa05701d0>] xfs_attr_get+0x90/0xe0 [xfs]
[<ffffffffa0565e07>] xfs_xattr_get+0x37/0x50 [xfs]
[<ffffffff8124842f>] generic_getxattr+0x4f/0x70
[<ffffffff8133fd9e>] inode_doinit_with_dentry+0x1ae/0x650
[<ffffffff81340e0c>] selinux_d_instantiate+0x1c/0x20
[<ffffffff813351bb>] security_d_instantiate+0x1b/0x30
[<ffffffff81237db0>] d_instantiate+0x50/0x70
[<ffffffff81237e85>] d_tmpfile+0xb5/0xc0
[<ffffffffa05add02>] xfs_create_tmpfile+0x362/0x410 [xfs]
[<ffffffffa0559ac8>] xfs_vn_tmpfile+0x18/0x20 [xfs]
[<ffffffff81230388>] path_openat+0x228/0x6a0
[<ffffffff810230f9>] ? sched_clock+0x9/0x10
[<ffffffff8105a427>] ? kvm_clock_read+0x27/0x40
[<ffffffff8124054f>] ? __alloc_fd+0xaf/0x1f0
[<ffffffff8123101a>] do_filp_open+0x3a/0x90
[<ffffffff817845e7>] ? _raw_spin_unlock+0x27/0x40
[<ffffffff8124054f>] ? __alloc_fd+0xaf/0x1f0
[<ffffffff8121e3ce>] do_sys_open+0x12e/0x210
[<ffffffff8121e4ce>] SyS_open+0x1e/0x20
[<ffffffff8178eda9>] system_call_fastpath+0x16/0x1b

xfs_vn_tmpfile() also fails to initialize security on the newly created
inode.

Pull the d_tmpfile() call up into xfs_vn_tmpfile() after the transaction
has been committed and the inode unlocked. Also, initialize security on
the inode based on the parent directory provided via the tmpfile call.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_inode.c |  5 +++--
 fs/xfs/xfs_inode.h |  2 +-
 fs/xfs/xfs_iops.c  | 20 +++++++++++++++++---
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 5e7a38f..768087b 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1334,7 +1334,8 @@ int
 xfs_create_tmpfile(
 	struct xfs_inode	*dp,
 	struct dentry		*dentry,
-	umode_t			mode)
+	umode_t			mode,
+	struct xfs_inode	**ipp)
 {
 	struct xfs_mount	*mp = dp->i_mount;
 	struct xfs_inode	*ip = NULL;
@@ -1402,7 +1403,6 @@ xfs_create_tmpfile(
 	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
 
 	ip->i_d.di_nlink--;
-	d_tmpfile(dentry, VFS_I(ip));
 	error = xfs_iunlink(tp, ip);
 	if (error)
 		goto out_trans_abort;
@@ -1415,6 +1415,7 @@ xfs_create_tmpfile(
 	xfs_qm_dqrele(gdqp);
 	xfs_qm_dqrele(pdqp);
 
+	*ipp = ip;
 	return 0;
 
  out_trans_abort:
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 396cc1f..f2fcde5 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -334,7 +334,7 @@ int		xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
 int		xfs_create(struct xfs_inode *dp, struct xfs_name *name,
 			   umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp);
 int		xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry,
-			   umode_t mode);
+			   umode_t mode, struct xfs_inode **ipp);
 int		xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
 			   struct xfs_inode *ip);
 int		xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 89b07e4..ef1ca01 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1053,11 +1053,25 @@ xfs_vn_tmpfile(
 	struct dentry	*dentry,
 	umode_t		mode)
 {
-	int		error;
+	int			error;
+	struct xfs_inode	*ip;
+	struct inode		*inode;
 
-	error = xfs_create_tmpfile(XFS_I(dir), dentry, mode);
+	error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip);
+	if (unlikely(error))
+		return -error;
 
-	return -error;
+	inode = VFS_I(ip);
+
+	error = xfs_init_security(inode, dir, &dentry->d_name);
+	if (unlikely(error)) {
+		iput(inode);
+		return -error;
+	}
+
+	d_tmpfile(dentry, inode);
+
+	return 0;
 }
 
 static const struct inode_operations xfs_inode_operations = {
-- 
cgit v1.1