From 0191f8697bbdfefcd36e7b8dc3eeddfe82893e4b Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 24 May 2010 19:15:57 +0200
Subject: pipe: F_SETPIPE_SZ should return -EPERM for non-root

If the passed in size is larger than what has been set as the
system wide limit and the user is not root, we want to return
permission denied (not invalid value).

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/pipe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index d79872e..01ca9fa 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1170,7 +1170,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
 	switch (cmd) {
 	case F_SETPIPE_SZ:
 		if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages)
-			return -EINVAL;
+			return -EPERM;
 		/*
 		 * The pipe needs to be at least 2 pages large to
 		 * guarantee POSIX behaviour.
-- 
cgit v1.1


From b9598db3401282bb27b4aef77e3eee12015f7f29 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 24 May 2010 19:34:43 +0200
Subject: pipe: make F_{GET,SET}PIPE_SZ deal with byte sizes

Instead of requiring an exact number of pages as the argument and
return value, change the API to deal with number of bytes instead.

This also relaxes the requirement that the passed in size must
result in a power-of-2 page array size. Round up to the nearest
power-of-2 automatically and return the resulting size of the pipe
on success.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/pipe.c | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index 01ca9fa..bdd3f96 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1112,26 +1112,20 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
  * Allocate a new array of pipe buffers and copy the info over. Returns the
  * pipe size if successful, or return -ERROR on error.
  */
-static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
+static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
 {
 	struct pipe_buffer *bufs;
 
 	/*
-	 * Must be a power-of-2 currently
-	 */
-	if (!is_power_of_2(arg))
-		return -EINVAL;
-
-	/*
 	 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
 	 * expect a lot of shrink+grow operations, just free and allocate
 	 * again like we would do for growing. If the pipe currently
 	 * contains more buffers than arg, then return busy.
 	 */
-	if (arg < pipe->nrbufs)
+	if (nr_pages < pipe->nrbufs)
 		return -EBUSY;
 
-	bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL);
+	bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL);
 	if (unlikely(!bufs))
 		return -ENOMEM;
 
@@ -1152,8 +1146,8 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
 	pipe->curbuf = 0;
 	kfree(pipe->bufs);
 	pipe->bufs = bufs;
-	pipe->buffers = arg;
-	return arg;
+	pipe->buffers = nr_pages;
+	return nr_pages * PAGE_SIZE;
 }
 
 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -1168,19 +1162,30 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
 	mutex_lock(&pipe->inode->i_mutex);
 
 	switch (cmd) {
-	case F_SETPIPE_SZ:
-		if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages)
+	case F_SETPIPE_SZ: {
+		unsigned long nr_pages;
+
+		/*
+		 * Currently the array must be a power-of-2 size, so adjust
+		 * upwards if needed.
+		 */
+		nr_pages = (arg + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		nr_pages = roundup_pow_of_two(nr_pages);
+
+		if (!capable(CAP_SYS_ADMIN) && nr_pages > pipe_max_pages)
 			return -EPERM;
+
 		/*
 		 * The pipe needs to be at least 2 pages large to
 		 * guarantee POSIX behaviour.
 		 */
-		if (arg < 2)
+		if (nr_pages < 2)
 			return -EINVAL;
-		ret = pipe_set_size(pipe, arg);
+		ret = pipe_set_size(pipe, nr_pages);
 		break;
+		}
 	case F_GETPIPE_SZ:
-		ret = pipe->buffers;
+		ret = pipe->buffers * PAGE_SIZE;
 		break;
 	default:
 		ret = -EINVAL;
-- 
cgit v1.1


From 0ae0b5d0557264bad65e22f1e2da4b83a02c4535 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 25 May 2010 10:25:26 +0200
Subject: fs/splice.c: fix mapping_gfp_mask usage

mapping_gfp_mask() is not supposed to store allocation contex details,
only page location details.  So mapping_gfp_mask should be applied to the
pagecache page allocation, wheras normal (kernel mapped) memory should be
used for surrounding allocations such as radix-tree nodes allocated by
add_to_page_cache.  Context modifiers should be applied on a per-callsite
basis.

So change splice to follow this convention (which is followed in similar
code patterns in core code).

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/splice.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index ac22b00..740e6b9 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -354,7 +354,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 				break;
 
 			error = add_to_page_cache_lru(page, mapping, index,
-						mapping_gfp_mask(mapping));
+						GFP_KERNEL);
 			if (unlikely(error)) {
 				page_cache_release(page);
 				if (error == -EEXIST)
-- 
cgit v1.1


From 657a4cffde065beca7d919e867f61e7d322708b6 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Fri, 30 Apr 2010 03:42:49 +0000
Subject: xfs: replace E2BIG with EFBIG where appropriate

Many places in the xfs code return E2BIG when they really mean
EFBIG; trying to grow past 16T on a 32 bit machine, for example,
says "Argument list too long" rather than "File too large" which is
not particularly helpful.

Some of these don't make perfect sense as EFBIG either, but still
better than E2BIG IMHO.

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_mount.c   | 14 +++++++-------
 fs/xfs/xfs_rtalloc.c |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d7bf38c..2d811e5 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -268,10 +268,10 @@ xfs_sb_validate_fsb_count(
 
 #if XFS_BIG_BLKNOS     /* Limited by ULONG_MAX of page cache index */
 	if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
-		return E2BIG;
+		return EFBIG;
 #else                  /* Limited by UINT_MAX of sectors */
 	if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX)
-		return E2BIG;
+		return EFBIG;
 #endif
 	return 0;
 }
@@ -393,7 +393,7 @@ xfs_mount_validate_sb(
 	    xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
 		xfs_fs_mount_cmn_err(flags,
 			"file system too large to be mounted on this system.");
-		return XFS_ERROR(E2BIG);
+		return XFS_ERROR(EFBIG);
 	}
 
 	if (unlikely(sbp->sb_inprogress)) {
@@ -1009,7 +1009,7 @@ xfs_check_sizes(xfs_mount_t *mp)
 	d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
 	if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
 		cmn_err(CE_WARN, "XFS: size check 1 failed");
-		return XFS_ERROR(E2BIG);
+		return XFS_ERROR(EFBIG);
 	}
 	error = xfs_read_buf(mp, mp->m_ddev_targp,
 			     d - XFS_FSS_TO_BB(mp, 1),
@@ -1019,7 +1019,7 @@ xfs_check_sizes(xfs_mount_t *mp)
 	} else {
 		cmn_err(CE_WARN, "XFS: size check 2 failed");
 		if (error == ENOSPC)
-			error = XFS_ERROR(E2BIG);
+			error = XFS_ERROR(EFBIG);
 		return error;
 	}
 
@@ -1027,7 +1027,7 @@ xfs_check_sizes(xfs_mount_t *mp)
 		d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
 		if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
 			cmn_err(CE_WARN, "XFS: size check 3 failed");
-			return XFS_ERROR(E2BIG);
+			return XFS_ERROR(EFBIG);
 		}
 		error = xfs_read_buf(mp, mp->m_logdev_targp,
 				     d - XFS_FSB_TO_BB(mp, 1),
@@ -1037,7 +1037,7 @@ xfs_check_sizes(xfs_mount_t *mp)
 		} else {
 			cmn_err(CE_WARN, "XFS: size check 3 failed");
 			if (error == ENOSPC)
-				error = XFS_ERROR(E2BIG);
+				error = XFS_ERROR(EFBIG);
 			return error;
 		}
 	}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 6be05f7..1644551 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -2247,7 +2247,7 @@ xfs_rtmount_init(
 		cmn_err(CE_WARN, "XFS: realtime mount -- %llu != %llu",
 			(unsigned long long) XFS_BB_TO_FSB(mp, d),
 			(unsigned long long) mp->m_sb.sb_rblocks);
-		return XFS_ERROR(E2BIG);
+		return XFS_ERROR(EFBIG);
 	}
 	error = xfs_read_buf(mp, mp->m_rtdev_targp,
 				d - XFS_FSB_TO_BB(mp, 1),
@@ -2256,7 +2256,7 @@ xfs_rtmount_init(
 		cmn_err(CE_WARN,
 	"XFS: realtime mount -- xfs_read_buf failed, returned %d", error);
 		if (error == ENOSPC)
-			return XFS_ERROR(E2BIG);
+			return XFS_ERROR(EFBIG);
 		return error;
 	}
 	xfs_buf_relse(bp);
-- 
cgit v1.1


From 32891b292d6262d1db8e553cf3f4b38a91247b5a Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Fri, 30 Apr 2010 16:43:48 +0000
Subject: xfs: be more explicit if RT mount fails due to config

Recent testers were slightly confused that a realtime mount failed
due to missing CONFIG_XFS_RT; we can make that a little more
obvious.

V2: drop the else as suggested by Christoph

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_rtalloc.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index b2d67ad..ff614c2 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -147,7 +147,16 @@ xfs_growfs_rt(
 # define xfs_rtfree_extent(t,b,l)                       (ENOSYS)
 # define xfs_rtpick_extent(m,t,l,rb)                    (ENOSYS)
 # define xfs_growfs_rt(mp,in)                           (ENOSYS)
-# define xfs_rtmount_init(m)    (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
+static inline int		/* error */
+xfs_rtmount_init(
+	xfs_mount_t	*mp)	/* file system mount structure */
+{
+	if (mp->m_sb.sb_rblocks == 0)
+		return 0;
+
+	cmn_err(CE_WARN, "XFS: Not built with CONFIG_XFS_RT");
+	return ENOSYS;
+}
 # define xfs_rtmount_inodes(m)  (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
 # define xfs_rtunmount_inodes(m)
 #endif	/* CONFIG_XFS_RT */
-- 
cgit v1.1


From 025101dca4480eff9da948405e872d5115030850 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 4 May 2010 13:53:48 +0000
Subject: xfs: cleanup log reservation calculactions

Instead of having small helper functions calling big macros do the
calculations for the log reservations directly in the functions.
These are mostly 1:1 from the macros execept that the macros kept
the quota calculations in their callers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_trans.c | 446 +++++++++++++++++++++++++++++++++++++++++++++++------
 fs/xfs/xfs_trans.h | 411 ------------------------------------------------
 2 files changed, 400 insertions(+), 457 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index ce558ef..28547df 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -48,134 +48,489 @@
 
 kmem_zone_t	*xfs_trans_zone;
 
+
 /*
- * Reservation functions here avoid a huge stack in xfs_trans_init
- * due to register overflow from temporaries in the calculations.
+ * Various log reservation values.
+ *
+ * These are based on the size of the file system block because that is what
+ * most transactions manipulate.  Each adds in an additional 128 bytes per
+ * item logged to try to account for the overhead of the transaction mechanism.
+ *
+ * Note:  Most of the reservations underestimate the number of allocation
+ * groups into which they could free extents in the xfs_bmap_finish() call.
+ * This is because the number in the worst case is quite high and quite
+ * unusual.  In order to fix this we need to change xfs_bmap_finish() to free
+ * extents in only a single AG at a time.  This will require changes to the
+ * EFI code as well, however, so that the EFI for the extents not freed is
+ * logged again in each transaction.  See SGI PV #261917.
+ *
+ * Reservation functions here avoid a huge stack in xfs_trans_init due to
+ * register overflow from temporaries in the calculations.
+ */
+
+
+/*
+ * In a write transaction we can allocate a maximum of 2
+ * extents.  This gives:
+ *    the inode getting the new extents: inode size
+ *    the inode's bmap btree: max depth * block size
+ *    the agfs of the ags from which the extents are allocated: 2 * sector
+ *    the superblock free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And the bmap_finish transaction can free bmap blocks in a join:
+ *    the agfs of the ags containing the blocks: 2 * sector size
+ *    the agfls of the ags containing the blocks: 2 * sector size
+ *    the super block free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
  */
 STATIC uint
-xfs_calc_write_reservation(xfs_mount_t *mp)
+xfs_calc_write_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_WRITE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((mp->m_sb.sb_inodesize +
+		     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
+		     2 * mp->m_sb.sb_sectsize +
+		     mp->m_sb.sb_sectsize +
+		     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+		     128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
+			    XFS_ALLOCFREE_LOG_COUNT(mp, 2))),
+		    (2 * mp->m_sb.sb_sectsize +
+		     2 * mp->m_sb.sb_sectsize +
+		     mp->m_sb.sb_sectsize +
+		     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+		     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
 }
 
+/*
+ * In truncating a file we free up to two extents at once.  We can modify:
+ *    the inode being truncated: inode size
+ *    the inode's bmap btree: (max depth + 1) * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *		4 exts * 2 trees * (2 * max depth - 1) * block size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_itruncate_reservation(xfs_mount_t *mp)
+xfs_calc_itruncate_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_ITRUNCATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((mp->m_sb.sb_inodesize +
+		     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) +
+		     128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
+		    (4 * mp->m_sb.sb_sectsize +
+		     4 * mp->m_sb.sb_sectsize +
+		     mp->m_sb.sb_sectsize +
+		     XFS_ALLOCFREE_LOG_RES(mp, 4) +
+		     128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)) +
+		     128 * 5 +
+		     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+		     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+			    XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
 }
 
+/*
+ * In renaming a files we can modify:
+ *    the four inodes involved: 4 * inode size
+ *    the two directory btrees: 2 * (max depth + v2) * dir block size
+ *    the two directory bmap btrees: 2 * max depth * block size
+ * And the bmap_finish transaction can free dir and bmap blocks (two sets
+ *	of bmap blocks) giving:
+ *    the agf for the ags in which the blocks live: 3 * sector size
+ *    the agfl for the ags in which the blocks live: 3 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_rename_reservation(xfs_mount_t *mp)
+xfs_calc_rename_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_RENAME_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((4 * mp->m_sb.sb_inodesize +
+		     2 * XFS_DIROP_LOG_RES(mp) +
+		     128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp))),
+		    (3 * mp->m_sb.sb_sectsize +
+		     3 * mp->m_sb.sb_sectsize +
+		     mp->m_sb.sb_sectsize +
+		     XFS_ALLOCFREE_LOG_RES(mp, 3) +
+		     128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))));
 }
 
+/*
+ * For creating a link to an inode:
+ *    the parent directory inode: inode size
+ *    the linked inode: inode size
+ *    the directory btree could split: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free some bmap blocks giving:
+ *    the agf for the ag in which the blocks live: sector size
+ *    the agfl for the ag in which the blocks live: sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_link_reservation(xfs_mount_t *mp)
+xfs_calc_link_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_LINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((mp->m_sb.sb_inodesize +
+		     mp->m_sb.sb_inodesize +
+		     XFS_DIROP_LOG_RES(mp) +
+		     128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
+		    (mp->m_sb.sb_sectsize +
+		     mp->m_sb.sb_sectsize +
+		     mp->m_sb.sb_sectsize +
+		     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+		     128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
 }
 
+/*
+ * For removing a directory entry we can modify:
+ *    the parent directory inode: inode size
+ *    the removed inode: inode size
+ *    the directory btree could join: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free the dir and bmap blocks giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_remove_reservation(xfs_mount_t *mp)
+xfs_calc_remove_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_REMOVE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((mp->m_sb.sb_inodesize +
+		     mp->m_sb.sb_inodesize +
+		     XFS_DIROP_LOG_RES(mp) +
+		     128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
+		    (2 * mp->m_sb.sb_sectsize +
+		     2 * mp->m_sb.sb_sectsize +
+		     mp->m_sb.sb_sectsize +
+		     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+		     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
 }
 
+/*
+ * For symlink we can modify:
+ *    the parent directory inode: inode size
+ *    the new inode: inode size
+ *    the inode btree entry: 1 block
+ *    the directory btree: (max depth + v2) * dir block size
+ *    the directory inode's bmap btree: (max depth + v2) * block size
+ *    the blocks for the symlink: 1 kB
+ * Or in the first xact we allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_symlink_reservation(xfs_mount_t *mp)
+xfs_calc_symlink_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_SYMLINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((mp->m_sb.sb_inodesize +
+		     mp->m_sb.sb_inodesize +
+		     XFS_FSB_TO_B(mp, 1) +
+		     XFS_DIROP_LOG_RES(mp) +
+		     1024 +
+		     128 * (4 + XFS_DIROP_LOG_COUNT(mp))),
+		    (2 * mp->m_sb.sb_sectsize +
+		     XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
+		     XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
+		     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+		     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+			    XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
 }
 
+/*
+ * For create we can modify:
+ *    the parent directory inode: inode size
+ *    the new inode: inode size
+ *    the inode btree entry: block size
+ *    the superblock for the nlink flag: sector size
+ *    the directory btree: (max depth + v2) * dir block size
+ *    the directory inode's bmap btree: (max depth + v2) * block size
+ * Or in the first xact we allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the superblock for the nlink flag: sector size
+ *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_create_reservation(xfs_mount_t *mp)
+xfs_calc_create_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_CREATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((mp->m_sb.sb_inodesize +
+		     mp->m_sb.sb_inodesize +
+		     mp->m_sb.sb_sectsize +
+		     XFS_FSB_TO_B(mp, 1) +
+		     XFS_DIROP_LOG_RES(mp) +
+		     128 * (3 + XFS_DIROP_LOG_COUNT(mp))),
+		    (3 * mp->m_sb.sb_sectsize +
+		     XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
+		     XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
+		     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+		     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+			    XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
 }
 
+/*
+ * Making a new directory is the same as creating a new file.
+ */
 STATIC uint
-xfs_calc_mkdir_reservation(xfs_mount_t *mp)
+xfs_calc_mkdir_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_MKDIR_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+	return xfs_calc_create_reservation(mp);
 }
 
+/*
+ * In freeing an inode we can modify:
+ *    the inode being freed: inode size
+ *    the super block free inode counter: sector size
+ *    the agi hash list and counters: sector size
+ *    the inode btree entry: block size
+ *    the on disk inode before ours in the agi hash list: inode cluster size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_ifree_reservation(xfs_mount_t *mp)
+xfs_calc_ifree_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_IFREE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+	return XFS_DQUOT_LOGRES(mp) +
+		mp->m_sb.sb_inodesize +
+		mp->m_sb.sb_sectsize +
+		mp->m_sb.sb_sectsize +
+		XFS_FSB_TO_B(mp, 1) +
+		MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),
+		    XFS_INODE_CLUSTER_SIZE(mp)) +
+		128 * 5 +
+		XFS_ALLOCFREE_LOG_RES(mp, 1) +
+		128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+		       XFS_ALLOCFREE_LOG_COUNT(mp, 1));
 }
 
+/*
+ * When only changing the inode we log the inode and possibly the superblock
+ * We also add a bit of slop for the transaction stuff.
+ */
 STATIC uint
-xfs_calc_ichange_reservation(xfs_mount_t *mp)
+xfs_calc_ichange_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_ICHANGE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+	return XFS_DQUOT_LOGRES(mp) +
+		mp->m_sb.sb_inodesize +
+		mp->m_sb.sb_sectsize +
+		512;
+
 }
 
+/*
+ * Growing the data section of the filesystem.
+ *	superblock
+ *	agi and agf
+ *	allocation btrees
+ */
 STATIC uint
-xfs_calc_growdata_reservation(xfs_mount_t *mp)
+xfs_calc_growdata_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_GROWDATA_LOG_RES(mp);
+	return mp->m_sb.sb_sectsize * 3 +
+		XFS_ALLOCFREE_LOG_RES(mp, 1) +
+		128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1));
 }
 
+/*
+ * Growing the rt section of the filesystem.
+ * In the first set of transactions (ALLOC) we allocate space to the
+ * bitmap or summary files.
+ *	superblock: sector size
+ *	agf of the ag from which the extent is allocated: sector size
+ *	bmap btree for bitmap/summary inode: max depth * blocksize
+ *	bitmap/summary inode: inode size
+ *	allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
+ */
 STATIC uint
-xfs_calc_growrtalloc_reservation(xfs_mount_t *mp)
+xfs_calc_growrtalloc_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_GROWRTALLOC_LOG_RES(mp);
+	return 2 * mp->m_sb.sb_sectsize +
+		XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
+		mp->m_sb.sb_inodesize +
+		XFS_ALLOCFREE_LOG_RES(mp, 1) +
+		128 * (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
+		       XFS_ALLOCFREE_LOG_COUNT(mp, 1));
 }
 
+/*
+ * Growing the rt section of the filesystem.
+ * In the second set of transactions (ZERO) we zero the new metadata blocks.
+ *	one bitmap/summary block: blocksize
+ */
 STATIC uint
-xfs_calc_growrtzero_reservation(xfs_mount_t *mp)
+xfs_calc_growrtzero_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_GROWRTZERO_LOG_RES(mp);
+	return mp->m_sb.sb_blocksize + 128;
 }
 
+/*
+ * Growing the rt section of the filesystem.
+ * In the third set of transactions (FREE) we update metadata without
+ * allocating any new blocks.
+ *	superblock: sector size
+ *	bitmap inode: inode size
+ *	summary inode: inode size
+ *	one bitmap block: blocksize
+ *	summary blocks: new summary size
+ */
 STATIC uint
-xfs_calc_growrtfree_reservation(xfs_mount_t *mp)
+xfs_calc_growrtfree_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_GROWRTFREE_LOG_RES(mp);
+	return mp->m_sb.sb_sectsize +
+		2 * mp->m_sb.sb_inodesize +
+		mp->m_sb.sb_blocksize +
+		mp->m_rsumsize +
+		128 * 5;
 }
 
+/*
+ * Logging the inode modification timestamp on a synchronous write.
+ *	inode
+ */
 STATIC uint
-xfs_calc_swrite_reservation(xfs_mount_t *mp)
+xfs_calc_swrite_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_SWRITE_LOG_RES(mp);
+	return mp->m_sb.sb_inodesize + 128;
 }
 
+/*
+ * Logging the inode mode bits when writing a setuid/setgid file
+ *	inode
+ */
 STATIC uint
 xfs_calc_writeid_reservation(xfs_mount_t *mp)
 {
-	return XFS_CALC_WRITEID_LOG_RES(mp);
+	return mp->m_sb.sb_inodesize + 128;
 }
 
+/*
+ * Converting the inode from non-attributed to attributed.
+ *	the inode being converted: inode size
+ *	agf block and superblock (for block allocation)
+ *	the new block (directory sized)
+ *	bmap blocks for the new directory block
+ *	allocation btrees
+ */
 STATIC uint
-xfs_calc_addafork_reservation(xfs_mount_t *mp)
+xfs_calc_addafork_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_ADDAFORK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+	return XFS_DQUOT_LOGRES(mp) +
+		mp->m_sb.sb_inodesize +
+		mp->m_sb.sb_sectsize * 2 +
+		mp->m_dirblksize +
+		XFS_FSB_TO_B(mp, XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) +
+		XFS_ALLOCFREE_LOG_RES(mp, 1) +
+		128 * (4 + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1 +
+		       XFS_ALLOCFREE_LOG_COUNT(mp, 1));
 }
 
+/*
+ * Removing the attribute fork of a file
+ *    the inode being truncated: inode size
+ *    the inode's bmap btree: max depth * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *		4 exts * 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_attrinval_reservation(xfs_mount_t *mp)
+xfs_calc_attrinval_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_ATTRINVAL_LOG_RES(mp);
+	return MAX((mp->m_sb.sb_inodesize +
+		    XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+		    128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))),
+		   (4 * mp->m_sb.sb_sectsize +
+		    4 * mp->m_sb.sb_sectsize +
+		    mp->m_sb.sb_sectsize +
+		    XFS_ALLOCFREE_LOG_RES(mp, 4) +
+		    128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))));
 }
 
+/*
+ * Setting an attribute.
+ *	the inode getting the attribute
+ *	the superblock for allocations
+ *	the agfs extents are allocated from
+ *	the attribute btree * max depth
+ *	the inode allocation btree
+ * Since attribute transaction space is dependent on the size of the attribute,
+ * the calculation is done partially at mount time and partially at runtime.
+ */
 STATIC uint
-xfs_calc_attrset_reservation(xfs_mount_t *mp)
+xfs_calc_attrset_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_ATTRSET_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+	return XFS_DQUOT_LOGRES(mp) +
+		mp->m_sb.sb_inodesize +
+		mp->m_sb.sb_sectsize +
+		XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
+		128 * (2 + XFS_DA_NODE_MAXDEPTH);
 }
 
+/*
+ * Removing an attribute.
+ *    the inode: inode size
+ *    the attribute btree could join: max depth * block size
+ *    the inode bmap btree could join or split: max depth * block size
+ * And the bmap_finish transaction can free the attr blocks freed giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_attrrm_reservation(xfs_mount_t *mp)
+xfs_calc_attrrm_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_ATTRRM_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((mp->m_sb.sb_inodesize +
+		     XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
+		     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+		     128 * (1 + XFS_DA_NODE_MAXDEPTH +
+			    XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
+		    (2 * mp->m_sb.sb_sectsize +
+		     2 * mp->m_sb.sb_sectsize +
+		     mp->m_sb.sb_sectsize +
+		     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+		     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
 }
 
+/*
+ * Clearing a bad agino number in an agi hash bucket.
+ */
 STATIC uint
-xfs_calc_clear_agi_bucket_reservation(xfs_mount_t *mp)
+xfs_calc_clear_agi_bucket_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp);
+	return mp->m_sb.sb_sectsize + 128;
 }
 
 /*
@@ -184,11 +539,10 @@ xfs_calc_clear_agi_bucket_reservation(xfs_mount_t *mp)
  */
 void
 xfs_trans_init(
-	xfs_mount_t	*mp)
+	struct xfs_mount	*mp)
 {
-	xfs_trans_reservations_t	*resp;
+	struct xfs_trans_reservations *resp = &mp->m_reservations;
 
-	resp = &(mp->m_reservations);
 	resp->tr_write = xfs_calc_write_reservation(mp);
 	resp->tr_itruncate = xfs_calc_itruncate_reservation(mp);
 	resp->tr_rename = xfs_calc_rename_reservation(mp);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 8c69e78..e639e8e 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -300,24 +300,6 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
 
 
 /*
- * Various log reservation values.
- * These are based on the size of the file system block
- * because that is what most transactions manipulate.
- * Each adds in an additional 128 bytes per item logged to
- * try to account for the overhead of the transaction mechanism.
- *
- * Note:
- * Most of the reservations underestimate the number of allocation
- * groups into which they could free extents in the xfs_bmap_finish()
- * call.  This is because the number in the worst case is quite high
- * and quite unusual.  In order to fix this we need to change
- * xfs_bmap_finish() to free extents in only a single AG at a time.
- * This will require changes to the EFI code as well, however, so that
- * the EFI for the extents not freed is logged again in each transaction.
- * See bug 261917.
- */
-
-/*
  * Per-extent log reservation for the allocation btree changes
  * involved in freeing or allocating an extent.
  * 2 trees * (2 blocks/level * max depth - 1) * block size
@@ -341,429 +323,36 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
 	(XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
 	 XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
 
-/*
- * In a write transaction we can allocate a maximum of 2
- * extents.  This gives:
- *    the inode getting the new extents: inode size
- *    the inode's bmap btree: max depth * block size
- *    the agfs of the ags from which the extents are allocated: 2 * sector
- *    the superblock free block counter: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- * And the bmap_finish transaction can free bmap blocks in a join:
- *    the agfs of the ags containing the blocks: 2 * sector size
- *    the agfls of the ags containing the blocks: 2 * sector size
- *    the super block free block counter: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_WRITE_LOG_RES(mp) \
-	(MAX( \
-	 ((mp)->m_sb.sb_inodesize + \
-	  XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
-	  (2 * (mp)->m_sb.sb_sectsize) + \
-	  (mp)->m_sb.sb_sectsize + \
-	  XFS_ALLOCFREE_LOG_RES(mp, 2) + \
-	  (128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))),\
-	 ((2 * (mp)->m_sb.sb_sectsize) + \
-	  (2 * (mp)->m_sb.sb_sectsize) + \
-	  (mp)->m_sb.sb_sectsize + \
-	  XFS_ALLOCFREE_LOG_RES(mp, 2) + \
-	  (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
 
 #define	XFS_WRITE_LOG_RES(mp)	((mp)->m_reservations.tr_write)
-
-/*
- * In truncating a file we free up to two extents at once.  We can modify:
- *    the inode being truncated: inode size
- *    the inode's bmap btree: (max depth + 1) * block size
- * And the bmap_finish transaction can free the blocks and bmap blocks:
- *    the agf for each of the ags: 4 * sector size
- *    the agfl for each of the ags: 4 * sector size
- *    the super block to reflect the freed blocks: sector size
- *    worst case split in allocation btrees per extent assuming 4 extents:
- *		4 exts * 2 trees * (2 * max depth - 1) * block size
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-#define	XFS_CALC_ITRUNCATE_LOG_RES(mp) \
-	(MAX( \
-	 ((mp)->m_sb.sb_inodesize + \
-	  XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) + \
-	  (128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
-	 ((4 * (mp)->m_sb.sb_sectsize) + \
-	  (4 * (mp)->m_sb.sb_sectsize) + \
-	  (mp)->m_sb.sb_sectsize + \
-	  XFS_ALLOCFREE_LOG_RES(mp, 4) + \
-	  (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))) + \
-	  (128 * 5) + \
-	  XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-	   (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
-	    XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
-
 #define	XFS_ITRUNCATE_LOG_RES(mp)   ((mp)->m_reservations.tr_itruncate)
-
-/*
- * In renaming a files we can modify:
- *    the four inodes involved: 4 * inode size
- *    the two directory btrees: 2 * (max depth + v2) * dir block size
- *    the two directory bmap btrees: 2 * max depth * block size
- * And the bmap_finish transaction can free dir and bmap blocks (two sets
- *	of bmap blocks) giving:
- *    the agf for the ags in which the blocks live: 3 * sector size
- *    the agfl for the ags in which the blocks live: 3 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define	XFS_CALC_RENAME_LOG_RES(mp) \
-	(MAX( \
-	 ((4 * (mp)->m_sb.sb_inodesize) + \
-	  (2 * XFS_DIROP_LOG_RES(mp)) + \
-	  (128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp)))), \
-	 ((3 * (mp)->m_sb.sb_sectsize) + \
-	  (3 * (mp)->m_sb.sb_sectsize) + \
-	  (mp)->m_sb.sb_sectsize + \
-	  XFS_ALLOCFREE_LOG_RES(mp, 3) + \
-	  (128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))))))
-
 #define	XFS_RENAME_LOG_RES(mp)	((mp)->m_reservations.tr_rename)
-
-/*
- * For creating a link to an inode:
- *    the parent directory inode: inode size
- *    the linked inode: inode size
- *    the directory btree could split: (max depth + v2) * dir block size
- *    the directory bmap btree could join or split: (max depth + v2) * blocksize
- * And the bmap_finish transaction can free some bmap blocks giving:
- *    the agf for the ag in which the blocks live: sector size
- *    the agfl for the ag in which the blocks live: sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
- */
-#define	XFS_CALC_LINK_LOG_RES(mp) \
-	(MAX( \
-	 ((mp)->m_sb.sb_inodesize + \
-	  (mp)->m_sb.sb_inodesize + \
-	  XFS_DIROP_LOG_RES(mp) + \
-	  (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
-	 ((mp)->m_sb.sb_sectsize + \
-	  (mp)->m_sb.sb_sectsize + \
-	  (mp)->m_sb.sb_sectsize + \
-	  XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-	  (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
-
 #define	XFS_LINK_LOG_RES(mp)	((mp)->m_reservations.tr_link)
-
-/*
- * For removing a directory entry we can modify:
- *    the parent directory inode: inode size
- *    the removed inode: inode size
- *    the directory btree could join: (max depth + v2) * dir block size
- *    the directory bmap btree could join or split: (max depth + v2) * blocksize
- * And the bmap_finish transaction can free the dir and bmap blocks giving:
- *    the agf for the ag in which the blocks live: 2 * sector size
- *    the agfl for the ag in which the blocks live: 2 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define	XFS_CALC_REMOVE_LOG_RES(mp)	\
-	(MAX( \
-	 ((mp)->m_sb.sb_inodesize + \
-	  (mp)->m_sb.sb_inodesize + \
-	  XFS_DIROP_LOG_RES(mp) + \
-	  (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
-	 ((2 * (mp)->m_sb.sb_sectsize) + \
-	  (2 * (mp)->m_sb.sb_sectsize) + \
-	  (mp)->m_sb.sb_sectsize + \
-	  XFS_ALLOCFREE_LOG_RES(mp, 2) + \
-	  (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
-
 #define	XFS_REMOVE_LOG_RES(mp)	((mp)->m_reservations.tr_remove)
-
-/*
- * For symlink we can modify:
- *    the parent directory inode: inode size
- *    the new inode: inode size
- *    the inode btree entry: 1 block
- *    the directory btree: (max depth + v2) * dir block size
- *    the directory inode's bmap btree: (max depth + v2) * block size
- *    the blocks for the symlink: 1 kB
- * Or in the first xact we allocate some inodes giving:
- *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
- *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
- */
-#define	XFS_CALC_SYMLINK_LOG_RES(mp)		\
-	(MAX( \
-	 ((mp)->m_sb.sb_inodesize + \
-	  (mp)->m_sb.sb_inodesize + \
-	  XFS_FSB_TO_B(mp, 1) + \
-	  XFS_DIROP_LOG_RES(mp) + \
-	  1024 + \
-	  (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \
-	 (2 * (mp)->m_sb.sb_sectsize + \
-	  XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
-	  XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
-	  XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-	  (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
-	   XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
-
 #define	XFS_SYMLINK_LOG_RES(mp)	((mp)->m_reservations.tr_symlink)
-
-/*
- * For create we can modify:
- *    the parent directory inode: inode size
- *    the new inode: inode size
- *    the inode btree entry: block size
- *    the superblock for the nlink flag: sector size
- *    the directory btree: (max depth + v2) * dir block size
- *    the directory inode's bmap btree: (max depth + v2) * block size
- * Or in the first xact we allocate some inodes giving:
- *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
- *    the superblock for the nlink flag: sector size
- *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-#define	XFS_CALC_CREATE_LOG_RES(mp)		\
-	(MAX( \
-	 ((mp)->m_sb.sb_inodesize + \
-	  (mp)->m_sb.sb_inodesize + \
-	  (mp)->m_sb.sb_sectsize + \
-	  XFS_FSB_TO_B(mp, 1) + \
-	  XFS_DIROP_LOG_RES(mp) + \
-	  (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \
-	 (3 * (mp)->m_sb.sb_sectsize + \
-	  XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
-	  XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
-	  XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-	  (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
-	   XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
-
 #define	XFS_CREATE_LOG_RES(mp)	((mp)->m_reservations.tr_create)
-
-/*
- * Making a new directory is the same as creating a new file.
- */
-#define	XFS_CALC_MKDIR_LOG_RES(mp)	XFS_CALC_CREATE_LOG_RES(mp)
-
 #define	XFS_MKDIR_LOG_RES(mp)	((mp)->m_reservations.tr_mkdir)
-
-/*
- * In freeing an inode we can modify:
- *    the inode being freed: inode size
- *    the super block free inode counter: sector size
- *    the agi hash list and counters: sector size
- *    the inode btree entry: block size
- *    the on disk inode before ours in the agi hash list: inode cluster size
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-#define	XFS_CALC_IFREE_LOG_RES(mp) \
-	((mp)->m_sb.sb_inodesize + \
-	 (mp)->m_sb.sb_sectsize + \
-	 (mp)->m_sb.sb_sectsize + \
-	 XFS_FSB_TO_B((mp), 1) + \
-	 MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \
-	 (128 * 5) + \
-	  XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-	  (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
-	   XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
-
-
 #define	XFS_IFREE_LOG_RES(mp)	((mp)->m_reservations.tr_ifree)
-
-/*
- * When only changing the inode we log the inode and possibly the superblock
- * We also add a bit of slop for the transaction stuff.
- */
-#define	XFS_CALC_ICHANGE_LOG_RES(mp)	((mp)->m_sb.sb_inodesize + \
-					 (mp)->m_sb.sb_sectsize + 512)
-
 #define	XFS_ICHANGE_LOG_RES(mp)	((mp)->m_reservations.tr_ichange)
-
-/*
- * Growing the data section of the filesystem.
- *	superblock
- *	agi and agf
- *	allocation btrees
- */
-#define	XFS_CALC_GROWDATA_LOG_RES(mp) \
-	((mp)->m_sb.sb_sectsize * 3 + \
-	 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-	 (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
-
 #define	XFS_GROWDATA_LOG_RES(mp)    ((mp)->m_reservations.tr_growdata)
-
-/*
- * Growing the rt section of the filesystem.
- * In the first set of transactions (ALLOC) we allocate space to the
- * bitmap or summary files.
- *	superblock: sector size
- *	agf of the ag from which the extent is allocated: sector size
- *	bmap btree for bitmap/summary inode: max depth * blocksize
- *	bitmap/summary inode: inode size
- *	allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
- */
-#define	XFS_CALC_GROWRTALLOC_LOG_RES(mp) \
-	(2 * (mp)->m_sb.sb_sectsize + \
-	 XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
-	 (mp)->m_sb.sb_inodesize + \
-	 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-	 (128 * \
-	  (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + \
-	   XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
-
 #define	XFS_GROWRTALLOC_LOG_RES(mp)	((mp)->m_reservations.tr_growrtalloc)
-
-/*
- * Growing the rt section of the filesystem.
- * In the second set of transactions (ZERO) we zero the new metadata blocks.
- *	one bitmap/summary block: blocksize
- */
-#define	XFS_CALC_GROWRTZERO_LOG_RES(mp) \
-	((mp)->m_sb.sb_blocksize + 128)
-
 #define	XFS_GROWRTZERO_LOG_RES(mp)	((mp)->m_reservations.tr_growrtzero)
-
-/*
- * Growing the rt section of the filesystem.
- * In the third set of transactions (FREE) we update metadata without
- * allocating any new blocks.
- *	superblock: sector size
- *	bitmap inode: inode size
- *	summary inode: inode size
- *	one bitmap block: blocksize
- *	summary blocks: new summary size
- */
-#define	XFS_CALC_GROWRTFREE_LOG_RES(mp) \
-	((mp)->m_sb.sb_sectsize + \
-	 2 * (mp)->m_sb.sb_inodesize + \
-	 (mp)->m_sb.sb_blocksize + \
-	 (mp)->m_rsumsize + \
-	 (128 * 5))
-
 #define	XFS_GROWRTFREE_LOG_RES(mp)	((mp)->m_reservations.tr_growrtfree)
-
-/*
- * Logging the inode modification timestamp on a synchronous write.
- *	inode
- */
-#define	XFS_CALC_SWRITE_LOG_RES(mp) \
-	((mp)->m_sb.sb_inodesize + 128)
-
 #define	XFS_SWRITE_LOG_RES(mp)	((mp)->m_reservations.tr_swrite)
-
 /*
  * Logging the inode timestamps on an fsync -- same as SWRITE
  * as long as SWRITE logs the entire inode core
  */
 #define XFS_FSYNC_TS_LOG_RES(mp)        ((mp)->m_reservations.tr_swrite)
-
-/*
- * Logging the inode mode bits when writing a setuid/setgid file
- *	inode
- */
-#define	XFS_CALC_WRITEID_LOG_RES(mp) \
-	((mp)->m_sb.sb_inodesize + 128)
-
 #define	XFS_WRITEID_LOG_RES(mp)	((mp)->m_reservations.tr_swrite)
-
-/*
- * Converting the inode from non-attributed to attributed.
- *	the inode being converted: inode size
- *	agf block and superblock (for block allocation)
- *	the new block (directory sized)
- *	bmap blocks for the new directory block
- *	allocation btrees
- */
-#define	XFS_CALC_ADDAFORK_LOG_RES(mp)	\
-	((mp)->m_sb.sb_inodesize + \
-	 (mp)->m_sb.sb_sectsize * 2 + \
-	 (mp)->m_dirblksize + \
-	 XFS_FSB_TO_B(mp, (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1)) + \
-	 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-	 (128 * (4 + (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + \
-		 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
-
 #define	XFS_ADDAFORK_LOG_RES(mp)	((mp)->m_reservations.tr_addafork)
-
-/*
- * Removing the attribute fork of a file
- *    the inode being truncated: inode size
- *    the inode's bmap btree: max depth * block size
- * And the bmap_finish transaction can free the blocks and bmap blocks:
- *    the agf for each of the ags: 4 * sector size
- *    the agfl for each of the ags: 4 * sector size
- *    the super block to reflect the freed blocks: sector size
- *    worst case split in allocation btrees per extent assuming 4 extents:
- *		4 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define	XFS_CALC_ATTRINVAL_LOG_RES(mp)	\
-	(MAX( \
-	 ((mp)->m_sb.sb_inodesize + \
-	  XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
-	  (128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))), \
-	 ((4 * (mp)->m_sb.sb_sectsize) + \
-	  (4 * (mp)->m_sb.sb_sectsize) + \
-	  (mp)->m_sb.sb_sectsize + \
-	  XFS_ALLOCFREE_LOG_RES(mp, 4) + \
-	  (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))))))
-
 #define	XFS_ATTRINVAL_LOG_RES(mp)	((mp)->m_reservations.tr_attrinval)
-
-/*
- * Setting an attribute.
- *	the inode getting the attribute
- *	the superblock for allocations
- *	the agfs extents are allocated from
- *	the attribute btree * max depth
- *	the inode allocation btree
- * Since attribute transaction space is dependent on the size of the attribute,
- * the calculation is done partially at mount time and partially at runtime.
- */
-#define	XFS_CALC_ATTRSET_LOG_RES(mp)	\
-	((mp)->m_sb.sb_inodesize + \
-	 (mp)->m_sb.sb_sectsize + \
-	  XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
-	  (128 * (2 + XFS_DA_NODE_MAXDEPTH)))
-
 #define	XFS_ATTRSET_LOG_RES(mp, ext)	\
 	((mp)->m_reservations.tr_attrset + \
 	 (ext * (mp)->m_sb.sb_sectsize) + \
 	 (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \
 	 (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))))
-
-/*
- * Removing an attribute.
- *    the inode: inode size
- *    the attribute btree could join: max depth * block size
- *    the inode bmap btree could join or split: max depth * block size
- * And the bmap_finish transaction can free the attr blocks freed giving:
- *    the agf for the ag in which the blocks live: 2 * sector size
- *    the agfl for the ag in which the blocks live: 2 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define	XFS_CALC_ATTRRM_LOG_RES(mp)	\
-	(MAX( \
-	  ((mp)->m_sb.sb_inodesize + \
-	  XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
-	  XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
-	  (128 * (1 + XFS_DA_NODE_MAXDEPTH + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
-	 ((2 * (mp)->m_sb.sb_sectsize) + \
-	  (2 * (mp)->m_sb.sb_sectsize) + \
-	  (mp)->m_sb.sb_sectsize + \
-	  XFS_ALLOCFREE_LOG_RES(mp, 2) + \
-	  (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
-
 #define	XFS_ATTRRM_LOG_RES(mp)	((mp)->m_reservations.tr_attrrm)
-
-/*
- * Clearing a bad agino number in an agi hash bucket.
- */
-#define	XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp) \
-	((mp)->m_sb.sb_sectsize + 128)
-
 #define	XFS_CLEAR_AGI_BUCKET_LOG_RES(mp)  ((mp)->m_reservations.tr_clearagi)
 
 
-- 
cgit v1.1


From fdc07f44c891d3fdee7722a03e3881614a293b3c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 10 May 2010 17:28:14 +0000
Subject: xfs: clean up xlog_align

Add suggested cleanups to commit 29db3370a1369541d58d692fbfb168b8a0bd7f41
from review that didn't end up being commited.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_log_recover.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 14a69ae..ed0684c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -132,15 +132,10 @@ xlog_align(
 	int		nbblks,
 	xfs_buf_t	*bp)
 {
-	xfs_daddr_t	offset;
-	xfs_caddr_t	ptr;
+	xfs_daddr_t	offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
 
-	offset = blk_no & ((xfs_daddr_t) log->l_sectBBsize - 1);
-	ptr = XFS_BUF_PTR(bp) + BBTOB(offset);
-
-	ASSERT(ptr + BBTOB(nbblks) <= XFS_BUF_PTR(bp) + XFS_BUF_SIZE(bp));
-
-	return ptr;
+	ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp));
+	return XFS_BUF_PTR(bp) + BBTOB(offset);
 }
 
 
-- 
cgit v1.1


From 07f1a4f5e89cd4e6c79d67d41e8a18c451214ae2 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 21 May 2010 05:47:59 +0000
Subject: xfs: Check new inode size is OK before preallocating

The new xfsqa test 228 tries to preallocate more space than the
filesystem contains. it should fail, but instead triggers an assert
about lock flags.  The failure is due to the size extension failing
in vmtruncate() due to rlimit being set. Check this before we start
the preallocation to avoid allocating space that will never be used.

Also the path through xfs_vn_allocate already holds the IO lock, so
it should not be present in the lock flags when the setattr fails.
Hence the assert needs to take this into account. This will prevent
other such callers from hitting this incorrect ASSERT.

(Fixed a reference to "newsize" to read "new_size". -Alex)

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_iops.c | 16 +++++++++++++---
 fs/xfs/xfs_vnodeops.c       |  2 +-
 2 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 9c8019c..44f0b2d 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -585,11 +585,20 @@ xfs_vn_fallocate(
 	bf.l_len = len;
 
 	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+	/* check the new inode size is valid before allocating */
+	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+	    offset + len > i_size_read(inode)) {
+		new_size = offset + len;
+		error = inode_newsize_ok(inode, new_size);
+		if (error)
+			goto out_unlock;
+	}
+
 	error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
 				       0, XFS_ATTR_NOLOCK);
-	if (!error && !(mode & FALLOC_FL_KEEP_SIZE) &&
-	    offset + len > i_size_read(inode))
-		new_size = offset + len;
+	if (error)
+		goto out_unlock;
 
 	/* Change file size if needed */
 	if (new_size) {
@@ -600,6 +609,7 @@ xfs_vn_fallocate(
 		error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
 	}
 
+out_unlock:
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 out_error:
 	return error;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 9d376be..a06bd62 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -267,7 +267,7 @@ xfs_setattr(
 		if (code) {
 			ASSERT(tp == NULL);
 			lock_flags &= ~XFS_ILOCK_EXCL;
-			ASSERT(lock_flags == XFS_IOLOCK_EXCL);
+			ASSERT(lock_flags == XFS_IOLOCK_EXCL || !need_iolock);
 			goto error_return;
 		}
 		tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
-- 
cgit v1.1


From 292ec4cf3536a5ed8809e8823341b203e497bbaf Mon Sep 17 00:00:00 2001
From: Huang Weiyi <weiyi.huang@gmail.com>
Date: Sat, 22 May 2010 17:13:20 +0000
Subject: xfs: xfs_trace.c: remove duplicated #include

Remove duplicated #include('s) in
  fs/xfs/linux-2.6/xfs_trace.c

Signed-off-by: Huang Weiyi <weiyi.huang@gmail.com>
Reviewed-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_trace.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
index 207fa77..d12be84 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -50,7 +50,6 @@
 #include "quota/xfs_dquot_item.h"
 #include "quota/xfs_dquot.h"
 #include "xfs_log_recover.h"
-#include "xfs_buf_item.h"
 #include "xfs_inode_item.h"
 
 /*
-- 
cgit v1.1


From f8adb4d574cf8c67f8391367136edc5469258fdc Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 24 May 2010 08:25:57 +0000
Subject: xfs: convert more trace events to DEFINE_EVENT

Use DECLARE_EVENT_CLASS, and save ~15K:

   text    data     bss     dec     hex filename
 171949   43028      48  215025   347f1 fs/xfs/linux-2.6/xfs_trace.o.orig
 156521   43028      36  199585   30ba1 fs/xfs/linux-2.6/xfs_trace.o

No change in functionality.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_trace.h | 356 +++++++++++++++++++++++--------------------
 1 file changed, 188 insertions(+), 168 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index ff6bc79..73d5aa1 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -82,33 +82,6 @@ DECLARE_EVENT_CLASS(xfs_attr_list_class,
 	)
 )
 
-#define DEFINE_PERAG_REF_EVENT(name) \
-TRACE_EVENT(name, \
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \
-		 unsigned long caller_ip), \
-	TP_ARGS(mp, agno, refcount, caller_ip), \
-	TP_STRUCT__entry( \
-		__field(dev_t, dev) \
-		__field(xfs_agnumber_t, agno) \
-		__field(int, refcount) \
-		__field(unsigned long, caller_ip) \
-	), \
-	TP_fast_assign( \
-		__entry->dev = mp->m_super->s_dev; \
-		__entry->agno = agno; \
-		__entry->refcount = refcount; \
-		__entry->caller_ip = caller_ip; \
-	), \
-	TP_printk("dev %d:%d agno %u refcount %d caller %pf", \
-		  MAJOR(__entry->dev), MINOR(__entry->dev), \
-		  __entry->agno, \
-		  __entry->refcount, \
-		  (char *)__entry->caller_ip) \
-);
-
-DEFINE_PERAG_REF_EVENT(xfs_perag_get)
-DEFINE_PERAG_REF_EVENT(xfs_perag_put)
-
 #define DEFINE_ATTR_LIST_EVENT(name) \
 DEFINE_EVENT(xfs_attr_list_class, name, \
 	TP_PROTO(struct xfs_attr_list_context *ctx), \
@@ -122,6 +95,37 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
 DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
 DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
 
+DECLARE_EVENT_CLASS(xfs_perag_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
+		 unsigned long caller_ip),
+	TP_ARGS(mp, agno, refcount, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(int, refcount)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->refcount = refcount;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d agno %u refcount %d caller %pf",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->refcount,
+		  (char *)__entry->caller_ip)
+);
+
+#define DEFINE_PERAG_REF_EVENT(name)	\
+DEFINE_EVENT(xfs_perag_class, name,	\
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,	\
+		 unsigned long caller_ip),					\
+	TP_ARGS(mp, agno, refcount, caller_ip))
+DEFINE_PERAG_REF_EVENT(xfs_perag_get);
+DEFINE_PERAG_REF_EVENT(xfs_perag_put);
+
 TRACE_EVENT(xfs_attr_list_node_descend,
 	TP_PROTO(struct xfs_attr_list_context *ctx,
 		 struct xfs_da_node_entry *btree),
@@ -775,165 +779,181 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
 
-#define DEFINE_RW_EVENT(name) \
-TRACE_EVENT(name, \
-	TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \
-	TP_ARGS(ip, count, offset, flags), \
-	TP_STRUCT__entry( \
-		__field(dev_t, dev) \
-		__field(xfs_ino_t, ino) \
-		__field(xfs_fsize_t, size) \
-		__field(xfs_fsize_t, new_size) \
-		__field(loff_t, offset) \
-		__field(size_t, count) \
-		__field(int, flags) \
-	), \
-	TP_fast_assign( \
-		__entry->dev = VFS_I(ip)->i_sb->s_dev; \
-		__entry->ino = ip->i_ino; \
-		__entry->size = ip->i_d.di_size; \
-		__entry->new_size = ip->i_new_size; \
-		__entry->offset = offset; \
-		__entry->count = count; \
-		__entry->flags = flags; \
-	), \
-	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
-		  "offset 0x%llx count 0x%zx ioflags %s", \
-		  MAJOR(__entry->dev), MINOR(__entry->dev), \
-		  __entry->ino, \
-		  __entry->size, \
-		  __entry->new_size, \
-		  __entry->offset, \
-		  __entry->count, \
-		  __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) \
+DECLARE_EVENT_CLASS(xfs_file_class,
+	TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
+	TP_ARGS(ip, count, offset, flags),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fsize_t, size)
+		__field(xfs_fsize_t, new_size)
+		__field(loff_t, offset)
+		__field(size_t, count)
+		__field(int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->size = ip->i_d.di_size;
+		__entry->new_size = ip->i_new_size;
+		__entry->offset = offset;
+		__entry->count = count;
+		__entry->flags = flags;
+	),
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+		  "offset 0x%llx count 0x%zx ioflags %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->new_size,
+		  __entry->offset,
+		  __entry->count,
+		  __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
 )
+
+#define DEFINE_RW_EVENT(name)		\
+DEFINE_EVENT(xfs_file_class, name,	\
+	TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),	\
+	TP_ARGS(ip, count, offset, flags))
 DEFINE_RW_EVENT(xfs_file_read);
 DEFINE_RW_EVENT(xfs_file_buffered_write);
 DEFINE_RW_EVENT(xfs_file_direct_write);
 DEFINE_RW_EVENT(xfs_file_splice_read);
 DEFINE_RW_EVENT(xfs_file_splice_write);
 
-
-#define DEFINE_PAGE_EVENT(name) \
-TRACE_EVENT(name, \
-	TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \
-	TP_ARGS(inode, page, off), \
-	TP_STRUCT__entry( \
-		__field(dev_t, dev) \
-		__field(xfs_ino_t, ino) \
-		__field(pgoff_t, pgoff) \
-		__field(loff_t, size) \
-		__field(unsigned long, offset) \
-		__field(int, delalloc) \
-		__field(int, unmapped) \
-		__field(int, unwritten) \
-	), \
-	TP_fast_assign( \
-		int delalloc = -1, unmapped = -1, unwritten = -1; \
-	\
-		if (page_has_buffers(page)) \
-			xfs_count_page_state(page, &delalloc, \
-					     &unmapped, &unwritten); \
-		__entry->dev = inode->i_sb->s_dev; \
-		__entry->ino = XFS_I(inode)->i_ino; \
-		__entry->pgoff = page_offset(page); \
-		__entry->size = i_size_read(inode); \
-		__entry->offset = off; \
-		__entry->delalloc = delalloc; \
-		__entry->unmapped = unmapped; \
-		__entry->unwritten = unwritten; \
-	), \
-	TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " \
-		  "delalloc %d unmapped %d unwritten %d", \
-		  MAJOR(__entry->dev), MINOR(__entry->dev), \
-		  __entry->ino, \
-		  __entry->pgoff, \
-		  __entry->size, \
-		  __entry->offset, \
-		  __entry->delalloc, \
-		  __entry->unmapped, \
-		  __entry->unwritten) \
+DECLARE_EVENT_CLASS(xfs_page_class,
+	TP_PROTO(struct inode *inode, struct page *page, unsigned long off),
+	TP_ARGS(inode, page, off),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(pgoff_t, pgoff)
+		__field(loff_t, size)
+		__field(unsigned long, offset)
+		__field(int, delalloc)
+		__field(int, unmapped)
+		__field(int, unwritten)
+	),
+	TP_fast_assign(
+		int delalloc = -1, unmapped = -1, unwritten = -1;
+
+		if (page_has_buffers(page))
+			xfs_count_page_state(page, &delalloc,
+					     &unmapped, &unwritten);
+		__entry->dev = inode->i_sb->s_dev;
+		__entry->ino = XFS_I(inode)->i_ino;
+		__entry->pgoff = page_offset(page);
+		__entry->size = i_size_read(inode);
+		__entry->offset = off;
+		__entry->delalloc = delalloc;
+		__entry->unmapped = unmapped;
+		__entry->unwritten = unwritten;
+	),
+	TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
+		  "delalloc %d unmapped %d unwritten %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->pgoff,
+		  __entry->size,
+		  __entry->offset,
+		  __entry->delalloc,
+		  __entry->unmapped,
+		  __entry->unwritten)
 )
+
+#define DEFINE_PAGE_EVENT(name)		\
+DEFINE_EVENT(xfs_page_class, name,	\
+	TP_PROTO(struct inode *inode, struct page *page, unsigned long off),	\
+	TP_ARGS(inode, page, off))
 DEFINE_PAGE_EVENT(xfs_writepage);
 DEFINE_PAGE_EVENT(xfs_releasepage);
 DEFINE_PAGE_EVENT(xfs_invalidatepage);
 
-#define DEFINE_IOMAP_EVENT(name) \
-TRACE_EVENT(name, \
-	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
-		 int flags, struct xfs_bmbt_irec *irec), \
-	TP_ARGS(ip, offset, count, flags, irec), \
-	TP_STRUCT__entry( \
-		__field(dev_t, dev) \
-		__field(xfs_ino_t, ino) \
-		__field(loff_t, size) \
-		__field(loff_t, new_size) \
-		__field(loff_t, offset) \
-		__field(size_t, count) \
-		__field(int, flags) \
-		__field(xfs_fileoff_t, startoff) \
-		__field(xfs_fsblock_t, startblock) \
-		__field(xfs_filblks_t, blockcount) \
-	), \
-	TP_fast_assign( \
-		__entry->dev = VFS_I(ip)->i_sb->s_dev; \
-		__entry->ino = ip->i_ino; \
-		__entry->size = ip->i_d.di_size; \
-		__entry->new_size = ip->i_new_size; \
-		__entry->offset = offset; \
-		__entry->count = count; \
-		__entry->flags = flags; \
-		__entry->startoff = irec ? irec->br_startoff : 0; \
-		__entry->startblock = irec ? irec->br_startblock : 0; \
-		__entry->blockcount = irec ? irec->br_blockcount : 0; \
-	), \
-	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
-		  "offset 0x%llx count %zd flags %s " \
-		  "startoff 0x%llx startblock %lld blockcount 0x%llx", \
-		  MAJOR(__entry->dev), MINOR(__entry->dev), \
-		  __entry->ino, \
-		  __entry->size, \
-		  __entry->new_size, \
-		  __entry->offset, \
-		  __entry->count, \
-		  __print_flags(__entry->flags, "|", BMAPI_FLAGS), \
-		  __entry->startoff, \
-		  (__int64_t)__entry->startblock, \
-		  __entry->blockcount) \
+DECLARE_EVENT_CLASS(xfs_iomap_class,
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
+		 int flags, struct xfs_bmbt_irec *irec),
+	TP_ARGS(ip, offset, count, flags, irec),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(loff_t, size)
+		__field(loff_t, new_size)
+		__field(loff_t, offset)
+		__field(size_t, count)
+		__field(int, flags)
+		__field(xfs_fileoff_t, startoff)
+		__field(xfs_fsblock_t, startblock)
+		__field(xfs_filblks_t, blockcount)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->size = ip->i_d.di_size;
+		__entry->new_size = ip->i_new_size;
+		__entry->offset = offset;
+		__entry->count = count;
+		__entry->flags = flags;
+		__entry->startoff = irec ? irec->br_startoff : 0;
+		__entry->startblock = irec ? irec->br_startblock : 0;
+		__entry->blockcount = irec ? irec->br_blockcount : 0;
+	),
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+		  "offset 0x%llx count %zd flags %s "
+		  "startoff 0x%llx startblock %lld blockcount 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->new_size,
+		  __entry->offset,
+		  __entry->count,
+		  __print_flags(__entry->flags, "|", BMAPI_FLAGS),
+		  __entry->startoff,
+		  (__int64_t)__entry->startblock,
+		  __entry->blockcount)
 )
+
+#define DEFINE_IOMAP_EVENT(name)	\
+DEFINE_EVENT(xfs_iomap_class, name,	\
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,	\
+		 int flags, struct xfs_bmbt_irec *irec),		\
+	TP_ARGS(ip, offset, count, flags, irec))
 DEFINE_IOMAP_EVENT(xfs_iomap_enter);
 DEFINE_IOMAP_EVENT(xfs_iomap_found);
 DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
 
-#define DEFINE_SIMPLE_IO_EVENT(name) \
-TRACE_EVENT(name, \
-	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \
-	TP_ARGS(ip, offset, count), \
-	TP_STRUCT__entry( \
-		__field(dev_t, dev) \
-		__field(xfs_ino_t, ino) \
-		__field(loff_t, size) \
-		__field(loff_t, new_size) \
-		__field(loff_t, offset) \
-		__field(size_t, count) \
-	), \
-	TP_fast_assign( \
-		__entry->dev = VFS_I(ip)->i_sb->s_dev; \
-		__entry->ino = ip->i_ino; \
-		__entry->size = ip->i_d.di_size; \
-		__entry->new_size = ip->i_new_size; \
-		__entry->offset = offset; \
-		__entry->count = count; \
-	), \
-	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
-		  "offset 0x%llx count %zd", \
-		  MAJOR(__entry->dev), MINOR(__entry->dev), \
-		  __entry->ino, \
-		  __entry->size, \
-		  __entry->new_size, \
-		  __entry->offset, \
-		  __entry->count) \
+DECLARE_EVENT_CLASS(xfs_simple_io_class,
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
+	TP_ARGS(ip, offset, count),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(loff_t, size)
+		__field(loff_t, new_size)
+		__field(loff_t, offset)
+		__field(size_t, count)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->size = ip->i_d.di_size;
+		__entry->new_size = ip->i_new_size;
+		__entry->offset = offset;
+		__entry->count = count;
+	),
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+		  "offset 0x%llx count %zd",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->new_size,
+		  __entry->offset,
+		  __entry->count)
 );
+
+#define DEFINE_SIMPLE_IO_EVENT(name)	\
+DEFINE_EVENT(xfs_simple_io_class, name,	\
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),	\
+	TP_ARGS(ip, offset, count))
 DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
 DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
 
-- 
cgit v1.1


From 3bd0946eb157e26240ca858d1a42738b095dc6f3 Mon Sep 17 00:00:00 2001
From: Huang Weiyi <weiyi.huang@gmail.com>
Date: Tue, 25 May 2010 22:15:26 +0000
Subject: xfs: remove duplicated #include

Remove duplicated #include('s) in
  fs/xfs/linux-2.6/xfs_quotaops.c

Signed-off-by: Huang Weiyi <weiyi.huang@gmail.com>
Reviewed-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_quotaops.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 2e73688..a184793 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -23,7 +23,6 @@
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_quota.h"
-#include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
-- 
cgit v1.1


From 38e712ab3d28d79725eaade02fe8aba51abac196 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Wed, 26 May 2010 15:57:23 +0000
Subject: fs/xfs/quota: Add missing mutex_unlock

Add a mutex_unlock missing on the error path.  The use of this lock
is balanced elsewhere in the file.

The semantic match that finds this problem is as follows:
(http://coccinelle.lip6.fr/)

// <smpl>
@@
expression E1;
@@

* mutex_lock(E1,...);
  <+... when != E1
  if (...) {
    ... when != E1
*   return ...;
  }
  ...+>
* mutex_unlock(E1,...);
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/quota/xfs_qm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 38e7641..2d8b7bc 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -249,8 +249,10 @@ xfs_qm_hold_quotafs_ref(
 
 	if (!xfs_Gqm) {
 		xfs_Gqm = xfs_Gqm_init();
-		if (!xfs_Gqm)
+		if (!xfs_Gqm) {
+			mutex_unlock(&xfs_Gqm_lock);
 			return ENOMEM;
+		}
 	}
 
 	/*
-- 
cgit v1.1


From 9b98b6f3e1534bba2efcd5b16318945cf2218d99 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 27 May 2010 01:58:13 +0000
Subject: xfs: fix might_sleep() warning when initialising per-ag tree

The use of radix_tree_preload() only works if the radix tree was
initialised without the __GFP_WAIT flag. The per-ag tree uses
GFP_NOFS, so does not trigger allocation of new tree nodes from the
preloaded array. Hence it enters the allocator with a spinlock held
and triggers the might_sleep() warnings.

Reported-by; Chris Mason <chris.mason@oracle.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_mount.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 2d811e5..ef1233b 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1254,7 +1254,7 @@ xfs_mountfs(
 	 * Allocate and initialize the per-ag data.
 	 */
 	spin_lock_init(&mp->m_perag_lock);
-	INIT_RADIX_TREE(&mp->m_perag_tree, GFP_NOFS);
+	INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
 	error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
 	if (error) {
 		cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error);
-- 
cgit v1.1


From fb3b504adeee942e55393396fea8fdf406acf037 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 28 May 2010 19:03:10 +0000
Subject: xfs: fix access to upper inodes without inode64

If a filesystem is mounted without the inode64 mount option we
should still be able to access inodes not fitting into 32 bits, just
not created new ones.  For this to work we need to make sure the
inode cache radix tree is initialized for all allocation groups, not
just those we plan to allocate inodes from.  This patch makes sure
we initialize the inode cache radix tree for all allocation groups,
and also cleans xfs_initialize_perag up a bit to separate the
inode32 logical from the general perag structure setup.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_sync.c |  9 --------
 fs/xfs/xfs_ag.h             |  1 -
 fs/xfs/xfs_iget.c           |  3 ---
 fs/xfs/xfs_inode.c          |  2 --
 fs/xfs/xfs_mount.c          | 52 ++++++++++++++++++---------------------------
 5 files changed, 21 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 3884e20..ef7f021 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -164,10 +164,6 @@ xfs_inode_ag_iterator(
 		struct xfs_perag	*pag;
 
 		pag = xfs_perag_get(mp, ag);
-		if (!pag->pag_ici_init) {
-			xfs_perag_put(pag);
-			continue;
-		}
 		error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
 						exclusive, &nr);
 		xfs_perag_put(pag);
@@ -867,12 +863,7 @@ xfs_reclaim_inode_shrink(
 	down_read(&xfs_mount_list_lock);
 	list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
 		for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
-
 			pag = xfs_perag_get(mp, ag);
-			if (!pag->pag_ici_init) {
-				xfs_perag_put(pag);
-				continue;
-			}
 			reclaimable += pag->pag_ici_reclaimable;
 			xfs_perag_put(pag);
 		}
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 401f364..4917d4e 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -227,7 +227,6 @@ typedef struct xfs_perag {
 
 	atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
 
-	int		pag_ici_init;	/* incore inode cache initialised */
 	rwlock_t	pag_ici_lock;	/* incore inode lock */
 	struct radix_tree_root pag_ici_root;	/* incore inode cache root */
 	int		pag_ici_reclaimable;	/* reclaimable inodes */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 6845db9..f44a367 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -382,9 +382,6 @@ xfs_iget(
 
 	/* get the perag structure and ensure that it's inode capable */
 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
-	if (!pag->pagi_inodeok)
-		return EINVAL;
-	ASSERT(pag->pag_ici_init);
 	agino = XFS_INO_TO_AGINO(mp, ino);
 
 again:
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 8cd6e8d..6ba7765 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2649,8 +2649,6 @@ xfs_iflush_cluster(
 	int			i;
 
 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-	ASSERT(pag->pagi_inodeok);
-	ASSERT(pag->pag_ici_init);
 
 	inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
 	ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index ef1233b..d59f4e8 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -413,17 +413,6 @@ xfs_mount_validate_sb(
 	return 0;
 }
 
-STATIC void
-xfs_initialize_perag_icache(
-	xfs_perag_t	*pag)
-{
-	if (!pag->pag_ici_init) {
-		rwlock_init(&pag->pag_ici_lock);
-		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
-		pag->pag_ici_init = 1;
-	}
-}
-
 int
 xfs_initialize_perag(
 	xfs_mount_t	*mp,
@@ -436,13 +425,8 @@ xfs_initialize_perag(
 	xfs_agino_t	agino;
 	xfs_ino_t	ino;
 	xfs_sb_t	*sbp = &mp->m_sb;
-	xfs_ino_t	max_inum = XFS_MAXINUMBER_32;
 	int		error = -ENOMEM;
 
-	/* Check to see if the filesystem can overflow 32 bit inodes */
-	agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
-	ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
-
 	/*
 	 * Walk the current per-ag tree so we don't try to initialise AGs
 	 * that already exist (growfs case). Allocate and insert all the
@@ -456,11 +440,18 @@ xfs_initialize_perag(
 		}
 		if (!first_initialised)
 			first_initialised = index;
+
 		pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
 		if (!pag)
 			goto out_unwind;
+		pag->pag_agno = index;
+		pag->pag_mount = mp;
+		rwlock_init(&pag->pag_ici_lock);
+		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
+
 		if (radix_tree_preload(GFP_NOFS))
 			goto out_unwind;
+
 		spin_lock(&mp->m_perag_lock);
 		if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
 			BUG();
@@ -469,25 +460,26 @@ xfs_initialize_perag(
 			error = -EEXIST;
 			goto out_unwind;
 		}
-		pag->pag_agno = index;
-		pag->pag_mount = mp;
 		spin_unlock(&mp->m_perag_lock);
 		radix_tree_preload_end();
 	}
 
-	/* Clear the mount flag if no inode can overflow 32 bits
-	 * on this filesystem, or if specifically requested..
+	/*
+	 * If we mount with the inode64 option, or no inode overflows
+	 * the legacy 32-bit address space clear the inode32 option.
 	 */
-	if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > max_inum) {
+	agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
+	ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
+
+	if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32)
 		mp->m_flags |= XFS_MOUNT_32BITINODES;
-	} else {
+	else
 		mp->m_flags &= ~XFS_MOUNT_32BITINODES;
-	}
 
-	/* If we can overflow then setup the ag headers accordingly */
 	if (mp->m_flags & XFS_MOUNT_32BITINODES) {
-		/* Calculate how much should be reserved for inodes to
-		 * meet the max inode percentage.
+		/*
+		 * Calculate how much should be reserved for inodes to meet
+		 * the max inode percentage.
 		 */
 		if (mp->m_maxicount) {
 			__uint64_t	icount;
@@ -500,30 +492,28 @@ xfs_initialize_perag(
 		} else {
 			max_metadata = agcount;
 		}
+
 		for (index = 0; index < agcount; index++) {
 			ino = XFS_AGINO_TO_INO(mp, index, agino);
-			if (ino > max_inum) {
+			if (ino > XFS_MAXINUMBER_32) {
 				index++;
 				break;
 			}
 
-			/* This ag is preferred for inodes */
 			pag = xfs_perag_get(mp, index);
 			pag->pagi_inodeok = 1;
 			if (index < max_metadata)
 				pag->pagf_metadata = 1;
-			xfs_initialize_perag_icache(pag);
 			xfs_perag_put(pag);
 		}
 	} else {
-		/* Setup default behavior for smaller filesystems */
 		for (index = 0; index < agcount; index++) {
 			pag = xfs_perag_get(mp, index);
 			pag->pagi_inodeok = 1;
-			xfs_initialize_perag_icache(pag);
 			xfs_perag_put(pag);
 		}
 	}
+
 	if (maxagi)
 		*maxagi = index;
 	return 0;
-- 
cgit v1.1


From 84cb0999851e25bc4bd4aaa717cc8f8acbf42b2a Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sat, 22 May 2010 12:49:32 +0900
Subject: nilfs2: fix style issue in nilfs_destroy_cachep

This gets rid of unwanted space chars in front of conditional
sentences of nilfs_destroy_cachep().

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/super.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 03b34b7..414ef68 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1130,13 +1130,13 @@ static void nilfs_segbuf_init_once(void *obj)
 
 static void nilfs_destroy_cachep(void)
 {
-	 if (nilfs_inode_cachep)
+	if (nilfs_inode_cachep)
 		kmem_cache_destroy(nilfs_inode_cachep);
-	 if (nilfs_transaction_cachep)
+	if (nilfs_transaction_cachep)
 		kmem_cache_destroy(nilfs_transaction_cachep);
-	 if (nilfs_segbuf_cachep)
+	if (nilfs_segbuf_cachep)
 		kmem_cache_destroy(nilfs_segbuf_cachep);
-	 if (nilfs_btree_path_cache)
+	if (nilfs_btree_path_cache)
 		kmem_cache_destroy(nilfs_btree_path_cache);
 }
 
-- 
cgit v1.1


From c29684d6834af7b3792f2feb6bdcf8c906ad8db6 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sun, 23 May 2010 20:40:32 +0900
Subject: nilfs2: remove obsolete declarations of cache constructor and
 destructor

The commit 41c88bd7 ("nilfs2: cleanup multi
kmem_cache_{create,destroy} code") consolidated slab constructors and
destructors used in nilfs, but it left some declarations in header
files.

This gets rid of the obsolete declarations.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/btree.h   | 2 --
 fs/nilfs2/segbuf.h  | 2 --
 fs/nilfs2/segment.h | 2 --
 3 files changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index af638d5..43c8c5b 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -75,8 +75,6 @@ struct nilfs_btree_path {
 
 extern struct kmem_cache *nilfs_btree_path_cache;
 
-int nilfs_btree_path_cache_init(void);
-void nilfs_btree_path_cache_destroy(void);
 int nilfs_btree_init(struct nilfs_bmap *);
 int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64,
 				   const __u64 *, const __u64 *, int);
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index fdf1c3b..85fbb66 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -127,8 +127,6 @@ struct nilfs_segment_buffer {
 
 extern struct kmem_cache *nilfs_segbuf_cachep;
 
-int __init nilfs_init_segbuf_cache(void);
-void nilfs_destroy_segbuf_cache(void);
 struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *);
 void nilfs_segbuf_free(struct nilfs_segment_buffer *);
 void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long,
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index dca1423..01e20db 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -221,8 +221,6 @@ enum {
 extern struct kmem_cache *nilfs_transaction_cachep;
 
 /* segment.c */
-extern int nilfs_init_transaction_cache(void);
-extern void nilfs_destroy_transaction_cache(void);
 extern void nilfs_relax_pressure_in_lock(struct super_block *);
 
 extern int nilfs_construct_segment(struct super_block *);
-- 
cgit v1.1


From f17625b318d9b151e7bd41e31223e9d89b2aaa77 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 1 Jun 2010 11:05:22 +0200
Subject: Revert "writeback: ensure that WB_SYNC_NONE writeback with sb pinned
 is sync"

This reverts commit 7c8a3554c683f512dbcee26faedb42e4c05f12fa.

We are investigating a hang associated with the WB_SYNC_NONE changes,
so revert them for now.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5c4161f..6753912 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -193,8 +193,7 @@ static void bdi_wait_on_work_clear(struct bdi_work *work)
 }
 
 static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
-				 struct wb_writeback_args *args,
-				 int wait)
+				 struct wb_writeback_args *args)
 {
 	struct bdi_work *work;
 
@@ -206,8 +205,6 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
 	if (work) {
 		bdi_work_init(work, args);
 		bdi_queue_work(bdi, work);
-		if (wait)
-			bdi_wait_on_work_clear(work);
 	} else {
 		struct bdi_writeback *wb = &bdi->wb;
 
@@ -282,7 +279,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
 		args.for_background = 1;
 	}
 
-	bdi_alloc_queue_work(bdi, &args, sb_locked);
+	bdi_alloc_queue_work(bdi, &args);
 }
 
 /*
@@ -912,7 +909,6 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 
 	while ((work = get_next_work_item(bdi, wb)) != NULL) {
 		struct wb_writeback_args args = work->args;
-		int post_clear;
 
 		/*
 		 * Override sync mode, in case we must wait for completion
@@ -920,13 +916,11 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 		if (force_wait)
 			work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
 
-		post_clear = WB_SYNC_ALL || args.sb_pinned;
-
 		/*
 		 * If this isn't a data integrity operation, just notify
 		 * that we have seen this work and we are now starting it.
 		 */
-		if (!post_clear)
+		if (args.sync_mode == WB_SYNC_NONE)
 			wb_clear_pending(wb, work);
 
 		wrote += wb_writeback(wb, &args);
@@ -935,7 +929,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 		 * This is a data integrity writeback, so only do the
 		 * notification when we have completed the work.
 		 */
-		if (post_clear)
+		if (args.sync_mode == WB_SYNC_ALL)
 			wb_clear_pending(wb, work);
 	}
 
@@ -1011,7 +1005,7 @@ static void bdi_writeback_all(struct super_block *sb, long nr_pages)
 		if (!bdi_has_dirty_io(bdi))
 			continue;
 
-		bdi_alloc_queue_work(bdi, &args, 0);
+		bdi_alloc_queue_work(bdi, &args);
 	}
 
 	rcu_read_unlock();
-- 
cgit v1.1


From 0e3c9a2284f5417f196e327c254d0b84c9ee8929 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Tue, 1 Jun 2010 11:08:43 +0200
Subject: Revert "writeback: fix WB_SYNC_NONE writeback from umount"

This reverts commit e913fc825dc685a444cb4c1d0f9d32f372f59861.

We are investigating a hang associated with the WB_SYNC_NONE changes,
so revert them for now.

Conflicts:

	fs/fs-writeback.c
	mm/page-writeback.c

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c | 48 +++++++++++-------------------------------------
 fs/sync.c         |  2 +-
 2 files changed, 12 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 6753912..408a787 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -45,7 +45,6 @@ struct wb_writeback_args {
 	unsigned int for_kupdate:1;
 	unsigned int range_cyclic:1;
 	unsigned int for_background:1;
-	unsigned int sb_pinned:1;
 };
 
 /*
@@ -231,11 +230,6 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
 		.sync_mode	= WB_SYNC_ALL,
 		.nr_pages	= LONG_MAX,
 		.range_cyclic	= 0,
-		/*
-		 * Setting sb_pinned is not necessary for WB_SYNC_ALL, but
-		 * lets make it explicitly clear.
-		 */
-		.sb_pinned	= 1,
 	};
 	struct bdi_work work;
 
@@ -251,23 +245,21 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
  * @bdi: the backing device to write from
  * @sb: write inodes from this super_block
  * @nr_pages: the number of pages to write
- * @sb_locked: caller already holds sb umount sem.
  *
  * Description:
  *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
  *   started when this function returns, we make no guarentees on
- *   completion. Caller specifies whether sb umount sem is held already or not.
+ *   completion. Caller need not hold sb s_umount semaphore.
  *
  */
 void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
-			 long nr_pages, int sb_locked)
+			 long nr_pages)
 {
 	struct wb_writeback_args args = {
 		.sb		= sb,
 		.sync_mode	= WB_SYNC_NONE,
 		.nr_pages	= nr_pages,
 		.range_cyclic	= 1,
-		.sb_pinned	= sb_locked,
 	};
 
 	/*
@@ -592,7 +584,7 @@ static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc,
 	/*
 	 * Caller must already hold the ref for this
 	 */
-	if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) {
+	if (wbc->sync_mode == WB_SYNC_ALL) {
 		WARN_ON(!rwsem_is_locked(&sb->s_umount));
 		return SB_NOT_PINNED;
 	}
@@ -766,7 +758,6 @@ static long wb_writeback(struct bdi_writeback *wb,
 		.for_kupdate		= args->for_kupdate,
 		.for_background		= args->for_background,
 		.range_cyclic		= args->range_cyclic,
-		.sb_pinned		= args->sb_pinned,
 	};
 	unsigned long oldest_jif;
 	long wrote = 0;
@@ -1214,18 +1205,6 @@ static void wait_sb_inodes(struct super_block *sb)
 	iput(old_inode);
 }
 
-static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
-{
-	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
-	long nr_to_write;
-
-	nr_to_write = nr_dirty + nr_unstable +
-			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
-
-	bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked);
-}
-
 /**
  * writeback_inodes_sb	-	writeback dirty inodes from given super_block
  * @sb: the superblock
@@ -1237,21 +1216,16 @@ static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
  */
 void writeback_inodes_sb(struct super_block *sb)
 {
-	__writeback_inodes_sb(sb, 0);
-}
-EXPORT_SYMBOL(writeback_inodes_sb);
+	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
+	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
+	long nr_to_write;
 
-/**
- * writeback_inodes_sb_locked	- writeback dirty inodes from given super_block
- * @sb: the superblock
- *
- * Like writeback_inodes_sb(), except the caller already holds the
- * sb umount sem.
- */
-void writeback_inodes_sb_locked(struct super_block *sb)
-{
-	__writeback_inodes_sb(sb, 1);
+	nr_to_write = nr_dirty + nr_unstable +
+			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
+
+	bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
 }
+EXPORT_SYMBOL(writeback_inodes_sb);
 
 /**
  * writeback_inodes_sb_if_idle	-	start writeback if none underway
diff --git a/fs/sync.c b/fs/sync.c
index e8cbd41..5a537cc 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -42,7 +42,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
 	if (wait)
 		sync_inodes_sb(sb);
 	else
-		writeback_inodes_sb_locked(sb);
+		writeback_inodes_sb(sb);
 
 	if (sb->s_op->sync_fs)
 		sb->s_op->sync_fs(sb, wait);
-- 
cgit v1.1


From e30c7c3b306312c157d67eedd6a01920518b756c Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Tue, 1 Jun 2010 14:10:47 +0100
Subject: binfmt_elf_fdpic: Fix clear_user() error handling

clear_user() returns the number of bytes that could not be copied rather than
an error code.  So we should return -EFAULT rather than directly returning the
results.

Without this patch, positive values may be returned to elf_fdpic_map_file()
and the following error handlings do not function as expected.

1.
	ret = elf_fdpic_map_file_constdisp_on_uclinux(params, file, mm);
	if (ret < 0)
		return ret;
2.
	ret = elf_fdpic_map_file_by_direct_mmap(params, file, mm);
	if (ret < 0)
		return ret;

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Mike Frysinger <vapier@gentoo.org>
CC: Alexander Viro <viro@zeniv.linux.org.uk>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Daisuke HATAYAMA <d.hatayama@jp.fujitsu.com>
CC: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf_fdpic.c | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 2c5f9a0..63039ed 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -990,10 +990,9 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
 
 		/* clear any space allocated but not loaded */
 		if (phdr->p_filesz < phdr->p_memsz) {
-			ret = clear_user((void *) (seg->addr + phdr->p_filesz),
-					 phdr->p_memsz - phdr->p_filesz);
-			if (ret)
-				return ret;
+			if (clear_user((void *) (seg->addr + phdr->p_filesz),
+				       phdr->p_memsz - phdr->p_filesz))
+				return -EFAULT;
 		}
 
 		if (mm) {
@@ -1027,7 +1026,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
 	struct elf32_fdpic_loadseg *seg;
 	struct elf32_phdr *phdr;
 	unsigned long load_addr, delta_vaddr;
-	int loop, dvset, ret;
+	int loop, dvset;
 
 	load_addr = params->load_addr;
 	delta_vaddr = 0;
@@ -1127,9 +1126,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
 		 * PT_LOAD */
 		if (prot & PROT_WRITE && disp > 0) {
 			kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp);
-			ret = clear_user((void __user *) maddr, disp);
-			if (ret)
-				return ret;
+			if (clear_user((void __user *) maddr, disp))
+				return -EFAULT;
 			maddr += disp;
 		}
 
@@ -1164,19 +1162,17 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
 		if (prot & PROT_WRITE && excess1 > 0) {
 			kdebug("clear[%d] ad=%lx sz=%lx",
 			       loop, maddr + phdr->p_filesz, excess1);
-			ret = clear_user((void __user *) maddr + phdr->p_filesz,
-					 excess1);
-			if (ret)
-				return ret;
+			if (clear_user((void __user *) maddr + phdr->p_filesz,
+				       excess1))
+				return -EFAULT;
 		}
 
 #else
 		if (excess > 0) {
 			kdebug("clear[%d] ad=%lx sz=%lx",
 			       loop, maddr + phdr->p_filesz, excess);
-			ret = clear_user((void *) maddr + phdr->p_filesz, excess);
-			if (ret)
-				return ret;
+			if (clear_user((void *) maddr + phdr->p_filesz, excess))
+				return -EFAULT;
 		}
 #endif
 
-- 
cgit v1.1


From 037776fcbe73236408f6c9ca97c782457efd6b53 Mon Sep 17 00:00:00 2001
From: Denis Kirjanov <dkirjanov@hera.kernel.org>
Date: Tue, 1 Jun 2010 17:15:39 +0100
Subject: AFS: Fix possible null pointer dereference in afs_alloc_server()

Fix a possible null pointer dereference in afs_alloc_server(): the server
pointer is NULL if there was an allocation failure, and under such a
condition, we can't dereference it in the _leave() statement.

Signed-off-by: Denis Kirjanov <dkirjanov@kernel.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/afs/server.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/server.c b/fs/afs/server.c
index f490995..9fdc7fe 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -91,9 +91,10 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
 
 		memcpy(&server->addr, addr, sizeof(struct in_addr));
 		server->addr.s_addr = addr->s_addr;
+		_leave(" = %p{%d}", server, atomic_read(&server->usage));
+	} else {
+		_leave(" = NULL [nomem]");
 	}
-
-	_leave(" = %p{%d}", server, atomic_read(&server->usage));
 	return server;
 }
 
-- 
cgit v1.1


From 06b43672a9e665cab18dc7b77d56d36884b90d45 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 1 Jun 2010 10:54:45 -0400
Subject: cifs: fix page refcount leak

Commit 315e995c63a15cb4d4efdbfd70fe2db191917f7a is causing OOM kills
when stress-testing a CIFS filesystem. The VFS readpages operation takes
a page reference. The older code just handed this reference off to the
page cache, but the new code takes an extra one. The simplest fix is to
put the new reference after add_to_page_cache_lru.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index f1ff785..75541af 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1952,6 +1952,7 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
 			bytes_read -= PAGE_CACHE_SIZE;
 			continue;
 		}
+		page_cache_release(page);
 
 		target = kmap_atomic(page, KM_USER0);
 
-- 
cgit v1.1


From 08a66859e69264f3223560d06b88e80c1a6a6387 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Tue, 1 Jun 2010 20:58:22 +0100
Subject: FS-Cache: Remove unneeded null checks

fscache_write_op() makes unnecessary checks of the page variable to see if it
is NULL.  It can't be NULL at those points as the kernel would already have
crashed a little higher up where we examined page->index.

Furthermore, unless radix_tree_gang_lookup_tag() can return 1 but no page, a
NULL pointer crash should not be encountered there as we can only get there if
r_t_g_l_t() returned 1.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fscache/page.c | 36 ++++++++++++++++--------------------
 1 file changed, 16 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 47aefd3..723b889 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -710,30 +710,26 @@ static void fscache_write_op(struct fscache_operation *_op)
 		goto superseded;
 	}
 
-	if (page) {
-		radix_tree_tag_set(&cookie->stores, page->index,
-				   FSCACHE_COOKIE_STORING_TAG);
-		radix_tree_tag_clear(&cookie->stores, page->index,
-				     FSCACHE_COOKIE_PENDING_TAG);
-	}
+	radix_tree_tag_set(&cookie->stores, page->index,
+			   FSCACHE_COOKIE_STORING_TAG);
+	radix_tree_tag_clear(&cookie->stores, page->index,
+			     FSCACHE_COOKIE_PENDING_TAG);
 
 	spin_unlock(&cookie->stores_lock);
 	spin_unlock(&object->lock);
 
-	if (page) {
-		fscache_set_op_state(&op->op, "Store");
-		fscache_stat(&fscache_n_store_pages);
-		fscache_stat(&fscache_n_cop_write_page);
-		ret = object->cache->ops->write_page(op, page);
-		fscache_stat_d(&fscache_n_cop_write_page);
-		fscache_set_op_state(&op->op, "EndWrite");
-		fscache_end_page_write(object, page);
-		if (ret < 0) {
-			fscache_set_op_state(&op->op, "Abort");
-			fscache_abort_object(object);
-		} else {
-			fscache_enqueue_operation(&op->op);
-		}
+	fscache_set_op_state(&op->op, "Store");
+	fscache_stat(&fscache_n_store_pages);
+	fscache_stat(&fscache_n_cop_write_page);
+	ret = object->cache->ops->write_page(op, page);
+	fscache_stat_d(&fscache_n_cop_write_page);
+	fscache_set_op_state(&op->op, "EndWrite");
+	fscache_end_page_write(object, page);
+	if (ret < 0) {
+		fscache_set_op_state(&op->op, "Abort");
+		fscache_abort_object(object);
+	} else {
+		fscache_enqueue_operation(&op->op);
 	}
 
 	_leave("");
-- 
cgit v1.1


From b160fdabe93a8a53094f90f02bf4dcb500782aab Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 1 Jun 2010 21:59:18 +0200
Subject: nfsd: nfsd_setattr needs to call commit_metadata

The conversion of write_inode_now calls to commit_metadata in commit
f501912a35c02eadc55ca9396ece55fe36f785d0 missed out the call in nfsd_setattr.

But without this conversion we can't guarantee that a SETATTR request
has actually been commited to disk with XFS, which causes a regression
from 2.6.32 (only for NFSv2, but anyway).

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: stable@kernel.org
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/vfs.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 6dd5f19..3440dd8 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -443,8 +443,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 	if (size_change)
 		put_write_access(inode);
 	if (!err)
-		if (EX_ISSYNC(fhp->fh_export))
-			write_inode_now(inode, 1);
+		commit_metadata(fhp);
 out:
 	return err;
 
-- 
cgit v1.1


From 13a4214cd9ec14d7b77e98bd3ee51f60f868a6e5 Mon Sep 17 00:00:00 2001
From: Henry C Chang <henry_c_chang@tcloudcomputing.com>
Date: Tue, 1 Jun 2010 11:31:08 -0700
Subject: ceph: fix d_subdirs ordering problem

We misused list_move_tail() to order the dentry in d_subdirs.
This will screw up the d_subdirs order.

This bug can be reliably reproduced by:
1. mount ceph fs.
2. on ceph fs, git clone git://ceph.newdream.net/git/ceph.git
3. Run autogen.sh in ceph directory.
(Note: Errors only occur at the first time you run autogen.sh.)

Signed-off-by: Henry C Chang <henry_c_chang@tcloudcomputing.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 226f5a5..ab47f46 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -827,7 +827,7 @@ static void ceph_set_dentry_offset(struct dentry *dn)
 
 	spin_lock(&dcache_lock);
 	spin_lock(&dn->d_lock);
-	list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
+	list_move(&dn->d_u.d_child, &dir->d_subdirs);
 	dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
 	     dn->d_u.d_child.prev, dn->d_u.d_child.next);
 	spin_unlock(&dn->d_lock);
-- 
cgit v1.1


From 205475679a74fe40b63a1c7f41110fdb64daa8b9 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Tue, 1 Jun 2010 10:37:40 -0700
Subject: ceph: fix memory leak in statfs

Freeing the statfs request structure when required.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mon_client.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 21c62e9..07a5399 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -400,6 +400,8 @@ static void release_generic_request(struct kref *kref)
 		ceph_msg_put(req->reply);
 	if (req->request)
 		ceph_msg_put(req->request);
+
+	kfree(req);
 }
 
 static void put_generic_request(struct ceph_mon_generic_request *req)
-- 
cgit v1.1


From 558d3499bd059d4534b1f2b69dc1c562acc733fe Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 1 Jun 2010 12:51:12 -0700
Subject: ceph: fix f_namelen reported by statfs

We were setting f_namelen in kstatfs to PATH_MAX instead of NAME_MAX.
That disagrees with ceph_lookup behavior (which checks against NAME_MAX),
and also makes the pjd posix test suite spit out ugly errors because with
can't clean up its temporary files.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 4e0bee2..8db88792 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -89,7 +89,7 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 	buf->f_files = le64_to_cpu(st.num_objects);
 	buf->f_ffree = -1;
-	buf->f_namelen = PATH_MAX;
+	buf->f_namelen = NAME_MAX;
 	buf->f_frsize = PAGE_CACHE_SIZE;
 
 	/* leave fsid little-endian, regardless of host endianness */
-- 
cgit v1.1


From 1f5a81e41f8b1a782c68d3843e9ec1bfaadf7d72 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 2 Jun 2010 22:04:39 -0400
Subject: ext4: Make sure the MOVE_EXT ioctl can't overwrite append-only files

Dan Roseberg has reported a problem with the MOVE_EXT ioctl.  If the
donor file is an append-only file, we should not allow the operation
to proceed, lest we end up overwriting the contents of an append-only
file.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Dan Rosenberg <dan.j.rosenberg@gmail.com>
---
 fs/ext4/move_extent.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 3a6c92a..52abfa1 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -960,6 +960,9 @@ mext_check_arguments(struct inode *orig_inode,
 		return -EINVAL;
 	}
 
+	if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode))
+		return -EPERM;
+
 	/* Ext4 move extent does not support swapfile */
 	if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
 		ext4_debug("ext4 move extent: The argument files should "
-- 
cgit v1.1


From 5b257b4a1f9239624c6b5e669763de04e482c2b3 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Thu, 3 Jun 2010 16:22:29 +1000
Subject: xfs: fix race in inode cluster freeing failing to stale inodes

When an inode cluster is freed, it needs to mark all inodes in memory as
XFS_ISTALE before marking the buffer as stale. This is eeded because the inodes
have a different life cycle to the buffer, and once the buffer is torn down
during transaction completion, we must ensure none of the inodes get written
back (which is what XFS_ISTALE does).

Unfortunately, xfs_ifree_cluster() has some bugs that lead to inodes not being
marked with XFS_ISTALE. This shows up when xfs_iflush() is called on these
inodes either during inode reclaim or tail pushing on the AIL.  The buffer is
read back, but no longer contains inodes and so triggers assert failures and
shutdowns. This was reproducable with at run.dbench10 invocation from xfstests.

There are two main causes of xfs_ifree_cluster() failing. The first is simple -
it checks in-memory inodes it finds in the per-ag icache to see if they are
clean without holding the flush lock. if they are clean it skips them
completely. However, If an inode is flushed delwri, it will
appear clean, but is not guaranteed to be written back until the flush lock has
been dropped. Hence we may have raced on the clean check and the inode may
actually be dirty. Hence always mark inodes found in memory stale before we
check properly if they are clean.

The second is more complex, and makes the first problem easier to hit.
Basically the in-memory inode scan is done with full knowledge it can be racing
with inode flushing and AIl tail pushing, which means that inodes that it can't
get the flush lock on might not be attached to the buffer after then in-memory
inode scan due to IO completion occurring. This is actually documented in the
code as "needs better interlocking". i.e. this is a zero-day bug.

Effectively, the in-memory scan must be done while the inode buffer is locked
and Io cannot be issued on it while we do the in-memory inode scan. This
ensures that inodes we couldn't get the flush lock on are guaranteed to be
attached to the cluster buffer, so we can then catch all in-memory inodes and
mark them stale.

Now that the inode cluster buffer is locked before the in-memory scan is done,
there is no need for the two-phase update of the in-memory inodes, so simplify
the code into two loops and remove the allocation of the temporary buffer used
to hold locked inodes across the phases.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_inode.c | 142 +++++++++++++++++++++++------------------------------
 1 file changed, 62 insertions(+), 80 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 6ba7765..d53c39d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1940,10 +1940,10 @@ xfs_ifree_cluster(
 	int			blks_per_cluster;
 	int			nbufs;
 	int			ninodes;
-	int			i, j, found, pre_flushed;
+	int			i, j;
 	xfs_daddr_t		blkno;
 	xfs_buf_t		*bp;
-	xfs_inode_t		*ip, **ip_found;
+	xfs_inode_t		*ip;
 	xfs_inode_log_item_t	*iip;
 	xfs_log_item_t		*lip;
 	struct xfs_perag	*pag;
@@ -1960,114 +1960,97 @@ xfs_ifree_cluster(
 		nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
 	}
 
-	ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS);
-
 	for (j = 0; j < nbufs; j++, inum += ninodes) {
+		int	found = 0;
+
 		blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
 					 XFS_INO_TO_AGBNO(mp, inum));
 
+		/*
+		 * We obtain and lock the backing buffer first in the process
+		 * here, as we have to ensure that any dirty inode that we
+		 * can't get the flush lock on is attached to the buffer.
+		 * If we scan the in-memory inodes first, then buffer IO can
+		 * complete before we get a lock on it, and hence we may fail
+		 * to mark all the active inodes on the buffer stale.
+		 */
+		bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
+					mp->m_bsize * blks_per_cluster,
+					XBF_LOCK);
+
+		/*
+		 * Walk the inodes already attached to the buffer and mark them
+		 * stale. These will all have the flush locks held, so an
+		 * in-memory inode walk can't lock them.
+		 */
+		lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+		while (lip) {
+			if (lip->li_type == XFS_LI_INODE) {
+				iip = (xfs_inode_log_item_t *)lip;
+				ASSERT(iip->ili_logged == 1);
+				lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
+				xfs_trans_ail_copy_lsn(mp->m_ail,
+							&iip->ili_flush_lsn,
+							&iip->ili_item.li_lsn);
+				xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
+				found++;
+			}
+			lip = lip->li_bio_list;
+		}
 
 		/*
-		 * Look for each inode in memory and attempt to lock it,
-		 * we can be racing with flush and tail pushing here.
-		 * any inode we get the locks on, add to an array of
-		 * inode items to process later.
+		 * For each inode in memory attempt to add it to the inode
+		 * buffer and set it up for being staled on buffer IO
+		 * completion.  This is safe as we've locked out tail pushing
+		 * and flushing by locking the buffer.
 		 *
-		 * The get the buffer lock, we could beat a flush
-		 * or tail pushing thread to the lock here, in which
-		 * case they will go looking for the inode buffer
-		 * and fail, we need some other form of interlock
-		 * here.
+		 * We have already marked every inode that was part of a
+		 * transaction stale above, which means there is no point in
+		 * even trying to lock them.
 		 */
-		found = 0;
 		for (i = 0; i < ninodes; i++) {
 			read_lock(&pag->pag_ici_lock);
 			ip = radix_tree_lookup(&pag->pag_ici_root,
 					XFS_INO_TO_AGINO(mp, (inum + i)));
 
-			/* Inode not in memory or we found it already,
-			 * nothing to do
-			 */
+			/* Inode not in memory or stale, nothing to do */
 			if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
 				read_unlock(&pag->pag_ici_lock);
 				continue;
 			}
 
-			if (xfs_inode_clean(ip)) {
-				read_unlock(&pag->pag_ici_lock);
-				continue;
-			}
-
-			/* If we can get the locks then add it to the
-			 * list, otherwise by the time we get the bp lock
-			 * below it will already be attached to the
-			 * inode buffer.
-			 */
-
-			/* This inode will already be locked - by us, lets
-			 * keep it that way.
-			 */
-
-			if (ip == free_ip) {
-				if (xfs_iflock_nowait(ip)) {
-					xfs_iflags_set(ip, XFS_ISTALE);
-					if (xfs_inode_clean(ip)) {
-						xfs_ifunlock(ip);
-					} else {
-						ip_found[found++] = ip;
-					}
-				}
+			/* don't try to lock/unlock the current inode */
+			if (ip != free_ip &&
+			    !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
 				read_unlock(&pag->pag_ici_lock);
 				continue;
 			}
+			read_unlock(&pag->pag_ici_lock);
 
-			if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-				if (xfs_iflock_nowait(ip)) {
-					xfs_iflags_set(ip, XFS_ISTALE);
-
-					if (xfs_inode_clean(ip)) {
-						xfs_ifunlock(ip);
-						xfs_iunlock(ip, XFS_ILOCK_EXCL);
-					} else {
-						ip_found[found++] = ip;
-					}
-				} else {
+			if (!xfs_iflock_nowait(ip)) {
+				if (ip != free_ip)
 					xfs_iunlock(ip, XFS_ILOCK_EXCL);
-				}
+				continue;
 			}
-			read_unlock(&pag->pag_ici_lock);
-		}
 
-		bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 
-					mp->m_bsize * blks_per_cluster,
-					XBF_LOCK);
-
-		pre_flushed = 0;
-		lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
-		while (lip) {
-			if (lip->li_type == XFS_LI_INODE) {
-				iip = (xfs_inode_log_item_t *)lip;
-				ASSERT(iip->ili_logged == 1);
-				lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
-				xfs_trans_ail_copy_lsn(mp->m_ail,
-							&iip->ili_flush_lsn,
-							&iip->ili_item.li_lsn);
-				xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
-				pre_flushed++;
+			xfs_iflags_set(ip, XFS_ISTALE);
+			if (xfs_inode_clean(ip)) {
+				ASSERT(ip != free_ip);
+				xfs_ifunlock(ip);
+				xfs_iunlock(ip, XFS_ILOCK_EXCL);
+				continue;
 			}
-			lip = lip->li_bio_list;
-		}
 
-		for (i = 0; i < found; i++) {
-			ip = ip_found[i];
 			iip = ip->i_itemp;
-
 			if (!iip) {
+				/* inode with unlogged changes only */
+				ASSERT(ip != free_ip);
 				ip->i_update_core = 0;
 				xfs_ifunlock(ip);
 				xfs_iunlock(ip, XFS_ILOCK_EXCL);
 				continue;
 			}
+			found++;
 
 			iip->ili_last_fields = iip->ili_format.ilf_fields;
 			iip->ili_format.ilf_fields = 0;
@@ -2078,17 +2061,16 @@ xfs_ifree_cluster(
 			xfs_buf_attach_iodone(bp,
 				(void(*)(xfs_buf_t*,xfs_log_item_t*))
 				xfs_istale_done, (xfs_log_item_t *)iip);
-			if (ip != free_ip) {
+
+			if (ip != free_ip)
 				xfs_iunlock(ip, XFS_ILOCK_EXCL);
-			}
 		}
 
-		if (found || pre_flushed)
+		if (found)
 			xfs_trans_stale_inode_buf(tp, bp);
 		xfs_trans_binval(tp, bp);
 	}
 
-	kmem_free(ip_found);
 	xfs_perag_put(pag);
 }
 
-- 
cgit v1.1


From 070ecdca54dde9577d2697088e74e45568f48efb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 3 Jun 2010 16:22:29 +1000
Subject: xfs: skip writeback from reclaim context

Allowing writeback from reclaim context causes massive problems with stack
overflows as we can call into the writeback code which tends to be a heavy
stack user both in the generic code and XFS from random contexts that
perform memory allocations.

Follow the example of btrfs (and in slightly different form ext4) and refuse
to write out data from reclaim context.  This issue should really be handled
by the VM so that we can tune better for this case, but until we get it
sorted out there we have to hack around this in each filesystem with a
complex writeback path.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/linux-2.6/xfs_aops.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 089eaca..a0fa3bf 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1333,6 +1333,21 @@ xfs_vm_writepage(
 	trace_xfs_writepage(inode, page, 0);
 
 	/*
+	 * Refuse to write the page out if we are called from reclaim context.
+	 *
+	 * This is primarily to avoid stack overflows when called from deep
+	 * used stacks in random callers for direct reclaim, but disabling
+	 * reclaim for kswap is a nice side-effect as kswapd causes rather
+	 * suboptimal I/O patters, too.
+	 *
+	 * This should really be done by the core VM, but until that happens
+	 * filesystems like XFS, btrfs and ext4 have to take care of this
+	 * by themselves.
+	 */
+	if (current->flags & PF_MEMALLOC)
+		goto out_fail;
+
+	/*
 	 * We need a transaction if:
 	 *  1. There are delalloc buffers on the page
 	 *  2. The page is uptodate and we have unmapped buffers
-- 
cgit v1.1


From f9369729496a0f4c607a4cc1ea4dfeddbbfc505a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 3 Jun 2010 16:22:29 +1000
Subject: xfs: improve xfs_isilocked

Use rwsem_is_locked to make the assertations for shared locks work.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_iget.c | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index f44a367..75df75f 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -741,30 +741,24 @@ xfs_ilock_demote(
 }
 
 #ifdef DEBUG
-/*
- * Debug-only routine, without additional rw_semaphore APIs, we can
- * now only answer requests regarding whether we hold the lock for write
- * (reader state is outside our visibility, we only track writer state).
- *
- * Note: this means !xfs_isilocked would give false positives, so don't do that.
- */
 int
 xfs_isilocked(
 	xfs_inode_t		*ip,
 	uint			lock_flags)
 {
-	if ((lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) ==
-			XFS_ILOCK_EXCL) {
-		if (!ip->i_lock.mr_writer)
-			return 0;
+	if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
+		if (!(lock_flags & XFS_ILOCK_SHARED))
+			return !!ip->i_lock.mr_writer;
+		return rwsem_is_locked(&ip->i_lock.mr_lock);
 	}
 
-	if ((lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) ==
-			XFS_IOLOCK_EXCL) {
-		if (!ip->i_iolock.mr_writer)
-			return 0;
+	if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
+		if (!(lock_flags & XFS_IOLOCK_SHARED))
+			return !!ip->i_iolock.mr_writer;
+		return rwsem_is_locked(&ip->i_iolock.mr_lock);
 	}
 
-	return 1;
+	ASSERT(0);
+	return 0;
 }
 #endif
-- 
cgit v1.1


From f324e4cb2cadd9a42932c8a158e761ae31b88e72 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Thu, 3 Jun 2010 08:03:39 +0100
Subject: jffs2: Fix in-core inode leaks on error paths

Pointed out by Al Viro.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/dir.c | 115 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 56 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 7aa4417..cb7ef34 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -360,8 +360,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 		/* Eeek. Wave bye bye */
 		mutex_unlock(&f->sem);
 		jffs2_complete_reservation(c);
-		jffs2_clear_inode(inode);
-		return PTR_ERR(fn);
+		ret = PTR_ERR(fn);
+		goto fail;
 	}
 
 	/* We use f->target field to store the target path. */
@@ -370,8 +370,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 		printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1);
 		mutex_unlock(&f->sem);
 		jffs2_complete_reservation(c);
-		jffs2_clear_inode(inode);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto fail;
 	}
 
 	memcpy(f->target, target, targetlen + 1);
@@ -386,30 +386,24 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 	jffs2_complete_reservation(c);
 
 	ret = jffs2_init_security(inode, dir_i);
-	if (ret) {
-		jffs2_clear_inode(inode);
-		return ret;
-	}
+	if (ret)
+		goto fail;
+
 	ret = jffs2_init_acl_post(inode);
-	if (ret) {
-		jffs2_clear_inode(inode);
-		return ret;
-	}
+	if (ret)
+		goto fail;
 
 	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
 				  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
-	if (ret) {
-		/* Eep. */
-		jffs2_clear_inode(inode);
-		return ret;
-	}
+	if (ret)
+		goto fail;
 
 	rd = jffs2_alloc_raw_dirent();
 	if (!rd) {
 		/* Argh. Now we treat it like a normal delete */
 		jffs2_complete_reservation(c);
-		jffs2_clear_inode(inode);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto fail;
 	}
 
 	dir_f = JFFS2_INODE_INFO(dir_i);
@@ -437,8 +431,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 		jffs2_complete_reservation(c);
 		jffs2_free_raw_dirent(rd);
 		mutex_unlock(&dir_f->sem);
-		jffs2_clear_inode(inode);
-		return PTR_ERR(fd);
+		ret = PTR_ERR(fd);
+		goto fail;
 	}
 
 	dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
@@ -454,6 +448,11 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 
 	d_instantiate(dentry, inode);
 	return 0;
+
+ fail:
+	make_bad_inode(inode);
+	iput(inode);
+	return ret;
 }
 
 
@@ -519,8 +518,8 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
 		/* Eeek. Wave bye bye */
 		mutex_unlock(&f->sem);
 		jffs2_complete_reservation(c);
-		jffs2_clear_inode(inode);
-		return PTR_ERR(fn);
+		ret = PTR_ERR(fn);
+		goto fail;
 	}
 	/* No data here. Only a metadata node, which will be
 	   obsoleted by the first data write
@@ -531,30 +530,24 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
 	jffs2_complete_reservation(c);
 
 	ret = jffs2_init_security(inode, dir_i);
-	if (ret) {
-		jffs2_clear_inode(inode);
-		return ret;
-	}
+	if (ret)
+		goto fail;
+
 	ret = jffs2_init_acl_post(inode);
-	if (ret) {
-		jffs2_clear_inode(inode);
-		return ret;
-	}
+	if (ret)
+		goto fail;
 
 	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
 				  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
-	if (ret) {
-		/* Eep. */
-		jffs2_clear_inode(inode);
-		return ret;
-	}
+	if (ret)
+		goto fail;
 
 	rd = jffs2_alloc_raw_dirent();
 	if (!rd) {
 		/* Argh. Now we treat it like a normal delete */
 		jffs2_complete_reservation(c);
-		jffs2_clear_inode(inode);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto fail;
 	}
 
 	dir_f = JFFS2_INODE_INFO(dir_i);
@@ -582,8 +575,8 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
 		jffs2_complete_reservation(c);
 		jffs2_free_raw_dirent(rd);
 		mutex_unlock(&dir_f->sem);
-		jffs2_clear_inode(inode);
-		return PTR_ERR(fd);
+		ret = PTR_ERR(fd);
+		goto fail;
 	}
 
 	dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
@@ -600,6 +593,11 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
 
 	d_instantiate(dentry, inode);
 	return 0;
+
+ fail:
+	make_bad_inode(inode);
+	iput(inode);
+	return ret;
 }
 
 static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
@@ -693,8 +691,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
 		/* Eeek. Wave bye bye */
 		mutex_unlock(&f->sem);
 		jffs2_complete_reservation(c);
-		jffs2_clear_inode(inode);
-		return PTR_ERR(fn);
+		ret = PTR_ERR(fn);
+		goto fail;
 	}
 	/* No data here. Only a metadata node, which will be
 	   obsoleted by the first data write
@@ -705,30 +703,24 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
 	jffs2_complete_reservation(c);
 
 	ret = jffs2_init_security(inode, dir_i);
-	if (ret) {
-		jffs2_clear_inode(inode);
-		return ret;
-	}
+	if (ret)
+		goto fail;
+
 	ret = jffs2_init_acl_post(inode);
-	if (ret) {
-		jffs2_clear_inode(inode);
-		return ret;
-	}
+	if (ret)
+		goto fail;
 
 	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
 				  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
-	if (ret) {
-		/* Eep. */
-		jffs2_clear_inode(inode);
-		return ret;
-	}
+	if (ret)
+		goto fail;
 
 	rd = jffs2_alloc_raw_dirent();
 	if (!rd) {
 		/* Argh. Now we treat it like a normal delete */
 		jffs2_complete_reservation(c);
-		jffs2_clear_inode(inode);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto fail;
 	}
 
 	dir_f = JFFS2_INODE_INFO(dir_i);
@@ -759,8 +751,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
 		jffs2_complete_reservation(c);
 		jffs2_free_raw_dirent(rd);
 		mutex_unlock(&dir_f->sem);
-		jffs2_clear_inode(inode);
-		return PTR_ERR(fd);
+		ret = PTR_ERR(fd);
+		goto fail;
 	}
 
 	dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
@@ -777,6 +769,11 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
 	d_instantiate(dentry, inode);
 
 	return 0;
+
+ fail:
+	make_bad_inode(inode);
+	iput(inode);
+	return ret;
 }
 
 static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
-- 
cgit v1.1


From e72e6497e74811e01d72b4c1b7537b3aea3ee857 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Thu, 3 Jun 2010 08:09:12 +0100
Subject: jffs2: Fix NFS race by using insert_inode_locked()

New inodes need to be locked as we're creating them, so they don't get used
by other things (like NFSd) before they're ready.

Pointed out by Al Viro.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/dir.c | 12 ++++++++++--
 fs/jffs2/fs.c  |  7 ++++++-
 2 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index cb7ef34..166062a 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -222,15 +222,18 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode,
 	dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime));
 
 	jffs2_free_raw_inode(ri);
-	d_instantiate(dentry, inode);
 
 	D1(printk(KERN_DEBUG "jffs2_create: Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n",
 		  inode->i_ino, inode->i_mode, inode->i_nlink,
 		  f->inocache->pino_nlink, inode->i_mapping->nrpages));
+
+	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
 	return 0;
 
  fail:
 	make_bad_inode(inode);
+	unlock_new_inode(inode);
 	iput(inode);
 	jffs2_free_raw_inode(ri);
 	return ret;
@@ -447,10 +450,12 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 	jffs2_complete_reservation(c);
 
 	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
 	return 0;
 
  fail:
 	make_bad_inode(inode);
+	unlock_new_inode(inode);
 	iput(inode);
 	return ret;
 }
@@ -592,10 +597,12 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
 	jffs2_complete_reservation(c);
 
 	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
 	return 0;
 
  fail:
 	make_bad_inode(inode);
+	unlock_new_inode(inode);
 	iput(inode);
 	return ret;
 }
@@ -767,11 +774,12 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
 	jffs2_complete_reservation(c);
 
 	d_instantiate(dentry, inode);
-
+	unlock_new_inode(inode);
 	return 0;
 
  fail:
 	make_bad_inode(inode);
+	unlock_new_inode(inode);
 	iput(inode);
 	return ret;
 }
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 86e0821..26037e2 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -465,7 +465,12 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i
 	inode->i_blocks = 0;
 	inode->i_size = 0;
 
-	insert_inode_hash(inode);
+	if (insert_inode_locked(inode) < 0) {
+		make_bad_inode(inode);
+		unlock_new_inode(inode);
+		iput(inode);
+		return ERR_PTR(-EINVAL);
+	}
 
 	return inode;
 }
-- 
cgit v1.1


From 6a6ca57de92fcae34603551ac944aa74758c30d4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Thu, 3 Jun 2010 12:44:30 +0200
Subject: pipe: adjust minimum pipe size to 1 page

We don't need to pages to guarantee the POSIX requirement
that upto a page size write must be atomic to an empty
pipe.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/pipe.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index 541d662..369a024 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1181,13 +1181,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
 		if (!capable(CAP_SYS_ADMIN) && nr_pages > pipe_max_pages) {
 			ret = -EPERM;
 			goto out;
-		}
-
-		/*
-		 * The pipe needs to be at least 2 pages large to
-		 * guarantee POSIX behaviour.
-		 */
-		if (arg < 2) {
+		} else if (nr_pages < 1) {
 			ret = -EINVAL;
 			goto out;
 		}
-- 
cgit v1.1


From 419f8367ea37e5adc5d95479e8fd5554b92b49fe Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Thu, 3 Jun 2010 12:45:28 +0200
Subject: pipe: change the privilege required for growing a pipe beyond system
 max

Change it to CAP_SYS_RESOURCE, as that more accurately models what
we want to control.

Suggested-by: Michael Kerrisk <mtk.manpages@googlemail.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/pipe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index 369a024..f98fae3 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1178,7 +1178,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
 		nr_pages = (arg + PAGE_SIZE - 1) >> PAGE_SHIFT;
 		nr_pages = roundup_pow_of_two(nr_pages);
 
-		if (!capable(CAP_SYS_ADMIN) && nr_pages > pipe_max_pages) {
+		if (!capable(CAP_SYS_RESOURCE) && nr_pages > pipe_max_pages) {
 			ret = -EPERM;
 			goto out;
 		} else if (nr_pages < 1) {
-- 
cgit v1.1


From ff9da691c0498ff81fdd014e7a0731dab2337dac Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Thu, 3 Jun 2010 14:54:39 +0200
Subject: pipe: change /proc/sys/fs/pipe-max-pages to byte sized interface

This changes the interface to be based on bytes instead. The API
matches that of F_SETPIPE_SZ in that it rounds up the passed in
size so that the resulting page array is a power-of-2 in size.

The proc file is renamed to /proc/sys/fs/pipe-max-size to
reflect this change.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/pipe.c | 54 ++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 42 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index f98fae3..69c4c7c 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -26,9 +26,14 @@
 
 /*
  * The max size that a non-root user is allowed to grow the pipe. Can
- * be set by root in /proc/sys/fs/pipe-max-pages
+ * be set by root in /proc/sys/fs/pipe-max-size
  */
-unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16;
+unsigned int pipe_max_size = 1048576;
+
+/*
+ * Minimum pipe size, as required by POSIX
+ */
+unsigned int pipe_min_size = PAGE_SIZE;
 
 /*
  * We use a start+len construction, which provides full use of the 
@@ -1156,6 +1161,35 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
 	return nr_pages * PAGE_SIZE;
 }
 
+/*
+ * Currently we rely on the pipe array holding a power-of-2 number
+ * of pages.
+ */
+static inline unsigned int round_pipe_size(unsigned int size)
+{
+	unsigned long nr_pages;
+
+	nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
+}
+
+/*
+ * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax
+ * will return an error.
+ */
+int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
+		 size_t *lenp, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
+	if (ret < 0 || !write)
+		return ret;
+
+	pipe_max_size = round_pipe_size(pipe_max_size);
+	return ret;
+}
+
 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct pipe_inode_info *pipe;
@@ -1169,23 +1203,19 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
 
 	switch (cmd) {
 	case F_SETPIPE_SZ: {
-		unsigned long nr_pages;
+		unsigned int size, nr_pages;
 
-		/*
-		 * Currently the array must be a power-of-2 size, so adjust
-		 * upwards if needed.
-		 */
-		nr_pages = (arg + PAGE_SIZE - 1) >> PAGE_SHIFT;
-		nr_pages = roundup_pow_of_two(nr_pages);
+		size = round_pipe_size(arg);
+		nr_pages = size >> PAGE_SHIFT;
 
-		if (!capable(CAP_SYS_RESOURCE) && nr_pages > pipe_max_pages) {
+		if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
 			ret = -EPERM;
 			goto out;
-		} else if (nr_pages < 1) {
+		} else if (nr_pages < PAGE_SIZE) {
 			ret = -EINVAL;
 			goto out;
 		}
-		ret = pipe_set_size(pipe, arg);
+		ret = pipe_set_size(pipe, nr_pages);
 		break;
 		}
 	case F_GETPIPE_SZ:
-- 
cgit v1.1


From 1e5ea23df11c7c90c7e7268dd3a6603bfa5aadf7 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 4 Jun 2010 10:05:40 -0700
Subject: ceph: fix lease revocation when seq doesn't match

If the client revokes a lease with a higher seq than what we have, keep
the mds's seq, so that it honors our release.  Otherwise, we can hang
indefinitely.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index b49f128..29b4485 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2433,6 +2433,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
 	struct ceph_dentry_info *di;
 	int mds = session->s_mds;
 	struct ceph_mds_lease *h = msg->front.iov_base;
+	u32 seq;
 	struct ceph_vino vino;
 	int mask;
 	struct qstr dname;
@@ -2446,6 +2447,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
 	vino.ino = le64_to_cpu(h->ino);
 	vino.snap = CEPH_NOSNAP;
 	mask = le16_to_cpu(h->mask);
+	seq = le32_to_cpu(h->seq);
 	dname.name = (void *)h + sizeof(*h) + sizeof(u32);
 	dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
 	if (dname.len != get_unaligned_le32(h+1))
@@ -2456,8 +2458,9 @@ static void handle_lease(struct ceph_mds_client *mdsc,
 
 	/* lookup inode */
 	inode = ceph_find_inode(sb, vino);
-	dout("handle_lease '%s', mask %d, ino %llx %p\n",
-	     ceph_lease_op_name(h->action), mask, vino.ino, inode);
+	dout("handle_lease %s, mask %d, ino %llx %p %.*s\n",
+	     ceph_lease_op_name(h->action), mask, vino.ino, inode,
+	     dname.len, dname.name);
 	if (inode == NULL) {
 		dout("handle_lease no inode %llx\n", vino.ino);
 		goto release;
@@ -2482,7 +2485,8 @@ static void handle_lease(struct ceph_mds_client *mdsc,
 	switch (h->action) {
 	case CEPH_MDS_LEASE_REVOKE:
 		if (di && di->lease_session == session) {
-			h->seq = cpu_to_le32(di->lease_seq);
+			if (ceph_seq_cmp(di->lease_seq, seq) > 0)
+				h->seq = cpu_to_le32(di->lease_seq);
 			__ceph_mdsc_drop_dentry_lease(dentry);
 		}
 		release = 1;
@@ -2496,7 +2500,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
 			unsigned long duration =
 				le32_to_cpu(h->duration_ms) * HZ / 1000;
 
-			di->lease_seq = le32_to_cpu(h->seq);
+			di->lease_seq = seq;
 			dentry->d_time = di->lease_renew_from + duration;
 			di->lease_renew_after = di->lease_renew_from +
 				(duration >> 1);
-- 
cgit v1.1


From 75de46b98dda624397ccb17c106e51f478a79c15 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 31 May 2010 17:58:02 +1000
Subject: fix setattr error handling in sysfs, configfs

sysfs and configfs setattr functions have error cases after the generic inode's
attributes have been changed. Fix consistency by changing the generic inode
attributes only when it is guaranteed to succeed.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/configfs/inode.c | 9 ++++-----
 fs/sysfs/inode.c    | 6 ++++--
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 4164514..cf78d44 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -72,10 +72,6 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
 	if (!sd)
 		return -EINVAL;
 
-	error = simple_setattr(dentry, iattr);
-	if (error)
-		return error;
-
 	sd_iattr = sd->s_iattr;
 	if (!sd_iattr) {
 		/* setting attributes for the first time, allocate now */
@@ -89,9 +85,12 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
 		sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME;
 		sd->s_iattr = sd_iattr;
 	}
-
 	/* attributes were changed atleast once in past */
 
+	error = simple_setattr(dentry, iattr);
+	if (error)
+		return error;
+
 	if (ia_valid & ATTR_UID)
 		sd_iattr->ia_uid = iattr->ia_uid;
 	if (ia_valid & ATTR_GID)
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index bde1a4c..0835a3b 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -117,11 +117,13 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	if (error)
 		goto out;
 
+	error = sysfs_sd_setattr(sd, iattr);
+	if (error)
+		goto out;
+
 	/* this ignores size changes */
 	generic_setattr(inode, iattr);
 
-	error = sysfs_sd_setattr(sd, iattr);
-
 out:
 	mutex_unlock(&sysfs_mutex);
 	return error;
-- 
cgit v1.1


From 7d683a09990ff095a91b6e724ecee0ff8733274a Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@polito.it>
Date: Thu, 3 Jun 2010 11:58:28 +0200
Subject: wrong type for 'magic' argument in simple_fill_super()

It's used to superblock ->s_magic, which is unsigned long.

Signed-off-by: Roberto Sassu <roberto.sassu@polito.it>
Reviewed-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
CC: stable@kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/libfs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/libfs.c b/fs/libfs.c
index 09e1016..dcaf972 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -489,7 +489,8 @@ int simple_write_end(struct file *file, struct address_space *mapping,
  * unique inode values later for this filesystem, then you must take care
  * to pass it an appropriate max_reserved value to avoid collisions.
  */
-int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files)
+int simple_fill_super(struct super_block *s, unsigned long magic,
+		      struct tree_descr *files)
 {
 	struct inode *inode;
 	struct dentry *root;
-- 
cgit v1.1


From 5b54470daded19d83ea2bbf5f6bc12662942cd63 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Thu, 3 Jun 2010 12:35:42 +0200
Subject: fcntl: return -EFAULT if copy_to_user fails

copy_to_user() returns the number of bytes remaining, but we want to
return -EFAULT.
	ret = fcntl(fd, F_SETOWN_EX, NULL);
With the original code ret would be 8 here.

V2: Takuya Yoshikawa pointed out a similar issue in f_getown_ex()

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fcntl.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index f74d270..51e11bf 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -274,7 +274,7 @@ static int f_setown_ex(struct file *filp, unsigned long arg)
 
 	ret = copy_from_user(&owner, owner_p, sizeof(owner));
 	if (ret)
-		return ret;
+		return -EFAULT;
 
 	switch (owner.type) {
 	case F_OWNER_TID:
@@ -332,8 +332,11 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
 	}
 	read_unlock(&filp->f_owner.lock);
 
-	if (!ret)
+	if (!ret) {
 		ret = copy_to_user(owner_p, &owner, sizeof(owner));
+		if (ret)
+			ret = -EFAULT;
+	}
 	return ret;
 }
 
-- 
cgit v1.1


From 8718d36cf99f5acf0f37487557ec25aee54b930b Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 31 May 2010 17:58:02 +1000
Subject: fix setattr error handling in sysfs, configfs

sysfs and configfs setattr functions have error cases after the generic inode's
attributes have been changed. Fix consistency by changing the generic inode
attributes only when it is guaranteed to succeed.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/configfs/inode.c | 9 ++++-----
 fs/sysfs/inode.c    | 6 ++++--
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 4164514..cf78d44 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -72,10 +72,6 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
 	if (!sd)
 		return -EINVAL;
 
-	error = simple_setattr(dentry, iattr);
-	if (error)
-		return error;
-
 	sd_iattr = sd->s_iattr;
 	if (!sd_iattr) {
 		/* setting attributes for the first time, allocate now */
@@ -89,9 +85,12 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
 		sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME;
 		sd->s_iattr = sd_iattr;
 	}
-
 	/* attributes were changed atleast once in past */
 
+	error = simple_setattr(dentry, iattr);
+	if (error)
+		return error;
+
 	if (ia_valid & ATTR_UID)
 		sd_iattr->ia_uid = iattr->ia_uid;
 	if (ia_valid & ATTR_GID)
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index bde1a4c..0835a3b 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -117,11 +117,13 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	if (error)
 		goto out;
 
+	error = sysfs_sd_setattr(sd, iattr);
+	if (error)
+		goto out;
+
 	/* this ignores size changes */
 	generic_setattr(inode, iattr);
 
-	error = sysfs_sd_setattr(sd, iattr);
-
 out:
 	mutex_unlock(&sysfs_mutex);
 	return error;
-- 
cgit v1.1


From af5a30d8cfcfc561336f982b06345d6b815e0bb3 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Thu, 3 Jun 2010 22:01:46 +1000
Subject: fix truncate inode time modification breakage

mtime and ctime should be changed only if the file size has actually
changed. Patches changing ext2 and tmpfs from vmtruncate to new truncate
sequence has caused regressions where they always update timestamps.

There is some strange cases in POSIX where truncate(2) must not update
times unless the size has acutally changed, see 6e656be89.

This area is all still rather buggy in different ways in a lot of
filesystems and needs a cleanup and audit (ideally the vfs will provide
a simple attribute or call to direct all filesystems exactly which
attributes to change). But coming up with the best solution will take a
while and is not appropriate for rc anyway.

So fix recent regression for now.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext2/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 1921443..3675088 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1552,7 +1552,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
 		if (error)
 			return error;
 	}
-	if (iattr->ia_valid & ATTR_SIZE) {
+	if (iattr->ia_valid & ATTR_SIZE && iattr->ia_size != inode->i_size) {
 		error = ext2_setsize(inode, iattr->ia_size);
 		if (error)
 			return error;
-- 
cgit v1.1


From 01afaf61983d08ed1c9e5e8f2fcf4f40e9008033 Mon Sep 17 00:00:00 2001
From: Andrew Hendry <andrew.hendry@gmail.com>
Date: Fri, 4 Jun 2010 22:51:24 +1000
Subject: Minix: Clean up left over label

Remove a left over fail label.

Signed-off-by: Andrew Hendry <andrew.hendry@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/minix/dir.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 9196958..1dbf921 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -75,10 +75,6 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)
 	if (!IS_ERR(page))
 		kmap(page);
 	return page;
-
-fail:
-	dir_put_page(page);
-	return ERR_PTR(-EIO);
 }
 
 static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi)
-- 
cgit v1.1


From 7cbe17701a0379c7b05a79a6df4f24e41d2afde8 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Fri, 4 Jun 2010 14:14:47 -0700
Subject: fs/compat_rw_copy_check_uvector: add missing compat_ptr call

A call to access_ok is missing a compat_ptr conversion.  Introduced with
b83733639a494d5f42fa00a2506563fbd2d3015d "compat: factor out
compat_rw_copy_check_uvector from compat_do_readv_writev"

fs/compat.c: In function 'compat_rw_copy_check_uvector':
fs/compat.c:629: warning: passing argument 1 of '__access_ok' makes pointer from integer without a cast

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/compat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index f0b391c..6490d21 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -626,7 +626,7 @@ ssize_t compat_rw_copy_check_uvector(int type,
 		tot_len += len;
 		if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
 			goto out;
-		if (!access_ok(vrfy_dir(type), buf, len)) {
+		if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
 			ret = -EFAULT;
 			goto out;
 		}
-- 
cgit v1.1


From 2e94de8acbe524d919f1ea8807913d7b005e1578 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Fri, 4 Jun 2010 14:14:53 -0700
Subject: fs/binfmt_flat.c: split the stack & data alignments

The stack and data have different alignment requirements, so don't force
them to wear the same shoe.  Increase the data alignment to match that
which the elf2flt linker script has always been using: 0x20 bytes.  Not
only does this bring the kernel loader in line with the toolchain, but it
also fixes a swath of gcc tests which try to force larger alignment values
but randomly fail when the FLAT loader fails to deliver.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: David Woodhouse <David.Woodhouse@intel.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: David McCullough <davidm@snapgear.com>
Acked-by: Greg Ungerer <gerg@uclinux.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Tested-by: Michal Simek <monstr@monstr.eu>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Jie Zhang <jie@codesourcery.com>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_flat.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 49566c1..b865622 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -56,15 +56,22 @@
 #endif
 
 /*
- * User data (stack, data section and bss) needs to be aligned
- * for the same reasons as SLAB memory is, and to the same amount.
- * Avoid duplicating architecture specific code by using the same
- * macro as with SLAB allocation:
+ * User data (data section and bss) needs to be aligned.
+ * We pick 0x20 here because it is the max value elf2flt has always
+ * used in producing FLAT files, and because it seems to be large
+ * enough to make all the gcc alignment related tests happy.
+ */
+#define FLAT_DATA_ALIGN	(0x20)
+
+/*
+ * User data (stack) also needs to be aligned.
+ * Here we can be a bit looser than the data sections since this
+ * needs to only meet arch ABI requirements.
  */
 #ifdef ARCH_SLAB_MINALIGN
-#define FLAT_DATA_ALIGN	(ARCH_SLAB_MINALIGN)
+#define FLAT_STACK_ALIGN	(ARCH_SLAB_MINALIGN)
 #else
-#define FLAT_DATA_ALIGN	(sizeof(void *))
+#define FLAT_STACK_ALIGN	(sizeof(void *))
 #endif
 
 #define RELOC_FAILED 0xff00ff01		/* Relocation incorrect somewhere */
@@ -129,7 +136,7 @@ static unsigned long create_flat_tables(
 
 	sp = (unsigned long *)p;
 	sp -= (envc + argc + 2) + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
-	sp = (unsigned long *) ((unsigned long)sp & -FLAT_DATA_ALIGN);
+	sp = (unsigned long *) ((unsigned long)sp & -FLAT_STACK_ALIGN);
 	argv = sp + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
 	envp = argv + (argc + 1);
 
@@ -876,7 +883,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	stack_len = TOP_OF_ARGS - bprm->p;             /* the strings */
 	stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */
 	stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */
-	stack_len += FLAT_DATA_ALIGN - 1;  /* reserve for upcoming alignment */
+	stack_len += FLAT_STACK_ALIGN - 1;  /* reserve for upcoming alignment */
 	
 	res = load_flat_file(bprm, &libinfo, 0, &stack_len);
 	if (IS_ERR_VALUE(res))
-- 
cgit v1.1


From 1da083c9b23dafd6bcb08dcfec443e66e90efff0 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Fri, 4 Jun 2010 14:14:55 -0700
Subject: flat: fix unmap len in load error path

The data chunk is mmaped with 'len' which remains unchanged, so use that
when unmapping in the error path rather than trying to recalculate (and
incorrectly so) the value used originally.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Acked-by: David McCullough <davidm@snapgear.com>
Acked-by: Greg Ungerer <gerg@uclinux.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_flat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index b865622..b6ab27c 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -596,7 +596,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 		if (IS_ERR_VALUE(result)) {
 			printk("Unable to read data+bss, errno %d\n", (int)-result);
 			do_munmap(current->mm, textpos, text_len);
-			do_munmap(current->mm, realdatastart, data_len + extra);
+			do_munmap(current->mm, realdatastart, len);
 			ret = result;
 			goto err;
 		}
-- 
cgit v1.1


From 84a8dce2710cc425089a2b92acc354d4fbb5788d Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Sat, 5 Jun 2010 11:51:27 -0400
Subject: ext4: Fix remaining racy updates of EXT4_I(inode)->i_flags

A few functions were still modifying i_flags in a racy manner.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 40 +++++++++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 19df61c..42272d6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4942,20 +4942,26 @@ void ext4_set_inode_flags(struct inode *inode)
 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
 void ext4_get_inode_flags(struct ext4_inode_info *ei)
 {
-	unsigned int flags = ei->vfs_inode.i_flags;
-
-	ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
-			EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL);
-	if (flags & S_SYNC)
-		ei->i_flags |= EXT4_SYNC_FL;
-	if (flags & S_APPEND)
-		ei->i_flags |= EXT4_APPEND_FL;
-	if (flags & S_IMMUTABLE)
-		ei->i_flags |= EXT4_IMMUTABLE_FL;
-	if (flags & S_NOATIME)
-		ei->i_flags |= EXT4_NOATIME_FL;
-	if (flags & S_DIRSYNC)
-		ei->i_flags |= EXT4_DIRSYNC_FL;
+	unsigned int vfs_fl;
+	unsigned long old_fl, new_fl;
+
+	do {
+		vfs_fl = ei->vfs_inode.i_flags;
+		old_fl = ei->i_flags;
+		new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
+				EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
+				EXT4_DIRSYNC_FL);
+		if (vfs_fl & S_SYNC)
+			new_fl |= EXT4_SYNC_FL;
+		if (vfs_fl & S_APPEND)
+			new_fl |= EXT4_APPEND_FL;
+		if (vfs_fl & S_IMMUTABLE)
+			new_fl |= EXT4_IMMUTABLE_FL;
+		if (vfs_fl & S_NOATIME)
+			new_fl |= EXT4_NOATIME_FL;
+		if (vfs_fl & S_DIRSYNC)
+			new_fl |= EXT4_DIRSYNC_FL;
+	} while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
 }
 
 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
@@ -5191,7 +5197,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
 		 */
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_high = 0;
-		ei->i_flags &= ~EXT4_HUGE_FILE_FL;
+		ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
 		return 0;
 	}
 	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
@@ -5204,9 +5210,9 @@ static int ext4_inode_blocks_set(handle_t *handle,
 		 */
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
-		ei->i_flags &= ~EXT4_HUGE_FILE_FL;
+		ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
 	} else {
-		ei->i_flags |= EXT4_HUGE_FILE_FL;
+		ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
 		/* i_block is stored in file system block size */
 		i_blocks = i_blocks >> (inode->i_blkbits - 9);
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
-- 
cgit v1.1


From 1c24d06f8e065023ebb428db5af5514500839ee6 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 4 Jun 2010 17:07:55 +0200
Subject: jffs2: update ctime when changing the file's permission by setfacl

jffs2 didn't update the ctime of the file when its permission was changed.

Steps to reproduce:
 # touch aaa
 # stat -c %Z aaa
 1275289822
 # setfacl -m  'u::x,g::x,o::x' aaa
 # stat -c %Z aaa
 1275289822                         <- unchanged

But, according to the spec of the ctime, jffs2 must update it.

Port of ext3 patch by Miao Xie <miaox@cn.fujitsu.com>.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/acl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 7cdc319..64d3b4b4 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -234,8 +234,9 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 			if (inode->i_mode != mode) {
 				struct iattr attr;
 
-				attr.ia_valid = ATTR_MODE;
+				attr.ia_valid = ATTR_MODE | ATTR_CTIME;
 				attr.ia_mode = mode;
+				attr.ia_ctime = CURRENT_TIME_SEC;
 				rc = jffs2_do_setattr(inode, &attr);
 				if (rc < 0)
 					return rc;
-- 
cgit v1.1


From c3935e30495869dd611e1cd62253c94ebc7c6c04 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Fri, 4 Jun 2010 16:42:08 -0400
Subject: nfsd4: shut down callback queue outside state lock

This reportedly causes a lockdep warning on nfsd shutdown.  That looks
like a false positive to me, but there's no reason why this needs the
state lock anyway.

Reported-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 12f7109..4a27347 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4122,8 +4122,8 @@ nfs4_state_shutdown(void)
 	nfs4_lock_state();
 	nfs4_release_reclaim();
 	__nfs4_state_shutdown();
-	nfsd4_destroy_callback_queue();
 	nfs4_unlock_state();
+	nfsd4_destroy_callback_queue();
 }
 
 /*
-- 
cgit v1.1


From 254c8c2dbf0e06a560a5814eb90cb628adb2de66 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 9 Jun 2010 10:37:19 +1000
Subject: xfs: remove nr_to_write writeback windup.

Now that the background flush code has been fixed, we shouldn't need to
silently multiply the wbc->nr_to_write to get good writeback. Remove
that code.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/xfs/linux-2.6/xfs_aops.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index a0fa3bf..34640d6 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1381,14 +1381,6 @@ xfs_vm_writepage(
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, 1 << inode->i_blkbits, 0);
 
-
-	/*
-	 *  VM calculation for nr_to_write seems off.  Bump it way
-	 *  up, this gets simple streaming writes zippy again.
-	 *  To be reviewed again after Jens' writeback changes.
-	 */
-	wbc->nr_to_write *= 4;
-
 	/*
 	 * Convert delayed allocate, unwritten or unmapped space
 	 * to real space and flush out to disk.
-- 
cgit v1.1


From cf3425707ed9ce0d5ebaba20bc3d22dd39e52f2f Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 26 May 2010 01:50:21 +1000
Subject: block: bd_start_claiming fix module refcount

bd_start_claiming has an unbalanced module_put introduced in 6b4517a79.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/block_dev.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 7346c96..204a763 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -734,6 +734,7 @@ static struct block_device *bd_start_claiming(struct block_device *bdev,
 		return ERR_PTR(-ENXIO);
 
 	whole = bdget_disk(disk, 0);
+	module_put(disk->fops->owner);
 	put_disk(disk);
 	if (!whole)
 		return ERR_PTR(-ENOMEM);
-- 
cgit v1.1


From b0018361c3f934858592cbbb5e1a4f318c2a70ed Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 26 May 2010 01:51:19 +1000
Subject: block: bd_start_claiming cleanup

I don't like the subtle multi-context code in bd_claim (ie.  detects where it
has been called based on bd_claiming). It seems clearer to just require a new
function to finish a 2-part claim.

Also improve commentary in bd_start_claiming as to how it should
be used.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/block_dev.c | 72 ++++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 48 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 204a763..a1642ef 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -706,8 +706,13 @@ retry:
  * @bdev is about to be opened exclusively.  Check @bdev can be opened
  * exclusively and mark that an exclusive open is in progress.  Each
  * successful call to this function must be matched with a call to
- * either bd_claim() or bd_abort_claiming().  If this function
- * succeeds, the matching bd_claim() is guaranteed to succeed.
+ * either bd_finish_claiming() or bd_abort_claiming() (which do not
+ * fail).
+ *
+ * This function is used to gain exclusive access to the block device
+ * without actually causing other exclusive open attempts to fail. It
+ * should be used when the open sequence itself requires exclusive
+ * access but may subsequently fail.
  *
  * CONTEXT:
  * Might sleep.
@@ -783,15 +788,47 @@ static void bd_abort_claiming(struct block_device *whole, void *holder)
 	__bd_abort_claiming(whole, holder);		/* releases bdev_lock */
 }
 
+/* increment holders when we have a legitimate claim. requires bdev_lock */
+static void __bd_claim(struct block_device *bdev, struct block_device *whole,
+					void *holder)
+{
+	/* note that for a whole device bd_holders
+	 * will be incremented twice, and bd_holder will
+	 * be set to bd_claim before being set to holder
+	 */
+	whole->bd_holders++;
+	whole->bd_holder = bd_claim;
+	bdev->bd_holders++;
+	bdev->bd_holder = holder;
+}
+
+/**
+ * bd_finish_claiming - finish claiming a block device
+ * @bdev: block device of interest (passed to bd_start_claiming())
+ * @whole: whole block device returned by bd_start_claiming()
+ * @holder: holder trying to claim @bdev
+ *
+ * Finish a claiming block started by bd_start_claiming().
+ *
+ * CONTEXT:
+ * Grabs and releases bdev_lock.
+ */
+static void bd_finish_claiming(struct block_device *bdev,
+				struct block_device *whole, void *holder)
+{
+	spin_lock(&bdev_lock);
+	BUG_ON(whole->bd_claiming != holder);
+	BUG_ON(!bd_may_claim(bdev, whole, holder));
+	__bd_claim(bdev, whole, holder);
+	__bd_abort_claiming(whole, holder); /* not actually an abort */
+}
+
 /**
  * bd_claim - claim a block device
  * @bdev: block device to claim
  * @holder: holder trying to claim @bdev
  *
- * Try to claim @bdev which must have been opened successfully.  This
- * function may be called with or without preceding
- * blk_start_claiming().  In the former case, this function is always
- * successful and terminates the claiming block.
+ * Try to claim @bdev which must have been opened successfully.
  *
  * CONTEXT:
  * Might sleep.
@@ -807,23 +844,10 @@ int bd_claim(struct block_device *bdev, void *holder)
 	might_sleep();
 
 	spin_lock(&bdev_lock);
-
 	res = bd_prepare_to_claim(bdev, whole, holder);
-	if (res == 0) {
-		/* note that for a whole device bd_holders
-		 * will be incremented twice, and bd_holder will
-		 * be set to bd_claim before being set to holder
-		 */
-		whole->bd_holders++;
-		whole->bd_holder = bd_claim;
-		bdev->bd_holders++;
-		bdev->bd_holder = holder;
-	}
-
-	if (whole->bd_claiming)
-		__bd_abort_claiming(whole, holder);	/* releases bdev_lock */
-	else
-		spin_unlock(&bdev_lock);
+	if (res == 0)
+		__bd_claim(bdev, whole, holder);
+	spin_unlock(&bdev_lock);
 
 	return res;
 }
@@ -1477,7 +1501,7 @@ static int blkdev_open(struct inode * inode, struct file * filp)
 
 	if (whole) {
 		if (res == 0)
-			BUG_ON(bd_claim(bdev, filp) != 0);
+			bd_finish_claiming(bdev, whole, filp);
 		else
 			bd_abort_claiming(whole, filp);
 	}
@@ -1713,7 +1737,7 @@ struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *h
 	if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
 		goto out_blkdev_put;
 
-	BUG_ON(bd_claim(bdev, holder) != 0);
+	bd_finish_claiming(bdev, whole, holder);
 	return bdev;
 
 out_blkdev_put:
-- 
cgit v1.1


From 3e6c05052c262ebe7fdd85e75e9d4f956cdd8d82 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Mon, 7 Jun 2010 20:17:38 +0200
Subject: block: remove duplicate BUG_ON() in bd_finish_claiming()

We do the same BUG_ON() just a line later when calling into
__bd_abort_claiming().

Reported-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/block_dev.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index a1642ef..99d6af8 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -817,7 +817,6 @@ static void bd_finish_claiming(struct block_device *bdev,
 				struct block_device *whole, void *holder)
 {
 	spin_lock(&bdev_lock);
-	BUG_ON(whole->bd_claiming != holder);
 	BUG_ON(!bd_may_claim(bdev, whole, holder));
 	__bd_claim(bdev, whole, holder);
 	__bd_abort_claiming(whole, holder); /* not actually an abort */
-- 
cgit v1.1


From 1d862f41222b7f385bada9f85a67ca5592ffd33e Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 8 Jun 2010 16:28:45 +0200
Subject: pipe: fix pipe buffer resizing

pipe_set_size() needs to copy pipe bufs from the old circular buffer
to the new.

The current code gets this wrong in multiple ways, resulting in oops.

Test program is available here:
  http://www.kernel.org/pub/linux/kernel/people/mszeredi/piperesize/

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/pipe.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index 69c4c7c..f31e2d4 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1145,13 +1145,20 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
 	 * and adjust the indexes.
 	 */
 	if (pipe->nrbufs) {
-		const unsigned int tail = pipe->nrbufs & (pipe->buffers - 1);
-		const unsigned int head = pipe->nrbufs - tail;
+		unsigned int tail;
+		unsigned int head;
 
+		tail = pipe->curbuf + pipe->nrbufs;
+		if (tail < pipe->buffers)
+			tail = 0;
+		else
+			tail &= (pipe->buffers - 1);
+
+		head = pipe->nrbufs - tail;
 		if (head)
 			memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
 		if (tail)
-			memcpy(bufs + head, pipe->bufs + pipe->curbuf, tail * sizeof(struct pipe_buffer));
+			memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
 	}
 
 	pipe->curbuf = 0;
-- 
cgit v1.1


From 6db40cf047a8723095caf79f5569d21b388d7b31 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 9 Jun 2010 09:27:57 +0200
Subject: pipe: fix check in "set size" fcntl

As it stands this check compares the number of pages to the page size.
This makes no sense and makes the fcntl fail in almost any sane case.

Fix it by checking if nr_pages is not zero (it can become zero only if
arg is too big and round_pipe_size() overflows).

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/pipe.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index f31e2d4..279eef9 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1215,12 +1215,13 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
 		size = round_pipe_size(arg);
 		nr_pages = size >> PAGE_SHIFT;
 
+		ret = -EINVAL;
+		if (!nr_pages)
+			goto out;
+
 		if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
 			ret = -EPERM;
 			goto out;
-		} else if (nr_pages < PAGE_SIZE) {
-			ret = -EINVAL;
-			goto out;
 		}
 		ret = pipe_set_size(pipe, nr_pages);
 		break;
-- 
cgit v1.1


From 00d5643e7c5ed4ae1bb0b385fe2f41bb951cc3cd Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Thu, 10 Jun 2010 11:13:58 -0400
Subject: ceph: fix atomic64_t initialization on ia64

bdi_seq is an atomic_long_t but we're using ATOMIC_INIT, which causes
 build failures on ia64. This patch fixes it to use ATOMIC_LONG_INIT.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 8db88792..fa87f51 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -926,7 +926,7 @@ static int ceph_compare_super(struct super_block *sb, void *data)
 /*
  * construct our own bdi so we can control readahead, etc.
  */
-static atomic_long_t bdi_seq = ATOMIC_INIT(0);
+static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
 
 static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
 {
-- 
cgit v1.1


From 9dbd412f56c453f15014396c6024b895c1485ccb Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 10 Jun 2010 13:21:20 -0700
Subject: ceph: fix misleading/incorrect debug message

Nothing is released here: the caps message is simply ignored in this case.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index ae3e3a3..da2a0e3 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2714,7 +2714,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	spin_lock(&inode->i_lock);
 	cap = __get_cap_for_mds(ceph_inode(inode), mds);
 	if (!cap) {
-		dout("no cap on %p ino %llx.%llx from mds%d, releasing\n",
+		dout(" no cap on %p ino %llx.%llx from mds%d\n",
 		     inode, ceph_ino(inode), ceph_snap(inode), mds);
 		spin_unlock(&inode->i_lock);
 		goto done;
-- 
cgit v1.1


From 3d7ded4d81d807c2f75f310a8d74a5d72be13a1b Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 9 Jun 2010 16:47:10 -0700
Subject: ceph: release cap on import if we don't have the inode

If we get an IMPORT that give us a cap, but we don't have the inode, queue
a release (and try to send it immediately) so that the MDS doesn't get
stuck waiting for us.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c       | 90 ++++++++++++++++++++++++++++++++--------------------
 fs/ceph/mds_client.c |  6 ++--
 fs/ceph/mds_client.h |  3 ++
 3 files changed, 61 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index da2a0e3..7c692e7 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -981,6 +981,46 @@ static int send_cap_msg(struct ceph_mds_session *session,
 	return 0;
 }
 
+static void __queue_cap_release(struct ceph_mds_session *session,
+				u64 ino, u64 cap_id, u32 migrate_seq,
+				u32 issue_seq)
+{
+	struct ceph_msg *msg;
+	struct ceph_mds_cap_release *head;
+	struct ceph_mds_cap_item *item;
+
+	spin_lock(&session->s_cap_lock);
+	BUG_ON(!session->s_num_cap_releases);
+	msg = list_first_entry(&session->s_cap_releases,
+			       struct ceph_msg, list_head);
+
+	dout(" adding %llx release to mds%d msg %p (%d left)\n",
+	     ino, session->s_mds, msg, session->s_num_cap_releases);
+
+	BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
+	head = msg->front.iov_base;
+	head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
+	item = msg->front.iov_base + msg->front.iov_len;
+	item->ino = cpu_to_le64(ino);
+	item->cap_id = cpu_to_le64(cap_id);
+	item->migrate_seq = cpu_to_le32(migrate_seq);
+	item->seq = cpu_to_le32(issue_seq);
+
+	session->s_num_cap_releases--;
+
+	msg->front.iov_len += sizeof(*item);
+	if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
+		dout(" release msg %p full\n", msg);
+		list_move_tail(&msg->list_head, &session->s_cap_releases_done);
+	} else {
+		dout(" release msg %p at %d/%d (%d)\n", msg,
+		     (int)le32_to_cpu(head->num),
+		     (int)CEPH_CAPS_PER_RELEASE,
+		     (int)msg->front.iov_len);
+	}
+	spin_unlock(&session->s_cap_lock);
+}
+
 /*
  * Queue cap releases when an inode is dropped from our cache.  Since
  * inode is about to be destroyed, there is no need for i_lock.
@@ -994,41 +1034,9 @@ void ceph_queue_caps_release(struct inode *inode)
 	while (p) {
 		struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
 		struct ceph_mds_session *session = cap->session;
-		struct ceph_msg *msg;
-		struct ceph_mds_cap_release *head;
-		struct ceph_mds_cap_item *item;
 
-		spin_lock(&session->s_cap_lock);
-		BUG_ON(!session->s_num_cap_releases);
-		msg = list_first_entry(&session->s_cap_releases,
-				       struct ceph_msg, list_head);
-
-		dout(" adding %p release to mds%d msg %p (%d left)\n",
-		     inode, session->s_mds, msg, session->s_num_cap_releases);
-
-		BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
-		head = msg->front.iov_base;
-		head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
-		item = msg->front.iov_base + msg->front.iov_len;
-		item->ino = cpu_to_le64(ceph_ino(inode));
-		item->cap_id = cpu_to_le64(cap->cap_id);
-		item->migrate_seq = cpu_to_le32(cap->mseq);
-		item->seq = cpu_to_le32(cap->issue_seq);
-
-		session->s_num_cap_releases--;
-
-		msg->front.iov_len += sizeof(*item);
-		if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
-			dout(" release msg %p full\n", msg);
-			list_move_tail(&msg->list_head,
-				       &session->s_cap_releases_done);
-		} else {
-			dout(" release msg %p at %d/%d (%d)\n", msg,
-			     (int)le32_to_cpu(head->num),
-			     (int)CEPH_CAPS_PER_RELEASE,
-			     (int)msg->front.iov_len);
-		}
-		spin_unlock(&session->s_cap_lock);
+		__queue_cap_release(session, ceph_ino(inode), cap->cap_id,
+				    cap->mseq, cap->issue_seq);
 		p = rb_next(p);
 		__ceph_remove_cap(cap);
 	}
@@ -2655,7 +2663,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	struct ceph_mds_caps *h;
 	int mds = session->s_mds;
 	int op;
-	u32 seq;
+	u32 seq, mseq;
 	struct ceph_vino vino;
 	u64 cap_id;
 	u64 size, max_size;
@@ -2675,6 +2683,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	vino.snap = CEPH_NOSNAP;
 	cap_id = le64_to_cpu(h->cap_id);
 	seq = le32_to_cpu(h->seq);
+	mseq = le32_to_cpu(h->migrate_seq);
 	size = le64_to_cpu(h->size);
 	max_size = le64_to_cpu(h->max_size);
 
@@ -2689,6 +2698,17 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	     vino.snap, inode);
 	if (!inode) {
 		dout(" i don't have ino %llx\n", vino.ino);
+
+		if (op == CEPH_CAP_OP_IMPORT)
+			__queue_cap_release(session, vino.ino, cap_id,
+					    mseq, seq);
+
+		/*
+		 * send any full release message to try to move things
+		 * along for the mds (who clearly thinks we still have this
+		 * cap).
+		 */
+		ceph_send_cap_releases(mdsc, session);
 		goto done;
 	}
 
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 29b4485..d28b6a9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1176,8 +1176,8 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
 /*
  * called under s_mutex
  */
-static void send_cap_releases(struct ceph_mds_client *mdsc,
-		       struct ceph_mds_session *session)
+void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
+			    struct ceph_mds_session *session)
 {
 	struct ceph_msg *msg;
 
@@ -2693,7 +2693,7 @@ static void delayed_work(struct work_struct *work)
 		add_cap_releases(mdsc, s, -1);
 		if (s->s_state == CEPH_MDS_SESSION_OPEN ||
 		    s->s_state == CEPH_MDS_SESSION_HUNG)
-			send_cap_releases(mdsc, s);
+			ceph_send_cap_releases(mdsc, s);
 		mutex_unlock(&s->s_mutex);
 		ceph_put_mds_session(s);
 
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index d9936c4..e43752b 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -322,6 +322,9 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
 	kref_put(&req->r_kref, ceph_mdsc_release_request);
 }
 
+extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
+				   struct ceph_mds_session *session);
+
 extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
 
 extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
-- 
cgit v1.1


From 2b2300d62ea413bec631d5b880effa2cc5363acb Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 9 Jun 2010 16:52:04 -0700
Subject: ceph: try to send partial cap release on cap message on missing inode

If we have enough memory to allocate a new cap release message, do so, so
that we can send a partial release message immediately.  This keeps us from
making the MDS wait when the cap release it needs is in a partially full
release message.

If we fail because of ENOMEM, oh well, they'll just have to wait a bit
longer.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c       |  1 +
 fs/ceph/mds_client.c | 10 +++++-----
 fs/ceph/mds_client.h |  3 +++
 3 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 7c692e7..619b616 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2708,6 +2708,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 		 * along for the mds (who clearly thinks we still have this
 		 * cap).
 		 */
+		ceph_add_cap_releases(mdsc, session, -1);
 		ceph_send_cap_releases(mdsc, session);
 		goto done;
 	}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d28b6a9..1766947 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1066,9 +1066,9 @@ static int trim_caps(struct ceph_mds_client *mdsc,
  *
  * Called under s_mutex.
  */
-static int add_cap_releases(struct ceph_mds_client *mdsc,
-			    struct ceph_mds_session *session,
-			    int extra)
+int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
+			  struct ceph_mds_session *session,
+			  int extra)
 {
 	struct ceph_msg *msg;
 	struct ceph_mds_cap_release *head;
@@ -1980,7 +1980,7 @@ out_err:
 	}
 	mutex_unlock(&mdsc->mutex);
 
-	add_cap_releases(mdsc, req->r_session, -1);
+	ceph_add_cap_releases(mdsc, req->r_session, -1);
 	mutex_unlock(&session->s_mutex);
 
 	/* kick calling process */
@@ -2690,7 +2690,7 @@ static void delayed_work(struct work_struct *work)
 			send_renew_caps(mdsc, s);
 		else
 			ceph_con_keepalive(&s->s_con);
-		add_cap_releases(mdsc, s, -1);
+		ceph_add_cap_releases(mdsc, s, -1);
 		if (s->s_state == CEPH_MDS_SESSION_OPEN ||
 		    s->s_state == CEPH_MDS_SESSION_HUNG)
 			ceph_send_cap_releases(mdsc, s);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index e43752b..b292fa4 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -322,6 +322,9 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
 	kref_put(&req->r_kref, ceph_mdsc_release_request);
 }
 
+extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
+				 struct ceph_mds_session *session,
+				 int extra);
 extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
 				   struct ceph_mds_session *session);
 
-- 
cgit v1.1


From 4a001071d3549f596c7c3736c5dda8a3a4aba9ed Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Mon, 7 Jun 2010 03:38:51 +0000
Subject: Btrfs: fix loop device on top of btrfs

We cannot use the loop device which has been connected to a file in the btrf

The reproduce steps is following:
 # dd if=/dev/zero of=vdev0 bs=1M count=1024
 # losetup /dev/loop0 vdev0
 # mkfs.btrfs /dev/loop0
 ...
 failed to zero device start -5

The reason is that the btrfs don't implement either ->write_begin or ->write
the VFS API, so we fix it by setting ->write to do_sync_write().

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 79437c5..abcb918 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1197,6 +1197,7 @@ static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
 const struct file_operations btrfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
+	.write		= do_sync_write,
 	.aio_read       = generic_file_aio_read,
 	.splice_read	= generic_file_splice_read,
 	.aio_write	= btrfs_file_aio_write,
-- 
cgit v1.1


From 836097797236fd727f82ec2f3f376ac41a430876 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 7 Jun 2010 18:26:37 +0000
Subject: Btrfs: fix fallocate regression

Seems that when btrfs_fallocate was converted to use the new ENOSPC stuff we
dropped passing the mode to the function that actually does the preallocation.
This breaks anybody who wants to use FALLOC_FL_KEEP_SIZE.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2551b80..d999c53 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6893,7 +6893,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 		if (em->block_start == EXTENT_MAP_HOLE ||
 		    (cur_offset >= inode->i_size &&
 		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-			ret = btrfs_prealloc_file_range(inode, 0, cur_offset,
+			ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
 							last_byte - cur_offset,
 							1 << inode->i_blkbits,
 							offset + len,
-- 
cgit v1.1


From 0e4dcbef1c0c3e29f9c7f824359445d385b2649a Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Tue, 1 Jun 2010 08:23:11 +0000
Subject: Btrfs: uninitialized data is check_path_shared()

refs can be used with uninitialized data if btrfs_lookup_extent_info()
fails on the first pass through the loop.  In the original code if that
happens then check_path_shared() probably returns 1, this patch
changes it to return 1 for safety.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d999c53..f08427c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2673,7 +2673,7 @@ static int check_path_shared(struct btrfs_root *root,
 	struct extent_buffer *eb;
 	int level;
 	int ret;
-	u64 refs;
+	u64 refs = 1;
 
 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
 		if (!path->nodes[level])
-- 
cgit v1.1


From 058a457ef0ce28d595af53d6103db73332383cbc Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 20 May 2010 07:21:50 +0000
Subject: Btrfs: fix remap_file_pages error

when we use remap_file_pages() to remap a file, remap_file_pages always return
error. It is because btrfs didn't set VM_CAN_NONLINEAR for vma.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index abcb918..ce0cd29 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1189,8 +1189,15 @@ static const struct vm_operations_struct btrfs_file_vm_ops = {
 
 static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
 {
-	vma->vm_ops = &btrfs_file_vm_ops;
+	struct address_space *mapping = filp->f_mapping;
+
+	if (!mapping->a_ops->readpage)
+		return -ENOEXEC;
+
 	file_accessed(filp);
+	vma->vm_ops = &btrfs_file_vm_ops;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
+
 	return 0;
 }
 
-- 
cgit v1.1


From 046f264f6b3b2cf7e5a1769fc92335d8a9316282 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.yan@oracle.com>
Date: Mon, 31 May 2010 08:58:47 +0000
Subject: Btrfs: Fix null dereference in relocation.c

Fix a potential null dereference in relocation.c

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Acked-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/relocation.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 05d41e5..b37d723 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -784,16 +784,17 @@ again:
 				struct btrfs_extent_ref_v0 *ref0;
 				ref0 = btrfs_item_ptr(eb, path1->slots[0],
 						struct btrfs_extent_ref_v0);
-				root = find_tree_root(rc, eb, ref0);
-				if (!root->ref_cows)
-					cur->cowonly = 1;
 				if (key.objectid == key.offset) {
+					root = find_tree_root(rc, eb, ref0);
 					if (root && !should_ignore_root(root))
 						cur->root = root;
 					else
 						list_add(&cur->list, &useless);
 					break;
 				}
+				if (is_cowonly_root(btrfs_ref_root_v0(eb,
+								      ref0)))
+					cur->cowonly = 1;
 			}
 #else
 		BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
-- 
cgit v1.1


From 3bf84a5a834d13e7c5c3e8e5b5c6b26012118dd8 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.yan@oracle.com>
Date: Mon, 31 May 2010 09:04:46 +0000
Subject: Btrfs: Fix BUG_ON for fs converted from extN

Tree blocks can live in data block groups in FS converted from extN.
So it's easy to trigger the BUG_ON.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6c14101..a46b64d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4360,7 +4360,8 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 
 	block_rsv = get_block_rsv(trans, root);
 	cache = btrfs_lookup_block_group(root->fs_info, buf->start);
-	BUG_ON(block_rsv->space_info != cache->space_info);
+	if (block_rsv->space_info != cache->space_info)
+		goto out;
 
 	if (btrfs_header_generation(buf) == trans->transid) {
 		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
-- 
cgit v1.1


From fb4f6f910ca6f58564c31a680ef88940d8192713 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Sat, 29 May 2010 09:40:57 +0000
Subject: Btrfs: handle error returns from btrfs_lookup_dir_item()

If btrfs_lookup_dir_item() fails, we should can just let the mount fail
with an error.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 574285c..9ea7114 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -360,6 +360,8 @@ static struct dentry *get_default_root(struct super_block *sb,
 	 */
 	dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
 	di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
+	if (IS_ERR(di))
+		return ERR_CAST(di);
 	if (!di) {
 		/*
 		 * Ok the default dir item isn't there.  This is weird since
-- 
cgit v1.1


From 676e4c86391936795c82ccd11ca9671ee6307936 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Sat, 29 May 2010 09:43:07 +0000
Subject: Btrfs: handle kzalloc() failure in open_ctree()

Unwind and return -ENOMEM if the allocation fails here.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f3b287c..73895ba 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1941,8 +1941,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		     btrfs_level_size(tree_root,
 				      btrfs_super_log_root_level(disk_super));
 
-		log_tree_root = kzalloc(sizeof(struct btrfs_root),
-						      GFP_NOFS);
+		log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+		if (!log_tree_root) {
+			err = -ENOMEM;
+			goto fail_trans_kthread;
+		}
 
 		__setup_root(nodesize, leafsize, sectorsize, stripesize,
 			     log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
-- 
cgit v1.1


From 4cbd1149fbcc351bdf08ab749867d157905d0d35 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Sat, 29 May 2010 09:42:19 +0000
Subject: Btrfs: btrfs_iget() returns ERR_PTR

btrfs_iget() returns an ERR_PTR() on failure and not null.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9ea7114..859ddaa 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -392,8 +392,8 @@ setup_root:
 	location.offset = 0;
 
 	inode = btrfs_iget(sb, &location, new_root, &new);
-	if (!inode)
-		return ERR_PTR(-ENOMEM);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
 
 	/*
 	 * If we're just mounting the root most subvol put the inode and return
-- 
cgit v1.1


From d327099a23e3d0c8ec09137e9b4b115449d4eb29 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Sat, 29 May 2010 09:46:47 +0000
Subject: Btrfs: unwind after btrfs_start_transaction() errors

This was added by a22285a6a3: "Btrfs: Integrate metadata reservation
with start_transaction".  If we goto out here then we skip all the
unwinding and there are locks still held etc.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4cdb98c..9f9a1d9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1280,7 +1280,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
 		err = PTR_ERR(trans);
-		goto out;
+		goto out_up_write;
 	}
 	trans->block_rsv = &root->fs_info->global_block_rsv;
 
-- 
cgit v1.1


From 3140c9a34b44cd4013baae8704fdb34a93a44475 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Sat, 29 May 2010 09:44:10 +0000
Subject: Btrfs: btrfs_read_fs_root_no_name() returns ERR_PTRs

btrfs_read_fs_root_no_name() returns ERR_PTRs on error so I added a
check for that.  It's not clear to me if it can also return NULL
pointers or not so I left the original NULL pointer check as is.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 73895ba..34f7c37 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1985,6 +1985,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
 	if (!fs_info->fs_root)
 		goto fail_trans_kthread;
+	if (IS_ERR(fs_info->fs_root)) {
+		err = PTR_ERR(fs_info->fs_root);
+		goto fail_trans_kthread;
+	}
 
 	if (!(sb->s_flags & MS_RDONLY)) {
 		down_read(&fs_info->cleanup_work_sem);
-- 
cgit v1.1


From cf1e99a4e0daa4a5623cd71108509823b9ff2d30 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Sat, 29 May 2010 09:47:24 +0000
Subject: Btrfs: btrfs_lookup_dir_item() can return ERR_PTR

btrfs_lookup_dir_item() can return either ERR_PTRs or null.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9f9a1d9..4dbaf89 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1845,7 +1845,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 	dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
 	di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
 				   dir_id, "default", 7, 1);
-	if (!di) {
+	if (IS_ERR_OR_NULL(di)) {
 		btrfs_free_path(path);
 		btrfs_end_transaction(trans, root);
 		printk(KERN_ERR "Umm, you don't have the default dir item, "
-- 
cgit v1.1


From 2f26afba46f0ebf155cf9be746496a0304a5b7cf Mon Sep 17 00:00:00 2001
From: Shi Weihua <shiwh@cn.fujitsu.com>
Date: Tue, 18 May 2010 00:50:32 +0000
Subject: Btrfs: should add a permission check for setfacl

On btrfs, do the following
------------------
# su user1
# cd btrfs-part/
# touch aaa
# getfacl aaa
  # file: aaa
  # owner: user1
  # group: user1
  user::rw-
  group::rw-
  other::r--
# su user2
# cd btrfs-part/
# setfacl -m u::rwx aaa
# getfacl aaa
  # file: aaa
  # owner: user1
  # group: user1
  user::rwx           <- successed to setfacl
  group::rw-
  other::r--
------------------
but we should prohibit it that user2 changing user1's acl.
In fact, on ext3 and other fs, a message occurs:
  setfacl: aaa: Operation not permitted

This patch fixed it.
Signed-off-by: Shi Weihua <shiwh@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/acl.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6ef7b26..6b4d0cc 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -160,6 +160,9 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
 	int ret;
 	struct posix_acl *acl = NULL;
 
+	if (!is_owner_or_cap(dentry->d_inode))
+		return -EPERM;
+
 	if (value) {
 		acl = posix_acl_from_xattr(value, size);
 		if (acl == NULL) {
-- 
cgit v1.1


From 731e3d1b4348a96d53de6c084774424dedc64a3b Mon Sep 17 00:00:00 2001
From: Shi Weihua <shiwh@cn.fujitsu.com>
Date: Tue, 18 May 2010 00:51:54 +0000
Subject: Btrfs: prohibit a operation of changing acl's mask when noacl mount
 option used

when used Posix File System Test Suite(pjd-fstest) to test btrfs,
some cases about setfacl failed when noacl mount option used.
I simplified used commands in pjd-fstest, and the following steps
can reproduce it.
------------------------
# cd btrfs-part/
# mkdir aaa
# setfacl -m m::rw aaa    <- successed, but not expected by pjd-fstest.
------------------------
I checked ext3, a warning message occured, like as:
  setfacl: aaa/: Operation not supported
Certainly, it's expected by pjd-fstest.

So, i compared acl.c of btrfs and ext3. Based on that, a patch created.
Fortunately, it works.

Signed-off-by: Shi Weihua <shiwh@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/acl.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6b4d0cc..a372985 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -163,6 +163,9 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
 	if (!is_owner_or_cap(dentry->d_inode))
 		return -EPERM;
 
+	if (!IS_POSIXACL(dentry->d_inode))
+		return -EOPNOTSUPP;
+
 	if (value) {
 		acl = posix_acl_from_xattr(value, size);
 		if (acl == NULL) {
-- 
cgit v1.1


From 15e7000095e6fc9ad07e476a100c900c72c14225 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 17 May 2010 17:15:27 +0000
Subject: Btrfs: avoid BUG when dropping root and reference in same transaction

If btrfs_ioctl_snap_destroy() deletes a snapshot but finishes
with end_transaction(), the cleaner kthread may come in and
drop the root in the same transaction.  If that's the case, the
root's refs still == 1 in the tree when btrfs_del_root() deletes
the item, because commit_fs_roots() hasn't updated it yet (that
happens during the commit).

This wasn't a problem before only because
btrfs_ioctl_snap_destroy() would commit the transaction before dropping
the dentry reference, so the dead root wouldn't get queued up until
after the fs root item was updated in the btree.

Since it is not an error to drop the root reference and the root in the
same transaction, just drop the BUG_ON() in btrfs_del_root().

Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/root-tree.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index b91ccd9..2d958be 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -330,7 +330,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 {
 	struct btrfs_path *path;
 	int ret;
-	u32 refs;
 	struct btrfs_root_item *ri;
 	struct extent_buffer *leaf;
 
@@ -344,8 +343,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	leaf = path->nodes[0];
 	ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
 
-	refs = btrfs_disk_root_refs(leaf, ri);
-	BUG_ON(refs != 0);
 	ret = btrfs_del_item(trans, root, path);
 out:
 	btrfs_free_path(path);
-- 
cgit v1.1


From 834e74759a473f8101a273e843d1edec2778801d Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Sat, 29 May 2010 09:48:35 +0000
Subject: Btrfs: handle ERR_PTR from posix_acl_from_xattr()

posix_acl_from_xattr() returns both ERR_PTRs and null, but it's OK to
pass null values to set_cached_acl()

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/acl.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index a372985..1606dc1 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -60,6 +60,8 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 		size = __btrfs_getxattr(inode, name, value, size);
 		if (size > 0) {
 			acl = posix_acl_from_xattr(value, size);
+			if (IS_ERR(acl))
+				return acl;
 			set_cached_acl(inode, type, acl);
 		}
 		kfree(value);
-- 
cgit v1.1


From 6f902af400b2499c80865c62a06fbbd15cf804fd Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Sat, 29 May 2010 09:49:07 +0000
Subject: Btrfs: The file argument for fsync() is never null

The "file" argument for fsync is never null so we can remove this check.

What drew my attention here is that 7ea8085910e: "drop unused dentry
argument to ->fsync" introduced an unconditional dereference at the
start of the function and that generated a smatch warning.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ce0cd29..7f29464 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1139,7 +1139,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	/*
 	 * ok we haven't committed the transaction yet, lets do a commit
 	 */
-	if (file && file->private_data)
+	if (file->private_data)
 		btrfs_ioctl_trans_end(file);
 
 	trans = btrfs_start_transaction(root, 0);
-- 
cgit v1.1