From a539bd8c86549b545b4ed27a0cfaf53fe649054d Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Thu, 17 Dec 2009 00:20:07 +0000
Subject: xfs: kill some warnings on i386 builds

Randy Dunlap Reported printk() format-related warnings reported
on i386 builds in his environment.  Dave Chinner provided this
patch to eliminate them.

Signed-off by: Dave Chinner <david@fromorbit.com>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>

Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_trace.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index c40834b..d4ded59 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -815,7 +815,7 @@ TRACE_EVENT(name, \
 	), \
 	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
 		  "offset 0x%llx count %zd flags %s " \
-		  "startoff 0x%llx startblock 0x%llx blockcount 0x%llx", \
+		  "startoff 0x%llx startblock %s blockcount 0x%llx", \
 		  MAJOR(__entry->dev), MINOR(__entry->dev), \
 		  __entry->ino, \
 		  __entry->size, \
@@ -824,7 +824,7 @@ TRACE_EVENT(name, \
 		  __entry->count, \
 		  __print_flags(__entry->flags, "|", BMAPI_FLAGS), \
 		  __entry->startoff, \
-		  __entry->startblock, \
+		  xfs_fmtfsblock(__entry->startblock), \
 		  __entry->blockcount) \
 )
 DEFINE_IOMAP_EVENT(xfs_iomap_enter);
@@ -1201,7 +1201,7 @@ TRACE_EVENT(name, \
 	TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u " \
 		  "prod %u minleft %u total %u alignment %u minalignslop %u " \
 		  "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d " \
-		  "userdata %d firstblock 0x%llx", \
+		  "userdata %d firstblock %s", \
 		  MAJOR(__entry->dev), MINOR(__entry->dev), \
 		  __entry->agno, \
 		  __entry->agbno, \
@@ -1220,7 +1220,7 @@ TRACE_EVENT(name, \
 		  __entry->wasfromfl, \
 		  __entry->isfl, \
 		  __entry->userdata, \
-		  __entry->firstblock) \
+		  xfs_fmtfsblock(__entry->firstblock)) \
 )
 
 DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
-- 
cgit v1.1


From ea9a48881e093a41a79305fb1545ca0794b203dc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 21 Dec 2009 14:03:03 +0000
Subject: xfs: use DECLARE_EVENT_CLASS

Using DECLARE_EVENT_CLASS allows us to to use trace event code
instead of duplicating it in the binary.  This was not available
before 2.6.33 so it had to be done as a separate step once the
prerequisite was merged.

This only requires changes to xfs_trace.h and the results are
rather impressive:

hch@brick:~/work/linux-2.6/obj-kvm$ size fs/xfs/xfs.o*
text	   data	    bss	    dec	    hex	filename
 607732	  41884	   3616	 653232	  9f7b0	fs/xfs/xfs.o
1026732	  41884	   3808	1072424	 105d28	fs/xfs/xfs.o.old

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_trace.h | 1129 ++++++++++++++++++++++--------------------
 1 file changed, 591 insertions(+), 538 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index d4ded59..8cb42b4 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -33,51 +33,55 @@ struct xfs_dquot;
 struct xlog_ticket;
 struct log;
 
+DECLARE_EVENT_CLASS(xfs_attr_list_class,
+	TP_PROTO(struct xfs_attr_list_context *ctx),
+	TP_ARGS(ctx),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(u32, hashval)
+		__field(u32, blkno)
+		__field(u32, offset)
+		__field(void *, alist)
+		__field(int, bufsize)
+		__field(int, count)
+		__field(int, firstu)
+		__field(int, dupcnt)
+		__field(int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ctx->dp)->i_sb->s_dev;
+		__entry->ino = ctx->dp->i_ino;
+		__entry->hashval = ctx->cursor->hashval;
+		__entry->blkno = ctx->cursor->blkno;
+		__entry->offset = ctx->cursor->offset;
+		__entry->alist = ctx->alist;
+		__entry->bufsize = ctx->bufsize;
+		__entry->count = ctx->count;
+		__entry->firstu = ctx->firstu;
+		__entry->flags = ctx->flags;
+	),
+	TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u "
+		  "alist 0x%p size %u count %u firstu %u flags %d %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		   __entry->ino,
+		   __entry->hashval,
+		   __entry->blkno,
+		   __entry->offset,
+		   __entry->dupcnt,
+		   __entry->alist,
+		   __entry->bufsize,
+		   __entry->count,
+		   __entry->firstu,
+		   __entry->flags,
+		   __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS)
+	)
+)
+
 #define DEFINE_ATTR_LIST_EVENT(name) \
-TRACE_EVENT(name, \
+DEFINE_EVENT(xfs_attr_list_class, name, \
 	TP_PROTO(struct xfs_attr_list_context *ctx), \
-	TP_ARGS(ctx), \
-	TP_STRUCT__entry( \
-		__field(dev_t, dev) \
-		__field(xfs_ino_t, ino) \
-		__field(u32, hashval) \
-		__field(u32, blkno) \
-		__field(u32, offset) \
-		__field(void *, alist) \
-		__field(int, bufsize) \
-		__field(int, count) \
-		__field(int, firstu) \
-		__field(int, dupcnt) \
-		__field(int, flags) \
-	), \
-	TP_fast_assign( \
-		__entry->dev = VFS_I(ctx->dp)->i_sb->s_dev; \
-		__entry->ino = ctx->dp->i_ino; \
-		__entry->hashval = ctx->cursor->hashval; \
-		__entry->blkno = ctx->cursor->blkno; \
-		__entry->offset = ctx->cursor->offset; \
-		__entry->alist = ctx->alist; \
-		__entry->bufsize = ctx->bufsize; \
-		__entry->count = ctx->count; \
-		__entry->firstu = ctx->firstu; \
-		__entry->flags = ctx->flags; \
-	), \
-	TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u " \
-		  "alist 0x%p size %u count %u firstu %u flags %d %s", \
-		  MAJOR(__entry->dev), MINOR(__entry->dev), \
-		   __entry->ino, \
-		   __entry->hashval, \
-		   __entry->blkno, \
-		   __entry->offset, \
-		   __entry->dupcnt, \
-		   __entry->alist, \
-		   __entry->bufsize, \
-		   __entry->count, \
-		   __entry->firstu, \
-		   __entry->flags, \
-		   __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS) \
-	) \
-)
+	TP_ARGS(ctx))
 DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf);
 DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf_all);
 DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf);
@@ -178,91 +182,99 @@ TRACE_EVENT(xfs_iext_insert,
 		  (char *)__entry->caller_ip)
 );
 
+DECLARE_EVENT_CLASS(xfs_bmap_class,
+	TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state,
+		 unsigned long caller_ip),
+	TP_ARGS(ip, idx, state, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_extnum_t, idx)
+		__field(xfs_fileoff_t, startoff)
+		__field(xfs_fsblock_t, startblock)
+		__field(xfs_filblks_t, blockcount)
+		__field(xfs_exntst_t, state)
+		__field(int, bmap_state)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		struct xfs_ifork	*ifp = (state & BMAP_ATTRFORK) ?
+						ip->i_afp : &ip->i_df;
+		struct xfs_bmbt_irec	r;
+
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r);
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->idx = idx;
+		__entry->startoff = r.br_startoff;
+		__entry->startblock = r.br_startblock;
+		__entry->blockcount = r.br_blockcount;
+		__entry->state = r.br_state;
+		__entry->bmap_state = state;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
+		  "offset %lld block %s count %lld flag %d caller %pf",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
+		  (long)__entry->idx,
+		  __entry->startoff,
+		  xfs_fmtfsblock(__entry->startblock),
+		  __entry->blockcount,
+		  __entry->state,
+		  (char *)__entry->caller_ip)
+)
+
 #define DEFINE_BMAP_EVENT(name) \
-TRACE_EVENT(name, \
+DEFINE_EVENT(xfs_bmap_class, name, \
 	TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, \
 		 unsigned long caller_ip), \
-	TP_ARGS(ip, idx, state, caller_ip), \
-	TP_STRUCT__entry( \
-		__field(dev_t, dev) \
-		__field(xfs_ino_t, ino) \
-		__field(xfs_extnum_t, idx) \
-		__field(xfs_fileoff_t, startoff) \
-		__field(xfs_fsblock_t, startblock) \
-		__field(xfs_filblks_t, blockcount) \
-		__field(xfs_exntst_t, state) \
-		__field(int, bmap_state) \
-		__field(unsigned long, caller_ip) \
-	), \
-	TP_fast_assign( \
-		struct xfs_ifork	*ifp = (state & BMAP_ATTRFORK) ? \
-						ip->i_afp : &ip->i_df; \
-		struct xfs_bmbt_irec	r; \
-	\
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r); \
-		__entry->dev = VFS_I(ip)->i_sb->s_dev; \
-		__entry->ino = ip->i_ino; \
-		__entry->idx = idx; \
-		__entry->startoff = r.br_startoff; \
-		__entry->startblock = r.br_startblock; \
-		__entry->blockcount = r.br_blockcount; \
-		__entry->state = r.br_state; \
-		__entry->bmap_state = state; \
-		__entry->caller_ip = caller_ip; \
-	), \
-	TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " \
-		  "offset %lld block %s count %lld flag %d caller %pf", \
-		  MAJOR(__entry->dev), MINOR(__entry->dev), \
-		  __entry->ino, \
-		  __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), \
-		  (long)__entry->idx, \
-		  __entry->startoff, \
-		  xfs_fmtfsblock(__entry->startblock), \
-		  __entry->blockcount, \
-		  __entry->state, \
-		  (char *)__entry->caller_ip) \
-)
-
+	TP_ARGS(ip, idx, state, caller_ip))
 DEFINE_BMAP_EVENT(xfs_iext_remove);
 DEFINE_BMAP_EVENT(xfs_bmap_pre_update);
 DEFINE_BMAP_EVENT(xfs_bmap_post_update);
 DEFINE_BMAP_EVENT(xfs_extlist);
 
-#define DEFINE_BUF_EVENT(tname) \
-TRACE_EVENT(tname, \
-	TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), \
-	TP_ARGS(bp, caller_ip), \
-	TP_STRUCT__entry( \
-		__field(dev_t, dev) \
-		__field(xfs_daddr_t, bno) \
-		__field(size_t, buffer_length) \
-		__field(int, hold) \
-		__field(int, pincount) \
-		__field(unsigned, lockval) \
-		__field(unsigned, flags) \
-		__field(unsigned long, caller_ip) \
-	), \
-	TP_fast_assign( \
-		__entry->dev = bp->b_target->bt_dev; \
-		__entry->bno = bp->b_bn; \
-		__entry->buffer_length = bp->b_buffer_length; \
-		__entry->hold = atomic_read(&bp->b_hold); \
-		__entry->pincount = atomic_read(&bp->b_pin_count); \
-		__entry->lockval = xfs_buf_lock_value(bp); \
-		__entry->flags = bp->b_flags; \
-		__entry->caller_ip = caller_ip; \
-	), \
-	TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " \
-		  "lock %d flags %s caller %pf", \
-		  MAJOR(__entry->dev), MINOR(__entry->dev), \
-		  (unsigned long long)__entry->bno, \
-		  __entry->buffer_length, \
-		  __entry->hold, \
-		  __entry->pincount, \
-		  __entry->lockval, \
-		  __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), \
-		  (void *)__entry->caller_ip) \
+DECLARE_EVENT_CLASS(xfs_buf_class,
+	TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip),
+	TP_ARGS(bp, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_daddr_t, bno)
+		__field(size_t, buffer_length)
+		__field(int, hold)
+		__field(int, pincount)
+		__field(unsigned, lockval)
+		__field(unsigned, flags)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = bp->b_target->bt_dev;
+		__entry->bno = bp->b_bn;
+		__entry->buffer_length = bp->b_buffer_length;
+		__entry->hold = atomic_read(&bp->b_hold);
+		__entry->pincount = atomic_read(&bp->b_pin_count);
+		__entry->lockval = xfs_buf_lock_value(bp);
+		__entry->flags = bp->b_flags;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+		  "lock %d flags %s caller %pf",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long long)__entry->bno,
+		  __entry->buffer_length,
+		  __entry->hold,
+		  __entry->pincount,
+		  __entry->lockval,
+		  __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
+		  (void *)__entry->caller_ip)
 )
+
+#define DEFINE_BUF_EVENT(name) \
+DEFINE_EVENT(xfs_buf_class, name, \
+	TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), \
+	TP_ARGS(bp, caller_ip))
 DEFINE_BUF_EVENT(xfs_buf_init);
 DEFINE_BUF_EVENT(xfs_buf_free);
 DEFINE_BUF_EVENT(xfs_buf_hold);
@@ -299,41 +311,45 @@ DEFINE_BUF_EVENT(xfs_reset_dqcounts);
 DEFINE_BUF_EVENT(xfs_inode_item_push);
 
 /* pass flags explicitly */
-#define DEFINE_BUF_FLAGS_EVENT(tname) \
-TRACE_EVENT(tname, \
-	TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), \
-	TP_ARGS(bp, flags, caller_ip), \
-	TP_STRUCT__entry( \
-		__field(dev_t, dev) \
-		__field(xfs_daddr_t, bno) \
-		__field(size_t, buffer_length) \
-		__field(int, hold) \
-		__field(int, pincount) \
-		__field(unsigned, lockval) \
-		__field(unsigned, flags) \
-		__field(unsigned long, caller_ip) \
-	), \
-	TP_fast_assign( \
-		__entry->dev = bp->b_target->bt_dev; \
-		__entry->bno = bp->b_bn; \
-		__entry->buffer_length = bp->b_buffer_length; \
-		__entry->flags = flags; \
-		__entry->hold = atomic_read(&bp->b_hold); \
-		__entry->pincount = atomic_read(&bp->b_pin_count); \
-		__entry->lockval = xfs_buf_lock_value(bp); \
-		__entry->caller_ip = caller_ip; \
-	), \
-	TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " \
-		  "lock %d flags %s caller %pf", \
-		  MAJOR(__entry->dev), MINOR(__entry->dev), \
-		  (unsigned long long)__entry->bno, \
-		  __entry->buffer_length, \
-		  __entry->hold, \
-		  __entry->pincount, \
-		  __entry->lockval, \
-		  __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), \
-		  (void *)__entry->caller_ip) \
+DECLARE_EVENT_CLASS(xfs_buf_flags_class,
+	TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip),
+	TP_ARGS(bp, flags, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_daddr_t, bno)
+		__field(size_t, buffer_length)
+		__field(int, hold)
+		__field(int, pincount)
+		__field(unsigned, lockval)
+		__field(unsigned, flags)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = bp->b_target->bt_dev;
+		__entry->bno = bp->b_bn;
+		__entry->buffer_length = bp->b_buffer_length;
+		__entry->flags = flags;
+		__entry->hold = atomic_read(&bp->b_hold);
+		__entry->pincount = atomic_read(&bp->b_pin_count);
+		__entry->lockval = xfs_buf_lock_value(bp);
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+		  "lock %d flags %s caller %pf",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long long)__entry->bno,
+		  __entry->buffer_length,
+		  __entry->hold,
+		  __entry->pincount,
+		  __entry->lockval,
+		  __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
+		  (void *)__entry->caller_ip)
 )
+
+#define DEFINE_BUF_FLAGS_EVENT(name) \
+DEFINE_EVENT(xfs_buf_flags_class, name, \
+	TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), \
+	TP_ARGS(bp, flags, caller_ip))
 DEFINE_BUF_FLAGS_EVENT(xfs_buf_find);
 DEFINE_BUF_FLAGS_EVENT(xfs_buf_get);
 DEFINE_BUF_FLAGS_EVENT(xfs_buf_read);
@@ -376,55 +392,58 @@ TRACE_EVENT(xfs_buf_ioerror,
 		  (void *)__entry->caller_ip)
 );
 
-#define DEFINE_BUF_ITEM_EVENT(tname) \
-TRACE_EVENT(tname, \
-	TP_PROTO(struct xfs_buf_log_item *bip), \
-	TP_ARGS(bip), \
-	TP_STRUCT__entry( \
-		__field(dev_t, dev) \
-		__field(xfs_daddr_t, buf_bno) \
-		__field(size_t, buf_len) \
-		__field(int, buf_hold) \
-		__field(int, buf_pincount) \
-		__field(int, buf_lockval) \
-		__field(unsigned, buf_flags) \
-		__field(unsigned, bli_recur) \
-		__field(int, bli_refcount) \
-		__field(unsigned, bli_flags) \
-		__field(void *, li_desc) \
-		__field(unsigned, li_flags) \
-	), \
-	TP_fast_assign( \
-		__entry->dev = bip->bli_buf->b_target->bt_dev; \
-		__entry->bli_flags = bip->bli_flags; \
-		__entry->bli_recur = bip->bli_recur; \
-		__entry->bli_refcount = atomic_read(&bip->bli_refcount); \
-		__entry->buf_bno = bip->bli_buf->b_bn; \
-		__entry->buf_len = bip->bli_buf->b_buffer_length; \
-		__entry->buf_flags = bip->bli_buf->b_flags; \
-		__entry->buf_hold = atomic_read(&bip->bli_buf->b_hold); \
-		__entry->buf_pincount = \
-			atomic_read(&bip->bli_buf->b_pin_count); \
-		__entry->buf_lockval = xfs_buf_lock_value(bip->bli_buf); \
-		__entry->li_desc = bip->bli_item.li_desc; \
-		__entry->li_flags = bip->bli_item.li_flags; \
-	), \
-	TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " \
-		  "lock %d flags %s recur %d refcount %d bliflags %s " \
-		  "lidesc 0x%p liflags %s", \
-		  MAJOR(__entry->dev), MINOR(__entry->dev), \
-		  (unsigned long long)__entry->buf_bno, \
-		  __entry->buf_len, \
-		  __entry->buf_hold, \
-		  __entry->buf_pincount, \
-		  __entry->buf_lockval, \
-		  __print_flags(__entry->buf_flags, "|", XFS_BUF_FLAGS), \
-		  __entry->bli_recur, \
-		  __entry->bli_refcount, \
-		  __print_flags(__entry->bli_flags, "|", XFS_BLI_FLAGS), \
-		  __entry->li_desc, \
-		  __print_flags(__entry->li_flags, "|", XFS_LI_FLAGS)) \
+DECLARE_EVENT_CLASS(xfs_buf_item_class,
+	TP_PROTO(struct xfs_buf_log_item *bip),
+	TP_ARGS(bip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_daddr_t, buf_bno)
+		__field(size_t, buf_len)
+		__field(int, buf_hold)
+		__field(int, buf_pincount)
+		__field(int, buf_lockval)
+		__field(unsigned, buf_flags)
+		__field(unsigned, bli_recur)
+		__field(int, bli_refcount)
+		__field(unsigned, bli_flags)
+		__field(void *, li_desc)
+		__field(unsigned, li_flags)
+	),
+	TP_fast_assign(
+		__entry->dev = bip->bli_buf->b_target->bt_dev;
+		__entry->bli_flags = bip->bli_flags;
+		__entry->bli_recur = bip->bli_recur;
+		__entry->bli_refcount = atomic_read(&bip->bli_refcount);
+		__entry->buf_bno = bip->bli_buf->b_bn;
+		__entry->buf_len = bip->bli_buf->b_buffer_length;
+		__entry->buf_flags = bip->bli_buf->b_flags;
+		__entry->buf_hold = atomic_read(&bip->bli_buf->b_hold);
+		__entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
+		__entry->buf_lockval = xfs_buf_lock_value(bip->bli_buf);
+		__entry->li_desc = bip->bli_item.li_desc;
+		__entry->li_flags = bip->bli_item.li_flags;
+	),
+	TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+		  "lock %d flags %s recur %d refcount %d bliflags %s "
+		  "lidesc 0x%p liflags %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long long)__entry->buf_bno,
+		  __entry->buf_len,
+		  __entry->buf_hold,
+		  __entry->buf_pincount,
+		  __entry->buf_lockval,
+		  __print_flags(__entry->buf_flags, "|", XFS_BUF_FLAGS),
+		  __entry->bli_recur,
+		  __entry->bli_refcount,
+		  __print_flags(__entry->bli_flags, "|", XFS_BLI_FLAGS),
+		  __entry->li_desc,
+		  __print_flags(__entry->li_flags, "|", XFS_LI_FLAGS))
 )
+
+#define DEFINE_BUF_ITEM_EVENT(name) \
+DEFINE_EVENT(xfs_buf_item_class, name, \
+	TP_PROTO(struct xfs_buf_log_item *bip), \
+	TP_ARGS(bip))
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
@@ -450,78 +469,90 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
 
+DECLARE_EVENT_CLASS(xfs_lock_class,
+	TP_PROTO(struct xfs_inode *ip, unsigned lock_flags,
+		 unsigned long caller_ip),
+	TP_ARGS(ip,  lock_flags, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(int, lock_flags)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->lock_flags = lock_flags;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d ino 0x%llx flags %s caller %pf",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS),
+		  (void *)__entry->caller_ip)
+)
+
 #define DEFINE_LOCK_EVENT(name) \
-TRACE_EVENT(name, \
+DEFINE_EVENT(xfs_lock_class, name, \
 	TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, \
 		 unsigned long caller_ip), \
-	TP_ARGS(ip,  lock_flags, caller_ip), \
-	TP_STRUCT__entry( \
-		__field(dev_t, dev) \
-		__field(xfs_ino_t, ino) \
-		__field(int, lock_flags) \
-		__field(unsigned long, caller_ip) \
-	), \
-	TP_fast_assign( \
-		__entry->dev = VFS_I(ip)->i_sb->s_dev; \
-		__entry->ino = ip->i_ino; \
-		__entry->lock_flags = lock_flags; \
-		__entry->caller_ip = caller_ip; \
-	), \
-	TP_printk("dev %d:%d ino 0x%llx flags %s caller %pf", \
-		  MAJOR(__entry->dev), MINOR(__entry->dev), \
-		  __entry->ino, \
-		  __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS), \
-		  (void *)__entry->caller_ip) \
-)
-
+	TP_ARGS(ip,  lock_flags, caller_ip))
 DEFINE_LOCK_EVENT(xfs_ilock);
 DEFINE_LOCK_EVENT(xfs_ilock_nowait);
 DEFINE_LOCK_EVENT(xfs_ilock_demote);
 DEFINE_LOCK_EVENT(xfs_iunlock);
 
+DECLARE_EVENT_CLASS(xfs_iget_class,
+	TP_PROTO(struct xfs_inode *ip),
+	TP_ARGS(ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+	),
+	TP_printk("dev %d:%d ino 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino)
+)
+
 #define DEFINE_IGET_EVENT(name) \
-TRACE_EVENT(name, \
+DEFINE_EVENT(xfs_iget_class, name, \
 	TP_PROTO(struct xfs_inode *ip), \
-	TP_ARGS(ip), \
-	TP_STRUCT__entry( \
-		__field(dev_t, dev) \
-		__field(xfs_ino_t, ino) \
-	), \
-	TP_fast_assign( \
-		__entry->dev = VFS_I(ip)->i_sb->s_dev; \
-		__entry->ino = ip->i_ino; \
-	), \
-	TP_printk("dev %d:%d ino 0x%llx", \
-		  MAJOR(__entry->dev), MINOR(__entry->dev), \
-		  __entry->ino) \
-)
+	TP_ARGS(ip))
 DEFINE_IGET_EVENT(xfs_iget_skip);
 DEFINE_IGET_EVENT(xfs_iget_reclaim);
 DEFINE_IGET_EVENT(xfs_iget_found);
 DEFINE_IGET_EVENT(xfs_iget_alloc);
 
+DECLARE_EVENT_CLASS(xfs_inode_class,
+	TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
+	TP_ARGS(ip, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(int, count)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->count = atomic_read(&VFS_I(ip)->i_count);
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d ino 0x%llx count %d caller %pf",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->count,
+		  (char *)__entry->caller_ip)
+)
+
 #define DEFINE_INODE_EVENT(name) \
-TRACE_EVENT(name, \
+DEFINE_EVENT(xfs_inode_class, name, \
 	TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
-	TP_ARGS(ip, caller_ip), \
-	TP_STRUCT__entry( \
-		__field(dev_t, dev) \
-		__field(xfs_ino_t, ino) \
-		__field(int, count) \
-		__field(unsigned long, caller_ip) \
-	), \
-	TP_fast_assign( \
-		__entry->dev = VFS_I(ip)->i_sb->s_dev; \
-		__entry->ino = ip->i_ino; \
-		__entry->count = atomic_read(&VFS_I(ip)->i_count); \
-		__entry->caller_ip = caller_ip; \
-	), \
-	TP_printk("dev %d:%d ino 0x%llx count %d caller %pf", \
-		  MAJOR(__entry->dev), MINOR(__entry->dev), \
-		  __entry->ino, \
-		  __entry->count, \
-		  (char *)__entry->caller_ip) \
-)
+	TP_ARGS(ip, caller_ip))
 DEFINE_INODE_EVENT(xfs_ihold);
 DEFINE_INODE_EVENT(xfs_irele);
 /* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */
@@ -529,55 +560,59 @@ DEFINE_INODE_EVENT(xfs_inode);
 #define xfs_itrace_entry(ip)    \
 	trace_xfs_inode(ip, _THIS_IP_)
 
-#define DEFINE_DQUOT_EVENT(tname) \
-TRACE_EVENT(tname, \
-	TP_PROTO(struct xfs_dquot *dqp), \
-	TP_ARGS(dqp), \
-	TP_STRUCT__entry( \
-		__field(dev_t, dev) \
-		__field(__be32, id) \
-		__field(unsigned, flags) \
-		__field(unsigned, nrefs) \
-		__field(unsigned long long, res_bcount) \
-		__field(unsigned long long, bcount) \
-		__field(unsigned long long, icount) \
-		__field(unsigned long long, blk_hardlimit) \
-		__field(unsigned long long, blk_softlimit) \
-		__field(unsigned long long, ino_hardlimit) \
-		__field(unsigned long long, ino_softlimit) \
-	), \
-	TP_fast_assign( \
-		__entry->dev = dqp->q_mount->m_super->s_dev; \
-		__entry->id = dqp->q_core.d_id; \
-		__entry->flags = dqp->dq_flags; \
-		__entry->nrefs = dqp->q_nrefs; \
-		__entry->res_bcount = dqp->q_res_bcount; \
-		__entry->bcount = be64_to_cpu(dqp->q_core.d_bcount); \
-		__entry->icount = be64_to_cpu(dqp->q_core.d_icount); \
-		__entry->blk_hardlimit = \
-			be64_to_cpu(dqp->q_core.d_blk_hardlimit); \
-		__entry->blk_softlimit = \
-			be64_to_cpu(dqp->q_core.d_blk_softlimit); \
-		__entry->ino_hardlimit = \
-			be64_to_cpu(dqp->q_core.d_ino_hardlimit); \
-		__entry->ino_softlimit = \
-			be64_to_cpu(dqp->q_core.d_ino_softlimit); \
+DECLARE_EVENT_CLASS(xfs_dquot_class,
+	TP_PROTO(struct xfs_dquot *dqp),
+	TP_ARGS(dqp),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(__be32, id)
+		__field(unsigned, flags)
+		__field(unsigned, nrefs)
+		__field(unsigned long long, res_bcount)
+		__field(unsigned long long, bcount)
+		__field(unsigned long long, icount)
+		__field(unsigned long long, blk_hardlimit)
+		__field(unsigned long long, blk_softlimit)
+		__field(unsigned long long, ino_hardlimit)
+		__field(unsigned long long, ino_softlimit)
 	), \
-	TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx " \
-		  "bcnt 0x%llx [hard 0x%llx | soft 0x%llx] " \
-		  "icnt 0x%llx [hard 0x%llx | soft 0x%llx]", \
-		  MAJOR(__entry->dev), MINOR(__entry->dev), \
-		  be32_to_cpu(__entry->id), \
-		  __print_flags(__entry->flags, "|", XFS_DQ_FLAGS), \
-		  __entry->nrefs, \
-		  __entry->res_bcount, \
-		  __entry->bcount, \
-		  __entry->blk_hardlimit, \
-		  __entry->blk_softlimit, \
-		  __entry->icount, \
-		  __entry->ino_hardlimit, \
-		  __entry->ino_softlimit) \
+	TP_fast_assign(
+		__entry->dev = dqp->q_mount->m_super->s_dev;
+		__entry->id = dqp->q_core.d_id;
+		__entry->flags = dqp->dq_flags;
+		__entry->nrefs = dqp->q_nrefs;
+		__entry->res_bcount = dqp->q_res_bcount;
+		__entry->bcount = be64_to_cpu(dqp->q_core.d_bcount);
+		__entry->icount = be64_to_cpu(dqp->q_core.d_icount);
+		__entry->blk_hardlimit =
+			be64_to_cpu(dqp->q_core.d_blk_hardlimit);
+		__entry->blk_softlimit =
+			be64_to_cpu(dqp->q_core.d_blk_softlimit);
+		__entry->ino_hardlimit =
+			be64_to_cpu(dqp->q_core.d_ino_hardlimit);
+		__entry->ino_softlimit =
+			be64_to_cpu(dqp->q_core.d_ino_softlimit);
+	),
+	TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx "
+		  "bcnt 0x%llx [hard 0x%llx | soft 0x%llx] "
+		  "icnt 0x%llx [hard 0x%llx | soft 0x%llx]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  be32_to_cpu(__entry->id),
+		  __print_flags(__entry->flags, "|", XFS_DQ_FLAGS),
+		  __entry->nrefs,
+		  __entry->res_bcount,
+		  __entry->bcount,
+		  __entry->blk_hardlimit,
+		  __entry->blk_softlimit,
+		  __entry->icount,
+		  __entry->ino_hardlimit,
+		  __entry->ino_softlimit)
 )
+
+#define DEFINE_DQUOT_EVENT(name) \
+DEFINE_EVENT(xfs_dquot_class, name, \
+	TP_PROTO(struct xfs_dquot *dqp), \
+	TP_ARGS(dqp))
 DEFINE_DQUOT_EVENT(xfs_dqadjust);
 DEFINE_DQUOT_EVENT(xfs_dqshake_dirty);
 DEFINE_DQUOT_EVENT(xfs_dqshake_unlink);
@@ -610,72 +645,75 @@ DEFINE_DQUOT_EVENT(xfs_dqflush_done);
 DEFINE_IGET_EVENT(xfs_dquot_dqalloc);
 DEFINE_IGET_EVENT(xfs_dquot_dqdetach);
 
+DECLARE_EVENT_CLASS(xfs_loggrant_class,
+	TP_PROTO(struct log *log, struct xlog_ticket *tic),
+	TP_ARGS(log, tic),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned, trans_type)
+		__field(char, ocnt)
+		__field(char, cnt)
+		__field(int, curr_res)
+		__field(int, unit_res)
+		__field(unsigned int, flags)
+		__field(void *, reserve_headq)
+		__field(void *, write_headq)
+		__field(int, grant_reserve_cycle)
+		__field(int, grant_reserve_bytes)
+		__field(int, grant_write_cycle)
+		__field(int, grant_write_bytes)
+		__field(int, curr_cycle)
+		__field(int, curr_block)
+		__field(xfs_lsn_t, tail_lsn)
+	),
+	TP_fast_assign(
+		__entry->dev = log->l_mp->m_super->s_dev;
+		__entry->trans_type = tic->t_trans_type;
+		__entry->ocnt = tic->t_ocnt;
+		__entry->cnt = tic->t_cnt;
+		__entry->curr_res = tic->t_curr_res;
+		__entry->unit_res = tic->t_unit_res;
+		__entry->flags = tic->t_flags;
+		__entry->reserve_headq = log->l_reserve_headq;
+		__entry->write_headq = log->l_write_headq;
+		__entry->grant_reserve_cycle = log->l_grant_reserve_cycle;
+		__entry->grant_reserve_bytes = log->l_grant_reserve_bytes;
+		__entry->grant_write_cycle = log->l_grant_write_cycle;
+		__entry->grant_write_bytes = log->l_grant_write_bytes;
+		__entry->curr_cycle = log->l_curr_cycle;
+		__entry->curr_block = log->l_curr_block;
+		__entry->tail_lsn = log->l_tail_lsn;
+	),
+	TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
+		  "t_unit_res %u t_flags %s reserve_headq 0x%p "
+		  "write_headq 0x%p grant_reserve_cycle %d "
+		  "grant_reserve_bytes %d grant_write_cycle %d "
+		  "grant_write_bytes %d curr_cycle %d curr_block %d "
+		  "tail_cycle %d tail_block %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES),
+		  __entry->ocnt,
+		  __entry->cnt,
+		  __entry->curr_res,
+		  __entry->unit_res,
+		  __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
+		  __entry->reserve_headq,
+		  __entry->write_headq,
+		  __entry->grant_reserve_cycle,
+		  __entry->grant_reserve_bytes,
+		  __entry->grant_write_cycle,
+		  __entry->grant_write_bytes,
+		  __entry->curr_cycle,
+		  __entry->curr_block,
+		  CYCLE_LSN(__entry->tail_lsn),
+		  BLOCK_LSN(__entry->tail_lsn)
+	)
+)
 
-#define DEFINE_LOGGRANT_EVENT(tname) \
-TRACE_EVENT(tname, \
+#define DEFINE_LOGGRANT_EVENT(name) \
+DEFINE_EVENT(xfs_loggrant_class, name, \
 	TP_PROTO(struct log *log, struct xlog_ticket *tic), \
-	TP_ARGS(log, tic), \
-	TP_STRUCT__entry( \
-		__field(dev_t, dev) \
-		__field(unsigned, trans_type) \
-		__field(char, ocnt) \
-		__field(char, cnt) \
-		__field(int, curr_res) \
-		__field(int, unit_res) \
-		__field(unsigned int, flags) \
-		__field(void *, reserve_headq) \
-		__field(void *, write_headq) \
-		__field(int, grant_reserve_cycle) \
-		__field(int, grant_reserve_bytes) \
-		__field(int, grant_write_cycle) \
-		__field(int, grant_write_bytes) \
-		__field(int, curr_cycle) \
-		__field(int, curr_block) \
-		__field(xfs_lsn_t, tail_lsn) \
-	), \
-	TP_fast_assign( \
-		__entry->dev = log->l_mp->m_super->s_dev; \
-		__entry->trans_type = tic->t_trans_type; \
-		__entry->ocnt = tic->t_ocnt; \
-		__entry->cnt = tic->t_cnt; \
-		__entry->curr_res = tic->t_curr_res; \
-		__entry->unit_res = tic->t_unit_res; \
-		__entry->flags = tic->t_flags; \
-		__entry->reserve_headq = log->l_reserve_headq; \
-		__entry->write_headq = log->l_write_headq; \
-		__entry->grant_reserve_cycle = log->l_grant_reserve_cycle; \
-		__entry->grant_reserve_bytes = log->l_grant_reserve_bytes; \
-		__entry->grant_write_cycle = log->l_grant_write_cycle; \
-		__entry->grant_write_bytes = log->l_grant_write_bytes; \
-		__entry->curr_cycle = log->l_curr_cycle; \
-		__entry->curr_block = log->l_curr_block; \
-		__entry->tail_lsn = log->l_tail_lsn; \
-	), \
-	TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u " \
-		  "t_unit_res %u t_flags %s reserve_headq 0x%p " \
-		  "write_headq 0x%p grant_reserve_cycle %d " \
-		  "grant_reserve_bytes %d grant_write_cycle %d " \
-		  "grant_write_bytes %d curr_cycle %d curr_block %d " \
-		  "tail_cycle %d tail_block %d", \
-		  MAJOR(__entry->dev), MINOR(__entry->dev), \
-		  __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES), \
-		  __entry->ocnt, \
-		  __entry->cnt, \
-		  __entry->curr_res, \
-		  __entry->unit_res, \
-		  __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS), \
-		  __entry->reserve_headq, \
-		  __entry->write_headq, \
-		  __entry->grant_reserve_cycle, \
-		  __entry->grant_reserve_bytes, \
-		  __entry->grant_write_cycle, \
-		  __entry->grant_write_bytes, \
-		  __entry->curr_cycle, \
-		  __entry->curr_block, \
-		  CYCLE_LSN(__entry->tail_lsn), \
-		  BLOCK_LSN(__entry->tail_lsn) \
-	) \
-)
+	TP_ARGS(log, tic))
 DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm);
 DEFINE_LOGGRANT_EVENT(xfs_log_done_perm);
 DEFINE_LOGGRANT_EVENT(xfs_log_reserve);
@@ -897,28 +935,32 @@ TRACE_EVENT(xfs_itruncate_start,
 		  __entry->toss_finish)
 );
 
+DECLARE_EVENT_CLASS(xfs_itrunc_class,
+	TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
+	TP_ARGS(ip, new_size),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fsize_t, size)
+		__field(xfs_fsize_t, new_size)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->size = ip->i_d.di_size;
+		__entry->new_size = new_size;
+	),
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->new_size)
+)
+
 #define DEFINE_ITRUNC_EVENT(name) \
-TRACE_EVENT(name, \
+DEFINE_EVENT(xfs_itrunc_class, name, \
 	TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \
-	TP_ARGS(ip, new_size), \
-	TP_STRUCT__entry( \
-		__field(dev_t, dev) \
-		__field(xfs_ino_t, ino) \
-		__field(xfs_fsize_t, size) \
-		__field(xfs_fsize_t, new_size) \
-	), \
-	TP_fast_assign( \
-		__entry->dev = VFS_I(ip)->i_sb->s_dev; \
-		__entry->ino = ip->i_ino; \
-		__entry->size = ip->i_d.di_size; \
-		__entry->new_size = new_size; \
-	), \
-	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx", \
-		  MAJOR(__entry->dev), MINOR(__entry->dev), \
-		  __entry->ino, \
-		  __entry->size, \
-		  __entry->new_size) \
-)
+	TP_ARGS(ip, new_size))
 DEFINE_ITRUNC_EVENT(xfs_itruncate_finish_start);
 DEFINE_ITRUNC_EVENT(xfs_itruncate_finish_end);
 
@@ -1152,77 +1194,80 @@ TRACE_EVENT(xfs_free_extent,
 
 );
 
-#define DEFINE_ALLOC_EVENT(name) \
-TRACE_EVENT(name, \
-	TP_PROTO(struct xfs_alloc_arg *args), \
-	TP_ARGS(args), \
-	TP_STRUCT__entry( \
-		__field(dev_t, dev) \
-		__field(xfs_agnumber_t, agno) \
-		__field(xfs_agblock_t, agbno) \
-		__field(xfs_extlen_t, minlen) \
-		__field(xfs_extlen_t, maxlen) \
-		__field(xfs_extlen_t, mod) \
-		__field(xfs_extlen_t, prod) \
-		__field(xfs_extlen_t, minleft) \
-		__field(xfs_extlen_t, total) \
-		__field(xfs_extlen_t, alignment) \
-		__field(xfs_extlen_t, minalignslop) \
-		__field(xfs_extlen_t, len) \
-		__field(short, type) \
-		__field(short, otype) \
-		__field(char, wasdel) \
-		__field(char, wasfromfl) \
-		__field(char, isfl) \
-		__field(char, userdata) \
-		__field(xfs_fsblock_t, firstblock) \
-	), \
-	TP_fast_assign( \
-		__entry->dev = args->mp->m_super->s_dev; \
-		__entry->agno = args->agno; \
-		__entry->agbno = args->agbno; \
-		__entry->minlen = args->minlen; \
-		__entry->maxlen = args->maxlen; \
-		__entry->mod = args->mod; \
-		__entry->prod = args->prod; \
-		__entry->minleft = args->minleft; \
-		__entry->total = args->total; \
-		__entry->alignment = args->alignment; \
-		__entry->minalignslop = args->minalignslop; \
-		__entry->len = args->len; \
-		__entry->type = args->type; \
-		__entry->otype = args->otype; \
-		__entry->wasdel = args->wasdel; \
-		__entry->wasfromfl = args->wasfromfl; \
-		__entry->isfl = args->isfl; \
-		__entry->userdata = args->userdata; \
-		__entry->firstblock = args->firstblock; \
-	), \
-	TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u " \
-		  "prod %u minleft %u total %u alignment %u minalignslop %u " \
-		  "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d " \
-		  "userdata %d firstblock %s", \
-		  MAJOR(__entry->dev), MINOR(__entry->dev), \
-		  __entry->agno, \
-		  __entry->agbno, \
-		  __entry->minlen, \
-		  __entry->maxlen, \
-		  __entry->mod, \
-		  __entry->prod, \
-		  __entry->minleft, \
-		  __entry->total, \
-		  __entry->alignment, \
-		  __entry->minalignslop, \
-		  __entry->len, \
-		  __print_symbolic(__entry->type, XFS_ALLOC_TYPES), \
-		  __print_symbolic(__entry->otype, XFS_ALLOC_TYPES), \
-		  __entry->wasdel, \
-		  __entry->wasfromfl, \
-		  __entry->isfl, \
-		  __entry->userdata, \
-		  xfs_fmtfsblock(__entry->firstblock)) \
+DECLARE_EVENT_CLASS(xfs_alloc_class,
+	TP_PROTO(struct xfs_alloc_arg *args),
+	TP_ARGS(args),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, minlen)
+		__field(xfs_extlen_t, maxlen)
+		__field(xfs_extlen_t, mod)
+		__field(xfs_extlen_t, prod)
+		__field(xfs_extlen_t, minleft)
+		__field(xfs_extlen_t, total)
+		__field(xfs_extlen_t, alignment)
+		__field(xfs_extlen_t, minalignslop)
+		__field(xfs_extlen_t, len)
+		__field(short, type)
+		__field(short, otype)
+		__field(char, wasdel)
+		__field(char, wasfromfl)
+		__field(char, isfl)
+		__field(char, userdata)
+		__field(xfs_fsblock_t, firstblock)
+	),
+	TP_fast_assign(
+		__entry->dev = args->mp->m_super->s_dev;
+		__entry->agno = args->agno;
+		__entry->agbno = args->agbno;
+		__entry->minlen = args->minlen;
+		__entry->maxlen = args->maxlen;
+		__entry->mod = args->mod;
+		__entry->prod = args->prod;
+		__entry->minleft = args->minleft;
+		__entry->total = args->total;
+		__entry->alignment = args->alignment;
+		__entry->minalignslop = args->minalignslop;
+		__entry->len = args->len;
+		__entry->type = args->type;
+		__entry->otype = args->otype;
+		__entry->wasdel = args->wasdel;
+		__entry->wasfromfl = args->wasfromfl;
+		__entry->isfl = args->isfl;
+		__entry->userdata = args->userdata;
+		__entry->firstblock = args->firstblock;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u "
+		  "prod %u minleft %u total %u alignment %u minalignslop %u "
+		  "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d "
+		  "userdata %d firstblock 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->minlen,
+		  __entry->maxlen,
+		  __entry->mod,
+		  __entry->prod,
+		  __entry->minleft,
+		  __entry->total,
+		  __entry->alignment,
+		  __entry->minalignslop,
+		  __entry->len,
+		  __print_symbolic(__entry->type, XFS_ALLOC_TYPES),
+		  __print_symbolic(__entry->otype, XFS_ALLOC_TYPES),
+		  __entry->wasdel,
+		  __entry->wasfromfl,
+		  __entry->isfl,
+		  __entry->userdata,
+		  __entry->firstblock)
 )
 
+#define DEFINE_ALLOC_EVENT(name) \
+DEFINE_EVENT(xfs_alloc_class, name, \
+	TP_PROTO(struct xfs_alloc_arg *args), \
+	TP_ARGS(args))
 DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
 DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
@@ -1245,92 +1290,100 @@ DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp);
 DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed);
 DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed);
 
-#define DEFINE_DIR2_TRACE(tname) \
-TRACE_EVENT(tname, \
+DECLARE_EVENT_CLASS(xfs_dir2_class,
+	TP_PROTO(struct xfs_da_args *args),
+	TP_ARGS(args),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__dynamic_array(char, name, args->namelen)
+		__field(int, namelen)
+		__field(xfs_dahash_t, hashval)
+		__field(xfs_ino_t, inumber)
+		__field(int, op_flags)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(args->dp)->i_sb->s_dev;
+		__entry->ino = args->dp->i_ino;
+		if (args->namelen)
+			memcpy(__get_str(name), args->name, args->namelen);
+		__entry->namelen = args->namelen;
+		__entry->hashval = args->hashval;
+		__entry->inumber = args->inumber;
+		__entry->op_flags = args->op_flags;
+	),
+	TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d hashval 0x%x "
+		  "inumber 0x%llx op_flags %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->namelen,
+		  __entry->namelen ? __get_str(name) : NULL,
+		  __entry->namelen,
+		  __entry->hashval,
+		  __entry->inumber,
+		  __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
+)
+
+#define DEFINE_DIR2_EVENT(name) \
+DEFINE_EVENT(xfs_dir2_class, name, \
 	TP_PROTO(struct xfs_da_args *args), \
-	TP_ARGS(args), \
-	TP_STRUCT__entry( \
-		__field(dev_t, dev) \
-		__field(xfs_ino_t, ino) \
-		__dynamic_array(char, name, args->namelen) \
-		__field(int, namelen) \
-		__field(xfs_dahash_t, hashval) \
-		__field(xfs_ino_t, inumber) \
-		__field(int, op_flags) \
-	), \
-	TP_fast_assign( \
-		__entry->dev = VFS_I(args->dp)->i_sb->s_dev; \
-		__entry->ino = args->dp->i_ino; \
-		if (args->namelen) \
-			memcpy(__get_str(name), args->name, args->namelen); \
-		__entry->namelen = args->namelen; \
-		__entry->hashval = args->hashval; \
-		__entry->inumber = args->inumber; \
-		__entry->op_flags = args->op_flags; \
-	), \
-	TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d hashval 0x%x " \
-		  "inumber 0x%llx op_flags %s", \
-		  MAJOR(__entry->dev), MINOR(__entry->dev), \
-		  __entry->ino, \
-		  __entry->namelen, \
-		  __entry->namelen ? __get_str(name) : NULL, \
-		  __entry->namelen, \
-		  __entry->hashval, \
-		  __entry->inumber, \
-		  __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) \
+	TP_ARGS(args))
+DEFINE_DIR2_EVENT(xfs_dir2_sf_addname);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_create);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_lookup);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_replace);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_removename);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_toino4);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_toino8);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_to_block);
+DEFINE_DIR2_EVENT(xfs_dir2_block_addname);
+DEFINE_DIR2_EVENT(xfs_dir2_block_lookup);
+DEFINE_DIR2_EVENT(xfs_dir2_block_replace);
+DEFINE_DIR2_EVENT(xfs_dir2_block_removename);
+DEFINE_DIR2_EVENT(xfs_dir2_block_to_sf);
+DEFINE_DIR2_EVENT(xfs_dir2_block_to_leaf);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_addname);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_lookup);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_replace);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_removename);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_block);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_node);
+DEFINE_DIR2_EVENT(xfs_dir2_node_addname);
+DEFINE_DIR2_EVENT(xfs_dir2_node_lookup);
+DEFINE_DIR2_EVENT(xfs_dir2_node_replace);
+DEFINE_DIR2_EVENT(xfs_dir2_node_removename);
+DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf);
+
+DECLARE_EVENT_CLASS(xfs_dir2_space_class,
+	TP_PROTO(struct xfs_da_args *args, int idx),
+	TP_ARGS(args, idx),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(int, op_flags)
+		__field(int, idx)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(args->dp)->i_sb->s_dev;
+		__entry->ino = args->dp->i_ino;
+		__entry->op_flags = args->op_flags;
+		__entry->idx = idx;
+	),
+	TP_printk("dev %d:%d ino 0x%llx op_flags %s index %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS),
+		  __entry->idx)
 )
-DEFINE_DIR2_TRACE(xfs_dir2_sf_addname);
-DEFINE_DIR2_TRACE(xfs_dir2_sf_create);
-DEFINE_DIR2_TRACE(xfs_dir2_sf_lookup);
-DEFINE_DIR2_TRACE(xfs_dir2_sf_replace);
-DEFINE_DIR2_TRACE(xfs_dir2_sf_removename);
-DEFINE_DIR2_TRACE(xfs_dir2_sf_toino4);
-DEFINE_DIR2_TRACE(xfs_dir2_sf_toino8);
-DEFINE_DIR2_TRACE(xfs_dir2_sf_to_block);
-DEFINE_DIR2_TRACE(xfs_dir2_block_addname);
-DEFINE_DIR2_TRACE(xfs_dir2_block_lookup);
-DEFINE_DIR2_TRACE(xfs_dir2_block_replace);
-DEFINE_DIR2_TRACE(xfs_dir2_block_removename);
-DEFINE_DIR2_TRACE(xfs_dir2_block_to_sf);
-DEFINE_DIR2_TRACE(xfs_dir2_block_to_leaf);
-DEFINE_DIR2_TRACE(xfs_dir2_leaf_addname);
-DEFINE_DIR2_TRACE(xfs_dir2_leaf_lookup);
-DEFINE_DIR2_TRACE(xfs_dir2_leaf_replace);
-DEFINE_DIR2_TRACE(xfs_dir2_leaf_removename);
-DEFINE_DIR2_TRACE(xfs_dir2_leaf_to_block);
-DEFINE_DIR2_TRACE(xfs_dir2_leaf_to_node);
-DEFINE_DIR2_TRACE(xfs_dir2_node_addname);
-DEFINE_DIR2_TRACE(xfs_dir2_node_lookup);
-DEFINE_DIR2_TRACE(xfs_dir2_node_replace);
-DEFINE_DIR2_TRACE(xfs_dir2_node_removename);
-DEFINE_DIR2_TRACE(xfs_dir2_node_to_leaf);
 
-#define DEFINE_DIR2_SPACE_TRACE(tname) \
-TRACE_EVENT(tname, \
+#define DEFINE_DIR2_SPACE_EVENT(name) \
+DEFINE_EVENT(xfs_dir2_space_class, name, \
 	TP_PROTO(struct xfs_da_args *args, int idx), \
-	TP_ARGS(args, idx), \
-	TP_STRUCT__entry( \
-		__field(dev_t, dev) \
-		__field(xfs_ino_t, ino) \
-		__field(int, op_flags) \
-		__field(int, idx) \
-	), \
-	TP_fast_assign( \
-		__entry->dev = VFS_I(args->dp)->i_sb->s_dev; \
-		__entry->ino = args->dp->i_ino; \
-		__entry->op_flags = args->op_flags; \
-		__entry->idx = idx; \
-	), \
-	TP_printk("dev %d:%d ino 0x%llx op_flags %s index %d", \
-		  MAJOR(__entry->dev), MINOR(__entry->dev), \
-		  __entry->ino, \
-		  __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS), \
-		  __entry->idx) \
-)
-DEFINE_DIR2_SPACE_TRACE(xfs_dir2_leafn_add);
-DEFINE_DIR2_SPACE_TRACE(xfs_dir2_leafn_remove);
-DEFINE_DIR2_SPACE_TRACE(xfs_dir2_grow_inode);
-DEFINE_DIR2_SPACE_TRACE(xfs_dir2_shrink_inode);
+	TP_ARGS(args, idx))
+DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_add);
+DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_remove);
+DEFINE_DIR2_SPACE_EVENT(xfs_dir2_grow_inode);
+DEFINE_DIR2_SPACE_EVENT(xfs_dir2_shrink_inode);
 
 TRACE_EVENT(xfs_dir2_leafn_moveents,
 	TP_PROTO(struct xfs_da_args *args, int src_idx, int dst_idx, int count),
-- 
cgit v1.1


From d6d59bada372bcf8bd36c3bbc71c485c29dd2a4b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 23 Dec 2009 16:09:13 +0000
Subject: xfs: fix timestamp handling in xfs_setattr

We currently have some rather odd code in xfs_setattr for
updating the a/c/mtime timestamps:

 - first we do a non-transaction update if all three are updated
   together
 - second we implicitly update the ctime for various changes
   instead of relying on the ATTR_CTIME flag
 - third we set the timestamps to the current time instead of the
   arguments in the iattr structure in many cases.

This patch makes sure we update it in a consistent way:

 - always transactional
 - ctime is only updated if ATTR_CTIME is set or we do a size
   update, which is a special case
 - always to the times passed in from the caller instead of the
   current time

The only non-size caller of xfs_setattr that doesn't come from
the VFS is updated to set ATTR_CTIME and pass in a valid ctime
value.

Reported-by: Eric Blake <ebb9@byu.net>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_acl.c |  3 +-
 fs/xfs/xfs_vnodeops.c      | 93 +++++++++++++++++++---------------------------
 2 files changed, 41 insertions(+), 55 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index 2512125..883ca5a 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -251,8 +251,9 @@ xfs_set_mode(struct inode *inode, mode_t mode)
 	if (mode != inode->i_mode) {
 		struct iattr iattr;
 
-		iattr.ia_valid = ATTR_MODE;
+		iattr.ia_valid = ATTR_MODE | ATTR_CTIME;
 		iattr.ia_mode = mode;
+		iattr.ia_ctime = current_fs_time(inode->i_sb);
 
 		error = -xfs_setattr(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
 	}
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 6558ffd..6f26875 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -70,7 +70,6 @@ xfs_setattr(
 	uint			commit_flags=0;
 	uid_t			uid=0, iuid=0;
 	gid_t			gid=0, igid=0;
-	int			timeflags = 0;
 	struct xfs_dquot	*udqp, *gdqp, *olddquot1, *olddquot2;
 	int			need_iolock = 1;
 
@@ -135,16 +134,13 @@ xfs_setattr(
 	if (flags & XFS_ATTR_NOLOCK)
 		need_iolock = 0;
 	if (!(mask & ATTR_SIZE)) {
-		if ((mask != (ATTR_CTIME|ATTR_ATIME|ATTR_MTIME)) ||
-		    (mp->m_flags & XFS_MOUNT_WSYNC)) {
-			tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
-			commit_flags = 0;
-			if ((code = xfs_trans_reserve(tp, 0,
-						     XFS_ICHANGE_LOG_RES(mp), 0,
-						     0, 0))) {
-				lock_flags = 0;
-				goto error_return;
-			}
+		tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+		commit_flags = 0;
+		code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp),
+					 0, 0, 0);
+		if (code) {
+			lock_flags = 0;
+			goto error_return;
 		}
 	} else {
 		if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
@@ -295,15 +291,23 @@ xfs_setattr(
 		 * or we are explicitly asked to change it. This handles
 		 * the semantic difference between truncate() and ftruncate()
 		 * as implemented in the VFS.
+		 *
+		 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME
+		 * is a special case where we need to update the times despite
+		 * not having these flags set.  For all other operations the
+		 * VFS set these flags explicitly if it wants a timestamp
+		 * update.
 		 */
-		if (iattr->ia_size != ip->i_size || (mask & ATTR_CTIME))
-			timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+		if (iattr->ia_size != ip->i_size &&
+		    (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
+			iattr->ia_ctime = iattr->ia_mtime =
+				current_fs_time(inode->i_sb);
+			mask |= ATTR_CTIME | ATTR_MTIME;
+		}
 
 		if (iattr->ia_size > ip->i_size) {
 			ip->i_d.di_size = iattr->ia_size;
 			ip->i_size = iattr->ia_size;
-			if (!(flags & XFS_ATTR_DMI))
-				xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
 			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 		} else if (iattr->ia_size <= ip->i_size ||
 			   (iattr->ia_size == 0 && ip->i_d.di_nextents)) {
@@ -374,9 +378,6 @@ xfs_setattr(
 			ip->i_d.di_gid = gid;
 			inode->i_gid = gid;
 		}
-
-		xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
-		timeflags |= XFS_ICHGTIME_CHG;
 	}
 
 	/*
@@ -393,51 +394,37 @@ xfs_setattr(
 
 		inode->i_mode &= S_IFMT;
 		inode->i_mode |= mode & ~S_IFMT;
-
-		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-		timeflags |= XFS_ICHGTIME_CHG;
 	}
 
 	/*
 	 * Change file access or modified times.
 	 */
-	if (mask & (ATTR_ATIME|ATTR_MTIME)) {
-		if (mask & ATTR_ATIME) {
-			inode->i_atime = iattr->ia_atime;
-			ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
-			ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
-			ip->i_update_core = 1;
-		}
-		if (mask & ATTR_MTIME) {
-			inode->i_mtime = iattr->ia_mtime;
-			ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
-			ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
-			timeflags &= ~XFS_ICHGTIME_MOD;
-			timeflags |= XFS_ICHGTIME_CHG;
-		}
-		if (tp && (mask & (ATTR_MTIME_SET|ATTR_ATIME_SET)))
-			xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
+	if (mask & ATTR_ATIME) {
+		inode->i_atime = iattr->ia_atime;
+		ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
+		ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
+		ip->i_update_core = 1;
 	}
-
-	/*
-	 * Change file inode change time only if ATTR_CTIME set
-	 * AND we have been called by a DMI function.
-	 */
-
-	if ((flags & XFS_ATTR_DMI) && (mask & ATTR_CTIME)) {
+	if (mask & ATTR_CTIME) {
 		inode->i_ctime = iattr->ia_ctime;
 		ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
 		ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
 		ip->i_update_core = 1;
-		timeflags &= ~XFS_ICHGTIME_CHG;
+	}
+	if (mask & ATTR_MTIME) {
+		inode->i_mtime = iattr->ia_mtime;
+		ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
+		ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
+		ip->i_update_core = 1;
 	}
 
 	/*
-	 * Send out timestamp changes that need to be set to the
-	 * current time.  Not done when called by a DMI function.
+	 * And finally, log the inode core if any attribute in it
+	 * has been changed.
 	 */
-	if (timeflags && !(flags & XFS_ATTR_DMI))
-		xfs_ichgtime(ip, timeflags);
+	if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE|
+		    ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
 	XFS_STATS_INC(xs_ig_attrchg);
 
@@ -452,12 +439,10 @@ xfs_setattr(
 	 * mix so this probably isn't worth the trouble to optimize.
 	 */
 	code = 0;
-	if (tp) {
-		if (mp->m_flags & XFS_MOUNT_WSYNC)
-			xfs_trans_set_sync(tp);
+	if (mp->m_flags & XFS_MOUNT_WSYNC)
+		xfs_trans_set_sync(tp);
 
-		code = xfs_trans_commit(tp, commit_flags);
-	}
+	code = xfs_trans_commit(tp, commit_flags);
 
 	xfs_iunlock(ip, lock_flags);
 
-- 
cgit v1.1


From 44e08c45cc14e6190a424be8d450070c8e508fad Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Sat, 2 Jan 2010 02:39:40 +0000
Subject: xfs: Don't flush stale inodes

Because inodes remain in cache much longer than inode buffers do
under memory pressure, we can get the situation where we have
stale, dirty inodes being reclaimed but the backing storage has
been freed.  Hence we should never, ever flush XFS_ISTALE inodes
to disk as there is no guarantee that the backing buffer is in
cache and still marked stale when the flush occurs.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_inode.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ce278b3..391d36b 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2841,10 +2841,14 @@ xfs_iflush(
 	mp = ip->i_mount;
 
 	/*
-	 * If the inode isn't dirty, then just release the inode
-	 * flush lock and do nothing.
+	 * If the inode isn't dirty, then just release the inode flush lock and
+	 * do nothing. Treat stale inodes the same; we cannot rely on the
+	 * backing buffer remaining stale in cache for the remaining life of
+	 * the stale inode and so xfs_itobp() below may give us a buffer that
+	 * no longer contains inodes below. Doing this stale check here also
+	 * avoids forcing the log on pinned, stale inodes.
 	 */
-	if (xfs_inode_clean(ip)) {
+	if (xfs_inode_clean(ip) || xfs_iflags_test(ip, XFS_ISTALE)) {
 		xfs_ifunlock(ip);
 		return 0;
 	}
-- 
cgit v1.1


From fd45e4784164d1017521086524e3442318c67370 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Sat, 2 Jan 2010 02:38:56 +0000
Subject: xfs: Ensure we force all busy extents in range to disk

When we search for and find a busy extent during allocation we
force the log out to ensure the extent free transaction is on
disk before the allocation transaction. The current implementation
has a subtle bug in it--it does not handle multiple overlapping
ranges.

That is, if we free lots of little extents into a single
contiguous extent, then allocate the contiguous extent, the busy
search code stops searching at the first extent it finds that
overlaps the allocated range. It then uses the commit LSN of the
transaction to force the log out to.

Unfortunately, the other busy ranges might have more recent
commit LSNs than the first busy extent that is found, and this
results in xfs_alloc_search_busy() returning before all the
extent free transactions are on disk for the range being
allocated. This can lead to potential metadata corruption or
stale data exposure after a crash because log replay won't replay
all the extent free transactions that cover the allocation range.

Modified-by: Alex Elder <aelder@sgi.com>

(Dropped the "found" argument from the xfs_alloc_busysearch trace
event.)

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_trace.h | 12 ++++++------
 fs/xfs/xfs_alloc.c           | 44 +++++++++++++++++++++-----------------------
 2 files changed, 27 insertions(+), 29 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 8cb42b4..c22a608 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -1079,28 +1079,28 @@ TRACE_EVENT(xfs_alloc_unbusy,
 
 TRACE_EVENT(xfs_alloc_busysearch,
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
-		 xfs_extlen_t len, int found),
-	TP_ARGS(mp, agno, agbno, len, found),
+		 xfs_extlen_t len, xfs_lsn_t lsn),
+	TP_ARGS(mp, agno, agbno, len, lsn),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_agnumber_t, agno)
 		__field(xfs_agblock_t, agbno)
 		__field(xfs_extlen_t, len)
-		__field(int, found)
+		__field(xfs_lsn_t, lsn)
 	),
 	TP_fast_assign(
 		__entry->dev = mp->m_super->s_dev;
 		__entry->agno = agno;
 		__entry->agbno = agbno;
 		__entry->len = len;
-		__entry->found = found;
+		__entry->lsn = lsn;
 	),
-	TP_printk("dev %d:%d agno %u agbno %u len %u %s",
+	TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->agno,
 		  __entry->agbno,
 		  __entry->len,
-		  __print_symbolic(__entry->found, XFS_BUSY_STATES))
+		  __entry->lsn)
 );
 
 TRACE_EVENT(xfs_agf,
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index a1c65fc..275b1f4 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -2563,43 +2563,41 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
 	xfs_mount_t		*mp;
 	xfs_perag_busy_t	*bsy;
 	xfs_agblock_t		uend, bend;
-	xfs_lsn_t		lsn;
+	xfs_lsn_t		lsn = 0;
 	int			cnt;
 
 	mp = tp->t_mountp;
 
 	spin_lock(&mp->m_perag[agno].pagb_lock);
-	cnt = mp->m_perag[agno].pagb_count;
 
 	uend = bno + len - 1;
 
-	/* search pagb_list for this slot, skipping open slots */
-	for (bsy = mp->m_perag[agno].pagb_list; cnt; bsy++) {
+	/*
+	 * search pagb_list for this slot, skipping open slots. We have to
+	 * search the entire array as there may be multiple overlaps and
+	 * we have to get the most recent LSN for the log force to push out
+	 * all the transactions that span the range.
+	 */
+	for (cnt = 0; cnt < mp->m_perag[agno].pagb_count; cnt++) {
+		bsy = &mp->m_perag[agno].pagb_list[cnt];
+		if (!bsy->busy_tp)
+			continue;
 
-		/*
-		 * (start1,length1) within (start2, length2)
-		 */
-		if (bsy->busy_tp != NULL) {
-			bend = bsy->busy_start + bsy->busy_length - 1;
-			if ((bno > bend) || (uend < bsy->busy_start)) {
-				cnt--;
-			} else {
-				break;
-			}
-		}
-	}
+		bend = bsy->busy_start + bsy->busy_length - 1;
+		if (bno > bend || uend < bsy->busy_start)
+			continue;
 
-	trace_xfs_alloc_busysearch(mp, agno, bno, len, !!cnt);
+		/* (start1,length1) within (start2, length2) */
+		if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0)
+			lsn = bsy->busy_tp->t_commit_lsn;
+	}
+	spin_unlock(&mp->m_perag[agno].pagb_lock);
+	trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn);
 
 	/*
 	 * If a block was found, force the log through the LSN of the
 	 * transaction that freed the block
 	 */
-	if (cnt) {
-		lsn = bsy->busy_tp->t_commit_lsn;
-		spin_unlock(&mp->m_perag[agno].pagb_lock);
+	if (lsn)
 		xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
-	} else {
-		spin_unlock(&mp->m_perag[agno].pagb_lock);
-	}
 }
-- 
cgit v1.1


From c8e20be020f234c8d492927a424a7d8bbefd5b5d Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Sun, 10 Jan 2010 23:51:45 +0000
Subject: xfs: reclaim inodes under a write lock

Make the inode tree reclaim walk exclusive to avoid races with
concurrent sync walkers and lookups. This is a version of a patch
posted by Christoph Hellwig that avoids all the code duplication.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_sync.c    | 154 ++++++++++++++++++-----------------------
 fs/xfs/linux-2.6/xfs_sync.h    |   2 +-
 fs/xfs/quota/xfs_qm_syscalls.c |   2 +-
 3 files changed, 71 insertions(+), 87 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 6fed97a..e19d255 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -65,7 +65,6 @@ xfs_inode_ag_lookup(
 	 * as the tree is sparse and a gang lookup walks to find
 	 * the number of objects requested.
 	 */
-	read_lock(&pag->pag_ici_lock);
 	if (tag == XFS_ICI_NO_TAG) {
 		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
 				(void **)&ip, *first_index, 1);
@@ -74,7 +73,7 @@ xfs_inode_ag_lookup(
 				(void **)&ip, *first_index, 1, tag);
 	}
 	if (!nr_found)
-		goto unlock;
+		return NULL;
 
 	/*
 	 * Update the index for the next lookup. Catch overflows
@@ -84,13 +83,8 @@ xfs_inode_ag_lookup(
 	 */
 	*first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
 	if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
-		goto unlock;
-
+		return NULL;
 	return ip;
-
-unlock:
-	read_unlock(&pag->pag_ici_lock);
-	return NULL;
 }
 
 STATIC int
@@ -100,7 +94,8 @@ xfs_inode_ag_walk(
 	int			(*execute)(struct xfs_inode *ip,
 					   struct xfs_perag *pag, int flags),
 	int			flags,
-	int			tag)
+	int			tag,
+	int			exclusive)
 {
 	struct xfs_perag	*pag = &mp->m_perag[ag];
 	uint32_t		first_index;
@@ -114,10 +109,20 @@ restart:
 		int		error = 0;
 		xfs_inode_t	*ip;
 
+		if (exclusive)
+			write_lock(&pag->pag_ici_lock);
+		else
+			read_lock(&pag->pag_ici_lock);
 		ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
-		if (!ip)
+		if (!ip) {
+			if (exclusive)
+				write_unlock(&pag->pag_ici_lock);
+			else
+				read_unlock(&pag->pag_ici_lock);
 			break;
+		}
 
+		/* execute releases pag->pag_ici_lock */
 		error = execute(ip, pag, flags);
 		if (error == EAGAIN) {
 			skipped++;
@@ -125,9 +130,8 @@ restart:
 		}
 		if (error)
 			last_error = error;
-		/*
-		 * bail out if the filesystem is corrupted.
-		 */
+
+		/* bail out if the filesystem is corrupted.  */
 		if (error == EFSCORRUPTED)
 			break;
 
@@ -148,7 +152,8 @@ xfs_inode_ag_iterator(
 	int			(*execute)(struct xfs_inode *ip,
 					   struct xfs_perag *pag, int flags),
 	int			flags,
-	int			tag)
+	int			tag,
+	int			exclusive)
 {
 	int			error = 0;
 	int			last_error = 0;
@@ -157,7 +162,8 @@ xfs_inode_ag_iterator(
 	for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
 		if (!mp->m_perag[ag].pag_ici_init)
 			continue;
-		error = xfs_inode_ag_walk(mp, ag, execute, flags, tag);
+		error = xfs_inode_ag_walk(mp, ag, execute, flags, tag,
+						exclusive);
 		if (error) {
 			last_error = error;
 			if (error == EFSCORRUPTED)
@@ -181,11 +187,7 @@ xfs_sync_inode_valid(
 		return EFSCORRUPTED;
 	}
 
-	/*
-	 * If we can't get a reference on the inode, it must be in reclaim.
-	 * Leave it for the reclaim code to flush. Also avoid inodes that
-	 * haven't been fully initialised.
-	 */
+	/* If we can't get a reference on the inode, it must be in reclaim. */
 	if (!igrab(inode)) {
 		read_unlock(&pag->pag_ici_lock);
 		return ENOENT;
@@ -282,7 +284,7 @@ xfs_sync_data(
 	ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
 
 	error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
-				      XFS_ICI_NO_TAG);
+				      XFS_ICI_NO_TAG, 0);
 	if (error)
 		return XFS_ERROR(error);
 
@@ -304,7 +306,7 @@ xfs_sync_attr(
 	ASSERT((flags & ~SYNC_WAIT) == 0);
 
 	return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
-				     XFS_ICI_NO_TAG);
+				     XFS_ICI_NO_TAG, 0);
 }
 
 STATIC int
@@ -664,60 +666,6 @@ xfs_syncd_stop(
 	kthread_stop(mp->m_sync_task);
 }
 
-STATIC int
-xfs_reclaim_inode(
-	xfs_inode_t	*ip,
-	int		sync_mode)
-{
-	xfs_perag_t	*pag = xfs_get_perag(ip->i_mount, ip->i_ino);
-
-	/* The hash lock here protects a thread in xfs_iget_core from
-	 * racing with us on linking the inode back with a vnode.
-	 * Once we have the XFS_IRECLAIM flag set it will not touch
-	 * us.
-	 */
-	write_lock(&pag->pag_ici_lock);
-	spin_lock(&ip->i_flags_lock);
-	if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
-	    !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
-		spin_unlock(&ip->i_flags_lock);
-		write_unlock(&pag->pag_ici_lock);
-		return -EAGAIN;
-	}
-	__xfs_iflags_set(ip, XFS_IRECLAIM);
-	spin_unlock(&ip->i_flags_lock);
-	write_unlock(&pag->pag_ici_lock);
-	xfs_put_perag(ip->i_mount, pag);
-
-	/*
-	 * If the inode is still dirty, then flush it out.  If the inode
-	 * is not in the AIL, then it will be OK to flush it delwri as
-	 * long as xfs_iflush() does not keep any references to the inode.
-	 * We leave that decision up to xfs_iflush() since it has the
-	 * knowledge of whether it's OK to simply do a delwri flush of
-	 * the inode or whether we need to wait until the inode is
-	 * pulled from the AIL.
-	 * We get the flush lock regardless, though, just to make sure
-	 * we don't free it while it is being flushed.
-	 */
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	xfs_iflock(ip);
-
-	/*
-	 * In the case of a forced shutdown we rely on xfs_iflush() to
-	 * wait for the inode to be unpinned before returning an error.
-	 */
-	if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
-		/* synchronize with xfs_iflush_done */
-		xfs_iflock(ip);
-		xfs_ifunlock(ip);
-	}
-
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	xfs_ireclaim(ip);
-	return 0;
-}
-
 void
 __xfs_inode_set_reclaim_tag(
 	struct xfs_perag	*pag,
@@ -760,19 +708,55 @@ __xfs_inode_clear_reclaim_tag(
 }
 
 STATIC int
-xfs_reclaim_inode_now(
+xfs_reclaim_inode(
 	struct xfs_inode	*ip,
 	struct xfs_perag	*pag,
-	int			flags)
+	int			sync_mode)
 {
-	/* ignore if already under reclaim */
-	if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
-		read_unlock(&pag->pag_ici_lock);
+	/*
+	 * The radix tree lock here protects a thread in xfs_iget from racing
+	 * with us starting reclaim on the inode.  Once we have the
+	 * XFS_IRECLAIM flag set it will not touch us.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+	if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
+		/* ignore as it is already under reclaim */
+		spin_unlock(&ip->i_flags_lock);
+		write_unlock(&pag->pag_ici_lock);
 		return 0;
 	}
-	read_unlock(&pag->pag_ici_lock);
+	__xfs_iflags_set(ip, XFS_IRECLAIM);
+	spin_unlock(&ip->i_flags_lock);
+	write_unlock(&pag->pag_ici_lock);
+
+	/*
+	 * If the inode is still dirty, then flush it out.  If the inode
+	 * is not in the AIL, then it will be OK to flush it delwri as
+	 * long as xfs_iflush() does not keep any references to the inode.
+	 * We leave that decision up to xfs_iflush() since it has the
+	 * knowledge of whether it's OK to simply do a delwri flush of
+	 * the inode or whether we need to wait until the inode is
+	 * pulled from the AIL.
+	 * We get the flush lock regardless, though, just to make sure
+	 * we don't free it while it is being flushed.
+	 */
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_iflock(ip);
 
-	return xfs_reclaim_inode(ip, flags);
+	/*
+	 * In the case of a forced shutdown we rely on xfs_iflush() to
+	 * wait for the inode to be unpinned before returning an error.
+	 */
+	if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
+		/* synchronize with xfs_iflush_done */
+		xfs_iflock(ip);
+		xfs_ifunlock(ip);
+	}
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	xfs_ireclaim(ip);
+	return 0;
 }
 
 int
@@ -780,6 +764,6 @@ xfs_reclaim_inodes(
 	xfs_mount_t	*mp,
 	int		mode)
 {
-	return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode,
-					XFS_ICI_RECLAIM_TAG);
+	return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
+					XFS_ICI_RECLAIM_TAG, 1);
 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index a500b4d..ea932b4 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -54,6 +54,6 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
 int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
 	int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
-	int flags, int tag);
+	int flags, int tag, int write_lock);
 
 #endif
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 71af76f..873e07e 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -891,7 +891,7 @@ xfs_qm_dqrele_all_inodes(
 	uint		 flags)
 {
 	ASSERT(mp->m_quotainfo);
-	xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG);
+	xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0);
 }
 
 /*------------------------------------------------------------------------*/
-- 
cgit v1.1


From 018027be90a6946e8cf3f9b17b5582384f7ed117 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Sun, 10 Jan 2010 23:51:46 +0000
Subject: xfs: Avoid inodes in reclaim when flushing from inode cache

The reclaim code will handle flushing of dirty inodes before reclaim
occurs, so avoid them when determining whether an inode is a
candidate for flushing to disk when walking the radix trees.  This
is based on a test patch from Christoph Hellwig.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_sync.c | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index e19d255..1f5e4bb 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -180,26 +180,31 @@ xfs_sync_inode_valid(
 	struct xfs_perag	*pag)
 {
 	struct inode		*inode = VFS_I(ip);
+	int			error = EFSCORRUPTED;
 
 	/* nothing to sync during shutdown */
-	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-		read_unlock(&pag->pag_ici_lock);
-		return EFSCORRUPTED;
-	}
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		goto out_unlock;
 
-	/* If we can't get a reference on the inode, it must be in reclaim. */
-	if (!igrab(inode)) {
-		read_unlock(&pag->pag_ici_lock);
-		return ENOENT;
-	}
-	read_unlock(&pag->pag_ici_lock);
+	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+	error = ENOENT;
+	if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+		goto out_unlock;
 
-	if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) {
+	/* If we can't grab the inode, it must on it's way to reclaim. */
+	if (!igrab(inode))
+		goto out_unlock;
+
+	if (is_bad_inode(inode)) {
 		IRELE(ip);
-		return ENOENT;
+		goto out_unlock;
 	}
 
-	return 0;
+	/* inode is valid */
+	error = 0;
+out_unlock:
+	read_unlock(&pag->pag_ici_lock);
+	return error;
 }
 
 STATIC int
-- 
cgit v1.1


From 57817c68229984818fea9e614d6f95249c3fb098 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Sun, 10 Jan 2010 23:51:47 +0000
Subject: xfs: reclaim all inodes by background tree walks

We cannot do direct inode reclaim without taking the flush lock to
ensure that we do not reclaim an inode under IO. We check the inode
is clean before doing direct reclaim, but this is not good enough
because the inode flush code marks the inode clean once it has
copied the in-core dirty state to the backing buffer.

It is the flush lock that determines whether the inode is still
under IO, even though it is marked clean, and the inode is still
required at IO completion so we can't reclaim it even though it is
clean in core. Hence the requirement that we need to take the flush
lock even on clean inodes because this guarantees that the inode
writeback IO has completed and it is safe to reclaim the inode.

With delayed write inode flushing, we coul dend up waiting a long
time on the flush lock even for a clean inode. The background
reclaim already handles this efficiently, so avoid all the problems
by killing the direct reclaim path altogether.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_super.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 09783cc..77414db 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -954,16 +954,14 @@ xfs_fs_destroy_inode(
 	ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
 
 	/*
-	 * If we have nothing to flush with this inode then complete the
-	 * teardown now, otherwise delay the flush operation.
+	 * We always use background reclaim here because even if the
+	 * inode is clean, it still may be under IO and hence we have
+	 * to take the flush lock. The background reclaim path handles
+	 * this more efficiently than we can here, so simply let background
+	 * reclaim tear down all inodes.
 	 */
-	if (!xfs_inode_clean(ip)) {
-		xfs_inode_set_reclaim_tag(ip);
-		return;
-	}
-
 out_reclaim:
-	xfs_ireclaim(ip);
+	xfs_inode_set_reclaim_tag(ip);
 }
 
 /*
-- 
cgit v1.1


From 126976c7c17d3bdfbc1fe9e0af8bee9f62d14cc6 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Sun, 10 Jan 2010 23:51:48 +0000
Subject: xfs: Remove inode iolock held check during allocation

lockdep complains about a the lock not being initialised as we do an
ASSERT based check that the lock is not held before we initialise it
to catch inodes freed with the lock held.

lockdep does this check for us in the lock initialisation code, so
remove the ASSERT to stop the lockdep warning.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_iget.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index fa402a6..155e798 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -73,7 +73,6 @@ xfs_inode_alloc(
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
 	ASSERT(completion_done(&ip->i_flush));
-	ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
 
 	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
 
-- 
cgit v1.1


From 4b6a46882cca8349e8942e2650c33b11bc571c92 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Mon, 11 Jan 2010 11:45:21 +0000
Subject: xfs: fix stale inode flush avoidance

When reclaiming stale inodes, we need to guarantee that inodes are
unpinned before returning with a "clean" status. If we don't we can
reclaim inodes that are pinned, leading to use after free in the
transaction subsystem as transactions complete.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_inode.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 391d36b..ef77fd8 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2842,13 +2842,9 @@ xfs_iflush(
 
 	/*
 	 * If the inode isn't dirty, then just release the inode flush lock and
-	 * do nothing. Treat stale inodes the same; we cannot rely on the
-	 * backing buffer remaining stale in cache for the remaining life of
-	 * the stale inode and so xfs_itobp() below may give us a buffer that
-	 * no longer contains inodes below. Doing this stale check here also
-	 * avoids forcing the log on pinned, stale inodes.
+	 * do nothing.
 	 */
-	if (xfs_inode_clean(ip) || xfs_iflags_test(ip, XFS_ISTALE)) {
+	if (xfs_inode_clean(ip)) {
 		xfs_ifunlock(ip);
 		return 0;
 	}
@@ -2872,6 +2868,19 @@ xfs_iflush(
 	xfs_iunpin_wait(ip);
 
 	/*
+	 * For stale inodes we cannot rely on the backing buffer remaining
+	 * stale in cache for the remaining life of the stale inode and so
+	 * xfs_itobp() below may give us a buffer that no longer contains
+	 * inodes below. We have to check this after ensuring the inode is
+	 * unpinned so that it is safe to reclaim the stale inode after the
+	 * flush call.
+	 */
+	if (xfs_iflags_test(ip, XFS_ISTALE)) {
+		xfs_ifunlock(ip);
+		return 0;
+	}
+
+	/*
 	 * This may have been unpinned because the filesystem is shutting
 	 * down forcibly. If that's the case we must not write this inode
 	 * to disk, because the log record didn't make it to disk!
-- 
cgit v1.1


From 3daeb42c13567e1505f233f6a699cc0e23c8ab5a Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Thu, 14 Jan 2010 08:44:46 +0000
Subject: xfs: fix missing error check in xfs_rtfree_range

When xfs_rtfind_forw() returns an error, the block is returned
uninitialised.  xfs_rtfree_range() is not checking the error return,
so could be using an uninitialised block number for modifying bitmap
summary info.

The problem was found by gcc when compiling the *userspace* libxfs
code - it is an copy of the kernel code with the exact same bug.
gcc gives an uninitialised variable warning on the userspace code
but not on the kernel code. You gotta love the consistency (Mmmm,
slightly chewy today!).

Signed-off-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_rtalloc.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 9e15a11..6be05f7 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1517,6 +1517,8 @@ xfs_rtfree_range(
 	 */
 	error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
 		&postblock);
+	if (error)
+		return error;
 	/*
 	 * If there are blocks not being freed at the front of the
 	 * old extent, add summary data for them to be allocated.
-- 
cgit v1.1


From e09f98606dcc156de1146c209d45a0d6d5f51c3f Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Thu, 14 Jan 2010 01:33:54 +0000
Subject: xfs: xfs_swap_extents needs to handle dynamic fork offsets

When swapping extents, we can corrupt inodes by swapping data forks
that are in incompatible formats.  This is caused by the two indoes
having different fork offsets due to the presence of an attribute
fork on an attr2 filesystem.  xfs_fsr tries to be smart about
setting the fork offset, but the trick it plays only works on attr1
(old fixed format attribute fork) filesystems.

Changing the way xfs_fsr sets up the attribute fork will prevent
this situation from ever occurring, so in the kernel code we can get
by with a preventative fix - check that the data fork in the
defragmented inode is in a format valid for the inode it is being
swapped into.  This will lead to files that will silently and
potentially repeatedly fail defragmentation, so issue a warning to
the log when this particular failure occurs to let us know that
xfs_fsr needs updating/fixing.

To help identify how to improve xfs_fsr to avoid this issue, add
trace points for the inodes being swapped so that we can determine
why the swap was rejected and to confirm that the code is making the
right decisions and modifications when swapping forks.

A further complication is even when the swap is allowed to proceed
when the fork offset is different between the two inodes then value
for the maximum number of extents the data fork can hold can be
wrong. Make sure these are also set correctly after the swap occurs.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_dfrag.c | 106 +++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 90 insertions(+), 16 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index d1483a4..84ca1cf 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -114,10 +114,82 @@ xfs_swapext(
 	return error;
 }
 
+/*
+ * We need to check that the format of the data fork in the temporary inode is
+ * valid for the target inode before doing the swap. This is not a problem with
+ * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
+ * data fork depending on the space the attribute fork is taking so we can get
+ * invalid formats on the target inode.
+ *
+ * E.g. target has space for 7 extents in extent format, temp inode only has
+ * space for 6.  If we defragment down to 7 extents, then the tmp format is a
+ * btree, but when swapped it needs to be in extent format. Hence we can't just
+ * blindly swap data forks on attr2 filesystems.
+ *
+ * Note that we check the swap in both directions so that we don't end up with
+ * a corrupt temporary inode, either.
+ *
+ * Note that fixing the way xfs_fsr sets up the attribute fork in the source
+ * inode will prevent this situation from occurring, so all we do here is
+ * reject and log the attempt. basically we are putting the responsibility on
+ * userspace to get this right.
+ */
+static int
+xfs_swap_extents_check_format(
+	xfs_inode_t	*ip,	/* target inode */
+	xfs_inode_t	*tip)	/* tmp inode */
+{
+
+	/* Should never get a local format */
+	if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
+	    tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+		return EINVAL;
+
+	/*
+	 * if the target inode has less extents that then temporary inode then
+	 * why did userspace call us?
+	 */
+	if (ip->i_d.di_nextents < tip->i_d.di_nextents)
+		return EINVAL;
+
+	/*
+	 * if the target inode is in extent form and the temp inode is in btree
+	 * form then we will end up with the target inode in the wrong format
+	 * as we already know there are less extents in the temp inode.
+	 */
+	if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+	    tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+		return EINVAL;
+
+	/* Check temp in extent form to max in target */
+	if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max)
+		return EINVAL;
+
+	/* Check target in extent form to max in temp */
+	if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
+		return EINVAL;
+
+	/* Check root block of temp in btree form to max in target */
+	if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+	    XFS_IFORK_BOFF(ip) &&
+	    tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
+		return EINVAL;
+
+	/* Check root block of target in btree form to max in temp */
+	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+	    XFS_IFORK_BOFF(tip) &&
+	    ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
+		return EINVAL;
+
+	return 0;
+}
+
 int
 xfs_swap_extents(
-	xfs_inode_t	*ip,
-	xfs_inode_t	*tip,
+	xfs_inode_t	*ip,	/* target inode */
+	xfs_inode_t	*tip,	/* tmp inode */
 	xfs_swapext_t	*sxp)
 {
 	xfs_mount_t	*mp;
@@ -161,13 +233,6 @@ xfs_swap_extents(
 		goto out_unlock;
 	}
 
-	/* Should never get a local format */
-	if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
-	    tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-		error = XFS_ERROR(EINVAL);
-		goto out_unlock;
-	}
-
 	if (VN_CACHED(VFS_I(tip)) != 0) {
 		error = xfs_flushinval_pages(tip, 0, -1,
 				FI_REMAPF_LOCKED);
@@ -189,13 +254,12 @@ xfs_swap_extents(
 		goto out_unlock;
 	}
 
-	/*
-	 * If the target has extended attributes, the tmp file
-	 * must also in order to ensure the correct data fork
-	 * format.
-	 */
-	if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) {
-		error = XFS_ERROR(EINVAL);
+	/* check inode formats now that data is flushed */
+	error = xfs_swap_extents_check_format(ip, tip);
+	if (error) {
+		xfs_fs_cmn_err(CE_NOTE, mp,
+		    "%s: inode 0x%llx format is incompatible for exchanging.",
+				__FILE__, ip->i_ino);
 		goto out_unlock;
 	}
 
@@ -276,6 +340,16 @@ xfs_swap_extents(
 	*tifp = *tempifp;	/* struct copy */
 
 	/*
+	 * Fix the in-memory data fork values that are dependent on the fork
+	 * offset in the inode. We can't assume they remain the same as attr2
+	 * has dynamic fork offsets.
+	 */
+	ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) /
+					(uint)sizeof(xfs_bmbt_rec_t);
+	tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) /
+					(uint)sizeof(xfs_bmbt_rec_t);
+
+	/*
 	 * Fix the on-disk inode values
 	 */
 	tmp = (__uint64_t)ip->i_d.di_nblocks;
-- 
cgit v1.1