From 785c4bcc0d88ff006a0b2120815a71e86ecf21ce Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Mon, 23 May 2011 18:33:01 +0200 Subject: ext3: Add fixed tracepoints This commit adds fixed tracepoints to the ext3 code. It is based on ext4 tracepoints, however due to the differences of both file systems, there are some tracepoints missing (those for delaloc and for multi-block allocator) and there are some ext3 specific as well (for reservation windows). Here is a list: ext3_free_inode ext3_request_inode ext3_allocate_inode ext3_evict_inode ext3_drop_inode ext3_mark_inode_dirty ext3_write_begin ext3_ordered_write_end ext3_writeback_write_end ext3_journalled_write_end ext3_ordered_writepage ext3_writeback_writepage ext3_journalled_writepage ext3_readpage ext3_releasepage ext3_invalidatepage ext3_discard_blocks ext3_request_blocks ext3_allocate_blocks ext3_free_blocks ext3_sync_file_enter ext3_sync_file_exit ext3_sync_fs ext3_rsv_window_add ext3_discard_reservation ext3_alloc_new_reservation ext3_reserved ext3_forget ext3_read_block_bitmap ext3_direct_IO_enter ext3_direct_IO_exit ext3_unlink_enter ext3_unlink_exit ext3_truncate_enter ext3_truncate_exit ext3_get_blocks_enter ext3_get_blocks_exit ext3_load_inode Signed-off-by: Lukas Czerner Cc: Jan Kara Signed-off-by: Jan Kara --- fs/ext3/balloc.c | 34 +- fs/ext3/fsync.c | 15 +- fs/ext3/ialloc.c | 4 + fs/ext3/inode.c | 29 ++ fs/ext3/namei.c | 3 + fs/ext3/super.c | 13 + include/trace/events/ext3.h | 864 ++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 946 insertions(+), 16 deletions(-) create mode 100644 include/trace/events/ext3.h diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index fe52297..f7d111e 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -21,6 +21,7 @@ #include #include #include +#include /* * balloc.c contains the blocks allocation and deallocation routines @@ -161,6 +162,7 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group) desc = ext3_get_group_desc(sb, block_group, NULL); if (!desc) return NULL; + trace_ext3_read_block_bitmap(sb, block_group); bitmap_blk = le32_to_cpu(desc->bg_block_bitmap); bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { @@ -351,6 +353,7 @@ void ext3_rsv_window_add(struct super_block *sb, struct rb_node * parent = NULL; struct ext3_reserve_window_node *this; + trace_ext3_rsv_window_add(sb, rsv); while (*p) { parent = *p; @@ -476,8 +479,10 @@ void ext3_discard_reservation(struct inode *inode) rsv = &block_i->rsv_window_node; if (!rsv_is_empty(&rsv->rsv_window)) { spin_lock(rsv_lock); - if (!rsv_is_empty(&rsv->rsv_window)) + if (!rsv_is_empty(&rsv->rsv_window)) { + trace_ext3_discard_reservation(inode, rsv); rsv_window_remove(inode->i_sb, rsv); + } spin_unlock(rsv_lock); } } @@ -683,14 +688,10 @@ error_return: void ext3_free_blocks(handle_t *handle, struct inode *inode, ext3_fsblk_t block, unsigned long count) { - struct super_block * sb; + struct super_block *sb = inode->i_sb; unsigned long dquot_freed_blocks; - sb = inode->i_sb; - if (!sb) { - printk ("ext3_free_blocks: nonexistent device"); - return; - } + trace_ext3_free_blocks(inode, block, count); ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); if (dquot_freed_blocks) dquot_free_block(inode, dquot_freed_blocks); @@ -1136,6 +1137,7 @@ static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv, else start_block = grp_goal + group_first_block; + trace_ext3_alloc_new_reservation(sb, start_block); size = my_rsv->rsv_goal_size; if (!rsv_is_empty(&my_rsv->rsv_window)) { @@ -1230,8 +1232,11 @@ retry: * check if the first free block is within the * free space we just reserved */ - if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end) + if (start_block >= my_rsv->rsv_start && + start_block <= my_rsv->rsv_end) { + trace_ext3_reserved(sb, start_block, my_rsv); return 0; /* success */ + } /* * if the first free bit we found is out of the reservable space * continue search for next reservable space, @@ -1514,10 +1519,6 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, *errp = -ENOSPC; sb = inode->i_sb; - if (!sb) { - printk("ext3_new_block: nonexistent device"); - return 0; - } /* * Check quota for allocation of this block. @@ -1528,8 +1529,10 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, return 0; } + trace_ext3_request_blocks(inode, goal, num); + sbi = EXT3_SB(sb); - es = EXT3_SB(sb)->s_es; + es = sbi->s_es; ext3_debug("goal=%lu.\n", goal); /* * Allocate a block from reservation only when @@ -1742,6 +1745,10 @@ allocated: brelse(bitmap_bh); dquot_free_block(inode, *count-num); *count = num; + + trace_ext3_allocate_blocks(inode, goal, num, + (unsigned long long)ret_block); + return ret_block; io_error: @@ -1996,6 +2003,7 @@ ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group, if ((next - start) < minblocks) goto free_extent; + trace_ext3_discard_blocks(sb, discard_block, next - start); /* Send the TRIM command down to the device */ err = sb_issue_discard(sb, discard_block, next - start, GFP_NOFS, 0); diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index 09b13bb..06a4394 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c @@ -30,6 +30,7 @@ #include #include #include +#include /* * akpm: A new design for ext3_sync_file(). @@ -51,10 +52,13 @@ int ext3_sync_file(struct file *file, int datasync) int ret, needs_barrier = 0; tid_t commit_tid; + J_ASSERT(ext3_journal_current_handle() == NULL); + + trace_ext3_sync_file_enter(file, datasync); + if (inode->i_sb->s_flags & MS_RDONLY) return 0; - J_ASSERT(ext3_journal_current_handle() == NULL); /* * data=writeback,ordered: @@ -70,8 +74,10 @@ int ext3_sync_file(struct file *file, int datasync) * (they were dirtied by commit). But that's OK - the blocks are * safe in-journal, which is all fsync() needs to ensure. */ - if (ext3_should_journal_data(inode)) - return ext3_force_commit(inode->i_sb); + if (ext3_should_journal_data(inode)) { + ret = ext3_force_commit(inode->i_sb); + goto out; + } if (datasync) commit_tid = atomic_read(&ei->i_datasync_tid); @@ -91,5 +97,8 @@ int ext3_sync_file(struct file *file, int datasync) */ if (needs_barrier) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + +out: + trace_ext3_sync_file_exit(inode, ret); return ret; } diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index bfc2dc4..bf09cbf 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -118,6 +119,7 @@ void ext3_free_inode (handle_t *handle, struct inode * inode) ino = inode->i_ino; ext3_debug ("freeing inode %lu\n", ino); + trace_ext3_free_inode(inode); is_directory = S_ISDIR(inode->i_mode); @@ -426,6 +428,7 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, return ERR_PTR(-EPERM); sb = dir->i_sb; + trace_ext3_request_inode(dir, mode); inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -601,6 +604,7 @@ got: } ext3_debug("allocating inode %lu\n", inode->i_ino); + trace_ext3_allocate_inode(inode, dir, mode); goto really_out; fail: ext3_std_error(sb, err); diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 3451d23..3aa05ee 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -38,6 +38,7 @@ #include #include #include +#include #include "xattr.h" #include "acl.h" @@ -70,6 +71,7 @@ int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, might_sleep(); + trace_ext3_forget(inode, is_metadata, blocknr); BUFFER_TRACE(bh, "enter"); jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " @@ -198,6 +200,7 @@ void ext3_evict_inode (struct inode *inode) handle_t *handle; int want_delete = 0; + trace_ext3_evict_inode(inode); if (!inode->i_nlink && !is_bad_inode(inode)) { dquot_initialize(inode); want_delete = 1; @@ -842,6 +845,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, ext3_fsblk_t first_block = 0; + trace_ext3_get_blocks_enter(inode, iblock, maxblocks, create); J_ASSERT(handle != NULL || create == 0); depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary); @@ -970,6 +974,9 @@ cleanup: } BUFFER_TRACE(bh_result, "returned"); out: + trace_ext3_get_blocks_exit(inode, iblock, + depth ? le32_to_cpu(chain[depth-1].key) : 0, + count, err); return err; } @@ -1217,6 +1224,8 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping, * we allocate blocks but write fails for some reason */ int needed_blocks = ext3_writepage_trans_blocks(inode) + 1; + trace_ext3_write_begin(inode, pos, len, flags); + index = pos >> PAGE_CACHE_SHIFT; from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; @@ -1332,6 +1341,7 @@ static int ext3_ordered_write_end(struct file *file, unsigned from, to; int ret = 0, ret2; + trace_ext3_ordered_write_end(inode, pos, len, copied); copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); from = pos & (PAGE_CACHE_SIZE - 1); @@ -1367,6 +1377,7 @@ static int ext3_writeback_write_end(struct file *file, struct inode *inode = file->f_mapping->host; int ret; + trace_ext3_writeback_write_end(inode, pos, len, copied); copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); update_file_sizes(inode, pos, copied); /* @@ -1395,6 +1406,7 @@ static int ext3_journalled_write_end(struct file *file, int partial = 0; unsigned from, to; + trace_ext3_journalled_write_end(inode, pos, len, copied); from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; @@ -1577,6 +1589,7 @@ static int ext3_ordered_writepage(struct page *page, if (ext3_journal_current_handle()) goto out_fail; + trace_ext3_ordered_writepage(page); if (!page_has_buffers(page)) { create_empty_buffers(page, inode->i_sb->s_blocksize, (1 << BH_Dirty)|(1 << BH_Uptodate)); @@ -1647,6 +1660,7 @@ static int ext3_writeback_writepage(struct page *page, if (ext3_journal_current_handle()) goto out_fail; + trace_ext3_writeback_writepage(page); if (page_has_buffers(page)) { if (!walk_page_buffers(NULL, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, buffer_unmapped)) { @@ -1689,6 +1703,7 @@ static int ext3_journalled_writepage(struct page *page, if (ext3_journal_current_handle()) goto no_write; + trace_ext3_journalled_writepage(page); handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); @@ -1739,6 +1754,7 @@ out_unlock: static int ext3_readpage(struct file *file, struct page *page) { + trace_ext3_readpage(page); return mpage_readpage(page, ext3_get_block); } @@ -1753,6 +1769,8 @@ static void ext3_invalidatepage(struct page *page, unsigned long offset) { journal_t *journal = EXT3_JOURNAL(page->mapping->host); + trace_ext3_invalidatepage(page, offset); + /* * If it's a full truncate we just forget about the pending dirtying */ @@ -1766,6 +1784,7 @@ static int ext3_releasepage(struct page *page, gfp_t wait) { journal_t *journal = EXT3_JOURNAL(page->mapping->host); + trace_ext3_releasepage(page); WARN_ON(PageChecked(page)); if (!page_has_buffers(page)) return 0; @@ -1794,6 +1813,8 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, size_t count = iov_length(iov, nr_segs); int retries = 0; + trace_ext3_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); + if (rw == WRITE) { loff_t final_size = offset + count; @@ -1868,6 +1889,8 @@ retry: ret = err; } out: + trace_ext3_direct_IO_exit(inode, offset, + iov_length(iov, nr_segs), rw, ret); return ret; } @@ -2446,6 +2469,8 @@ void ext3_truncate(struct inode *inode) unsigned blocksize = inode->i_sb->s_blocksize; struct page *page; + trace_ext3_truncate_enter(inode); + if (!ext3_can_truncate(inode)) goto out_notrans; @@ -2597,6 +2622,7 @@ out_stop: ext3_orphan_del(handle, inode); ext3_journal_stop(handle); + trace_ext3_truncate_exit(inode); return; out_notrans: /* @@ -2605,6 +2631,7 @@ out_notrans: */ if (inode->i_nlink) ext3_orphan_del(NULL, inode); + trace_ext3_truncate_exit(inode); } static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb, @@ -2746,6 +2773,7 @@ make_io: * has in-inode xattrs, or we don't have this inode in memory. * Read the block from disk. */ + trace_ext3_load_inode(inode); get_bh(bh); bh->b_end_io = end_buffer_read_sync; submit_bh(READ_META, bh); @@ -3372,6 +3400,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) int err; might_sleep(); + trace_ext3_mark_inode_dirty(inode, _RET_IP_); err = ext3_reserve_inode_write(handle, inode, &iloc); if (!err) err = ext3_mark_iloc_dirty(handle, inode, &iloc); diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 34b6d9b..51736a4 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "namei.h" #include "xattr.h" @@ -2144,6 +2145,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) struct ext3_dir_entry_2 * de; handle_t *handle; + trace_ext3_unlink_enter(dir, dentry); /* Initialize quotas before so that eventual writes go * in separate transaction */ dquot_initialize(dir); @@ -2189,6 +2191,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) end_unlink: ext3_journal_stop(handle); brelse (bh); + trace_ext3_unlink_exit(dentry, retval); return retval; } diff --git a/fs/ext3/super.c b/fs/ext3/super.c index aad153e..662290f 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -44,6 +44,9 @@ #include "acl.h" #include "namei.h" +#define CREATE_TRACE_POINTS +#include + #ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA #else @@ -497,6 +500,14 @@ static struct inode *ext3_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } +static int ext3_drop_inode(struct inode *inode) +{ + int drop = generic_drop_inode(inode); + + trace_ext3_drop_inode(inode, drop); + return drop; +} + static void ext3_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); @@ -788,6 +799,7 @@ static const struct super_operations ext3_sops = { .destroy_inode = ext3_destroy_inode, .write_inode = ext3_write_inode, .dirty_inode = ext3_dirty_inode, + .drop_inode = ext3_drop_inode, .evict_inode = ext3_evict_inode, .put_super = ext3_put_super, .sync_fs = ext3_sync_fs, @@ -2507,6 +2519,7 @@ static int ext3_sync_fs(struct super_block *sb, int wait) { tid_t target; + trace_ext3_sync_fs(sb, wait); if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) { if (wait) log_wait_commit(EXT3_SB(sb)->s_journal, target); diff --git a/include/trace/events/ext3.h b/include/trace/events/ext3.h new file mode 100644 index 0000000..7b53c05 --- /dev/null +++ b/include/trace/events/ext3.h @@ -0,0 +1,864 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ext3 + +#if !defined(_TRACE_EXT3_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_EXT3_H + +#include + +TRACE_EVENT(ext3_free_inode, + TP_PROTO(struct inode *inode), + + TP_ARGS(inode), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( umode_t, mode ) + __field( uid_t, uid ) + __field( gid_t, gid ) + __field( blkcnt_t, blocks ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->mode = inode->i_mode; + __entry->uid = inode->i_uid; + __entry->gid = inode->i_gid; + __entry->blocks = inode->i_blocks; + ), + + TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->mode, __entry->uid, __entry->gid, + (unsigned long) __entry->blocks) +); + +TRACE_EVENT(ext3_request_inode, + TP_PROTO(struct inode *dir, int mode), + + TP_ARGS(dir, mode), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, dir ) + __field( umode_t, mode ) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->dir = dir->i_ino; + __entry->mode = mode; + ), + + TP_printk("dev %d,%d dir %lu mode 0%o", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->dir, __entry->mode) +); + +TRACE_EVENT(ext3_allocate_inode, + TP_PROTO(struct inode *inode, struct inode *dir, int mode), + + TP_ARGS(inode, dir, mode), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( ino_t, dir ) + __field( umode_t, mode ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->dir = dir->i_ino; + __entry->mode = mode; + ), + + TP_printk("dev %d,%d ino %lu dir %lu mode 0%o", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned long) __entry->dir, __entry->mode) +); + +TRACE_EVENT(ext3_evict_inode, + TP_PROTO(struct inode *inode), + + TP_ARGS(inode), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( int, nlink ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->nlink = inode->i_nlink; + ), + + TP_printk("dev %d,%d ino %lu nlink %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, __entry->nlink) +); + +TRACE_EVENT(ext3_drop_inode, + TP_PROTO(struct inode *inode, int drop), + + TP_ARGS(inode, drop), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( int, drop ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->drop = drop; + ), + + TP_printk("dev %d,%d ino %lu drop %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, __entry->drop) +); + +TRACE_EVENT(ext3_mark_inode_dirty, + TP_PROTO(struct inode *inode, unsigned long IP), + + TP_ARGS(inode, IP), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field(unsigned long, ip ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->ip = IP; + ), + + TP_printk("dev %d,%d ino %lu caller %pF", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, (void *)__entry->ip) +); + +TRACE_EVENT(ext3_write_begin, + TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, + unsigned int flags), + + TP_ARGS(inode, pos, len, flags), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( loff_t, pos ) + __field( unsigned int, len ) + __field( unsigned int, flags ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pos = pos; + __entry->len = len; + __entry->flags = flags; + ), + + TP_printk("dev %d,%d ino %lu pos %llu len %u flags %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned long long) __entry->pos, __entry->len, + __entry->flags) +); + +DECLARE_EVENT_CLASS(ext3__write_end, + TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, + unsigned int copied), + + TP_ARGS(inode, pos, len, copied), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( loff_t, pos ) + __field( unsigned int, len ) + __field( unsigned int, copied ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pos = pos; + __entry->len = len; + __entry->copied = copied; + ), + + TP_printk("dev %d,%d ino %lu pos %llu len %u copied %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned long long) __entry->pos, __entry->len, + __entry->copied) +); + +DEFINE_EVENT(ext3__write_end, ext3_ordered_write_end, + + TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, + unsigned int copied), + + TP_ARGS(inode, pos, len, copied) +); + +DEFINE_EVENT(ext3__write_end, ext3_writeback_write_end, + + TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, + unsigned int copied), + + TP_ARGS(inode, pos, len, copied) +); + +DEFINE_EVENT(ext3__write_end, ext3_journalled_write_end, + + TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, + unsigned int copied), + + TP_ARGS(inode, pos, len, copied) +); + +DECLARE_EVENT_CLASS(ext3__page_op, + TP_PROTO(struct page *page), + + TP_ARGS(page), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( pgoff_t, index ) + + ), + + TP_fast_assign( + __entry->index = page->index; + __entry->ino = page->mapping->host->i_ino; + __entry->dev = page->mapping->host->i_sb->s_dev; + ), + + TP_printk("dev %d,%d ino %lu page_index %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, __entry->index) +); + +DEFINE_EVENT(ext3__page_op, ext3_ordered_writepage, + + TP_PROTO(struct page *page), + + TP_ARGS(page) +); + +DEFINE_EVENT(ext3__page_op, ext3_writeback_writepage, + + TP_PROTO(struct page *page), + + TP_ARGS(page) +); + +DEFINE_EVENT(ext3__page_op, ext3_journalled_writepage, + + TP_PROTO(struct page *page), + + TP_ARGS(page) +); + +DEFINE_EVENT(ext3__page_op, ext3_readpage, + + TP_PROTO(struct page *page), + + TP_ARGS(page) +); + +DEFINE_EVENT(ext3__page_op, ext3_releasepage, + + TP_PROTO(struct page *page), + + TP_ARGS(page) +); + +TRACE_EVENT(ext3_invalidatepage, + TP_PROTO(struct page *page, unsigned long offset), + + TP_ARGS(page, offset), + + TP_STRUCT__entry( + __field( pgoff_t, index ) + __field( unsigned long, offset ) + __field( ino_t, ino ) + __field( dev_t, dev ) + + ), + + TP_fast_assign( + __entry->index = page->index; + __entry->offset = offset; + __entry->ino = page->mapping->host->i_ino; + __entry->dev = page->mapping->host->i_sb->s_dev; + ), + + TP_printk("dev %d,%d ino %lu page_index %lu offset %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->index, __entry->offset) +); + +TRACE_EVENT(ext3_discard_blocks, + TP_PROTO(struct super_block *sb, unsigned long blk, + unsigned long count), + + TP_ARGS(sb, blk, count), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( unsigned long, blk ) + __field( unsigned long, count ) + + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->blk = blk; + __entry->count = count; + ), + + TP_printk("dev %d,%d blk %lu count %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->blk, __entry->count) +); + +TRACE_EVENT(ext3_request_blocks, + TP_PROTO(struct inode *inode, unsigned long goal, + unsigned long count), + + TP_ARGS(inode, goal, count), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( unsigned long, count ) + __field( unsigned long, goal ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->count = count; + __entry->goal = goal; + ), + + TP_printk("dev %d,%d ino %lu count %lu goal %lu ", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->count, __entry->goal) +); + +TRACE_EVENT(ext3_allocate_blocks, + TP_PROTO(struct inode *inode, unsigned long goal, + unsigned long count, unsigned long block), + + TP_ARGS(inode, goal, count, block), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( unsigned long, block ) + __field( unsigned long, count ) + __field( unsigned long, goal ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->block = block; + __entry->count = count; + __entry->goal = goal; + ), + + TP_printk("dev %d,%d ino %lu count %lu block %lu goal %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->count, __entry->block, + __entry->goal) +); + +TRACE_EVENT(ext3_free_blocks, + TP_PROTO(struct inode *inode, unsigned long block, + unsigned long count), + + TP_ARGS(inode, block, count), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( umode_t, mode ) + __field( unsigned long, block ) + __field( unsigned long, count ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->mode = inode->i_mode; + __entry->block = block; + __entry->count = count; + ), + + TP_printk("dev %d,%d ino %lu mode 0%o block %lu count %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->mode, __entry->block, __entry->count) +); + +TRACE_EVENT(ext3_sync_file_enter, + TP_PROTO(struct file *file, int datasync), + + TP_ARGS(file, datasync), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( ino_t, parent ) + __field( int, datasync ) + ), + + TP_fast_assign( + struct dentry *dentry = file->f_path.dentry; + + __entry->dev = dentry->d_inode->i_sb->s_dev; + __entry->ino = dentry->d_inode->i_ino; + __entry->datasync = datasync; + __entry->parent = dentry->d_parent->d_inode->i_ino; + ), + + TP_printk("dev %d,%d ino %lu parent %ld datasync %d ", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned long) __entry->parent, __entry->datasync) +); + +TRACE_EVENT(ext3_sync_file_exit, + TP_PROTO(struct inode *inode, int ret), + + TP_ARGS(inode, ret), + + TP_STRUCT__entry( + __field( int, ret ) + __field( ino_t, ino ) + __field( dev_t, dev ) + ), + + TP_fast_assign( + __entry->ret = ret; + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + ), + + TP_printk("dev %d,%d ino %lu ret %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->ret) +); + +TRACE_EVENT(ext3_sync_fs, + TP_PROTO(struct super_block *sb, int wait), + + TP_ARGS(sb, wait), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, wait ) + + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->wait = wait; + ), + + TP_printk("dev %d,%d wait %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->wait) +); + +TRACE_EVENT(ext3_rsv_window_add, + TP_PROTO(struct super_block *sb, + struct ext3_reserve_window_node *rsv_node), + + TP_ARGS(sb, rsv_node), + + TP_STRUCT__entry( + __field( unsigned long, start ) + __field( unsigned long, end ) + __field( dev_t, dev ) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->start = rsv_node->rsv_window._rsv_start; + __entry->end = rsv_node->rsv_window._rsv_end; + ), + + TP_printk("dev %d,%d start %lu end %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->start, __entry->end) +); + +TRACE_EVENT(ext3_discard_reservation, + TP_PROTO(struct inode *inode, + struct ext3_reserve_window_node *rsv_node), + + TP_ARGS(inode, rsv_node), + + TP_STRUCT__entry( + __field( unsigned long, start ) + __field( unsigned long, end ) + __field( ino_t, ino ) + __field( dev_t, dev ) + ), + + TP_fast_assign( + __entry->start = rsv_node->rsv_window._rsv_start; + __entry->end = rsv_node->rsv_window._rsv_end; + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + ), + + TP_printk("dev %d,%d ino %lu start %lu end %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long)__entry->ino, __entry->start, + __entry->end) +); + +TRACE_EVENT(ext3_alloc_new_reservation, + TP_PROTO(struct super_block *sb, unsigned long goal), + + TP_ARGS(sb, goal), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( unsigned long, goal ) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->goal = goal; + ), + + TP_printk("dev %d,%d goal %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->goal) +); + +TRACE_EVENT(ext3_reserved, + TP_PROTO(struct super_block *sb, unsigned long block, + struct ext3_reserve_window_node *rsv_node), + + TP_ARGS(sb, block, rsv_node), + + TP_STRUCT__entry( + __field( unsigned long, block ) + __field( unsigned long, start ) + __field( unsigned long, end ) + __field( dev_t, dev ) + ), + + TP_fast_assign( + __entry->block = block; + __entry->start = rsv_node->rsv_window._rsv_start; + __entry->end = rsv_node->rsv_window._rsv_end; + __entry->dev = sb->s_dev; + ), + + TP_printk("dev %d,%d block %lu, start %lu end %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->block, __entry->start, __entry->end) +); + +TRACE_EVENT(ext3_forget, + TP_PROTO(struct inode *inode, int is_metadata, unsigned long block), + + TP_ARGS(inode, is_metadata, block), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( umode_t, mode ) + __field( int, is_metadata ) + __field( unsigned long, block ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->mode = inode->i_mode; + __entry->is_metadata = is_metadata; + __entry->block = block; + ), + + TP_printk("dev %d,%d ino %lu mode 0%o is_metadata %d block %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->mode, __entry->is_metadata, __entry->block) +); + +TRACE_EVENT(ext3_read_block_bitmap, + TP_PROTO(struct super_block *sb, unsigned int group), + + TP_ARGS(sb, group), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( __u32, group ) + + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->group = group; + ), + + TP_printk("dev %d,%d group %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->group) +); + +TRACE_EVENT(ext3_direct_IO_enter, + TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw), + + TP_ARGS(inode, offset, len, rw), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( loff_t, pos ) + __field( unsigned long, len ) + __field( int, rw ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->pos = offset; + __entry->len = len; + __entry->rw = rw; + ), + + TP_printk("dev %d,%d ino %lu pos %llu len %lu rw %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned long long) __entry->pos, __entry->len, + __entry->rw) +); + +TRACE_EVENT(ext3_direct_IO_exit, + TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, + int rw, int ret), + + TP_ARGS(inode, offset, len, rw, ret), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( loff_t, pos ) + __field( unsigned long, len ) + __field( int, rw ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->pos = offset; + __entry->len = len; + __entry->rw = rw; + __entry->ret = ret; + ), + + TP_printk("dev %d,%d ino %lu pos %llu len %lu rw %d ret %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned long long) __entry->pos, __entry->len, + __entry->rw, __entry->ret) +); + +TRACE_EVENT(ext3_unlink_enter, + TP_PROTO(struct inode *parent, struct dentry *dentry), + + TP_ARGS(parent, dentry), + + TP_STRUCT__entry( + __field( ino_t, parent ) + __field( ino_t, ino ) + __field( loff_t, size ) + __field( dev_t, dev ) + ), + + TP_fast_assign( + __entry->parent = parent->i_ino; + __entry->ino = dentry->d_inode->i_ino; + __entry->size = dentry->d_inode->i_size; + __entry->dev = dentry->d_inode->i_sb->s_dev; + ), + + TP_printk("dev %d,%d ino %lu size %lld parent %ld", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned long long)__entry->size, + (unsigned long) __entry->parent) +); + +TRACE_EVENT(ext3_unlink_exit, + TP_PROTO(struct dentry *dentry, int ret), + + TP_ARGS(dentry, ret), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->ino = dentry->d_inode->i_ino; + __entry->dev = dentry->d_inode->i_sb->s_dev; + __entry->ret = ret; + ), + + TP_printk("dev %d,%d ino %lu ret %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->ret) +); + +DECLARE_EVENT_CLASS(ext3__truncate, + TP_PROTO(struct inode *inode), + + TP_ARGS(inode), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( blkcnt_t, blocks ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->blocks = inode->i_blocks; + ), + + TP_printk("dev %d,%d ino %lu blocks %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, (unsigned long) __entry->blocks) +); + +DEFINE_EVENT(ext3__truncate, ext3_truncate_enter, + + TP_PROTO(struct inode *inode), + + TP_ARGS(inode) +); + +DEFINE_EVENT(ext3__truncate, ext3_truncate_exit, + + TP_PROTO(struct inode *inode), + + TP_ARGS(inode) +); + +TRACE_EVENT(ext3_get_blocks_enter, + TP_PROTO(struct inode *inode, unsigned long lblk, + unsigned long len, int create), + + TP_ARGS(inode, lblk, len, create), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( unsigned long, lblk ) + __field( unsigned long, len ) + __field( int, create ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->lblk = lblk; + __entry->len = len; + __entry->create = create; + ), + + TP_printk("dev %d,%d ino %lu lblk %lu len %lu create %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->lblk, __entry->len, __entry->create) +); + +TRACE_EVENT(ext3_get_blocks_exit, + TP_PROTO(struct inode *inode, unsigned long lblk, + unsigned long pblk, unsigned long len, int ret), + + TP_ARGS(inode, lblk, pblk, len, ret), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( unsigned long, lblk ) + __field( unsigned long, pblk ) + __field( unsigned long, len ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->lblk = lblk; + __entry->pblk = pblk; + __entry->len = len; + __entry->ret = ret; + ), + + TP_printk("dev %d,%d ino %lu lblk %lu pblk %lu len %lu ret %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->lblk, __entry->pblk, + __entry->len, __entry->ret) +); + +TRACE_EVENT(ext3_load_inode, + TP_PROTO(struct inode *inode), + + TP_ARGS(inode), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + ), + + TP_printk("dev %d,%d ino %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino) +); + +#endif /* _TRACE_EXT3_H */ + +/* This part must be outside protection */ +#include -- cgit v1.1 From 99cb1a318c37bf462c53d43f4dacb7b4896ce0c9 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Mon, 23 May 2011 18:33:02 +0200 Subject: jbd: Add fixed tracepoints This commit adds fixed tracepoint for jbd. It has been based on fixed tracepoints for jbd2, however there are missing those for collecting statistics, since I think that it will require more intrusive patch so I should have its own commit, if someone decide that it is needed. Also there are new tracepoints in __journal_drop_transaction() and journal_update_superblock(). The list of jbd tracepoints: jbd_checkpoint jbd_start_commit jbd_commit_locking jbd_commit_flushing jbd_commit_logging jbd_drop_transaction jbd_end_commit jbd_do_submit_data jbd_cleanup_journal_tail jbd_update_superblock_end Signed-off-by: Lukas Czerner Cc: Jan Kara Signed-off-by: Jan Kara --- fs/jbd/checkpoint.c | 4 + fs/jbd/commit.c | 11 +++ fs/jbd/journal.c | 4 + include/trace/events/jbd.h | 203 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 222 insertions(+) create mode 100644 include/trace/events/jbd.h diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index e4b87bc..dea7503b 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -22,6 +22,7 @@ #include #include #include +#include /* * Unlink a buffer from a transaction checkpoint list. @@ -358,6 +359,7 @@ int log_do_checkpoint(journal_t *journal) * journal straight away. */ result = cleanup_journal_tail(journal); + trace_jbd_checkpoint(journal, result); jbd_debug(1, "cleanup_journal_tail returned %d\n", result); if (result <= 0) return result; @@ -503,6 +505,7 @@ int cleanup_journal_tail(journal_t *journal) if (blocknr < journal->j_tail) freed = freed + journal->j_last - journal->j_first; + trace_jbd_cleanup_journal_tail(journal, first_tid, blocknr, freed); jbd_debug(1, "Cleaning journal tail from %d to %d (offset %u), " "freeing %u\n", @@ -752,6 +755,7 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) J_ASSERT(journal->j_committing_transaction != transaction); J_ASSERT(journal->j_running_transaction != transaction); + trace_jbd_drop_transaction(journal, transaction); jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); kfree(transaction); } diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 72ffa97..eedd201 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -21,6 +21,7 @@ #include #include #include +#include /* * Default IO end handler for temporary BJ_IO buffer_heads. @@ -204,6 +205,8 @@ write_out_data: if (!trylock_buffer(bh)) { BUFFER_TRACE(bh, "needs blocking lock"); spin_unlock(&journal->j_list_lock); + trace_jbd_do_submit_data(journal, + commit_transaction); /* Write out all data to prevent deadlocks */ journal_do_submit_data(wbuf, bufs, write_op); bufs = 0; @@ -236,6 +239,8 @@ write_out_data: jbd_unlock_bh_state(bh); if (bufs == journal->j_wbufsize) { spin_unlock(&journal->j_list_lock); + trace_jbd_do_submit_data(journal, + commit_transaction); journal_do_submit_data(wbuf, bufs, write_op); bufs = 0; goto write_out_data; @@ -266,6 +271,7 @@ write_out_data: } } spin_unlock(&journal->j_list_lock); + trace_jbd_do_submit_data(journal, commit_transaction); journal_do_submit_data(wbuf, bufs, write_op); return err; @@ -316,12 +322,14 @@ void journal_commit_transaction(journal_t *journal) commit_transaction = journal->j_running_transaction; J_ASSERT(commit_transaction->t_state == T_RUNNING); + trace_jbd_start_commit(journal, commit_transaction); jbd_debug(1, "JBD: starting commit of transaction %d\n", commit_transaction->t_tid); spin_lock(&journal->j_state_lock); commit_transaction->t_state = T_LOCKED; + trace_jbd_commit_locking(journal, commit_transaction); spin_lock(&commit_transaction->t_handle_lock); while (commit_transaction->t_updates) { DEFINE_WAIT(wait); @@ -392,6 +400,7 @@ void journal_commit_transaction(journal_t *journal) */ journal_switch_revoke_table(journal); + trace_jbd_commit_flushing(journal, commit_transaction); commit_transaction->t_state = T_FLUSH; journal->j_committing_transaction = commit_transaction; journal->j_running_transaction = NULL; @@ -493,6 +502,7 @@ void journal_commit_transaction(journal_t *journal) commit_transaction->t_state = T_COMMIT; spin_unlock(&journal->j_state_lock); + trace_jbd_commit_logging(journal, commit_transaction); J_ASSERT(commit_transaction->t_nr_buffers <= commit_transaction->t_outstanding_credits); @@ -946,6 +956,7 @@ restart_loop: } spin_unlock(&journal->j_list_lock); + trace_jbd_end_commit(journal, commit_transaction); jbd_debug(1, "JBD: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index e2d4285..ab019ee 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -38,6 +38,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + #include #include @@ -1065,6 +1068,7 @@ void journal_update_superblock(journal_t *journal, int wait) } else write_dirty_buffer(bh, WRITE); + trace_jbd_update_superblock_end(journal, wait); out: /* If we have just flushed the log (by marking s_start==0), then * any future commit will have to be careful to update the diff --git a/include/trace/events/jbd.h b/include/trace/events/jbd.h new file mode 100644 index 0000000..aff64d8 --- /dev/null +++ b/include/trace/events/jbd.h @@ -0,0 +1,203 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM jbd + +#if !defined(_TRACE_JBD_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_JBD_H + +#include +#include + +TRACE_EVENT(jbd_checkpoint, + + TP_PROTO(journal_t *journal, int result), + + TP_ARGS(journal, result), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, result ) + ), + + TP_fast_assign( + __entry->dev = journal->j_fs_dev->bd_dev; + __entry->result = result; + ), + + TP_printk("dev %d,%d result %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->result) +); + +DECLARE_EVENT_CLASS(jbd_commit, + + TP_PROTO(journal_t *journal, transaction_t *commit_transaction), + + TP_ARGS(journal, commit_transaction), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( char, sync_commit ) + __field( int, transaction ) + ), + + TP_fast_assign( + __entry->dev = journal->j_fs_dev->bd_dev; + __entry->sync_commit = commit_transaction->t_synchronous_commit; + __entry->transaction = commit_transaction->t_tid; + ), + + TP_printk("dev %d,%d transaction %d sync %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->transaction, __entry->sync_commit) +); + +DEFINE_EVENT(jbd_commit, jbd_start_commit, + + TP_PROTO(journal_t *journal, transaction_t *commit_transaction), + + TP_ARGS(journal, commit_transaction) +); + +DEFINE_EVENT(jbd_commit, jbd_commit_locking, + + TP_PROTO(journal_t *journal, transaction_t *commit_transaction), + + TP_ARGS(journal, commit_transaction) +); + +DEFINE_EVENT(jbd_commit, jbd_commit_flushing, + + TP_PROTO(journal_t *journal, transaction_t *commit_transaction), + + TP_ARGS(journal, commit_transaction) +); + +DEFINE_EVENT(jbd_commit, jbd_commit_logging, + + TP_PROTO(journal_t *journal, transaction_t *commit_transaction), + + TP_ARGS(journal, commit_transaction) +); + +TRACE_EVENT(jbd_drop_transaction, + + TP_PROTO(journal_t *journal, transaction_t *commit_transaction), + + TP_ARGS(journal, commit_transaction), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( char, sync_commit ) + __field( int, transaction ) + ), + + TP_fast_assign( + __entry->dev = journal->j_fs_dev->bd_dev; + __entry->sync_commit = commit_transaction->t_synchronous_commit; + __entry->transaction = commit_transaction->t_tid; + ), + + TP_printk("dev %d,%d transaction %d sync %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->transaction, __entry->sync_commit) +); + +TRACE_EVENT(jbd_end_commit, + TP_PROTO(journal_t *journal, transaction_t *commit_transaction), + + TP_ARGS(journal, commit_transaction), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( char, sync_commit ) + __field( int, transaction ) + __field( int, head ) + ), + + TP_fast_assign( + __entry->dev = journal->j_fs_dev->bd_dev; + __entry->sync_commit = commit_transaction->t_synchronous_commit; + __entry->transaction = commit_transaction->t_tid; + __entry->head = journal->j_tail_sequence; + ), + + TP_printk("dev %d,%d transaction %d sync %d head %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->transaction, __entry->sync_commit, __entry->head) +); + +TRACE_EVENT(jbd_do_submit_data, + TP_PROTO(journal_t *journal, transaction_t *commit_transaction), + + TP_ARGS(journal, commit_transaction), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( char, sync_commit ) + __field( int, transaction ) + ), + + TP_fast_assign( + __entry->dev = journal->j_fs_dev->bd_dev; + __entry->sync_commit = commit_transaction->t_synchronous_commit; + __entry->transaction = commit_transaction->t_tid; + ), + + TP_printk("dev %d,%d transaction %d sync %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->transaction, __entry->sync_commit) +); + +TRACE_EVENT(jbd_cleanup_journal_tail, + + TP_PROTO(journal_t *journal, tid_t first_tid, + unsigned long block_nr, unsigned long freed), + + TP_ARGS(journal, first_tid, block_nr, freed), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( tid_t, tail_sequence ) + __field( tid_t, first_tid ) + __field(unsigned long, block_nr ) + __field(unsigned long, freed ) + ), + + TP_fast_assign( + __entry->dev = journal->j_fs_dev->bd_dev; + __entry->tail_sequence = journal->j_tail_sequence; + __entry->first_tid = first_tid; + __entry->block_nr = block_nr; + __entry->freed = freed; + ), + + TP_printk("dev %d,%d from %u to %u offset %lu freed %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tail_sequence, __entry->first_tid, + __entry->block_nr, __entry->freed) +); + +TRACE_EVENT(jbd_update_superblock_end, + TP_PROTO(journal_t *journal, int wait), + + TP_ARGS(journal, wait), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, wait ) + ), + + TP_fast_assign( + __entry->dev = journal->j_fs_dev->bd_dev; + __entry->wait = wait; + ), + + TP_printk("dev %d,%d wait %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->wait) +); + +#endif /* _TRACE_JBD_H */ + +/* This part must be outside protection */ +#include -- cgit v1.1 From 40680f2fa4670ab35ee554822a69dda1a118f966 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 24 May 2011 22:24:47 +0200 Subject: ext3: Convert ext3 to new truncate calling convention Mostly trivial conversion. We fix a bug that IS_IMMUTABLE and IS_APPEND files could not be truncated during failed writes as we change the code. In fact the test is not needed at all because both IS_IMMUTABLE and IS_APPEND is tested in upper layers in do_sys_[f]truncate(), may_write(), etc. Signed-off-by: Jan Kara --- fs/ext3/file.c | 1 - fs/ext3/inode.c | 27 +++++++++++---------------- include/linux/ext3_fs.h | 2 +- 3 files changed, 12 insertions(+), 18 deletions(-) diff --git a/fs/ext3/file.c b/fs/ext3/file.c index f55df0e..86c8ab3 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -71,7 +71,6 @@ const struct file_operations ext3_file_operations = { }; const struct inode_operations ext3_file_inode_operations = { - .truncate = ext3_truncate, .setattr = ext3_setattr, #ifdef CONFIG_EXT3_FS_XATTR .setxattr = generic_setxattr, diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 3aa05ee..b4051c9 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -234,12 +234,10 @@ void ext3_evict_inode (struct inode *inode) if (inode->i_blocks) ext3_truncate(inode); /* - * Kill off the orphan record which ext3_truncate created. - * AKPM: I think this can be inside the above `if'. - * Note that ext3_orphan_del() has to be able to cope with the - * deletion of a non-existent orphan - this is because we don't - * know if ext3_truncate() actually created an orphan record. - * (Well, we could do this if we need to, but heck - it works) + * Kill off the orphan record created when the inode lost the last + * link. Note that ext3_orphan_del() has to be able to cope with the + * deletion of a non-existent orphan - ext3_truncate() could + * have removed the record. */ ext3_orphan_del(handle, inode); EXT3_I(inode)->i_dtime = get_seconds(); @@ -890,6 +888,9 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, if (!create || err == -EIO) goto cleanup; + /* + * Block out ext3_truncate while we alter the tree + */ mutex_lock(&ei->truncate_mutex); /* @@ -938,9 +939,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, */ count = ext3_blks_to_allocate(partial, indirect_blks, maxblocks, blocks_to_boundary); - /* - * Block out ext3_truncate while we alter the tree - */ err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal, offsets + (partial - chain), partial); @@ -1849,7 +1847,7 @@ retry: loff_t end = offset + iov_length(iov, nr_segs); if (end > isize) - vmtruncate(inode, isize); + ext3_truncate_failed_write(inode); } if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) goto retry; @@ -1863,7 +1861,7 @@ retry: /* This is really bad luck. We've written the data * but cannot extend i_size. Truncate allocated blocks * and pretend the write failed... */ - ext3_truncate(inode); + ext3_truncate_failed_write(inode); ret = PTR_ERR(handle); goto out; } @@ -2414,8 +2412,6 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, int ext3_can_truncate(struct inode *inode) { - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return 0; if (S_ISREG(inode->i_mode)) return 1; if (S_ISDIR(inode->i_mode)) @@ -3264,9 +3260,8 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { - rc = vmtruncate(inode, attr->ia_size); - if (rc) - goto err_out; + truncate_setsize(inode, attr->ia_size); + ext3_truncate(inode); } setattr_copy(inode, attr); diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index 5e06acf..9aaa3a8 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -913,7 +913,7 @@ extern void ext3_dirty_inode(struct inode *, int); extern int ext3_change_inode_journal_flag(struct inode *, int); extern int ext3_get_inode_loc(struct inode *, struct ext3_iloc *); extern int ext3_can_truncate(struct inode *inode); -extern void ext3_truncate (struct inode *); +extern void ext3_truncate(struct inode *inode); extern void ext3_set_inode_flags(struct inode *); extern void ext3_get_inode_flags(struct ext3_inode_info *); extern void ext3_set_aops(struct inode *inode); -- cgit v1.1 From 05713082ab7690a2b22b044cfc867f346c39cd2d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 26 May 2011 17:17:18 +0200 Subject: jbd: remove dependency on __GFP_NOFAIL The callers of start_this_handle() (or better ext3_journal_start()) are not really prepared to handle allocation failures. Such failures can for example result in silent data loss when it happens in ext3_..._writepage(). OTOH __GFP_NOFAIL is going away so we just retry allocation in start_this_handle(). This loop is potentially dangerous because the oom killer cannot be invoked for GFP_NOFS allocation, so there is a potential for infinitely looping. But still this is better than silent data loss. Signed-off-by: Jan Kara --- fs/jbd/transaction.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index f7ee81a..83a6618 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -26,6 +26,7 @@ #include #include #include +#include static void __journal_temp_unlink_buffer(struct journal_head *jh); @@ -99,11 +100,10 @@ static int start_this_handle(journal_t *journal, handle_t *handle) alloc_transaction: if (!journal->j_running_transaction) { - new_transaction = kzalloc(sizeof(*new_transaction), - GFP_NOFS|__GFP_NOFAIL); + new_transaction = kzalloc(sizeof(*new_transaction), GFP_NOFS); if (!new_transaction) { - ret = -ENOMEM; - goto out; + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto alloc_transaction; } } -- cgit v1.1 From bd5c9e1854e13d0c62a3de29a5fbc15dd6a4d8c6 Mon Sep 17 00:00:00 2001 From: Ding Dinghua Date: Thu, 26 May 2011 10:29:01 +0800 Subject: jbd: fix a bug of leaking jh->b_jcount journal_get_create_access should drop jh->b_jcount in error handling path Signed-off-by: Ding Dinghua Signed-off-by: Jan Kara --- fs/jbd/transaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 83a6618..dc39efd 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -844,8 +844,8 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh) */ JBUFFER_TRACE(jh, "cancelling revoke"); journal_cancel_revoke(handle, jh); - journal_put_journal_head(jh); out: + journal_put_journal_head(jh); return err; } -- cgit v1.1 From ad95c5e9bc8b5885f94dce720137cac8fa8da4c9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 30 May 2011 13:29:20 +0200 Subject: ext3: Fix oops in ext3_try_to_allocate_with_rsv() Block allocation is called from two places: ext3_get_blocks_handle() and ext3_xattr_block_set(). These two callers are not necessarily synchronized because xattr code holds only xattr_sem and i_mutex, and ext3_get_blocks_handle() may hold only truncate_mutex when called from writepage() path. Block reservation code does not expect two concurrent allocations to happen to the same inode and thus assertions can be triggered or reservation structure corruption can occur. Fix the problem by taking truncate_mutex in xattr code to serialize allocations. CC: Sage Weil CC: stable@kernel.org Reported-by: Fyodor Ustinov Signed-off-by: Jan Kara --- fs/ext3/xattr.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index 32e6cc2..d565759 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -803,8 +803,16 @@ inserted: /* We need to allocate a new block */ ext3_fsblk_t goal = ext3_group_first_block_no(sb, EXT3_I(inode)->i_block_group); - ext3_fsblk_t block = ext3_new_block(handle, inode, - goal, &error); + ext3_fsblk_t block; + + /* + * Protect us agaist concurrent allocations to the + * same inode from ext3_..._writepage(). Reservation + * code does not expect racing allocations. + */ + mutex_lock(&EXT3_I(inode)->truncate_mutex); + block = ext3_new_block(handle, inode, goal, &error); + mutex_unlock(&EXT3_I(inode)->truncate_mutex); if (error) goto cleanup; ea_idebug(inode, "creating block %d", block); -- cgit v1.1 From fbcc9e624b8dbc7f740fac3906aa261b83398100 Mon Sep 17 00:00:00 2001 From: Petr Uzel Date: Tue, 31 May 2011 11:36:06 +0200 Subject: ext2: include fs.h into ext2_fs.h AC_CHECK_HEADERS([linux/ext2_fs.h]) fails with configure:34666: checking linux/ext2_fs.h usability configure:34666: gcc -std=gnu99 -c -ggdb3 -O0 -Wunreachable-code conftest.c >&5 In file included from conftest.c:406:0: /usr/include/linux/ext2_fs.h: In function 'ext2_mask_flags': /usr/include/linux/ext2_fs.h:182:21: error: 'FS_DIRSYNC_FL' undeclared (first use in this function) /usr/include/linux/ext2_fs.h:182:21: note: each undeclared identifier is reported only once for each function it appears in /usr/include/linux/ext2_fs.h:182:37: error: 'FS_TOPDIR_FL' undeclared (first use in this function) /usr/include/linux/ext2_fs.h:184:19: error: 'FS_NODUMP_FL' undeclared (first use in this function) /usr/include/linux/ext2_fs.h:184:34: error: 'FS_NOATIME_FL' undeclared (first use in this function) It's reasonable to have headers that include all necessary definitions. So fix this by including fs.h into ext2_fs.h. Signed-off-by: Petr Uzel Signed-off-by: Jan Kara --- include/linux/ext2_fs.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/ext2_fs.h b/include/linux/ext2_fs.h index 2dfa707..53792bf 100644 --- a/include/linux/ext2_fs.h +++ b/include/linux/ext2_fs.h @@ -18,6 +18,7 @@ #include #include +#include /* * The second extended filesystem constants/structures -- cgit v1.1 From 9008593017069ad513cc7dc78a6c94e8dfddba31 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 1 Jun 2011 23:34:04 +0900 Subject: ext3: use proper little-endian bitops ext3_{set,clear}_bit() is defined as __test_and_{set,clear}_bit_le() for ext3. But all ext3_{set,clear}_bit() calls ignore return values. So these can be replaced with __{set,clear}_bit_le(). This changes ext3_{set,clear}_bit safely, because if someone uses these macros without noticing the change, new ext3_{set,clear}_bit don't have return value and causes compiler errors where the return value is used. This also removes unused ext3_find_first_zero_bit(). Signed-off-by: Akinobu Mita Cc: Jan Kara Cc: Andrew Morton Cc: Andreas Dilger Cc: linux-ext4@vger.kernel.org Signed-off-by: Jan Kara --- include/linux/ext3_fs.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index 9aaa3a8..8f1f908 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -418,12 +418,11 @@ struct ext3_inode { #define EXT2_MOUNT_DATA_FLAGS EXT3_MOUNT_DATA_FLAGS #endif -#define ext3_set_bit __test_and_set_bit_le +#define ext3_set_bit __set_bit_le #define ext3_set_bit_atomic ext2_set_bit_atomic -#define ext3_clear_bit __test_and_clear_bit_le +#define ext3_clear_bit __clear_bit_le #define ext3_clear_bit_atomic ext2_clear_bit_atomic #define ext3_test_bit test_bit_le -#define ext3_find_first_zero_bit find_first_zero_bit_le #define ext3_find_next_zero_bit find_next_zero_bit_le /* -- cgit v1.1 From ee3e77f18010679a889b3831c2dd931238c12d09 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 3 Jun 2011 21:58:11 +0200 Subject: ext3: Improve truncate error handling New truncate calling convention allows us to handle errors from ext3_block_truncate_page(). So reorganize the code so that ext3_block_truncate_page() is called before we change inode size. This also removes unnecessary block zeroing from error recovery after failed buffered writes (zeroing isn't needed because we could have never written non-zero data to disk). We have to be careful and keep zeroing in direct IO write error recovery because there we might have already overwritten end of the last file block. Signed-off-by: Jan Kara --- fs/ext3/inode.c | 101 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 63 insertions(+), 38 deletions(-) diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index b4051c9..d2e4547 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -43,6 +43,7 @@ #include "acl.h" static int ext3_writepage_trans_blocks(struct inode *inode); +static int ext3_block_truncate_page(struct inode *inode, loff_t from); /* * Test whether an inode is a fast symlink. @@ -1207,6 +1208,16 @@ static void ext3_truncate_failed_write(struct inode *inode) ext3_truncate(inode); } +/* + * Truncate blocks that were not used by direct IO write. We have to zero out + * the last file block as well because direct IO might have written to it. + */ +static void ext3_truncate_failed_direct_write(struct inode *inode) +{ + ext3_block_truncate_page(inode, inode->i_size); + ext3_truncate(inode); +} + static int ext3_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1847,7 +1858,7 @@ retry: loff_t end = offset + iov_length(iov, nr_segs); if (end > isize) - ext3_truncate_failed_write(inode); + ext3_truncate_failed_direct_write(inode); } if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) goto retry; @@ -1861,7 +1872,7 @@ retry: /* This is really bad luck. We've written the data * but cannot extend i_size. Truncate allocated blocks * and pretend the write failed... */ - ext3_truncate_failed_write(inode); + ext3_truncate_failed_direct_write(inode); ret = PTR_ERR(handle); goto out; } @@ -1971,17 +1982,24 @@ void ext3_set_aops(struct inode *inode) * This required during truncate. We need to physically zero the tail end * of that block so it doesn't yield old data if the file is later grown. */ -static int ext3_block_truncate_page(handle_t *handle, struct page *page, - struct address_space *mapping, loff_t from) +static int ext3_block_truncate_page(struct inode *inode, loff_t from) { ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned offset = from & (PAGE_CACHE_SIZE - 1); unsigned blocksize, iblock, length, pos; - struct inode *inode = mapping->host; + struct page *page; + handle_t *handle = NULL; struct buffer_head *bh; int err = 0; + /* Truncated on block boundary - nothing to do */ blocksize = inode->i_sb->s_blocksize; + if ((from & (blocksize - 1)) == 0) + return 0; + + page = grab_cache_page(inode->i_mapping, index); + if (!page) + return -ENOMEM; length = blocksize - (offset & (blocksize - 1)); iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); @@ -2026,11 +2044,23 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page, goto unlock; } + /* data=writeback mode doesn't need transaction to zero-out data */ + if (!ext3_should_writeback_data(inode)) { + /* We journal at most one block */ + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) { + clear_highpage(page); + flush_dcache_page(page); + err = PTR_ERR(handle); + goto unlock; + } + } + if (ext3_should_journal_data(inode)) { BUFFER_TRACE(bh, "get write access"); err = ext3_journal_get_write_access(handle, bh); if (err) - goto unlock; + goto stop; } zero_user(page, offset, length); @@ -2044,6 +2074,9 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page, err = ext3_journal_dirty_data(handle, bh); mark_buffer_dirty(bh); } +stop: + if (handle) + ext3_journal_stop(handle); unlock: unlock_page(page); @@ -2455,7 +2488,6 @@ void ext3_truncate(struct inode *inode) struct ext3_inode_info *ei = EXT3_I(inode); __le32 *i_data = ei->i_data; int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); - struct address_space *mapping = inode->i_mapping; int offsets[4]; Indirect chain[4]; Indirect *partial; @@ -2463,7 +2495,6 @@ void ext3_truncate(struct inode *inode) int n; long last_block; unsigned blocksize = inode->i_sb->s_blocksize; - struct page *page; trace_ext3_truncate_enter(inode); @@ -2473,37 +2504,12 @@ void ext3_truncate(struct inode *inode) if (inode->i_size == 0 && ext3_should_writeback_data(inode)) ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); - /* - * We have to lock the EOF page here, because lock_page() nests - * outside journal_start(). - */ - if ((inode->i_size & (blocksize - 1)) == 0) { - /* Block boundary? Nothing to do */ - page = NULL; - } else { - page = grab_cache_page(mapping, - inode->i_size >> PAGE_CACHE_SHIFT); - if (!page) - goto out_notrans; - } - handle = start_transaction(inode); - if (IS_ERR(handle)) { - if (page) { - clear_highpage(page); - flush_dcache_page(page); - unlock_page(page); - page_cache_release(page); - } + if (IS_ERR(handle)) goto out_notrans; - } last_block = (inode->i_size + blocksize-1) >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); - - if (page) - ext3_block_truncate_page(handle, page, mapping, inode->i_size); - n = ext3_block_to_path(inode, last_block, offsets, NULL); if (n == 0) goto out_stop; /* error */ @@ -3251,11 +3257,30 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) } error = ext3_orphan_add(handle, inode); + if (error) { + ext3_journal_stop(handle); + goto err_out; + } EXT3_I(inode)->i_disksize = attr->ia_size; - rc = ext3_mark_inode_dirty(handle, inode); - if (!error) - error = rc; + error = ext3_mark_inode_dirty(handle, inode); ext3_journal_stop(handle); + if (error) { + /* Some hard fs error must have happened. Bail out. */ + ext3_orphan_del(NULL, inode); + goto err_out; + } + rc = ext3_block_truncate_page(inode, attr->ia_size); + if (rc) { + /* Cleanup orphan list and exit */ + handle = ext3_journal_start(inode, 3); + if (IS_ERR(handle)) { + ext3_orphan_del(NULL, inode); + goto err_out; + } + ext3_orphan_del(handle, inode); + ext3_journal_stop(handle); + goto err_out; + } } if ((attr->ia_valid & ATTR_SIZE) && -- cgit v1.1 From ad434017718a725b1695fb2ebfff312cf3693d3b Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Tue, 7 Jun 2011 12:27:05 +0200 Subject: ext3/ext4 Documentation: remove bh/nobh since it has been deprecated Bh and nobh mount option has been deprecated in ext4 (206f7ab4f49a2021fcb8687f25395be77711ddee) and in ext3 (4c4d3901225518ed1a4c938ba15ba09842a00770) so remove those options from documentation. Signed-off-by: Lukas Czerner Reviewed-by: Eric Sandeen Signed-off-by: Jan Kara --- Documentation/filesystems/ext3.txt | 9 --------- Documentation/filesystems/ext4.txt | 23 +++++++---------------- 2 files changed, 7 insertions(+), 25 deletions(-) diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt index 272f80d..aee5560 100644 --- a/Documentation/filesystems/ext3.txt +++ b/Documentation/filesystems/ext3.txt @@ -147,15 +147,6 @@ grpjquota= during journal replay. They replace the above package for more details (http://sourceforge.net/projects/linuxquota). -bh (*) ext3 associates buffer heads to data pages to -nobh (a) cache disk block mapping information - (b) link pages into transaction to provide - ordering guarantees. - "bh" option forces use of buffer heads. - "nobh" option tries to avoid associating buffer - heads (supported only for "writeback" mode). - - Specification ============= Ext3 shares all disk implementation with the ext2 filesystem, and adds diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 3ae9bc9..232a575 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -68,12 +68,12 @@ Note: More extensive information for getting started with ext4 can be '-o barriers=[0|1]' mount option for both ext3 and ext4 filesystems for a fair comparison. When tuning ext3 for best benchmark numbers, it is often worthwhile to try changing the data journaling mode; '-o - data=writeback,nobh' can be faster for some workloads. (Note - however that running mounted with data=writeback can potentially - leave stale data exposed in recently written files in case of an - unclean shutdown, which could be a security exposure in some - situations.) Configuring the filesystem with a large journal can - also be helpful for metadata-intensive workloads. + data=writeback' can be faster for some workloads. (Note however that + running mounted with data=writeback can potentially leave stale data + exposed in recently written files in case of an unclean shutdown, + which could be a security exposure in some situations.) Configuring + the filesystem with a large journal can also be helpful for + metadata-intensive workloads. 2. Features =========== @@ -272,14 +272,6 @@ grpjquota= during journal replay. They replace the above package for more details (http://sourceforge.net/projects/linuxquota). -bh (*) ext4 associates buffer heads to data pages to -nobh (a) cache disk block mapping information - (b) link pages into transaction to provide - ordering guarantees. - "bh" option forces use of buffer heads. - "nobh" option tries to avoid associating buffer - heads (supported only for "writeback" mode). - stripe=n Number of filesystem blocks that mballoc will try to use for allocation size and alignment. For RAID5/6 systems this should be the number of data @@ -393,8 +385,7 @@ dioread_nolock locking. If the dioread_nolock option is specified write and convert the extent to initialized after IO completes. This approach allows ext4 code to avoid using inode mutex, which improves scalability on high - speed storages. However this does not work with nobh - option and the mount will fail. Nor does it work with + speed storages. However this does not work with data journaling and dioread_nolock option will be ignored with kernel warning. Note that dioread_nolock code path is only used for extent-based files. -- cgit v1.1 From 81fe8c62febade6b5d0915269b06a0c50448da27 Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Fri, 10 Jun 2011 14:59:05 -0700 Subject: ext3/ioctl.c: silence sparse warnings about different address spaces The 'from' argument for copy_from_user and the 'to' argument for copy_to_user should both be tagged as __user address space. Signed-off-by: H Hartley Sweeten Cc: Andrew Morton Cc: Andreas Dilger Signed-off-by: Jan Kara --- fs/ext3/ioctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index f4090bd..c7f4394 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -285,7 +285,7 @@ group_add_out: if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (copy_from_user(&range, (struct fstrim_range *)arg, + if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) return -EFAULT; @@ -293,7 +293,7 @@ group_add_out: if (ret < 0) return ret; - if (copy_to_user((struct fstrim_range *)arg, &range, + if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range))) return -EFAULT; -- cgit v1.1 From 2c2ea9451fc2a12ee57c8346f0da26969d07ee7f Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Wed, 22 Jun 2011 10:51:09 +0200 Subject: ext3: Return -EINVAL when start is beyond the end of fs in ext3_trim_fs() We should return -EINVAL when the FITRIM parameters are not sane, but currently we are exiting silently if start is beyond the end of the file system. This commit fixes this so we return -EINVAL as other file systems do. Signed-off-by: Lukas Czerner CC: Jan Kara Signed-off-by: Jan Kara --- fs/ext3/balloc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index f7d111e..6386d76 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -2108,7 +2108,7 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range) if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb))) return -EINVAL; if (start >= max_blks) - goto out; + return -EINVAL; if (start + len > max_blks) len = max_blks - start; @@ -2156,8 +2156,6 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range) if (ret >= 0) ret = 0; - -out: range->len = trimmed * sb->s_blocksize; return ret; -- cgit v1.1 From bb189247f35688a3353545902c56290fb7d7754a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 24 Jun 2011 23:11:59 +0200 Subject: jbd: Fix oops in journal_remove_journal_head() journal_remove_journal_head() can oops when trying to access journal_head returned by bh2jh(). This is caused for example by the following race: TASK1 TASK2 journal_commit_transaction() ... processing t_forget list __journal_refile_buffer(jh); if (!jh->b_transaction) { jbd_unlock_bh_state(bh); journal_try_to_free_buffers() journal_grab_journal_head(bh) jbd_lock_bh_state(bh) __journal_try_to_free_buffer() journal_put_journal_head(jh) journal_remove_journal_head(bh); journal_put_journal_head() in TASK2 sees that b_jcount == 0 and buffer is not part of any transaction and thus frees journal_head before TASK1 gets to doing so. Note that even buffer_head can be released by try_to_free_buffers() after journal_put_journal_head() which adds even larger opportunity for oops (but I didn't see this happen in reality). Fix the problem by making transactions hold their own journal_head reference (in b_jcount). That way we don't have to remove journal_head explicitely via journal_remove_journal_head() and instead just remove journal_head when b_jcount drops to zero. The result of this is that [__]journal_refile_buffer(), [__]journal_unfile_buffer(), and __journal_remove_checkpoint() can free journal_head which needs modification of a few callers. Also we have to be careful because once journal_head is removed, buffer_head might be freed as well. So we have to get our own buffer_head reference where it matters. Signed-off-by: Jan Kara --- fs/jbd/checkpoint.c | 27 ++++++++------- fs/jbd/commit.c | 46 ++++++++++++------------- fs/jbd/journal.c | 95 +++++++++++++++++----------------------------------- fs/jbd/transaction.c | 73 ++++++++++++++++++++-------------------- include/linux/jbd.h | 1 - 5 files changed, 104 insertions(+), 138 deletions(-) diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index dea7503b..61655a3 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -96,10 +96,14 @@ static int __try_to_free_cp_buf(struct journal_head *jh) if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh) && !buffer_write_io_error(bh)) { + /* + * Get our reference so that bh cannot be freed before + * we unlock it + */ + get_bh(bh); JBUFFER_TRACE(jh, "remove from checkpoint list"); ret = __journal_remove_checkpoint(jh) + 1; jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); BUFFER_TRACE(bh, "release"); __brelse(bh); } else { @@ -221,8 +225,8 @@ restart: spin_lock(&journal->j_list_lock); goto restart; } + get_bh(bh); if (buffer_locked(bh)) { - get_bh(bh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); wait_on_buffer(bh); @@ -241,7 +245,6 @@ restart: */ released = __journal_remove_checkpoint(jh); jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); __brelse(bh); } @@ -305,12 +308,12 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, ret = 1; if (unlikely(buffer_write_io_error(bh))) ret = -EIO; + get_bh(bh); J_ASSERT_JH(jh, !buffer_jbddirty(bh)); BUFFER_TRACE(bh, "remove from checkpoint"); __journal_remove_checkpoint(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); __brelse(bh); } else { /* @@ -526,9 +529,9 @@ int cleanup_journal_tail(journal_t *journal) /* * journal_clean_one_cp_list * - * Find all the written-back checkpoint buffers in the given list and release them. + * Find all the written-back checkpoint buffers in the given list and release + * them. * - * Called with the journal locked. * Called with j_list_lock held. * Returns number of bufers reaped (for debug) */ @@ -635,8 +638,8 @@ out: * checkpoint lists. * * The function returns 1 if it frees the transaction, 0 otherwise. + * The function can free jh and bh. * - * This function is called with the journal locked. * This function is called with j_list_lock held. * This function is called with jbd_lock_bh_state(jh2bh(jh)) */ @@ -655,13 +658,14 @@ int __journal_remove_checkpoint(struct journal_head *jh) } journal = transaction->t_journal; + JBUFFER_TRACE(jh, "removing from transaction"); __buffer_unlink(jh); jh->b_cp_transaction = NULL; + journal_put_journal_head(jh); if (transaction->t_checkpoint_list != NULL || transaction->t_checkpoint_io_list != NULL) goto out; - JBUFFER_TRACE(jh, "transaction has no more buffers"); /* * There is one special case to worry about: if we have just pulled the @@ -672,10 +676,8 @@ int __journal_remove_checkpoint(struct journal_head *jh) * The locking here around t_state is a bit sleazy. * See the comment at the end of journal_commit_transaction(). */ - if (transaction->t_state != T_FINISHED) { - JBUFFER_TRACE(jh, "belongs to running/committing transaction"); + if (transaction->t_state != T_FINISHED) goto out; - } /* OK, that was the last buffer for the transaction: we can now safely remove this transaction from the log */ @@ -687,7 +689,6 @@ int __journal_remove_checkpoint(struct journal_head *jh) wake_up(&journal->j_wait_logspace); ret = 1; out: - JBUFFER_TRACE(jh, "exit"); return ret; } @@ -706,6 +707,8 @@ void __journal_insert_checkpoint(struct journal_head *jh, J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + /* Get reference for checkpointing transaction */ + journal_grab_journal_head(jh2bh(jh)); jh->b_cp_transaction = transaction; if (!transaction->t_checkpoint_list) { diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index eedd201..8799207 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -258,10 +258,6 @@ write_out_data: jbd_unlock_bh_state(bh); if (locked) unlock_buffer(bh); - journal_remove_journal_head(bh); - /* One for our safety reference, other for - * journal_remove_journal_head() */ - put_bh(bh); release_data_buffer(bh); } @@ -455,14 +451,9 @@ void journal_commit_transaction(journal_t *journal) } if (buffer_jbd(bh) && bh2jh(bh) == jh && jh->b_transaction == commit_transaction && - jh->b_jlist == BJ_Locked) { + jh->b_jlist == BJ_Locked) __journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); - put_bh(bh); - } else { - jbd_unlock_bh_state(bh); - } + jbd_unlock_bh_state(bh); release_data_buffer(bh); cond_resched_lock(&journal->j_list_lock); } @@ -807,10 +798,16 @@ restart_loop: while (commit_transaction->t_forget) { transaction_t *cp_transaction; struct buffer_head *bh; + int try_to_free = 0; jh = commit_transaction->t_forget; spin_unlock(&journal->j_list_lock); bh = jh2bh(jh); + /* + * Get a reference so that bh cannot be freed before we are + * done with it. + */ + get_bh(bh); jbd_lock_bh_state(bh); J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || jh->b_transaction == journal->j_running_transaction); @@ -868,28 +865,27 @@ restart_loop: __journal_insert_checkpoint(jh, commit_transaction); if (is_journal_aborted(journal)) clear_buffer_jbddirty(bh); - JBUFFER_TRACE(jh, "refile for checkpoint writeback"); - __journal_refile_buffer(jh); - jbd_unlock_bh_state(bh); } else { J_ASSERT_BH(bh, !buffer_dirty(bh)); - /* The buffer on BJ_Forget list and not jbddirty means + /* + * The buffer on BJ_Forget list and not jbddirty means * it has been freed by this transaction and hence it * could not have been reallocated until this * transaction has committed. *BUT* it could be * reallocated once we have written all the data to * disk and before we process the buffer on BJ_Forget - * list. */ - JBUFFER_TRACE(jh, "refile or unfile freed buffer"); - __journal_refile_buffer(jh); - if (!jh->b_transaction) { - jbd_unlock_bh_state(bh); - /* needs a brelse */ - journal_remove_journal_head(bh); - release_buffer_page(bh); - } else - jbd_unlock_bh_state(bh); + * list. + */ + if (!jh->b_next_transaction) + try_to_free = 1; } + JBUFFER_TRACE(jh, "refile or unfile freed buffer"); + __journal_refile_buffer(jh); + jbd_unlock_bh_state(bh); + if (try_to_free) + release_buffer_page(bh); + else + __brelse(bh); cond_resched_lock(&journal->j_list_lock); } spin_unlock(&journal->j_list_lock); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index ab019ee..9fe061f 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -1803,10 +1803,9 @@ static void journal_free_journal_head(struct journal_head *jh) * When a buffer has its BH_JBD bit set it is immune from being released by * core kernel code, mainly via ->b_count. * - * A journal_head may be detached from its buffer_head when the journal_head's - * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL. - * Various places in JBD call journal_remove_journal_head() to indicate that the - * journal_head can be dropped if needed. + * A journal_head is detached from its buffer_head when the journal_head's + * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint + * transaction (b_cp_transaction) hold their references to b_jcount. * * Various places in the kernel want to attach a journal_head to a buffer_head * _before_ attaching the journal_head to a transaction. To protect the @@ -1819,17 +1818,16 @@ static void journal_free_journal_head(struct journal_head *jh) * (Attach a journal_head if needed. Increments b_jcount) * struct journal_head *jh = journal_add_journal_head(bh); * ... - * jh->b_transaction = xxx; - * journal_put_journal_head(jh); - * - * Now, the journal_head's b_jcount is zero, but it is safe from being released - * because it has a non-zero b_transaction. + * (Get another reference for transaction) + * journal_grab_journal_head(bh); + * jh->b_transaction = xxx; + * (Put original reference) + * journal_put_journal_head(jh); */ /* * Give a buffer_head a journal_head. * - * Doesn't need the journal lock. * May sleep. */ struct journal_head *journal_add_journal_head(struct buffer_head *bh) @@ -1893,61 +1891,29 @@ static void __journal_remove_journal_head(struct buffer_head *bh) struct journal_head *jh = bh2jh(bh); J_ASSERT_JH(jh, jh->b_jcount >= 0); - - get_bh(bh); - if (jh->b_jcount == 0) { - if (jh->b_transaction == NULL && - jh->b_next_transaction == NULL && - jh->b_cp_transaction == NULL) { - J_ASSERT_JH(jh, jh->b_jlist == BJ_None); - J_ASSERT_BH(bh, buffer_jbd(bh)); - J_ASSERT_BH(bh, jh2bh(jh) == bh); - BUFFER_TRACE(bh, "remove journal_head"); - if (jh->b_frozen_data) { - printk(KERN_WARNING "%s: freeing " - "b_frozen_data\n", - __func__); - jbd_free(jh->b_frozen_data, bh->b_size); - } - if (jh->b_committed_data) { - printk(KERN_WARNING "%s: freeing " - "b_committed_data\n", - __func__); - jbd_free(jh->b_committed_data, bh->b_size); - } - bh->b_private = NULL; - jh->b_bh = NULL; /* debug, really */ - clear_buffer_jbd(bh); - __brelse(bh); - journal_free_journal_head(jh); - } else { - BUFFER_TRACE(bh, "journal_head was locked"); - } + J_ASSERT_JH(jh, jh->b_transaction == NULL); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + J_ASSERT_JH(jh, jh->b_jlist == BJ_None); + J_ASSERT_BH(bh, buffer_jbd(bh)); + J_ASSERT_BH(bh, jh2bh(jh) == bh); + BUFFER_TRACE(bh, "remove journal_head"); + if (jh->b_frozen_data) { + printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__); + jbd_free(jh->b_frozen_data, bh->b_size); } + if (jh->b_committed_data) { + printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__); + jbd_free(jh->b_committed_data, bh->b_size); + } + bh->b_private = NULL; + jh->b_bh = NULL; /* debug, really */ + clear_buffer_jbd(bh); + journal_free_journal_head(jh); } /* - * journal_remove_journal_head(): if the buffer isn't attached to a transaction - * and has a zero b_jcount then remove and release its journal_head. If we did - * see that the buffer is not used by any transaction we also "logically" - * decrement ->b_count. - * - * We in fact take an additional increment on ->b_count as a convenience, - * because the caller usually wants to do additional things with the bh - * after calling here. - * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some - * time. Once the caller has run __brelse(), the buffer is eligible for - * reaping by try_to_free_buffers(). - */ -void journal_remove_journal_head(struct buffer_head *bh) -{ - jbd_lock_bh_journal_head(bh); - __journal_remove_journal_head(bh); - jbd_unlock_bh_journal_head(bh); -} - -/* - * Drop a reference on the passed journal_head. If it fell to zero then try to + * Drop a reference on the passed journal_head. If it fell to zero then * release the journal_head from the buffer_head. */ void journal_put_journal_head(struct journal_head *jh) @@ -1957,11 +1923,12 @@ void journal_put_journal_head(struct journal_head *jh) jbd_lock_bh_journal_head(bh); J_ASSERT_JH(jh, jh->b_jcount > 0); --jh->b_jcount; - if (!jh->b_jcount && !jh->b_transaction) { + if (!jh->b_jcount) { __journal_remove_journal_head(bh); + jbd_unlock_bh_journal_head(bh); __brelse(bh); - } - jbd_unlock_bh_journal_head(bh); + } else + jbd_unlock_bh_journal_head(bh); } /* diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index dc39efd..7e59c6e 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -696,7 +696,6 @@ repeat: if (!jh->b_transaction) { JBUFFER_TRACE(jh, "no transaction"); J_ASSERT_JH(jh, !jh->b_next_transaction); - jh->b_transaction = transaction; JBUFFER_TRACE(jh, "file as BJ_Reserved"); spin_lock(&journal->j_list_lock); __journal_file_buffer(jh, transaction, BJ_Reserved); @@ -818,7 +817,6 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh) * committed and so it's safe to clear the dirty bit. */ clear_buffer_dirty(jh2bh(jh)); - jh->b_transaction = transaction; /* first access by this transaction */ jh->b_modified = 0; @@ -1069,8 +1067,9 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) ret = -EIO; goto no_journal; } - - if (jh->b_transaction != NULL) { + /* We might have slept so buffer could be refiled now */ + if (jh->b_transaction != NULL && + jh->b_transaction != handle->h_transaction) { JBUFFER_TRACE(jh, "unfile from commit"); __journal_temp_unlink_buffer(jh); /* It still points to the committing @@ -1091,8 +1090,6 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) { JBUFFER_TRACE(jh, "not on correct data list: unfile"); J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); - __journal_temp_unlink_buffer(jh); - jh->b_transaction = handle->h_transaction; JBUFFER_TRACE(jh, "file as data"); __journal_file_buffer(jh, handle->h_transaction, BJ_SyncData); @@ -1300,8 +1297,6 @@ int journal_forget (handle_t *handle, struct buffer_head *bh) __journal_file_buffer(jh, transaction, BJ_Forget); } else { __journal_unfile_buffer(jh); - journal_remove_journal_head(bh); - __brelse(bh); if (!buffer_jbd(bh)) { spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); @@ -1622,19 +1617,32 @@ static void __journal_temp_unlink_buffer(struct journal_head *jh) mark_buffer_dirty(bh); /* Expose it to the VM */ } +/* + * Remove buffer from all transactions. + * + * Called with bh_state lock and j_list_lock + * + * jh and bh may be already freed when this function returns. + */ void __journal_unfile_buffer(struct journal_head *jh) { __journal_temp_unlink_buffer(jh); jh->b_transaction = NULL; + journal_put_journal_head(jh); } void journal_unfile_buffer(journal_t *journal, struct journal_head *jh) { - jbd_lock_bh_state(jh2bh(jh)); + struct buffer_head *bh = jh2bh(jh); + + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); + jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); __journal_unfile_buffer(jh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(jh2bh(jh)); + jbd_unlock_bh_state(bh); + __brelse(bh); } /* @@ -1661,16 +1669,12 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) /* A written-back ordered data buffer */ JBUFFER_TRACE(jh, "release data"); __journal_unfile_buffer(jh); - journal_remove_journal_head(bh); - __brelse(bh); } } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { /* written-back checkpointed metadata buffer */ if (jh->b_jlist == BJ_None) { JBUFFER_TRACE(jh, "remove from checkpoint list"); __journal_remove_checkpoint(jh); - journal_remove_journal_head(bh); - __brelse(bh); } } spin_unlock(&journal->j_list_lock); @@ -1733,7 +1737,7 @@ int journal_try_to_free_buffers(journal_t *journal, /* * We take our own ref against the journal_head here to avoid * having to add tons of locking around each instance of - * journal_remove_journal_head() and journal_put_journal_head(). + * journal_put_journal_head(). */ jh = journal_grab_journal_head(bh); if (!jh) @@ -1770,10 +1774,9 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) int may_free = 1; struct buffer_head *bh = jh2bh(jh); - __journal_unfile_buffer(jh); - if (jh->b_cp_transaction) { JBUFFER_TRACE(jh, "on running+cp transaction"); + __journal_temp_unlink_buffer(jh); /* * We don't want to write the buffer anymore, clear the * bit so that we don't confuse checks in @@ -1784,8 +1787,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) may_free = 0; } else { JBUFFER_TRACE(jh, "on running transaction"); - journal_remove_journal_head(bh); - __brelse(bh); + __journal_unfile_buffer(jh); } return may_free; } @@ -2070,6 +2072,8 @@ void __journal_file_buffer(struct journal_head *jh, if (jh->b_transaction) __journal_temp_unlink_buffer(jh); + else + journal_grab_journal_head(bh); jh->b_transaction = transaction; switch (jlist) { @@ -2127,9 +2131,10 @@ void journal_file_buffer(struct journal_head *jh, * already started to be used by a subsequent transaction, refile the * buffer on that transaction's metadata list. * - * Called under journal->j_list_lock - * + * Called under j_list_lock * Called under jbd_lock_bh_state(jh2bh(jh)) + * + * jh and bh may be already free when this function returns */ void __journal_refile_buffer(struct journal_head *jh) { @@ -2153,6 +2158,11 @@ void __journal_refile_buffer(struct journal_head *jh) was_dirty = test_clear_buffer_jbddirty(bh); __journal_temp_unlink_buffer(jh); + /* + * We set b_transaction here because b_next_transaction will inherit + * our jh reference and thus __journal_file_buffer() must not take a + * new one. + */ jh->b_transaction = jh->b_next_transaction; jh->b_next_transaction = NULL; if (buffer_freed(bh)) @@ -2169,30 +2179,21 @@ void __journal_refile_buffer(struct journal_head *jh) } /* - * For the unlocked version of this call, also make sure that any - * hanging journal_head is cleaned up if necessary. - * - * __journal_refile_buffer is usually called as part of a single locked - * operation on a buffer_head, in which the caller is probably going to - * be hooking the journal_head onto other lists. In that case it is up - * to the caller to remove the journal_head if necessary. For the - * unlocked journal_refile_buffer call, the caller isn't going to be - * doing anything else to the buffer so we need to do the cleanup - * ourselves to avoid a jh leak. - * - * *** The journal_head may be freed by this call! *** + * __journal_refile_buffer() with necessary locking added. We take our bh + * reference so that we can safely unlock bh. + * + * The jh and bh may be freed by this call. */ void journal_refile_buffer(journal_t *journal, struct journal_head *jh) { struct buffer_head *bh = jh2bh(jh); + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); - __journal_refile_buffer(jh); jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); - spin_unlock(&journal->j_list_lock); __brelse(bh); } diff --git a/include/linux/jbd.h b/include/linux/jbd.h index e069650..e6a5e34 100644 --- a/include/linux/jbd.h +++ b/include/linux/jbd.h @@ -940,7 +940,6 @@ extern int journal_force_commit(journal_t *); */ struct journal_head *journal_add_journal_head(struct buffer_head *bh); struct journal_head *journal_grab_journal_head(struct buffer_head *bh); -void journal_remove_journal_head(struct buffer_head *bh); void journal_put_journal_head(struct journal_head *jh); /* -- cgit v1.1 From a212d1a71deea10ba4f6de2aaac0221be34ddb29 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Tue, 7 Jun 2011 11:56:50 +0800 Subject: jbd: Use WRITE_SYNC in journal checkpoint. In journal checkpoint, we write the buffer and wait for its finish. But in cfq, the async queue has a very low priority, and in our test, if there are too many sync queues and every queue is filled up with requests, and the process will hang waiting for the log space. So this patch tries to use WRITE_SYNC in __flush_batch so that the request will be moved into sync queue and handled by cfq timely. We also use the new plug, sot that all the WRITE_SYNC requests can be given as a whole when we unplug it. Reported-by: Robin Dong Signed-off-by: Tao Ma Signed-off-by: Jan Kara --- fs/jbd/checkpoint.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index 61655a3..f94fc48 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -22,6 +22,7 @@ #include #include #include +#include #include /* @@ -257,9 +258,12 @@ static void __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) { int i; + struct blk_plug plug; + blk_start_plug(&plug); for (i = 0; i < *batch_count; i++) - write_dirty_buffer(bhs[i], WRITE); + write_dirty_buffer(bhs[i], WRITE_SYNC); + blk_finish_plug(&plug); for (i = 0; i < *batch_count; i++) { struct buffer_head *bh = bhs[i]; -- cgit v1.1 From d12dc256547cec4fe62dad6e94252dced4ee2d58 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 20 Jul 2011 14:31:47 +0200 Subject: quota: Remove unused declaration There is no point in declaring quotactl() syscall prototype in kernel header and 'make headers_check' complains about it. So just remove those lines. Signed-off-by: Jan Kara --- include/linux/quota.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/include/linux/quota.h b/include/linux/quota.h index 9a85412..313b7de 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -415,13 +415,5 @@ struct quota_module_name { {QFMT_VFS_V0, "quota_v2"},\ {0, NULL}} -#else - -# /* nodep */ include - -__BEGIN_DECLS -long quotactl __P ((unsigned int, const char *, int, caddr_t)); -__END_DECLS - #endif /* __KERNEL__ */ #endif /* _QUOTA_ */ -- cgit v1.1 From c878c73f8dda48f64bcec24f78e80e154cbc5cf8 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 20 Jul 2011 23:16:33 +0200 Subject: ext3: Fix compilation with -DDX_DEBUG Compilation of ext3/namei.c brought up an error and warning messages when compiled with -DDX_DEBUG. Signed-off-by: Bernd Schubert Signed-off-by: Jan Kara --- fs/ext3/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 51736a4..0465368 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -288,7 +288,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_ent while (len--) printk("%c", *name++); ext3fs_dirhash(de->name, de->name_len, &h); printk(":%x.%u ", h.hash, - ((char *) de - base)); + (unsigned) ((char *) de - base)); } space += EXT3_DIR_REC_LEN(de->name_len); names++; @@ -1014,7 +1014,7 @@ static struct buffer_head * ext3_dx_find_entry(struct inode *dir, *err = -ENOENT; errout: - dxtrace(printk("%s not found\n", name)); + dxtrace(printk("%s not found\n", entry->name)); dx_release (frames); return NULL; } -- cgit v1.1 From 03b5bb342978f99f75fb36d69cd29bab32109fd4 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Fri, 22 Jul 2011 08:50:13 -0500 Subject: ext2: check xattr name_len before acquiring xattr_sem in ext2_xattr_get In ext2_xattr_get(), the code will acquire xattr_sem first, later checks the length of xattr name_len > 255. It's unnecessarily time consuming and also ext2_xattr_set() checks the length before other checks. So move the check before acquiring xattr_sem to make these two functions consistent. Signed-off-by: Wang Sheng-Hui Signed-off-by: Jan Kara --- fs/ext2/xattr.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 5299706..d27b71f 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -161,6 +161,10 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name, if (name == NULL) return -EINVAL; + name_len = strlen(name); + if (name_len > 255) + return -ERANGE; + down_read(&EXT2_I(inode)->xattr_sem); error = -ENODATA; if (!EXT2_I(inode)->i_file_acl) @@ -181,12 +185,8 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_get", error = -EIO; goto cleanup; } - /* find named attribute */ - name_len = strlen(name); - error = -ERANGE; - if (name_len > 255) - goto cleanup; + /* find named attribute */ entry = FIRST_ENTRY(bh); while (!IS_LAST_ENTRY(entry)) { struct ext2_xattr_entry *next = -- cgit v1.1 From b22570d9abb3d844e65c15c8bc0d57a78129e3b4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sat, 23 Jul 2011 01:21:38 +0200 Subject: ext3: Fix data corruption in inodes with journalled data When journalling data for an inode (either because it is a symlink or because the filesystem is mounted in data=journal mode), ext3_evict_inode() can discard unwritten data by calling truncate_inode_pages(). This is because we don't mark the buffer / page dirty when journalling data but only add the buffer to the running transaction and thus mm does not know there are still unwritten data. Fix the problem by carefully tracking transaction containing inode's data, committing this transaction, and writing uncheckpointed buffers when inode should be reaped. Signed-off-by: Jan Kara --- fs/ext3/inode.c | 40 +++++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index d2e4547..f57c87b 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -197,6 +197,7 @@ static int truncate_restart_transaction(handle_t *handle, struct inode *inode) */ void ext3_evict_inode (struct inode *inode) { + struct ext3_inode_info *ei = EXT3_I(inode); struct ext3_block_alloc_info *rsv; handle_t *handle; int want_delete = 0; @@ -207,11 +208,36 @@ void ext3_evict_inode (struct inode *inode) want_delete = 1; } + /* + * When journalling data dirty buffers are tracked only in the journal. + * So although mm thinks everything is clean and ready for reaping the + * inode might still have some pages to write in the running + * transaction or waiting to be checkpointed. Thus calling + * journal_invalidatepage() (via truncate_inode_pages()) to discard + * these buffers can cause data loss. Also even if we did not discard + * these buffers, we would have no way to find them after the inode + * is reaped and thus user could see stale data if he tries to read + * them before the transaction is checkpointed. So be careful and + * force everything to disk here... We use ei->i_datasync_tid to + * store the newest transaction containing inode's data. + * + * Note that directories do not have this problem because they don't + * use page cache. + */ + if (inode->i_nlink && ext3_should_journal_data(inode) && + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { + tid_t commit_tid = atomic_read(&ei->i_datasync_tid); + journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; + + log_start_commit(journal, commit_tid); + log_wait_commit(journal, commit_tid); + filemap_write_and_wait(&inode->i_data); + } truncate_inode_pages(&inode->i_data, 0); ext3_discard_reservation(inode); - rsv = EXT3_I(inode)->i_block_alloc_info; - EXT3_I(inode)->i_block_alloc_info = NULL; + rsv = ei->i_block_alloc_info; + ei->i_block_alloc_info = NULL; if (unlikely(rsv)) kfree(rsv); @@ -241,7 +267,7 @@ void ext3_evict_inode (struct inode *inode) * have removed the record. */ ext3_orphan_del(handle, inode); - EXT3_I(inode)->i_dtime = get_seconds(); + ei->i_dtime = get_seconds(); /* * One subtle ordering requirement: if anything has gone wrong @@ -1411,6 +1437,7 @@ static int ext3_journalled_write_end(struct file *file, { handle_t *handle = ext3_journal_current_handle(); struct inode *inode = mapping->host; + struct ext3_inode_info *ei = EXT3_I(inode); int ret = 0, ret2; int partial = 0; unsigned from, to; @@ -1440,8 +1467,9 @@ static int ext3_journalled_write_end(struct file *file, if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); ext3_set_inode_state(inode, EXT3_STATE_JDATA); - if (inode->i_size > EXT3_I(inode)->i_disksize) { - EXT3_I(inode)->i_disksize = inode->i_size; + atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); + if (inode->i_size > ei->i_disksize) { + ei->i_disksize = inode->i_size; ret2 = ext3_mark_inode_dirty(handle, inode); if (!ret) ret = ret2; @@ -1739,6 +1767,8 @@ static int ext3_journalled_writepage(struct page *page, if (ret == 0) ret = err; ext3_set_inode_state(inode, EXT3_STATE_JDATA); + atomic_set(&EXT3_I(inode)->i_datasync_tid, + handle->h_transaction->t_tid); unlock_page(page); } else { /* -- cgit v1.1 From 2b76aa074e9de718b31ba48699e99191b7697bf2 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Sat, 23 Jul 2011 09:08:49 +0800 Subject: ext3.txt: update the links in the section "useful links" to the latest ones In Documentation/filesystems/ext3.txt, the section "useful links" provides two links which link to ibm developerworks articles. While the second one can be redirected to the latest one, the first one http://www.ibm.com/developerworks/library/l-fs7.html fails to be redirected. Update the 2 links to the latest ones. Signed-off-by: Wang Sheng-Hui Signed-off-by: Jan Kara --- Documentation/filesystems/ext3.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt index aee5560..22f3a0e 100644 --- a/Documentation/filesystems/ext3.txt +++ b/Documentation/filesystems/ext3.txt @@ -218,5 +218,5 @@ kernel source: programs: http://e2fsprogs.sourceforge.net/ http://ext2resize.sourceforge.net -useful links: http://www.ibm.com/developerworks/library/l-fs7.html - http://www.ibm.com/developerworks/library/l-fs8.html +useful links: http://www.ibm.com/developerworks/library/l-fs7/index.html + http://www.ibm.com/developerworks/library/l-fs8/index.html -- cgit v1.1 From 5cf49d763eb141d236e92be6d4a0dc94e31fa886 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Mon, 25 Jul 2011 21:02:25 +0800 Subject: jbd: change the field "b_cow_tid" of struct journal_head from type unsigned to tid_t In the definition of struct journal_head, the comment for the field "unsigned b_cow_tid" says the field tracks the last transaction id in which this buffer has been cowed. In the header part of file journal-head.h, it defines typedef unsigned int tid_t; We should use type tid_t to define transaction id fields. Change the field "b_cow_tid" of struct journal_head from type unsigned to tid_t. Signed-off-by: Wang Sheng-Hui Acked-by: Amir Goldstein Signed-off-by: Jan Kara --- include/linux/journal-head.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/journal-head.h b/include/linux/journal-head.h index 44e95d0..423cb6d 100644 --- a/include/linux/journal-head.h +++ b/include/linux/journal-head.h @@ -45,7 +45,7 @@ struct journal_head { * has been cowed * [jbd_lock_bh_state()] */ - unsigned b_cow_tid; + tid_t b_cow_tid; /* * Copy of the buffer data frozen for writing to the log. -- cgit v1.1