From 0390131ba84fd3f726f9e24fc4553828125700bb Mon Sep 17 00:00:00 2001 From: Frank Mayhar Date: Wed, 7 Jan 2009 00:06:22 -0500 Subject: ext4: Allow ext4 to run without a journal A few weeks ago I posted a patch for discussion that allowed ext4 to run without a journal. Since that time I've integrated the excellent comments from Andreas and fixed several serious bugs. We're currently running with this patch and generating some performance numbers against both ext2 (with backported reservations code) and ext4 with and without a journal. It just so happens that running without a journal is slightly faster for most everything. We did iozone -T -t 4 s 2g -r 256k -T -I -i0 -i1 -i2 which creates 4 threads, each of which create and do reads and writes on a 2G file, with a buffer size of 256K, using O_DIRECT for all file opens to bypass the page cache. Results: ext2 ext4, default ext4, no journal initial writes 13.0 MB/s 15.4 MB/s 15.7 MB/s rewrites 13.1 MB/s 15.6 MB/s 15.9 MB/s reads 15.2 MB/s 16.9 MB/s 17.2 MB/s re-reads 15.3 MB/s 16.9 MB/s 17.2 MB/s random readers 5.6 MB/s 5.6 MB/s 5.7 MB/s random writers 5.1 MB/s 5.3 MB/s 5.4 MB/s So it seems that, so far, this was a useful exercise. Signed-off-by: Frank Mayhar Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'fs/ext4/ialloc.c') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 6e60528..9dd21b7 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -253,12 +253,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) spin_unlock(sb_bgl_lock(sbi, flex_group)); } } - BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata"); - err = ext4_journal_dirty_metadata(handle, bh2); + BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, bh2); if (!fatal) fatal = err; } - BUFFER_TRACE(bitmap_bh, "call ext4_journal_dirty_metadata"); - err = ext4_journal_dirty_metadata(handle, bitmap_bh); + BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!fatal) fatal = err; sb->s_dirt = 1; @@ -656,15 +656,16 @@ repeat_in_this_group: ino, bitmap_bh->b_data)) { /* we won it */ BUFFER_TRACE(bitmap_bh, - "call ext4_journal_dirty_metadata"); - err = ext4_journal_dirty_metadata(handle, + "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, + inode, bitmap_bh); if (err) goto fail; goto got; } /* we lost it */ - jbd2_journal_release_buffer(handle, bitmap_bh); + ext4_handle_release_buffer(handle, bitmap_bh); if (++ino < EXT4_INODES_PER_GROUP(sb)) goto repeat_in_this_group; @@ -726,7 +727,8 @@ got: /* Don't need to dirty bitmap block if we didn't change it */ if (free) { BUFFER_TRACE(block_bh, "dirty block bitmap"); - err = ext4_journal_dirty_metadata(handle, block_bh); + err = ext4_handle_dirty_metadata(handle, + NULL, block_bh); } brelse(block_bh); @@ -771,8 +773,8 @@ got: } gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); spin_unlock(sb_bgl_lock(sbi, group)); - BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata"); - err = ext4_journal_dirty_metadata(handle, bh2); + BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, bh2); if (err) goto fail; percpu_counter_dec(&sbi->s_freeinodes_counter); @@ -825,7 +827,7 @@ got: ext4_set_inode_flags(inode); if (IS_DIRSYNC(inode)) - handle->h_sync = 1; + ext4_handle_sync(handle); if (insert_inode_locked(inode) < 0) { err = -EINVAL; goto fail_drop; @@ -1028,4 +1030,3 @@ unsigned long ext4_count_dirs(struct super_block * sb) } return count; } - -- cgit v1.1 From fde4d95ad8711c84a36735a17136c45b19746af9 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 5 Jan 2009 22:17:35 -0500 Subject: ext4: remove extraneous newlines from calls to ext4_error() and ext4_warning() This removes annoying blank syslog entries emitted by ext4_error() or ext4_warning(), since these functions add their own newline. Signed-off-by: Nick Warne Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/ialloc.c') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 9dd21b7..4794d2c 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -74,7 +74,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, /* If checksum is bad mark all blocks and inodes use to prevent * allocation, essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { - ext4_error(sb, __func__, "Checksum bad for group %lu\n", + ext4_error(sb, __func__, "Checksum bad for group %lu", block_group); gdp->bg_free_blocks_count = 0; gdp->bg_free_inodes_count = 0; -- cgit v1.1 From a9df9a49102f3578909cba7bd33784eb3b9caaa4 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 5 Jan 2009 22:18:16 -0500 Subject: ext4: Make ext4_group_t be an unsigned int Nearly all places in the ext3/4 code which uses "unsigned long" is probably a bug, since on 32-bit systems a ulong a 32-bits, which means we are wasting stack space on 64-bit systems. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/ext4/ialloc.c') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 4794d2c..cac3617 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -74,7 +74,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, /* If checksum is bad mark all blocks and inodes use to prevent * allocation, essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { - ext4_error(sb, __func__, "Checksum bad for group %lu", + ext4_error(sb, __func__, "Checksum bad for group %u", block_group); gdp->bg_free_blocks_count = 0; gdp->bg_free_inodes_count = 0; @@ -111,7 +111,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) if (unlikely(!bh)) { ext4_error(sb, __func__, "Cannot read inode bitmap - " - "block_group = %lu, inode_bitmap = %llu", + "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); return NULL; } @@ -133,7 +133,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) put_bh(bh); ext4_error(sb, __func__, "Cannot read inode bitmap - " - "block_group = %lu, inode_bitmap = %llu", + "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); return NULL; } @@ -690,7 +690,7 @@ got: ino > EXT4_INODES_PER_GROUP(sb)) { ext4_error(sb, __func__, "reserved inode or inode > inodes count - " - "block_group = %lu, inode=%lu", group, + "block_group = %u, inode=%lu", group, ino + group * EXT4_INODES_PER_GROUP(sb)); err = -EIO; goto fail; -- cgit v1.1 From 560671a0d3c9ad2d647fa6d09375a262e1f19c4f Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 5 Jan 2009 22:20:24 -0500 Subject: ext4: Use high 16 bits of the block group descriptor's free counts fields Rename the lower bits with suffix _lo and add helper to access the values. Also rename bg_itable_unused_hi to bg_pad as in e2fsprogs. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 83 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 44 insertions(+), 39 deletions(-) (limited to 'fs/ext4/ialloc.c') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index cac3617..11c4f6f 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -76,9 +76,9 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { ext4_error(sb, __func__, "Checksum bad for group %u", block_group); - gdp->bg_free_blocks_count = 0; - gdp->bg_free_inodes_count = 0; - gdp->bg_itable_unused = 0; + ext4_free_blks_set(sb, gdp, 0); + ext4_free_inodes_set(sb, gdp, 0); + ext4_itable_unused_set(sb, gdp, 0); memset(bh->b_data, 0xff, sb->s_blocksize); return 0; } @@ -168,7 +168,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) struct ext4_group_desc *gdp; struct ext4_super_block *es; struct ext4_sb_info *sbi; - int fatal = 0, err; + int fatal = 0, err, count; ext4_group_t flex_group; if (atomic_read(&inode->i_count) > 1) { @@ -236,9 +236,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) if (gdp) { spin_lock(sb_bgl_lock(sbi, block_group)); - le16_add_cpu(&gdp->bg_free_inodes_count, 1); - if (is_directory) - le16_add_cpu(&gdp->bg_used_dirs_count, -1); + count = ext4_free_inodes_count(sb, gdp) + 1; + ext4_free_inodes_set(sb, gdp, count); + if (is_directory) { + count = ext4_used_dirs_count(sb, gdp) - 1; + ext4_used_dirs_set(sb, gdp, count); + } gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); spin_unlock(sb_bgl_lock(sbi, block_group)); @@ -291,13 +294,13 @@ static int find_group_dir(struct super_block *sb, struct inode *parent, for (group = 0; group < ngroups; group++) { desc = ext4_get_group_desc(sb, group, NULL); - if (!desc || !desc->bg_free_inodes_count) + if (!desc || !ext4_free_inodes_count(sb, desc)) continue; - if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) + if (ext4_free_inodes_count(sb, desc) < avefreei) continue; if (!best_desc || - (le16_to_cpu(desc->bg_free_blocks_count) > - le16_to_cpu(best_desc->bg_free_blocks_count))) { + (ext4_free_blks_count(sb, desc) > + ext4_free_blks_count(sb, best_desc))) { *best_group = group; best_desc = desc; ret = 0; @@ -369,7 +372,7 @@ found_flexbg: for (i = best_flex * flex_size; i < ngroups && i < (best_flex + 1) * flex_size; i++) { desc = ext4_get_group_desc(sb, i, &bh); - if (le16_to_cpu(desc->bg_free_inodes_count)) { + if (ext4_free_inodes_count(sb, desc)) { *best_group = i; goto out; } @@ -443,17 +446,17 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, for (i = 0; i < ngroups; i++) { grp = (parent_group + i) % ngroups; desc = ext4_get_group_desc(sb, grp, NULL); - if (!desc || !desc->bg_free_inodes_count) + if (!desc || !ext4_free_inodes_count(sb, desc)) continue; - if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir) + if (ext4_used_dirs_count(sb, desc) >= best_ndir) continue; - if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) + if (ext4_free_inodes_count(sb, desc) < avefreei) continue; - if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb) + if (ext4_free_blks_count(sb, desc) < avefreeb) continue; *group = grp; ret = 0; - best_ndir = le16_to_cpu(desc->bg_used_dirs_count); + best_ndir = ext4_used_dirs_count(sb, desc); } if (ret == 0) return ret; @@ -479,13 +482,13 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, for (i = 0; i < ngroups; i++) { *group = (parent_group + i) % ngroups; desc = ext4_get_group_desc(sb, *group, NULL); - if (!desc || !desc->bg_free_inodes_count) + if (!desc || !ext4_free_inodes_count(sb, desc)) continue; - if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs) + if (ext4_used_dirs_count(sb, desc) >= max_dirs) continue; - if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes) + if (ext4_free_inodes_count(sb, desc) < min_inodes) continue; - if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks) + if (ext4_free_blks_count(sb, desc) < min_blocks) continue; return 0; } @@ -494,8 +497,8 @@ fallback: for (i = 0; i < ngroups; i++) { *group = (parent_group + i) % ngroups; desc = ext4_get_group_desc(sb, *group, NULL); - if (desc && desc->bg_free_inodes_count && - le16_to_cpu(desc->bg_free_inodes_count) >= avefreei) + if (desc && ext4_free_inodes_count(sb, desc) && + ext4_free_inodes_count(sb, desc) >= avefreei) return 0; } @@ -524,8 +527,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent, */ *group = parent_group; desc = ext4_get_group_desc(sb, *group, NULL); - if (desc && le16_to_cpu(desc->bg_free_inodes_count) && - le16_to_cpu(desc->bg_free_blocks_count)) + if (desc && ext4_free_inodes_count(sb, desc) && + ext4_free_blks_count(sb, desc)) return 0; /* @@ -548,8 +551,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent, if (*group >= ngroups) *group -= ngroups; desc = ext4_get_group_desc(sb, *group, NULL); - if (desc && le16_to_cpu(desc->bg_free_inodes_count) && - le16_to_cpu(desc->bg_free_blocks_count)) + if (desc && ext4_free_inodes_count(sb, desc) && + ext4_free_blks_count(sb, desc)) return 0; } @@ -562,7 +565,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent, if (++*group >= ngroups) *group = 0; desc = ext4_get_group_desc(sb, *group, NULL); - if (desc && le16_to_cpu(desc->bg_free_inodes_count)) + if (desc && ext4_free_inodes_count(sb, desc)) return 0; } @@ -591,7 +594,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) struct ext4_super_block *es; struct ext4_inode_info *ei; struct ext4_sb_info *sbi; - int ret2, err = 0; + int ret2, err = 0, count; struct inode *ret; ext4_group_t i; int free = 0; @@ -718,7 +721,7 @@ got: if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); free = ext4_free_blocks_after_init(sb, group, gdp); - gdp->bg_free_blocks_count = cpu_to_le16(free); + ext4_free_blks_set(sb, gdp, free); gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); } @@ -753,7 +756,7 @@ got: free = 0; } else { free = EXT4_INODES_PER_GROUP(sb) - - le16_to_cpu(gdp->bg_itable_unused); + ext4_itable_unused_count(sb, gdp); } /* @@ -763,13 +766,15 @@ got: * */ if (ino > free) - gdp->bg_itable_unused = - cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino); + ext4_itable_unused_set(sb, gdp, + (EXT4_INODES_PER_GROUP(sb) - ino)); } - le16_add_cpu(&gdp->bg_free_inodes_count, -1); + count = ext4_free_inodes_count(sb, gdp) - 1; + ext4_free_inodes_set(sb, gdp, count); if (S_ISDIR(mode)) { - le16_add_cpu(&gdp->bg_used_dirs_count, 1); + count = ext4_used_dirs_count(sb, gdp) + 1; + ext4_used_dirs_set(sb, gdp, count); } gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); spin_unlock(sb_bgl_lock(sbi, group)); @@ -987,7 +992,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; - desc_count += le16_to_cpu(gdp->bg_free_inodes_count); + desc_count += ext4_free_inodes_count(sb, gdp); brelse(bitmap_bh); bitmap_bh = ext4_read_inode_bitmap(sb, i); if (!bitmap_bh) @@ -995,7 +1000,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", - i, le16_to_cpu(gdp->bg_free_inodes_count), x); + i, ext4_free_inodes_count(sb, gdp), x); bitmap_count += x; } brelse(bitmap_bh); @@ -1009,7 +1014,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; - desc_count += le16_to_cpu(gdp->bg_free_inodes_count); + desc_count += ext4_free_inodes_count(sb, gdp); cond_resched(); } return desc_count; @@ -1026,7 +1031,7 @@ unsigned long ext4_count_dirs(struct super_block * sb) struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; - count += le16_to_cpu(gdp->bg_used_dirs_count); + count += ext4_used_dirs_count(sb, gdp); } return count; } -- cgit v1.1 From 3300beda523136f9f87821e4fba85c5c9e319645 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 3 Jan 2009 22:33:39 -0500 Subject: ext4: code cleanup Rename some variables. We also unlock locks in the reverse order we acquired as a part of cleanup. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 65 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 35 insertions(+), 30 deletions(-) (limited to 'fs/ext4/ialloc.c') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 11c4f6f..b47427a 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -124,8 +124,8 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { ext4_init_inode_bitmap(sb, bh, block_group, desc); set_buffer_uptodate(bh); - unlock_buffer(bh); spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + unlock_buffer(bh); return bh; } spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); @@ -585,8 +585,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent, struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) { struct super_block *sb; - struct buffer_head *bitmap_bh = NULL; - struct buffer_head *bh2; + struct buffer_head *inode_bitmap_bh = NULL; + struct buffer_head *group_desc_bh; ext4_group_t group = 0; unsigned long ino = 0; struct inode *inode; @@ -634,41 +634,44 @@ got_group: for (i = 0; i < sbi->s_groups_count; i++) { err = -EIO; - gdp = ext4_get_group_desc(sb, group, &bh2); + gdp = ext4_get_group_desc(sb, group, &group_desc_bh); if (!gdp) goto fail; - brelse(bitmap_bh); - bitmap_bh = ext4_read_inode_bitmap(sb, group); - if (!bitmap_bh) + brelse(inode_bitmap_bh); + inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); + if (!inode_bitmap_bh) goto fail; ino = 0; repeat_in_this_group: ino = ext4_find_next_zero_bit((unsigned long *) - bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino); + inode_bitmap_bh->b_data, + EXT4_INODES_PER_GROUP(sb), ino); + if (ino < EXT4_INODES_PER_GROUP(sb)) { - BUFFER_TRACE(bitmap_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bitmap_bh); + BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, + inode_bitmap_bh); if (err) goto fail; if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group), - ino, bitmap_bh->b_data)) { + ino, inode_bitmap_bh->b_data)) { /* we won it */ - BUFFER_TRACE(bitmap_bh, + BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, - inode, - bitmap_bh); + inode, + inode_bitmap_bh); if (err) goto fail; goto got; } /* we lost it */ - ext4_handle_release_buffer(handle, bitmap_bh); + ext4_handle_release_buffer(handle, inode_bitmap_bh); if (++ino < EXT4_INODES_PER_GROUP(sb)) goto repeat_in_this_group; @@ -699,19 +702,21 @@ got: goto fail; } - BUFFER_TRACE(bh2, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh2); - if (err) goto fail; + BUFFER_TRACE(group_desc_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, group_desc_bh); + if (err) + goto fail; /* We may have to initialize the block bitmap if it isn't already */ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group); + struct buffer_head *block_bitmap_bh; - BUFFER_TRACE(block_bh, "get block bitmap access"); - err = ext4_journal_get_write_access(handle, block_bh); + block_bitmap_bh = ext4_read_block_bitmap(sb, group); + BUFFER_TRACE(block_bitmap_bh, "get block bitmap access"); + err = ext4_journal_get_write_access(handle, block_bitmap_bh); if (err) { - brelse(block_bh); + brelse(block_bitmap_bh); goto fail; } @@ -719,8 +724,8 @@ got: spin_lock(sb_bgl_lock(sbi, group)); /* recheck and clear flag under lock if we still need to */ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); free = ext4_free_blocks_after_init(sb, group, gdp); + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_blks_set(sb, gdp, free); gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); @@ -729,12 +734,12 @@ got: /* Don't need to dirty bitmap block if we didn't change it */ if (free) { - BUFFER_TRACE(block_bh, "dirty block bitmap"); + BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); err = ext4_handle_dirty_metadata(handle, - NULL, block_bh); + NULL, block_bitmap_bh); } - brelse(block_bh); + brelse(block_bitmap_bh); if (err) goto fail; } @@ -778,8 +783,8 @@ got: } gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); spin_unlock(sb_bgl_lock(sbi, group)); - BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, NULL, bh2); + BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); if (err) goto fail; percpu_counter_dec(&sbi->s_freeinodes_counter); @@ -881,7 +886,7 @@ out: iput(inode); ret = ERR_PTR(err); really_out: - brelse(bitmap_bh); + brelse(inode_bitmap_bh); return ret; fail_free_drop: @@ -893,7 +898,7 @@ fail_drop: inode->i_nlink = 0; unlock_new_inode(inode); iput(inode); - brelse(bitmap_bh); + brelse(inode_bitmap_bh); return ERR_PTR(err); } -- cgit v1.1 From 393418676a7602e1d7d3f6e560159c65c8cbd50e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 5 Jan 2009 21:38:14 -0500 Subject: ext4: Fix the race between read_inode_bitmap() and ext4_new_inode() We need to make sure we update the inode bitmap and clear EXT4_BG_INODE_UNINIT flag with sb_bgl_lock held, since ext4_read_inode_bitmap() looks at EXT4_BG_INODE_UNINIT to decide whether to initialize the inode bitmap each time it is called. (introduced by commit c806e68f.) ext4_read_inode_bitmap does: spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { ext4_init_inode_bitmap(sb, bh, block_group, desc); and ext4_new_inode does if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group), ino, inode_bitmap_bh->b_data)) ...... ... spin_lock(sb_bgl_lock(sbi, group)); gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); i.e., on allocation we update the bitmap then we take the sb_bgl_lock and clear the EXT4_BG_INODE_UNINIT flag. What can happen is a parallel ext4_read_inode_bitmap can zero out the bitmap in between the above ext4_set_bit_atomic and spin_lock(sb_bg_lock..) The race results in below user visible errors EXT4-fs error (device sdb1): ext4_free_inode: bit already cleared for inode 168449 EXT4-fs warning (device sdb1): ext4_unlink: Deleting nonexistent file ... EXT4-fs warning (device sdb1): ext4_rmdir: empty directory has too many links ... # ls -al /mnt/tmp/f/p369/d3/d6/d39/db2/dee/d10f/d3f/l71 ls: /mnt/tmp/f/p369/d3/d6/d39/db2/dee/d10f/d3f/l71: Stale NFS file handle Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/ialloc.c | 146 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 86 insertions(+), 60 deletions(-) (limited to 'fs/ext4/ialloc.c') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index b47427a..d4e544f 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -573,6 +573,79 @@ static int find_group_other(struct super_block *sb, struct inode *parent, } /* + * claim the inode from the inode bitmap. If the group + * is uninit we need to take the groups's sb_bgl_lock + * and clear the uninit flag. The inode bitmap update + * and group desc uninit flag clear should be done + * after holding sb_bgl_lock so that ext4_read_inode_bitmap + * doesn't race with the ext4_claim_inode + */ +static int ext4_claim_inode(struct super_block *sb, + struct buffer_head *inode_bitmap_bh, + unsigned long ino, ext4_group_t group, int mode) +{ + int free = 0, retval = 0, count; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); + + spin_lock(sb_bgl_lock(sbi, group)); + if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { + /* not a free inode */ + retval = 1; + goto err_ret; + } + ino++; + if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || + ino > EXT4_INODES_PER_GROUP(sb)) { + spin_unlock(sb_bgl_lock(sbi, group)); + ext4_error(sb, __func__, + "reserved inode or inode > inodes count - " + "block_group = %u, inode=%lu", group, + ino + group * EXT4_INODES_PER_GROUP(sb)); + return 1; + } + /* If we didn't allocate from within the initialized part of the inode + * table then we need to initialize up to this inode. */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { + + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); + /* When marking the block group with + * ~EXT4_BG_INODE_UNINIT we don't want to depend + * on the value of bg_itable_unused even though + * mke2fs could have initialized the same for us. + * Instead we calculated the value below + */ + + free = 0; + } else { + free = EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp); + } + + /* + * Check the relative inode number against the last used + * relative inode number in this group. if it is greater + * we need to update the bg_itable_unused count + * + */ + if (ino > free) + ext4_itable_unused_set(sb, gdp, + (EXT4_INODES_PER_GROUP(sb) - ino)); + } + count = ext4_free_inodes_count(sb, gdp) - 1; + ext4_free_inodes_set(sb, gdp, count); + if (S_ISDIR(mode)) { + count = ext4_used_dirs_count(sb, gdp) + 1; + ext4_used_dirs_set(sb, gdp, count); + } + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); +err_ret: + spin_unlock(sb_bgl_lock(sbi, group)); + return retval; +} + +/* * There are two policies for allocating an inode. If the new inode is * a directory, then a forward search is made for a block group with both * free space and a low directory-to-inode ratio; if that fails, then of @@ -594,7 +667,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) struct ext4_super_block *es; struct ext4_inode_info *ei; struct ext4_sb_info *sbi; - int ret2, err = 0, count; + int ret2, err = 0; struct inode *ret; ext4_group_t i; int free = 0; @@ -658,8 +731,13 @@ repeat_in_this_group: if (err) goto fail; - if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group), - ino, inode_bitmap_bh->b_data)) { + BUFFER_TRACE(group_desc_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, + group_desc_bh); + if (err) + goto fail; + if (!ext4_claim_inode(sb, inode_bitmap_bh, + ino, group, mode)) { /* we won it */ BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); @@ -668,10 +746,13 @@ repeat_in_this_group: inode_bitmap_bh); if (err) goto fail; + /* zero bit is inode number 1*/ + ino++; goto got; } /* we lost it */ ext4_handle_release_buffer(handle, inode_bitmap_bh); + ext4_handle_release_buffer(handle, group_desc_bh); if (++ino < EXT4_INODES_PER_GROUP(sb)) goto repeat_in_this_group; @@ -691,22 +772,6 @@ repeat_in_this_group: goto out; got: - ino++; - if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || - ino > EXT4_INODES_PER_GROUP(sb)) { - ext4_error(sb, __func__, - "reserved inode or inode > inodes count - " - "block_group = %u, inode=%lu", group, - ino + group * EXT4_INODES_PER_GROUP(sb)); - err = -EIO; - goto fail; - } - - BUFFER_TRACE(group_desc_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, group_desc_bh); - if (err) - goto fail; - /* We may have to initialize the block bitmap if it isn't already */ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { @@ -743,49 +808,10 @@ got: if (err) goto fail; } - - spin_lock(sb_bgl_lock(sbi, group)); - /* If we didn't allocate from within the initialized part of the inode - * table then we need to initialize up to this inode. */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { - if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); - - /* When marking the block group with - * ~EXT4_BG_INODE_UNINIT we don't want to depend - * on the value of bg_itable_unused even though - * mke2fs could have initialized the same for us. - * Instead we calculated the value below - */ - - free = 0; - } else { - free = EXT4_INODES_PER_GROUP(sb) - - ext4_itable_unused_count(sb, gdp); - } - - /* - * Check the relative inode number against the last used - * relative inode number in this group. if it is greater - * we need to update the bg_itable_unused count - * - */ - if (ino > free) - ext4_itable_unused_set(sb, gdp, - (EXT4_INODES_PER_GROUP(sb) - ino)); - } - - count = ext4_free_inodes_count(sb, gdp) - 1; - ext4_free_inodes_set(sb, gdp, count); - if (S_ISDIR(mode)) { - count = ext4_used_dirs_count(sb, gdp) + 1; - ext4_used_dirs_set(sb, gdp, count); - } - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); - spin_unlock(sb_bgl_lock(sbi, group)); BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); - if (err) goto fail; + if (err) + goto fail; percpu_counter_dec(&sbi->s_freeinodes_counter); if (S_ISDIR(mode)) -- cgit v1.1 From 2ccb5fb9f113dae969d1ae9b6c10e80fa34f8cd3 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 5 Jan 2009 21:49:55 -0500 Subject: ext4: Use new buffer_head flag to check uninit group bitmaps initialization For uninit block group, the on-disk bitmap is not initialized. That implies we cannot depend on the uptodate flag on the bitmap buffer_head to find bitmap validity. Use a new buffer_head flag which would be set after we properly initialize the bitmap. This also prevents (re-)initializing the uninit group bitmap every time we call ext4_read_block_bitmap(). Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/ialloc.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'fs/ext4/ialloc.c') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index d4e544f..7b12aed 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -115,20 +115,40 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) block_group, bitmap_blk); return NULL; } - if (buffer_uptodate(bh) && - !(desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) + if (bitmap_uptodate(bh)) return bh; lock_buffer(bh); + if (bitmap_uptodate(bh)) { + unlock_buffer(bh); + return bh; + } spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { ext4_init_inode_bitmap(sb, bh, block_group, desc); + set_bitmap_uptodate(bh); set_buffer_uptodate(bh); spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); unlock_buffer(bh); return bh; } spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + if (buffer_uptodate(bh)) { + /* + * if not uninit if bh is uptodate, + * bitmap is also uptodate + */ + set_bitmap_uptodate(bh); + unlock_buffer(bh); + return bh; + } + /* + * submit the buffer_head for read. We can + * safely mark the bitmap as uptodate now. + * We do it here so the bitmap uptodate bit + * get set with buffer lock held. + */ + set_bitmap_uptodate(bh); if (bh_submit_read(bh) < 0) { put_bh(bh); ext4_error(sb, __func__, -- cgit v1.1 From 648f5879f5892dddd3ba71cd0d285599f40f2512 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 5 Jan 2009 21:46:04 -0500 Subject: ext4: mark the blocks/inode bitmap beyond end of group as used We need to mark the block/inode bitmap beyond the end of the group with '1'. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/ialloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/ialloc.c') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 7b12aed..e3aa3fa 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -84,7 +84,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, } memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); - mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb), + mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, bh->b_data); return EXT4_INODES_PER_GROUP(sb); -- cgit v1.1 From ba80b1019aa722b24506db1ee755e0bb2f513022 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 3 Jan 2009 20:03:21 -0500 Subject: ext4: Add markers for better debuggability Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs/ext4/ialloc.c') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index e3aa3fa..369c34c 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -210,6 +210,11 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) ino = inode->i_ino; ext4_debug("freeing inode %lu\n", ino); + trace_mark(ext4_free_inode, + "dev %s ino %lu mode %d uid %lu gid %lu bocks %llu", + sb->s_id, inode->i_ino, inode->i_mode, + (unsigned long) inode->i_uid, (unsigned long) inode->i_gid, + (unsigned long long) inode->i_blocks); /* * Note: we must free any quota before locking the superblock, @@ -698,6 +703,8 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) return ERR_PTR(-EPERM); sb = dir->i_sb; + trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id, + dir->i_ino, mode); inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -925,6 +932,8 @@ got: } ext4_debug("allocating inode %lu\n", inode->i_ino); + trace_mark(ext4_allocate_inode, "dev %s ino %lu dir %lu mode %d", + sb->s_id, inode->i_ino, dir->i_ino, mode); goto really_out; fail: ext4_std_error(sb, err); -- cgit v1.1 From 83982b6f47201c4c7767210d24d7d8c99567a0b3 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 6 Jan 2009 14:53:16 -0500 Subject: ext4: Remove "extents" mount option This mount option is largely superfluous, and in fact the way it was implemented was buggy; if a filesystem which did not have the extents feature flag was mounted -o extents, the filesystem would attempt to create and use extents-based file even though the extents feature flag was not eabled. The simplest thing to do is to nuke the mount option entirely. It's not all that useful to force the non-creation of new extent-based files if the filesystem can support it. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/ialloc.c') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 369c34c..4fb86a0 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -917,7 +917,7 @@ got: if (err) goto fail_free_drop; - if (test_opt(sb, EXTENTS)) { + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { /* set extent flag only for directory, file and normal symlink*/ if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; -- cgit v1.1