From cab893d9096a2cfe604fcad84c745def3a31a721 Mon Sep 17 00:00:00 2001 From: Martin Michlmayr Date: Fri, 17 Oct 2008 15:03:38 -0400 Subject: ext4: Remove an old reference to ext4dev in Makefile comment Remove an old reference to ext4dev. Signed-off-by: Martin Michlmayr Signed-off-by: Theodore Ts'o --- fs/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/Makefile b/fs/Makefile index d0c69f5..2168c90 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -71,7 +71,7 @@ obj-$(CONFIG_DLM) += dlm/ # Do not add any filesystems before this line obj-$(CONFIG_REISERFS_FS) += reiserfs/ obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 -obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4dev +obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4 obj-$(CONFIG_JBD) += jbd/ obj-$(CONFIG_JBD2) += jbd2/ obj-$(CONFIG_EXT2_FS) += ext2/ -- cgit v1.1 From 473dc8eddb049055ef823e000ad968ebd56cdaca Mon Sep 17 00:00:00 2001 From: Manish Katiyar Date: Mon, 13 Oct 2008 09:01:02 -0400 Subject: ext4: Fix Kconfig typo for ext4dev Looks like there is one more instance where ext4dev should be changed to ext4 because the module name will be "ext4" unless EXT4DEV_COMPAT is selected. Signed-off-by: Manish Katiyar Signed-off-by: Theodore Ts'o --- fs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/Kconfig b/fs/Kconfig index 9e9d70c..d0a1174 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -160,7 +160,7 @@ config EXT4_FS filesystem initially. To compile this file system support as a module, choose M here. The - module will be called ext4dev. + module will be called ext4. If unsure, say N. -- cgit v1.1 From 688f05a01983711a4e715b1d6e15a89a89c96a66 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 13 Oct 2008 12:14:14 -0400 Subject: ext4: Free ext4_prealloc_space using kmem_cache_free We should use kmem_cache_free to free memory allocated via kmem_cache_alloc Signed-off-by: Aneesh Kumar K.V Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b580714..154f8de 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2568,7 +2568,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); list_del(&pa->pa_group_list); count++; - kfree(pa); + kmem_cache_free(ext4_pspace_cachep, pa); } if (count) mb_debug("mballoc: %u PAs left\n", count); -- cgit v1.1 From c2774d84fd6cab2bfa2a2fae0b1ca8d8ebde48a2 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 10 Oct 2008 20:07:20 -0400 Subject: ext4: Do mballoc init before doing filesystem recovery During filesystem recovery we may be doing a truncate which expects some of the mballoc data structures to be initialized. So do ext4_mb_init before recovery. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dea8f13..4f41107 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2456,6 +2456,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "available.\n"); } + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - " + "requested data journaling mode\n"); + clear_opt(sbi->s_mount_opt, DELALLOC); + } else if (test_opt(sb, DELALLOC)) + printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); + + ext4_ext_init(sb); + err = ext4_mb_init(sb, needs_recovery); + if (err) { + printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n", + err); + goto failed_mount4; + } + /* * akpm: core read_super() calls in here with the superblock locked. * That deadlocks, because orphan cleanup needs to lock the superblock @@ -2475,21 +2490,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": "writeback"); - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { - printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - " - "requested data journaling mode\n"); - clear_opt(sbi->s_mount_opt, DELALLOC); - } else if (test_opt(sb, DELALLOC)) - printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); - - ext4_ext_init(sb); - err = ext4_mb_init(sb, needs_recovery); - if (err) { - printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n", - err); - goto failed_mount4; - } - lock_kernel(); return 0; -- cgit v1.1 From c894058d66637c7720569fbe12957f4de64d9991 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 16 Oct 2008 10:14:27 -0400 Subject: ext4: Use an rbtree for tracking blocks freed during transaction. With this patch we track the block freed during a transaction using red-black tree. We also make sure contiguous blocks freed are collected in one node in the tree. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 184 ++++++++++++++++++++++++++++++++++-------------------- fs/ext4/mballoc.h | 26 +++++--- 2 files changed, 133 insertions(+), 77 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 154f8de..bd9b011 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2300,6 +2300,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, } INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); + meta_group_info[i]->bb_free_root.rb_node = NULL;; #ifdef DOUBLE_CHECK { @@ -2647,13 +2648,11 @@ int ext4_mb_release(struct super_block *sb) static noinline_for_stack void ext4_mb_free_committed_blocks(struct super_block *sb) { - struct ext4_sb_info *sbi = EXT4_SB(sb); - int err; - int i; - int count = 0; - int count2 = 0; - struct ext4_free_metadata *md; struct ext4_buddy e4b; + struct ext4_group_info *db; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int err, count = 0, count2 = 0; + struct ext4_free_data *entry; if (list_empty(&sbi->s_committed_transaction)) return; @@ -2661,44 +2660,46 @@ ext4_mb_free_committed_blocks(struct super_block *sb) /* there is committed blocks to be freed yet */ do { /* get next array of blocks */ - md = NULL; + entry = NULL; spin_lock(&sbi->s_md_lock); if (!list_empty(&sbi->s_committed_transaction)) { - md = list_entry(sbi->s_committed_transaction.next, - struct ext4_free_metadata, list); - list_del(&md->list); + entry = list_entry(sbi->s_committed_transaction.next, + struct ext4_free_data, list); + list_del(&entry->list); } spin_unlock(&sbi->s_md_lock); - if (md == NULL) + if (entry == NULL) break; mb_debug("gonna free %u blocks in group %lu (0x%p):", - md->num, md->group, md); + entry->count, entry->group, entry); - err = ext4_mb_load_buddy(sb, md->group, &e4b); + err = ext4_mb_load_buddy(sb, entry->group, &e4b); /* we expect to find existing buddy because it's pinned */ BUG_ON(err != 0); + db = e4b.bd_info; /* there are blocks to put in buddy to make them really free */ - count += md->num; + count += entry->count; count2++; - ext4_lock_group(sb, md->group); - for (i = 0; i < md->num; i++) { - mb_debug(" %u", md->blocks[i]); - mb_free_blocks(NULL, &e4b, md->blocks[i], 1); + ext4_lock_group(sb, entry->group); + /* Take it out of per group rb tree */ + rb_erase(&entry->node, &(db->bb_free_root)); + mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); + + if (!db->bb_free_root.rb_node) { + /* No more items in the per group rb tree + * balance refcounts from ext4_mb_free_metadata() + */ + page_cache_release(e4b.bd_buddy_page); + page_cache_release(e4b.bd_bitmap_page); } - mb_debug("\n"); - ext4_unlock_group(sb, md->group); - - /* balance refcounts from ext4_mb_free_metadata() */ - page_cache_release(e4b.bd_buddy_page); - page_cache_release(e4b.bd_bitmap_page); + ext4_unlock_group(sb, entry->group); - kfree(md); + kmem_cache_free(ext4_free_ext_cachep, entry); ext4_mb_release_desc(&e4b); - - } while (md); + } while (1); mb_debug("freed %u blocks in %u structures\n", count, count2); } @@ -2771,6 +2772,16 @@ int __init init_ext4_mballoc(void) kmem_cache_destroy(ext4_pspace_cachep); return -ENOMEM; } + + ext4_free_ext_cachep = + kmem_cache_create("ext4_free_block_extents", + sizeof(struct ext4_free_data), + 0, SLAB_RECLAIM_ACCOUNT, NULL); + if (ext4_free_ext_cachep == NULL) { + kmem_cache_destroy(ext4_pspace_cachep); + kmem_cache_destroy(ext4_ac_cachep); + return -ENOMEM; + } return 0; } @@ -2779,6 +2790,7 @@ void exit_ext4_mballoc(void) /* XXX: synchronize_rcu(); */ kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); + kmem_cache_destroy(ext4_free_ext_cachep); } @@ -4415,6 +4427,21 @@ static void ext4_mb_poll_new_transaction(struct super_block *sb, ext4_mb_free_committed_blocks(sb); } +/* + * We can merge two free data extents only if the physical blocks + * are contiguous, AND the extents were freed by the same transaction, + * AND the blocks are associated with the same group. + */ +static int can_merge(struct ext4_free_data *entry1, + struct ext4_free_data *entry2) +{ + if ((entry1->t_tid == entry2->t_tid) && + (entry1->group == entry2->group) && + ((entry1->start_blk + entry1->count) == entry2->start_blk)) + return 1; + return 0; +} + static noinline_for_stack int ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, ext4_group_t group, ext4_grpblk_t block, int count) @@ -4422,57 +4449,80 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct ext4_group_info *db = e4b->bd_info; struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_free_metadata *md; - int i; + struct ext4_free_data *entry, *new_entry; + struct rb_node **n = &db->bb_free_root.rb_node, *node; + struct rb_node *parent = NULL, *new_node; + BUG_ON(e4b->bd_bitmap_page == NULL); BUG_ON(e4b->bd_buddy_page == NULL); + new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); + new_entry->start_blk = block; + new_entry->group = group; + new_entry->count = count; + new_entry->t_tid = handle->h_transaction->t_tid; + new_node = &new_entry->node; + ext4_lock_group(sb, group); - for (i = 0; i < count; i++) { - md = db->bb_md_cur; - if (md && db->bb_tid != handle->h_transaction->t_tid) { - db->bb_md_cur = NULL; - md = NULL; + if (!*n) { + /* first free block exent. We need to + protect buddy cache from being freed, + * otherwise we'll refresh it from + * on-disk bitmap and lose not-yet-available + * blocks */ + page_cache_get(e4b->bd_buddy_page); + page_cache_get(e4b->bd_bitmap_page); + } + while (*n) { + parent = *n; + entry = rb_entry(parent, struct ext4_free_data, node); + if (block < entry->start_blk) + n = &(*n)->rb_left; + else if (block >= (entry->start_blk + entry->count)) + n = &(*n)->rb_right; + else { + ext4_error(sb, __func__, + "Double free of blocks %d (%d %d)\n", + block, entry->start_blk, entry->count); + return 0; } + } - if (md == NULL) { - ext4_unlock_group(sb, group); - md = kmalloc(sizeof(*md), GFP_NOFS); - if (md == NULL) - return -ENOMEM; - md->num = 0; - md->group = group; - - ext4_lock_group(sb, group); - if (db->bb_md_cur == NULL) { - spin_lock(&sbi->s_md_lock); - list_add(&md->list, &sbi->s_active_transaction); - spin_unlock(&sbi->s_md_lock); - /* protect buddy cache from being freed, - * otherwise we'll refresh it from - * on-disk bitmap and lose not-yet-available - * blocks */ - page_cache_get(e4b->bd_buddy_page); - page_cache_get(e4b->bd_bitmap_page); - db->bb_md_cur = md; - db->bb_tid = handle->h_transaction->t_tid; - mb_debug("new md 0x%p for group %lu\n", - md, md->group); - } else { - kfree(md); - md = db->bb_md_cur; - } + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, &db->bb_free_root); + + /* Now try to see the extent can be merged to left and right */ + node = rb_prev(new_node); + if (node) { + entry = rb_entry(node, struct ext4_free_data, node); + if (can_merge(entry, new_entry)) { + new_entry->start_blk = entry->start_blk; + new_entry->count += entry->count; + rb_erase(node, &(db->bb_free_root)); + spin_lock(&sbi->s_md_lock); + list_del(&entry->list); + spin_unlock(&sbi->s_md_lock); + kmem_cache_free(ext4_free_ext_cachep, entry); } + } - BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS); - md->blocks[md->num] = block + i; - md->num++; - if (md->num == EXT4_BB_MAX_BLOCKS) { - /* no more space, put full container on a sb's list */ - db->bb_md_cur = NULL; + node = rb_next(new_node); + if (node) { + entry = rb_entry(node, struct ext4_free_data, node); + if (can_merge(new_entry, entry)) { + new_entry->count += entry->count; + rb_erase(node, &(db->bb_free_root)); + spin_lock(&sbi->s_md_lock); + list_del(&entry->list); + spin_unlock(&sbi->s_md_lock); + kmem_cache_free(ext4_free_ext_cachep, entry); } } + /* Add the extent to active_transaction list */ + spin_lock(&sbi->s_md_lock); + list_add(&new_entry->list, &sbi->s_active_transaction); + spin_unlock(&sbi->s_md_lock); ext4_unlock_group(sb, group); return 0; } diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index b3b4828..9e815c4 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -98,23 +98,29 @@ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; +static struct kmem_cache *ext4_free_ext_cachep; -#ifdef EXT4_BB_MAX_BLOCKS -#undef EXT4_BB_MAX_BLOCKS -#endif -#define EXT4_BB_MAX_BLOCKS 30 +struct ext4_free_data { + /* this links the free block information from group_info */ + struct rb_node node; -struct ext4_free_metadata { - ext4_group_t group; - unsigned short num; - ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS]; + /* this links the free block information from ext4_sb_info */ struct list_head list; + + /* group which free block extent belongs */ + ext4_group_t group; + + /* free block extent */ + ext4_grpblk_t start_blk; + ext4_grpblk_t count; + + /* transaction which freed this extent */ + tid_t t_tid; }; struct ext4_group_info { unsigned long bb_state; - unsigned long bb_tid; - struct ext4_free_metadata *bb_md_cur; + struct rb_root bb_free_root; unsigned short bb_first_free; unsigned short bb_free; unsigned short bb_fragments; -- cgit v1.1 From a1aebc1e2da9a7bee4ff8cce510b08f469d1929e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 10 Oct 2008 20:13:31 -0400 Subject: ext4: Don't reuse released data blocks until transaction commits We need to make sure we don't reuse the data blocks released during the transaction untill the transaction commits. We force this mode only for ordered and journalled mode. Writeback mode already don't provided data consistency. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Theodore Ts'o --- fs/ext4/balloc.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index bd2ece2..b9821be 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -568,8 +568,16 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, /* this isn't the right place to decide whether block is metadata * inode.c/extents.c knows better, but for safety ... */ - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || - ext4_should_journal_data(inode)) + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + metadata = 1; + + /* We need to make sure we don't reuse + * block released untill the transaction commit. + * writeback mode have weak data consistency so + * don't force data as metadata when freeing block + * for writeback mode. + */ + if (metadata == 0 && !ext4_should_writeback_data(inode)) metadata = 1; sb = inode->i_sb; -- cgit v1.1 From 8a0aba733db1adb5e1f0e828889a18f4c1c512de Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 16 Oct 2008 10:06:27 -0400 Subject: ext4: let the block device know when unused blocks can be discarded Let the block device know when unused blocks can be discarded, using the new sb_issue_discard() interface. Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 7 +++++++ fs/ext4/mballoc.h | 2 ++ 2 files changed, 9 insertions(+) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index bd9b011..815a22e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2653,6 +2653,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb) struct ext4_sb_info *sbi = EXT4_SB(sb); int err, count = 0, count2 = 0; struct ext4_free_data *entry; + ext4_fsblk_t discard_block; if (list_empty(&sbi->s_committed_transaction)) return; @@ -2696,6 +2697,12 @@ ext4_mb_free_committed_blocks(struct super_block *sb) page_cache_release(e4b.bd_bitmap_page); } ext4_unlock_group(sb, entry->group); + discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) + + entry->start_blk + + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); + trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id, + (unsigned long long) discard_block, entry->count); + sb_issue_discard(sb, discard_block, entry->count); kmem_cache_free(ext4_free_ext_cachep, entry); ext4_mb_release_desc(&e4b); diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 9e815c4..94cb7b9 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include "ext4_jbd2.h" #include "ext4.h" #include "group.h" -- cgit v1.1 From af6f029d3836eb7264cd3fbb13a6baf0e5fdb5ea Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 14 Oct 2008 09:20:19 -0400 Subject: ext4: Use tag dirty lookup during mpage_da_submit_io This enables us to drop the range_cont writeback mode use from ext4_da_writepages. Signed-off-by: Aneesh Kumar K.V --- fs/ext4/inode.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9b4ec9d..4ee3f069 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1656,17 +1656,23 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) while (index <= end) { /* XXX: optimize tail */ - nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + /* + * We can use PAGECACHE_TAG_DIRTY lookup here because + * even though we have cleared the dirty flag on the page + * We still keep the page in the radix tree with tag + * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io. + * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback + * which is called via the below writepage callback. + */ + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, + (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - index = page->index; - if (index > end) - break; - index++; - err = mapping->a_ops->writepage(page, mpd->wbc); if (!err) mpd->pages_written++; @@ -2361,7 +2367,6 @@ static int ext4_da_writepages(struct address_space *mapping, struct writeback_control *wbc) { handle_t *handle = NULL; - loff_t range_start = 0; struct mpage_da_data mpd; struct inode *inode = mapping->host; int needed_blocks, ret = 0, nr_to_writebump = 0; @@ -2386,14 +2391,7 @@ static int ext4_da_writepages(struct address_space *mapping, wbc->nr_to_write = sbi->s_mb_stream_request; } - if (!wbc->range_cyclic) - /* - * If range_cyclic is not set force range_cont - * and save the old writeback_index - */ - wbc->range_cont = 1; - range_start = wbc->range_start; pages_skipped = wbc->pages_skipped; mpd.wbc = wbc; @@ -2452,9 +2450,8 @@ restart_loop: wbc->nr_to_write = to_write; } - if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) { + if (!wbc->range_cyclic && (pages_skipped != wbc->pages_skipped)) { /* We skipped pages in this loop */ - wbc->range_start = range_start; wbc->nr_to_write = to_write + wbc->pages_skipped - pages_skipped; wbc->pages_skipped = pages_skipped; @@ -2463,7 +2460,6 @@ restart_loop: out_writepages: wbc->nr_to_write = to_write - nr_to_writebump; - wbc->range_start = range_start; return ret; } -- cgit v1.1 From 74baaaaec8b4f22e1ae279f5ecca4ff705b28912 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 14 Oct 2008 09:21:02 -0400 Subject: vfs: Remove the range_cont writeback mode. Ext4 was the only user of range_cont writeback mode and ext4 switched to a different method. So remove the range_cont mode which is not used in the kernel. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" CC: linux-fsdevel@vger.kernel.org --- include/linux/writeback.h | 1 - mm/page-writeback.c | 2 -- 2 files changed, 3 deletions(-) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 12b15c5..bd91987 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -63,7 +63,6 @@ struct writeback_control { unsigned for_writepages:1; /* This is a writepages() call */ unsigned range_cyclic:1; /* range_start is cyclic */ unsigned more_io:1; /* more io to be dispatched */ - unsigned range_cont:1; }; /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c130a13..e373f14 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -961,8 +961,6 @@ retry: if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = index; - if (wbc->range_cont) - wbc->range_start = index << PAGE_CACHE_SHIFT; return ret; } EXPORT_SYMBOL(write_cache_pages); -- cgit v1.1 From 17bc6c30cf6bfffd816bdc53682dd46fc34a2cf4 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 16 Oct 2008 10:09:17 -0400 Subject: vfs: Add no_nrwrite_index_update writeback control flag If no_nrwrite_index_update is set we don't update nr_to_write and address space writeback_index in write_cache_pages. This change enables a file system to skip these updates in write_cache_pages and do them in the writepages() callback. This patch will be followed by an ext4 patch that make use of these new flags. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" CC: linux-fsdevel@vger.kernel.org --- include/linux/writeback.h | 9 +++++++++ mm/page-writeback.c | 10 +++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index bd91987..e585657 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -63,6 +63,15 @@ struct writeback_control { unsigned for_writepages:1; /* This is a writepages() call */ unsigned range_cyclic:1; /* range_start is cyclic */ unsigned more_io:1; /* more io to be dispatched */ + /* + * write_cache_pages() won't update wbc->nr_to_write and + * mapping->writeback_index if no_nrwrite_index_update + * is set. write_cache_pages() may write more than we + * requested and we want to make sure nr_to_write and + * writeback_index are updated in a consistent manner + * so we use a single control to update them + */ + unsigned no_nrwrite_index_update:1; }; /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e373f14..b40f6d5 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -876,6 +876,7 @@ int write_cache_pages(struct address_space *mapping, pgoff_t end; /* Inclusive */ int scanned = 0; int range_whole = 0; + long nr_to_write = wbc->nr_to_write; if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; @@ -939,7 +940,7 @@ retry: unlock_page(page); ret = 0; } - if (ret || (--(wbc->nr_to_write) <= 0)) + if (ret || (--nr_to_write <= 0)) done = 1; if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; @@ -958,8 +959,11 @@ retry: index = 0; goto retry; } - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) - mapping->writeback_index = index; + if (!wbc->no_nrwrite_index_update) { + if (wbc->range_cyclic || (range_whole && nr_to_write > 0)) + mapping->writeback_index = index; + wbc->nr_to_write = nr_to_write; + } return ret; } -- cgit v1.1 From 22208dedbd7626e5fc4339c417f8d24cc21f79d7 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 16 Oct 2008 10:10:36 -0400 Subject: ext4: Fix file fragmentation during large file write. The range_cyclic writeback mode uses the address_space writeback_index as the start index for writeback. With delayed allocation we were updating writeback_index wrongly resulting in highly fragmented file. This patch reduces the number of extents reduced from 4000 to 27 for a 3GB file. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 91 ++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 57 insertions(+), 34 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4ee3f069..27fc6b9 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1648,6 +1648,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) int ret = 0, err, nr_pages, i; unsigned long index, end; struct pagevec pvec; + long pages_skipped; BUG_ON(mpd->next_page <= mpd->first_page); pagevec_init(&pvec, 0); @@ -1655,7 +1656,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) end = mpd->next_page - 1; while (index <= end) { - /* XXX: optimize tail */ /* * We can use PAGECACHE_TAG_DIRTY lookup here because * even though we have cleared the dirty flag on the page @@ -1673,8 +1673,13 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + pages_skipped = mpd->wbc->pages_skipped; err = mapping->a_ops->writepage(page, mpd->wbc); - if (!err) + if (!err && (pages_skipped == mpd->wbc->pages_skipped)) + /* + * have successfully written the page + * without skipping the same + */ mpd->pages_written++; /* * In error case, we have to continue because @@ -2110,7 +2115,6 @@ static int mpage_da_writepages(struct address_space *mapping, struct writeback_control *wbc, struct mpage_da_data *mpd) { - long to_write; int ret; if (!mpd->get_block) @@ -2125,19 +2129,18 @@ static int mpage_da_writepages(struct address_space *mapping, mpd->pages_written = 0; mpd->retval = 0; - to_write = wbc->nr_to_write; - ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd); - /* * Handle last extent of pages */ if (!mpd->io_done && mpd->next_page != mpd->first_page) { if (mpage_da_map_blocks(mpd) == 0) mpage_da_submit_io(mpd); - } - wbc->nr_to_write = to_write - mpd->pages_written; + mpd->io_done = 1; + ret = MPAGE_DA_EXTENT_TAIL; + } + wbc->nr_to_write -= mpd->pages_written; return ret; } @@ -2366,11 +2369,14 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) static int ext4_da_writepages(struct address_space *mapping, struct writeback_control *wbc) { + pgoff_t index; + int range_whole = 0; handle_t *handle = NULL; struct mpage_da_data mpd; struct inode *inode = mapping->host; + int no_nrwrite_index_update; + long pages_written = 0, pages_skipped; int needed_blocks, ret = 0, nr_to_writebump = 0; - long to_write, pages_skipped = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); /* @@ -2390,16 +2396,26 @@ static int ext4_da_writepages(struct address_space *mapping, nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; wbc->nr_to_write = sbi->s_mb_stream_request; } + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; - - pages_skipped = wbc->pages_skipped; + if (wbc->range_cyclic) + index = mapping->writeback_index; + else + index = wbc->range_start >> PAGE_CACHE_SHIFT; mpd.wbc = wbc; mpd.inode = mapping->host; -restart_loop: - to_write = wbc->nr_to_write; - while (!ret && to_write > 0) { + /* + * we don't want write_cache_pages to update + * nr_to_write and writeback_index + */ + no_nrwrite_index_update = wbc->no_nrwrite_index_update; + wbc->no_nrwrite_index_update = 1; + pages_skipped = wbc->pages_skipped; + + while (!ret && wbc->nr_to_write > 0) { /* * we insert one extent at a time. So we need @@ -2420,46 +2436,53 @@ restart_loop: dump_stack(); goto out_writepages; } - to_write -= wbc->nr_to_write; - mpd.get_block = ext4_da_get_block_write; ret = mpage_da_writepages(mapping, wbc, &mpd); ext4_journal_stop(handle); - if (mpd.retval == -ENOSPC) + if (mpd.retval == -ENOSPC) { + /* commit the transaction which would + * free blocks released in the transaction + * and try again + */ jbd2_journal_force_commit_nested(sbi->s_journal); - - /* reset the retry count */ - if (ret == MPAGE_DA_EXTENT_TAIL) { + wbc->pages_skipped = pages_skipped; + ret = 0; + } else if (ret == MPAGE_DA_EXTENT_TAIL) { /* * got one extent now try with * rest of the pages */ - to_write += wbc->nr_to_write; + pages_written += mpd.pages_written; + wbc->pages_skipped = pages_skipped; ret = 0; - } else if (wbc->nr_to_write) { + } else if (wbc->nr_to_write) /* * There is no more writeout needed * or we requested for a noblocking writeout * and we found the device congested */ - to_write += wbc->nr_to_write; break; - } - wbc->nr_to_write = to_write; - } - - if (!wbc->range_cyclic && (pages_skipped != wbc->pages_skipped)) { - /* We skipped pages in this loop */ - wbc->nr_to_write = to_write + - wbc->pages_skipped - pages_skipped; - wbc->pages_skipped = pages_skipped; - goto restart_loop; } + if (pages_skipped != wbc->pages_skipped) + printk(KERN_EMERG "This should not happen leaving %s " + "with nr_to_write = %ld ret = %d\n", + __func__, wbc->nr_to_write, ret); + + /* Update index */ + index += pages_written; + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + /* + * set the writeback_index so that range_cyclic + * mode will write it back later + */ + mapping->writeback_index = index; out_writepages: - wbc->nr_to_write = to_write - nr_to_writebump; + if (!no_nrwrite_index_update) + wbc->no_nrwrite_index_update = 0; + wbc->nr_to_write -= nr_to_writebump; return ret; } -- cgit v1.1 From 5128273a326679970b9196a27ff812670927c1c4 Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Fri, 17 Oct 2008 09:16:19 -0400 Subject: ext4: Add missing newlines to printk messages There are some newlines missing in ext4_check_descriptors, which cause the printk level to be printed out when the next printk call is made: [ 778.847265] EXT4-fs: ext4_check_descriptors: Block bitmap for group 0 not in group (block 1509949442)!<3>EXT4-fs: group descriptors corrupted! [ 802.646630] EXT4-fs: ext4_check_descriptors: Inode bitmap for group 0 not in group (block 9043971)!<3>EXT4-fs: group descriptors corrupted! Signed-off-by: Eric Sesterhenn Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4f41107..a97e9eb 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1618,14 +1618,14 @@ static int ext4_check_descriptors(struct super_block *sb) if (block_bitmap < first_block || block_bitmap > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Block bitmap for group %lu not in group " - "(block %llu)!", i, block_bitmap); + "(block %llu)!\n", i, block_bitmap); return 0; } inode_bitmap = ext4_inode_bitmap(sb, gdp); if (inode_bitmap < first_block || inode_bitmap > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Inode bitmap for group %lu not in group " - "(block %llu)!", i, inode_bitmap); + "(block %llu)!\n", i, inode_bitmap); return 0; } inode_table = ext4_inode_table(sb, gdp); @@ -1633,7 +1633,7 @@ static int ext4_check_descriptors(struct super_block *sb) inode_table + sbi->s_itb_per_group - 1 > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Inode table for group %lu not in group " - "(block %llu)!", i, inode_table); + "(block %llu)!\n", i, inode_table); return 0; } spin_lock(sb_bgl_lock(sbi, i)); -- cgit v1.1 From 0b09923eabd92f11a8b272dd3fd0347332d0e1e2 Mon Sep 17 00:00:00 2001 From: Manish Katiyar Date: Fri, 17 Oct 2008 14:58:45 -0400 Subject: ext4: Remove compile warnings when building w/o CONFIG_PROC_FS Signed-off-by: Manish Katiyar Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 815a22e..da1da1f 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2720,6 +2720,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb) static int ext4_mb_init_per_dev_proc(struct super_block *sb) { +#ifdef CONFIG_PROC_FS mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; struct ext4_sb_info *sbi = EXT4_SB(sb); struct proc_dir_entry *proc; @@ -2743,10 +2744,14 @@ err_out: remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); return -ENOMEM; +#else + return 0; +#endif } static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) { +#ifdef CONFIG_PROC_FS struct ext4_sb_info *sbi = EXT4_SB(sb); if (sbi->s_proc == NULL) @@ -2758,7 +2763,7 @@ static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc); remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); - +#endif return 0; } -- cgit v1.1 From 01436ef2e4710317f826c4893b31c07b2d8df88c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 17 Oct 2008 07:22:35 -0400 Subject: ext4: Remove unused mount options: nomballoc, mballoc, nocheck These mount options don't actually do anything any more, so remove them. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 1 - fs/ext4/super.c | 11 ++--------- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6690a41..4880cc3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -511,7 +511,6 @@ do { \ /* * Mount flags */ -#define EXT4_MOUNT_CHECK 0x00001 /* Do mount-time checks */ #define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */ #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a97e9eb..95127f0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -904,7 +904,7 @@ static const struct export_operations ext4_export_ops = { enum { Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, - Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, + Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, @@ -915,7 +915,7 @@ enum { Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, - Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc, + Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_inode_readahead_blks }; @@ -933,8 +933,6 @@ static const match_table_t tokens = { {Opt_err_panic, "errors=panic"}, {Opt_err_ro, "errors=remount-ro"}, {Opt_nouid32, "nouid32"}, - {Opt_nocheck, "nocheck"}, - {Opt_nocheck, "check=none"}, {Opt_debug, "debug"}, {Opt_oldalloc, "oldalloc"}, {Opt_orlov, "orlov"}, @@ -973,8 +971,6 @@ static const match_table_t tokens = { {Opt_extents, "extents"}, {Opt_noextents, "noextents"}, {Opt_i_version, "i_version"}, - {Opt_mballoc, "mballoc"}, - {Opt_nomballoc, "nomballoc"}, {Opt_stripe, "stripe=%u"}, {Opt_resize, "resize"}, {Opt_delalloc, "delalloc"}, @@ -1073,9 +1069,6 @@ static int parse_options(char *options, struct super_block *sb, case Opt_nouid32: set_opt(sbi->s_mount_opt, NO_UID32); break; - case Opt_nocheck: - clear_opt(sbi->s_mount_opt, CHECK); - break; case Opt_debug: set_opt(sbi->s_mount_opt, DEBUG); break; -- cgit v1.1 From 22359f5745eb26bd3205a1ede7968c8944398220 Mon Sep 17 00:00:00 2001 From: Diego Calleja Date: Fri, 17 Oct 2008 09:15:14 -0400 Subject: ext4: Update Documentation/filesystems/ext4.txt Since Ext4 is supposed to be stable in 2.6.28-rc, ext4's documentation file should be updated. [ More updates also added by Theodore Ts'o. ] Signed-off-by: Diego Calleja Signed-off-by: "Theodore Ts'o" --- Documentation/filesystems/ext4.txt | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index eb154ef..174eaff 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -2,19 +2,24 @@ Ext4 Filesystem =============== -This is a development version of the ext4 filesystem, an advanced level -of the ext3 filesystem which incorporates scalability and reliability -enhancements for supporting large filesystems (64 bit) in keeping with -increasing disk capacities and state-of-the-art feature requirements. +Ext4 is an an advanced level of the ext3 filesystem which incorporates +scalability and reliability enhancements for supporting large filesystems +(64 bit) in keeping with increasing disk capacities and state-of-the-art +feature requirements. -Mailing list: linux-ext4@vger.kernel.org +Mailing list: linux-ext4@vger.kernel.org +Web site: http://ext4.wiki.kernel.org 1. Quick usage instructions: =========================== +Note: More extensive information for getting started with ext4 can be + found at the ext4 wiki site at the URL: + http://ext4.wiki.kernel.org/index.php/Ext4_Howto + - Compile and install the latest version of e2fsprogs (as of this - writing version 1.41) from: + writing version 1.41.3) from: http://sourceforge.net/project/showfiles.php?group_id=2406 @@ -36,11 +41,9 @@ Mailing list: linux-ext4@vger.kernel.org # mke2fs -t ext4 /dev/hda1 - Or configure an existing ext3 filesystem to support extents and set - the test_fs flag to indicate that it's ok for an in-development - filesystem to touch this filesystem: + Or to configure an existing ext3 filesystem to support extents: - # tune2fs -O extents -E test_fs /dev/hda1 + # tune2fs -O extents /dev/hda1 If the filesystem was created with 128 byte inodes, it can be converted to use 256 byte for greater efficiency via: @@ -104,8 +107,8 @@ exist yet so I'm not sure they're in the near-term roadmap. The big performance win will come with mballoc, delalloc and flex_bg grouping of bitmaps and inode tables. Some test results available here: - - http://www.bullopensource.org/ext4/20080530/ffsb-write-2.6.26-rc2.html - - http://www.bullopensource.org/ext4/20080530/ffsb-readwrite-2.6.26-rc2.html + - http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-write-2.6.27-rc1.html + - http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-readwrite-2.6.27-rc1.html 3. Options ========== @@ -214,9 +217,6 @@ noreservation bsddf (*) Make 'df' act like BSD. minixdf Make 'df' act like Minix. -check=none Don't do extra checking of bitmaps on mount. -nocheck - debug Extra debugging information is sent to syslog. errors=remount-ro(*) Remount the filesystem read-only on an error. @@ -253,8 +253,6 @@ nobh (a) cache disk block mapping information "nobh" option tries to avoid associating buffer heads (supported only for "writeback" mode). -mballoc (*) Use the multiple block allocator for block allocation -nomballoc disabled multiple block allocator for block allocation. stripe=n Number of filesystem blocks that mballoc will try to use for allocation size and alignment. For RAID5/6 systems this should be the number of data -- cgit v1.1 From 3e624fc72fba09b6f999a9fbb87b64efccd38036 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 16 Oct 2008 20:00:24 -0400 Subject: ext4: Replace hackish ext4_mb_poll_new_transaction with commit callback The multiblock allocator needs to be able to release blocks (and issue a blkdev discard request) when the transaction which freed those blocks is committed. Previously this was done via a polling mechanism when blocks are allocated or freed. A much better way of doing things is to create a jbd2 callback function and attaching the list of blocks to be freed directly to the transaction structure. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4_sb.h | 3 -- fs/ext4/mballoc.c | 85 +++++++++------------------------------------------ fs/ext4/mballoc.h | 3 +- fs/jbd2/commit.c | 3 ++ fs/jbd2/transaction.c | 1 + include/linux/jbd2.h | 9 ++++++ 6 files changed, 29 insertions(+), 75 deletions(-) diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index 6a0b40d..445fde6 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h @@ -99,9 +99,6 @@ struct ext4_sb_info { struct inode *s_buddy_cache; long s_blocks_reserved; spinlock_t s_reserve_lock; - struct list_head s_active_transaction; - struct list_head s_closed_transaction; - struct list_head s_committed_transaction; spinlock_t s_md_lock; tid_t s_last_transaction; unsigned short *s_mb_offsets, *s_mb_maxs; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index da1da1f..dfe17a1 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2523,9 +2523,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) } spin_lock_init(&sbi->s_md_lock); - INIT_LIST_HEAD(&sbi->s_active_transaction); - INIT_LIST_HEAD(&sbi->s_closed_transaction); - INIT_LIST_HEAD(&sbi->s_committed_transaction); spin_lock_init(&sbi->s_bal_lock); sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; @@ -2554,6 +2551,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) ext4_mb_init_per_dev_proc(sb); ext4_mb_history_init(sb); + sbi->s_journal->j_commit_callback = release_blocks_on_commit; + printk(KERN_INFO "EXT4-fs: mballoc enabled\n"); return 0; } @@ -2583,15 +2582,6 @@ int ext4_mb_release(struct super_block *sb) struct ext4_group_info *grinfo; struct ext4_sb_info *sbi = EXT4_SB(sb); - /* release freed, non-committed blocks */ - spin_lock(&sbi->s_md_lock); - list_splice_init(&sbi->s_closed_transaction, - &sbi->s_committed_transaction); - list_splice_init(&sbi->s_active_transaction, - &sbi->s_committed_transaction); - spin_unlock(&sbi->s_md_lock); - ext4_mb_free_committed_blocks(sb); - if (sbi->s_group_info) { for (i = 0; i < sbi->s_groups_count; i++) { grinfo = ext4_get_group_info(sb, i); @@ -2645,36 +2635,25 @@ int ext4_mb_release(struct super_block *sb) return 0; } -static noinline_for_stack void -ext4_mb_free_committed_blocks(struct super_block *sb) +/* + * This function is called by the jbd2 layer once the commit has finished, + * so we know we can free the blocks that were released with that commit. + */ +static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) { + struct super_block *sb = journal->j_private; struct ext4_buddy e4b; struct ext4_group_info *db; - struct ext4_sb_info *sbi = EXT4_SB(sb); int err, count = 0, count2 = 0; struct ext4_free_data *entry; ext4_fsblk_t discard_block; + struct list_head *l, *ltmp; - if (list_empty(&sbi->s_committed_transaction)) - return; - - /* there is committed blocks to be freed yet */ - do { - /* get next array of blocks */ - entry = NULL; - spin_lock(&sbi->s_md_lock); - if (!list_empty(&sbi->s_committed_transaction)) { - entry = list_entry(sbi->s_committed_transaction.next, - struct ext4_free_data, list); - list_del(&entry->list); - } - spin_unlock(&sbi->s_md_lock); - - if (entry == NULL) - break; + list_for_each_safe(l, ltmp, &txn->t_private_list) { + entry = list_entry(l, struct ext4_free_data, list); mb_debug("gonna free %u blocks in group %lu (0x%p):", - entry->count, entry->group, entry); + entry->count, entry->group, entry); err = ext4_mb_load_buddy(sb, entry->group, &e4b); /* we expect to find existing buddy because it's pinned */ @@ -2706,7 +2685,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb) kmem_cache_free(ext4_free_ext_cachep, entry); ext4_mb_release_desc(&e4b); - } while (1); + } mb_debug("freed %u blocks in %u structures\n", count, count2); } @@ -4348,8 +4327,6 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, goto out1; } - ext4_mb_poll_new_transaction(sb, handle); - *errp = ext4_mb_initialize_context(ac, ar); if (*errp) { ar->len = 0; @@ -4408,36 +4385,6 @@ out1: return block; } -static void ext4_mb_poll_new_transaction(struct super_block *sb, - handle_t *handle) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - - if (sbi->s_last_transaction == handle->h_transaction->t_tid) - return; - - /* new transaction! time to close last one and free blocks for - * committed transaction. we know that only transaction can be - * active, so previos transaction can be being logged and we - * know that transaction before previous is known to be already - * logged. this means that now we may free blocks freed in all - * transactions before previous one. hope I'm clear enough ... */ - - spin_lock(&sbi->s_md_lock); - if (sbi->s_last_transaction != handle->h_transaction->t_tid) { - mb_debug("new transaction %lu, old %lu\n", - (unsigned long) handle->h_transaction->t_tid, - (unsigned long) sbi->s_last_transaction); - list_splice_init(&sbi->s_closed_transaction, - &sbi->s_committed_transaction); - list_splice_init(&sbi->s_active_transaction, - &sbi->s_closed_transaction); - sbi->s_last_transaction = handle->h_transaction->t_tid; - } - spin_unlock(&sbi->s_md_lock); - - ext4_mb_free_committed_blocks(sb); -} /* * We can merge two free data extents only if the physical blocks @@ -4531,9 +4478,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, kmem_cache_free(ext4_free_ext_cachep, entry); } } - /* Add the extent to active_transaction list */ + /* Add the extent to transaction's private list */ spin_lock(&sbi->s_md_lock); - list_add(&new_entry->list, &sbi->s_active_transaction); + list_add(&new_entry->list, &handle->h_transaction->t_private_list); spin_unlock(&sbi->s_md_lock); ext4_unlock_group(sb, group); return 0; @@ -4562,8 +4509,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, *freed = 0; - ext4_mb_poll_new_transaction(sb, handle); - sbi = EXT4_SB(sb); es = EXT4_SB(sb)->s_es; if (block < le32_to_cpu(es->s_first_data_block) || diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 94cb7b9..b5dff1f 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -269,8 +269,6 @@ struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); -static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *); -static void ext4_mb_free_committed_blocks(struct super_block *); static void ext4_mb_return_to_preallocation(struct inode *inode, struct ext4_buddy *e4b, sector_t block, int count); @@ -278,6 +276,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *, struct super_block *, struct ext4_prealloc_space *pa); static int ext4_mb_init_per_dev_proc(struct super_block *sb); static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); +static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 0abe02c..8b119e1 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -995,6 +995,9 @@ restart_loop: } spin_unlock(&journal->j_list_lock); + if (journal->j_commit_callback) + journal->j_commit_callback(journal, commit_transaction); + trace_mark(jbd2_end_commit, "dev %s transaction %d head %d", journal->j_devname, commit_transaction->t_tid, journal->j_tail_sequence); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e5d5405..39b7805 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -52,6 +52,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) transaction->t_expires = jiffies + journal->j_commit_interval; spin_lock_init(&transaction->t_handle_lock); INIT_LIST_HEAD(&transaction->t_inode_list); + INIT_LIST_HEAD(&transaction->t_private_list); /* Set up the commit timer for the new transaction. */ journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 463d6f1..c7d106e 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -641,6 +641,11 @@ struct transaction_s */ int t_handle_count; + /* + * For use by the filesystem to store fs-specific data + * structures associated with the transaction + */ + struct list_head t_private_list; }; struct transaction_run_stats_s { @@ -935,6 +940,10 @@ struct journal_s pid_t j_last_sync_writer; + /* This function is called when a transaction is closed */ + void (*j_commit_callback)(journal_t *, + transaction_t *); + /* * Journal statistics */ -- cgit v1.1 From f287a1a56130be5fdb96a4a62d1290bd064f308e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 16 Oct 2008 22:50:48 -0400 Subject: ext4: Remove automatic enabling of the HUGE_FILE feature flag If the HUGE_FILE feature flag is not set, don't allow the creation of large files, instead of automatically enabling the feature flag. Recent versions of mke2fs will set the HUGE_FILE flag automatically anyway for ext4 filesystems. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 24 +++++----------- fs/ext4/super.c | 85 ++++++++++----------------------------------------------- 2 files changed, 21 insertions(+), 88 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 27fc6b9..8dbf695 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4194,7 +4194,6 @@ static int ext4_inode_blocks_set(handle_t *handle, struct inode *inode = &(ei->vfs_inode); u64 i_blocks = inode->i_blocks; struct super_block *sb = inode->i_sb; - int err = 0; if (i_blocks <= ~0U) { /* @@ -4204,36 +4203,27 @@ static int ext4_inode_blocks_set(handle_t *handle, raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = 0; ei->i_flags &= ~EXT4_HUGE_FILE_FL; - } else if (i_blocks <= 0xffffffffffffULL) { + return 0; + } + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) + return -EFBIG; + + if (i_blocks <= 0xffffffffffffULL) { /* * i_blocks can be represented in a 48 bit variable * as multiple of 512 bytes */ - err = ext4_update_rocompat_feature(handle, sb, - EXT4_FEATURE_RO_COMPAT_HUGE_FILE); - if (err) - goto err_out; - /* i_block is stored in the split 48 bit fields */ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); ei->i_flags &= ~EXT4_HUGE_FILE_FL; } else { - /* - * i_blocks should be represented in a 48 bit variable - * as multiple of file system block size - */ - err = ext4_update_rocompat_feature(handle, sb, - EXT4_FEATURE_RO_COMPAT_HUGE_FILE); - if (err) - goto err_out; ei->i_flags |= EXT4_HUGE_FILE_FL; /* i_block is stored in file system block size */ i_blocks = i_blocks >> (inode->i_blkbits - 9); raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); } -err_out: - return err; + return 0; } /* diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 95127f0..9b2b2bc 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -374,66 +374,6 @@ void ext4_update_dynamic_rev(struct super_block *sb) */ } -int ext4_update_compat_feature(handle_t *handle, - struct super_block *sb, __u32 compat) -{ - int err = 0; - if (!EXT4_HAS_COMPAT_FEATURE(sb, compat)) { - err = ext4_journal_get_write_access(handle, - EXT4_SB(sb)->s_sbh); - if (err) - return err; - EXT4_SET_COMPAT_FEATURE(sb, compat); - sb->s_dirt = 1; - handle->h_sync = 1; - BUFFER_TRACE(EXT4_SB(sb)->s_sbh, - "call ext4_journal_dirty_met adata"); - err = ext4_journal_dirty_metadata(handle, - EXT4_SB(sb)->s_sbh); - } - return err; -} - -int ext4_update_rocompat_feature(handle_t *handle, - struct super_block *sb, __u32 rocompat) -{ - int err = 0; - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, rocompat)) { - err = ext4_journal_get_write_access(handle, - EXT4_SB(sb)->s_sbh); - if (err) - return err; - EXT4_SET_RO_COMPAT_FEATURE(sb, rocompat); - sb->s_dirt = 1; - handle->h_sync = 1; - BUFFER_TRACE(EXT4_SB(sb)->s_sbh, - "call ext4_journal_dirty_met adata"); - err = ext4_journal_dirty_metadata(handle, - EXT4_SB(sb)->s_sbh); - } - return err; -} - -int ext4_update_incompat_feature(handle_t *handle, - struct super_block *sb, __u32 incompat) -{ - int err = 0; - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, incompat)) { - err = ext4_journal_get_write_access(handle, - EXT4_SB(sb)->s_sbh); - if (err) - return err; - EXT4_SET_INCOMPAT_FEATURE(sb, incompat); - sb->s_dirt = 1; - handle->h_sync = 1; - BUFFER_TRACE(EXT4_SB(sb)->s_sbh, - "call ext4_journal_dirty_met adata"); - err = ext4_journal_dirty_metadata(handle, - EXT4_SB(sb)->s_sbh); - } - return err; -} - /* * Open the external journal device */ @@ -1771,13 +1711,13 @@ static void ext4_orphan_cleanup(struct super_block *sb, * * Note, this does *not* consider any metadata overhead for vfs i_blocks. */ -static loff_t ext4_max_size(int blkbits) +static loff_t ext4_max_size(int blkbits, int has_huge_files) { loff_t res; loff_t upper_limit = MAX_LFS_FILESIZE; /* small i_blocks in vfs inode? */ - if (sizeof(blkcnt_t) < sizeof(u64)) { + if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { /* * CONFIG_LSF is not enabled implies the inode * i_block represent total blocks in 512 bytes @@ -1807,7 +1747,7 @@ static loff_t ext4_max_size(int blkbits) * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks. * We need to be 1 filesystem block less than the 2^48 sector limit. */ -static loff_t ext4_max_bitmap_size(int bits) +static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) { loff_t res = EXT4_NDIR_BLOCKS; int meta_blocks; @@ -1820,11 +1760,11 @@ static loff_t ext4_max_bitmap_size(int bits) * total number of 512 bytes blocks of the file */ - if (sizeof(blkcnt_t) < sizeof(u64)) { + if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { /* - * CONFIG_LSF is not enabled implies the inode - * i_block represent total blocks in 512 bytes - * 32 == size of vfs inode i_blocks * 8 + * !has_huge_files or CONFIG_LSF is not enabled + * implies the inode i_block represent total blocks in + * 512 bytes 32 == size of vfs inode i_blocks * 8 */ upper_limit = (1LL << 32) - 1; @@ -1933,7 +1873,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) int blocksize; int db_count; int i; - int needs_recovery; + int needs_recovery, has_huge_files; __le32 features; __u64 blocks_count; int err; @@ -2074,7 +2014,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_id, le32_to_cpu(features)); goto failed_mount; } - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { + has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_HUGE_FILE); + if (has_huge_files) { /* * Large file size enabled file system can only be * mount if kernel is build with CONFIG_LSF @@ -2124,8 +2066,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } } - sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits); - sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits); + sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, + has_huge_files); + sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; -- cgit v1.1