From 659c6009ca2e3a01acc9881bafe5f55ef09c965b Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 27 Oct 2010 21:30:03 -0400 Subject: ext4: stop looping in ext4_num_dirty_pages when max_pages reached Today we simply break out of the inner loop when we have accumulated max_pages; this keeps scanning forwad and doing pagevec_lookup_tag() in the while (!done) loop, this does potentially a lot of work with no net effect. When we have accumulated max_pages, just clean up and return. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4b8debe..d88ba4a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1207,8 +1207,10 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, break; idx++; num++; - if (num >= max_pages) + if (num >= max_pages) { + done = 1; break; + } } pagevec_release(&pvec); } -- cgit v1.1 From b443e7339aa08574d30b0819b344618459c76214 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 27 Oct 2010 21:30:03 -0400 Subject: ext4: don't bump up LONG_MAX nr_to_write by a factor of 8 I'm uneasy with lots of stuff going on in ext4_da_writepages(), but bumping nr_to_write from LLONG_MAX to -8 clearly isn't making anything better, so avoid the multiplier in that case. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d88ba4a..50f3bba 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3004,9 +3004,12 @@ static int ext4_da_writepages(struct address_space *mapping, * sbi->max_writeback_mb_bump whichever is smaller. */ max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); - if (!range_cyclic && range_whole) - desired_nr_to_write = wbc->nr_to_write * 8; - else + if (!range_cyclic && range_whole) { + if (wbc->nr_to_write == LONG_MAX) + desired_nr_to_write = wbc->nr_to_write; + else + desired_nr_to_write = wbc->nr_to_write * 8; + } else desired_nr_to_write = ext4_num_dirty_pages(inode, index, max_pages); if (desired_nr_to_write > max_pages) -- cgit v1.1 From 0c9169ccad4aed233fdd49e95da4eada2536a06d Mon Sep 17 00:00:00 2001 From: Toshiyuki Okajima Date: Wed, 27 Oct 2010 21:30:07 -0400 Subject: ext4: fix potential infinite loop in ext4_da_writepages() On linux-2.6.36-rc2, if we execute the following script, we can hang the system when the /bin/sync command is executed: ======================================================================== #!/bin/sh echo -n "HANG UP TEST: " /bin/dd if=/dev/zero of=/tmp/img bs=1k count=1 seek=1M 2> /dev/null /sbin/mkfs.ext4 -Fq /tmp/img /bin/mount -o loop -t ext4 /tmp/img /mnt /bin/dd if=/dev/zero of=/mnt/file bs=1 count=1 \ seek=$((16*1024*1024*1024*1024-4096)) 2> /dev/null /bin/sync /bin/umount /mnt echo "DONE" exit 0 ======================================================================== We can see the following backtrace if we get the kdump when this hangup occurs: ====================================================================== kthread() => bdi_writeback_thread() => wb_do_writeback() => wb_writeback() => writeback_inodes_wb() => writeback_sb_inodes() => writeback_single_inode() => ext4_da_writepages() ---+ ^ infinite | | loop | +-------------+ ====================================================================== The reason why this hangup happens is described as follows: 1) We write the last extent block of the file whose size is the filesystem maximum size. 2) "BH_Delay" flag is set on the buffer_head of its block. 3) - the member, "m_lblk" of struct mpage_da_data is 4294967295 (UINT_MAX) - the member, "m_len" of struct mpage_da_data is 1 mpage_put_bnr_to_bhs() which is called via ext4_da_writepages() cannot clear "BH_Delay" flag of the buffer_head because the type of m_lblk is ext4_lblk_t and then m_lblk + m_len is overflow. Therefore an infinite loop occurs because ext4_da_writepages() cannot write the page (which corresponds to the block) since "BH_Delay" flag isn't cleared. ---------------------------------------------------------------------- static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, struct ext4_map_blocks *map) { ... int blocks = map->m_len; ... do { // cur_logical = 4294967295 // map->m_lblk = 4294967295 // blocks = 1 // *** map->m_lblk + blocks == 0 (OVERFLOW!) *** // (cur_logical >= map->m_lblk + blocks) => true if (cur_logical >= map->m_lblk + blocks) break; ---------------------------------------------------------------------- NOTE: Mounting with the nodelalloc option will avoid this codepath, and thus, avoid this hang Signed-off-by: Toshiyuki Okajima Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 50f3bba..1e824a3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2105,7 +2105,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, } while ((bh = bh->b_this_page) != head); do { - if (cur_logical >= map->m_lblk + blocks) + if (cur_logical > map->m_lblk + (blocks - 1)) break; if (buffer_delay(bh) || buffer_unwritten(bh)) { -- cgit v1.1 From c999af2b347a55174f702702e0df814d05ef5491 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 27 Oct 2010 21:30:07 -0400 Subject: ext4: queue conversion after adding to inode's completed IO list By queuing the io end on the unwritten workqueue before adding it to our inode's list of completed IOs, I think we run the risk of the work getting completed, and the IO freed, before we try to add it to the inode's i_completed_io_list. It should be safe to add it to the inode's list of completed IOs, and -then- queue it for completion, I think. Thanks to Dave Chinner for pointing out the race. Signed-off-by: Eric Sandeen Reviewed-by: Jiaying Zhang Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1e824a3..670ab15 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3850,14 +3850,14 @@ out: } wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; - /* queue the work to convert unwritten extents to written */ - queue_work(wq, &io_end->work); - /* Add the io_end to per-inode completed aio dio list*/ ei = EXT4_I(io_end->inode); spin_lock_irqsave(&ei->i_completed_io_lock, flags); list_add_tail(&io_end->list, &ei->i_completed_io_list); spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + + /* queue the work to convert unwritten extents to written */ + queue_work(wq, &io_end->work); iocb->private = NULL; } -- cgit v1.1 From 5a87b7a5da250c9be6d757758425dfeaf8ed3179 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 27 Oct 2010 21:30:09 -0400 Subject: ext4: call mpage_da_submit_io() from mpage_da_map_blocks() Eventually we need to completely reorganize the ext4 writepage callpath, but for now, we simplify things a little by calling mpage_da_submit_io() from mpage_da_map_blocks(), since all of the places where we call mpage_da_map_blocks() it is followed up by a call to mpage_da_submit_io(). We're also a wee bit better with respect to error handling, but there are still a number of issues where it's not clear what the right thing is to do with ext4 functions deep in the writeback codepath fails. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 66 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 33 insertions(+), 33 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 670ab15..55961ff 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -60,6 +60,7 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, } static void ext4_invalidatepage(struct page *page, unsigned long offset); +static int ext4_writepage(struct page *page, struct writeback_control *wbc); /* * Test whether an inode is a fast symlink. @@ -2033,7 +2034,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) BUG_ON(PageWriteback(page)); pages_skipped = mpd->wbc->pages_skipped; - err = mapping->a_ops->writepage(page, mpd->wbc); + err = ext4_writepage(page, mpd->wbc); if (!err && (pages_skipped == mpd->wbc->pages_skipped)) /* * have successfully written the page @@ -2189,14 +2190,15 @@ static void ext4_print_free_blocks(struct inode *inode) } /* - * mpage_da_map_blocks - go through given space + * mpage_da_map_and_submit - go through given space, map them + * if necessary, and then submit them for I/O * * @mpd - bh describing space * * The function skips space we know is already mapped to disk blocks. * */ -static int mpage_da_map_blocks(struct mpage_da_data *mpd) +static void mpage_da_map_and_submit(struct mpage_da_data *mpd) { int err, blks, get_blocks_flags; struct ext4_map_blocks map; @@ -2206,18 +2208,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) handle_t *handle = NULL; /* - * We consider only non-mapped and non-allocated blocks + * If the blocks are mapped already, or we couldn't accumulate + * any blocks, then proceed immediately to the submission stage. */ - if ((mpd->b_state & (1 << BH_Mapped)) && - !(mpd->b_state & (1 << BH_Delay)) && - !(mpd->b_state & (1 << BH_Unwritten))) - return 0; - - /* - * If we didn't accumulate anything to write simply return - */ - if (!mpd->b_size) - return 0; + if ((mpd->b_size == 0) || + ((mpd->b_state & (1 << BH_Mapped)) && + !(mpd->b_state & (1 << BH_Delay)) && + !(mpd->b_state & (1 << BH_Unwritten)))) + goto submit_io; handle = ext4_journal_current_handle(); BUG_ON(!handle); @@ -2254,17 +2252,18 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) err = blks; /* - * If get block returns with error we simply - * return. Later writepage will redirty the page and - * writepages will find the dirty page again + * If get block returns EAGAIN or ENOSPC and there + * appears to be free blocks we will call + * ext4_writepage() for all of the pages which will + * just redirty the pages. */ if (err == -EAGAIN) - return 0; + goto submit_io; if (err == -ENOSPC && ext4_count_free_blocks(sb)) { mpd->retval = err; - return 0; + goto submit_io; } /* @@ -2289,7 +2288,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) /* invalidate all the pages */ ext4_da_block_invalidatepages(mpd, next, mpd->b_size >> mpd->inode->i_blkbits); - return err; + return; } BUG_ON(blks == 0); @@ -2312,7 +2311,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) if (ext4_should_order_data(mpd->inode)) { err = ext4_jbd2_file_inode(handle, mpd->inode); if (err) - return err; + /* This only happens if the journal is aborted */ + return; } /* @@ -2323,10 +2323,16 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) disksize = i_size_read(mpd->inode); if (disksize > EXT4_I(mpd->inode)->i_disksize) { ext4_update_i_disksize(mpd->inode, disksize); - return ext4_mark_inode_dirty(handle, mpd->inode); + err = ext4_mark_inode_dirty(handle, mpd->inode); + if (err) + ext4_error(mpd->inode->i_sb, + "Failed to mark inode %lu dirty", + mpd->inode->i_ino); } - return 0; +submit_io: + mpage_da_submit_io(mpd); + mpd->io_done = 1; } #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ @@ -2403,9 +2409,7 @@ flush_it: * We couldn't merge the block to our extent, so we * need to flush current extent and start new one */ - if (mpage_da_map_blocks(mpd) == 0) - mpage_da_submit_io(mpd); - mpd->io_done = 1; + mpage_da_map_and_submit(mpd); return; } @@ -2437,15 +2441,13 @@ static int __mpage_da_writepage(struct page *page, if (mpd->next_page != page->index) { /* * Nope, we can't. So, we map non-allocated blocks - * and start IO on them using writepage() + * and start IO on them */ if (mpd->next_page != mpd->first_page) { - if (mpage_da_map_blocks(mpd) == 0) - mpage_da_submit_io(mpd); + mpage_da_map_and_submit(mpd); /* * skip rest of the page in the page_vec */ - mpd->io_done = 1; redirty_page_for_writepage(wbc, page); unlock_page(page); return MPAGE_DA_EXTENT_TAIL; @@ -3071,9 +3073,7 @@ retry: * them for I/O. */ if (!mpd.io_done && mpd.next_page != mpd.first_page) { - if (mpage_da_map_blocks(&mpd) == 0) - mpage_da_submit_io(&mpd); - mpd.io_done = 1; + mpage_da_map_and_submit(&mpd); ret = MPAGE_DA_EXTENT_TAIL; } trace_ext4_da_write_pages(inode, &mpd); -- cgit v1.1 From a42afc5f56f319107e987aa6adf2f65d93d527c7 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 27 Oct 2010 21:30:09 -0400 Subject: ext4: simplify ext4_writepage() The actual code in ext4_writepage() is unnecessarily convoluted. Simplify it so it is easier to understand, but otherwise logically equivalent. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 73 ++++++++++++++++++++------------------------------------- 1 file changed, 25 insertions(+), 48 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 55961ff..a08ec79 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2704,7 +2704,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); static int ext4_writepage(struct page *page, struct writeback_control *wbc) { - int ret = 0; + int ret = 0, commit_write = 0; loff_t size; unsigned int len; struct buffer_head *page_bufs = NULL; @@ -2717,60 +2717,37 @@ static int ext4_writepage(struct page *page, else len = PAGE_CACHE_SIZE; - if (page_has_buffers(page)) { - page_bufs = page_buffers(page); - if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, - ext4_bh_delay_or_unwritten)) { - /* - * We don't want to do block allocation - * So redirty the page and return - * We may reach here when we do a journal commit - * via journal_submit_inode_data_buffers. - * If we don't have mapping block we just ignore - * them. We can also reach here via shrink_page_list - */ + /* + * If the page does not have buffers (for whatever reason), + * try to create them using block_prepare_write. If this + * fails, redirty the page and move on. + */ + if (!page_buffers(page)) { + if (block_prepare_write(page, 0, len, + noalloc_get_block_write)) { + redirty_page: redirty_page_for_writepage(wbc, page); unlock_page(page); return 0; } - } else { + commit_write = 1; + } + page_bufs = page_buffers(page); + if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, + ext4_bh_delay_or_unwritten)) { /* - * The test for page_has_buffers() is subtle: - * We know the page is dirty but it lost buffers. That means - * that at some moment in time after write_begin()/write_end() - * has been called all buffers have been clean and thus they - * must have been written at least once. So they are all - * mapped and we can happily proceed with mapping them - * and writing the page. - * - * Try to initialize the buffer_heads and check whether - * all are mapped and non delay. We don't want to - * do block allocation here. + * We don't want to do block allocation So redirty the + * page and return We may reach here when we do a + * journal commit via + * journal_submit_inode_data_buffers. If we don't + * have mapping block we just ignore them. We can also + * reach here via shrink_page_list */ - ret = block_prepare_write(page, 0, len, - noalloc_get_block_write); - if (!ret) { - page_bufs = page_buffers(page); - /* check whether all are mapped and non delay */ - if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, - ext4_bh_delay_or_unwritten)) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } - } else { - /* - * We can't do block allocation here - * so just redity the page and unlock - * and return - */ - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } + goto redirty_page; + } + if (commit_write) /* now mark the buffer_heads as dirty and uptodate */ block_commit_write(page, 0, len); - } if (PageChecked(page) && ext4_should_journal_data(inode)) { /* @@ -2781,7 +2758,7 @@ static int ext4_writepage(struct page *page, return __ext4_journalled_writepage(page, len); } - if (page_bufs && buffer_uninit(page_bufs)) { + if (buffer_uninit(page_bufs)) { ext4_set_bh_endio(page_bufs, inode); ret = block_write_full_page_endio(page, noalloc_get_block_write, wbc, ext4_end_io_buffer_write); -- cgit v1.1 From cb20d5188366f04d96d2e07b1240cc92170ade40 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 27 Oct 2010 21:30:09 -0400 Subject: ext4: inline ext4_writepage() into mpage_da_submit_io() As a prepratory step to switching to bio_submit, inline ext4_writepage() into mpage_da_submit() and then simplify things a bit. This makes it clearer what mpage_da_submit needs to do. Also, move the ClearPageChecked(page) call into __ext4_journalled_writepage(), as a minor bit of cleanup refactoring. This also allows us to pull i_size_read() and ext4_should_journal_data() out of the loop, which should be a very minor CPU savings. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 59 insertions(+), 11 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a08ec79..97a0c35 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -60,7 +60,12 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, } static void ext4_invalidatepage(struct page *page, unsigned long offset); -static int ext4_writepage(struct page *page, struct writeback_control *wbc); +static int noalloc_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); +static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); +static int __ext4_journalled_writepage(struct page *page, unsigned int len); +static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); /* * Test whether an inode is a fast symlink. @@ -2000,12 +2005,15 @@ static void ext4_da_page_release_reservation(struct page *page, */ static int mpage_da_submit_io(struct mpage_da_data *mpd) { - long pages_skipped; struct pagevec pvec; unsigned long index, end; int ret = 0, err, nr_pages, i; struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; + loff_t size = i_size_read(inode); + unsigned int len; + struct buffer_head *page_bufs = NULL; + int journal_data = ext4_should_journal_data(inode); BUG_ON(mpd->next_page <= mpd->first_page); /* @@ -2023,28 +2031,69 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { + int commit_write = 0; struct page *page = pvec.pages[i]; index = page->index; if (index > end) break; + + if (index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; index++; BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); - pages_skipped = mpd->wbc->pages_skipped; - err = ext4_writepage(page, mpd->wbc); - if (!err && (pages_skipped == mpd->wbc->pages_skipped)) + /* + * If the page does not have buffers (for + * whatever reason), try to create them using + * block_prepare_write. If this fails, + * redirty the page and move on. + */ + if (!page_has_buffers(page)) { + if (block_prepare_write(page, 0, len, + noalloc_get_block_write)) { + redirty_page: + redirty_page_for_writepage(mpd->wbc, + page); + unlock_page(page); + continue; + } + commit_write = 1; + } + page_bufs = page_buffers(page); + if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, + ext4_bh_delay_or_unwritten)) { /* - * have successfully written the page - * without skipping the same + * We couldn't do block allocation for + * some reason. */ + goto redirty_page; + } + + if (commit_write) + /* mark the buffer_heads as dirty & uptodate */ + block_commit_write(page, 0, len); + + if (journal_data && PageChecked(page)) + err = __ext4_journalled_writepage(page, len); + else if (buffer_uninit(page_bufs)) { + ext4_set_bh_endio(page_bufs, inode); + err = block_write_full_page_endio(page, + noalloc_get_block_write, + mpd->wbc, ext4_end_io_buffer_write); + } else + err = block_write_full_page(page, + noalloc_get_block_write, mpd->wbc); + + if (!err) mpd->pages_written++; /* * In error case, we have to continue because * remaining pages are still locked - * XXX: unlock and re-dirty them? */ if (ret == 0) ret = err; @@ -2627,6 +2676,7 @@ static int __ext4_journalled_writepage(struct page *page, int ret = 0; int err; + ClearPageChecked(page); page_bufs = page_buffers(page); BUG_ON(!page_bufs); walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); @@ -2749,14 +2799,12 @@ static int ext4_writepage(struct page *page, /* now mark the buffer_heads as dirty and uptodate */ block_commit_write(page, 0, len); - if (PageChecked(page) && ext4_should_journal_data(inode)) { + if (PageChecked(page) && ext4_should_journal_data(inode)) /* * It's mmapped pagecache. Add buffers and journal it. There * doesn't seem much point in redirtying the page here. */ - ClearPageChecked(page); return __ext4_journalled_writepage(page, len); - } if (buffer_uninit(page_bufs)) { ext4_set_bh_endio(page_bufs, inode); -- cgit v1.1 From 3ecdb3a193a5f224f084c04a63aa28cdccf4d7d0 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 27 Oct 2010 21:30:10 -0400 Subject: ext4: inline walk_page_buffers() into mpage_da_submit_io Expand the call: if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, ext4_bh_delay_or_unwritten)) goto redirty_page into mpage_da_submit_io(). This will allow us to merge in mpage_put_bnr_to_bhs() in the next patch. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 97a0c35..5da6cfc 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2011,8 +2011,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; loff_t size = i_size_read(inode); - unsigned int len; - struct buffer_head *page_bufs = NULL; + unsigned int len, block_start; + struct buffer_head *bh, *page_bufs = NULL; int journal_data = ext4_should_journal_data(inode); BUG_ON(mpd->next_page <= mpd->first_page); @@ -2064,15 +2064,17 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) } commit_write = 1; } - page_bufs = page_buffers(page); - if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, - ext4_bh_delay_or_unwritten)) { - /* - * We couldn't do block allocation for - * some reason. - */ - goto redirty_page; - } + + bh = page_bufs = page_buffers(page); + block_start = 0; + do { + /* redirty page if block allocation undone */ + if (!bh || buffer_delay(bh) || + buffer_unwritten(bh)) + goto redirty_page; + bh = bh->b_this_page; + block_start += bh->b_size; + } while ((bh != page_bufs) && (block_start < len)); if (commit_write) /* mark the buffer_heads as dirty & uptodate */ -- cgit v1.1 From 1de3e3df917459422cb2aecac440febc8879d410 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 27 Oct 2010 21:30:10 -0400 Subject: ext4: move mpage_put_bnr_to_bhs()'s functionality to mpage_da_submit_io() This massively simplifies the ext4_da_writepages() code path by completely removing mpage_put_bnr_bhs(), which is almost 100 lines of code iterating over a set of pages using pagevec_lookup(), and folds that functionality into mpage_da_submit_io()'s existing pagevec_lookup() loop. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 139 ++++++++++++++++---------------------------------------- 1 file changed, 38 insertions(+), 101 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5da6cfc..c65d647 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2003,7 +2003,8 @@ static void ext4_da_page_release_reservation(struct page *page, * * As pages are already locked by write_cache_pages(), we can't use it */ -static int mpage_da_submit_io(struct mpage_da_data *mpd) +static int mpage_da_submit_io(struct mpage_da_data *mpd, + struct ext4_map_blocks *map) { struct pagevec pvec; unsigned long index, end; @@ -2014,6 +2015,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) unsigned int len, block_start; struct buffer_head *bh, *page_bufs = NULL; int journal_data = ext4_should_journal_data(inode); + sector_t pblock = 0, cur_logical = 0; BUG_ON(mpd->next_page <= mpd->first_page); /* @@ -2031,7 +2033,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { - int commit_write = 0; + int commit_write = 0, redirty_page = 0; struct page *page = pvec.pages[i]; index = page->index; @@ -2042,6 +2044,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) len = size & ~PAGE_CACHE_MASK; else len = PAGE_CACHE_SIZE; + if (map) { + cur_logical = index << (PAGE_CACHE_SHIFT - + inode->i_blkbits); + pblock = map->m_pblk + (cur_logical - + map->m_lblk); + } index++; BUG_ON(!PageLocked(page)); @@ -2068,13 +2076,34 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) bh = page_bufs = page_buffers(page); block_start = 0; do { - /* redirty page if block allocation undone */ - if (!bh || buffer_delay(bh) || - buffer_unwritten(bh)) + if (!bh) goto redirty_page; + if (map && (cur_logical >= map->m_lblk) && + (cur_logical <= (map->m_lblk + + (map->m_len - 1)))) { + if (buffer_delay(bh)) { + clear_buffer_delay(bh); + bh->b_blocknr = pblock; + } + if (buffer_unwritten(bh) || + buffer_mapped(bh)) + BUG_ON(bh->b_blocknr != pblock); + if (map->m_flags & EXT4_MAP_UNINIT) + set_buffer_uninit(bh); + clear_buffer_unwritten(bh); + } + + /* redirty page if block allocation undone */ + if (buffer_delay(bh) || buffer_unwritten(bh)) + redirty_page = 1; bh = bh->b_this_page; block_start += bh->b_size; - } while ((bh != page_bufs) && (block_start < len)); + cur_logical++; + pblock++; + } while (bh != page_bufs); + + if (redirty_page) + goto redirty_page; if (commit_write) /* mark the buffer_heads as dirty & uptodate */ @@ -2105,91 +2134,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) return ret; } -/* - * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers - * - * the function goes through all passed space and put actual disk - * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten - */ -static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, - struct ext4_map_blocks *map) -{ - struct inode *inode = mpd->inode; - struct address_space *mapping = inode->i_mapping; - int blocks = map->m_len; - sector_t pblock = map->m_pblk, cur_logical; - struct buffer_head *head, *bh; - pgoff_t index, end; - struct pagevec pvec; - int nr_pages, i; - - index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - - pagevec_init(&pvec, 0); - - while (index <= end) { - /* XXX: optimize tail */ - nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); - if (nr_pages == 0) - break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - - index = page->index; - if (index > end) - break; - index++; - - BUG_ON(!PageLocked(page)); - BUG_ON(PageWriteback(page)); - BUG_ON(!page_has_buffers(page)); - - bh = page_buffers(page); - head = bh; - - /* skip blocks out of the range */ - do { - if (cur_logical >= map->m_lblk) - break; - cur_logical++; - } while ((bh = bh->b_this_page) != head); - - do { - if (cur_logical > map->m_lblk + (blocks - 1)) - break; - - if (buffer_delay(bh) || buffer_unwritten(bh)) { - - BUG_ON(bh->b_bdev != inode->i_sb->s_bdev); - - if (buffer_delay(bh)) { - clear_buffer_delay(bh); - bh->b_blocknr = pblock; - } else { - /* - * unwritten already should have - * blocknr assigned. Verify that - */ - clear_buffer_unwritten(bh); - BUG_ON(bh->b_blocknr != pblock); - } - - } else if (buffer_mapped(bh)) - BUG_ON(bh->b_blocknr != pblock); - - if (map->m_flags & EXT4_MAP_UNINIT) - set_buffer_uninit(bh); - cur_logical++; - pblock++; - } while ((bh = bh->b_this_page) != head); - } - pagevec_release(&pvec); - } -} - - static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, sector_t logical, long blk_cnt) { @@ -2252,7 +2196,7 @@ static void ext4_print_free_blocks(struct inode *inode) static void mpage_da_map_and_submit(struct mpage_da_data *mpd) { int err, blks, get_blocks_flags; - struct ext4_map_blocks map; + struct ext4_map_blocks map, *mapp = NULL; sector_t next = mpd->b_blocknr; unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; loff_t disksize = EXT4_I(mpd->inode)->i_disksize; @@ -2343,6 +2287,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd) } BUG_ON(blks == 0); + mapp = ↦ if (map.m_flags & EXT4_MAP_NEW) { struct block_device *bdev = mpd->inode->i_sb->s_bdev; int i; @@ -2351,14 +2296,6 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd) unmap_underlying_metadata(bdev, map.m_pblk + i); } - /* - * If blocks are delayed marked, we need to - * put actual blocknr and drop delayed bit - */ - if ((mpd->b_state & (1 << BH_Delay)) || - (mpd->b_state & (1 << BH_Unwritten))) - mpage_put_bnr_to_bhs(mpd, &map); - if (ext4_should_order_data(mpd->inode)) { err = ext4_jbd2_file_inode(handle, mpd->inode); if (err) @@ -2382,7 +2319,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd) } submit_io: - mpage_da_submit_io(mpd); + mpage_da_submit_io(mpd, mapp); mpd->io_done = 1; } -- cgit v1.1 From bd2d0210cf22f2bd0cef72eb97cf94fc7d31d8cc Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 27 Oct 2010 21:30:10 -0400 Subject: ext4: use bio layer instead of buffer layer in mpage_da_submit_io Call the block I/O layer directly instad of going through the buffer layer. This should give us much better performance and scalability, as well as lowering our CPU utilization when doing buffered writeback. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 118 +++++++------------------------------------------------- 1 file changed, 14 insertions(+), 104 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c65d647..58604fe 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2016,8 +2016,10 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, struct buffer_head *bh, *page_bufs = NULL; int journal_data = ext4_should_journal_data(inode); sector_t pblock = 0, cur_logical = 0; + struct ext4_io_submit io_submit; BUG_ON(mpd->next_page <= mpd->first_page); + memset(&io_submit, 0, sizeof(io_submit)); /* * We need to start from the first_page to the next_page - 1 * to make sure we also write the mapped dirty buffer_heads. @@ -2109,16 +2111,16 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, /* mark the buffer_heads as dirty & uptodate */ block_commit_write(page, 0, len); - if (journal_data && PageChecked(page)) + /* + * Delalloc doesn't support data journalling, + * but eventually maybe we'll lift this + * restriction. + */ + if (unlikely(journal_data && PageChecked(page))) err = __ext4_journalled_writepage(page, len); - else if (buffer_uninit(page_bufs)) { - ext4_set_bh_endio(page_bufs, inode); - err = block_write_full_page_endio(page, - noalloc_get_block_write, - mpd->wbc, ext4_end_io_buffer_write); - } else - err = block_write_full_page(page, - noalloc_get_block_write, mpd->wbc); + else + err = ext4_bio_write_page(&io_submit, page, + len, mpd->wbc); if (!err) mpd->pages_written++; @@ -2131,6 +2133,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, } pagevec_release(&pvec); } + ext4_io_submit(&io_submit); return ret; } @@ -3426,15 +3429,6 @@ ext4_readpages(struct file *file, struct address_space *mapping, return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); } -static void ext4_free_io_end(ext4_io_end_t *io) -{ - BUG_ON(!io); - if (io->page) - put_page(io->page); - iput(io->inode); - kfree(io); -} - static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset) { struct buffer_head *head, *bh; @@ -3640,68 +3634,6 @@ static void dump_completed_IO(struct inode * inode) } /* - * check a range of space and convert unwritten extents to written. - */ -static int ext4_end_io_nolock(ext4_io_end_t *io) -{ - struct inode *inode = io->inode; - loff_t offset = io->offset; - ssize_t size = io->size; - int ret = 0; - - ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," - "list->prev 0x%p\n", - io, inode->i_ino, io->list.next, io->list.prev); - - if (list_empty(&io->list)) - return ret; - - if (io->flag != EXT4_IO_UNWRITTEN) - return ret; - - ret = ext4_convert_unwritten_extents(inode, offset, size); - if (ret < 0) { - printk(KERN_EMERG "%s: failed to convert unwritten" - "extents to written extents, error is %d" - " io is still on inode %lu aio dio list\n", - __func__, ret, inode->i_ino); - return ret; - } - - if (io->iocb) - aio_complete(io->iocb, io->result, 0); - /* clear the DIO AIO unwritten flag */ - io->flag = 0; - return ret; -} - -/* - * work on completed aio dio IO, to convert unwritten extents to extents - */ -static void ext4_end_io_work(struct work_struct *work) -{ - ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); - struct inode *inode = io->inode; - struct ext4_inode_info *ei = EXT4_I(inode); - unsigned long flags; - int ret; - - mutex_lock(&inode->i_mutex); - ret = ext4_end_io_nolock(io); - if (ret < 0) { - mutex_unlock(&inode->i_mutex); - return; - } - - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - if (!list_empty(&io->list)) - list_del_init(&io->list); - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - mutex_unlock(&inode->i_mutex); - ext4_free_io_end(io); -} - -/* * This function is called from ext4_sync_file(). * * When IO is completed, the work to convert unwritten extents to @@ -3756,28 +3688,6 @@ int flush_completed_IO(struct inode *inode) return (ret2 < 0) ? ret2 : 0; } -static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags) -{ - ext4_io_end_t *io = NULL; - - io = kmalloc(sizeof(*io), flags); - - if (io) { - igrab(inode); - io->inode = inode; - io->flag = 0; - io->offset = 0; - io->size = 0; - io->page = NULL; - io->iocb = NULL; - io->result = 0; - INIT_WORK(&io->work, ext4_end_io_work); - INIT_LIST_HEAD(&io->list); - } - - return io; -} - static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ssize_t size, void *private, int ret, bool is_async) @@ -3797,7 +3707,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, size); /* if not aio dio with unwritten extents, just free io and return */ - if (io_end->flag != EXT4_IO_UNWRITTEN){ + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { ext4_free_io_end(io_end); iocb->private = NULL; out: @@ -3842,7 +3752,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) goto out; } - io_end->flag = EXT4_IO_UNWRITTEN; + io_end->flag = EXT4_IO_END_UNWRITTEN; inode = io_end->inode; /* Add the io_end to per-inode completed io list*/ -- cgit v1.1 From 877836905da55e8f2426234f42a89287184949e9 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 27 Oct 2010 21:30:11 -0400 Subject: ext4: Check return value of sb_getblk() and friends Fail block allocation if sb_getblk() returns NULL. In that case, sb_find_get_block() also likely to fail so that it should skip calling ext4_forget(). Signed-off-by: Namhyung Kim Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 58604fe..077c3c9 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -761,6 +761,11 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, * parent to disk. */ bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + if (unlikely(!bh)) { + err = -EIO; + goto failed; + } + branch[n].bh = bh; lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); -- cgit v1.1 From bbd08344e3df8c7c1d7aa04bc0c8c9367806e12d Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 27 Oct 2010 21:30:12 -0400 Subject: ext4: tidy up a void argument in inode.c This doesn't fix anything at all, it just removes a vestige of prior use from __mpage_da_writepage() __mpage_da_writepage() had a *void argument leftover from its previous life as a callback; make it reflect the actual type. Fixing this up makes it slightly more obvious to read, and enables proper typechecking. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 077c3c9..6671fcb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2424,9 +2424,9 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) * The function finds extents of pages and scan them for all blocks. */ static int __mpage_da_writepage(struct page *page, - struct writeback_control *wbc, void *data) + struct writeback_control *wbc, + struct mpage_da_data *mpd) { - struct mpage_da_data *mpd = data; struct inode *inode = mpd->inode; struct buffer_head *bh, *head; sector_t logical; -- cgit v1.1 From 5b41d92437f1ae19b3f3ffa3b16589fd5df50ac0 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 27 Oct 2010 21:30:13 -0400 Subject: ext4: implement writeback livelock avoidance using page tagging This is analogous to Jan Kara's commit, f446daaea9d4a420d16c606f755f3689dcb2d0ce mm: implement writeback livelock avoidance using page tagging but since we forked write_cache_pages, we need to reimplement it there (and in ext4_da_writepages, since range_cyclic handling was moved to there) If you start a large buffered IO to a file, and then set fsync after it, you'll find that fsync does not complete until the other IO stops. If you continue re-dirtying the file (say, putting dd with conv=notrunc in a loop), when fsync finally completes (after all IO is done), it reports via tracing that it has written many more pages than the file contains; in other words it has synced and re-synced pages in the file multiple times. This then leads to problems with our writeback_index update, since it advances it by pages written, and essentially sets writeback_index off the end of the file... With the following patch, we only sync as much as was dirty at the time of the sync. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6671fcb..c9ea95b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2809,16 +2809,21 @@ static int write_cache_pages_da(struct address_space *mapping, pgoff_t index; pgoff_t end; /* Inclusive */ long nr_to_write = wbc->nr_to_write; + int tag; pagevec_init(&pvec, 0); index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->sync_mode == WB_SYNC_ALL) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; + while (!done && (index <= end)) { int i; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) break; @@ -2923,6 +2928,7 @@ static int ext4_da_writepages(struct address_space *mapping, long desired_nr_to_write, nr_to_writebump = 0; loff_t range_start = wbc->range_start; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); + pgoff_t end; trace_ext4_da_writepages(inode, wbc); @@ -2958,8 +2964,11 @@ static int ext4_da_writepages(struct address_space *mapping, wbc->range_start = index << PAGE_CACHE_SHIFT; wbc->range_end = LLONG_MAX; wbc->range_cyclic = 0; - } else + end = -1; + } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + } /* * This works around two forms of stupidity. The first is in @@ -3000,6 +3009,9 @@ static int ext4_da_writepages(struct address_space *mapping, pages_skipped = wbc->pages_skipped; retry: + if (wbc->sync_mode == WB_SYNC_ALL) + tag_pages_for_writeback(mapping, index, end); + while (!ret && wbc->nr_to_write > 0) { /* -- cgit v1.1 From 72f84e6560d18d60a091df27edf81409be6641cb Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 27 Oct 2010 21:30:13 -0400 Subject: ext4: update writeback_index based on last page scanned As pointed out in a prior patch, updating the mapping's writeback_index based on pages written isn't quite right; what the writeback index is really supposed to reflect is the next page which should be scanned for writeback during periodic flush. As in write_cache_pages(), write_cache_pages_da() does this scanning for us as we assemble the mpd for later writeout. If we keep track of the next page after the current scan, we can easily update writeback_index without worrying about pages written vs. pages skipped, etc. Without this, an fsync will reset writeback_index to 0 (its starting index) + however many pages it wrote, which can mess up the progress of periodic flush. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c9ea95b..45fc5bd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2800,12 +2800,13 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) */ static int write_cache_pages_da(struct address_space *mapping, struct writeback_control *wbc, - struct mpage_da_data *mpd) + struct mpage_da_data *mpd, + pgoff_t *done_index) { int ret = 0; int done = 0; struct pagevec pvec; - int nr_pages; + unsigned nr_pages; pgoff_t index; pgoff_t end; /* Inclusive */ long nr_to_write = wbc->nr_to_write; @@ -2820,6 +2821,7 @@ static int write_cache_pages_da(struct address_space *mapping, else tag = PAGECACHE_TAG_DIRTY; + *done_index = index; while (!done && (index <= end)) { int i; @@ -2843,6 +2845,8 @@ static int write_cache_pages_da(struct address_space *mapping, break; } + *done_index = page->index + 1; + lock_page(page); /* @@ -2928,6 +2932,7 @@ static int ext4_da_writepages(struct address_space *mapping, long desired_nr_to_write, nr_to_writebump = 0; loff_t range_start = wbc->range_start; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); + pgoff_t done_index = 0; pgoff_t end; trace_ext4_da_writepages(inode, wbc); @@ -3050,7 +3055,7 @@ retry: mpd.io_done = 0; mpd.pages_written = 0; mpd.retval = 0; - ret = write_cache_pages_da(mapping, wbc, &mpd); + ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index); /* * If we have a contiguous extent of pages and we * haven't done the I/O yet, map the blocks and submit @@ -3104,14 +3109,13 @@ retry: __func__, wbc->nr_to_write, ret); /* Update index */ - index += pages_written; wbc->range_cyclic = range_cyclic; if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) /* * set the writeback_index so that range_cyclic * mode will write it back later */ - mapping->writeback_index = index; + mapping->writeback_index = done_index; out_writepages: wbc->nr_to_write -= nr_to_writebump; -- cgit v1.1 From 1f109d5a17b438c4a54cbf6fd87a249e3d72fb21 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 27 Oct 2010 21:30:14 -0400 Subject: ext4: make various ext4 functions be static These functions have no need to be exported beyond file context. No functions needed to be moved for this commit; just some function declarations changed to be static and removed from header files. (A similar patch was submitted by Eric Sandeen, but I wanted to handle code movement in separate patches to make sure code changes didn't accidentally get dropped.) Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 45fc5bd..7a83c27 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5543,7 +5543,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) * * Also account for superblock, inode, quota and xattr blocks */ -int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) +static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) { ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); int gdpblocks; -- cgit v1.1 From 4a873a472b3bbcfd425d7ae210afdec28c04e2e5 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 27 Oct 2010 21:30:14 -0400 Subject: ext4: move flush_completed_IO to fs/ext4/fsync.c and make it static Fix a namespace leak by moving the function to the file where it is used and making it static. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 83 --------------------------------------------------------- 1 file changed, 83 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7a83c27..9e60d0b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3626,89 +3626,6 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock, EXT4_GET_BLOCKS_IO_CREATE_EXT); } -static void dump_completed_IO(struct inode * inode) -{ -#ifdef EXT4_DEBUG - struct list_head *cur, *before, *after; - ext4_io_end_t *io, *io0, *io1; - unsigned long flags; - - if (list_empty(&EXT4_I(inode)->i_completed_io_list)){ - ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino); - return; - } - - ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino); - spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); - list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){ - cur = &io->list; - before = cur->prev; - io0 = container_of(before, ext4_io_end_t, list); - after = cur->next; - io1 = container_of(after, ext4_io_end_t, list); - - ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", - io, inode->i_ino, io0, io1); - } - spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); -#endif -} - -/* - * This function is called from ext4_sync_file(). - * - * When IO is completed, the work to convert unwritten extents to - * written is queued on workqueue but may not get immediately - * scheduled. When fsync is called, we need to ensure the - * conversion is complete before fsync returns. - * The inode keeps track of a list of pending/completed IO that - * might needs to do the conversion. This function walks through - * the list and convert the related unwritten extents for completed IO - * to written. - * The function return the number of pending IOs on success. - */ -int flush_completed_IO(struct inode *inode) -{ - ext4_io_end_t *io; - struct ext4_inode_info *ei = EXT4_I(inode); - unsigned long flags; - int ret = 0; - int ret2 = 0; - - if (list_empty(&ei->i_completed_io_list)) - return ret; - - dump_completed_IO(inode); - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - while (!list_empty(&ei->i_completed_io_list)){ - io = list_entry(ei->i_completed_io_list.next, - ext4_io_end_t, list); - /* - * Calling ext4_end_io_nolock() to convert completed - * IO to written. - * - * When ext4_sync_file() is called, run_queue() may already - * about to flush the work corresponding to this io structure. - * It will be upset if it founds the io structure related - * to the work-to-be schedule is freed. - * - * Thus we need to keep the io structure still valid here after - * convertion finished. The io structure has a flag to - * avoid double converting from both fsync and background work - * queue work. - */ - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - ret = ext4_end_io_nolock(io); - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - if (ret < 0) - ret2 = ret; - else - list_del_init(&io->list); - } - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - return (ret2 < 0) ? ret2 : 0; -} - static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ssize_t size, void *private, int ret, bool is_async) -- cgit v1.1 From 3d287de3b828226e5a450c7fd5bf4283792dc2b0 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Wed, 27 Oct 2010 22:08:46 -0400 Subject: ext4: optimize orphan_list handling for ext4_setattr Surprisingly chown() on ext4 is not SMP scalable operation. Due to unconditional orphan_del(NULL, inode) in ext4_setattr() result in significant performance overhead because of global orphan mutex, especially in no-journal mode (where orphan_add() is noop). It is possible to skip explicit orphan_del if possible. Results of fchown() micro-benchmark in no-journal mode while (1) { iteration++; fchown(fd, uid, gid); fchown(fd, uid + 1, gid + 1) } measured: iterations per millisecond | nr_tasks | w/o patch | with patch | | 1 | 142 | 185 | | 4 | 109 | 642 | Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9e60d0b..3ba237b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5281,6 +5281,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; int error, rc = 0; + int orphan = 0; const unsigned int ia_valid = attr->ia_valid; error = inode_change_ok(inode, attr); @@ -5336,8 +5337,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) error = PTR_ERR(handle); goto err_out; } - - error = ext4_orphan_add(handle, inode); + if (ext4_handle_valid(handle)) { + error = ext4_orphan_add(handle, inode); + orphan = 1; + } EXT4_I(inode)->i_disksize = attr->ia_size; rc = ext4_mark_inode_dirty(handle, inode); if (!error) @@ -5355,6 +5358,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) goto err_out; } ext4_orphan_del(handle, inode); + orphan = 0; ext4_journal_stop(handle); goto err_out; } @@ -5377,7 +5381,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) * If the call to ext4_truncate failed to get a transaction handle at * all, we need to clean up the in-core orphan list manually. */ - if (inode->i_nlink) + if (orphan && inode->i_nlink) ext4_orphan_del(NULL, inode); if (!rc && (ia_valid & ATTR_MODE)) -- cgit v1.1