diff options
-rw-r--r-- | fs/ext4/ext4.h | 8 | ||||
-rw-r--r-- | fs/ext4/extents.c | 9 | ||||
-rw-r--r-- | fs/ext4/extents_status.c | 17 | ||||
-rw-r--r-- | fs/ext4/extents_status.h | 3 | ||||
-rw-r--r-- | fs/ext4/file.c | 4 | ||||
-rw-r--r-- | fs/ext4/inode.c | 85 | ||||
-rw-r--r-- | fs/ext4/mballoc.c | 6 | ||||
-rw-r--r-- | fs/ext4/page-io.c | 121 | ||||
-rw-r--r-- | include/linux/journal-head.h | 8 | ||||
-rw-r--r-- | include/trace/events/ext4.h | 4 |
10 files changed, 119 insertions, 146 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0aabb34..5aae3d1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -209,7 +209,6 @@ typedef struct ext4_io_end { ssize_t size; /* size of the extent */ struct kiocb *iocb; /* iocb struct for AIO */ int result; /* error value for AIO */ - atomic_t count; /* reference counter */ } ext4_io_end_t; struct ext4_io_submit { @@ -2651,14 +2650,11 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, /* page-io.c */ extern int __init ext4_init_pageio(void); +extern void ext4_add_complete_io(ext4_io_end_t *io_end); extern void ext4_exit_pageio(void); extern void ext4_ioend_shutdown(struct inode *); +extern void ext4_free_io_end(ext4_io_end_t *io); extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); -extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end); -extern int ext4_put_io_end(ext4_io_end_t *io_end); -extern void ext4_put_io_end_defer(ext4_io_end_t *io_end); -extern void ext4_io_submit_init(struct ext4_io_submit *io, - struct writeback_control *wbc); extern void ext4_end_io_work(struct work_struct *work); extern void ext4_io_submit(struct ext4_io_submit *io); extern int ext4_bio_write_page(struct ext4_io_submit *io, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 107936d..bc0f1910 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3642,7 +3642,7 @@ int ext4_find_delalloc_range(struct inode *inode, { struct extent_status es; - ext4_es_find_delayed_extent(inode, lblk_start, &es); + ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es); if (es.es_len == 0) return 0; /* there is no delay extent in this tree */ else if (es.es_lblk <= lblk_start && @@ -4608,9 +4608,10 @@ static int ext4_find_delayed_extent(struct inode *inode, struct extent_status es; ext4_lblk_t block, next_del; - ext4_es_find_delayed_extent(inode, newes->es_lblk, &es); - if (newes->es_pblk == 0) { + ext4_es_find_delayed_extent_range(inode, newes->es_lblk, + newes->es_lblk + newes->es_len - 1, &es); + /* * No extent in extent-tree contains block @newes->es_pblk, * then the block may stay in 1)a hole or 2)delayed-extent. @@ -4630,7 +4631,7 @@ static int ext4_find_delayed_extent(struct inode *inode, } block = newes->es_lblk + newes->es_len; - ext4_es_find_delayed_extent(inode, block, &es); + ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es); if (es.es_len == 0) next_del = EXT_MAX_BLOCKS; else diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index fe3337a..e6941e6 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -232,14 +232,16 @@ static struct extent_status *__es_tree_search(struct rb_root *root, } /* - * ext4_es_find_delayed_extent: find the 1st delayed extent covering @es->lblk - * if it exists, otherwise, the next extent after @es->lblk. + * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering + * @es->lblk if it exists, otherwise, the next extent after @es->lblk. * * @inode: the inode which owns delayed extents * @lblk: the offset where we start to search + * @end: the offset where we stop to search * @es: delayed extent that we found */ -void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, +void ext4_es_find_delayed_extent_range(struct inode *inode, + ext4_lblk_t lblk, ext4_lblk_t end, struct extent_status *es) { struct ext4_es_tree *tree = NULL; @@ -247,7 +249,8 @@ void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, struct rb_node *node; BUG_ON(es == NULL); - trace_ext4_es_find_delayed_extent_enter(inode, lblk); + BUG_ON(end < lblk); + trace_ext4_es_find_delayed_extent_range_enter(inode, lblk); read_lock(&EXT4_I(inode)->i_es_lock); tree = &EXT4_I(inode)->i_es_tree; @@ -270,6 +273,10 @@ out: if (es1 && !ext4_es_is_delayed(es1)) { while ((node = rb_next(&es1->rb_node)) != NULL) { es1 = rb_entry(node, struct extent_status, rb_node); + if (es1->es_lblk > end) { + es1 = NULL; + break; + } if (ext4_es_is_delayed(es1)) break; } @@ -285,7 +292,7 @@ out: read_unlock(&EXT4_I(inode)->i_es_lock); ext4_es_lru_add(inode); - trace_ext4_es_find_delayed_extent_exit(inode, es); + trace_ext4_es_find_delayed_extent_range_exit(inode, es); } static struct extent_status * diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index d8e2d4d..f740eb03 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -62,7 +62,8 @@ extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, unsigned long long status); extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); -extern void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, +extern void ext4_es_find_delayed_extent_range(struct inode *inode, + ext4_lblk_t lblk, ext4_lblk_t end, struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 4959e29..b1b4d51 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -465,7 +465,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) * If there is a delay extent at this offset, * it will be as a data. */ - ext4_es_find_delayed_extent(inode, last, &es); + ext4_es_find_delayed_extent_range(inode, last, last, &es); if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { if (last != start) dataoff = last << blkbits; @@ -548,7 +548,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) * If there is a delay extent at this offset, * we will skip this extent. */ - ext4_es_find_delayed_extent(inode, last, &es); + ext4_es_find_delayed_extent_range(inode, last, last, &es); if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { last = es.es_lblk + es.es_len; holeoff = last << blkbits; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0723774..d6382b8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1488,10 +1488,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, struct ext4_io_submit io_submit; BUG_ON(mpd->next_page <= mpd->first_page); - ext4_io_submit_init(&io_submit, mpd->wbc); - io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); - if (!io_submit.io_end) - return -ENOMEM; + memset(&io_submit, 0, sizeof(io_submit)); /* * We need to start from the first_page to the next_page - 1 * to make sure we also write the mapped dirty buffer_heads. @@ -1579,8 +1576,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, pagevec_release(&pvec); } ext4_io_submit(&io_submit); - /* Drop io_end reference we got from init */ - ext4_put_io_end_defer(io_submit.io_end); return ret; } @@ -2239,16 +2234,9 @@ static int ext4_writepage(struct page *page, */ return __ext4_journalled_writepage(page, len); - ext4_io_submit_init(&io_submit, wbc); - io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); - if (!io_submit.io_end) { - redirty_page_for_writepage(wbc, page); - return -ENOMEM; - } + memset(&io_submit, 0, sizeof(io_submit)); ret = ext4_bio_write_page(&io_submit, page, len, wbc); ext4_io_submit(&io_submit); - /* Drop io_end reference we got from init */ - ext4_put_io_end_defer(io_submit.io_end); return ret; } @@ -3079,13 +3067,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, struct inode *inode = file_inode(iocb->ki_filp); ext4_io_end_t *io_end = iocb->private; - /* if not async direct IO just return */ - if (!io_end) { - inode_dio_done(inode); - if (is_async) - aio_complete(iocb, ret, 0); - return; - } + /* if not async direct IO or dio with 0 bytes write, just return */ + if (!io_end || !size) + goto out; ext_debug("ext4_end_io_dio(): io_end 0x%p " "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", @@ -3093,13 +3077,25 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, size); iocb->private = NULL; + + /* if not aio dio with unwritten extents, just free io and return */ + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + ext4_free_io_end(io_end); +out: + inode_dio_done(inode); + if (is_async) + aio_complete(iocb, ret, 0); + return; + } + io_end->offset = offset; io_end->size = size; if (is_async) { io_end->iocb = iocb; io_end->result = ret; } - ext4_put_io_end_defer(io_end); + + ext4_add_complete_io(io_end); } /* @@ -3133,7 +3129,6 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, get_block_t *get_block_func = NULL; int dio_flags = 0; loff_t final_size = offset + count; - ext4_io_end_t *io_end = NULL; /* Use the old path for reads and writes beyond i_size. */ if (rw != WRITE || final_size > inode->i_size) @@ -3172,16 +3167,13 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, iocb->private = NULL; ext4_inode_aio_set(inode, NULL); if (!is_sync_kiocb(iocb)) { - io_end = ext4_init_io_end(inode, GFP_NOFS); + ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS); if (!io_end) { ret = -ENOMEM; goto retake_lock; } io_end->flag |= EXT4_IO_END_DIRECT; - /* - * Grab reference for DIO. Will be dropped in ext4_end_io_dio() - */ - iocb->private = ext4_get_io_end(io_end); + iocb->private = io_end; /* * we save the io structure for current async direct * IO, so that later ext4_map_blocks() could flag the @@ -3205,27 +3197,26 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, NULL, dio_flags); + if (iocb->private) + ext4_inode_aio_set(inode, NULL); /* - * Put our reference to io_end. This can free the io_end structure e.g. - * in sync IO case or in case of error. It can even perform extent - * conversion if all bios we submitted finished before we got here. - * Note that in that case iocb->private can be already set to NULL - * here. + * The io_end structure takes a reference to the inode, that + * structure needs to be destroyed and the reference to the + * inode need to be dropped, when IO is complete, even with 0 + * byte write, or failed. + * + * In the successful AIO DIO case, the io_end structure will + * be destroyed and the reference to the inode will be dropped + * after the end_io call back function is called. + * + * In the case there is 0 byte write, or error case, since VFS + * direct IO won't invoke the end_io call back function, we + * need to free the end_io structure here. */ - if (io_end) { - ext4_inode_aio_set(inode, NULL); - ext4_put_io_end(io_end); - /* - * In case of error or no write ext4_end_io_dio() was not - * called so we have to put iocb's reference. - */ - if (ret <= 0 && ret != -EIOCBQUEUED) { - WARN_ON(iocb->private != io_end); - ext4_put_io_end(io_end); - iocb->private = NULL; - } - } - if (ret > 0 && !overwrite && ext4_test_inode_state(inode, + if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { + ext4_free_io_end(iocb->private); + iocb->private = NULL; + } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN)) { int err; /* diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b1ed9e0..def8408 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2105,7 +2105,11 @@ repeat: group = ac->ac_g_ex.fe_group; for (i = 0; i < ngroups; group++, i++) { - if (group == ngroups) + /* + * Artificially restricted ngroups for non-extent + * files makes group > ngroups possible on first loop. + */ + if (group >= ngroups) group = 0; /* This now checks without needing the buddy page */ diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 19599bd..4acf1f7 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -62,28 +62,15 @@ void ext4_ioend_shutdown(struct inode *inode) cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); } -static void ext4_release_io_end(ext4_io_end_t *io_end) +void ext4_free_io_end(ext4_io_end_t *io) { - BUG_ON(!list_empty(&io_end->list)); - BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); - - if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count)) - wake_up_all(ext4_ioend_wq(io_end->inode)); - if (io_end->flag & EXT4_IO_END_DIRECT) - inode_dio_done(io_end->inode); - if (io_end->iocb) - aio_complete(io_end->iocb, io_end->result, 0); - kmem_cache_free(io_end_cachep, io_end); -} - -static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) -{ - struct inode *inode = io_end->inode; + BUG_ON(!io); + BUG_ON(!list_empty(&io->list)); + BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN); - io_end->flag &= ~EXT4_IO_END_UNWRITTEN; - /* Wake up anyone waiting on unwritten extent conversion */ - if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) - wake_up_all(ext4_ioend_wq(inode)); + if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count)) + wake_up_all(ext4_ioend_wq(io->inode)); + kmem_cache_free(io_end_cachep, io); } /* check a range of space and convert unwritten extents to written. */ @@ -106,8 +93,13 @@ static int ext4_end_io(ext4_io_end_t *io) "(inode %lu, offset %llu, size %zd, error %d)", inode->i_ino, offset, size, ret); } - ext4_clear_io_unwritten_flag(io); - ext4_release_io_end(io); + /* Wake up anyone waiting on unwritten extent conversion */ + if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) + wake_up_all(ext4_ioend_wq(inode)); + if (io->flag & EXT4_IO_END_DIRECT) + inode_dio_done(inode); + if (io->iocb) + aio_complete(io->iocb, io->result, 0); return ret; } @@ -138,7 +130,7 @@ static void dump_completed_IO(struct inode *inode) } /* Add the io_end to per-inode completed end_io list. */ -static void ext4_add_complete_io(ext4_io_end_t *io_end) +void ext4_add_complete_io(ext4_io_end_t *io_end) { struct ext4_inode_info *ei = EXT4_I(io_end->inode); struct workqueue_struct *wq; @@ -175,6 +167,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode) err = ext4_end_io(io); if (unlikely(!ret && err)) ret = err; + io->flag &= ~EXT4_IO_END_UNWRITTEN; + ext4_free_io_end(io); } return ret; } @@ -206,43 +200,10 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) atomic_inc(&EXT4_I(inode)->i_ioend_count); io->inode = inode; INIT_LIST_HEAD(&io->list); - atomic_set(&io->count, 1); } return io; } -void ext4_put_io_end_defer(ext4_io_end_t *io_end) -{ - if (atomic_dec_and_test(&io_end->count)) { - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) { - ext4_release_io_end(io_end); - return; - } - ext4_add_complete_io(io_end); - } -} - -int ext4_put_io_end(ext4_io_end_t *io_end) -{ - int err = 0; - - if (atomic_dec_and_test(&io_end->count)) { - if (io_end->flag & EXT4_IO_END_UNWRITTEN) { - err = ext4_convert_unwritten_extents(io_end->inode, - io_end->offset, io_end->size); - ext4_clear_io_unwritten_flag(io_end); - } - ext4_release_io_end(io_end); - } - return err; -} - -ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) -{ - atomic_inc(&io_end->count); - return io_end; -} - /* * Print an buffer I/O error compatible with the fs/buffer.c. This * provides compatibility with dmesg scrapers that look for a specific @@ -325,7 +286,12 @@ static void ext4_end_bio(struct bio *bio, int error) bi_sector >> (inode->i_blkbits - 9)); } - ext4_put_io_end_defer(io_end); + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + ext4_free_io_end(io_end); + return; + } + + ext4_add_complete_io(io_end); } void ext4_io_submit(struct ext4_io_submit *io) @@ -339,37 +305,40 @@ void ext4_io_submit(struct ext4_io_submit *io) bio_put(io->io_bio); } io->io_bio = NULL; -} - -void ext4_io_submit_init(struct ext4_io_submit *io, - struct writeback_control *wbc) -{ - io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); - io->io_bio = NULL; + io->io_op = 0; io->io_end = NULL; } -static int io_submit_init_bio(struct ext4_io_submit *io, - struct buffer_head *bh) +static int io_submit_init(struct ext4_io_submit *io, + struct inode *inode, + struct writeback_control *wbc, + struct buffer_head *bh) { + ext4_io_end_t *io_end; + struct page *page = bh->b_page; int nvecs = bio_get_nr_vecs(bh->b_bdev); struct bio *bio; + io_end = ext4_init_io_end(inode, GFP_NOFS); + if (!io_end) + return -ENOMEM; bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; + bio->bi_private = io->io_end = io_end; bio->bi_end_io = ext4_end_bio; - bio->bi_private = ext4_get_io_end(io->io_end); - if (!io->io_end->size) - io->io_end->offset = (bh->b_page->index << PAGE_CACHE_SHIFT) - + bh_offset(bh); + + io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); + io->io_bio = bio; + io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); io->io_next_block = bh->b_blocknr; return 0; } static int io_submit_add_bh(struct ext4_io_submit *io, struct inode *inode, + struct writeback_control *wbc, struct buffer_head *bh) { ext4_io_end_t *io_end; @@ -380,18 +349,18 @@ submit_and_retry: ext4_io_submit(io); } if (io->io_bio == NULL) { - ret = io_submit_init_bio(io, bh); + ret = io_submit_init(io, inode, wbc, bh); if (ret) return ret; } - ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); - if (ret != bh->b_size) - goto submit_and_retry; io_end = io->io_end; if (test_clear_buffer_uninit(bh)) ext4_set_io_unwritten_flag(inode, io_end); - io_end->size += bh->b_size; + io->io_end->size += bh->b_size; io->io_next_block++; + ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); + if (ret != bh->b_size) + goto submit_and_retry; return 0; } @@ -463,7 +432,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, do { if (!buffer_async_write(bh)) continue; - ret = io_submit_add_bh(io, inode, bh); + ret = io_submit_add_bh(io, inode, wbc, bh); if (ret) { /* * We only get here on ENOMEM. Not much else diff --git a/include/linux/journal-head.h b/include/linux/journal-head.h index 13a3da2..98cd41b 100644 --- a/include/linux/journal-head.h +++ b/include/linux/journal-head.h @@ -30,15 +30,19 @@ struct journal_head { /* * Journalling list for this buffer [jbd_lock_bh_state()] + * NOTE: We *cannot* combine this with b_modified into a bitfield + * as gcc would then (which the C standard allows but which is + * very unuseful) make 64-bit accesses to the bitfield and clobber + * b_jcount if its update races with bitfield modification. */ - unsigned b_jlist:4; + unsigned b_jlist; /* * This flag signals the buffer has been modified by * the currently running transaction * [jbd_lock_bh_state()] */ - unsigned b_modified:1; + unsigned b_modified; /* * Copy of the buffer data frozen for writing to the log. diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index d0e6864..8ee15b9 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -2139,7 +2139,7 @@ TRACE_EVENT(ext4_es_remove_extent, __entry->lblk, __entry->len) ); -TRACE_EVENT(ext4_es_find_delayed_extent_enter, +TRACE_EVENT(ext4_es_find_delayed_extent_range_enter, TP_PROTO(struct inode *inode, ext4_lblk_t lblk), TP_ARGS(inode, lblk), @@ -2161,7 +2161,7 @@ TRACE_EVENT(ext4_es_find_delayed_extent_enter, (unsigned long) __entry->ino, __entry->lblk) ); -TRACE_EVENT(ext4_es_find_delayed_extent_exit, +TRACE_EVENT(ext4_es_find_delayed_extent_range_exit, TP_PROTO(struct inode *inode, struct extent_status *es), TP_ARGS(inode, es), |