From bd2d0210cf22f2bd0cef72eb97cf94fc7d31d8cc Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 27 Oct 2010 21:30:10 -0400 Subject: ext4: use bio layer instead of buffer layer in mpage_da_submit_io Call the block I/O layer directly instad of going through the buffer layer. This should give us much better performance and scalability, as well as lowering our CPU utilization when doing buffered writeback. Signed-off-by: "Theodore Ts'o" --- fs/ext4/page-io.c | 430 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 430 insertions(+) create mode 100644 fs/ext4/page-io.c (limited to 'fs/ext4/page-io.c') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c new file mode 100644 index 0000000..b972ca5 --- /dev/null +++ b/fs/ext4/page-io.c @@ -0,0 +1,430 @@ +/* + * linux/fs/ext4/page-io.c + * + * This contains the new page_io functions for ext4 + * + * Written by Theodore Ts'o, 2010. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ext4_jbd2.h" +#include "xattr.h" +#include "acl.h" +#include "ext4_extents.h" + +static struct kmem_cache *io_page_cachep, *io_end_cachep; + +int __init init_ext4_pageio(void) +{ + io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); + if (io_page_cachep == NULL) + return -ENOMEM; + io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); + if (io_page_cachep == NULL) { + kmem_cache_destroy(io_page_cachep); + return -ENOMEM; + } + + return 0; +} + +void exit_ext4_pageio(void) +{ + kmem_cache_destroy(io_end_cachep); + kmem_cache_destroy(io_page_cachep); +} + +void ext4_free_io_end(ext4_io_end_t *io) +{ + int i; + + BUG_ON(!io); + if (io->page) + put_page(io->page); + for (i = 0; i < io->num_io_pages; i++) { + if (--io->pages[i]->p_count == 0) { + struct page *page = io->pages[i]->p_page; + + end_page_writeback(page); + put_page(page); + kmem_cache_free(io_page_cachep, io->pages[i]); + } + } + io->num_io_pages = 0; + iput(io->inode); + kmem_cache_free(io_end_cachep, io); +} + +/* + * check a range of space and convert unwritten extents to written. + */ +int ext4_end_io_nolock(ext4_io_end_t *io) +{ + struct inode *inode = io->inode; + loff_t offset = io->offset; + ssize_t size = io->size; + int ret = 0; + + ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," + "list->prev 0x%p\n", + io, inode->i_ino, io->list.next, io->list.prev); + + if (list_empty(&io->list)) + return ret; + + if (!(io->flag & EXT4_IO_END_UNWRITTEN)) + return ret; + + ret = ext4_convert_unwritten_extents(inode, offset, size); + if (ret < 0) { + printk(KERN_EMERG "%s: failed to convert unwritten " + "extents to written extents, error is %d " + "io is still on inode %lu aio dio list\n", + __func__, ret, inode->i_ino); + return ret; + } + + if (io->iocb) + aio_complete(io->iocb, io->result, 0); + /* clear the DIO AIO unwritten flag */ + io->flag &= ~EXT4_IO_END_UNWRITTEN; + return ret; +} + +/* + * work on completed aio dio IO, to convert unwritten extents to extents + */ +static void ext4_end_io_work(struct work_struct *work) +{ + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); + struct inode *inode = io->inode; + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned long flags; + int ret; + + mutex_lock(&inode->i_mutex); + ret = ext4_end_io_nolock(io); + if (ret < 0) { + mutex_unlock(&inode->i_mutex); + return; + } + + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + if (!list_empty(&io->list)) + list_del_init(&io->list); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + mutex_unlock(&inode->i_mutex); + ext4_free_io_end(io); +} + +ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) +{ + ext4_io_end_t *io = NULL; + + io = kmem_cache_alloc(io_end_cachep, flags); + if (io) { + memset(io, 0, sizeof(*io)); + io->inode = igrab(inode); + BUG_ON(!io->inode); + INIT_WORK(&io->work, ext4_end_io_work); + INIT_LIST_HEAD(&io->list); + } + return io; +} + +/* + * Print an buffer I/O error compatible with the fs/buffer.c. This + * provides compatibility with dmesg scrapers that look for a specific + * buffer I/O error message. We really need a unified error reporting + * structure to userspace ala Digital Unix's uerf system, but it's + * probably not going to happen in my lifetime, due to LKML politics... + */ +static void buffer_io_error(struct buffer_head *bh) +{ + char b[BDEVNAME_SIZE]; + printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", + bdevname(bh->b_bdev, b), + (unsigned long long)bh->b_blocknr); +} + +static void ext4_end_bio(struct bio *bio, int error) +{ + ext4_io_end_t *io_end = bio->bi_private; + struct workqueue_struct *wq; + struct inode *inode; + unsigned long flags; + ext4_fsblk_t err_block; + int i; + + BUG_ON(!io_end); + inode = io_end->inode; + bio->bi_private = NULL; + bio->bi_end_io = NULL; + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) + error = 0; + err_block = bio->bi_sector >> (inode->i_blkbits - 9); + bio_put(bio); + + if (!(inode->i_sb->s_flags & MS_ACTIVE)) { + pr_err("sb umounted, discard end_io request for inode %lu\n", + io_end->inode->i_ino); + ext4_free_io_end(io_end); + return; + } + + if (error) { + io_end->flag |= EXT4_IO_END_ERROR; + ext4_warning(inode->i_sb, "I/O error writing to inode %lu " + "(offset %llu size %ld starting block %llu)", + inode->i_ino, + (unsigned long long) io_end->offset, + (long) io_end->size, + (unsigned long long) err_block); + } + + for (i = 0; i < io_end->num_io_pages; i++) { + struct page *page = io_end->pages[i]->p_page; + struct buffer_head *bh, *head; + int partial_write = 0; + + head = page_buffers(page); + if (error) + SetPageError(page); + BUG_ON(!head); + if (head->b_size == PAGE_CACHE_SIZE) + clear_buffer_dirty(head); + else { + loff_t offset; + loff_t io_end_offset = io_end->offset + io_end->size; + + offset = (sector_t) page->index << PAGE_CACHE_SHIFT; + bh = head; + do { + if ((offset >= io_end->offset) && + (offset+bh->b_size <= io_end_offset)) { + if (error) + buffer_io_error(bh); + + clear_buffer_dirty(bh); + } + if (buffer_delay(bh)) + partial_write = 1; + else if (!buffer_mapped(bh)) + clear_buffer_dirty(bh); + else if (buffer_dirty(bh)) + partial_write = 1; + offset += bh->b_size; + bh = bh->b_this_page; + } while (bh != head); + } + + if (--io_end->pages[i]->p_count == 0) { + struct page *page = io_end->pages[i]->p_page; + + end_page_writeback(page); + put_page(page); + kmem_cache_free(io_page_cachep, io_end->pages[i]); + } + + /* + * If this is a partial write which happened to make + * all buffers uptodate then we can optimize away a + * bogus readpage() for the next read(). Here we + * 'discover' whether the page went uptodate as a + * result of this (potentially partial) write. + */ + if (!partial_write) + SetPageUptodate(page); + } + + io_end->num_io_pages = 0; + + /* Add the io_end to per-inode completed io list*/ + spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); + list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); + spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); + + wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; + /* queue the work to convert unwritten extents to written */ + queue_work(wq, &io_end->work); +} + +void ext4_io_submit(struct ext4_io_submit *io) +{ + struct bio *bio = io->io_bio; + + if (bio) { + bio_get(io->io_bio); + submit_bio(io->io_op, io->io_bio); + BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP)); + bio_put(io->io_bio); + } + io->io_bio = 0; + io->io_op = 0; + io->io_end = 0; +} + +static int io_submit_init(struct ext4_io_submit *io, + struct inode *inode, + struct writeback_control *wbc, + struct buffer_head *bh) +{ + ext4_io_end_t *io_end; + struct page *page = bh->b_page; + int nvecs = bio_get_nr_vecs(bh->b_bdev); + struct bio *bio; + + io_end = ext4_init_io_end(inode, GFP_NOFS); + if (!io_end) + return -ENOMEM; + do { + bio = bio_alloc(GFP_NOIO, nvecs); + nvecs >>= 1; + } while (bio == NULL); + + bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; + bio->bi_private = io->io_end = io_end; + bio->bi_end_io = ext4_end_bio; + + io_end->inode = inode; + io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); + + io->io_bio = bio; + io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC_PLUG : WRITE); + io->io_next_block = bh->b_blocknr; + return 0; +} + +static int io_submit_add_bh(struct ext4_io_submit *io, + struct ext4_io_page *io_page, + struct inode *inode, + struct writeback_control *wbc, + struct buffer_head *bh) +{ + ext4_io_end_t *io_end; + int ret; + + if (buffer_new(bh)) { + clear_buffer_new(bh); + unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + } + + if (!buffer_mapped(bh) || buffer_delay(bh)) { + if (!buffer_mapped(bh)) + clear_buffer_dirty(bh); + if (io->io_bio) + ext4_io_submit(io); + return 0; + } + + if (io->io_bio && bh->b_blocknr != io->io_next_block) { +submit_and_retry: + ext4_io_submit(io); + } + if (io->io_bio == NULL) { + ret = io_submit_init(io, inode, wbc, bh); + if (ret) + return ret; + } + io_end = io->io_end; + if ((io_end->num_io_pages >= MAX_IO_PAGES) && + (io_end->pages[io_end->num_io_pages-1] != io_page)) + goto submit_and_retry; + if (buffer_uninit(bh)) + io->io_end->flag |= EXT4_IO_END_UNWRITTEN; + io->io_end->size += bh->b_size; + io->io_next_block++; + ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); + if (ret != bh->b_size) + goto submit_and_retry; + if ((io_end->num_io_pages == 0) || + (io_end->pages[io_end->num_io_pages-1] != io_page)) { + io_end->pages[io_end->num_io_pages++] = io_page; + io_page->p_count++; + } + return 0; +} + +int ext4_bio_write_page(struct ext4_io_submit *io, + struct page *page, + int len, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + unsigned block_start, block_end, blocksize; + struct ext4_io_page *io_page; + struct buffer_head *bh, *head; + int ret = 0; + + blocksize = 1 << inode->i_blkbits; + + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + ClearPageError(page); + + io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS); + if (!io_page) { + set_page_dirty(page); + unlock_page(page); + return -ENOMEM; + } + io_page->p_page = page; + io_page->p_count = 0; + get_page(page); + + for (bh = head = page_buffers(page), block_start = 0; + bh != head || !block_start; + block_start = block_end, bh = bh->b_this_page) { + block_end = block_start + blocksize; + if (block_start >= len) { + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + continue; + } + ret = io_submit_add_bh(io, io_page, inode, wbc, bh); + if (ret) { + /* + * We only get here on ENOMEM. Not much else + * we can do but mark the page as dirty, and + * better luck next time. + */ + set_page_dirty(page); + break; + } + } + unlock_page(page); + /* + * If the page was truncated before we could do the writeback, + * or we had a memory allocation error while trying to write + * the first buffer head, we won't have submitted any pages for + * I/O. In that case we need to make sure we've cleared the + * PageWriteback bit from the page to prevent the system from + * wedging later on. + */ + if (io_page->p_count == 0) { + put_page(page); + end_page_writeback(page); + kmem_cache_free(io_page_cachep, io_page); + } + return ret; +} -- cgit v1.1