diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-12-14 17:09:00 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-12-14 17:09:00 -0800 |
commit | 80eabba70260dcb55b05098f6c1fecbe5c0e518b (patch) | |
tree | c9f5d19803a3387d2b9d8a6998eb9c58bad2a15a | |
parent | 852d21ae1fcdf0e4de6b5bfa730d29cb013c7ff3 (diff) | |
parent | ce98321bf7d274a470642ef99e1d82512673ce7c (diff) | |
download | op-kernel-dev-80eabba70260dcb55b05098f6c1fecbe5c0e518b.zip op-kernel-dev-80eabba70260dcb55b05098f6c1fecbe5c0e518b.tar.gz |
Merge branch 'for-4.10/fs-unmap' of git://git.kernel.dk/linux-block
Pull fs meta data unmap optimization from Jens Axboe:
"A series from Jan Kara, providing a more efficient way for unmapping
meta data from in the buffer cache than doing it block-by-block.
Provide a general helper that existing callers can use"
* 'for-4.10/fs-unmap' of git://git.kernel.dk/linux-block:
fs: Remove unmap_underlying_metadata
fs: Add helper to clean bdev aliases under a bh and use it
ext2: Use clean_bdev_aliases() instead of iteration
ext4: Use clean_bdev_aliases() instead of iteration
direct-io: Use clean_bdev_aliases() instead of handmade iteration
fs: Provide function to unmap metadata for a range of blocks
-rw-r--r-- | fs/buffer.c | 104 | ||||
-rw-r--r-- | fs/direct-io.c | 28 | ||||
-rw-r--r-- | fs/ext2/inode.c | 9 | ||||
-rw-r--r-- | fs/ext4/extents.c | 13 | ||||
-rw-r--r-- | fs/ext4/inode.c | 18 | ||||
-rw-r--r-- | fs/ext4/page-io.c | 2 | ||||
-rw-r--r-- | fs/mpage.c | 3 | ||||
-rw-r--r-- | fs/ntfs/aops.c | 2 | ||||
-rw-r--r-- | fs/ntfs/file.c | 5 | ||||
-rw-r--r-- | fs/ocfs2/aops.c | 2 | ||||
-rw-r--r-- | fs/ufs/balloc.c | 3 | ||||
-rw-r--r-- | fs/ufs/inode.c | 3 | ||||
-rw-r--r-- | include/linux/buffer_head.h | 7 |
13 files changed, 104 insertions, 95 deletions
diff --git a/fs/buffer.c b/fs/buffer.c index a3bfd57..d21771f 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -43,6 +43,7 @@ #include <linux/bitops.h> #include <linux/mpage.h> #include <linux/bit_spinlock.h> +#include <linux/pagevec.h> #include <trace/events/block.h> static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); @@ -1604,37 +1605,80 @@ void create_empty_buffers(struct page *page, } EXPORT_SYMBOL(create_empty_buffers); -/* - * We are taking a block for data and we don't want any output from any - * buffer-cache aliases starting from return from that function and - * until the moment when something will explicitly mark the buffer - * dirty (hopefully that will not happen until we will free that block ;-) - * We don't even need to mark it not-uptodate - nobody can expect - * anything from a newly allocated buffer anyway. We used to used - * unmap_buffer() for such invalidation, but that was wrong. We definitely - * don't want to mark the alias unmapped, for example - it would confuse - * anyone who might pick it with bread() afterwards... - * - * Also.. Note that bforget() doesn't lock the buffer. So there can - * be writeout I/O going on against recently-freed buffers. We don't - * wait on that I/O in bforget() - it's more efficient to wait on the I/O - * only if we really need to. That happens here. - */ -void unmap_underlying_metadata(struct block_device *bdev, sector_t block) +/** + * clean_bdev_aliases: clean a range of buffers in block device + * @bdev: Block device to clean buffers in + * @block: Start of a range of blocks to clean + * @len: Number of blocks to clean + * + * We are taking a range of blocks for data and we don't want writeback of any + * buffer-cache aliases starting from return from this function and until the + * moment when something will explicitly mark the buffer dirty (hopefully that + * will not happen until we will free that block ;-) We don't even need to mark + * it not-uptodate - nobody can expect anything from a newly allocated buffer + * anyway. We used to use unmap_buffer() for such invalidation, but that was + * wrong. We definitely don't want to mark the alias unmapped, for example - it + * would confuse anyone who might pick it with bread() afterwards... + * + * Also.. Note that bforget() doesn't lock the buffer. So there can be + * writeout I/O going on against recently-freed buffers. We don't wait on that + * I/O in bforget() - it's more efficient to wait on the I/O only if we really + * need to. That happens here. + */ +void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) { - struct buffer_head *old_bh; + struct inode *bd_inode = bdev->bd_inode; + struct address_space *bd_mapping = bd_inode->i_mapping; + struct pagevec pvec; + pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits); + pgoff_t end; + int i; + struct buffer_head *bh; + struct buffer_head *head; - might_sleep(); + end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits); + pagevec_init(&pvec, 0); + while (index <= end && pagevec_lookup(&pvec, bd_mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; - old_bh = __find_get_block_slow(bdev, block); - if (old_bh) { - clear_buffer_dirty(old_bh); - wait_on_buffer(old_bh); - clear_buffer_req(old_bh); - __brelse(old_bh); + index = page->index; + if (index > end) + break; + if (!page_has_buffers(page)) + continue; + /* + * We use page lock instead of bd_mapping->private_lock + * to pin buffers here since we can afford to sleep and + * it scales better than a global spinlock lock. + */ + lock_page(page); + /* Recheck when the page is locked which pins bhs */ + if (!page_has_buffers(page)) + goto unlock_page; + head = page_buffers(page); + bh = head; + do { + if (!buffer_mapped(bh)) + goto next; + if (bh->b_blocknr >= block + len) + break; + clear_buffer_dirty(bh); + wait_on_buffer(bh); + clear_buffer_req(bh); +next: + bh = bh->b_this_page; + } while (bh != head); +unlock_page: + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + index++; } } -EXPORT_SYMBOL(unmap_underlying_metadata); +EXPORT_SYMBOL(clean_bdev_aliases); /* * Size is a power-of-two in the range 512..PAGE_SIZE, @@ -1745,8 +1789,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, if (buffer_new(bh)) { /* blockdev mappings never come here */ clear_buffer_new(bh); - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); + clean_bdev_bh_alias(bh); } } bh = bh->b_this_page; @@ -1992,8 +2035,7 @@ int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, } if (buffer_new(bh)) { - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); + clean_bdev_bh_alias(bh); if (PageUptodate(page)) { clear_buffer_new(bh); set_buffer_uptodate(bh); @@ -2633,7 +2675,7 @@ int nobh_write_begin(struct address_space *mapping, if (!buffer_mapped(bh)) is_mapped_to_disk = 0; if (buffer_new(bh)) - unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + clean_bdev_bh_alias(bh); if (PageUptodate(page)) { set_buffer_uptodate(bh); continue; diff --git a/fs/direct-io.c b/fs/direct-io.c index 835e23a..86aa798 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -843,24 +843,6 @@ out: } /* - * Clean any dirty buffers in the blockdev mapping which alias newly-created - * file blocks. Only called for S_ISREG files - blockdevs do not set - * buffer_new - */ -static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh) -{ - unsigned i; - unsigned nblocks; - - nblocks = map_bh->b_size >> dio->inode->i_blkbits; - - for (i = 0; i < nblocks; i++) { - unmap_underlying_metadata(map_bh->b_bdev, - map_bh->b_blocknr + i); - } -} - -/* * If we are not writing the entire block and get_block() allocated * the block for us, we need to fill-in the unused portion of the * block with zeros. This happens only if user-buffer, fileoffset or @@ -960,11 +942,15 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, goto do_holes; sdio->blocks_available = - map_bh->b_size >> sdio->blkbits; + map_bh->b_size >> blkbits; sdio->next_block_for_io = map_bh->b_blocknr << sdio->blkfactor; - if (buffer_new(map_bh)) - clean_blockdev_aliases(dio, map_bh); + if (buffer_new(map_bh)) { + clean_bdev_aliases( + map_bh->b_bdev, + map_bh->b_blocknr, + map_bh->b_size >> blkbits); + } if (!sdio->blkfactor) goto do_holes; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 046b642..e173afe 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -732,16 +732,13 @@ static int ext2_get_blocks(struct inode *inode, } if (IS_DAX(inode)) { - int i; - /* * We must unmap blocks before zeroing so that writeback cannot * overwrite zeros with stale data from block device page cache. */ - for (i = 0; i < count; i++) { - unmap_underlying_metadata(inode->i_sb->s_bdev, - le32_to_cpu(chain[depth-1].key) + i); - } + clean_bdev_aliases(inode->i_sb->s_bdev, + le32_to_cpu(chain[depth-1].key), + count); /* * block must be initialised before we put it in the tree * so that it's not found by another thread before it's diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3e1014f..b1f8416 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3777,14 +3777,6 @@ out: return err; } -static void unmap_underlying_metadata_blocks(struct block_device *bdev, - sector_t block, int count) -{ - int i; - for (i = 0; i < count; i++) - unmap_underlying_metadata(bdev, block + i); -} - /* * Handle EOFBLOCKS_FL flag, clearing it if necessary */ @@ -4121,9 +4113,8 @@ out: * new. */ if (allocated > map->m_len) { - unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, - newblock + map->m_len, - allocated - map->m_len); + clean_bdev_aliases(inode->i_sb->s_bdev, newblock + map->m_len, + allocated - map->m_len); allocated = map->m_len; } map->m_len = allocated; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 72d593f..88d57af 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -661,12 +661,8 @@ found: if (flags & EXT4_GET_BLOCKS_ZERO && map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) { - ext4_lblk_t i; - - for (i = 0; i < map->m_len; i++) { - unmap_underlying_metadata(inode->i_sb->s_bdev, - map->m_pblk + i); - } + clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk, + map->m_len); ret = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk, map->m_len); if (ret) { @@ -1137,8 +1133,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, if (err) break; if (buffer_new(bh)) { - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); + clean_bdev_bh_alias(bh); if (PageUptodate(page)) { clear_buffer_new(bh); set_buffer_uptodate(bh); @@ -2371,11 +2366,8 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) BUG_ON(map->m_len == 0); if (map->m_flags & EXT4_MAP_NEW) { - struct block_device *bdev = inode->i_sb->s_bdev; - int i; - - for (i = 0; i < map->m_len; i++) - unmap_underlying_metadata(bdev, map->m_pblk + i); + clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk, + map->m_len); } return 0; } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index e2332a6..d83b0f3 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -457,7 +457,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, } if (buffer_new(bh)) { clear_buffer_new(bh); - unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + clean_bdev_bh_alias(bh); } set_buffer_async_write(bh); nr_to_submit++; @@ -555,8 +555,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, if (mpd->get_block(inode, block_in_file, &map_bh, 1)) goto confused; if (buffer_new(&map_bh)) - unmap_underlying_metadata(map_bh.b_bdev, - map_bh.b_blocknr); + clean_bdev_bh_alias(&map_bh); if (buffer_boundary(&map_bh)) { boundary_block = map_bh.b_blocknr; boundary_bdev = map_bh.b_bdev; diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index d0cf6fe..cc91856 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -765,7 +765,7 @@ lock_retry_remap: } // TODO: Instantiate the hole. // clear_buffer_new(bh); - // unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + // clean_bdev_bh_alias(bh); ntfs_error(vol->sb, "Writing into sparse regions is " "not supported yet. Sorry."); err = -EOPNOTSUPP; diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index bf72a2c..99510d8 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -740,8 +740,7 @@ map_buffer_cached: set_buffer_uptodate(bh); if (unlikely(was_hole)) { /* We allocated the buffer. */ - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); + clean_bdev_bh_alias(bh); if (bh_end <= pos || bh_pos >= end) mark_buffer_dirty(bh); else @@ -784,7 +783,7 @@ map_buffer_cached: continue; } /* We allocated the buffer. */ - unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + clean_bdev_bh_alias(bh); /* * If the buffer is fully outside the write, zero it, * set it uptodate, and mark it dirty so it gets diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 9a88984..4d9c6f5 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -630,7 +630,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, if (!buffer_mapped(bh)) { map_bh(bh, inode->i_sb, *p_blkno); - unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + clean_bdev_bh_alias(bh); } if (PageUptodate(page)) { diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index b035af5..a0376a2 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -307,8 +307,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg, (unsigned long long)(pos + newb), pos); bh->b_blocknr = newb + pos; - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); + clean_bdev_bh_alias(bh); mark_buffer_dirty(bh); ++j; bh = bh->b_this_page; diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 190d64b..45ceb94 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1070,8 +1070,7 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size) if (buffer_new(bh)) { clear_buffer_new(bh); - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); + clean_bdev_bh_alias(bh); /* * we do not zeroize fragment, because of * if it maped to hole, it already contains zeroes diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index ebbacd1..d67ab83 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -168,7 +168,12 @@ int inode_has_buffers(struct inode *); void invalidate_inode_buffers(struct inode *); int remove_inode_buffers(struct inode *inode); int sync_mapping_buffers(struct address_space *mapping); -void unmap_underlying_metadata(struct block_device *bdev, sector_t block); +void clean_bdev_aliases(struct block_device *bdev, sector_t block, + sector_t len); +static inline void clean_bdev_bh_alias(struct buffer_head *bh) +{ + clean_bdev_aliases(bh->b_bdev, bh->b_blocknr, 1); +} void mark_buffer_async_write(struct buffer_head *bh); void __wait_on_buffer(struct buffer_head *); |