From 890871be854b5f5e43e7ba2475f706209906cc24 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 2 Sep 2009 16:24:52 -0400 Subject: Btrfs: switch extent_map to a rw lock There are two main users of the extent_map tree. The first is regular file inodes, where it is evenly spread between readers and writers. The second is the chunk allocation tree, which maps blocks from logical addresses to phyiscal ones, and it is 99.99% reads. The mapping tree is a point of lock contention during heavy IO workloads, so this commit switches things to a rw lock. Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 72a2b9c..edd86ae 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5396,9 +5396,9 @@ static noinline int relocate_data_extent(struct inode *reloc_inode, lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); while (1) { int ret; - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); break; -- cgit v1.1 From 11833d66be94b514652466802100378046c16b72 Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Fri, 11 Sep 2009 16:11:19 -0400 Subject: Btrfs: improve async block group caching This patch gets rid of two limitations of async block group caching. The old code delays handling pinned extents when block group is in caching. To allocate logged file extents, the old code need wait until block group is fully cached. To get rid of the limitations, This patch introduces a data structure to track the progress of caching. Base on the caching progress, we know which extents should be added to the free space cache when handling the pinned extents. The logged file extents are also handled in a similar way. This patch also changes how pinned extents are tracked. The old code uses one tree to track pinned extents, and copy the pinned extents tree at transaction commit time. This patch makes it use two trees to track pinned extents. One tree for extents that are pinned in the running transaction, one tree for extents that can be unpinned. At transaction commit time, we swap the two trees. Signed-off-by: Yan Zheng Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 586 +++++++++++++++++++++++++++++-------------------- 1 file changed, 352 insertions(+), 234 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index edd86ae..9bcb9c0 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -32,12 +32,12 @@ #include "locking.h" #include "free-space-cache.h" -static int update_reserved_extents(struct btrfs_root *root, - u64 bytenr, u64 num, int reserve); static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc, int mark_free); +static int update_reserved_extents(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@ -57,10 +57,17 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, u64 flags, struct btrfs_disk_key *key, int level, struct btrfs_key *ins); - static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, u64 flags, int force); +static int pin_down_bytes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 num_bytes, + int is_data, int reserved, + struct extent_buffer **must_clean); +static int find_next_key(struct btrfs_path *path, int level, + struct btrfs_key *key); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -153,34 +160,34 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, return ret; } -/* - * We always set EXTENT_LOCKED for the super mirror extents so we don't - * overwrite them, so those bits need to be unset. Also, if we are unmounting - * with pinned extents still sitting there because we had a block group caching, - * we need to clear those now, since we are done. - */ -void btrfs_free_pinned_extents(struct btrfs_fs_info *info) +static int add_excluded_extent(struct btrfs_root *root, + u64 start, u64 num_bytes) { - u64 start, end, last = 0; - int ret; + u64 end = start + num_bytes - 1; + set_extent_bits(&root->fs_info->freed_extents[0], + start, end, EXTENT_UPTODATE, GFP_NOFS); + set_extent_bits(&root->fs_info->freed_extents[1], + start, end, EXTENT_UPTODATE, GFP_NOFS); + return 0; +} - while (1) { - ret = find_first_extent_bit(&info->pinned_extents, last, - &start, &end, - EXTENT_LOCKED|EXTENT_DIRTY); - if (ret) - break; +static void free_excluded_extents(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) +{ + u64 start, end; - clear_extent_bits(&info->pinned_extents, start, end, - EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS); - last = end+1; - } + start = cache->key.objectid; + end = start + cache->key.offset - 1; + + clear_extent_bits(&root->fs_info->freed_extents[0], + start, end, EXTENT_UPTODATE, GFP_NOFS); + clear_extent_bits(&root->fs_info->freed_extents[1], + start, end, EXTENT_UPTODATE, GFP_NOFS); } -static int remove_sb_from_cache(struct btrfs_root *root, - struct btrfs_block_group_cache *cache) +static int exclude_super_stripes(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) { - struct btrfs_fs_info *fs_info = root->fs_info; u64 bytenr; u64 *logical; int stripe_len; @@ -192,17 +199,41 @@ static int remove_sb_from_cache(struct btrfs_root *root, cache->key.objectid, bytenr, 0, &logical, &nr, &stripe_len); BUG_ON(ret); + while (nr--) { - try_lock_extent(&fs_info->pinned_extents, - logical[nr], - logical[nr] + stripe_len - 1, GFP_NOFS); + ret = add_excluded_extent(root, logical[nr], + stripe_len); + BUG_ON(ret); } + kfree(logical); } - return 0; } +static struct btrfs_caching_control * +get_caching_control(struct btrfs_block_group_cache *cache) +{ + struct btrfs_caching_control *ctl; + + spin_lock(&cache->lock); + if (cache->cached != BTRFS_CACHE_STARTED) { + spin_unlock(&cache->lock); + return NULL; + } + + ctl = cache->caching_ctl; + atomic_inc(&ctl->count); + spin_unlock(&cache->lock); + return ctl; +} + +static void put_caching_control(struct btrfs_caching_control *ctl) +{ + if (atomic_dec_and_test(&ctl->count)) + kfree(ctl); +} + /* * this is only called by cache_block_group, since we could have freed extents * we need to check the pinned_extents for any extents that can't be used yet @@ -215,9 +246,9 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, int ret; while (start < end) { - ret = find_first_extent_bit(&info->pinned_extents, start, + ret = find_first_extent_bit(info->pinned_extents, start, &extent_start, &extent_end, - EXTENT_DIRTY|EXTENT_LOCKED); + EXTENT_DIRTY | EXTENT_UPTODATE); if (ret) break; @@ -249,22 +280,24 @@ static int caching_kthread(void *data) { struct btrfs_block_group_cache *block_group = data; struct btrfs_fs_info *fs_info = block_group->fs_info; - u64 last = 0; + struct btrfs_caching_control *caching_ctl = block_group->caching_ctl; + struct btrfs_root *extent_root = fs_info->extent_root; struct btrfs_path *path; - int ret = 0; - struct btrfs_key key; struct extent_buffer *leaf; - int slot; + struct btrfs_key key; u64 total_found = 0; - - BUG_ON(!fs_info); + u64 last = 0; + u32 nritems; + int ret = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - atomic_inc(&block_group->space_info->caching_threads); + exclude_super_stripes(extent_root, block_group); + last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); + /* * We don't want to deadlock with somebody trying to allocate a new * extent for the extent root while also trying to search the extent @@ -277,74 +310,64 @@ static int caching_kthread(void *data) key.objectid = last; key.offset = 0; - btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); + key.type = BTRFS_EXTENT_ITEM_KEY; again: + mutex_lock(&caching_ctl->mutex); /* need to make sure the commit_root doesn't disappear */ down_read(&fs_info->extent_commit_sem); - ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) goto err; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + while (1) { smp_mb(); - if (block_group->fs_info->closing > 1) { + if (fs_info->closing > 1) { last = (u64)-1; break; } - leaf = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(fs_info->extent_root, path); - if (ret < 0) - goto err; - else if (ret) + if (path->slots[0] < nritems) { + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + } else { + ret = find_next_key(path, 0, &key); + if (ret) break; - if (need_resched() || - btrfs_transaction_in_commit(fs_info)) { - leaf = path->nodes[0]; - - /* this shouldn't happen, but if the - * leaf is empty just move on. - */ - if (btrfs_header_nritems(leaf) == 0) - break; - /* - * we need to copy the key out so that - * we are sure the next search advances - * us forward in the btree. - */ - btrfs_item_key_to_cpu(leaf, &key, 0); - btrfs_release_path(fs_info->extent_root, path); - up_read(&fs_info->extent_commit_sem); + caching_ctl->progress = last; + btrfs_release_path(extent_root, path); + up_read(&fs_info->extent_commit_sem); + mutex_unlock(&caching_ctl->mutex); + if (btrfs_transaction_in_commit(fs_info)) schedule_timeout(1); - goto again; - } + else + cond_resched(); + goto again; + } + if (key.objectid < block_group->key.objectid) { + path->slots[0]++; continue; } - btrfs_item_key_to_cpu(leaf, &key, slot); - if (key.objectid < block_group->key.objectid) - goto next; if (key.objectid >= block_group->key.objectid + block_group->key.offset) break; - if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) { + if (key.type == BTRFS_EXTENT_ITEM_KEY) { total_found += add_new_free_space(block_group, fs_info, last, key.objectid); last = key.objectid + key.offset; - } - if (total_found > (1024 * 1024 * 2)) { - total_found = 0; - wake_up(&block_group->caching_q); + if (total_found > (1024 * 1024 * 2)) { + total_found = 0; + wake_up(&caching_ctl->wait); + } } -next: path->slots[0]++; } ret = 0; @@ -352,33 +375,65 @@ next: total_found += add_new_free_space(block_group, fs_info, last, block_group->key.objectid + block_group->key.offset); + caching_ctl->progress = (u64)-1; spin_lock(&block_group->lock); + block_group->caching_ctl = NULL; block_group->cached = BTRFS_CACHE_FINISHED; spin_unlock(&block_group->lock); err: btrfs_free_path(path); up_read(&fs_info->extent_commit_sem); - atomic_dec(&block_group->space_info->caching_threads); - wake_up(&block_group->caching_q); + free_excluded_extents(extent_root, block_group); + + mutex_unlock(&caching_ctl->mutex); + wake_up(&caching_ctl->wait); + + put_caching_control(caching_ctl); + atomic_dec(&block_group->space_info->caching_threads); return 0; } static int cache_block_group(struct btrfs_block_group_cache *cache) { + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_caching_control *caching_ctl; struct task_struct *tsk; int ret = 0; + smp_mb(); + if (cache->cached != BTRFS_CACHE_NO) + return 0; + + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL); + BUG_ON(!caching_ctl); + + INIT_LIST_HEAD(&caching_ctl->list); + mutex_init(&caching_ctl->mutex); + init_waitqueue_head(&caching_ctl->wait); + caching_ctl->block_group = cache; + caching_ctl->progress = cache->key.objectid; + /* one for caching kthread, one for caching block group list */ + atomic_set(&caching_ctl->count, 2); + spin_lock(&cache->lock); if (cache->cached != BTRFS_CACHE_NO) { spin_unlock(&cache->lock); - return ret; + kfree(caching_ctl); + return 0; } + cache->caching_ctl = caching_ctl; cache->cached = BTRFS_CACHE_STARTED; spin_unlock(&cache->lock); + down_write(&fs_info->extent_commit_sem); + list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); + up_write(&fs_info->extent_commit_sem); + + atomic_inc(&cache->space_info->caching_threads); + tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", cache->key.objectid); if (IS_ERR(tsk)) { @@ -1656,7 +1711,6 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, parent, ref_root, flags, ref->objectid, ref->offset, &ins, node->ref_mod); - update_reserved_extents(root, ins.objectid, ins.offset, 0); } else if (node->action == BTRFS_ADD_DELAYED_REF) { ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, node->num_bytes, parent, @@ -1782,7 +1836,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, extent_op->flags_to_set, &extent_op->key, ref->level, &ins); - update_reserved_extents(root, ins.objectid, ins.offset, 0); } else if (node->action == BTRFS_ADD_DELAYED_REF) { ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, node->num_bytes, parent, ref_root, @@ -1817,16 +1870,32 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, BUG_ON(extent_op); head = btrfs_delayed_node_to_head(node); if (insert_reserved) { + int mark_free = 0; + struct extent_buffer *must_clean = NULL; + + ret = pin_down_bytes(trans, root, NULL, + node->bytenr, node->num_bytes, + head->is_data, 1, &must_clean); + if (ret > 0) + mark_free = 1; + + if (must_clean) { + clean_tree_block(NULL, root, must_clean); + btrfs_tree_unlock(must_clean); + free_extent_buffer(must_clean); + } if (head->is_data) { ret = btrfs_del_csums(trans, root, node->bytenr, node->num_bytes); BUG_ON(ret); } - btrfs_update_pinned_extents(root, node->bytenr, - node->num_bytes, 1); - update_reserved_extents(root, node->bytenr, - node->num_bytes, 0); + if (mark_free) { + ret = btrfs_free_reserved_extent(root, + node->bytenr, + node->num_bytes); + BUG_ON(ret); + } } mutex_unlock(&head->mutex); return 0; @@ -3008,10 +3077,12 @@ static int update_block_group(struct btrfs_trans_handle *trans, num_bytes = min(total, cache->key.offset - byte_in_group); if (alloc) { old_val += num_bytes; + btrfs_set_block_group_used(&cache->item, old_val); + cache->reserved -= num_bytes; cache->space_info->bytes_used += num_bytes; + cache->space_info->bytes_reserved -= num_bytes; if (cache->ro) cache->space_info->bytes_readonly -= num_bytes; - btrfs_set_block_group_used(&cache->item, old_val); spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); } else { @@ -3056,127 +3127,136 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) return bytenr; } -int btrfs_update_pinned_extents(struct btrfs_root *root, - u64 bytenr, u64 num, int pin) +/* + * this function must be called within transaction + */ +int btrfs_pin_extent(struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int reserved) { - u64 len; - struct btrfs_block_group_cache *cache; struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_group_cache *cache; - if (pin) - set_extent_dirty(&fs_info->pinned_extents, - bytenr, bytenr + num - 1, GFP_NOFS); - - while (num > 0) { - cache = btrfs_lookup_block_group(fs_info, bytenr); - BUG_ON(!cache); - len = min(num, cache->key.offset - - (bytenr - cache->key.objectid)); - if (pin) { - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - cache->pinned += len; - cache->space_info->bytes_pinned += len; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - fs_info->total_pinned += len; - } else { - int unpin = 0; + cache = btrfs_lookup_block_group(fs_info, bytenr); + BUG_ON(!cache); - /* - * in order to not race with the block group caching, we - * only want to unpin the extent if we are cached. If - * we aren't cached, we want to start async caching this - * block group so we can free the extent the next time - * around. - */ - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - unpin = (cache->cached == BTRFS_CACHE_FINISHED); - if (likely(unpin)) { - cache->pinned -= len; - cache->space_info->bytes_pinned -= len; - fs_info->total_pinned -= len; - } - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->pinned += num_bytes; + cache->space_info->bytes_pinned += num_bytes; + if (reserved) { + cache->reserved -= num_bytes; + cache->space_info->bytes_reserved -= num_bytes; + } + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); - if (likely(unpin)) - clear_extent_dirty(&fs_info->pinned_extents, - bytenr, bytenr + len -1, - GFP_NOFS); - else - cache_block_group(cache); + btrfs_put_block_group(cache); - if (unpin) - btrfs_add_free_space(cache, bytenr, len); - } - btrfs_put_block_group(cache); - bytenr += len; - num -= len; + set_extent_dirty(fs_info->pinned_extents, + bytenr, bytenr + num_bytes - 1, GFP_NOFS); + return 0; +} + +static int update_reserved_extents(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve) +{ + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + if (reserve) { + cache->reserved += num_bytes; + cache->space_info->bytes_reserved += num_bytes; + } else { + cache->reserved -= num_bytes; + cache->space_info->bytes_reserved -= num_bytes; } + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); return 0; } -static int update_reserved_extents(struct btrfs_root *root, - u64 bytenr, u64 num, int reserve) +int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { - u64 len; - struct btrfs_block_group_cache *cache; struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_caching_control *next; + struct btrfs_caching_control *caching_ctl; + struct btrfs_block_group_cache *cache; - while (num > 0) { - cache = btrfs_lookup_block_group(fs_info, bytenr); - BUG_ON(!cache); - len = min(num, cache->key.offset - - (bytenr - cache->key.objectid)); + down_write(&fs_info->extent_commit_sem); - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - if (reserve) { - cache->reserved += len; - cache->space_info->bytes_reserved += len; + list_for_each_entry_safe(caching_ctl, next, + &fs_info->caching_block_groups, list) { + cache = caching_ctl->block_group; + if (block_group_cache_done(cache)) { + cache->last_byte_to_unpin = (u64)-1; + list_del_init(&caching_ctl->list); + put_caching_control(caching_ctl); } else { - cache->reserved -= len; - cache->space_info->bytes_reserved -= len; + cache->last_byte_to_unpin = caching_ctl->progress; } - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - btrfs_put_block_group(cache); - bytenr += len; - num -= len; } + + if (fs_info->pinned_extents == &fs_info->freed_extents[0]) + fs_info->pinned_extents = &fs_info->freed_extents[1]; + else + fs_info->pinned_extents = &fs_info->freed_extents[0]; + + up_write(&fs_info->extent_commit_sem); return 0; } -int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy) +static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) { - u64 last = 0; - u64 start; - u64 end; - struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents; - int ret; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_group_cache *cache = NULL; + u64 len; - while (1) { - ret = find_first_extent_bit(pinned_extents, last, - &start, &end, EXTENT_DIRTY); - if (ret) - break; + while (start <= end) { + if (!cache || + start >= cache->key.objectid + cache->key.offset) { + if (cache) + btrfs_put_block_group(cache); + cache = btrfs_lookup_block_group(fs_info, start); + BUG_ON(!cache); + } + + len = cache->key.objectid + cache->key.offset - start; + len = min(len, end + 1 - start); + + if (start < cache->last_byte_to_unpin) { + len = min(len, cache->last_byte_to_unpin - start); + btrfs_add_free_space(cache, start, len); + } + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->pinned -= len; + cache->space_info->bytes_pinned -= len; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); - set_extent_dirty(copy, start, end, GFP_NOFS); - last = end + 1; + start += len; } + + if (cache) + btrfs_put_block_group(cache); return 0; } int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_io_tree *unpin) + struct btrfs_root *root) { + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_io_tree *unpin; u64 start; u64 end; int ret; + if (fs_info->pinned_extents == &fs_info->freed_extents[0]) + unpin = &fs_info->freed_extents[1]; + else + unpin = &fs_info->freed_extents[0]; + while (1) { ret = find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY); @@ -3185,10 +3265,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, ret = btrfs_discard_extent(root, start, end + 1 - start); - /* unlocks the pinned mutex */ - btrfs_update_pinned_extents(root, start, end + 1 - start, 0); clear_extent_dirty(unpin, start, end, GFP_NOFS); - + unpin_extent_range(root, start, end); cond_resched(); } @@ -3198,7 +3276,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, static int pin_down_bytes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - u64 bytenr, u64 num_bytes, int is_data, + u64 bytenr, u64 num_bytes, + int is_data, int reserved, struct extent_buffer **must_clean) { int err = 0; @@ -3230,15 +3309,15 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, } free_extent_buffer(buf); pinit: - btrfs_set_path_blocking(path); + if (path) + btrfs_set_path_blocking(path); /* unlocks the pinned mutex */ - btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); + btrfs_pin_extent(root, bytenr, num_bytes, reserved); BUG_ON(err < 0); return 0; } - static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@ -3412,7 +3491,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } ret = pin_down_bytes(trans, root, path, bytenr, - num_bytes, is_data, &must_clean); + num_bytes, is_data, 0, &must_clean); if (ret > 0) mark_free = 1; BUG_ON(ret < 0); @@ -3543,8 +3622,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); /* unlocks the pinned mutex */ - btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); - update_reserved_extents(root, bytenr, num_bytes, 0); + btrfs_pin_extent(root, bytenr, num_bytes, 1); ret = 0; } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, @@ -3584,19 +3662,33 @@ static noinline int wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, u64 num_bytes) { + struct btrfs_caching_control *caching_ctl; DEFINE_WAIT(wait); - prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE); - - if (block_group_cache_done(cache)) { - finish_wait(&cache->caching_q, &wait); + caching_ctl = get_caching_control(cache); + if (!caching_ctl) return 0; - } - schedule(); - finish_wait(&cache->caching_q, &wait); - wait_event(cache->caching_q, block_group_cache_done(cache) || + wait_event(caching_ctl->wait, block_group_cache_done(cache) || (cache->free_space >= num_bytes)); + + put_caching_control(caching_ctl); + return 0; +} + +static noinline int +wait_block_group_cache_done(struct btrfs_block_group_cache *cache) +{ + struct btrfs_caching_control *caching_ctl; + DEFINE_WAIT(wait); + + caching_ctl = get_caching_control(cache); + if (!caching_ctl) + return 0; + + wait_event(caching_ctl->wait, block_group_cache_done(cache)); + + put_caching_control(caching_ctl); return 0; } @@ -3880,6 +3972,8 @@ checks: search_start - offset); BUG_ON(offset > search_start); + update_reserved_extents(block_group, num_bytes, 1); + /* we are all good, lets return */ break; loop: @@ -3972,12 +4066,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes) up_read(&info->groups_sem); } -static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 num_bytes, u64 min_alloc_size, - u64 empty_size, u64 hint_byte, - u64 search_end, struct btrfs_key *ins, - u64 data) +int btrfs_reserve_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 num_bytes, u64 min_alloc_size, + u64 empty_size, u64 hint_byte, + u64 search_end, struct btrfs_key *ins, + u64 data) { int ret; u64 search_start = 0; @@ -4043,25 +4137,8 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) ret = btrfs_discard_extent(root, start, len); btrfs_add_free_space(cache, start, len); + update_reserved_extents(cache, len, 0); btrfs_put_block_group(cache); - update_reserved_extents(root, start, len, 0); - - return ret; -} - -int btrfs_reserve_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 num_bytes, u64 min_alloc_size, - u64 empty_size, u64 hint_byte, - u64 search_end, struct btrfs_key *ins, - u64 data) -{ - int ret; - ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size, - empty_size, hint_byte, search_end, ins, - data); - if (!ret) - update_reserved_extents(root, ins->objectid, ins->offset, 1); return ret; } @@ -4222,15 +4299,46 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, { int ret; struct btrfs_block_group_cache *block_group; + struct btrfs_caching_control *caching_ctl; + u64 start = ins->objectid; + u64 num_bytes = ins->offset; block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); cache_block_group(block_group); - wait_event(block_group->caching_q, - block_group_cache_done(block_group)); + caching_ctl = get_caching_control(block_group); - ret = btrfs_remove_free_space(block_group, ins->objectid, - ins->offset); - BUG_ON(ret); + if (!caching_ctl) { + BUG_ON(!block_group_cache_done(block_group)); + ret = btrfs_remove_free_space(block_group, start, num_bytes); + BUG_ON(ret); + } else { + mutex_lock(&caching_ctl->mutex); + + if (start >= caching_ctl->progress) { + ret = add_excluded_extent(root, start, num_bytes); + BUG_ON(ret); + } else if (start + num_bytes <= caching_ctl->progress) { + ret = btrfs_remove_free_space(block_group, + start, num_bytes); + BUG_ON(ret); + } else { + num_bytes = caching_ctl->progress - start; + ret = btrfs_remove_free_space(block_group, + start, num_bytes); + BUG_ON(ret); + + start = caching_ctl->progress; + num_bytes = ins->objectid + ins->offset - + caching_ctl->progress; + ret = add_excluded_extent(root, start, num_bytes); + BUG_ON(ret); + } + + mutex_unlock(&caching_ctl->mutex); + put_caching_control(caching_ctl); + } + + update_reserved_extents(block_group, ins->offset, 1); btrfs_put_block_group(block_group); ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 0, owner, offset, ins, 1); @@ -4254,9 +4362,9 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans, int ret; u64 flags = 0; - ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes, - empty_size, hint_byte, search_end, - ins, 0); + ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes, + empty_size, hint_byte, search_end, + ins, 0); if (ret) return ret; @@ -4267,7 +4375,6 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans, } else BUG_ON(parent > 0); - update_reserved_extents(root, ins->objectid, ins->offset, 1); if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { struct btrfs_delayed_extent_op *extent_op; extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); @@ -7164,8 +7271,18 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) { struct btrfs_block_group_cache *block_group; struct btrfs_space_info *space_info; + struct btrfs_caching_control *caching_ctl; struct rb_node *n; + down_write(&info->extent_commit_sem); + while (!list_empty(&info->caching_block_groups)) { + caching_ctl = list_entry(info->caching_block_groups.next, + struct btrfs_caching_control, list); + list_del(&caching_ctl->list); + put_caching_control(caching_ctl); + } + up_write(&info->extent_commit_sem); + spin_lock(&info->block_group_cache_lock); while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { block_group = rb_entry(n, struct btrfs_block_group_cache, @@ -7179,8 +7296,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) up_write(&block_group->space_info->groups_sem); if (block_group->cached == BTRFS_CACHE_STARTED) - wait_event(block_group->caching_q, - block_group_cache_done(block_group)); + wait_block_group_cache_done(block_group); btrfs_remove_free_space_cache(block_group); @@ -7250,7 +7366,6 @@ int btrfs_read_block_groups(struct btrfs_root *root) spin_lock_init(&cache->lock); spin_lock_init(&cache->tree_lock); cache->fs_info = info; - init_waitqueue_head(&cache->caching_q); INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); @@ -7272,8 +7387,6 @@ int btrfs_read_block_groups(struct btrfs_root *root) cache->flags = btrfs_block_group_flags(&cache->item); cache->sectorsize = root->sectorsize; - remove_sb_from_cache(root, cache); - /* * check for two cases, either we are full, and therefore * don't need to bother with the caching work since we won't @@ -7282,13 +7395,17 @@ int btrfs_read_block_groups(struct btrfs_root *root) * time, particularly in the full case. */ if (found_key.offset == btrfs_block_group_used(&cache->item)) { + cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; } else if (btrfs_block_group_used(&cache->item) == 0) { + exclude_super_stripes(root, cache); + cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; add_new_free_space(cache, root->fs_info, found_key.objectid, found_key.objectid + found_key.offset); + free_excluded_extents(root, cache); } ret = update_space_info(info, cache->flags, found_key.offset, @@ -7345,7 +7462,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); spin_lock_init(&cache->tree_lock); - init_waitqueue_head(&cache->caching_q); INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); @@ -7354,12 +7470,15 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->flags = type; btrfs_set_block_group_flags(&cache->item, type); + cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; - remove_sb_from_cache(root, cache); + exclude_super_stripes(root, cache); add_new_free_space(cache, root->fs_info, chunk_offset, chunk_offset + size); + free_excluded_extents(root, cache); + ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, &cache->space_info); BUG_ON(ret); @@ -7428,8 +7547,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, up_write(&block_group->space_info->groups_sem); if (block_group->cached == BTRFS_CACHE_STARTED) - wait_event(block_group->caching_q, - block_group_cache_done(block_group)); + wait_block_group_cache_done(block_group); btrfs_remove_free_space_cache(block_group); -- cgit v1.1 From 1c4850e21df8b441164d910bc611ef46a01d5d75 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 21 Sep 2009 15:55:59 -0400 Subject: Btrfs: speed up snapshot dropping This patch contains two changes to avoid unnecessary tree block reads during snapshot dropping. First, check tree block's reference count and flags before reading the tree block. if reference count > 1 and there is no need to update backrefs, we can avoid reading the tree block. Second, save when snapshot was created in root_key.offset. we can compare block pointer's generation with snapshot's creation generation during updating backrefs. If a given block was created before snapshot was created, the snapshot can't be the tree block's owner. So we can avoid reading the block. Signed-off-by: Yan Zheng Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 253 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 205 insertions(+), 48 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9bcb9c0..8fc92298 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4886,19 +4886,90 @@ struct walk_control { int shared_level; int update_ref; int keep_locks; + int reada_slot; + int reada_count; }; #define DROP_REFERENCE 1 #define UPDATE_BACKREF 2 +static noinline void reada_walk_down(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct walk_control *wc, + struct btrfs_path *path) +{ + u64 bytenr; + u64 generation; + u64 refs; + u64 last = 0; + u32 nritems; + u32 blocksize; + struct btrfs_key key; + struct extent_buffer *eb; + int ret; + int slot; + int nread = 0; + + if (path->slots[wc->level] < wc->reada_slot) { + wc->reada_count = wc->reada_count * 2 / 3; + wc->reada_count = max(wc->reada_count, 2); + } else { + wc->reada_count = wc->reada_count * 3 / 2; + wc->reada_count = min_t(int, wc->reada_count, + BTRFS_NODEPTRS_PER_BLOCK(root)); + } + + eb = path->nodes[wc->level]; + nritems = btrfs_header_nritems(eb); + blocksize = btrfs_level_size(root, wc->level - 1); + + for (slot = path->slots[wc->level]; slot < nritems; slot++) { + if (nread >= wc->reada_count) + break; + + cond_resched(); + bytenr = btrfs_node_blockptr(eb, slot); + generation = btrfs_node_ptr_generation(eb, slot); + + if (slot == path->slots[wc->level]) + goto reada; + + if (wc->stage == UPDATE_BACKREF && + generation <= root->root_key.offset) + continue; + + if (wc->stage == DROP_REFERENCE) { + ret = btrfs_lookup_extent_info(trans, root, + bytenr, blocksize, + &refs, NULL); + BUG_ON(ret); + BUG_ON(refs == 0); + if (refs == 1) + goto reada; + + if (!wc->update_ref || + generation <= root->root_key.offset) + continue; + btrfs_node_key_to_cpu(eb, &key, slot); + ret = btrfs_comp_cpu_keys(&key, + &wc->update_progress); + if (ret < 0) + continue; + } +reada: + ret = readahead_tree_block(root, bytenr, blocksize, + generation); + if (ret) + break; + last = bytenr + blocksize; + nread++; + } + wc->reada_slot = slot; +} + /* * hepler to process tree block while walking down the tree. * - * when wc->stage == DROP_REFERENCE, this function checks - * reference count of the block. if the block is shared and - * we need update back refs for the subtree rooted at the - * block, this function changes wc->stage to UPDATE_BACKREF - * * when wc->stage == UPDATE_BACKREF, this function updates * back refs for pointers in the block. * @@ -4911,7 +4982,6 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, { int level = wc->level; struct extent_buffer *eb = path->nodes[level]; - struct btrfs_key key; u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; int ret; @@ -4934,21 +5004,6 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, BUG_ON(wc->refs[level] == 0); } - if (wc->stage == DROP_REFERENCE && - wc->update_ref && wc->refs[level] > 1) { - BUG_ON(eb == root->node); - BUG_ON(path->slots[level] > 0); - if (level == 0) - btrfs_item_key_to_cpu(eb, &key, path->slots[level]); - else - btrfs_node_key_to_cpu(eb, &key, path->slots[level]); - if (btrfs_header_owner(eb) == root->root_key.objectid && - btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) { - wc->stage = UPDATE_BACKREF; - wc->shared_level = level; - } - } - if (wc->stage == DROP_REFERENCE) { if (wc->refs[level] > 1) return 1; @@ -4985,6 +5040,123 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, } /* + * hepler to process tree block pointer. + * + * when wc->stage == DROP_REFERENCE, this function checks + * reference count of the block pointed to. if the block + * is shared and we need update back refs for the subtree + * rooted at the block, this function changes wc->stage to + * UPDATE_BACKREF. if the block is shared and there is no + * need to update back, this function drops the reference + * to the block. + * + * NOTE: return value 1 means we should stop walking down. + */ +static noinline int do_walk_down(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc) +{ + u64 bytenr; + u64 generation; + u64 parent; + u32 blocksize; + struct btrfs_key key; + struct extent_buffer *next; + int level = wc->level; + int reada = 0; + int ret = 0; + + generation = btrfs_node_ptr_generation(path->nodes[level], + path->slots[level]); + /* + * if the lower level block was created before the snapshot + * was created, we know there is no need to update back refs + * for the subtree + */ + if (wc->stage == UPDATE_BACKREF && + generation <= root->root_key.offset) + return 1; + + bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); + blocksize = btrfs_level_size(root, level - 1); + + next = btrfs_find_tree_block(root, bytenr, blocksize); + if (!next) { + next = btrfs_find_create_tree_block(root, bytenr, blocksize); + reada = 1; + } + btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); + + if (wc->stage == DROP_REFERENCE) { + ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, + &wc->refs[level - 1], + &wc->flags[level - 1]); + BUG_ON(ret); + BUG_ON(wc->refs[level - 1] == 0); + + if (wc->refs[level - 1] > 1) { + if (!wc->update_ref || + generation <= root->root_key.offset) + goto skip; + + btrfs_node_key_to_cpu(path->nodes[level], &key, + path->slots[level]); + ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); + if (ret < 0) + goto skip; + + wc->stage = UPDATE_BACKREF; + wc->shared_level = level - 1; + } + } + + if (!btrfs_buffer_uptodate(next, generation)) { + btrfs_tree_unlock(next); + free_extent_buffer(next); + next = NULL; + } + + if (!next) { + if (reada && level == 1) + reada_walk_down(trans, root, wc, path); + next = read_tree_block(root, bytenr, blocksize, generation); + btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); + } + + level--; + BUG_ON(level != btrfs_header_level(next)); + path->nodes[level] = next; + path->slots[level] = 0; + path->locks[level] = 1; + wc->level = level; + if (wc->level == 1) + wc->reada_slot = 0; + return 0; +skip: + wc->refs[level - 1] = 0; + wc->flags[level - 1] = 0; + + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { + parent = path->nodes[level]->start; + } else { + BUG_ON(root->root_key.objectid != + btrfs_header_owner(path->nodes[level])); + parent = 0; + } + + ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, + root->root_key.objectid, level - 1, 0); + BUG_ON(ret); + + btrfs_tree_unlock(next); + free_extent_buffer(next); + return 1; +} + +/* * hepler to process tree block while walking up the tree. * * when wc->stage == DROP_REFERENCE, this function drops @@ -5011,7 +5183,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (level < wc->shared_level) goto out; - BUG_ON(wc->refs[level] <= 1); ret = find_next_key(path, level + 1, &wc->update_progress); if (ret > 0) wc->update_ref = 0; @@ -5042,8 +5213,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, path->locks[level] = 0; return 1; } - } else { - BUG_ON(level != 0); } } @@ -5096,17 +5265,13 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct walk_control *wc) { - struct extent_buffer *next; - struct extent_buffer *cur; - u64 bytenr; - u64 ptr_gen; - u32 blocksize; int level = wc->level; int ret; while (level >= 0) { - cur = path->nodes[level]; - BUG_ON(path->slots[level] >= btrfs_header_nritems(cur)); + if (path->slots[level] >= + btrfs_header_nritems(path->nodes[level])) + break; ret = walk_down_proc(trans, root, path, wc); if (ret > 0) @@ -5115,20 +5280,12 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, if (level == 0) break; - bytenr = btrfs_node_blockptr(cur, path->slots[level]); - blocksize = btrfs_level_size(root, level - 1); - ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]); - - next = read_tree_block(root, bytenr, blocksize, ptr_gen); - btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); - - level--; - BUG_ON(level != btrfs_header_level(next)); - path->nodes[level] = next; - path->slots[level] = 0; - path->locks[level] = 1; - wc->level = level; + ret = do_walk_down(trans, root, path, wc); + if (ret > 0) { + path->slots[level]++; + continue; + } + level = wc->level; } return 0; } @@ -5218,9 +5375,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) err = ret; goto out; } - btrfs_node_key_to_cpu(path->nodes[level], &key, - path->slots[level]); - WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key))); + WARN_ON(ret > 0); /* * unlock our path, this is safe because only this @@ -5255,6 +5410,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) wc->stage = DROP_REFERENCE; wc->update_ref = update_ref; wc->keep_locks = 0; + wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); while (1) { ret = walk_down_tree(trans, root, path, wc); @@ -5361,6 +5517,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, wc->stage = DROP_REFERENCE; wc->update_ref = 0; wc->keep_locks = 1; + wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); while (1) { wret = walk_down_tree(trans, root, path, wc); -- cgit v1.1 From 76dda93c6ae2c1dc3e6cde34569d6aca26b0c918 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 21 Sep 2009 16:00:26 -0400 Subject: Btrfs: add snapshot/subvolume destroy ioctl This patch adds snapshot/subvolume destroy ioctl. A subvolume that isn't being used and doesn't contains links to other subvolumes can be destroyed. Signed-off-by: Yan Zheng Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8fc92298..4bd04f3 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5463,9 +5463,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) ret = btrfs_del_root(trans, tree_root, &root->root_key); BUG_ON(ret); - free_extent_buffer(root->node); - free_extent_buffer(root->commit_root); - kfree(root); + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + ret = btrfs_find_last_root(tree_root, root->root_key.objectid, + NULL, NULL); + BUG_ON(ret < 0); + if (ret > 0) { + ret = btrfs_del_orphan_item(trans, tree_root, + root->root_key.objectid); + BUG_ON(ret); + } + } + + if (root->in_radix) { + btrfs_free_fs_root(tree_root->fs_info, root); + } else { + free_extent_buffer(root->node); + free_extent_buffer(root->commit_root); + kfree(root); + } out: btrfs_end_transaction(trans, tree_root); kfree(wc); -- cgit v1.1 From ba1bf4818baf68d914ef9e3b06fbea6acb674fe4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 11 Sep 2009 16:11:19 -0400 Subject: Btrfs: make balance code choose more wisely when relocating Currently, we can panic the box if the first block group we go to move is of a type where there is no space left to move those extents. For example, if we fill the disk up with data, and then we try to balance and we have no room to move the data nor room to allocate new chunks, we will panic. Change this by checking to see if we have room to move this chunk around, and if not, return -ENOSPC and move on to the next chunk. This will make sure we remove block groups that are moveable, like if we have alot of empty metadata block groups, and then that way we make room to be able to balance our data chunks as well. Tested this with an fs that would panic on btrfs-vol -b normally, but no longer panics with this patch. V1->V2: -actually search for a free extent on the device to make sure we can allocate a chunk if need be. -fix btrfs_shrink_device to make sure we actually try to relocate all the chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r we don't remove the device with data still on it. -check to make sure the block group we are going to relocate isn't the last one in that particular space -fix a bug in btrfs_shrink_device where we would change the device's size and not fix it if we fail to do our relocate Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4bd04f3..4c7c946 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7402,6 +7402,93 @@ out: } #endif +/* + * checks to see if its even possible to relocate this block group. + * + * @return - -1 if it's not a good idea to relocate this block group, 0 if its + * ok to go ahead and try. + */ +int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + struct btrfs_device *device; + int full = 0; + int ret = 0; + + block_group = btrfs_lookup_block_group(root->fs_info, bytenr); + + /* odd, couldn't find the block group, leave it alone */ + if (!block_group) + return -1; + + /* no bytes used, we're good */ + if (!btrfs_block_group_used(&block_group->item)) + goto out; + + space_info = block_group->space_info; + spin_lock(&space_info->lock); + + full = space_info->full; + + /* + * if this is the last block group we have in this space, we can't + * relocate it. + */ + if (space_info->total_bytes == block_group->key.offset) { + ret = -1; + spin_unlock(&space_info->lock); + goto out; + } + + /* + * need to make sure we have room in the space to handle all of the + * extents from this block group. If we can, we're good + */ + if (space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + btrfs_block_group_used(&block_group->item) < + space_info->total_bytes) { + spin_unlock(&space_info->lock); + goto out; + } + spin_unlock(&space_info->lock); + + /* + * ok we don't have enough space, but maybe we have free space on our + * devices to allocate new chunks for relocation, so loop through our + * alloc devices and guess if we have enough space. However, if we + * were marked as full, then we know there aren't enough chunks, and we + * can just return. + */ + ret = -1; + if (full) + goto out; + + mutex_lock(&root->fs_info->chunk_mutex); + list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { + u64 min_free = btrfs_block_group_used(&block_group->item); + u64 dev_offset, max_avail; + + /* + * check to make sure we can actually find a chunk with enough + * space to fit our block group in. + */ + if (device->total_bytes > device->bytes_used + min_free) { + ret = find_free_dev_extent(NULL, device, min_free, + &dev_offset, &max_avail); + if (!ret) + break; + ret = -1; + } + } + mutex_unlock(&root->fs_info->chunk_mutex); +out: + btrfs_put_block_group(block_group); + return ret; +} + static int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key) { -- cgit v1.1 From 0a24325e6d8cfb150eba0aa279615ef27b5f6aec Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 11 Sep 2009 16:11:20 -0400 Subject: Btrfs: don't keep retrying a block group if we fail to allocate a cluster The box can get locked up in the allocator if we happen upon a block group under these conditions: 1) During a commit, so caching threads cannot make progress 2) Our block group currently is in the middle of being cached 3) Our block group currently has plenty of free space in it 4) Our block group is so fragmented that it ends up having no free space chunks larger than min_bytes calculated by btrfs_find_space_cluster. What happens is we try and do btrfs_find_space_cluster, which fails because it is unable to find enough free space chunks that are large than min_bytes and are close enough together. Since the block group is not cached we do a wait_block_group_cache_progress, which waits for the number of bytes we need, except the block group already has _plenty_ of free space, its just severely fragmented, so we loop and try again, ad infinitum. This patch keeps us from waiting on the block group to finish caching if we failed to find a free space cluster before. It also makes sure that we don't even try to find a free space cluster if we are on our last loop in the allocator, since we will have tried everything at this point at it is futile. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4c7c946..0f41da2 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3726,6 +3726,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, int last_ptr_loop = 0; int loop = 0; bool found_uncached_bg = false; + bool failed_cluster_refill = false; WARN_ON(num_bytes < root->sectorsize); btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); @@ -3823,7 +3824,16 @@ have_block_group: if (unlikely(block_group->ro)) goto loop; - if (last_ptr) { + /* + * Ok we want to try and use the cluster allocator, so lets look + * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will + * have tried the cluster allocator plenty of times at this + * point and not have found anything, so we are likely way too + * fragmented for the clustering stuff to find anything, so lets + * just skip it and let the allocator find whatever block it can + * find + */ + if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) { /* * the refill lock keeps out other * people trying to start a new cluster @@ -3898,9 +3908,11 @@ refill_cluster: spin_unlock(&last_ptr->refill_lock); goto checks; } - } else if (!cached && loop > LOOP_CACHING_NOWAIT) { + } else if (!cached && loop > LOOP_CACHING_NOWAIT + && !failed_cluster_refill) { spin_unlock(&last_ptr->refill_lock); + failed_cluster_refill = true; wait_block_group_cache_progress(block_group, num_bytes + empty_cluster + empty_size); goto have_block_group; @@ -3912,13 +3924,9 @@ refill_cluster: * cluster. Free the cluster we've been trying * to use, and go to the next block group */ - if (loop < LOOP_NO_EMPTY_SIZE) { - btrfs_return_cluster_to_free_space(NULL, - last_ptr); - spin_unlock(&last_ptr->refill_lock); - goto loop; - } + btrfs_return_cluster_to_free_space(NULL, last_ptr); spin_unlock(&last_ptr->refill_lock); + goto loop; } offset = btrfs_find_space_for_alloc(block_group, search_start, @@ -3977,6 +3985,7 @@ checks: /* we are all good, lets return */ break; loop: + failed_cluster_refill = false; btrfs_put_block_group(block_group); } up_read(&space_info->groups_sem); -- cgit v1.1 From f61408b81cd040a594dc0b65171230c4d5cc917d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 11 Sep 2009 16:11:20 -0400 Subject: Btrfs: remove dead code This patch removes a bunch of dead code from the snapshot removal stuff. It was confusing me when doing the metadata ENOSPC stuff so I killed it. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 706 ------------------------------------------------- 1 file changed, 706 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0f41da2..93e376a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4462,430 +4462,6 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, return buf; } -#if 0 -int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *leaf) -{ - u64 disk_bytenr; - u64 num_bytes; - struct btrfs_key key; - struct btrfs_file_extent_item *fi; - u32 nritems; - int i; - int ret; - - BUG_ON(!btrfs_is_leaf(leaf)); - nritems = btrfs_header_nritems(leaf); - - for (i = 0; i < nritems; i++) { - cond_resched(); - btrfs_item_key_to_cpu(leaf, &key, i); - - /* only extents have references, skip everything else */ - if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) - continue; - - fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); - - /* inline extents live in the btree, they don't have refs */ - if (btrfs_file_extent_type(leaf, fi) == - BTRFS_FILE_EXTENT_INLINE) - continue; - - disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - - /* holes don't have refs */ - if (disk_bytenr == 0) - continue; - - num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); - ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes, - leaf->start, 0, key.objectid, 0); - BUG_ON(ret); - } - return 0; -} - -static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_leaf_ref *ref) -{ - int i; - int ret; - struct btrfs_extent_info *info; - struct refsort *sorted; - - if (ref->nritems == 0) - return 0; - - sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS); - for (i = 0; i < ref->nritems; i++) { - sorted[i].bytenr = ref->extents[i].bytenr; - sorted[i].slot = i; - } - sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL); - - /* - * the items in the ref were sorted when the ref was inserted - * into the ref cache, so this is already in order - */ - for (i = 0; i < ref->nritems; i++) { - info = ref->extents + sorted[i].slot; - ret = btrfs_free_extent(trans, root, info->bytenr, - info->num_bytes, ref->bytenr, - ref->owner, ref->generation, - info->objectid, 0); - - atomic_inc(&root->fs_info->throttle_gen); - wake_up(&root->fs_info->transaction_throttle); - cond_resched(); - - BUG_ON(ret); - info++; - } - - kfree(sorted); - return 0; -} - - -static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 start, - u64 len, u32 *refs) -{ - int ret; - - ret = btrfs_lookup_extent_refs(trans, root, start, len, refs); - BUG_ON(ret); - -#if 0 /* some debugging code in case we see problems here */ - /* if the refs count is one, it won't get increased again. But - * if the ref count is > 1, someone may be decreasing it at - * the same time we are. - */ - if (*refs != 1) { - struct extent_buffer *eb = NULL; - eb = btrfs_find_create_tree_block(root, start, len); - if (eb) - btrfs_tree_lock(eb); - - mutex_lock(&root->fs_info->alloc_mutex); - ret = lookup_extent_ref(NULL, root, start, len, refs); - BUG_ON(ret); - mutex_unlock(&root->fs_info->alloc_mutex); - - if (eb) { - btrfs_tree_unlock(eb); - free_extent_buffer(eb); - } - if (*refs == 1) { - printk(KERN_ERR "btrfs block %llu went down to one " - "during drop_snap\n", (unsigned long long)start); - } - - } -#endif - - cond_resched(); - return ret; -} - - -/* - * this is used while deleting old snapshots, and it drops the refs - * on a whole subtree starting from a level 1 node. - * - * The idea is to sort all the leaf pointers, and then drop the - * ref on all the leaves in order. Most of the time the leaves - * will have ref cache entries, so no leaf IOs will be required to - * find the extents they have references on. - * - * For each leaf, any references it has are also dropped in order - * - * This ends up dropping the references in something close to optimal - * order for reading and modifying the extent allocation tree. - */ -static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path) -{ - u64 bytenr; - u64 root_owner; - u64 root_gen; - struct extent_buffer *eb = path->nodes[1]; - struct extent_buffer *leaf; - struct btrfs_leaf_ref *ref; - struct refsort *sorted = NULL; - int nritems = btrfs_header_nritems(eb); - int ret; - int i; - int refi = 0; - int slot = path->slots[1]; - u32 blocksize = btrfs_level_size(root, 0); - u32 refs; - - if (nritems == 0) - goto out; - - root_owner = btrfs_header_owner(eb); - root_gen = btrfs_header_generation(eb); - sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS); - - /* - * step one, sort all the leaf pointers so we don't scribble - * randomly into the extent allocation tree - */ - for (i = slot; i < nritems; i++) { - sorted[refi].bytenr = btrfs_node_blockptr(eb, i); - sorted[refi].slot = i; - refi++; - } - - /* - * nritems won't be zero, but if we're picking up drop_snapshot - * after a crash, slot might be > 0, so double check things - * just in case. - */ - if (refi == 0) - goto out; - - sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); - - /* - * the first loop frees everything the leaves point to - */ - for (i = 0; i < refi; i++) { - u64 ptr_gen; - - bytenr = sorted[i].bytenr; - - /* - * check the reference count on this leaf. If it is > 1 - * we just decrement it below and don't update any - * of the refs the leaf points to. - */ - ret = drop_snap_lookup_refcount(trans, root, bytenr, - blocksize, &refs); - BUG_ON(ret); - if (refs != 1) - continue; - - ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot); - - /* - * the leaf only had one reference, which means the - * only thing pointing to this leaf is the snapshot - * we're deleting. It isn't possible for the reference - * count to increase again later - * - * The reference cache is checked for the leaf, - * and if found we'll be able to drop any refs held by - * the leaf without needing to read it in. - */ - ref = btrfs_lookup_leaf_ref(root, bytenr); - if (ref && ref->generation != ptr_gen) { - btrfs_free_leaf_ref(root, ref); - ref = NULL; - } - if (ref) { - ret = cache_drop_leaf_ref(trans, root, ref); - BUG_ON(ret); - btrfs_remove_leaf_ref(root, ref); - btrfs_free_leaf_ref(root, ref); - } else { - /* - * the leaf wasn't in the reference cache, so - * we have to read it. - */ - leaf = read_tree_block(root, bytenr, blocksize, - ptr_gen); - ret = btrfs_drop_leaf_ref(trans, root, leaf); - BUG_ON(ret); - free_extent_buffer(leaf); - } - atomic_inc(&root->fs_info->throttle_gen); - wake_up(&root->fs_info->transaction_throttle); - cond_resched(); - } - - /* - * run through the loop again to free the refs on the leaves. - * This is faster than doing it in the loop above because - * the leaves are likely to be clustered together. We end up - * working in nice chunks on the extent allocation tree. - */ - for (i = 0; i < refi; i++) { - bytenr = sorted[i].bytenr; - ret = btrfs_free_extent(trans, root, bytenr, - blocksize, eb->start, - root_owner, root_gen, 0, 1); - BUG_ON(ret); - - atomic_inc(&root->fs_info->throttle_gen); - wake_up(&root->fs_info->transaction_throttle); - cond_resched(); - } -out: - kfree(sorted); - - /* - * update the path to show we've processed the entire level 1 - * node. This will get saved into the root's drop_snapshot_progress - * field so these drops are not repeated again if this transaction - * commits. - */ - path->slots[1] = nritems; - return 0; -} - -/* - * helper function for drop_snapshot, this walks down the tree dropping ref - * counts as it goes. - */ -static noinline int walk_down_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, int *level) -{ - u64 root_owner; - u64 root_gen; - u64 bytenr; - u64 ptr_gen; - struct extent_buffer *next; - struct extent_buffer *cur; - struct extent_buffer *parent; - u32 blocksize; - int ret; - u32 refs; - - WARN_ON(*level < 0); - WARN_ON(*level >= BTRFS_MAX_LEVEL); - ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start, - path->nodes[*level]->len, &refs); - BUG_ON(ret); - if (refs > 1) - goto out; - - /* - * walk down to the last node level and free all the leaves - */ - while (*level >= 0) { - WARN_ON(*level < 0); - WARN_ON(*level >= BTRFS_MAX_LEVEL); - cur = path->nodes[*level]; - - if (btrfs_header_level(cur) != *level) - WARN_ON(1); - - if (path->slots[*level] >= - btrfs_header_nritems(cur)) - break; - - /* the new code goes down to level 1 and does all the - * leaves pointed to that node in bulk. So, this check - * for level 0 will always be false. - * - * But, the disk format allows the drop_snapshot_progress - * field in the root to leave things in a state where - * a leaf will need cleaning up here. If someone crashes - * with the old code and then boots with the new code, - * we might find a leaf here. - */ - if (*level == 0) { - ret = btrfs_drop_leaf_ref(trans, root, cur); - BUG_ON(ret); - break; - } - - /* - * once we get to level one, process the whole node - * at once, including everything below it. - */ - if (*level == 1) { - ret = drop_level_one_refs(trans, root, path); - BUG_ON(ret); - break; - } - - bytenr = btrfs_node_blockptr(cur, path->slots[*level]); - ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); - blocksize = btrfs_level_size(root, *level - 1); - - ret = drop_snap_lookup_refcount(trans, root, bytenr, - blocksize, &refs); - BUG_ON(ret); - - /* - * if there is more than one reference, we don't need - * to read that node to drop any references it has. We - * just drop the ref we hold on that node and move on to the - * next slot in this level. - */ - if (refs != 1) { - parent = path->nodes[*level]; - root_owner = btrfs_header_owner(parent); - root_gen = btrfs_header_generation(parent); - path->slots[*level]++; - - ret = btrfs_free_extent(trans, root, bytenr, - blocksize, parent->start, - root_owner, root_gen, - *level - 1, 1); - BUG_ON(ret); - - atomic_inc(&root->fs_info->throttle_gen); - wake_up(&root->fs_info->transaction_throttle); - cond_resched(); - - continue; - } - - /* - * we need to keep freeing things in the next level down. - * read the block and loop around to process it - */ - next = read_tree_block(root, bytenr, blocksize, ptr_gen); - WARN_ON(*level <= 0); - if (path->nodes[*level-1]) - free_extent_buffer(path->nodes[*level-1]); - path->nodes[*level-1] = next; - *level = btrfs_header_level(next); - path->slots[*level] = 0; - cond_resched(); - } -out: - WARN_ON(*level < 0); - WARN_ON(*level >= BTRFS_MAX_LEVEL); - - if (path->nodes[*level] == root->node) { - parent = path->nodes[*level]; - bytenr = path->nodes[*level]->start; - } else { - parent = path->nodes[*level + 1]; - bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]); - } - - blocksize = btrfs_level_size(root, *level); - root_owner = btrfs_header_owner(parent); - root_gen = btrfs_header_generation(parent); - - /* - * cleanup and free the reference on the last node - * we processed - */ - ret = btrfs_free_extent(trans, root, bytenr, blocksize, - parent->start, root_owner, root_gen, - *level, 1); - free_extent_buffer(path->nodes[*level]); - path->nodes[*level] = NULL; - - *level += 1; - BUG_ON(ret); - - cond_resched(); - return 0; -} -#endif - struct walk_control { u64 refs[BTRFS_MAX_LEVEL]; u64 flags[BTRFS_MAX_LEVEL]; @@ -7129,288 +6705,6 @@ int btrfs_prepare_block_group_relocation(struct btrfs_root *root, return 0; } -#if 0 -static int __insert_orphan_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 objectid, u64 size) -{ - struct btrfs_path *path; - struct btrfs_inode_item *item; - struct extent_buffer *leaf; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - path->leave_spinning = 1; - ret = btrfs_insert_empty_inode(trans, root, path, objectid); - if (ret) - goto out; - - leaf = path->nodes[0]; - item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); - memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); - btrfs_set_inode_generation(leaf, item, 1); - btrfs_set_inode_size(leaf, item, size); - btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); - btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); - btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(root, path); -out: - btrfs_free_path(path); - return ret; -} - -static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *group) -{ - struct inode *inode = NULL; - struct btrfs_trans_handle *trans; - struct btrfs_root *root; - struct btrfs_key root_key; - u64 objectid = BTRFS_FIRST_FREE_OBJECTID; - int err = 0; - - root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; - root_key.type = BTRFS_ROOT_ITEM_KEY; - root_key.offset = (u64)-1; - root = btrfs_read_fs_root_no_name(fs_info, &root_key); - if (IS_ERR(root)) - return ERR_CAST(root); - - trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); - - err = btrfs_find_free_objectid(trans, root, objectid, &objectid); - if (err) - goto out; - - err = __insert_orphan_inode(trans, root, objectid, group->key.offset); - BUG_ON(err); - - err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0, - group->key.offset, 0, group->key.offset, - 0, 0, 0); - BUG_ON(err); - - inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); - if (inode->i_state & I_NEW) { - BTRFS_I(inode)->root = root; - BTRFS_I(inode)->location.objectid = objectid; - BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; - BTRFS_I(inode)->location.offset = 0; - btrfs_read_locked_inode(inode); - unlock_new_inode(inode); - BUG_ON(is_bad_inode(inode)); - } else { - BUG_ON(1); - } - BTRFS_I(inode)->index_cnt = group->key.objectid; - - err = btrfs_orphan_add(trans, inode); -out: - btrfs_end_transaction(trans, root); - if (err) { - if (inode) - iput(inode); - inode = ERR_PTR(err); - } - return inode; -} - -int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) -{ - - struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; - struct btrfs_ordered_extent *ordered; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct list_head list; - size_t offset; - int ret; - u64 disk_bytenr; - - INIT_LIST_HEAD(&list); - - ordered = btrfs_lookup_ordered_extent(inode, file_pos); - BUG_ON(ordered->file_offset != file_pos || ordered->len != len); - - disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; - ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr, - disk_bytenr + len - 1, &list); - - while (!list_empty(&list)) { - sums = list_entry(list.next, struct btrfs_ordered_sum, list); - list_del_init(&sums->list); - - sector_sum = sums->sums; - sums->bytenr = ordered->start; - - offset = 0; - while (offset < sums->len) { - sector_sum->bytenr += ordered->start - disk_bytenr; - sector_sum++; - offset += root->sectorsize; - } - - btrfs_add_ordered_sum(inode, ordered, sums); - } - btrfs_put_ordered_extent(ordered); - return 0; -} - -int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start) -{ - struct btrfs_trans_handle *trans; - struct btrfs_path *path; - struct btrfs_fs_info *info = root->fs_info; - struct extent_buffer *leaf; - struct inode *reloc_inode; - struct btrfs_block_group_cache *block_group; - struct btrfs_key key; - u64 skipped; - u64 cur_byte; - u64 total_found; - u32 nritems; - int ret; - int progress; - int pass = 0; - - root = root->fs_info->extent_root; - - block_group = btrfs_lookup_block_group(info, group_start); - BUG_ON(!block_group); - - printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n", - (unsigned long long)block_group->key.objectid, - (unsigned long long)block_group->flags); - - path = btrfs_alloc_path(); - BUG_ON(!path); - - reloc_inode = create_reloc_inode(info, block_group); - BUG_ON(IS_ERR(reloc_inode)); - - __alloc_chunk_for_shrink(root, block_group, 1); - set_block_group_readonly(block_group); - - btrfs_start_delalloc_inodes(info->tree_root); - btrfs_wait_ordered_extents(info->tree_root, 0); -again: - skipped = 0; - total_found = 0; - progress = 0; - key.objectid = block_group->key.objectid; - key.offset = 0; - key.type = 0; - cur_byte = key.objectid; - - trans = btrfs_start_transaction(info->tree_root, 1); - btrfs_commit_transaction(trans, info->tree_root); - - mutex_lock(&root->fs_info->cleaner_mutex); - btrfs_clean_old_snapshots(info->tree_root); - btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1); - mutex_unlock(&root->fs_info->cleaner_mutex); - - trans = btrfs_start_transaction(info->tree_root, 1); - btrfs_commit_transaction(trans, info->tree_root); - - while (1) { - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; -next: - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out; - if (ret == 1) { - ret = 0; - break; - } - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - } - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - - if (key.objectid >= block_group->key.objectid + - block_group->key.offset) - break; - - if (progress && need_resched()) { - btrfs_release_path(root, path); - cond_resched(); - progress = 0; - continue; - } - progress = 1; - - if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY || - key.objectid + key.offset <= cur_byte) { - path->slots[0]++; - goto next; - } - - total_found++; - cur_byte = key.objectid + key.offset; - btrfs_release_path(root, path); - - __alloc_chunk_for_shrink(root, block_group, 0); - ret = relocate_one_extent(root, path, &key, block_group, - reloc_inode, pass); - BUG_ON(ret < 0); - if (ret > 0) - skipped++; - - key.objectid = cur_byte; - key.type = 0; - key.offset = 0; - } - - btrfs_release_path(root, path); - - if (pass == 0) { - btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1); - invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1); - } - - if (total_found > 0) { - printk(KERN_INFO "btrfs found %llu extents in pass %d\n", - (unsigned long long)total_found, pass); - pass++; - if (total_found == skipped && pass > 2) { - iput(reloc_inode); - reloc_inode = create_reloc_inode(info, block_group); - pass = 0; - } - goto again; - } - - /* delete reloc_inode */ - iput(reloc_inode); - - /* unpin extents in this range */ - trans = btrfs_start_transaction(info->tree_root, 1); - btrfs_commit_transaction(trans, info->tree_root); - - spin_lock(&block_group->lock); - WARN_ON(block_group->pinned > 0); - WARN_ON(block_group->reserved > 0); - WARN_ON(btrfs_block_group_used(&block_group->item) > 0); - spin_unlock(&block_group->lock); - btrfs_put_block_group(block_group); - ret = 0; -out: - btrfs_free_path(path); - return ret; -} -#endif - /* * checks to see if its even possible to relocate this block group. * -- cgit v1.1 From 1b2da372b0324b5c604fc8790e70a7efbeacb0b6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 11 Sep 2009 16:11:20 -0400 Subject: Btrfs: account for space used by the super mirrors As we get closer to proper -ENOSPC handling in btrfs, we need more accurate space accounting for the space info's. Currently we exclude the free space for the super mirrors, but the space they take up isn't accounted for in any of the counters. This patch introduces bytes_super, which keeps track of the amount of bytes used for a super mirror in the block group cache and space info. This makes sure that our free space caclucations will be completely accurate. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 93e376a..5f3544e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -201,6 +201,7 @@ static int exclude_super_stripes(struct btrfs_root *root, BUG_ON(ret); while (nr--) { + cache->bytes_super += stripe_len; ret = add_excluded_extent(root, logical[nr], stripe_len); BUG_ON(ret); @@ -295,6 +296,9 @@ static int caching_kthread(void *data) return -ENOMEM; exclude_super_stripes(extent_root, block_group); + spin_lock(&block_group->space_info->lock); + block_group->space_info->bytes_super += block_group->bytes_super; + spin_unlock(&block_group->space_info->lock); last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); @@ -2785,7 +2789,8 @@ again: do_div(thresh, 100); if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + - meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) { + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + + meta_sinfo->bytes_super > thresh) { struct btrfs_trans_handle *trans; if (!meta_sinfo->full) { meta_sinfo->force_alloc = 1; @@ -2839,7 +2844,7 @@ again: if (data_sinfo->total_bytes - data_sinfo->bytes_used - data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved - data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - - data_sinfo->bytes_may_use < bytes) { + data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) { struct btrfs_trans_handle *trans; /* @@ -6957,8 +6962,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) * time, particularly in the full case. */ if (found_key.offset == btrfs_block_group_used(&cache->item)) { + exclude_super_stripes(root, cache); cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; + free_excluded_extents(root, cache); } else if (btrfs_block_group_used(&cache->item) == 0) { exclude_super_stripes(root, cache); cache->last_byte_to_unpin = (u64)-1; @@ -6975,6 +6982,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) &space_info); BUG_ON(ret); cache->space_info = space_info; + spin_lock(&cache->space_info->lock); + cache->space_info->bytes_super += cache->bytes_super; + spin_unlock(&cache->space_info->lock); + down_write(&space_info->groups_sem); list_add_tail(&cache->list, &space_info->block_groups); up_write(&space_info->groups_sem); @@ -7044,6 +7055,11 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, &cache->space_info); BUG_ON(ret); + + spin_lock(&cache->space_info->lock); + cache->space_info->bytes_super += cache->bytes_super; + spin_unlock(&cache->space_info->lock); + down_write(&cache->space_info->groups_sem); list_add_tail(&cache->list, &cache->space_info->block_groups); up_write(&cache->space_info->groups_sem); -- cgit v1.1 From 33b4d47f5e24b986f486d7de9a2df915ad1fdfbc Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 22 Sep 2009 14:45:50 -0400 Subject: Btrfs: deal with NULL space info After a balance it is briefly possible for the space info field in the inode to be NULL. This adds some checks to make sure things properly deal with the NULL value. Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5f3544e..1b9b878 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2778,6 +2778,8 @@ int btrfs_check_metadata_free_space(struct btrfs_root *root) /* get the space info for where the metadata will live */ alloc_target = btrfs_get_alloc_profile(root, 0); meta_sinfo = __find_space_info(info, alloc_target); + if (!meta_sinfo) + goto alloc; again: spin_lock(&meta_sinfo->lock); @@ -2795,7 +2797,7 @@ again: if (!meta_sinfo->full) { meta_sinfo->force_alloc = 1; spin_unlock(&meta_sinfo->lock); - +alloc: trans = btrfs_start_transaction(root, 1); if (!trans) return -ENOMEM; @@ -2803,6 +2805,10 @@ again: ret = do_chunk_alloc(trans, root->fs_info->extent_root, 2 * 1024 * 1024, alloc_target, 0); btrfs_end_transaction(trans, root); + if (!meta_sinfo) { + meta_sinfo = __find_space_info(info, + alloc_target); + } goto again; } spin_unlock(&meta_sinfo->lock); @@ -2838,6 +2844,9 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); data_sinfo = BTRFS_I(inode)->space_info; + if (!data_sinfo) + goto alloc; + again: /* make sure we have enough space to handle the data first */ spin_lock(&data_sinfo->lock); @@ -2856,7 +2865,7 @@ again: data_sinfo->force_alloc = 1; spin_unlock(&data_sinfo->lock); - +alloc: alloc_target = btrfs_get_alloc_profile(root, 1); trans = btrfs_start_transaction(root, 1); if (!trans) @@ -2868,6 +2877,11 @@ again: btrfs_end_transaction(trans, root); if (ret) return ret; + + if (!data_sinfo) { + btrfs_set_inode_space_info(root, inode); + data_sinfo = BTRFS_I(inode)->space_info; + } goto again; } spin_unlock(&data_sinfo->lock); -- cgit v1.1 From 7ce618db9878689f87897b673fa3329070860fc7 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 22 Sep 2009 14:48:44 -0400 Subject: Btrfs: fix early enospc during balancing We now do extra checks before a balance to make sure there is room for the balance to take place. One of the checks was testing to see if we were trying to balance away the last block group of a given type. If there is no space available for new chunks, we should not try and balance away the last block group of a give type. But, the code wasn't checking for available chunk space, and so it was exiting too soon. The fix here is to combine some of the checks and make sure we try to allocate new chunks when we're balancing the last block group. Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1b9b878..90d314e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6756,22 +6756,16 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) /* * if this is the last block group we have in this space, we can't - * relocate it. + * relocate it unless we're able to allocate a new chunk below. + * + * Otherwise, we need to make sure we have room in the space to handle + * all of the extents from this block group. If we can, we're good */ - if (space_info->total_bytes == block_group->key.offset) { - ret = -1; - spin_unlock(&space_info->lock); - goto out; - } - - /* - * need to make sure we have room in the space to handle all of the - * extents from this block group. If we can, we're good - */ - if (space_info->bytes_used + space_info->bytes_reserved + + if ((space_info->total_bytes != block_group->key.offset) && + (space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly + btrfs_block_group_used(&block_group->item) < - space_info->total_bytes) { + space_info->total_bytes)) { spin_unlock(&space_info->lock); goto out; } -- cgit v1.1