summaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent-tree.c
diff options
context:
space:
mode:
authorYan Zheng <zheng.yan@oracle.com>2008-10-29 14:49:05 -0400
committerChris Mason <chris.mason@oracle.com>2008-10-29 14:49:05 -0400
commitf82d02d9d8222183b7945e893111a6d1bf67ae4a (patch)
tree70be1bb231f4cc2e673920774e759359f3dcf1a5 /fs/btrfs/extent-tree.c
parentc8b978188c9a0fd3d535c13debd19d522b726f1f (diff)
downloadop-kernel-dev-f82d02d9d8222183b7945e893111a6d1bf67ae4a.zip
op-kernel-dev-f82d02d9d8222183b7945e893111a6d1bf67ae4a.tar.gz
Btrfs: Improve space balancing code
This patch improves the space balancing code to keep more sharing of tree blocks. The only case that breaks sharing of tree blocks is data extents get fragmented during balancing. The main changes in this patch are: Add a 'drop sub-tree' function. This solves the problem in old code that BTRFS_HEADER_FLAG_WRITTEN check breaks sharing of tree block. Remove relocation mapping tree. Relocation mappings are stored in struct btrfs_ref_path and updated dynamically during walking up/down the reference path. This reduces CPU usage and simplifies code. This patch also fixes a bug. Root items for reloc trees should be updated in btrfs_free_reloc_root. Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r--fs/btrfs/extent-tree.c364
1 files changed, 223 insertions, 141 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index bbf04e8..56e4136 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3032,13 +3032,103 @@ out:
}
/*
+ * helper function for drop_subtree, this function is similar to
+ * walk_down_tree. The main difference is that it checks reference
+ * counts while tree blocks are locked.
+ */
+static int noinline walk_down_subtree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int *level)
+{
+ struct extent_buffer *next;
+ struct extent_buffer *cur;
+ struct extent_buffer *parent;
+ u64 bytenr;
+ u64 ptr_gen;
+ u32 blocksize;
+ u32 refs;
+ int ret;
+
+ cur = path->nodes[*level];
+ ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len,
+ &refs);
+ BUG_ON(ret);
+ if (refs > 1)
+ goto out;
+
+ while (*level >= 0) {
+ cur = path->nodes[*level];
+ if (*level == 0) {
+ ret = btrfs_drop_leaf_ref(trans, root, cur);
+ BUG_ON(ret);
+ clean_tree_block(trans, root, cur);
+ break;
+ }
+ if (path->slots[*level] >= btrfs_header_nritems(cur)) {
+ clean_tree_block(trans, root, cur);
+ break;
+ }
+
+ bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+ blocksize = btrfs_level_size(root, *level - 1);
+ ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+
+ next = read_tree_block(root, bytenr, blocksize, ptr_gen);
+ btrfs_tree_lock(next);
+
+ ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
+ &refs);
+ BUG_ON(ret);
+ if (refs > 1) {
+ parent = path->nodes[*level];
+ ret = btrfs_free_extent(trans, root, bytenr,
+ blocksize, parent->start,
+ btrfs_header_owner(parent),
+ btrfs_header_generation(parent),
+ *level - 1, 1);
+ BUG_ON(ret);
+ path->slots[*level]++;
+ btrfs_tree_unlock(next);
+ free_extent_buffer(next);
+ continue;
+ }
+
+ *level = btrfs_header_level(next);
+ path->nodes[*level] = next;
+ path->slots[*level] = 0;
+ path->locks[*level] = 1;
+ cond_resched();
+ }
+out:
+ parent = path->nodes[*level + 1];
+ bytenr = path->nodes[*level]->start;
+ blocksize = path->nodes[*level]->len;
+
+ ret = btrfs_free_extent(trans, root, bytenr, blocksize,
+ parent->start, btrfs_header_owner(parent),
+ btrfs_header_generation(parent), *level, 1);
+ BUG_ON(ret);
+
+ if (path->locks[*level]) {
+ btrfs_tree_unlock(path->nodes[*level]);
+ path->locks[*level] = 0;
+ }
+ free_extent_buffer(path->nodes[*level]);
+ path->nodes[*level] = NULL;
+ *level += 1;
+ cond_resched();
+ return 0;
+}
+
+/*
* helper for dropping snapshots. This walks back up the tree in the path
* to find the first node higher up where we haven't yet gone through
* all the slots
*/
static int noinline walk_up_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- struct btrfs_path *path, int *level)
+ struct btrfs_path *path,
+ int *level, int max_level)
{
u64 root_owner;
u64 root_gen;
@@ -3047,7 +3137,7 @@ static int noinline walk_up_tree(struct btrfs_trans_handle *trans,
int slot;
int ret;
- for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
+ for (i = *level; i < max_level && path->nodes[i]; i++) {
slot = path->slots[i];
if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
struct extent_buffer *node;
@@ -3070,12 +3160,18 @@ static int noinline walk_up_tree(struct btrfs_trans_handle *trans,
root_owner = btrfs_header_owner(parent);
root_gen = btrfs_header_generation(parent);
+
+ clean_tree_block(trans, root, path->nodes[*level]);
ret = btrfs_free_extent(trans, root,
path->nodes[*level]->start,
path->nodes[*level]->len,
parent->start, root_owner,
root_gen, *level, 1);
BUG_ON(ret);
+ if (path->locks[*level]) {
+ btrfs_tree_unlock(path->nodes[*level]);
+ path->locks[*level] = 0;
+ }
free_extent_buffer(path->nodes[*level]);
path->nodes[*level] = NULL;
*level = i + 1;
@@ -3145,7 +3241,8 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
if (wret < 0)
ret = wret;
- wret = walk_up_tree(trans, root, path, &level);
+ wret = walk_up_tree(trans, root, path, &level,
+ BTRFS_MAX_LEVEL);
if (wret > 0)
break;
if (wret < 0)
@@ -3168,6 +3265,50 @@ out:
return ret;
}
+int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *node,
+ struct extent_buffer *parent)
+{
+ struct btrfs_path *path;
+ int level;
+ int parent_level;
+ int ret = 0;
+ int wret;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ BUG_ON(!btrfs_tree_locked(parent));
+ parent_level = btrfs_header_level(parent);
+ extent_buffer_get(parent);
+ path->nodes[parent_level] = parent;
+ path->slots[parent_level] = btrfs_header_nritems(parent);
+
+ BUG_ON(!btrfs_tree_locked(node));
+ level = btrfs_header_level(node);
+ extent_buffer_get(node);
+ path->nodes[level] = node;
+ path->slots[level] = 0;
+
+ while (1) {
+ wret = walk_down_subtree(trans, root, path, &level);
+ if (wret < 0)
+ ret = wret;
+ if (wret != 0)
+ break;
+
+ wret = walk_up_tree(trans, root, path, &level, parent_level);
+ if (wret < 0)
+ ret = wret;
+ if (wret != 0)
+ break;
+ }
+
+ btrfs_free_path(path);
+ return ret;
+}
+
static unsigned long calc_ra(unsigned long start, unsigned long last,
unsigned long nr)
{
@@ -3312,6 +3453,10 @@ struct btrfs_ref_path {
u32 num_refs;
int lowest_level;
int current_level;
+ int shared_level;
+
+ struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
+ u64 new_nodes[BTRFS_MAX_LEVEL];
};
struct disk_extent {
@@ -3360,6 +3505,7 @@ static int noinline __next_ref_path(struct btrfs_trans_handle *trans,
if (first_time) {
ref_path->lowest_level = -1;
ref_path->current_level = -1;
+ ref_path->shared_level = -1;
goto walk_up;
}
walk_down:
@@ -3403,8 +3549,11 @@ walk_down:
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
if (found_key.objectid == bytenr &&
- found_key.type == BTRFS_EXTENT_REF_KEY)
+ found_key.type == BTRFS_EXTENT_REF_KEY) {
+ if (level < ref_path->shared_level)
+ ref_path->shared_level = level;
goto found;
+ }
next:
level--;
btrfs_release_path(extent_root, path);
@@ -3992,51 +4141,6 @@ out:
return ret;
}
-int btrfs_add_reloc_mapping(struct btrfs_root *root, u64 orig_bytenr,
- u64 num_bytes, u64 new_bytenr)
-{
- set_extent_bits(&root->fs_info->reloc_mapping_tree,
- orig_bytenr, orig_bytenr + num_bytes - 1,
- EXTENT_LOCKED, GFP_NOFS);
- set_state_private(&root->fs_info->reloc_mapping_tree,
- orig_bytenr, new_bytenr);
- return 0;
-}
-
-int btrfs_get_reloc_mapping(struct btrfs_root *root, u64 orig_bytenr,
- u64 num_bytes, u64 *new_bytenr)
-{
- u64 bytenr;
- u64 cur_bytenr = orig_bytenr;
- u64 prev_bytenr = orig_bytenr;
- int ret;
-
- while (1) {
- ret = get_state_private(&root->fs_info->reloc_mapping_tree,
- cur_bytenr, &bytenr);
- if (ret)
- break;
- prev_bytenr = cur_bytenr;
- cur_bytenr = bytenr;
- }
-
- if (orig_bytenr == cur_bytenr)
- return -ENOENT;
-
- if (prev_bytenr != orig_bytenr) {
- set_state_private(&root->fs_info->reloc_mapping_tree,
- orig_bytenr, cur_bytenr);
- }
- *new_bytenr = cur_bytenr;
- return 0;
-}
-
-void btrfs_free_reloc_mappings(struct btrfs_root *root)
-{
- clear_extent_bits(&root->fs_info->reloc_mapping_tree,
- 0, (u64)-1, -1, GFP_NOFS);
-}
-
int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf, u64 orig_start)
@@ -4222,15 +4326,30 @@ static int noinline replace_extents_in_leaf(struct btrfs_trans_handle *trans,
return 0;
}
-int btrfs_free_reloc_root(struct btrfs_root *root)
+int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
{
struct btrfs_root *reloc_root;
+ int ret;
if (root->reloc_root) {
reloc_root = root->reloc_root;
root->reloc_root = NULL;
list_add(&reloc_root->dead_list,
&root->fs_info->dead_reloc_roots);
+
+ btrfs_set_root_bytenr(&reloc_root->root_item,
+ reloc_root->node->start);
+ btrfs_set_root_level(&root->root_item,
+ btrfs_header_level(reloc_root->node));
+ memset(&reloc_root->root_item.drop_progress, 0,
+ sizeof(struct btrfs_disk_key));
+ reloc_root->root_item.drop_level = 0;
+
+ ret = btrfs_update_root(trans, root->fs_info->tree_root,
+ &reloc_root->root_key,
+ &reloc_root->root_item);
+ BUG_ON(ret);
}
return 0;
}
@@ -4356,8 +4475,6 @@ static int noinline init_reloc_tree(struct btrfs_trans_handle *trans,
btrfs_set_root_refs(root_item, 0);
btrfs_set_root_bytenr(root_item, eb->start);
btrfs_set_root_level(root_item, btrfs_header_level(eb));
- memset(&root_item->drop_progress, 0, sizeof(root_item->drop_progress));
- root_item->drop_level = 0;
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
@@ -4382,15 +4499,19 @@ static int noinline init_reloc_tree(struct btrfs_trans_handle *trans,
* Core function of space balance.
*
* The idea is using reloc trees to relocate tree blocks in reference
- * counted roots. There is one reloc tree for each subvol, all reloc
- * trees share same key objectid. Reloc trees are snapshots of the
- * latest committed roots (subvol root->commit_root). To relocate a tree
- * block referenced by a subvol, the code COW the block through the reloc
- * tree, then update pointer in the subvol to point to the new block.
- * Since all reloc trees share same key objectid, we can easily do special
- * handing to share tree blocks between reloc trees. Once a tree block has
- * been COWed in one reloc tree, we can use the result when the same block
- * is COWed again through other reloc trees.
+ * counted roots. There is one reloc tree for each subvol, and all
+ * reloc trees share same root key objectid. Reloc trees are snapshots
+ * of the latest committed roots of subvols (root->commit_root).
+ *
+ * To relocate a tree block referenced by a subvol, there are two steps.
+ * COW the block through subvol's reloc tree, then update block pointer
+ * in the subvol to point to the new block. Since all reloc trees share
+ * same root key objectid, doing special handing for tree blocks owned
+ * by them is easy. Once a tree block has been COWed in one reloc tree,
+ * we can use the resulting new block directly when the same block is
+ * required to COW again through other reloc trees. By this way, relocated
+ * tree blocks are shared between reloc trees, so they are also shared
+ * between subvols.
*/
static int noinline relocate_one_path(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -4405,15 +4526,14 @@ static int noinline relocate_one_path(struct btrfs_trans_handle *trans,
struct btrfs_key *keys;
u64 *nodes;
int level;
- int lowest_merge;
+ int shared_level;
int lowest_level = 0;
- int update_refs;
int ret;
if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
lowest_level = ref_path->owner_objectid;
- if (is_cowonly_root(ref_path->root_objectid)) {
+ if (!root->ref_cows) {
path->lowest_level = lowest_level;
ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
BUG_ON(ret < 0);
@@ -4422,91 +4542,49 @@ static int noinline relocate_one_path(struct btrfs_trans_handle *trans,
return 0;
}
- keys = kzalloc(sizeof(*keys) * BTRFS_MAX_LEVEL, GFP_NOFS);
- BUG_ON(!keys);
- nodes = kzalloc(sizeof(*nodes) * BTRFS_MAX_LEVEL, GFP_NOFS);
- BUG_ON(!nodes);
-
mutex_lock(&root->fs_info->tree_reloc_mutex);
ret = init_reloc_tree(trans, root);
BUG_ON(ret);
reloc_root = root->reloc_root;
- path->lowest_level = lowest_level;
- ret = btrfs_search_slot(trans, reloc_root, first_key, path, 0, 0);
- BUG_ON(ret);
- /*
- * get relocation mapping for tree blocks in the path
- */
- lowest_merge = BTRFS_MAX_LEVEL;
- for (level = BTRFS_MAX_LEVEL - 1; level >= lowest_level; level--) {
- u64 new_bytenr;
- eb = path->nodes[level];
- if (!eb || eb == reloc_root->node)
- continue;
- ret = btrfs_get_reloc_mapping(reloc_root, eb->start, eb->len,
- &new_bytenr);
- if (ret)
- continue;
- if (level == 0)
- btrfs_item_key_to_cpu(eb, &keys[level], 0);
- else
- btrfs_node_key_to_cpu(eb, &keys[level], 0);
- nodes[level] = new_bytenr;
- lowest_merge = level;
- }
+ shared_level = ref_path->shared_level;
+ ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
- update_refs = 0;
- if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
- eb = path->nodes[0];
- if (btrfs_header_generation(eb) < trans->transid)
- update_refs = 1;
- }
+ keys = ref_path->node_keys;
+ nodes = ref_path->new_nodes;
+ memset(&keys[shared_level + 1], 0,
+ sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
+ memset(&nodes[shared_level + 1], 0,
+ sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
- btrfs_release_path(reloc_root, path);
- /*
- * merge tree blocks that already relocated in other reloc trees
- */
- if (lowest_merge != BTRFS_MAX_LEVEL) {
+ if (nodes[lowest_level] == 0) {
+ path->lowest_level = lowest_level;
+ ret = btrfs_search_slot(trans, reloc_root, first_key, path,
+ 0, 1);
+ BUG_ON(ret);
+ for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
+ eb = path->nodes[level];
+ if (!eb || eb == reloc_root->node)
+ break;
+ nodes[level] = eb->start;
+ if (level == 0)
+ btrfs_item_key_to_cpu(eb, &keys[level], 0);
+ else
+ btrfs_node_key_to_cpu(eb, &keys[level], 0);
+ }
+ if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+ eb = path->nodes[0];
+ ret = replace_extents_in_leaf(trans, reloc_root, eb,
+ group, reloc_inode);
+ BUG_ON(ret);
+ }
+ btrfs_release_path(reloc_root, path);
+ } else {
ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
- lowest_merge);
- BUG_ON(ret < 0);
- }
- /*
- * cow any tree blocks that still haven't been relocated
- */
- ret = btrfs_search_slot(trans, reloc_root, first_key, path, 0, 1);
- BUG_ON(ret);
- /*
- * if we are relocating data block group, update extent pointers
- * in the newly created tree leaf.
- */
- eb = path->nodes[0];
- if (update_refs && nodes[0] != eb->start) {
- ret = replace_extents_in_leaf(trans, reloc_root, eb, group,
- reloc_inode);
+ lowest_level);
BUG_ON(ret);
}
- memset(keys, 0, sizeof(*keys) * BTRFS_MAX_LEVEL);
- memset(nodes, 0, sizeof(*nodes) * BTRFS_MAX_LEVEL);
- for (level = BTRFS_MAX_LEVEL - 1; level >= lowest_level; level--) {
- eb = path->nodes[level];
- if (!eb || eb == reloc_root->node)
- continue;
- BUG_ON(btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID);
- nodes[level] = eb->start;
- if (level == 0)
- btrfs_item_key_to_cpu(eb, &keys[level], 0);
- else
- btrfs_node_key_to_cpu(eb, &keys[level], 0);
- }
-
- if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
- eb = path->nodes[0];
- extent_buffer_get(eb);
- }
- btrfs_release_path(reloc_root, path);
/*
* replace tree blocks in the fs tree with tree blocks in
* the reloc tree.
@@ -4515,15 +4593,19 @@ static int noinline relocate_one_path(struct btrfs_trans_handle *trans,
BUG_ON(ret < 0);
if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+ ret = btrfs_search_slot(trans, reloc_root, first_key, path,
+ 0, 0);
+ BUG_ON(ret);
+ extent_buffer_get(path->nodes[0]);
+ eb = path->nodes[0];
+ btrfs_release_path(reloc_root, path);
ret = invalidate_extent_cache(reloc_root, eb, group, root);
BUG_ON(ret);
free_extent_buffer(eb);
}
- mutex_unlock(&root->fs_info->tree_reloc_mutex);
+ mutex_unlock(&root->fs_info->tree_reloc_mutex);
path->lowest_level = 0;
- kfree(nodes);
- kfree(keys);
return 0;
}
OpenPOWER on IntegriCloud