summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJosef Bacik <jbacik@fb.com>2014-01-23 10:54:11 -0500
committerChris Mason <clm@fb.com>2014-01-28 13:20:26 -0800
commit0a2b2a844af616addc87cac3cc18dcaba2a9d0fb (patch)
treed81e13b3388df4a66e3a2af6ff2df82f532d5c9e /fs
parentd7df2c796d7eedd72a334dc89c65e1fec8171431 (diff)
downloadop-kernel-dev-0a2b2a844af616addc87cac3cc18dcaba2a9d0fb.zip
op-kernel-dev-0a2b2a844af616addc87cac3cc18dcaba2a9d0fb.tar.gz
Btrfs: throttle delayed refs better
On one of our gluster clusters we noticed some pretty big lag spikes. This turned out to be because our transaction commit was taking like 3 minutes to complete. This is because we have like 30 gigs of metadata, so our global reserve would end up being the max which is like 512 mb. So our throttling code would allow a ridiculous amount of delayed refs to build up and then they'd all get run at transaction commit time, and for a cold mounted file system that could take up to 3 minutes to run. So fix the throttling to be based on both the size of the global reserve and how long it takes us to run delayed refs. This patch tracks the time it takes to run delayed refs and then only allows 1 seconds worth of outstanding delayed refs at a time. This way it will auto-tune itself from cold cache up to when everything is in memory and it no longer has to go to disk. This makes our transaction commits take much less time to run. Thanks, Signed-off-by: Josef Bacik <jbacik@fb.com> Signed-off-by: Chris Mason <clm@fb.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/ctree.h3
-rw-r--r--fs/btrfs/disk-io.c2
-rw-r--r--fs/btrfs/extent-tree.c41
-rw-r--r--fs/btrfs/transaction.c4
4 files changed, 46 insertions, 4 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3cebb4a..ca6bcc3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1360,6 +1360,7 @@ struct btrfs_fs_info {
u64 generation;
u64 last_trans_committed;
+ u64 avg_delayed_ref_runtime;
/*
* this is updated to the current trans every time a full commit
@@ -3172,6 +3173,8 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
+int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root, unsigned long count);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ed23127..f0e7bbe 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2185,7 +2185,7 @@ int open_ctree(struct super_block *sb,
fs_info->free_chunk_space = 0;
fs_info->tree_mod_log = RB_ROOT;
fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
-
+ fs_info->avg_delayed_ref_runtime = div64_u64(NSEC_PER_SEC, 64);
/* readahead state */
INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
spin_lock_init(&fs_info->reada_lock);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c77156c..b532259 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2322,8 +2322,10 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *locked_ref = NULL;
struct btrfs_delayed_extent_op *extent_op;
struct btrfs_fs_info *fs_info = root->fs_info;
+ ktime_t start = ktime_get();
int ret;
unsigned long count = 0;
+ unsigned long actual_count = 0;
int must_insert_reserved = 0;
delayed_refs = &trans->transaction->delayed_refs;
@@ -2452,6 +2454,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
&delayed_refs->href_root);
spin_unlock(&delayed_refs->lock);
} else {
+ actual_count++;
ref->in_tree = 0;
rb_erase(&ref->rb_node, &locked_ref->ref_root);
}
@@ -2502,6 +2505,26 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
count++;
cond_resched();
}
+
+ /*
+ * We don't want to include ref heads since we can have empty ref heads
+ * and those will drastically skew our runtime down since we just do
+ * accounting, no actual extent tree updates.
+ */
+ if (actual_count > 0) {
+ u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
+ u64 avg;
+
+ /*
+ * We weigh the current average higher than our current runtime
+ * to avoid large swings in the average.
+ */
+ spin_lock(&delayed_refs->lock);
+ avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
+ avg = div64_u64(avg, 4);
+ fs_info->avg_delayed_ref_runtime = avg;
+ spin_unlock(&delayed_refs->lock);
+ }
return 0;
}
@@ -2600,7 +2623,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
}
-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_block_rsv *global_rsv;
@@ -2629,6 +2652,22 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
return ret;
}
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 num_entries =
+ atomic_read(&trans->transaction->delayed_refs.num_entries);
+ u64 avg_runtime;
+
+ smp_mb();
+ avg_runtime = fs_info->avg_delayed_ref_runtime;
+ if (num_entries * avg_runtime >= NSEC_PER_SEC)
+ return 1;
+
+ return btrfs_check_space_for_delayed_refs(trans, root);
+}
+
/*
* this starts processing the delayed reference count updates and
* extent insertions we have queued up so far. count can be
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index fd14464..5e2bfda 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -645,7 +645,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
if (root->fs_info->global_block_rsv.space_info->full &&
- btrfs_should_throttle_delayed_refs(trans, root))
+ btrfs_check_space_for_delayed_refs(trans, root))
return 1;
return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
@@ -710,7 +710,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
trans->delayed_ref_updates = 0;
if (!trans->sync && btrfs_should_throttle_delayed_refs(trans, root)) {
- cur = max_t(unsigned long, cur, 1);
+ cur = max_t(unsigned long, cur, 32);
trans->delayed_ref_updates = 0;
btrfs_run_delayed_refs(trans, root, cur);
}
OpenPOWER on IntegriCloud