diff options
-rw-r--r-- | Documentation/block/cfq-iosched.txt | 58 | ||||
-rw-r--r-- | Documentation/cgroups/blkio-controller.txt | 35 | ||||
-rw-r--r-- | block/Kconfig | 1 | ||||
-rw-r--r-- | block/blk-cgroup.c | 277 | ||||
-rw-r--r-- | block/blk-cgroup.h | 68 | ||||
-rw-r--r-- | block/blk-core.c | 18 | ||||
-rw-r--r-- | block/blk-exec.c | 4 | ||||
-rw-r--r-- | block/blk-flush.c | 2 | ||||
-rw-r--r-- | block/blk-lib.c | 6 | ||||
-rw-r--r-- | block/blk-sysfs.c | 9 | ||||
-rw-r--r-- | block/blk.h | 2 | ||||
-rw-r--r-- | block/cfq-iosched.c | 629 | ||||
-rw-r--r-- | block/elevator.c | 23 | ||||
-rw-r--r-- | drivers/block/swim3.c | 5 | ||||
-rw-r--r-- | drivers/md/dm.c | 1 | ||||
-rw-r--r-- | drivers/md/raid5.c | 11 | ||||
-rw-r--r-- | fs/bio.c | 2 | ||||
-rw-r--r-- | fs/block_dev.c | 6 | ||||
-rw-r--r-- | fs/buffer.c | 10 | ||||
-rw-r--r-- | fs/fs-writeback.c | 16 | ||||
-rw-r--r-- | include/linux/blkdev.h | 3 | ||||
-rw-r--r-- | include/linux/blktrace_api.h | 1 | ||||
-rw-r--r-- | include/linux/buffer_head.h | 2 | ||||
-rw-r--r-- | include/linux/completion.h | 3 | ||||
-rw-r--r-- | include/linux/elevator.h | 5 | ||||
-rw-r--r-- | include/trace/events/block.h | 104 | ||||
-rw-r--r-- | include/trace/events/writeback.h | 116 | ||||
-rw-r--r-- | kernel/sched/core.c | 57 | ||||
-rw-r--r-- | kernel/trace/blktrace.c | 28 | ||||
-rw-r--r-- | mm/page-writeback.c | 2 |
30 files changed, 1246 insertions, 258 deletions
diff --git a/Documentation/block/cfq-iosched.txt b/Documentation/block/cfq-iosched.txt index d89b4fe..a5eb7d1 100644 --- a/Documentation/block/cfq-iosched.txt +++ b/Documentation/block/cfq-iosched.txt @@ -102,6 +102,64 @@ processing of request. Therefore, increasing the value can imporve the performace although this can cause the latency of some I/O to increase due to more number of requests. +CFQ Group scheduling +==================== + +CFQ supports blkio cgroup and has "blkio." prefixed files in each +blkio cgroup directory. It is weight-based and there are four knobs +for configuration - weight[_device] and leaf_weight[_device]. +Internal cgroup nodes (the ones with children) can also have tasks in +them, so the former two configure how much proportion the cgroup as a +whole is entitled to at its parent's level while the latter two +configure how much proportion the tasks in the cgroup have compared to +its direct children. + +Another way to think about it is assuming that each internal node has +an implicit leaf child node which hosts all the tasks whose weight is +configured by leaf_weight[_device]. Let's assume a blkio hierarchy +composed of five cgroups - root, A, B, AA and AB - with the following +weights where the names represent the hierarchy. + + weight leaf_weight + root : 125 125 + A : 500 750 + B : 250 500 + AA : 500 500 + AB : 1000 500 + +root never has a parent making its weight is meaningless. For backward +compatibility, weight is always kept in sync with leaf_weight. B, AA +and AB have no child and thus its tasks have no children cgroup to +compete with. They always get 100% of what the cgroup won at the +parent level. Considering only the weights which matter, the hierarchy +looks like the following. + + root + / | \ + A B leaf + 500 250 125 + / | \ + AA AB leaf + 500 1000 750 + +If all cgroups have active IOs and competing with each other, disk +time will be distributed like the following. + +Distribution below root. The total active weight at this level is +A:500 + B:250 + C:125 = 875. + + root-leaf : 125 / 875 =~ 14% + A : 500 / 875 =~ 57% + B(-leaf) : 250 / 875 =~ 28% + +A has children and further distributes its 57% among the children and +the implicit leaf node. The total active weight at this level is +AA:500 + AB:1000 + A-leaf:750 = 2250. + + A-leaf : ( 750 / 2250) * A =~ 19% + AA(-leaf) : ( 500 / 2250) * A =~ 12% + AB(-leaf) : (1000 / 2250) * A =~ 25% + CFQ IOPS Mode for group scheduling =================================== Basic CFQ design is to provide priority based time slices. Higher priority diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt index a794ce9..da272c8 100644 --- a/Documentation/cgroups/blkio-controller.txt +++ b/Documentation/cgroups/blkio-controller.txt @@ -94,13 +94,11 @@ Throttling/Upper Limit policy Hierarchical Cgroups ==================== -- Currently none of the IO control policy supports hierarchical groups. But - cgroup interface does allow creation of hierarchical cgroups and internally - IO policies treat them as flat hierarchy. +- Currently only CFQ supports hierarchical groups. For throttling, + cgroup interface does allow creation of hierarchical cgroups and + internally it treats them as flat hierarchy. - So this patch will allow creation of cgroup hierarchcy but at the backend - everything will be treated as flat. So if somebody created a hierarchy like - as follows. + If somebody created a hierarchy like as follows. root / \ @@ -108,16 +106,20 @@ Hierarchical Cgroups | test3 - CFQ and throttling will practically treat all groups at same level. + CFQ will handle the hierarchy correctly but and throttling will + practically treat all groups at same level. For details on CFQ + hierarchy support, refer to Documentation/block/cfq-iosched.txt. + Throttling will treat the hierarchy as if it looks like the + following. pivot / / \ \ root test1 test2 test3 - Down the line we can implement hierarchical accounting/control support - and also introduce a new cgroup file "use_hierarchy" which will control - whether cgroup hierarchy is viewed as flat or hierarchical by the policy.. - This is how memory controller also has implemented the things. + Nesting cgroups, while allowed, isn't officially supported and blkio + genereates warning when cgroups nest. Once throttling implements + hierarchy support, hierarchy will be supported and the warning will + be removed. Various user visible config options =================================== @@ -172,6 +174,12 @@ Proportional weight policy files dev weight 8:16 300 +- blkio.leaf_weight[_device] + - Equivalents of blkio.weight[_device] for the purpose of + deciding how much weight tasks in the given cgroup has while + competing with the cgroup's child cgroups. For details, + please refer to Documentation/block/cfq-iosched.txt. + - blkio.time - disk time allocated to cgroup per device in milliseconds. First two fields specify the major and minor number of the device and @@ -279,6 +287,11 @@ Proportional weight policy files and minor number of the device and third field specifies the number of times a group was dequeued from a particular device. +- blkio.*_recursive + - Recursive version of various stats. These files show the + same information as their non-recursive counterparts but + include stats from all the descendant cgroups. + Throttling/Upper limit policy files ----------------------------------- - blkio.throttle.read_bps_device diff --git a/block/Kconfig b/block/Kconfig index 4a85ccf8..a7e40a7 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -4,7 +4,6 @@ menuconfig BLOCK bool "Enable the block layer" if EXPERT default y - select PERCPU_RWSEM help Provide block layer support for the kernel. diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 8bdebb6..b2b9837 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -26,11 +26,32 @@ static DEFINE_MUTEX(blkcg_pol_mutex); -struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT }; +struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT, + .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, }; EXPORT_SYMBOL_GPL(blkcg_root); static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; +static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, + struct request_queue *q, bool update_hint); + +/** + * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants + * @d_blkg: loop cursor pointing to the current descendant + * @pos_cgrp: used for iteration + * @p_blkg: target blkg to walk descendants of + * + * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU + * read locked. If called under either blkcg or queue lock, the iteration + * is guaranteed to include all and only online blkgs. The caller may + * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip + * subtree. + */ +#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \ + cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \ + if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \ + (p_blkg)->q, false))) + static bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) { @@ -112,9 +133,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, blkg->pd[i] = pd; pd->blkg = blkg; + pd->plid = i; /* invoke per-policy init */ - if (blkcg_policy_enabled(blkg->q, pol)) + if (pol->pd_init_fn) pol->pd_init_fn(blkg); } @@ -125,8 +147,19 @@ err_free: return NULL; } +/** + * __blkg_lookup - internal version of blkg_lookup() + * @blkcg: blkcg of interest + * @q: request_queue of interest + * @update_hint: whether to update lookup hint with the result or not + * + * This is internal version and shouldn't be used by policy + * implementations. Looks up blkgs for the @blkcg - @q pair regardless of + * @q's bypass state. If @update_hint is %true, the caller should be + * holding @q->queue_lock and lookup hint is updated on success. + */ static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, - struct request_queue *q) + struct request_queue *q, bool update_hint) { struct blkcg_gq *blkg; @@ -135,14 +168,19 @@ static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, return blkg; /* - * Hint didn't match. Look up from the radix tree. Note that we - * may not be holding queue_lock and thus are not sure whether - * @blkg from blkg_tree has already been removed or not, so we - * can't update hint to the lookup result. Leave it to the caller. + * Hint didn't match. Look up from the radix tree. Note that the + * hint can only be updated under queue_lock as otherwise @blkg + * could have already been removed from blkg_tree. The caller is + * responsible for grabbing queue_lock if @update_hint. */ blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); - if (blkg && blkg->q == q) + if (blkg && blkg->q == q) { + if (update_hint) { + lockdep_assert_held(q->queue_lock); + rcu_assign_pointer(blkcg->blkg_hint, blkg); + } return blkg; + } return NULL; } @@ -162,7 +200,7 @@ struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) if (unlikely(blk_queue_bypass(q))) return NULL; - return __blkg_lookup(blkcg, q); + return __blkg_lookup(blkcg, q, false); } EXPORT_SYMBOL_GPL(blkg_lookup); @@ -170,75 +208,129 @@ EXPORT_SYMBOL_GPL(blkg_lookup); * If @new_blkg is %NULL, this function tries to allocate a new one as * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return. */ -static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q, - struct blkcg_gq *new_blkg) +static struct blkcg_gq *blkg_create(struct blkcg *blkcg, + struct request_queue *q, + struct blkcg_gq *new_blkg) { struct blkcg_gq *blkg; - int ret; + int i, ret; WARN_ON_ONCE(!rcu_read_lock_held()); lockdep_assert_held(q->queue_lock); - /* lookup and update hint on success, see __blkg_lookup() for details */ - blkg = __blkg_lookup(blkcg, q); - if (blkg) { - rcu_assign_pointer(blkcg->blkg_hint, blkg); - goto out_free; - } - /* blkg holds a reference to blkcg */ if (!css_tryget(&blkcg->css)) { - blkg = ERR_PTR(-EINVAL); - goto out_free; + ret = -EINVAL; + goto err_free_blkg; } /* allocate */ if (!new_blkg) { new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC); if (unlikely(!new_blkg)) { - blkg = ERR_PTR(-ENOMEM); - goto out_put; + ret = -ENOMEM; + goto err_put_css; } } blkg = new_blkg; - /* insert */ + /* link parent and insert */ + if (blkcg_parent(blkcg)) { + blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); + if (WARN_ON_ONCE(!blkg->parent)) { + blkg = ERR_PTR(-EINVAL); + goto err_put_css; + } + blkg_get(blkg->parent); + } + spin_lock(&blkcg->lock); ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); if (likely(!ret)) { hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); list_add(&blkg->q_node, &q->blkg_list); + + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + + if (blkg->pd[i] && pol->pd_online_fn) + pol->pd_online_fn(blkg); + } } + blkg->online = true; spin_unlock(&blkcg->lock); if (!ret) return blkg; - blkg = ERR_PTR(ret); -out_put: + /* @blkg failed fully initialized, use the usual release path */ + blkg_put(blkg); + return ERR_PTR(ret); + +err_put_css: css_put(&blkcg->css); -out_free: +err_free_blkg: blkg_free(new_blkg); - return blkg; + return ERR_PTR(ret); } +/** + * blkg_lookup_create - lookup blkg, try to create one if not there + * @blkcg: blkcg of interest + * @q: request_queue of interest + * + * Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to + * create one. blkg creation is performed recursively from blkcg_root such + * that all non-root blkg's have access to the parent blkg. This function + * should be called under RCU read lock and @q->queue_lock. + * + * Returns pointer to the looked up or created blkg on success, ERR_PTR() + * value on error. If @q is dead, returns ERR_PTR(-EINVAL). If @q is not + * dead and bypassing, returns ERR_PTR(-EBUSY). + */ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct request_queue *q) { + struct blkcg_gq *blkg; + + WARN_ON_ONCE(!rcu_read_lock_held()); + lockdep_assert_held(q->queue_lock); + /* * This could be the first entry point of blkcg implementation and * we shouldn't allow anything to go through for a bypassing queue. */ if (unlikely(blk_queue_bypass(q))) return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY); - return __blkg_lookup_create(blkcg, q, NULL); + + blkg = __blkg_lookup(blkcg, q, true); + if (blkg) + return blkg; + + /* + * Create blkgs walking down from blkcg_root to @blkcg, so that all + * non-root blkgs have access to their parents. + */ + while (true) { + struct blkcg *pos = blkcg; + struct blkcg *parent = blkcg_parent(blkcg); + + while (parent && !__blkg_lookup(parent, q, false)) { + pos = parent; + parent = blkcg_parent(parent); + } + + blkg = blkg_create(pos, q, NULL); + if (pos == blkcg || IS_ERR(blkg)) + return blkg; + } } EXPORT_SYMBOL_GPL(blkg_lookup_create); static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; + int i; lockdep_assert_held(blkg->q->queue_lock); lockdep_assert_held(&blkcg->lock); @@ -247,6 +339,14 @@ static void blkg_destroy(struct blkcg_gq *blkg) WARN_ON_ONCE(list_empty(&blkg->q_node)); WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + + if (blkg->pd[i] && pol->pd_offline_fn) + pol->pd_offline_fn(blkg); + } + blkg->online = false; + radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); list_del_init(&blkg->q_node); hlist_del_init_rcu(&blkg->blkcg_node); @@ -301,8 +401,10 @@ static void blkg_rcu_free(struct rcu_head *rcu_head) void __blkg_release(struct blkcg_gq *blkg) { - /* release the extra blkcg reference this blkg has been holding */ + /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); + if (blkg->parent) + blkg_put(blkg->parent); /* * A group is freed in rcu manner. But having an rcu lock does not @@ -401,8 +503,9 @@ static const char *blkg_dev_name(struct blkcg_gq *blkg) * * This function invokes @prfill on each blkg of @blkcg if pd for the * policy specified by @pol exists. @prfill is invoked with @sf, the - * policy data and @data. If @show_total is %true, the sum of the return - * values from @prfill is printed with "Total" label at the end. + * policy data and @data and the matching queue lock held. If @show_total + * is %true, the sum of the return values from @prfill is printed with + * "Total" label at the end. * * This is to be used to construct print functions for * cftype->read_seq_string method. @@ -416,11 +519,14 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, struct blkcg_gq *blkg; u64 total = 0; - spin_lock_irq(&blkcg->lock); - hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) + rcu_read_lock(); + hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + spin_lock_irq(blkg->q->queue_lock); if (blkcg_policy_enabled(blkg->q, pol)) total += prfill(sf, blkg->pd[pol->plid], data); - spin_unlock_irq(&blkcg->lock); + spin_unlock_irq(blkg->q->queue_lock); + } + rcu_read_unlock(); if (show_total) seq_printf(sf, "Total %llu\n", (unsigned long long)total); @@ -479,6 +585,7 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); return v; } +EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat); /** * blkg_prfill_stat - prfill callback for blkg_stat @@ -512,6 +619,82 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); /** + * blkg_stat_recursive_sum - collect hierarchical blkg_stat + * @pd: policy private data of interest + * @off: offset to the blkg_stat in @pd + * + * Collect the blkg_stat specified by @off from @pd and all its online + * descendants and return the sum. The caller must be holding the queue + * lock for online tests. + */ +u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) +{ + struct blkcg_policy *pol = blkcg_policy[pd->plid]; + struct blkcg_gq *pos_blkg; + struct cgroup *pos_cgrp; + u64 sum; + + lockdep_assert_held(pd->blkg->q->queue_lock); + + sum = blkg_stat_read((void *)pd + off); + + rcu_read_lock(); + blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) { + struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); + struct blkg_stat *stat = (void *)pos_pd + off; + + if (pos_blkg->online) + sum += blkg_stat_read(stat); + } + rcu_read_unlock(); + + return sum; +} +EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum); + +/** + * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat + * @pd: policy private data of interest + * @off: offset to the blkg_stat in @pd + * + * Collect the blkg_rwstat specified by @off from @pd and all its online + * descendants and return the sum. The caller must be holding the queue + * lock for online tests. + */ +struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, + int off) +{ + struct blkcg_policy *pol = blkcg_policy[pd->plid]; + struct blkcg_gq *pos_blkg; + struct cgroup *pos_cgrp; + struct blkg_rwstat sum; + int i; + + lockdep_assert_held(pd->blkg->q->queue_lock); + + sum = blkg_rwstat_read((void *)pd + off); + + rcu_read_lock(); + blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) { + struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); + struct blkg_rwstat *rwstat = (void *)pos_pd + off; + struct blkg_rwstat tmp; + + if (!pos_blkg->online) + continue; + + tmp = blkg_rwstat_read(rwstat); + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + sum.cnt[i] += tmp.cnt[i]; + } + rcu_read_unlock(); + + return sum; +} +EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); + +/** * blkg_conf_prep - parse and prepare for per-blkg config update * @blkcg: target block cgroup * @pol: target policy @@ -656,6 +839,7 @@ static struct cgroup_subsys_state *blkcg_css_alloc(struct cgroup *cgroup) return ERR_PTR(-ENOMEM); blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT; + blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT; blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */ done: spin_lock_init(&blkcg->lock); @@ -775,7 +959,7 @@ int blkcg_activate_policy(struct request_queue *q, const struct blkcg_policy *pol) { LIST_HEAD(pds); - struct blkcg_gq *blkg; + struct blkcg_gq *blkg, *new_blkg; struct blkg_policy_data *pd, *n; int cnt = 0, ret; bool preloaded; @@ -784,19 +968,27 @@ int blkcg_activate_policy(struct request_queue *q, return 0; /* preallocations for root blkg */ - blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); - if (!blkg) + new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); + if (!new_blkg) return -ENOMEM; preloaded = !radix_tree_preload(GFP_KERNEL); blk_queue_bypass_start(q); - /* make sure the root blkg exists and count the existing blkgs */ + /* + * Make sure the root blkg exists and count the existing blkgs. As + * @q is bypassing at this point, blkg_lookup_create() can't be + * used. Open code it. + */ spin_lock_irq(q->queue_lock); rcu_read_lock(); - blkg = __blkg_lookup_create(&blkcg_root, q, blkg); + blkg = __blkg_lookup(&blkcg_root, q, false); + if (blkg) + blkg_free(new_blkg); + else + blkg = blkg_create(&blkcg_root, q, new_blkg); rcu_read_unlock(); if (preloaded) @@ -844,6 +1036,7 @@ int blkcg_activate_policy(struct request_queue *q, blkg->pd[pol->plid] = pd; pd->blkg = blkg; + pd->plid = pol->plid; pol->pd_init_fn(blkg); spin_unlock(&blkg->blkcg->lock); @@ -890,6 +1083,8 @@ void blkcg_deactivate_policy(struct request_queue *q, /* grab blkcg lock too while removing @pd from @blkg */ spin_lock(&blkg->blkcg->lock); + if (pol->pd_offline_fn) + pol->pd_offline_fn(blkg); if (pol->pd_exit_fn) pol->pd_exit_fn(blkg); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 2459730..f2b2929 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -54,6 +54,7 @@ struct blkcg { /* TODO: per-policy storage in blkcg */ unsigned int cfq_weight; /* belongs to cfq */ + unsigned int cfq_leaf_weight; }; struct blkg_stat { @@ -80,8 +81,9 @@ struct blkg_rwstat { * beginning and pd_size can't be smaller than pd. */ struct blkg_policy_data { - /* the blkg this per-policy data belongs to */ + /* the blkg and policy id this per-policy data belongs to */ struct blkcg_gq *blkg; + int plid; /* used during policy activation */ struct list_head alloc_node; @@ -94,17 +96,27 @@ struct blkcg_gq { struct list_head q_node; struct hlist_node blkcg_node; struct blkcg *blkcg; + + /* all non-root blkcg_gq's are guaranteed to have access to parent */ + struct blkcg_gq *parent; + /* request allocation list for this blkcg-q pair */ struct request_list rl; + /* reference count */ int refcnt; + /* is this blkg online? protected by both blkcg and q locks */ + bool online; + struct blkg_policy_data *pd[BLKCG_MAX_POLS]; struct rcu_head rcu_head; }; typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); @@ -117,6 +129,8 @@ struct blkcg_policy { /* operations */ blkcg_pol_init_pd_fn *pd_init_fn; + blkcg_pol_online_pd_fn *pd_online_fn; + blkcg_pol_offline_pd_fn *pd_offline_fn; blkcg_pol_exit_pd_fn *pd_exit_fn; blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; }; @@ -150,6 +164,10 @@ u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, int off); +u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off); +struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, + int off); + struct blkg_conf_ctx { struct gendisk *disk; struct blkcg_gq *blkg; @@ -181,6 +199,19 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) } /** + * blkcg_parent - get the parent of a blkcg + * @blkcg: blkcg of interest + * + * Return the parent blkcg of @blkcg. Can be called anytime. + */ +static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) +{ + struct cgroup *pcg = blkcg->css.cgroup->parent; + + return pcg ? cgroup_to_blkcg(pcg) : NULL; +} + +/** * blkg_to_pdata - get policy private data * @blkg: blkg of interest * @pol: policy of interest @@ -387,6 +418,18 @@ static inline void blkg_stat_reset(struct blkg_stat *stat) } /** + * blkg_stat_merge - merge a blkg_stat into another + * @to: the destination blkg_stat + * @from: the source + * + * Add @from's count to @to. + */ +static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from) +{ + blkg_stat_add(to, blkg_stat_read(from)); +} + +/** * blkg_rwstat_add - add a value to a blkg_rwstat * @rwstat: target blkg_rwstat * @rw: mask of REQ_{WRITE|SYNC} @@ -434,14 +477,14 @@ static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) } /** - * blkg_rwstat_sum - read the total count of a blkg_rwstat + * blkg_rwstat_total - read the total count of a blkg_rwstat * @rwstat: blkg_rwstat to read * * Return the total count of @rwstat regardless of the IO direction. This * function can be called without synchronization and takes care of u64 * atomicity. */ -static inline uint64_t blkg_rwstat_sum(struct blkg_rwstat *rwstat) +static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) { struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); @@ -457,6 +500,25 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); } +/** + * blkg_rwstat_merge - merge a blkg_rwstat into another + * @to: the destination blkg_rwstat + * @from: the source + * + * Add @from's counts to @to. + */ +static inline void blkg_rwstat_merge(struct blkg_rwstat *to, + struct blkg_rwstat *from) +{ + struct blkg_rwstat v = blkg_rwstat_read(from); + int i; + + u64_stats_update_begin(&to->syncp); + for (i = 0; i < BLKG_RWSTAT_NR; i++) + to->cnt[i] += v.cnt[i]; + u64_stats_update_end(&to->syncp); +} + #else /* CONFIG_BLK_CGROUP */ struct cgroup; diff --git a/block/blk-core.c b/block/blk-core.c index 277134c..074b758 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -39,7 +39,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); -EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); DEFINE_IDA(blk_queue_ida); @@ -1348,7 +1347,7 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, if (!ll_back_merge_fn(q, req, bio)) return false; - trace_block_bio_backmerge(q, bio); + trace_block_bio_backmerge(q, req, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) blk_rq_set_mixed_merge(req); @@ -1370,7 +1369,7 @@ static bool bio_attempt_front_merge(struct request_queue *q, if (!ll_front_merge_fn(q, req, bio)) return false; - trace_block_bio_frontmerge(q, bio); + trace_block_bio_frontmerge(q, req, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) blk_rq_set_mixed_merge(req); @@ -1553,13 +1552,6 @@ get_rq: if (list_empty(&plug->list)) trace_block_plug(q); else { - if (!plug->should_sort) { - struct request *__rq; - - __rq = list_entry_rq(plug->list.prev); - if (__rq->q != q) - plug->should_sort = 1; - } if (request_count >= BLK_MAX_REQUEST_COUNT) { blk_flush_plug_list(plug, false); trace_block_plug(q); @@ -2890,7 +2882,6 @@ void blk_start_plug(struct blk_plug *plug) plug->magic = PLUG_MAGIC; INIT_LIST_HEAD(&plug->list); INIT_LIST_HEAD(&plug->cb_list); - plug->should_sort = 0; /* * If this is a nested plug, don't actually assign it. It will be @@ -2992,10 +2983,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) list_splice_init(&plug->list, &list); - if (plug->should_sort) { - list_sort(NULL, &list, plug_rq_cmp); - plug->should_sort = 0; - } + list_sort(NULL, &list, plug_rq_cmp); q = NULL; depth = 0; diff --git a/block/blk-exec.c b/block/blk-exec.c index c88202f..e706213 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -121,9 +121,9 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, /* Prevent hang_check timer from firing at us during very long I/O */ hang_check = sysctl_hung_task_timeout_secs; if (hang_check) - while (!wait_for_completion_timeout(&wait, hang_check * (HZ/2))); + while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2))); else - wait_for_completion(&wait); + wait_for_completion_io(&wait); if (rq->errors) err = -EIO; diff --git a/block/blk-flush.c b/block/blk-flush.c index 720ad60..db8f1b5 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -436,7 +436,7 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, bio_get(bio); submit_bio(WRITE_FLUSH, bio); - wait_for_completion(&wait); + wait_for_completion_io(&wait); /* * The driver must store the error location in ->bi_sector, if diff --git a/block/blk-lib.c b/block/blk-lib.c index b3a1f2b7..d6f50d5 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -126,7 +126,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, /* Wait for bios in-flight */ if (!atomic_dec_and_test(&bb.done)) - wait_for_completion(&wait); + wait_for_completion_io(&wait); if (!test_bit(BIO_UPTODATE, &bb.flags)) ret = -EIO; @@ -200,7 +200,7 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, /* Wait for bios in-flight */ if (!atomic_dec_and_test(&bb.done)) - wait_for_completion(&wait); + wait_for_completion_io(&wait); if (!test_bit(BIO_UPTODATE, &bb.flags)) ret = -ENOTSUPP; @@ -262,7 +262,7 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, /* Wait for bios in-flight */ if (!atomic_dec_and_test(&bb.done)) - wait_for_completion(&wait); + wait_for_completion_io(&wait); if (!test_bit(BIO_UPTODATE, &bb.flags)) /* One of bios in the batch was completed with error.*/ diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 7881477..6206a93 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -497,6 +497,13 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, return res; } +static void blk_free_queue_rcu(struct rcu_head *rcu_head) +{ + struct request_queue *q = container_of(rcu_head, struct request_queue, + rcu_head); + kmem_cache_free(blk_requestq_cachep, q); +} + /** * blk_release_queue: - release a &struct request_queue when it is no longer needed * @kobj: the kobj belonging to the request queue to be released @@ -538,7 +545,7 @@ static void blk_release_queue(struct kobject *kobj) bdi_destroy(&q->backing_dev_info); ida_simple_remove(&blk_queue_ida, q->id); - kmem_cache_free(blk_requestq_cachep, q); + call_rcu(&q->rcu_head, blk_free_queue_rcu); } static const struct sysfs_ops queue_sysfs_ops = { diff --git a/block/blk.h b/block/blk.h index 47fdfdd..e837b8f 100644 --- a/block/blk.h +++ b/block/blk.h @@ -61,7 +61,7 @@ static inline void blk_clear_rq_complete(struct request *rq) /* * Internal elevator interface */ -#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) +#define ELV_ON_HASH(rq) hash_hashed(&(rq)->hash) void blk_insert_flush(struct request *rq); void blk_abort_flushes(struct request_queue *q); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index ec52807..4f0ade7 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -85,7 +85,6 @@ struct cfq_rb_root { struct rb_root rb; struct rb_node *left; unsigned count; - unsigned total_weight; u64 min_vdisktime; struct cfq_ttime ttime; }; @@ -155,7 +154,7 @@ struct cfq_queue { * First index in the service_trees. * IDLE is handled separately, so it has negative index */ -enum wl_prio_t { +enum wl_class_t { BE_WORKLOAD = 0, RT_WORKLOAD = 1, IDLE_WORKLOAD = 2, @@ -223,10 +222,45 @@ struct cfq_group { /* group service_tree key */ u64 vdisktime; + + /* + * The number of active cfqgs and sum of their weights under this + * cfqg. This covers this cfqg's leaf_weight and all children's + * weights, but does not cover weights of further descendants. + * + * If a cfqg is on the service tree, it's active. An active cfqg + * also activates its parent and contributes to the children_weight + * of the parent. + */ + int nr_active; + unsigned int children_weight; + + /* + * vfraction is the fraction of vdisktime that the tasks in this + * cfqg are entitled to. This is determined by compounding the + * ratios walking up from this cfqg to the root. + * + * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all + * vfractions on a service tree is approximately 1. The sum may + * deviate a bit due to rounding errors and fluctuations caused by + * cfqgs entering and leaving the service tree. + */ + unsigned int vfraction; + + /* + * There are two weights - (internal) weight is the weight of this + * cfqg against the sibling cfqgs. leaf_weight is the wight of + * this cfqg against the child cfqgs. For the root cfqg, both + * weights are kept in sync for backward compatibility. + */ unsigned int weight; unsigned int new_weight; unsigned int dev_weight; + unsigned int leaf_weight; + unsigned int new_leaf_weight; + unsigned int dev_leaf_weight; + /* number of cfqq currently on this group */ int nr_cfqq; @@ -248,14 +282,15 @@ struct cfq_group { struct cfq_rb_root service_trees[2][3]; struct cfq_rb_root service_tree_idle; - unsigned long saved_workload_slice; - enum wl_type_t saved_workload; - enum wl_prio_t saved_serving_prio; + unsigned long saved_wl_slice; + enum wl_type_t saved_wl_type; + enum wl_class_t saved_wl_class; /* number of requests that are on the dispatch list or inside driver */ int dispatched; struct cfq_ttime ttime; - struct cfqg_stats stats; + struct cfqg_stats stats; /* stats for this cfqg */ + struct cfqg_stats dead_stats; /* stats pushed from dead children */ }; struct cfq_io_cq { @@ -280,8 +315,8 @@ struct cfq_data { /* * The priority currently being served */ - enum wl_prio_t serving_prio; - enum wl_type_t serving_type; + enum wl_class_t serving_wl_class; + enum wl_type_t serving_wl_type; unsigned long workload_expires; struct cfq_group *serving_group; @@ -353,17 +388,17 @@ struct cfq_data { static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); -static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg, - enum wl_prio_t prio, +static struct cfq_rb_root *st_for(struct cfq_group *cfqg, + enum wl_class_t class, enum wl_type_t type) { if (!cfqg) return NULL; - if (prio == IDLE_WORKLOAD) + if (class == IDLE_WORKLOAD) return &cfqg->service_tree_idle; - return &cfqg->service_trees[prio][type]; + return &cfqg->service_trees[class][type]; } enum cfqq_state_flags { @@ -502,7 +537,7 @@ static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { struct cfqg_stats *stats = &cfqg->stats; - if (blkg_rwstat_sum(&stats->queued)) + if (blkg_rwstat_total(&stats->queued)) return; /* @@ -546,7 +581,7 @@ static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) struct cfqg_stats *stats = &cfqg->stats; blkg_stat_add(&stats->avg_queue_size_sum, - blkg_rwstat_sum(&stats->queued)); + blkg_rwstat_total(&stats->queued)); blkg_stat_add(&stats->avg_queue_size_samples, 1); cfqg_stats_update_group_wait_time(stats); } @@ -572,6 +607,13 @@ static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); } +static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) +{ + struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent; + + return pblkg ? blkg_to_cfqg(pblkg) : NULL; +} + static inline void cfqg_get(struct cfq_group *cfqg) { return blkg_get(cfqg_to_blkg(cfqg)); @@ -586,8 +628,9 @@ static inline void cfqg_put(struct cfq_group *cfqg) char __pbuf[128]; \ \ blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf)); \ - blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ - cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ + blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c %s " fmt, (cfqq)->pid, \ + cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ + cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\ __pbuf, ##args); \ } while (0) @@ -646,11 +689,9 @@ static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, io_start_time - start_time); } -static void cfq_pd_reset_stats(struct blkcg_gq *blkg) +/* @stats = 0 */ +static void cfqg_stats_reset(struct cfqg_stats *stats) { - struct cfq_group *cfqg = blkg_to_cfqg(blkg); - struct cfqg_stats *stats = &cfqg->stats; - /* queued stats shouldn't be cleared */ blkg_rwstat_reset(&stats->service_bytes); blkg_rwstat_reset(&stats->serviced); @@ -669,13 +710,58 @@ static void cfq_pd_reset_stats(struct blkcg_gq *blkg) #endif } +/* @to += @from */ +static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from) +{ + /* queued stats shouldn't be cleared */ + blkg_rwstat_merge(&to->service_bytes, &from->service_bytes); + blkg_rwstat_merge(&to->serviced, &from->serviced); + blkg_rwstat_merge(&to->merged, &from->merged); + blkg_rwstat_merge(&to->service_time, &from->service_time); + blkg_rwstat_merge(&to->wait_time, &from->wait_time); + blkg_stat_merge(&from->time, &from->time); +#ifdef CONFIG_DEBUG_BLK_CGROUP + blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time); + blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum); + blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples); + blkg_stat_merge(&to->dequeue, &from->dequeue); + blkg_stat_merge(&to->group_wait_time, &from->group_wait_time); + blkg_stat_merge(&to->idle_time, &from->idle_time); + blkg_stat_merge(&to->empty_time, &from->empty_time); +#endif +} + +/* + * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors' + * recursive stats can still account for the amount used by this cfqg after + * it's gone. + */ +static void cfqg_stats_xfer_dead(struct cfq_group *cfqg) +{ + struct cfq_group *parent = cfqg_parent(cfqg); + + lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock); + + if (unlikely(!parent)) + return; + + cfqg_stats_merge(&parent->dead_stats, &cfqg->stats); + cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats); + cfqg_stats_reset(&cfqg->stats); + cfqg_stats_reset(&cfqg->dead_stats); +} + #else /* CONFIG_CFQ_GROUP_IOSCHED */ +static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; } static inline void cfqg_get(struct cfq_group *cfqg) { } static inline void cfqg_put(struct cfq_group *cfqg) { } #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ - blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) + blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid, \ + cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ + cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\ + ##args) #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, @@ -732,7 +818,7 @@ static inline bool iops_mode(struct cfq_data *cfqd) return false; } -static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) +static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq) { if (cfq_class_idle(cfqq)) return IDLE_WORKLOAD; @@ -751,23 +837,23 @@ static enum wl_type_t cfqq_type(struct cfq_queue *cfqq) return SYNC_WORKLOAD; } -static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl, +static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class, struct cfq_data *cfqd, struct cfq_group *cfqg) { - if (wl == IDLE_WORKLOAD) + if (wl_class == IDLE_WORKLOAD) return cfqg->service_tree_idle.count; - return cfqg->service_trees[wl][ASYNC_WORKLOAD].count - + cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count - + cfqg->service_trees[wl][SYNC_WORKLOAD].count; + return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count + + cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count + + cfqg->service_trees[wl_class][SYNC_WORKLOAD].count; } static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, struct cfq_group *cfqg) { - return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count - + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count; + return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count + + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count; } static void cfq_dispatch_insert(struct request_queue *, struct request *); @@ -847,13 +933,27 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); } -static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) +/** + * cfqg_scale_charge - scale disk time charge according to cfqg weight + * @charge: disk time being charged + * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT + * + * Scale @charge according to @vfraction, which is in range (0, 1]. The + * scaling is inversely proportional. + * + * scaled = charge / vfraction + * + * The result is also in fixed point w/ CFQ_SERVICE_SHIFT. + */ +static inline u64 cfqg_scale_charge(unsigned long charge, + unsigned int vfraction) { - u64 d = delta << CFQ_SERVICE_SHIFT; + u64 c = charge << CFQ_SERVICE_SHIFT; /* make it fixed point */ - d = d * CFQ_WEIGHT_DEFAULT; - do_div(d, cfqg->weight); - return d; + /* charge / vfraction */ + c <<= CFQ_SERVICE_SHIFT; + do_div(c, vfraction); + return c; } static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) @@ -909,9 +1009,7 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd, static inline unsigned cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) { - struct cfq_rb_root *st = &cfqd->grp_service_tree; - - return cfqd->cfq_target_latency * cfqg->weight / st->total_weight; + return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT; } static inline unsigned @@ -1178,20 +1276,61 @@ static void cfq_update_group_weight(struct cfq_group *cfqg) { BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); + if (cfqg->new_weight) { cfqg->weight = cfqg->new_weight; cfqg->new_weight = 0; } + + if (cfqg->new_leaf_weight) { + cfqg->leaf_weight = cfqg->new_leaf_weight; + cfqg->new_leaf_weight = 0; + } } static void cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) { + unsigned int vfr = 1 << CFQ_SERVICE_SHIFT; /* start with 1 */ + struct cfq_group *pos = cfqg; + struct cfq_group *parent; + bool propagate; + + /* add to the service tree */ BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); cfq_update_group_weight(cfqg); __cfq_group_service_tree_add(st, cfqg); - st->total_weight += cfqg->weight; + + /* + * Activate @cfqg and calculate the portion of vfraction @cfqg is + * entitled to. vfraction is calculated by walking the tree + * towards the root calculating the fraction it has at each level. + * The compounded ratio is how much vfraction @cfqg owns. + * + * Start with the proportion tasks in this cfqg has against active + * children cfqgs - its leaf_weight against children_weight. + */ + propagate = !pos->nr_active++; + pos->children_weight += pos->leaf_weight; + vfr = vfr * pos->leaf_weight / pos->children_weight; + + /* + * Compound ->weight walking up the tree. Both activation and + * vfraction calculation are done in the same loop. Propagation + * stops once an already activated node is met. vfraction + * calculation should always continue to the root. + */ + while ((parent = cfqg_parent(pos))) { + if (propagate) { + propagate = !parent->nr_active++; + parent->children_weight += pos->weight; + } + vfr = vfr * pos->weight / parent->children_weight; + pos = parent; + } + + cfqg->vfraction = max_t(unsigned, vfr, 1); } static void @@ -1222,7 +1361,32 @@ cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg) static void cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg) { - st->total_weight -= cfqg->weight; + struct cfq_group *pos = cfqg; + bool propagate; + + /* + * Undo activation from cfq_group_service_tree_add(). Deactivate + * @cfqg and propagate deactivation upwards. + */ + propagate = !--pos->nr_active; + pos->children_weight -= pos->leaf_weight; + + while (propagate) { + struct cfq_group *parent = cfqg_parent(pos); + + /* @pos has 0 nr_active at this point */ + WARN_ON_ONCE(pos->children_weight); + pos->vfraction = 0; + + if (!parent) + break; + + propagate = !--parent->nr_active; + parent->children_weight -= pos->weight; + pos = parent; + } + + /* remove from the service tree */ if (!RB_EMPTY_NODE(&cfqg->rb_node)) cfq_rb_erase(&cfqg->rb_node, st); } @@ -1241,7 +1405,7 @@ cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg) cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); cfq_group_service_tree_del(st, cfqg); - cfqg->saved_workload_slice = 0; + cfqg->saved_wl_slice = 0; cfqg_stats_update_dequeue(cfqg); } @@ -1284,6 +1448,7 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, unsigned int used_sl, charge, unaccounted_sl = 0; int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) - cfqg->service_tree_idle.count; + unsigned int vfr; BUG_ON(nr_sync < 0); used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl); @@ -1293,20 +1458,25 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, else if (!cfq_cfqq_sync(cfqq) && !nr_sync) charge = cfqq->allocated_slice; - /* Can't update vdisktime while group is on service tree */ + /* + * Can't update vdisktime while on service tree and cfqg->vfraction + * is valid only while on it. Cache vfr, leave the service tree, + * update vdisktime and go back on. The re-addition to the tree + * will also update the weights as necessary. + */ + vfr = cfqg->vfraction; cfq_group_service_tree_del(st, cfqg); - cfqg->vdisktime += cfq_scale_slice(charge, cfqg); - /* If a new weight was requested, update now, off tree */ + cfqg->vdisktime += cfqg_scale_charge(charge, vfr); cfq_group_service_tree_add(st, cfqg); /* This group is being expired. Save the context */ if (time_after(cfqd->workload_expires, jiffies)) { - cfqg->saved_workload_slice = cfqd->workload_expires + cfqg->saved_wl_slice = cfqd->workload_expires - jiffies; - cfqg->saved_workload = cfqd->serving_type; - cfqg->saved_serving_prio = cfqd->serving_prio; + cfqg->saved_wl_type = cfqd->serving_wl_type; + cfqg->saved_wl_class = cfqd->serving_wl_class; } else - cfqg->saved_workload_slice = 0; + cfqg->saved_wl_slice = 0; cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, st->min_vdisktime); @@ -1344,6 +1514,52 @@ static void cfq_pd_init(struct blkcg_gq *blkg) cfq_init_cfqg_base(cfqg); cfqg->weight = blkg->blkcg->cfq_weight; + cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight; +} + +static void cfq_pd_offline(struct blkcg_gq *blkg) +{ + /* + * @blkg is going offline and will be ignored by + * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so + * that they don't get lost. If IOs complete after this point, the + * stats for them will be lost. Oh well... + */ + cfqg_stats_xfer_dead(blkg_to_cfqg(blkg)); +} + +/* offset delta from cfqg->stats to cfqg->dead_stats */ +static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) - + offsetof(struct cfq_group, stats); + +/* to be used by recursive prfill, sums live and dead stats recursively */ +static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off) +{ + u64 sum = 0; + + sum += blkg_stat_recursive_sum(pd, off); + sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta); + return sum; +} + +/* to be used by recursive prfill, sums live and dead rwstats recursively */ +static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd, + int off) +{ + struct blkg_rwstat a, b; + + a = blkg_rwstat_recursive_sum(pd, off); + b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta); + blkg_rwstat_merge(&a, &b); + return a; +} + +static void cfq_pd_reset_stats(struct blkcg_gq *blkg) +{ + struct cfq_group *cfqg = blkg_to_cfqg(blkg); + + cfqg_stats_reset(&cfqg->stats); + cfqg_stats_reset(&cfqg->dead_stats); } /* @@ -1400,6 +1616,26 @@ static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft, return 0; } +static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct cfq_group *cfqg = pd_to_cfqg(pd); + + if (!cfqg->dev_leaf_weight) + return 0; + return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight); +} + +static int cfqg_print_leaf_weight_device(struct cgroup *cgrp, + struct cftype *cft, + struct seq_file *sf) +{ + blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), + cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, 0, + false); + return 0; +} + static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft, struct seq_file *sf) { @@ -1407,8 +1643,16 @@ static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft, return 0; } -static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, - const char *buf) +static int cfq_print_leaf_weight(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *sf) +{ + seq_printf(sf, "%u\n", + cgroup_to_blkcg(cgrp)->cfq_leaf_weight); + return 0; +} + +static int __cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, + const char *buf, bool is_leaf_weight) { struct blkcg *blkcg = cgroup_to_blkcg(cgrp); struct blkg_conf_ctx ctx; @@ -1422,8 +1666,13 @@ static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, ret = -EINVAL; cfqg = blkg_to_cfqg(ctx.blkg); if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) { - cfqg->dev_weight = ctx.v; - cfqg->new_weight = cfqg->dev_weight ?: blkcg->cfq_weight; + if (!is_leaf_weight) { + cfqg->dev_weight = ctx.v; + cfqg->new_weight = ctx.v ?: blkcg->cfq_weight; + } else { + cfqg->dev_leaf_weight = ctx.v; + cfqg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight; + } ret = 0; } @@ -1431,7 +1680,20 @@ static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, return ret; } -static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) +static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, + const char *buf) +{ + return __cfqg_set_weight_device(cgrp, cft, buf, false); +} + +static int cfqg_set_leaf_weight_device(struct cgroup *cgrp, struct cftype *cft, + const char *buf) +{ + return __cfqg_set_weight_device(cgrp, cft, buf, true); +} + +static int __cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val, + bool is_leaf_weight) { struct blkcg *blkcg = cgroup_to_blkcg(cgrp); struct blkcg_gq *blkg; @@ -1440,19 +1702,41 @@ static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) return -EINVAL; spin_lock_irq(&blkcg->lock); - blkcg->cfq_weight = (unsigned int)val; + + if (!is_leaf_weight) + blkcg->cfq_weight = val; + else + blkcg->cfq_leaf_weight = val; hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { struct cfq_group *cfqg = blkg_to_cfqg(blkg); - if (cfqg && !cfqg->dev_weight) - cfqg->new_weight = blkcg->cfq_weight; + if (!cfqg) + continue; + + if (!is_leaf_weight) { + if (!cfqg->dev_weight) + cfqg->new_weight = blkcg->cfq_weight; + } else { + if (!cfqg->dev_leaf_weight) + cfqg->new_leaf_weight = blkcg->cfq_leaf_weight; + } } spin_unlock_irq(&blkcg->lock); return 0; } +static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) +{ + return __cfq_set_weight(cgrp, cft, val, false); +} + +static int cfq_set_leaf_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) +{ + return __cfq_set_weight(cgrp, cft, val, true); +} + static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft, struct seq_file *sf) { @@ -1473,6 +1757,42 @@ static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft, return 0; } +static u64 cfqg_prfill_stat_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + u64 sum = cfqg_stat_pd_recursive_sum(pd, off); + + return __blkg_prfill_u64(sf, pd, sum); +} + +static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off); + + return __blkg_prfill_rwstat(sf, pd, &sum); +} + +static int cfqg_print_stat_recursive(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *sf) +{ + struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + + blkcg_print_blkgs(sf, blkcg, cfqg_prfill_stat_recursive, + &blkcg_policy_cfq, cft->private, false); + return 0; +} + +static int cfqg_print_rwstat_recursive(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *sf) +{ + struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + + blkcg_print_blkgs(sf, blkcg, cfqg_prfill_rwstat_recursive, + &blkcg_policy_cfq, cft->private, true); + return 0; +} + #ifdef CONFIG_DEBUG_BLK_CGROUP static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, struct blkg_policy_data *pd, int off) @@ -1502,17 +1822,49 @@ static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft, #endif /* CONFIG_DEBUG_BLK_CGROUP */ static struct cftype cfq_blkcg_files[] = { + /* on root, weight is mapped to leaf_weight */ + { + .name = "weight_device", + .flags = CFTYPE_ONLY_ON_ROOT, + .read_seq_string = cfqg_print_leaf_weight_device, + .write_string = cfqg_set_leaf_weight_device, + .max_write_len = 256, + }, + { + .name = "weight", + .flags = CFTYPE_ONLY_ON_ROOT, + .read_seq_string = cfq_print_leaf_weight, + .write_u64 = cfq_set_leaf_weight, + }, + + /* no such mapping necessary for !roots */ { .name = "weight_device", + .flags = CFTYPE_NOT_ON_ROOT, .read_seq_string = cfqg_print_weight_device, .write_string = cfqg_set_weight_device, .max_write_len = 256, }, { .name = "weight", + .flags = CFTYPE_NOT_ON_ROOT, .read_seq_string = cfq_print_weight, .write_u64 = cfq_set_weight, }, + + { + .name = "leaf_weight_device", + .read_seq_string = cfqg_print_leaf_weight_device, + .write_string = cfqg_set_leaf_weight_device, + .max_write_len = 256, + }, + { + .name = "leaf_weight", + .read_seq_string = cfq_print_leaf_weight, + .write_u64 = cfq_set_leaf_weight, + }, + + /* statistics, covers only the tasks in the cfqg */ { .name = "time", .private = offsetof(struct cfq_group, stats.time), @@ -1553,6 +1905,48 @@ static struct cftype cfq_blkcg_files[] = { .private = offsetof(struct cfq_group, stats.queued), .read_seq_string = cfqg_print_rwstat, }, + + /* the same statictics which cover the cfqg and its descendants */ + { + .name = "time_recursive", + .private = offsetof(struct cfq_group, stats.time), + .read_seq_string = cfqg_print_stat_recursive, + }, + { + .name = "sectors_recursive", + .private = offsetof(struct cfq_group, stats.sectors), + .read_seq_string = cfqg_print_stat_recursive, + }, + { + .name = "io_service_bytes_recursive", + .private = offsetof(struct cfq_group, stats.service_bytes), + .read_seq_string = cfqg_print_rwstat_recursive, + }, + { + .name = "io_serviced_recursive", + .private = offsetof(struct cfq_group, stats.serviced), + .read_seq_string = cfqg_print_rwstat_recursive, + }, + { + .name = "io_service_time_recursive", + .private = offsetof(struct cfq_group, stats.service_time), + .read_seq_string = cfqg_print_rwstat_recursive, + }, + { + .name = "io_wait_time_recursive", + .private = offsetof(struct cfq_group, stats.wait_time), + .read_seq_string = cfqg_print_rwstat_recursive, + }, + { + .name = "io_merged_recursive", + .private = offsetof(struct cfq_group, stats.merged), + .read_seq_string = cfqg_print_rwstat_recursive, + }, + { + .name = "io_queued_recursive", + .private = offsetof(struct cfq_group, stats.queued), + .read_seq_string = cfqg_print_rwstat_recursive, + }, #ifdef CONFIG_DEBUG_BLK_CGROUP { .name = "avg_queue_size", @@ -1611,15 +2005,14 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct rb_node **p, *parent; struct cfq_queue *__cfqq; unsigned long rb_key; - struct cfq_rb_root *service_tree; + struct cfq_rb_root *st; int left; int new_cfqq = 1; - service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), - cfqq_type(cfqq)); + st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq)); if (cfq_class_idle(cfqq)) { rb_key = CFQ_IDLE_DELAY; - parent = rb_last(&service_tree->rb); + parent = rb_last(&st->rb); if (parent && parent != &cfqq->rb_node) { __cfqq = rb_entry(parent, struct cfq_queue, rb_node); rb_key += __cfqq->rb_key; @@ -1637,7 +2030,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfqq->slice_resid = 0; } else { rb_key = -HZ; - __cfqq = cfq_rb_first(service_tree); + __cfqq = cfq_rb_first(st); rb_key += __cfqq ? __cfqq->rb_key : jiffies; } @@ -1646,8 +2039,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, /* * same position, nothing more to do */ - if (rb_key == cfqq->rb_key && - cfqq->service_tree == service_tree) + if (rb_key == cfqq->rb_key && cfqq->service_tree == st) return; cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); @@ -1656,11 +2048,9 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, left = 1; parent = NULL; - cfqq->service_tree = service_tree; - p = &service_tree->rb.rb_node; + cfqq->service_tree = st; + p = &st->rb.rb_node; while (*p) { - struct rb_node **n; - parent = *p; __cfqq = rb_entry(parent, struct cfq_queue, rb_node); @@ -1668,22 +2058,20 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, * sort by key, that represents service time. */ if (time_before(rb_key, __cfqq->rb_key)) - n = &(*p)->rb_left; + p = &parent->rb_left; else { - n = &(*p)->rb_right; + p = &parent->rb_right; left = 0; } - - p = n; } if (left) - service_tree->left = &cfqq->rb_node; + st->left = &cfqq->rb_node; cfqq->rb_key = rb_key; rb_link_node(&cfqq->rb_node, parent, p); - rb_insert_color(&cfqq->rb_node, &service_tree->rb); - service_tree->count++; + rb_insert_color(&cfqq->rb_node, &st->rb); + st->count++; if (add_front || !new_cfqq) return; cfq_group_notify_queue_add(cfqd, cfqq->cfqg); @@ -2029,8 +2417,8 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { if (cfqq) { - cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", - cfqd->serving_prio, cfqd->serving_type); + cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d", + cfqd->serving_wl_class, cfqd->serving_wl_type); cfqg_stats_update_avg_queue_size(cfqq->cfqg); cfqq->slice_start = 0; cfqq->dispatch_start = jiffies; @@ -2116,19 +2504,18 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out) */ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) { - struct cfq_rb_root *service_tree = - service_tree_for(cfqd->serving_group, cfqd->serving_prio, - cfqd->serving_type); + struct cfq_rb_root *st = st_for(cfqd->serving_group, + cfqd->serving_wl_class, cfqd->serving_wl_type); if (!cfqd->rq_queued) return NULL; /* There is nothing to dispatch */ - if (!service_tree) + if (!st) return NULL; - if (RB_EMPTY_ROOT(&service_tree->rb)) + if (RB_EMPTY_ROOT(&st->rb)) return NULL; - return cfq_rb_first(service_tree); + return cfq_rb_first(st); } static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd) @@ -2284,17 +2671,17 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) { - enum wl_prio_t prio = cfqq_prio(cfqq); - struct cfq_rb_root *service_tree = cfqq->service_tree; + enum wl_class_t wl_class = cfqq_class(cfqq); + struct cfq_rb_root *st = cfqq->service_tree; - BUG_ON(!service_tree); - BUG_ON(!service_tree->count); + BUG_ON(!st); + BUG_ON(!st->count); if (!cfqd->cfq_slice_idle) return false; /* We never do for idle class queues. */ - if (prio == IDLE_WORKLOAD) + if (wl_class == IDLE_WORKLOAD) return false; /* We do for queues that were marked with idle window flag. */ @@ -2306,11 +2693,10 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) * Otherwise, we do only if they are the last ones * in their service tree. */ - if (service_tree->count == 1 && cfq_cfqq_sync(cfqq) && - !cfq_io_thinktime_big(cfqd, &service_tree->ttime, false)) + if (st->count == 1 && cfq_cfqq_sync(cfqq) && + !cfq_io_thinktime_big(cfqd, &st->ttime, false)) return true; - cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", - service_tree->count); + cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count); return false; } @@ -2493,8 +2879,8 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) } } -static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, - struct cfq_group *cfqg, enum wl_prio_t prio) +static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd, + struct cfq_group *cfqg, enum wl_class_t wl_class) { struct cfq_queue *queue; int i; @@ -2504,7 +2890,7 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, for (i = 0; i <= SYNC_WORKLOAD; ++i) { /* select the one with lowest rb_key */ - queue = cfq_rb_first(service_tree_for(cfqg, prio, i)); + queue = cfq_rb_first(st_for(cfqg, wl_class, i)); if (queue && (!key_valid || time_before(queue->rb_key, lowest_key))) { lowest_key = queue->rb_key; @@ -2516,26 +2902,27 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, return cur_best; } -static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) +static void +choose_wl_class_and_type(struct cfq_data *cfqd, struct cfq_group *cfqg) { unsigned slice; unsigned count; struct cfq_rb_root *st; unsigned group_slice; - enum wl_prio_t original_prio = cfqd->serving_prio; + enum wl_class_t original_class = cfqd->serving_wl_class; /* Choose next priority. RT > BE > IDLE */ if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) - cfqd->serving_prio = RT_WORKLOAD; + cfqd->serving_wl_class = RT_WORKLOAD; else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg)) - cfqd->serving_prio = BE_WORKLOAD; + cfqd->serving_wl_class = BE_WORKLOAD; else { - cfqd->serving_prio = IDLE_WORKLOAD; + cfqd->serving_wl_class = IDLE_WORKLOAD; cfqd->workload_expires = jiffies + 1; return; } - if (original_prio != cfqd->serving_prio) + if (original_class != cfqd->serving_wl_class) goto new_workload; /* @@ -2543,7 +2930,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload * expiration time */ - st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type); + st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type); count = st->count; /* @@ -2554,9 +2941,9 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) new_workload: /* otherwise select new workload type */ - cfqd->serving_type = - cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio); - st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type); + cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg, + cfqd->serving_wl_class); + st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type); count = st->count; /* @@ -2567,10 +2954,11 @@ new_workload: group_slice = cfq_group_slice(cfqd, cfqg); slice = group_slice * count / - max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio], - cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg)); + max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class], + cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd, + cfqg)); - if (cfqd->serving_type == ASYNC_WORKLOAD) { + if (cfqd->serving_wl_type == ASYNC_WORKLOAD) { unsigned int tmp; /* @@ -2616,14 +3004,14 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd) cfqd->serving_group = cfqg; /* Restore the workload type data */ - if (cfqg->saved_workload_slice) { - cfqd->workload_expires = jiffies + cfqg->saved_workload_slice; - cfqd->serving_type = cfqg->saved_workload; - cfqd->serving_prio = cfqg->saved_serving_prio; + if (cfqg->saved_wl_slice) { + cfqd->workload_expires = jiffies + cfqg->saved_wl_slice; + cfqd->serving_wl_type = cfqg->saved_wl_type; + cfqd->serving_wl_class = cfqg->saved_wl_class; } else cfqd->workload_expires = jiffies - 1; - choose_service_tree(cfqd, cfqg); + choose_wl_class_and_type(cfqd, cfqg); } /* @@ -3205,6 +3593,8 @@ retry: spin_lock_irq(cfqd->queue->queue_lock); if (new_cfqq) goto retry; + else + return &cfqd->oom_cfqq; } else { cfqq = kmem_cache_alloc_node(cfq_pool, gfp_mask | __GFP_ZERO, @@ -3402,7 +3792,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, return true; /* Allow preemption only if we are idling on sync-noidle tree */ - if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD && + if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD && cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD && new_cfqq->service_tree->count == 2 && RB_EMPTY_ROOT(&cfqq->sort_list)) @@ -3454,7 +3844,7 @@ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) * doesn't happen */ if (old_type != cfqq_type(cfqq)) - cfqq->cfqg->saved_workload_slice = 0; + cfqq->cfqg->saved_wl_slice = 0; /* * Put the new queue at the front of the of the current list, @@ -3636,16 +4026,17 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; if (sync) { - struct cfq_rb_root *service_tree; + struct cfq_rb_root *st; RQ_CIC(rq)->ttime.last_end_request = now; if (cfq_cfqq_on_rr(cfqq)) - service_tree = cfqq->service_tree; + st = cfqq->service_tree; else - service_tree = service_tree_for(cfqq->cfqg, - cfqq_prio(cfqq), cfqq_type(cfqq)); - service_tree->ttime.last_end_request = now; + st = st_for(cfqq->cfqg, cfqq_class(cfqq), + cfqq_type(cfqq)); + + st->ttime.last_end_request = now; if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now)) cfqd->last_delayed_sync = now; } @@ -3992,6 +4383,7 @@ static int cfq_init_queue(struct request_queue *q) cfq_init_cfqg_base(cfqd->root_group); #endif cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT; + cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT; /* * Not strictly needed (since RB_ROOT just clears the node and we @@ -4176,6 +4568,7 @@ static struct blkcg_policy blkcg_policy_cfq = { .cftypes = cfq_blkcg_files, .pd_init_fn = cfq_pd_init, + .pd_offline_fn = cfq_pd_offline, .pd_reset_stats_fn = cfq_pd_reset_stats, }; #endif diff --git a/block/elevator.c b/block/elevator.c index d0acb31..a0ffdd9 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -46,11 +46,6 @@ static LIST_HEAD(elv_list); /* * Merge hash stuff. */ -static const int elv_hash_shift = 6; -#define ELV_HASH_BLOCK(sec) ((sec) >> 3) -#define ELV_HASH_FN(sec) \ - (hash_long(ELV_HASH_BLOCK((sec)), elv_hash_shift)) -#define ELV_HASH_ENTRIES (1 << elv_hash_shift) #define rq_hash_key(rq) (blk_rq_pos(rq) + blk_rq_sectors(rq)) /* @@ -158,7 +153,6 @@ static struct elevator_queue *elevator_alloc(struct request_queue *q, struct elevator_type *e) { struct elevator_queue *eq; - int i; eq = kmalloc_node(sizeof(*eq), GFP_KERNEL | __GFP_ZERO, q->node); if (unlikely(!eq)) @@ -167,14 +161,7 @@ static struct elevator_queue *elevator_alloc(struct request_queue *q, eq->type = e; kobject_init(&eq->kobj, &elv_ktype); mutex_init(&eq->sysfs_lock); - - eq->hash = kmalloc_node(sizeof(struct hlist_head) * ELV_HASH_ENTRIES, - GFP_KERNEL, q->node); - if (!eq->hash) - goto err; - - for (i = 0; i < ELV_HASH_ENTRIES; i++) - INIT_HLIST_HEAD(&eq->hash[i]); + hash_init(eq->hash); return eq; err: @@ -189,7 +176,6 @@ static void elevator_release(struct kobject *kobj) e = container_of(kobj, struct elevator_queue, kobj); elevator_put(e->type); - kfree(e->hash); kfree(e); } @@ -261,7 +247,7 @@ EXPORT_SYMBOL(elevator_exit); static inline void __elv_rqhash_del(struct request *rq) { - hlist_del_init(&rq->hash); + hash_del(&rq->hash); } static void elv_rqhash_del(struct request_queue *q, struct request *rq) @@ -275,7 +261,7 @@ static void elv_rqhash_add(struct request_queue *q, struct request *rq) struct elevator_queue *e = q->elevator; BUG_ON(ELV_ON_HASH(rq)); - hlist_add_head(&rq->hash, &e->hash[ELV_HASH_FN(rq_hash_key(rq))]); + hash_add(e->hash, &rq->hash, rq_hash_key(rq)); } static void elv_rqhash_reposition(struct request_queue *q, struct request *rq) @@ -287,11 +273,10 @@ static void elv_rqhash_reposition(struct request_queue *q, struct request *rq) static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset) { struct elevator_queue *e = q->elevator; - struct hlist_head *hash_list = &e->hash[ELV_HASH_FN(offset)]; struct hlist_node *next; struct request *rq; - hlist_for_each_entry_safe(rq, next, hash_list, hash) { + hash_for_each_possible_safe(e->hash, rq, next, hash, offset) { BUG_ON(!ELV_ON_HASH(rq)); if (unlikely(!rq_mergeable(rq))) { diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index 57763c5..758f2ac 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -1090,10 +1090,13 @@ static const struct block_device_operations floppy_fops = { static void swim3_mb_event(struct macio_dev* mdev, int mb_state) { struct floppy_state *fs = macio_get_drvdata(mdev); - struct swim3 __iomem *sw = fs->swim3; + struct swim3 __iomem *sw; if (!fs) return; + + sw = fs->swim3; + if (mb_state != MB_FD) return; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index e67a4be..bb2cd3c 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -626,7 +626,6 @@ static void dec_pending(struct dm_io *io, int error) queue_io(md, bio); } else { /* done with normal IO or empty flush */ - trace_block_bio_complete(md->queue, bio, io_error); bio_endio(bio, io_error); } } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 697f026..5af2d27 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -184,8 +184,6 @@ static void return_io(struct bio *return_bi) return_bi = bi->bi_next; bi->bi_next = NULL; bi->bi_size = 0; - trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), - bi, 0); bio_endio(bi, 0); bi = return_bi; } @@ -3916,8 +3914,6 @@ static void raid5_align_endio(struct bio *bi, int error) rdev_dec_pending(rdev, conf->mddev); if (!error && uptodate) { - trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev), - raid_bi, 0); bio_endio(raid_bi, 0); if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_stripe); @@ -4376,8 +4372,6 @@ static void make_request(struct mddev *mddev, struct bio * bi) if ( rw == WRITE ) md_write_end(mddev); - trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), - bi, 0); bio_endio(bi, 0); } } @@ -4754,11 +4748,8 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) handled++; } remaining = raid5_dec_bi_active_stripes(raid_bio); - if (remaining == 0) { - trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev), - raid_bio, 0); + if (remaining == 0) bio_endio(raid_bio, 0); - } if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_stripe); return handled; @@ -1428,6 +1428,8 @@ void bio_endio(struct bio *bio, int error) else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) error = -EIO; + trace_block_bio_complete(bio, error); + if (bio->bi_end_io) bio->bi_end_io(bio, error); } diff --git a/fs/block_dev.c b/fs/block_dev.c index 53f5fae..aea605c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1033,7 +1033,9 @@ void bd_set_size(struct block_device *bdev, loff_t size) { unsigned bsize = bdev_logical_block_size(bdev); - bdev->bd_inode->i_size = size; + mutex_lock(&bdev->bd_inode->i_mutex); + i_size_write(bdev->bd_inode, size); + mutex_unlock(&bdev->bd_inode->i_mutex); while (bsize < PAGE_CACHE_SIZE) { if (size & bsize) break; @@ -1118,7 +1120,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } } - if (!ret && !bdev->bd_openers) { + if (!ret) { bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); bdi = blk_get_backing_dev_info(bdev); if (bdi == NULL) diff --git a/fs/buffer.c b/fs/buffer.c index 8e18281..b4dcb34 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -41,6 +41,7 @@ #include <linux/bitops.h> #include <linux/mpage.h> #include <linux/bit_spinlock.h> +#include <trace/events/block.h> static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); @@ -53,6 +54,13 @@ void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) } EXPORT_SYMBOL(init_buffer); +inline void touch_buffer(struct buffer_head *bh) +{ + trace_block_touch_buffer(bh); + mark_page_accessed(bh->b_page); +} +EXPORT_SYMBOL(touch_buffer); + static int sleep_on_buffer(void *word) { io_schedule(); @@ -1113,6 +1121,8 @@ void mark_buffer_dirty(struct buffer_head *bh) { WARN_ON_ONCE(!buffer_uptodate(bh)); + trace_block_dirty_buffer(bh); + /* * Very *carefully* optimize the it-is-already-dirty case. * diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 310972b..359494e 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -318,8 +318,14 @@ static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) static int write_inode(struct inode *inode, struct writeback_control *wbc) { - if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) - return inode->i_sb->s_op->write_inode(inode, wbc); + int ret; + + if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) { + trace_writeback_write_inode_start(inode, wbc); + ret = inode->i_sb->s_op->write_inode(inode, wbc); + trace_writeback_write_inode(inode, wbc); + return ret; + } return 0; } @@ -450,6 +456,8 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) WARN_ON(!(inode->i_state & I_SYNC)); + trace_writeback_single_inode_start(inode, wbc, nr_to_write); + ret = do_writepages(mapping, wbc); /* @@ -1150,8 +1158,12 @@ void __mark_inode_dirty(struct inode *inode, int flags) * dirty the inode itself */ if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { + trace_writeback_dirty_inode_start(inode, flags); + if (sb->s_op->dirty_inode) sb->s_op->dirty_inode(inode, flags); + + trace_writeback_dirty_inode(inode, flags); } /* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f94bc83..78feda9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -19,6 +19,7 @@ #include <linux/gfp.h> #include <linux/bsg.h> #include <linux/smp.h> +#include <linux/rcupdate.h> #include <asm/scatterlist.h> @@ -437,6 +438,7 @@ struct request_queue { /* Throttle data */ struct throtl_data *td; #endif + struct rcu_head rcu_head; }; #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ @@ -974,7 +976,6 @@ struct blk_plug { unsigned long magic; /* detect uninitialized use-cases */ struct list_head list; /* requests */ struct list_head cb_list; /* md requires an unplug callback */ - unsigned int should_sort; /* list to be sorted before flushing? */ }; #define BLK_MAX_REQUEST_COUNT 16 diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 7c2e030..0ea61e0 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -12,6 +12,7 @@ struct blk_trace { int trace_state; + bool rq_based; struct rchan *rchan; unsigned long __percpu *sequence; unsigned char __percpu *msg_data; diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 458f497..5afc4f9 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -126,7 +126,6 @@ BUFFER_FNS(Write_EIO, write_io_error) BUFFER_FNS(Unwritten, unwritten) #define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) -#define touch_buffer(bh) mark_page_accessed(bh->b_page) /* If we *know* page->private refers to buffer_heads */ #define page_buffers(page) \ @@ -142,6 +141,7 @@ BUFFER_FNS(Unwritten, unwritten) void mark_buffer_dirty(struct buffer_head *bh); void init_buffer(struct buffer_head *, bh_end_io_t *, void *); +void touch_buffer(struct buffer_head *bh); void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset); int try_to_free_buffers(struct page *); diff --git a/include/linux/completion.h b/include/linux/completion.h index 51494e6..33f0280 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -77,10 +77,13 @@ static inline void init_completion(struct completion *x) } extern void wait_for_completion(struct completion *); +extern void wait_for_completion_io(struct completion *); extern int wait_for_completion_interruptible(struct completion *x); extern int wait_for_completion_killable(struct completion *x); extern unsigned long wait_for_completion_timeout(struct completion *x, unsigned long timeout); +extern unsigned long wait_for_completion_io_timeout(struct completion *x, + unsigned long timeout); extern long wait_for_completion_interruptible_timeout( struct completion *x, unsigned long timeout); extern long wait_for_completion_killable_timeout( diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 1866206..acd0312 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -2,6 +2,7 @@ #define _LINUX_ELEVATOR_H #include <linux/percpu.h> +#include <linux/hashtable.h> #ifdef CONFIG_BLOCK @@ -96,6 +97,8 @@ struct elevator_type struct list_head list; }; +#define ELV_HASH_BITS 6 + /* * each queue has an elevator_queue associated with it */ @@ -105,8 +108,8 @@ struct elevator_queue void *elevator_data; struct kobject kobj; struct mutex sysfs_lock; - struct hlist_head *hash; unsigned int registered:1; + DECLARE_HASHTABLE(hash, ELV_HASH_BITS); }; /* diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 05c5e61..9961726 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -6,10 +6,61 @@ #include <linux/blktrace_api.h> #include <linux/blkdev.h> +#include <linux/buffer_head.h> #include <linux/tracepoint.h> #define RWBS_LEN 8 +DECLARE_EVENT_CLASS(block_buffer, + + TP_PROTO(struct buffer_head *bh), + + TP_ARGS(bh), + + TP_STRUCT__entry ( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( size_t, size ) + ), + + TP_fast_assign( + __entry->dev = bh->b_bdev->bd_dev; + __entry->sector = bh->b_blocknr; + __entry->size = bh->b_size; + ), + + TP_printk("%d,%d sector=%llu size=%zu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->sector, __entry->size + ) +); + +/** + * block_touch_buffer - mark a buffer accessed + * @bh: buffer_head being touched + * + * Called from touch_buffer(). + */ +DEFINE_EVENT(block_buffer, block_touch_buffer, + + TP_PROTO(struct buffer_head *bh), + + TP_ARGS(bh) +); + +/** + * block_dirty_buffer - mark a buffer dirty + * @bh: buffer_head being dirtied + * + * Called from mark_buffer_dirty(). + */ +DEFINE_EVENT(block_buffer, block_dirty_buffer, + + TP_PROTO(struct buffer_head *bh), + + TP_ARGS(bh) +); + DECLARE_EVENT_CLASS(block_rq_with_error, TP_PROTO(struct request_queue *q, struct request *rq), @@ -206,7 +257,6 @@ TRACE_EVENT(block_bio_bounce, /** * block_bio_complete - completed all work on the block operation - * @q: queue holding the block operation * @bio: block operation completed * @error: io error value * @@ -215,9 +265,9 @@ TRACE_EVENT(block_bio_bounce, */ TRACE_EVENT(block_bio_complete, - TP_PROTO(struct request_queue *q, struct bio *bio, int error), + TP_PROTO(struct bio *bio, int error), - TP_ARGS(q, bio, error), + TP_ARGS(bio, error), TP_STRUCT__entry( __field( dev_t, dev ) @@ -228,7 +278,8 @@ TRACE_EVENT(block_bio_complete, ), TP_fast_assign( - __entry->dev = bio->bi_bdev->bd_dev; + __entry->dev = bio->bi_bdev ? + bio->bi_bdev->bd_dev : 0; __entry->sector = bio->bi_sector; __entry->nr_sector = bio->bi_size >> 9; __entry->error = error; @@ -241,11 +292,11 @@ TRACE_EVENT(block_bio_complete, __entry->nr_sector, __entry->error) ); -DECLARE_EVENT_CLASS(block_bio, +DECLARE_EVENT_CLASS(block_bio_merge, - TP_PROTO(struct request_queue *q, struct bio *bio), + TP_PROTO(struct request_queue *q, struct request *rq, struct bio *bio), - TP_ARGS(q, bio), + TP_ARGS(q, rq, bio), TP_STRUCT__entry( __field( dev_t, dev ) @@ -272,31 +323,33 @@ DECLARE_EVENT_CLASS(block_bio, /** * block_bio_backmerge - merging block operation to the end of an existing operation * @q: queue holding operation + * @rq: request bio is being merged into * @bio: new block operation to merge * * Merging block request @bio to the end of an existing block request * in queue @q. */ -DEFINE_EVENT(block_bio, block_bio_backmerge, +DEFINE_EVENT(block_bio_merge, block_bio_backmerge, - TP_PROTO(struct request_queue *q, struct bio *bio), + TP_PROTO(struct request_queue *q, struct request *rq, struct bio *bio), - TP_ARGS(q, bio) + TP_ARGS(q, rq, bio) ); /** * block_bio_frontmerge - merging block operation to the beginning of an existing operation * @q: queue holding operation + * @rq: request bio is being merged into * @bio: new block operation to merge * * Merging block IO operation @bio to the beginning of an existing block * operation in queue @q. */ -DEFINE_EVENT(block_bio, block_bio_frontmerge, +DEFINE_EVENT(block_bio_merge, block_bio_frontmerge, - TP_PROTO(struct request_queue *q, struct bio *bio), + TP_PROTO(struct request_queue *q, struct request *rq, struct bio *bio), - TP_ARGS(q, bio) + TP_ARGS(q, rq, bio) ); /** @@ -306,11 +359,32 @@ DEFINE_EVENT(block_bio, block_bio_frontmerge, * * About to place the block IO operation @bio into queue @q. */ -DEFINE_EVENT(block_bio, block_bio_queue, +TRACE_EVENT(block_bio_queue, TP_PROTO(struct request_queue *q, struct bio *bio), - TP_ARGS(q, bio) + TP_ARGS(q, bio), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned int, nr_sector ) + __array( char, rwbs, RWBS_LEN ) + __array( char, comm, TASK_COMM_LEN ) + ), + + TP_fast_assign( + __entry->dev = bio->bi_bdev->bd_dev; + __entry->sector = bio->bi_sector; + __entry->nr_sector = bio->bi_size >> 9; + blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("%d,%d %s %llu + %u [%s]", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->comm) ); DECLARE_EVENT_CLASS(block_get_rq, diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index b453d92..6a16fd2 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -32,6 +32,115 @@ struct wb_writeback_work; +TRACE_EVENT(writeback_dirty_page, + + TP_PROTO(struct page *page, struct address_space *mapping), + + TP_ARGS(page, mapping), + + TP_STRUCT__entry ( + __array(char, name, 32) + __field(unsigned long, ino) + __field(pgoff_t, index) + ), + + TP_fast_assign( + strncpy(__entry->name, + mapping ? dev_name(mapping->backing_dev_info->dev) : "(unknown)", 32); + __entry->ino = mapping ? mapping->host->i_ino : 0; + __entry->index = page->index; + ), + + TP_printk("bdi %s: ino=%lu index=%lu", + __entry->name, + __entry->ino, + __entry->index + ) +); + +DECLARE_EVENT_CLASS(writeback_dirty_inode_template, + + TP_PROTO(struct inode *inode, int flags), + + TP_ARGS(inode, flags), + + TP_STRUCT__entry ( + __array(char, name, 32) + __field(unsigned long, ino) + __field(unsigned long, flags) + ), + + TP_fast_assign( + struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info; + + /* may be called for files on pseudo FSes w/ unregistered bdi */ + strncpy(__entry->name, + bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32); + __entry->ino = inode->i_ino; + __entry->flags = flags; + ), + + TP_printk("bdi %s: ino=%lu flags=%s", + __entry->name, + __entry->ino, + show_inode_state(__entry->flags) + ) +); + +DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start, + + TP_PROTO(struct inode *inode, int flags), + + TP_ARGS(inode, flags) +); + +DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode, + + TP_PROTO(struct inode *inode, int flags), + + TP_ARGS(inode, flags) +); + +DECLARE_EVENT_CLASS(writeback_write_inode_template, + + TP_PROTO(struct inode *inode, struct writeback_control *wbc), + + TP_ARGS(inode, wbc), + + TP_STRUCT__entry ( + __array(char, name, 32) + __field(unsigned long, ino) + __field(int, sync_mode) + ), + + TP_fast_assign( + strncpy(__entry->name, + dev_name(inode->i_mapping->backing_dev_info->dev), 32); + __entry->ino = inode->i_ino; + __entry->sync_mode = wbc->sync_mode; + ), + + TP_printk("bdi %s: ino=%lu sync_mode=%d", + __entry->name, + __entry->ino, + __entry->sync_mode + ) +); + +DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode_start, + + TP_PROTO(struct inode *inode, struct writeback_control *wbc), + + TP_ARGS(inode, wbc) +); + +DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode, + + TP_PROTO(struct inode *inode, struct writeback_control *wbc), + + TP_ARGS(inode, wbc) +); + DECLARE_EVENT_CLASS(writeback_work_class, TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), TP_ARGS(bdi, work), @@ -479,6 +588,13 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, ) ); +DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_start, + TP_PROTO(struct inode *inode, + struct writeback_control *wbc, + unsigned long nr_to_write), + TP_ARGS(inode, wbc, nr_to_write) +); + DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode, TP_PROTO(struct inode *inode, struct writeback_control *wbc, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 12af427..7f12624a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3258,7 +3258,8 @@ void complete_all(struct completion *x) EXPORT_SYMBOL(complete_all); static inline long __sched -do_wait_for_common(struct completion *x, long timeout, int state) +do_wait_for_common(struct completion *x, + long (*action)(long), long timeout, int state) { if (!x->done) { DECLARE_WAITQUEUE(wait, current); @@ -3271,7 +3272,7 @@ do_wait_for_common(struct completion *x, long timeout, int state) } __set_current_state(state); spin_unlock_irq(&x->wait.lock); - timeout = schedule_timeout(timeout); + timeout = action(timeout); spin_lock_irq(&x->wait.lock); } while (!x->done && timeout); __remove_wait_queue(&x->wait, &wait); @@ -3282,17 +3283,30 @@ do_wait_for_common(struct completion *x, long timeout, int state) return timeout ?: 1; } -static long __sched -wait_for_common(struct completion *x, long timeout, int state) +static inline long __sched +__wait_for_common(struct completion *x, + long (*action)(long), long timeout, int state) { might_sleep(); spin_lock_irq(&x->wait.lock); - timeout = do_wait_for_common(x, timeout, state); + timeout = do_wait_for_common(x, action, timeout, state); spin_unlock_irq(&x->wait.lock); return timeout; } +static long __sched +wait_for_common(struct completion *x, long timeout, int state) +{ + return __wait_for_common(x, schedule_timeout, timeout, state); +} + +static long __sched +wait_for_common_io(struct completion *x, long timeout, int state) +{ + return __wait_for_common(x, io_schedule_timeout, timeout, state); +} + /** * wait_for_completion: - waits for completion of a task * @x: holds the state of this particular completion @@ -3329,6 +3343,39 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout) EXPORT_SYMBOL(wait_for_completion_timeout); /** + * wait_for_completion_io: - waits for completion of a task + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. The caller is accounted as waiting + * for IO. + */ +void __sched wait_for_completion_io(struct completion *x) +{ + wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_io); + +/** + * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. The caller is accounted as waiting for IO. + * + * The return value is 0 if timed out, and positive (at least 1, or number of + * jiffies left till timeout) if completed. + */ +unsigned long __sched +wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) +{ + return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_io_timeout); + +/** * wait_for_completion_interruptible: - waits for completion of a task (w/intr) * @x: holds the state of this particular completion * diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 71259e2..9e5b8c2 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -739,6 +739,12 @@ static void blk_add_trace_rq_complete(void *ignore, struct request_queue *q, struct request *rq) { + struct blk_trace *bt = q->blk_trace; + + /* if control ever passes through here, it's a request based driver */ + if (unlikely(bt && !bt->rq_based)) + bt->rq_based = true; + blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); } @@ -774,15 +780,30 @@ static void blk_add_trace_bio_bounce(void *ignore, blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); } -static void blk_add_trace_bio_complete(void *ignore, - struct request_queue *q, struct bio *bio, - int error) +static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error) { + struct request_queue *q; + struct blk_trace *bt; + + if (!bio->bi_bdev) + return; + + q = bdev_get_queue(bio->bi_bdev); + bt = q->blk_trace; + + /* + * Request based drivers will generate both rq and bio completions. + * Ignore bio ones. + */ + if (likely(!bt) || bt->rq_based) + return; + blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); } static void blk_add_trace_bio_backmerge(void *ignore, struct request_queue *q, + struct request *rq, struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); @@ -790,6 +811,7 @@ static void blk_add_trace_bio_backmerge(void *ignore, static void blk_add_trace_bio_frontmerge(void *ignore, struct request_queue *q, + struct request *rq, struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index cdc377c..742c405 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1986,6 +1986,8 @@ int __set_page_dirty_no_writeback(struct page *page) */ void account_page_dirtied(struct page *page, struct address_space *mapping) { + trace_writeback_dirty_page(page, mapping); + if (mapping_cap_account_dirty(mapping)) { __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_DIRTIED); |