diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-14 15:32:19 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-14 15:32:19 -0800 |
commit | e2c5923c349c1738fe8fda980874d93f6fb2e5b6 (patch) | |
tree | b97a90170c45211bcc437761653aa8016c34afcd /block | |
parent | abc36be236358162202e86ad88616ff95a755101 (diff) | |
parent | a04b5de5050ab8b891128eb2c47a0916fe8622e1 (diff) | |
download | op-kernel-dev-e2c5923c349c1738fe8fda980874d93f6fb2e5b6.zip op-kernel-dev-e2c5923c349c1738fe8fda980874d93f6fb2e5b6.tar.gz |
Merge branch 'for-4.15/block' of git://git.kernel.dk/linux-block
Pull core block layer updates from Jens Axboe:
"This is the main pull request for block storage for 4.15-rc1.
Nothing out of the ordinary in here, and no API changes or anything
like that. Just various new features for drivers, core changes, etc.
In particular, this pull request contains:
- A patch series from Bart, closing the whole on blk/scsi-mq queue
quescing.
- A series from Christoph, building towards hidden gendisks (for
multipath) and ability to move bio chains around.
- NVMe
- Support for native multipath for NVMe (Christoph).
- Userspace notifications for AENs (Keith).
- Command side-effects support (Keith).
- SGL support (Chaitanya Kulkarni)
- FC fixes and improvements (James Smart)
- Lots of fixes and tweaks (Various)
- bcache
- New maintainer (Michael Lyle)
- Writeback control improvements (Michael)
- Various fixes (Coly, Elena, Eric, Liang, et al)
- lightnvm updates, mostly centered around the pblk interface
(Javier, Hans, and Rakesh).
- Removal of unused bio/bvec kmap atomic interfaces (me, Christoph)
- Writeback series that fix the much discussed hundreds of millions
of sync-all units. This goes all the way, as discussed previously
(me).
- Fix for missing wakeup on writeback timer adjustments (Yafang
Shao).
- Fix laptop mode on blk-mq (me).
- {mq,name} tupple lookup for IO schedulers, allowing us to have
alias names. This means you can use 'deadline' on both !mq and on
mq (where it's called mq-deadline). (me).
- blktrace race fix, oopsing on sg load (me).
- blk-mq optimizations (me).
- Obscure waitqueue race fix for kyber (Omar).
- NBD fixes (Josef).
- Disable writeback throttling by default on bfq, like we do on cfq
(Luca Miccio).
- Series from Ming that enable us to treat flush requests on blk-mq
like any other request. This is a really nice cleanup.
- Series from Ming that improves merging on blk-mq with schedulers,
getting us closer to flipping the switch on scsi-mq again.
- BFQ updates (Paolo).
- blk-mq atomic flags memory ordering fixes (Peter Z).
- Loop cgroup support (Shaohua).
- Lots of minor fixes from lots of different folks, both for core and
driver code"
* 'for-4.15/block' of git://git.kernel.dk/linux-block: (294 commits)
nvme: fix visibility of "uuid" ns attribute
blk-mq: fixup some comment typos and lengths
ide: ide-atapi: fix compile error with defining macro DEBUG
blk-mq: improve tag waiting setup for non-shared tags
brd: remove unused brd_mutex
blk-mq: only run the hardware queue if IO is pending
block: avoid null pointer dereference on null disk
fs: guard_bio_eod() needs to consider partitions
xtensa/simdisk: fix compile error
nvme: expose subsys attribute to sysfs
nvme: create 'slaves' and 'holders' entries for hidden controllers
block: create 'slaves' and 'holders' entries for hidden gendisks
nvme: also expose the namespace identification sysfs files for mpath nodes
nvme: implement multipath access to nvme subsystems
nvme: track shared namespaces
nvme: introduce a nvme_ns_ids structure
nvme: track subsystems
block, nvme: Introduce blk_mq_req_flags_t
block, scsi: Make SCSI quiesce and resume work reliably
block: Add the QUEUE_FLAG_PREEMPT_ONLY request queue flag
...
Diffstat (limited to 'block')
-rw-r--r-- | block/bfq-iosched.c | 225 | ||||
-rw-r--r-- | block/bio-integrity.c | 7 | ||||
-rw-r--r-- | block/bio.c | 40 | ||||
-rw-r--r-- | block/blk-cgroup.c | 9 | ||||
-rw-r--r-- | block/blk-core.c | 274 | ||||
-rw-r--r-- | block/blk-flush.c | 37 | ||||
-rw-r--r-- | block/blk-lib.c | 108 | ||||
-rw-r--r-- | block/blk-mq-debugfs.c | 3 | ||||
-rw-r--r-- | block/blk-mq-sched.c | 203 | ||||
-rw-r--r-- | block/blk-mq-tag.c | 11 | ||||
-rw-r--r-- | block/blk-mq-tag.h | 7 | ||||
-rw-r--r-- | block/blk-mq.c | 422 | ||||
-rw-r--r-- | block/blk-mq.h | 60 | ||||
-rw-r--r-- | block/blk-settings.c | 2 | ||||
-rw-r--r-- | block/blk-stat.c | 45 | ||||
-rw-r--r-- | block/blk-throttle.c | 12 | ||||
-rw-r--r-- | block/blk-timeout.c | 5 | ||||
-rw-r--r-- | block/blk-wbt.c | 2 | ||||
-rw-r--r-- | block/blk.h | 46 | ||||
-rw-r--r-- | block/bsg.c | 18 | ||||
-rw-r--r-- | block/elevator.c | 67 | ||||
-rw-r--r-- | block/genhd.c | 70 | ||||
-rw-r--r-- | block/ioctl.c | 19 | ||||
-rw-r--r-- | block/kyber-iosched.c | 12 | ||||
-rw-r--r-- | block/mq-deadline.c | 1 | ||||
-rw-r--r-- | block/scsi_ioctl.c | 8 |
26 files changed, 1097 insertions, 616 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index a4783da..889a854 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -108,6 +108,7 @@ #include "blk-mq-tag.h" #include "blk-mq-sched.h" #include "bfq-iosched.h" +#include "blk-wbt.h" #define BFQ_BFQQ_FNS(name) \ void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ @@ -724,6 +725,44 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, } } +static unsigned int bfq_wr_duration(struct bfq_data *bfqd) +{ + u64 dur; + + if (bfqd->bfq_wr_max_time > 0) + return bfqd->bfq_wr_max_time; + + dur = bfqd->RT_prod; + do_div(dur, bfqd->peak_rate); + + /* + * Limit duration between 3 and 13 seconds. Tests show that + * higher values than 13 seconds often yield the opposite of + * the desired result, i.e., worsen responsiveness by letting + * non-interactive and non-soft-real-time applications + * preserve weight raising for a too long time interval. + * + * On the other end, lower values than 3 seconds make it + * difficult for most interactive tasks to complete their jobs + * before weight-raising finishes. + */ + if (dur > msecs_to_jiffies(13000)) + dur = msecs_to_jiffies(13000); + else if (dur < msecs_to_jiffies(3000)) + dur = msecs_to_jiffies(3000); + + return dur; +} + +/* switch back from soft real-time to interactive weight raising */ +static void switch_back_to_interactive_wr(struct bfq_queue *bfqq, + struct bfq_data *bfqd) +{ + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt; +} + static void bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, struct bfq_io_cq *bic, bool bfq_already_existing) @@ -750,10 +789,16 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || time_is_before_jiffies(bfqq->last_wr_start_finish + bfqq->wr_cur_max_time))) { - bfq_log_bfqq(bfqq->bfqd, bfqq, - "resume state: switching off wr"); - - bfqq->wr_coeff = 1; + if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && + !bfq_bfqq_in_large_burst(bfqq) && + time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt + + bfq_wr_duration(bfqd))) { + switch_back_to_interactive_wr(bfqq, bfqd); + } else { + bfqq->wr_coeff = 1; + bfq_log_bfqq(bfqq->bfqd, bfqq, + "resume state: switching off wr"); + } } /* make sure weight will be updated, however we got here */ @@ -1173,33 +1218,22 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, return wr_or_deserves_wr; } -static unsigned int bfq_wr_duration(struct bfq_data *bfqd) +/* + * Return the farthest future time instant according to jiffies + * macros. + */ +static unsigned long bfq_greatest_from_now(void) { - u64 dur; - - if (bfqd->bfq_wr_max_time > 0) - return bfqd->bfq_wr_max_time; - - dur = bfqd->RT_prod; - do_div(dur, bfqd->peak_rate); - - /* - * Limit duration between 3 and 13 seconds. Tests show that - * higher values than 13 seconds often yield the opposite of - * the desired result, i.e., worsen responsiveness by letting - * non-interactive and non-soft-real-time applications - * preserve weight raising for a too long time interval. - * - * On the other end, lower values than 3 seconds make it - * difficult for most interactive tasks to complete their jobs - * before weight-raising finishes. - */ - if (dur > msecs_to_jiffies(13000)) - dur = msecs_to_jiffies(13000); - else if (dur < msecs_to_jiffies(3000)) - dur = msecs_to_jiffies(3000); + return jiffies + MAX_JIFFY_OFFSET; +} - return dur; +/* + * Return the farthest past time instant according to jiffies + * macros. + */ +static unsigned long bfq_smallest_from_now(void) +{ + return jiffies - MAX_JIFFY_OFFSET; } static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, @@ -1216,7 +1250,19 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, bfqq->wr_coeff = bfqd->bfq_wr_coeff; bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); } else { - bfqq->wr_start_at_switch_to_srt = jiffies; + /* + * No interactive weight raising in progress + * here: assign minus infinity to + * wr_start_at_switch_to_srt, to make sure + * that, at the end of the soft-real-time + * weight raising periods that is starting + * now, no interactive weight-raising period + * may be wrongly considered as still in + * progress (and thus actually started by + * mistake). + */ + bfqq->wr_start_at_switch_to_srt = + bfq_smallest_from_now(); bfqq->wr_coeff = bfqd->bfq_wr_coeff * BFQ_SOFTRT_WEIGHT_FACTOR; bfqq->wr_cur_max_time = @@ -2016,10 +2062,27 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); - bic->saved_wr_coeff = bfqq->wr_coeff; - bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; - bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; - bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; + if (unlikely(bfq_bfqq_just_created(bfqq) && + !bfq_bfqq_in_large_burst(bfqq))) { + /* + * bfqq being merged right after being created: bfqq + * would have deserved interactive weight raising, but + * did not make it to be set in a weight-raised state, + * because of this early merge. Store directly the + * weight-raising state that would have been assigned + * to bfqq, so that to avoid that bfqq unjustly fails + * to enjoy weight raising if split soon. + */ + bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; + bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); + bic->saved_last_wr_start_finish = jiffies; + } else { + bic->saved_wr_coeff = bfqq->wr_coeff; + bic->saved_wr_start_at_switch_to_srt = + bfqq->wr_start_at_switch_to_srt; + bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; + bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; + } } static void @@ -2897,24 +2960,6 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); } -/* - * Return the farthest future time instant according to jiffies - * macros. - */ -static unsigned long bfq_greatest_from_now(void) -{ - return jiffies + MAX_JIFFY_OFFSET; -} - -/* - * Return the farthest past time instant according to jiffies - * macros. - */ -static unsigned long bfq_smallest_from_now(void) -{ - return jiffies - MAX_JIFFY_OFFSET; -} - /** * bfq_bfqq_expire - expire a queue. * @bfqd: device owning the queue. @@ -3489,11 +3534,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_wr_duration(bfqd))) bfq_bfqq_end_wr(bfqq); else { - /* switch back to interactive wr */ - bfqq->wr_coeff = bfqd->bfq_wr_coeff; - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - bfqq->last_wr_start_finish = - bfqq->wr_start_at_switch_to_srt; + switch_back_to_interactive_wr(bfqq, bfqd); bfqq->entity.prio_changed = 1; } } @@ -3685,16 +3726,37 @@ void bfq_put_queue(struct bfq_queue *bfqq) if (bfqq->ref) return; - if (bfq_bfqq_sync(bfqq)) + if (!hlist_unhashed(&bfqq->burst_list_node)) { + hlist_del_init(&bfqq->burst_list_node); /* - * The fact that this queue is being destroyed does not - * invalidate the fact that this queue may have been - * activated during the current burst. As a consequence, - * although the queue does not exist anymore, and hence - * needs to be removed from the burst list if there, - * the burst size has not to be decremented. + * Decrement also burst size after the removal, if the + * process associated with bfqq is exiting, and thus + * does not contribute to the burst any longer. This + * decrement helps filter out false positives of large + * bursts, when some short-lived process (often due to + * the execution of commands by some service) happens + * to start and exit while a complex application is + * starting, and thus spawning several processes that + * do I/O (and that *must not* be treated as a large + * burst, see comments on bfq_handle_burst). + * + * In particular, the decrement is performed only if: + * 1) bfqq is not a merged queue, because, if it is, + * then this free of bfqq is not triggered by the exit + * of the process bfqq is associated with, but exactly + * by the fact that bfqq has just been merged. + * 2) burst_size is greater than 0, to handle + * unbalanced decrements. Unbalanced decrements may + * happen in te following case: bfqq is inserted into + * the current burst list--without incrementing + * bust_size--because of a split, but the current + * burst list is not the burst list bfqq belonged to + * (see comments on the case of a split in + * bfq_set_request). */ - hlist_del_init(&bfqq->burst_list_node); + if (bfqq->bic && bfqq->bfqd->burst_size > 0) + bfqq->bfqd->burst_size--; + } kmem_cache_free(bfq_pool, bfqq); #ifdef CONFIG_BFQ_GROUP_IOSCHED @@ -4127,7 +4189,6 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) new_bfqq->allocated++; bfqq->allocated--; new_bfqq->ref++; - bfq_clear_bfqq_just_created(bfqq); /* * If the bic associated with the process * issuing this request still points to bfqq @@ -4139,6 +4200,8 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq, new_bfqq); + + bfq_clear_bfqq_just_created(bfqq); /* * rq is about to be enqueued into new_bfqq, * release rq reference on bfqq @@ -4424,6 +4487,34 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, else { bfq_clear_bfqq_in_large_burst(bfqq); if (bic->was_in_burst_list) + /* + * If bfqq was in the current + * burst list before being + * merged, then we have to add + * it back. And we do not need + * to increase burst_size, as + * we did not decrement + * burst_size when we removed + * bfqq from the burst list as + * a consequence of a merge + * (see comments in + * bfq_put_queue). In this + * respect, it would be rather + * costly to know whether the + * current burst list is still + * the same burst list from + * which bfqq was removed on + * the merge. To avoid this + * cost, if bfqq was in a + * burst list, then we add + * bfqq to the current burst + * list without any further + * check. This can cause + * inappropriate insertions, + * but rarely enough to not + * harm the detection of large + * bursts significantly. + */ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); } @@ -4775,7 +4866,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfq_init_root_group(bfqd->root_group, bfqd); bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); - + wbt_disable_default(q); return 0; out_free: diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 5df3290..23b42e8 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -485,11 +485,8 @@ EXPORT_SYMBOL(bioset_integrity_create); void bioset_integrity_free(struct bio_set *bs) { - if (bs->bio_integrity_pool) - mempool_destroy(bs->bio_integrity_pool); - - if (bs->bvec_integrity_pool) - mempool_destroy(bs->bvec_integrity_pool); + mempool_destroy(bs->bio_integrity_pool); + mempool_destroy(bs->bvec_integrity_pool); } EXPORT_SYMBOL(bioset_integrity_free); diff --git a/block/bio.c b/block/bio.c index cc60213..b94a802 100644 --- a/block/bio.c +++ b/block/bio.c @@ -400,7 +400,7 @@ static void punt_bios_to_rescuer(struct bio_set *bs) /** * bio_alloc_bioset - allocate a bio for I/O - * @gfp_mask: the GFP_ mask given to the slab allocator + * @gfp_mask: the GFP_* mask given to the slab allocator * @nr_iovecs: number of iovecs to pre-allocate * @bs: the bio_set to allocate from. * @@ -1931,11 +1931,8 @@ void bioset_free(struct bio_set *bs) if (bs->rescue_workqueue) destroy_workqueue(bs->rescue_workqueue); - if (bs->bio_pool) - mempool_destroy(bs->bio_pool); - - if (bs->bvec_pool) - mempool_destroy(bs->bvec_pool); + mempool_destroy(bs->bio_pool); + mempool_destroy(bs->bvec_pool); bioset_integrity_free(bs); bio_put_slab(bs); @@ -2036,37 +2033,6 @@ int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) EXPORT_SYMBOL_GPL(bio_associate_blkcg); /** - * bio_associate_current - associate a bio with %current - * @bio: target bio - * - * Associate @bio with %current if it hasn't been associated yet. Block - * layer will treat @bio as if it were issued by %current no matter which - * task actually issues it. - * - * This function takes an extra reference of @task's io_context and blkcg - * which will be put when @bio is released. The caller must own @bio, - * ensure %current->io_context exists, and is responsible for synchronizing - * calls to this function. - */ -int bio_associate_current(struct bio *bio) -{ - struct io_context *ioc; - - if (bio->bi_css) - return -EBUSY; - - ioc = current->io_context; - if (!ioc) - return -ENOENT; - - get_io_context_active(ioc); - bio->bi_ioc = ioc; - bio->bi_css = task_get_css(current, io_cgrp_id); - return 0; -} -EXPORT_SYMBOL_GPL(bio_associate_current); - -/** * bio_disassociate_task - undo bio_associate_current() * @bio: target bio */ diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index d3f56ba..4117524 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1419,6 +1419,11 @@ int blkcg_policy_register(struct blkcg_policy *pol) if (i >= BLKCG_MAX_POLS) goto err_unlock; + /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */ + if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || + (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) + goto err_unlock; + /* register @pol */ pol->plid = i; blkcg_policy[pol->plid] = pol; @@ -1452,7 +1457,7 @@ int blkcg_policy_register(struct blkcg_policy *pol) return 0; err_free_cpds: - if (pol->cpd_alloc_fn) { + if (pol->cpd_free_fn) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { if (blkcg->cpd[pol->plid]) { pol->cpd_free_fn(blkcg->cpd[pol->plid]); @@ -1492,7 +1497,7 @@ void blkcg_policy_unregister(struct blkcg_policy *pol) /* remove cpds and unregister */ mutex_lock(&blkcg_pol_mutex); - if (pol->cpd_alloc_fn) { + if (pol->cpd_free_fn) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { if (blkcg->cpd[pol->plid]) { pol->cpd_free_fn(blkcg->cpd[pol->plid]); diff --git a/block/blk-core.c b/block/blk-core.c index 048be4a..7c54c19 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -333,11 +333,13 @@ EXPORT_SYMBOL(blk_stop_queue); void blk_sync_queue(struct request_queue *q) { del_timer_sync(&q->timeout); + cancel_work_sync(&q->timeout_work); if (q->mq_ops) { struct blk_mq_hw_ctx *hctx; int i; + cancel_delayed_work_sync(&q->requeue_work); queue_for_each_hw_ctx(q, hctx, i) cancel_delayed_work_sync(&hctx->run_work); } else { @@ -347,6 +349,37 @@ void blk_sync_queue(struct request_queue *q) EXPORT_SYMBOL(blk_sync_queue); /** + * blk_set_preempt_only - set QUEUE_FLAG_PREEMPT_ONLY + * @q: request queue pointer + * + * Returns the previous value of the PREEMPT_ONLY flag - 0 if the flag was not + * set and 1 if the flag was already set. + */ +int blk_set_preempt_only(struct request_queue *q) +{ + unsigned long flags; + int res; + + spin_lock_irqsave(q->queue_lock, flags); + res = queue_flag_test_and_set(QUEUE_FLAG_PREEMPT_ONLY, q); + spin_unlock_irqrestore(q->queue_lock, flags); + + return res; +} +EXPORT_SYMBOL_GPL(blk_set_preempt_only); + +void blk_clear_preempt_only(struct request_queue *q) +{ + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q); + wake_up_all(&q->mq_freeze_wq); + spin_unlock_irqrestore(q->queue_lock, flags); +} +EXPORT_SYMBOL_GPL(blk_clear_preempt_only); + +/** * __blk_run_queue_uncond - run a queue whether or not it has been stopped * @q: The queue to run * @@ -610,6 +643,9 @@ void blk_set_queue_dying(struct request_queue *q) } spin_unlock_irq(q->queue_lock); } + + /* Make blk_queue_enter() reexamine the DYING flag. */ + wake_up_all(&q->mq_freeze_wq); } EXPORT_SYMBOL_GPL(blk_set_queue_dying); @@ -718,7 +754,7 @@ static void free_request_size(void *element, void *data) int blk_init_rl(struct request_list *rl, struct request_queue *q, gfp_t gfp_mask) { - if (unlikely(rl->rq_pool)) + if (unlikely(rl->rq_pool) || q->mq_ops) return 0; rl->q = q; @@ -760,15 +796,38 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask) } EXPORT_SYMBOL(blk_alloc_queue); -int blk_queue_enter(struct request_queue *q, bool nowait) +/** + * blk_queue_enter() - try to increase q->q_usage_counter + * @q: request queue pointer + * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT + */ +int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) { + const bool preempt = flags & BLK_MQ_REQ_PREEMPT; + while (true) { + bool success = false; int ret; - if (percpu_ref_tryget_live(&q->q_usage_counter)) + rcu_read_lock_sched(); + if (percpu_ref_tryget_live(&q->q_usage_counter)) { + /* + * The code that sets the PREEMPT_ONLY flag is + * responsible for ensuring that that flag is globally + * visible before the queue is unfrozen. + */ + if (preempt || !blk_queue_preempt_only(q)) { + success = true; + } else { + percpu_ref_put(&q->q_usage_counter); + } + } + rcu_read_unlock_sched(); + + if (success) return 0; - if (nowait) + if (flags & BLK_MQ_REQ_NOWAIT) return -EBUSY; /* @@ -781,7 +840,8 @@ int blk_queue_enter(struct request_queue *q, bool nowait) smp_rmb(); ret = wait_event_interruptible(q->mq_freeze_wq, - !atomic_read(&q->mq_freeze_depth) || + (atomic_read(&q->mq_freeze_depth) == 0 && + (preempt || !blk_queue_preempt_only(q))) || blk_queue_dying(q)); if (blk_queue_dying(q)) return -ENODEV; @@ -844,6 +904,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) setup_timer(&q->backing_dev_info->laptop_mode_wb_timer, laptop_mode_timer_fn, (unsigned long) q); setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); + INIT_WORK(&q->timeout_work, NULL); INIT_LIST_HEAD(&q->queue_head); INIT_LIST_HEAD(&q->timeout_list); INIT_LIST_HEAD(&q->icq_list); @@ -1154,7 +1215,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr) * @rl: request list to allocate from * @op: operation and flags * @bio: bio to allocate request for (can be %NULL) - * @gfp_mask: allocation mask + * @flags: BLQ_MQ_REQ_* flags * * Get a free request from @q. This function may fail under memory * pressure or if @q is dead. @@ -1164,7 +1225,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr) * Returns request pointer on success, with @q->queue_lock *not held*. */ static struct request *__get_request(struct request_list *rl, unsigned int op, - struct bio *bio, gfp_t gfp_mask) + struct bio *bio, blk_mq_req_flags_t flags) { struct request_queue *q = rl->q; struct request *rq; @@ -1173,6 +1234,8 @@ static struct request *__get_request(struct request_list *rl, unsigned int op, struct io_cq *icq = NULL; const bool is_sync = op_is_sync(op); int may_queue; + gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC : + __GFP_DIRECT_RECLAIM; req_flags_t rq_flags = RQF_ALLOCED; lockdep_assert_held(q->queue_lock); @@ -1255,6 +1318,8 @@ static struct request *__get_request(struct request_list *rl, unsigned int op, blk_rq_set_rl(rq, rl); rq->cmd_flags = op; rq->rq_flags = rq_flags; + if (flags & BLK_MQ_REQ_PREEMPT) + rq->rq_flags |= RQF_PREEMPT; /* init elvpriv */ if (rq_flags & RQF_ELVPRIV) { @@ -1333,7 +1398,7 @@ rq_starved: * @q: request_queue to allocate request from * @op: operation and flags * @bio: bio to allocate request for (can be %NULL) - * @gfp_mask: allocation mask + * @flags: BLK_MQ_REQ_* flags. * * Get a free request from @q. If %__GFP_DIRECT_RECLAIM is set in @gfp_mask, * this function keeps retrying under memory pressure and fails iff @q is dead. @@ -1343,7 +1408,7 @@ rq_starved: * Returns request pointer on success, with @q->queue_lock *not held*. */ static struct request *get_request(struct request_queue *q, unsigned int op, - struct bio *bio, gfp_t gfp_mask) + struct bio *bio, blk_mq_req_flags_t flags) { const bool is_sync = op_is_sync(op); DEFINE_WAIT(wait); @@ -1355,7 +1420,7 @@ static struct request *get_request(struct request_queue *q, unsigned int op, rl = blk_get_rl(q, bio); /* transferred to @rq on success */ retry: - rq = __get_request(rl, op, bio, gfp_mask); + rq = __get_request(rl, op, bio, flags); if (!IS_ERR(rq)) return rq; @@ -1364,7 +1429,7 @@ retry: return ERR_PTR(-EAGAIN); } - if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) { + if ((flags & BLK_MQ_REQ_NOWAIT) || unlikely(blk_queue_dying(q))) { blk_put_rl(rl); return rq; } @@ -1391,20 +1456,28 @@ retry: goto retry; } +/* flags: BLK_MQ_REQ_PREEMPT and/or BLK_MQ_REQ_NOWAIT. */ static struct request *blk_old_get_request(struct request_queue *q, - unsigned int op, gfp_t gfp_mask) + unsigned int op, blk_mq_req_flags_t flags) { struct request *rq; + gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC : + __GFP_DIRECT_RECLAIM; + int ret = 0; WARN_ON_ONCE(q->mq_ops); /* create ioc upfront */ create_io_context(gfp_mask, q->node); + ret = blk_queue_enter(q, flags); + if (ret) + return ERR_PTR(ret); spin_lock_irq(q->queue_lock); - rq = get_request(q, op, NULL, gfp_mask); + rq = get_request(q, op, NULL, flags); if (IS_ERR(rq)) { spin_unlock_irq(q->queue_lock); + blk_queue_exit(q); return rq; } @@ -1415,25 +1488,40 @@ static struct request *blk_old_get_request(struct request_queue *q, return rq; } -struct request *blk_get_request(struct request_queue *q, unsigned int op, - gfp_t gfp_mask) +/** + * blk_get_request_flags - allocate a request + * @q: request queue to allocate a request for + * @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC. + * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT. + */ +struct request *blk_get_request_flags(struct request_queue *q, unsigned int op, + blk_mq_req_flags_t flags) { struct request *req; + WARN_ON_ONCE(op & REQ_NOWAIT); + WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT)); + if (q->mq_ops) { - req = blk_mq_alloc_request(q, op, - (gfp_mask & __GFP_DIRECT_RECLAIM) ? - 0 : BLK_MQ_REQ_NOWAIT); + req = blk_mq_alloc_request(q, op, flags); if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) q->mq_ops->initialize_rq_fn(req); } else { - req = blk_old_get_request(q, op, gfp_mask); + req = blk_old_get_request(q, op, flags); if (!IS_ERR(req) && q->initialize_rq_fn) q->initialize_rq_fn(req); } return req; } +EXPORT_SYMBOL(blk_get_request_flags); + +struct request *blk_get_request(struct request_queue *q, unsigned int op, + gfp_t gfp_mask) +{ + return blk_get_request_flags(q, op, gfp_mask & __GFP_DIRECT_RECLAIM ? + 0 : BLK_MQ_REQ_NOWAIT); +} EXPORT_SYMBOL(blk_get_request); /** @@ -1576,6 +1664,7 @@ void __blk_put_request(struct request_queue *q, struct request *req) blk_free_request(rl, req); freed_request(rl, sync, rq_flags); blk_put_rl(rl); + blk_queue_exit(q); } } EXPORT_SYMBOL_GPL(__blk_put_request); @@ -1857,8 +1946,10 @@ get_rq: * Grab a free request. This is might sleep but can not fail. * Returns with the queue unlocked. */ - req = get_request(q, bio->bi_opf, bio, GFP_NOIO); + blk_queue_enter_live(q); + req = get_request(q, bio->bi_opf, bio, 0); if (IS_ERR(req)) { + blk_queue_exit(q); __wbt_done(q->rq_wb, wb_acct); if (PTR_ERR(req) == -ENOMEM) bio->bi_status = BLK_STS_RESOURCE; @@ -2200,8 +2291,10 @@ blk_qc_t generic_make_request(struct bio *bio) current->bio_list = bio_list_on_stack; do { struct request_queue *q = bio->bi_disk->queue; + blk_mq_req_flags_t flags = bio->bi_opf & REQ_NOWAIT ? + BLK_MQ_REQ_NOWAIT : 0; - if (likely(blk_queue_enter(q, bio->bi_opf & REQ_NOWAIT) == 0)) { + if (likely(blk_queue_enter(q, flags) == 0)) { struct bio_list lower, same; /* Create a fresh bio_list for all subordinate requests */ @@ -2242,6 +2335,40 @@ out: EXPORT_SYMBOL(generic_make_request); /** + * direct_make_request - hand a buffer directly to its device driver for I/O + * @bio: The bio describing the location in memory and on the device. + * + * This function behaves like generic_make_request(), but does not protect + * against recursion. Must only be used if the called driver is known + * to not call generic_make_request (or direct_make_request) again from + * its make_request function. (Calling direct_make_request again from + * a workqueue is perfectly fine as that doesn't recurse). + */ +blk_qc_t direct_make_request(struct bio *bio) +{ + struct request_queue *q = bio->bi_disk->queue; + bool nowait = bio->bi_opf & REQ_NOWAIT; + blk_qc_t ret; + + if (!generic_make_request_checks(bio)) + return BLK_QC_T_NONE; + + if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) { + if (nowait && !blk_queue_dying(q)) + bio->bi_status = BLK_STS_AGAIN; + else + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + return BLK_QC_T_NONE; + } + + ret = q->make_request_fn(q, bio); + blk_queue_exit(q); + return ret; +} +EXPORT_SYMBOL_GPL(direct_make_request); + +/** * submit_bio - submit a bio to the block device layer for I/O * @bio: The &struct bio which describes the I/O * @@ -2285,6 +2412,17 @@ blk_qc_t submit_bio(struct bio *bio) } EXPORT_SYMBOL(submit_bio); +bool blk_poll(struct request_queue *q, blk_qc_t cookie) +{ + if (!q->poll_fn || !blk_qc_t_valid(cookie)) + return false; + + if (current->plug) + blk_flush_plug_list(current->plug, false); + return q->poll_fn(q, cookie); +} +EXPORT_SYMBOL_GPL(blk_poll); + /** * blk_cloned_rq_check_limits - Helper function to check a cloned request * for new the queue limits @@ -2350,7 +2488,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * * bypass a potential scheduler on the bottom device for * insert. */ - blk_mq_request_bypass_insert(rq); + blk_mq_request_bypass_insert(rq, true); return BLK_STS_OK; } @@ -2464,20 +2602,22 @@ void blk_account_io_done(struct request *req) * Don't process normal requests when queue is suspended * or in the process of suspending/resuming */ -static struct request *blk_pm_peek_request(struct request_queue *q, - struct request *rq) +static bool blk_pm_allow_request(struct request *rq) { - if (q->dev && (q->rpm_status == RPM_SUSPENDED || - (q->rpm_status != RPM_ACTIVE && !(rq->rq_flags & RQF_PM)))) - return NULL; - else - return rq; + switch (rq->q->rpm_status) { + case RPM_RESUMING: + case RPM_SUSPENDING: + return rq->rq_flags & RQF_PM; + case RPM_SUSPENDED: + return false; + } + + return true; } #else -static inline struct request *blk_pm_peek_request(struct request_queue *q, - struct request *rq) +static bool blk_pm_allow_request(struct request *rq) { - return rq; + return true; } #endif @@ -2517,6 +2657,48 @@ void blk_account_io_start(struct request *rq, bool new_io) part_stat_unlock(); } +static struct request *elv_next_request(struct request_queue *q) +{ + struct request *rq; + struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); + + WARN_ON_ONCE(q->mq_ops); + + while (1) { + list_for_each_entry(rq, &q->queue_head, queuelist) { + if (blk_pm_allow_request(rq)) + return rq; + + if (rq->rq_flags & RQF_SOFTBARRIER) + break; + } + + /* + * Flush request is running and flush request isn't queueable + * in the drive, we can hold the queue till flush request is + * finished. Even we don't do this, driver can't dispatch next + * requests and will requeue them. And this can improve + * throughput too. For example, we have request flush1, write1, + * flush 2. flush1 is dispatched, then queue is hold, write1 + * isn't inserted to queue. After flush1 is finished, flush2 + * will be dispatched. Since disk cache is already clean, + * flush2 will be finished very soon, so looks like flush2 is + * folded to flush1. + * Since the queue is hold, a flag is set to indicate the queue + * should be restarted later. Please see flush_end_io() for + * details. + */ + if (fq->flush_pending_idx != fq->flush_running_idx && + !queue_flush_queueable(q)) { + fq->flush_queue_delayed = 1; + return NULL; + } + if (unlikely(blk_queue_bypass(q)) || + !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0)) + return NULL; + } +} + /** * blk_peek_request - peek at the top of a request queue * @q: request queue to peek at @@ -2538,12 +2720,7 @@ struct request *blk_peek_request(struct request_queue *q) lockdep_assert_held(q->queue_lock); WARN_ON_ONCE(q->mq_ops); - while ((rq = __elv_next_request(q)) != NULL) { - - rq = blk_pm_peek_request(q, rq); - if (!rq) - break; - + while ((rq = elv_next_request(q)) != NULL) { if (!(rq->rq_flags & RQF_STARTED)) { /* * This is the first time the device driver @@ -2695,6 +2872,27 @@ struct request *blk_fetch_request(struct request_queue *q) } EXPORT_SYMBOL(blk_fetch_request); +/* + * Steal bios from a request and add them to a bio list. + * The request must not have been partially completed before. + */ +void blk_steal_bios(struct bio_list *list, struct request *rq) +{ + if (rq->bio) { + if (list->tail) + list->tail->bi_next = rq->bio; + else + list->head = rq->bio; + list->tail = rq->biotail; + + rq->bio = NULL; + rq->biotail = NULL; + } + + rq->__data_len = 0; +} +EXPORT_SYMBOL_GPL(blk_steal_bios); + /** * blk_update_request - Special helper function for request stacking drivers * @req: the request being processed diff --git a/block/blk-flush.c b/block/blk-flush.c index 4938bec..f171706 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -231,8 +231,13 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu); - blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); - flush_rq->tag = -1; + if (!q->elevator) { + blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); + flush_rq->tag = -1; + } else { + blk_mq_put_driver_tag_hctx(hctx, flush_rq); + flush_rq->internal_tag = -1; + } } running = &fq->flush_queue[fq->flush_running_idx]; @@ -318,19 +323,26 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq) blk_rq_init(q, flush_rq); /* - * Borrow tag from the first request since they can't - * be in flight at the same time. And acquire the tag's - * ownership for flush req. + * In case of none scheduler, borrow tag from the first request + * since they can't be in flight at the same time. And acquire + * the tag's ownership for flush req. + * + * In case of IO scheduler, flush rq need to borrow scheduler tag + * just for cheating put/get driver tag. */ if (q->mq_ops) { struct blk_mq_hw_ctx *hctx; flush_rq->mq_ctx = first_rq->mq_ctx; - flush_rq->tag = first_rq->tag; - fq->orig_rq = first_rq; - hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu); - blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq); + if (!q->elevator) { + fq->orig_rq = first_rq; + flush_rq->tag = first_rq->tag; + hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu); + blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq); + } else { + flush_rq->internal_tag = first_rq->internal_tag; + } } flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; @@ -394,6 +406,11 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error) hctx = blk_mq_map_queue(q, ctx->cpu); + if (q->elevator) { + WARN_ON(rq->tag < 0); + blk_mq_put_driver_tag_hctx(hctx, rq); + } + /* * After populating an empty queue, kick it to avoid stall. Read * the comment in flush_end_io(). @@ -463,7 +480,7 @@ void blk_insert_flush(struct request *rq) if ((policy & REQ_FSEQ_DATA) && !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { if (q->mq_ops) - blk_mq_sched_insert_request(rq, false, true, false, false); + blk_mq_request_bypass_insert(rq, false); else list_add_tail(&rq->queuelist, &q->queue_head); return; diff --git a/block/blk-lib.c b/block/blk-lib.c index 63fb971..2bc544c 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -275,6 +275,40 @@ static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects) return min(pages, (sector_t)BIO_MAX_PAGES); } +static int __blkdev_issue_zero_pages(struct block_device *bdev, + sector_t sector, sector_t nr_sects, gfp_t gfp_mask, + struct bio **biop) +{ + struct request_queue *q = bdev_get_queue(bdev); + struct bio *bio = *biop; + int bi_size = 0; + unsigned int sz; + + if (!q) + return -ENXIO; + + while (nr_sects != 0) { + bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects), + gfp_mask); + bio->bi_iter.bi_sector = sector; + bio_set_dev(bio, bdev); + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + + while (nr_sects != 0) { + sz = min((sector_t) PAGE_SIZE, nr_sects << 9); + bi_size = bio_add_page(bio, ZERO_PAGE(0), sz, 0); + nr_sects -= bi_size >> 9; + sector += bi_size >> 9; + if (bi_size < sz) + break; + } + cond_resched(); + } + + *biop = bio; + return 0; +} + /** * __blkdev_issue_zeroout - generate number of zero filed write bios * @bdev: blockdev to issue @@ -288,12 +322,6 @@ static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects) * Zero-fill a block range, either using hardware offload or by explicitly * writing zeroes to the device. * - * Note that this function may fail with -EOPNOTSUPP if the driver signals - * zeroing offload support, but the device fails to process the command (for - * some devices there is no non-destructive way to verify whether this - * operation is actually supported). In this case the caller should call - * retry the call to blkdev_issue_zeroout() and the fallback path will be used. - * * If a device is using logical block provisioning, the underlying space will * not be released if %flags contains BLKDEV_ZERO_NOUNMAP. * @@ -305,9 +333,6 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, unsigned flags) { int ret; - int bi_size = 0; - struct bio *bio = *biop; - unsigned int sz; sector_t bs_mask; bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; @@ -317,30 +342,10 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask, biop, flags); if (ret != -EOPNOTSUPP || (flags & BLKDEV_ZERO_NOFALLBACK)) - goto out; - - ret = 0; - while (nr_sects != 0) { - bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects), - gfp_mask); - bio->bi_iter.bi_sector = sector; - bio_set_dev(bio, bdev); - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - - while (nr_sects != 0) { - sz = min((sector_t) PAGE_SIZE, nr_sects << 9); - bi_size = bio_add_page(bio, ZERO_PAGE(0), sz, 0); - nr_sects -= bi_size >> 9; - sector += bi_size >> 9; - if (bi_size < sz) - break; - } - cond_resched(); - } + return ret; - *biop = bio; -out: - return ret; + return __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask, + biop); } EXPORT_SYMBOL(__blkdev_issue_zeroout); @@ -360,18 +365,49 @@ EXPORT_SYMBOL(__blkdev_issue_zeroout); int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, unsigned flags) { - int ret; - struct bio *bio = NULL; + int ret = 0; + sector_t bs_mask; + struct bio *bio; struct blk_plug plug; + bool try_write_zeroes = !!bdev_write_zeroes_sectors(bdev); + bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; + if ((sector | nr_sects) & bs_mask) + return -EINVAL; + +retry: + bio = NULL; blk_start_plug(&plug); - ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, - &bio, flags); + if (try_write_zeroes) { + ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, + gfp_mask, &bio, flags); + } else if (!(flags & BLKDEV_ZERO_NOFALLBACK)) { + ret = __blkdev_issue_zero_pages(bdev, sector, nr_sects, + gfp_mask, &bio); + } else { + /* No zeroing offload support */ + ret = -EOPNOTSUPP; + } if (ret == 0 && bio) { ret = submit_bio_wait(bio); bio_put(bio); } blk_finish_plug(&plug); + if (ret && try_write_zeroes) { + if (!(flags & BLKDEV_ZERO_NOFALLBACK)) { + try_write_zeroes = false; + goto retry; + } + if (!bdev_write_zeroes_sectors(bdev)) { + /* + * Zeroing offload support was indicated, but the + * device reported ILLEGAL REQUEST (for some devices + * there is no non-destructive way to verify whether + * WRITE ZEROES is actually supported). + */ + ret = -EOPNOTSUPP; + } + } return ret; } diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index de294d7..b56a4f3 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -54,7 +54,6 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(NOMERGES), QUEUE_FLAG_NAME(SAME_COMP), QUEUE_FLAG_NAME(FAIL_IO), - QUEUE_FLAG_NAME(STACKABLE), QUEUE_FLAG_NAME(NONROT), QUEUE_FLAG_NAME(IO_STAT), QUEUE_FLAG_NAME(DISCARD), @@ -75,6 +74,7 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(REGISTERED), QUEUE_FLAG_NAME(SCSI_PASSTHROUGH), QUEUE_FLAG_NAME(QUIESCED), + QUEUE_FLAG_NAME(PREEMPT_ONLY), }; #undef QUEUE_FLAG_NAME @@ -180,7 +180,6 @@ static const char *const hctx_state_name[] = { HCTX_STATE_NAME(STOPPED), HCTX_STATE_NAME(TAG_ACTIVE), HCTX_STATE_NAME(SCHED_RESTART), - HCTX_STATE_NAME(TAG_WAITING), HCTX_STATE_NAME(START_ON_RUN), }; #undef HCTX_STATE_NAME diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 4ab6943..c117bd8 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -81,20 +81,103 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) } else clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); - if (blk_mq_hctx_has_pending(hctx)) { - blk_mq_run_hw_queue(hctx, true); - return true; - } + return blk_mq_run_hw_queue(hctx, true); +} - return false; +/* + * Only SCSI implements .get_budget and .put_budget, and SCSI restarts + * its queue by itself in its completion handler, so we don't need to + * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. + */ +static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + struct elevator_queue *e = q->elevator; + LIST_HEAD(rq_list); + + do { + struct request *rq; + + if (e->type->ops.mq.has_work && + !e->type->ops.mq.has_work(hctx)) + break; + + if (!blk_mq_get_dispatch_budget(hctx)) + break; + + rq = e->type->ops.mq.dispatch_request(hctx); + if (!rq) { + blk_mq_put_dispatch_budget(hctx); + break; + } + + /* + * Now this rq owns the budget which has to be released + * if this rq won't be queued to driver via .queue_rq() + * in blk_mq_dispatch_rq_list(). + */ + list_add(&rq->queuelist, &rq_list); + } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); } +static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx) +{ + unsigned idx = ctx->index_hw; + + if (++idx == hctx->nr_ctx) + idx = 0; + + return hctx->ctxs[idx]; +} + +/* + * Only SCSI implements .get_budget and .put_budget, and SCSI restarts + * its queue by itself in its completion handler, so we don't need to + * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. + */ +static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + LIST_HEAD(rq_list); + struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); + + do { + struct request *rq; + + if (!sbitmap_any_bit_set(&hctx->ctx_map)) + break; + + if (!blk_mq_get_dispatch_budget(hctx)) + break; + + rq = blk_mq_dequeue_from_ctx(hctx, ctx); + if (!rq) { + blk_mq_put_dispatch_budget(hctx); + break; + } + + /* + * Now this rq owns the budget which has to be released + * if this rq won't be queued to driver via .queue_rq() + * in blk_mq_dispatch_rq_list(). + */ + list_add(&rq->queuelist, &rq_list); + + /* round robin for fair dispatch */ + ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); + + } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); + + WRITE_ONCE(hctx->dispatch_from, ctx); +} + +/* return true if hw queue need to be run again */ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; - bool did_work = false; LIST_HEAD(rq_list); /* RCU or SRCU read lock is needed before checking quiesced flag */ @@ -122,29 +205,34 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) * scheduler, we can no longer merge or sort them. So it's best to * leave them there for as long as we can. Mark the hw queue as * needing a restart in that case. + * + * We want to dispatch from the scheduler if there was nothing + * on the dispatch list or we were able to dispatch from the + * dispatch list. */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); - did_work = blk_mq_dispatch_rq_list(q, &rq_list); - } else if (!has_sched_dispatch) { + if (blk_mq_dispatch_rq_list(q, &rq_list, false)) { + if (has_sched_dispatch) + blk_mq_do_dispatch_sched(hctx); + else + blk_mq_do_dispatch_ctx(hctx); + } + } else if (has_sched_dispatch) { + blk_mq_do_dispatch_sched(hctx); + } else if (q->mq_ops->get_budget) { + /* + * If we need to get budget before queuing request, we + * dequeue request one by one from sw queue for avoiding + * to mess up I/O merge when dispatch runs out of resource. + * + * TODO: get more budgets, and dequeue more requests in + * one time. + */ + blk_mq_do_dispatch_ctx(hctx); + } else { blk_mq_flush_busy_ctxs(hctx, &rq_list); - blk_mq_dispatch_rq_list(q, &rq_list); - } - - /* - * We want to dispatch from the scheduler if we had no work left - * on the dispatch list, OR if we did have work but weren't able - * to make progress. - */ - if (!did_work && has_sched_dispatch) { - do { - struct request *rq; - - rq = e->type->ops.mq.dispatch_request(hctx); - if (!rq) - break; - list_add(&rq->queuelist, &rq_list); - } while (blk_mq_dispatch_rq_list(q, &rq_list)); + blk_mq_dispatch_rq_list(q, &rq_list, false); } } @@ -260,21 +348,21 @@ void blk_mq_sched_request_inserted(struct request *rq) EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted); static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, + bool has_sched, struct request *rq) { - if (rq->tag == -1) { - rq->rq_flags |= RQF_SORTED; - return false; + /* dispatch flush rq directly */ + if (rq->rq_flags & RQF_FLUSH_SEQ) { + spin_lock(&hctx->lock); + list_add(&rq->queuelist, &hctx->dispatch); + spin_unlock(&hctx->lock); + return true; } - /* - * If we already have a real request tag, send directly to - * the dispatch list. - */ - spin_lock(&hctx->lock); - list_add(&rq->queuelist, &hctx->dispatch); - spin_unlock(&hctx->lock); - return true; + if (has_sched) + rq->rq_flags |= RQF_SORTED; + + return false; } /** @@ -339,21 +427,6 @@ done: } } -/* - * Add flush/fua to the queue. If we fail getting a driver tag, then - * punt to the requeue list. Requeue will re-invoke us from a context - * that's safe to block from. - */ -static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx, - struct request *rq, bool can_block) -{ - if (blk_mq_get_driver_tag(rq, &hctx, can_block)) { - blk_insert_flush(rq); - blk_mq_run_hw_queue(hctx, true); - } else - blk_mq_add_to_requeue_list(rq, false, true); -} - void blk_mq_sched_insert_request(struct request *rq, bool at_head, bool run_queue, bool async, bool can_block) { @@ -362,12 +435,15 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head, struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); - if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) { - blk_mq_sched_insert_flush(hctx, rq, can_block); - return; + /* flush rq in flush machinery need to be dispatched directly */ + if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) { + blk_insert_flush(rq); + goto run; } - if (e && blk_mq_sched_bypass_insert(hctx, rq)) + WARN_ON(e && (rq->tag != -1)); + + if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) goto run; if (e && e->type->ops.mq.insert_requests) { @@ -393,23 +469,6 @@ void blk_mq_sched_insert_requests(struct request_queue *q, struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); struct elevator_queue *e = hctx->queue->elevator; - if (e) { - struct request *rq, *next; - - /* - * We bypass requests that already have a driver tag assigned, - * which should only be flushes. Flushes are only ever inserted - * as single requests, so we shouldn't ever hit the - * WARN_ON_ONCE() below (but let's handle it just in case). - */ - list_for_each_entry_safe(rq, next, list, queuelist) { - if (WARN_ON_ONCE(rq->tag != -1)) { - list_del_init(&rq->queuelist); - blk_mq_sched_bypass_insert(hctx, rq); - } - } - } - if (e && e->type->ops.mq.insert_requests) e->type->ops.mq.insert_requests(hctx, list, false); else diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 6714507..c81b40e 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -298,12 +298,12 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, } EXPORT_SYMBOL(blk_mq_tagset_busy_iter); -int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, - int (reinit_request)(void *, struct request *)) +int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data, + int (fn)(void *, struct request *)) { int i, j, ret = 0; - if (WARN_ON_ONCE(!reinit_request)) + if (WARN_ON_ONCE(!fn)) goto out; for (i = 0; i < set->nr_hw_queues; i++) { @@ -316,8 +316,7 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, if (!tags->static_rqs[j]) continue; - ret = reinit_request(set->driver_data, - tags->static_rqs[j]); + ret = fn(data, tags->static_rqs[j]); if (ret) goto out; } @@ -326,7 +325,7 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, out: return ret; } -EXPORT_SYMBOL_GPL(blk_mq_reinit_tagset); +EXPORT_SYMBOL_GPL(blk_mq_tagset_iter); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void *priv) diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index c190165..61deab0 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -45,13 +45,8 @@ static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, } enum { - BLK_MQ_TAG_CACHE_MIN = 1, - BLK_MQ_TAG_CACHE_MAX = 64, -}; - -enum { BLK_MQ_TAG_FAIL = -1U, - BLK_MQ_TAG_MIN = BLK_MQ_TAG_CACHE_MIN, + BLK_MQ_TAG_MIN = 1, BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1, }; diff --git a/block/blk-mq.c b/block/blk-mq.c index 98a1860..b600463 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -37,6 +37,7 @@ #include "blk-wbt.h" #include "blk-mq-sched.h" +static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); static void blk_mq_poll_stats_start(struct request_queue *q); static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); @@ -60,10 +61,10 @@ static int blk_mq_poll_stats_bkt(const struct request *rq) /* * Check if any of the ctx's have pending work in this hardware queue */ -bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) +static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) { - return sbitmap_any_bit_set(&hctx->ctx_map) || - !list_empty_careful(&hctx->dispatch) || + return !list_empty_careful(&hctx->dispatch) || + sbitmap_any_bit_set(&hctx->ctx_map) || blk_mq_sched_has_work(hctx); } @@ -125,7 +126,8 @@ void blk_freeze_queue_start(struct request_queue *q) freeze_depth = atomic_inc_return(&q->mq_freeze_depth); if (freeze_depth == 1) { percpu_ref_kill(&q->q_usage_counter); - blk_mq_run_hw_queues(q, false); + if (q->mq_ops) + blk_mq_run_hw_queues(q, false); } } EXPORT_SYMBOL_GPL(blk_freeze_queue_start); @@ -255,13 +257,6 @@ void blk_mq_wake_waiters(struct request_queue *q) queue_for_each_hw_ctx(q, hctx, i) if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_wakeup_all(hctx->tags, true); - - /* - * If we are called because the queue has now been marked as - * dying, we need to ensure that processes currently waiting on - * the queue are notified as well. - */ - wake_up_all(&q->mq_freeze_wq); } bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) @@ -296,6 +291,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->q = data->q; rq->mq_ctx = data->ctx; rq->cmd_flags = op; + if (data->flags & BLK_MQ_REQ_PREEMPT) + rq->rq_flags |= RQF_PREEMPT; if (blk_queue_io_stat(data->q)) rq->rq_flags |= RQF_IO_STAT; /* do not touch atomic flags, it needs atomic ops against the timer */ @@ -336,12 +333,14 @@ static struct request *blk_mq_get_request(struct request_queue *q, struct elevator_queue *e = q->elevator; struct request *rq; unsigned int tag; - struct blk_mq_ctx *local_ctx = NULL; + bool put_ctx_on_error = false; blk_queue_enter_live(q); data->q = q; - if (likely(!data->ctx)) - data->ctx = local_ctx = blk_mq_get_ctx(q); + if (likely(!data->ctx)) { + data->ctx = blk_mq_get_ctx(q); + put_ctx_on_error = true; + } if (likely(!data->hctx)) data->hctx = blk_mq_map_queue(q, data->ctx->cpu); if (op & REQ_NOWAIT) @@ -360,8 +359,8 @@ static struct request *blk_mq_get_request(struct request_queue *q, tag = blk_mq_get_tag(data); if (tag == BLK_MQ_TAG_FAIL) { - if (local_ctx) { - blk_mq_put_ctx(local_ctx); + if (put_ctx_on_error) { + blk_mq_put_ctx(data->ctx); data->ctx = NULL; } blk_queue_exit(q); @@ -384,13 +383,13 @@ static struct request *blk_mq_get_request(struct request_queue *q, } struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, - unsigned int flags) + blk_mq_req_flags_t flags) { struct blk_mq_alloc_data alloc_data = { .flags = flags }; struct request *rq; int ret; - ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT); + ret = blk_queue_enter(q, flags); if (ret) return ERR_PTR(ret); @@ -410,7 +409,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, EXPORT_SYMBOL(blk_mq_alloc_request); struct request *blk_mq_alloc_request_hctx(struct request_queue *q, - unsigned int op, unsigned int flags, unsigned int hctx_idx) + unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx) { struct blk_mq_alloc_data alloc_data = { .flags = flags }; struct request *rq; @@ -429,7 +428,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, if (hctx_idx >= q->nr_hw_queues) return ERR_PTR(-EIO); - ret = blk_queue_enter(q, true); + ret = blk_queue_enter(q, flags); if (ret) return ERR_PTR(ret); @@ -476,8 +475,14 @@ void blk_mq_free_request(struct request *rq) if (rq->rq_flags & RQF_MQ_INFLIGHT) atomic_dec(&hctx->nr_active); + if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) + laptop_io_completion(q->backing_dev_info); + wbt_done(q->rq_wb, &rq->issue_stat); + if (blk_rq_rl(rq)) + blk_put_rl(blk_rq_rl(rq)); + clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); if (rq->tag != -1) @@ -593,22 +598,32 @@ void blk_mq_start_request(struct request *rq) blk_add_timer(rq); - /* - * Ensure that ->deadline is visible before set the started - * flag and clear the completed flag. - */ - smp_mb__before_atomic(); + WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)); /* * Mark us as started and clear complete. Complete might have been * set if requeue raced with timeout, which then marked it as * complete. So be sure to clear complete again when we start * the request, otherwise we'll ignore the completion event. + * + * Ensure that ->deadline is visible before we set STARTED, such that + * blk_mq_check_expired() is guaranteed to observe our ->deadline when + * it observes STARTED. */ - if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) - set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); - if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) + smp_wmb(); + set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) { + /* + * Coherence order guarantees these consecutive stores to a + * single variable propagate in the specified order. Thus the + * clear_bit() is ordered _after_ the set bit. See + * blk_mq_check_expired(). + * + * (the bits must be part of the same byte for this to be + * true). + */ clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); + } if (q->dma_drain_size && blk_rq_bytes(rq)) { /* @@ -634,6 +649,8 @@ static void __blk_mq_requeue_request(struct request *rq) { struct request_queue *q = rq->q; + blk_mq_put_driver_tag(rq); + trace_block_rq_requeue(q, rq); wbt_requeue(q->rq_wb, &rq->issue_stat); blk_mq_sched_requeue_request(rq); @@ -690,7 +707,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, /* * We abuse this flag that is otherwise used by the I/O scheduler to - * request head insertation from the workqueue. + * request head insertion from the workqueue. */ BUG_ON(rq->rq_flags & RQF_SOFTBARRIER); @@ -778,11 +795,20 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { struct blk_mq_timeout_data *data = priv; + unsigned long deadline; if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) return; /* + * Ensures that if we see STARTED we must also see our + * up-to-date deadline, see blk_mq_start_request(). + */ + smp_rmb(); + + deadline = READ_ONCE(rq->deadline); + + /* * The rq being checked may have been freed and reallocated * out already here, we avoid this race by checking rq->deadline * and REQ_ATOM_COMPLETE flag together: @@ -795,11 +821,20 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, * and clearing the flag in blk_mq_start_request(), so * this rq won't be timed out too. */ - if (time_after_eq(jiffies, rq->deadline)) { - if (!blk_mark_rq_complete(rq)) + if (time_after_eq(jiffies, deadline)) { + if (!blk_mark_rq_complete(rq)) { + /* + * Again coherence order ensures that consecutive reads + * from the same variable must be in that order. This + * ensures that if we see COMPLETE clear, we must then + * see STARTED set and we'll ignore this timeout. + * + * (There's also the MB implied by the test_and_clear()) + */ blk_mq_rq_timed_out(rq, reserved); - } else if (!data->next_set || time_after(data->next, rq->deadline)) { - data->next = rq->deadline; + } + } else if (!data->next_set || time_after(data->next, deadline)) { + data->next = deadline; data->next_set = 1; } } @@ -880,6 +915,45 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) } EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs); +struct dispatch_rq_data { + struct blk_mq_hw_ctx *hctx; + struct request *rq; +}; + +static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, + void *data) +{ + struct dispatch_rq_data *dispatch_data = data; + struct blk_mq_hw_ctx *hctx = dispatch_data->hctx; + struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; + + spin_lock(&ctx->lock); + if (unlikely(!list_empty(&ctx->rq_list))) { + dispatch_data->rq = list_entry_rq(ctx->rq_list.next); + list_del_init(&dispatch_data->rq->queuelist); + if (list_empty(&ctx->rq_list)) + sbitmap_clear_bit(sb, bitnr); + } + spin_unlock(&ctx->lock); + + return !dispatch_data->rq; +} + +struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *start) +{ + unsigned off = start ? start->index_hw : 0; + struct dispatch_rq_data data = { + .hctx = hctx, + .rq = NULL, + }; + + __sbitmap_for_each_set(&hctx->ctx_map, off, + dispatch_rq_from_ctx, &data); + + return data.rq; +} + static inline unsigned int queued_to_index(unsigned int queued) { if (!queued) @@ -920,109 +994,95 @@ done: return rq->tag != -1; } -static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, - struct request *rq) -{ - blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag); - rq->tag = -1; - - if (rq->rq_flags & RQF_MQ_INFLIGHT) { - rq->rq_flags &= ~RQF_MQ_INFLIGHT; - atomic_dec(&hctx->nr_active); - } -} - -static void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx, - struct request *rq) -{ - if (rq->tag == -1 || rq->internal_tag == -1) - return; - - __blk_mq_put_driver_tag(hctx, rq); -} - -static void blk_mq_put_driver_tag(struct request *rq) +static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, + int flags, void *key) { struct blk_mq_hw_ctx *hctx; - if (rq->tag == -1 || rq->internal_tag == -1) - return; + hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); - hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu); - __blk_mq_put_driver_tag(hctx, rq); + list_del_init(&wait->entry); + blk_mq_run_hw_queue(hctx, true); + return 1; } /* - * If we fail getting a driver tag because all the driver tags are already - * assigned and on the dispatch list, BUT the first entry does not have a - * tag, then we could deadlock. For that case, move entries with assigned - * driver tags to the front, leaving the set of tagged requests in the - * same order, and the untagged set in the same order. + * Mark us waiting for a tag. For shared tags, this involves hooking us into + * the tag wakeups. For non-shared tags, we can simply mark us nedeing a + * restart. For both caes, take care to check the condition again after + * marking us as waiting. */ -static bool reorder_tags_to_front(struct list_head *list) -{ - struct request *rq, *tmp, *first = NULL; - - list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) { - if (rq == first) - break; - if (rq->tag != -1) { - list_move(&rq->queuelist, list); - if (!first) - first = rq; - } - } - - return first != NULL; -} - -static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags, - void *key) +static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, + struct request *rq) { - struct blk_mq_hw_ctx *hctx; + struct blk_mq_hw_ctx *this_hctx = *hctx; + bool shared_tags = (this_hctx->flags & BLK_MQ_F_TAG_SHARED) != 0; + struct sbq_wait_state *ws; + wait_queue_entry_t *wait; + bool ret; - hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); + if (!shared_tags) { + if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state)) + set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state); + } else { + wait = &this_hctx->dispatch_wait; + if (!list_empty_careful(&wait->entry)) + return false; - list_del(&wait->entry); - clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state); - blk_mq_run_hw_queue(hctx, true); - return 1; -} + spin_lock(&this_hctx->lock); + if (!list_empty(&wait->entry)) { + spin_unlock(&this_hctx->lock); + return false; + } -static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx) -{ - struct sbq_wait_state *ws; + ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); + add_wait_queue(&ws->wait, wait); + } /* - * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait. - * The thread which wins the race to grab this bit adds the hardware - * queue to the wait queue. + * It's possible that a tag was freed in the window between the + * allocation failure and adding the hardware queue to the wait + * queue. */ - if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) || - test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state)) - return false; + ret = blk_mq_get_driver_tag(rq, hctx, false); - init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); - ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx); + if (!shared_tags) { + /* + * Don't clear RESTART here, someone else could have set it. + * At most this will cost an extra queue run. + */ + return ret; + } else { + if (!ret) { + spin_unlock(&this_hctx->lock); + return false; + } - /* - * As soon as this returns, it's no longer safe to fiddle with - * hctx->dispatch_wait, since a completion can wake up the wait queue - * and unlock the bit. - */ - add_wait_queue(&ws->wait, &hctx->dispatch_wait); - return true; + /* + * We got a tag, remove ourselves from the wait queue to ensure + * someone else gets the wakeup. + */ + spin_lock_irq(&ws->wait.lock); + list_del_init(&wait->entry); + spin_unlock_irq(&ws->wait.lock); + spin_unlock(&this_hctx->lock); + return true; + } } -bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) +bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, + bool got_budget) { struct blk_mq_hw_ctx *hctx; - struct request *rq; + struct request *rq, *nxt; + bool no_tag = false; int errors, queued; if (list_empty(list)) return false; + WARN_ON(!list_is_singular(list) && got_budget); + /* * Now process all the entries, sending them to the driver. */ @@ -1033,23 +1093,29 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) rq = list_first_entry(list, struct request, queuelist); if (!blk_mq_get_driver_tag(rq, &hctx, false)) { - if (!queued && reorder_tags_to_front(list)) - continue; - /* * The initial allocation attempt failed, so we need to - * rerun the hardware queue when a tag is freed. + * rerun the hardware queue when a tag is freed. The + * waitqueue takes care of that. If the queue is run + * before we add this entry back on the dispatch list, + * we'll re-run it below. */ - if (!blk_mq_dispatch_wait_add(hctx)) + if (!blk_mq_mark_tag_wait(&hctx, rq)) { + if (got_budget) + blk_mq_put_dispatch_budget(hctx); + /* + * For non-shared tags, the RESTART check + * will suffice. + */ + if (hctx->flags & BLK_MQ_F_TAG_SHARED) + no_tag = true; break; + } + } - /* - * It's possible that a tag was freed in the window - * between the allocation failure and adding the - * hardware queue to the wait queue. - */ - if (!blk_mq_get_driver_tag(rq, &hctx, false)) - break; + if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) { + blk_mq_put_driver_tag(rq); + break; } list_del_init(&rq->queuelist); @@ -1063,15 +1129,21 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) if (list_empty(list)) bd.last = true; else { - struct request *nxt; - nxt = list_first_entry(list, struct request, queuelist); bd.last = !blk_mq_get_driver_tag(nxt, NULL, false); } ret = q->mq_ops->queue_rq(hctx, &bd); if (ret == BLK_STS_RESOURCE) { - blk_mq_put_driver_tag_hctx(hctx, rq); + /* + * If an I/O scheduler has been configured and we got a + * driver tag for the next request already, free it + * again. + */ + if (!list_empty(list)) { + nxt = list_first_entry(list, struct request, queuelist); + blk_mq_put_driver_tag(nxt); + } list_add(&rq->queuelist, list); __blk_mq_requeue_request(rq); break; @@ -1093,13 +1165,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) * that is where we will continue on next queue run. */ if (!list_empty(list)) { - /* - * If an I/O scheduler has been configured and we got a driver - * tag for the next request already, free it again. - */ - rq = list_first_entry(list, struct request, queuelist); - blk_mq_put_driver_tag(rq); - spin_lock(&hctx->lock); list_splice_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); @@ -1109,10 +1174,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) * it is no longer set that means that it was cleared by another * thread and hence that a queue rerun is needed. * - * If TAG_WAITING is set that means that an I/O scheduler has - * been configured and another thread is waiting for a driver - * tag. To guarantee fairness, do not rerun this hardware queue - * but let the other thread grab the driver tag. + * If 'no_tag' is set, that means that we failed getting + * a driver tag with an I/O scheduler attached. If our dispatch + * waitqueue is no longer active, ensure that we run the queue + * AFTER adding our entries back to the list. * * If no I/O scheduler has been configured it is possible that * the hardware queue got stopped and restarted before requests @@ -1124,8 +1189,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq * and dm-rq. */ - if (!blk_mq_sched_needs_restart(hctx) && - !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) + if (!blk_mq_sched_needs_restart(hctx) || + (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) blk_mq_run_hw_queue(hctx, true); } @@ -1218,9 +1283,14 @@ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) } EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); -void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) +bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { - __blk_mq_delay_run_hw_queue(hctx, async, 0); + if (blk_mq_hctx_has_pending(hctx)) { + __blk_mq_delay_run_hw_queue(hctx, async, 0); + return true; + } + + return false; } EXPORT_SYMBOL(blk_mq_run_hw_queue); @@ -1230,8 +1300,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async) int i; queue_for_each_hw_ctx(q, hctx, i) { - if (!blk_mq_hctx_has_pending(hctx) || - blk_mq_hctx_stopped(hctx)) + if (blk_mq_hctx_stopped(hctx)) continue; blk_mq_run_hw_queue(hctx, async); @@ -1405,7 +1474,7 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, * Should only be used carefully, when the caller knows we want to * bypass a potential IO scheduler on the target device. */ -void blk_mq_request_bypass_insert(struct request *rq) +void blk_mq_request_bypass_insert(struct request *rq, bool run_queue) { struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu); @@ -1414,7 +1483,8 @@ void blk_mq_request_bypass_insert(struct request *rq) list_add_tail(&rq->queuelist, &hctx->dispatch); spin_unlock(&hctx->lock); - blk_mq_run_hw_queue(hctx, false); + if (run_queue) + blk_mq_run_hw_queue(hctx, false); } void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, @@ -1501,13 +1571,9 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) { blk_init_request_from_bio(rq, bio); - blk_account_io_start(rq, true); -} + blk_rq_set_rl(rq, blk_get_rl(rq->q, bio)); -static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx) -{ - return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) && - !blk_queue_nomerges(hctx->queue); + blk_account_io_start(rq, true); } static inline void blk_mq_queue_io(struct blk_mq_hw_ctx *hctx, @@ -1552,6 +1618,11 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, if (!blk_mq_get_driver_tag(rq, NULL, false)) goto insert; + if (!blk_mq_get_dispatch_budget(hctx)) { + blk_mq_put_driver_tag(rq); + goto insert; + } + new_cookie = request_to_qc_t(hctx, rq); /* @@ -1641,13 +1712,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) if (unlikely(is_flush_fua)) { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); - if (q->elevator) { - blk_mq_sched_insert_request(rq, false, true, true, - true); - } else { - blk_insert_flush(rq); - blk_mq_run_hw_queue(data.hctx, true); - } + + /* bypass scheduler for flush rq */ + blk_insert_flush(rq); + blk_mq_run_hw_queue(data.hctx, true); } else if (plug && q->nr_hw_queues == 1) { struct request *last = NULL; @@ -1990,6 +2058,9 @@ static int blk_mq_init_hctx(struct request_queue *q, hctx->nr_ctx = 0; + init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); + INIT_LIST_HEAD(&hctx->dispatch_wait.entry); + if (set->ops->init_hctx && set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) goto free_bitmap; @@ -2229,8 +2300,11 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, mutex_lock(&set->tag_list_lock); - /* Check to see if we're transitioning to shared (from 1 to 2 queues). */ - if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_SHARED)) { + /* + * Check to see if we're transitioning to shared (from 1 to 2 queues). + */ + if (!list_empty(&set->tag_list) && + !(set->flags & BLK_MQ_F_TAG_SHARED)) { set->flags |= BLK_MQ_F_TAG_SHARED; /* update existing queue */ blk_mq_update_tag_set_depth(set, true); @@ -2404,6 +2478,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, spin_lock_init(&q->requeue_lock); blk_queue_make_request(q, blk_mq_make_request); + if (q->mq_ops->poll) + q->poll_fn = blk_mq_poll; /* * Do this after blk_queue_make_request() overrides it... @@ -2460,10 +2536,9 @@ static void blk_mq_queue_reinit(struct request_queue *q) /* * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe - * we should change hctx numa_node according to new topology (this - * involves free and re-allocate memory, worthy doing?) + * we should change hctx numa_node according to the new topology (this + * involves freeing and re-allocating memory, worth doing?) */ - blk_mq_map_swqueue(q); blk_mq_sysfs_register(q); @@ -2552,6 +2627,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (!set->ops->queue_rq) return -EINVAL; + if (!set->ops->get_budget ^ !set->ops->put_budget) + return -EINVAL; + if (set->queue_depth > BLK_MQ_MAX_DEPTH) { pr_info("blk-mq: reduced tag depth to %u\n", BLK_MQ_MAX_DEPTH); @@ -2642,8 +2720,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) * queue depth. This is similar to what the old code would do. */ if (!hctx->sched_tags) { - ret = blk_mq_tag_update_depth(hctx, &hctx->tags, - min(nr, set->queue_depth), + ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, false); } else { ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, @@ -2863,20 +2940,14 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq) return false; } -bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) +static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) { struct blk_mq_hw_ctx *hctx; - struct blk_plug *plug; struct request *rq; - if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) || - !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) return false; - plug = current->plug; - if (plug) - blk_flush_plug_list(plug, false); - hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; if (!blk_qc_t_is_internal(cookie)) rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); @@ -2894,10 +2965,15 @@ bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) return __blk_mq_poll(hctx, rq); } -EXPORT_SYMBOL_GPL(blk_mq_poll); static int __init blk_mq_init(void) { + /* + * See comment in block/blk.h rq_atomic_flags enum + */ + BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) != + (REQ_ATOM_COMPLETE / BITS_PER_BYTE)); + cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, blk_mq_hctx_notify_dead); return 0; diff --git a/block/blk-mq.h b/block/blk-mq.h index 4933af9..6c7c3ff 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -3,6 +3,7 @@ #define INT_BLK_MQ_H #include "blk-stat.h" +#include "blk-mq-tag.h" struct blk_mq_tag_set; @@ -26,16 +27,16 @@ struct blk_mq_ctx { struct kobject kobj; } ____cacheline_aligned_in_smp; -void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); -bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *); +bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); -bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx); bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, bool wait); +struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *start); /* * Internal helpers for allocating/freeing the request map @@ -55,7 +56,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, */ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bool at_head); -void blk_mq_request_bypass_insert(struct request *rq); +void blk_mq_request_bypass_insert(struct request *rq, bool run_queue); void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list); @@ -109,7 +110,7 @@ static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx) struct blk_mq_alloc_data { /* input parameter */ struct request_queue *q; - unsigned int flags; + blk_mq_req_flags_t flags; unsigned int shallow_depth; /* input & output parameter */ @@ -138,4 +139,53 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]); +static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + + if (q->mq_ops->put_budget) + q->mq_ops->put_budget(hctx); +} + +static inline bool blk_mq_get_dispatch_budget(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + + if (q->mq_ops->get_budget) + return q->mq_ops->get_budget(hctx); + return true; +} + +static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag); + rq->tag = -1; + + if (rq->rq_flags & RQF_MQ_INFLIGHT) { + rq->rq_flags &= ~RQF_MQ_INFLIGHT; + atomic_dec(&hctx->nr_active); + } +} + +static inline void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + if (rq->tag == -1 || rq->internal_tag == -1) + return; + + __blk_mq_put_driver_tag(hctx, rq); +} + +static inline void blk_mq_put_driver_tag(struct request *rq) +{ + struct blk_mq_hw_ctx *hctx; + + if (rq->tag == -1 || rq->internal_tag == -1) + return; + + hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu); + __blk_mq_put_driver_tag(hctx, rq); +} + #endif diff --git a/block/blk-settings.c b/block/blk-settings.c index 8559e95..48ebe6b 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -157,7 +157,7 @@ EXPORT_SYMBOL(blk_set_stacking_limits); * Caveat: * The driver that does this *must* be able to deal appropriately * with buffers in "highmemory". This can be accomplished by either calling - * __bio_kmap_atomic() to get a temporary kernel mapping, or by calling + * kmap_atomic() to get a temporary kernel mapping, or by calling * blk_queue_bounce() to create a buffer in normal memory. **/ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) diff --git a/block/blk-stat.c b/block/blk-stat.c index c52356d..3a2f3c9 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -11,8 +11,6 @@ #include "blk-mq.h" #include "blk.h" -#define BLK_RQ_STAT_BATCH 64 - struct blk_queue_stats { struct list_head callbacks; spinlock_t lock; @@ -23,45 +21,21 @@ static void blk_stat_init(struct blk_rq_stat *stat) { stat->min = -1ULL; stat->max = stat->nr_samples = stat->mean = 0; - stat->batch = stat->nr_batch = 0; -} - -static void blk_stat_flush_batch(struct blk_rq_stat *stat) -{ - const s32 nr_batch = READ_ONCE(stat->nr_batch); - const s32 nr_samples = READ_ONCE(stat->nr_samples); - - if (!nr_batch) - return; - if (!nr_samples) - stat->mean = div64_s64(stat->batch, nr_batch); - else { - stat->mean = div64_s64((stat->mean * nr_samples) + - stat->batch, - nr_batch + nr_samples); - } - - stat->nr_samples += nr_batch; - stat->nr_batch = stat->batch = 0; + stat->batch = 0; } +/* src is a per-cpu stat, mean isn't initialized */ static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) { - blk_stat_flush_batch(src); - if (!src->nr_samples) return; dst->min = min(dst->min, src->min); dst->max = max(dst->max, src->max); - if (!dst->nr_samples) - dst->mean = src->mean; - else { - dst->mean = div64_s64((src->mean * src->nr_samples) + - (dst->mean * dst->nr_samples), - dst->nr_samples + src->nr_samples); - } + dst->mean = div_u64(src->batch + dst->mean * dst->nr_samples, + dst->nr_samples + src->nr_samples); + dst->nr_samples += src->nr_samples; } @@ -69,13 +43,8 @@ static void __blk_stat_add(struct blk_rq_stat *stat, u64 value) { stat->min = min(stat->min, value); stat->max = max(stat->max, value); - - if (stat->batch + value < stat->batch || - stat->nr_batch + 1 == BLK_RQ_STAT_BATCH) - blk_stat_flush_batch(stat); - stat->batch += value; - stat->nr_batch++; + stat->nr_samples++; } void blk_stat_add(struct request *rq) @@ -84,7 +53,7 @@ void blk_stat_add(struct request *rq) struct blk_stat_callback *cb; struct blk_rq_stat *stat; int bucket; - s64 now, value; + u64 now, value; now = __blk_stat_time(ktime_to_ns(ktime_get())); if (now < blk_stat_time(&rq->issue_stat)) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 8631763..96ad326 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2113,8 +2113,12 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio) { #ifdef CONFIG_BLK_DEV_THROTTLING_LOW - if (bio->bi_css) + if (bio->bi_css) { + if (bio->bi_cg_private) + blkg_put(tg_to_blkg(bio->bi_cg_private)); bio->bi_cg_private = tg; + blkg_get(tg_to_blkg(tg)); + } blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio)); #endif } @@ -2284,8 +2288,10 @@ void blk_throtl_bio_endio(struct bio *bio) start_time = blk_stat_time(&bio->bi_issue_stat) >> 10; finish_time = __blk_stat_time(finish_time_ns) >> 10; - if (!start_time || finish_time <= start_time) + if (!start_time || finish_time <= start_time) { + blkg_put(tg_to_blkg(tg)); return; + } lat = finish_time - start_time; /* this is only for bio based driver */ @@ -2315,6 +2321,8 @@ void blk_throtl_bio_endio(struct bio *bio) tg->bio_cnt /= 2; tg->bad_bio_cnt /= 2; } + + blkg_put(tg_to_blkg(tg)); } #endif diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 17ec83b..764ecf9 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -134,8 +134,6 @@ void blk_timeout_work(struct work_struct *work) struct request *rq, *tmp; int next_set = 0; - if (blk_queue_enter(q, true)) - return; spin_lock_irqsave(q->queue_lock, flags); list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) @@ -145,7 +143,6 @@ void blk_timeout_work(struct work_struct *work) mod_timer(&q->timeout, round_jiffies_up(next)); spin_unlock_irqrestore(q->queue_lock, flags); - blk_queue_exit(q); } /** @@ -211,7 +208,7 @@ void blk_add_timer(struct request *req) if (!req->timeout) req->timeout = q->rq_timeout; - req->deadline = jiffies + req->timeout; + WRITE_ONCE(req->deadline, jiffies + req->timeout); /* * Only the non-mq case needs to add the request to a protected list. diff --git a/block/blk-wbt.c b/block/blk-wbt.c index d822530..b252da0 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -654,7 +654,7 @@ void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on) } /* - * Disable wbt, if enabled by default. Only called from CFQ. + * Disable wbt, if enabled by default. */ void wbt_disable_default(struct request_queue *q) { diff --git a/block/blk.h b/block/blk.h index 85be8b2..3f14469 100644 --- a/block/blk.h +++ b/block/blk.h @@ -123,8 +123,15 @@ void blk_account_io_done(struct request *req); * Internal atomic flags for request handling */ enum rq_atomic_flags { + /* + * Keep these two bits first - not because we depend on the + * value of them, but we do depend on them being in the same + * byte of storage to ensure ordering on writes. Keeping them + * first will achieve that nicely. + */ REQ_ATOM_COMPLETE = 0, REQ_ATOM_STARTED, + REQ_ATOM_POLL_SLEPT, }; @@ -149,45 +156,6 @@ static inline void blk_clear_rq_complete(struct request *rq) void blk_insert_flush(struct request *rq); -static inline struct request *__elv_next_request(struct request_queue *q) -{ - struct request *rq; - struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); - - WARN_ON_ONCE(q->mq_ops); - - while (1) { - if (!list_empty(&q->queue_head)) { - rq = list_entry_rq(q->queue_head.next); - return rq; - } - - /* - * Flush request is running and flush request isn't queueable - * in the drive, we can hold the queue till flush request is - * finished. Even we don't do this, driver can't dispatch next - * requests and will requeue them. And this can improve - * throughput too. For example, we have request flush1, write1, - * flush 2. flush1 is dispatched, then queue is hold, write1 - * isn't inserted to queue. After flush1 is finished, flush2 - * will be dispatched. Since disk cache is already clean, - * flush2 will be finished very soon, so looks like flush2 is - * folded to flush1. - * Since the queue is hold, a flag is set to indicate the queue - * should be restarted later. Please see flush_end_io() for - * details. - */ - if (fq->flush_pending_idx != fq->flush_running_idx && - !queue_flush_queueable(q)) { - fq->flush_queue_delayed = 1; - return NULL; - } - if (unlikely(blk_queue_bypass(q)) || - !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0)) - return NULL; - } -} - static inline void elv_activate_rq(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; diff --git a/block/bsg.c b/block/bsg.c index ee1335c..452f94f 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -137,7 +137,7 @@ static inline struct hlist_head *bsg_dev_idx_hash(int index) static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, struct sg_io_v4 *hdr, struct bsg_device *bd, - fmode_t has_write_perm) + fmode_t mode) { struct scsi_request *req = scsi_req(rq); @@ -152,7 +152,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, return -EFAULT; if (hdr->subprotocol == BSG_SUB_PROTOCOL_SCSI_CMD) { - if (blk_verify_command(req->cmd, has_write_perm)) + if (blk_verify_command(req->cmd, mode)) return -EPERM; } else if (!capable(CAP_SYS_RAWIO)) return -EPERM; @@ -206,7 +206,7 @@ bsg_validate_sgv4_hdr(struct sg_io_v4 *hdr, int *op) * map sg_io_v4 to a request. */ static struct request * -bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm) +bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t mode) { struct request_queue *q = bd->queue; struct request *rq, *next_rq = NULL; @@ -237,7 +237,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm) if (IS_ERR(rq)) return rq; - ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, has_write_perm); + ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, mode); if (ret) goto out; @@ -587,8 +587,7 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) } static int __bsg_write(struct bsg_device *bd, const char __user *buf, - size_t count, ssize_t *bytes_written, - fmode_t has_write_perm) + size_t count, ssize_t *bytes_written, fmode_t mode) { struct bsg_command *bc; struct request *rq; @@ -619,7 +618,7 @@ static int __bsg_write(struct bsg_device *bd, const char __user *buf, /* * get a request, fill in the blanks, and add to request queue */ - rq = bsg_map_hdr(bd, &bc->hdr, has_write_perm); + rq = bsg_map_hdr(bd, &bc->hdr, mode); if (IS_ERR(rq)) { ret = PTR_ERR(rq); rq = NULL; @@ -655,8 +654,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) bsg_set_block(bd, file); bytes_written = 0; - ret = __bsg_write(bd, buf, count, &bytes_written, - file->f_mode & FMODE_WRITE); + ret = __bsg_write(bd, buf, count, &bytes_written, file->f_mode); *ppos = bytes_written; @@ -915,7 +913,7 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) if (copy_from_user(&hdr, uarg, sizeof(hdr))) return -EFAULT; - rq = bsg_map_hdr(bd, &hdr, file->f_mode & FMODE_WRITE); + rq = bsg_map_hdr(bd, &hdr, file->f_mode); if (IS_ERR(rq)) return PTR_ERR(rq); diff --git a/block/elevator.c b/block/elevator.c index 153926a..7bda083 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -83,12 +83,25 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio) } EXPORT_SYMBOL(elv_bio_merge_ok); -static struct elevator_type *elevator_find(const char *name) +static bool elevator_match(const struct elevator_type *e, const char *name) +{ + if (!strcmp(e->elevator_name, name)) + return true; + if (e->elevator_alias && !strcmp(e->elevator_alias, name)) + return true; + + return false; +} + +/* + * Return scheduler with name 'name' and with matching 'mq capability + */ +static struct elevator_type *elevator_find(const char *name, bool mq) { struct elevator_type *e; list_for_each_entry(e, &elv_list, list) { - if (!strcmp(e->elevator_name, name)) + if (elevator_match(e, name) && (mq == e->uses_mq)) return e; } @@ -100,25 +113,25 @@ static void elevator_put(struct elevator_type *e) module_put(e->elevator_owner); } -static struct elevator_type *elevator_get(const char *name, bool try_loading) +static struct elevator_type *elevator_get(struct request_queue *q, + const char *name, bool try_loading) { struct elevator_type *e; spin_lock(&elv_list_lock); - e = elevator_find(name); + e = elevator_find(name, q->mq_ops != NULL); if (!e && try_loading) { spin_unlock(&elv_list_lock); request_module("%s-iosched", name); spin_lock(&elv_list_lock); - e = elevator_find(name); + e = elevator_find(name, q->mq_ops != NULL); } if (e && !try_module_get(e->elevator_owner)) e = NULL; spin_unlock(&elv_list_lock); - return e; } @@ -144,8 +157,12 @@ void __init load_default_elevator_module(void) if (!chosen_elevator[0]) return; + /* + * Boot parameter is deprecated, we haven't supported that for MQ. + * Only look for non-mq schedulers from here. + */ spin_lock(&elv_list_lock); - e = elevator_find(chosen_elevator); + e = elevator_find(chosen_elevator, false); spin_unlock(&elv_list_lock); if (!e) @@ -202,7 +219,7 @@ int elevator_init(struct request_queue *q, char *name) q->boundary_rq = NULL; if (name) { - e = elevator_get(name, true); + e = elevator_get(q, name, true); if (!e) return -EINVAL; } @@ -214,7 +231,7 @@ int elevator_init(struct request_queue *q, char *name) * allowed from async. */ if (!e && !q->mq_ops && *chosen_elevator) { - e = elevator_get(chosen_elevator, false); + e = elevator_get(q, chosen_elevator, false); if (!e) printk(KERN_ERR "I/O scheduler %s not found\n", chosen_elevator); @@ -229,17 +246,17 @@ int elevator_init(struct request_queue *q, char *name) */ if (q->mq_ops) { if (q->nr_hw_queues == 1) - e = elevator_get("mq-deadline", false); + e = elevator_get(q, "mq-deadline", false); if (!e) return 0; } else - e = elevator_get(CONFIG_DEFAULT_IOSCHED, false); + e = elevator_get(q, CONFIG_DEFAULT_IOSCHED, false); if (!e) { printk(KERN_ERR "Default I/O scheduler not found. " \ "Using noop.\n"); - e = elevator_get("noop", false); + e = elevator_get(q, "noop", false); } } @@ -905,7 +922,7 @@ int elv_register(struct elevator_type *e) /* register, don't allow duplicate names */ spin_lock(&elv_list_lock); - if (elevator_find(e->elevator_name)) { + if (elevator_find(e->elevator_name, e->uses_mq)) { spin_unlock(&elv_list_lock); if (e->icq_cache) kmem_cache_destroy(e->icq_cache); @@ -915,9 +932,9 @@ int elv_register(struct elevator_type *e) spin_unlock(&elv_list_lock); /* print pretty message */ - if (!strcmp(e->elevator_name, chosen_elevator) || + if (elevator_match(e, chosen_elevator) || (!*chosen_elevator && - !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED))) + elevator_match(e, CONFIG_DEFAULT_IOSCHED))) def = " (default)"; printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name, @@ -1066,25 +1083,15 @@ static int __elevator_change(struct request_queue *q, const char *name) return elevator_switch(q, NULL); strlcpy(elevator_name, name, sizeof(elevator_name)); - e = elevator_get(strstrip(elevator_name), true); + e = elevator_get(q, strstrip(elevator_name), true); if (!e) return -EINVAL; - if (q->elevator && - !strcmp(elevator_name, q->elevator->type->elevator_name)) { + if (q->elevator && elevator_match(q->elevator->type, elevator_name)) { elevator_put(e); return 0; } - if (!e->uses_mq && q->mq_ops) { - elevator_put(e); - return -EINVAL; - } - if (e->uses_mq && !q->mq_ops) { - elevator_put(e); - return -EINVAL; - } - return elevator_switch(q, e); } @@ -1116,9 +1123,10 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) struct elevator_queue *e = q->elevator; struct elevator_type *elv = NULL; struct elevator_type *__e; + bool uses_mq = q->mq_ops != NULL; int len = 0; - if (!blk_queue_stackable(q)) + if (!queue_is_rq_based(q)) return sprintf(name, "none\n"); if (!q->elevator) @@ -1128,7 +1136,8 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) spin_lock(&elv_list_lock); list_for_each_entry(__e, &elv_list, list) { - if (elv && !strcmp(elv->elevator_name, __e->elevator_name)) { + if (elv && elevator_match(elv, __e->elevator_name) && + (__e->uses_mq == uses_mq)) { len += sprintf(name+len, "[%s] ", elv->elevator_name); continue; } diff --git a/block/genhd.c b/block/genhd.c index 630c0da..c2223f1 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -588,6 +588,11 @@ static void register_disk(struct device *parent, struct gendisk *disk) disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); + if (disk->flags & GENHD_FL_HIDDEN) { + dev_set_uevent_suppress(ddev, 0); + return; + } + /* No minors to use for partitions */ if (!disk_part_scan_enabled(disk)) goto exit; @@ -616,6 +621,11 @@ exit: while ((part = disk_part_iter_next(&piter))) kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); disk_part_iter_exit(&piter); + + err = sysfs_create_link(&ddev->kobj, + &disk->queue->backing_dev_info->dev->kobj, + "bdi"); + WARN_ON(err); } /** @@ -630,7 +640,6 @@ exit: */ void device_add_disk(struct device *parent, struct gendisk *disk) { - struct backing_dev_info *bdi; dev_t devt; int retval; @@ -639,7 +648,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk) * parameters make sense. */ WARN_ON(disk->minors && !(disk->major || disk->first_minor)); - WARN_ON(!disk->minors && !(disk->flags & GENHD_FL_EXT_DEVT)); + WARN_ON(!disk->minors && + !(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN))); disk->flags |= GENHD_FL_UP; @@ -648,22 +658,26 @@ void device_add_disk(struct device *parent, struct gendisk *disk) WARN_ON(1); return; } - disk_to_dev(disk)->devt = devt; - - /* ->major and ->first_minor aren't supposed to be - * dereferenced from here on, but set them just in case. - */ disk->major = MAJOR(devt); disk->first_minor = MINOR(devt); disk_alloc_events(disk); - /* Register BDI before referencing it from bdev */ - bdi = disk->queue->backing_dev_info; - bdi_register_owner(bdi, disk_to_dev(disk)); - - blk_register_region(disk_devt(disk), disk->minors, NULL, - exact_match, exact_lock, disk); + if (disk->flags & GENHD_FL_HIDDEN) { + /* + * Don't let hidden disks show up in /proc/partitions, + * and don't bother scanning for partitions either. + */ + disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; + disk->flags |= GENHD_FL_NO_PART_SCAN; + } else { + /* Register BDI before referencing it from bdev */ + disk_to_dev(disk)->devt = devt; + bdi_register_owner(disk->queue->backing_dev_info, + disk_to_dev(disk)); + blk_register_region(disk_devt(disk), disk->minors, NULL, + exact_match, exact_lock, disk); + } register_disk(parent, disk); blk_register_queue(disk); @@ -673,10 +687,6 @@ void device_add_disk(struct device *parent, struct gendisk *disk) */ WARN_ON_ONCE(!blk_get_queue(disk->queue)); - retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, - "bdi"); - WARN_ON(retval); - disk_add_events(disk); blk_integrity_add(disk); } @@ -705,7 +715,8 @@ void del_gendisk(struct gendisk *disk) set_capacity(disk, 0); disk->flags &= ~GENHD_FL_UP; - sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); + if (!(disk->flags & GENHD_FL_HIDDEN)) + sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); if (disk->queue) { /* * Unregister bdi before releasing device numbers (as they can @@ -716,13 +727,15 @@ void del_gendisk(struct gendisk *disk) } else { WARN_ON(1); } - blk_unregister_region(disk_devt(disk), disk->minors); - part_stat_set_all(&disk->part0, 0); - disk->part0.stamp = 0; + if (!(disk->flags & GENHD_FL_HIDDEN)) + blk_unregister_region(disk_devt(disk), disk->minors); kobject_put(disk->part0.holder_dir); kobject_put(disk->slave_dir); + + part_stat_set_all(&disk->part0, 0); + disk->part0.stamp = 0; if (!sysfs_deprecated) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); @@ -785,6 +798,10 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) spin_unlock_bh(&ext_devt_lock); } + if (disk && unlikely(disk->flags & GENHD_FL_HIDDEN)) { + put_disk(disk); + disk = NULL; + } return disk; } EXPORT_SYMBOL(get_gendisk); @@ -1028,6 +1045,15 @@ static ssize_t disk_removable_show(struct device *dev, (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0)); } +static ssize_t disk_hidden_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%d\n", + (disk->flags & GENHD_FL_HIDDEN ? 1 : 0)); +} + static ssize_t disk_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1065,6 +1091,7 @@ static ssize_t disk_discard_alignment_show(struct device *dev, static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); +static DEVICE_ATTR(hidden, S_IRUGO, disk_hidden_show, NULL); static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL); @@ -1089,6 +1116,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_range.attr, &dev_attr_ext_range.attr, &dev_attr_removable.attr, + &dev_attr_hidden.attr, &dev_attr_ro.attr, &dev_attr_size.attr, &dev_attr_alignment_offset.attr, diff --git a/block/ioctl.c b/block/ioctl.c index 0de02ee..1668506 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -202,10 +202,16 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, { uint64_t range[2]; uint64_t start, len; + struct request_queue *q = bdev_get_queue(bdev); + struct address_space *mapping = bdev->bd_inode->i_mapping; + if (!(mode & FMODE_WRITE)) return -EBADF; + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + if (copy_from_user(range, (void __user *)arg, sizeof(range))) return -EFAULT; @@ -216,12 +222,12 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, return -EINVAL; if (len & 511) return -EINVAL; - start >>= 9; - len >>= 9; - if (start + len > (i_size_read(bdev->bd_inode) >> 9)) + if (start + len > i_size_read(bdev->bd_inode)) return -EINVAL; - return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags); + truncate_inode_pages_range(mapping, start, start + len); + return blkdev_issue_discard(bdev, start >> 9, len >> 9, + GFP_KERNEL, flags); } static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, @@ -437,11 +443,12 @@ static int blkdev_roset(struct block_device *bdev, fmode_t mode, { int ret, n; + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); if (!is_unrecognized_ioctl(ret)) return ret; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; if (get_user(n, (int __user *)arg)) return -EFAULT; set_device_ro(bdev, n); diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index f58cab8..b4df317 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -541,9 +541,17 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd, /* * Try again in case a token was freed before we got on the wait - * queue. + * queue. The waker may have already removed the entry from the + * wait queue, but list_del_init() is okay with that. */ nr = __sbitmap_queue_get(domain_tokens); + if (nr >= 0) { + unsigned long flags; + + spin_lock_irqsave(&ws->wait.lock, flags); + list_del_init(&wait->entry); + spin_unlock_irqrestore(&ws->wait.lock, flags); + } } return nr; } @@ -641,7 +649,7 @@ static bool kyber_has_work(struct blk_mq_hw_ctx *hctx) if (!list_empty_careful(&khd->rqs[i])) return true; } - return false; + return sbitmap_any_bit_set(&hctx->ctx_map); } #define KYBER_LAT_SHOW_STORE(op) \ diff --git a/block/mq-deadline.c b/block/mq-deadline.c index a1cad43..0179e48 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -657,6 +657,7 @@ static struct elevator_type mq_deadline = { #endif .elevator_attrs = deadline_attrs, .elevator_name = "mq-deadline", + .elevator_alias = "deadline", .elevator_owner = THIS_MODULE, }; MODULE_ALIAS("mq-deadline-iosched"); diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 7440de4..edcfff9 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -207,7 +207,7 @@ static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter) __set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok); } -int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm) +int blk_verify_command(unsigned char *cmd, fmode_t mode) { struct blk_cmd_filter *filter = &blk_default_cmd_filter; @@ -220,7 +220,7 @@ int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm) return 0; /* Write-safe commands require a writable open */ - if (test_bit(cmd[0], filter->write_ok) && has_write_perm) + if (test_bit(cmd[0], filter->write_ok) && (mode & FMODE_WRITE)) return 0; return -EPERM; @@ -234,7 +234,7 @@ static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq, if (copy_from_user(req->cmd, hdr->cmdp, hdr->cmd_len)) return -EFAULT; - if (blk_verify_command(req->cmd, mode & FMODE_WRITE)) + if (blk_verify_command(req->cmd, mode)) return -EPERM; /* @@ -469,7 +469,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len)) goto error; - err = blk_verify_command(req->cmd, mode & FMODE_WRITE); + err = blk_verify_command(req->cmd, mode); if (err) goto error; |