diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-01-29 11:51:49 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-01-29 11:51:49 -0800 |
commit | 0a4b6e2f80aad46fb55a5cf7b1664c0aef030ee0 (patch) | |
tree | cefccd67dc1f27bb45830f6b8065dd4a1c05e83b /block/blk-mq.c | |
parent | 9697e9da84299d0d715d515dd2cc48f1eceb277d (diff) | |
parent | 796baeeef85a40b3495a907fb7425086e7010102 (diff) | |
download | op-kernel-dev-0a4b6e2f80aad46fb55a5cf7b1664c0aef030ee0.zip op-kernel-dev-0a4b6e2f80aad46fb55a5cf7b1664c0aef030ee0.tar.gz |
Merge branch 'for-4.16/block' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe:
"This is the main pull request for block IO related changes for the
4.16 kernel. Nothing major in this pull request, but a good amount of
improvements and fixes all over the map. This contains:
- BFQ improvements, fixes, and cleanups from Angelo, Chiara, and
Paolo.
- Support for SMR zones for deadline and mq-deadline from Damien and
Christoph.
- Set of fixes for bcache by way of Michael Lyle, including fixes
from himself, Kent, Rui, Tang, and Coly.
- Series from Matias for lightnvm with fixes from Hans Holmberg,
Javier, and Matias. Mostly centered around pblk, and the removing
rrpc 1.2 in preparation for supporting 2.0.
- A couple of NVMe pull requests from Christoph. Nothing major in
here, just fixes and cleanups, and support for command tracing from
Johannes.
- Support for blk-throttle for tracking reads and writes separately.
From Joseph Qi. A few cleanups/fixes also for blk-throttle from
Weiping.
- Series from Mike Snitzer that enables dm to register its queue more
logically, something that's alwways been problematic on dm since
it's a stacked device.
- Series from Ming cleaning up some of the bio accessor use, in
preparation for supporting multipage bvecs.
- Various fixes from Ming closing up holes around queue mapping and
quiescing.
- BSD partition fix from Richard Narron, fixing a problem where we
can't mount newer (10/11) FreeBSD partitions.
- Series from Tejun reworking blk-mq timeout handling. The previous
scheme relied on atomic bits, but it had races where we would think
a request had timed out if it to reused at the wrong time.
- null_blk now supports faking timeouts, to enable us to better
exercise and test that functionality separately. From me.
- Kill the separate atomic poll bit in the request struct. After
this, we don't use the atomic bits on blk-mq anymore at all. From
me.
- sgl_alloc/free helpers from Bart.
- Heavily contended tag case scalability improvement from me.
- Various little fixes and cleanups from Arnd, Bart, Corentin,
Douglas, Eryu, Goldwyn, and myself"
* 'for-4.16/block' of git://git.kernel.dk/linux-block: (186 commits)
block: remove smart1,2.h
nvme: add tracepoint for nvme_complete_rq
nvme: add tracepoint for nvme_setup_cmd
nvme-pci: introduce RECONNECTING state to mark initializing procedure
nvme-rdma: remove redundant boolean for inline_data
nvme: don't free uuid pointer before printing it
nvme-pci: Suspend queues after deleting them
bsg: use pr_debug instead of hand crafted macros
blk-mq-debugfs: don't allow write on attributes with seq_operations set
nvme-pci: Fix queue double allocations
block: Set BIO_TRACE_COMPLETION on new bio during split
blk-throttle: use queue_is_rq_based
block: Remove kblockd_schedule_delayed_work{,_on}()
blk-mq: Avoid that blk_mq_delay_run_hw_queue() introduces unintended delays
blk-mq: Rename blk_mq_request_direct_issue() into blk_mq_request_issue_directly()
lib/scatterlist: Fix chaining support in sgl_alloc_order()
blk-throttle: track read and write request individually
block: add bdev_read_only() checks to common helpers
block: fail op_is_write() requests to read-only partitions
blk-throttle: export io_serviced_recursive, io_service_bytes_recursive
...
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r-- | block/blk-mq.c | 667 |
1 files changed, 437 insertions, 230 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c index 3d37973..01f271d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -95,8 +95,7 @@ static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, { struct mq_inflight *mi = priv; - if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags) && - !test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) { + if (blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) { /* * index[0] counts the specific partition that was asked * for. index[1] counts the ones that are active on the @@ -222,7 +221,7 @@ void blk_mq_quiesce_queue(struct request_queue *q) queue_for_each_hw_ctx(q, hctx, i) { if (hctx->flags & BLK_MQ_F_BLOCKING) - synchronize_srcu(hctx->queue_rq_srcu); + synchronize_srcu(hctx->srcu); else rcu = true; } @@ -272,15 +271,14 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct request *rq = tags->static_rqs[tag]; - - rq->rq_flags = 0; + req_flags_t rq_flags = 0; if (data->flags & BLK_MQ_REQ_INTERNAL) { rq->tag = -1; rq->internal_tag = tag; } else { if (blk_mq_tag_busy(data->hctx)) { - rq->rq_flags = RQF_MQ_INFLIGHT; + rq_flags = RQF_MQ_INFLIGHT; atomic_inc(&data->hctx->nr_active); } rq->tag = tag; @@ -288,27 +286,22 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, data->hctx->tags->rqs[rq->tag] = rq; } - INIT_LIST_HEAD(&rq->queuelist); /* csd/requeue_work/fifo_time is initialized before use */ rq->q = data->q; rq->mq_ctx = data->ctx; + rq->rq_flags = rq_flags; + rq->cpu = -1; rq->cmd_flags = op; if (data->flags & BLK_MQ_REQ_PREEMPT) rq->rq_flags |= RQF_PREEMPT; if (blk_queue_io_stat(data->q)) rq->rq_flags |= RQF_IO_STAT; - /* do not touch atomic flags, it needs atomic ops against the timer */ - rq->cpu = -1; + INIT_LIST_HEAD(&rq->queuelist); INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); rq->rq_disk = NULL; rq->part = NULL; rq->start_time = jiffies; -#ifdef CONFIG_BLK_CGROUP - rq->rl = NULL; - set_start_time_ns(rq); - rq->io_start_time_ns = 0; -#endif rq->nr_phys_segments = 0; #if defined(CONFIG_BLK_DEV_INTEGRITY) rq->nr_integrity_segments = 0; @@ -316,6 +309,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->special = NULL; /* tag was already set */ rq->extra_len = 0; + rq->__deadline = 0; INIT_LIST_HEAD(&rq->timeout_list); rq->timeout = 0; @@ -324,6 +318,12 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->end_io_data = NULL; rq->next_rq = NULL; +#ifdef CONFIG_BLK_CGROUP + rq->rl = NULL; + set_start_time_ns(rq); + rq->io_start_time_ns = 0; +#endif + data->ctx->rq_dispatched[op_is_sync(op)]++; return rq; } @@ -443,7 +443,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, blk_queue_exit(q); return ERR_PTR(-EXDEV); } - cpu = cpumask_first(alloc_data.hctx->cpumask); + cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask); alloc_data.ctx = __blk_mq_get_ctx(q, cpu); rq = blk_mq_get_request(q, NULL, op, &alloc_data); @@ -485,8 +485,7 @@ void blk_mq_free_request(struct request *rq) if (blk_rq_rl(rq)) blk_put_rl(blk_rq_rl(rq)); - clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); - clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); + blk_mq_rq_update_state(rq, MQ_RQ_IDLE); if (rq->tag != -1) blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); if (sched_tag != -1) @@ -532,6 +531,9 @@ static void __blk_mq_complete_request(struct request *rq) bool shared = false; int cpu; + WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT); + blk_mq_rq_update_state(rq, MQ_RQ_COMPLETE); + if (rq->internal_tag != -1) blk_mq_sched_completed_request(rq); if (rq->rq_flags & RQF_STATS) { @@ -559,6 +561,56 @@ static void __blk_mq_complete_request(struct request *rq) put_cpu(); } +static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx) + __releases(hctx->srcu) +{ + if (!(hctx->flags & BLK_MQ_F_BLOCKING)) + rcu_read_unlock(); + else + srcu_read_unlock(hctx->srcu, srcu_idx); +} + +static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) + __acquires(hctx->srcu) +{ + if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { + /* shut up gcc false positive */ + *srcu_idx = 0; + rcu_read_lock(); + } else + *srcu_idx = srcu_read_lock(hctx->srcu); +} + +static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate) +{ + unsigned long flags; + + /* + * blk_mq_rq_aborted_gstate() is used from the completion path and + * can thus be called from irq context. u64_stats_fetch in the + * middle of update on the same CPU leads to lockup. Disable irq + * while updating. + */ + local_irq_save(flags); + u64_stats_update_begin(&rq->aborted_gstate_sync); + rq->aborted_gstate = gstate; + u64_stats_update_end(&rq->aborted_gstate_sync); + local_irq_restore(flags); +} + +static u64 blk_mq_rq_aborted_gstate(struct request *rq) +{ + unsigned int start; + u64 aborted_gstate; + + do { + start = u64_stats_fetch_begin(&rq->aborted_gstate_sync); + aborted_gstate = rq->aborted_gstate; + } while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start)); + + return aborted_gstate; +} + /** * blk_mq_complete_request - end I/O on a request * @rq: the request being processed @@ -570,17 +622,33 @@ static void __blk_mq_complete_request(struct request *rq) void blk_mq_complete_request(struct request *rq) { struct request_queue *q = rq->q; + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu); + int srcu_idx; if (unlikely(blk_should_fake_timeout(q))) return; - if (!blk_mark_rq_complete(rq)) + + /* + * If @rq->aborted_gstate equals the current instance, timeout is + * claiming @rq and we lost. This is synchronized through + * hctx_lock(). See blk_mq_timeout_work() for details. + * + * Completion path never blocks and we can directly use RCU here + * instead of hctx_lock() which can be either RCU or SRCU. + * However, that would complicate paths which want to synchronize + * against us. Let stay in sync with the issue path so that + * hctx_lock() covers both issue and completion paths. + */ + hctx_lock(hctx, &srcu_idx); + if (blk_mq_rq_aborted_gstate(rq) != rq->gstate) __blk_mq_complete_request(rq); + hctx_unlock(hctx, srcu_idx); } EXPORT_SYMBOL(blk_mq_complete_request); int blk_mq_request_started(struct request *rq) { - return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + return blk_mq_rq_state(rq) != MQ_RQ_IDLE; } EXPORT_SYMBOL_GPL(blk_mq_request_started); @@ -598,34 +666,27 @@ void blk_mq_start_request(struct request *rq) wbt_issue(q->rq_wb, &rq->issue_stat); } - blk_add_timer(rq); - - WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)); + WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE); /* - * Mark us as started and clear complete. Complete might have been - * set if requeue raced with timeout, which then marked it as - * complete. So be sure to clear complete again when we start - * the request, otherwise we'll ignore the completion event. + * Mark @rq in-flight which also advances the generation number, + * and register for timeout. Protect with a seqcount to allow the + * timeout path to read both @rq->gstate and @rq->deadline + * coherently. * - * Ensure that ->deadline is visible before we set STARTED, such that - * blk_mq_check_expired() is guaranteed to observe our ->deadline when - * it observes STARTED. + * This is the only place where a request is marked in-flight. If + * the timeout path reads an in-flight @rq->gstate, the + * @rq->deadline it reads together under @rq->gstate_seq is + * guaranteed to be the matching one. */ - smp_wmb(); - set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); - if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) { - /* - * Coherence order guarantees these consecutive stores to a - * single variable propagate in the specified order. Thus the - * clear_bit() is ordered _after_ the set bit. See - * blk_mq_check_expired(). - * - * (the bits must be part of the same byte for this to be - * true). - */ - clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); - } + preempt_disable(); + write_seqcount_begin(&rq->gstate_seq); + + blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT); + blk_add_timer(rq); + + write_seqcount_end(&rq->gstate_seq); + preempt_enable(); if (q->dma_drain_size && blk_rq_bytes(rq)) { /* @@ -639,13 +700,9 @@ void blk_mq_start_request(struct request *rq) EXPORT_SYMBOL(blk_mq_start_request); /* - * When we reach here because queue is busy, REQ_ATOM_COMPLETE - * flag isn't set yet, so there may be race with timeout handler, - * but given rq->deadline is just set in .queue_rq() under - * this situation, the race won't be possible in reality because - * rq->timeout should be set as big enough to cover the window - * between blk_mq_start_request() called from .queue_rq() and - * clearing REQ_ATOM_STARTED here. + * When we reach here because queue is busy, it's safe to change the state + * to IDLE without checking @rq->aborted_gstate because we should still be + * holding the RCU read lock and thus protected against timeout. */ static void __blk_mq_requeue_request(struct request *rq) { @@ -657,7 +714,8 @@ static void __blk_mq_requeue_request(struct request *rq) wbt_requeue(q->rq_wb, &rq->issue_stat); blk_mq_sched_requeue_request(rq); - if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { + if (blk_mq_rq_state(rq) != MQ_RQ_IDLE) { + blk_mq_rq_update_state(rq, MQ_RQ_IDLE); if (q->dma_drain_size && blk_rq_bytes(rq)) rq->nr_phys_segments--; } @@ -689,13 +747,13 @@ static void blk_mq_requeue_work(struct work_struct *work) rq->rq_flags &= ~RQF_SOFTBARRIER; list_del_init(&rq->queuelist); - blk_mq_sched_insert_request(rq, true, false, false, true); + blk_mq_sched_insert_request(rq, true, false, false); } while (!list_empty(&rq_list)) { rq = list_entry(rq_list.next, struct request, queuelist); list_del_init(&rq->queuelist); - blk_mq_sched_insert_request(rq, false, false, false, true); + blk_mq_sched_insert_request(rq, false, false, false); } blk_mq_run_hw_queues(q, false); @@ -729,7 +787,7 @@ EXPORT_SYMBOL(blk_mq_add_to_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q) { - kblockd_schedule_delayed_work(&q->requeue_work, 0); + kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0); } EXPORT_SYMBOL(blk_mq_kick_requeue_list); @@ -755,24 +813,15 @@ EXPORT_SYMBOL(blk_mq_tag_to_rq); struct blk_mq_timeout_data { unsigned long next; unsigned int next_set; + unsigned int nr_expired; }; -void blk_mq_rq_timed_out(struct request *req, bool reserved) +static void blk_mq_rq_timed_out(struct request *req, bool reserved) { const struct blk_mq_ops *ops = req->q->mq_ops; enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; - /* - * We know that complete is set at this point. If STARTED isn't set - * anymore, then the request isn't active and the "timeout" should - * just be ignored. This can happen due to the bitflag ordering. - * Timeout first checks if STARTED is set, and if it is, assumes - * the request is active. But if we race with completion, then - * both flags will get cleared. So check here again, and ignore - * a timeout event with a request that isn't active. - */ - if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags)) - return; + req->rq_flags |= RQF_MQ_TIMEOUT_EXPIRED; if (ops->timeout) ret = ops->timeout(req, reserved); @@ -782,8 +831,13 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved) __blk_mq_complete_request(req); break; case BLK_EH_RESET_TIMER: + /* + * As nothing prevents from completion happening while + * ->aborted_gstate is set, this may lead to ignored + * completions and further spurious timeouts. + */ + blk_mq_rq_update_aborted_gstate(req, 0); blk_add_timer(req); - blk_clear_rq_complete(req); break; case BLK_EH_NOT_HANDLED: break; @@ -797,50 +851,51 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { struct blk_mq_timeout_data *data = priv; - unsigned long deadline; + unsigned long gstate, deadline; + int start; - if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) - return; + might_sleep(); - /* - * Ensures that if we see STARTED we must also see our - * up-to-date deadline, see blk_mq_start_request(). - */ - smp_rmb(); + if (rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) + return; - deadline = READ_ONCE(rq->deadline); + /* read coherent snapshots of @rq->state_gen and @rq->deadline */ + while (true) { + start = read_seqcount_begin(&rq->gstate_seq); + gstate = READ_ONCE(rq->gstate); + deadline = blk_rq_deadline(rq); + if (!read_seqcount_retry(&rq->gstate_seq, start)) + break; + cond_resched(); + } - /* - * The rq being checked may have been freed and reallocated - * out already here, we avoid this race by checking rq->deadline - * and REQ_ATOM_COMPLETE flag together: - * - * - if rq->deadline is observed as new value because of - * reusing, the rq won't be timed out because of timing. - * - if rq->deadline is observed as previous value, - * REQ_ATOM_COMPLETE flag won't be cleared in reuse path - * because we put a barrier between setting rq->deadline - * and clearing the flag in blk_mq_start_request(), so - * this rq won't be timed out too. - */ - if (time_after_eq(jiffies, deadline)) { - if (!blk_mark_rq_complete(rq)) { - /* - * Again coherence order ensures that consecutive reads - * from the same variable must be in that order. This - * ensures that if we see COMPLETE clear, we must then - * see STARTED set and we'll ignore this timeout. - * - * (There's also the MB implied by the test_and_clear()) - */ - blk_mq_rq_timed_out(rq, reserved); - } + /* if in-flight && overdue, mark for abortion */ + if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT && + time_after_eq(jiffies, deadline)) { + blk_mq_rq_update_aborted_gstate(rq, gstate); + data->nr_expired++; + hctx->nr_expired++; } else if (!data->next_set || time_after(data->next, deadline)) { data->next = deadline; data->next_set = 1; } } +static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx, + struct request *rq, void *priv, bool reserved) +{ + /* + * We marked @rq->aborted_gstate and waited for RCU. If there were + * completions that we lost to, they would have finished and + * updated @rq->gstate by now; otherwise, the completion path is + * now guaranteed to see @rq->aborted_gstate and yield. If + * @rq->aborted_gstate still matches @rq->gstate, @rq is ours. + */ + if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) && + READ_ONCE(rq->gstate) == rq->aborted_gstate) + blk_mq_rq_timed_out(rq, reserved); +} + static void blk_mq_timeout_work(struct work_struct *work) { struct request_queue *q = @@ -848,7 +903,9 @@ static void blk_mq_timeout_work(struct work_struct *work) struct blk_mq_timeout_data data = { .next = 0, .next_set = 0, + .nr_expired = 0, }; + struct blk_mq_hw_ctx *hctx; int i; /* A deadlock might occur if a request is stuck requiring a @@ -867,14 +924,46 @@ static void blk_mq_timeout_work(struct work_struct *work) if (!percpu_ref_tryget(&q->q_usage_counter)) return; + /* scan for the expired ones and set their ->aborted_gstate */ blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); + if (data.nr_expired) { + bool has_rcu = false; + + /* + * Wait till everyone sees ->aborted_gstate. The + * sequential waits for SRCUs aren't ideal. If this ever + * becomes a problem, we can add per-hw_ctx rcu_head and + * wait in parallel. + */ + queue_for_each_hw_ctx(q, hctx, i) { + if (!hctx->nr_expired) + continue; + + if (!(hctx->flags & BLK_MQ_F_BLOCKING)) + has_rcu = true; + else + synchronize_srcu(hctx->srcu); + + hctx->nr_expired = 0; + } + if (has_rcu) + synchronize_rcu(); + + /* terminate the ones we won */ + blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL); + } + if (data.next_set) { data.next = blk_rq_timeout(round_jiffies_up(data.next)); mod_timer(&q->timeout, data.next); } else { - struct blk_mq_hw_ctx *hctx; - + /* + * Request timeouts are handled as a forward rolling timer. If + * we end up here it means that no requests are pending and + * also that no request has been pending for a while. Mark + * each hctx as idle. + */ queue_for_each_hw_ctx(q, hctx, i) { /* the hctx may be unmapped, so check it here */ if (blk_mq_hw_queue_mapped(hctx)) @@ -1010,66 +1099,67 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, /* * Mark us waiting for a tag. For shared tags, this involves hooking us into - * the tag wakeups. For non-shared tags, we can simply mark us nedeing a - * restart. For both caes, take care to check the condition again after + * the tag wakeups. For non-shared tags, we can simply mark us needing a + * restart. For both cases, take care to check the condition again after * marking us as waiting. */ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, struct request *rq) { struct blk_mq_hw_ctx *this_hctx = *hctx; - bool shared_tags = (this_hctx->flags & BLK_MQ_F_TAG_SHARED) != 0; struct sbq_wait_state *ws; wait_queue_entry_t *wait; bool ret; - if (!shared_tags) { + if (!(this_hctx->flags & BLK_MQ_F_TAG_SHARED)) { if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state)) set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state); - } else { - wait = &this_hctx->dispatch_wait; - if (!list_empty_careful(&wait->entry)) - return false; - spin_lock(&this_hctx->lock); - if (!list_empty(&wait->entry)) { - spin_unlock(&this_hctx->lock); - return false; - } + /* + * It's possible that a tag was freed in the window between the + * allocation failure and adding the hardware queue to the wait + * queue. + * + * Don't clear RESTART here, someone else could have set it. + * At most this will cost an extra queue run. + */ + return blk_mq_get_driver_tag(rq, hctx, false); + } - ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); - add_wait_queue(&ws->wait, wait); + wait = &this_hctx->dispatch_wait; + if (!list_empty_careful(&wait->entry)) + return false; + + spin_lock(&this_hctx->lock); + if (!list_empty(&wait->entry)) { + spin_unlock(&this_hctx->lock); + return false; } + ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); + add_wait_queue(&ws->wait, wait); + /* * It's possible that a tag was freed in the window between the * allocation failure and adding the hardware queue to the wait * queue. */ ret = blk_mq_get_driver_tag(rq, hctx, false); - - if (!shared_tags) { - /* - * Don't clear RESTART here, someone else could have set it. - * At most this will cost an extra queue run. - */ - return ret; - } else { - if (!ret) { - spin_unlock(&this_hctx->lock); - return false; - } - - /* - * We got a tag, remove ourselves from the wait queue to ensure - * someone else gets the wakeup. - */ - spin_lock_irq(&ws->wait.lock); - list_del_init(&wait->entry); - spin_unlock_irq(&ws->wait.lock); + if (!ret) { spin_unlock(&this_hctx->lock); - return true; + return false; } + + /* + * We got a tag, remove ourselves from the wait queue to ensure + * someone else gets the wakeup. + */ + spin_lock_irq(&ws->wait.lock); + list_del_init(&wait->entry); + spin_unlock_irq(&ws->wait.lock); + spin_unlock(&this_hctx->lock); + + return true; } bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, @@ -1206,9 +1296,27 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) /* * We should be running this queue from one of the CPUs that * are mapped to it. + * + * There are at least two related races now between setting + * hctx->next_cpu from blk_mq_hctx_next_cpu() and running + * __blk_mq_run_hw_queue(): + * + * - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(), + * but later it becomes online, then this warning is harmless + * at all + * + * - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(), + * but later it becomes offline, then the warning can't be + * triggered, and we depend on blk-mq timeout handler to + * handle dispatched requests to this hctx */ - WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && - cpu_online(hctx->next_cpu)); + if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && + cpu_online(hctx->next_cpu)) { + printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n", + raw_smp_processor_id(), + cpumask_empty(hctx->cpumask) ? "inactive": "active"); + dump_stack(); + } /* * We can't run the queue inline with ints disabled. Ensure that @@ -1216,17 +1324,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) */ WARN_ON_ONCE(in_interrupt()); - if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { - rcu_read_lock(); - blk_mq_sched_dispatch_requests(hctx); - rcu_read_unlock(); - } else { - might_sleep(); + might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); - srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); - blk_mq_sched_dispatch_requests(hctx); - srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx); - } + hctx_lock(hctx, &srcu_idx); + blk_mq_sched_dispatch_requests(hctx); + hctx_unlock(hctx, srcu_idx); } /* @@ -1237,20 +1339,47 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) */ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) { + bool tried = false; + if (hctx->queue->nr_hw_queues == 1) return WORK_CPU_UNBOUND; if (--hctx->next_cpu_batch <= 0) { int next_cpu; - - next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); +select_cpu: + next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask, + cpu_online_mask); if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first(hctx->cpumask); + next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask); - hctx->next_cpu = next_cpu; + /* + * No online CPU is found, so have to make sure hctx->next_cpu + * is set correctly for not breaking workqueue. + */ + if (next_cpu >= nr_cpu_ids) + hctx->next_cpu = cpumask_first(hctx->cpumask); + else + hctx->next_cpu = next_cpu; hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } + /* + * Do unbound schedule if we can't find a online CPU for this hctx, + * and it should only happen in the path of handling CPU DEAD. + */ + if (!cpu_online(hctx->next_cpu)) { + if (!tried) { + tried = true; + goto select_cpu; + } + + /* + * Make sure to re-select CPU next time once after CPUs + * in hctx->cpumask become online again. + */ + hctx->next_cpu_batch = 1; + return WORK_CPU_UNBOUND; + } return hctx->next_cpu; } @@ -1274,9 +1403,8 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, put_cpu(); } - kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), - &hctx->run_work, - msecs_to_jiffies(msecs)); + kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, + msecs_to_jiffies(msecs)); } void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) @@ -1287,7 +1415,23 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { - if (blk_mq_hctx_has_pending(hctx)) { + int srcu_idx; + bool need_run; + + /* + * When queue is quiesced, we may be switching io scheduler, or + * updating nr_hw_queues, or other things, and we can't run queue + * any more, even __blk_mq_hctx_has_pending() can't be called safely. + * + * And queue will be rerun in blk_mq_unquiesce_queue() if it is + * quiesced. + */ + hctx_lock(hctx, &srcu_idx); + need_run = !blk_queue_quiesced(hctx->queue) && + blk_mq_hctx_has_pending(hctx); + hctx_unlock(hctx, srcu_idx); + + if (need_run) { __blk_mq_delay_run_hw_queue(hctx, async, 0); return true; } @@ -1595,9 +1739,9 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true); } -static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, - struct request *rq, - blk_qc_t *cookie, bool may_sleep) +static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, + struct request *rq, + blk_qc_t *cookie) { struct request_queue *q = rq->q; struct blk_mq_queue_data bd = { @@ -1606,15 +1750,52 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, }; blk_qc_t new_cookie; blk_status_t ret; + + new_cookie = request_to_qc_t(hctx, rq); + + /* + * For OK queue, we are done. For error, caller may kill it. + * Any other error (busy), just add it to our list as we + * previously would have done. + */ + ret = q->mq_ops->queue_rq(hctx, &bd); + switch (ret) { + case BLK_STS_OK: + *cookie = new_cookie; + break; + case BLK_STS_RESOURCE: + __blk_mq_requeue_request(rq); + break; + default: + *cookie = BLK_QC_T_NONE; + break; + } + + return ret; +} + +static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, + struct request *rq, + blk_qc_t *cookie, + bool bypass_insert) +{ + struct request_queue *q = rq->q; bool run_queue = true; - /* RCU or SRCU read lock is needed before checking quiesced flag */ + /* + * RCU or SRCU read lock is needed before checking quiesced flag. + * + * When queue is stopped or quiesced, ignore 'bypass_insert' from + * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller, + * and avoid driver to try to dispatch again. + */ if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) { run_queue = false; + bypass_insert = false; goto insert; } - if (q->elevator) + if (q->elevator && !bypass_insert) goto insert; if (!blk_mq_get_driver_tag(rq, NULL, false)) @@ -1625,47 +1806,47 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, goto insert; } - new_cookie = request_to_qc_t(hctx, rq); - - /* - * For OK queue, we are done. For error, kill it. Any other - * error (busy), just add it to our list as we previously - * would have done - */ - ret = q->mq_ops->queue_rq(hctx, &bd); - switch (ret) { - case BLK_STS_OK: - *cookie = new_cookie; - return; - case BLK_STS_RESOURCE: - __blk_mq_requeue_request(rq); - goto insert; - default: - *cookie = BLK_QC_T_NONE; - blk_mq_end_request(rq, ret); - return; - } - + return __blk_mq_issue_directly(hctx, rq, cookie); insert: - blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep); + if (bypass_insert) + return BLK_STS_RESOURCE; + + blk_mq_sched_insert_request(rq, false, run_queue, false); + return BLK_STS_OK; } static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_qc_t *cookie) { - if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { - rcu_read_lock(); - __blk_mq_try_issue_directly(hctx, rq, cookie, false); - rcu_read_unlock(); - } else { - unsigned int srcu_idx; + blk_status_t ret; + int srcu_idx; - might_sleep(); + might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); - srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); - __blk_mq_try_issue_directly(hctx, rq, cookie, true); - srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx); - } + hctx_lock(hctx, &srcu_idx); + + ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false); + if (ret == BLK_STS_RESOURCE) + blk_mq_sched_insert_request(rq, false, true, false); + else if (ret != BLK_STS_OK) + blk_mq_end_request(rq, ret); + + hctx_unlock(hctx, srcu_idx); +} + +blk_status_t blk_mq_request_issue_directly(struct request *rq) +{ + blk_status_t ret; + int srcu_idx; + blk_qc_t unused_cookie; + struct blk_mq_ctx *ctx = rq->mq_ctx; + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu); + + hctx_lock(hctx, &srcu_idx); + ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true); + hctx_unlock(hctx, srcu_idx); + + return ret; } static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) @@ -1776,7 +1957,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) } else if (q->elevator) { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); - blk_mq_sched_insert_request(rq, false, true, true, true); + blk_mq_sched_insert_request(rq, false, true, true); } else { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); @@ -1869,6 +2050,22 @@ static size_t order_to_size(unsigned int order) return (size_t)PAGE_SIZE << order; } +static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, + unsigned int hctx_idx, int node) +{ + int ret; + + if (set->ops->init_request) { + ret = set->ops->init_request(set, rq, hctx_idx, node); + if (ret) + return ret; + } + + seqcount_init(&rq->gstate_seq); + u64_stats_init(&rq->aborted_gstate_sync); + return 0; +} + int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx, unsigned int depth) { @@ -1930,12 +2127,9 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, struct request *rq = p; tags->static_rqs[i] = rq; - if (set->ops->init_request) { - if (set->ops->init_request(set, rq, hctx_idx, - node)) { - tags->static_rqs[i] = NULL; - goto fail; - } + if (blk_mq_init_request(set, rq, hctx_idx, node)) { + tags->static_rqs[i] = NULL; + goto fail; } p += rq_size; @@ -1994,7 +2188,8 @@ static void blk_mq_exit_hctx(struct request_queue *q, { blk_mq_debugfs_unregister_hctx(hctx); - blk_mq_tag_idle(hctx); + if (blk_mq_hw_queue_mapped(hctx)) + blk_mq_tag_idle(hctx); if (set->ops->exit_request) set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); @@ -2005,7 +2200,7 @@ static void blk_mq_exit_hctx(struct request_queue *q, set->ops->exit_hctx(hctx, hctx_idx); if (hctx->flags & BLK_MQ_F_BLOCKING) - cleanup_srcu_struct(hctx->queue_rq_srcu); + cleanup_srcu_struct(hctx->srcu); blk_mq_remove_cpuhp(hctx); blk_free_flush_queue(hctx->fq); @@ -2074,13 +2269,11 @@ static int blk_mq_init_hctx(struct request_queue *q, if (!hctx->fq) goto sched_exit_hctx; - if (set->ops->init_request && - set->ops->init_request(set, hctx->fq->flush_rq, hctx_idx, - node)) + if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node)) goto free_fq; if (hctx->flags & BLK_MQ_F_BLOCKING) - init_srcu_struct(hctx->queue_rq_srcu); + init_srcu_struct(hctx->srcu); blk_mq_debugfs_register_hctx(q, hctx); @@ -2116,16 +2309,11 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, INIT_LIST_HEAD(&__ctx->rq_list); __ctx->queue = q; - /* If the cpu isn't present, the cpu is mapped to first hctx */ - if (!cpu_present(i)) - continue; - - hctx = blk_mq_map_queue(q, i); - /* * Set local node, IFF we have more than one hw queue. If * not, we remain on the home node of the device */ + hctx = blk_mq_map_queue(q, i); if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) hctx->numa_node = local_memory_node(cpu_to_node(i)); } @@ -2182,7 +2370,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) * * If the cpu isn't present, the cpu is mapped to first hctx. */ - for_each_present_cpu(i) { + for_each_possible_cpu(i) { hctx_idx = q->mq_map[i]; /* unmapped hw queue can be remapped after CPU topo changed */ if (!set->tags[hctx_idx] && @@ -2236,7 +2424,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) /* * Initialize batch roundrobin counts */ - hctx->next_cpu = cpumask_first(hctx->cpumask); + hctx->next_cpu = cpumask_first_and(hctx->cpumask, + cpu_online_mask); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } } @@ -2369,7 +2558,7 @@ static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set) { int hw_ctx_size = sizeof(struct blk_mq_hw_ctx); - BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, queue_rq_srcu), + BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu), __alignof__(struct blk_mq_hw_ctx)) != sizeof(struct blk_mq_hw_ctx)); @@ -2386,6 +2575,9 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; blk_mq_sysfs_unregister(q); + + /* protect against switching io scheduler */ + mutex_lock(&q->sysfs_lock); for (i = 0; i < set->nr_hw_queues; i++) { int node; @@ -2430,6 +2622,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, } } q->nr_hw_queues = i; + mutex_unlock(&q->sysfs_lock); blk_mq_sysfs_register(q); } @@ -2601,9 +2794,27 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) { - if (set->ops->map_queues) + if (set->ops->map_queues) { + int cpu; + /* + * transport .map_queues is usually done in the following + * way: + * + * for (queue = 0; queue < set->nr_hw_queues; queue++) { + * mask = get_cpu_mask(queue) + * for_each_cpu(cpu, mask) + * set->mq_map[cpu] = queue; + * } + * + * When we need to remap, the table has to be cleared for + * killing stale mapping since one CPU may not be mapped + * to any hw queue. + */ + for_each_possible_cpu(cpu) + set->mq_map[cpu] = 0; + return set->ops->map_queues(set); - else + } else return blk_mq_map_queues(set); } @@ -2712,6 +2923,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) return -EINVAL; blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); ret = 0; queue_for_each_hw_ctx(q, hctx, i) { @@ -2735,6 +2947,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) if (!ret) q->nr_requests = nr; + blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); return ret; @@ -2850,7 +3063,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, unsigned int nsecs; ktime_t kt; - if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags)) + if (rq->rq_flags & RQF_MQ_POLL_SLEPT) return false; /* @@ -2870,7 +3083,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, if (!nsecs) return false; - set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); + rq->rq_flags |= RQF_MQ_POLL_SLEPT; /* * This will be replaced with the stats tracking code, using @@ -2884,7 +3097,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, hrtimer_init_sleeper(&hs, current); do { - if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) + if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE) break; set_current_state(TASK_UNINTERRUPTIBLE); hrtimer_start_expires(&hs.timer, mode); @@ -2970,12 +3183,6 @@ static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) static int __init blk_mq_init(void) { - /* - * See comment in block/blk.h rq_atomic_flags enum - */ - BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) != - (REQ_ATOM_COMPLETE / BITS_PER_BYTE)); - cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, blk_mq_hctx_notify_dead); return 0; |