From 59f082e464ae0533813d5a1c3149b22563da93d7 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 1 Feb 2017 09:53:14 -0800 Subject: blk-mq: allocate blk_mq_tags and requests in correct node blk_mq_tags/requests of specific hardware queue are mostly used in specific cpus, which might not be in the same numa node as disk. For example, a nvme card is in node 0. half hardware queue will be used by node 0, the other node 1. Signed-off-by: Shaohua Li Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 9e6b064..d84c66f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1713,16 +1713,20 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, unsigned int reserved_tags) { struct blk_mq_tags *tags; + int node; + + node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx); + if (node == NUMA_NO_NODE) + node = set->numa_node; - tags = blk_mq_init_tags(nr_tags, reserved_tags, - set->numa_node, + tags = blk_mq_init_tags(nr_tags, reserved_tags, node, BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); if (!tags) return NULL; tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, - set->numa_node); + node); if (!tags->rqs) { blk_mq_free_tags(tags); return NULL; @@ -1730,7 +1734,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, tags->static_rqs = kzalloc_node(nr_tags * sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, - set->numa_node); + node); if (!tags->static_rqs) { kfree(tags->rqs); blk_mq_free_tags(tags); @@ -1750,6 +1754,11 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, { unsigned int i, j, entries_per_page, max_order = 4; size_t rq_size, left; + int node; + + node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx); + if (node == NUMA_NO_NODE) + node = set->numa_node; INIT_LIST_HEAD(&tags->page_list); @@ -1771,7 +1780,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, this_order--; do { - page = alloc_pages_node(set->numa_node, + page = alloc_pages_node(node, GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, this_order); if (page) @@ -1804,7 +1813,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, if (set->ops->init_request) { if (set->ops->init_request(set->driver_data, rq, hctx_idx, i, - set->numa_node)) { + node)) { tags->static_rqs[i] = NULL; goto fail; } -- cgit v1.1 From 415b806de5576b656f3ff94366589af9a161d0c8 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 27 Feb 2017 10:04:39 -0700 Subject: blk-mq-sched: Allocate sched reserved tags as specified in the original queue tagset Signed-off-by: Sagi Grimberg Modified by me to also check at driver tag allocation time if the original request was reserved, so we can be sure to allocate a properly reserved tag at that point in time, too. Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 3 ++- block/blk-mq-tag.c | 2 +- block/blk-mq-tag.h | 6 ++++++ block/blk-mq.c | 3 +++ 4 files changed, 12 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 98c7b06..46ca965 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -454,7 +454,8 @@ int blk_mq_sched_setup(struct request_queue *q) */ ret = 0; queue_for_each_hw_ctx(q, hctx, i) { - hctx->sched_tags = blk_mq_alloc_rq_map(set, i, q->nr_requests, 0); + hctx->sched_tags = blk_mq_alloc_rq_map(set, i, + q->nr_requests, set->reserved_tags); if (!hctx->sched_tags) { ret = -ENOMEM; break; diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 54c8436..e48bc2c 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -181,7 +181,7 @@ found_tag: void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag) { - if (tag >= tags->nr_reserved_tags) { + if (!blk_mq_tag_is_reserved(tags, tag)) { const int real_tag = tag - tags->nr_reserved_tags; BUG_ON(real_tag >= tags->nr_tags); diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 6349742..5cb51e5 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -85,4 +85,10 @@ static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx, hctx->tags->rqs[tag] = rq; } +static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, + unsigned int tag) +{ + return tag < tags->nr_reserved_tags; +} + #endif diff --git a/block/blk-mq.c b/block/blk-mq.c index d84c66f..4df5fb4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -852,6 +852,9 @@ done: return true; } + if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag)) + data.flags |= BLK_MQ_REQ_RESERVED; + rq->tag = blk_mq_get_tag(&data); if (rq->tag >= 0) { if (blk_mq_tag_busy(data.hctx)) { -- cgit v1.1 From 6d2809d51a5079f01a416d91dd63b0766cb685d0 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 27 Feb 2017 10:28:27 -0800 Subject: blk-mq: make blk_mq_alloc_request_hctx() allocate a scheduler request blk_mq_alloc_request_hctx() allocates a driver request directly, unlike its blk_mq_alloc_request() counterpart. It also crashes because it doesn't update the tags->rqs map. Fix it by making it allocate a scheduler request. Reported-by: Sagi Grimberg Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe Tested-by: Sagi Grimberg --- block/blk-mq-sched.c | 11 +++++------ block/blk-mq.c | 33 +++++++++++++++------------------ 2 files changed, 20 insertions(+), 24 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 46ca965..5697b23 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -110,15 +110,14 @@ struct request *blk_mq_sched_get_request(struct request_queue *q, struct blk_mq_alloc_data *data) { struct elevator_queue *e = q->elevator; - struct blk_mq_hw_ctx *hctx; - struct blk_mq_ctx *ctx; struct request *rq; blk_queue_enter_live(q); - ctx = blk_mq_get_ctx(q); - hctx = blk_mq_map_queue(q, ctx->cpu); - - blk_mq_set_alloc_data(data, q, data->flags, ctx, hctx); + data->q = q; + if (likely(!data->ctx)) + data->ctx = blk_mq_get_ctx(q); + if (likely(!data->hctx)) + data->hctx = blk_mq_map_queue(q, data->ctx->cpu); if (e) { data->flags |= BLK_MQ_REQ_INTERNAL; diff --git a/block/blk-mq.c b/block/blk-mq.c index 4df5fb4..85a7047 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -273,10 +273,9 @@ EXPORT_SYMBOL(blk_mq_alloc_request); struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw, unsigned int flags, unsigned int hctx_idx) { - struct blk_mq_hw_ctx *hctx; - struct blk_mq_ctx *ctx; + struct blk_mq_alloc_data alloc_data = { .flags = flags }; struct request *rq; - struct blk_mq_alloc_data alloc_data; + unsigned int cpu; int ret; /* @@ -299,25 +298,23 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw, * Check if the hardware context is actually mapped to anything. * If not tell the caller that it should skip this queue. */ - hctx = q->queue_hw_ctx[hctx_idx]; - if (!blk_mq_hw_queue_mapped(hctx)) { - ret = -EXDEV; - goto out_queue_exit; + alloc_data.hctx = q->queue_hw_ctx[hctx_idx]; + if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) { + blk_queue_exit(q); + return ERR_PTR(-EXDEV); } - ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask)); + cpu = cpumask_first(alloc_data.hctx->cpumask); + alloc_data.ctx = __blk_mq_get_ctx(q, cpu); - blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, rw); - if (!rq) { - ret = -EWOULDBLOCK; - goto out_queue_exit; - } - - return rq; + rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data); -out_queue_exit: + blk_mq_put_ctx(alloc_data.ctx); blk_queue_exit(q); - return ERR_PTR(ret); + + if (!rq) + return ERR_PTR(-EWOULDBLOCK); + + return rq; } EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); -- cgit v1.1 From 59748398992c9c3e9d600e56cb2a5c0c546fe129 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 27 Feb 2017 09:47:54 -0800 Subject: blk-mq: kill blk_mq_set_alloc_data() Nothing is using it anymore. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe Tested-by: Sagi Grimberg --- block/blk-mq.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'block') diff --git a/block/blk-mq.h b/block/blk-mq.h index 24b2256..088ced0 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -146,16 +146,6 @@ struct blk_mq_alloc_data { struct blk_mq_hw_ctx *hctx; }; -static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data, - struct request_queue *q, unsigned int flags, - struct blk_mq_ctx *ctx, struct blk_mq_hw_ctx *hctx) -{ - data->q = q; - data->flags = flags; - data->ctx = ctx; - data->hctx = hctx; -} - static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) { if (data->flags & BLK_MQ_REQ_INTERNAL) -- cgit v1.1 From 562bef4259776c19cb2423d43af1f99183910a4d Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 27 Feb 2017 09:47:55 -0800 Subject: blk-mq: move update of tags->rqs to __blk_mq_alloc_request() No functional difference, it just makes a little more sense to update the tag map where we actually allocate the tag. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe Tested-by: Sagi Grimberg --- block/blk-mq-sched.c | 2 -- block/blk-mq.c | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 5697b23..09af8ff 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -134,8 +134,6 @@ struct request *blk_mq_sched_get_request(struct request_queue *q, rq = __blk_mq_alloc_request(data, op); } else { rq = __blk_mq_alloc_request(data, op); - if (rq) - data->hctx->tags->rqs[rq->tag] = rq; } if (rq) { diff --git a/block/blk-mq.c b/block/blk-mq.c index 85a7047..94593c6 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -234,6 +234,7 @@ struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data, } rq->tag = tag; rq->internal_tag = -1; + data->hctx->tags->rqs[rq->tag] = rq; } blk_mq_rq_ctx_init(data->q, data->ctx, rq, op); -- cgit v1.1 From 6bae363ee3057a14eec93440826813603559273a Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 1 Mar 2017 14:22:10 -0500 Subject: blk-mq: Export blk_mq_freeze_queue_wait Drivers can start a freeze, so this provides a way to wait for frozen. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 94593c6..8da2c04 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -75,10 +75,11 @@ void blk_mq_freeze_queue_start(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start); -static void blk_mq_freeze_queue_wait(struct request_queue *q) +void blk_mq_freeze_queue_wait(struct request_queue *q) { wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); } +EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait); /* * Guarantee no request is in use, so we can change any data structure of -- cgit v1.1 From f91328c40a559362b6e7b7bfee01ca17fda87592 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 1 Mar 2017 14:22:11 -0500 Subject: blk-mq: Provide freeze queue timeout A driver may wish to take corrective action if queued requests do not complete within a set time. Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-mq.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 8da2c04..a5e66a7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -81,6 +81,15 @@ void blk_mq_freeze_queue_wait(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait); +int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, + unsigned long timeout) +{ + return wait_event_timeout(q->mq_freeze_wq, + percpu_ref_is_zero(&q->q_usage_counter), + timeout); +} +EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout); + /* * Guarantee no request is in use, so we can change any data structure of * the queue afterward. -- cgit v1.1 From 7b36a7189fc320f0b783dd51bd1f541db56cfbdd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 2 Mar 2017 13:59:08 -0700 Subject: block: don't call ioc_exit_icq() with the queue lock held for blk-mq For legacy scheduling, we always call ioc_exit_icq() with both the ioc and queue lock held. This poses a problem for blk-mq with scheduling, since the queue lock isn't what we use in the scheduler. And since we don't need the queue lock held for ioc exit there, don't grab it and leave any extra locking up to the blk-mq scheduler. Reported-by: Paolo Valente Tested-by: Paolo Valente Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-ioc.c | 44 +++++++++++++++++++++++++++++++------------- block/blk-sysfs.c | 2 -- block/elevator.c | 2 -- 3 files changed, 31 insertions(+), 17 deletions(-) (limited to 'block') diff --git a/block/blk-ioc.c b/block/blk-ioc.c index b12f9c8..0d7c96c 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -36,8 +36,8 @@ static void icq_free_icq_rcu(struct rcu_head *head) } /* - * Exit an icq. Called with both ioc and q locked for sq, only ioc locked for - * mq. + * Exit an icq. Called with ioc locked for blk-mq, and with both ioc + * and queue locked for legacy. */ static void ioc_exit_icq(struct io_cq *icq) { @@ -54,7 +54,10 @@ static void ioc_exit_icq(struct io_cq *icq) icq->flags |= ICQ_EXITED; } -/* Release an icq. Called with both ioc and q locked. */ +/* + * Release an icq. Called with ioc locked for blk-mq, and with both ioc + * and queue locked for legacy. + */ static void ioc_destroy_icq(struct io_cq *icq) { struct io_context *ioc = icq->ioc; @@ -62,7 +65,6 @@ static void ioc_destroy_icq(struct io_cq *icq) struct elevator_type *et = q->elevator->type; lockdep_assert_held(&ioc->lock); - lockdep_assert_held(q->queue_lock); radix_tree_delete(&ioc->icq_tree, icq->q->id); hlist_del_init(&icq->ioc_node); @@ -222,24 +224,40 @@ void exit_io_context(struct task_struct *task) put_io_context_active(ioc); } +static void __ioc_clear_queue(struct list_head *icq_list) +{ + unsigned long flags; + + while (!list_empty(icq_list)) { + struct io_cq *icq = list_entry(icq_list->next, + struct io_cq, q_node); + struct io_context *ioc = icq->ioc; + + spin_lock_irqsave(&ioc->lock, flags); + ioc_destroy_icq(icq); + spin_unlock_irqrestore(&ioc->lock, flags); + } +} + /** * ioc_clear_queue - break any ioc association with the specified queue * @q: request_queue being cleared * - * Walk @q->icq_list and exit all io_cq's. Must be called with @q locked. + * Walk @q->icq_list and exit all io_cq's. */ void ioc_clear_queue(struct request_queue *q) { - lockdep_assert_held(q->queue_lock); + LIST_HEAD(icq_list); - while (!list_empty(&q->icq_list)) { - struct io_cq *icq = list_entry(q->icq_list.next, - struct io_cq, q_node); - struct io_context *ioc = icq->ioc; + spin_lock_irq(q->queue_lock); + list_splice_init(&q->icq_list, &icq_list); - spin_lock(&ioc->lock); - ioc_destroy_icq(icq); - spin_unlock(&ioc->lock); + if (q->mq_ops) { + spin_unlock_irq(q->queue_lock); + __ioc_clear_queue(&icq_list); + } else { + __ioc_clear_queue(&icq_list); + spin_unlock_irq(q->queue_lock); } } diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 002af83..c44b321 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -815,9 +815,7 @@ static void blk_release_queue(struct kobject *kobj) blkcg_exit_queue(q); if (q->elevator) { - spin_lock_irq(q->queue_lock); ioc_clear_queue(q); - spin_unlock_irq(q->queue_lock); elevator_exit(q->elevator); } diff --git a/block/elevator.c b/block/elevator.c index ac1c9f4..01139f5 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -983,9 +983,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) if (old_registered) elv_unregister_queue(q); - spin_lock_irq(q->queue_lock); ioc_clear_queue(q); - spin_unlock_irq(q->queue_lock); } /* allocate, init and register new elevator */ -- cgit v1.1 From 113285b473824922498d07d7f82459507b9792eb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 2 Mar 2017 13:26:04 -0700 Subject: blk-mq: ensure that bd->last is always set correctly When drivers are called with a request in blk-mq, blk-mq flags the state such that the driver knows if this is the last request in this call chain or not. The driver can then use that information to defer kicking off IO until bd->last is true. However, with blk-mq and scheduling, we need to allocate a driver tag for a request before it can be issued. If we fail to allocate such a tag, we could end up in the situation where the last request issued did not have bd->last == true set. This can then cause a driver hang. This fixes a hang with virtio-blk, which uses bd->last as a hint on whether to kick the queue or not. Reported-by: Chris Mason Tested-by: Chris Mason Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq.c | 50 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index a5e66a7..e797607 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -876,12 +876,9 @@ done: return false; } -static void blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, - struct request *rq) +static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, + struct request *rq) { - if (rq->tag == -1 || rq->internal_tag == -1) - return; - blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag); rq->tag = -1; @@ -891,6 +888,26 @@ static void blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, } } +static void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + if (rq->tag == -1 || rq->internal_tag == -1) + return; + + __blk_mq_put_driver_tag(hctx, rq); +} + +static void blk_mq_put_driver_tag(struct request *rq) +{ + struct blk_mq_hw_ctx *hctx; + + if (rq->tag == -1 || rq->internal_tag == -1) + return; + + hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu); + __blk_mq_put_driver_tag(hctx, rq); +} + /* * If we fail getting a driver tag because all the driver tags are already * assigned and on the dispatch list, BUT the first entry does not have a @@ -1000,7 +1017,19 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) bd.rq = rq; bd.list = dptr; - bd.last = list_empty(list); + + /* + * Flag last if we have no more requests, or if we have more + * but can't assign a driver tag to it. + */ + if (list_empty(list)) + bd.last = true; + else { + struct request *nxt; + + nxt = list_first_entry(list, struct request, queuelist); + bd.last = !blk_mq_get_driver_tag(nxt, NULL, false); + } ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { @@ -1008,7 +1037,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) queued++; break; case BLK_MQ_RQ_QUEUE_BUSY: - blk_mq_put_driver_tag(hctx, rq); + blk_mq_put_driver_tag_hctx(hctx, rq); list_add(&rq->queuelist, list); __blk_mq_requeue_request(rq); break; @@ -1038,6 +1067,13 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) * that is where we will continue on next queue run. */ if (!list_empty(list)) { + /* + * If we got a driver tag for the next request already, + * free it again. + */ + rq = list_first_entry(list, struct request, queuelist); + blk_mq_put_driver_tag(rq); + spin_lock(&hctx->lock); list_splice_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); -- cgit v1.1 From 165a5e22fafb127ecb5914e12e8c32a1f0d3f820 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 8 Feb 2017 08:05:56 +0100 Subject: block: Move bdi_unregister() to del_gendisk() Commit 6cd18e711dd8 "block: destroy bdi before blockdev is unregistered." moved bdi unregistration (at that time through bdi_destroy()) from blk_release_queue() to blk_cleanup_queue() because it needs to happen before blk_unregister_region() call in del_gendisk() for MD. SCSI though will free up the device number from sd_remove() called through a maze of callbacks from device_del() in __scsi_remove_device() before blk_cleanup_queue() and thus similar races as described in 6cd18e711dd8 can happen for SCSI as well as reported by Omar [1]. Moving bdi_unregister() to del_gendisk() works for MD and fixes the problem for SCSI since del_gendisk() gets called from sd_remove() before freeing the device number. This also makes device_add_disk() (calling bdi_register_owner()) more symmetric with del_gendisk(). [1] http://marc.info/?l=linux-block&m=148554717109098&w=2 Tested-by: Lekshmi Pillai Acked-by: Tejun Heo Signed-off-by: Jan Kara Tested-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-core.c | 1 - block/genhd.c | 5 +++++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index b9e857f..1086dac 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -578,7 +578,6 @@ void blk_cleanup_queue(struct request_queue *q) q->queue_lock = &q->__queue_lock; spin_unlock_irq(lock); - bdi_unregister(q->backing_dev_info); put_disk_devt(q->disk_devt); /* @q is and will stay empty, shutdown and put */ diff --git a/block/genhd.c b/block/genhd.c index 2f444b8..b26a5ea 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -681,6 +681,11 @@ void del_gendisk(struct gendisk *disk) disk->flags &= ~GENHD_FL_UP; sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); + /* + * Unregister bdi before releasing device numbers (as they can get + * reused and we'd get clashes in sysfs). + */ + bdi_unregister(disk->queue->backing_dev_info); blk_unregister_queue(disk); blk_unregister_region(disk_devt(disk), disk->minors); -- cgit v1.1