From 59f082e464ae0533813d5a1c3149b22563da93d7 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 1 Feb 2017 09:53:14 -0800
Subject: blk-mq: allocate blk_mq_tags and requests in correct node

blk_mq_tags/requests of specific hardware queue are mostly used in
specific cpus, which might not be in the same numa node as disk. For
example, a nvme card is in node 0. half hardware queue will be used by
node 0, the other node 1.

Signed-off-by: Shaohua Li <shli@fb.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9e6b064..d84c66f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1713,16 +1713,20 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
 					unsigned int reserved_tags)
 {
 	struct blk_mq_tags *tags;
+	int node;
+
+	node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
+	if (node == NUMA_NO_NODE)
+		node = set->numa_node;
 
-	tags = blk_mq_init_tags(nr_tags, reserved_tags,
-				set->numa_node,
+	tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
 				BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
 	if (!tags)
 		return NULL;
 
 	tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *),
 				 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
-				 set->numa_node);
+				 node);
 	if (!tags->rqs) {
 		blk_mq_free_tags(tags);
 		return NULL;
@@ -1730,7 +1734,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
 
 	tags->static_rqs = kzalloc_node(nr_tags * sizeof(struct request *),
 				 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
-				 set->numa_node);
+				 node);
 	if (!tags->static_rqs) {
 		kfree(tags->rqs);
 		blk_mq_free_tags(tags);
@@ -1750,6 +1754,11 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 {
 	unsigned int i, j, entries_per_page, max_order = 4;
 	size_t rq_size, left;
+	int node;
+
+	node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
+	if (node == NUMA_NO_NODE)
+		node = set->numa_node;
 
 	INIT_LIST_HEAD(&tags->page_list);
 
@@ -1771,7 +1780,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 			this_order--;
 
 		do {
-			page = alloc_pages_node(set->numa_node,
+			page = alloc_pages_node(node,
 				GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
 				this_order);
 			if (page)
@@ -1804,7 +1813,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 			if (set->ops->init_request) {
 				if (set->ops->init_request(set->driver_data,
 						rq, hctx_idx, i,
-						set->numa_node)) {
+						node)) {
 					tags->static_rqs[i] = NULL;
 					goto fail;
 				}
-- 
cgit v1.1


From 415b806de5576b656f3ff94366589af9a161d0c8 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Mon, 27 Feb 2017 10:04:39 -0700
Subject: blk-mq-sched: Allocate sched reserved tags as specified in the
 original queue tagset

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>

Modified by me to also check at driver tag allocation time if the
original request was reserved, so we can be sure to allocate a
properly reserved tag at that point in time, too.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-sched.c | 3 ++-
 block/blk-mq-tag.c   | 2 +-
 block/blk-mq-tag.h   | 6 ++++++
 block/blk-mq.c       | 3 +++
 4 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 98c7b06..46ca965 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -454,7 +454,8 @@ int blk_mq_sched_setup(struct request_queue *q)
 	 */
 	ret = 0;
 	queue_for_each_hw_ctx(q, hctx, i) {
-		hctx->sched_tags = blk_mq_alloc_rq_map(set, i, q->nr_requests, 0);
+		hctx->sched_tags = blk_mq_alloc_rq_map(set, i,
+				q->nr_requests, set->reserved_tags);
 		if (!hctx->sched_tags) {
 			ret = -ENOMEM;
 			break;
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 54c8436..e48bc2c 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -181,7 +181,7 @@ found_tag:
 void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags,
 		    struct blk_mq_ctx *ctx, unsigned int tag)
 {
-	if (tag >= tags->nr_reserved_tags) {
+	if (!blk_mq_tag_is_reserved(tags, tag)) {
 		const int real_tag = tag - tags->nr_reserved_tags;
 
 		BUG_ON(real_tag >= tags->nr_tags);
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 6349742..5cb51e5 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -85,4 +85,10 @@ static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx,
 	hctx->tags->rqs[tag] = rq;
 }
 
+static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
+					  unsigned int tag)
+{
+	return tag < tags->nr_reserved_tags;
+}
+
 #endif
diff --git a/block/blk-mq.c b/block/blk-mq.c
index d84c66f..4df5fb4 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -852,6 +852,9 @@ done:
 		return true;
 	}
 
+	if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
+		data.flags |= BLK_MQ_REQ_RESERVED;
+
 	rq->tag = blk_mq_get_tag(&data);
 	if (rq->tag >= 0) {
 		if (blk_mq_tag_busy(data.hctx)) {
-- 
cgit v1.1


From 6d2809d51a5079f01a416d91dd63b0766cb685d0 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Mon, 27 Feb 2017 10:28:27 -0800
Subject: blk-mq: make blk_mq_alloc_request_hctx() allocate a scheduler request

blk_mq_alloc_request_hctx() allocates a driver request directly, unlike
its blk_mq_alloc_request() counterpart. It also crashes because it
doesn't update the tags->rqs map.

Fix it by making it allocate a scheduler request.

Reported-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
Tested-by: Sagi Grimberg <sagi@grimberg.me>
---
 block/blk-mq-sched.c | 11 +++++------
 block/blk-mq.c       | 33 +++++++++++++++------------------
 2 files changed, 20 insertions(+), 24 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 46ca965..5697b23 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -110,15 +110,14 @@ struct request *blk_mq_sched_get_request(struct request_queue *q,
 					 struct blk_mq_alloc_data *data)
 {
 	struct elevator_queue *e = q->elevator;
-	struct blk_mq_hw_ctx *hctx;
-	struct blk_mq_ctx *ctx;
 	struct request *rq;
 
 	blk_queue_enter_live(q);
-	ctx = blk_mq_get_ctx(q);
-	hctx = blk_mq_map_queue(q, ctx->cpu);
-
-	blk_mq_set_alloc_data(data, q, data->flags, ctx, hctx);
+	data->q = q;
+	if (likely(!data->ctx))
+		data->ctx = blk_mq_get_ctx(q);
+	if (likely(!data->hctx))
+		data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
 
 	if (e) {
 		data->flags |= BLK_MQ_REQ_INTERNAL;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4df5fb4..85a7047 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -273,10 +273,9 @@ EXPORT_SYMBOL(blk_mq_alloc_request);
 struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
 		unsigned int flags, unsigned int hctx_idx)
 {
-	struct blk_mq_hw_ctx *hctx;
-	struct blk_mq_ctx *ctx;
+	struct blk_mq_alloc_data alloc_data = { .flags = flags };
 	struct request *rq;
-	struct blk_mq_alloc_data alloc_data;
+	unsigned int cpu;
 	int ret;
 
 	/*
@@ -299,25 +298,23 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
 	 * Check if the hardware context is actually mapped to anything.
 	 * If not tell the caller that it should skip this queue.
 	 */
-	hctx = q->queue_hw_ctx[hctx_idx];
-	if (!blk_mq_hw_queue_mapped(hctx)) {
-		ret = -EXDEV;
-		goto out_queue_exit;
+	alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
+	if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) {
+		blk_queue_exit(q);
+		return ERR_PTR(-EXDEV);
 	}
-	ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask));
+	cpu = cpumask_first(alloc_data.hctx->cpumask);
+	alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
 
-	blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
-	rq = __blk_mq_alloc_request(&alloc_data, rw);
-	if (!rq) {
-		ret = -EWOULDBLOCK;
-		goto out_queue_exit;
-	}
-
-	return rq;
+	rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data);
 
-out_queue_exit:
+	blk_mq_put_ctx(alloc_data.ctx);
 	blk_queue_exit(q);
-	return ERR_PTR(ret);
+
+	if (!rq)
+		return ERR_PTR(-EWOULDBLOCK);
+
+	return rq;
 }
 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
 
-- 
cgit v1.1


From 59748398992c9c3e9d600e56cb2a5c0c546fe129 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Mon, 27 Feb 2017 09:47:54 -0800
Subject: blk-mq: kill blk_mq_set_alloc_data()

Nothing is using it anymore.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
Tested-by: Sagi Grimberg <sagi@grimberg.me>
---
 block/blk-mq.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.h b/block/blk-mq.h
index 24b2256..088ced0 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -146,16 +146,6 @@ struct blk_mq_alloc_data {
 	struct blk_mq_hw_ctx *hctx;
 };
 
-static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
-		struct request_queue *q, unsigned int flags,
-		struct blk_mq_ctx *ctx, struct blk_mq_hw_ctx *hctx)
-{
-	data->q = q;
-	data->flags = flags;
-	data->ctx = ctx;
-	data->hctx = hctx;
-}
-
 static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
 {
 	if (data->flags & BLK_MQ_REQ_INTERNAL)
-- 
cgit v1.1


From 562bef4259776c19cb2423d43af1f99183910a4d Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Mon, 27 Feb 2017 09:47:55 -0800
Subject: blk-mq: move update of tags->rqs to __blk_mq_alloc_request()

No functional difference, it just makes a little more sense to update
the tag map where we actually allocate the tag.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
Tested-by: Sagi Grimberg <sagi@grimberg.me>
---
 block/blk-mq-sched.c | 2 --
 block/blk-mq.c       | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 5697b23..09af8ff 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -134,8 +134,6 @@ struct request *blk_mq_sched_get_request(struct request_queue *q,
 			rq = __blk_mq_alloc_request(data, op);
 	} else {
 		rq = __blk_mq_alloc_request(data, op);
-		if (rq)
-			data->hctx->tags->rqs[rq->tag] = rq;
 	}
 
 	if (rq) {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 85a7047..94593c6 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -234,6 +234,7 @@ struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
 			}
 			rq->tag = tag;
 			rq->internal_tag = -1;
+			data->hctx->tags->rqs[rq->tag] = rq;
 		}
 
 		blk_mq_rq_ctx_init(data->q, data->ctx, rq, op);
-- 
cgit v1.1


From 6bae363ee3057a14eec93440826813603559273a Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 1 Mar 2017 14:22:10 -0500
Subject: blk-mq: Export blk_mq_freeze_queue_wait

Drivers can start a freeze, so this provides a way to wait for frozen.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 94593c6..8da2c04 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -75,10 +75,11 @@ void blk_mq_freeze_queue_start(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
 
-static void blk_mq_freeze_queue_wait(struct request_queue *q)
+void blk_mq_freeze_queue_wait(struct request_queue *q)
 {
 	wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
 }
+EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
 
 /*
  * Guarantee no request is in use, so we can change any data structure of
-- 
cgit v1.1


From f91328c40a559362b6e7b7bfee01ca17fda87592 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 1 Mar 2017 14:22:11 -0500
Subject: blk-mq: Provide freeze queue timeout

A driver may wish to take corrective action if queued requests do not
complete within a set time.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8da2c04..a5e66a7 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -81,6 +81,15 @@ void blk_mq_freeze_queue_wait(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
 
+int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
+				     unsigned long timeout)
+{
+	return wait_event_timeout(q->mq_freeze_wq,
+					percpu_ref_is_zero(&q->q_usage_counter),
+					timeout);
+}
+EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
+
 /*
  * Guarantee no request is in use, so we can change any data structure of
  * the queue afterward.
-- 
cgit v1.1


From 7b36a7189fc320f0b783dd51bd1f541db56cfbdd Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 2 Mar 2017 13:59:08 -0700
Subject: block: don't call ioc_exit_icq() with the queue lock held for blk-mq

For legacy scheduling, we always call ioc_exit_icq() with both the
ioc and queue lock held. This poses a problem for blk-mq with
scheduling, since the queue lock isn't what we use in the scheduler.
And since we don't need the queue lock held for ioc exit there,
don't grab it and leave any extra locking up to the blk-mq scheduler.

Reported-by: Paolo Valente <paolo.valente@linaro.org>
Tested-by: Paolo Valente <paolo.valente@linaro.org>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-ioc.c   | 44 +++++++++++++++++++++++++++++++-------------
 block/blk-sysfs.c |  2 --
 block/elevator.c  |  2 --
 3 files changed, 31 insertions(+), 17 deletions(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index b12f9c8..0d7c96c 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -36,8 +36,8 @@ static void icq_free_icq_rcu(struct rcu_head *head)
 }
 
 /*
- * Exit an icq. Called with both ioc and q locked for sq, only ioc locked for
- * mq.
+ * Exit an icq. Called with ioc locked for blk-mq, and with both ioc
+ * and queue locked for legacy.
  */
 static void ioc_exit_icq(struct io_cq *icq)
 {
@@ -54,7 +54,10 @@ static void ioc_exit_icq(struct io_cq *icq)
 	icq->flags |= ICQ_EXITED;
 }
 
-/* Release an icq.  Called with both ioc and q locked. */
+/*
+ * Release an icq. Called with ioc locked for blk-mq, and with both ioc
+ * and queue locked for legacy.
+ */
 static void ioc_destroy_icq(struct io_cq *icq)
 {
 	struct io_context *ioc = icq->ioc;
@@ -62,7 +65,6 @@ static void ioc_destroy_icq(struct io_cq *icq)
 	struct elevator_type *et = q->elevator->type;
 
 	lockdep_assert_held(&ioc->lock);
-	lockdep_assert_held(q->queue_lock);
 
 	radix_tree_delete(&ioc->icq_tree, icq->q->id);
 	hlist_del_init(&icq->ioc_node);
@@ -222,24 +224,40 @@ void exit_io_context(struct task_struct *task)
 	put_io_context_active(ioc);
 }
 
+static void __ioc_clear_queue(struct list_head *icq_list)
+{
+	unsigned long flags;
+
+	while (!list_empty(icq_list)) {
+		struct io_cq *icq = list_entry(icq_list->next,
+					       struct io_cq, q_node);
+		struct io_context *ioc = icq->ioc;
+
+		spin_lock_irqsave(&ioc->lock, flags);
+		ioc_destroy_icq(icq);
+		spin_unlock_irqrestore(&ioc->lock, flags);
+	}
+}
+
 /**
  * ioc_clear_queue - break any ioc association with the specified queue
  * @q: request_queue being cleared
  *
- * Walk @q->icq_list and exit all io_cq's.  Must be called with @q locked.
+ * Walk @q->icq_list and exit all io_cq's.
  */
 void ioc_clear_queue(struct request_queue *q)
 {
-	lockdep_assert_held(q->queue_lock);
+	LIST_HEAD(icq_list);
 
-	while (!list_empty(&q->icq_list)) {
-		struct io_cq *icq = list_entry(q->icq_list.next,
-					       struct io_cq, q_node);
-		struct io_context *ioc = icq->ioc;
+	spin_lock_irq(q->queue_lock);
+	list_splice_init(&q->icq_list, &icq_list);
 
-		spin_lock(&ioc->lock);
-		ioc_destroy_icq(icq);
-		spin_unlock(&ioc->lock);
+	if (q->mq_ops) {
+		spin_unlock_irq(q->queue_lock);
+		__ioc_clear_queue(&icq_list);
+	} else {
+		__ioc_clear_queue(&icq_list);
+		spin_unlock_irq(q->queue_lock);
 	}
 }
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 002af83..c44b321 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -815,9 +815,7 @@ static void blk_release_queue(struct kobject *kobj)
 	blkcg_exit_queue(q);
 
 	if (q->elevator) {
-		spin_lock_irq(q->queue_lock);
 		ioc_clear_queue(q);
-		spin_unlock_irq(q->queue_lock);
 		elevator_exit(q->elevator);
 	}
 
diff --git a/block/elevator.c b/block/elevator.c
index ac1c9f4..01139f5 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -983,9 +983,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 		if (old_registered)
 			elv_unregister_queue(q);
 
-		spin_lock_irq(q->queue_lock);
 		ioc_clear_queue(q);
-		spin_unlock_irq(q->queue_lock);
 	}
 
 	/* allocate, init and register new elevator */
-- 
cgit v1.1


From 113285b473824922498d07d7f82459507b9792eb Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 2 Mar 2017 13:26:04 -0700
Subject: blk-mq: ensure that bd->last is always set correctly

When drivers are called with a request in blk-mq, blk-mq flags the
state such that the driver knows if this is the last request in
this call chain or not. The driver can then use that information
to defer kicking off IO until bd->last is true. However, with blk-mq
and scheduling, we need to allocate a driver tag for a request before
it can be issued. If we fail to allocate such a tag, we could end up
in the situation where the last request issued did not have
bd->last == true set. This can then cause a driver hang.

This fixes a hang with virtio-blk, which uses bd->last as a hint
on whether to kick the queue or not.

Reported-by: Chris Mason <clm@fb.com>
Tested-by: Chris Mason <clm@fb.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 50 +++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 43 insertions(+), 7 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index a5e66a7..e797607 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -876,12 +876,9 @@ done:
 	return false;
 }
 
-static void blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
-				  struct request *rq)
+static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
+				    struct request *rq)
 {
-	if (rq->tag == -1 || rq->internal_tag == -1)
-		return;
-
 	blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag);
 	rq->tag = -1;
 
@@ -891,6 +888,26 @@ static void blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
 	}
 }
 
+static void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx,
+				       struct request *rq)
+{
+	if (rq->tag == -1 || rq->internal_tag == -1)
+		return;
+
+	__blk_mq_put_driver_tag(hctx, rq);
+}
+
+static void blk_mq_put_driver_tag(struct request *rq)
+{
+	struct blk_mq_hw_ctx *hctx;
+
+	if (rq->tag == -1 || rq->internal_tag == -1)
+		return;
+
+	hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
+	__blk_mq_put_driver_tag(hctx, rq);
+}
+
 /*
  * If we fail getting a driver tag because all the driver tags are already
  * assigned and on the dispatch list, BUT the first entry does not have a
@@ -1000,7 +1017,19 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 
 		bd.rq = rq;
 		bd.list = dptr;
-		bd.last = list_empty(list);
+
+		/*
+		 * Flag last if we have no more requests, or if we have more
+		 * but can't assign a driver tag to it.
+		 */
+		if (list_empty(list))
+			bd.last = true;
+		else {
+			struct request *nxt;
+
+			nxt = list_first_entry(list, struct request, queuelist);
+			bd.last = !blk_mq_get_driver_tag(nxt, NULL, false);
+		}
 
 		ret = q->mq_ops->queue_rq(hctx, &bd);
 		switch (ret) {
@@ -1008,7 +1037,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 			queued++;
 			break;
 		case BLK_MQ_RQ_QUEUE_BUSY:
-			blk_mq_put_driver_tag(hctx, rq);
+			blk_mq_put_driver_tag_hctx(hctx, rq);
 			list_add(&rq->queuelist, list);
 			__blk_mq_requeue_request(rq);
 			break;
@@ -1038,6 +1067,13 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 	 * that is where we will continue on next queue run.
 	 */
 	if (!list_empty(list)) {
+		/*
+		 * If we got a driver tag for the next request already,
+		 * free it again.
+		 */
+		rq = list_first_entry(list, struct request, queuelist);
+		blk_mq_put_driver_tag(rq);
+
 		spin_lock(&hctx->lock);
 		list_splice_init(list, &hctx->dispatch);
 		spin_unlock(&hctx->lock);
-- 
cgit v1.1


From 165a5e22fafb127ecb5914e12e8c32a1f0d3f820 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 8 Feb 2017 08:05:56 +0100
Subject: block: Move bdi_unregister() to del_gendisk()

Commit 6cd18e711dd8 "block: destroy bdi before blockdev is
unregistered." moved bdi unregistration (at that time through
bdi_destroy()) from blk_release_queue() to blk_cleanup_queue() because
it needs to happen before blk_unregister_region() call in del_gendisk()
for MD. SCSI though will free up the device number from sd_remove()
called through a maze of callbacks from device_del() in
__scsi_remove_device() before blk_cleanup_queue() and thus similar races
as described in 6cd18e711dd8 can happen for SCSI as well as reported by
Omar [1].

Moving bdi_unregister() to del_gendisk() works for MD and fixes the
problem for SCSI since del_gendisk() gets called from sd_remove() before
freeing the device number.

This also makes device_add_disk() (calling bdi_register_owner()) more
symmetric with del_gendisk().

[1] http://marc.info/?l=linux-block&m=148554717109098&w=2

Tested-by: Lekshmi Pillai <lekshmicpillai@in.ibm.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Tested-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c | 1 -
 block/genhd.c    | 5 +++++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index b9e857f..1086dac 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -578,7 +578,6 @@ void blk_cleanup_queue(struct request_queue *q)
 		q->queue_lock = &q->__queue_lock;
 	spin_unlock_irq(lock);
 
-	bdi_unregister(q->backing_dev_info);
 	put_disk_devt(q->disk_devt);
 
 	/* @q is and will stay empty, shutdown and put */
diff --git a/block/genhd.c b/block/genhd.c
index 2f444b8..b26a5ea 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -681,6 +681,11 @@ void del_gendisk(struct gendisk *disk)
 	disk->flags &= ~GENHD_FL_UP;
 
 	sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
+	/*
+	 * Unregister bdi before releasing device numbers (as they can get
+	 * reused and we'd get clashes in sysfs).
+	 */
+	bdi_unregister(disk->queue->backing_dev_info);
 	blk_unregister_queue(disk);
 	blk_unregister_region(disk_devt(disk), disk->minors);
 
-- 
cgit v1.1