From dd840087086f3b93ac20f7472b4fca59aff7b79f Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Fri, 15 Aug 2014 23:16:32 +0800
Subject: blk-mq: fix WARNING "percpu_ref_kill() called more than once!"

Before doing queue release, the queue has been freezed already
by blk_cleanup_queue(), so needn't to freeze queue for deleting
tag set.

This patch fixes the WARNING of "percpu_ref_kill() called more than once!"
which is triggered during unloading block driver.

Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5189cb1..ac8a041 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1713,14 +1713,10 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
 {
 	struct blk_mq_tag_set *set = q->tag_set;
 
-	blk_mq_freeze_queue(q);
-
 	mutex_lock(&set->tag_list_lock);
 	list_del_init(&q->tag_set_list);
 	blk_mq_update_tag_set_depth(set);
 	mutex_unlock(&set->tag_list_lock);
-
-	blk_mq_unfreeze_queue(q);
 }
 
 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
-- 
cgit v1.1


From 274a5843ff2f08a89464589d90c64eb65f2c0847 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 15 Aug 2014 12:44:08 -0600
Subject: blk-mq: don't allow merges if turned off for the queue

blk-mq uses BLK_MQ_F_SHOULD_MERGE, as set by the driver at init time,
to determine whether it should merge IO or not. However, this could
also be disabled by the admin, if merging is switched off through
sysfs. So check the general queue state as well before attempting
to merge IO.

Reported-by: Rob Elliott <Elliott@hp.com>
Tested-by: Rob Elliott <Elliott@hp.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index ac8a041..c9e89a8 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1068,13 +1068,17 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
 		blk_account_io_start(rq, 1);
 }
 
+static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx)
+{
+	return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
+		!blk_queue_nomerges(hctx->queue);
+}
+
 static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
 					 struct blk_mq_ctx *ctx,
 					 struct request *rq, struct bio *bio)
 {
-	struct request_queue *q = hctx->queue;
-
-	if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE)) {
+	if (!hctx_allow_merges(hctx)) {
 		blk_mq_bio_to_request(rq, bio);
 		spin_lock(&ctx->lock);
 insert_rq:
@@ -1082,6 +1086,8 @@ insert_rq:
 		spin_unlock(&ctx->lock);
 		return false;
 	} else {
+		struct request_queue *q = hctx->queue;
+
 		spin_lock(&ctx->lock);
 		if (!blk_mq_attempt_merge(q, ctx, bio)) {
 			blk_mq_bio_to_request(rq, bio);
-- 
cgit v1.1


From a68aafa5b297d99c2d0c38689089a752126e9e79 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 15 Aug 2014 13:19:15 -0600
Subject: blk-mq: correct a few wrong/bad comments

Just grammar or spelling errors, nothing major.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index c9e89a8..a0565bb 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1580,7 +1580,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
 		hctx->tags = set->tags[i];
 
 		/*
-		 * Allocate space for all possible cpus to avoid allocation in
+		 * Allocate space for all possible cpus to avoid allocation at
 		 * runtime
 		 */
 		hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
@@ -1668,8 +1668,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 
 	queue_for_each_hw_ctx(q, hctx, i) {
 		/*
-		 * If not software queues are mapped to this hardware queue,
-		 * disable it and free the request entries
+		 * If no software queues are mapped to this hardware queue,
+		 * disable it and free the request entries.
 		 */
 		if (!hctx->nr_ctx) {
 			struct blk_mq_tag_set *set = q->tag_set;
-- 
cgit v1.1


From cddd5d17642cc6881352732693c2ae6930e9ce65 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 16 Aug 2014 08:02:24 -0400
Subject: blk-mq: blk_mq_freeze_queue() should allow nesting

While converting to percpu_ref for freezing, add703fda981 ("blk-mq:
use percpu_ref for mq usage count") incorrectly made
blk_mq_freeze_queue() misbehave when freezing is nested due to
percpu_ref_kill() being invoked on an already killed ref.

Fix it by making blk_mq_freeze_queue() kill and kick the queue only
for the outermost freeze attempt.  All the nested ones can simply wait
for the ref to reach zero.

While at it, remove unnecessary @wake initialization from
blk_mq_unfreeze_queue().

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index a0565bb..7950f8d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -112,18 +112,22 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref)
  */
 void blk_mq_freeze_queue(struct request_queue *q)
 {
+	bool freeze;
+
 	spin_lock_irq(q->queue_lock);
-	q->mq_freeze_depth++;
+	freeze = !q->mq_freeze_depth++;
 	spin_unlock_irq(q->queue_lock);
 
-	percpu_ref_kill(&q->mq_usage_counter);
-	blk_mq_run_queues(q, false);
+	if (freeze) {
+		percpu_ref_kill(&q->mq_usage_counter);
+		blk_mq_run_queues(q, false);
+	}
 	wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter));
 }
 
 static void blk_mq_unfreeze_queue(struct request_queue *q)
 {
-	bool wake = false;
+	bool wake;
 
 	spin_lock_irq(q->queue_lock);
 	wake = !--q->mq_freeze_depth;
-- 
cgit v1.1


From 6f4a16266fb3e58cd3e200eab51d2220ef92d604 Mon Sep 17 00:00:00 2001
From: Tony Battersby <tonyb@cybernetics.com>
Date: Fri, 22 Aug 2014 15:53:39 -0400
Subject: scsi-mq: fix requests that use a separate CDB buffer

This patch fixes code such as the following with scsi-mq enabled:

    rq = blk_get_request(...);
    blk_rq_set_block_pc(rq);

    rq->cmd = my_cmd_buffer; /* separate CDB buffer */

    blk_execute_rq_nowait(...);

Code like this appears in e.g. sg_start_req() in drivers/scsi/sg.c (for
large CDBs only).  Without this patch, scsi_mq_prep_fn() will set
rq->cmd back to rq->__cmd, causing the wrong CDB to be sent to the device.

Signed-off-by: Tony Battersby <tonyb@cybernetics.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7950f8d..4aac826 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -176,6 +176,8 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 	/* tag was already set */
 	rq->errors = 0;
 
+	rq->cmd = rq->__cmd;
+
 	rq->extra_len = 0;
 	rq->sense_len = 0;
 	rq->resid_len = 0;
-- 
cgit v1.1


From 5676e7b6db02b80eafc2e3ad316d5f2fee817ecb Mon Sep 17 00:00:00 2001
From: Robert Elliott <elliott@hp.com>
Date: Tue, 2 Sep 2014 11:38:44 -0500
Subject: blk-mq: cleanup after blk_mq_init_rq_map failures

In blk-mq.c blk_mq_alloc_tag_set, if:
	set->tags = kmalloc_node()
succeeds, but one of the blk_mq_init_rq_map() calls fails,
	goto out_unwind;
needs to free set->tags so the caller is not obligated
to do so.  None of the current callers (null_blk,
virtio_blk, virtio_blk, or the forthcoming scsi-mq)
do so.

set->tags needs to be set to NULL after doing so,
so other tag cleanup logic doesn't try to free
a stale pointer later.  Also set it to NULL
in blk_mq_free_tag_set.

Tested with error injection on the forthcoming
scsi-mq + hpsa combination.

Signed-off-by: Robert Elliott <elliott@hp.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4aac826..f9b85e8 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1982,6 +1982,8 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 out_unwind:
 	while (--i >= 0)
 		blk_mq_free_rq_map(set, set->tags[i], i);
+	kfree(set->tags);
+	set->tags = NULL;
 out:
 	return -ENOMEM;
 }
@@ -1997,6 +1999,7 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 	}
 
 	kfree(set->tags);
+	set->tags = NULL;
 }
 EXPORT_SYMBOL(blk_mq_free_tag_set);
 
-- 
cgit v1.1


From a516440542afcb9647f88d12c35640baf02d07ea Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 10 Sep 2014 09:02:03 -0600
Subject: blk-mq: scale depth and rq map appropriate if low on memory

If we are running in a kdump environment, resources are scarce.
For some SCSI setups with a huge set of shared tags, we run out
of memory allocating what the drivers is asking for. So implement
a scale back logic to reduce the tag depth for those cases, allowing
the driver to successfully load.

We should extend this to detect low memory situations, and implement
a sane fallback for those (1 queue, 64 tags, or something like that).

Tested-by: Robert Elliott <elliott@hp.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 88 +++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 69 insertions(+), 19 deletions(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index f9b85e8..383ea0c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1321,6 +1321,7 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
 				continue;
 			set->ops->exit_request(set->driver_data, tags->rqs[i],
 						hctx_idx, i);
+			tags->rqs[i] = NULL;
 		}
 	}
 
@@ -1354,8 +1355,9 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 
 	INIT_LIST_HEAD(&tags->page_list);
 
-	tags->rqs = kmalloc_node(set->queue_depth * sizeof(struct request *),
-					GFP_KERNEL, set->numa_node);
+	tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *),
+				 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
+				 set->numa_node);
 	if (!tags->rqs) {
 		blk_mq_free_tags(tags);
 		return NULL;
@@ -1379,8 +1381,9 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 			this_order--;
 
 		do {
-			page = alloc_pages_node(set->numa_node, GFP_KERNEL,
-						this_order);
+			page = alloc_pages_node(set->numa_node,
+				GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
+				this_order);
 			if (page)
 				break;
 			if (!this_order--)
@@ -1404,8 +1407,10 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 			if (set->ops->init_request) {
 				if (set->ops->init_request(set->driver_data,
 						tags->rqs[i], hctx_idx, i,
-						set->numa_node))
+						set->numa_node)) {
+					tags->rqs[i] = NULL;
 					goto fail;
+				}
 			}
 
 			p += rq_size;
@@ -1416,7 +1421,6 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 	return tags;
 
 fail:
-	pr_warn("%s: failed to allocate requests\n", __func__);
 	blk_mq_free_rq_map(set, tags, hctx_idx);
 	return NULL;
 }
@@ -1936,6 +1940,61 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
 	return NOTIFY_OK;
 }
 
+static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
+{
+	int i;
+
+	for (i = 0; i < set->nr_hw_queues; i++) {
+		set->tags[i] = blk_mq_init_rq_map(set, i);
+		if (!set->tags[i])
+			goto out_unwind;
+	}
+
+	return 0;
+
+out_unwind:
+	while (--i >= 0)
+		blk_mq_free_rq_map(set, set->tags[i], i);
+
+	set->tags = NULL;
+	return -ENOMEM;
+}
+
+/*
+ * Allocate the request maps associated with this tag_set. Note that this
+ * may reduce the depth asked for, if memory is tight. set->queue_depth
+ * will be updated to reflect the allocated depth.
+ */
+static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
+{
+	unsigned int depth;
+	int err;
+
+	depth = set->queue_depth;
+	do {
+		err = __blk_mq_alloc_rq_maps(set);
+		if (!err)
+			break;
+
+		set->queue_depth >>= 1;
+		if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
+			err = -ENOMEM;
+			break;
+		}
+	} while (set->queue_depth);
+
+	if (!set->queue_depth || err) {
+		pr_err("blk-mq: failed to allocate request map\n");
+		return -ENOMEM;
+	}
+
+	if (depth != set->queue_depth)
+		pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
+						depth, set->queue_depth);
+
+	return 0;
+}
+
 /*
  * Alloc a tag set to be associated with one or more request queues.
  * May fail with EINVAL for various error conditions. May adjust the
@@ -1944,8 +2003,6 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
  */
 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 {
-	int i;
-
 	if (!set->nr_hw_queues)
 		return -EINVAL;
 	if (!set->queue_depth)
@@ -1966,25 +2023,18 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 				 sizeof(struct blk_mq_tags *),
 				 GFP_KERNEL, set->numa_node);
 	if (!set->tags)
-		goto out;
+		return -ENOMEM;
 
-	for (i = 0; i < set->nr_hw_queues; i++) {
-		set->tags[i] = blk_mq_init_rq_map(set, i);
-		if (!set->tags[i])
-			goto out_unwind;
-	}
+	if (blk_mq_alloc_rq_maps(set))
+		goto enomem;
 
 	mutex_init(&set->tag_list_lock);
 	INIT_LIST_HEAD(&set->tag_list);
 
 	return 0;
-
-out_unwind:
-	while (--i >= 0)
-		blk_mq_free_rq_map(set, set->tags[i], i);
+enomem:
 	kfree(set->tags);
 	set->tags = NULL;
-out:
 	return -ENOMEM;
 }
 EXPORT_SYMBOL(blk_mq_alloc_tag_set);
-- 
cgit v1.1


From 538b75341835e3c2041ff066408de10d24fdc830 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 16 Sep 2014 10:37:37 -0600
Subject: blk-mq: request deadline must be visible before marking rq as started

When we start the request, we set the deadline and flip the bits
marking the request as started and non-complete. However, it's
important that the deadline store is ordered before flipping the
bits, otherwise we could have a small window where the request is
marked started but with an invalid deadline. This can confuse the
timeout handling.

Suggested-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 383ea0c..a13c40c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -393,6 +393,12 @@ static void blk_mq_start_request(struct request *rq, bool last)
 	blk_add_timer(rq);
 
 	/*
+	 * Ensure that ->deadline is visible before set the started
+	 * flag and clear the completed flag.
+	 */
+	smp_mb__before_atomic();
+
+	/*
 	 * Mark us as started and clear complete. Complete might have been
 	 * set if requeue raced with timeout, which then marked it as
 	 * complete. So be sure to clear complete again when we start
-- 
cgit v1.1


From 683d0e126232d898a481daa3a4ca032c2b1a9660 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Thu, 18 Sep 2014 11:04:31 +0200
Subject: blk-mq: Avoid race condition with uninitialized requests

This patch should fix the bug reported in
https://lkml.org/lkml/2014/9/11/249.

We have to initialize at least the atomic_flags and the cmd_flags when
allocating storage for the requests.

Otherwise blk_mq_timeout_check() might dereference uninitialized
pointers when racing with the creation of a request.

Also move the reset of cmd_flags for the initializing code to the point
where a request is freed. So we will never end up with pending flush
request indicators that might trigger dereferences of invalid pointers
in blk_mq_timeout_check().

Cc: stable@vger.kernel.org
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Reported-by: Paulo De Rezende Pinatti <ppinatti@linux.vnet.ibm.com>
Tested-by: Paulo De Rezende Pinatti <ppinatti@linux.vnet.ibm.com>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index a13c40c..1583ed2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -203,7 +203,6 @@ __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw)
 	if (tag != BLK_MQ_TAG_FAIL) {
 		rq = data->hctx->tags->rqs[tag];
 
-		rq->cmd_flags = 0;
 		if (blk_mq_tag_busy(data->hctx)) {
 			rq->cmd_flags = REQ_MQ_INFLIGHT;
 			atomic_inc(&data->hctx->nr_active);
@@ -258,6 +257,7 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
 
 	if (rq->cmd_flags & REQ_MQ_INFLIGHT)
 		atomic_dec(&hctx->nr_active);
+	rq->cmd_flags = 0;
 
 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 	blk_mq_put_tag(hctx, tag, &ctx->last_tag);
@@ -1410,6 +1410,8 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 		left -= to_do * rq_size;
 		for (j = 0; j < to_do; j++) {
 			tags->rqs[i] = p;
+			tags->rqs[i]->atomic_flags = 0;
+			tags->rqs[i]->cmd_flags = 0;
 			if (set->ops->init_request) {
 				if (set->ops->init_request(set->driver_data,
 						tags->rqs[i], hctx_idx, i,
-- 
cgit v1.1


From a57a178a490345c7236b0077b3de005754389ed6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 16 Sep 2014 14:44:07 -0700
Subject: blk-mq: avoid infinite recursion with the FUA flag

We should not insert requests into the flush state machine from
blk_mq_insert_request.  All incoming flush requests come through
blk_{m,s}q_make_request and are handled there, while blk_execute_rq_nowait
should only be called for BLOCK_PC requests.  All other callers
deal with requests that already went through the flush statemchine
and shouldn't be reinserted into it.

Reported-by: Robert Elliott  <Elliott@hp.com>
Debugged-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1583ed2..a7d70a1 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -963,14 +963,9 @@ void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
 
 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
 
-	if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA) &&
-	    !(rq->cmd_flags & (REQ_FLUSH_SEQ))) {
-		blk_insert_flush(rq);
-	} else {
-		spin_lock(&ctx->lock);
-		__blk_mq_insert_request(hctx, rq, at_head);
-		spin_unlock(&ctx->lock);
-	}
+	spin_lock(&ctx->lock);
+	__blk_mq_insert_request(hctx, rq, at_head);
+	spin_unlock(&ctx->lock);
 
 	if (run_queue)
 		blk_mq_run_hw_queue(hctx, async);
-- 
cgit v1.1


From 6b55e1f2d0a5e462e52678278ab749468f1db81c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 19 Sep 2014 08:04:53 -0600
Subject: blk-mq: fix potential oops on out-of-memory in
 __blk_mq_alloc_rq_maps()

__blk_mq_alloc_rq_maps() can be invoked multiple times, if we scale
back the queue depth if we are low on memory. So don't clear
set->tags when we fail, this is handled directly in
the parent function, blk_mq_alloc_tag_set().

Reported-by: Robert Elliott  <Elliott@hp.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index a7d70a1..e83d306 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1959,7 +1959,6 @@ out_unwind:
 	while (--i >= 0)
 		blk_mq_free_rq_map(set, set->tags[i], i);
 
-	set->tags = NULL;
 	return -ENOMEM;
 }
 
-- 
cgit v1.1


From 8b95741569eabc5eb17da71d1d3668cdb0bef86c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 19 Sep 2014 13:10:29 -0600
Subject: blk-mq: use blk_mq_start_hw_queues() when running requeue work

When requests are retried due to hw or sw resource shortages,
we often stop the associated hardware queue. So ensure that we
restart the queues when running the requeue work, otherwise the
queue run will be a no-op.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index e83d306..c88e608 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -479,7 +479,11 @@ static void blk_mq_requeue_work(struct work_struct *work)
 		blk_mq_insert_request(rq, false, false, false);
 	}
 
-	blk_mq_run_queues(q, false);
+	/*
+	 * Use the start variant of queue running here, so that running
+	 * the requeue work will kick stopped queues.
+	 */
+	blk_mq_start_hw_queues(q);
 }
 
 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head)
-- 
cgit v1.1


From 0a30288da1aec914e158c2d7a3482a85f632750f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 23 Sep 2014 15:24:32 -0400
Subject: blk-mq, percpu_ref: implement a kludge for SCSI blk-mq stall during
 probe

blk-mq uses percpu_ref for its usage counter which tracks the number
of in-flight commands and used to synchronously drain the queue on
freeze.  percpu_ref shutdown takes measureable wallclock time as it
involves a sched RCU grace period.  This means that draining a blk-mq
takes measureable wallclock time.  One would think that this shouldn't
matter as queue shutdown should be a rare event which takes place
asynchronously w.r.t. userland.

Unfortunately, SCSI probing involves synchronously setting up and then
tearing down a lot of request_queues back-to-back for non-existent
LUNs.  This means that SCSI probing may take more than ten seconds
when scsi-mq is used.

This will be properly fixed by implementing a mechanism to keep
q->mq_usage_counter in atomic mode till genhd registration; however,
that involves rather big updates to percpu_ref which is difficult to
apply late in the devel cycle (v3.17-rc6 at the moment).  As a
stop-gap measure till the proper fix can be implemented in the next
cycle, this patch introduces __percpu_ref_kill_expedited() and makes
blk_mq_freeze_queue() use it.  This is heavy-handed but should work
for testing the experimental SCSI blk-mq implementation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Christoph Hellwig <hch@infradead.org>
Link: http://lkml.kernel.org/g/20140919113815.GA10791@lst.de
Fixes: add703fda981 ("blk-mq: use percpu_ref for mq usage count")
Cc: Kent Overstreet <kmo@daterainc.com>
Cc: Jens Axboe <axboe@kernel.dk>
Tested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index c88e608..df8e1e0 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -119,7 +119,16 @@ void blk_mq_freeze_queue(struct request_queue *q)
 	spin_unlock_irq(q->queue_lock);
 
 	if (freeze) {
-		percpu_ref_kill(&q->mq_usage_counter);
+		/*
+		 * XXX: Temporary kludge to work around SCSI blk-mq stall.
+		 * SCSI synchronously creates and destroys many queues
+		 * back-to-back during probe leading to lengthy stalls.
+		 * This will be fixed by keeping ->mq_usage_counter in
+		 * atomic mode until genhd registration, but, for now,
+		 * let's work around using expedited synchronization.
+		 */
+		__percpu_ref_kill_expedited(&q->mq_usage_counter);
+
 		blk_mq_run_queues(q, false);
 	}
 	wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter));
-- 
cgit v1.1