From 2940474af79744411da0cb63b041ad52c57bc443 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 11 Jun 2014 13:49:23 +0200
Subject: block: remove elv_abort_queue and blk_abort_flushes

elv_abort_queue has no callers, and blk_abort_flushes is only called by
elv_abort_queue.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-flush.c | 38 --------------------------------------
 block/blk.h       |  1 -
 block/elevator.c  | 20 --------------------
 3 files changed, 59 deletions(-)

(limited to 'block')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 8ffee4b..3cb5e9e 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -422,44 +422,6 @@ void blk_insert_flush(struct request *rq)
 }
 
 /**
- * blk_abort_flushes - @q is being aborted, abort flush requests
- * @q: request_queue being aborted
- *
- * To be called from elv_abort_queue().  @q is being aborted.  Prepare all
- * FLUSH/FUA requests for abortion.
- *
- * CONTEXT:
- * spin_lock_irq(q->queue_lock)
- */
-void blk_abort_flushes(struct request_queue *q)
-{
-	struct request *rq, *n;
-	int i;
-
-	/*
-	 * Requests in flight for data are already owned by the dispatch
-	 * queue or the device driver.  Just restore for normal completion.
-	 */
-	list_for_each_entry_safe(rq, n, &q->flush_data_in_flight, flush.list) {
-		list_del_init(&rq->flush.list);
-		blk_flush_restore_request(rq);
-	}
-
-	/*
-	 * We need to give away requests on flush queues.  Restore for
-	 * normal completion and put them on the dispatch queue.
-	 */
-	for (i = 0; i < ARRAY_SIZE(q->flush_queue); i++) {
-		list_for_each_entry_safe(rq, n, &q->flush_queue[i],
-					 flush.list) {
-			list_del_init(&rq->flush.list);
-			blk_flush_restore_request(rq);
-			list_add_tail(&rq->queuelist, &q->queue_head);
-		}
-	}
-}
-
-/**
  * blkdev_issue_flush - queue a flush
  * @bdev:	blockdev to issue flush for
  * @gfp_mask:	memory allocation flags (for bio_alloc)
diff --git a/block/blk.h b/block/blk.h
index 45385e9..6748c4f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -84,7 +84,6 @@ static inline void blk_clear_rq_complete(struct request *rq)
 #define ELV_ON_HASH(rq) ((rq)->cmd_flags & REQ_HASHED)
 
 void blk_insert_flush(struct request *rq);
-void blk_abort_flushes(struct request_queue *q);
 
 static inline struct request *__elv_next_request(struct request_queue *q)
 {
diff --git a/block/elevator.c b/block/elevator.c
index f35eddd..34bded1 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -729,26 +729,6 @@ int elv_may_queue(struct request_queue *q, int rw)
 	return ELV_MQUEUE_MAY;
 }
 
-void elv_abort_queue(struct request_queue *q)
-{
-	struct request *rq;
-
-	blk_abort_flushes(q);
-
-	while (!list_empty(&q->queue_head)) {
-		rq = list_entry_rq(q->queue_head.next);
-		rq->cmd_flags |= REQ_QUIET;
-		trace_block_rq_abort(q, rq);
-		/*
-		 * Mark this request as started so we don't trigger
-		 * any debug logic in the end I/O path.
-		 */
-		blk_start_request(rq);
-		__blk_end_request_all(rq, -EIO);
-	}
-}
-EXPORT_SYMBOL(elv_abort_queue);
-
 void elv_completed_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
-- 
cgit v1.1


From 28747fcd2211d0fccbe3d3c91a1067c3744db908 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= <m@bjorling.me>
Date: Wed, 11 Jun 2014 23:43:54 +0200
Subject: block: remove WQ_POWER_EFFICIENT from kblockd
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

blk-mq issues async requests through kblockd. To issue a work request on
a specific CPU, kblockd_schedule_delayed_work_on is used. However, the
specific CPU choice may not be honored, if the power_efficient option
for workqueues is set. blk-mq requires that we have strict per-cpu
scheduling, so it wont work properly if kblockd is marked
POWER_EFFICIENT and power_efficient is set.

Remove the kblockd WQ_POWER_EFFICIENT flag to prevent this behavior.
This essentially reverts part of commit 695588f9454b, which added
the WQ_POWER_EFFICIENT marker to kblockd.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 9aca8c7..55f91b8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3311,8 +3311,7 @@ int __init blk_dev_init(void)
 
 	/* used for unplugging and affects IO latency/throughput - HIGHPRI */
 	kblockd_workqueue = alloc_workqueue("kblockd",
-					    WQ_MEM_RECLAIM | WQ_HIGHPRI |
-					    WQ_POWER_EFFICIENT, 0);
+					    WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
 	if (!kblockd_workqueue)
 		panic("Failed to create kblockd\n");
 
-- 
cgit v1.1


From 8f5280f4ee75333ca12bde99ef6280ff65a8af43 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 13 Jun 2014 19:43:04 +0200
Subject: blk-mq: properly drain stopped queues

If we need to drain a queue we need to run all queues, even if they
are marked stopped to make sure the driver has a chance to error out
on all queued requests.

This fixes surprise removal with scsi-mq.

Reported-by: Bart Van Assche <bvanassche@acm.org>
Tested-by: Bart Van Assche <bvanassche@acm.org>

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index e11f5f8..fd8b485 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -120,7 +120,7 @@ static void __blk_mq_drain_queue(struct request_queue *q)
 
 		if (count == 0)
 			break;
-		blk_mq_run_queues(q, false);
+		blk_mq_start_hw_queues(q);
 		msleep(10);
 	}
 }
-- 
cgit v1.1


From 95ed068165d8edf6a81f9a73df0bb05c38602c3c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 13 Jun 2014 19:43:35 +0200
Subject: blk-mq: merge blk_mq_drain_queue and __blk_mq_drain_queue

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index fd8b485..0ef2dc7 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -109,7 +109,7 @@ static void blk_mq_queue_exit(struct request_queue *q)
 	__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
 }
 
-static void __blk_mq_drain_queue(struct request_queue *q)
+void blk_mq_drain_queue(struct request_queue *q)
 {
 	while (true) {
 		s64 count;
@@ -139,12 +139,7 @@ static void blk_mq_freeze_queue(struct request_queue *q)
 	spin_unlock_irq(q->queue_lock);
 
 	if (drain)
-		__blk_mq_drain_queue(q);
-}
-
-void blk_mq_drain_queue(struct request_queue *q)
-{
-	__blk_mq_drain_queue(q);
+		blk_mq_drain_queue(q);
 }
 
 static void blk_mq_unfreeze_queue(struct request_queue *q)
-- 
cgit v1.1


From 8537b12034cf1fd3fab3da2c859d71f76846fae9 Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <agordeev@redhat.com>
Date: Tue, 17 Jun 2014 22:12:35 -0700
Subject: blk-mq: bitmap tag: fix races on shared ::wake_index fields

Fix racy updates of shared blk_mq_bitmap_tags::wake_index
and blk_mq_hw_ctx::wake_index fields.

Cc: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Alexander Gordeev <agordeev@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-tag.c | 32 +++++++++++++++++++++-----------
 block/blk-mq-tag.h |  2 +-
 2 files changed, 22 insertions(+), 12 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 1aab39f..6deb130 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -43,9 +43,16 @@ bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
 	return bt_has_free_tags(&tags->bitmap_tags);
 }
 
-static inline void bt_index_inc(unsigned int *index)
+static inline int bt_index_inc(int index)
 {
-	*index = (*index + 1) & (BT_WAIT_QUEUES - 1);
+	return (index + 1) & (BT_WAIT_QUEUES - 1);
+}
+
+static inline void bt_index_atomic_inc(atomic_t *index)
+{
+	int old = atomic_read(index);
+	int new = bt_index_inc(old);
+	atomic_cmpxchg(index, old, new);
 }
 
 /*
@@ -69,14 +76,14 @@ static void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags)
 	int i, wake_index;
 
 	bt = &tags->bitmap_tags;
-	wake_index = bt->wake_index;
+	wake_index = atomic_read(&bt->wake_index);
 	for (i = 0; i < BT_WAIT_QUEUES; i++) {
 		struct bt_wait_state *bs = &bt->bs[wake_index];
 
 		if (waitqueue_active(&bs->wait))
 			wake_up(&bs->wait);
 
-		bt_index_inc(&wake_index);
+		wake_index = bt_index_inc(wake_index);
 	}
 }
 
@@ -212,12 +219,14 @@ static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt,
 					 struct blk_mq_hw_ctx *hctx)
 {
 	struct bt_wait_state *bs;
+	int wait_index;
 
 	if (!hctx)
 		return &bt->bs[0];
 
-	bs = &bt->bs[hctx->wait_index];
-	bt_index_inc(&hctx->wait_index);
+	wait_index = atomic_read(&hctx->wait_index);
+	bs = &bt->bs[wait_index];
+	bt_index_atomic_inc(&hctx->wait_index);
 	return bs;
 }
 
@@ -313,18 +322,19 @@ static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
 {
 	int i, wake_index;
 
-	wake_index = bt->wake_index;
+	wake_index = atomic_read(&bt->wake_index);
 	for (i = 0; i < BT_WAIT_QUEUES; i++) {
 		struct bt_wait_state *bs = &bt->bs[wake_index];
 
 		if (waitqueue_active(&bs->wait)) {
-			if (wake_index != bt->wake_index)
-				bt->wake_index = wake_index;
+			int o = atomic_read(&bt->wake_index);
+			if (wake_index != o)
+				atomic_cmpxchg(&bt->wake_index, o, wake_index);
 
 			return bs;
 		}
 
-		bt_index_inc(&wake_index);
+		wake_index = bt_index_inc(wake_index);
 	}
 
 	return NULL;
@@ -344,7 +354,7 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
 	bs = bt_wake_ptr(bt);
 	if (bs && atomic_dec_and_test(&bs->wait_cnt)) {
 		atomic_set(&bs->wait_cnt, bt->wake_cnt);
-		bt_index_inc(&bt->wake_index);
+		bt_index_atomic_inc(&bt->wake_index);
 		wake_up(&bs->wait);
 	}
 }
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 98696a6..6206ed1 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -24,7 +24,7 @@ struct blk_mq_bitmap_tags {
 	unsigned int map_nr;
 	struct blk_align_bitmap *map;
 
-	unsigned int wake_index;
+	atomic_t wake_index;
 	struct bt_wait_state *bs;
 };
 
-- 
cgit v1.1


From 2971c35f35886b87af54675313a2afef937c1b0c Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <agordeev@redhat.com>
Date: Thu, 12 Jun 2014 17:05:37 +0200
Subject: blk-mq: bitmap tag: fix race on blk_mq_bitmap_tags::wake_cnt

This piece of code in bt_clear_tag() function is racy:

	bs = bt_wake_ptr(bt);
	if (bs && atomic_dec_and_test(&bs->wait_cnt)) {
		atomic_set(&bs->wait_cnt, bt->wake_cnt);
 		wake_up(&bs->wait);
	}

Since nothing prevents bt_wake_ptr() from returning the very
same 'bs' address on multiple CPUs, the following scenario is
possible:

    CPU1                                CPU2
    ----                                ----

0.  bs = bt_wake_ptr(bt);               bs = bt_wake_ptr(bt);
1.  atomic_dec_and_test(&bs->wait_cnt)
2.                                      atomic_dec_and_test(&bs->wait_cnt)
3.  atomic_set(&bs->wait_cnt, bt->wake_cnt);

If the decrement in [1] yields zero then for some amount of time
the decrement in [2] results in a negative/overflow value, which
is not expected. The follow-up assignment in [3] overwrites the
invalid value with the batch value (and likely prevents the issue
from being severe) which is still incorrect and should be a lesser.

Cc: Ming Lei <tom.leiming@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Alexander Gordeev <agordeev@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-tag.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 6deb130..08fc671 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -344,6 +344,7 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
 {
 	const int index = TAG_TO_INDEX(bt, tag);
 	struct bt_wait_state *bs;
+	int wait_cnt;
 
 	/*
 	 * The unlock memory barrier need to order access to req in free
@@ -352,10 +353,19 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
 	clear_bit_unlock(TAG_TO_BIT(bt, tag), &bt->map[index].word);
 
 	bs = bt_wake_ptr(bt);
-	if (bs && atomic_dec_and_test(&bs->wait_cnt)) {
-		atomic_set(&bs->wait_cnt, bt->wake_cnt);
+	if (!bs)
+		return;
+
+	wait_cnt = atomic_dec_return(&bs->wait_cnt);
+	if (wait_cnt == 0) {
+wake:
+		atomic_add(bt->wake_cnt, &bs->wait_cnt);
 		bt_index_atomic_inc(&bt->wake_index);
 		wake_up(&bs->wait);
+	} else if (wait_cnt < 0) {
+		wait_cnt = atomic_inc_return(&bs->wait_cnt);
+		if (!wait_cnt)
+			goto wake;
 	}
 }
 
-- 
cgit v1.1


From 86fb5c56cfa26de5e91c9a50e2767a695dff366e Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <agordeev@redhat.com>
Date: Tue, 17 Jun 2014 22:37:23 +0200
Subject: blk-mq: bitmap tag: fix races in bt_get() function

This update fixes few issues in bt_get() function:

- list_empty(&wait.task_list) check is not protected;

- was_empty check is always true which results in *every* thread
  entering the loop resets bt_wait_state::wait_cnt counter rather
  than every bt->wake_cnt'th thread;

- 'bt_wait_state::wait_cnt' counter update is redundant, since
  it also gets reset in bt_clear_tag() function;

Cc: Christoph Hellwig <hch@infradead.org>
Cc: Ming Lei <tom.leiming@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Alexander Gordeev <agordeev@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-tag.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 08fc671..c1b9242 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -248,18 +248,12 @@ static int bt_get(struct blk_mq_alloc_data *data,
 
 	bs = bt_wait_ptr(bt, hctx);
 	do {
-		bool was_empty;
-
-		was_empty = list_empty(&wait.task_list);
 		prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
 
 		tag = __bt_get(hctx, bt, last_tag);
 		if (tag != -1)
 			break;
 
-		if (was_empty)
-			atomic_set(&bs->wait_cnt, bt->wake_cnt);
-
 		blk_mq_put_ctx(data->ctx);
 
 		io_schedule();
@@ -519,10 +513,13 @@ static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,
 		return -ENOMEM;
 	}
 
-	for (i = 0; i < BT_WAIT_QUEUES; i++)
+	bt_update_count(bt, depth);
+
+	for (i = 0; i < BT_WAIT_QUEUES; i++) {
 		init_waitqueue_head(&bt->bs[i].wait);
+		atomic_set(&bt->bs[i].wait_cnt, bt->wake_cnt);
+	}
 
-	bt_update_count(bt, depth);
 	return 0;
 }
 
-- 
cgit v1.1