summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-10-07 14:42:05 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2016-10-07 14:42:05 -0700
commit513a4befae06c4469abfb836e8f71977de58c636 (patch)
tree18cc7d0b01a7fd2352de734e99a4ca5c29ad5fac /block
parent87840a2b7e048018d18d60bdac5c09224de85370 (diff)
parent997198ba1ed691c09457120576c27dbd953d0557 (diff)
downloadop-kernel-dev-513a4befae06c4469abfb836e8f71977de58c636.zip
op-kernel-dev-513a4befae06c4469abfb836e8f71977de58c636.tar.gz
Merge branch 'for-4.9/block' of git://git.kernel.dk/linux-block
Pull block layer updates from Jens Axboe: "This is the main pull request for block layer changes in 4.9. As mentioned at the last merge window, I've changed things up and now do just one branch for core block layer changes, and driver changes. This avoids dependencies between the two branches. Outside of this main pull request, there are two topical branches coming as well. This pull request contains: - A set of fixes, and a conversion to blk-mq, of nbd. From Josef. - Set of fixes and updates for lightnvm from Matias, Simon, and Arnd. Followup dependency fix from Geert. - General fixes from Bart, Baoyou, Guoqing, and Linus W. - CFQ async write starvation fix from Glauber. - Add supprot for delayed kick of the requeue list, from Mike. - Pull out the scalable bitmap code from blk-mq-tag.c and make it generally available under the name of sbitmap. Only blk-mq-tag uses it for now, but the blk-mq scheduling bits will use it as well. From Omar. - bdev thaw error progagation from Pierre. - Improve the blk polling statistics, and allow the user to clear them. From Stephen. - Set of minor cleanups from Christoph in block/blk-mq. - Set of cleanups and optimizations from me for block/blk-mq. - Various nvme/nvmet/nvmeof fixes from the various folks" * 'for-4.9/block' of git://git.kernel.dk/linux-block: (54 commits) fs/block_dev.c: return the right error in thaw_bdev() nvme: Pass pointers, not dma addresses, to nvme_get/set_features() nvme/scsi: Remove power management support nvmet: Make dsm number of ranges zero based nvmet: Use direct IO for writes admin-cmd: Added smart-log command support. nvme-fabrics: Add host_traddr options field to host infrastructure nvme-fabrics: revise host transport option descriptions nvme-fabrics: rework nvmf_get_address() for variable options nbd: use BLK_MQ_F_BLOCKING blkcg: Annotate blkg_hint correctly cfq: fix starvation of asynchronous writes blk-mq: add flag for drivers wanting blocking ->queue_rq() blk-mq: remove non-blocking pass in blk_mq_map_request blk-mq: get rid of manual run of queue with __blk_mq_run_hw_queue() block: export bio_free_pages to other modules lightnvm: propagate device_add() error code lightnvm: expose device geometry through sysfs lightnvm: control life of nvm_dev in driver blk-mq: register device instead of disk ...
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig1
-rw-r--r--block/bio.c5
-rw-r--r--block/blk-core.c16
-rw-r--r--block/blk-mq-sysfs.c40
-rw-r--r--block/blk-mq-tag.c503
-rw-r--r--block/blk-mq-tag.h42
-rw-r--r--block/blk-mq.c183
-rw-r--r--block/blk-mq.h11
-rw-r--r--block/blk-sysfs.c4
-rw-r--r--block/cfq-iosched.c13
10 files changed, 244 insertions, 574 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 161491d..5136ad4 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -4,6 +4,7 @@
menuconfig BLOCK
bool "Enable the block layer" if EXPERT
default y
+ select SBITMAP
help
Provide block layer support for the kernel.
diff --git a/block/bio.c b/block/bio.c
index aa73540..db85c57 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1068,7 +1068,7 @@ static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter)
return 0;
}
-static void bio_free_pages(struct bio *bio)
+void bio_free_pages(struct bio *bio)
{
struct bio_vec *bvec;
int i;
@@ -1076,6 +1076,7 @@ static void bio_free_pages(struct bio *bio)
bio_for_each_segment_all(bvec, bio, i)
__free_page(bvec->bv_page);
}
+EXPORT_SYMBOL(bio_free_pages);
/**
* bio_uncopy_user - finish previously mapped bio
@@ -1274,7 +1275,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
nr_pages += end - start;
/*
- * buffer must be aligned to at least hardsector size for now
+ * buffer must be aligned to at least logical block size for now
*/
if (uaddr & queue_dma_alignment(q))
return ERR_PTR(-EINVAL);
diff --git a/block/blk-core.c b/block/blk-core.c
index 36c7ac3..14d7c07 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -288,7 +288,7 @@ void blk_sync_queue(struct request_queue *q)
int i;
queue_for_each_hw_ctx(q, hctx, i) {
- cancel_delayed_work_sync(&hctx->run_work);
+ cancel_work_sync(&hctx->run_work);
cancel_delayed_work_sync(&hctx->delay_work);
}
} else {
@@ -3097,6 +3097,12 @@ int kblockd_schedule_work(struct work_struct *work)
}
EXPORT_SYMBOL(kblockd_schedule_work);
+int kblockd_schedule_work_on(int cpu, struct work_struct *work)
+{
+ return queue_work_on(cpu, kblockd_workqueue, work);
+}
+EXPORT_SYMBOL(kblockd_schedule_work_on);
+
int kblockd_schedule_delayed_work(struct delayed_work *dwork,
unsigned long delay)
{
@@ -3301,19 +3307,23 @@ bool blk_poll(struct request_queue *q, blk_qc_t cookie)
{
struct blk_plug *plug;
long state;
+ unsigned int queue_num;
+ struct blk_mq_hw_ctx *hctx;
if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) ||
!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
return false;
+ queue_num = blk_qc_t_to_queue_num(cookie);
+ hctx = q->queue_hw_ctx[queue_num];
+ hctx->poll_considered++;
+
plug = current->plug;
if (plug)
blk_flush_plug_list(plug, false);
state = current->state;
while (!need_resched()) {
- unsigned int queue_num = blk_qc_t_to_queue_num(cookie);
- struct blk_mq_hw_ctx *hctx = q->queue_hw_ctx[queue_num];
int ret;
hctx->poll_invoked++;
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index fe822aa..01fb455 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -176,7 +176,17 @@ static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page)
static ssize_t blk_mq_hw_sysfs_poll_show(struct blk_mq_hw_ctx *hctx, char *page)
{
- return sprintf(page, "invoked=%lu, success=%lu\n", hctx->poll_invoked, hctx->poll_success);
+ return sprintf(page, "considered=%lu, invoked=%lu, success=%lu\n",
+ hctx->poll_considered, hctx->poll_invoked,
+ hctx->poll_success);
+}
+
+static ssize_t blk_mq_hw_sysfs_poll_store(struct blk_mq_hw_ctx *hctx,
+ const char *page, size_t size)
+{
+ hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0;
+
+ return size;
}
static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx,
@@ -198,12 +208,14 @@ static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx,
page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]);
- for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) {
- unsigned long d = 1U << (i - 1);
+ for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) {
+ unsigned int d = 1U << (i - 1);
- page += sprintf(page, "%8lu\t%lu\n", d, hctx->dispatched[i]);
+ page += sprintf(page, "%8u\t%lu\n", d, hctx->dispatched[i]);
}
+ page += sprintf(page, "%8u+\t%lu\n", 1U << (i - 1),
+ hctx->dispatched[i]);
return page - start_page;
}
@@ -301,8 +313,9 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = {
.show = blk_mq_hw_sysfs_cpus_show,
};
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = {
- .attr = {.name = "io_poll", .mode = S_IRUGO },
+ .attr = {.name = "io_poll", .mode = S_IWUSR | S_IRUGO },
.show = blk_mq_hw_sysfs_poll_show,
+ .store = blk_mq_hw_sysfs_poll_store,
};
static struct attribute *default_hw_ctx_attrs[] = {
@@ -380,9 +393,8 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
return ret;
}
-static void __blk_mq_unregister_disk(struct gendisk *disk)
+static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
{
- struct request_queue *q = disk->queue;
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx;
int i, j;
@@ -400,15 +412,15 @@ static void __blk_mq_unregister_disk(struct gendisk *disk)
kobject_del(&q->mq_kobj);
kobject_put(&q->mq_kobj);
- kobject_put(&disk_to_dev(disk)->kobj);
+ kobject_put(&dev->kobj);
q->mq_sysfs_init_done = false;
}
-void blk_mq_unregister_disk(struct gendisk *disk)
+void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
{
blk_mq_disable_hotplug();
- __blk_mq_unregister_disk(disk);
+ __blk_mq_unregister_dev(dev, q);
blk_mq_enable_hotplug();
}
@@ -430,10 +442,8 @@ static void blk_mq_sysfs_init(struct request_queue *q)
}
}
-int blk_mq_register_disk(struct gendisk *disk)
+int blk_mq_register_dev(struct device *dev, struct request_queue *q)
{
- struct device *dev = disk_to_dev(disk);
- struct request_queue *q = disk->queue;
struct blk_mq_hw_ctx *hctx;
int ret, i;
@@ -454,7 +464,7 @@ int blk_mq_register_disk(struct gendisk *disk)
}
if (ret)
- __blk_mq_unregister_disk(disk);
+ __blk_mq_unregister_dev(dev, q);
else
q->mq_sysfs_init_done = true;
out:
@@ -462,7 +472,7 @@ out:
return ret;
}
-EXPORT_SYMBOL_GPL(blk_mq_register_disk);
+EXPORT_SYMBOL_GPL(blk_mq_register_dev);
void blk_mq_sysfs_unregister(struct request_queue *q)
{
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 729bac3..cef618f 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -1,58 +1,24 @@
/*
- * Fast and scalable bitmap tagging variant. Uses sparser bitmaps spread
- * over multiple cachelines to avoid ping-pong between multiple submitters
- * or submitter and completer. Uses rolling wakeups to avoid falling of
- * the scaling cliff when we run out of tags and have to start putting
- * submitters to sleep.
- *
- * Uses active queue tracking to support fairer distribution of tags
- * between multiple submitters when a shared tag map is used.
+ * Tag allocation using scalable bitmaps. Uses active queue tracking to support
+ * fairer distribution of tags between multiple submitters when a shared tag map
+ * is used.
*
* Copyright (C) 2013-2014 Jens Axboe
*/
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/random.h>
#include <linux/blk-mq.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
-static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt)
-{
- int i;
-
- for (i = 0; i < bt->map_nr; i++) {
- struct blk_align_bitmap *bm = &bt->map[i];
- int ret;
-
- ret = find_first_zero_bit(&bm->word, bm->depth);
- if (ret < bm->depth)
- return true;
- }
-
- return false;
-}
-
bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
{
if (!tags)
return true;
- return bt_has_free_tags(&tags->bitmap_tags);
-}
-
-static inline int bt_index_inc(int index)
-{
- return (index + 1) & (BT_WAIT_QUEUES - 1);
-}
-
-static inline void bt_index_atomic_inc(atomic_t *index)
-{
- int old = atomic_read(index);
- int new = bt_index_inc(old);
- atomic_cmpxchg(index, old, new);
+ return sbitmap_any_bit_clear(&tags->bitmap_tags.sb);
}
/*
@@ -72,29 +38,9 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
*/
void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
{
- struct blk_mq_bitmap_tags *bt;
- int i, wake_index;
-
- /*
- * Make sure all changes prior to this are visible from other CPUs.
- */
- smp_mb();
- bt = &tags->bitmap_tags;
- wake_index = atomic_read(&bt->wake_index);
- for (i = 0; i < BT_WAIT_QUEUES; i++) {
- struct bt_wait_state *bs = &bt->bs[wake_index];
-
- if (waitqueue_active(&bs->wait))
- wake_up(&bs->wait);
-
- wake_index = bt_index_inc(wake_index);
- }
-
- if (include_reserve) {
- bt = &tags->breserved_tags;
- if (waitqueue_active(&bt->bs[0].wait))
- wake_up(&bt->bs[0].wait);
- }
+ sbitmap_queue_wake_all(&tags->bitmap_tags);
+ if (include_reserve)
+ sbitmap_queue_wake_all(&tags->breserved_tags);
}
/*
@@ -118,7 +64,7 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
* and attempt to provide a fair share of the tag depth for each of them.
*/
static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
- struct blk_mq_bitmap_tags *bt)
+ struct sbitmap_queue *bt)
{
unsigned int depth, users;
@@ -130,7 +76,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
/*
* Don't try dividing an ant
*/
- if (bt->depth == 1)
+ if (bt->sb.depth == 1)
return true;
users = atomic_read(&hctx->tags->active_queues);
@@ -140,142 +86,36 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
/*
* Allow at least some tags
*/
- depth = max((bt->depth + users - 1) / users, 4U);
+ depth = max((bt->sb.depth + users - 1) / users, 4U);
return atomic_read(&hctx->nr_active) < depth;
}
-static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag,
- bool nowrap)
-{
- int tag, org_last_tag = last_tag;
-
- while (1) {
- tag = find_next_zero_bit(&bm->word, bm->depth, last_tag);
- if (unlikely(tag >= bm->depth)) {
- /*
- * We started with an offset, and we didn't reset the
- * offset to 0 in a failure case, so start from 0 to
- * exhaust the map.
- */
- if (org_last_tag && last_tag && !nowrap) {
- last_tag = org_last_tag = 0;
- continue;
- }
- return -1;
- }
-
- if (!test_and_set_bit(tag, &bm->word))
- break;
-
- last_tag = tag + 1;
- if (last_tag >= bm->depth - 1)
- last_tag = 0;
- }
-
- return tag;
-}
-
-#define BT_ALLOC_RR(tags) (tags->alloc_policy == BLK_TAG_ALLOC_RR)
-
-/*
- * Straight forward bitmap tag implementation, where each bit is a tag
- * (cleared == free, and set == busy). The small twist is using per-cpu
- * last_tag caches, which blk-mq stores in the blk_mq_ctx software queue
- * contexts. This enables us to drastically limit the space searched,
- * without dirtying an extra shared cacheline like we would if we stored
- * the cache value inside the shared blk_mq_bitmap_tags structure. On top
- * of that, each word of tags is in a separate cacheline. This means that
- * multiple users will tend to stick to different cachelines, at least
- * until the map is exhausted.
- */
-static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt,
- unsigned int *tag_cache, struct blk_mq_tags *tags)
+static int __bt_get(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt)
{
- unsigned int last_tag, org_last_tag;
- int index, i, tag;
-
if (!hctx_may_queue(hctx, bt))
return -1;
-
- last_tag = org_last_tag = *tag_cache;
- index = TAG_TO_INDEX(bt, last_tag);
-
- for (i = 0; i < bt->map_nr; i++) {
- tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag),
- BT_ALLOC_RR(tags));
- if (tag != -1) {
- tag += (index << bt->bits_per_word);
- goto done;
- }
-
- /*
- * Jump to next index, and reset the last tag to be the
- * first tag of that index
- */
- index++;
- last_tag = (index << bt->bits_per_word);
-
- if (index >= bt->map_nr) {
- index = 0;
- last_tag = 0;
- }
- }
-
- *tag_cache = 0;
- return -1;
-
- /*
- * Only update the cache from the allocation path, if we ended
- * up using the specific cached tag.
- */
-done:
- if (tag == org_last_tag || unlikely(BT_ALLOC_RR(tags))) {
- last_tag = tag + 1;
- if (last_tag >= bt->depth - 1)
- last_tag = 0;
-
- *tag_cache = last_tag;
- }
-
- return tag;
+ return __sbitmap_queue_get(bt);
}
-static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt,
- struct blk_mq_hw_ctx *hctx)
+static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt,
+ struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags)
{
- struct bt_wait_state *bs;
- int wait_index;
-
- if (!hctx)
- return &bt->bs[0];
-
- wait_index = atomic_read(&hctx->wait_index);
- bs = &bt->bs[wait_index];
- bt_index_atomic_inc(&hctx->wait_index);
- return bs;
-}
-
-static int bt_get(struct blk_mq_alloc_data *data,
- struct blk_mq_bitmap_tags *bt,
- struct blk_mq_hw_ctx *hctx,
- unsigned int *last_tag, struct blk_mq_tags *tags)
-{
- struct bt_wait_state *bs;
+ struct sbq_wait_state *ws;
DEFINE_WAIT(wait);
int tag;
- tag = __bt_get(hctx, bt, last_tag, tags);
+ tag = __bt_get(hctx, bt);
if (tag != -1)
return tag;
if (data->flags & BLK_MQ_REQ_NOWAIT)
return -1;
- bs = bt_wait_ptr(bt, hctx);
+ ws = bt_wait_ptr(bt, hctx);
do {
- prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE);
- tag = __bt_get(hctx, bt, last_tag, tags);
+ tag = __bt_get(hctx, bt);
if (tag != -1)
break;
@@ -292,7 +132,7 @@ static int bt_get(struct blk_mq_alloc_data *data,
* Retry tag allocation after running the hardware queue,
* as running the queue may also have found completions.
*/
- tag = __bt_get(hctx, bt, last_tag, tags);
+ tag = __bt_get(hctx, bt);
if (tag != -1)
break;
@@ -306,15 +146,14 @@ static int bt_get(struct blk_mq_alloc_data *data,
if (data->flags & BLK_MQ_REQ_RESERVED) {
bt = &data->hctx->tags->breserved_tags;
} else {
- last_tag = &data->ctx->last_tag;
hctx = data->hctx;
bt = &hctx->tags->bitmap_tags;
}
- finish_wait(&bs->wait, &wait);
- bs = bt_wait_ptr(bt, hctx);
+ finish_wait(&ws->wait, &wait);
+ ws = bt_wait_ptr(bt, hctx);
} while (1);
- finish_wait(&bs->wait, &wait);
+ finish_wait(&ws->wait, &wait);
return tag;
}
@@ -323,7 +162,7 @@ static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data)
int tag;
tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx,
- &data->ctx->last_tag, data->hctx->tags);
+ data->hctx->tags);
if (tag >= 0)
return tag + data->hctx->tags->nr_reserved_tags;
@@ -332,15 +171,15 @@ static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data)
static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data)
{
- int tag, zero = 0;
+ int tag;
if (unlikely(!data->hctx->tags->nr_reserved_tags)) {
WARN_ON_ONCE(1);
return BLK_MQ_TAG_FAIL;
}
- tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero,
- data->hctx->tags);
+ tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL,
+ data->hctx->tags);
if (tag < 0)
return BLK_MQ_TAG_FAIL;
@@ -354,55 +193,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
return __blk_mq_get_tag(data);
}
-static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
-{
- int i, wake_index;
-
- wake_index = atomic_read(&bt->wake_index);
- for (i = 0; i < BT_WAIT_QUEUES; i++) {
- struct bt_wait_state *bs = &bt->bs[wake_index];
-
- if (waitqueue_active(&bs->wait)) {
- int o = atomic_read(&bt->wake_index);
- if (wake_index != o)
- atomic_cmpxchg(&bt->wake_index, o, wake_index);
-
- return bs;
- }
-
- wake_index = bt_index_inc(wake_index);
- }
-
- return NULL;
-}
-
-static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
-{
- const int index = TAG_TO_INDEX(bt, tag);
- struct bt_wait_state *bs;
- int wait_cnt;
-
- clear_bit(TAG_TO_BIT(bt, tag), &bt->map[index].word);
-
- /* Ensure that the wait list checks occur after clear_bit(). */
- smp_mb();
-
- bs = bt_wake_ptr(bt);
- if (!bs)
- return;
-
- wait_cnt = atomic_dec_return(&bs->wait_cnt);
- if (unlikely(wait_cnt < 0))
- wait_cnt = atomic_inc_return(&bs->wait_cnt);
- if (wait_cnt == 0) {
- atomic_add(bt->wake_cnt, &bs->wait_cnt);
- bt_index_atomic_inc(&bt->wake_index);
- wake_up(&bs->wait);
- }
-}
-
-void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
- unsigned int *last_tag)
+void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+ unsigned int tag)
{
struct blk_mq_tags *tags = hctx->tags;
@@ -410,67 +202,92 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
const int real_tag = tag - tags->nr_reserved_tags;
BUG_ON(real_tag >= tags->nr_tags);
- bt_clear_tag(&tags->bitmap_tags, real_tag);
- if (likely(tags->alloc_policy == BLK_TAG_ALLOC_FIFO))
- *last_tag = real_tag;
+ sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu);
} else {
BUG_ON(tag >= tags->nr_reserved_tags);
- bt_clear_tag(&tags->breserved_tags, tag);
+ sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu);
}
}
-static void bt_for_each(struct blk_mq_hw_ctx *hctx,
- struct blk_mq_bitmap_tags *bt, unsigned int off,
- busy_iter_fn *fn, void *data, bool reserved)
+struct bt_iter_data {
+ struct blk_mq_hw_ctx *hctx;
+ busy_iter_fn *fn;
+ void *data;
+ bool reserved;
+};
+
+static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
{
+ struct bt_iter_data *iter_data = data;
+ struct blk_mq_hw_ctx *hctx = iter_data->hctx;
+ struct blk_mq_tags *tags = hctx->tags;
+ bool reserved = iter_data->reserved;
struct request *rq;
- int bit, i;
- for (i = 0; i < bt->map_nr; i++) {
- struct blk_align_bitmap *bm = &bt->map[i];
+ if (!reserved)
+ bitnr += tags->nr_reserved_tags;
+ rq = tags->rqs[bitnr];
- for (bit = find_first_bit(&bm->word, bm->depth);
- bit < bm->depth;
- bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
- rq = hctx->tags->rqs[off + bit];
- if (rq->q == hctx->queue)
- fn(hctx, rq, data, reserved);
- }
+ if (rq->q == hctx->queue)
+ iter_data->fn(hctx, rq, iter_data->data, reserved);
+ return true;
+}
- off += (1 << bt->bits_per_word);
- }
+static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
+ busy_iter_fn *fn, void *data, bool reserved)
+{
+ struct bt_iter_data iter_data = {
+ .hctx = hctx,
+ .fn = fn,
+ .data = data,
+ .reserved = reserved,
+ };
+
+ sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data);
}
-static void bt_tags_for_each(struct blk_mq_tags *tags,
- struct blk_mq_bitmap_tags *bt, unsigned int off,
- busy_tag_iter_fn *fn, void *data, bool reserved)
+struct bt_tags_iter_data {
+ struct blk_mq_tags *tags;
+ busy_tag_iter_fn *fn;
+ void *data;
+ bool reserved;
+};
+
+static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
{
+ struct bt_tags_iter_data *iter_data = data;
+ struct blk_mq_tags *tags = iter_data->tags;
+ bool reserved = iter_data->reserved;
struct request *rq;
- int bit, i;
- if (!tags->rqs)
- return;
- for (i = 0; i < bt->map_nr; i++) {
- struct blk_align_bitmap *bm = &bt->map[i];
-
- for (bit = find_first_bit(&bm->word, bm->depth);
- bit < bm->depth;
- bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
- rq = tags->rqs[off + bit];
- fn(rq, data, reserved);
- }
+ if (!reserved)
+ bitnr += tags->nr_reserved_tags;
+ rq = tags->rqs[bitnr];
- off += (1 << bt->bits_per_word);
- }
+ iter_data->fn(rq, iter_data->data, reserved);
+ return true;
+}
+
+static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
+ busy_tag_iter_fn *fn, void *data, bool reserved)
+{
+ struct bt_tags_iter_data iter_data = {
+ .tags = tags,
+ .fn = fn,
+ .data = data,
+ .reserved = reserved,
+ };
+
+ if (tags->rqs)
+ sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data);
}
static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
busy_tag_iter_fn *fn, void *priv)
{
if (tags->nr_reserved_tags)
- bt_tags_for_each(tags, &tags->breserved_tags, 0, fn, priv, true);
- bt_tags_for_each(tags, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
- false);
+ bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, true);
+ bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, false);
}
void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
@@ -529,124 +346,40 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
continue;
if (tags->nr_reserved_tags)
- bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true);
- bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
- false);
+ bt_for_each(hctx, &tags->breserved_tags, fn, priv, true);
+ bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false);
}
}
-static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt)
+static unsigned int bt_unused_tags(const struct sbitmap_queue *bt)
{
- unsigned int i, used;
-
- for (i = 0, used = 0; i < bt->map_nr; i++) {
- struct blk_align_bitmap *bm = &bt->map[i];
-
- used += bitmap_weight(&bm->word, bm->depth);
- }
-
- return bt->depth - used;
+ return bt->sb.depth - sbitmap_weight(&bt->sb);
}
-static void bt_update_count(struct blk_mq_bitmap_tags *bt,
- unsigned int depth)
+static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth,
+ bool round_robin, int node)
{
- unsigned int tags_per_word = 1U << bt->bits_per_word;
- unsigned int map_depth = depth;
-
- if (depth) {
- int i;
-
- for (i = 0; i < bt->map_nr; i++) {
- bt->map[i].depth = min(map_depth, tags_per_word);
- map_depth -= bt->map[i].depth;
- }
- }
-
- bt->wake_cnt = BT_WAIT_BATCH;
- if (bt->wake_cnt > depth / BT_WAIT_QUEUES)
- bt->wake_cnt = max(1U, depth / BT_WAIT_QUEUES);
-
- bt->depth = depth;
-}
-
-static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,
- int node, bool reserved)
-{
- int i;
-
- bt->bits_per_word = ilog2(BITS_PER_LONG);
-
- /*
- * Depth can be zero for reserved tags, that's not a failure
- * condition.
- */
- if (depth) {
- unsigned int nr, tags_per_word;
-
- tags_per_word = (1 << bt->bits_per_word);
-
- /*
- * If the tag space is small, shrink the number of tags
- * per word so we spread over a few cachelines, at least.
- * If less than 4 tags, just forget about it, it's not
- * going to work optimally anyway.
- */
- if (depth >= 4) {
- while (tags_per_word * 4 > depth) {
- bt->bits_per_word--;
- tags_per_word = (1 << bt->bits_per_word);
- }
- }
-
- nr = ALIGN(depth, tags_per_word) / tags_per_word;
- bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap),
- GFP_KERNEL, node);
- if (!bt->map)
- return -ENOMEM;
-
- bt->map_nr = nr;
- }
-
- bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL);
- if (!bt->bs) {
- kfree(bt->map);
- bt->map = NULL;
- return -ENOMEM;
- }
-
- bt_update_count(bt, depth);
-
- for (i = 0; i < BT_WAIT_QUEUES; i++) {
- init_waitqueue_head(&bt->bs[i].wait);
- atomic_set(&bt->bs[i].wait_cnt, bt->wake_cnt);
- }
-
- return 0;
-}
-
-static void bt_free(struct blk_mq_bitmap_tags *bt)
-{
- kfree(bt->map);
- kfree(bt->bs);
+ return sbitmap_queue_init_node(bt, depth, -1, round_robin, GFP_KERNEL,
+ node);
}
static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
int node, int alloc_policy)
{
unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
+ bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR;
- tags->alloc_policy = alloc_policy;
-
- if (bt_alloc(&tags->bitmap_tags, depth, node, false))
- goto enomem;
- if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true))
- goto enomem;
+ if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node))
+ goto free_tags;
+ if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, round_robin,
+ node))
+ goto free_bitmap_tags;
return tags;
-enomem:
- bt_free(&tags->bitmap_tags);
+free_bitmap_tags:
+ sbitmap_queue_free(&tags->bitmap_tags);
+free_tags:
kfree(tags);
return NULL;
}
@@ -679,19 +412,12 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
void blk_mq_free_tags(struct blk_mq_tags *tags)
{
- bt_free(&tags->bitmap_tags);
- bt_free(&tags->breserved_tags);
+ sbitmap_queue_free(&tags->bitmap_tags);
+ sbitmap_queue_free(&tags->breserved_tags);
free_cpumask_var(tags->cpumask);
kfree(tags);
}
-void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag)
-{
- unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
-
- *tag = prandom_u32() % depth;
-}
-
int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth)
{
tdepth -= tags->nr_reserved_tags;
@@ -702,7 +428,8 @@ int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth)
* Don't need (or can't) update reserved tags here, they remain
* static and should never need resizing.
*/
- bt_update_count(&tags->bitmap_tags, tdepth);
+ sbitmap_queue_resize(&tags->bitmap_tags, tdepth);
+
blk_mq_tag_wakeup_all(tags, false);
return 0;
}
@@ -746,7 +473,7 @@ ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
page += sprintf(page, "nr_tags=%u, reserved_tags=%u, "
"bits_per_word=%u\n",
tags->nr_tags, tags->nr_reserved_tags,
- tags->bitmap_tags.bits_per_word);
+ 1U << tags->bitmap_tags.sb.shift);
free = bt_unused_tags(&tags->bitmap_tags);
res = bt_unused_tags(&tags->breserved_tags);
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index d468a79..09f4cc0 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -3,31 +3,6 @@
#include "blk-mq.h"
-enum {
- BT_WAIT_QUEUES = 8,
- BT_WAIT_BATCH = 8,
-};
-
-struct bt_wait_state {
- atomic_t wait_cnt;
- wait_queue_head_t wait;
-} ____cacheline_aligned_in_smp;
-
-#define TAG_TO_INDEX(bt, tag) ((tag) >> (bt)->bits_per_word)
-#define TAG_TO_BIT(bt, tag) ((tag) & ((1 << (bt)->bits_per_word) - 1))
-
-struct blk_mq_bitmap_tags {
- unsigned int depth;
- unsigned int wake_cnt;
- unsigned int bits_per_word;
-
- unsigned int map_nr;
- struct blk_align_bitmap *map;
-
- atomic_t wake_index;
- struct bt_wait_state *bs;
-};
-
/*
* Tag address space map.
*/
@@ -37,13 +12,12 @@ struct blk_mq_tags {
atomic_t active_queues;
- struct blk_mq_bitmap_tags bitmap_tags;
- struct blk_mq_bitmap_tags breserved_tags;
+ struct sbitmap_queue bitmap_tags;
+ struct sbitmap_queue breserved_tags;
struct request **rqs;
struct list_head page_list;
- int alloc_policy;
cpumask_var_t cpumask;
};
@@ -52,15 +26,23 @@ extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int r
extern void blk_mq_free_tags(struct blk_mq_tags *tags);
extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
-extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag);
+extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+ unsigned int tag);
extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
-extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag);
extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth);
extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
void *priv);
+static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
+ struct blk_mq_hw_ctx *hctx)
+{
+ if (!hctx)
+ return &bt->ws[0];
+ return sbq_wait_ptr(bt, &hctx->wait_index);
+}
+
enum {
BLK_MQ_TAG_CACHE_MIN = 1,
BLK_MQ_TAG_CACHE_MAX = 64,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c207fa9..dc5f47f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -22,6 +22,7 @@
#include <linux/sched/sysctl.h>
#include <linux/delay.h>
#include <linux/crash_dump.h>
+#include <linux/prefetch.h>
#include <trace/events/block.h>
@@ -33,49 +34,28 @@
static DEFINE_MUTEX(all_q_mutex);
static LIST_HEAD(all_q_list);
-static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
-
/*
* Check if any of the ctx's have pending work in this hardware queue
*/
static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
{
- unsigned int i;
-
- for (i = 0; i < hctx->ctx_map.size; i++)
- if (hctx->ctx_map.map[i].word)
- return true;
-
- return false;
-}
-
-static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx,
- struct blk_mq_ctx *ctx)
-{
- return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word];
+ return sbitmap_any_bit_set(&hctx->ctx_map);
}
-#define CTX_TO_BIT(hctx, ctx) \
- ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1))
-
/*
* Mark this ctx as having pending work in this hardware queue
*/
static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx)
{
- struct blk_align_bitmap *bm = get_bm(hctx, ctx);
-
- if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word))
- set_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
+ if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw))
+ sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw);
}
static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx)
{
- struct blk_align_bitmap *bm = get_bm(hctx, ctx);
-
- clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
+ sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
}
void blk_mq_freeze_queue_start(struct request_queue *q)
@@ -246,19 +226,9 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
ctx = blk_mq_get_ctx(q);
hctx = q->mq_ops->map_queue(q, ctx->cpu);
blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
-
rq = __blk_mq_alloc_request(&alloc_data, rw, 0);
- if (!rq && !(flags & BLK_MQ_REQ_NOWAIT)) {
- __blk_mq_run_hw_queue(hctx);
- blk_mq_put_ctx(ctx);
-
- ctx = blk_mq_get_ctx(q);
- hctx = q->mq_ops->map_queue(q, ctx->cpu);
- blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
- rq = __blk_mq_alloc_request(&alloc_data, rw, 0);
- ctx = alloc_data.ctx;
- }
blk_mq_put_ctx(ctx);
+
if (!rq) {
blk_queue_exit(q);
return ERR_PTR(-EWOULDBLOCK);
@@ -333,7 +303,7 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
rq->cmd_flags = 0;
clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
- blk_mq_put_tag(hctx, tag, &ctx->last_tag);
+ blk_mq_put_tag(hctx, ctx, tag);
blk_queue_exit(q);
}
@@ -513,7 +483,7 @@ EXPORT_SYMBOL(blk_mq_requeue_request);
static void blk_mq_requeue_work(struct work_struct *work)
{
struct request_queue *q =
- container_of(work, struct request_queue, requeue_work);
+ container_of(work, struct request_queue, requeue_work.work);
LIST_HEAD(rq_list);
struct request *rq, *next;
unsigned long flags;
@@ -568,16 +538,24 @@ EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
void blk_mq_cancel_requeue_work(struct request_queue *q)
{
- cancel_work_sync(&q->requeue_work);
+ cancel_delayed_work_sync(&q->requeue_work);
}
EXPORT_SYMBOL_GPL(blk_mq_cancel_requeue_work);
void blk_mq_kick_requeue_list(struct request_queue *q)
{
- kblockd_schedule_work(&q->requeue_work);
+ kblockd_schedule_delayed_work(&q->requeue_work, 0);
}
EXPORT_SYMBOL(blk_mq_kick_requeue_list);
+void blk_mq_delay_kick_requeue_list(struct request_queue *q,
+ unsigned long msecs)
+{
+ kblockd_schedule_delayed_work(&q->requeue_work,
+ msecs_to_jiffies(msecs));
+}
+EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
+
void blk_mq_abort_requeue_list(struct request_queue *q)
{
unsigned long flags;
@@ -600,8 +578,10 @@ EXPORT_SYMBOL(blk_mq_abort_requeue_list);
struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
{
- if (tag < tags->nr_tags)
+ if (tag < tags->nr_tags) {
+ prefetch(tags->rqs[tag]);
return tags->rqs[tag];
+ }
return NULL;
}
@@ -756,38 +736,44 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
return false;
}
+struct flush_busy_ctx_data {
+ struct blk_mq_hw_ctx *hctx;
+ struct list_head *list;
+};
+
+static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
+{
+ struct flush_busy_ctx_data *flush_data = data;
+ struct blk_mq_hw_ctx *hctx = flush_data->hctx;
+ struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
+
+ sbitmap_clear_bit(sb, bitnr);
+ spin_lock(&ctx->lock);
+ list_splice_tail_init(&ctx->rq_list, flush_data->list);
+ spin_unlock(&ctx->lock);
+ return true;
+}
+
/*
* Process software queues that have been marked busy, splicing them
* to the for-dispatch
*/
static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
{
- struct blk_mq_ctx *ctx;
- int i;
-
- for (i = 0; i < hctx->ctx_map.size; i++) {
- struct blk_align_bitmap *bm = &hctx->ctx_map.map[i];
- unsigned int off, bit;
-
- if (!bm->word)
- continue;
+ struct flush_busy_ctx_data data = {
+ .hctx = hctx,
+ .list = list,
+ };
- bit = 0;
- off = i * hctx->ctx_map.bits_per_word;
- do {
- bit = find_next_bit(&bm->word, bm->depth, bit);
- if (bit >= bm->depth)
- break;
+ sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
+}
- ctx = hctx->ctxs[bit + off];
- clear_bit(bit, &bm->word);
- spin_lock(&ctx->lock);
- list_splice_tail_init(&ctx->rq_list, list);
- spin_unlock(&ctx->lock);
+static inline unsigned int queued_to_index(unsigned int queued)
+{
+ if (!queued)
+ return 0;
- bit++;
- } while (1);
- }
+ return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
}
/*
@@ -878,10 +864,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
dptr = &driver_list;
}
- if (!queued)
- hctx->dispatched[0]++;
- else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1)))
- hctx->dispatched[ilog2(queued) + 1]++;
+ hctx->dispatched[queued_to_index(queued)]++;
/*
* Any items that need requeuing? Stuff them into hctx->dispatch,
@@ -937,7 +920,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
!blk_mq_hw_queue_mapped(hctx)))
return;
- if (!async) {
+ if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
int cpu = get_cpu();
if (cpumask_test_cpu(cpu, hctx->cpumask)) {
__blk_mq_run_hw_queue(hctx);
@@ -948,8 +931,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
put_cpu();
}
- kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
- &hctx->run_work, 0);
+ kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work);
}
void blk_mq_run_hw_queues(struct request_queue *q, bool async)
@@ -970,7 +952,7 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues);
void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
{
- cancel_delayed_work(&hctx->run_work);
+ cancel_work(&hctx->run_work);
cancel_delayed_work(&hctx->delay_work);
set_bit(BLK_MQ_S_STOPPED, &hctx->state);
}
@@ -1023,7 +1005,7 @@ static void blk_mq_run_work_fn(struct work_struct *work)
{
struct blk_mq_hw_ctx *hctx;
- hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
+ hctx = container_of(work, struct blk_mq_hw_ctx, run_work);
__blk_mq_run_hw_queue(hctx);
}
@@ -1240,20 +1222,8 @@ static struct request *blk_mq_map_request(struct request_queue *q,
op_flags |= REQ_SYNC;
trace_block_getrq(q, bio, op);
- blk_mq_set_alloc_data(&alloc_data, q, BLK_MQ_REQ_NOWAIT, ctx, hctx);
+ blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx);
rq = __blk_mq_alloc_request(&alloc_data, op, op_flags);
- if (unlikely(!rq)) {
- __blk_mq_run_hw_queue(hctx);
- blk_mq_put_ctx(ctx);
- trace_block_sleeprq(q, bio, op);
-
- ctx = blk_mq_get_ctx(q);
- hctx = q->mq_ops->map_queue(q, ctx->cpu);
- blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx);
- rq = __blk_mq_alloc_request(&alloc_data, op, op_flags);
- ctx = alloc_data.ctx;
- hctx = alloc_data.hctx;
- }
hctx->queued++;
data->hctx = hctx;
@@ -1606,32 +1576,6 @@ fail:
return NULL;
}
-static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap)
-{
- kfree(bitmap->map);
-}
-
-static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node)
-{
- unsigned int bpw = 8, total, num_maps, i;
-
- bitmap->bits_per_word = bpw;
-
- num_maps = ALIGN(nr_cpu_ids, bpw) / bpw;
- bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap),
- GFP_KERNEL, node);
- if (!bitmap->map)
- return -ENOMEM;
-
- total = nr_cpu_ids;
- for (i = 0; i < num_maps; i++) {
- bitmap->map[i].depth = min(total, bitmap->bits_per_word);
- total -= bitmap->map[i].depth;
- }
-
- return 0;
-}
-
/*
* 'cpu' is going away. splice any existing rq_list entries from this
* software queue to the hw queue dispatch list, and ensure that it
@@ -1697,7 +1641,7 @@ static void blk_mq_exit_hctx(struct request_queue *q,
blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
blk_free_flush_queue(hctx->fq);
- blk_mq_free_bitmap(&hctx->ctx_map);
+ sbitmap_free(&hctx->ctx_map);
}
static void blk_mq_exit_hw_queues(struct request_queue *q,
@@ -1734,7 +1678,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
if (node == NUMA_NO_NODE)
node = hctx->numa_node = set->numa_node;
- INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
+ INIT_WORK(&hctx->run_work, blk_mq_run_work_fn);
INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
spin_lock_init(&hctx->lock);
INIT_LIST_HEAD(&hctx->dispatch);
@@ -1757,7 +1701,8 @@ static int blk_mq_init_hctx(struct request_queue *q,
if (!hctx->ctxs)
goto unregister_cpu_notifier;
- if (blk_mq_alloc_bitmap(&hctx->ctx_map, node))
+ if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), GFP_KERNEL,
+ node))
goto free_ctxs;
hctx->nr_ctx = 0;
@@ -1784,7 +1729,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
free_bitmap:
- blk_mq_free_bitmap(&hctx->ctx_map);
+ sbitmap_free(&hctx->ctx_map);
free_ctxs:
kfree(hctx->ctxs);
unregister_cpu_notifier:
@@ -1860,8 +1805,6 @@ static void blk_mq_map_swqueue(struct request_queue *q,
mutex_unlock(&q->sysfs_lock);
queue_for_each_hw_ctx(q, hctx, i) {
- struct blk_mq_ctxmap *map = &hctx->ctx_map;
-
/*
* If no software queues are mapped to this hardware queue,
* disable it and free the request entries.
@@ -1887,7 +1830,7 @@ static void blk_mq_map_swqueue(struct request_queue *q,
* This is more accurate and more efficient than looping
* over all possibly mapped software queues.
*/
- map->size = DIV_ROUND_UP(hctx->nr_ctx, map->bits_per_word);
+ sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
/*
* Initialize batch roundrobin counts
@@ -2094,7 +2037,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
q->sg_reserved_size = INT_MAX;
- INIT_WORK(&q->requeue_work, blk_mq_requeue_work);
+ INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
INIT_LIST_HEAD(&q->requeue_list);
spin_lock_init(&q->requeue_lock);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 9087b11..9b15d2e 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -12,8 +12,6 @@ struct blk_mq_ctx {
unsigned int cpu;
unsigned int index_hw;
- unsigned int last_tag ____cacheline_aligned_in_smp;
-
/* incremented at dispatch time */
unsigned long rq_dispatched[2];
unsigned long rq_merged;
@@ -63,15 +61,6 @@ extern void blk_mq_rq_timed_out(struct request *req, bool reserved);
void blk_mq_release(struct request_queue *q);
-/*
- * Basic implementation of sparser bitmap, allowing the user to spread
- * the bits over more cachelines.
- */
-struct blk_align_bitmap {
- unsigned long word;
- unsigned long depth;
-} ____cacheline_aligned_in_smp;
-
static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
unsigned int cpu)
{
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index f87a7e7..9cc8d7c 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -704,7 +704,7 @@ int blk_register_queue(struct gendisk *disk)
kobject_uevent(&q->kobj, KOBJ_ADD);
if (q->mq_ops)
- blk_mq_register_disk(disk);
+ blk_mq_register_dev(dev, q);
if (!q->request_fn)
return 0;
@@ -729,7 +729,7 @@ void blk_unregister_queue(struct gendisk *disk)
return;
if (q->mq_ops)
- blk_mq_unregister_disk(disk);
+ blk_mq_unregister_dev(disk_to_dev(disk), q);
if (q->request_fn)
elv_unregister_queue(q);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index cc2f6db..5e24d88 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3042,7 +3042,6 @@ static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
if (ktime_get_ns() < rq->fifo_time)
rq = NULL;
- cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
return rq;
}
@@ -3420,6 +3419,9 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
unsigned int max_dispatch;
+ if (cfq_cfqq_must_dispatch(cfqq))
+ return true;
+
/*
* Drain async requests before we start sync IO
*/
@@ -3511,15 +3513,20 @@ static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
+ rq = cfq_check_fifo(cfqq);
+ if (rq)
+ cfq_mark_cfqq_must_dispatch(cfqq);
+
if (!cfq_may_dispatch(cfqd, cfqq))
return false;
/*
* follow expired path, else get first next available
*/
- rq = cfq_check_fifo(cfqq);
if (!rq)
rq = cfqq->next_rq;
+ else
+ cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
/*
* insert request into driver dispatch list
@@ -3989,7 +3996,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
* if the new request is sync, but the currently running queue is
* not, let the sync request have priority.
*/
- if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
+ if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq) && !cfq_cfqq_must_dispatch(cfqq))
return true;
/*
OpenPOWER on IntegriCloud