diff options
56 files changed, 2088 insertions, 986 deletions
diff --git a/Documentation/DocBook/filesystems.tmpl b/Documentation/DocBook/filesystems.tmpl index 4f67683..bcdfdb9 100644 --- a/Documentation/DocBook/filesystems.tmpl +++ b/Documentation/DocBook/filesystems.tmpl @@ -62,7 +62,7 @@ !Efs/mpage.c !Efs/namei.c !Efs/buffer.c -!Efs/bio.c +!Eblock/bio.c !Efs/seq_file.c !Efs/filesystems.c !Efs/fs-writeback.c diff --git a/block/Makefile b/block/Makefile index 20645e8..a2ce6ac 100644 --- a/block/Makefile +++ b/block/Makefile @@ -2,13 +2,15 @@ # Makefile for the kernel block layer # -obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ +obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \ blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ - genhd.o scsi_ioctl.o partition-generic.o partitions/ + genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ + partitions/ +obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o @@ -20,3 +22,4 @@ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o +obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o diff --git a/fs/bio-integrity.c b/block/bio-integrity.c index 1c2ce0c..9e24106 100644 --- a/fs/bio-integrity.c +++ b/block/bio-integrity.c @@ -617,7 +617,7 @@ int bioset_integrity_create(struct bio_set *bs, int pool_size) if (!bs->bio_integrity_pool) return -1; - bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size); + bs->bvec_integrity_pool = biovec_create_pool(pool_size); if (!bs->bvec_integrity_pool) { mempool_destroy(bs->bio_integrity_pool); return -1; @@ -305,6 +305,8 @@ static void bio_chain_endio(struct bio *bio, int error) /** * bio_chain - chain bio completions + * @bio: the target bio + * @parent: the @bio's parent bio * * The caller won't have a bi_end_io called when @bio completes - instead, * @parent's bi_end_io won't be called until both @parent and @bio have @@ -1011,8 +1013,7 @@ static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, bio->bi_private = bmd; } -static struct bio_map_data *bio_alloc_map_data(int nr_segs, - unsigned int iov_count, +static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count, gfp_t gfp_mask) { if (iov_count > UIO_MAXIOV) @@ -1154,7 +1155,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, if (offset) nr_pages++; - bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask); + bmd = bio_alloc_map_data(iov_count, gfp_mask); if (!bmd) return ERR_PTR(-ENOMEM); @@ -1859,7 +1860,7 @@ EXPORT_SYMBOL_GPL(bio_trim); * create memory pools for biovec's in a bio_set. * use the global biovec slabs created for general use. */ -mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries) +mempool_t *biovec_create_pool(int pool_entries) { struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX; @@ -1922,7 +1923,7 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) if (!bs->bio_pool) goto bad; - bs->bvec_pool = biovec_create_pool(bs, pool_size); + bs->bvec_pool = biovec_create_pool(pool_size); if (!bs->bvec_pool) goto bad; diff --git a/block/blk-core.c b/block/blk-core.c index a0e3096..40d6548 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -146,8 +146,8 @@ void blk_dump_rq_flags(struct request *rq, char *msg) printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", (unsigned long long)blk_rq_pos(rq), blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); - printk(KERN_INFO " bio %p, biotail %p, buffer %p, len %u\n", - rq->bio, rq->biotail, rq->buffer, blk_rq_bytes(rq)); + printk(KERN_INFO " bio %p, biotail %p, len %u\n", + rq->bio, rq->biotail, blk_rq_bytes(rq)); if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { printk(KERN_INFO " cdb: "); @@ -251,8 +251,10 @@ void blk_sync_queue(struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i; - queue_for_each_hw_ctx(q, hctx, i) - cancel_delayed_work_sync(&hctx->delayed_work); + queue_for_each_hw_ctx(q, hctx, i) { + cancel_delayed_work_sync(&hctx->run_work); + cancel_delayed_work_sync(&hctx->delay_work); + } } else { cancel_delayed_work_sync(&q->delay_work); } @@ -574,12 +576,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (!q) return NULL; - if (percpu_counter_init(&q->mq_usage_counter, 0)) - goto fail_q; - q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); if (q->id < 0) - goto fail_c; + goto fail_q; q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; @@ -637,8 +636,6 @@ fail_bdi: bdi_destroy(&q->backing_dev_info); fail_id: ida_simple_remove(&blk_queue_ida, q->id); -fail_c: - percpu_counter_destroy(&q->mq_usage_counter); fail_q: kmem_cache_free(blk_requestq_cachep, q); return NULL; @@ -846,6 +843,47 @@ static void freed_request(struct request_list *rl, unsigned int flags) __freed_request(rl, sync ^ 1); } +int blk_update_nr_requests(struct request_queue *q, unsigned int nr) +{ + struct request_list *rl; + + spin_lock_irq(q->queue_lock); + q->nr_requests = nr; + blk_queue_congestion_threshold(q); + + /* congestion isn't cgroup aware and follows root blkcg for now */ + rl = &q->root_rl; + + if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) + blk_set_queue_congested(q, BLK_RW_SYNC); + else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) + blk_clear_queue_congested(q, BLK_RW_SYNC); + + if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q)) + blk_set_queue_congested(q, BLK_RW_ASYNC); + else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) + blk_clear_queue_congested(q, BLK_RW_ASYNC); + + blk_queue_for_each_rl(rl, q) { + if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { + blk_set_rl_full(rl, BLK_RW_SYNC); + } else { + blk_clear_rl_full(rl, BLK_RW_SYNC); + wake_up(&rl->wait[BLK_RW_SYNC]); + } + + if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { + blk_set_rl_full(rl, BLK_RW_ASYNC); + } else { + blk_clear_rl_full(rl, BLK_RW_ASYNC); + wake_up(&rl->wait[BLK_RW_ASYNC]); + } + } + + spin_unlock_irq(q->queue_lock); + return 0; +} + /* * Determine if elevator data should be initialized when allocating the * request associated with @bio. @@ -1135,7 +1173,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw, struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) { if (q->mq_ops) - return blk_mq_alloc_request(q, rw, gfp_mask); + return blk_mq_alloc_request(q, rw, gfp_mask, false); else return blk_old_get_request(q, rw, gfp_mask); } @@ -1231,12 +1269,15 @@ static void add_acct_request(struct request_queue *q, struct request *rq, static void part_round_stats_single(int cpu, struct hd_struct *part, unsigned long now) { + int inflight; + if (now == part->stamp) return; - if (part_in_flight(part)) { + inflight = part_in_flight(part); + if (inflight) { __part_stat_add(cpu, part, time_in_queue, - part_in_flight(part) * (now - part->stamp)); + inflight * (now - part->stamp)); __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); } part->stamp = now; @@ -1360,7 +1401,6 @@ void blk_add_request_payload(struct request *rq, struct page *page, rq->__data_len = rq->resid_len = len; rq->nr_phys_segments = 1; - rq->buffer = bio_data(bio); } EXPORT_SYMBOL_GPL(blk_add_request_payload); @@ -1402,12 +1442,6 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, bio->bi_next = req->bio; req->bio = bio; - /* - * may not be valid. if the low level driver said - * it didn't need a bounce buffer then it better - * not touch req->buffer either... - */ - req->buffer = bio_data(bio); req->__sector = bio->bi_iter.bi_sector; req->__data_len += bio->bi_iter.bi_size; req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); @@ -1432,6 +1466,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, * added on the elevator at this point. In addition, we don't have * reliable access to the elevator outside queue lock. Only check basic * merging parameters without querying the elevator. + * + * Caller must ensure !blk_queue_nomerges(q) beforehand. */ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int *request_count) @@ -1441,9 +1477,6 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, bool ret = false; struct list_head *plug_list; - if (blk_queue_nomerges(q)) - goto out; - plug = current->plug; if (!plug) goto out; @@ -1522,7 +1555,8 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio) * Check if we can merge with the plugged list before grabbing * any locks. */ - if (blk_attempt_plug_merge(q, bio, &request_count)) + if (!blk_queue_nomerges(q) && + blk_attempt_plug_merge(q, bio, &request_count)) return; spin_lock_irq(q->queue_lock); @@ -1654,7 +1688,7 @@ static int __init fail_make_request_debugfs(void) struct dentry *dir = fault_create_debugfs_attr("fail_make_request", NULL, &fail_make_request); - return IS_ERR(dir) ? PTR_ERR(dir) : 0; + return PTR_ERR_OR_ZERO(dir); } late_initcall(fail_make_request_debugfs); @@ -2434,7 +2468,6 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) } req->__data_len -= total_bytes; - req->buffer = bio_data(req->bio); /* update sector only for requests with clear definition of sector */ if (req->cmd_type == REQ_TYPE_FS) @@ -2503,7 +2536,7 @@ EXPORT_SYMBOL_GPL(blk_unprep_request); /* * queue lock must be held */ -static void blk_finish_request(struct request *req, int error) +void blk_finish_request(struct request *req, int error) { if (blk_rq_tagged(req)) blk_queue_end_tag(req->q, req); @@ -2529,6 +2562,7 @@ static void blk_finish_request(struct request *req, int error) __blk_put_request(req->q, req); } } +EXPORT_SYMBOL(blk_finish_request); /** * blk_end_bidi_request - Complete a bidi request @@ -2752,10 +2786,9 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq, /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */ rq->cmd_flags |= bio->bi_rw & REQ_WRITE; - if (bio_has_data(bio)) { + if (bio_has_data(bio)) rq->nr_phys_segments = bio_phys_segments(q, bio); - rq->buffer = bio_data(bio); - } + rq->__data_len = bio->bi_iter.bi_size; rq->bio = rq->biotail = bio; @@ -2831,7 +2864,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); /* * Copy attributes of the original request to the clone request. - * The actual data parts (e.g. ->cmd, ->buffer, ->sense) are not copied. + * The actual data parts (e.g. ->cmd, ->sense) are not copied. */ static void __blk_rq_prep_clone(struct request *dst, struct request *src) { @@ -2857,7 +2890,7 @@ static void __blk_rq_prep_clone(struct request *dst, struct request *src) * * Description: * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. - * The actual data parts of @rq_src (e.g. ->cmd, ->buffer, ->sense) + * The actual data parts of @rq_src (e.g. ->cmd, ->sense) * are not copied, and copying such parts is the caller's responsibility. * Also, pages which the original bios are pointing to are not copied * and the cloned bios just point same pages. @@ -2904,20 +2937,25 @@ free_and_out: } EXPORT_SYMBOL_GPL(blk_rq_prep_clone); -int kblockd_schedule_work(struct request_queue *q, struct work_struct *work) +int kblockd_schedule_work(struct work_struct *work) { return queue_work(kblockd_workqueue, work); } EXPORT_SYMBOL(kblockd_schedule_work); -int kblockd_schedule_delayed_work(struct request_queue *q, - struct delayed_work *dwork, unsigned long delay) +int kblockd_schedule_delayed_work(struct delayed_work *dwork, + unsigned long delay) { return queue_delayed_work(kblockd_workqueue, dwork, delay); } EXPORT_SYMBOL(kblockd_schedule_delayed_work); -#define PLUG_MAGIC 0x91827364 +int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, + unsigned long delay) +{ + return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay); +} +EXPORT_SYMBOL(kblockd_schedule_delayed_work_on); /** * blk_start_plug - initialize blk_plug and track it inside the task_struct @@ -2937,7 +2975,6 @@ void blk_start_plug(struct blk_plug *plug) { struct task_struct *tsk = current; - plug->magic = PLUG_MAGIC; INIT_LIST_HEAD(&plug->list); INIT_LIST_HEAD(&plug->mq_list); INIT_LIST_HEAD(&plug->cb_list); @@ -3034,8 +3071,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) LIST_HEAD(list); unsigned int depth; - BUG_ON(plug->magic != PLUG_MAGIC); - flush_plug_callbacks(plug, from_schedule); if (!list_empty(&plug->mq_list)) diff --git a/block/blk-flush.c b/block/blk-flush.c index 43e6b47..ff87c66 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -130,21 +130,13 @@ static void blk_flush_restore_request(struct request *rq) blk_clear_rq_complete(rq); } -static void mq_flush_run(struct work_struct *work) -{ - struct request *rq; - - rq = container_of(work, struct request, mq_flush_work); - - memset(&rq->csd, 0, sizeof(rq->csd)); - blk_mq_insert_request(rq, false, true, false); -} - static bool blk_flush_queue_rq(struct request *rq, bool add_front) { if (rq->q->mq_ops) { - INIT_WORK(&rq->mq_flush_work, mq_flush_run); - kblockd_schedule_work(rq->q, &rq->mq_flush_work); + struct request_queue *q = rq->q; + + blk_mq_add_to_requeue_list(rq, add_front); + blk_mq_kick_requeue_list(q); return false; } else { if (add_front) @@ -231,8 +223,10 @@ static void flush_end_io(struct request *flush_rq, int error) struct request *rq, *n; unsigned long flags = 0; - if (q->mq_ops) + if (q->mq_ops) { spin_lock_irqsave(&q->mq_flush_lock, flags); + q->flush_rq->cmd_flags = 0; + } running = &q->flush_queue[q->flush_running_idx]; BUG_ON(q->flush_pending_idx == q->flush_running_idx); @@ -306,23 +300,9 @@ static bool blk_kick_flush(struct request_queue *q) */ q->flush_pending_idx ^= 1; - if (q->mq_ops) { - struct blk_mq_ctx *ctx = first_rq->mq_ctx; - struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); - - blk_mq_rq_init(hctx, q->flush_rq); - q->flush_rq->mq_ctx = ctx; - - /* - * Reuse the tag value from the fist waiting request, - * with blk-mq the tag is generated during request - * allocation and drivers can rely on it being inside - * the range they asked for. - */ - q->flush_rq->tag = first_rq->tag; - } else { - blk_rq_init(q, q->flush_rq); - } + blk_rq_init(q, q->flush_rq); + if (q->mq_ops) + blk_mq_clone_flush_request(q->flush_rq, first_rq); q->flush_rq->cmd_type = REQ_TYPE_FS; q->flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c index c11d24e..d828b44 100644 --- a/block/blk-iopoll.c +++ b/block/blk-iopoll.c @@ -64,12 +64,12 @@ EXPORT_SYMBOL(__blk_iopoll_complete); * iopoll handler will not be invoked again before blk_iopoll_sched_prep() * is called. **/ -void blk_iopoll_complete(struct blk_iopoll *iopoll) +void blk_iopoll_complete(struct blk_iopoll *iop) { unsigned long flags; local_irq_save(flags); - __blk_iopoll_complete(iopoll); + __blk_iopoll_complete(iop); local_irq_restore(flags); } EXPORT_SYMBOL(blk_iopoll_complete); diff --git a/block/blk-lib.c b/block/blk-lib.c index 97a733c..8411be3 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -226,8 +226,8 @@ EXPORT_SYMBOL(blkdev_issue_write_same); * Generate and issue number of bios with zerofiled pages. */ -int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask) +static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask) { int ret; struct bio *bio; diff --git a/block/blk-map.c b/block/blk-map.c index f7b22bc..f890d43 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -155,7 +155,6 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq, if (!bio_flagged(bio, BIO_USER_MAPPED)) rq->cmd_flags |= REQ_COPY_USER; - rq->buffer = NULL; return 0; unmap_rq: blk_rq_unmap_user(bio); @@ -238,7 +237,6 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, blk_queue_bounce(q, &bio); bio_get(bio); blk_rq_bio_prep(q, rq, bio); - rq->buffer = NULL; return 0; } EXPORT_SYMBOL(blk_rq_map_user_iov); @@ -325,7 +323,6 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, } blk_queue_bounce(q, &rq->bio); - rq->buffer = NULL; return 0; } EXPORT_SYMBOL(blk_rq_map_kern); diff --git a/block/blk-merge.c b/block/blk-merge.c index 6c583f9..b3bf0df 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -13,7 +13,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, struct bio *bio) { struct bio_vec bv, bvprv = { NULL }; - int cluster, high, highprv = 1; + int cluster, high, highprv = 1, no_sg_merge; unsigned int seg_size, nr_phys_segs; struct bio *fbio, *bbio; struct bvec_iter iter; @@ -35,12 +35,21 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, cluster = blk_queue_cluster(q); seg_size = 0; nr_phys_segs = 0; + no_sg_merge = test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags); + high = 0; for_each_bio(bio) { bio_for_each_segment(bv, bio, iter) { /* + * If SG merging is disabled, each bio vector is + * a segment + */ + if (no_sg_merge) + goto new_segment; + + /* * the trick here is making sure that a high page is - * never considered part of another segment, since that - * might change with the bounce page. + * never considered part of another segment, since + * that might change with the bounce page. */ high = page_to_pfn(bv.bv_page) > queue_bounce_pfn(q); if (!high && !highprv && cluster) { @@ -84,11 +93,16 @@ void blk_recalc_rq_segments(struct request *rq) void blk_recount_segments(struct request_queue *q, struct bio *bio) { - struct bio *nxt = bio->bi_next; + if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags)) + bio->bi_phys_segments = bio->bi_vcnt; + else { + struct bio *nxt = bio->bi_next; + + bio->bi_next = NULL; + bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio); + bio->bi_next = nxt; + } - bio->bi_next = NULL; - bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio); - bio->bi_next = nxt; bio->bi_flags |= (1 << BIO_SEG_VALID); } EXPORT_SYMBOL(blk_recount_segments); diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c index 136ef86..bb3ed48 100644 --- a/block/blk-mq-cpu.c +++ b/block/blk-mq-cpu.c @@ -1,3 +1,8 @@ +/* + * CPU notifier helper code for blk-mq + * + * Copyright (C) 2013-2014 Jens Axboe + */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> @@ -18,14 +23,18 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self, { unsigned int cpu = (unsigned long) hcpu; struct blk_mq_cpu_notifier *notify; + int ret = NOTIFY_OK; raw_spin_lock(&blk_mq_cpu_notify_lock); - list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) - notify->notify(notify->data, action, cpu); + list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) { + ret = notify->notify(notify->data, action, cpu); + if (ret != NOTIFY_OK) + break; + } raw_spin_unlock(&blk_mq_cpu_notify_lock); - return NOTIFY_OK; + return ret; } void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier) @@ -45,7 +54,7 @@ void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier) } void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, - void (*fn)(void *, unsigned long, unsigned int), + int (*fn)(void *, unsigned long, unsigned int), void *data) { notifier->notify = fn; diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 09792132..1065d7c 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -1,3 +1,8 @@ +/* + * CPU <-> hardware queue mapping helpers + * + * Copyright (C) 2013-2014 Jens Axboe + */ #include <linux/kernel.h> #include <linux/threads.h> #include <linux/module.h> @@ -80,19 +85,35 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues) return 0; } -unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg) +unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set) { unsigned int *map; /* If cpus are offline, map them to first hctx */ map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL, - reg->numa_node); + set->numa_node); if (!map) return NULL; - if (!blk_mq_update_queue_map(map, reg->nr_hw_queues)) + if (!blk_mq_update_queue_map(map, set->nr_hw_queues)) return map; kfree(map); return NULL; } + +/* + * We have no quick way of doing reverse lookups. This is only used at + * queue init time, so runtime isn't important. + */ +int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index) +{ + int i; + + for_each_possible_cpu(i) { + if (index == mq_map[i]) + return cpu_to_node(i); + } + + return NUMA_NO_NODE; +} diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index b0ba264..ed52178 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -203,59 +203,24 @@ static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx, return ret; } -static ssize_t blk_mq_hw_sysfs_ipi_show(struct blk_mq_hw_ctx *hctx, char *page) -{ - ssize_t ret; - - spin_lock(&hctx->lock); - ret = sprintf(page, "%u\n", !!(hctx->flags & BLK_MQ_F_SHOULD_IPI)); - spin_unlock(&hctx->lock); - - return ret; -} - -static ssize_t blk_mq_hw_sysfs_ipi_store(struct blk_mq_hw_ctx *hctx, - const char *page, size_t len) +static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page) { - struct blk_mq_ctx *ctx; - unsigned long ret; - unsigned int i; - - if (kstrtoul(page, 10, &ret)) { - pr_err("blk-mq-sysfs: invalid input '%s'\n", page); - return -EINVAL; - } - - spin_lock(&hctx->lock); - if (ret) - hctx->flags |= BLK_MQ_F_SHOULD_IPI; - else - hctx->flags &= ~BLK_MQ_F_SHOULD_IPI; - spin_unlock(&hctx->lock); - - hctx_for_each_ctx(hctx, ctx, i) - ctx->ipi_redirect = !!ret; - - return len; + return blk_mq_tag_sysfs_show(hctx->tags, page); } -static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page) +static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page) { - return blk_mq_tag_sysfs_show(hctx->tags, page); + return sprintf(page, "%u\n", atomic_read(&hctx->nr_active)); } static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) { - unsigned int i, queue_num, first = 1; + unsigned int i, first = 1; ssize_t ret = 0; blk_mq_disable_hotplug(); - for_each_online_cpu(i) { - queue_num = hctx->queue->mq_map[i]; - if (queue_num != hctx->queue_num) - continue; - + for_each_cpu(i, hctx->cpumask) { if (first) ret += sprintf(ret + page, "%u", i); else @@ -307,15 +272,14 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = { .attr = {.name = "dispatched", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_dispatched_show, }; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = { + .attr = {.name = "active", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_active_show, +}; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = { .attr = {.name = "pending", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_rq_list_show, }; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_ipi = { - .attr = {.name = "ipi_redirect", .mode = S_IRUGO | S_IWUSR}, - .show = blk_mq_hw_sysfs_ipi_show, - .store = blk_mq_hw_sysfs_ipi_store, -}; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = { .attr = {.name = "tags", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_tags_show, @@ -330,9 +294,9 @@ static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_run.attr, &blk_mq_hw_sysfs_dispatched.attr, &blk_mq_hw_sysfs_pending.attr, - &blk_mq_hw_sysfs_ipi.attr, &blk_mq_hw_sysfs_tags.attr, &blk_mq_hw_sysfs_cpus.attr, + &blk_mq_hw_sysfs_active.attr, NULL, }; @@ -363,6 +327,42 @@ static struct kobj_type blk_mq_hw_ktype = { .release = blk_mq_sysfs_release, }; +static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_ctx *ctx; + int i; + + if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP)) + return; + + hctx_for_each_ctx(hctx, ctx, i) + kobject_del(&ctx->kobj); + + kobject_del(&hctx->kobj); +} + +static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + struct blk_mq_ctx *ctx; + int i, ret; + + if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP)) + return 0; + + ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num); + if (ret) + return ret; + + hctx_for_each_ctx(hctx, ctx, i) { + ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu); + if (ret) + break; + } + + return ret; +} + void blk_mq_unregister_disk(struct gendisk *disk) { struct request_queue *q = disk->queue; @@ -371,11 +371,11 @@ void blk_mq_unregister_disk(struct gendisk *disk) int i, j; queue_for_each_hw_ctx(q, hctx, i) { - hctx_for_each_ctx(hctx, ctx, j) { - kobject_del(&ctx->kobj); + blk_mq_unregister_hctx(hctx); + + hctx_for_each_ctx(hctx, ctx, j) kobject_put(&ctx->kobj); - } - kobject_del(&hctx->kobj); + kobject_put(&hctx->kobj); } @@ -386,15 +386,30 @@ void blk_mq_unregister_disk(struct gendisk *disk) kobject_put(&disk_to_dev(disk)->kobj); } +static void blk_mq_sysfs_init(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + int i, j; + + kobject_init(&q->mq_kobj, &blk_mq_ktype); + + queue_for_each_hw_ctx(q, hctx, i) { + kobject_init(&hctx->kobj, &blk_mq_hw_ktype); + + hctx_for_each_ctx(hctx, ctx, j) + kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); + } +} + int blk_mq_register_disk(struct gendisk *disk) { struct device *dev = disk_to_dev(disk); struct request_queue *q = disk->queue; struct blk_mq_hw_ctx *hctx; - struct blk_mq_ctx *ctx; - int ret, i, j; + int ret, i; - kobject_init(&q->mq_kobj, &blk_mq_ktype); + blk_mq_sysfs_init(q); ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); if (ret < 0) @@ -403,20 +418,10 @@ int blk_mq_register_disk(struct gendisk *disk) kobject_uevent(&q->mq_kobj, KOBJ_ADD); queue_for_each_hw_ctx(q, hctx, i) { - kobject_init(&hctx->kobj, &blk_mq_hw_ktype); - ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", i); + hctx->flags |= BLK_MQ_F_SYSFS_UP; + ret = blk_mq_register_hctx(hctx); if (ret) break; - - if (!hctx->nr_ctx) - continue; - - hctx_for_each_ctx(hctx, ctx, j) { - kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); - ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu); - if (ret) - break; - } } if (ret) { @@ -426,3 +431,26 @@ int blk_mq_register_disk(struct gendisk *disk) return 0; } + +void blk_mq_sysfs_unregister(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_unregister_hctx(hctx); +} + +int blk_mq_sysfs_register(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i, ret = 0; + + queue_for_each_hw_ctx(q, hctx, i) { + ret = blk_mq_register_hctx(hctx); + if (ret) + break; + } + + return ret; +} diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 83ae96c..d90c4ae 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -1,78 +1,345 @@ +/* + * Fast and scalable bitmap tagging variant. Uses sparser bitmaps spread + * over multiple cachelines to avoid ping-pong between multiple submitters + * or submitter and completer. Uses rolling wakeups to avoid falling of + * the scaling cliff when we run out of tags and have to start putting + * submitters to sleep. + * + * Uses active queue tracking to support fairer distribution of tags + * between multiple submitters when a shared tag map is used. + * + * Copyright (C) 2013-2014 Jens Axboe + */ #include <linux/kernel.h> #include <linux/module.h> -#include <linux/percpu_ida.h> +#include <linux/random.h> #include <linux/blk-mq.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" +static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt) +{ + int i; + + for (i = 0; i < bt->map_nr; i++) { + struct blk_align_bitmap *bm = &bt->map[i]; + int ret; + + ret = find_first_zero_bit(&bm->word, bm->depth); + if (ret < bm->depth) + return true; + } + + return false; +} + +bool blk_mq_has_free_tags(struct blk_mq_tags *tags) +{ + if (!tags) + return true; + + return bt_has_free_tags(&tags->bitmap_tags); +} + +static inline void bt_index_inc(unsigned int *index) +{ + *index = (*index + 1) & (BT_WAIT_QUEUES - 1); +} + /* - * Per tagged queue (tag address space) map + * If a previously inactive queue goes active, bump the active user count. */ -struct blk_mq_tags { - unsigned int nr_tags; - unsigned int nr_reserved_tags; - unsigned int nr_batch_move; - unsigned int nr_max_cache; +bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) +{ + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && + !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + atomic_inc(&hctx->tags->active_queues); - struct percpu_ida free_tags; - struct percpu_ida reserved_tags; -}; + return true; +} -void blk_mq_wait_for_tags(struct blk_mq_tags *tags) +/* + * Wakeup all potentially sleeping on normal (non-reserved) tags + */ +static void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags) { - int tag = blk_mq_get_tag(tags, __GFP_WAIT, false); - blk_mq_put_tag(tags, tag); + struct blk_mq_bitmap_tags *bt; + int i, wake_index; + + bt = &tags->bitmap_tags; + wake_index = bt->wake_index; + for (i = 0; i < BT_WAIT_QUEUES; i++) { + struct bt_wait_state *bs = &bt->bs[wake_index]; + + if (waitqueue_active(&bs->wait)) + wake_up(&bs->wait); + + bt_index_inc(&wake_index); + } } -bool blk_mq_has_free_tags(struct blk_mq_tags *tags) +/* + * If a previously busy queue goes inactive, potential waiters could now + * be allowed to queue. Wake them up and check. + */ +void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_tags *tags = hctx->tags; + + if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + return; + + atomic_dec(&tags->active_queues); + + blk_mq_tag_wakeup_all(tags); +} + +/* + * For shared tag users, we track the number of currently active users + * and attempt to provide a fair share of the tag depth for each of them. + */ +static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, + struct blk_mq_bitmap_tags *bt) +{ + unsigned int depth, users; + + if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED)) + return true; + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + return true; + + /* + * Don't try dividing an ant + */ + if (bt->depth == 1) + return true; + + users = atomic_read(&hctx->tags->active_queues); + if (!users) + return true; + + /* + * Allow at least some tags + */ + depth = max((bt->depth + users - 1) / users, 4U); + return atomic_read(&hctx->nr_active) < depth; +} + +static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag) { - return !tags || - percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids) != 0; + int tag, org_last_tag, end; + + org_last_tag = last_tag; + end = bm->depth; + do { +restart: + tag = find_next_zero_bit(&bm->word, end, last_tag); + if (unlikely(tag >= end)) { + /* + * We started with an offset, start from 0 to + * exhaust the map. + */ + if (org_last_tag && last_tag) { + end = last_tag; + last_tag = 0; + goto restart; + } + return -1; + } + last_tag = tag + 1; + } while (test_and_set_bit_lock(tag, &bm->word)); + + return tag; } -static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp) +/* + * Straight forward bitmap tag implementation, where each bit is a tag + * (cleared == free, and set == busy). The small twist is using per-cpu + * last_tag caches, which blk-mq stores in the blk_mq_ctx software queue + * contexts. This enables us to drastically limit the space searched, + * without dirtying an extra shared cacheline like we would if we stored + * the cache value inside the shared blk_mq_bitmap_tags structure. On top + * of that, each word of tags is in a separate cacheline. This means that + * multiple users will tend to stick to different cachelines, at least + * until the map is exhausted. + */ +static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt, + unsigned int *tag_cache) { + unsigned int last_tag, org_last_tag; + int index, i, tag; + + if (!hctx_may_queue(hctx, bt)) + return -1; + + last_tag = org_last_tag = *tag_cache; + index = TAG_TO_INDEX(bt, last_tag); + + for (i = 0; i < bt->map_nr; i++) { + tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag)); + if (tag != -1) { + tag += (index << bt->bits_per_word); + goto done; + } + + last_tag = 0; + if (++index >= bt->map_nr) + index = 0; + } + + *tag_cache = 0; + return -1; + + /* + * Only update the cache from the allocation path, if we ended + * up using the specific cached tag. + */ +done: + if (tag == org_last_tag) { + last_tag = tag + 1; + if (last_tag >= bt->depth - 1) + last_tag = 0; + + *tag_cache = last_tag; + } + + return tag; +} + +static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt, + struct blk_mq_hw_ctx *hctx) +{ + struct bt_wait_state *bs; + + if (!hctx) + return &bt->bs[0]; + + bs = &bt->bs[hctx->wait_index]; + bt_index_inc(&hctx->wait_index); + return bs; +} + +static int bt_get(struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx, + unsigned int *last_tag, gfp_t gfp) +{ + struct bt_wait_state *bs; + DEFINE_WAIT(wait); int tag; - tag = percpu_ida_alloc(&tags->free_tags, (gfp & __GFP_WAIT) ? - TASK_UNINTERRUPTIBLE : TASK_RUNNING); - if (tag < 0) - return BLK_MQ_TAG_FAIL; - return tag + tags->nr_reserved_tags; + tag = __bt_get(hctx, bt, last_tag); + if (tag != -1) + return tag; + + if (!(gfp & __GFP_WAIT)) + return -1; + + bs = bt_wait_ptr(bt, hctx); + do { + bool was_empty; + + was_empty = list_empty(&wait.task_list); + prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE); + + tag = __bt_get(hctx, bt, last_tag); + if (tag != -1) + break; + + if (was_empty) + atomic_set(&bs->wait_cnt, bt->wake_cnt); + + io_schedule(); + } while (1); + + finish_wait(&bs->wait, &wait); + return tag; +} + +static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, + struct blk_mq_hw_ctx *hctx, + unsigned int *last_tag, gfp_t gfp) +{ + int tag; + + tag = bt_get(&tags->bitmap_tags, hctx, last_tag, gfp); + if (tag >= 0) + return tag + tags->nr_reserved_tags; + + return BLK_MQ_TAG_FAIL; } static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags, gfp_t gfp) { - int tag; + int tag, zero = 0; if (unlikely(!tags->nr_reserved_tags)) { WARN_ON_ONCE(1); return BLK_MQ_TAG_FAIL; } - tag = percpu_ida_alloc(&tags->reserved_tags, (gfp & __GFP_WAIT) ? - TASK_UNINTERRUPTIBLE : TASK_RUNNING); + tag = bt_get(&tags->breserved_tags, NULL, &zero, gfp); if (tag < 0) return BLK_MQ_TAG_FAIL; + return tag; } -unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved) +unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, + gfp_t gfp, bool reserved) { if (!reserved) - return __blk_mq_get_tag(tags, gfp); + return __blk_mq_get_tag(hctx->tags, hctx, last_tag, gfp); - return __blk_mq_get_reserved_tag(tags, gfp); + return __blk_mq_get_reserved_tag(hctx->tags, gfp); +} + +static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt) +{ + int i, wake_index; + + wake_index = bt->wake_index; + for (i = 0; i < BT_WAIT_QUEUES; i++) { + struct bt_wait_state *bs = &bt->bs[wake_index]; + + if (waitqueue_active(&bs->wait)) { + if (wake_index != bt->wake_index) + bt->wake_index = wake_index; + + return bs; + } + + bt_index_inc(&wake_index); + } + + return NULL; +} + +static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag) +{ + const int index = TAG_TO_INDEX(bt, tag); + struct bt_wait_state *bs; + + /* + * The unlock memory barrier need to order access to req in free + * path and clearing tag bit + */ + clear_bit_unlock(TAG_TO_BIT(bt, tag), &bt->map[index].word); + + bs = bt_wake_ptr(bt); + if (bs && atomic_dec_and_test(&bs->wait_cnt)) { + atomic_set(&bs->wait_cnt, bt->wake_cnt); + bt_index_inc(&bt->wake_index); + wake_up(&bs->wait); + } } static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) { BUG_ON(tag >= tags->nr_tags); - percpu_ida_free(&tags->free_tags, tag - tags->nr_reserved_tags); + bt_clear_tag(&tags->bitmap_tags, tag); } static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags, @@ -80,22 +347,43 @@ static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags, { BUG_ON(tag >= tags->nr_reserved_tags); - percpu_ida_free(&tags->reserved_tags, tag); + bt_clear_tag(&tags->breserved_tags, tag); } -void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) +void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, + unsigned int *last_tag) { - if (tag >= tags->nr_reserved_tags) - __blk_mq_put_tag(tags, tag); - else + struct blk_mq_tags *tags = hctx->tags; + + if (tag >= tags->nr_reserved_tags) { + const int real_tag = tag - tags->nr_reserved_tags; + + __blk_mq_put_tag(tags, real_tag); + *last_tag = real_tag; + } else __blk_mq_put_reserved_tag(tags, tag); } -static int __blk_mq_tag_iter(unsigned id, void *data) +static void bt_for_each_free(struct blk_mq_bitmap_tags *bt, + unsigned long *free_map, unsigned int off) { - unsigned long *tag_map = data; - __set_bit(id, tag_map); - return 0; + int i; + + for (i = 0; i < bt->map_nr; i++) { + struct blk_align_bitmap *bm = &bt->map[i]; + int bit = 0; + + do { + bit = find_next_zero_bit(&bm->word, bm->depth, bit); + if (bit >= bm->depth) + break; + + __set_bit(bit + off, free_map); + bit++; + } while (1); + + off += (1 << bt->bits_per_word); + } } void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, @@ -109,21 +397,128 @@ void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, if (!tag_map) return; - percpu_ida_for_each_free(&tags->free_tags, __blk_mq_tag_iter, tag_map); + bt_for_each_free(&tags->bitmap_tags, tag_map, tags->nr_reserved_tags); if (tags->nr_reserved_tags) - percpu_ida_for_each_free(&tags->reserved_tags, __blk_mq_tag_iter, - tag_map); + bt_for_each_free(&tags->breserved_tags, tag_map, 0); fn(data, tag_map); kfree(tag_map); } +EXPORT_SYMBOL(blk_mq_tag_busy_iter); + +static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt) +{ + unsigned int i, used; + + for (i = 0, used = 0; i < bt->map_nr; i++) { + struct blk_align_bitmap *bm = &bt->map[i]; + + used += bitmap_weight(&bm->word, bm->depth); + } + + return bt->depth - used; +} + +static void bt_update_count(struct blk_mq_bitmap_tags *bt, + unsigned int depth) +{ + unsigned int tags_per_word = 1U << bt->bits_per_word; + unsigned int map_depth = depth; + + if (depth) { + int i; + + for (i = 0; i < bt->map_nr; i++) { + bt->map[i].depth = min(map_depth, tags_per_word); + map_depth -= bt->map[i].depth; + } + } + + bt->wake_cnt = BT_WAIT_BATCH; + if (bt->wake_cnt > depth / 4) + bt->wake_cnt = max(1U, depth / 4); + + bt->depth = depth; +} + +static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth, + int node, bool reserved) +{ + int i; + + bt->bits_per_word = ilog2(BITS_PER_LONG); + + /* + * Depth can be zero for reserved tags, that's not a failure + * condition. + */ + if (depth) { + unsigned int nr, tags_per_word; + + tags_per_word = (1 << bt->bits_per_word); + + /* + * If the tag space is small, shrink the number of tags + * per word so we spread over a few cachelines, at least. + * If less than 4 tags, just forget about it, it's not + * going to work optimally anyway. + */ + if (depth >= 4) { + while (tags_per_word * 4 > depth) { + bt->bits_per_word--; + tags_per_word = (1 << bt->bits_per_word); + } + } + + nr = ALIGN(depth, tags_per_word) / tags_per_word; + bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap), + GFP_KERNEL, node); + if (!bt->map) + return -ENOMEM; + + bt->map_nr = nr; + } + + bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL); + if (!bt->bs) { + kfree(bt->map); + return -ENOMEM; + } + + for (i = 0; i < BT_WAIT_QUEUES; i++) + init_waitqueue_head(&bt->bs[i].wait); + + bt_update_count(bt, depth); + return 0; +} + +static void bt_free(struct blk_mq_bitmap_tags *bt) +{ + kfree(bt->map); + kfree(bt->bs); +} + +static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, + int node) +{ + unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; + + if (bt_alloc(&tags->bitmap_tags, depth, node, false)) + goto enomem; + if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true)) + goto enomem; + + return tags; +enomem: + bt_free(&tags->bitmap_tags); + kfree(tags); + return NULL; +} struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, unsigned int reserved_tags, int node) { - unsigned int nr_tags, nr_cache; struct blk_mq_tags *tags; - int ret; if (total_tags > BLK_MQ_TAG_MAX) { pr_err("blk-mq: tag depth too large\n"); @@ -134,73 +529,59 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, if (!tags) return NULL; - nr_tags = total_tags - reserved_tags; - nr_cache = nr_tags / num_possible_cpus(); - - if (nr_cache < BLK_MQ_TAG_CACHE_MIN) - nr_cache = BLK_MQ_TAG_CACHE_MIN; - else if (nr_cache > BLK_MQ_TAG_CACHE_MAX) - nr_cache = BLK_MQ_TAG_CACHE_MAX; - tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; - tags->nr_max_cache = nr_cache; - tags->nr_batch_move = max(1u, nr_cache / 2); - ret = __percpu_ida_init(&tags->free_tags, tags->nr_tags - - tags->nr_reserved_tags, - tags->nr_max_cache, - tags->nr_batch_move); - if (ret) - goto err_free_tags; + return blk_mq_init_bitmap_tags(tags, node); +} - if (reserved_tags) { - /* - * With max_cahe and batch set to 1, the allocator fallbacks to - * no cached. It's fine reserved tags allocation is slow. - */ - ret = __percpu_ida_init(&tags->reserved_tags, reserved_tags, - 1, 1); - if (ret) - goto err_reserved_tags; - } +void blk_mq_free_tags(struct blk_mq_tags *tags) +{ + bt_free(&tags->bitmap_tags); + bt_free(&tags->breserved_tags); + kfree(tags); +} - return tags; +void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag) +{ + unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; -err_reserved_tags: - percpu_ida_destroy(&tags->free_tags); -err_free_tags: - kfree(tags); - return NULL; + *tag = prandom_u32() % depth; } -void blk_mq_free_tags(struct blk_mq_tags *tags) +int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth) { - percpu_ida_destroy(&tags->free_tags); - percpu_ida_destroy(&tags->reserved_tags); - kfree(tags); + tdepth -= tags->nr_reserved_tags; + if (tdepth > tags->nr_tags) + return -EINVAL; + + /* + * Don't need (or can't) update reserved tags here, they remain + * static and should never need resizing. + */ + bt_update_count(&tags->bitmap_tags, tdepth); + blk_mq_tag_wakeup_all(tags); + return 0; } ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) { char *orig_page = page; - unsigned int cpu; + unsigned int free, res; if (!tags) return 0; - page += sprintf(page, "nr_tags=%u, reserved_tags=%u, batch_move=%u," - " max_cache=%u\n", tags->nr_tags, tags->nr_reserved_tags, - tags->nr_batch_move, tags->nr_max_cache); + page += sprintf(page, "nr_tags=%u, reserved_tags=%u, " + "bits_per_word=%u\n", + tags->nr_tags, tags->nr_reserved_tags, + tags->bitmap_tags.bits_per_word); - page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", - percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids), - percpu_ida_free_tags(&tags->reserved_tags, nr_cpu_ids)); + free = bt_unused_tags(&tags->bitmap_tags); + res = bt_unused_tags(&tags->breserved_tags); - for_each_possible_cpu(cpu) { - page += sprintf(page, " cpu%02u: nr_free=%u\n", cpu, - percpu_ida_free_tags(&tags->free_tags, cpu)); - } + page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res); + page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues)); return page - orig_page; } diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 947ba2c..c959de5 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -1,17 +1,59 @@ #ifndef INT_BLK_MQ_TAG_H #define INT_BLK_MQ_TAG_H -struct blk_mq_tags; +#include "blk-mq.h" + +enum { + BT_WAIT_QUEUES = 8, + BT_WAIT_BATCH = 8, +}; + +struct bt_wait_state { + atomic_t wait_cnt; + wait_queue_head_t wait; +} ____cacheline_aligned_in_smp; + +#define TAG_TO_INDEX(bt, tag) ((tag) >> (bt)->bits_per_word) +#define TAG_TO_BIT(bt, tag) ((tag) & ((1 << (bt)->bits_per_word) - 1)) + +struct blk_mq_bitmap_tags { + unsigned int depth; + unsigned int wake_cnt; + unsigned int bits_per_word; + + unsigned int map_nr; + struct blk_align_bitmap *map; + + unsigned int wake_index; + struct bt_wait_state *bs; +}; + +/* + * Tag address space map. + */ +struct blk_mq_tags { + unsigned int nr_tags; + unsigned int nr_reserved_tags; + + atomic_t active_queues; + + struct blk_mq_bitmap_tags bitmap_tags; + struct blk_mq_bitmap_tags breserved_tags; + + struct request **rqs; + struct list_head page_list; +}; + extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node); extern void blk_mq_free_tags(struct blk_mq_tags *tags); -extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved); -extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags); -extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag); -extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data); +extern unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved); +extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag); extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); +extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag); +extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth); enum { BLK_MQ_TAG_CACHE_MIN = 1, @@ -24,4 +66,23 @@ enum { BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1, }; +extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *); +extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); + +static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) +{ + if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) + return false; + + return __blk_mq_tag_busy(hctx); +} + +static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) +{ + if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) + return; + + __blk_mq_tag_idle(hctx); +} + #endif diff --git a/block/blk-mq.c b/block/blk-mq.c index 1d2a9bd..0f5879c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1,3 +1,9 @@ +/* + * Block multiqueue core code + * + * Copyright (C) 2013-2014 Jens Axboe + * Copyright (C) 2013-2014 Christoph Hellwig + */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/backing-dev.h> @@ -56,38 +62,40 @@ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) { unsigned int i; - for (i = 0; i < hctx->nr_ctx_map; i++) - if (hctx->ctx_map[i]) + for (i = 0; i < hctx->ctx_map.map_size; i++) + if (hctx->ctx_map.map[i].word) return true; return false; } +static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx) +{ + return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word]; +} + +#define CTX_TO_BIT(hctx, ctx) \ + ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1)) + /* * Mark this ctx as having pending work in this hardware queue */ static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { - if (!test_bit(ctx->index_hw, hctx->ctx_map)) - set_bit(ctx->index_hw, hctx->ctx_map); + struct blk_align_bitmap *bm = get_bm(hctx, ctx); + + if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word)) + set_bit(CTX_TO_BIT(hctx, ctx), &bm->word); } -static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, - gfp_t gfp, bool reserved) +static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx) { - struct request *rq; - unsigned int tag; + struct blk_align_bitmap *bm = get_bm(hctx, ctx); - tag = blk_mq_get_tag(hctx->tags, gfp, reserved); - if (tag != BLK_MQ_TAG_FAIL) { - rq = hctx->rqs[tag]; - rq->tag = tag; - - return rq; - } - - return NULL; + clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word); } static int blk_mq_queue_enter(struct request_queue *q) @@ -186,78 +194,95 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, if (blk_queue_io_stat(q)) rw_flags |= REQ_IO_STAT; + INIT_LIST_HEAD(&rq->queuelist); + /* csd/requeue_work/fifo_time is initialized before use */ + rq->q = q; rq->mq_ctx = ctx; - rq->cmd_flags = rw_flags; - rq->start_time = jiffies; + rq->cmd_flags |= rw_flags; + /* do not touch atomic flags, it needs atomic ops against the timer */ + rq->cpu = -1; + INIT_HLIST_NODE(&rq->hash); + RB_CLEAR_NODE(&rq->rb_node); + rq->rq_disk = NULL; + rq->part = NULL; +#ifdef CONFIG_BLK_CGROUP + rq->rl = NULL; set_start_time_ns(rq); + rq->io_start_time_ns = 0; +#endif + rq->nr_phys_segments = 0; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + rq->nr_integrity_segments = 0; +#endif + rq->special = NULL; + /* tag was already set */ + rq->errors = 0; + + rq->extra_len = 0; + rq->sense_len = 0; + rq->resid_len = 0; + rq->sense = NULL; + + INIT_LIST_HEAD(&rq->timeout_list); + rq->end_io = NULL; + rq->end_io_data = NULL; + rq->next_rq = NULL; + ctx->rq_dispatched[rw_is_sync(rw_flags)]++; } -static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, - int rw, gfp_t gfp, - bool reserved) +static struct request * +__blk_mq_alloc_request(struct request_queue *q, struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx, int rw, gfp_t gfp, bool reserved) { struct request *rq; + unsigned int tag; - do { - struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); - struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); + tag = blk_mq_get_tag(hctx, &ctx->last_tag, gfp, reserved); + if (tag != BLK_MQ_TAG_FAIL) { + rq = hctx->tags->rqs[tag]; - rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved); - if (rq) { - blk_mq_rq_ctx_init(q, ctx, rq, rw); - break; + rq->cmd_flags = 0; + if (blk_mq_tag_busy(hctx)) { + rq->cmd_flags = REQ_MQ_INFLIGHT; + atomic_inc(&hctx->nr_active); } - blk_mq_put_ctx(ctx); - if (!(gfp & __GFP_WAIT)) - break; - - __blk_mq_run_hw_queue(hctx); - blk_mq_wait_for_tags(hctx->tags); - } while (1); + rq->tag = tag; + blk_mq_rq_ctx_init(q, ctx, rq, rw); + return rq; + } - return rq; + return NULL; } -struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp) +struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, + bool reserved) { + struct blk_mq_ctx *ctx; + struct blk_mq_hw_ctx *hctx; struct request *rq; if (blk_mq_queue_enter(q)) return NULL; - rq = blk_mq_alloc_request_pinned(q, rw, gfp, false); - if (rq) - blk_mq_put_ctx(rq->mq_ctx); - return rq; -} - -struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, - gfp_t gfp) -{ - struct request *rq; + ctx = blk_mq_get_ctx(q); + hctx = q->mq_ops->map_queue(q, ctx->cpu); - if (blk_mq_queue_enter(q)) - return NULL; + rq = __blk_mq_alloc_request(q, hctx, ctx, rw, gfp & ~__GFP_WAIT, + reserved); + if (!rq && (gfp & __GFP_WAIT)) { + __blk_mq_run_hw_queue(hctx); + blk_mq_put_ctx(ctx); - rq = blk_mq_alloc_request_pinned(q, rw, gfp, true); - if (rq) - blk_mq_put_ctx(rq->mq_ctx); + ctx = blk_mq_get_ctx(q); + hctx = q->mq_ops->map_queue(q, ctx->cpu); + rq = __blk_mq_alloc_request(q, hctx, ctx, rw, gfp, reserved); + } + blk_mq_put_ctx(ctx); return rq; } -EXPORT_SYMBOL(blk_mq_alloc_reserved_request); - -/* - * Re-init and set pdu, if we have it - */ -void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq) -{ - blk_rq_init(hctx->queue, rq); - - if (hctx->cmd_size) - rq->special = blk_mq_rq_to_pdu(rq); -} +EXPORT_SYMBOL(blk_mq_alloc_request); static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct request *rq) @@ -265,9 +290,11 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, const int tag = rq->tag; struct request_queue *q = rq->q; - blk_mq_rq_init(hctx, rq); - blk_mq_put_tag(hctx->tags, tag); + if (rq->cmd_flags & REQ_MQ_INFLIGHT) + atomic_dec(&hctx->nr_active); + clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + blk_mq_put_tag(hctx, tag, &ctx->last_tag); blk_mq_queue_exit(q); } @@ -283,20 +310,47 @@ void blk_mq_free_request(struct request *rq) __blk_mq_free_request(hctx, ctx, rq); } -bool blk_mq_end_io_partial(struct request *rq, int error, unsigned int nr_bytes) +/* + * Clone all relevant state from a request that has been put on hold in + * the flush state machine into the preallocated flush request that hangs + * off the request queue. + * + * For a driver the flush request should be invisible, that's why we are + * impersonating the original request here. + */ +void blk_mq_clone_flush_request(struct request *flush_rq, + struct request *orig_rq) { - if (blk_update_request(rq, error, blk_rq_bytes(rq))) - return true; + struct blk_mq_hw_ctx *hctx = + orig_rq->q->mq_ops->map_queue(orig_rq->q, orig_rq->mq_ctx->cpu); + + flush_rq->mq_ctx = orig_rq->mq_ctx; + flush_rq->tag = orig_rq->tag; + memcpy(blk_mq_rq_to_pdu(flush_rq), blk_mq_rq_to_pdu(orig_rq), + hctx->cmd_size); +} +inline void __blk_mq_end_io(struct request *rq, int error) +{ blk_account_io_done(rq); - if (rq->end_io) + if (rq->end_io) { rq->end_io(rq, error); - else + } else { + if (unlikely(blk_bidi_rq(rq))) + blk_mq_free_request(rq->next_rq); blk_mq_free_request(rq); - return false; + } +} +EXPORT_SYMBOL(__blk_mq_end_io); + +void blk_mq_end_io(struct request *rq, int error) +{ + if (blk_update_request(rq, error, blk_rq_bytes(rq))) + BUG(); + __blk_mq_end_io(rq, error); } -EXPORT_SYMBOL(blk_mq_end_io_partial); +EXPORT_SYMBOL(blk_mq_end_io); static void __blk_mq_complete_request_remote(void *data) { @@ -305,18 +359,22 @@ static void __blk_mq_complete_request_remote(void *data) rq->q->softirq_done_fn(rq); } -void __blk_mq_complete_request(struct request *rq) +static void blk_mq_ipi_complete_request(struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; + bool shared = false; int cpu; - if (!ctx->ipi_redirect) { + if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { rq->q->softirq_done_fn(rq); return; } cpu = get_cpu(); - if (cpu != ctx->cpu && cpu_online(ctx->cpu)) { + if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags)) + shared = cpus_share_cache(cpu, ctx->cpu); + + if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { rq->csd.func = __blk_mq_complete_request_remote; rq->csd.info = rq; rq->csd.flags = 0; @@ -327,6 +385,16 @@ void __blk_mq_complete_request(struct request *rq) put_cpu(); } +void __blk_mq_complete_request(struct request *rq) +{ + struct request_queue *q = rq->q; + + if (!q->softirq_done_fn) + blk_mq_end_io(rq, rq->errors); + else + blk_mq_ipi_complete_request(rq); +} + /** * blk_mq_complete_request - end I/O on a request * @rq: the request being processed @@ -337,7 +405,9 @@ void __blk_mq_complete_request(struct request *rq) **/ void blk_mq_complete_request(struct request *rq) { - if (unlikely(blk_should_fake_timeout(rq->q))) + struct request_queue *q = rq->q; + + if (unlikely(blk_should_fake_timeout(q))) return; if (!blk_mark_rq_complete(rq)) __blk_mq_complete_request(rq); @@ -350,13 +420,31 @@ static void blk_mq_start_request(struct request *rq, bool last) trace_block_rq_issue(q, rq); + rq->resid_len = blk_rq_bytes(rq); + if (unlikely(blk_bidi_rq(rq))) + rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); + /* * Just mark start time and set the started bit. Due to memory * ordering, we know we'll see the correct deadline as long as - * REQ_ATOMIC_STARTED is seen. + * REQ_ATOMIC_STARTED is seen. Use the default queue timeout, + * unless one has been set in the request. + */ + if (!rq->timeout) + rq->deadline = jiffies + q->rq_timeout; + else + rq->deadline = jiffies + rq->timeout; + + /* + * Mark us as started and clear complete. Complete might have been + * set if requeue raced with timeout, which then marked it as + * complete. So be sure to clear complete again when we start + * the request, otherwise we'll ignore the completion event. */ - rq->deadline = jiffies + q->rq_timeout; - set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) + set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) + clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); if (q->dma_drain_size && blk_rq_bytes(rq)) { /* @@ -378,7 +466,7 @@ static void blk_mq_start_request(struct request *rq, bool last) rq->cmd_flags |= REQ_END; } -static void blk_mq_requeue_request(struct request *rq) +static void __blk_mq_requeue_request(struct request *rq) { struct request_queue *q = rq->q; @@ -391,6 +479,86 @@ static void blk_mq_requeue_request(struct request *rq) rq->nr_phys_segments--; } +void blk_mq_requeue_request(struct request *rq) +{ + __blk_mq_requeue_request(rq); + blk_clear_rq_complete(rq); + + BUG_ON(blk_queued_rq(rq)); + blk_mq_add_to_requeue_list(rq, true); +} +EXPORT_SYMBOL(blk_mq_requeue_request); + +static void blk_mq_requeue_work(struct work_struct *work) +{ + struct request_queue *q = + container_of(work, struct request_queue, requeue_work); + LIST_HEAD(rq_list); + struct request *rq, *next; + unsigned long flags; + + spin_lock_irqsave(&q->requeue_lock, flags); + list_splice_init(&q->requeue_list, &rq_list); + spin_unlock_irqrestore(&q->requeue_lock, flags); + + list_for_each_entry_safe(rq, next, &rq_list, queuelist) { + if (!(rq->cmd_flags & REQ_SOFTBARRIER)) + continue; + + rq->cmd_flags &= ~REQ_SOFTBARRIER; + list_del_init(&rq->queuelist); + blk_mq_insert_request(rq, true, false, false); + } + + while (!list_empty(&rq_list)) { + rq = list_entry(rq_list.next, struct request, queuelist); + list_del_init(&rq->queuelist); + blk_mq_insert_request(rq, false, false, false); + } + + blk_mq_run_queues(q, false); +} + +void blk_mq_add_to_requeue_list(struct request *rq, bool at_head) +{ + struct request_queue *q = rq->q; + unsigned long flags; + + /* + * We abuse this flag that is otherwise used by the I/O scheduler to + * request head insertation from the workqueue. + */ + BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER); + + spin_lock_irqsave(&q->requeue_lock, flags); + if (at_head) { + rq->cmd_flags |= REQ_SOFTBARRIER; + list_add(&rq->queuelist, &q->requeue_list); + } else { + list_add_tail(&rq->queuelist, &q->requeue_list); + } + spin_unlock_irqrestore(&q->requeue_lock, flags); +} +EXPORT_SYMBOL(blk_mq_add_to_requeue_list); + +void blk_mq_kick_requeue_list(struct request_queue *q) +{ + kblockd_schedule_work(&q->requeue_work); +} +EXPORT_SYMBOL(blk_mq_kick_requeue_list); + +struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx, unsigned int tag) +{ + struct request_queue *q = hctx->queue; + + if ((q->flush_rq->cmd_flags & REQ_FLUSH_SEQ) && + q->flush_rq->tag == tag) + return q->flush_rq; + + return hctx->tags->rqs[tag]; +} +EXPORT_SYMBOL(blk_mq_tag_to_rq); + struct blk_mq_timeout_data { struct blk_mq_hw_ctx *hctx; unsigned long *next; @@ -412,12 +580,13 @@ static void blk_mq_timeout_check(void *__data, unsigned long *free_tags) do { struct request *rq; - tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag); - if (tag >= hctx->queue_depth) + tag = find_next_zero_bit(free_tags, hctx->tags->nr_tags, tag); + if (tag >= hctx->tags->nr_tags) break; - rq = hctx->rqs[tag++]; - + rq = blk_mq_tag_to_rq(hctx, tag++); + if (rq->q != hctx->queue) + continue; if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) continue; @@ -442,6 +611,28 @@ static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx, blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data); } +static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq) +{ + struct request_queue *q = rq->q; + + /* + * We know that complete is set at this point. If STARTED isn't set + * anymore, then the request isn't active and the "timeout" should + * just be ignored. This can happen due to the bitflag ordering. + * Timeout first checks if STARTED is set, and if it is, assumes + * the request is active. But if we race with completion, then + * we both flags will get cleared. So check here again, and ignore + * a timeout event with a request that isn't active. + */ + if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) + return BLK_EH_NOT_HANDLED; + + if (!q->mq_ops->timeout) + return BLK_EH_RESET_TIMER; + + return q->mq_ops->timeout(rq); +} + static void blk_mq_rq_timer(unsigned long data) { struct request_queue *q = (struct request_queue *) data; @@ -449,11 +640,24 @@ static void blk_mq_rq_timer(unsigned long data) unsigned long next = 0; int i, next_set = 0; - queue_for_each_hw_ctx(q, hctx, i) + queue_for_each_hw_ctx(q, hctx, i) { + /* + * If not software queues are currently mapped to this + * hardware queue, there's nothing to check + */ + if (!hctx->nr_ctx || !hctx->tags) + continue; + blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); + } - if (next_set) - mod_timer(&q->timeout, round_jiffies_up(next)); + if (next_set) { + next = blk_rq_timeout(round_jiffies_up(next)); + mod_timer(&q->timeout, next); + } else { + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_tag_idle(hctx); + } } /* @@ -495,9 +699,38 @@ static bool blk_mq_attempt_merge(struct request_queue *q, return false; } -void blk_mq_add_timer(struct request *rq) +/* + * Process software queues that have been marked busy, splicing them + * to the for-dispatch + */ +static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) { - __blk_add_timer(rq, NULL); + struct blk_mq_ctx *ctx; + int i; + + for (i = 0; i < hctx->ctx_map.map_size; i++) { + struct blk_align_bitmap *bm = &hctx->ctx_map.map[i]; + unsigned int off, bit; + + if (!bm->word) + continue; + + bit = 0; + off = i * hctx->ctx_map.bits_per_word; + do { + bit = find_next_bit(&bm->word, bm->depth, bit); + if (bit >= bm->depth) + break; + + ctx = hctx->ctxs[bit + off]; + clear_bit(bit, &bm->word); + spin_lock(&ctx->lock); + list_splice_tail_init(&ctx->rq_list, list); + spin_unlock(&ctx->lock); + + bit++; + } while (1); + } } /* @@ -509,10 +742,11 @@ void blk_mq_add_timer(struct request *rq) static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; - struct blk_mq_ctx *ctx; struct request *rq; LIST_HEAD(rq_list); - int bit, queued; + int queued; + + WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)); if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) return; @@ -522,15 +756,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) /* * Touch any software queue that has pending entries. */ - for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) { - clear_bit(bit, hctx->ctx_map); - ctx = hctx->ctxs[bit]; - BUG_ON(bit != ctx->index_hw); - - spin_lock(&ctx->lock); - list_splice_tail_init(&ctx->rq_list, &rq_list); - spin_unlock(&ctx->lock); - } + flush_busy_ctxs(hctx, &rq_list); /* * If we have previous entries on our dispatch list, grab them @@ -544,13 +770,9 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) } /* - * Delete and return all entries from our dispatch list - */ - queued = 0; - - /* * Now process all the entries, sending them to the driver. */ + queued = 0; while (!list_empty(&rq_list)) { int ret; @@ -565,13 +787,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) queued++; continue; case BLK_MQ_RQ_QUEUE_BUSY: - /* - * FIXME: we should have a mechanism to stop the queue - * like blk_stop_queue, otherwise we will waste cpu - * time - */ list_add(&rq->queuelist, &rq_list); - blk_mq_requeue_request(rq); + __blk_mq_requeue_request(rq); break; default: pr_err("blk-mq: bad return on queue: %d\n", ret); @@ -601,17 +818,44 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) } } +/* + * It'd be great if the workqueue API had a way to pass + * in a mask and had some smarts for more clever placement. + * For now we just round-robin here, switching for every + * BLK_MQ_CPU_WORK_BATCH queued items. + */ +static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) +{ + int cpu = hctx->next_cpu; + + if (--hctx->next_cpu_batch <= 0) { + int next_cpu; + + next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(hctx->cpumask); + + hctx->next_cpu = next_cpu; + hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; + } + + return cpu; +} + void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) return; - if (!async) + if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) __blk_mq_run_hw_queue(hctx); + else if (hctx->queue->nr_hw_queues == 1) + kblockd_schedule_delayed_work(&hctx->run_work, 0); else { - struct request_queue *q = hctx->queue; + unsigned int cpu; - kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0); + cpu = blk_mq_hctx_next_cpu(hctx); + kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0); } } @@ -626,14 +870,17 @@ void blk_mq_run_queues(struct request_queue *q, bool async) test_bit(BLK_MQ_S_STOPPED, &hctx->state)) continue; + preempt_disable(); blk_mq_run_hw_queue(hctx, async); + preempt_enable(); } } EXPORT_SYMBOL(blk_mq_run_queues); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) { - cancel_delayed_work(&hctx->delayed_work); + cancel_delayed_work(&hctx->run_work); + cancel_delayed_work(&hctx->delay_work); set_bit(BLK_MQ_S_STOPPED, &hctx->state); } EXPORT_SYMBOL(blk_mq_stop_hw_queue); @@ -651,11 +898,25 @@ EXPORT_SYMBOL(blk_mq_stop_hw_queues); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) { clear_bit(BLK_MQ_S_STOPPED, &hctx->state); + + preempt_disable(); __blk_mq_run_hw_queue(hctx); + preempt_enable(); } EXPORT_SYMBOL(blk_mq_start_hw_queue); -void blk_mq_start_stopped_hw_queues(struct request_queue *q) +void blk_mq_start_hw_queues(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_start_hw_queue(hctx); +} +EXPORT_SYMBOL(blk_mq_start_hw_queues); + + +void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx; int i; @@ -665,19 +926,47 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q) continue; clear_bit(BLK_MQ_S_STOPPED, &hctx->state); - blk_mq_run_hw_queue(hctx, true); + preempt_disable(); + blk_mq_run_hw_queue(hctx, async); + preempt_enable(); } } EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); -static void blk_mq_work_fn(struct work_struct *work) +static void blk_mq_run_work_fn(struct work_struct *work) { struct blk_mq_hw_ctx *hctx; - hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work); + hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); + __blk_mq_run_hw_queue(hctx); } +static void blk_mq_delay_work_fn(struct work_struct *work) +{ + struct blk_mq_hw_ctx *hctx; + + hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work); + + if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state)) + __blk_mq_run_hw_queue(hctx); +} + +void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) +{ + unsigned long tmo = msecs_to_jiffies(msecs); + + if (hctx->queue->nr_hw_queues == 1) + kblockd_schedule_delayed_work(&hctx->delay_work, tmo); + else { + unsigned int cpu; + + cpu = blk_mq_hctx_next_cpu(hctx); + kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo); + } +} +EXPORT_SYMBOL(blk_mq_delay_queue); + static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bool at_head) { @@ -689,12 +978,13 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, list_add(&rq->queuelist, &ctx->rq_list); else list_add_tail(&rq->queuelist, &ctx->rq_list); + blk_mq_hctx_mark_pending(hctx, ctx); /* * We do this early, to ensure we are on the right CPU. */ - blk_mq_add_timer(rq); + blk_add_timer(rq); } void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, @@ -719,10 +1009,10 @@ void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, spin_unlock(&ctx->lock); } - blk_mq_put_ctx(current_ctx); - if (run_queue) blk_mq_run_hw_queue(hctx, async); + + blk_mq_put_ctx(current_ctx); } static void blk_mq_insert_requests(struct request_queue *q, @@ -758,9 +1048,8 @@ static void blk_mq_insert_requests(struct request_queue *q, } spin_unlock(&ctx->lock); - blk_mq_put_ctx(current_ctx); - blk_mq_run_hw_queue(hctx, from_schedule); + blk_mq_put_ctx(current_ctx); } static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) @@ -823,24 +1112,169 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) { init_request_from_bio(rq, bio); - blk_account_io_start(rq, 1); + + if (blk_do_io_stat(rq)) { + rq->start_time = jiffies; + blk_account_io_start(rq, 1); + } } -static void blk_mq_make_request(struct request_queue *q, struct bio *bio) +static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx, + struct request *rq, struct bio *bio) +{ + struct request_queue *q = hctx->queue; + + if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE)) { + blk_mq_bio_to_request(rq, bio); + spin_lock(&ctx->lock); +insert_rq: + __blk_mq_insert_request(hctx, rq, false); + spin_unlock(&ctx->lock); + return false; + } else { + spin_lock(&ctx->lock); + if (!blk_mq_attempt_merge(q, ctx, bio)) { + blk_mq_bio_to_request(rq, bio); + goto insert_rq; + } + + spin_unlock(&ctx->lock); + __blk_mq_free_request(hctx, ctx, rq); + return true; + } +} + +struct blk_map_ctx { + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; +}; + +static struct request *blk_mq_map_request(struct request_queue *q, + struct bio *bio, + struct blk_map_ctx *data) { struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; + struct request *rq; + int rw = bio_data_dir(bio); + + if (unlikely(blk_mq_queue_enter(q))) { + bio_endio(bio, -EIO); + return NULL; + } + + ctx = blk_mq_get_ctx(q); + hctx = q->mq_ops->map_queue(q, ctx->cpu); + + if (rw_is_sync(bio->bi_rw)) + rw |= REQ_SYNC; + + trace_block_getrq(q, bio, rw); + rq = __blk_mq_alloc_request(q, hctx, ctx, rw, GFP_ATOMIC, false); + if (unlikely(!rq)) { + __blk_mq_run_hw_queue(hctx); + blk_mq_put_ctx(ctx); + trace_block_sleeprq(q, bio, rw); + + ctx = blk_mq_get_ctx(q); + hctx = q->mq_ops->map_queue(q, ctx->cpu); + rq = __blk_mq_alloc_request(q, hctx, ctx, rw, + __GFP_WAIT|GFP_ATOMIC, false); + } + + hctx->queued++; + data->hctx = hctx; + data->ctx = ctx; + return rq; +} + +/* + * Multiple hardware queue variant. This will not use per-process plugs, + * but will attempt to bypass the hctx queueing if we can go straight to + * hardware for SYNC IO. + */ +static void blk_mq_make_request(struct request_queue *q, struct bio *bio) +{ const int is_sync = rw_is_sync(bio->bi_rw); const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); - int rw = bio_data_dir(bio); + struct blk_map_ctx data; struct request *rq; + + blk_queue_bounce(q, &bio); + + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { + bio_endio(bio, -EIO); + return; + } + + rq = blk_mq_map_request(q, bio, &data); + if (unlikely(!rq)) + return; + + if (unlikely(is_flush_fua)) { + blk_mq_bio_to_request(rq, bio); + blk_insert_flush(rq); + goto run_queue; + } + + if (is_sync) { + int ret; + + blk_mq_bio_to_request(rq, bio); + blk_mq_start_request(rq, true); + blk_add_timer(rq); + + /* + * For OK queue, we are done. For error, kill it. Any other + * error (busy), just add it to our list as we previously + * would have done + */ + ret = q->mq_ops->queue_rq(data.hctx, rq); + if (ret == BLK_MQ_RQ_QUEUE_OK) + goto done; + else { + __blk_mq_requeue_request(rq); + + if (ret == BLK_MQ_RQ_QUEUE_ERROR) { + rq->errors = -EIO; + blk_mq_end_io(rq, rq->errors); + goto done; + } + } + } + + if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { + /* + * For a SYNC request, send it to the hardware immediately. For + * an ASYNC request, just ensure that we run it later on. The + * latter allows for merging opportunities and more efficient + * dispatching. + */ +run_queue: + blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); + } +done: + blk_mq_put_ctx(data.ctx); +} + +/* + * Single hardware queue variant. This will attempt to use any per-process + * plug for merging and IO deferral. + */ +static void blk_sq_make_request(struct request_queue *q, struct bio *bio) +{ + const int is_sync = rw_is_sync(bio->bi_rw); + const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); unsigned int use_plug, request_count = 0; + struct blk_map_ctx data; + struct request *rq; /* * If we have multiple hardware queues, just go directly to * one of those for sync IO. */ - use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync); + use_plug = !is_flush_fua && !is_sync; blk_queue_bounce(q, &bio); @@ -849,37 +1283,14 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) return; } - if (use_plug && blk_attempt_plug_merge(q, bio, &request_count)) + if (use_plug && !blk_queue_nomerges(q) && + blk_attempt_plug_merge(q, bio, &request_count)) return; - if (blk_mq_queue_enter(q)) { - bio_endio(bio, -EIO); - return; - } - - ctx = blk_mq_get_ctx(q); - hctx = q->mq_ops->map_queue(q, ctx->cpu); - - if (is_sync) - rw |= REQ_SYNC; - trace_block_getrq(q, bio, rw); - rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false); - if (likely(rq)) - blk_mq_rq_ctx_init(q, ctx, rq, rw); - else { - blk_mq_put_ctx(ctx); - trace_block_sleeprq(q, bio, rw); - rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC, - false); - ctx = rq->mq_ctx; - hctx = q->mq_ops->map_queue(q, ctx->cpu); - } - - hctx->queued++; + rq = blk_mq_map_request(q, bio, &data); if (unlikely(is_flush_fua)) { blk_mq_bio_to_request(rq, bio); - blk_mq_put_ctx(ctx); blk_insert_flush(rq); goto run_queue; } @@ -901,31 +1312,23 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) trace_block_plug(q); } list_add_tail(&rq->queuelist, &plug->mq_list); - blk_mq_put_ctx(ctx); + blk_mq_put_ctx(data.ctx); return; } } - spin_lock(&ctx->lock); - - if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && - blk_mq_attempt_merge(q, ctx, bio)) - __blk_mq_free_request(hctx, ctx, rq); - else { - blk_mq_bio_to_request(rq, bio); - __blk_mq_insert_request(hctx, rq, false); + if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { + /* + * For a SYNC request, send it to the hardware immediately. For + * an ASYNC request, just ensure that we run it later on. The + * latter allows for merging opportunities and more efficient + * dispatching. + */ +run_queue: + blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); } - spin_unlock(&ctx->lock); - blk_mq_put_ctx(ctx); - - /* - * For a SYNC request, send it to the hardware immediately. For an - * ASYNC request, just ensure that we run it later on. The latter - * allows for merging opportunities and more efficient dispatching. - */ -run_queue: - blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua); + blk_mq_put_ctx(data.ctx); } /* @@ -937,32 +1340,153 @@ struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu) } EXPORT_SYMBOL(blk_mq_map_queue); -struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *reg, - unsigned int hctx_index) +static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, + struct blk_mq_tags *tags, unsigned int hctx_idx) { - return kmalloc_node(sizeof(struct blk_mq_hw_ctx), - GFP_KERNEL | __GFP_ZERO, reg->numa_node); + struct page *page; + + if (tags->rqs && set->ops->exit_request) { + int i; + + for (i = 0; i < tags->nr_tags; i++) { + if (!tags->rqs[i]) + continue; + set->ops->exit_request(set->driver_data, tags->rqs[i], + hctx_idx, i); + } + } + + while (!list_empty(&tags->page_list)) { + page = list_first_entry(&tags->page_list, struct page, lru); + list_del_init(&page->lru); + __free_pages(page, page->private); + } + + kfree(tags->rqs); + + blk_mq_free_tags(tags); } -EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue); -void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx, - unsigned int hctx_index) +static size_t order_to_size(unsigned int order) { - kfree(hctx); + return (size_t)PAGE_SIZE << order; } -EXPORT_SYMBOL(blk_mq_free_single_hw_queue); -static void blk_mq_hctx_notify(void *data, unsigned long action, - unsigned int cpu) +static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, + unsigned int hctx_idx) +{ + struct blk_mq_tags *tags; + unsigned int i, j, entries_per_page, max_order = 4; + size_t rq_size, left; + + tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags, + set->numa_node); + if (!tags) + return NULL; + + INIT_LIST_HEAD(&tags->page_list); + + tags->rqs = kmalloc_node(set->queue_depth * sizeof(struct request *), + GFP_KERNEL, set->numa_node); + if (!tags->rqs) { + blk_mq_free_tags(tags); + return NULL; + } + + /* + * rq_size is the size of the request plus driver payload, rounded + * to the cacheline size + */ + rq_size = round_up(sizeof(struct request) + set->cmd_size, + cache_line_size()); + left = rq_size * set->queue_depth; + + for (i = 0; i < set->queue_depth; ) { + int this_order = max_order; + struct page *page; + int to_do; + void *p; + + while (left < order_to_size(this_order - 1) && this_order) + this_order--; + + do { + page = alloc_pages_node(set->numa_node, GFP_KERNEL, + this_order); + if (page) + break; + if (!this_order--) + break; + if (order_to_size(this_order) < rq_size) + break; + } while (1); + + if (!page) + goto fail; + + page->private = this_order; + list_add_tail(&page->lru, &tags->page_list); + + p = page_address(page); + entries_per_page = order_to_size(this_order) / rq_size; + to_do = min(entries_per_page, set->queue_depth - i); + left -= to_do * rq_size; + for (j = 0; j < to_do; j++) { + tags->rqs[i] = p; + if (set->ops->init_request) { + if (set->ops->init_request(set->driver_data, + tags->rqs[i], hctx_idx, i, + set->numa_node)) + goto fail; + } + + p += rq_size; + i++; + } + } + + return tags; + +fail: + pr_warn("%s: failed to allocate requests\n", __func__); + blk_mq_free_rq_map(set, tags, hctx_idx); + return NULL; +} + +static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap) +{ + kfree(bitmap->map); +} + +static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node) +{ + unsigned int bpw = 8, total, num_maps, i; + + bitmap->bits_per_word = bpw; + + num_maps = ALIGN(nr_cpu_ids, bpw) / bpw; + bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap), + GFP_KERNEL, node); + if (!bitmap->map) + return -ENOMEM; + + bitmap->map_size = num_maps; + + total = nr_cpu_ids; + for (i = 0; i < num_maps; i++) { + bitmap->map[i].depth = min(total, bitmap->bits_per_word); + total -= bitmap->map[i].depth; + } + + return 0; +} + +static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu) { - struct blk_mq_hw_ctx *hctx = data; struct request_queue *q = hctx->queue; struct blk_mq_ctx *ctx; LIST_HEAD(tmp); - if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) - return; - /* * Move ctx entries to new CPU, if this one is going away. */ @@ -971,12 +1495,12 @@ static void blk_mq_hctx_notify(void *data, unsigned long action, spin_lock(&ctx->lock); if (!list_empty(&ctx->rq_list)) { list_splice_init(&ctx->rq_list, &tmp); - clear_bit(ctx->index_hw, hctx->ctx_map); + blk_mq_hctx_clear_pending(hctx, ctx); } spin_unlock(&ctx->lock); if (list_empty(&tmp)) - return; + return NOTIFY_OK; ctx = blk_mq_get_ctx(q); spin_lock(&ctx->lock); @@ -993,210 +1517,103 @@ static void blk_mq_hctx_notify(void *data, unsigned long action, blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); - blk_mq_put_ctx(ctx); blk_mq_run_hw_queue(hctx, true); + blk_mq_put_ctx(ctx); + return NOTIFY_OK; } -static int blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx, - int (*init)(void *, struct blk_mq_hw_ctx *, - struct request *, unsigned int), - void *data) +static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu) { - unsigned int i; - int ret = 0; - - for (i = 0; i < hctx->queue_depth; i++) { - struct request *rq = hctx->rqs[i]; - - ret = init(data, hctx, rq, i); - if (ret) - break; - } - - return ret; -} + struct request_queue *q = hctx->queue; + struct blk_mq_tag_set *set = q->tag_set; -int blk_mq_init_commands(struct request_queue *q, - int (*init)(void *, struct blk_mq_hw_ctx *, - struct request *, unsigned int), - void *data) -{ - struct blk_mq_hw_ctx *hctx; - unsigned int i; - int ret = 0; + if (set->tags[hctx->queue_num]) + return NOTIFY_OK; - queue_for_each_hw_ctx(q, hctx, i) { - ret = blk_mq_init_hw_commands(hctx, init, data); - if (ret) - break; - } + set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num); + if (!set->tags[hctx->queue_num]) + return NOTIFY_STOP; - return ret; + hctx->tags = set->tags[hctx->queue_num]; + return NOTIFY_OK; } -EXPORT_SYMBOL(blk_mq_init_commands); -static void blk_mq_free_hw_commands(struct blk_mq_hw_ctx *hctx, - void (*free)(void *, struct blk_mq_hw_ctx *, - struct request *, unsigned int), - void *data) +static int blk_mq_hctx_notify(void *data, unsigned long action, + unsigned int cpu) { - unsigned int i; + struct blk_mq_hw_ctx *hctx = data; - for (i = 0; i < hctx->queue_depth; i++) { - struct request *rq = hctx->rqs[i]; + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) + return blk_mq_hctx_cpu_offline(hctx, cpu); + else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) + return blk_mq_hctx_cpu_online(hctx, cpu); - free(data, hctx, rq, i); - } + return NOTIFY_OK; } -void blk_mq_free_commands(struct request_queue *q, - void (*free)(void *, struct blk_mq_hw_ctx *, - struct request *, unsigned int), - void *data) +static void blk_mq_exit_hw_queues(struct request_queue *q, + struct blk_mq_tag_set *set, int nr_queue) { struct blk_mq_hw_ctx *hctx; unsigned int i; - queue_for_each_hw_ctx(q, hctx, i) - blk_mq_free_hw_commands(hctx, free, data); -} -EXPORT_SYMBOL(blk_mq_free_commands); + queue_for_each_hw_ctx(q, hctx, i) { + if (i == nr_queue) + break; -static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx) -{ - struct page *page; + if (set->ops->exit_hctx) + set->ops->exit_hctx(hctx, i); - while (!list_empty(&hctx->page_list)) { - page = list_first_entry(&hctx->page_list, struct page, lru); - list_del_init(&page->lru); - __free_pages(page, page->private); + blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); + kfree(hctx->ctxs); + blk_mq_free_bitmap(&hctx->ctx_map); } - kfree(hctx->rqs); - - if (hctx->tags) - blk_mq_free_tags(hctx->tags); -} - -static size_t order_to_size(unsigned int order) -{ - size_t ret = PAGE_SIZE; - - while (order--) - ret *= 2; - - return ret; } -static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx, - unsigned int reserved_tags, int node) +static void blk_mq_free_hw_queues(struct request_queue *q, + struct blk_mq_tag_set *set) { - unsigned int i, j, entries_per_page, max_order = 4; - size_t rq_size, left; - - INIT_LIST_HEAD(&hctx->page_list); - - hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *), - GFP_KERNEL, node); - if (!hctx->rqs) - return -ENOMEM; - - /* - * rq_size is the size of the request plus driver payload, rounded - * to the cacheline size - */ - rq_size = round_up(sizeof(struct request) + hctx->cmd_size, - cache_line_size()); - left = rq_size * hctx->queue_depth; - - for (i = 0; i < hctx->queue_depth;) { - int this_order = max_order; - struct page *page; - int to_do; - void *p; - - while (left < order_to_size(this_order - 1) && this_order) - this_order--; - - do { - page = alloc_pages_node(node, GFP_KERNEL, this_order); - if (page) - break; - if (!this_order--) - break; - if (order_to_size(this_order) < rq_size) - break; - } while (1); - - if (!page) - break; - - page->private = this_order; - list_add_tail(&page->lru, &hctx->page_list); - - p = page_address(page); - entries_per_page = order_to_size(this_order) / rq_size; - to_do = min(entries_per_page, hctx->queue_depth - i); - left -= to_do * rq_size; - for (j = 0; j < to_do; j++) { - hctx->rqs[i] = p; - blk_mq_rq_init(hctx, hctx->rqs[i]); - p += rq_size; - i++; - } - } - - if (i < (reserved_tags + BLK_MQ_TAG_MIN)) - goto err_rq_map; - else if (i != hctx->queue_depth) { - hctx->queue_depth = i; - pr_warn("%s: queue depth set to %u because of low memory\n", - __func__, i); - } + struct blk_mq_hw_ctx *hctx; + unsigned int i; - hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node); - if (!hctx->tags) { -err_rq_map: - blk_mq_free_rq_map(hctx); - return -ENOMEM; + queue_for_each_hw_ctx(q, hctx, i) { + free_cpumask_var(hctx->cpumask); + kfree(hctx); } - - return 0; } static int blk_mq_init_hw_queues(struct request_queue *q, - struct blk_mq_reg *reg, void *driver_data) + struct blk_mq_tag_set *set) { struct blk_mq_hw_ctx *hctx; - unsigned int i, j; + unsigned int i; /* * Initialize hardware queues */ queue_for_each_hw_ctx(q, hctx, i) { - unsigned int num_maps; int node; node = hctx->numa_node; if (node == NUMA_NO_NODE) - node = hctx->numa_node = reg->numa_node; + node = hctx->numa_node = set->numa_node; - INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn); + INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); + INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); spin_lock_init(&hctx->lock); INIT_LIST_HEAD(&hctx->dispatch); hctx->queue = q; hctx->queue_num = i; - hctx->flags = reg->flags; - hctx->queue_depth = reg->queue_depth; - hctx->cmd_size = reg->cmd_size; + hctx->flags = set->flags; + hctx->cmd_size = set->cmd_size; blk_mq_init_cpu_notifier(&hctx->cpu_notifier, blk_mq_hctx_notify, hctx); blk_mq_register_cpu_notifier(&hctx->cpu_notifier); - if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node)) - break; + hctx->tags = set->tags[i]; /* * Allocate space for all possible cpus to avoid allocation in @@ -1207,17 +1624,13 @@ static int blk_mq_init_hw_queues(struct request_queue *q, if (!hctx->ctxs) break; - num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG; - hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long), - GFP_KERNEL, node); - if (!hctx->ctx_map) + if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) break; - hctx->nr_ctx_map = num_maps; hctx->nr_ctx = 0; - if (reg->ops->init_hctx && - reg->ops->init_hctx(hctx, driver_data, i)) + if (set->ops->init_hctx && + set->ops->init_hctx(hctx, set->driver_data, i)) break; } @@ -1227,17 +1640,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q, /* * Init failed */ - queue_for_each_hw_ctx(q, hctx, j) { - if (i == j) - break; - - if (reg->ops->exit_hctx) - reg->ops->exit_hctx(hctx, j); - - blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); - blk_mq_free_rq_map(hctx); - kfree(hctx->ctxs); - } + blk_mq_exit_hw_queues(q, set, i); return 1; } @@ -1258,12 +1661,13 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, __ctx->queue = q; /* If the cpu isn't online, the cpu is mapped to first hctx */ - hctx = q->mq_ops->map_queue(q, i); - hctx->nr_ctx++; - if (!cpu_online(i)) continue; + hctx = q->mq_ops->map_queue(q, i); + cpumask_set_cpu(i, hctx->cpumask); + hctx->nr_ctx++; + /* * Set local node, IFF we have more than one hw queue. If * not, we remain on the home node of the device @@ -1280,6 +1684,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) struct blk_mq_ctx *ctx; queue_for_each_hw_ctx(q, hctx, i) { + cpumask_clear(hctx->cpumask); hctx->nr_ctx = 0; } @@ -1288,115 +1693,208 @@ static void blk_mq_map_swqueue(struct request_queue *q) */ queue_for_each_ctx(q, ctx, i) { /* If the cpu isn't online, the cpu is mapped to first hctx */ + if (!cpu_online(i)) + continue; + hctx = q->mq_ops->map_queue(q, i); + cpumask_set_cpu(i, hctx->cpumask); ctx->index_hw = hctx->nr_ctx; hctx->ctxs[hctx->nr_ctx++] = ctx; } + + queue_for_each_hw_ctx(q, hctx, i) { + /* + * If not software queues are mapped to this hardware queue, + * disable it and free the request entries + */ + if (!hctx->nr_ctx) { + struct blk_mq_tag_set *set = q->tag_set; + + if (set->tags[i]) { + blk_mq_free_rq_map(set, set->tags[i], i); + set->tags[i] = NULL; + hctx->tags = NULL; + } + continue; + } + + /* + * Initialize batch roundrobin counts + */ + hctx->next_cpu = cpumask_first(hctx->cpumask); + hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; + } } -struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, - void *driver_data) +static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set) { - struct blk_mq_hw_ctx **hctxs; - struct blk_mq_ctx *ctx; + struct blk_mq_hw_ctx *hctx; struct request_queue *q; + bool shared; int i; - if (!reg->nr_hw_queues || - !reg->ops->queue_rq || !reg->ops->map_queue || - !reg->ops->alloc_hctx || !reg->ops->free_hctx) - return ERR_PTR(-EINVAL); + if (set->tag_list.next == set->tag_list.prev) + shared = false; + else + shared = true; + + list_for_each_entry(q, &set->tag_list, tag_set_list) { + blk_mq_freeze_queue(q); - if (!reg->queue_depth) - reg->queue_depth = BLK_MQ_MAX_DEPTH; - else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) { - pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth); - reg->queue_depth = BLK_MQ_MAX_DEPTH; + queue_for_each_hw_ctx(q, hctx, i) { + if (shared) + hctx->flags |= BLK_MQ_F_TAG_SHARED; + else + hctx->flags &= ~BLK_MQ_F_TAG_SHARED; + } + blk_mq_unfreeze_queue(q); } +} - if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN)) - return ERR_PTR(-EINVAL); +static void blk_mq_del_queue_tag_set(struct request_queue *q) +{ + struct blk_mq_tag_set *set = q->tag_set; + + blk_mq_freeze_queue(q); + + mutex_lock(&set->tag_list_lock); + list_del_init(&q->tag_set_list); + blk_mq_update_tag_set_depth(set); + mutex_unlock(&set->tag_list_lock); + + blk_mq_unfreeze_queue(q); +} + +static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, + struct request_queue *q) +{ + q->tag_set = set; + + mutex_lock(&set->tag_list_lock); + list_add_tail(&q->tag_set_list, &set->tag_list); + blk_mq_update_tag_set_depth(set); + mutex_unlock(&set->tag_list_lock); +} + +struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) +{ + struct blk_mq_hw_ctx **hctxs; + struct blk_mq_ctx *ctx; + struct request_queue *q; + unsigned int *map; + int i; ctx = alloc_percpu(struct blk_mq_ctx); if (!ctx) return ERR_PTR(-ENOMEM); - hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, - reg->numa_node); + hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, + set->numa_node); if (!hctxs) goto err_percpu; - for (i = 0; i < reg->nr_hw_queues; i++) { - hctxs[i] = reg->ops->alloc_hctx(reg, i); + map = blk_mq_make_queue_map(set); + if (!map) + goto err_map; + + for (i = 0; i < set->nr_hw_queues; i++) { + int node = blk_mq_hw_queue_to_node(map, i); + + hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx), + GFP_KERNEL, node); if (!hctxs[i]) goto err_hctxs; - hctxs[i]->numa_node = NUMA_NO_NODE; + if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL)) + goto err_hctxs; + + atomic_set(&hctxs[i]->nr_active, 0); + hctxs[i]->numa_node = node; hctxs[i]->queue_num = i; } - q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node); + q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); if (!q) goto err_hctxs; - q->mq_map = blk_mq_make_queue_map(reg); - if (!q->mq_map) + if (percpu_counter_init(&q->mq_usage_counter, 0)) goto err_map; setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); blk_queue_rq_timeout(q, 30000); q->nr_queues = nr_cpu_ids; - q->nr_hw_queues = reg->nr_hw_queues; + q->nr_hw_queues = set->nr_hw_queues; + q->mq_map = map; q->queue_ctx = ctx; q->queue_hw_ctx = hctxs; - q->mq_ops = reg->ops; + q->mq_ops = set->ops; q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; + if (!(set->flags & BLK_MQ_F_SG_MERGE)) + q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE; + q->sg_reserved_size = INT_MAX; - blk_queue_make_request(q, blk_mq_make_request); - blk_queue_rq_timed_out(q, reg->ops->timeout); - if (reg->timeout) - blk_queue_rq_timeout(q, reg->timeout); + INIT_WORK(&q->requeue_work, blk_mq_requeue_work); + INIT_LIST_HEAD(&q->requeue_list); + spin_lock_init(&q->requeue_lock); + + if (q->nr_hw_queues > 1) + blk_queue_make_request(q, blk_mq_make_request); + else + blk_queue_make_request(q, blk_sq_make_request); + + blk_queue_rq_timed_out(q, blk_mq_rq_timed_out); + if (set->timeout) + blk_queue_rq_timeout(q, set->timeout); + + /* + * Do this after blk_queue_make_request() overrides it... + */ + q->nr_requests = set->queue_depth; - if (reg->ops->complete) - blk_queue_softirq_done(q, reg->ops->complete); + if (set->ops->complete) + blk_queue_softirq_done(q, set->ops->complete); blk_mq_init_flush(q); - blk_mq_init_cpu_queues(q, reg->nr_hw_queues); + blk_mq_init_cpu_queues(q, set->nr_hw_queues); - q->flush_rq = kzalloc(round_up(sizeof(struct request) + reg->cmd_size, - cache_line_size()), GFP_KERNEL); + q->flush_rq = kzalloc(round_up(sizeof(struct request) + + set->cmd_size, cache_line_size()), + GFP_KERNEL); if (!q->flush_rq) goto err_hw; - if (blk_mq_init_hw_queues(q, reg, driver_data)) + if (blk_mq_init_hw_queues(q, set)) goto err_flush_rq; - blk_mq_map_swqueue(q); - mutex_lock(&all_q_mutex); list_add_tail(&q->all_q_node, &all_q_list); mutex_unlock(&all_q_mutex); + blk_mq_add_queue_tag_set(set, q); + + blk_mq_map_swqueue(q); + return q; err_flush_rq: kfree(q->flush_rq); err_hw: - kfree(q->mq_map); -err_map: blk_cleanup_queue(q); err_hctxs: - for (i = 0; i < reg->nr_hw_queues; i++) { + kfree(map); + for (i = 0; i < set->nr_hw_queues; i++) { if (!hctxs[i]) break; - reg->ops->free_hctx(hctxs[i], i); + free_cpumask_var(hctxs[i]->cpumask); + kfree(hctxs[i]); } +err_map: kfree(hctxs); err_percpu: free_percpu(ctx); @@ -1406,18 +1904,14 @@ EXPORT_SYMBOL(blk_mq_init_queue); void blk_mq_free_queue(struct request_queue *q) { - struct blk_mq_hw_ctx *hctx; - int i; + struct blk_mq_tag_set *set = q->tag_set; - queue_for_each_hw_ctx(q, hctx, i) { - kfree(hctx->ctx_map); - kfree(hctx->ctxs); - blk_mq_free_rq_map(hctx); - blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); - if (q->mq_ops->exit_hctx) - q->mq_ops->exit_hctx(hctx, i); - q->mq_ops->free_hctx(hctx, i); - } + blk_mq_del_queue_tag_set(q); + + blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); + blk_mq_free_hw_queues(q, set); + + percpu_counter_destroy(&q->mq_usage_counter); free_percpu(q->queue_ctx); kfree(q->queue_hw_ctx); @@ -1437,6 +1931,8 @@ static void blk_mq_queue_reinit(struct request_queue *q) { blk_mq_freeze_queue(q); + blk_mq_sysfs_unregister(q); + blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues); /* @@ -1447,6 +1943,8 @@ static void blk_mq_queue_reinit(struct request_queue *q) blk_mq_map_swqueue(q); + blk_mq_sysfs_register(q); + blk_mq_unfreeze_queue(q); } @@ -1456,10 +1954,10 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, struct request_queue *q; /* - * Before new mapping is established, hotadded cpu might already start - * handling requests. This doesn't break anything as we map offline - * CPUs to first hardware queue. We will re-init queue below to get - * optimal settings. + * Before new mappings are established, hotadded cpu might already + * start handling requests. This doesn't break anything as we map + * offline CPUs to first hardware queue. We will re-init the queue + * below to get optimal settings. */ if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) @@ -1472,6 +1970,81 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, return NOTIFY_OK; } +int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) +{ + int i; + + if (!set->nr_hw_queues) + return -EINVAL; + if (!set->queue_depth || set->queue_depth > BLK_MQ_MAX_DEPTH) + return -EINVAL; + if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) + return -EINVAL; + + if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue) + return -EINVAL; + + + set->tags = kmalloc_node(set->nr_hw_queues * + sizeof(struct blk_mq_tags *), + GFP_KERNEL, set->numa_node); + if (!set->tags) + goto out; + + for (i = 0; i < set->nr_hw_queues; i++) { + set->tags[i] = blk_mq_init_rq_map(set, i); + if (!set->tags[i]) + goto out_unwind; + } + + mutex_init(&set->tag_list_lock); + INIT_LIST_HEAD(&set->tag_list); + + return 0; + +out_unwind: + while (--i >= 0) + blk_mq_free_rq_map(set, set->tags[i], i); +out: + return -ENOMEM; +} +EXPORT_SYMBOL(blk_mq_alloc_tag_set); + +void blk_mq_free_tag_set(struct blk_mq_tag_set *set) +{ + int i; + + for (i = 0; i < set->nr_hw_queues; i++) { + if (set->tags[i]) + blk_mq_free_rq_map(set, set->tags[i], i); + } + + kfree(set->tags); +} +EXPORT_SYMBOL(blk_mq_free_tag_set); + +int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) +{ + struct blk_mq_tag_set *set = q->tag_set; + struct blk_mq_hw_ctx *hctx; + int i, ret; + + if (!set || nr > set->queue_depth) + return -EINVAL; + + ret = 0; + queue_for_each_hw_ctx(q, hctx, i) { + ret = blk_mq_tag_update_depth(hctx->tags, nr); + if (ret) + break; + } + + if (!ret) + q->nr_requests = nr; + + return ret; +} + void blk_mq_disable_hotplug(void) { mutex_lock(&all_q_mutex); diff --git a/block/blk-mq.h b/block/blk-mq.h index ebbe6ba..de7b3bb 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -1,6 +1,8 @@ #ifndef INT_BLK_MQ_H #define INT_BLK_MQ_H +struct blk_mq_tag_set; + struct blk_mq_ctx { struct { spinlock_t lock; @@ -9,7 +11,8 @@ struct blk_mq_ctx { unsigned int cpu; unsigned int index_hw; - unsigned int ipi_redirect; + + unsigned int last_tag ____cacheline_aligned_in_smp; /* incremented at dispatch time */ unsigned long rq_dispatched[2]; @@ -20,21 +23,23 @@ struct blk_mq_ctx { struct request_queue *queue; struct kobject kobj; -}; +} ____cacheline_aligned_in_smp; void __blk_mq_complete_request(struct request *rq); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_init_flush(struct request_queue *q); void blk_mq_drain_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); -void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq); +void blk_mq_clone_flush_request(struct request *flush_rq, + struct request *orig_rq); +int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); /* * CPU hotplug helpers */ struct blk_mq_cpu_notifier; void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, - void (*fn)(void *, unsigned long, unsigned int), + int (*fn)(void *, unsigned long, unsigned int), void *data); void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier); void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier); @@ -45,10 +50,23 @@ void blk_mq_disable_hotplug(void); /* * CPU -> queue mappings */ -struct blk_mq_reg; -extern unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg); +extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set); extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues); +extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int); -void blk_mq_add_timer(struct request *rq); +/* + * sysfs helpers + */ +extern int blk_mq_sysfs_register(struct request_queue *q); +extern void blk_mq_sysfs_unregister(struct request_queue *q); + +/* + * Basic implementation of sparser bitmap, allowing the user to spread + * the bits over more cachelines. + */ +struct blk_align_bitmap { + unsigned long word; + unsigned long depth; +} ____cacheline_aligned_in_smp; #endif diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 7500f87..23321fb 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -48,11 +48,10 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page) static ssize_t queue_requests_store(struct request_queue *q, const char *page, size_t count) { - struct request_list *rl; unsigned long nr; - int ret; + int ret, err; - if (!q->request_fn) + if (!q->request_fn && !q->mq_ops) return -EINVAL; ret = queue_var_store(&nr, page, count); @@ -62,40 +61,14 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; - spin_lock_irq(q->queue_lock); - q->nr_requests = nr; - blk_queue_congestion_threshold(q); - - /* congestion isn't cgroup aware and follows root blkcg for now */ - rl = &q->root_rl; - - if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) - blk_set_queue_congested(q, BLK_RW_SYNC); - else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, BLK_RW_SYNC); - - if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q)) - blk_set_queue_congested(q, BLK_RW_ASYNC); - else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, BLK_RW_ASYNC); - - blk_queue_for_each_rl(rl, q) { - if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { - blk_set_rl_full(rl, BLK_RW_SYNC); - } else { - blk_clear_rl_full(rl, BLK_RW_SYNC); - wake_up(&rl->wait[BLK_RW_SYNC]); - } - - if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { - blk_set_rl_full(rl, BLK_RW_ASYNC); - } else { - blk_clear_rl_full(rl, BLK_RW_ASYNC); - wake_up(&rl->wait[BLK_RW_ASYNC]); - } - } + if (q->request_fn) + err = blk_update_nr_requests(q, nr); + else + err = blk_mq_update_nr_requests(q, nr); + + if (err) + return err; - spin_unlock_irq(q->queue_lock); return ret; } @@ -544,8 +517,6 @@ static void blk_release_queue(struct kobject *kobj) if (q->queue_tags) __blk_queue_free_tags(q); - percpu_counter_destroy(&q->mq_usage_counter); - if (q->mq_ops) blk_mq_free_queue(q); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 033745c..9353b46 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -744,7 +744,7 @@ static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, static bool throtl_slice_used(struct throtl_grp *tg, bool rw) { if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) - return 0; + return false; return 1; } @@ -842,7 +842,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, if (tg->io_disp[rw] + 1 <= io_allowed) { if (wait) *wait = 0; - return 1; + return true; } /* Calc approx time to dispatch */ @@ -880,7 +880,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) { if (wait) *wait = 0; - return 1; + return true; } /* Calc approx time to dispatch */ @@ -923,7 +923,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { if (wait) *wait = 0; - return 1; + return true; } /* @@ -1258,7 +1258,7 @@ out_unlock: * of throtl_data->service_queue. Those bio's are ready and issued by this * function. */ -void blk_throtl_dispatch_work_fn(struct work_struct *work) +static void blk_throtl_dispatch_work_fn(struct work_struct *work) { struct throtl_data *td = container_of(work, struct throtl_data, dispatch_work); diff --git a/block/blk-timeout.c b/block/blk-timeout.c index d96f7061..95a0959 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -96,11 +96,7 @@ static void blk_rq_timed_out(struct request *req) __blk_complete_request(req); break; case BLK_EH_RESET_TIMER: - if (q->mq_ops) - blk_mq_add_timer(req); - else - blk_add_timer(req); - + blk_add_timer(req); blk_clear_rq_complete(req); break; case BLK_EH_NOT_HANDLED: @@ -170,7 +166,26 @@ void blk_abort_request(struct request *req) } EXPORT_SYMBOL_GPL(blk_abort_request); -void __blk_add_timer(struct request *req, struct list_head *timeout_list) +unsigned long blk_rq_timeout(unsigned long timeout) +{ + unsigned long maxt; + + maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT); + if (time_after(timeout, maxt)) + timeout = maxt; + + return timeout; +} + +/** + * blk_add_timer - Start timeout timer for a single request + * @req: request that is about to start running. + * + * Notes: + * Each request has its own timer, and as it is added to the queue, we + * set up the timer. When the request completes, we cancel the timer. + */ +void blk_add_timer(struct request *req) { struct request_queue *q = req->q; unsigned long expiry; @@ -188,32 +203,29 @@ void __blk_add_timer(struct request *req, struct list_head *timeout_list) req->timeout = q->rq_timeout; req->deadline = jiffies + req->timeout; - if (timeout_list) - list_add_tail(&req->timeout_list, timeout_list); + if (!q->mq_ops) + list_add_tail(&req->timeout_list, &req->q->timeout_list); /* * If the timer isn't already pending or this timeout is earlier * than an existing one, modify the timer. Round up to next nearest * second. */ - expiry = round_jiffies_up(req->deadline); + expiry = blk_rq_timeout(round_jiffies_up(req->deadline)); if (!timer_pending(&q->timeout) || - time_before(expiry, q->timeout.expires)) - mod_timer(&q->timeout, expiry); + time_before(expiry, q->timeout.expires)) { + unsigned long diff = q->timeout.expires - expiry; -} + /* + * Due to added timer slack to group timers, the timer + * will often be a little in front of what we asked for. + * So apply some tolerance here too, otherwise we keep + * modifying the timer because expires for value X + * will be X + something. + */ + if (!timer_pending(&q->timeout) || (diff >= HZ / 2)) + mod_timer(&q->timeout, expiry); + } -/** - * blk_add_timer - Start timeout timer for a single request - * @req: request that is about to start running. - * - * Notes: - * Each request has its own timer, and as it is added to the queue, we - * set up the timer. When the request completes, we cancel the timer. - */ -void blk_add_timer(struct request *req) -{ - __blk_add_timer(req, &req->q->timeout_list); } - diff --git a/block/blk.h b/block/blk.h index 1d880f1..45385e9 100644 --- a/block/blk.h +++ b/block/blk.h @@ -9,6 +9,9 @@ /* Number of requests a "batching" process may submit */ #define BLK_BATCH_REQ 32 +/* Max future timer expiry for timeouts */ +#define BLK_MAX_TIMEOUT (5 * HZ) + extern struct kmem_cache *blk_requestq_cachep; extern struct kmem_cache *request_cachep; extern struct kobj_type blk_queue_ktype; @@ -37,9 +40,9 @@ bool __blk_end_bidi_request(struct request *rq, int error, void blk_rq_timed_out_timer(unsigned long data); void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, unsigned int *next_set); -void __blk_add_timer(struct request *req, struct list_head *timeout_list); +unsigned long blk_rq_timeout(unsigned long timeout); +void blk_add_timer(struct request *req); void blk_delete_timer(struct request *); -void blk_add_timer(struct request *); bool bio_attempt_front_merge(struct request_queue *q, struct request *req, @@ -185,6 +188,8 @@ static inline int queue_congestion_off_threshold(struct request_queue *q) return q->nr_congestion_off; } +extern int blk_update_nr_requests(struct request_queue *, unsigned int); + /* * Contribute to IO statistics IFF: * diff --git a/mm/bounce.c b/block/bounce.c index 523918b..523918b 100644 --- a/mm/bounce.c +++ b/block/bounce.c diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index e0985f1..22dffeb 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -908,7 +908,7 @@ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) { if (cfqd->busy_queues) { cfq_log(cfqd, "schedule dispatch"); - kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work); + kblockd_schedule_work(&cfqd->unplug_work); } } @@ -4460,7 +4460,7 @@ out_free: static ssize_t cfq_var_show(unsigned int var, char *page) { - return sprintf(page, "%d\n", var); + return sprintf(page, "%u\n", var); } static ssize_t diff --git a/fs/ioprio.c b/block/ioprio.c index e50170c..e50170c 100644 --- a/fs/ioprio.c +++ b/block/ioprio.c diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 2648797..9c28a5b 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -205,10 +205,6 @@ int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm) if (capable(CAP_SYS_RAWIO)) return 0; - /* if there's no filter set, assume we're filtering everything out */ - if (!filter) - return -EPERM; - /* Anybody who can open the device can do a read-safe command */ if (test_bit(cmd[0], filter->read_ok)) return 0; diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 748dea4..758da22 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1406,7 +1406,7 @@ next_segment: track = block / (floppy->dtype->sects * floppy->type->sect_mult); sector = block % (floppy->dtype->sects * floppy->type->sect_mult); - data = rq->buffer + 512 * cnt; + data = bio_data(rq->bio) + 512 * cnt; #ifdef DEBUG printk("access to track %d, sector %d, with buffer at " "0x%08lx\n", track, sector, data); diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index cfa64bd..2104b1b 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -1484,7 +1484,7 @@ repeat: ReqCnt = 0; ReqCmd = rq_data_dir(fd_request); ReqBlock = blk_rq_pos(fd_request); - ReqBuffer = fd_request->buffer; + ReqBuffer = bio_data(fd_request->bio); setup_req_params( drive ); do_fd_action( drive ); diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index fa9bb74..dc3a41c 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -2351,7 +2351,7 @@ static void rw_interrupt(void) } if (CT(COMMAND) != FD_READ || - raw_cmd->kernel_data == current_req->buffer) { + raw_cmd->kernel_data == bio_data(current_req->bio)) { /* transfer directly from buffer */ cont->done(1); } else if (CT(COMMAND) == FD_READ) { @@ -2640,7 +2640,7 @@ static int make_raw_rw_request(void) raw_cmd->flags &= ~FD_RAW_WRITE; raw_cmd->flags |= FD_RAW_READ; COMMAND = FM_MODE(_floppy, FD_READ); - } else if ((unsigned long)current_req->buffer < MAX_DMA_ADDRESS) { + } else if ((unsigned long)bio_data(current_req->bio) < MAX_DMA_ADDRESS) { unsigned long dma_limit; int direct, indirect; @@ -2654,13 +2654,13 @@ static int make_raw_rw_request(void) */ max_size = buffer_chain_size(); dma_limit = (MAX_DMA_ADDRESS - - ((unsigned long)current_req->buffer)) >> 9; + ((unsigned long)bio_data(current_req->bio))) >> 9; if ((unsigned long)max_size > dma_limit) max_size = dma_limit; /* 64 kb boundaries */ - if (CROSS_64KB(current_req->buffer, max_size << 9)) + if (CROSS_64KB(bio_data(current_req->bio), max_size << 9)) max_size = (K_64 - - ((unsigned long)current_req->buffer) % + ((unsigned long)bio_data(current_req->bio)) % K_64) >> 9; direct = transfer_size(ssize, max_sector, max_size) - fsector_t; /* @@ -2677,7 +2677,7 @@ static int make_raw_rw_request(void) (DP->read_track & (1 << DRS->probed_format)))))) { max_size = blk_rq_sectors(current_req); } else { - raw_cmd->kernel_data = current_req->buffer; + raw_cmd->kernel_data = bio_data(current_req->bio); raw_cmd->length = current_count_sectors << 9; if (raw_cmd->length == 0) { DPRINT("%s: zero dma transfer attempted\n", __func__); @@ -2731,7 +2731,7 @@ static int make_raw_rw_request(void) raw_cmd->length = ((raw_cmd->length - 1) | (ssize - 1)) + 1; raw_cmd->length <<= 9; if ((raw_cmd->length < current_count_sectors << 9) || - (raw_cmd->kernel_data != current_req->buffer && + (raw_cmd->kernel_data != bio_data(current_req->bio) && CT(COMMAND) == FD_WRITE && (aligned_sector_t + (raw_cmd->length >> 9) > buffer_max || aligned_sector_t < buffer_min)) || @@ -2739,7 +2739,7 @@ static int make_raw_rw_request(void) raw_cmd->length <= 0 || current_count_sectors <= 0) { DPRINT("fractionary current count b=%lx s=%lx\n", raw_cmd->length, current_count_sectors); - if (raw_cmd->kernel_data != current_req->buffer) + if (raw_cmd->kernel_data != bio_data(current_req->bio)) pr_info("addr=%d, length=%ld\n", (int)((raw_cmd->kernel_data - floppy_track_buffer) >> 9), @@ -2756,7 +2756,7 @@ static int make_raw_rw_request(void) return 0; } - if (raw_cmd->kernel_data != current_req->buffer) { + if (raw_cmd->kernel_data != bio_data(current_req->bio)) { if (raw_cmd->kernel_data < floppy_track_buffer || current_count_sectors < 0 || raw_cmd->length < 0 || diff --git a/drivers/block/hd.c b/drivers/block/hd.c index bf397bf..8a290c0 100644 --- a/drivers/block/hd.c +++ b/drivers/block/hd.c @@ -464,11 +464,11 @@ static void read_intr(void) ok_to_read: req = hd_req; - insw(HD_DATA, req->buffer, 256); + insw(HD_DATA, bio_data(req->bio), 256); #ifdef DEBUG printk("%s: read: sector %ld, remaining = %u, buffer=%p\n", req->rq_disk->disk_name, blk_rq_pos(req) + 1, - blk_rq_sectors(req) - 1, req->buffer+512); + blk_rq_sectors(req) - 1, bio_data(req->bio)+512); #endif if (hd_end_request(0, 512)) { SET_HANDLER(&read_intr); @@ -505,7 +505,7 @@ static void write_intr(void) ok_to_write: if (hd_end_request(0, 512)) { SET_HANDLER(&write_intr); - outsw(HD_DATA, req->buffer, 256); + outsw(HD_DATA, bio_data(req->bio), 256); return; } @@ -624,7 +624,7 @@ repeat: printk("%s: %sing: CHS=%d/%d/%d, sectors=%d, buffer=%p\n", req->rq_disk->disk_name, req_data_dir(req) == READ ? "read" : "writ", - cyl, head, sec, nsect, req->buffer); + cyl, head, sec, nsect, bio_data(req->bio)); #endif if (req->cmd_type == REQ_TYPE_FS) { switch (rq_data_dir(req)) { @@ -643,7 +643,7 @@ repeat: bad_rw_intr(); goto repeat; } - outsw(HD_DATA, req->buffer, 256); + outsw(HD_DATA, bio_data(req->bio), 256); break; default: printk("unknown hd-command\n"); diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index eb59b12..e352cac 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -479,7 +479,7 @@ static unsigned int mg_out(struct mg_host *host, static void mg_read_one(struct mg_host *host, struct request *req) { - u16 *buff = (u16 *)req->buffer; + u16 *buff = (u16 *)bio_data(req->bio); u32 i; for (i = 0; i < MG_SECTOR_SIZE >> 1; i++) @@ -496,7 +496,7 @@ static void mg_read(struct request *req) mg_bad_rw_intr(host); MG_DBG("requested %d sects (from %ld), buffer=0x%p\n", - blk_rq_sectors(req), blk_rq_pos(req), req->buffer); + blk_rq_sectors(req), blk_rq_pos(req), bio_data(req->bio)); do { if (mg_wait(host, ATA_DRQ, @@ -514,7 +514,7 @@ static void mg_read(struct request *req) static void mg_write_one(struct mg_host *host, struct request *req) { - u16 *buff = (u16 *)req->buffer; + u16 *buff = (u16 *)bio_data(req->bio); u32 i; for (i = 0; i < MG_SECTOR_SIZE >> 1; i++) @@ -534,7 +534,7 @@ static void mg_write(struct request *req) } MG_DBG("requested %d sects (from %ld), buffer=0x%p\n", - rem, blk_rq_pos(req), req->buffer); + rem, blk_rq_pos(req), bio_data(req->bio)); if (mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) { @@ -585,7 +585,7 @@ ok_to_read: mg_read_one(host, req); MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n", - blk_rq_pos(req), blk_rq_sectors(req) - 1, req->buffer); + blk_rq_pos(req), blk_rq_sectors(req) - 1, bio_data(req->bio)); /* send read confirm */ outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND); @@ -624,7 +624,7 @@ ok_to_write: /* write 1 sector and set handler if remains */ mg_write_one(host, req); MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n", - blk_rq_pos(req), blk_rq_sectors(req), req->buffer); + blk_rq_pos(req), blk_rq_sectors(req), bio_data(req->bio)); host->mg_do_intr = mg_write_intr; mod_timer(&host->timer, jiffies + 3 * HZ); } diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 091b9ea..b40af63 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -32,6 +32,7 @@ struct nullb { unsigned int index; struct request_queue *q; struct gendisk *disk; + struct blk_mq_tag_set tag_set; struct hrtimer timer; unsigned int queue_depth; spinlock_t lock; @@ -226,7 +227,7 @@ static void null_cmd_end_timer(struct nullb_cmd *cmd) static void null_softirq_done_fn(struct request *rq) { - end_cmd(rq->special); + end_cmd(blk_mq_rq_to_pdu(rq)); } static inline void null_handle_cmd(struct nullb_cmd *cmd) @@ -311,7 +312,7 @@ static void null_request_fn(struct request_queue *q) static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq) { - struct nullb_cmd *cmd = rq->special; + struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); cmd->rq = rq; cmd->nq = hctx->driver_data; @@ -320,46 +321,6 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq) return BLK_MQ_RQ_QUEUE_OK; } -static struct blk_mq_hw_ctx *null_alloc_hctx(struct blk_mq_reg *reg, unsigned int hctx_index) -{ - int b_size = DIV_ROUND_UP(reg->nr_hw_queues, nr_online_nodes); - int tip = (reg->nr_hw_queues % nr_online_nodes); - int node = 0, i, n; - - /* - * Split submit queues evenly wrt to the number of nodes. If uneven, - * fill the first buckets with one extra, until the rest is filled with - * no extra. - */ - for (i = 0, n = 1; i < hctx_index; i++, n++) { - if (n % b_size == 0) { - n = 0; - node++; - - tip--; - if (!tip) - b_size = reg->nr_hw_queues / nr_online_nodes; - } - } - - /* - * A node might not be online, therefore map the relative node id to the - * real node id. - */ - for_each_online_node(n) { - if (!node) - break; - node--; - } - - return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, n); -} - -static void null_free_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_index) -{ - kfree(hctx); -} - static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) { BUG_ON(!nullb); @@ -389,19 +350,14 @@ static struct blk_mq_ops null_mq_ops = { .complete = null_softirq_done_fn, }; -static struct blk_mq_reg null_mq_reg = { - .ops = &null_mq_ops, - .queue_depth = 64, - .cmd_size = sizeof(struct nullb_cmd), - .flags = BLK_MQ_F_SHOULD_MERGE, -}; - static void null_del_dev(struct nullb *nullb) { list_del_init(&nullb->list); del_gendisk(nullb->disk); blk_cleanup_queue(nullb->q); + if (queue_mode == NULL_Q_MQ) + blk_mq_free_tag_set(&nullb->tag_set); put_disk(nullb->disk); kfree(nullb); } @@ -506,7 +462,7 @@ static int null_add_dev(void) nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node); if (!nullb) - return -ENOMEM; + goto out; spin_lock_init(&nullb->lock); @@ -514,49 +470,44 @@ static int null_add_dev(void) submit_queues = nr_online_nodes; if (setup_queues(nullb)) - goto err; + goto out_free_nullb; if (queue_mode == NULL_Q_MQ) { - null_mq_reg.numa_node = home_node; - null_mq_reg.queue_depth = hw_queue_depth; - null_mq_reg.nr_hw_queues = submit_queues; - - if (use_per_node_hctx) { - null_mq_reg.ops->alloc_hctx = null_alloc_hctx; - null_mq_reg.ops->free_hctx = null_free_hctx; - } else { - null_mq_reg.ops->alloc_hctx = blk_mq_alloc_single_hw_queue; - null_mq_reg.ops->free_hctx = blk_mq_free_single_hw_queue; - } - - nullb->q = blk_mq_init_queue(&null_mq_reg, nullb); + nullb->tag_set.ops = &null_mq_ops; + nullb->tag_set.nr_hw_queues = submit_queues; + nullb->tag_set.queue_depth = hw_queue_depth; + nullb->tag_set.numa_node = home_node; + nullb->tag_set.cmd_size = sizeof(struct nullb_cmd); + nullb->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + nullb->tag_set.driver_data = nullb; + + if (blk_mq_alloc_tag_set(&nullb->tag_set)) + goto out_cleanup_queues; + + nullb->q = blk_mq_init_queue(&nullb->tag_set); + if (!nullb->q) + goto out_cleanup_tags; } else if (queue_mode == NULL_Q_BIO) { nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node); + if (!nullb->q) + goto out_cleanup_queues; blk_queue_make_request(nullb->q, null_queue_bio); init_driver_queues(nullb); } else { nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, home_node); + if (!nullb->q) + goto out_cleanup_queues; blk_queue_prep_rq(nullb->q, null_rq_prep_fn); - if (nullb->q) - blk_queue_softirq_done(nullb->q, null_softirq_done_fn); + blk_queue_softirq_done(nullb->q, null_softirq_done_fn); init_driver_queues(nullb); } - if (!nullb->q) - goto queue_fail; - nullb->q->queuedata = nullb; queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q); disk = nullb->disk = alloc_disk_node(1, home_node); - if (!disk) { -queue_fail: - blk_cleanup_queue(nullb->q); - cleanup_queues(nullb); -err: - kfree(nullb); - return -ENOMEM; - } + if (!disk) + goto out_cleanup_blk_queue; mutex_lock(&lock); list_add_tail(&nullb->list, &nullb_list); @@ -579,6 +530,18 @@ err: sprintf(disk->disk_name, "nullb%d", nullb->index); add_disk(disk); return 0; + +out_cleanup_blk_queue: + blk_cleanup_queue(nullb->q); +out_cleanup_tags: + if (queue_mode == NULL_Q_MQ) + blk_mq_free_tag_set(&nullb->tag_set); +out_cleanup_queues: + cleanup_queues(nullb); +out_free_nullb: + kfree(nullb); +out: + return -ENOMEM; } static int __init null_init(void) diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index e76bdc0..719cb1b 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -747,7 +747,7 @@ static void do_pcd_request(struct request_queue * q) pcd_current = cd; pcd_sector = blk_rq_pos(pcd_req); pcd_count = blk_rq_cur_sectors(pcd_req); - pcd_buf = pcd_req->buffer; + pcd_buf = bio_data(pcd_req->bio); pcd_busy = 1; ps_set_intr(do_pcd_read, NULL, 0, nice); return; diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index 19ad8f0..fea7e76 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -454,7 +454,7 @@ static enum action do_pd_io_start(void) if (pd_block + pd_count > get_capacity(pd_req->rq_disk)) return Fail; pd_run = blk_rq_sectors(pd_req); - pd_buf = pd_req->buffer; + pd_buf = bio_data(pd_req->bio); pd_retries = 0; if (pd_cmd == READ) return do_pd_read_start(); @@ -485,7 +485,7 @@ static int pd_next_buf(void) spin_lock_irqsave(&pd_lock, saved_flags); __blk_end_request_cur(pd_req, 0); pd_count = blk_rq_cur_sectors(pd_req); - pd_buf = pd_req->buffer; + pd_buf = bio_data(pd_req->bio); spin_unlock_irqrestore(&pd_lock, saved_flags); return 0; } diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index f5c86d5..9a15fd3 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -795,7 +795,7 @@ repeat: } pf_cmd = rq_data_dir(pf_req); - pf_buf = pf_req->buffer; + pf_buf = bio_data(pf_req->bio); pf_retries = 0; pf_busy = 1; @@ -827,7 +827,7 @@ static int pf_next_buf(void) if (!pf_req) return 1; pf_count = blk_rq_cur_sectors(pf_req); - pf_buf = pf_req->buffer; + pf_buf = bio_data(pf_req->bio); } return 0; } diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index a69dd93..c48d9084 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -563,7 +563,6 @@ skd_prep_discard_cdb(struct skd_scsi_request *scsi_req, req = skreq->req; blk_add_request_payload(req, page, len); - req->buffer = buf; } static void skd_request_fn_not_online(struct request_queue *q); @@ -744,6 +743,7 @@ static void skd_request_fn(struct request_queue *q) break; } skreq->discard_page = 1; + req->completion_data = page; skd_prep_discard_cdb(scsi_req, skreq, page, lba, count); } else if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) { @@ -858,8 +858,7 @@ static void skd_end_request(struct skd_device *skdev, (skreq->discard_page == 1)) { pr_debug("%s:%s:%d, free the page!", skdev->name, __func__, __LINE__); - free_page((unsigned long)req->buffer); - req->buffer = NULL; + __free_page(req->completion_data); } if (unlikely(error)) { diff --git a/drivers/block/swim.c b/drivers/block/swim.c index b02d53a..6b44bbe 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -549,7 +549,7 @@ static void redo_fd_request(struct request_queue *q) case READ: err = floppy_read_sectors(fs, blk_rq_pos(req), blk_rq_cur_sectors(req), - req->buffer); + bio_data(req->bio)); break; } done: diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index c74f7b5..523ee8f 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -342,7 +342,7 @@ static void start_request(struct floppy_state *fs) swim3_dbg("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%u buf=%p\n", req->rq_disk->disk_name, req->cmd, (long)blk_rq_pos(req), blk_rq_sectors(req), - req->buffer); + bio_data(req->bio)); swim3_dbg(" errors=%d current_nr_sectors=%u\n", req->errors, blk_rq_cur_sectors(req)); #endif @@ -479,11 +479,11 @@ static inline void setup_transfer(struct floppy_state *fs) /* Set up 3 dma commands: write preamble, data, postamble */ init_dma(cp, OUTPUT_MORE, write_preamble, sizeof(write_preamble)); ++cp; - init_dma(cp, OUTPUT_MORE, req->buffer, 512); + init_dma(cp, OUTPUT_MORE, bio_data(req->bio), 512); ++cp; init_dma(cp, OUTPUT_LAST, write_postamble, sizeof(write_postamble)); } else { - init_dma(cp, INPUT_LAST, req->buffer, n * 512); + init_dma(cp, INPUT_LAST, bio_data(req->bio), n * 512); } ++cp; out_le16(&cp->command, DBDMA_STOP); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index cb9b1f8..c8f286e 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -30,6 +30,9 @@ struct virtio_blk /* The disk structure for the kernel. */ struct gendisk *disk; + /* Block layer tags. */ + struct blk_mq_tag_set tag_set; + /* Process context for config space updates */ struct work_struct config_work; @@ -112,7 +115,7 @@ static int __virtblk_add_req(struct virtqueue *vq, static inline void virtblk_request_done(struct request *req) { - struct virtblk_req *vbr = req->special; + struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); int error = virtblk_result(vbr); if (req->cmd_type == REQ_TYPE_BLOCK_PC) { @@ -147,14 +150,14 @@ static void virtblk_done(struct virtqueue *vq) /* In case queue is stopped waiting for more buffers. */ if (req_done) - blk_mq_start_stopped_hw_queues(vblk->disk->queue); + blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); spin_unlock_irqrestore(&vblk->vq_lock, flags); } static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) { struct virtio_blk *vblk = hctx->queue->queuedata; - struct virtblk_req *vbr = req->special; + struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); unsigned long flags; unsigned int num; const bool last = (req->cmd_flags & REQ_END) != 0; @@ -480,33 +483,27 @@ static const struct device_attribute dev_attr_cache_type_rw = __ATTR(cache_type, S_IRUGO|S_IWUSR, virtblk_cache_type_show, virtblk_cache_type_store); -static struct blk_mq_ops virtio_mq_ops = { - .queue_rq = virtio_queue_rq, - .map_queue = blk_mq_map_queue, - .alloc_hctx = blk_mq_alloc_single_hw_queue, - .free_hctx = blk_mq_free_single_hw_queue, - .complete = virtblk_request_done, -}; - -static struct blk_mq_reg virtio_mq_reg = { - .ops = &virtio_mq_ops, - .nr_hw_queues = 1, - .queue_depth = 0, /* Set in virtblk_probe */ - .numa_node = NUMA_NO_NODE, - .flags = BLK_MQ_F_SHOULD_MERGE, -}; -module_param_named(queue_depth, virtio_mq_reg.queue_depth, uint, 0444); - -static int virtblk_init_vbr(void *data, struct blk_mq_hw_ctx *hctx, - struct request *rq, unsigned int nr) +static int virtblk_init_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int request_idx, + unsigned int numa_node) { struct virtio_blk *vblk = data; - struct virtblk_req *vbr = rq->special; + struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq); sg_init_table(vbr->sg, vblk->sg_elems); return 0; } +static struct blk_mq_ops virtio_mq_ops = { + .queue_rq = virtio_queue_rq, + .map_queue = blk_mq_map_queue, + .complete = virtblk_request_done, + .init_request = virtblk_init_request, +}; + +static unsigned int virtblk_queue_depth; +module_param_named(queue_depth, virtblk_queue_depth, uint, 0444); + static int virtblk_probe(struct virtio_device *vdev) { struct virtio_blk *vblk; @@ -561,24 +558,34 @@ static int virtblk_probe(struct virtio_device *vdev) } /* Default queue sizing is to fill the ring. */ - if (!virtio_mq_reg.queue_depth) { - virtio_mq_reg.queue_depth = vblk->vq->num_free; + if (!virtblk_queue_depth) { + virtblk_queue_depth = vblk->vq->num_free; /* ... but without indirect descs, we use 2 descs per req */ if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)) - virtio_mq_reg.queue_depth /= 2; + virtblk_queue_depth /= 2; } - virtio_mq_reg.cmd_size = + + memset(&vblk->tag_set, 0, sizeof(vblk->tag_set)); + vblk->tag_set.ops = &virtio_mq_ops; + vblk->tag_set.nr_hw_queues = 1; + vblk->tag_set.queue_depth = virtblk_queue_depth; + vblk->tag_set.numa_node = NUMA_NO_NODE; + vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + vblk->tag_set.cmd_size = sizeof(struct virtblk_req) + sizeof(struct scatterlist) * sg_elems; + vblk->tag_set.driver_data = vblk; - q = vblk->disk->queue = blk_mq_init_queue(&virtio_mq_reg, vblk); + err = blk_mq_alloc_tag_set(&vblk->tag_set); + if (err) + goto out_put_disk; + + q = vblk->disk->queue = blk_mq_init_queue(&vblk->tag_set); if (!q) { err = -ENOMEM; - goto out_put_disk; + goto out_free_tags; } - blk_mq_init_commands(q, virtblk_init_vbr, vblk); - q->queuedata = vblk; virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN); @@ -679,6 +686,8 @@ static int virtblk_probe(struct virtio_device *vdev) out_del_disk: del_gendisk(vblk->disk); blk_cleanup_queue(vblk->disk->queue); +out_free_tags: + blk_mq_free_tag_set(&vblk->tag_set); out_put_disk: put_disk(vblk->disk); out_free_vq: @@ -705,6 +714,8 @@ static void virtblk_remove(struct virtio_device *vdev) del_gendisk(vblk->disk); blk_cleanup_queue(vblk->disk->queue); + blk_mq_free_tag_set(&vblk->tag_set); + /* Stop all the virtqueues. */ vdev->config->reset(vdev); @@ -749,7 +760,7 @@ static int virtblk_restore(struct virtio_device *vdev) vblk->config_enable = true; ret = init_vq(vdev->priv); if (!ret) - blk_mq_start_stopped_hw_queues(vblk->disk->queue); + blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); return ret; } diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index efe1b47..283a30e 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -612,10 +612,10 @@ static void do_blkif_request(struct request_queue *rq) } pr_debug("do_blk_req %p: cmd %p, sec %lx, " - "(%u/%u) buffer:%p [%s]\n", + "(%u/%u) [%s]\n", req, req->cmd, (unsigned long)blk_rq_pos(req), blk_rq_cur_sectors(req), blk_rq_sectors(req), - req->buffer, rq_data_dir(req) ? "write" : "read"); + rq_data_dir(req) ? "write" : "read"); if (blkif_queue_request(req)) { blk_requeue_request(rq, req); diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c index 1393b88..ab3ea62 100644 --- a/drivers/block/xsysace.c +++ b/drivers/block/xsysace.c @@ -661,7 +661,7 @@ static void ace_fsm_dostate(struct ace_device *ace) rq_data_dir(req)); ace->req = req; - ace->data_ptr = req->buffer; + ace->data_ptr = bio_data(req->bio); ace->data_count = blk_rq_cur_sectors(req) * ACE_BUF_PER_SECTOR; ace_out32(ace, ACE_MPULBA, blk_rq_pos(req) & 0x0FFFFFFF); @@ -733,7 +733,7 @@ static void ace_fsm_dostate(struct ace_device *ace) * blk_rq_sectors(ace->req), * blk_rq_cur_sectors(ace->req)); */ - ace->data_ptr = ace->req->buffer; + ace->data_ptr = bio_data(ace->req->bio); ace->data_count = blk_rq_cur_sectors(ace->req) * 16; ace_fsm_yieldirq(ace); break; diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index 27de5046..968f9e5 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -87,13 +87,15 @@ static void do_z2_request(struct request_queue *q) while (len) { unsigned long addr = start & Z2RAM_CHUNKMASK; unsigned long size = Z2RAM_CHUNKSIZE - addr; + void *buffer = bio_data(req->bio); + if (len < size) size = len; addr += z2ram_map[ start >> Z2RAM_CHUNKSHIFT ]; if (rq_data_dir(req) == READ) - memcpy(req->buffer, (char *)addr, size); + memcpy(buffer, (char *)addr, size); else - memcpy((char *)addr, req->buffer, size); + memcpy((char *)addr, buffer, size); start += size; len -= size; } diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index 51e75ad..584bc31 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -602,7 +602,7 @@ static void gdrom_readdisk_dma(struct work_struct *work) spin_unlock(&gdrom_lock); block = blk_rq_pos(req)/GD_TO_BLK + GD_SESSION_OFFSET; block_cnt = blk_rq_sectors(req)/GD_TO_BLK; - __raw_writel(virt_to_phys(req->buffer), GDROM_DMA_STARTADDR_REG); + __raw_writel(virt_to_phys(bio_data(req->bio)), GDROM_DMA_STARTADDR_REG); __raw_writel(block_cnt * GDROM_HARD_SECTOR, GDROM_DMA_LENGTH_REG); __raw_writel(1, GDROM_DMA_DIRECTION_REG); __raw_writel(1, GDROM_DMA_ENABLE_REG); diff --git a/drivers/char/random.c b/drivers/char/random.c index 102c50d..06cea7f 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -902,6 +902,7 @@ void add_disk_randomness(struct gendisk *disk) add_timer_randomness(disk->random, 0x100 + disk_devt(disk)); trace_add_disk_randomness(disk_devt(disk), ENTROPY_BITS(&input_pool)); } +EXPORT_SYMBOL_GPL(add_disk_randomness); #endif /********************************************************************* diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index 16f69be..ee88038 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -188,10 +188,9 @@ static ide_startstop_t ide_do_rw_disk(ide_drive_t *drive, struct request *rq, ledtrig_ide_activity(); - pr_debug("%s: %sing: block=%llu, sectors=%u, buffer=0x%08lx\n", + pr_debug("%s: %sing: block=%llu, sectors=%u\n", drive->name, rq_data_dir(rq) == READ ? "read" : "writ", - (unsigned long long)block, blk_rq_sectors(rq), - (unsigned long)rq->buffer); + (unsigned long long)block, blk_rq_sectors(rq)); if (hwif->rw_disk) hwif->rw_disk(drive, rq); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 455e649..6a71bc7 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1544,7 +1544,6 @@ static int setup_clone(struct request *clone, struct request *rq, clone->cmd = rq->cmd; clone->cmd_len = rq->cmd_len; clone->sense = rq->sense; - clone->buffer = rq->buffer; clone->end_io = end_clone_request; clone->end_io_data = tio; diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 0b2ccb6..4dbfaee 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -82,8 +82,7 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr, block = blk_rq_pos(req) << 9 >> tr->blkshift; nsect = blk_rq_cur_bytes(req) >> tr->blkshift; - - buf = req->buffer; + buf = bio_data(req->bio); if (req->cmd_type != REQ_TYPE_FS) return -EIO; diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index 8d659e6..20a667c 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -253,7 +253,7 @@ static int do_ubiblock_request(struct ubiblock *dev, struct request *req) * flash access anyway. */ mutex_lock(&dev->dev_mutex); - ret = ubiblock_read(dev, req->buffer, sec, len); + ret = ubiblock_read(dev, bio_data(req->bio), sec, len); mutex_unlock(&dev->dev_mutex); return ret; diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c index 4ccb5d8..a40ee1e 100644 --- a/drivers/sbus/char/jsflash.c +++ b/drivers/sbus/char/jsflash.c @@ -207,7 +207,7 @@ static void jsfd_do_request(struct request_queue *q) goto end; } - jsfd_read(req->buffer, jdp->dbase + offset, len); + jsfd_read(bio_data(req->bio), jdp->dbase + offset, len); err = 0; end: if (!__blk_end_request_cur(req, err)) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 9db097a..a0c95ca 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -140,7 +140,7 @@ static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, int unbusy) cmd->result = 0; spin_lock_irqsave(q->queue_lock, flags); blk_requeue_request(q, cmd->request); - kblockd_schedule_work(q, &device->requeue_work); + kblockd_schedule_work(&device->requeue_work); spin_unlock_irqrestore(q->queue_lock, flags); } @@ -1019,8 +1019,6 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb, return BLKPREP_DEFER; } - req->buffer = NULL; - /* * Next, walk the list, and fill in the addresses and sizes of * each segment. @@ -1158,7 +1156,6 @@ int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req) BUG_ON(blk_rq_bytes(req)); memset(&cmd->sdb, 0, sizeof(cmd->sdb)); - req->buffer = NULL; } cmd->cmd_len = req->cmd_len; diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index efcbcd1..96af195 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -737,16 +737,14 @@ static int sd_setup_discard_cmnd(struct scsi_device *sdp, struct request *rq) goto out; } + rq->completion_data = page; blk_add_request_payload(rq, page, len); ret = scsi_setup_blk_pc_cmnd(sdp, rq); - rq->buffer = page_address(page); rq->__data_len = nr_bytes; out: - if (ret != BLKPREP_OK) { + if (ret != BLKPREP_OK) __free_page(page); - rq->buffer = NULL; - } return ret; } @@ -842,10 +840,9 @@ static void sd_unprep_fn(struct request_queue *q, struct request *rq) { struct scsi_cmnd *SCpnt = rq->special; - if (rq->cmd_flags & REQ_DISCARD) { - free_page((unsigned long)rq->buffer); - rq->buffer = NULL; - } + if (rq->cmd_flags & REQ_DISCARD) + __free_page(rq->completion_data); + if (SCpnt->cmnd != rq->cmd) { mempool_free(SCpnt->cmnd, sd_cdb_pool); SCpnt->cmnd = NULL; diff --git a/fs/Makefile b/fs/Makefile index f9cb987..4030cbf 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -14,14 +14,13 @@ obj-y := open.o read_write.o file_table.o super.o \ stack.o fs_struct.o statfs.o ifeq ($(CONFIG_BLOCK),y) -obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o +obj-y += buffer.o block_dev.o direct-io.o mpage.o else obj-y += no-block.o endif obj-$(CONFIG_PROC_FS) += proc_namespace.o -obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o obj-y += notify/ obj-$(CONFIG_EPOLL) += eventpoll.o obj-$(CONFIG_ANON_INODES) += anon_inodes.o diff --git a/include/linux/bio.h b/include/linux/bio.h index bba5508..5a64576 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -333,7 +333,7 @@ static inline struct bio *bio_next_split(struct bio *bio, int sectors, extern struct bio_set *bioset_create(unsigned int, unsigned int); extern void bioset_free(struct bio_set *); -extern mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries); +extern mempool_t *biovec_create_pool(int pool_entries); extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *); extern void bio_put(struct bio *); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 0120451..c151288 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -8,7 +8,13 @@ struct blk_mq_tags; struct blk_mq_cpu_notifier { struct list_head list; void *data; - void (*notify)(void *data, unsigned long action, unsigned int cpu); + int (*notify)(void *data, unsigned long action, unsigned int cpu); +}; + +struct blk_mq_ctxmap { + unsigned int map_size; + unsigned int bits_per_word; + struct blk_align_bitmap *map; }; struct blk_mq_hw_ctx { @@ -18,7 +24,11 @@ struct blk_mq_hw_ctx { } ____cacheline_aligned_in_smp; unsigned long state; /* BLK_MQ_S_* flags */ - struct delayed_work delayed_work; + struct delayed_work run_work; + struct delayed_work delay_work; + cpumask_var_t cpumask; + int next_cpu; + int next_cpu_batch; unsigned long flags; /* BLK_MQ_F_* flags */ @@ -27,13 +37,13 @@ struct blk_mq_hw_ctx { void *driver_data; + struct blk_mq_ctxmap ctx_map; + unsigned int nr_ctx; struct blk_mq_ctx **ctxs; - unsigned int nr_ctx_map; - unsigned long *ctx_map; - struct request **rqs; - struct list_head page_list; + unsigned int wait_index; + struct blk_mq_tags *tags; unsigned long queued; @@ -41,31 +51,40 @@ struct blk_mq_hw_ctx { #define BLK_MQ_MAX_DISPATCH_ORDER 10 unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; - unsigned int queue_depth; unsigned int numa_node; unsigned int cmd_size; /* per-request extra data */ + atomic_t nr_active; + struct blk_mq_cpu_notifier cpu_notifier; struct kobject kobj; }; -struct blk_mq_reg { +struct blk_mq_tag_set { struct blk_mq_ops *ops; unsigned int nr_hw_queues; - unsigned int queue_depth; + unsigned int queue_depth; /* max hw supported */ unsigned int reserved_tags; unsigned int cmd_size; /* per-request extra data */ int numa_node; unsigned int timeout; unsigned int flags; /* BLK_MQ_F_* */ + void *driver_data; + + struct blk_mq_tags **tags; + + struct mutex tag_list_lock; + struct list_head tag_list; }; typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *); typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int); -typedef struct blk_mq_hw_ctx *(alloc_hctx_fn)(struct blk_mq_reg *,unsigned int); -typedef void (free_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); +typedef int (init_request_fn)(void *, struct request *, unsigned int, + unsigned int, unsigned int); +typedef void (exit_request_fn)(void *, struct request *, unsigned int, + unsigned int); struct blk_mq_ops { /* @@ -86,18 +105,20 @@ struct blk_mq_ops { softirq_done_fn *complete; /* - * Override for hctx allocations (should probably go) - */ - alloc_hctx_fn *alloc_hctx; - free_hctx_fn *free_hctx; - - /* * Called when the block layer side of a hardware queue has been * set up, allowing the driver to allocate/init matching structures. * Ditto for exit/teardown. */ init_hctx_fn *init_hctx; exit_hctx_fn *exit_hctx; + + /* + * Called for every command allocated by the block layer to allow + * the driver to set up driver specific data. + * Ditto for exit/teardown. + */ + init_request_fn *init_request; + exit_request_fn *exit_request; }; enum { @@ -107,18 +128,24 @@ enum { BLK_MQ_F_SHOULD_MERGE = 1 << 0, BLK_MQ_F_SHOULD_SORT = 1 << 1, - BLK_MQ_F_SHOULD_IPI = 1 << 2, + BLK_MQ_F_TAG_SHARED = 1 << 2, + BLK_MQ_F_SG_MERGE = 1 << 3, + BLK_MQ_F_SYSFS_UP = 1 << 4, BLK_MQ_S_STOPPED = 0, + BLK_MQ_S_TAG_ACTIVE = 1, BLK_MQ_MAX_DEPTH = 2048, + + BLK_MQ_CPU_WORK_BATCH = 8, }; -struct request_queue *blk_mq_init_queue(struct blk_mq_reg *, void *); +struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); int blk_mq_register_disk(struct gendisk *); void blk_mq_unregister_disk(struct gendisk *); -int blk_mq_init_commands(struct request_queue *, int (*init)(void *data, struct blk_mq_hw_ctx *, struct request *, unsigned int), void *data); -void blk_mq_free_commands(struct request_queue *, void (*free)(void *data, struct blk_mq_hw_ctx *, struct request *, unsigned int), void *data); + +int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); +void blk_mq_free_tag_set(struct blk_mq_tag_set *set); void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); @@ -126,28 +153,28 @@ void blk_mq_insert_request(struct request *, bool, bool, bool); void blk_mq_run_queues(struct request_queue *q, bool async); void blk_mq_free_request(struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); -struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp); -struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp); -struct request *blk_mq_rq_from_tag(struct request_queue *q, unsigned int tag); +struct request *blk_mq_alloc_request(struct request_queue *q, int rw, + gfp_t gfp, bool reserved); +struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx, unsigned int tag); struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index); -struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *, unsigned int); -void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int); +struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int); -bool blk_mq_end_io_partial(struct request *rq, int error, - unsigned int nr_bytes); -static inline void blk_mq_end_io(struct request *rq, int error) -{ - bool done = !blk_mq_end_io_partial(rq, error, blk_rq_bytes(rq)); - BUG_ON(!done); -} +void blk_mq_end_io(struct request *rq, int error); +void __blk_mq_end_io(struct request *rq, int error); +void blk_mq_requeue_request(struct request *rq); +void blk_mq_add_to_requeue_list(struct request *rq, bool at_head); +void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_complete_request(struct request *rq); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_stop_hw_queues(struct request_queue *q); -void blk_mq_start_stopped_hw_queues(struct request_queue *q); +void blk_mq_start_hw_queues(struct request_queue *q); +void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); +void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); +void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data); /* * Driver command data is immediately after the request. So subtract request @@ -162,12 +189,6 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq) return (void *) rq + sizeof(*rq); } -static inline struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx, - unsigned int tag) -{ - return hctx->rqs[tag]; -} - #define queue_for_each_hw_ctx(q, hctx, i) \ for ((i) = 0; (i) < (q)->nr_hw_queues && \ ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index aa0eaa2..d8e4cea 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -190,6 +190,7 @@ enum rq_flag_bits { __REQ_PM, /* runtime pm request */ __REQ_END, /* last of chain of requests */ __REQ_HASHED, /* on IO scheduler merge hash */ + __REQ_MQ_INFLIGHT, /* track inflight for MQ */ __REQ_NR_BITS, /* stops here */ }; @@ -243,5 +244,6 @@ enum rq_flag_bits { #define REQ_PM (1ULL << __REQ_PM) #define REQ_END (1ULL << __REQ_END) #define REQ_HASHED (1ULL << __REQ_HASHED) +#define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) #endif /* __LINUX_BLK_TYPES_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0d84981..695b9fd 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -90,15 +90,15 @@ enum rq_cmd_type_bits { #define BLK_MAX_CDB 16 /* - * try to put the fields that are referenced together in the same cacheline. - * if you modify this structure, be sure to check block/blk-core.c:blk_rq_init() - * as well! + * Try to put the fields that are referenced together in the same cacheline. + * + * If you modify this structure, make sure to update blk_rq_init() and + * especially blk_mq_rq_ctx_init() to take care of the added fields. */ struct request { struct list_head queuelist; union { struct call_single_data csd; - struct work_struct mq_flush_work; unsigned long fifo_time; }; @@ -178,7 +178,6 @@ struct request { unsigned short ioprio; void *special; /* opaque pointer available for LLD use */ - char *buffer; /* kaddr of the current segment if available */ int tag; int errors; @@ -463,6 +462,10 @@ struct request_queue { struct request *flush_rq; spinlock_t mq_flush_lock; + struct list_head requeue_list; + spinlock_t requeue_lock; + struct work_struct requeue_work; + struct mutex sysfs_lock; int bypass_depth; @@ -481,6 +484,9 @@ struct request_queue { wait_queue_head_t mq_freeze_wq; struct percpu_counter mq_usage_counter; struct list_head all_q_node; + + struct blk_mq_tag_set *tag_set; + struct list_head tag_set_list; }; #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ @@ -504,6 +510,7 @@ struct request_queue { #define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */ #define QUEUE_FLAG_DEAD 19 /* queue tear-down finished */ #define QUEUE_FLAG_INIT_DONE 20 /* queue is initialized */ +#define QUEUE_FLAG_NO_SG_MERGE 21 /* don't attempt to merge SG segments*/ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ @@ -937,6 +944,7 @@ extern struct request *blk_fetch_request(struct request_queue *q); */ extern bool blk_update_request(struct request *rq, int error, unsigned int nr_bytes); +extern void blk_finish_request(struct request *rq, int error); extern bool blk_end_request(struct request *rq, int error, unsigned int nr_bytes); extern void blk_end_request_all(struct request *rq, int error); @@ -1053,7 +1061,6 @@ static inline void blk_post_runtime_resume(struct request_queue *q, int err) {} * schedule() where blk_schedule_flush_plug() is called. */ struct blk_plug { - unsigned long magic; /* detect uninitialized use-cases */ struct list_head list; /* requests */ struct list_head mq_list; /* blk-mq requests */ struct list_head cb_list; /* md requires an unplug callback */ @@ -1102,7 +1109,8 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) /* * tag stuff */ -#define blk_rq_tagged(rq) ((rq)->cmd_flags & REQ_QUEUED) +#define blk_rq_tagged(rq) \ + ((rq)->mq_ctx || ((rq)->cmd_flags & REQ_QUEUED)) extern int blk_queue_start_tag(struct request_queue *, struct request *); extern struct request *blk_queue_find_tag(struct request_queue *, int); extern void blk_queue_end_tag(struct request_queue *, struct request *); @@ -1370,8 +1378,9 @@ static inline void put_dev_sector(Sector p) } struct work_struct; -int kblockd_schedule_work(struct request_queue *q, struct work_struct *work); -int kblockd_schedule_delayed_work(struct request_queue *q, struct delayed_work *dwork, unsigned long delay); +int kblockd_schedule_work(struct work_struct *work); +int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay); +int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); #ifdef CONFIG_BLK_CGROUP /* diff --git a/mm/Makefile b/mm/Makefile index b484452..0173940 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -30,7 +30,6 @@ endif obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o -obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o obj-$(CONFIG_FRONTSWAP) += frontswap.o obj-$(CONFIG_ZSWAP) += zswap.o |