diff options
49 files changed, 3885 insertions, 364 deletions
diff --git a/block/Makefile b/block/Makefile index 671a83d..20645e8 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,8 +5,9 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ - blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o \ - partition-generic.o partitions/ + blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \ + blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ + genhd.o scsi_ioctl.o partition-generic.o partitions/ obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o diff --git a/block/blk-core.c b/block/blk-core.c index 0a00e4e..8bdd012 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -16,6 +16,7 @@ #include <linux/backing-dev.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/blk-mq.h> #include <linux/highmem.h> #include <linux/mm.h> #include <linux/kernel_stat.h> @@ -48,7 +49,7 @@ DEFINE_IDA(blk_queue_ida); /* * For the allocated request tables */ -static struct kmem_cache *request_cachep; +struct kmem_cache *request_cachep = NULL; /* * For queue allocation @@ -60,42 +61,6 @@ struct kmem_cache *blk_requestq_cachep; */ static struct workqueue_struct *kblockd_workqueue; -static void drive_stat_acct(struct request *rq, int new_io) -{ - struct hd_struct *part; - int rw = rq_data_dir(rq); - int cpu; - - if (!blk_do_io_stat(rq)) - return; - - cpu = part_stat_lock(); - - if (!new_io) { - part = rq->part; - part_stat_inc(cpu, part, merges[rw]); - } else { - part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); - if (!hd_struct_try_get(part)) { - /* - * The partition is already being removed, - * the request will be accounted on the disk only - * - * We take a reference on disk->part0 although that - * partition will never be deleted, so we can treat - * it as any other partition. - */ - part = &rq->rq_disk->part0; - hd_struct_get(part); - } - part_round_stats(cpu, part); - part_inc_in_flight(part, rw); - rq->part = part; - } - - part_stat_unlock(); -} - void blk_queue_congestion_threshold(struct request_queue *q) { int nr; @@ -145,7 +110,6 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->cmd = rq->__cmd; rq->cmd_len = BLK_MAX_CDB; rq->tag = -1; - rq->ref_count = 1; rq->start_time = jiffies; set_start_time_ns(rq); rq->part = NULL; @@ -174,9 +138,9 @@ void blk_dump_rq_flags(struct request *rq, char *msg) { int bit; - printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg, + printk(KERN_INFO "%s: dev %s: type=%x, flags=%llx\n", msg, rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type, - rq->cmd_flags); + (unsigned long long) rq->cmd_flags); printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", (unsigned long long)blk_rq_pos(rq), @@ -595,9 +559,12 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (!q) return NULL; + if (percpu_counter_init(&q->mq_usage_counter, 0)) + goto fail_q; + q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); if (q->id < 0) - goto fail_q; + goto fail_c; q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; @@ -644,13 +611,19 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) q->bypass_depth = 1; __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags); + init_waitqueue_head(&q->mq_freeze_wq); + if (blkcg_init_queue(q)) - goto fail_id; + goto fail_bdi; return q; +fail_bdi: + bdi_destroy(&q->backing_dev_info); fail_id: ida_simple_remove(&blk_queue_ida, q->id); +fail_c: + percpu_counter_destroy(&q->mq_usage_counter); fail_q: kmem_cache_free(blk_requestq_cachep, q); return NULL; @@ -739,9 +712,17 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, q->sg_reserved_size = INT_MAX; + /* Protect q->elevator from elevator_change */ + mutex_lock(&q->sysfs_lock); + /* init elevator */ - if (elevator_init(q, NULL)) + if (elevator_init(q, NULL)) { + mutex_unlock(&q->sysfs_lock); return NULL; + } + + mutex_unlock(&q->sysfs_lock); + return q; } EXPORT_SYMBOL(blk_init_allocated_queue); @@ -1109,7 +1090,8 @@ retry: goto retry; } -struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) +static struct request *blk_old_get_request(struct request_queue *q, int rw, + gfp_t gfp_mask) { struct request *rq; @@ -1126,6 +1108,14 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) return rq; } + +struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) +{ + if (q->mq_ops) + return blk_mq_alloc_request(q, rw, gfp_mask, false); + else + return blk_old_get_request(q, rw, gfp_mask); +} EXPORT_SYMBOL(blk_get_request); /** @@ -1211,7 +1201,7 @@ EXPORT_SYMBOL(blk_requeue_request); static void add_acct_request(struct request_queue *q, struct request *rq, int where) { - drive_stat_acct(rq, 1); + blk_account_io_start(rq, true); __elv_add_request(q, rq, where); } @@ -1272,8 +1262,6 @@ void __blk_put_request(struct request_queue *q, struct request *req) { if (unlikely(!q)) return; - if (unlikely(--req->ref_count)) - return; blk_pm_put_request(req); @@ -1302,12 +1290,17 @@ EXPORT_SYMBOL_GPL(__blk_put_request); void blk_put_request(struct request *req) { - unsigned long flags; struct request_queue *q = req->q; - spin_lock_irqsave(q->queue_lock, flags); - __blk_put_request(q, req); - spin_unlock_irqrestore(q->queue_lock, flags); + if (q->mq_ops) + blk_mq_free_request(req); + else { + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + __blk_put_request(q, req); + spin_unlock_irqrestore(q->queue_lock, flags); + } } EXPORT_SYMBOL(blk_put_request); @@ -1343,8 +1336,8 @@ void blk_add_request_payload(struct request *rq, struct page *page, } EXPORT_SYMBOL_GPL(blk_add_request_payload); -static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, - struct bio *bio) +bool bio_attempt_back_merge(struct request_queue *q, struct request *req, + struct bio *bio) { const int ff = bio->bi_rw & REQ_FAILFAST_MASK; @@ -1361,12 +1354,12 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, req->__data_len += bio->bi_size; req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); - drive_stat_acct(req, 0); + blk_account_io_start(req, false); return true; } -static bool bio_attempt_front_merge(struct request_queue *q, - struct request *req, struct bio *bio) +bool bio_attempt_front_merge(struct request_queue *q, struct request *req, + struct bio *bio) { const int ff = bio->bi_rw & REQ_FAILFAST_MASK; @@ -1391,12 +1384,12 @@ static bool bio_attempt_front_merge(struct request_queue *q, req->__data_len += bio->bi_size; req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); - drive_stat_acct(req, 0); + blk_account_io_start(req, false); return true; } /** - * attempt_plug_merge - try to merge with %current's plugged list + * blk_attempt_plug_merge - try to merge with %current's plugged list * @q: request_queue new bio is being queued at * @bio: new bio being queued * @request_count: out parameter for number of traversed plugged requests @@ -1412,19 +1405,28 @@ static bool bio_attempt_front_merge(struct request_queue *q, * reliable access to the elevator outside queue lock. Only check basic * merging parameters without querying the elevator. */ -static bool attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int *request_count) +bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, + unsigned int *request_count) { struct blk_plug *plug; struct request *rq; bool ret = false; + struct list_head *plug_list; + + if (blk_queue_nomerges(q)) + goto out; plug = current->plug; if (!plug) goto out; *request_count = 0; - list_for_each_entry_reverse(rq, &plug->list, queuelist) { + if (q->mq_ops) + plug_list = &plug->mq_list; + else + plug_list = &plug->list; + + list_for_each_entry_reverse(rq, plug_list, queuelist) { int el_ret; if (rq->q == q) @@ -1492,7 +1494,7 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio) * Check if we can merge with the plugged list before grabbing * any locks. */ - if (attempt_plug_merge(q, bio, &request_count)) + if (blk_attempt_plug_merge(q, bio, &request_count)) return; spin_lock_irq(q->queue_lock); @@ -1560,7 +1562,7 @@ get_rq: } } list_add_tail(&req->queuelist, &plug->list); - drive_stat_acct(req, 1); + blk_account_io_start(req, true); } else { spin_lock_irq(q->queue_lock); add_acct_request(q, req, where); @@ -2014,7 +2016,7 @@ unsigned int blk_rq_err_bytes(const struct request *rq) } EXPORT_SYMBOL_GPL(blk_rq_err_bytes); -static void blk_account_io_completion(struct request *req, unsigned int bytes) +void blk_account_io_completion(struct request *req, unsigned int bytes) { if (blk_do_io_stat(req)) { const int rw = rq_data_dir(req); @@ -2028,7 +2030,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes) } } -static void blk_account_io_done(struct request *req) +void blk_account_io_done(struct request *req) { /* * Account IO completion. flush_rq isn't accounted as a @@ -2076,6 +2078,42 @@ static inline struct request *blk_pm_peek_request(struct request_queue *q, } #endif +void blk_account_io_start(struct request *rq, bool new_io) +{ + struct hd_struct *part; + int rw = rq_data_dir(rq); + int cpu; + + if (!blk_do_io_stat(rq)) + return; + + cpu = part_stat_lock(); + + if (!new_io) { + part = rq->part; + part_stat_inc(cpu, part, merges[rw]); + } else { + part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); + if (!hd_struct_try_get(part)) { + /* + * The partition is already being removed, + * the request will be accounted on the disk only + * + * We take a reference on disk->part0 although that + * partition will never be deleted, so we can treat + * it as any other partition. + */ + part = &rq->rq_disk->part0; + hd_struct_get(part); + } + part_round_stats(cpu, part); + part_inc_in_flight(part, rw); + rq->part = part; + } + + part_stat_unlock(); +} + /** * blk_peek_request - peek at the top of a request queue * @q: request queue to peek at @@ -2227,6 +2265,7 @@ void blk_start_request(struct request *req) if (unlikely(blk_bidi_rq(req))) req->next_rq->resid_len = blk_rq_bytes(req->next_rq); + BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags)); blk_add_timer(req); } EXPORT_SYMBOL(blk_start_request); @@ -2451,7 +2490,6 @@ static void blk_finish_request(struct request *req, int error) if (req->cmd_flags & REQ_DONTPREP) blk_unprep_request(req); - blk_account_io_done(req); if (req->end_io) @@ -2873,6 +2911,7 @@ void blk_start_plug(struct blk_plug *plug) plug->magic = PLUG_MAGIC; INIT_LIST_HEAD(&plug->list); + INIT_LIST_HEAD(&plug->mq_list); INIT_LIST_HEAD(&plug->cb_list); /* @@ -2970,6 +3009,10 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) BUG_ON(plug->magic != PLUG_MAGIC); flush_plug_callbacks(plug, from_schedule); + + if (!list_empty(&plug->mq_list)) + blk_mq_flush_plug_list(plug, from_schedule); + if (list_empty(&plug->list)) return; diff --git a/block/blk-exec.c b/block/blk-exec.c index ae4f27d..c3edf9d 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -5,6 +5,7 @@ #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/blk-mq.h> #include <linux/sched/sysctl.h> #include "blk.h" @@ -24,7 +25,6 @@ static void blk_end_sync_rq(struct request *rq, int error) struct completion *waiting = rq->end_io_data; rq->end_io_data = NULL; - __blk_put_request(rq->q, rq); /* * complete last, if this is a stack request the process (and thus @@ -59,6 +59,12 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, rq->rq_disk = bd_disk; rq->end_io = done; + + if (q->mq_ops) { + blk_mq_insert_request(q, rq, true); + return; + } + /* * need to check this before __blk_run_queue(), because rq can * be freed before that returns. @@ -103,12 +109,6 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, int err = 0; unsigned long hang_check; - /* - * we need an extra reference to the request, so we can look at - * it after io completion - */ - rq->ref_count++; - if (!rq->sense) { memset(sense, 0, sizeof(sense)); rq->sense = sense; diff --git a/block/blk-flush.c b/block/blk-flush.c index cc2b827..331e627 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -69,8 +69,10 @@ #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/gfp.h> +#include <linux/blk-mq.h> #include "blk.h" +#include "blk-mq.h" /* FLUSH/FUA sequences */ enum { @@ -124,6 +126,24 @@ static void blk_flush_restore_request(struct request *rq) /* make @rq a normal request */ rq->cmd_flags &= ~REQ_FLUSH_SEQ; rq->end_io = rq->flush.saved_end_io; + + blk_clear_rq_complete(rq); +} + +static void mq_flush_data_run(struct work_struct *work) +{ + struct request *rq; + + rq = container_of(work, struct request, mq_flush_data); + + memset(&rq->csd, 0, sizeof(rq->csd)); + blk_mq_run_request(rq, true, false); +} + +static void blk_mq_flush_data_insert(struct request *rq) +{ + INIT_WORK(&rq->mq_flush_data, mq_flush_data_run); + kblockd_schedule_work(rq->q, &rq->mq_flush_data); } /** @@ -136,7 +156,7 @@ static void blk_flush_restore_request(struct request *rq) * completion and trigger the next step. * * CONTEXT: - * spin_lock_irq(q->queue_lock) + * spin_lock_irq(q->queue_lock or q->mq_flush_lock) * * RETURNS: * %true if requests were added to the dispatch queue, %false otherwise. @@ -146,7 +166,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, { struct request_queue *q = rq->q; struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; - bool queued = false; + bool queued = false, kicked; BUG_ON(rq->flush.seq & seq); rq->flush.seq |= seq; @@ -167,8 +187,12 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, case REQ_FSEQ_DATA: list_move_tail(&rq->flush.list, &q->flush_data_in_flight); - list_add(&rq->queuelist, &q->queue_head); - queued = true; + if (q->mq_ops) + blk_mq_flush_data_insert(rq); + else { + list_add(&rq->queuelist, &q->queue_head); + queued = true; + } break; case REQ_FSEQ_DONE: @@ -181,28 +205,43 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, BUG_ON(!list_empty(&rq->queuelist)); list_del_init(&rq->flush.list); blk_flush_restore_request(rq); - __blk_end_request_all(rq, error); + if (q->mq_ops) + blk_mq_end_io(rq, error); + else + __blk_end_request_all(rq, error); break; default: BUG(); } - return blk_kick_flush(q) | queued; + kicked = blk_kick_flush(q); + /* blk_mq_run_flush will run queue */ + if (q->mq_ops) + return queued; + return kicked | queued; } static void flush_end_io(struct request *flush_rq, int error) { struct request_queue *q = flush_rq->q; - struct list_head *running = &q->flush_queue[q->flush_running_idx]; + struct list_head *running; bool queued = false; struct request *rq, *n; + unsigned long flags = 0; + if (q->mq_ops) { + blk_mq_free_request(flush_rq); + spin_lock_irqsave(&q->mq_flush_lock, flags); + } + running = &q->flush_queue[q->flush_running_idx]; BUG_ON(q->flush_pending_idx == q->flush_running_idx); /* account completion of the flush request */ q->flush_running_idx ^= 1; - elv_completed_request(q, flush_rq); + + if (!q->mq_ops) + elv_completed_request(q, flush_rq); /* and push the waiting requests to the next stage */ list_for_each_entry_safe(rq, n, running, flush.list) { @@ -223,9 +262,48 @@ static void flush_end_io(struct request *flush_rq, int error) * directly into request_fn may confuse the driver. Always use * kblockd. */ - if (queued || q->flush_queue_delayed) - blk_run_queue_async(q); + if (queued || q->flush_queue_delayed) { + if (!q->mq_ops) + blk_run_queue_async(q); + else + /* + * This can be optimized to only run queues with requests + * queued if necessary. + */ + blk_mq_run_queues(q, true); + } q->flush_queue_delayed = 0; + if (q->mq_ops) + spin_unlock_irqrestore(&q->mq_flush_lock, flags); +} + +static void mq_flush_work(struct work_struct *work) +{ + struct request_queue *q; + struct request *rq; + + q = container_of(work, struct request_queue, mq_flush_work); + + /* We don't need set REQ_FLUSH_SEQ, it's for consistency */ + rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ, + __GFP_WAIT|GFP_ATOMIC, true); + rq->cmd_type = REQ_TYPE_FS; + rq->end_io = flush_end_io; + + blk_mq_run_request(rq, true, false); +} + +/* + * We can't directly use q->flush_rq, because it doesn't have tag and is not in + * hctx->rqs[]. so we must allocate a new request, since we can't sleep here, + * so offload the work to workqueue. + * + * Note: we assume a flush request finished in any hardware queue will flush + * the whole disk cache. + */ +static void mq_run_flush(struct request_queue *q) +{ + kblockd_schedule_work(q, &q->mq_flush_work); } /** @@ -236,7 +314,7 @@ static void flush_end_io(struct request *flush_rq, int error) * Please read the comment at the top of this file for more info. * * CONTEXT: - * spin_lock_irq(q->queue_lock) + * spin_lock_irq(q->queue_lock or q->mq_flush_lock) * * RETURNS: * %true if flush was issued, %false otherwise. @@ -261,13 +339,18 @@ static bool blk_kick_flush(struct request_queue *q) * Issue flush and toggle pending_idx. This makes pending_idx * different from running_idx, which means flush is in flight. */ + q->flush_pending_idx ^= 1; + if (q->mq_ops) { + mq_run_flush(q); + return true; + } + blk_rq_init(q, &q->flush_rq); q->flush_rq.cmd_type = REQ_TYPE_FS; q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; q->flush_rq.rq_disk = first_rq->rq_disk; q->flush_rq.end_io = flush_end_io; - q->flush_pending_idx ^= 1; list_add_tail(&q->flush_rq.queuelist, &q->queue_head); return true; } @@ -284,16 +367,37 @@ static void flush_data_end_io(struct request *rq, int error) blk_run_queue_async(q); } +static void mq_flush_data_end_io(struct request *rq, int error) +{ + struct request_queue *q = rq->q; + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + unsigned long flags; + + ctx = rq->mq_ctx; + hctx = q->mq_ops->map_queue(q, ctx->cpu); + + /* + * After populating an empty queue, kick it to avoid stall. Read + * the comment in flush_end_io(). + */ + spin_lock_irqsave(&q->mq_flush_lock, flags); + if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error)) + blk_mq_run_hw_queue(hctx, true); + spin_unlock_irqrestore(&q->mq_flush_lock, flags); +} + /** * blk_insert_flush - insert a new FLUSH/FUA request * @rq: request to insert * * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions. + * or __blk_mq_run_hw_queue() to dispatch request. * @rq is being submitted. Analyze what needs to be done and put it on the * right queue. * * CONTEXT: - * spin_lock_irq(q->queue_lock) + * spin_lock_irq(q->queue_lock) in !mq case */ void blk_insert_flush(struct request *rq) { @@ -316,7 +420,10 @@ void blk_insert_flush(struct request *rq) * complete the request. */ if (!policy) { - __blk_end_bidi_request(rq, 0, 0, 0); + if (q->mq_ops) + blk_mq_end_io(rq, 0); + else + __blk_end_bidi_request(rq, 0, 0, 0); return; } @@ -329,7 +436,10 @@ void blk_insert_flush(struct request *rq) */ if ((policy & REQ_FSEQ_DATA) && !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { - list_add_tail(&rq->queuelist, &q->queue_head); + if (q->mq_ops) { + blk_mq_run_request(rq, false, true); + } else + list_add_tail(&rq->queuelist, &q->queue_head); return; } @@ -341,6 +451,14 @@ void blk_insert_flush(struct request *rq) INIT_LIST_HEAD(&rq->flush.list); rq->cmd_flags |= REQ_FLUSH_SEQ; rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ + if (q->mq_ops) { + rq->end_io = mq_flush_data_end_io; + + spin_lock_irq(&q->mq_flush_lock); + blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); + spin_unlock_irq(&q->mq_flush_lock); + return; + } rq->end_io = flush_data_end_io; blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); @@ -453,3 +571,9 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, return ret; } EXPORT_SYMBOL(blkdev_issue_flush); + +void blk_mq_init_flush(struct request_queue *q) +{ + spin_lock_init(&q->mq_flush_lock); + INIT_WORK(&q->mq_flush_work, mq_flush_work); +} diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c index 4b8d9b54..1855bf5 100644 --- a/block/blk-iopoll.c +++ b/block/blk-iopoll.c @@ -35,7 +35,7 @@ void blk_iopoll_sched(struct blk_iopoll *iop) unsigned long flags; local_irq_save(flags); - list_add_tail(&iop->list, &__get_cpu_var(blk_cpu_iopoll)); + list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll)); __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ); local_irq_restore(flags); } @@ -79,7 +79,7 @@ EXPORT_SYMBOL(blk_iopoll_complete); static void blk_iopoll_softirq(struct softirq_action *h) { - struct list_head *list = &__get_cpu_var(blk_cpu_iopoll); + struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll); int rearm = 0, budget = blk_iopoll_budget; unsigned long start_time = jiffies; @@ -201,7 +201,7 @@ static int blk_iopoll_cpu_notify(struct notifier_block *self, local_irq_disable(); list_splice_init(&per_cpu(blk_cpu_iopoll, cpu), - &__get_cpu_var(blk_cpu_iopoll)); + this_cpu_ptr(&blk_cpu_iopoll)); __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ); local_irq_enable(); } diff --git a/block/blk-lib.c b/block/blk-lib.c index d6f50d5..9b5b561 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -43,8 +43,8 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, DECLARE_COMPLETION_ONSTACK(wait); struct request_queue *q = bdev_get_queue(bdev); int type = REQ_WRITE | REQ_DISCARD; - sector_t max_discard_sectors; - sector_t granularity, alignment; + unsigned int max_discard_sectors, granularity; + int alignment; struct bio_batch bb; struct bio *bio; int ret = 0; @@ -58,16 +58,14 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, /* Zero-sector (unknown) and one-sector granularities are the same. */ granularity = max(q->limits.discard_granularity >> 9, 1U); - alignment = bdev_discard_alignment(bdev) >> 9; - alignment = sector_div(alignment, granularity); + alignment = (bdev_discard_alignment(bdev) >> 9) % granularity; /* * Ensure that max_discard_sectors is of the proper * granularity, so that requests stay aligned after a split. */ max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); - sector_div(max_discard_sectors, granularity); - max_discard_sectors *= granularity; + max_discard_sectors -= max_discard_sectors % granularity; if (unlikely(!max_discard_sectors)) { /* Avoid infinite loop below. Being cautious never hurts. */ return -EOPNOTSUPP; diff --git a/block/blk-merge.c b/block/blk-merge.c index 5f24482..1ffc589 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -308,6 +308,17 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req, return ll_new_hw_segment(q, req, bio); } +/* + * blk-mq uses req->special to carry normal driver per-request payload, it + * does not indicate a prepared command that we cannot merge with. + */ +static bool req_no_special_merge(struct request *req) +{ + struct request_queue *q = req->q; + + return !q->mq_ops && req->special; +} + static int ll_merge_requests_fn(struct request_queue *q, struct request *req, struct request *next) { @@ -319,7 +330,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, * First check if the either of the requests are re-queued * requests. Can't merge them if they are. */ - if (req->special || next->special) + if (req_no_special_merge(req) || req_no_special_merge(next)) return 0; /* @@ -416,7 +427,7 @@ static int attempt_merge(struct request_queue *q, struct request *req, if (rq_data_dir(req) != rq_data_dir(next) || req->rq_disk != next->rq_disk - || next->special) + || req_no_special_merge(next)) return 0; if (req->cmd_flags & REQ_WRITE_SAME && @@ -515,7 +526,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) return false; /* must be same device and not a special request */ - if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special) + if (rq->rq_disk != bio->bi_bdev->bd_disk || req_no_special_merge(rq)) return false; /* only merge integrity protected bio into ditto rq */ diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c new file mode 100644 index 0000000..f8ea39d --- /dev/null +++ b/block/blk-mq-cpu.c @@ -0,0 +1,93 @@ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/blkdev.h> +#include <linux/list.h> +#include <linux/llist.h> +#include <linux/smp.h> +#include <linux/cpu.h> + +#include <linux/blk-mq.h> +#include "blk-mq.h" + +static LIST_HEAD(blk_mq_cpu_notify_list); +static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock); + +static int __cpuinit blk_mq_main_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long) hcpu; + struct blk_mq_cpu_notifier *notify; + + spin_lock(&blk_mq_cpu_notify_lock); + + list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) + notify->notify(notify->data, action, cpu); + + spin_unlock(&blk_mq_cpu_notify_lock); + return NOTIFY_OK; +} + +static void __cpuinit blk_mq_cpu_notify(void *data, unsigned long action, + unsigned int cpu) +{ + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { + /* + * If the CPU goes away, ensure that we run any pending + * completions. + */ + struct llist_node *node; + struct request *rq; + + local_irq_disable(); + + node = llist_del_all(&per_cpu(ipi_lists, cpu)); + while (node) { + struct llist_node *next = node->next; + + rq = llist_entry(node, struct request, ll_list); + __blk_mq_end_io(rq, rq->errors); + node = next; + } + + local_irq_enable(); + } +} + +static struct notifier_block __cpuinitdata blk_mq_main_cpu_notifier = { + .notifier_call = blk_mq_main_cpu_notify, +}; + +void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier) +{ + BUG_ON(!notifier->notify); + + spin_lock(&blk_mq_cpu_notify_lock); + list_add_tail(¬ifier->list, &blk_mq_cpu_notify_list); + spin_unlock(&blk_mq_cpu_notify_lock); +} + +void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier) +{ + spin_lock(&blk_mq_cpu_notify_lock); + list_del(¬ifier->list); + spin_unlock(&blk_mq_cpu_notify_lock); +} + +void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, + void (*fn)(void *, unsigned long, unsigned int), + void *data) +{ + notifier->notify = fn; + notifier->data = data; +} + +static struct blk_mq_cpu_notifier __cpuinitdata cpu_notifier = { + .notify = blk_mq_cpu_notify, +}; + +void __init blk_mq_cpu_init(void) +{ + register_hotcpu_notifier(&blk_mq_main_cpu_notifier); + blk_mq_register_cpu_notifier(&cpu_notifier); +} diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c new file mode 100644 index 0000000..f872127 --- /dev/null +++ b/block/blk-mq-cpumap.c @@ -0,0 +1,108 @@ +#include <linux/kernel.h> +#include <linux/threads.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/cpu.h> + +#include <linux/blk-mq.h> +#include "blk.h" +#include "blk-mq.h" + +static void show_map(unsigned int *map, unsigned int nr) +{ + int i; + + pr_info("blk-mq: CPU -> queue map\n"); + for_each_online_cpu(i) + pr_info(" CPU%2u -> Queue %u\n", i, map[i]); +} + +static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues, + const int cpu) +{ + return cpu / ((nr_cpus + nr_queues - 1) / nr_queues); +} + +static int get_first_sibling(unsigned int cpu) +{ + unsigned int ret; + + ret = cpumask_first(topology_thread_cpumask(cpu)); + if (ret < nr_cpu_ids) + return ret; + + return cpu; +} + +int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues) +{ + unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling; + cpumask_var_t cpus; + + if (!alloc_cpumask_var(&cpus, GFP_ATOMIC)) + return 1; + + cpumask_clear(cpus); + nr_cpus = nr_uniq_cpus = 0; + for_each_online_cpu(i) { + nr_cpus++; + first_sibling = get_first_sibling(i); + if (!cpumask_test_cpu(first_sibling, cpus)) + nr_uniq_cpus++; + cpumask_set_cpu(i, cpus); + } + + queue = 0; + for_each_possible_cpu(i) { + if (!cpu_online(i)) { + map[i] = 0; + continue; + } + + /* + * Easy case - we have equal or more hardware queues. Or + * there are no thread siblings to take into account. Do + * 1:1 if enough, or sequential mapping if less. + */ + if (nr_queues >= nr_cpus || nr_cpus == nr_uniq_cpus) { + map[i] = cpu_to_queue_index(nr_cpus, nr_queues, queue); + queue++; + continue; + } + + /* + * Less then nr_cpus queues, and we have some number of + * threads per cores. Map sibling threads to the same + * queue. + */ + first_sibling = get_first_sibling(i); + if (first_sibling == i) { + map[i] = cpu_to_queue_index(nr_uniq_cpus, nr_queues, + queue); + queue++; + } else + map[i] = map[first_sibling]; + } + + show_map(map, nr_cpus); + free_cpumask_var(cpus); + return 0; +} + +unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg) +{ + unsigned int *map; + + /* If cpus are offline, map them to first hctx */ + map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL, + reg->numa_node); + if (!map) + return NULL; + + if (!blk_mq_update_queue_map(map, reg->nr_hw_queues)) + return map; + + kfree(map); + return NULL; +} diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c new file mode 100644 index 0000000..ba6cf8e --- /dev/null +++ b/block/blk-mq-sysfs.c @@ -0,0 +1,384 @@ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/backing-dev.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/workqueue.h> +#include <linux/smp.h> + +#include <linux/blk-mq.h> +#include "blk-mq.h" +#include "blk-mq-tag.h" + +static void blk_mq_sysfs_release(struct kobject *kobj) +{ +} + +struct blk_mq_ctx_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct blk_mq_ctx *, char *); + ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t); +}; + +struct blk_mq_hw_ctx_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct blk_mq_hw_ctx *, char *); + ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t); +}; + +static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr, + char *page) +{ + struct blk_mq_ctx_sysfs_entry *entry; + struct blk_mq_ctx *ctx; + struct request_queue *q; + ssize_t res; + + entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); + ctx = container_of(kobj, struct blk_mq_ctx, kobj); + q = ctx->queue; + + if (!entry->show) + return -EIO; + + res = -ENOENT; + mutex_lock(&q->sysfs_lock); + if (!blk_queue_dying(q)) + res = entry->show(ctx, page); + mutex_unlock(&q->sysfs_lock); + return res; +} + +static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr, + const char *page, size_t length) +{ + struct blk_mq_ctx_sysfs_entry *entry; + struct blk_mq_ctx *ctx; + struct request_queue *q; + ssize_t res; + + entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); + ctx = container_of(kobj, struct blk_mq_ctx, kobj); + q = ctx->queue; + + if (!entry->store) + return -EIO; + + res = -ENOENT; + mutex_lock(&q->sysfs_lock); + if (!blk_queue_dying(q)) + res = entry->store(ctx, page, length); + mutex_unlock(&q->sysfs_lock); + return res; +} + +static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, + struct attribute *attr, char *page) +{ + struct blk_mq_hw_ctx_sysfs_entry *entry; + struct blk_mq_hw_ctx *hctx; + struct request_queue *q; + ssize_t res; + + entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr); + hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); + q = hctx->queue; + + if (!entry->show) + return -EIO; + + res = -ENOENT; + mutex_lock(&q->sysfs_lock); + if (!blk_queue_dying(q)) + res = entry->show(hctx, page); + mutex_unlock(&q->sysfs_lock); + return res; +} + +static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj, + struct attribute *attr, const char *page, + size_t length) +{ + struct blk_mq_hw_ctx_sysfs_entry *entry; + struct blk_mq_hw_ctx *hctx; + struct request_queue *q; + ssize_t res; + + entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr); + hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); + q = hctx->queue; + + if (!entry->store) + return -EIO; + + res = -ENOENT; + mutex_lock(&q->sysfs_lock); + if (!blk_queue_dying(q)) + res = entry->store(hctx, page, length); + mutex_unlock(&q->sysfs_lock); + return res; +} + +static ssize_t blk_mq_sysfs_dispatched_show(struct blk_mq_ctx *ctx, char *page) +{ + return sprintf(page, "%lu %lu\n", ctx->rq_dispatched[1], + ctx->rq_dispatched[0]); +} + +static ssize_t blk_mq_sysfs_merged_show(struct blk_mq_ctx *ctx, char *page) +{ + return sprintf(page, "%lu\n", ctx->rq_merged); +} + +static ssize_t blk_mq_sysfs_completed_show(struct blk_mq_ctx *ctx, char *page) +{ + return sprintf(page, "%lu %lu\n", ctx->rq_completed[1], + ctx->rq_completed[0]); +} + +static ssize_t sysfs_list_show(char *page, struct list_head *list, char *msg) +{ + char *start_page = page; + struct request *rq; + + page += sprintf(page, "%s:\n", msg); + + list_for_each_entry(rq, list, queuelist) + page += sprintf(page, "\t%p\n", rq); + + return page - start_page; +} + +static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page) +{ + ssize_t ret; + + spin_lock(&ctx->lock); + ret = sysfs_list_show(page, &ctx->rq_list, "CTX pending"); + spin_unlock(&ctx->lock); + + return ret; +} + +static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx, + char *page) +{ + return sprintf(page, "%lu\n", hctx->queued); +} + +static ssize_t blk_mq_hw_sysfs_run_show(struct blk_mq_hw_ctx *hctx, char *page) +{ + return sprintf(page, "%lu\n", hctx->run); +} + +static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx, + char *page) +{ + char *start_page = page; + int i; + + page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]); + + for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) { + unsigned long d = 1U << (i - 1); + + page += sprintf(page, "%8lu\t%lu\n", d, hctx->dispatched[i]); + } + + return page - start_page; +} + +static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx, + char *page) +{ + ssize_t ret; + + spin_lock(&hctx->lock); + ret = sysfs_list_show(page, &hctx->dispatch, "HCTX pending"); + spin_unlock(&hctx->lock); + + return ret; +} + +static ssize_t blk_mq_hw_sysfs_ipi_show(struct blk_mq_hw_ctx *hctx, char *page) +{ + ssize_t ret; + + spin_lock(&hctx->lock); + ret = sprintf(page, "%u\n", !!(hctx->flags & BLK_MQ_F_SHOULD_IPI)); + spin_unlock(&hctx->lock); + + return ret; +} + +static ssize_t blk_mq_hw_sysfs_ipi_store(struct blk_mq_hw_ctx *hctx, + const char *page, size_t len) +{ + struct blk_mq_ctx *ctx; + unsigned long ret; + unsigned int i; + + if (kstrtoul(page, 10, &ret)) { + pr_err("blk-mq-sysfs: invalid input '%s'\n", page); + return -EINVAL; + } + + spin_lock(&hctx->lock); + if (ret) + hctx->flags |= BLK_MQ_F_SHOULD_IPI; + else + hctx->flags &= ~BLK_MQ_F_SHOULD_IPI; + spin_unlock(&hctx->lock); + + hctx_for_each_ctx(hctx, ctx, i) + ctx->ipi_redirect = !!ret; + + return len; +} + +static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page) +{ + return blk_mq_tag_sysfs_show(hctx->tags, page); +} + +static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = { + .attr = {.name = "dispatched", .mode = S_IRUGO }, + .show = blk_mq_sysfs_dispatched_show, +}; +static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_merged = { + .attr = {.name = "merged", .mode = S_IRUGO }, + .show = blk_mq_sysfs_merged_show, +}; +static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_completed = { + .attr = {.name = "completed", .mode = S_IRUGO }, + .show = blk_mq_sysfs_completed_show, +}; +static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_rq_list = { + .attr = {.name = "rq_list", .mode = S_IRUGO }, + .show = blk_mq_sysfs_rq_list_show, +}; + +static struct attribute *default_ctx_attrs[] = { + &blk_mq_sysfs_dispatched.attr, + &blk_mq_sysfs_merged.attr, + &blk_mq_sysfs_completed.attr, + &blk_mq_sysfs_rq_list.attr, + NULL, +}; + +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_queued = { + .attr = {.name = "queued", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_queued_show, +}; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_run = { + .attr = {.name = "run", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_run_show, +}; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = { + .attr = {.name = "dispatched", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_dispatched_show, +}; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = { + .attr = {.name = "pending", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_rq_list_show, +}; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_ipi = { + .attr = {.name = "ipi_redirect", .mode = S_IRUGO | S_IWUSR}, + .show = blk_mq_hw_sysfs_ipi_show, + .store = blk_mq_hw_sysfs_ipi_store, +}; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = { + .attr = {.name = "tags", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_tags_show, +}; + +static struct attribute *default_hw_ctx_attrs[] = { + &blk_mq_hw_sysfs_queued.attr, + &blk_mq_hw_sysfs_run.attr, + &blk_mq_hw_sysfs_dispatched.attr, + &blk_mq_hw_sysfs_pending.attr, + &blk_mq_hw_sysfs_ipi.attr, + &blk_mq_hw_sysfs_tags.attr, + NULL, +}; + +static const struct sysfs_ops blk_mq_sysfs_ops = { + .show = blk_mq_sysfs_show, + .store = blk_mq_sysfs_store, +}; + +static const struct sysfs_ops blk_mq_hw_sysfs_ops = { + .show = blk_mq_hw_sysfs_show, + .store = blk_mq_hw_sysfs_store, +}; + +static struct kobj_type blk_mq_ktype = { + .sysfs_ops = &blk_mq_sysfs_ops, + .release = blk_mq_sysfs_release, +}; + +static struct kobj_type blk_mq_ctx_ktype = { + .sysfs_ops = &blk_mq_sysfs_ops, + .default_attrs = default_ctx_attrs, + .release = blk_mq_sysfs_release, +}; + +static struct kobj_type blk_mq_hw_ktype = { + .sysfs_ops = &blk_mq_hw_sysfs_ops, + .default_attrs = default_hw_ctx_attrs, + .release = blk_mq_sysfs_release, +}; + +void blk_mq_unregister_disk(struct gendisk *disk) +{ + struct request_queue *q = disk->queue; + + kobject_uevent(&q->mq_kobj, KOBJ_REMOVE); + kobject_del(&q->mq_kobj); + + kobject_put(&disk_to_dev(disk)->kobj); +} + +int blk_mq_register_disk(struct gendisk *disk) +{ + struct device *dev = disk_to_dev(disk); + struct request_queue *q = disk->queue; + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + int ret, i, j; + + kobject_init(&q->mq_kobj, &blk_mq_ktype); + + ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); + if (ret < 0) + return ret; + + kobject_uevent(&q->mq_kobj, KOBJ_ADD); + + queue_for_each_hw_ctx(q, hctx, i) { + kobject_init(&hctx->kobj, &blk_mq_hw_ktype); + ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", i); + if (ret) + break; + + if (!hctx->nr_ctx) + continue; + + hctx_for_each_ctx(hctx, ctx, j) { + kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); + ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu); + if (ret) + break; + } + } + + if (ret) { + blk_mq_unregister_disk(disk); + return ret; + } + + return 0; +} diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c new file mode 100644 index 0000000..d64a02f --- /dev/null +++ b/block/blk-mq-tag.c @@ -0,0 +1,204 @@ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/percpu_ida.h> + +#include <linux/blk-mq.h> +#include "blk.h" +#include "blk-mq.h" +#include "blk-mq-tag.h" + +/* + * Per tagged queue (tag address space) map + */ +struct blk_mq_tags { + unsigned int nr_tags; + unsigned int nr_reserved_tags; + unsigned int nr_batch_move; + unsigned int nr_max_cache; + + struct percpu_ida free_tags; + struct percpu_ida reserved_tags; +}; + +void blk_mq_wait_for_tags(struct blk_mq_tags *tags) +{ + int tag = blk_mq_get_tag(tags, __GFP_WAIT, false); + blk_mq_put_tag(tags, tag); +} + +bool blk_mq_has_free_tags(struct blk_mq_tags *tags) +{ + return !tags || + percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids) != 0; +} + +static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp) +{ + int tag; + + tag = percpu_ida_alloc(&tags->free_tags, gfp); + if (tag < 0) + return BLK_MQ_TAG_FAIL; + return tag + tags->nr_reserved_tags; +} + +static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags, + gfp_t gfp) +{ + int tag; + + if (unlikely(!tags->nr_reserved_tags)) { + WARN_ON_ONCE(1); + return BLK_MQ_TAG_FAIL; + } + + tag = percpu_ida_alloc(&tags->reserved_tags, gfp); + if (tag < 0) + return BLK_MQ_TAG_FAIL; + return tag; +} + +unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved) +{ + if (!reserved) + return __blk_mq_get_tag(tags, gfp); + + return __blk_mq_get_reserved_tag(tags, gfp); +} + +static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) +{ + BUG_ON(tag >= tags->nr_tags); + + percpu_ida_free(&tags->free_tags, tag - tags->nr_reserved_tags); +} + +static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags, + unsigned int tag) +{ + BUG_ON(tag >= tags->nr_reserved_tags); + + percpu_ida_free(&tags->reserved_tags, tag); +} + +void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) +{ + if (tag >= tags->nr_reserved_tags) + __blk_mq_put_tag(tags, tag); + else + __blk_mq_put_reserved_tag(tags, tag); +} + +static int __blk_mq_tag_iter(unsigned id, void *data) +{ + unsigned long *tag_map = data; + __set_bit(id, tag_map); + return 0; +} + +void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, + void (*fn)(void *, unsigned long *), void *data) +{ + unsigned long *tag_map; + size_t map_size; + + map_size = ALIGN(tags->nr_tags, BITS_PER_LONG) / BITS_PER_LONG; + tag_map = kzalloc(map_size * sizeof(unsigned long), GFP_ATOMIC); + if (!tag_map) + return; + + percpu_ida_for_each_free(&tags->free_tags, __blk_mq_tag_iter, tag_map); + if (tags->nr_reserved_tags) + percpu_ida_for_each_free(&tags->reserved_tags, __blk_mq_tag_iter, + tag_map); + + fn(data, tag_map); + kfree(tag_map); +} + +struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, + unsigned int reserved_tags, int node) +{ + unsigned int nr_tags, nr_cache; + struct blk_mq_tags *tags; + int ret; + + if (total_tags > BLK_MQ_TAG_MAX) { + pr_err("blk-mq: tag depth too large\n"); + return NULL; + } + + tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node); + if (!tags) + return NULL; + + nr_tags = total_tags - reserved_tags; + nr_cache = nr_tags / num_possible_cpus(); + + if (nr_cache < BLK_MQ_TAG_CACHE_MIN) + nr_cache = BLK_MQ_TAG_CACHE_MIN; + else if (nr_cache > BLK_MQ_TAG_CACHE_MAX) + nr_cache = BLK_MQ_TAG_CACHE_MAX; + + tags->nr_tags = total_tags; + tags->nr_reserved_tags = reserved_tags; + tags->nr_max_cache = nr_cache; + tags->nr_batch_move = max(1u, nr_cache / 2); + + ret = __percpu_ida_init(&tags->free_tags, tags->nr_tags - + tags->nr_reserved_tags, + tags->nr_max_cache, + tags->nr_batch_move); + if (ret) + goto err_free_tags; + + if (reserved_tags) { + /* + * With max_cahe and batch set to 1, the allocator fallbacks to + * no cached. It's fine reserved tags allocation is slow. + */ + ret = __percpu_ida_init(&tags->reserved_tags, reserved_tags, + 1, 1); + if (ret) + goto err_reserved_tags; + } + + return tags; + +err_reserved_tags: + percpu_ida_destroy(&tags->free_tags); +err_free_tags: + kfree(tags); + return NULL; +} + +void blk_mq_free_tags(struct blk_mq_tags *tags) +{ + percpu_ida_destroy(&tags->free_tags); + percpu_ida_destroy(&tags->reserved_tags); + kfree(tags); +} + +ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) +{ + char *orig_page = page; + int cpu; + + if (!tags) + return 0; + + page += sprintf(page, "nr_tags=%u, reserved_tags=%u, batch_move=%u," + " max_cache=%u\n", tags->nr_tags, tags->nr_reserved_tags, + tags->nr_batch_move, tags->nr_max_cache); + + page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", + percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids), + percpu_ida_free_tags(&tags->reserved_tags, nr_cpu_ids)); + + for_each_possible_cpu(cpu) { + page += sprintf(page, " cpu%02u: nr_free=%u\n", cpu, + percpu_ida_free_tags(&tags->free_tags, cpu)); + } + + return page - orig_page; +} diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h new file mode 100644 index 0000000..947ba2c --- /dev/null +++ b/block/blk-mq-tag.h @@ -0,0 +1,27 @@ +#ifndef INT_BLK_MQ_TAG_H +#define INT_BLK_MQ_TAG_H + +struct blk_mq_tags; + +extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node); +extern void blk_mq_free_tags(struct blk_mq_tags *tags); + +extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved); +extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags); +extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag); +extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data); +extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); +extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); + +enum { + BLK_MQ_TAG_CACHE_MIN = 1, + BLK_MQ_TAG_CACHE_MAX = 64, +}; + +enum { + BLK_MQ_TAG_FAIL = -1U, + BLK_MQ_TAG_MIN = BLK_MQ_TAG_CACHE_MIN, + BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1, +}; + +#endif diff --git a/block/blk-mq.c b/block/blk-mq.c new file mode 100644 index 0000000..88d4e86 --- /dev/null +++ b/block/blk-mq.c @@ -0,0 +1,1500 @@ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/backing-dev.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/workqueue.h> +#include <linux/smp.h> +#include <linux/llist.h> +#include <linux/list_sort.h> +#include <linux/cpu.h> +#include <linux/cache.h> +#include <linux/sched/sysctl.h> +#include <linux/delay.h> + +#include <trace/events/block.h> + +#include <linux/blk-mq.h> +#include "blk.h" +#include "blk-mq.h" +#include "blk-mq-tag.h" + +static DEFINE_MUTEX(all_q_mutex); +static LIST_HEAD(all_q_list); + +static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); + +DEFINE_PER_CPU(struct llist_head, ipi_lists); + +static struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, + unsigned int cpu) +{ + return per_cpu_ptr(q->queue_ctx, cpu); +} + +/* + * This assumes per-cpu software queueing queues. They could be per-node + * as well, for instance. For now this is hardcoded as-is. Note that we don't + * care about preemption, since we know the ctx's are persistent. This does + * mean that we can't rely on ctx always matching the currently running CPU. + */ +static struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) +{ + return __blk_mq_get_ctx(q, get_cpu()); +} + +static void blk_mq_put_ctx(struct blk_mq_ctx *ctx) +{ + put_cpu(); +} + +/* + * Check if any of the ctx's have pending work in this hardware queue + */ +static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) +{ + unsigned int i; + + for (i = 0; i < hctx->nr_ctx_map; i++) + if (hctx->ctx_map[i]) + return true; + + return false; +} + +/* + * Mark this ctx as having pending work in this hardware queue + */ +static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx) +{ + if (!test_bit(ctx->index_hw, hctx->ctx_map)) + set_bit(ctx->index_hw, hctx->ctx_map); +} + +static struct request *blk_mq_alloc_rq(struct blk_mq_hw_ctx *hctx, gfp_t gfp, + bool reserved) +{ + struct request *rq; + unsigned int tag; + + tag = blk_mq_get_tag(hctx->tags, gfp, reserved); + if (tag != BLK_MQ_TAG_FAIL) { + rq = hctx->rqs[tag]; + rq->tag = tag; + + return rq; + } + + return NULL; +} + +static int blk_mq_queue_enter(struct request_queue *q) +{ + int ret; + + __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); + smp_wmb(); + /* we have problems to freeze the queue if it's initializing */ + if (!blk_queue_bypass(q) || !blk_queue_init_done(q)) + return 0; + + __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); + + spin_lock_irq(q->queue_lock); + ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq, + !blk_queue_bypass(q), *q->queue_lock); + /* inc usage with lock hold to avoid freeze_queue runs here */ + if (!ret) + __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); + spin_unlock_irq(q->queue_lock); + + return ret; +} + +static void blk_mq_queue_exit(struct request_queue *q) +{ + __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); +} + +/* + * Guarantee no request is in use, so we can change any data structure of + * the queue afterward. + */ +static void blk_mq_freeze_queue(struct request_queue *q) +{ + bool drain; + + spin_lock_irq(q->queue_lock); + drain = !q->bypass_depth++; + queue_flag_set(QUEUE_FLAG_BYPASS, q); + spin_unlock_irq(q->queue_lock); + + if (!drain) + return; + + while (true) { + s64 count; + + spin_lock_irq(q->queue_lock); + count = percpu_counter_sum(&q->mq_usage_counter); + spin_unlock_irq(q->queue_lock); + + if (count == 0) + break; + blk_mq_run_queues(q, false); + msleep(10); + } +} + +static void blk_mq_unfreeze_queue(struct request_queue *q) +{ + bool wake = false; + + spin_lock_irq(q->queue_lock); + if (!--q->bypass_depth) { + queue_flag_clear(QUEUE_FLAG_BYPASS, q); + wake = true; + } + WARN_ON_ONCE(q->bypass_depth < 0); + spin_unlock_irq(q->queue_lock); + if (wake) + wake_up_all(&q->mq_freeze_wq); +} + +bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) +{ + return blk_mq_has_free_tags(hctx->tags); +} +EXPORT_SYMBOL(blk_mq_can_queue); + +static void blk_mq_rq_ctx_init(struct blk_mq_ctx *ctx, struct request *rq, + unsigned int rw_flags) +{ + rq->mq_ctx = ctx; + rq->cmd_flags = rw_flags; + ctx->rq_dispatched[rw_is_sync(rw_flags)]++; +} + +static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, + gfp_t gfp, bool reserved) +{ + return blk_mq_alloc_rq(hctx, gfp, reserved); +} + +static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, + int rw, gfp_t gfp, + bool reserved) +{ + struct request *rq; + + do { + struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); + struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); + + rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved); + if (rq) { + blk_mq_rq_ctx_init(ctx, rq, rw); + break; + } else if (!(gfp & __GFP_WAIT)) + break; + + blk_mq_put_ctx(ctx); + __blk_mq_run_hw_queue(hctx); + blk_mq_wait_for_tags(hctx->tags); + } while (1); + + return rq; +} + +struct request *blk_mq_alloc_request(struct request_queue *q, int rw, + gfp_t gfp, bool reserved) +{ + struct request *rq; + + if (blk_mq_queue_enter(q)) + return NULL; + + rq = blk_mq_alloc_request_pinned(q, rw, gfp, reserved); + blk_mq_put_ctx(rq->mq_ctx); + return rq; +} + +struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, + gfp_t gfp) +{ + struct request *rq; + + if (blk_mq_queue_enter(q)) + return NULL; + + rq = blk_mq_alloc_request_pinned(q, rw, gfp, true); + blk_mq_put_ctx(rq->mq_ctx); + return rq; +} +EXPORT_SYMBOL(blk_mq_alloc_reserved_request); + +/* + * Re-init and set pdu, if we have it + */ +static void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq) +{ + blk_rq_init(hctx->queue, rq); + + if (hctx->cmd_size) + rq->special = blk_mq_rq_to_pdu(rq); +} + +static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx, struct request *rq) +{ + const int tag = rq->tag; + struct request_queue *q = rq->q; + + blk_mq_rq_init(hctx, rq); + blk_mq_put_tag(hctx->tags, tag); + + blk_mq_queue_exit(q); +} + +void blk_mq_free_request(struct request *rq) +{ + struct blk_mq_ctx *ctx = rq->mq_ctx; + struct blk_mq_hw_ctx *hctx; + struct request_queue *q = rq->q; + + ctx->rq_completed[rq_is_sync(rq)]++; + + hctx = q->mq_ops->map_queue(q, ctx->cpu); + __blk_mq_free_request(hctx, ctx, rq); +} + +static void blk_mq_bio_endio(struct request *rq, struct bio *bio, int error) +{ + if (error) + clear_bit(BIO_UPTODATE, &bio->bi_flags); + else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + error = -EIO; + + if (unlikely(rq->cmd_flags & REQ_QUIET)) + set_bit(BIO_QUIET, &bio->bi_flags); + + /* don't actually finish bio if it's part of flush sequence */ + if (!(rq->cmd_flags & REQ_FLUSH_SEQ)) + bio_endio(bio, error); +} + +void blk_mq_complete_request(struct request *rq, int error) +{ + struct bio *bio = rq->bio; + unsigned int bytes = 0; + + trace_block_rq_complete(rq->q, rq); + + while (bio) { + struct bio *next = bio->bi_next; + + bio->bi_next = NULL; + bytes += bio->bi_size; + blk_mq_bio_endio(rq, bio, error); + bio = next; + } + + blk_account_io_completion(rq, bytes); + + if (rq->end_io) + rq->end_io(rq, error); + else + blk_mq_free_request(rq); + + blk_account_io_done(rq); +} + +void __blk_mq_end_io(struct request *rq, int error) +{ + if (!blk_mark_rq_complete(rq)) + blk_mq_complete_request(rq, error); +} + +#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS) + +/* + * Called with interrupts disabled. + */ +static void ipi_end_io(void *data) +{ + struct llist_head *list = &per_cpu(ipi_lists, smp_processor_id()); + struct llist_node *entry, *next; + struct request *rq; + + entry = llist_del_all(list); + + while (entry) { + next = entry->next; + rq = llist_entry(entry, struct request, ll_list); + __blk_mq_end_io(rq, rq->errors); + entry = next; + } +} + +static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu, + struct request *rq, const int error) +{ + struct call_single_data *data = &rq->csd; + + rq->errors = error; + rq->ll_list.next = NULL; + + /* + * If the list is non-empty, an existing IPI must already + * be "in flight". If that is the case, we need not schedule + * a new one. + */ + if (llist_add(&rq->ll_list, &per_cpu(ipi_lists, ctx->cpu))) { + data->func = ipi_end_io; + data->flags = 0; + __smp_call_function_single(ctx->cpu, data, 0); + } + + return true; +} +#else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */ +static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu, + struct request *rq, const int error) +{ + return false; +} +#endif + +/* + * End IO on this request on a multiqueue enabled driver. We'll either do + * it directly inline, or punt to a local IPI handler on the matching + * remote CPU. + */ +void blk_mq_end_io(struct request *rq, int error) +{ + struct blk_mq_ctx *ctx = rq->mq_ctx; + int cpu; + + if (!ctx->ipi_redirect) + return __blk_mq_end_io(rq, error); + + cpu = get_cpu(); + + if (cpu == ctx->cpu || !cpu_online(ctx->cpu) || + !ipi_remote_cpu(ctx, cpu, rq, error)) + __blk_mq_end_io(rq, error); + + put_cpu(); +} +EXPORT_SYMBOL(blk_mq_end_io); + +static void blk_mq_start_request(struct request *rq) +{ + struct request_queue *q = rq->q; + + trace_block_rq_issue(q, rq); + + /* + * Just mark start time and set the started bit. Due to memory + * ordering, we know we'll see the correct deadline as long as + * REQ_ATOMIC_STARTED is seen. + */ + rq->deadline = jiffies + q->rq_timeout; + set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); +} + +static void blk_mq_requeue_request(struct request *rq) +{ + struct request_queue *q = rq->q; + + trace_block_rq_requeue(q, rq); + clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); +} + +struct blk_mq_timeout_data { + struct blk_mq_hw_ctx *hctx; + unsigned long *next; + unsigned int *next_set; +}; + +static void blk_mq_timeout_check(void *__data, unsigned long *free_tags) +{ + struct blk_mq_timeout_data *data = __data; + struct blk_mq_hw_ctx *hctx = data->hctx; + unsigned int tag; + + /* It may not be in flight yet (this is where + * the REQ_ATOMIC_STARTED flag comes in). The requests are + * statically allocated, so we know it's always safe to access the + * memory associated with a bit offset into ->rqs[]. + */ + tag = 0; + do { + struct request *rq; + + tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag); + if (tag >= hctx->queue_depth) + break; + + rq = hctx->rqs[tag++]; + + if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) + continue; + + blk_rq_check_expired(rq, data->next, data->next_set); + } while (1); +} + +static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx, + unsigned long *next, + unsigned int *next_set) +{ + struct blk_mq_timeout_data data = { + .hctx = hctx, + .next = next, + .next_set = next_set, + }; + + /* + * Ask the tagging code to iterate busy requests, so we can + * check them for timeout. + */ + blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data); +} + +static void blk_mq_rq_timer(unsigned long data) +{ + struct request_queue *q = (struct request_queue *) data; + struct blk_mq_hw_ctx *hctx; + unsigned long next = 0; + int i, next_set = 0; + + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); + + if (next_set) + mod_timer(&q->timeout, round_jiffies_up(next)); +} + +/* + * Reverse check our software queue for entries that we could potentially + * merge with. Currently includes a hand-wavy stop count of 8, to not spend + * too much time checking for merges. + */ +static bool blk_mq_attempt_merge(struct request_queue *q, + struct blk_mq_ctx *ctx, struct bio *bio) +{ + struct request *rq; + int checked = 8; + + list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { + int el_ret; + + if (!checked--) + break; + + if (!blk_rq_merge_ok(rq, bio)) + continue; + + el_ret = blk_try_merge(rq, bio); + if (el_ret == ELEVATOR_BACK_MERGE) { + if (bio_attempt_back_merge(q, rq, bio)) { + ctx->rq_merged++; + return true; + } + break; + } else if (el_ret == ELEVATOR_FRONT_MERGE) { + if (bio_attempt_front_merge(q, rq, bio)) { + ctx->rq_merged++; + return true; + } + break; + } + } + + return false; +} + +void blk_mq_add_timer(struct request *rq) +{ + __blk_add_timer(rq, NULL); +} + +/* + * Run this hardware queue, pulling any software queues mapped to it in. + * Note that this function currently has various problems around ordering + * of IO. In particular, we'd like FIFO behaviour on handling existing + * items on the hctx->dispatch list. Ignore that for now. + */ +static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + struct blk_mq_ctx *ctx; + struct request *rq; + LIST_HEAD(rq_list); + int bit, queued; + + if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags))) + return; + + hctx->run++; + + /* + * Touch any software queue that has pending entries. + */ + for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) { + clear_bit(bit, hctx->ctx_map); + ctx = hctx->ctxs[bit]; + BUG_ON(bit != ctx->index_hw); + + spin_lock(&ctx->lock); + list_splice_tail_init(&ctx->rq_list, &rq_list); + spin_unlock(&ctx->lock); + } + + /* + * If we have previous entries on our dispatch list, grab them + * and stuff them at the front for more fair dispatch. + */ + if (!list_empty_careful(&hctx->dispatch)) { + spin_lock(&hctx->lock); + if (!list_empty(&hctx->dispatch)) + list_splice_init(&hctx->dispatch, &rq_list); + spin_unlock(&hctx->lock); + } + + /* + * Delete and return all entries from our dispatch list + */ + queued = 0; + + /* + * Now process all the entries, sending them to the driver. + */ + while (!list_empty(&rq_list)) { + int ret; + + rq = list_first_entry(&rq_list, struct request, queuelist); + list_del_init(&rq->queuelist); + blk_mq_start_request(rq); + + /* + * Last request in the series. Flag it as such, this + * enables drivers to know when IO should be kicked off, + * if they don't do it on a per-request basis. + * + * Note: the flag isn't the only condition drivers + * should do kick off. If drive is busy, the last + * request might not have the bit set. + */ + if (list_empty(&rq_list)) + rq->cmd_flags |= REQ_END; + + ret = q->mq_ops->queue_rq(hctx, rq); + switch (ret) { + case BLK_MQ_RQ_QUEUE_OK: + queued++; + continue; + case BLK_MQ_RQ_QUEUE_BUSY: + /* + * FIXME: we should have a mechanism to stop the queue + * like blk_stop_queue, otherwise we will waste cpu + * time + */ + list_add(&rq->queuelist, &rq_list); + blk_mq_requeue_request(rq); + break; + default: + pr_err("blk-mq: bad return on queue: %d\n", ret); + rq->errors = -EIO; + case BLK_MQ_RQ_QUEUE_ERROR: + blk_mq_end_io(rq, rq->errors); + break; + } + + if (ret == BLK_MQ_RQ_QUEUE_BUSY) + break; + } + + if (!queued) + hctx->dispatched[0]++; + else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1))) + hctx->dispatched[ilog2(queued) + 1]++; + + /* + * Any items that need requeuing? Stuff them into hctx->dispatch, + * that is where we will continue on next queue run. + */ + if (!list_empty(&rq_list)) { + spin_lock(&hctx->lock); + list_splice(&rq_list, &hctx->dispatch); + spin_unlock(&hctx->lock); + } +} + +void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) +{ + if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags))) + return; + + if (!async) + __blk_mq_run_hw_queue(hctx); + else { + struct request_queue *q = hctx->queue; + + kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0); + } +} + +void blk_mq_run_queues(struct request_queue *q, bool async) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) { + if ((!blk_mq_hctx_has_pending(hctx) && + list_empty_careful(&hctx->dispatch)) || + test_bit(BLK_MQ_S_STOPPED, &hctx->flags)) + continue; + + blk_mq_run_hw_queue(hctx, async); + } +} +EXPORT_SYMBOL(blk_mq_run_queues); + +void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) +{ + cancel_delayed_work(&hctx->delayed_work); + set_bit(BLK_MQ_S_STOPPED, &hctx->state); +} +EXPORT_SYMBOL(blk_mq_stop_hw_queue); + +void blk_mq_stop_hw_queues(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_stop_hw_queue(hctx); +} +EXPORT_SYMBOL(blk_mq_stop_hw_queues); + +void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) +{ + clear_bit(BLK_MQ_S_STOPPED, &hctx->state); + __blk_mq_run_hw_queue(hctx); +} +EXPORT_SYMBOL(blk_mq_start_hw_queue); + +void blk_mq_start_stopped_hw_queues(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) { + if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state)) + continue; + + clear_bit(BLK_MQ_S_STOPPED, &hctx->state); + blk_mq_run_hw_queue(hctx, true); + } +} +EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); + +static void blk_mq_work_fn(struct work_struct *work) +{ + struct blk_mq_hw_ctx *hctx; + + hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work); + __blk_mq_run_hw_queue(hctx); +} + +static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + struct blk_mq_ctx *ctx = rq->mq_ctx; + + list_add_tail(&rq->queuelist, &ctx->rq_list); + blk_mq_hctx_mark_pending(hctx, ctx); + + /* + * We do this early, to ensure we are on the right CPU. + */ + blk_mq_add_timer(rq); +} + +void blk_mq_insert_request(struct request_queue *q, struct request *rq, + bool run_queue) +{ + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx, *current_ctx; + + ctx = rq->mq_ctx; + hctx = q->mq_ops->map_queue(q, ctx->cpu); + + if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) { + blk_insert_flush(rq); + } else { + current_ctx = blk_mq_get_ctx(q); + + if (!cpu_online(ctx->cpu)) { + ctx = current_ctx; + hctx = q->mq_ops->map_queue(q, ctx->cpu); + rq->mq_ctx = ctx; + } + spin_lock(&ctx->lock); + __blk_mq_insert_request(hctx, rq); + spin_unlock(&ctx->lock); + + blk_mq_put_ctx(current_ctx); + } + + if (run_queue) + __blk_mq_run_hw_queue(hctx); +} +EXPORT_SYMBOL(blk_mq_insert_request); + +/* + * This is a special version of blk_mq_insert_request to bypass FLUSH request + * check. Should only be used internally. + */ +void blk_mq_run_request(struct request *rq, bool run_queue, bool async) +{ + struct request_queue *q = rq->q; + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx, *current_ctx; + + current_ctx = blk_mq_get_ctx(q); + + ctx = rq->mq_ctx; + if (!cpu_online(ctx->cpu)) { + ctx = current_ctx; + rq->mq_ctx = ctx; + } + hctx = q->mq_ops->map_queue(q, ctx->cpu); + + /* ctx->cpu might be offline */ + spin_lock(&ctx->lock); + __blk_mq_insert_request(hctx, rq); + spin_unlock(&ctx->lock); + + blk_mq_put_ctx(current_ctx); + + if (run_queue) + blk_mq_run_hw_queue(hctx, async); +} + +static void blk_mq_insert_requests(struct request_queue *q, + struct blk_mq_ctx *ctx, + struct list_head *list, + int depth, + bool from_schedule) + +{ + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *current_ctx; + + trace_block_unplug(q, depth, !from_schedule); + + current_ctx = blk_mq_get_ctx(q); + + if (!cpu_online(ctx->cpu)) + ctx = current_ctx; + hctx = q->mq_ops->map_queue(q, ctx->cpu); + + /* + * preemption doesn't flush plug list, so it's possible ctx->cpu is + * offline now + */ + spin_lock(&ctx->lock); + while (!list_empty(list)) { + struct request *rq; + + rq = list_first_entry(list, struct request, queuelist); + list_del_init(&rq->queuelist); + rq->mq_ctx = ctx; + __blk_mq_insert_request(hctx, rq); + } + spin_unlock(&ctx->lock); + + blk_mq_put_ctx(current_ctx); + + blk_mq_run_hw_queue(hctx, from_schedule); +} + +static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct request *rqa = container_of(a, struct request, queuelist); + struct request *rqb = container_of(b, struct request, queuelist); + + return !(rqa->mq_ctx < rqb->mq_ctx || + (rqa->mq_ctx == rqb->mq_ctx && + blk_rq_pos(rqa) < blk_rq_pos(rqb))); +} + +void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) +{ + struct blk_mq_ctx *this_ctx; + struct request_queue *this_q; + struct request *rq; + LIST_HEAD(list); + LIST_HEAD(ctx_list); + unsigned int depth; + + list_splice_init(&plug->mq_list, &list); + + list_sort(NULL, &list, plug_ctx_cmp); + + this_q = NULL; + this_ctx = NULL; + depth = 0; + + while (!list_empty(&list)) { + rq = list_entry_rq(list.next); + list_del_init(&rq->queuelist); + BUG_ON(!rq->q); + if (rq->mq_ctx != this_ctx) { + if (this_ctx) { + blk_mq_insert_requests(this_q, this_ctx, + &ctx_list, depth, + from_schedule); + } + + this_ctx = rq->mq_ctx; + this_q = rq->q; + depth = 0; + } + + depth++; + list_add_tail(&rq->queuelist, &ctx_list); + } + + /* + * If 'this_ctx' is set, we know we have entries to complete + * on 'ctx_list'. Do those. + */ + if (this_ctx) { + blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth, + from_schedule); + } +} + +static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) +{ + init_request_from_bio(rq, bio); + blk_account_io_start(rq, 1); +} + +static void blk_mq_make_request(struct request_queue *q, struct bio *bio) +{ + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + const int is_sync = rw_is_sync(bio->bi_rw); + const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); + int rw = bio_data_dir(bio); + struct request *rq; + unsigned int use_plug, request_count = 0; + + /* + * If we have multiple hardware queues, just go directly to + * one of those for sync IO. + */ + use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync); + + blk_queue_bounce(q, &bio); + + if (use_plug && blk_attempt_plug_merge(q, bio, &request_count)) + return; + + if (blk_mq_queue_enter(q)) { + bio_endio(bio, -EIO); + return; + } + + ctx = blk_mq_get_ctx(q); + hctx = q->mq_ops->map_queue(q, ctx->cpu); + + trace_block_getrq(q, bio, rw); + rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false); + if (likely(rq)) + blk_mq_rq_ctx_init(ctx, rq, rw); + else { + blk_mq_put_ctx(ctx); + trace_block_sleeprq(q, bio, rw); + rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC, + false); + ctx = rq->mq_ctx; + hctx = q->mq_ops->map_queue(q, ctx->cpu); + } + + hctx->queued++; + + if (unlikely(is_flush_fua)) { + blk_mq_bio_to_request(rq, bio); + blk_mq_put_ctx(ctx); + blk_insert_flush(rq); + goto run_queue; + } + + /* + * A task plug currently exists. Since this is completely lockless, + * utilize that to temporarily store requests until the task is + * either done or scheduled away. + */ + if (use_plug) { + struct blk_plug *plug = current->plug; + + if (plug) { + blk_mq_bio_to_request(rq, bio); + if (list_empty(&plug->mq_list)) + trace_block_plug(q); + else if (request_count >= BLK_MAX_REQUEST_COUNT) { + blk_flush_plug_list(plug, false); + trace_block_plug(q); + } + list_add_tail(&rq->queuelist, &plug->mq_list); + blk_mq_put_ctx(ctx); + return; + } + } + + spin_lock(&ctx->lock); + + if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && + blk_mq_attempt_merge(q, ctx, bio)) + __blk_mq_free_request(hctx, ctx, rq); + else { + blk_mq_bio_to_request(rq, bio); + __blk_mq_insert_request(hctx, rq); + } + + spin_unlock(&ctx->lock); + blk_mq_put_ctx(ctx); + + /* + * For a SYNC request, send it to the hardware immediately. For an + * ASYNC request, just ensure that we run it later on. The latter + * allows for merging opportunities and more efficient dispatching. + */ +run_queue: + blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua); +} + +/* + * Default mapping to a software queue, since we use one per CPU. + */ +struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu) +{ + return q->queue_hw_ctx[q->mq_map[cpu]]; +} +EXPORT_SYMBOL(blk_mq_map_queue); + +struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *reg, + unsigned int hctx_index) +{ + return kmalloc_node(sizeof(struct blk_mq_hw_ctx), + GFP_KERNEL | __GFP_ZERO, reg->numa_node); +} +EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue); + +void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx, + unsigned int hctx_index) +{ + kfree(hctx); +} +EXPORT_SYMBOL(blk_mq_free_single_hw_queue); + +static void blk_mq_hctx_notify(void *data, unsigned long action, + unsigned int cpu) +{ + struct blk_mq_hw_ctx *hctx = data; + struct blk_mq_ctx *ctx; + LIST_HEAD(tmp); + + if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) + return; + + /* + * Move ctx entries to new CPU, if this one is going away. + */ + ctx = __blk_mq_get_ctx(hctx->queue, cpu); + + spin_lock(&ctx->lock); + if (!list_empty(&ctx->rq_list)) { + list_splice_init(&ctx->rq_list, &tmp); + clear_bit(ctx->index_hw, hctx->ctx_map); + } + spin_unlock(&ctx->lock); + + if (list_empty(&tmp)) + return; + + ctx = blk_mq_get_ctx(hctx->queue); + spin_lock(&ctx->lock); + + while (!list_empty(&tmp)) { + struct request *rq; + + rq = list_first_entry(&tmp, struct request, queuelist); + rq->mq_ctx = ctx; + list_move_tail(&rq->queuelist, &ctx->rq_list); + } + + blk_mq_hctx_mark_pending(hctx, ctx); + + spin_unlock(&ctx->lock); + blk_mq_put_ctx(ctx); +} + +static void blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx, + void (*init)(void *, struct blk_mq_hw_ctx *, + struct request *, unsigned int), + void *data) +{ + unsigned int i; + + for (i = 0; i < hctx->queue_depth; i++) { + struct request *rq = hctx->rqs[i]; + + init(data, hctx, rq, i); + } +} + +void blk_mq_init_commands(struct request_queue *q, + void (*init)(void *, struct blk_mq_hw_ctx *, + struct request *, unsigned int), + void *data) +{ + struct blk_mq_hw_ctx *hctx; + unsigned int i; + + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_init_hw_commands(hctx, init, data); +} +EXPORT_SYMBOL(blk_mq_init_commands); + +static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx) +{ + struct page *page; + + while (!list_empty(&hctx->page_list)) { + page = list_first_entry(&hctx->page_list, struct page, list); + list_del_init(&page->list); + __free_pages(page, page->private); + } + + kfree(hctx->rqs); + + if (hctx->tags) + blk_mq_free_tags(hctx->tags); +} + +static size_t order_to_size(unsigned int order) +{ + size_t ret = PAGE_SIZE; + + while (order--) + ret *= 2; + + return ret; +} + +static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx, + unsigned int reserved_tags, int node) +{ + unsigned int i, j, entries_per_page, max_order = 4; + size_t rq_size, left; + + INIT_LIST_HEAD(&hctx->page_list); + + hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *), + GFP_KERNEL, node); + if (!hctx->rqs) + return -ENOMEM; + + /* + * rq_size is the size of the request plus driver payload, rounded + * to the cacheline size + */ + rq_size = round_up(sizeof(struct request) + hctx->cmd_size, + cache_line_size()); + left = rq_size * hctx->queue_depth; + + for (i = 0; i < hctx->queue_depth;) { + int this_order = max_order; + struct page *page; + int to_do; + void *p; + + while (left < order_to_size(this_order - 1) && this_order) + this_order--; + + do { + page = alloc_pages_node(node, GFP_KERNEL, this_order); + if (page) + break; + if (!this_order--) + break; + if (order_to_size(this_order) < rq_size) + break; + } while (1); + + if (!page) + break; + + page->private = this_order; + list_add_tail(&page->list, &hctx->page_list); + + p = page_address(page); + entries_per_page = order_to_size(this_order) / rq_size; + to_do = min(entries_per_page, hctx->queue_depth - i); + left -= to_do * rq_size; + for (j = 0; j < to_do; j++) { + hctx->rqs[i] = p; + blk_mq_rq_init(hctx, hctx->rqs[i]); + p += rq_size; + i++; + } + } + + if (i < (reserved_tags + BLK_MQ_TAG_MIN)) + goto err_rq_map; + else if (i != hctx->queue_depth) { + hctx->queue_depth = i; + pr_warn("%s: queue depth set to %u because of low memory\n", + __func__, i); + } + + hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node); + if (!hctx->tags) { +err_rq_map: + blk_mq_free_rq_map(hctx); + return -ENOMEM; + } + + return 0; +} + +static int blk_mq_init_hw_queues(struct request_queue *q, + struct blk_mq_reg *reg, void *driver_data) +{ + struct blk_mq_hw_ctx *hctx; + unsigned int i, j; + + /* + * Initialize hardware queues + */ + queue_for_each_hw_ctx(q, hctx, i) { + unsigned int num_maps; + int node; + + node = hctx->numa_node; + if (node == NUMA_NO_NODE) + node = hctx->numa_node = reg->numa_node; + + INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn); + spin_lock_init(&hctx->lock); + INIT_LIST_HEAD(&hctx->dispatch); + hctx->queue = q; + hctx->queue_num = i; + hctx->flags = reg->flags; + hctx->queue_depth = reg->queue_depth; + hctx->cmd_size = reg->cmd_size; + + blk_mq_init_cpu_notifier(&hctx->cpu_notifier, + blk_mq_hctx_notify, hctx); + blk_mq_register_cpu_notifier(&hctx->cpu_notifier); + + if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node)) + break; + + /* + * Allocate space for all possible cpus to avoid allocation in + * runtime + */ + hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), + GFP_KERNEL, node); + if (!hctx->ctxs) + break; + + num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG; + hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long), + GFP_KERNEL, node); + if (!hctx->ctx_map) + break; + + hctx->nr_ctx_map = num_maps; + hctx->nr_ctx = 0; + + if (reg->ops->init_hctx && + reg->ops->init_hctx(hctx, driver_data, i)) + break; + } + + if (i == q->nr_hw_queues) + return 0; + + /* + * Init failed + */ + queue_for_each_hw_ctx(q, hctx, j) { + if (i == j) + break; + + if (reg->ops->exit_hctx) + reg->ops->exit_hctx(hctx, j); + + blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); + blk_mq_free_rq_map(hctx); + kfree(hctx->ctxs); + } + + return 1; +} + +static void blk_mq_init_cpu_queues(struct request_queue *q, + unsigned int nr_hw_queues) +{ + unsigned int i; + + for_each_possible_cpu(i) { + struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); + struct blk_mq_hw_ctx *hctx; + + memset(__ctx, 0, sizeof(*__ctx)); + __ctx->cpu = i; + spin_lock_init(&__ctx->lock); + INIT_LIST_HEAD(&__ctx->rq_list); + __ctx->queue = q; + + /* If the cpu isn't online, the cpu is mapped to first hctx */ + hctx = q->mq_ops->map_queue(q, i); + hctx->nr_ctx++; + + if (!cpu_online(i)) + continue; + + /* + * Set local node, IFF we have more than one hw queue. If + * not, we remain on the home node of the device + */ + if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) + hctx->numa_node = cpu_to_node(i); + } +} + +static void blk_mq_map_swqueue(struct request_queue *q) +{ + unsigned int i; + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + + queue_for_each_hw_ctx(q, hctx, i) { + hctx->nr_ctx = 0; + } + + /* + * Map software to hardware queues + */ + queue_for_each_ctx(q, ctx, i) { + /* If the cpu isn't online, the cpu is mapped to first hctx */ + hctx = q->mq_ops->map_queue(q, i); + ctx->index_hw = hctx->nr_ctx; + hctx->ctxs[hctx->nr_ctx++] = ctx; + } +} + +struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, + void *driver_data) +{ + struct blk_mq_hw_ctx **hctxs; + struct blk_mq_ctx *ctx; + struct request_queue *q; + int i; + + if (!reg->nr_hw_queues || + !reg->ops->queue_rq || !reg->ops->map_queue || + !reg->ops->alloc_hctx || !reg->ops->free_hctx) + return ERR_PTR(-EINVAL); + + if (!reg->queue_depth) + reg->queue_depth = BLK_MQ_MAX_DEPTH; + else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) { + pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth); + reg->queue_depth = BLK_MQ_MAX_DEPTH; + } + + /* + * Set aside a tag for flush requests. It will only be used while + * another flush request is in progress but outside the driver. + * + * TODO: only allocate if flushes are supported + */ + reg->queue_depth++; + reg->reserved_tags++; + + if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN)) + return ERR_PTR(-EINVAL); + + ctx = alloc_percpu(struct blk_mq_ctx); + if (!ctx) + return ERR_PTR(-ENOMEM); + + hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, + reg->numa_node); + + if (!hctxs) + goto err_percpu; + + for (i = 0; i < reg->nr_hw_queues; i++) { + hctxs[i] = reg->ops->alloc_hctx(reg, i); + if (!hctxs[i]) + goto err_hctxs; + + hctxs[i]->numa_node = NUMA_NO_NODE; + hctxs[i]->queue_num = i; + } + + q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node); + if (!q) + goto err_hctxs; + + q->mq_map = blk_mq_make_queue_map(reg); + if (!q->mq_map) + goto err_map; + + setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); + blk_queue_rq_timeout(q, 30000); + + q->nr_queues = nr_cpu_ids; + q->nr_hw_queues = reg->nr_hw_queues; + + q->queue_ctx = ctx; + q->queue_hw_ctx = hctxs; + + q->mq_ops = reg->ops; + + blk_queue_make_request(q, blk_mq_make_request); + blk_queue_rq_timed_out(q, reg->ops->timeout); + if (reg->timeout) + blk_queue_rq_timeout(q, reg->timeout); + + blk_mq_init_flush(q); + blk_mq_init_cpu_queues(q, reg->nr_hw_queues); + + if (blk_mq_init_hw_queues(q, reg, driver_data)) + goto err_hw; + + blk_mq_map_swqueue(q); + + mutex_lock(&all_q_mutex); + list_add_tail(&q->all_q_node, &all_q_list); + mutex_unlock(&all_q_mutex); + + return q; +err_hw: + kfree(q->mq_map); +err_map: + blk_cleanup_queue(q); +err_hctxs: + for (i = 0; i < reg->nr_hw_queues; i++) { + if (!hctxs[i]) + break; + reg->ops->free_hctx(hctxs[i], i); + } + kfree(hctxs); +err_percpu: + free_percpu(ctx); + return ERR_PTR(-ENOMEM); +} +EXPORT_SYMBOL(blk_mq_init_queue); + +void blk_mq_free_queue(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) { + cancel_delayed_work_sync(&hctx->delayed_work); + kfree(hctx->ctx_map); + kfree(hctx->ctxs); + blk_mq_free_rq_map(hctx); + blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); + if (q->mq_ops->exit_hctx) + q->mq_ops->exit_hctx(hctx, i); + q->mq_ops->free_hctx(hctx, i); + } + + free_percpu(q->queue_ctx); + kfree(q->queue_hw_ctx); + kfree(q->mq_map); + + q->queue_ctx = NULL; + q->queue_hw_ctx = NULL; + q->mq_map = NULL; + + mutex_lock(&all_q_mutex); + list_del_init(&q->all_q_node); + mutex_unlock(&all_q_mutex); +} +EXPORT_SYMBOL(blk_mq_free_queue); + +/* Basically redo blk_mq_init_queue with queue frozen */ +static void __cpuinit blk_mq_queue_reinit(struct request_queue *q) +{ + blk_mq_freeze_queue(q); + + blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues); + + /* + * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe + * we should change hctx numa_node according to new topology (this + * involves free and re-allocate memory, worthy doing?) + */ + + blk_mq_map_swqueue(q); + + blk_mq_unfreeze_queue(q); +} + +static int __cpuinit blk_mq_queue_reinit_notify(struct notifier_block *nb, + unsigned long action, void *hcpu) +{ + struct request_queue *q; + + /* + * Before new mapping is established, hotadded cpu might already start + * handling requests. This doesn't break anything as we map offline + * CPUs to first hardware queue. We will re-init queue below to get + * optimal settings. + */ + if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && + action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) + return NOTIFY_OK; + + mutex_lock(&all_q_mutex); + list_for_each_entry(q, &all_q_list, all_q_node) + blk_mq_queue_reinit(q); + mutex_unlock(&all_q_mutex); + return NOTIFY_OK; +} + +static int __init blk_mq_init(void) +{ + unsigned int i; + + for_each_possible_cpu(i) + init_llist_head(&per_cpu(ipi_lists, i)); + + blk_mq_cpu_init(); + + /* Must be called after percpu_counter_hotcpu_callback() */ + hotcpu_notifier(blk_mq_queue_reinit_notify, -10); + + return 0; +} +subsys_initcall(blk_mq_init); diff --git a/block/blk-mq.h b/block/blk-mq.h new file mode 100644 index 0000000..52bf1f9 --- /dev/null +++ b/block/blk-mq.h @@ -0,0 +1,52 @@ +#ifndef INT_BLK_MQ_H +#define INT_BLK_MQ_H + +struct blk_mq_ctx { + struct { + spinlock_t lock; + struct list_head rq_list; + } ____cacheline_aligned_in_smp; + + unsigned int cpu; + unsigned int index_hw; + unsigned int ipi_redirect; + + /* incremented at dispatch time */ + unsigned long rq_dispatched[2]; + unsigned long rq_merged; + + /* incremented at completion time */ + unsigned long ____cacheline_aligned_in_smp rq_completed[2]; + + struct request_queue *queue; + struct kobject kobj; +}; + +void __blk_mq_end_io(struct request *rq, int error); +void blk_mq_complete_request(struct request *rq, int error); +void blk_mq_run_request(struct request *rq, bool run_queue, bool async); +void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); +void blk_mq_init_flush(struct request_queue *q); + +/* + * CPU hotplug helpers + */ +struct blk_mq_cpu_notifier; +void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, + void (*fn)(void *, unsigned long, unsigned int), + void *data); +void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier); +void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier); +void blk_mq_cpu_init(void); +DECLARE_PER_CPU(struct llist_head, ipi_lists); + +/* + * CPU -> queue mappings + */ +struct blk_mq_reg; +extern unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg); +extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues); + +void blk_mq_add_timer(struct request *rq); + +#endif diff --git a/block/blk-settings.c b/block/blk-settings.c index 026c151..05e8267 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -144,6 +144,7 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->discard_zeroes_data = 1; lim->max_segments = USHRT_MAX; lim->max_hw_sectors = UINT_MAX; + lim->max_segment_size = UINT_MAX; lim->max_sectors = UINT_MAX; lim->max_write_same_sectors = UINT_MAX; } diff --git a/block/blk-softirq.c b/block/blk-softirq.c index ec9e606..ce4b8bf 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -23,7 +23,7 @@ static void blk_done_softirq(struct softirq_action *h) struct list_head *cpu_list, local_list; local_irq_disable(); - cpu_list = &__get_cpu_var(blk_cpu_done); + cpu_list = this_cpu_ptr(&blk_cpu_done); list_replace_init(cpu_list, &local_list); local_irq_enable(); @@ -44,7 +44,7 @@ static void trigger_softirq(void *data) struct list_head *list; local_irq_save(flags); - list = &__get_cpu_var(blk_cpu_done); + list = this_cpu_ptr(&blk_cpu_done); list_add_tail(&rq->csd.list, list); if (list->next == &rq->csd.list) @@ -90,7 +90,7 @@ static int blk_cpu_notify(struct notifier_block *self, unsigned long action, local_irq_disable(); list_splice_init(&per_cpu(blk_cpu_done, cpu), - &__get_cpu_var(blk_cpu_done)); + this_cpu_ptr(&blk_cpu_done)); raise_softirq_irqoff(BLOCK_SOFTIRQ); local_irq_enable(); } @@ -135,7 +135,7 @@ void __blk_complete_request(struct request *req) if (ccpu == cpu || shared) { struct list_head *list; do_local: - list = &__get_cpu_var(blk_cpu_done); + list = this_cpu_ptr(&blk_cpu_done); list_add_tail(&req->csd.list, list); /* diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 3aa5b19..4f8c4d9 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -7,6 +7,7 @@ #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/blktrace_api.h> +#include <linux/blk-mq.h> #include "blk.h" #include "blk-cgroup.h" @@ -542,6 +543,11 @@ static void blk_release_queue(struct kobject *kobj) if (q->queue_tags) __blk_queue_free_tags(q); + percpu_counter_destroy(&q->mq_usage_counter); + + if (q->mq_ops) + blk_mq_free_queue(q); + blk_trace_shutdown(q); bdi_destroy(&q->backing_dev_info); @@ -575,6 +581,7 @@ int blk_register_queue(struct gendisk *disk) * bypass from queue allocation. */ blk_queue_bypass_end(q); + queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q); ret = blk_trace_init_sysfs(dev); if (ret) @@ -588,6 +595,9 @@ int blk_register_queue(struct gendisk *disk) kobject_uevent(&q->kobj, KOBJ_ADD); + if (q->mq_ops) + blk_mq_register_disk(disk); + if (!q->request_fn) return 0; @@ -610,6 +620,9 @@ void blk_unregister_queue(struct gendisk *disk) if (WARN_ON(!q)) return; + if (q->mq_ops) + blk_mq_unregister_disk(disk); + if (q->request_fn) elv_unregister_queue(q); diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 65f1035..bba81c9 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -7,6 +7,7 @@ #include <linux/fault-inject.h> #include "blk.h" +#include "blk-mq.h" #ifdef CONFIG_FAIL_IO_TIMEOUT @@ -31,7 +32,7 @@ static int __init fail_io_timeout_debugfs(void) struct dentry *dir = fault_create_debugfs_attr("fail_io_timeout", NULL, &fail_io_timeout); - return IS_ERR(dir) ? PTR_ERR(dir) : 0; + return PTR_ERR_OR_ZERO(dir); } late_initcall(fail_io_timeout_debugfs); @@ -88,11 +89,19 @@ static void blk_rq_timed_out(struct request *req) ret = q->rq_timed_out_fn(req); switch (ret) { case BLK_EH_HANDLED: - __blk_complete_request(req); + /* Can we use req->errors here? */ + if (q->mq_ops) + blk_mq_complete_request(req, req->errors); + else + __blk_complete_request(req); break; case BLK_EH_RESET_TIMER: + if (q->mq_ops) + blk_mq_add_timer(req); + else + blk_add_timer(req); + blk_clear_rq_complete(req); - blk_add_timer(req); break; case BLK_EH_NOT_HANDLED: /* @@ -108,6 +117,23 @@ static void blk_rq_timed_out(struct request *req) } } +void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, + unsigned int *next_set) +{ + if (time_after_eq(jiffies, rq->deadline)) { + list_del_init(&rq->timeout_list); + + /* + * Check if we raced with end io completion + */ + if (!blk_mark_rq_complete(rq)) + blk_rq_timed_out(rq); + } else if (!*next_set || time_after(*next_timeout, rq->deadline)) { + *next_timeout = rq->deadline; + *next_set = 1; + } +} + void blk_rq_timed_out_timer(unsigned long data) { struct request_queue *q = (struct request_queue *) data; @@ -117,21 +143,8 @@ void blk_rq_timed_out_timer(unsigned long data) spin_lock_irqsave(q->queue_lock, flags); - list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) { - if (time_after_eq(jiffies, rq->deadline)) { - list_del_init(&rq->timeout_list); - - /* - * Check if we raced with end io completion - */ - if (blk_mark_rq_complete(rq)) - continue; - blk_rq_timed_out(rq); - } else if (!next_set || time_after(next, rq->deadline)) { - next = rq->deadline; - next_set = 1; - } - } + list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) + blk_rq_check_expired(rq, &next, &next_set); if (next_set) mod_timer(&q->timeout, round_jiffies_up(next)); @@ -157,15 +170,7 @@ void blk_abort_request(struct request *req) } EXPORT_SYMBOL_GPL(blk_abort_request); -/** - * blk_add_timer - Start timeout timer for a single request - * @req: request that is about to start running. - * - * Notes: - * Each request has its own timer, and as it is added to the queue, we - * set up the timer. When the request completes, we cancel the timer. - */ -void blk_add_timer(struct request *req) +void __blk_add_timer(struct request *req, struct list_head *timeout_list) { struct request_queue *q = req->q; unsigned long expiry; @@ -174,7 +179,6 @@ void blk_add_timer(struct request *req) return; BUG_ON(!list_empty(&req->timeout_list)); - BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags)); /* * Some LLDs, like scsi, peek at the timeout to prevent a @@ -184,7 +188,8 @@ void blk_add_timer(struct request *req) req->timeout = q->rq_timeout; req->deadline = jiffies + req->timeout; - list_add_tail(&req->timeout_list, &q->timeout_list); + if (timeout_list) + list_add_tail(&req->timeout_list, timeout_list); /* * If the timer isn't already pending or this timeout is earlier @@ -196,5 +201,19 @@ void blk_add_timer(struct request *req) if (!timer_pending(&q->timeout) || time_before(expiry, q->timeout.expires)) mod_timer(&q->timeout, expiry); + +} + +/** + * blk_add_timer - Start timeout timer for a single request + * @req: request that is about to start running. + * + * Notes: + * Each request has its own timer, and as it is added to the queue, we + * set up the timer. When the request completes, we cancel the timer. + */ +void blk_add_timer(struct request *req) +{ + __blk_add_timer(req, &req->q->timeout_list); } diff --git a/block/blk.h b/block/blk.h index e837b8f..c90e1d8 100644 --- a/block/blk.h +++ b/block/blk.h @@ -10,6 +10,7 @@ #define BLK_BATCH_REQ 32 extern struct kmem_cache *blk_requestq_cachep; +extern struct kmem_cache *request_cachep; extern struct kobj_type blk_queue_ktype; extern struct ida blk_queue_ida; @@ -34,14 +35,30 @@ bool __blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes, unsigned int bidi_bytes); void blk_rq_timed_out_timer(unsigned long data); +void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, + unsigned int *next_set); +void __blk_add_timer(struct request *req, struct list_head *timeout_list); void blk_delete_timer(struct request *); void blk_add_timer(struct request *); + +bool bio_attempt_front_merge(struct request_queue *q, struct request *req, + struct bio *bio); +bool bio_attempt_back_merge(struct request_queue *q, struct request *req, + struct bio *bio); +bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, + unsigned int *request_count); + +void blk_account_io_start(struct request *req, bool new_io); +void blk_account_io_completion(struct request *req, unsigned int bytes); +void blk_account_io_done(struct request *req); + /* * Internal atomic flags for request handling */ enum rq_atomic_flags { REQ_ATOM_COMPLETE = 0, + REQ_ATOM_STARTED, }; /* diff --git a/block/elevator.c b/block/elevator.c index 2bcbd8c..b7ff286 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -186,6 +186,12 @@ int elevator_init(struct request_queue *q, char *name) struct elevator_type *e = NULL; int err; + /* + * q->sysfs_lock must be held to provide mutual exclusion between + * elevator_switch() and here. + */ + lockdep_assert_held(&q->sysfs_lock); + if (unlikely(q->elevator)) return 0; @@ -959,7 +965,7 @@ fail_init: /* * Switch this queue to the given IO scheduler. */ -int elevator_change(struct request_queue *q, const char *name) +static int __elevator_change(struct request_queue *q, const char *name) { char elevator_name[ELV_NAME_MAX]; struct elevator_type *e; @@ -981,6 +987,18 @@ int elevator_change(struct request_queue *q, const char *name) return elevator_switch(q, e); } + +int elevator_change(struct request_queue *q, const char *name) +{ + int ret; + + /* Protect q->elevator from elevator_init() */ + mutex_lock(&q->sysfs_lock); + ret = __elevator_change(q, name); + mutex_unlock(&q->sysfs_lock); + + return ret; +} EXPORT_SYMBOL(elevator_change); ssize_t elv_iosched_store(struct request_queue *q, const char *name, @@ -991,7 +1009,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name, if (!q->elevator) return count; - ret = elevator_change(q, name); + ret = __elevator_change(q, name); if (!ret) return count; diff --git a/block/ioctl.c b/block/ioctl.c index a31d91d9b..7d5c3b2 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -64,7 +64,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user part = add_partition(disk, partno, start, length, ADDPART_FLAG_NONE, NULL); mutex_unlock(&bdev->bd_mutex); - return IS_ERR(part) ? PTR_ERR(part) : 0; + return PTR_ERR_OR_ZERO(part); case BLKPG_DEL_PARTITION: part = disk_get_part(disk, partno); if (!part) diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index a5ffcc9..625e3e4 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -286,7 +286,8 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, struct sg_io_hdr *hdr, fmode_t mode) { unsigned long start_time; - int writing = 0, ret = 0; + ssize_t ret = 0; + int writing = 0; struct request *rq; char sense[SCSI_SENSE_BUFFERSIZE]; struct bio *bio; @@ -321,37 +322,16 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, } if (hdr->iovec_count) { - const int size = sizeof(struct sg_iovec) * hdr->iovec_count; size_t iov_data_len; - struct sg_iovec *sg_iov; struct iovec *iov; - int i; - sg_iov = kmalloc(size, GFP_KERNEL); - if (!sg_iov) { - ret = -ENOMEM; + ret = rw_copy_check_uvector(-1, hdr->dxferp, hdr->iovec_count, + 0, NULL, &iov); + if (ret < 0) goto out; - } - - if (copy_from_user(sg_iov, hdr->dxferp, size)) { - kfree(sg_iov); - ret = -EFAULT; - goto out; - } - /* - * Sum up the vecs, making sure they don't overflow - */ - iov = (struct iovec *) sg_iov; - iov_data_len = 0; - for (i = 0; i < hdr->iovec_count; i++) { - if (iov_data_len + iov[i].iov_len < iov_data_len) { - kfree(sg_iov); - ret = -EINVAL; - goto out; - } - iov_data_len += iov[i].iov_len; - } + iov_data_len = ret; + ret = 0; /* SG_IO howto says that the shorter of the two wins */ if (hdr->dxfer_len < iov_data_len) { @@ -361,9 +341,10 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, iov_data_len = hdr->dxfer_len; } - ret = blk_rq_map_user_iov(q, rq, NULL, sg_iov, hdr->iovec_count, + ret = blk_rq_map_user_iov(q, rq, NULL, (struct sg_iovec *) iov, + hdr->iovec_count, iov_data_len, GFP_KERNEL); - kfree(sg_iov); + kfree(iov); } else if (hdr->dxfer_len) ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len, GFP_KERNEL); diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index e67fa16..5902bd0 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -15,6 +15,9 @@ menuconfig BLK_DEV if BLK_DEV +config BLK_DEV_NULL_BLK + tristate "Null test block driver" + config BLK_DEV_FD tristate "Normal floppy disk support" depends on ARCH_MAY_HAVE_PC_FDC diff --git a/drivers/block/Makefile b/drivers/block/Makefile index ca07399..03b3b4a 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -41,6 +41,7 @@ obj-$(CONFIG_BLK_DEV_RBD) += rbd.o obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/ obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/ +obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o nvme-y := nvme-core.o nvme-scsi.o swim_mod-y := swim.o swim_asm.o diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 9bf4371..d91f1a5 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -545,7 +545,7 @@ static struct kobject *brd_probe(dev_t dev, int *part, void *data) mutex_lock(&brd_devices_mutex); brd = brd_init_one(MINOR(dev) >> part_shift); - kobj = brd ? get_disk(brd->brd_disk) : ERR_PTR(-ENOMEM); + kobj = brd ? get_disk(brd->brd_disk) : NULL; mutex_unlock(&brd_devices_mutex); *part = 0; diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 04ceb7e..000abe2 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -2886,9 +2886,9 @@ static void do_fd_request(struct request_queue *q) return; if (WARN(atomic_read(&usage_count) == 0, - "warning: usage count=0, current_req=%p sect=%ld type=%x flags=%x\n", + "warning: usage count=0, current_req=%p sect=%ld type=%x flags=%llx\n", current_req, (long)blk_rq_pos(current_req), current_req->cmd_type, - current_req->cmd_flags)) + (unsigned long long) current_req->cmd_flags)) return; if (test_and_set_bit(0, &fdc_busy)) { diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 40e7155..dbdb88a 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1633,7 +1633,7 @@ static int loop_add(struct loop_device **l, int i) err = -ENOMEM; lo->lo_queue = blk_alloc_queue(GFP_KERNEL); if (!lo->lo_queue) - goto out_free_dev; + goto out_free_idr; disk = lo->lo_disk = alloc_disk(1 << part_shift); if (!disk) @@ -1678,6 +1678,8 @@ static int loop_add(struct loop_device **l, int i) out_free_queue: blk_cleanup_queue(lo->lo_queue); +out_free_idr: + idr_remove(&loop_index_idr, i); out_free_dev: kfree(lo); out: @@ -1741,7 +1743,7 @@ static struct kobject *loop_probe(dev_t dev, int *part, void *data) if (err < 0) err = loop_add(&lo, MINOR(dev) >> part_shift); if (err < 0) - kobj = ERR_PTR(err); + kobj = NULL; else kobj = get_disk(lo->lo_disk); mutex_unlock(&loop_index_mutex); diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c new file mode 100644 index 0000000..b5d8423 --- /dev/null +++ b/drivers/block/null_blk.c @@ -0,0 +1,635 @@ +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/blkdev.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/blk-mq.h> +#include <linux/hrtimer.h> + +struct nullb_cmd { + struct list_head list; + struct llist_node ll_list; + struct call_single_data csd; + struct request *rq; + struct bio *bio; + unsigned int tag; + struct nullb_queue *nq; +}; + +struct nullb_queue { + unsigned long *tag_map; + wait_queue_head_t wait; + unsigned int queue_depth; + + struct nullb_cmd *cmds; +}; + +struct nullb { + struct list_head list; + unsigned int index; + struct request_queue *q; + struct gendisk *disk; + struct hrtimer timer; + unsigned int queue_depth; + spinlock_t lock; + + struct nullb_queue *queues; + unsigned int nr_queues; +}; + +static LIST_HEAD(nullb_list); +static struct mutex lock; +static int null_major; +static int nullb_indexes; + +struct completion_queue { + struct llist_head list; + struct hrtimer timer; +}; + +/* + * These are per-cpu for now, they will need to be configured by the + * complete_queues parameter and appropriately mapped. + */ +static DEFINE_PER_CPU(struct completion_queue, completion_queues); + +enum { + NULL_IRQ_NONE = 0, + NULL_IRQ_SOFTIRQ = 1, + NULL_IRQ_TIMER = 2, + + NULL_Q_BIO = 0, + NULL_Q_RQ = 1, + NULL_Q_MQ = 2, +}; + +static int submit_queues = 1; +module_param(submit_queues, int, S_IRUGO); +MODULE_PARM_DESC(submit_queues, "Number of submission queues"); + +static int home_node = NUMA_NO_NODE; +module_param(home_node, int, S_IRUGO); +MODULE_PARM_DESC(home_node, "Home node for the device"); + +static int queue_mode = NULL_Q_MQ; +module_param(queue_mode, int, S_IRUGO); +MODULE_PARM_DESC(use_mq, "Use blk-mq interface (0=bio,1=rq,2=multiqueue)"); + +static int gb = 250; +module_param(gb, int, S_IRUGO); +MODULE_PARM_DESC(gb, "Size in GB"); + +static int bs = 512; +module_param(bs, int, S_IRUGO); +MODULE_PARM_DESC(bs, "Block size (in bytes)"); + +static int nr_devices = 2; +module_param(nr_devices, int, S_IRUGO); +MODULE_PARM_DESC(nr_devices, "Number of devices to register"); + +static int irqmode = NULL_IRQ_SOFTIRQ; +module_param(irqmode, int, S_IRUGO); +MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer"); + +static int completion_nsec = 10000; +module_param(completion_nsec, int, S_IRUGO); +MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns"); + +static int hw_queue_depth = 64; +module_param(hw_queue_depth, int, S_IRUGO); +MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64"); + +static bool use_per_node_hctx = true; +module_param(use_per_node_hctx, bool, S_IRUGO); +MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: true"); + +static void put_tag(struct nullb_queue *nq, unsigned int tag) +{ + clear_bit_unlock(tag, nq->tag_map); + + if (waitqueue_active(&nq->wait)) + wake_up(&nq->wait); +} + +static unsigned int get_tag(struct nullb_queue *nq) +{ + unsigned int tag; + + do { + tag = find_first_zero_bit(nq->tag_map, nq->queue_depth); + if (tag >= nq->queue_depth) + return -1U; + } while (test_and_set_bit_lock(tag, nq->tag_map)); + + return tag; +} + +static void free_cmd(struct nullb_cmd *cmd) +{ + put_tag(cmd->nq, cmd->tag); +} + +static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq) +{ + struct nullb_cmd *cmd; + unsigned int tag; + + tag = get_tag(nq); + if (tag != -1U) { + cmd = &nq->cmds[tag]; + cmd->tag = tag; + cmd->nq = nq; + return cmd; + } + + return NULL; +} + +static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait) +{ + struct nullb_cmd *cmd; + DEFINE_WAIT(wait); + + cmd = __alloc_cmd(nq); + if (cmd || !can_wait) + return cmd; + + do { + prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE); + cmd = __alloc_cmd(nq); + if (cmd) + break; + + io_schedule(); + } while (1); + + finish_wait(&nq->wait, &wait); + return cmd; +} + +static void end_cmd(struct nullb_cmd *cmd) +{ + if (cmd->rq) { + if (queue_mode == NULL_Q_MQ) + blk_mq_end_io(cmd->rq, 0); + else { + INIT_LIST_HEAD(&cmd->rq->queuelist); + blk_end_request_all(cmd->rq, 0); + } + } else if (cmd->bio) + bio_endio(cmd->bio, 0); + + if (queue_mode != NULL_Q_MQ) + free_cmd(cmd); +} + +static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) +{ + struct completion_queue *cq; + struct llist_node *entry; + struct nullb_cmd *cmd; + + cq = &per_cpu(completion_queues, smp_processor_id()); + + while ((entry = llist_del_all(&cq->list)) != NULL) { + do { + cmd = container_of(entry, struct nullb_cmd, ll_list); + end_cmd(cmd); + entry = entry->next; + } while (entry); + } + + return HRTIMER_NORESTART; +} + +static void null_cmd_end_timer(struct nullb_cmd *cmd) +{ + struct completion_queue *cq = &per_cpu(completion_queues, get_cpu()); + + cmd->ll_list.next = NULL; + if (llist_add(&cmd->ll_list, &cq->list)) { + ktime_t kt = ktime_set(0, completion_nsec); + + hrtimer_start(&cq->timer, kt, HRTIMER_MODE_REL); + } + + put_cpu(); +} + +static void null_softirq_done_fn(struct request *rq) +{ + blk_end_request_all(rq, 0); +} + +#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS) + +static void null_ipi_cmd_end_io(void *data) +{ + struct completion_queue *cq; + struct llist_node *entry, *next; + struct nullb_cmd *cmd; + + cq = &per_cpu(completion_queues, smp_processor_id()); + + entry = llist_del_all(&cq->list); + + while (entry) { + next = entry->next; + cmd = llist_entry(entry, struct nullb_cmd, ll_list); + end_cmd(cmd); + entry = next; + } +} + +static void null_cmd_end_ipi(struct nullb_cmd *cmd) +{ + struct call_single_data *data = &cmd->csd; + int cpu = get_cpu(); + struct completion_queue *cq = &per_cpu(completion_queues, cpu); + + cmd->ll_list.next = NULL; + + if (llist_add(&cmd->ll_list, &cq->list)) { + data->func = null_ipi_cmd_end_io; + data->flags = 0; + __smp_call_function_single(cpu, data, 0); + } + + put_cpu(); +} + +#endif /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */ + +static inline void null_handle_cmd(struct nullb_cmd *cmd) +{ + /* Complete IO by inline, softirq or timer */ + switch (irqmode) { + case NULL_IRQ_NONE: + end_cmd(cmd); + break; + case NULL_IRQ_SOFTIRQ: +#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS) + null_cmd_end_ipi(cmd); +#else + end_cmd(cmd); +#endif + break; + case NULL_IRQ_TIMER: + null_cmd_end_timer(cmd); + break; + } +} + +static struct nullb_queue *nullb_to_queue(struct nullb *nullb) +{ + int index = 0; + + if (nullb->nr_queues != 1) + index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues); + + return &nullb->queues[index]; +} + +static void null_queue_bio(struct request_queue *q, struct bio *bio) +{ + struct nullb *nullb = q->queuedata; + struct nullb_queue *nq = nullb_to_queue(nullb); + struct nullb_cmd *cmd; + + cmd = alloc_cmd(nq, 1); + cmd->bio = bio; + + null_handle_cmd(cmd); +} + +static int null_rq_prep_fn(struct request_queue *q, struct request *req) +{ + struct nullb *nullb = q->queuedata; + struct nullb_queue *nq = nullb_to_queue(nullb); + struct nullb_cmd *cmd; + + cmd = alloc_cmd(nq, 0); + if (cmd) { + cmd->rq = req; + req->special = cmd; + return BLKPREP_OK; + } + + return BLKPREP_DEFER; +} + +static void null_request_fn(struct request_queue *q) +{ + struct request *rq; + + while ((rq = blk_fetch_request(q)) != NULL) { + struct nullb_cmd *cmd = rq->special; + + spin_unlock_irq(q->queue_lock); + null_handle_cmd(cmd); + spin_lock_irq(q->queue_lock); + } +} + +static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq) +{ + struct nullb_cmd *cmd = rq->special; + + cmd->rq = rq; + cmd->nq = hctx->driver_data; + + null_handle_cmd(cmd); + return BLK_MQ_RQ_QUEUE_OK; +} + +static struct blk_mq_hw_ctx *null_alloc_hctx(struct blk_mq_reg *reg, unsigned int hctx_index) +{ + return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, + hctx_index); +} + +static void null_free_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_index) +{ + kfree(hctx); +} + +static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int index) +{ + struct nullb *nullb = data; + struct nullb_queue *nq = &nullb->queues[index]; + + init_waitqueue_head(&nq->wait); + nq->queue_depth = nullb->queue_depth; + nullb->nr_queues++; + hctx->driver_data = nq; + + return 0; +} + +static struct blk_mq_ops null_mq_ops = { + .queue_rq = null_queue_rq, + .map_queue = blk_mq_map_queue, + .init_hctx = null_init_hctx, +}; + +static struct blk_mq_reg null_mq_reg = { + .ops = &null_mq_ops, + .queue_depth = 64, + .cmd_size = sizeof(struct nullb_cmd), + .flags = BLK_MQ_F_SHOULD_MERGE, +}; + +static void null_del_dev(struct nullb *nullb) +{ + list_del_init(&nullb->list); + + del_gendisk(nullb->disk); + if (queue_mode == NULL_Q_MQ) + blk_mq_free_queue(nullb->q); + else + blk_cleanup_queue(nullb->q); + put_disk(nullb->disk); + kfree(nullb); +} + +static int null_open(struct block_device *bdev, fmode_t mode) +{ + return 0; +} + +static void null_release(struct gendisk *disk, fmode_t mode) +{ +} + +static const struct block_device_operations null_fops = { + .owner = THIS_MODULE, + .open = null_open, + .release = null_release, +}; + +static int setup_commands(struct nullb_queue *nq) +{ + struct nullb_cmd *cmd; + int i, tag_size; + + nq->cmds = kzalloc(nq->queue_depth * sizeof(*cmd), GFP_KERNEL); + if (!nq->cmds) + return 1; + + tag_size = ALIGN(nq->queue_depth, BITS_PER_LONG) / BITS_PER_LONG; + nq->tag_map = kzalloc(tag_size * sizeof(unsigned long), GFP_KERNEL); + if (!nq->tag_map) { + kfree(nq->cmds); + return 1; + } + + for (i = 0; i < nq->queue_depth; i++) { + cmd = &nq->cmds[i]; + INIT_LIST_HEAD(&cmd->list); + cmd->ll_list.next = NULL; + cmd->tag = -1U; + } + + return 0; +} + +static void cleanup_queue(struct nullb_queue *nq) +{ + kfree(nq->tag_map); + kfree(nq->cmds); +} + +static void cleanup_queues(struct nullb *nullb) +{ + int i; + + for (i = 0; i < nullb->nr_queues; i++) + cleanup_queue(&nullb->queues[i]); + + kfree(nullb->queues); +} + +static int setup_queues(struct nullb *nullb) +{ + struct nullb_queue *nq; + int i; + + nullb->queues = kzalloc(submit_queues * sizeof(*nq), GFP_KERNEL); + if (!nullb->queues) + return 1; + + nullb->nr_queues = 0; + nullb->queue_depth = hw_queue_depth; + + if (queue_mode == NULL_Q_MQ) + return 0; + + for (i = 0; i < submit_queues; i++) { + nq = &nullb->queues[i]; + init_waitqueue_head(&nq->wait); + nq->queue_depth = hw_queue_depth; + if (setup_commands(nq)) + break; + nullb->nr_queues++; + } + + if (i == submit_queues) + return 0; + + cleanup_queues(nullb); + return 1; +} + +static int null_add_dev(void) +{ + struct gendisk *disk; + struct nullb *nullb; + sector_t size; + + nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node); + if (!nullb) + return -ENOMEM; + + spin_lock_init(&nullb->lock); + + if (setup_queues(nullb)) + goto err; + + if (queue_mode == NULL_Q_MQ) { + null_mq_reg.numa_node = home_node; + null_mq_reg.queue_depth = hw_queue_depth; + + if (use_per_node_hctx) { + null_mq_reg.ops->alloc_hctx = null_alloc_hctx; + null_mq_reg.ops->free_hctx = null_free_hctx; + + null_mq_reg.nr_hw_queues = nr_online_nodes; + } else { + null_mq_reg.ops->alloc_hctx = blk_mq_alloc_single_hw_queue; + null_mq_reg.ops->free_hctx = blk_mq_free_single_hw_queue; + + null_mq_reg.nr_hw_queues = submit_queues; + } + + nullb->q = blk_mq_init_queue(&null_mq_reg, nullb); + } else if (queue_mode == NULL_Q_BIO) { + nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node); + blk_queue_make_request(nullb->q, null_queue_bio); + } else { + nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, home_node); + blk_queue_prep_rq(nullb->q, null_rq_prep_fn); + if (nullb->q) + blk_queue_softirq_done(nullb->q, null_softirq_done_fn); + } + + if (!nullb->q) + goto queue_fail; + + nullb->q->queuedata = nullb; + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q); + + disk = nullb->disk = alloc_disk_node(1, home_node); + if (!disk) { +queue_fail: + if (queue_mode == NULL_Q_MQ) + blk_mq_free_queue(nullb->q); + else + blk_cleanup_queue(nullb->q); + cleanup_queues(nullb); +err: + kfree(nullb); + return -ENOMEM; + } + + mutex_lock(&lock); + list_add_tail(&nullb->list, &nullb_list); + nullb->index = nullb_indexes++; + mutex_unlock(&lock); + + blk_queue_logical_block_size(nullb->q, bs); + blk_queue_physical_block_size(nullb->q, bs); + + size = gb * 1024 * 1024 * 1024ULL; + sector_div(size, bs); + set_capacity(disk, size); + + disk->flags |= GENHD_FL_EXT_DEVT; + disk->major = null_major; + disk->first_minor = nullb->index; + disk->fops = &null_fops; + disk->private_data = nullb; + disk->queue = nullb->q; + sprintf(disk->disk_name, "nullb%d", nullb->index); + add_disk(disk); + return 0; +} + +static int __init null_init(void) +{ + unsigned int i; + +#if !defined(CONFIG_SMP) || !defined(CONFIG_USE_GENERIC_SMP_HELPERS) + if (irqmode == NULL_IRQ_SOFTIRQ) { + pr_warn("null_blk: softirq completions not available.\n"); + pr_warn("null_blk: using direct completions.\n"); + irqmode = NULL_IRQ_NONE; + } +#endif + + if (submit_queues > nr_cpu_ids) + submit_queues = nr_cpu_ids; + else if (!submit_queues) + submit_queues = 1; + + mutex_init(&lock); + + /* Initialize a separate list for each CPU for issuing softirqs */ + for_each_possible_cpu(i) { + struct completion_queue *cq = &per_cpu(completion_queues, i); + + init_llist_head(&cq->list); + + if (irqmode != NULL_IRQ_TIMER) + continue; + + hrtimer_init(&cq->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cq->timer.function = null_cmd_timer_expired; + } + + null_major = register_blkdev(0, "nullb"); + if (null_major < 0) + return null_major; + + for (i = 0; i < nr_devices; i++) { + if (null_add_dev()) { + unregister_blkdev(null_major, "nullb"); + return -EINVAL; + } + } + + pr_info("null: module loaded\n"); + return 0; +} + +static void __exit null_exit(void) +{ + struct nullb *nullb; + + unregister_blkdev(null_major, "nullb"); + + mutex_lock(&lock); + while (!list_empty(&nullb_list)) { + nullb = list_entry(nullb_list.next, struct nullb, list); + null_del_dev(nullb); + } + mutex_unlock(&lock); +} + +module_init(null_init); +module_exit(null_exit); + +MODULE_AUTHOR("Jens Axboe <jaxboe@fusionio.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index a4660bb..8d53ed2 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1336,57 +1336,6 @@ static int blkfront_probe(struct xenbus_device *dev, return 0; } -/* - * This is a clone of md_trim_bio, used to split a bio into smaller ones - */ -static void trim_bio(struct bio *bio, int offset, int size) -{ - /* 'bio' is a cloned bio which we need to trim to match - * the given offset and size. - * This requires adjusting bi_sector, bi_size, and bi_io_vec - */ - int i; - struct bio_vec *bvec; - int sofar = 0; - - size <<= 9; - if (offset == 0 && size == bio->bi_size) - return; - - bio->bi_sector += offset; - bio->bi_size = size; - offset <<= 9; - clear_bit(BIO_SEG_VALID, &bio->bi_flags); - - while (bio->bi_idx < bio->bi_vcnt && - bio->bi_io_vec[bio->bi_idx].bv_len <= offset) { - /* remove this whole bio_vec */ - offset -= bio->bi_io_vec[bio->bi_idx].bv_len; - bio->bi_idx++; - } - if (bio->bi_idx < bio->bi_vcnt) { - bio->bi_io_vec[bio->bi_idx].bv_offset += offset; - bio->bi_io_vec[bio->bi_idx].bv_len -= offset; - } - /* avoid any complications with bi_idx being non-zero*/ - if (bio->bi_idx) { - memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx, - (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec)); - bio->bi_vcnt -= bio->bi_idx; - bio->bi_idx = 0; - } - /* Make sure vcnt and last bv are not too big */ - bio_for_each_segment(bvec, bio, i) { - if (sofar + bvec->bv_len > size) - bvec->bv_len = size - sofar; - if (bvec->bv_len == 0) { - bio->bi_vcnt = i; - break; - } - sofar += bvec->bv_len; - } -} - static void split_bio_end(struct bio *bio, int error) { struct split_bio *split_bio = bio->bi_private; @@ -1522,7 +1471,7 @@ static int blkif_recover(struct blkfront_info *info) (unsigned int)(bio->bi_size >> 9) - offset); cloned_bio = bio_clone(bio, GFP_NOIO); BUG_ON(cloned_bio == NULL); - trim_bio(cloned_bio, offset, size); + bio_trim(cloned_bio, offset, size); cloned_bio->bi_private = split_bio; cloned_bio->bi_end_io = split_bio_end; submit_bio(cloned_bio->bi_rw, cloned_bio); diff --git a/drivers/md/md.c b/drivers/md/md.c index 2445fec..8766eab 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -183,46 +183,6 @@ struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, } EXPORT_SYMBOL_GPL(bio_clone_mddev); -void md_trim_bio(struct bio *bio, int offset, int size) -{ - /* 'bio' is a cloned bio which we need to trim to match - * the given offset and size. - * This requires adjusting bi_sector, bi_size, and bi_io_vec - */ - int i; - struct bio_vec *bvec; - int sofar = 0; - - size <<= 9; - if (offset == 0 && size == bio->bi_size) - return; - - clear_bit(BIO_SEG_VALID, &bio->bi_flags); - - bio_advance(bio, offset << 9); - - bio->bi_size = size; - - /* avoid any complications with bi_idx being non-zero*/ - if (bio->bi_idx) { - memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx, - (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec)); - bio->bi_vcnt -= bio->bi_idx; - bio->bi_idx = 0; - } - /* Make sure vcnt and last bv are not too big */ - bio_for_each_segment(bvec, bio, i) { - if (sofar + bvec->bv_len > size) - bvec->bv_len = size - sofar; - if (bvec->bv_len == 0) { - bio->bi_vcnt = i; - break; - } - sofar += bvec->bv_len; - } -} -EXPORT_SYMBOL_GPL(md_trim_bio); - /* * We have a system wide 'event count' that is incremented * on any 'interesting' event, and readers of /proc/mdstat diff --git a/drivers/md/md.h b/drivers/md/md.h index b0051f2..2f5cc8a 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -617,7 +617,6 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, struct mddev *mddev); extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, struct mddev *mddev); -extern void md_trim_bio(struct bio *bio, int offset, int size); extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule); static inline int mddev_check_plugged(struct mddev *mddev) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index aacf6bf..af6681b1 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1097,8 +1097,8 @@ read_again: r1_bio->read_disk = rdisk; read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(read_bio, r1_bio->sector - bio->bi_sector, - max_sectors); + bio_trim(read_bio, r1_bio->sector - bio->bi_sector, + max_sectors); r1_bio->bios[rdisk] = read_bio; @@ -1266,7 +1266,7 @@ read_again: continue; mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(mbio, r1_bio->sector - bio->bi_sector, max_sectors); + bio_trim(mbio, r1_bio->sector - bio->bi_sector, max_sectors); if (first_clone) { /* do behind I/O ? @@ -2126,7 +2126,7 @@ static int narrow_write_error(struct r1bio *r1_bio, int i) wbio->bi_sector = r1_bio->sector; wbio->bi_size = r1_bio->sectors << 9; - md_trim_bio(wbio, sector - r1_bio->sector, sectors); + bio_trim(wbio, sector - r1_bio->sector, sectors); wbio->bi_sector += rdev->data_offset; wbio->bi_bdev = rdev->bdev; if (submit_bio_wait(WRITE, wbio) == 0) @@ -2241,7 +2241,7 @@ read_more: } r1_bio->read_disk = disk; bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); - md_trim_bio(bio, r1_bio->sector - bio->bi_sector, max_sectors); + bio_trim(bio, r1_bio->sector - bio->bi_sector, max_sectors); r1_bio->bios[r1_bio->read_disk] = bio; rdev = conf->mirrors[disk].rdev; printk_ratelimited(KERN_ERR diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 73dc8a3..7c3508a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1302,8 +1302,8 @@ read_again: slot = r10_bio->read_slot; read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector, - max_sectors); + bio_trim(read_bio, r10_bio->sector - bio->bi_sector, + max_sectors); r10_bio->devs[slot].bio = read_bio; r10_bio->devs[slot].rdev = rdev; @@ -1510,8 +1510,8 @@ retry_write: if (r10_bio->devs[i].bio) { struct md_rdev *rdev = conf->mirrors[d].rdev; mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, - max_sectors); + bio_trim(mbio, r10_bio->sector - bio->bi_sector, + max_sectors); r10_bio->devs[i].bio = mbio; mbio->bi_sector = (r10_bio->devs[i].addr+ @@ -1553,8 +1553,8 @@ retry_write: rdev = conf->mirrors[d].rdev; } mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, - max_sectors); + bio_trim(mbio, r10_bio->sector - bio->bi_sector, + max_sectors); r10_bio->devs[i].repl_bio = mbio; mbio->bi_sector = (r10_bio->devs[i].addr + @@ -2614,7 +2614,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i) sectors = sect_to_write; /* Write at 'sector' for 'sectors' */ wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(wbio, sector - bio->bi_sector, sectors); + bio_trim(wbio, sector - bio->bi_sector, sectors); wbio->bi_sector = (r10_bio->devs[i].addr+ choose_data_offset(r10_bio, rdev) + (sector - r10_bio->sector)); @@ -2687,9 +2687,7 @@ read_more: (unsigned long long)r10_bio->sector); bio = bio_clone_mddev(r10_bio->master_bio, GFP_NOIO, mddev); - md_trim_bio(bio, - r10_bio->sector - bio->bi_sector, - max_sectors); + bio_trim(bio, r10_bio->sector - bio->bi_sector, max_sectors); r10_bio->devs[slot].bio = bio; r10_bio->devs[slot].rdev = rdev; bio->bi_sector = r10_bio->devs[slot].addr diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 5693f6d7..7fe4faa 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1002,7 +1002,7 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq) SCpnt->cmnd[0] = READ_6; SCpnt->sc_data_direction = DMA_FROM_DEVICE; } else { - scmd_printk(KERN_ERR, SCpnt, "Unknown command %x\n", rq->cmd_flags); + scmd_printk(KERN_ERR, SCpnt, "Unknown command %llx\n", (unsigned long long) rq->cmd_flags); goto out; } @@ -1805,6 +1805,52 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors) EXPORT_SYMBOL(bio_split); /** + * bio_trim - trim a bio + * @bio: bio to trim + * @offset: number of sectors to trim from the front of @bio + * @size: size we want to trim @bio to, in sectors + */ +void bio_trim(struct bio *bio, int offset, int size) +{ + /* 'bio' is a cloned bio which we need to trim to match + * the given offset and size. + * This requires adjusting bi_sector, bi_size, and bi_io_vec + */ + int i; + struct bio_vec *bvec; + int sofar = 0; + + size <<= 9; + if (offset == 0 && size == bio->bi_size) + return; + + clear_bit(BIO_SEG_VALID, &bio->bi_flags); + + bio_advance(bio, offset << 9); + + bio->bi_size = size; + + /* avoid any complications with bi_idx being non-zero*/ + if (bio->bi_idx) { + memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx, + (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec)); + bio->bi_vcnt -= bio->bi_idx; + bio->bi_idx = 0; + } + /* Make sure vcnt and last bv are not too big */ + bio_for_each_segment(bvec, bio, i) { + if (sofar + bvec->bv_len > size) + bvec->bv_len = size - sofar; + if (bvec->bv_len == 0) { + bio->bi_vcnt = i; + break; + } + sofar += bvec->bv_len; + } +} +EXPORT_SYMBOL_GPL(bio_trim); + +/** * bio_sector_offset - Find hardware sector offset in bio * @bio: bio to inspect * @index: bio_vec index diff --git a/fs/char_dev.c b/fs/char_dev.c index 94b5f60..f77f770 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -576,7 +576,8 @@ static struct kobject *base_probe(dev_t dev, int *part, void *data) void __init chrdev_init(void) { cdev_map = kobj_map_init(base_probe, &chrdevs_lock); - bdi_init(&directly_mappable_cdev_bdi); + if (bdi_init(&directly_mappable_cdev_bdi)) + panic("Failed to init directly mappable cdev bdi"); } diff --git a/fs/fscache/object.c b/fs/fscache/object.c index dcb8216..53d35c5 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -799,7 +799,7 @@ void fscache_enqueue_object(struct fscache_object *object) */ bool fscache_object_sleep_till_congested(signed long *timeoutp) { - wait_queue_head_t *cong_wq = &__get_cpu_var(fscache_object_cong_wait); + wait_queue_head_t *cong_wq = this_cpu_ptr(&fscache_object_cong_wait); DEFINE_WAIT(wait); if (fscache_object_congested()) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 5f66d51..2481900 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -109,7 +109,7 @@ struct backing_dev_info { #endif }; -int bdi_init(struct backing_dev_info *bdi); +int __must_check bdi_init(struct backing_dev_info *bdi); void bdi_destroy(struct backing_dev_info *bdi); __printf(3, 4) @@ -117,7 +117,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...); int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); void bdi_unregister(struct backing_dev_info *bdi); -int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int); +int __must_check bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int); void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, enum wb_reason reason); void bdi_start_background_writeback(struct backing_dev_info *bdi); diff --git a/include/linux/bio.h b/include/linux/bio.h index ec48bac..060ff69 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -218,6 +218,7 @@ struct bio_pair { }; extern struct bio_pair *bio_split(struct bio *bi, int first_sectors); extern void bio_pair_release(struct bio_pair *dbio); +extern void bio_trim(struct bio *bio, int offset, int size); extern struct bio_set *bioset_create(unsigned int, unsigned int); extern void bioset_free(struct bio_set *); @@ -419,6 +420,8 @@ static inline void bio_list_init(struct bio_list *bl) bl->head = bl->tail = NULL; } +#define BIO_EMPTY_LIST { NULL, NULL } + #define bio_list_for_each(bio, bl) \ for (bio = (bl)->head; bio; bio = bio->bi_next) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h new file mode 100644 index 0000000..ab0e9b2 --- /dev/null +++ b/include/linux/blk-mq.h @@ -0,0 +1,183 @@ +#ifndef BLK_MQ_H +#define BLK_MQ_H + +#include <linux/blkdev.h> + +struct blk_mq_tags; + +struct blk_mq_cpu_notifier { + struct list_head list; + void *data; + void (*notify)(void *data, unsigned long action, unsigned int cpu); +}; + +struct blk_mq_hw_ctx { + struct { + spinlock_t lock; + struct list_head dispatch; + } ____cacheline_aligned_in_smp; + + unsigned long state; /* BLK_MQ_S_* flags */ + struct delayed_work delayed_work; + + unsigned long flags; /* BLK_MQ_F_* flags */ + + struct request_queue *queue; + unsigned int queue_num; + + void *driver_data; + + unsigned int nr_ctx; + struct blk_mq_ctx **ctxs; + unsigned int nr_ctx_map; + unsigned long *ctx_map; + + struct request **rqs; + struct list_head page_list; + struct blk_mq_tags *tags; + + unsigned long queued; + unsigned long run; +#define BLK_MQ_MAX_DISPATCH_ORDER 10 + unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; + + unsigned int queue_depth; + unsigned int numa_node; + unsigned int cmd_size; /* per-request extra data */ + + struct blk_mq_cpu_notifier cpu_notifier; + struct kobject kobj; +}; + +struct blk_mq_reg { + struct blk_mq_ops *ops; + unsigned int nr_hw_queues; + unsigned int queue_depth; + unsigned int reserved_tags; + unsigned int cmd_size; /* per-request extra data */ + int numa_node; + unsigned int timeout; + unsigned int flags; /* BLK_MQ_F_* */ +}; + +typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *); +typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int); +typedef struct blk_mq_hw_ctx *(alloc_hctx_fn)(struct blk_mq_reg *,unsigned int); +typedef void (free_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); +typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); +typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); + +struct blk_mq_ops { + /* + * Queue request + */ + queue_rq_fn *queue_rq; + + /* + * Map to specific hardware queue + */ + map_queue_fn *map_queue; + + /* + * Called on request timeout + */ + rq_timed_out_fn *timeout; + + /* + * Override for hctx allocations (should probably go) + */ + alloc_hctx_fn *alloc_hctx; + free_hctx_fn *free_hctx; + + /* + * Called when the block layer side of a hardware queue has been + * set up, allowing the driver to allocate/init matching structures. + * Ditto for exit/teardown. + */ + init_hctx_fn *init_hctx; + exit_hctx_fn *exit_hctx; +}; + +enum { + BLK_MQ_RQ_QUEUE_OK = 0, /* queued fine */ + BLK_MQ_RQ_QUEUE_BUSY = 1, /* requeue IO for later */ + BLK_MQ_RQ_QUEUE_ERROR = 2, /* end IO with error */ + + BLK_MQ_F_SHOULD_MERGE = 1 << 0, + BLK_MQ_F_SHOULD_SORT = 1 << 1, + BLK_MQ_F_SHOULD_IPI = 1 << 2, + + BLK_MQ_S_STOPPED = 1 << 0, + + BLK_MQ_MAX_DEPTH = 2048, +}; + +struct request_queue *blk_mq_init_queue(struct blk_mq_reg *, void *); +void blk_mq_free_queue(struct request_queue *); +int blk_mq_register_disk(struct gendisk *); +void blk_mq_unregister_disk(struct gendisk *); +void blk_mq_init_commands(struct request_queue *, void (*init)(void *data, struct blk_mq_hw_ctx *, struct request *, unsigned int), void *data); + +void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); + +void blk_mq_insert_request(struct request_queue *, struct request *, bool); +void blk_mq_run_queues(struct request_queue *q, bool async); +void blk_mq_free_request(struct request *rq); +bool blk_mq_can_queue(struct blk_mq_hw_ctx *); +struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, bool reserved); +struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp); +struct request *blk_mq_rq_from_tag(struct request_queue *q, unsigned int tag); + +struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index); +struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *, unsigned int); +void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int); + +void blk_mq_end_io(struct request *rq, int error); + +void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); +void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); +void blk_mq_stop_hw_queues(struct request_queue *q); +void blk_mq_start_stopped_hw_queues(struct request_queue *q); + +/* + * Driver command data is immediately after the request. So subtract request + * size to get back to the original request. + */ +static inline struct request *blk_mq_rq_from_pdu(void *pdu) +{ + return pdu - sizeof(struct request); +} +static inline void *blk_mq_rq_to_pdu(struct request *rq) +{ + return (void *) rq + sizeof(*rq); +} + +static inline struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx, + unsigned int tag) +{ + return hctx->rqs[tag]; +} + +#define queue_for_each_hw_ctx(q, hctx, i) \ + for ((i) = 0, hctx = (q)->queue_hw_ctx[0]; \ + (i) < (q)->nr_hw_queues; (i)++, hctx = (q)->queue_hw_ctx[i]) + +#define queue_for_each_ctx(q, ctx, i) \ + for ((i) = 0, ctx = per_cpu_ptr((q)->queue_ctx, 0); \ + (i) < (q)->nr_queues; (i)++, ctx = per_cpu_ptr(q->queue_ctx, (i))) + +#define hctx_for_each_ctx(hctx, ctx, i) \ + for ((i) = 0, ctx = (hctx)->ctxs[0]; \ + (i) < (hctx)->nr_ctx; (i)++, ctx = (hctx)->ctxs[(i)]) + +#define blk_ctx_sum(q, sum) \ +({ \ + struct blk_mq_ctx *__x; \ + unsigned int __ret = 0, __i; \ + \ + queue_for_each_ctx((q), __x, __i) \ + __ret += sum; \ + __ret; \ +}) + +#endif diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index fa1abeb..238ef0e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -178,19 +178,20 @@ enum rq_flag_bits { __REQ_MIXED_MERGE, /* merge of different types, fail separately */ __REQ_KERNEL, /* direct IO to kernel pages */ __REQ_PM, /* runtime pm request */ + __REQ_END, /* last of chain of requests */ __REQ_NR_BITS, /* stops here */ }; -#define REQ_WRITE (1 << __REQ_WRITE) -#define REQ_FAILFAST_DEV (1 << __REQ_FAILFAST_DEV) -#define REQ_FAILFAST_TRANSPORT (1 << __REQ_FAILFAST_TRANSPORT) -#define REQ_FAILFAST_DRIVER (1 << __REQ_FAILFAST_DRIVER) -#define REQ_SYNC (1 << __REQ_SYNC) -#define REQ_META (1 << __REQ_META) -#define REQ_PRIO (1 << __REQ_PRIO) -#define REQ_DISCARD (1 << __REQ_DISCARD) -#define REQ_WRITE_SAME (1 << __REQ_WRITE_SAME) -#define REQ_NOIDLE (1 << __REQ_NOIDLE) +#define REQ_WRITE (1ULL << __REQ_WRITE) +#define REQ_FAILFAST_DEV (1ULL << __REQ_FAILFAST_DEV) +#define REQ_FAILFAST_TRANSPORT (1ULL << __REQ_FAILFAST_TRANSPORT) +#define REQ_FAILFAST_DRIVER (1ULL << __REQ_FAILFAST_DRIVER) +#define REQ_SYNC (1ULL << __REQ_SYNC) +#define REQ_META (1ULL << __REQ_META) +#define REQ_PRIO (1ULL << __REQ_PRIO) +#define REQ_DISCARD (1ULL << __REQ_DISCARD) +#define REQ_WRITE_SAME (1ULL << __REQ_WRITE_SAME) +#define REQ_NOIDLE (1ULL << __REQ_NOIDLE) #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) @@ -206,28 +207,29 @@ enum rq_flag_bits { #define REQ_NOMERGE_FLAGS \ (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA) -#define REQ_RAHEAD (1 << __REQ_RAHEAD) -#define REQ_THROTTLED (1 << __REQ_THROTTLED) - -#define REQ_SORTED (1 << __REQ_SORTED) -#define REQ_SOFTBARRIER (1 << __REQ_SOFTBARRIER) -#define REQ_FUA (1 << __REQ_FUA) -#define REQ_NOMERGE (1 << __REQ_NOMERGE) -#define REQ_STARTED (1 << __REQ_STARTED) -#define REQ_DONTPREP (1 << __REQ_DONTPREP) -#define REQ_QUEUED (1 << __REQ_QUEUED) -#define REQ_ELVPRIV (1 << __REQ_ELVPRIV) -#define REQ_FAILED (1 << __REQ_FAILED) -#define REQ_QUIET (1 << __REQ_QUIET) -#define REQ_PREEMPT (1 << __REQ_PREEMPT) -#define REQ_ALLOCED (1 << __REQ_ALLOCED) -#define REQ_COPY_USER (1 << __REQ_COPY_USER) -#define REQ_FLUSH (1 << __REQ_FLUSH) -#define REQ_FLUSH_SEQ (1 << __REQ_FLUSH_SEQ) -#define REQ_IO_STAT (1 << __REQ_IO_STAT) -#define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE) -#define REQ_SECURE (1 << __REQ_SECURE) -#define REQ_KERNEL (1 << __REQ_KERNEL) -#define REQ_PM (1 << __REQ_PM) +#define REQ_RAHEAD (1ULL << __REQ_RAHEAD) +#define REQ_THROTTLED (1ULL << __REQ_THROTTLED) + +#define REQ_SORTED (1ULL << __REQ_SORTED) +#define REQ_SOFTBARRIER (1ULL << __REQ_SOFTBARRIER) +#define REQ_FUA (1ULL << __REQ_FUA) +#define REQ_NOMERGE (1ULL << __REQ_NOMERGE) +#define REQ_STARTED (1ULL << __REQ_STARTED) +#define REQ_DONTPREP (1ULL << __REQ_DONTPREP) +#define REQ_QUEUED (1ULL << __REQ_QUEUED) +#define REQ_ELVPRIV (1ULL << __REQ_ELVPRIV) +#define REQ_FAILED (1ULL << __REQ_FAILED) +#define REQ_QUIET (1ULL << __REQ_QUIET) +#define REQ_PREEMPT (1ULL << __REQ_PREEMPT) +#define REQ_ALLOCED (1ULL << __REQ_ALLOCED) +#define REQ_COPY_USER (1ULL << __REQ_COPY_USER) +#define REQ_FLUSH (1ULL << __REQ_FLUSH) +#define REQ_FLUSH_SEQ (1ULL << __REQ_FLUSH_SEQ) +#define REQ_IO_STAT (1ULL << __REQ_IO_STAT) +#define REQ_MIXED_MERGE (1ULL << __REQ_MIXED_MERGE) +#define REQ_SECURE (1ULL << __REQ_SECURE) +#define REQ_KERNEL (1ULL << __REQ_KERNEL) +#define REQ_PM (1ULL << __REQ_PM) +#define REQ_END (1ULL << __REQ_END) #endif /* __LINUX_BLK_TYPES_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0e6f765..f26ec20f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -8,6 +8,7 @@ #include <linux/major.h> #include <linux/genhd.h> #include <linux/list.h> +#include <linux/llist.h> #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/pagemap.h> @@ -94,12 +95,19 @@ enum rq_cmd_type_bits { * as well! */ struct request { - struct list_head queuelist; - struct call_single_data csd; + union { + struct list_head queuelist; + struct llist_node ll_list; + }; + union { + struct call_single_data csd; + struct work_struct mq_flush_data; + }; struct request_queue *q; + struct blk_mq_ctx *mq_ctx; - unsigned int cmd_flags; + u64 cmd_flags; enum rq_cmd_type_bits cmd_type; unsigned long atomic_flags; @@ -160,8 +168,6 @@ struct request { unsigned short ioprio; - int ref_count; - void *special; /* opaque pointer available for LLD use */ char *buffer; /* kaddr of the current segment if available */ @@ -215,6 +221,8 @@ struct request_pm_state #include <linux/elevator.h> +struct blk_queue_ctx; + typedef void (request_fn_proc) (struct request_queue *q); typedef void (make_request_fn) (struct request_queue *q, struct bio *bio); typedef int (prep_rq_fn) (struct request_queue *, struct request *); @@ -313,6 +321,18 @@ struct request_queue { dma_drain_needed_fn *dma_drain_needed; lld_busy_fn *lld_busy_fn; + struct blk_mq_ops *mq_ops; + + unsigned int *mq_map; + + /* sw queues */ + struct blk_mq_ctx *queue_ctx; + unsigned int nr_queues; + + /* hw dispatch queues */ + struct blk_mq_hw_ctx **queue_hw_ctx; + unsigned int nr_hw_queues; + /* * Dispatch queue sorting */ @@ -361,6 +381,11 @@ struct request_queue { */ struct kobject kobj; + /* + * mq queue kobject + */ + struct kobject mq_kobj; + #ifdef CONFIG_PM_RUNTIME struct device *dev; int rpm_status; @@ -425,7 +450,13 @@ struct request_queue { unsigned long flush_pending_since; struct list_head flush_queue[2]; struct list_head flush_data_in_flight; - struct request flush_rq; + union { + struct request flush_rq; + struct { + spinlock_t mq_flush_lock; + struct work_struct mq_flush_work; + }; + }; struct mutex sysfs_lock; @@ -437,14 +468,14 @@ struct request_queue { struct bsg_class_device bsg_dev; #endif -#ifdef CONFIG_BLK_CGROUP - struct list_head all_q_node; -#endif #ifdef CONFIG_BLK_DEV_THROTTLING /* Throttle data */ struct throtl_data *td; #endif struct rcu_head rcu_head; + wait_queue_head_t mq_freeze_wq; + struct percpu_counter mq_usage_counter; + struct list_head all_q_node; }; #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ @@ -467,6 +498,7 @@ struct request_queue { #define QUEUE_FLAG_SECDISCARD 17 /* supports SECDISCARD */ #define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */ #define QUEUE_FLAG_DEAD 19 /* queue tear-down finished */ +#define QUEUE_FLAG_INIT_DONE 20 /* queue is initialized */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ @@ -539,6 +571,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) #define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags) #define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags) #define blk_queue_bypass(q) test_bit(QUEUE_FLAG_BYPASS, &(q)->queue_flags) +#define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) #define blk_queue_noxmerges(q) \ test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) @@ -570,7 +603,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) #define list_entry_rq(ptr) list_entry((ptr), struct request, queuelist) -#define rq_data_dir(rq) ((rq)->cmd_flags & 1) +#define rq_data_dir(rq) (((rq)->cmd_flags & 1) != 0) static inline unsigned int blk_queue_cluster(struct request_queue *q) { @@ -1013,6 +1046,7 @@ static inline void blk_post_runtime_resume(struct request_queue *q, int err) {} struct blk_plug { unsigned long magic; /* detect uninitialized use-cases */ struct list_head list; /* requests */ + struct list_head mq_list; /* blk-mq requests */ struct list_head cb_list; /* md requires an unplug callback */ }; #define BLK_MAX_REQUEST_COUNT 16 @@ -1050,7 +1084,10 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) { struct blk_plug *plug = tsk->plug; - return plug && (!list_empty(&plug->list) || !list_empty(&plug->cb_list)); + return plug && + (!list_empty(&plug->list) || + !list_empty(&plug->mq_list) || + !list_empty(&plug->cb_list)); } /* @@ -1325,6 +1362,7 @@ static inline void put_dev_sector(Sector p) struct work_struct; int kblockd_schedule_work(struct request_queue *q, struct work_struct *work); +int kblockd_schedule_delayed_work(struct request_queue *q, struct delayed_work *dwork, unsigned long delay); #ifdef CONFIG_BLK_CGROUP /* diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 7c2e030..afc1343 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -5,6 +5,7 @@ #include <linux/relay.h> #include <linux/compat.h> #include <uapi/linux/blktrace_api.h> +#include <linux/list.h> #if defined(CONFIG_BLK_DEV_IO_TRACE) @@ -23,6 +24,7 @@ struct blk_trace { struct dentry *dir; struct dentry *dropped_file; struct dentry *msg_file; + struct list_head running_list; atomic_t dropped; }; @@ -87,7 +89,7 @@ static inline int blk_trace_init_sysfs(struct device *dev) #ifdef CONFIG_COMPAT struct compat_blk_user_trace_setup { - char name[32]; + char name[BLKTRACE_BDEV_SIZE]; u16 act_mask; u32 buf_size; u32 buf_nr; diff --git a/include/linux/percpu_ida.h b/include/linux/percpu_ida.h index 0b23edb..1900bd0 100644 --- a/include/linux/percpu_ida.h +++ b/include/linux/percpu_ida.h @@ -16,6 +16,8 @@ struct percpu_ida { * percpu_ida_init() */ unsigned nr_tags; + unsigned percpu_max_size; + unsigned percpu_batch_size; struct percpu_ida_cpu __percpu *tag_cpu; @@ -51,10 +53,29 @@ struct percpu_ida { } ____cacheline_aligned_in_smp; }; +/* + * Number of tags we move between the percpu freelist and the global freelist at + * a time + */ +#define IDA_DEFAULT_PCPU_BATCH_MOVE 32U +/* Max size of percpu freelist, */ +#define IDA_DEFAULT_PCPU_SIZE ((IDA_DEFAULT_PCPU_BATCH_MOVE * 3) / 2) + int percpu_ida_alloc(struct percpu_ida *pool, gfp_t gfp); void percpu_ida_free(struct percpu_ida *pool, unsigned tag); void percpu_ida_destroy(struct percpu_ida *pool); -int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags); +int __percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags, + unsigned long max_size, unsigned long batch_size); +static inline int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags) +{ + return __percpu_ida_init(pool, nr_tags, IDA_DEFAULT_PCPU_SIZE, + IDA_DEFAULT_PCPU_BATCH_MOVE); +} + +typedef int (*percpu_ida_cb)(unsigned, void *); +int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn, + void *data); +unsigned percpu_ida_free_tags(struct percpu_ida *pool, int cpu); #endif /* __PERCPU_IDA_H__ */ diff --git a/kernel/smp.c b/kernel/smp.c index f5768b0..4611610 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -18,6 +18,7 @@ #ifdef CONFIG_USE_GENERIC_SMP_HELPERS enum { CSD_FLAG_LOCK = 0x01, + CSD_FLAG_WAIT = 0x02, }; struct call_function_data { @@ -124,7 +125,7 @@ static void csd_lock(struct call_single_data *csd) static void csd_unlock(struct call_single_data *csd) { - WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); + WARN_ON((csd->flags & CSD_FLAG_WAIT) && !(csd->flags & CSD_FLAG_LOCK)); /* * ensure we're all done before releasing data: @@ -146,6 +147,9 @@ void generic_exec_single(int cpu, struct call_single_data *csd, int wait) unsigned long flags; int ipi; + if (wait) + csd->flags |= CSD_FLAG_WAIT; + raw_spin_lock_irqsave(&dst->lock, flags); ipi = list_empty(&dst->list); list_add_tail(&csd->list, &dst->list); @@ -340,6 +344,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *csd, } put_cpu(); } +EXPORT_SYMBOL_GPL(__smp_call_function_single); /** * smp_call_function_many(): Run a function on a set of other CPUs. diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index b8b8560..f785aef 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -26,6 +26,7 @@ #include <linux/export.h> #include <linux/time.h> #include <linux/uaccess.h> +#include <linux/list.h> #include <trace/events/block.h> @@ -38,6 +39,9 @@ static unsigned int blktrace_seq __read_mostly = 1; static struct trace_array *blk_tr; static bool blk_tracer_enabled __read_mostly; +static LIST_HEAD(running_trace_list); +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock); + /* Select an alternative, minimalistic output than the original one */ #define TRACE_BLK_OPT_CLASSIC 0x1 @@ -107,10 +111,18 @@ record_it: * Send out a notify for this process, if we haven't done so since a trace * started */ -static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk) +static void trace_note_tsk(struct task_struct *tsk) { + unsigned long flags; + struct blk_trace *bt; + tsk->btrace_seq = blktrace_seq; - trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm)); + spin_lock_irqsave(&running_trace_lock, flags); + list_for_each_entry(bt, &running_trace_list, running_list) { + trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, + sizeof(tsk->comm)); + } + spin_unlock_irqrestore(&running_trace_lock, flags); } static void trace_note_time(struct blk_trace *bt) @@ -229,16 +241,15 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, goto record_it; } + if (unlikely(tsk->btrace_seq != blktrace_seq)) + trace_note_tsk(tsk); + /* * A word about the locking here - we disable interrupts to reserve * some space in the relay per-cpu buffer, to prevent an irq * from coming in and stepping on our toes. */ local_irq_save(flags); - - if (unlikely(tsk->btrace_seq != blktrace_seq)) - trace_note_tsk(bt, tsk); - t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); if (t) { sequence = per_cpu_ptr(bt->sequence, cpu); @@ -477,6 +488,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, bt->dir = dir; bt->dev = dev; atomic_set(&bt->dropped, 0); + INIT_LIST_HEAD(&bt->running_list); ret = -EIO; bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, @@ -567,13 +579,12 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, .end_lba = cbuts.end_lba, .pid = cbuts.pid, }; - memcpy(&buts.name, &cbuts.name, 32); ret = do_blk_trace_setup(q, name, dev, bdev, &buts); if (ret) return ret; - if (copy_to_user(arg, &buts.name, 32)) { + if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { blk_trace_remove(q); return -EFAULT; } @@ -601,6 +612,9 @@ int blk_trace_startstop(struct request_queue *q, int start) blktrace_seq++; smp_mb(); bt->trace_state = Blktrace_running; + spin_lock_irq(&running_trace_lock); + list_add(&bt->running_list, &running_trace_list); + spin_unlock_irq(&running_trace_lock); trace_note_time(bt); ret = 0; @@ -608,6 +622,9 @@ int blk_trace_startstop(struct request_queue *q, int start) } else { if (bt->trace_state == Blktrace_running) { bt->trace_state = Blktrace_stopped; + spin_lock_irq(&running_trace_lock); + list_del_init(&bt->running_list); + spin_unlock_irq(&running_trace_lock); relay_flush(bt->rchan); ret = 0; } @@ -1472,6 +1489,9 @@ static int blk_trace_remove_queue(struct request_queue *q) if (atomic_dec_and_test(&blk_probes_ref)) blk_unregister_tracepoints(); + spin_lock_irq(&running_trace_lock); + list_del(&bt->running_list); + spin_unlock_irq(&running_trace_lock); blk_trace_free(bt); return 0; } diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 93c5d5e..7473ee3 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -60,14 +60,15 @@ static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc) void percpu_counter_set(struct percpu_counter *fbc, s64 amount) { int cpu; + unsigned long flags; - raw_spin_lock(&fbc->lock); + raw_spin_lock_irqsave(&fbc->lock, flags); for_each_possible_cpu(cpu) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); *pcount = 0; } fbc->count = amount; - raw_spin_unlock(&fbc->lock); + raw_spin_unlock_irqrestore(&fbc->lock, flags); } EXPORT_SYMBOL(percpu_counter_set); @@ -78,9 +79,10 @@ void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch) preempt_disable(); count = __this_cpu_read(*fbc->counters) + amount; if (count >= batch || count <= -batch) { - raw_spin_lock(&fbc->lock); + unsigned long flags; + raw_spin_lock_irqsave(&fbc->lock, flags); fbc->count += count; - raw_spin_unlock(&fbc->lock); + raw_spin_unlock_irqrestore(&fbc->lock, flags); __this_cpu_write(*fbc->counters, 0); } else { __this_cpu_write(*fbc->counters, count); @@ -97,14 +99,15 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc) { s64 ret; int cpu; + unsigned long flags; - raw_spin_lock(&fbc->lock); + raw_spin_lock_irqsave(&fbc->lock, flags); ret = fbc->count; for_each_online_cpu(cpu) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); ret += *pcount; } - raw_spin_unlock(&fbc->lock); + raw_spin_unlock_irqrestore(&fbc->lock, flags); return ret; } EXPORT_SYMBOL(__percpu_counter_sum); diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c index bab1ba2..b0698ea 100644 --- a/lib/percpu_ida.c +++ b/lib/percpu_ida.c @@ -30,15 +30,6 @@ #include <linux/spinlock.h> #include <linux/percpu_ida.h> -/* - * Number of tags we move between the percpu freelist and the global freelist at - * a time - */ -#define IDA_PCPU_BATCH_MOVE 32U - -/* Max size of percpu freelist, */ -#define IDA_PCPU_SIZE ((IDA_PCPU_BATCH_MOVE * 3) / 2) - struct percpu_ida_cpu { /* * Even though this is percpu, we need a lock for tag stealing by remote @@ -78,7 +69,7 @@ static inline void steal_tags(struct percpu_ida *pool, struct percpu_ida_cpu *remote; for (cpus_have_tags = cpumask_weight(&pool->cpus_have_tags); - cpus_have_tags * IDA_PCPU_SIZE > pool->nr_tags / 2; + cpus_have_tags * pool->percpu_max_size > pool->nr_tags / 2; cpus_have_tags--) { cpu = cpumask_next(cpu, &pool->cpus_have_tags); @@ -123,7 +114,7 @@ static inline void alloc_global_tags(struct percpu_ida *pool, { move_tags(tags->freelist, &tags->nr_free, pool->freelist, &pool->nr_free, - min(pool->nr_free, IDA_PCPU_BATCH_MOVE)); + min(pool->nr_free, pool->percpu_batch_size)); } static inline unsigned alloc_local_tag(struct percpu_ida *pool, @@ -245,17 +236,17 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag) wake_up(&pool->wait); } - if (nr_free == IDA_PCPU_SIZE) { + if (nr_free == pool->percpu_max_size) { spin_lock(&pool->lock); /* * Global lock held and irqs disabled, don't need percpu * lock */ - if (tags->nr_free == IDA_PCPU_SIZE) { + if (tags->nr_free == pool->percpu_max_size) { move_tags(pool->freelist, &pool->nr_free, tags->freelist, &tags->nr_free, - IDA_PCPU_BATCH_MOVE); + pool->percpu_batch_size); wake_up(&pool->wait); } @@ -292,7 +283,8 @@ EXPORT_SYMBOL_GPL(percpu_ida_destroy); * Allocation is percpu, but sharding is limited by nr_tags - for best * performance, the workload should not span more cpus than nr_tags / 128. */ -int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags) +int __percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags, + unsigned long max_size, unsigned long batch_size) { unsigned i, cpu, order; @@ -301,6 +293,8 @@ int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags) init_waitqueue_head(&pool->wait); spin_lock_init(&pool->lock); pool->nr_tags = nr_tags; + pool->percpu_max_size = max_size; + pool->percpu_batch_size = batch_size; /* Guard against overflow */ if (nr_tags > (unsigned) INT_MAX + 1) { @@ -319,7 +313,7 @@ int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags) pool->nr_free = nr_tags; pool->tag_cpu = __alloc_percpu(sizeof(struct percpu_ida_cpu) + - IDA_PCPU_SIZE * sizeof(unsigned), + pool->percpu_max_size * sizeof(unsigned), sizeof(unsigned)); if (!pool->tag_cpu) goto err; @@ -332,4 +326,65 @@ err: percpu_ida_destroy(pool); return -ENOMEM; } -EXPORT_SYMBOL_GPL(percpu_ida_init); +EXPORT_SYMBOL_GPL(__percpu_ida_init); + +/** + * percpu_ida_for_each_free - iterate free ids of a pool + * @pool: pool to iterate + * @fn: interate callback function + * @data: parameter for @fn + * + * Note, this doesn't guarantee to iterate all free ids restrictly. Some free + * ids might be missed, some might be iterated duplicated, and some might + * be iterated and not free soon. + */ +int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn, + void *data) +{ + unsigned long flags; + struct percpu_ida_cpu *remote; + unsigned cpu, i, err = 0; + + local_irq_save(flags); + for_each_possible_cpu(cpu) { + remote = per_cpu_ptr(pool->tag_cpu, cpu); + spin_lock(&remote->lock); + for (i = 0; i < remote->nr_free; i++) { + err = fn(remote->freelist[i], data); + if (err) + break; + } + spin_unlock(&remote->lock); + if (err) + goto out; + } + + spin_lock(&pool->lock); + for (i = 0; i < pool->nr_free; i++) { + err = fn(pool->freelist[i], data); + if (err) + break; + } + spin_unlock(&pool->lock); +out: + local_irq_restore(flags); + return err; +} +EXPORT_SYMBOL_GPL(percpu_ida_for_each_free); + +/** + * percpu_ida_free_tags - return free tags number of a specific cpu or global pool + * @pool: pool related + * @cpu: specific cpu or global pool if @cpu == nr_cpu_ids + * + * Note: this just returns a snapshot of free tags number. + */ +unsigned percpu_ida_free_tags(struct percpu_ida *pool, int cpu) +{ + struct percpu_ida_cpu *remote; + if (cpu == nr_cpu_ids) + return pool->nr_free; + remote = per_cpu_ptr(pool->tag_cpu, cpu); + return remote->nr_free; +} +EXPORT_SYMBOL_GPL(percpu_ida_free_tags); @@ -934,7 +934,8 @@ void __init swap_setup(void) #ifdef CONFIG_SWAP int i; - bdi_init(swapper_spaces[0].backing_dev_info); + if (bdi_init(swapper_spaces[0].backing_dev_info)) + panic("Failed to init swap bdi"); for (i = 0; i < MAX_SWAPFILES; i++) { spin_lock_init(&swapper_spaces[i].tree_lock); INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); |