diff options
-rw-r--r-- | block/blk-core.c | 20 | ||||
-rw-r--r-- | block/blk-exec.c | 2 | ||||
-rw-r--r-- | block/blk-flush.c | 101 | ||||
-rw-r--r-- | block/blk-lib.c | 8 | ||||
-rw-r--r-- | block/blk-merge.c | 91 | ||||
-rw-r--r-- | block/blk-mq-tag.c | 2 | ||||
-rw-r--r-- | block/blk-mq.c | 143 | ||||
-rw-r--r-- | block/blk-mq.h | 4 | ||||
-rw-r--r-- | block/blk-sysfs.c | 2 | ||||
-rw-r--r-- | block/blk-timeout.c | 2 | ||||
-rw-r--r-- | block/blk.h | 2 | ||||
-rw-r--r-- | drivers/block/null_blk.c | 97 | ||||
-rw-r--r-- | drivers/block/virtio_blk.c | 7 | ||||
-rw-r--r-- | drivers/block/xen-blkback/blkback.c | 66 | ||||
-rw-r--r-- | drivers/block/xen-blkback/common.h | 5 | ||||
-rw-r--r-- | drivers/block/xen-blkback/xenbus.c | 14 | ||||
-rw-r--r-- | drivers/block/xen-blkfront.c | 11 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 4 | ||||
-rw-r--r-- | drivers/md/bcache/bset.c | 7 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 4 | ||||
-rw-r--r-- | drivers/md/bcache/request.c | 6 | ||||
-rw-r--r-- | drivers/md/bcache/sysfs.c | 2 | ||||
-rw-r--r-- | fs/bio-integrity.c | 13 | ||||
-rw-r--r-- | fs/bio.c | 15 | ||||
-rw-r--r-- | include/linux/bio.h | 12 | ||||
-rw-r--r-- | include/linux/blk-mq.h | 9 | ||||
-rw-r--r-- | include/linux/blkdev.h | 11 | ||||
-rw-r--r-- | include/xen/interface/io/blkif.h | 34 | ||||
-rw-r--r-- | lib/percpu_ida.c | 7 |
29 files changed, 397 insertions, 304 deletions
diff --git a/block/blk-core.c b/block/blk-core.c index c00e0bd..853f927 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -693,11 +693,20 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) if (!uninit_q) return NULL; + uninit_q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL); + if (!uninit_q->flush_rq) + goto out_cleanup_queue; + q = blk_init_allocated_queue(uninit_q, rfn, lock); if (!q) - blk_cleanup_queue(uninit_q); - + goto out_free_flush_rq; return q; + +out_free_flush_rq: + kfree(uninit_q->flush_rq); +out_cleanup_queue: + blk_cleanup_queue(uninit_q); + return NULL; } EXPORT_SYMBOL(blk_init_queue_node); @@ -1127,7 +1136,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw, struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) { if (q->mq_ops) - return blk_mq_alloc_request(q, rw, gfp_mask, false); + return blk_mq_alloc_request(q, rw, gfp_mask); else return blk_old_get_request(q, rw, gfp_mask); } @@ -1278,6 +1287,11 @@ void __blk_put_request(struct request_queue *q, struct request *req) if (unlikely(!q)) return; + if (q->mq_ops) { + blk_mq_free_request(req); + return; + } + blk_pm_put_request(req); elv_completed_request(q, req); diff --git a/block/blk-exec.c b/block/blk-exec.c index bbfc072..c68613b 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -65,7 +65,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, * be resued after dying flag is set */ if (q->mq_ops) { - blk_mq_insert_request(q, rq, true); + blk_mq_insert_request(q, rq, at_head, true); return; } diff --git a/block/blk-flush.c b/block/blk-flush.c index 9288aaf..66e2b69 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -130,20 +130,26 @@ static void blk_flush_restore_request(struct request *rq) blk_clear_rq_complete(rq); } -static void mq_flush_data_run(struct work_struct *work) +static void mq_flush_run(struct work_struct *work) { struct request *rq; - rq = container_of(work, struct request, mq_flush_data); + rq = container_of(work, struct request, mq_flush_work); memset(&rq->csd, 0, sizeof(rq->csd)); blk_mq_run_request(rq, true, false); } -static void blk_mq_flush_data_insert(struct request *rq) +static bool blk_flush_queue_rq(struct request *rq) { - INIT_WORK(&rq->mq_flush_data, mq_flush_data_run); - kblockd_schedule_work(rq->q, &rq->mq_flush_data); + if (rq->q->mq_ops) { + INIT_WORK(&rq->mq_flush_work, mq_flush_run); + kblockd_schedule_work(rq->q, &rq->mq_flush_work); + return false; + } else { + list_add_tail(&rq->queuelist, &rq->q->queue_head); + return true; + } } /** @@ -187,12 +193,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, case REQ_FSEQ_DATA: list_move_tail(&rq->flush.list, &q->flush_data_in_flight); - if (q->mq_ops) - blk_mq_flush_data_insert(rq); - else { - list_add(&rq->queuelist, &q->queue_head); - queued = true; - } + queued = blk_flush_queue_rq(rq); break; case REQ_FSEQ_DONE: @@ -216,9 +217,6 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, } kicked = blk_kick_flush(q); - /* blk_mq_run_flush will run queue */ - if (q->mq_ops) - return queued; return kicked | queued; } @@ -230,10 +228,9 @@ static void flush_end_io(struct request *flush_rq, int error) struct request *rq, *n; unsigned long flags = 0; - if (q->mq_ops) { - blk_mq_free_request(flush_rq); + if (q->mq_ops) spin_lock_irqsave(&q->mq_flush_lock, flags); - } + running = &q->flush_queue[q->flush_running_idx]; BUG_ON(q->flush_pending_idx == q->flush_running_idx); @@ -263,49 +260,14 @@ static void flush_end_io(struct request *flush_rq, int error) * kblockd. */ if (queued || q->flush_queue_delayed) { - if (!q->mq_ops) - blk_run_queue_async(q); - else - /* - * This can be optimized to only run queues with requests - * queued if necessary. - */ - blk_mq_run_queues(q, true); + WARN_ON(q->mq_ops); + blk_run_queue_async(q); } q->flush_queue_delayed = 0; if (q->mq_ops) spin_unlock_irqrestore(&q->mq_flush_lock, flags); } -static void mq_flush_work(struct work_struct *work) -{ - struct request_queue *q; - struct request *rq; - - q = container_of(work, struct request_queue, mq_flush_work); - - /* We don't need set REQ_FLUSH_SEQ, it's for consistency */ - rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ, - __GFP_WAIT|GFP_ATOMIC, true); - rq->cmd_type = REQ_TYPE_FS; - rq->end_io = flush_end_io; - - blk_mq_run_request(rq, true, false); -} - -/* - * We can't directly use q->flush_rq, because it doesn't have tag and is not in - * hctx->rqs[]. so we must allocate a new request, since we can't sleep here, - * so offload the work to workqueue. - * - * Note: we assume a flush request finished in any hardware queue will flush - * the whole disk cache. - */ -static void mq_run_flush(struct request_queue *q) -{ - kblockd_schedule_work(q, &q->mq_flush_work); -} - /** * blk_kick_flush - consider issuing flush request * @q: request_queue being kicked @@ -340,19 +302,31 @@ static bool blk_kick_flush(struct request_queue *q) * different from running_idx, which means flush is in flight. */ q->flush_pending_idx ^= 1; + if (q->mq_ops) { - mq_run_flush(q); - return true; + struct blk_mq_ctx *ctx = first_rq->mq_ctx; + struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); + + blk_mq_rq_init(hctx, q->flush_rq); + q->flush_rq->mq_ctx = ctx; + + /* + * Reuse the tag value from the fist waiting request, + * with blk-mq the tag is generated during request + * allocation and drivers can rely on it being inside + * the range they asked for. + */ + q->flush_rq->tag = first_rq->tag; + } else { + blk_rq_init(q, q->flush_rq); } - blk_rq_init(q, &q->flush_rq); - q->flush_rq.cmd_type = REQ_TYPE_FS; - q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; - q->flush_rq.rq_disk = first_rq->rq_disk; - q->flush_rq.end_io = flush_end_io; + q->flush_rq->cmd_type = REQ_TYPE_FS; + q->flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; + q->flush_rq->rq_disk = first_rq->rq_disk; + q->flush_rq->end_io = flush_end_io; - list_add_tail(&q->flush_rq.queuelist, &q->queue_head); - return true; + return blk_flush_queue_rq(q->flush_rq); } static void flush_data_end_io(struct request *rq, int error) @@ -558,5 +532,4 @@ EXPORT_SYMBOL(blkdev_issue_flush); void blk_mq_init_flush(struct request_queue *q) { spin_lock_init(&q->mq_flush_lock); - INIT_WORK(&q->mq_flush_work, mq_flush_work); } diff --git a/block/blk-lib.c b/block/blk-lib.c index 2da76c9..97a733c 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -119,6 +119,14 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, atomic_inc(&bb.done); submit_bio(type, bio); + + /* + * We can loop for a long time in here, if someone does + * full device discards (like mkfs). Be nice and allow + * us to schedule out to avoid softlocking if preempt + * is disabled. + */ + cond_resched(); } blk_finish_plug(&plug); diff --git a/block/blk-merge.c b/block/blk-merge.c index 8f8adaa..6c583f9 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -21,6 +21,16 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, if (!bio) return 0; + /* + * This should probably be returning 0, but blk_add_request_payload() + * (Christoph!!!!) + */ + if (bio->bi_rw & REQ_DISCARD) + return 1; + + if (bio->bi_rw & REQ_WRITE_SAME) + return 1; + fbio = bio; cluster = blk_queue_cluster(q); seg_size = 0; @@ -161,30 +171,60 @@ new_segment: *bvprv = *bvec; } -/* - * map a request to scatterlist, return number of sg entries setup. Caller - * must make sure sg can hold rq->nr_phys_segments entries - */ -int blk_rq_map_sg(struct request_queue *q, struct request *rq, - struct scatterlist *sglist) +static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio, + struct scatterlist *sglist, + struct scatterlist **sg) { struct bio_vec bvec, bvprv = { NULL }; - struct req_iterator iter; - struct scatterlist *sg; + struct bvec_iter iter; int nsegs, cluster; nsegs = 0; cluster = blk_queue_cluster(q); - /* - * for each bio in rq - */ - sg = NULL; - rq_for_each_segment(bvec, rq, iter) { - __blk_segment_map_sg(q, &bvec, sglist, &bvprv, &sg, - &nsegs, &cluster); - } /* segments in rq */ + if (bio->bi_rw & REQ_DISCARD) { + /* + * This is a hack - drivers should be neither modifying the + * biovec, nor relying on bi_vcnt - but because of + * blk_add_request_payload(), a discard bio may or may not have + * a payload we need to set up here (thank you Christoph) and + * bi_vcnt is really the only way of telling if we need to. + */ + + if (bio->bi_vcnt) + goto single_segment; + + return 0; + } + + if (bio->bi_rw & REQ_WRITE_SAME) { +single_segment: + *sg = sglist; + bvec = bio_iovec(bio); + sg_set_page(*sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset); + return 1; + } + + for_each_bio(bio) + bio_for_each_segment(bvec, bio, iter) + __blk_segment_map_sg(q, &bvec, sglist, &bvprv, sg, + &nsegs, &cluster); + return nsegs; +} + +/* + * map a request to scatterlist, return number of sg entries setup. Caller + * must make sure sg can hold rq->nr_phys_segments entries + */ +int blk_rq_map_sg(struct request_queue *q, struct request *rq, + struct scatterlist *sglist) +{ + struct scatterlist *sg = NULL; + int nsegs = 0; + + if (rq->bio) + nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg); if (unlikely(rq->cmd_flags & REQ_COPY_USER) && (blk_rq_bytes(rq) & q->dma_pad_mask)) { @@ -230,20 +270,13 @@ EXPORT_SYMBOL(blk_rq_map_sg); int blk_bio_map_sg(struct request_queue *q, struct bio *bio, struct scatterlist *sglist) { - struct bio_vec bvec, bvprv = { NULL }; - struct scatterlist *sg; - int nsegs, cluster; - struct bvec_iter iter; - - nsegs = 0; - cluster = blk_queue_cluster(q); - - sg = NULL; - bio_for_each_segment(bvec, bio, iter) { - __blk_segment_map_sg(q, &bvec, sglist, &bvprv, &sg, - &nsegs, &cluster); - } /* segments in bio */ + struct scatterlist *sg = NULL; + int nsegs; + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + nsegs = __blk_bios_map_sg(q, bio, sglist, &sg); + bio->bi_next = next; if (sg) sg_mark_end(sg); diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 5d70edc..83ae96c 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -184,7 +184,7 @@ void blk_mq_free_tags(struct blk_mq_tags *tags) ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) { char *orig_page = page; - int cpu; + unsigned int cpu; if (!tags) return 0; diff --git a/block/blk-mq.c b/block/blk-mq.c index 57039fc..1fa9dd1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -226,15 +226,14 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, return rq; } -struct request *blk_mq_alloc_request(struct request_queue *q, int rw, - gfp_t gfp, bool reserved) +struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp) { struct request *rq; if (blk_mq_queue_enter(q)) return NULL; - rq = blk_mq_alloc_request_pinned(q, rw, gfp, reserved); + rq = blk_mq_alloc_request_pinned(q, rw, gfp, false); if (rq) blk_mq_put_ctx(rq->mq_ctx); return rq; @@ -258,7 +257,7 @@ EXPORT_SYMBOL(blk_mq_alloc_reserved_request); /* * Re-init and set pdu, if we have it */ -static void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq) +void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq) { blk_rq_init(hctx->queue, rq); @@ -305,7 +304,7 @@ static void blk_mq_bio_endio(struct request *rq, struct bio *bio, int error) bio_endio(bio, error); } -void blk_mq_complete_request(struct request *rq, int error) +void blk_mq_end_io(struct request *rq, int error) { struct bio *bio = rq->bio; unsigned int bytes = 0; @@ -330,48 +329,55 @@ void blk_mq_complete_request(struct request *rq, int error) else blk_mq_free_request(rq); } +EXPORT_SYMBOL(blk_mq_end_io); -void __blk_mq_end_io(struct request *rq, int error) -{ - if (!blk_mark_rq_complete(rq)) - blk_mq_complete_request(rq, error); -} - -static void blk_mq_end_io_remote(void *data) +static void __blk_mq_complete_request_remote(void *data) { struct request *rq = data; - __blk_mq_end_io(rq, rq->errors); + rq->q->softirq_done_fn(rq); } -/* - * End IO on this request on a multiqueue enabled driver. We'll either do - * it directly inline, or punt to a local IPI handler on the matching - * remote CPU. - */ -void blk_mq_end_io(struct request *rq, int error) +void __blk_mq_complete_request(struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; int cpu; - if (!ctx->ipi_redirect) - return __blk_mq_end_io(rq, error); + if (!ctx->ipi_redirect) { + rq->q->softirq_done_fn(rq); + return; + } cpu = get_cpu(); if (cpu != ctx->cpu && cpu_online(ctx->cpu)) { - rq->errors = error; - rq->csd.func = blk_mq_end_io_remote; + rq->csd.func = __blk_mq_complete_request_remote; rq->csd.info = rq; rq->csd.flags = 0; __smp_call_function_single(ctx->cpu, &rq->csd, 0); } else { - __blk_mq_end_io(rq, error); + rq->q->softirq_done_fn(rq); } put_cpu(); } -EXPORT_SYMBOL(blk_mq_end_io); -static void blk_mq_start_request(struct request *rq) +/** + * blk_mq_complete_request - end I/O on a request + * @rq: the request being processed + * + * Description: + * Ends all I/O on a request. It does not handle partial completions. + * The actual completion happens out-of-order, through a IPI handler. + **/ +void blk_mq_complete_request(struct request *rq) +{ + if (unlikely(blk_should_fake_timeout(rq->q))) + return; + if (!blk_mark_rq_complete(rq)) + __blk_mq_complete_request(rq); +} +EXPORT_SYMBOL(blk_mq_complete_request); + +static void blk_mq_start_request(struct request *rq, bool last) { struct request_queue *q = rq->q; @@ -384,6 +390,25 @@ static void blk_mq_start_request(struct request *rq) */ rq->deadline = jiffies + q->rq_timeout; set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + + if (q->dma_drain_size && blk_rq_bytes(rq)) { + /* + * Make sure space for the drain appears. We know we can do + * this because max_hw_segments has been adjusted to be one + * fewer than the device can handle. + */ + rq->nr_phys_segments++; + } + + /* + * Flag the last request in the series so that drivers know when IO + * should be kicked off, if they don't do it on a per-request basis. + * + * Note: the flag isn't the only condition drivers should do kick off. + * If drive is busy, the last request might not have the bit set. + */ + if (last) + rq->cmd_flags |= REQ_END; } static void blk_mq_requeue_request(struct request *rq) @@ -392,6 +417,11 @@ static void blk_mq_requeue_request(struct request *rq) trace_block_rq_requeue(q, rq); clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + + rq->cmd_flags &= ~REQ_END; + + if (q->dma_drain_size && blk_rq_bytes(rq)) + rq->nr_phys_segments--; } struct blk_mq_timeout_data { @@ -559,19 +589,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) rq = list_first_entry(&rq_list, struct request, queuelist); list_del_init(&rq->queuelist); - blk_mq_start_request(rq); - /* - * Last request in the series. Flag it as such, this - * enables drivers to know when IO should be kicked off, - * if they don't do it on a per-request basis. - * - * Note: the flag isn't the only condition drivers - * should do kick off. If drive is busy, the last - * request might not have the bit set. - */ - if (list_empty(&rq_list)) - rq->cmd_flags |= REQ_END; + blk_mq_start_request(rq, list_empty(&rq_list)); ret = q->mq_ops->queue_rq(hctx, rq); switch (ret) { @@ -589,8 +608,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) break; default: pr_err("blk-mq: bad return on queue: %d\n", ret); - rq->errors = -EIO; case BLK_MQ_RQ_QUEUE_ERROR: + rq->errors = -EIO; blk_mq_end_io(rq, rq->errors); break; } @@ -693,13 +712,16 @@ static void blk_mq_work_fn(struct work_struct *work) } static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, - struct request *rq) + struct request *rq, bool at_head) { struct blk_mq_ctx *ctx = rq->mq_ctx; trace_block_rq_insert(hctx->queue, rq); - list_add_tail(&rq->queuelist, &ctx->rq_list); + if (at_head) + list_add(&rq->queuelist, &ctx->rq_list); + else + list_add_tail(&rq->queuelist, &ctx->rq_list); blk_mq_hctx_mark_pending(hctx, ctx); /* @@ -709,7 +731,7 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, } void blk_mq_insert_request(struct request_queue *q, struct request *rq, - bool run_queue) + bool at_head, bool run_queue) { struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx, *current_ctx; @@ -728,7 +750,7 @@ void blk_mq_insert_request(struct request_queue *q, struct request *rq, rq->mq_ctx = ctx; } spin_lock(&ctx->lock); - __blk_mq_insert_request(hctx, rq); + __blk_mq_insert_request(hctx, rq, at_head); spin_unlock(&ctx->lock); blk_mq_put_ctx(current_ctx); @@ -760,7 +782,7 @@ void blk_mq_run_request(struct request *rq, bool run_queue, bool async) /* ctx->cpu might be offline */ spin_lock(&ctx->lock); - __blk_mq_insert_request(hctx, rq); + __blk_mq_insert_request(hctx, rq, false); spin_unlock(&ctx->lock); blk_mq_put_ctx(current_ctx); @@ -798,7 +820,7 @@ static void blk_mq_insert_requests(struct request_queue *q, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); rq->mq_ctx = ctx; - __blk_mq_insert_request(hctx, rq); + __blk_mq_insert_request(hctx, rq, false); } spin_unlock(&ctx->lock); @@ -888,6 +910,11 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_queue_bounce(q, &bio); + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { + bio_endio(bio, -EIO); + return; + } + if (use_plug && blk_attempt_plug_merge(q, bio, &request_count)) return; @@ -950,7 +977,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) __blk_mq_free_request(hctx, ctx, rq); else { blk_mq_bio_to_request(rq, bio); - __blk_mq_insert_request(hctx, rq); + __blk_mq_insert_request(hctx, rq, false); } spin_unlock(&ctx->lock); @@ -1309,15 +1336,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, reg->queue_depth = BLK_MQ_MAX_DEPTH; } - /* - * Set aside a tag for flush requests. It will only be used while - * another flush request is in progress but outside the driver. - * - * TODO: only allocate if flushes are supported - */ - reg->queue_depth++; - reg->reserved_tags++; - if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN)) return ERR_PTR(-EINVAL); @@ -1360,17 +1378,27 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, q->mq_ops = reg->ops; q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; + q->sg_reserved_size = INT_MAX; + blk_queue_make_request(q, blk_mq_make_request); blk_queue_rq_timed_out(q, reg->ops->timeout); if (reg->timeout) blk_queue_rq_timeout(q, reg->timeout); + if (reg->ops->complete) + blk_queue_softirq_done(q, reg->ops->complete); + blk_mq_init_flush(q); blk_mq_init_cpu_queues(q, reg->nr_hw_queues); - if (blk_mq_init_hw_queues(q, reg, driver_data)) + q->flush_rq = kzalloc(round_up(sizeof(struct request) + reg->cmd_size, + cache_line_size()), GFP_KERNEL); + if (!q->flush_rq) goto err_hw; + if (blk_mq_init_hw_queues(q, reg, driver_data)) + goto err_flush_rq; + blk_mq_map_swqueue(q); mutex_lock(&all_q_mutex); @@ -1378,6 +1406,9 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, mutex_unlock(&all_q_mutex); return q; + +err_flush_rq: + kfree(q->flush_rq); err_hw: kfree(q->mq_map); err_map: diff --git a/block/blk-mq.h b/block/blk-mq.h index 5c39179..ed0035c 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -22,13 +22,13 @@ struct blk_mq_ctx { struct kobject kobj; }; -void __blk_mq_end_io(struct request *rq, int error); -void blk_mq_complete_request(struct request *rq, int error); +void __blk_mq_complete_request(struct request *rq); void blk_mq_run_request(struct request *rq, bool run_queue, bool async); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_init_flush(struct request_queue *q); void blk_mq_drain_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); +void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq); /* * CPU hotplug helpers diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 8095c4a..7500f87 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -549,6 +549,8 @@ static void blk_release_queue(struct kobject *kobj) if (q->mq_ops) blk_mq_free_queue(q); + kfree(q->flush_rq); + blk_trace_shutdown(q); bdi_destroy(&q->backing_dev_info); diff --git a/block/blk-timeout.c b/block/blk-timeout.c index bba81c9..d96f7061 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -91,7 +91,7 @@ static void blk_rq_timed_out(struct request *req) case BLK_EH_HANDLED: /* Can we use req->errors here? */ if (q->mq_ops) - blk_mq_complete_request(req, req->errors); + __blk_mq_complete_request(req); else __blk_complete_request(req); break; diff --git a/block/blk.h b/block/blk.h index c90e1d8..d23b415 100644 --- a/block/blk.h +++ b/block/blk.h @@ -113,7 +113,7 @@ static inline struct request *__elv_next_request(struct request_queue *q) q->flush_queue_delayed = 1; return NULL; } - if (unlikely(blk_queue_dying(q)) || + if (unlikely(blk_queue_bypass(q)) || !q->elevator->type->ops.elevator_dispatch_fn(q, 0)) return NULL; } diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 3107282..091b9ea 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -60,7 +60,9 @@ enum { NULL_IRQ_NONE = 0, NULL_IRQ_SOFTIRQ = 1, NULL_IRQ_TIMER = 2, +}; +enum { NULL_Q_BIO = 0, NULL_Q_RQ = 1, NULL_Q_MQ = 2, @@ -172,18 +174,20 @@ static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait) static void end_cmd(struct nullb_cmd *cmd) { - if (cmd->rq) { - if (queue_mode == NULL_Q_MQ) - blk_mq_end_io(cmd->rq, 0); - else { - INIT_LIST_HEAD(&cmd->rq->queuelist); - blk_end_request_all(cmd->rq, 0); - } - } else if (cmd->bio) + switch (queue_mode) { + case NULL_Q_MQ: + blk_mq_end_io(cmd->rq, 0); + return; + case NULL_Q_RQ: + INIT_LIST_HEAD(&cmd->rq->queuelist); + blk_end_request_all(cmd->rq, 0); + break; + case NULL_Q_BIO: bio_endio(cmd->bio, 0); + break; + } - if (queue_mode != NULL_Q_MQ) - free_cmd(cmd); + free_cmd(cmd); } static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) @@ -195,6 +199,7 @@ static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) cq = &per_cpu(completion_queues, smp_processor_id()); while ((entry = llist_del_all(&cq->list)) != NULL) { + entry = llist_reverse_order(entry); do { cmd = container_of(entry, struct nullb_cmd, ll_list); end_cmd(cmd); @@ -221,61 +226,31 @@ static void null_cmd_end_timer(struct nullb_cmd *cmd) static void null_softirq_done_fn(struct request *rq) { - blk_end_request_all(rq, 0); -} - -#ifdef CONFIG_SMP - -static void null_ipi_cmd_end_io(void *data) -{ - struct completion_queue *cq; - struct llist_node *entry, *next; - struct nullb_cmd *cmd; - - cq = &per_cpu(completion_queues, smp_processor_id()); - - entry = llist_del_all(&cq->list); - - while (entry) { - next = entry->next; - cmd = llist_entry(entry, struct nullb_cmd, ll_list); - end_cmd(cmd); - entry = next; - } -} - -static void null_cmd_end_ipi(struct nullb_cmd *cmd) -{ - struct call_single_data *data = &cmd->csd; - int cpu = get_cpu(); - struct completion_queue *cq = &per_cpu(completion_queues, cpu); - - cmd->ll_list.next = NULL; - - if (llist_add(&cmd->ll_list, &cq->list)) { - data->func = null_ipi_cmd_end_io; - data->flags = 0; - __smp_call_function_single(cpu, data, 0); - } - - put_cpu(); + end_cmd(rq->special); } -#endif /* CONFIG_SMP */ - static inline void null_handle_cmd(struct nullb_cmd *cmd) { /* Complete IO by inline, softirq or timer */ switch (irqmode) { - case NULL_IRQ_NONE: - end_cmd(cmd); - break; case NULL_IRQ_SOFTIRQ: -#ifdef CONFIG_SMP - null_cmd_end_ipi(cmd); -#else + switch (queue_mode) { + case NULL_Q_MQ: + blk_mq_complete_request(cmd->rq); + break; + case NULL_Q_RQ: + blk_complete_request(cmd->rq); + break; + case NULL_Q_BIO: + /* + * XXX: no proper submitting cpu information available. + */ + end_cmd(cmd); + break; + } + break; + case NULL_IRQ_NONE: end_cmd(cmd); -#endif break; case NULL_IRQ_TIMER: null_cmd_end_timer(cmd); @@ -411,6 +386,7 @@ static struct blk_mq_ops null_mq_ops = { .queue_rq = null_queue_rq, .map_queue = blk_mq_map_queue, .init_hctx = null_init_hctx, + .complete = null_softirq_done_fn, }; static struct blk_mq_reg null_mq_reg = { @@ -609,13 +585,6 @@ static int __init null_init(void) { unsigned int i; -#if !defined(CONFIG_SMP) - if (irqmode == NULL_IRQ_SOFTIRQ) { - pr_warn("null_blk: softirq completions not available.\n"); - pr_warn("null_blk: using direct completions.\n"); - irqmode = NULL_IRQ_NONE; - } -#endif if (bs > PAGE_SIZE) { pr_warn("null_blk: invalid block size\n"); pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 6a680d4..b1cb3f4 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -110,9 +110,9 @@ static int __virtblk_add_req(struct virtqueue *vq, return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); } -static inline void virtblk_request_done(struct virtblk_req *vbr) +static inline void virtblk_request_done(struct request *req) { - struct request *req = vbr->req; + struct virtblk_req *vbr = req->special; int error = virtblk_result(vbr); if (req->cmd_type == REQ_TYPE_BLOCK_PC) { @@ -138,7 +138,7 @@ static void virtblk_done(struct virtqueue *vq) do { virtqueue_disable_cb(vq); while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) { - virtblk_request_done(vbr); + blk_mq_complete_request(vbr->req); req_done = true; } if (unlikely(virtqueue_is_broken(vq))) @@ -479,6 +479,7 @@ static struct blk_mq_ops virtio_mq_ops = { .map_queue = blk_mq_map_queue, .alloc_hctx = blk_mq_alloc_single_hw_queue, .free_hctx = blk_mq_free_single_hw_queue, + .complete = virtblk_request_done, }; static struct blk_mq_reg virtio_mq_reg = { diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 4b97b86..64c60ed 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -299,7 +299,7 @@ static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root, BUG_ON(num != 0); } -static void unmap_purged_grants(struct work_struct *work) +void xen_blkbk_unmap_purged_grants(struct work_struct *work) { struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; @@ -375,7 +375,7 @@ static void purge_persistent_gnt(struct xen_blkif *blkif) pr_debug(DRV_PFX "Going to purge %u persistent grants\n", num_clean); - INIT_LIST_HEAD(&blkif->persistent_purge_list); + BUG_ON(!list_empty(&blkif->persistent_purge_list)); root = &blkif->persistent_gnts; purge_list: foreach_grant_safe(persistent_gnt, n, root, node) { @@ -420,7 +420,6 @@ finished: blkif->vbd.overflow_max_grants = 0; /* We can defer this work */ - INIT_WORK(&blkif->persistent_purge_work, unmap_purged_grants); schedule_work(&blkif->persistent_purge_work); pr_debug(DRV_PFX "Purged %u/%u\n", (total - num_clean), total); return; @@ -625,9 +624,23 @@ purge_gnt_list: print_stats(blkif); } - /* Since we are shutting down remove all pages from the buffer */ - shrink_free_pagepool(blkif, 0 /* All */); + /* Drain pending purge work */ + flush_work(&blkif->persistent_purge_work); + if (log_stats) + print_stats(blkif); + + blkif->xenblkd = NULL; + xen_blkif_put(blkif); + + return 0; +} + +/* + * Remove persistent grants and empty the pool of free pages + */ +void xen_blkbk_free_caches(struct xen_blkif *blkif) +{ /* Free all persistent grant pages */ if (!RB_EMPTY_ROOT(&blkif->persistent_gnts)) free_persistent_gnts(blkif, &blkif->persistent_gnts, @@ -636,13 +649,8 @@ purge_gnt_list: BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); blkif->persistent_gnt_c = 0; - if (log_stats) - print_stats(blkif); - - blkif->xenblkd = NULL; - xen_blkif_put(blkif); - - return 0; + /* Since we are shutting down remove all pages from the buffer */ + shrink_free_pagepool(blkif, 0 /* All */); } /* @@ -838,7 +846,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req, struct grant_page **pages = pending_req->indirect_pages; struct xen_blkif *blkif = pending_req->blkif; int indirect_grefs, rc, n, nseg, i; - struct blkif_request_segment_aligned *segments = NULL; + struct blkif_request_segment *segments = NULL; nseg = pending_req->nr_pages; indirect_grefs = INDIRECT_PAGES(nseg); @@ -934,9 +942,7 @@ static void xen_blk_drain_io(struct xen_blkif *blkif) { atomic_set(&blkif->drain, 1); do { - /* The initial value is one, and one refcnt taken at the - * start of the xen_blkif_schedule thread. */ - if (atomic_read(&blkif->refcnt) <= 2) + if (atomic_read(&blkif->inflight) == 0) break; wait_for_completion_interruptible_timeout( &blkif->drain_complete, HZ); @@ -976,17 +982,30 @@ static void __end_block_io_op(struct pending_req *pending_req, int error) * the proper response on the ring. */ if (atomic_dec_and_test(&pending_req->pendcnt)) { - xen_blkbk_unmap(pending_req->blkif, + struct xen_blkif *blkif = pending_req->blkif; + + xen_blkbk_unmap(blkif, pending_req->segments, pending_req->nr_pages); - make_response(pending_req->blkif, pending_req->id, + make_response(blkif, pending_req->id, pending_req->operation, pending_req->status); - xen_blkif_put(pending_req->blkif); - if (atomic_read(&pending_req->blkif->refcnt) <= 2) { - if (atomic_read(&pending_req->blkif->drain)) - complete(&pending_req->blkif->drain_complete); + free_req(blkif, pending_req); + /* + * Make sure the request is freed before releasing blkif, + * or there could be a race between free_req and the + * cleanup done in xen_blkif_free during shutdown. + * + * NB: The fact that we might try to wake up pending_free_wq + * before drain_complete (in case there's a drain going on) + * it's not a problem with our current implementation + * because we can assure there's no thread waiting on + * pending_free_wq if there's a drain going on, but it has + * to be taken into account if the current model is changed. + */ + if (atomic_dec_and_test(&blkif->inflight) && atomic_read(&blkif->drain)) { + complete(&blkif->drain_complete); } - free_req(pending_req->blkif, pending_req); + xen_blkif_put(blkif); } } @@ -1240,6 +1259,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, * below (in "!bio") if we are handling a BLKIF_OP_DISCARD. */ xen_blkif_get(blkif); + atomic_inc(&blkif->inflight); for (i = 0; i < nseg; i++) { while ((bio == NULL) || diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index 8d88075..be05277 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -57,7 +57,7 @@ #define MAX_INDIRECT_SEGMENTS 256 #define SEGS_PER_INDIRECT_FRAME \ - (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned)) + (PAGE_SIZE/sizeof(struct blkif_request_segment)) #define MAX_INDIRECT_PAGES \ ((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) #define INDIRECT_PAGES(_segs) \ @@ -278,6 +278,7 @@ struct xen_blkif { /* for barrier (drain) requests */ struct completion drain_complete; atomic_t drain; + atomic_t inflight; /* One thread per one blkif. */ struct task_struct *xenblkd; unsigned int waiting_reqs; @@ -376,6 +377,7 @@ int xen_blkif_xenbus_init(void); irqreturn_t xen_blkif_be_int(int irq, void *dev_id); int xen_blkif_schedule(void *arg); int xen_blkif_purge_persistent(void *arg); +void xen_blkbk_free_caches(struct xen_blkif *blkif); int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, struct backend_info *be, int state); @@ -383,6 +385,7 @@ int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, int xen_blkbk_barrier(struct xenbus_transaction xbt, struct backend_info *be, int state); struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be); +void xen_blkbk_unmap_purged_grants(struct work_struct *work); static inline void blkif_get_x86_32_req(struct blkif_request *dst, struct blkif_x86_32_request *src) diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index c2014a0..9a547e6 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -125,8 +125,11 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) blkif->persistent_gnts.rb_node = NULL; spin_lock_init(&blkif->free_pages_lock); INIT_LIST_HEAD(&blkif->free_pages); + INIT_LIST_HEAD(&blkif->persistent_purge_list); blkif->free_pages_num = 0; atomic_set(&blkif->persistent_gnt_in_use, 0); + atomic_set(&blkif->inflight, 0); + INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants); INIT_LIST_HEAD(&blkif->pending_free); @@ -259,6 +262,17 @@ static void xen_blkif_free(struct xen_blkif *blkif) if (!atomic_dec_and_test(&blkif->refcnt)) BUG(); + /* Remove all persistent grants and the cache of ballooned pages. */ + xen_blkbk_free_caches(blkif); + + /* Make sure everything is drained before shutting down */ + BUG_ON(blkif->persistent_gnt_c != 0); + BUG_ON(atomic_read(&blkif->persistent_gnt_in_use) != 0); + BUG_ON(blkif->free_pages_num != 0); + BUG_ON(!list_empty(&blkif->persistent_purge_list)); + BUG_ON(!list_empty(&blkif->free_pages)); + BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); + /* Check that there is no request in use */ list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) { list_del(&req->free_list); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 8dcfb54..efe1b47 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -162,7 +162,7 @@ static DEFINE_SPINLOCK(minor_lock); #define DEV_NAME "xvd" /* name in /dev */ #define SEGS_PER_INDIRECT_FRAME \ - (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned)) + (PAGE_SIZE/sizeof(struct blkif_request_segment)) #define INDIRECT_GREFS(_segs) \ ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) @@ -393,7 +393,7 @@ static int blkif_queue_request(struct request *req) unsigned long id; unsigned int fsect, lsect; int i, ref, n; - struct blkif_request_segment_aligned *segments = NULL; + struct blkif_request_segment *segments = NULL; /* * Used to store if we are able to queue the request by just using @@ -550,7 +550,7 @@ static int blkif_queue_request(struct request *req) } else { n = i % SEGS_PER_INDIRECT_FRAME; segments[n] = - (struct blkif_request_segment_aligned) { + (struct blkif_request_segment) { .gref = ref, .first_sect = fsect, .last_sect = lsect }; @@ -1904,13 +1904,16 @@ static void blkback_changed(struct xenbus_device *dev, case XenbusStateReconfiguring: case XenbusStateReconfigured: case XenbusStateUnknown: - case XenbusStateClosed: break; case XenbusStateConnected: blkfront_connect(info); break; + case XenbusStateClosed: + if (dev->state == XenbusStateClosed) + break; + /* Missed the backend's Closing state -- fallthrough */ case XenbusStateClosing: blkfront_closing(info); break; diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 0c707e4..a4c7306 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -210,7 +210,9 @@ BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); #define GC_MARK_RECLAIMABLE 0 #define GC_MARK_DIRTY 1 #define GC_MARK_METADATA 2 -BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 13); +#define GC_SECTORS_USED_SIZE 13 +#define MAX_GC_SECTORS_USED (~(~0ULL << GC_SECTORS_USED_SIZE)) +BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE); BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1); #include "journal.h" diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 4f6b594..3f74b4b 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -23,7 +23,7 @@ void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set) for (k = i->start; k < bset_bkey_last(i); k = next) { next = bkey_next(k); - printk(KERN_ERR "block %u key %zi/%u: ", set, + printk(KERN_ERR "block %u key %li/%u: ", set, (uint64_t *) k - i->d, i->keys); if (b->ops->key_dump) @@ -1185,9 +1185,12 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter, struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO, order); if (!out) { + struct page *outp; + BUG_ON(order > state->page_order); - out = page_address(mempool_alloc(state->pool, GFP_NOIO)); + outp = mempool_alloc(state->pool, GFP_NOIO); + out = page_address(outp); used_mempool = true; order = state->page_order; } diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 98cc0a8..5f9c2a6 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1167,7 +1167,7 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) /* guard against overflow */ SET_GC_SECTORS_USED(g, min_t(unsigned, GC_SECTORS_USED(g) + KEY_SIZE(k), - (1 << 14) - 1)); + MAX_GC_SECTORS_USED)); BUG_ON(!GC_SECTORS_USED(g)); } @@ -1805,7 +1805,7 @@ static bool btree_insert_key(struct btree *b, struct bkey *k, static size_t insert_u64s_remaining(struct btree *b) { - ssize_t ret = bch_btree_keys_u64s_remaining(&b->keys); + long ret = bch_btree_keys_u64s_remaining(&b->keys); /* * Might land in the middle of an existing extent and have to split it diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 72cd213..5d5d031 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -353,14 +353,14 @@ static void bch_data_insert_start(struct closure *cl) struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); struct bio *bio = op->bio, *n; - if (op->bypass) - return bch_data_invalidate(cl); - if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { set_gc_sectors(op->c); wake_up_gc(op->c); } + if (op->bypass) + return bch_data_invalidate(cl); + /* * Journal writes are marked REQ_FLUSH; if the original write was a * flush, it'll wait on the journal write. diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index c6ab693..d8458d4 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -416,7 +416,7 @@ static int btree_bset_stats(struct btree_op *b_op, struct btree *b) return MAP_CONTINUE; } -int bch_bset_print_stats(struct cache_set *c, char *buf) +static int bch_bset_print_stats(struct cache_set *c, char *buf) { struct bset_stats_op op; int ret; diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 0bad24d..0129b78 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -114,6 +114,14 @@ void bio_integrity_free(struct bio *bio) } EXPORT_SYMBOL(bio_integrity_free); +static inline unsigned int bip_integrity_vecs(struct bio_integrity_payload *bip) +{ + if (bip->bip_slab == BIO_POOL_NONE) + return BIP_INLINE_VECS; + + return bvec_nr_vecs(bip->bip_slab); +} + /** * bio_integrity_add_page - Attach integrity metadata * @bio: bio to update @@ -129,7 +137,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, struct bio_integrity_payload *bip = bio->bi_integrity; struct bio_vec *iv; - if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_slab)) { + if (bip->bip_vcnt >= bip_integrity_vecs(bip)) { printk(KERN_ERR "%s: bip_vec full\n", __func__); return 0; } @@ -226,7 +234,8 @@ unsigned int bio_integrity_tag_size(struct bio *bio) } EXPORT_SYMBOL(bio_integrity_tag_size); -int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set) +static int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, + int set) { struct bio_integrity_payload *bip = bio->bi_integrity; struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); @@ -611,7 +611,6 @@ EXPORT_SYMBOL(bio_clone_fast); struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, struct bio_set *bs) { - unsigned nr_iovecs = 0; struct bvec_iter iter; struct bio_vec bv; struct bio *bio; @@ -638,10 +637,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, * __bio_clone_fast() anyways. */ - bio_for_each_segment(bv, bio_src, iter) - nr_iovecs++; - - bio = bio_alloc_bioset(gfp_mask, nr_iovecs, bs); + bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); if (!bio) return NULL; @@ -650,9 +646,18 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; + if (bio->bi_rw & REQ_DISCARD) + goto integrity_clone; + + if (bio->bi_rw & REQ_WRITE_SAME) { + bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; + goto integrity_clone; + } + bio_for_each_segment(bv, bio_src, iter) bio->bi_io_vec[bio->bi_vcnt++] = bv; +integrity_clone: if (bio_integrity(bio_src)) { int ret; diff --git a/include/linux/bio.h b/include/linux/bio.h index 7065452..5a4d39b 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -250,6 +250,17 @@ static inline unsigned bio_segments(struct bio *bio) struct bio_vec bv; struct bvec_iter iter; + /* + * We special case discard/write same, because they interpret bi_size + * differently: + */ + + if (bio->bi_rw & REQ_DISCARD) + return 1; + + if (bio->bi_rw & REQ_WRITE_SAME) + return 1; + bio_for_each_segment(bv, bio, iter) segs++; @@ -332,6 +343,7 @@ extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *); extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs); extern struct bio_set *fs_bio_set; +unsigned int bio_integrity_tag_size(struct bio *bio); static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) { diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 161b231..18ba8a6 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -83,6 +83,8 @@ struct blk_mq_ops { */ rq_timed_out_fn *timeout; + softirq_done_fn *complete; + /* * Override for hctx allocations (should probably go) */ @@ -119,11 +121,12 @@ void blk_mq_init_commands(struct request_queue *, void (*init)(void *data, struc void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); -void blk_mq_insert_request(struct request_queue *, struct request *, bool); +void blk_mq_insert_request(struct request_queue *, struct request *, + bool, bool); void blk_mq_run_queues(struct request_queue *q, bool async); void blk_mq_free_request(struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); -struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, bool reserved); +struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp); struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp); struct request *blk_mq_rq_from_tag(struct request_queue *q, unsigned int tag); @@ -133,6 +136,8 @@ void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int); void blk_mq_end_io(struct request *rq, int error); +void blk_mq_complete_request(struct request *rq); + void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_stop_hw_queues(struct request_queue *q); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8678c43..4afa4f8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -98,7 +98,7 @@ struct request { struct list_head queuelist; union { struct call_single_data csd; - struct work_struct mq_flush_data; + struct work_struct mq_flush_work; }; struct request_queue *q; @@ -448,13 +448,8 @@ struct request_queue { unsigned long flush_pending_since; struct list_head flush_queue[2]; struct list_head flush_data_in_flight; - union { - struct request flush_rq; - struct { - spinlock_t mq_flush_lock; - struct work_struct mq_flush_work; - }; - }; + struct request *flush_rq; + spinlock_t mq_flush_lock; struct mutex sysfs_lock; diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h index ae665ac..32ec05a 100644 --- a/include/xen/interface/io/blkif.h +++ b/include/xen/interface/io/blkif.h @@ -113,13 +113,13 @@ typedef uint64_t blkif_sector_t; * it's less than the number provided by the backend. The indirect_grefs field * in blkif_request_indirect should be filled by the frontend with the * grant references of the pages that are holding the indirect segments. - * This pages are filled with an array of blkif_request_segment_aligned - * that hold the information about the segments. The number of indirect - * pages to use is determined by the maximum number of segments - * a indirect request contains. Every indirect page can contain a maximum - * of 512 segments (PAGE_SIZE/sizeof(blkif_request_segment_aligned)), - * so to calculate the number of indirect pages to use we have to do - * ceil(indirect_segments/512). + * These pages are filled with an array of blkif_request_segment that hold the + * information about the segments. The number of indirect pages to use is + * determined by the number of segments an indirect request contains. Every + * indirect page can contain a maximum of + * (PAGE_SIZE / sizeof(struct blkif_request_segment)) segments, so to + * calculate the number of indirect pages to use we have to do + * ceil(indirect_segments / (PAGE_SIZE / sizeof(struct blkif_request_segment))). * * If a backend does not recognize BLKIF_OP_INDIRECT, it should *not* * create the "feature-max-indirect-segments" node! @@ -135,13 +135,12 @@ typedef uint64_t blkif_sector_t; #define BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST 8 -struct blkif_request_segment_aligned { - grant_ref_t gref; /* reference to I/O buffer frame */ - /* @first_sect: first sector in frame to transfer (inclusive). */ - /* @last_sect: last sector in frame to transfer (inclusive). */ - uint8_t first_sect, last_sect; - uint16_t _pad; /* padding to make it 8 bytes, so it's cache-aligned */ -} __attribute__((__packed__)); +struct blkif_request_segment { + grant_ref_t gref; /* reference to I/O buffer frame */ + /* @first_sect: first sector in frame to transfer (inclusive). */ + /* @last_sect: last sector in frame to transfer (inclusive). */ + uint8_t first_sect, last_sect; +}; struct blkif_request_rw { uint8_t nr_segments; /* number of segments */ @@ -151,12 +150,7 @@ struct blkif_request_rw { #endif uint64_t id; /* private guest value, echoed in resp */ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ - struct blkif_request_segment { - grant_ref_t gref; /* reference to I/O buffer frame */ - /* @first_sect: first sector in frame to transfer (inclusive). */ - /* @last_sect: last sector in frame to transfer (inclusive). */ - uint8_t first_sect, last_sect; - } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; } __attribute__((__packed__)); struct blkif_request_discard { diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c index 7be235f..93d145e 100644 --- a/lib/percpu_ida.c +++ b/lib/percpu_ida.c @@ -54,9 +54,7 @@ static inline void move_tags(unsigned *dst, unsigned *dst_nr, /* * Try to steal tags from a remote cpu's percpu freelist. * - * We first check how many percpu freelists have tags - we don't steal tags - * unless enough percpu freelists have tags on them that it's possible more than - * half the total tags could be stuck on remote percpu freelists. + * We first check how many percpu freelists have tags * * Then we iterate through the cpus until we find some tags - we don't attempt * to find the "best" cpu to steal from, to keep cacheline bouncing to a @@ -69,8 +67,7 @@ static inline void steal_tags(struct percpu_ida *pool, struct percpu_ida_cpu *remote; for (cpus_have_tags = cpumask_weight(&pool->cpus_have_tags); - cpus_have_tags * pool->percpu_max_size > pool->nr_tags / 2; - cpus_have_tags--) { + cpus_have_tags; cpus_have_tags--) { cpu = cpumask_next(cpu, &pool->cpus_have_tags); if (cpu >= nr_cpu_ids) { |