diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-12-13 14:14:23 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-12-13 14:14:23 -0800 |
commit | caf292ae5bb9d57198ce001d8b762f7abae3a94d (patch) | |
tree | 5fd5d6d971503818ab2824407134cf36a80c53d0 | |
parent | 8f4385d590d4296ec38e228d17b1d002f6031dd2 (diff) | |
parent | fcbf6a087a7e4d3f03d28333678a1010810a53c3 (diff) | |
download | op-kernel-dev-caf292ae5bb9d57198ce001d8b762f7abae3a94d.zip op-kernel-dev-caf292ae5bb9d57198ce001d8b762f7abae3a94d.tar.gz |
Merge branch 'for-3.19/core' of git://git.kernel.dk/linux-block
Pull block driver core update from Jens Axboe:
"This is the pull request for the core block IO changes for 3.19. Not
a huge round this time, mostly lots of little good fixes:
- Fix a bug in sysfs blktrace interface causing a NULL pointer
dereference, when enabled/disabled through that API. From Arianna
Avanzini.
- Various updates/fixes/improvements for blk-mq:
- A set of updates from Bart, mostly fixing buts in the tag
handling.
- Cleanup/code consolidation from Christoph.
- Extend queue_rq API to be able to handle batching issues of IO
requests. NVMe will utilize this shortly. From me.
- A few tag and request handling updates from me.
- Cleanup of the preempt handling for running queues from Paolo.
- Prevent running of unmapped hardware queues from Ming Lei.
- Move the kdump memory limiting check to be in the correct
location, from Shaohua.
- Initialize all software queues at init time from Takashi. This
prevents a kobject warning when CPUs are brought online that
weren't online when a queue was registered.
- Single writeback fix for I_DIRTY clearing from Tejun. Queued with
the core IO changes, since it's just a single fix.
- Version X of the __bio_add_page() segment addition retry from
Maurizio. Hope the Xth time is the charm.
- Documentation fixup for IO scheduler merging from Jan.
- Introduce (and use) generic IO stat accounting helpers for non-rq
drivers, from Gu Zheng.
- Kill off artificial limiting of max sectors in a request from
Christoph"
* 'for-3.19/core' of git://git.kernel.dk/linux-block: (26 commits)
bio: modify __bio_add_page() to accept pages that don't start a new segment
blk-mq: Fix uninitialized kobject at CPU hotplugging
blktrace: don't let the sysfs interface remove trace from running list
blk-mq: Use all available hardware queues
blk-mq: Micro-optimize bt_get()
blk-mq: Fix a race between bt_clear_tag() and bt_get()
blk-mq: Avoid that __bt_get_word() wraps multiple times
blk-mq: Fix a use-after-free
blk-mq: prevent unmapped hw queue from being scheduled
blk-mq: re-check for available tags after running the hardware queue
blk-mq: fix hang in bt_get()
blk-mq: move the kdump check to blk_mq_alloc_tag_set
blk-mq: cleanup tag free handling
blk-mq: use 'nr_cpu_ids' as highest CPU ID count for hwq <-> cpu map
blk: introduce generic io stat accounting help function
blk-mq: handle the single queue case in blk_mq_hctx_next_cpu
genhd: check for int overflow in disk_expand_part_tbl()
blk-mq: add blk_mq_free_hctx_request()
blk-mq: export blk_mq_free_request()
blk-mq: use get_cpu/put_cpu instead of preempt_disable/preempt_enable
...
-rw-r--r-- | Documentation/block/biodoc.txt | 6 | ||||
-rw-r--r-- | block/bio.c | 82 | ||||
-rw-r--r-- | block/blk-core.c | 3 | ||||
-rw-r--r-- | block/blk-mq-cpumap.c | 4 | ||||
-rw-r--r-- | block/blk-mq-sysfs.c | 9 | ||||
-rw-r--r-- | block/blk-mq-tag.c | 60 | ||||
-rw-r--r-- | block/blk-mq.c | 126 | ||||
-rw-r--r-- | block/blk-mq.h | 5 | ||||
-rw-r--r-- | block/blk-settings.c | 4 | ||||
-rw-r--r-- | block/blk-sysfs.c | 12 | ||||
-rw-r--r-- | block/genhd.c | 11 | ||||
-rw-r--r-- | drivers/block/aoe/aoeblk.c | 2 | ||||
-rw-r--r-- | drivers/block/mtip32xx/mtip32xx.c | 5 | ||||
-rw-r--r-- | drivers/block/null_blk.c | 10 | ||||
-rw-r--r-- | drivers/block/virtio_blk.c | 7 | ||||
-rw-r--r-- | drivers/scsi/scsi_lib.c | 5 | ||||
-rw-r--r-- | fs/fs-writeback.c | 29 | ||||
-rw-r--r-- | include/linux/bio.h | 5 | ||||
-rw-r--r-- | include/linux/blk-mq.h | 10 | ||||
-rw-r--r-- | include/linux/blkdev.h | 1 | ||||
-rw-r--r-- | kernel/trace/blktrace.c | 3 |
21 files changed, 254 insertions, 145 deletions
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index 6b972b2..5aabc08 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt @@ -942,7 +942,11 @@ elevator_allow_merge_fn called whenever the block layer determines request safely. The io scheduler may still want to stop a merge at this point if it results in some sort of conflict internally, - this hook allows it to do that. + this hook allows it to do that. Note however + that two *requests* can still be merged at later + time. Currently the io scheduler has no way to + prevent that. It can only learn about the fact + from elevator_merge_req_fn callback. elevator_dispatch_fn* fills the dispatch queue with ready requests. I/O schedulers are free to postpone requests by diff --git a/block/bio.c b/block/bio.c index 3e6e198..471d738 100644 --- a/block/bio.c +++ b/block/bio.c @@ -748,6 +748,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page } } + bio->bi_iter.bi_size += len; goto done; } @@ -764,29 +765,32 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page return 0; /* - * we might lose a segment or two here, but rather that than - * make this too complex. + * setup the new entry, we might clear it again later if we + * cannot add the page + */ + bvec = &bio->bi_io_vec[bio->bi_vcnt]; + bvec->bv_page = page; + bvec->bv_len = len; + bvec->bv_offset = offset; + bio->bi_vcnt++; + bio->bi_phys_segments++; + bio->bi_iter.bi_size += len; + + /* + * Perform a recount if the number of segments is greater + * than queue_max_segments(q). */ - while (bio->bi_phys_segments >= queue_max_segments(q)) { + while (bio->bi_phys_segments > queue_max_segments(q)) { if (retried_segments) - return 0; + goto failed; retried_segments = 1; blk_recount_segments(q, bio); } /* - * setup the new entry, we might clear it again later if we - * cannot add the page - */ - bvec = &bio->bi_io_vec[bio->bi_vcnt]; - bvec->bv_page = page; - bvec->bv_len = len; - bvec->bv_offset = offset; - - /* * if queue has other restrictions (eg varying max sector size * depending on offset), it can specify a merge_bvec_fn in the * queue to get further control @@ -795,7 +799,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page struct bvec_merge_data bvm = { .bi_bdev = bio->bi_bdev, .bi_sector = bio->bi_iter.bi_sector, - .bi_size = bio->bi_iter.bi_size, + .bi_size = bio->bi_iter.bi_size - len, .bi_rw = bio->bi_rw, }; @@ -803,23 +807,25 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page * merge_bvec_fn() returns number of bytes it can accept * at this offset */ - if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) { - bvec->bv_page = NULL; - bvec->bv_len = 0; - bvec->bv_offset = 0; - return 0; - } + if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) + goto failed; } /* If we may be able to merge these biovecs, force a recount */ - if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) + if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) bio->bi_flags &= ~(1 << BIO_SEG_VALID); - bio->bi_vcnt++; - bio->bi_phys_segments++; done: - bio->bi_iter.bi_size += len; return len; + + failed: + bvec->bv_page = NULL; + bvec->bv_len = 0; + bvec->bv_offset = 0; + bio->bi_vcnt--; + bio->bi_iter.bi_size -= len; + blk_recount_segments(q, bio); + return 0; } /** @@ -1739,6 +1745,34 @@ void bio_check_pages_dirty(struct bio *bio) } } +void generic_start_io_acct(int rw, unsigned long sectors, + struct hd_struct *part) +{ + int cpu = part_stat_lock(); + + part_round_stats(cpu, part); + part_stat_inc(cpu, part, ios[rw]); + part_stat_add(cpu, part, sectors[rw], sectors); + part_inc_in_flight(part, rw); + + part_stat_unlock(); +} +EXPORT_SYMBOL(generic_start_io_acct); + +void generic_end_io_acct(int rw, struct hd_struct *part, + unsigned long start_time) +{ + unsigned long duration = jiffies - start_time; + int cpu = part_stat_lock(); + + part_stat_add(cpu, part, ticks[rw], duration); + part_round_stats(cpu, part); + part_dec_in_flight(part, rw); + + part_stat_unlock(); +} +EXPORT_SYMBOL(generic_end_io_acct); + #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE void bio_flush_dcache_pages(struct bio *bi) { diff --git a/block/blk-core.c b/block/blk-core.c index ea1c4d0..30f6153 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -525,6 +525,9 @@ void blk_cleanup_queue(struct request_queue *q) del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); blk_sync_queue(q); + if (q->mq_ops) + blk_mq_free_queue(q); + spin_lock_irq(lock); if (q->queue_lock != &q->__queue_lock) q->queue_lock = &q->__queue_lock; diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 1065d7c..5f13f4d 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -17,7 +17,7 @@ static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues, const int cpu) { - return cpu / ((nr_cpus + nr_queues - 1) / nr_queues); + return cpu * nr_queues / nr_cpus; } static int get_first_sibling(unsigned int cpu) @@ -90,7 +90,7 @@ unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set) unsigned int *map; /* If cpus are offline, map them to first hctx */ - map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL, + map = kzalloc_node(sizeof(*map) * nr_cpu_ids, GFP_KERNEL, set->numa_node); if (!map) return NULL; diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 371d880..1630a20 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -390,16 +390,15 @@ static void blk_mq_sysfs_init(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; - int i, j; + int i; kobject_init(&q->mq_kobj, &blk_mq_ktype); - queue_for_each_hw_ctx(q, hctx, i) { + queue_for_each_hw_ctx(q, hctx, i) kobject_init(&hctx->kobj, &blk_mq_hw_ktype); - hctx_for_each_ctx(hctx, ctx, j) - kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); - } + queue_for_each_ctx(q, ctx, i) + kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); } /* see blk_register_queue() */ diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 728b9a4..e3d4e40 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -137,6 +137,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag) { int tag, org_last_tag, end; + bool wrap = last_tag != 0; org_last_tag = last_tag; end = bm->depth; @@ -148,15 +149,16 @@ restart: * We started with an offset, start from 0 to * exhaust the map. */ - if (org_last_tag && last_tag) { - end = last_tag; + if (wrap) { + wrap = false; + end = org_last_tag; last_tag = 0; goto restart; } return -1; } last_tag = tag + 1; - } while (test_and_set_bit_lock(tag, &bm->word)); + } while (test_and_set_bit(tag, &bm->word)); return tag; } @@ -246,14 +248,29 @@ static int bt_get(struct blk_mq_alloc_data *data, if (!(data->gfp & __GFP_WAIT)) return -1; - bs = bt_wait_ptr(bt, hctx); do { + bs = bt_wait_ptr(bt, hctx); prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE); tag = __bt_get(hctx, bt, last_tag); if (tag != -1) break; + /* + * We're out of tags on this hardware queue, kick any + * pending IO submits before going to sleep waiting for + * some to complete. + */ + blk_mq_run_hw_queue(hctx, false); + + /* + * Retry tag allocation after running the hardware queue, + * as running the queue may also have found completions. + */ + tag = __bt_get(hctx, bt, last_tag); + if (tag != -1) + break; + blk_mq_put_ctx(data->ctx); io_schedule(); @@ -268,8 +285,6 @@ static int bt_get(struct blk_mq_alloc_data *data, hctx = data->hctx; bt = &hctx->tags->bitmap_tags; } - finish_wait(&bs->wait, &wait); - bs = bt_wait_ptr(bt, hctx); } while (1); finish_wait(&bs->wait, &wait); @@ -340,11 +355,10 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag) struct bt_wait_state *bs; int wait_cnt; - /* - * The unlock memory barrier need to order access to req in free - * path and clearing tag bit - */ - clear_bit_unlock(TAG_TO_BIT(bt, tag), &bt->map[index].word); + clear_bit(TAG_TO_BIT(bt, tag), &bt->map[index].word); + + /* Ensure that the wait list checks occur after clear_bit(). */ + smp_mb(); bs = bt_wake_ptr(bt); if (!bs) @@ -360,21 +374,6 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag) } } -static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) -{ - BUG_ON(tag >= tags->nr_tags); - - bt_clear_tag(&tags->bitmap_tags, tag); -} - -static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags, - unsigned int tag) -{ - BUG_ON(tag >= tags->nr_reserved_tags); - - bt_clear_tag(&tags->breserved_tags, tag); -} - void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag) { @@ -383,10 +382,13 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, if (tag >= tags->nr_reserved_tags) { const int real_tag = tag - tags->nr_reserved_tags; - __blk_mq_put_tag(tags, real_tag); + BUG_ON(real_tag >= tags->nr_tags); + bt_clear_tag(&tags->bitmap_tags, real_tag); *last_tag = real_tag; - } else - __blk_mq_put_reserved_tag(tags, tag); + } else { + BUG_ON(tag >= tags->nr_reserved_tags); + bt_clear_tag(&tags->breserved_tags, tag); + } } static void bt_for_each(struct blk_mq_hw_ctx *hctx, diff --git a/block/blk-mq.c b/block/blk-mq.c index 92ceef0..da1ab56 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -279,17 +279,25 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, blk_mq_queue_exit(q); } -void blk_mq_free_request(struct request *rq) +void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx; - struct request_queue *q = rq->q; ctx->rq_completed[rq_is_sync(rq)]++; - - hctx = q->mq_ops->map_queue(q, ctx->cpu); __blk_mq_free_request(hctx, ctx, rq); + +} +EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request); + +void blk_mq_free_request(struct request *rq) +{ + struct blk_mq_hw_ctx *hctx; + struct request_queue *q = rq->q; + + hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu); + blk_mq_free_hctx_request(hctx, rq); } +EXPORT_SYMBOL_GPL(blk_mq_free_request); inline void __blk_mq_end_request(struct request *rq, int error) { @@ -591,7 +599,7 @@ static void blk_mq_rq_timer(unsigned long priv) * If not software queues are currently mapped to this * hardware queue, there's nothing to check */ - if (!hctx->nr_ctx || !hctx->tags) + if (!blk_mq_hw_queue_mapped(hctx)) continue; blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data); @@ -690,6 +698,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) struct request_queue *q = hctx->queue; struct request *rq; LIST_HEAD(rq_list); + LIST_HEAD(driver_list); + struct list_head *dptr; int queued; WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)); @@ -716,16 +726,27 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) } /* + * Start off with dptr being NULL, so we start the first request + * immediately, even if we have more pending. + */ + dptr = NULL; + + /* * Now process all the entries, sending them to the driver. */ queued = 0; while (!list_empty(&rq_list)) { + struct blk_mq_queue_data bd; int ret; rq = list_first_entry(&rq_list, struct request, queuelist); list_del_init(&rq->queuelist); - ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list)); + bd.rq = rq; + bd.list = dptr; + bd.last = list_empty(&rq_list); + + ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { case BLK_MQ_RQ_QUEUE_OK: queued++; @@ -744,6 +765,13 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) if (ret == BLK_MQ_RQ_QUEUE_BUSY) break; + + /* + * We've done the first request. If we have more than 1 + * left in the list, set dptr to defer issue. + */ + if (!dptr && rq_list.next != rq_list.prev) + dptr = &driver_list; } if (!queued) @@ -770,10 +798,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) */ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) { - int cpu = hctx->next_cpu; + if (hctx->queue->nr_hw_queues == 1) + return WORK_CPU_UNBOUND; if (--hctx->next_cpu_batch <= 0) { - int next_cpu; + int cpu = hctx->next_cpu, next_cpu; next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); if (next_cpu >= nr_cpu_ids) @@ -781,26 +810,32 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) hctx->next_cpu = next_cpu; hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; + + return cpu; } - return cpu; + return hctx->next_cpu; } void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { - if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) + if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state) || + !blk_mq_hw_queue_mapped(hctx))) return; - if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) - __blk_mq_run_hw_queue(hctx); - else if (hctx->queue->nr_hw_queues == 1) - kblockd_schedule_delayed_work(&hctx->run_work, 0); - else { - unsigned int cpu; + if (!async) { + int cpu = get_cpu(); + if (cpumask_test_cpu(cpu, hctx->cpumask)) { + __blk_mq_run_hw_queue(hctx); + put_cpu(); + return; + } - cpu = blk_mq_hctx_next_cpu(hctx); - kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0); + put_cpu(); } + + kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), + &hctx->run_work, 0); } void blk_mq_run_queues(struct request_queue *q, bool async) @@ -814,9 +849,7 @@ void blk_mq_run_queues(struct request_queue *q, bool async) test_bit(BLK_MQ_S_STOPPED, &hctx->state)) continue; - preempt_disable(); blk_mq_run_hw_queue(hctx, async); - preempt_enable(); } } EXPORT_SYMBOL(blk_mq_run_queues); @@ -843,9 +876,7 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) { clear_bit(BLK_MQ_S_STOPPED, &hctx->state); - preempt_disable(); blk_mq_run_hw_queue(hctx, false); - preempt_enable(); } EXPORT_SYMBOL(blk_mq_start_hw_queue); @@ -870,9 +901,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) continue; clear_bit(BLK_MQ_S_STOPPED, &hctx->state); - preempt_disable(); blk_mq_run_hw_queue(hctx, async); - preempt_enable(); } } EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); @@ -898,16 +927,11 @@ static void blk_mq_delay_work_fn(struct work_struct *work) void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) { - unsigned long tmo = msecs_to_jiffies(msecs); - - if (hctx->queue->nr_hw_queues == 1) - kblockd_schedule_delayed_work(&hctx->delay_work, tmo); - else { - unsigned int cpu; + if (unlikely(!blk_mq_hw_queue_mapped(hctx))) + return; - cpu = blk_mq_hctx_next_cpu(hctx); - kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo); - } + kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), + &hctx->delay_work, msecs_to_jiffies(msecs)); } EXPORT_SYMBOL(blk_mq_delay_queue); @@ -1162,7 +1186,17 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) goto run_queue; } - if (is_sync) { + /* + * If the driver supports defer issued based on 'last', then + * queue it up like normal since we can potentially save some + * CPU this way. + */ + if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { + struct blk_mq_queue_data bd = { + .rq = rq, + .list = NULL, + .last = 1 + }; int ret; blk_mq_bio_to_request(rq, bio); @@ -1172,7 +1206,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) * error (busy), just add it to our list as we previously * would have done */ - ret = q->mq_ops->queue_rq(data.hctx, rq, true); + ret = q->mq_ops->queue_rq(data.hctx, &bd); if (ret == BLK_MQ_RQ_QUEUE_OK) goto done; else { @@ -1784,16 +1818,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (!ctx) return ERR_PTR(-ENOMEM); - /* - * If a crashdump is active, then we are potentially in a very - * memory constrained environment. Limit us to 1 queue and - * 64 tags to prevent using too much memory. - */ - if (is_kdump_kernel()) { - set->nr_hw_queues = 1; - set->queue_depth = min(64U, set->queue_depth); - } - hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, set->numa_node); @@ -2067,6 +2091,16 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) set->queue_depth = BLK_MQ_MAX_DEPTH; } + /* + * If a crashdump is active, then we are potentially in a very + * memory constrained environment. Limit us to 1 queue and + * 64 tags to prevent using too much memory. + */ + if (is_kdump_kernel()) { + set->nr_hw_queues = 1; + set->queue_depth = min(64U, set->queue_depth); + } + set->tags = kmalloc_node(set->nr_hw_queues * sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); diff --git a/block/blk-mq.h b/block/blk-mq.h index d567d52..206230e 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -115,4 +115,9 @@ static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data, data->hctx = hctx; } +static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) +{ + return hctx->nr_ctx && hctx->tags; +} + #endif diff --git a/block/blk-settings.c b/block/blk-settings.c index aa02247..6ed2cbe 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -257,9 +257,7 @@ void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_ __func__, max_hw_sectors); } - limits->max_hw_sectors = max_hw_sectors; - limits->max_sectors = min_t(unsigned int, max_hw_sectors, - BLK_DEF_MAX_SECTORS); + limits->max_sectors = limits->max_hw_sectors = max_hw_sectors; } EXPORT_SYMBOL(blk_limits_max_hw_sectors); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 1fac434..935ea2a 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -492,17 +492,15 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head) * Currently, its primary task it to free all the &struct request * structures that were allocated to the queue and the queue itself. * - * Caveat: - * Hopefully the low level driver will have finished any - * outstanding requests first... + * Note: + * The low level driver must have finished any outstanding requests first + * via blk_cleanup_queue(). **/ static void blk_release_queue(struct kobject *kobj) { struct request_queue *q = container_of(kobj, struct request_queue, kobj); - blk_sync_queue(q); - blkcg_exit_queue(q); if (q->elevator) { @@ -517,9 +515,7 @@ static void blk_release_queue(struct kobject *kobj) if (q->queue_tags) __blk_queue_free_tags(q); - if (q->mq_ops) - blk_mq_free_queue(q); - else + if (!q->mq_ops) blk_free_flush_queue(q->fq); blk_trace_shutdown(q); diff --git a/block/genhd.c b/block/genhd.c index bd30606..0a536dc 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1070,9 +1070,16 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno) struct disk_part_tbl *old_ptbl = disk->part_tbl; struct disk_part_tbl *new_ptbl; int len = old_ptbl ? old_ptbl->len : 0; - int target = partno + 1; + int i, target; size_t size; - int i; + + /* + * check for int overflow, since we can get here from blkpg_ioctl() + * with a user passed 'partno'. + */ + target = partno + 1; + if (target < 0) + return -EINVAL; /* disk_max_parts() is zero during initialization, ignore if so */ if (disk_max_parts(disk) && target > disk_max_parts(disk)) diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index dd73e1f..46c282f 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -395,7 +395,7 @@ aoeblk_gdalloc(void *vp) WARN_ON(d->flags & DEVFL_TKILL); WARN_ON(d->gd); WARN_ON(d->flags & DEVFL_UP); - blk_queue_max_hw_sectors(q, BLK_DEF_MAX_SECTORS); + blk_queue_max_hw_sectors(q, 1024); q->backing_dev_info.name = "aoe"; q->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE; d->bufpool = mp; diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 1bd5f52..3bd7ca9 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3775,9 +3775,10 @@ static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx, return false; } -static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq, - bool last) +static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { + struct request *rq = bd->rq; int ret; if (unlikely(mtip_check_unal_depth(hctx, rq))) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 8001e812..caa6121 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -313,15 +313,15 @@ static void null_request_fn(struct request_queue *q) } } -static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq, - bool last) +static int null_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { - struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); + struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); - cmd->rq = rq; + cmd->rq = bd->rq; cmd->nq = hctx->driver_data; - blk_mq_start_request(rq); + blk_mq_start_request(bd->rq); null_handle_cmd(cmd); return BLK_MQ_RQ_QUEUE_OK; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 1fb9e09..7ef7c09 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -159,10 +159,11 @@ static void virtblk_done(struct virtqueue *vq) spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); } -static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, - bool last) +static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { struct virtio_blk *vblk = hctx->queue->queuedata; + struct request *req = bd->rq; struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); unsigned long flags; unsigned int num; @@ -223,7 +224,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, return BLK_MQ_RQ_QUEUE_ERROR; } - if (last && virtqueue_kick_prepare(vblk->vqs[qid].vq)) + if (bd->last && virtqueue_kick_prepare(vblk->vqs[qid].vq)) notify = true; spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 7e3d954..43318d5 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1947,9 +1947,10 @@ static void scsi_mq_done(struct scsi_cmnd *cmd) blk_mq_complete_request(cmd->request); } -static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, - bool last) +static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { + struct request *req = bd->rq; struct request_queue *q = req->q; struct scsi_device *sdev = q->queuedata; struct Scsi_Host *shost = sdev->host; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index ef9bef1..2d609a5 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -479,12 +479,28 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * write_inode() */ spin_lock(&inode->i_lock); - /* Clear I_DIRTY_PAGES if we've written out all dirty pages */ - if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) - inode->i_state &= ~I_DIRTY_PAGES; + dirty = inode->i_state & I_DIRTY; - inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); + inode->i_state &= ~I_DIRTY; + + /* + * Paired with smp_mb() in __mark_inode_dirty(). This allows + * __mark_inode_dirty() to test i_state without grabbing i_lock - + * either they see the I_DIRTY bits cleared or we see the dirtied + * inode. + * + * I_DIRTY_PAGES is always cleared together above even if @mapping + * still has dirty pages. The flag is reinstated after smp_mb() if + * necessary. This guarantees that either __mark_inode_dirty() + * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY. + */ + smp_mb(); + + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) + inode->i_state |= I_DIRTY_PAGES; + spin_unlock(&inode->i_lock); + /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { int err = write_inode(inode, wbc); @@ -1148,12 +1164,11 @@ void __mark_inode_dirty(struct inode *inode, int flags) } /* - * make sure that changes are seen by all cpus before we test i_state - * -- mikulas + * Paired with smp_mb() in __writeback_single_inode() for the + * following lockless i_state test. See there for details. */ smp_mb(); - /* avoid the locking if we can */ if ((inode->i_state & flags) == flags) return; diff --git a/include/linux/bio.h b/include/linux/bio.h index 7347f48..efead0b 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -443,6 +443,11 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int, extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); +void generic_start_io_acct(int rw, unsigned long sectors, + struct hd_struct *part); +void generic_end_io_acct(int rw, struct hd_struct *part, + unsigned long start_time); + #ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE # error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform" #endif diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 15f7034a..8aded9a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -79,7 +79,13 @@ struct blk_mq_tag_set { struct list_head tag_list; }; -typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *, bool); +struct blk_mq_queue_data { + struct request *rq; + struct list_head *list; + bool last; +}; + +typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *); typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int); typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); @@ -140,6 +146,7 @@ enum { BLK_MQ_F_TAG_SHARED = 1 << 1, BLK_MQ_F_SG_MERGE = 1 << 2, BLK_MQ_F_SYSFS_UP = 1 << 3, + BLK_MQ_F_DEFER_ISSUE = 1 << 4, BLK_MQ_S_STOPPED = 0, BLK_MQ_S_TAG_ACTIVE = 1, @@ -162,6 +169,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); void blk_mq_insert_request(struct request *, bool, bool, bool); void blk_mq_run_queues(struct request_queue *q, bool async); void blk_mq_free_request(struct request *rq); +void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, bool reserved); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0495e38..92f4b4b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1184,7 +1184,6 @@ extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm); enum blk_default_limits { BLK_MAX_SEGMENTS = 128, BLK_SAFE_MAX_SECTORS = 255, - BLK_DEF_MAX_SECTORS = 1024, BLK_MAX_SEGMENT_SIZE = 65536, BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL, }; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 11b9cb3..483cecf 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1477,9 +1477,6 @@ static int blk_trace_remove_queue(struct request_queue *q) if (atomic_dec_and_test(&blk_probes_ref)) blk_unregister_tracepoints(); - spin_lock_irq(&running_trace_lock); - list_del(&bt->running_list); - spin_unlock_irq(&running_trace_lock); blk_trace_free(bt); return 0; } |