diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/Kconfig | 13 | ||||
-rw-r--r-- | block/bio.c | 59 | ||||
-rw-r--r-- | block/blk-cgroup.c | 9 | ||||
-rw-r--r-- | block/blk-core.c | 8 | ||||
-rw-r--r-- | block/blk-map.c | 91 | ||||
-rw-r--r-- | block/blk-merge.c | 34 | ||||
-rw-r--r-- | block/blk-mq-sysfs.c | 9 | ||||
-rw-r--r-- | block/blk-mq.c | 190 | ||||
-rw-r--r-- | block/blk-mq.h | 1 | ||||
-rw-r--r-- | block/blk-settings.c | 4 | ||||
-rw-r--r-- | block/blk-sysfs.c | 5 | ||||
-rw-r--r-- | block/cfq-iosched.c | 43 | ||||
-rw-r--r-- | block/deadline-iosched.c | 3 | ||||
-rw-r--r-- | block/ioctl.c | 38 | ||||
-rw-r--r-- | block/partition-generic.c | 29 |
15 files changed, 327 insertions, 209 deletions
diff --git a/block/Kconfig b/block/Kconfig index 161491d..0363cd7 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -88,6 +88,19 @@ config BLK_DEV_INTEGRITY T10/SCSI Data Integrity Field or the T13/ATA External Path Protection. If in doubt, say N. +config BLK_DEV_DAX + bool "Block device DAX support" + depends on FS_DAX + depends on BROKEN + help + When DAX support is available (CONFIG_FS_DAX) raw block + devices can also support direct userspace access to the + storage capacity via MMAP(2) similar to a file on a + DAX-enabled filesystem. However, the DAX I/O-path disables + some standard I/O-statistics, and the MMAP(2) path has some + operational differences due to bypassing the page + cache. If in doubt, say N. + config BLK_DEV_THROTTLING bool "Block layer bio throttling support" depends on BLK_CGROUP=y diff --git a/block/bio.c b/block/bio.c index dbabd48..f124a0a 100644 --- a/block/bio.c +++ b/block/bio.c @@ -296,13 +296,19 @@ void bio_reset(struct bio *bio) } EXPORT_SYMBOL(bio_reset); -static void bio_chain_endio(struct bio *bio) +static struct bio *__bio_chain_endio(struct bio *bio) { struct bio *parent = bio->bi_private; - parent->bi_error = bio->bi_error; - bio_endio(parent); + if (!parent->bi_error) + parent->bi_error = bio->bi_error; bio_put(bio); + return parent; +} + +static void bio_chain_endio(struct bio *bio) +{ + bio_endio(__bio_chain_endio(bio)); } /* @@ -874,7 +880,7 @@ int submit_bio_wait(int rw, struct bio *bio) bio->bi_private = &ret; bio->bi_end_io = submit_bio_wait_endio; submit_bio(rw, bio); - wait_for_completion(&ret.event); + wait_for_completion_io(&ret.event); return ret.error; } @@ -1090,9 +1096,12 @@ int bio_uncopy_user(struct bio *bio) if (!bio_flagged(bio, BIO_NULL_MAPPED)) { /* * if we're in a workqueue, the request is orphaned, so - * don't copy into a random user address space, just free. + * don't copy into a random user address space, just free + * and return -EINTR so user space doesn't expect any data. */ - if (current->mm && bio_data_dir(bio) == READ) + if (!current->mm) + ret = -EINTR; + else if (bio_data_dir(bio) == READ) ret = bio_copy_to_iter(bio, bmd->iter); if (bmd->is_our_pages) bio_free_pages(bio); @@ -1739,29 +1748,25 @@ static inline bool bio_remaining_done(struct bio *bio) **/ void bio_endio(struct bio *bio) { - while (bio) { - if (unlikely(!bio_remaining_done(bio))) - break; +again: + if (!bio_remaining_done(bio)) + return; - /* - * Need to have a real endio function for chained bios, - * otherwise various corner cases will break (like stacking - * block devices that save/restore bi_end_io) - however, we want - * to avoid unbounded recursion and blowing the stack. Tail call - * optimization would handle this, but compiling with frame - * pointers also disables gcc's sibling call optimization. - */ - if (bio->bi_end_io == bio_chain_endio) { - struct bio *parent = bio->bi_private; - parent->bi_error = bio->bi_error; - bio_put(bio); - bio = parent; - } else { - if (bio->bi_end_io) - bio->bi_end_io(bio); - bio = NULL; - } + /* + * Need to have a real endio function for chained bios, otherwise + * various corner cases will break (like stacking block devices that + * save/restore bi_end_io) - however, we want to avoid unbounded + * recursion and blowing the stack. Tail call optimization would + * handle this, but compiling with frame pointers also disables + * gcc's sibling call optimization. + */ + if (bio->bi_end_io == bio_chain_endio) { + bio = __bio_chain_endio(bio); + goto again; } + + if (bio->bi_end_io) + bio->bi_end_io(bio); } EXPORT_SYMBOL(bio_endio); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 5a37188..66e6f1a 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -788,6 +788,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, { struct gendisk *disk; struct blkcg_gq *blkg; + struct module *owner; unsigned int major, minor; int key_len, part, ret; char *body; @@ -804,7 +805,9 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, if (!disk) return -ENODEV; if (part) { + owner = disk->fops->owner; put_disk(disk); + module_put(owner); return -ENODEV; } @@ -820,7 +823,9 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, ret = PTR_ERR(blkg); rcu_read_unlock(); spin_unlock_irq(disk->queue->queue_lock); + owner = disk->fops->owner; put_disk(disk); + module_put(owner); /* * If queue was bypassing, we should retry. Do so after a * short msleep(). It isn't strictly necessary but queue @@ -851,9 +856,13 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep); void blkg_conf_finish(struct blkg_conf_ctx *ctx) __releases(ctx->disk->queue->queue_lock) __releases(rcu) { + struct module *owner; + spin_unlock_irq(ctx->disk->queue->queue_lock); rcu_read_unlock(); + owner = ctx->disk->fops->owner; put_disk(ctx->disk); + module_put(owner); } EXPORT_SYMBOL_GPL(blkg_conf_finish); diff --git a/block/blk-core.c b/block/blk-core.c index 11e371e..827f8ba 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2198,7 +2198,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) if (q->mq_ops) { if (blk_queue_io_stat(q)) blk_account_io_start(rq, true); - blk_mq_insert_request(rq, false, true, true); + blk_mq_insert_request(rq, false, true, false); return 0; } @@ -2455,14 +2455,16 @@ struct request *blk_peek_request(struct request_queue *q) rq = NULL; break; - } else if (ret == BLKPREP_KILL) { + } else if (ret == BLKPREP_KILL || ret == BLKPREP_INVALID) { + int err = (ret == BLKPREP_INVALID) ? -EREMOTEIO : -EIO; + rq->cmd_flags |= REQ_QUIET; /* * Mark this request as started so we don't trigger * any debug logic in the end I/O path. */ blk_start_request(rq); - __blk_end_request_all(rq, -EIO); + __blk_end_request_all(rq, err); } else { printk(KERN_ERR "%s: bad return=%d\n", __func__, ret); break; diff --git a/block/blk-map.c b/block/blk-map.c index f565e11..a54f054 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -57,6 +57,49 @@ static int __blk_rq_unmap_user(struct bio *bio) return ret; } +static int __blk_rq_map_user_iov(struct request *rq, + struct rq_map_data *map_data, struct iov_iter *iter, + gfp_t gfp_mask, bool copy) +{ + struct request_queue *q = rq->q; + struct bio *bio, *orig_bio; + int ret; + + if (copy) + bio = bio_copy_user_iov(q, map_data, iter, gfp_mask); + else + bio = bio_map_user_iov(q, iter, gfp_mask); + + if (IS_ERR(bio)) + return PTR_ERR(bio); + + if (map_data && map_data->null_mapped) + bio_set_flag(bio, BIO_NULL_MAPPED); + + iov_iter_advance(iter, bio->bi_iter.bi_size); + if (map_data) + map_data->offset += bio->bi_iter.bi_size; + + orig_bio = bio; + blk_queue_bounce(q, &bio); + + /* + * We link the bounce buffer in and could have to traverse it + * later so we have to get a ref to prevent it from being freed + */ + bio_get(bio); + + ret = blk_rq_append_bio(q, rq, bio); + if (ret) { + bio_endio(bio); + __blk_rq_unmap_user(orig_bio); + bio_put(bio); + return ret; + } + + return 0; +} + /** * blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage * @q: request queue where request should be inserted @@ -82,10 +125,11 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, struct rq_map_data *map_data, const struct iov_iter *iter, gfp_t gfp_mask) { - struct bio *bio; - int unaligned = 0; - struct iov_iter i; struct iovec iov, prv = {.iov_base = NULL, .iov_len = 0}; + bool copy = (q->dma_pad_mask & iter->count) || map_data; + struct bio *bio = NULL; + struct iov_iter i; + int ret; if (!iter || !iter->count) return -EINVAL; @@ -101,42 +145,29 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, */ if ((uaddr & queue_dma_alignment(q)) || iovec_gap_to_prv(q, &prv, &iov)) - unaligned = 1; + copy = true; prv.iov_base = iov.iov_base; prv.iov_len = iov.iov_len; } - if (unaligned || (q->dma_pad_mask & iter->count) || map_data) - bio = bio_copy_user_iov(q, map_data, iter, gfp_mask); - else - bio = bio_map_user_iov(q, iter, gfp_mask); - - if (IS_ERR(bio)) - return PTR_ERR(bio); - - if (map_data && map_data->null_mapped) - bio_set_flag(bio, BIO_NULL_MAPPED); - - if (bio->bi_iter.bi_size != iter->count) { - /* - * Grab an extra reference to this bio, as bio_unmap_user() - * expects to be able to drop it twice as it happens on the - * normal IO completion path - */ - bio_get(bio); - bio_endio(bio); - __blk_rq_unmap_user(bio); - return -EINVAL; - } + i = *iter; + do { + ret =__blk_rq_map_user_iov(rq, map_data, &i, gfp_mask, copy); + if (ret) + goto unmap_rq; + if (!bio) + bio = rq->bio; + } while (iov_iter_count(&i)); if (!bio_flagged(bio, BIO_USER_MAPPED)) rq->cmd_flags |= REQ_COPY_USER; - - blk_queue_bounce(q, &bio); - bio_get(bio); - blk_rq_bio_prep(q, rq, bio); return 0; + +unmap_rq: + __blk_rq_unmap_user(bio); + rq->bio = NULL; + return -EINVAL; } EXPORT_SYMBOL(blk_rq_map_user_iov); diff --git a/block/blk-merge.c b/block/blk-merge.c index 1699df5..2613531 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -70,6 +70,18 @@ static struct bio *blk_bio_write_same_split(struct request_queue *q, return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs); } +static inline unsigned get_max_io_size(struct request_queue *q, + struct bio *bio) +{ + unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector); + unsigned mask = queue_logical_block_size(q) - 1; + + /* aligned to logical block size */ + sectors &= ~(mask >> 9); + + return sectors; +} + static struct bio *blk_bio_segment_split(struct request_queue *q, struct bio *bio, struct bio_set *bs, @@ -81,6 +93,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, unsigned front_seg_size = bio->bi_seg_front_size; bool do_split = true; struct bio *new = NULL; + const unsigned max_sectors = get_max_io_size(q, bio); bio_for_each_segment(bv, bio, iter) { /* @@ -90,20 +103,19 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset)) goto split; - if (sectors + (bv.bv_len >> 9) > - blk_max_size_offset(q, bio->bi_iter.bi_sector)) { + if (sectors + (bv.bv_len >> 9) > max_sectors) { /* * Consider this a new segment if we're splitting in * the middle of this vector. */ if (nsegs < queue_max_segments(q) && - sectors < blk_max_size_offset(q, - bio->bi_iter.bi_sector)) { + sectors < max_sectors) { nsegs++; - sectors = blk_max_size_offset(q, - bio->bi_iter.bi_sector); + sectors = max_sectors; } - goto split; + if (sectors) + goto split; + /* Make this single bvec as the 1st segment */ } if (bvprvp && blk_queue_cluster(q)) { @@ -292,7 +304,6 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, struct bio *nxt) { struct bio_vec end_bv = { NULL }, nxt_bv; - struct bvec_iter iter; if (!blk_queue_cluster(q)) return 0; @@ -304,11 +315,8 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, if (!bio_has_data(bio)) return 1; - bio_for_each_segment(end_bv, bio, iter) - if (end_bv.bv_len == iter.bi_size) - break; - - nxt_bv = bio_iovec(nxt); + bio_get_last_bvec(bio, &end_bv); + bio_get_first_bvec(nxt, &nxt_bv); if (!BIOVEC_PHYS_MERGEABLE(&end_bv, &nxt_bv)) return 0; diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 1cf1878..431fdda 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -408,17 +408,18 @@ void blk_mq_unregister_disk(struct gendisk *disk) blk_mq_enable_hotplug(); } +void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx) +{ + kobject_init(&hctx->kobj, &blk_mq_hw_ktype); +} + static void blk_mq_sysfs_init(struct request_queue *q) { - struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; int i; kobject_init(&q->mq_kobj, &blk_mq_ktype); - queue_for_each_hw_ctx(q, hctx, i) - kobject_init(&hctx->kobj, &blk_mq_hw_ktype); - queue_for_each_ctx(q, ctx, i) kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); } diff --git a/block/blk-mq.c b/block/blk-mq.c index 4c0622f..050f7a1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -544,7 +544,10 @@ EXPORT_SYMBOL(blk_mq_abort_requeue_list); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) { - return tags->rqs[tag]; + if (tag < tags->nr_tags) + return tags->rqs[tag]; + + return NULL; } EXPORT_SYMBOL(blk_mq_tag_to_rq); @@ -599,8 +602,10 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, * If a request wasn't started before the queue was * marked dying, kill it here or it'll go unnoticed. */ - if (unlikely(blk_queue_dying(rq->q))) - blk_mq_complete_request(rq, -EIO); + if (unlikely(blk_queue_dying(rq->q))) { + rq->errors = -EIO; + blk_mq_end_request(rq, rq->errors); + } return; } @@ -1742,31 +1747,6 @@ static int blk_mq_init_hctx(struct request_queue *q, return -1; } -static int blk_mq_init_hw_queues(struct request_queue *q, - struct blk_mq_tag_set *set) -{ - struct blk_mq_hw_ctx *hctx; - unsigned int i; - - /* - * Initialize hardware queues - */ - queue_for_each_hw_ctx(q, hctx, i) { - if (blk_mq_init_hctx(q, set, hctx, i)) - break; - } - - if (i == q->nr_hw_queues) - return 0; - - /* - * Init failed - */ - blk_mq_exit_hw_queues(q, set, i); - - return 1; -} - static void blk_mq_init_cpu_queues(struct request_queue *q, unsigned int nr_hw_queues) { @@ -1824,6 +1804,7 @@ static void blk_mq_map_swqueue(struct request_queue *q, continue; hctx = q->mq_ops->map_queue(q, i); + cpumask_set_cpu(i, hctx->cpumask); ctx->index_hw = hctx->nr_ctx; hctx->ctxs[hctx->nr_ctx++] = ctx; @@ -1972,56 +1953,93 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) } EXPORT_SYMBOL(blk_mq_init_queue); -struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, - struct request_queue *q) +static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, + struct request_queue *q) { - struct blk_mq_hw_ctx **hctxs; - struct blk_mq_ctx __percpu *ctx; - unsigned int *map; - int i; - - ctx = alloc_percpu(struct blk_mq_ctx); - if (!ctx) - return ERR_PTR(-ENOMEM); - - hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, - set->numa_node); - - if (!hctxs) - goto err_percpu; - - map = blk_mq_make_queue_map(set); - if (!map) - goto err_map; + int i, j; + struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; + blk_mq_sysfs_unregister(q); for (i = 0; i < set->nr_hw_queues; i++) { - int node = blk_mq_hw_queue_to_node(map, i); + int node; + if (hctxs[i]) + continue; + + node = blk_mq_hw_queue_to_node(q->mq_map, i); hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, node); if (!hctxs[i]) - goto err_hctxs; + break; if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL, - node)) - goto err_hctxs; + node)) { + kfree(hctxs[i]); + hctxs[i] = NULL; + break; + } atomic_set(&hctxs[i]->nr_active, 0); hctxs[i]->numa_node = node; hctxs[i]->queue_num = i; + + if (blk_mq_init_hctx(q, set, hctxs[i], i)) { + free_cpumask_var(hctxs[i]->cpumask); + kfree(hctxs[i]); + hctxs[i] = NULL; + break; + } + blk_mq_hctx_kobj_init(hctxs[i]); + } + for (j = i; j < q->nr_hw_queues; j++) { + struct blk_mq_hw_ctx *hctx = hctxs[j]; + + if (hctx) { + if (hctx->tags) { + blk_mq_free_rq_map(set, hctx->tags, j); + set->tags[j] = NULL; + } + blk_mq_exit_hctx(q, set, hctx, j); + free_cpumask_var(hctx->cpumask); + kobject_put(&hctx->kobj); + kfree(hctx->ctxs); + kfree(hctx); + hctxs[j] = NULL; + + } } + q->nr_hw_queues = i; + blk_mq_sysfs_register(q); +} + +struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, + struct request_queue *q) +{ + /* mark the queue as mq asap */ + q->mq_ops = set->ops; + + q->queue_ctx = alloc_percpu(struct blk_mq_ctx); + if (!q->queue_ctx) + return ERR_PTR(-ENOMEM); + + q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)), + GFP_KERNEL, set->numa_node); + if (!q->queue_hw_ctx) + goto err_percpu; + + q->mq_map = blk_mq_make_queue_map(set); + if (!q->mq_map) + goto err_map; + + blk_mq_realloc_hw_ctxs(set, q); + if (!q->nr_hw_queues) + goto err_hctxs; INIT_WORK(&q->timeout_work, blk_mq_timeout_work); blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); q->nr_queues = nr_cpu_ids; - q->nr_hw_queues = set->nr_hw_queues; - q->mq_map = map; - - q->queue_ctx = ctx; - q->queue_hw_ctx = hctxs; - q->mq_ops = set->ops; q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; if (!(set->flags & BLK_MQ_F_SG_MERGE)) @@ -2048,9 +2066,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, blk_mq_init_cpu_queues(q, set->nr_hw_queues); - if (blk_mq_init_hw_queues(q, set)) - goto err_hctxs; - get_online_cpus(); mutex_lock(&all_q_mutex); @@ -2064,17 +2079,11 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, return q; err_hctxs: - kfree(map); - for (i = 0; i < set->nr_hw_queues; i++) { - if (!hctxs[i]) - break; - free_cpumask_var(hctxs[i]->cpumask); - kfree(hctxs[i]); - } + kfree(q->mq_map); err_map: - kfree(hctxs); + kfree(q->queue_hw_ctx); err_percpu: - free_percpu(ctx); + free_percpu(q->queue_ctx); return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL(blk_mq_init_allocated_queue); @@ -2282,9 +2291,13 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) set->nr_hw_queues = 1; set->queue_depth = min(64U, set->queue_depth); } + /* + * There is no use for more h/w queues than cpus. + */ + if (set->nr_hw_queues > nr_cpu_ids) + set->nr_hw_queues = nr_cpu_ids; - set->tags = kmalloc_node(set->nr_hw_queues * - sizeof(struct blk_mq_tags *), + set->tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); if (!set->tags) return -ENOMEM; @@ -2307,7 +2320,7 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { int i; - for (i = 0; i < set->nr_hw_queues; i++) { + for (i = 0; i < nr_cpu_ids; i++) { if (set->tags[i]) blk_mq_free_rq_map(set, set->tags[i], i); } @@ -2328,6 +2341,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) ret = 0; queue_for_each_hw_ctx(q, hctx, i) { + if (!hctx->tags) + continue; ret = blk_mq_tag_update_depth(hctx->tags, nr); if (ret) break; @@ -2339,6 +2354,35 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) return ret; } +void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) +{ + struct request_queue *q; + + if (nr_hw_queues > nr_cpu_ids) + nr_hw_queues = nr_cpu_ids; + if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues) + return; + + list_for_each_entry(q, &set->tag_list, tag_set_list) + blk_mq_freeze_queue(q); + + set->nr_hw_queues = nr_hw_queues; + list_for_each_entry(q, &set->tag_list, tag_set_list) { + blk_mq_realloc_hw_ctxs(set, q); + + if (q->nr_hw_queues > 1) + blk_queue_make_request(q, blk_mq_make_request); + else + blk_queue_make_request(q, blk_sq_make_request); + + blk_mq_queue_reinit(q, cpu_online_mask); + } + + list_for_each_entry(q, &set->tag_list, tag_set_list) + blk_mq_unfreeze_queue(q); +} +EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); + void blk_mq_disable_hotplug(void) { mutex_lock(&all_q_mutex); diff --git a/block/blk-mq.h b/block/blk-mq.h index eaede8e..9087b11 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -57,6 +57,7 @@ extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int); */ extern int blk_mq_sysfs_register(struct request_queue *q); extern void blk_mq_sysfs_unregister(struct request_queue *q); +extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); extern void blk_mq_rq_timed_out(struct request *req, bool reserved); diff --git a/block/blk-settings.c b/block/blk-settings.c index dd49735..c7bb666 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -91,8 +91,8 @@ void blk_set_default_limits(struct queue_limits *lim) lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; lim->virt_boundary_mask = 0; lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; - lim->max_sectors = lim->max_dev_sectors = lim->max_hw_sectors = - BLK_SAFE_MAX_SECTORS; + lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; + lim->max_dev_sectors = 0; lim->chunk_sectors = 0; lim->max_write_same_sectors = 0; lim->max_discard_sectors = 0; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index e140cc4..dd937630 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -147,10 +147,9 @@ static ssize_t queue_discard_granularity_show(struct request_queue *q, char *pag static ssize_t queue_discard_max_hw_show(struct request_queue *q, char *page) { - unsigned long long val; - val = q->limits.max_hw_discard_sectors << 9; - return sprintf(page, "%llu\n", val); + return sprintf(page, "%llu\n", + (unsigned long long)q->limits.max_hw_discard_sectors << 9); } static ssize_t queue_discard_max_show(struct request_queue *q, char *page) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 1f9093e..e3c591d 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -632,6 +632,13 @@ static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) return pblkg ? blkg_to_cfqg(pblkg) : NULL; } +static inline bool cfqg_is_descendant(struct cfq_group *cfqg, + struct cfq_group *ancestor) +{ + return cgroup_is_descendant(cfqg_to_blkg(cfqg)->blkcg->css.cgroup, + cfqg_to_blkg(ancestor)->blkcg->css.cgroup); +} + static inline void cfqg_get(struct cfq_group *cfqg) { return blkg_get(cfqg_to_blkg(cfqg)); @@ -758,6 +765,11 @@ static void cfqg_stats_xfer_dead(struct cfq_group *cfqg) #else /* CONFIG_CFQ_GROUP_IOSCHED */ static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; } +static inline bool cfqg_is_descendant(struct cfq_group *cfqg, + struct cfq_group *ancestor) +{ + return true; +} static inline void cfqg_get(struct cfq_group *cfqg) { } static inline void cfqg_put(struct cfq_group *cfqg) { } @@ -2897,6 +2909,7 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) static void cfq_arm_slice_timer(struct cfq_data *cfqd) { struct cfq_queue *cfqq = cfqd->active_queue; + struct cfq_rb_root *st = cfqq->service_tree; struct cfq_io_cq *cic; unsigned long sl, group_idle = 0; @@ -2947,8 +2960,13 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) return; } - /* There are other queues in the group, don't do group idle */ - if (group_idle && cfqq->cfqg->nr_cfqq > 1) + /* + * There are other queues in the group or this is the only group and + * it has too big thinktime, don't do group idle. + */ + if (group_idle && + (cfqq->cfqg->nr_cfqq > 1 || + cfq_io_thinktime_big(cfqd, &st->ttime, true))) return; cfq_mark_cfqq_wait_request(cfqq); @@ -3947,16 +3965,27 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) return true; - if (new_cfqq->cfqg != cfqq->cfqg) + /* + * Treat ancestors of current cgroup the same way as current cgroup. + * For anybody else we disallow preemption to guarantee service + * fairness among cgroups. + */ + if (!cfqg_is_descendant(cfqq->cfqg, new_cfqq->cfqg)) return false; if (cfq_slice_used(cfqq)) return true; + /* + * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. + */ + if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) + return true; + + WARN_ON_ONCE(cfqq->ioprio_class != new_cfqq->ioprio_class); /* Allow preemption only if we are idling on sync-noidle tree */ if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD && cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD && - new_cfqq->service_tree->count == 2 && RB_EMPTY_ROOT(&cfqq->sort_list)) return true; @@ -3967,12 +3996,6 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending) return true; - /* - * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. - */ - if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) - return true; - /* An idle queue should not be idle now for some reason */ if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq)) return true; diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index a753df2..d0dd788 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -39,7 +39,6 @@ struct deadline_data { */ struct request *next_rq[2]; unsigned int batching; /* number of sequential requests made */ - sector_t last_sector; /* head position */ unsigned int starved; /* times reads have starved writes */ /* @@ -210,8 +209,6 @@ deadline_move_request(struct deadline_data *dd, struct request *rq) dd->next_rq[WRITE] = NULL; dd->next_rq[data_dir] = deadline_latter_request(rq); - dd->last_sector = rq_end_sector(rq); - /* * take it off the sort and fifo list, move * to dispatch queue diff --git a/block/ioctl.c b/block/ioctl.c index 77f5d177..d8996bb 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -434,42 +434,6 @@ bool blkdev_dax_capable(struct block_device *bdev) return true; } - -static int blkdev_daxset(struct block_device *bdev, unsigned long argp) -{ - unsigned long arg; - int rc = 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - - if (get_user(arg, (int __user *)(argp))) - return -EFAULT; - arg = !!arg; - if (arg == !!(bdev->bd_inode->i_flags & S_DAX)) - return 0; - - if (arg) - arg = S_DAX; - - if (arg && !blkdev_dax_capable(bdev)) - return -ENOTTY; - - inode_lock(bdev->bd_inode); - if (bdev->bd_map_count == 0) - inode_set_flags(bdev->bd_inode, arg, S_DAX); - else - rc = -EBUSY; - inode_unlock(bdev->bd_inode); - return rc; -} -#else -static int blkdev_daxset(struct block_device *bdev, int arg) -{ - if (arg) - return -ENOTTY; - return 0; -} #endif static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, @@ -634,8 +598,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKTRACESETUP: case BLKTRACETEARDOWN: return blk_trace_ioctl(bdev, cmd, argp); - case BLKDAXSET: - return blkdev_daxset(bdev, arg); case BLKDAXGET: return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX)); break; diff --git a/block/partition-generic.c b/block/partition-generic.c index 746935a..5d87019 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -16,6 +16,7 @@ #include <linux/kmod.h> #include <linux/ctype.h> #include <linux/genhd.h> +#include <linux/dax.h> #include <linux/blktrace_api.h> #include "partitions/check.h" @@ -216,10 +217,21 @@ static void part_release(struct device *dev) kfree(p); } +static int part_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + struct hd_struct *part = dev_to_part(dev); + + add_uevent_var(env, "PARTN=%u", part->partno); + if (part->info && part->info->volname[0]) + add_uevent_var(env, "PARTNAME=%s", part->info->volname); + return 0; +} + struct device_type part_type = { .name = "partition", .groups = part_attr_groups, .release = part_release, + .uevent = part_uevent, }; static void delete_partition_rcu_cb(struct rcu_head *head) @@ -550,13 +562,24 @@ int invalidate_partitions(struct gendisk *disk, struct block_device *bdev) return 0; } -unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) +static struct page *read_pagecache_sector(struct block_device *bdev, sector_t n) { struct address_space *mapping = bdev->bd_inode->i_mapping; + + return read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)), + NULL); +} + +unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) +{ struct page *page; - page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)), - NULL); + /* don't populate page cache for dax capable devices */ + if (IS_DAX(bdev->bd_inode)) + page = read_dax_sector(bdev, n); + else + page = read_pagecache_sector(bdev, n); + if (!IS_ERR(page)) { if (PageError(page)) goto fail; |