diff options
265 files changed, 5912 insertions, 6237 deletions
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index 01ddeaf..9490f28 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt @@ -632,7 +632,7 @@ to i/o submission, if the bio fields are likely to be accessed after the i/o is issued (since the bio may otherwise get freed in case i/o completion happens in the meantime). -The bio_clone() routine may be used to duplicate a bio, where the clone +The bio_clone_fast() routine may be used to duplicate a bio, where the clone shares the bio_vec_list with the original bio (i.e. both point to the same bio_vec_list). This would typically be used for splitting i/o requests in lvm or md. diff --git a/arch/s390/include/asm/eadm.h b/arch/s390/include/asm/eadm.h index 6702630..144809a 100644 --- a/arch/s390/include/asm/eadm.h +++ b/arch/s390/include/asm/eadm.h @@ -3,6 +3,7 @@ #include <linux/types.h> #include <linux/device.h> +#include <linux/blkdev.h> struct arqb { u64 data; @@ -105,13 +106,14 @@ struct scm_driver { int (*probe) (struct scm_device *scmdev); int (*remove) (struct scm_device *scmdev); void (*notify) (struct scm_device *scmdev, enum scm_event event); - void (*handler) (struct scm_device *scmdev, void *data, int error); + void (*handler) (struct scm_device *scmdev, void *data, + blk_status_t error); }; int scm_driver_register(struct scm_driver *scmdrv); void scm_driver_unregister(struct scm_driver *scmdrv); int eadm_start_aob(struct aob *aob); -void scm_irq_handler(struct aob *aob, int error); +void scm_irq_handler(struct aob *aob, blk_status_t error); #endif /* _ASM_S390_EADM_H */ diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 8541027..b55fe9b 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -534,7 +534,7 @@ static void ubd_handler(void) for (count = 0; count < n/sizeof(struct io_thread_req *); count++) { blk_end_request( (*irq_req_buffer)[count]->req, - 0, + BLK_STS_OK, (*irq_req_buffer)[count]->length ); kfree((*irq_req_buffer)[count]); diff --git a/block/badblocks.c b/block/badblocks.c index 6ebcef2..43c7116 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -533,6 +533,7 @@ ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, case 3: if (newline != '\n') return -EINVAL; + /* fall through */ case 2: if (length <= 0) return -EINVAL; diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index ed93da2..12bbc6b 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -725,8 +725,12 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, } static void -bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) +bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, + struct bfq_io_cq *bic, bool bfq_already_existing) { + unsigned int old_wr_coeff = bfqq->wr_coeff; + bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); + if (bic->saved_idle_window) bfq_mark_bfqq_idle_window(bfqq); else @@ -754,6 +758,14 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) /* make sure weight will be updated, however we got here */ bfqq->entity.prio_changed = 1; + + if (likely(!busy)) + return; + + if (old_wr_coeff == 1 && bfqq->wr_coeff > 1) + bfqd->wr_busy_queues++; + else if (old_wr_coeff > 1 && bfqq->wr_coeff == 1) + bfqd->wr_busy_queues--; } static int bfqq_process_refs(struct bfq_queue *bfqq) @@ -4290,10 +4302,16 @@ static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) bfq_put_queue(bfqq); } -static void bfq_put_rq_private(struct request_queue *q, struct request *rq) +static void bfq_finish_request(struct request *rq) { - struct bfq_queue *bfqq = RQ_BFQQ(rq); - struct bfq_data *bfqd = bfqq->bfqd; + struct bfq_queue *bfqq; + struct bfq_data *bfqd; + + if (!rq->elv.icq) + return; + + bfqq = RQ_BFQQ(rq); + bfqd = bfqq->bfqd; if (rq->rq_flags & RQF_STARTED) bfqg_stats_update_completion(bfqq_group(bfqq), @@ -4324,7 +4342,7 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) */ if (!RB_EMPTY_NODE(&rq->rb_node)) - bfq_remove_request(q, rq); + bfq_remove_request(rq->q, rq); bfq_put_rq_priv_body(bfqq); } @@ -4394,20 +4412,21 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, /* * Allocate bfq data structures associated with this request. */ -static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - struct bio *bio) +static void bfq_prepare_request(struct request *rq, struct bio *bio) { + struct request_queue *q = rq->q; struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); + struct bfq_io_cq *bic; const int is_sync = rq_is_sync(rq); struct bfq_queue *bfqq; bool new_queue = false; - bool split = false; + bool bfqq_already_existing = false, split = false; - spin_lock_irq(&bfqd->lock); + if (!rq->elv.icq) + return; + bic = icq_to_bic(rq->elv.icq); - if (!bic) - goto queue_fail; + spin_lock_irq(&bfqd->lock); bfq_check_ioprio_change(bic, bio); @@ -4432,6 +4451,8 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, true, is_sync, NULL); + else + bfqq_already_existing = true; } } @@ -4457,7 +4478,8 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, * queue: restore the idle window and the * possible weight raising period. */ - bfq_bfqq_resume_state(bfqq, bic); + bfq_bfqq_resume_state(bfqq, bfqd, bic, + bfqq_already_existing); } } @@ -4465,13 +4487,6 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, bfq_handle_burst(bfqd, bfqq); spin_unlock_irq(&bfqd->lock); - - return 0; - -queue_fail: - spin_unlock_irq(&bfqd->lock); - - return 1; } static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) @@ -4950,8 +4965,8 @@ static struct elv_fs_entry bfq_attrs[] = { static struct elevator_type iosched_bfq_mq = { .ops.mq = { - .get_rq_priv = bfq_get_rq_private, - .put_rq_priv = bfq_put_rq_private, + .prepare_request = bfq_prepare_request, + .finish_request = bfq_finish_request, .exit_icq = bfq_exit_icq, .insert_requests = bfq_insert_requests, .dispatch_request = bfq_dispatch_request, diff --git a/block/bio-integrity.c b/block/bio-integrity.c index b5009a8..b8a3a65 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -224,7 +224,7 @@ static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, * @bio: bio to generate/verify integrity metadata for * @proc_fn: Pointer to the relevant processing function */ -static int bio_integrity_process(struct bio *bio, +static blk_status_t bio_integrity_process(struct bio *bio, integrity_processing_fn *proc_fn) { struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); @@ -232,7 +232,7 @@ static int bio_integrity_process(struct bio *bio, struct bvec_iter bviter; struct bio_vec bv; struct bio_integrity_payload *bip = bio_integrity(bio); - unsigned int ret = 0; + blk_status_t ret = BLK_STS_OK; void *prot_buf = page_address(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; @@ -369,7 +369,7 @@ static void bio_integrity_verify_fn(struct work_struct *work) struct bio *bio = bip->bip_bio; struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - bio->bi_error = bio_integrity_process(bio, bi->profile->verify_fn); + bio->bi_status = bio_integrity_process(bio, bi->profile->verify_fn); /* Restore original bio completion handler */ bio->bi_end_io = bip->bip_end_io; @@ -398,7 +398,7 @@ void bio_integrity_endio(struct bio *bio) * integrity metadata. Restore original bio end_io handler * and run it. */ - if (bio->bi_error) { + if (bio->bi_status) { bio->bi_end_io = bip->bip_end_io; bio_endio(bio); diff --git a/block/bio.c b/block/bio.c index 26b0810..1cfcd0d 100644 --- a/block/bio.c +++ b/block/bio.c @@ -315,8 +315,8 @@ static struct bio *__bio_chain_endio(struct bio *bio) { struct bio *parent = bio->bi_private; - if (!parent->bi_error) - parent->bi_error = bio->bi_error; + if (!parent->bi_status) + parent->bi_status = bio->bi_status; bio_put(bio); return parent; } @@ -369,6 +369,8 @@ static void punt_bios_to_rescuer(struct bio_set *bs) struct bio_list punt, nopunt; struct bio *bio; + if (WARN_ON_ONCE(!bs->rescue_workqueue)) + return; /* * In order to guarantee forward progress we must punt only bios that * were allocated from this bio_set; otherwise, if there was a bio on @@ -480,7 +482,8 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, if (current->bio_list && (!bio_list_empty(¤t->bio_list[0]) || - !bio_list_empty(¤t->bio_list[1]))) + !bio_list_empty(¤t->bio_list[1])) && + bs->rescue_workqueue) gfp_mask &= ~__GFP_DIRECT_RECLAIM; p = mempool_alloc(bs->bio_pool, gfp_mask); @@ -550,7 +553,7 @@ EXPORT_SYMBOL(zero_fill_bio); * * Description: * Put a reference to a &struct bio, either one you have gotten with - * bio_alloc, bio_get or bio_clone. The last put of a bio will free it. + * bio_alloc, bio_get or bio_clone_*. The last put of a bio will free it. **/ void bio_put(struct bio *bio) { @@ -599,6 +602,7 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) bio->bi_bdev = bio_src->bi_bdev; bio_set_flag(bio, BIO_CLONED); bio->bi_opf = bio_src->bi_opf; + bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_iter = bio_src->bi_iter; bio->bi_io_vec = bio_src->bi_io_vec; @@ -682,6 +686,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, return NULL; bio->bi_bdev = bio_src->bi_bdev; bio->bi_opf = bio_src->bi_opf; + bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; @@ -924,7 +929,7 @@ static void submit_bio_wait_endio(struct bio *bio) { struct submit_bio_ret *ret = bio->bi_private; - ret->error = bio->bi_error; + ret->error = blk_status_to_errno(bio->bi_status); complete(&ret->event); } @@ -1823,8 +1828,8 @@ again: } if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { - trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), - bio, bio->bi_error); + trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio, + blk_status_to_errno(bio->bi_status)); bio_clear_flag(bio, BIO_TRACE_COMPLETION); } @@ -1927,9 +1932,29 @@ void bioset_free(struct bio_set *bs) } EXPORT_SYMBOL(bioset_free); -static struct bio_set *__bioset_create(unsigned int pool_size, - unsigned int front_pad, - bool create_bvec_pool) +/** + * bioset_create - Create a bio_set + * @pool_size: Number of bio and bio_vecs to cache in the mempool + * @front_pad: Number of bytes to allocate in front of the returned bio + * @flags: Flags to modify behavior, currently %BIOSET_NEED_BVECS + * and %BIOSET_NEED_RESCUER + * + * Description: + * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller + * to ask for a number of bytes to be allocated in front of the bio. + * Front pad allocation is useful for embedding the bio inside + * another structure, to avoid allocating extra data to go with the bio. + * Note that the bio must be embedded at the END of that structure always, + * or things will break badly. + * If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated + * for allocating iovecs. This pool is not needed e.g. for bio_clone_fast(). + * If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used to + * dispatch queued requests when the mempool runs out of space. + * + */ +struct bio_set *bioset_create(unsigned int pool_size, + unsigned int front_pad, + int flags) { unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); struct bio_set *bs; @@ -1954,12 +1979,15 @@ static struct bio_set *__bioset_create(unsigned int pool_size, if (!bs->bio_pool) goto bad; - if (create_bvec_pool) { + if (flags & BIOSET_NEED_BVECS) { bs->bvec_pool = biovec_create_pool(pool_size); if (!bs->bvec_pool) goto bad; } + if (!(flags & BIOSET_NEED_RESCUER)) + return bs; + bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0); if (!bs->rescue_workqueue) goto bad; @@ -1969,41 +1997,8 @@ bad: bioset_free(bs); return NULL; } - -/** - * bioset_create - Create a bio_set - * @pool_size: Number of bio and bio_vecs to cache in the mempool - * @front_pad: Number of bytes to allocate in front of the returned bio - * - * Description: - * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller - * to ask for a number of bytes to be allocated in front of the bio. - * Front pad allocation is useful for embedding the bio inside - * another structure, to avoid allocating extra data to go with the bio. - * Note that the bio must be embedded at the END of that structure always, - * or things will break badly. - */ -struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) -{ - return __bioset_create(pool_size, front_pad, true); -} EXPORT_SYMBOL(bioset_create); -/** - * bioset_create_nobvec - Create a bio_set without bio_vec mempool - * @pool_size: Number of bio to cache in the mempool - * @front_pad: Number of bytes to allocate in front of the returned bio - * - * Description: - * Same functionality as bioset_create() except that mempool is not - * created for bio_vecs. Saving some memory for bio_clone_fast() users. - */ -struct bio_set *bioset_create_nobvec(unsigned int pool_size, unsigned int front_pad) -{ - return __bioset_create(pool_size, front_pad, false); -} -EXPORT_SYMBOL(bioset_create_nobvec); - #ifdef CONFIG_BLK_CGROUP /** @@ -2118,7 +2113,7 @@ static int __init init_bio(void) bio_integrity_init(); biovec_init_slabs(); - fs_bio_set = bioset_create(BIO_POOL_SIZE, 0); + fs_bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (!fs_bio_set) panic("bio: can't allocate bios\n"); diff --git a/block/blk-core.c b/block/blk-core.c index a7421b7..af393d5 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -129,11 +129,70 @@ void blk_rq_init(struct request_queue *q, struct request *rq) } EXPORT_SYMBOL(blk_rq_init); +static const struct { + int errno; + const char *name; +} blk_errors[] = { + [BLK_STS_OK] = { 0, "" }, + [BLK_STS_NOTSUPP] = { -EOPNOTSUPP, "operation not supported" }, + [BLK_STS_TIMEOUT] = { -ETIMEDOUT, "timeout" }, + [BLK_STS_NOSPC] = { -ENOSPC, "critical space allocation" }, + [BLK_STS_TRANSPORT] = { -ENOLINK, "recoverable transport" }, + [BLK_STS_TARGET] = { -EREMOTEIO, "critical target" }, + [BLK_STS_NEXUS] = { -EBADE, "critical nexus" }, + [BLK_STS_MEDIUM] = { -ENODATA, "critical medium" }, + [BLK_STS_PROTECTION] = { -EILSEQ, "protection" }, + [BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" }, + [BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" }, + + /* device mapper special case, should not leak out: */ + [BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" }, + + /* everything else not covered above: */ + [BLK_STS_IOERR] = { -EIO, "I/O" }, +}; + +blk_status_t errno_to_blk_status(int errno) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(blk_errors); i++) { + if (blk_errors[i].errno == errno) + return (__force blk_status_t)i; + } + + return BLK_STS_IOERR; +} +EXPORT_SYMBOL_GPL(errno_to_blk_status); + +int blk_status_to_errno(blk_status_t status) +{ + int idx = (__force int)status; + + if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) + return -EIO; + return blk_errors[idx].errno; +} +EXPORT_SYMBOL_GPL(blk_status_to_errno); + +static void print_req_error(struct request *req, blk_status_t status) +{ + int idx = (__force int)status; + + if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) + return; + + printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu\n", + __func__, blk_errors[idx].name, req->rq_disk ? + req->rq_disk->disk_name : "?", + (unsigned long long)blk_rq_pos(req)); +} + static void req_bio_endio(struct request *rq, struct bio *bio, - unsigned int nbytes, int error) + unsigned int nbytes, blk_status_t error) { if (error) - bio->bi_error = error; + bio->bi_status = error; if (unlikely(rq->rq_flags & RQF_QUIET)) bio_set_flag(bio, BIO_QUIET); @@ -177,10 +236,13 @@ static void blk_delay_work(struct work_struct *work) * Description: * Sometimes queueing needs to be postponed for a little while, to allow * resources to come back. This function will make sure that queueing is - * restarted around the specified time. Queue lock must be held. + * restarted around the specified time. */ void blk_delay_queue(struct request_queue *q, unsigned long msecs) { + lockdep_assert_held(q->queue_lock); + WARN_ON_ONCE(q->mq_ops); + if (likely(!blk_queue_dead(q))) queue_delayed_work(kblockd_workqueue, &q->delay_work, msecs_to_jiffies(msecs)); @@ -198,6 +260,9 @@ EXPORT_SYMBOL(blk_delay_queue); **/ void blk_start_queue_async(struct request_queue *q) { + lockdep_assert_held(q->queue_lock); + WARN_ON_ONCE(q->mq_ops); + queue_flag_clear(QUEUE_FLAG_STOPPED, q); blk_run_queue_async(q); } @@ -210,11 +275,13 @@ EXPORT_SYMBOL(blk_start_queue_async); * Description: * blk_start_queue() will clear the stop flag on the queue, and call * the request_fn for the queue if it was in a stopped state when - * entered. Also see blk_stop_queue(). Queue lock must be held. + * entered. Also see blk_stop_queue(). **/ void blk_start_queue(struct request_queue *q) { + lockdep_assert_held(q->queue_lock); WARN_ON(!irqs_disabled()); + WARN_ON_ONCE(q->mq_ops); queue_flag_clear(QUEUE_FLAG_STOPPED, q); __blk_run_queue(q); @@ -233,10 +300,13 @@ EXPORT_SYMBOL(blk_start_queue); * or if it simply chooses not to queue more I/O at one point, it can * call this function to prevent the request_fn from being called until * the driver has signalled it's ready to go again. This happens by calling - * blk_start_queue() to restart queue operations. Queue lock must be held. + * blk_start_queue() to restart queue operations. **/ void blk_stop_queue(struct request_queue *q) { + lockdep_assert_held(q->queue_lock); + WARN_ON_ONCE(q->mq_ops); + cancel_delayed_work(&q->delay_work); queue_flag_set(QUEUE_FLAG_STOPPED, q); } @@ -289,6 +359,9 @@ EXPORT_SYMBOL(blk_sync_queue); */ inline void __blk_run_queue_uncond(struct request_queue *q) { + lockdep_assert_held(q->queue_lock); + WARN_ON_ONCE(q->mq_ops); + if (unlikely(blk_queue_dead(q))) return; @@ -310,11 +383,13 @@ EXPORT_SYMBOL_GPL(__blk_run_queue_uncond); * @q: The queue to run * * Description: - * See @blk_run_queue. This variant must be called with the queue lock - * held and interrupts disabled. + * See @blk_run_queue. */ void __blk_run_queue(struct request_queue *q) { + lockdep_assert_held(q->queue_lock); + WARN_ON_ONCE(q->mq_ops); + if (unlikely(blk_queue_stopped(q))) return; @@ -328,10 +403,18 @@ EXPORT_SYMBOL(__blk_run_queue); * * Description: * Tells kblockd to perform the equivalent of @blk_run_queue on behalf - * of us. The caller must hold the queue lock. + * of us. + * + * Note: + * Since it is not allowed to run q->delay_work after blk_cleanup_queue() + * has canceled q->delay_work, callers must hold the queue lock to avoid + * race conditions between blk_cleanup_queue() and blk_run_queue_async(). */ void blk_run_queue_async(struct request_queue *q) { + lockdep_assert_held(q->queue_lock); + WARN_ON_ONCE(q->mq_ops); + if (likely(!blk_queue_stopped(q) && !blk_queue_dead(q))) mod_delayed_work(kblockd_workqueue, &q->delay_work, 0); } @@ -349,6 +432,8 @@ void blk_run_queue(struct request_queue *q) { unsigned long flags; + WARN_ON_ONCE(q->mq_ops); + spin_lock_irqsave(q->queue_lock, flags); __blk_run_queue(q); spin_unlock_irqrestore(q->queue_lock, flags); @@ -377,6 +462,7 @@ static void __blk_drain_queue(struct request_queue *q, bool drain_all) int i; lockdep_assert_held(q->queue_lock); + WARN_ON_ONCE(q->mq_ops); while (true) { bool drain = false; @@ -455,6 +541,8 @@ static void __blk_drain_queue(struct request_queue *q, bool drain_all) */ void blk_queue_bypass_start(struct request_queue *q) { + WARN_ON_ONCE(q->mq_ops); + spin_lock_irq(q->queue_lock); q->bypass_depth++; queue_flag_set(QUEUE_FLAG_BYPASS, q); @@ -481,6 +569,9 @@ EXPORT_SYMBOL_GPL(blk_queue_bypass_start); * @q: queue of interest * * Leave bypass mode and restore the normal queueing behavior. + * + * Note: although blk_queue_bypass_start() is only called for blk-sq queues, + * this function is called for both blk-sq and blk-mq queues. */ void blk_queue_bypass_end(struct request_queue *q) { @@ -732,7 +823,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (q->id < 0) goto fail_q; - q->bio_split = bioset_create(BIO_POOL_SIZE, 0); + q->bio_split = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (!q->bio_split) goto fail_id; @@ -878,6 +969,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio); int blk_init_allocated_queue(struct request_queue *q) { + WARN_ON_ONCE(q->mq_ops); + q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, q->cmd_size); if (!q->fq) return -ENOMEM; @@ -1015,6 +1108,8 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr) struct request_list *rl; int on_thresh, off_thresh; + WARN_ON_ONCE(q->mq_ops); + spin_lock_irq(q->queue_lock); q->nr_requests = nr; blk_queue_congestion_threshold(q); @@ -1077,6 +1172,8 @@ static struct request *__get_request(struct request_list *rl, unsigned int op, int may_queue; req_flags_t rq_flags = RQF_ALLOCED; + lockdep_assert_held(q->queue_lock); + if (unlikely(blk_queue_dying(q))) return ERR_PTR(-ENODEV); @@ -1250,12 +1347,20 @@ static struct request *get_request(struct request_queue *q, unsigned int op, struct request_list *rl; struct request *rq; + lockdep_assert_held(q->queue_lock); + WARN_ON_ONCE(q->mq_ops); + rl = blk_get_rl(q, bio); /* transferred to @rq on success */ retry: rq = __get_request(rl, op, bio, gfp_mask); if (!IS_ERR(rq)) return rq; + if (op & REQ_NOWAIT) { + blk_put_rl(rl); + return ERR_PTR(-EAGAIN); + } + if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) { blk_put_rl(rl); return rq; @@ -1283,16 +1388,18 @@ retry: goto retry; } -static struct request *blk_old_get_request(struct request_queue *q, int rw, - gfp_t gfp_mask) +static struct request *blk_old_get_request(struct request_queue *q, + unsigned int op, gfp_t gfp_mask) { struct request *rq; + WARN_ON_ONCE(q->mq_ops); + /* create ioc upfront */ create_io_context(gfp_mask, q->node); spin_lock_irq(q->queue_lock); - rq = get_request(q, rw, NULL, gfp_mask); + rq = get_request(q, op, NULL, gfp_mask); if (IS_ERR(rq)) { spin_unlock_irq(q->queue_lock); return rq; @@ -1305,14 +1412,24 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw, return rq; } -struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) +struct request *blk_get_request(struct request_queue *q, unsigned int op, + gfp_t gfp_mask) { - if (q->mq_ops) - return blk_mq_alloc_request(q, rw, + struct request *req; + + if (q->mq_ops) { + req = blk_mq_alloc_request(q, op, (gfp_mask & __GFP_DIRECT_RECLAIM) ? 0 : BLK_MQ_REQ_NOWAIT); - else - return blk_old_get_request(q, rw, gfp_mask); + if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) + q->mq_ops->initialize_rq_fn(req); + } else { + req = blk_old_get_request(q, op, gfp_mask); + if (!IS_ERR(req) && q->initialize_rq_fn) + q->initialize_rq_fn(req); + } + + return req; } EXPORT_SYMBOL(blk_get_request); @@ -1328,6 +1445,9 @@ EXPORT_SYMBOL(blk_get_request); */ void blk_requeue_request(struct request_queue *q, struct request *rq) { + lockdep_assert_held(q->queue_lock); + WARN_ON_ONCE(q->mq_ops); + blk_delete_timer(rq); blk_clear_rq_complete(rq); trace_block_rq_requeue(q, rq); @@ -1402,9 +1522,6 @@ static void blk_pm_put_request(struct request *rq) static inline void blk_pm_put_request(struct request *rq) {} #endif -/* - * queue lock must be held - */ void __blk_put_request(struct request_queue *q, struct request *req) { req_flags_t rq_flags = req->rq_flags; @@ -1417,6 +1534,8 @@ void __blk_put_request(struct request_queue *q, struct request *req) return; } + lockdep_assert_held(q->queue_lock); + blk_pm_put_request(req); elv_completed_request(q, req); @@ -1646,6 +1765,7 @@ void blk_init_request_from_bio(struct request *req, struct bio *bio) req->ioprio = ioc->ioprio; else req->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); + req->write_hint = bio->bi_write_hint; blk_rq_bio_prep(req->q, req, bio); } EXPORT_SYMBOL_GPL(blk_init_request_from_bio); @@ -1665,10 +1785,10 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) */ blk_queue_bounce(q, &bio); - blk_queue_split(q, &bio, q->bio_split); + blk_queue_split(q, &bio); if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); return BLK_QC_T_NONE; } @@ -1726,7 +1846,10 @@ get_rq: req = get_request(q, bio->bi_opf, bio, GFP_NOIO); if (IS_ERR(req)) { __wbt_done(q->rq_wb, wb_acct); - bio->bi_error = PTR_ERR(req); + if (PTR_ERR(req) == -ENOMEM) + bio->bi_status = BLK_STS_RESOURCE; + else + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); goto out_unlock; } @@ -1881,7 +2004,7 @@ generic_make_request_checks(struct bio *bio) { struct request_queue *q; int nr_sectors = bio_sectors(bio); - int err = -EIO; + blk_status_t status = BLK_STS_IOERR; char b[BDEVNAME_SIZE]; struct hd_struct *part; @@ -1900,6 +2023,14 @@ generic_make_request_checks(struct bio *bio) goto end_io; } + /* + * For a REQ_NOWAIT based request, return -EOPNOTSUPP + * if queue is not a request based queue. + */ + + if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q)) + goto not_supported; + part = bio->bi_bdev->bd_part; if (should_fail_request(part, bio->bi_iter.bi_size) || should_fail_request(&part_to_disk(part)->part0, @@ -1924,7 +2055,7 @@ generic_make_request_checks(struct bio *bio) !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA); if (!nr_sectors) { - err = 0; + status = BLK_STS_OK; goto end_io; } } @@ -1976,9 +2107,9 @@ generic_make_request_checks(struct bio *bio) return true; not_supported: - err = -EOPNOTSUPP; + status = BLK_STS_NOTSUPP; end_io: - bio->bi_error = err; + bio->bi_status = status; bio_endio(bio); return false; } @@ -2057,7 +2188,7 @@ blk_qc_t generic_make_request(struct bio *bio) do { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - if (likely(blk_queue_enter(q, false) == 0)) { + if (likely(blk_queue_enter(q, bio->bi_opf & REQ_NOWAIT) == 0)) { struct bio_list lower, same; /* Create a fresh bio_list for all subordinate requests */ @@ -2082,7 +2213,11 @@ blk_qc_t generic_make_request(struct bio *bio) bio_list_merge(&bio_list_on_stack[0], &same); bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); } else { - bio_io_error(bio); + if (unlikely(!blk_queue_dying(q) && + (bio->bi_opf & REQ_NOWAIT))) + bio_wouldblock_error(bio); + else + bio_io_error(bio); } bio = bio_list_pop(&bio_list_on_stack[0]); } while (bio); @@ -2183,29 +2318,29 @@ static int blk_cloned_rq_check_limits(struct request_queue *q, * @q: the queue to submit the request * @rq: the request being queued */ -int blk_insert_cloned_request(struct request_queue *q, struct request *rq) +blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) { unsigned long flags; int where = ELEVATOR_INSERT_BACK; if (blk_cloned_rq_check_limits(q, rq)) - return -EIO; + return BLK_STS_IOERR; if (rq->rq_disk && should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) - return -EIO; + return BLK_STS_IOERR; if (q->mq_ops) { if (blk_queue_io_stat(q)) blk_account_io_start(rq, true); blk_mq_sched_insert_request(rq, false, true, false, false); - return 0; + return BLK_STS_OK; } spin_lock_irqsave(q->queue_lock, flags); if (unlikely(blk_queue_dying(q))) { spin_unlock_irqrestore(q->queue_lock, flags); - return -ENODEV; + return BLK_STS_IOERR; } /* @@ -2222,7 +2357,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) __blk_run_queue(q); spin_unlock_irqrestore(q->queue_lock, flags); - return 0; + return BLK_STS_OK; } EXPORT_SYMBOL_GPL(blk_insert_cloned_request); @@ -2238,9 +2373,6 @@ EXPORT_SYMBOL_GPL(blk_insert_cloned_request); * * Return: * The number of bytes to fail. - * - * Context: - * queue_lock must be held. */ unsigned int blk_rq_err_bytes(const struct request *rq) { @@ -2380,15 +2512,15 @@ void blk_account_io_start(struct request *rq, bool new_io) * Return: * Pointer to the request at the top of @q if available. Null * otherwise. - * - * Context: - * queue_lock must be held. */ struct request *blk_peek_request(struct request_queue *q) { struct request *rq; int ret; + lockdep_assert_held(q->queue_lock); + WARN_ON_ONCE(q->mq_ops); + while ((rq = __elv_next_request(q)) != NULL) { rq = blk_pm_peek_request(q, rq); @@ -2456,15 +2588,14 @@ struct request *blk_peek_request(struct request_queue *q) rq = NULL; break; } else if (ret == BLKPREP_KILL || ret == BLKPREP_INVALID) { - int err = (ret == BLKPREP_INVALID) ? -EREMOTEIO : -EIO; - rq->rq_flags |= RQF_QUIET; /* * Mark this request as started so we don't trigger * any debug logic in the end I/O path. */ blk_start_request(rq); - __blk_end_request_all(rq, err); + __blk_end_request_all(rq, ret == BLKPREP_INVALID ? + BLK_STS_TARGET : BLK_STS_IOERR); } else { printk(KERN_ERR "%s: bad return=%d\n", __func__, ret); break; @@ -2505,12 +2636,12 @@ void blk_dequeue_request(struct request *rq) * * Block internal functions which don't want to start timer should * call blk_dequeue_request(). - * - * Context: - * queue_lock must be held. */ void blk_start_request(struct request *req) { + lockdep_assert_held(req->q->queue_lock); + WARN_ON_ONCE(req->q->mq_ops); + blk_dequeue_request(req); if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) { @@ -2535,14 +2666,14 @@ EXPORT_SYMBOL(blk_start_request); * Return: * Pointer to the request at the top of @q if available. Null * otherwise. - * - * Context: - * queue_lock must be held. */ struct request *blk_fetch_request(struct request_queue *q) { struct request *rq; + lockdep_assert_held(q->queue_lock); + WARN_ON_ONCE(q->mq_ops); + rq = blk_peek_request(q); if (rq) blk_start_request(rq); @@ -2553,7 +2684,7 @@ EXPORT_SYMBOL(blk_fetch_request); /** * blk_update_request - Special helper function for request stacking drivers * @req: the request being processed - * @error: %0 for success, < %0 for error + * @error: block status code * @nr_bytes: number of bytes to complete @req * * Description: @@ -2572,49 +2703,19 @@ EXPORT_SYMBOL(blk_fetch_request); * %false - this request doesn't have any more data * %true - this request has more data **/ -bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) +bool blk_update_request(struct request *req, blk_status_t error, + unsigned int nr_bytes) { int total_bytes; - trace_block_rq_complete(req, error, nr_bytes); + trace_block_rq_complete(req, blk_status_to_errno(error), nr_bytes); if (!req->bio) return false; - if (error && !blk_rq_is_passthrough(req) && - !(req->rq_flags & RQF_QUIET)) { - char *error_type; - - switch (error) { - case -ENOLINK: - error_type = "recoverable transport"; - break; - case -EREMOTEIO: - error_type = "critical target"; - break; - case -EBADE: - error_type = "critical nexus"; - break; - case -ETIMEDOUT: - error_type = "timeout"; - break; - case -ENOSPC: - error_type = "critical space allocation"; - break; - case -ENODATA: - error_type = "critical medium"; - break; - case -EIO: - default: - error_type = "I/O"; - break; - } - printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu\n", - __func__, error_type, req->rq_disk ? - req->rq_disk->disk_name : "?", - (unsigned long long)blk_rq_pos(req)); - - } + if (unlikely(error && !blk_rq_is_passthrough(req) && + !(req->rq_flags & RQF_QUIET))) + print_req_error(req, error); blk_account_io_completion(req, nr_bytes); @@ -2680,7 +2781,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) } EXPORT_SYMBOL_GPL(blk_update_request); -static bool blk_update_bidi_request(struct request *rq, int error, +static bool blk_update_bidi_request(struct request *rq, blk_status_t error, unsigned int nr_bytes, unsigned int bidi_bytes) { @@ -2718,13 +2819,13 @@ void blk_unprep_request(struct request *req) } EXPORT_SYMBOL_GPL(blk_unprep_request); -/* - * queue lock must be held - */ -void blk_finish_request(struct request *req, int error) +void blk_finish_request(struct request *req, blk_status_t error) { struct request_queue *q = req->q; + lockdep_assert_held(req->q->queue_lock); + WARN_ON_ONCE(q->mq_ops); + if (req->rq_flags & RQF_STATS) blk_stat_add(req); @@ -2758,7 +2859,7 @@ EXPORT_SYMBOL(blk_finish_request); /** * blk_end_bidi_request - Complete a bidi request * @rq: the request to complete - * @error: %0 for success, < %0 for error + * @error: block status code * @nr_bytes: number of bytes to complete @rq * @bidi_bytes: number of bytes to complete @rq->next_rq * @@ -2772,12 +2873,14 @@ EXPORT_SYMBOL(blk_finish_request); * %false - we are done with this request * %true - still buffers pending for this request **/ -static bool blk_end_bidi_request(struct request *rq, int error, +static bool blk_end_bidi_request(struct request *rq, blk_status_t error, unsigned int nr_bytes, unsigned int bidi_bytes) { struct request_queue *q = rq->q; unsigned long flags; + WARN_ON_ONCE(q->mq_ops); + if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) return true; @@ -2791,7 +2894,7 @@ static bool blk_end_bidi_request(struct request *rq, int error, /** * __blk_end_bidi_request - Complete a bidi request with queue lock held * @rq: the request to complete - * @error: %0 for success, < %0 for error + * @error: block status code * @nr_bytes: number of bytes to complete @rq * @bidi_bytes: number of bytes to complete @rq->next_rq * @@ -2803,9 +2906,12 @@ static bool blk_end_bidi_request(struct request *rq, int error, * %false - we are done with this request * %true - still buffers pending for this request **/ -static bool __blk_end_bidi_request(struct request *rq, int error, +static bool __blk_end_bidi_request(struct request *rq, blk_status_t error, unsigned int nr_bytes, unsigned int bidi_bytes) { + lockdep_assert_held(rq->q->queue_lock); + WARN_ON_ONCE(rq->q->mq_ops); + if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) return true; @@ -2817,7 +2923,7 @@ static bool __blk_end_bidi_request(struct request *rq, int error, /** * blk_end_request - Helper function for drivers to complete the request. * @rq: the request being processed - * @error: %0 for success, < %0 for error + * @error: block status code * @nr_bytes: number of bytes to complete * * Description: @@ -2828,8 +2934,10 @@ static bool __blk_end_bidi_request(struct request *rq, int error, * %false - we are done with this request * %true - still buffers pending for this request **/ -bool blk_end_request(struct request *rq, int error, unsigned int nr_bytes) +bool blk_end_request(struct request *rq, blk_status_t error, + unsigned int nr_bytes) { + WARN_ON_ONCE(rq->q->mq_ops); return blk_end_bidi_request(rq, error, nr_bytes, 0); } EXPORT_SYMBOL(blk_end_request); @@ -2837,12 +2945,12 @@ EXPORT_SYMBOL(blk_end_request); /** * blk_end_request_all - Helper function for drives to finish the request. * @rq: the request to finish - * @error: %0 for success, < %0 for error + * @error: block status code * * Description: * Completely finish @rq. */ -void blk_end_request_all(struct request *rq, int error) +void blk_end_request_all(struct request *rq, blk_status_t error) { bool pending; unsigned int bidi_bytes = 0; @@ -2858,7 +2966,7 @@ EXPORT_SYMBOL(blk_end_request_all); /** * __blk_end_request - Helper function for drivers to complete the request. * @rq: the request being processed - * @error: %0 for success, < %0 for error + * @error: block status code * @nr_bytes: number of bytes to complete * * Description: @@ -2868,8 +2976,12 @@ EXPORT_SYMBOL(blk_end_request_all); * %false - we are done with this request * %true - still buffers pending for this request **/ -bool __blk_end_request(struct request *rq, int error, unsigned int nr_bytes) +bool __blk_end_request(struct request *rq, blk_status_t error, + unsigned int nr_bytes) { + lockdep_assert_held(rq->q->queue_lock); + WARN_ON_ONCE(rq->q->mq_ops); + return __blk_end_bidi_request(rq, error, nr_bytes, 0); } EXPORT_SYMBOL(__blk_end_request); @@ -2877,16 +2989,19 @@ EXPORT_SYMBOL(__blk_end_request); /** * __blk_end_request_all - Helper function for drives to finish the request. * @rq: the request to finish - * @error: %0 for success, < %0 for error + * @error: block status code * * Description: * Completely finish @rq. Must be called with queue lock held. */ -void __blk_end_request_all(struct request *rq, int error) +void __blk_end_request_all(struct request *rq, blk_status_t error) { bool pending; unsigned int bidi_bytes = 0; + lockdep_assert_held(rq->q->queue_lock); + WARN_ON_ONCE(rq->q->mq_ops); + if (unlikely(blk_bidi_rq(rq))) bidi_bytes = blk_rq_bytes(rq->next_rq); @@ -2898,7 +3013,7 @@ EXPORT_SYMBOL(__blk_end_request_all); /** * __blk_end_request_cur - Helper function to finish the current request chunk. * @rq: the request to finish the current chunk for - * @error: %0 for success, < %0 for error + * @error: block status code * * Description: * Complete the current consecutively mapped chunk from @rq. Must @@ -2908,7 +3023,7 @@ EXPORT_SYMBOL(__blk_end_request_all); * %false - we are done with this request * %true - still buffers pending for this request */ -bool __blk_end_request_cur(struct request *rq, int error) +bool __blk_end_request_cur(struct request *rq, blk_status_t error) { return __blk_end_request(rq, error, blk_rq_cur_bytes(rq)); } @@ -3151,6 +3266,8 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth, bool from_schedule) __releases(q->queue_lock) { + lockdep_assert_held(q->queue_lock); + trace_block_unplug(q, depth, !from_schedule); if (from_schedule) @@ -3249,7 +3366,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) * Short-circuit if @q is dead */ if (unlikely(blk_queue_dying(q))) { - __blk_end_request_all(rq, -ENODEV); + __blk_end_request_all(rq, BLK_STS_IOERR); continue; } diff --git a/block/blk-exec.c b/block/blk-exec.c index a9451e3..5c0f3dc 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -16,7 +16,7 @@ * @rq: request to complete * @error: end I/O status of the request */ -static void blk_end_sync_rq(struct request *rq, int error) +static void blk_end_sync_rq(struct request *rq, blk_status_t error) { struct completion *waiting = rq->end_io_data; @@ -69,7 +69,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, if (unlikely(blk_queue_dying(q))) { rq->rq_flags |= RQF_QUIET; - __blk_end_request_all(rq, -ENXIO); + __blk_end_request_all(rq, BLK_STS_IOERR); spin_unlock_irq(q->queue_lock); return; } diff --git a/block/blk-flush.c b/block/blk-flush.c index c4e0880..ed5fe32 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -164,7 +164,7 @@ static bool blk_flush_queue_rq(struct request *rq, bool add_front) */ static bool blk_flush_complete_seq(struct request *rq, struct blk_flush_queue *fq, - unsigned int seq, int error) + unsigned int seq, blk_status_t error) { struct request_queue *q = rq->q; struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; @@ -216,7 +216,7 @@ static bool blk_flush_complete_seq(struct request *rq, return kicked | queued; } -static void flush_end_io(struct request *flush_rq, int error) +static void flush_end_io(struct request *flush_rq, blk_status_t error) { struct request_queue *q = flush_rq->q; struct list_head *running; @@ -341,11 +341,13 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq) return blk_flush_queue_rq(flush_rq, false); } -static void flush_data_end_io(struct request *rq, int error) +static void flush_data_end_io(struct request *rq, blk_status_t error) { struct request_queue *q = rq->q; struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); + lockdep_assert_held(q->queue_lock); + /* * Updating q->in_flight[] here for making this tag usable * early. Because in blk_queue_start_tag(), @@ -382,7 +384,7 @@ static void flush_data_end_io(struct request *rq, int error) blk_run_queue_async(q); } -static void mq_flush_data_end_io(struct request *rq, int error) +static void mq_flush_data_end_io(struct request *rq, blk_status_t error) { struct request_queue *q = rq->q; struct blk_mq_hw_ctx *hctx; @@ -411,9 +413,6 @@ static void mq_flush_data_end_io(struct request *rq, int error) * or __blk_mq_run_hw_queue() to dispatch request. * @rq is being submitted. Analyze what needs to be done and put it on the * right queue. - * - * CONTEXT: - * spin_lock_irq(q->queue_lock) in !mq case */ void blk_insert_flush(struct request *rq) { @@ -422,6 +421,9 @@ void blk_insert_flush(struct request *rq) unsigned int policy = blk_flush_policy(fflags, rq); struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); + if (!q->mq_ops) + lockdep_assert_held(q->queue_lock); + /* * @policy now records what operations need to be done. Adjust * REQ_PREFLUSH and FUA for the driver. diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 0f891a9..feb3057 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -384,9 +384,9 @@ static struct kobj_type integrity_ktype = { .sysfs_ops = &integrity_ops, }; -static int blk_integrity_nop_fn(struct blk_integrity_iter *iter) +static blk_status_t blk_integrity_nop_fn(struct blk_integrity_iter *iter) { - return 0; + return BLK_STS_OK; } static const struct blk_integrity_profile nop_profile = { diff --git a/block/blk-map.c b/block/blk-map.c index 3b5cb86..2547016 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -16,6 +16,8 @@ */ int blk_rq_append_bio(struct request *rq, struct bio *bio) { + blk_queue_bounce(rq->q, &bio); + if (!rq->bio) { blk_rq_bio_prep(rq->q, rq, bio); } else { @@ -72,15 +74,13 @@ static int __blk_rq_map_user_iov(struct request *rq, map_data->offset += bio->bi_iter.bi_size; orig_bio = bio; - blk_queue_bounce(q, &bio); /* * We link the bounce buffer in and could have to traverse it * later so we have to get a ref to prevent it from being freed */ - bio_get(bio); - ret = blk_rq_append_bio(rq, bio); + bio_get(bio); if (ret) { bio_endio(bio); __blk_rq_unmap_user(orig_bio); @@ -249,7 +249,6 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, return ret; } - blk_queue_bounce(q, &rq->bio); return 0; } EXPORT_SYMBOL(blk_rq_map_kern); diff --git a/block/blk-merge.c b/block/blk-merge.c index 3990ae4..99038830 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -108,31 +108,9 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, bool do_split = true; struct bio *new = NULL; const unsigned max_sectors = get_max_io_size(q, bio); - unsigned bvecs = 0; bio_for_each_segment(bv, bio, iter) { /* - * With arbitrary bio size, the incoming bio may be very - * big. We have to split the bio into small bios so that - * each holds at most BIO_MAX_PAGES bvecs because - * bio_clone() can fail to allocate big bvecs. - * - * It should have been better to apply the limit per - * request queue in which bio_clone() is involved, - * instead of globally. The biggest blocker is the - * bio_clone() in bio bounce. - * - * If bio is splitted by this reason, we should have - * allowed to continue bios merging, but don't do - * that now for making the change simple. - * - * TODO: deal with bio bounce's bio_clone() gracefully - * and convert the global limit into per-queue limit. - */ - if (bvecs++ >= BIO_MAX_PAGES) - goto split; - - /* * If the queue doesn't support SG gaps and adding this * offset would create a gap, disallow it. */ @@ -202,8 +180,7 @@ split: return do_split ? new : NULL; } -void blk_queue_split(struct request_queue *q, struct bio **bio, - struct bio_set *bs) +void blk_queue_split(struct request_queue *q, struct bio **bio) { struct bio *split, *res; unsigned nsegs; @@ -211,13 +188,13 @@ void blk_queue_split(struct request_queue *q, struct bio **bio, switch (bio_op(*bio)) { case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: - split = blk_bio_discard_split(q, *bio, bs, &nsegs); + split = blk_bio_discard_split(q, *bio, q->bio_split, &nsegs); break; case REQ_OP_WRITE_ZEROES: - split = blk_bio_write_zeroes_split(q, *bio, bs, &nsegs); + split = blk_bio_write_zeroes_split(q, *bio, q->bio_split, &nsegs); break; case REQ_OP_WRITE_SAME: - split = blk_bio_write_same_split(q, *bio, bs, &nsegs); + split = blk_bio_write_same_split(q, *bio, q->bio_split, &nsegs); break; default: split = blk_bio_segment_split(q, *bio, q->bio_split, &nsegs); @@ -671,6 +648,9 @@ static void blk_account_io_merge(struct request *req) static struct request *attempt_merge(struct request_queue *q, struct request *req, struct request *next) { + if (!q->mq_ops) + lockdep_assert_held(q->queue_lock); + if (!rq_mergeable(req) || !rq_mergeable(next)) return NULL; @@ -693,6 +673,13 @@ static struct request *attempt_merge(struct request_queue *q, return NULL; /* + * Don't allow merge of different write hints, or for a hint with + * non-hint IO. + */ + if (req->write_hint != next->write_hint) + return NULL; + + /* * If we are allowed to merge, then append bio list * from next to rq and release next. merge_requests_fn * will have updated segment counts, update sector @@ -811,6 +798,13 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) !blk_write_same_mergeable(rq->bio, bio)) return false; + /* + * Don't allow merge of different write hints, or for a hint with + * non-hint IO. + */ + if (rq->write_hint != bio->bi_write_hint) + return false; + return true; } diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 8e61e86..2cca4fc 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -14,10 +14,15 @@ #include "blk.h" #include "blk-mq.h" -static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues, - const int cpu) +static int cpu_to_queue_index(unsigned int nr_queues, const int cpu, + const struct cpumask *online_mask) { - return cpu * nr_queues / nr_cpus; + /* + * Non online CPU will be mapped to queue index 0. + */ + if (!cpumask_test_cpu(cpu, online_mask)) + return 0; + return cpu % nr_queues; } static int get_first_sibling(unsigned int cpu) @@ -36,55 +41,26 @@ int blk_mq_map_queues(struct blk_mq_tag_set *set) unsigned int *map = set->mq_map; unsigned int nr_queues = set->nr_hw_queues; const struct cpumask *online_mask = cpu_online_mask; - unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling; - cpumask_var_t cpus; - - if (!alloc_cpumask_var(&cpus, GFP_ATOMIC)) - return -ENOMEM; - - cpumask_clear(cpus); - nr_cpus = nr_uniq_cpus = 0; - for_each_cpu(i, online_mask) { - nr_cpus++; - first_sibling = get_first_sibling(i); - if (!cpumask_test_cpu(first_sibling, cpus)) - nr_uniq_cpus++; - cpumask_set_cpu(i, cpus); - } - - queue = 0; - for_each_possible_cpu(i) { - if (!cpumask_test_cpu(i, online_mask)) { - map[i] = 0; - continue; - } + unsigned int cpu, first_sibling; + for_each_possible_cpu(cpu) { /* - * Easy case - we have equal or more hardware queues. Or - * there are no thread siblings to take into account. Do - * 1:1 if enough, or sequential mapping if less. + * First do sequential mapping between CPUs and queues. + * In case we still have CPUs to map, and we have some number of + * threads per cores then map sibling threads to the same queue for + * performace optimizations. */ - if (nr_queues >= nr_cpus || nr_cpus == nr_uniq_cpus) { - map[i] = cpu_to_queue_index(nr_cpus, nr_queues, queue); - queue++; - continue; + if (cpu < nr_queues) { + map[cpu] = cpu_to_queue_index(nr_queues, cpu, online_mask); + } else { + first_sibling = get_first_sibling(cpu); + if (first_sibling == cpu) + map[cpu] = cpu_to_queue_index(nr_queues, cpu, online_mask); + else + map[cpu] = map[first_sibling]; } - - /* - * Less then nr_cpus queues, and we have some number of - * threads per cores. Map sibling threads to the same - * queue. - */ - first_sibling = get_first_sibling(i); - if (first_sibling == i) { - map[i] = cpu_to_queue_index(nr_uniq_cpus, nr_queues, - queue); - queue++; - } else - map[i] = map[first_sibling]; } - free_cpumask_var(cpus); return 0; } EXPORT_SYMBOL_GPL(blk_mq_map_queues); diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 803aed4..9ebc294 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -114,10 +114,12 @@ static ssize_t queue_state_write(void *data, const char __user *buf, blk_mq_run_hw_queues(q, true); } else if (strcmp(op, "start") == 0) { blk_mq_start_stopped_hw_queues(q, true); + } else if (strcmp(op, "kick") == 0) { + blk_mq_kick_requeue_list(q); } else { pr_err("%s: unsupported operation '%s'\n", __func__, op); inval: - pr_err("%s: use either 'run' or 'start'\n", __func__); + pr_err("%s: use 'run', 'start' or 'kick'\n", __func__); return -EINVAL; } return count; @@ -133,6 +135,29 @@ static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) } } +static int queue_write_hint_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + int i; + + for (i = 0; i < BLK_MAX_WRITE_HINTS; i++) + seq_printf(m, "hint%d: %llu\n", i, q->write_hints[i]); + + return 0; +} + +static ssize_t queue_write_hint_store(void *data, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct request_queue *q = data; + int i; + + for (i = 0; i < BLK_MAX_WRITE_HINTS; i++) + q->write_hints[i] = 0; + + return count; +} + static int queue_poll_stat_show(void *data, struct seq_file *m) { struct request_queue *q = data; @@ -267,6 +292,14 @@ static const char *const rqf_name[] = { }; #undef RQF_NAME +#define RQAF_NAME(name) [REQ_ATOM_##name] = #name +static const char *const rqaf_name[] = { + RQAF_NAME(COMPLETE), + RQAF_NAME(STARTED), + RQAF_NAME(POLL_SLEPT), +}; +#undef RQAF_NAME + int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) { const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; @@ -283,6 +316,8 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) seq_puts(m, ", .rq_flags="); blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name, ARRAY_SIZE(rqf_name)); + seq_puts(m, ", .atomic_flags="); + blk_flags_show(m, rq->atomic_flags, rqaf_name, ARRAY_SIZE(rqaf_name)); seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag, rq->internal_tag); if (mq_ops->show_rq) @@ -298,6 +333,37 @@ int blk_mq_debugfs_rq_show(struct seq_file *m, void *v) } EXPORT_SYMBOL_GPL(blk_mq_debugfs_rq_show); +static void *queue_requeue_list_start(struct seq_file *m, loff_t *pos) + __acquires(&q->requeue_lock) +{ + struct request_queue *q = m->private; + + spin_lock_irq(&q->requeue_lock); + return seq_list_start(&q->requeue_list, *pos); +} + +static void *queue_requeue_list_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct request_queue *q = m->private; + + return seq_list_next(v, &q->requeue_list, pos); +} + +static void queue_requeue_list_stop(struct seq_file *m, void *v) + __releases(&q->requeue_lock) +{ + struct request_queue *q = m->private; + + spin_unlock_irq(&q->requeue_lock); +} + +static const struct seq_operations queue_requeue_list_seq_ops = { + .start = queue_requeue_list_start, + .next = queue_requeue_list_next, + .stop = queue_requeue_list_stop, + .show = blk_mq_debugfs_rq_show, +}; + static void *hctx_dispatch_start(struct seq_file *m, loff_t *pos) __acquires(&hctx->lock) { @@ -329,6 +395,36 @@ static const struct seq_operations hctx_dispatch_seq_ops = { .show = blk_mq_debugfs_rq_show, }; +struct show_busy_params { + struct seq_file *m; + struct blk_mq_hw_ctx *hctx; +}; + +/* + * Note: the state of a request may change while this function is in progress, + * e.g. due to a concurrent blk_mq_finish_request() call. + */ +static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved) +{ + const struct show_busy_params *params = data; + + if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx && + test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) + __blk_mq_debugfs_rq_show(params->m, + list_entry_rq(&rq->queuelist)); +} + +static int hctx_busy_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + struct show_busy_params params = { .m = m, .hctx = hctx }; + + blk_mq_tagset_busy_iter(hctx->queue->tag_set, hctx_show_busy_rq, + ¶ms); + + return 0; +} + static int hctx_ctx_map_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; @@ -655,7 +751,9 @@ const struct file_operations blk_mq_debugfs_fops = { static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { {"poll_stat", 0400, queue_poll_stat_show}, + {"requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops}, {"state", 0600, queue_state_show, queue_state_write}, + {"write_hints", 0600, queue_write_hint_show, queue_write_hint_store}, {}, }; @@ -663,6 +761,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"state", 0400, hctx_state_show}, {"flags", 0400, hctx_flags_show}, {"dispatch", 0400, .seq_ops = &hctx_dispatch_seq_ops}, + {"busy", 0400, hctx_busy_show}, {"ctx_map", 0400, hctx_ctx_map_show}, {"tags", 0400, hctx_tags_show}, {"tags_bitmap", 0400, hctx_tags_bitmap_show}, diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 0ded5e8..7f0dc48 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -31,11 +31,10 @@ void blk_mq_sched_free_hctx_data(struct request_queue *q, } EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); -static void __blk_mq_sched_assign_ioc(struct request_queue *q, - struct request *rq, - struct bio *bio, - struct io_context *ioc) +void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio) { + struct request_queue *q = rq->q; + struct io_context *ioc = rq_ioc(bio); struct io_cq *icq; spin_lock_irq(q->queue_lock); @@ -47,25 +46,8 @@ static void __blk_mq_sched_assign_ioc(struct request_queue *q, if (!icq) return; } - + get_io_context(icq->ioc); rq->elv.icq = icq; - if (!blk_mq_sched_get_rq_priv(q, rq, bio)) { - rq->rq_flags |= RQF_ELVPRIV; - get_io_context(icq->ioc); - return; - } - - rq->elv.icq = NULL; -} - -static void blk_mq_sched_assign_ioc(struct request_queue *q, - struct request *rq, struct bio *bio) -{ - struct io_context *ioc; - - ioc = rq_ioc(bio); - if (ioc) - __blk_mq_sched_assign_ioc(q, rq, bio, ioc); } /* @@ -107,71 +89,6 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) return false; } -struct request *blk_mq_sched_get_request(struct request_queue *q, - struct bio *bio, - unsigned int op, - struct blk_mq_alloc_data *data) -{ - struct elevator_queue *e = q->elevator; - struct request *rq; - - blk_queue_enter_live(q); - data->q = q; - if (likely(!data->ctx)) - data->ctx = blk_mq_get_ctx(q); - if (likely(!data->hctx)) - data->hctx = blk_mq_map_queue(q, data->ctx->cpu); - - if (e) { - data->flags |= BLK_MQ_REQ_INTERNAL; - - /* - * Flush requests are special and go directly to the - * dispatch list. - */ - if (!op_is_flush(op) && e->type->ops.mq.get_request) { - rq = e->type->ops.mq.get_request(q, op, data); - if (rq) - rq->rq_flags |= RQF_QUEUED; - } else - rq = __blk_mq_alloc_request(data, op); - } else { - rq = __blk_mq_alloc_request(data, op); - } - - if (rq) { - if (!op_is_flush(op)) { - rq->elv.icq = NULL; - if (e && e->type->icq_cache) - blk_mq_sched_assign_ioc(q, rq, bio); - } - data->hctx->queued++; - return rq; - } - - blk_queue_exit(q); - return NULL; -} - -void blk_mq_sched_put_request(struct request *rq) -{ - struct request_queue *q = rq->q; - struct elevator_queue *e = q->elevator; - - if (rq->rq_flags & RQF_ELVPRIV) { - blk_mq_sched_put_rq_priv(rq->q, rq); - if (rq->elv.icq) { - put_io_context(rq->elv.icq->ioc); - rq->elv.icq = NULL; - } - } - - if ((rq->rq_flags & RQF_QUEUED) && e && e->type->ops.mq.put_request) - e->type->ops.mq.put_request(rq); - else - blk_mq_finish_request(rq); -} - void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; @@ -180,7 +97,8 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) bool did_work = false; LIST_HEAD(rq_list); - if (unlikely(blk_mq_hctx_stopped(hctx))) + /* RCU or SRCU read lock is needed before checking quiesced flag */ + if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) return; hctx->run++; @@ -260,19 +178,73 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, } EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); +/* + * Reverse check our software queue for entries that we could potentially + * merge with. Currently includes a hand-wavy stop count of 8, to not spend + * too much time checking for merges. + */ +static bool blk_mq_attempt_merge(struct request_queue *q, + struct blk_mq_ctx *ctx, struct bio *bio) +{ + struct request *rq; + int checked = 8; + + lockdep_assert_held(&ctx->lock); + + list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { + bool merged = false; + + if (!checked--) + break; + + if (!blk_rq_merge_ok(rq, bio)) + continue; + + switch (blk_try_merge(rq, bio)) { + case ELEVATOR_BACK_MERGE: + if (blk_mq_sched_allow_merge(q, rq, bio)) + merged = bio_attempt_back_merge(q, rq, bio); + break; + case ELEVATOR_FRONT_MERGE: + if (blk_mq_sched_allow_merge(q, rq, bio)) + merged = bio_attempt_front_merge(q, rq, bio); + break; + case ELEVATOR_DISCARD_MERGE: + merged = bio_attempt_discard_merge(q, rq, bio); + break; + default: + continue; + } + + if (merged) + ctx->rq_merged++; + return merged; + } + + return false; +} + bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) { struct elevator_queue *e = q->elevator; + struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + bool ret = false; - if (e->type->ops.mq.bio_merge) { - struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); - + if (e && e->type->ops.mq.bio_merge) { blk_mq_put_ctx(ctx); return e->type->ops.mq.bio_merge(hctx, bio); } - return false; + if (hctx->flags & BLK_MQ_F_SHOULD_MERGE) { + /* default per sw-queue merge */ + spin_lock(&ctx->lock); + ret = blk_mq_attempt_merge(q, ctx, bio); + spin_unlock(&ctx->lock); + } + + blk_mq_put_ctx(ctx); + return ret; } bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq) diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 5007ede..9267d0b7 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -7,8 +7,7 @@ void blk_mq_sched_free_hctx_data(struct request_queue *q, void (*exit)(struct blk_mq_hw_ctx *)); -struct request *blk_mq_sched_get_request(struct request_queue *q, struct bio *bio, unsigned int op, struct blk_mq_alloc_data *data); -void blk_mq_sched_put_request(struct request *rq); +void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio); void blk_mq_sched_request_inserted(struct request *rq); bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, @@ -38,35 +37,12 @@ int blk_mq_sched_init(struct request_queue *q); static inline bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) { - struct elevator_queue *e = q->elevator; - - if (!e || blk_queue_nomerges(q) || !bio_mergeable(bio)) + if (blk_queue_nomerges(q) || !bio_mergeable(bio)) return false; return __blk_mq_sched_bio_merge(q, bio); } -static inline int blk_mq_sched_get_rq_priv(struct request_queue *q, - struct request *rq, - struct bio *bio) -{ - struct elevator_queue *e = q->elevator; - - if (e && e->type->ops.mq.get_rq_priv) - return e->type->ops.mq.get_rq_priv(q, rq, bio); - - return 0; -} - -static inline void blk_mq_sched_put_rq_priv(struct request_queue *q, - struct request *rq) -{ - struct elevator_queue *e = q->elevator; - - if (e && e->type->ops.mq.put_rq_priv) - e->type->ops.mq.put_rq_priv(q, rq); -} - static inline bool blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, struct bio *bio) diff --git a/block/blk-mq.c b/block/blk-mq.c index 958ceda..05dfa3f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -42,7 +42,6 @@ static LIST_HEAD(all_q_list); static void blk_mq_poll_stats_start(struct request_queue *q); static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); -static void __blk_mq_stop_hw_queues(struct request_queue *q, bool sync); static int blk_mq_poll_stats_bkt(const struct request *rq) { @@ -154,13 +153,28 @@ void blk_mq_unfreeze_queue(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); +/* + * FIXME: replace the scsi_internal_device_*block_nowait() calls in the + * mpt3sas driver such that this function can be removed. + */ +void blk_mq_quiesce_queue_nowait(struct request_queue *q) +{ + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + queue_flag_set(QUEUE_FLAG_QUIESCED, q); + spin_unlock_irqrestore(q->queue_lock, flags); +} +EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); + /** - * blk_mq_quiesce_queue() - wait until all ongoing queue_rq calls have finished + * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished * @q: request queue. * * Note: this function does not prevent that the struct request end_io() - * callback function is invoked. Additionally, it is not prevented that - * new queue_rq() calls occur unless the queue has been stopped first. + * callback function is invoked. Once this function is returned, we make + * sure no dispatch can happen until the queue is unquiesced via + * blk_mq_unquiesce_queue(). */ void blk_mq_quiesce_queue(struct request_queue *q) { @@ -168,11 +182,11 @@ void blk_mq_quiesce_queue(struct request_queue *q) unsigned int i; bool rcu = false; - __blk_mq_stop_hw_queues(q, true); + blk_mq_quiesce_queue_nowait(q); queue_for_each_hw_ctx(q, hctx, i) { if (hctx->flags & BLK_MQ_F_BLOCKING) - synchronize_srcu(&hctx->queue_rq_srcu); + synchronize_srcu(hctx->queue_rq_srcu); else rcu = true; } @@ -181,6 +195,26 @@ void blk_mq_quiesce_queue(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); +/* + * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue() + * @q: request queue. + * + * This function recovers queue into the state before quiescing + * which is done by blk_mq_quiesce_queue. + */ +void blk_mq_unquiesce_queue(struct request_queue *q) +{ + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + queue_flag_clear(QUEUE_FLAG_QUIESCED, q); + spin_unlock_irqrestore(q->queue_lock, flags); + + /* dispatch requests which are inserted during quiescing */ + blk_mq_run_hw_queues(q, true); +} +EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); + void blk_mq_wake_waiters(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; @@ -204,15 +238,33 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) } EXPORT_SYMBOL(blk_mq_can_queue); -void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, - struct request *rq, unsigned int op) +static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, + unsigned int tag, unsigned int op) { + struct blk_mq_tags *tags = blk_mq_tags_from_data(data); + struct request *rq = tags->static_rqs[tag]; + + rq->rq_flags = 0; + + if (data->flags & BLK_MQ_REQ_INTERNAL) { + rq->tag = -1; + rq->internal_tag = tag; + } else { + if (blk_mq_tag_busy(data->hctx)) { + rq->rq_flags = RQF_MQ_INFLIGHT; + atomic_inc(&data->hctx->nr_active); + } + rq->tag = tag; + rq->internal_tag = -1; + data->hctx->tags->rqs[rq->tag] = rq; + } + INIT_LIST_HEAD(&rq->queuelist); /* csd/requeue_work/fifo_time is initialized before use */ - rq->q = q; - rq->mq_ctx = ctx; + rq->q = data->q; + rq->mq_ctx = data->ctx; rq->cmd_flags = op; - if (blk_queue_io_stat(q)) + if (blk_queue_io_stat(data->q)) rq->rq_flags |= RQF_IO_STAT; /* do not touch atomic flags, it needs atomic ops against the timer */ rq->cpu = -1; @@ -241,44 +293,60 @@ void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, rq->end_io_data = NULL; rq->next_rq = NULL; - ctx->rq_dispatched[op_is_sync(op)]++; + data->ctx->rq_dispatched[op_is_sync(op)]++; + return rq; } -EXPORT_SYMBOL_GPL(blk_mq_rq_ctx_init); -struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data, - unsigned int op) +static struct request *blk_mq_get_request(struct request_queue *q, + struct bio *bio, unsigned int op, + struct blk_mq_alloc_data *data) { + struct elevator_queue *e = q->elevator; struct request *rq; unsigned int tag; - tag = blk_mq_get_tag(data); - if (tag != BLK_MQ_TAG_FAIL) { - struct blk_mq_tags *tags = blk_mq_tags_from_data(data); + blk_queue_enter_live(q); + data->q = q; + if (likely(!data->ctx)) + data->ctx = blk_mq_get_ctx(q); + if (likely(!data->hctx)) + data->hctx = blk_mq_map_queue(q, data->ctx->cpu); + if (op & REQ_NOWAIT) + data->flags |= BLK_MQ_REQ_NOWAIT; - rq = tags->static_rqs[tag]; + if (e) { + data->flags |= BLK_MQ_REQ_INTERNAL; - if (data->flags & BLK_MQ_REQ_INTERNAL) { - rq->tag = -1; - rq->internal_tag = tag; - } else { - if (blk_mq_tag_busy(data->hctx)) { - rq->rq_flags = RQF_MQ_INFLIGHT; - atomic_inc(&data->hctx->nr_active); - } - rq->tag = tag; - rq->internal_tag = -1; - data->hctx->tags->rqs[rq->tag] = rq; - } + /* + * Flush requests are special and go directly to the + * dispatch list. + */ + if (!op_is_flush(op) && e->type->ops.mq.limit_depth) + e->type->ops.mq.limit_depth(op, data); + } - blk_mq_rq_ctx_init(data->q, data->ctx, rq, op); - return rq; + tag = blk_mq_get_tag(data); + if (tag == BLK_MQ_TAG_FAIL) { + blk_queue_exit(q); + return NULL; } - return NULL; + rq = blk_mq_rq_ctx_init(data, tag, op); + if (!op_is_flush(op)) { + rq->elv.icq = NULL; + if (e && e->type->ops.mq.prepare_request) { + if (e->type->icq_cache && rq_ioc(bio)) + blk_mq_sched_assign_ioc(rq, bio); + + e->type->ops.mq.prepare_request(rq, bio); + rq->rq_flags |= RQF_ELVPRIV; + } + } + data->hctx->queued++; + return rq; } -EXPORT_SYMBOL_GPL(__blk_mq_alloc_request); -struct request *blk_mq_alloc_request(struct request_queue *q, int rw, +struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, unsigned int flags) { struct blk_mq_alloc_data alloc_data = { .flags = flags }; @@ -289,7 +357,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, if (ret) return ERR_PTR(ret); - rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data); + rq = blk_mq_get_request(q, NULL, op, &alloc_data); blk_mq_put_ctx(alloc_data.ctx); blk_queue_exit(q); @@ -304,8 +372,8 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, } EXPORT_SYMBOL(blk_mq_alloc_request); -struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw, - unsigned int flags, unsigned int hctx_idx) +struct request *blk_mq_alloc_request_hctx(struct request_queue *q, + unsigned int op, unsigned int flags, unsigned int hctx_idx) { struct blk_mq_alloc_data alloc_data = { .flags = flags }; struct request *rq; @@ -340,7 +408,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw, cpu = cpumask_first(alloc_data.hctx->cpumask); alloc_data.ctx = __blk_mq_get_ctx(q, cpu); - rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data); + rq = blk_mq_get_request(q, NULL, op, &alloc_data); blk_queue_exit(q); @@ -351,17 +419,28 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw, } EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); -void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, - struct request *rq) +void blk_mq_free_request(struct request *rq) { - const int sched_tag = rq->internal_tag; struct request_queue *q = rq->q; + struct elevator_queue *e = q->elevator; + struct blk_mq_ctx *ctx = rq->mq_ctx; + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + const int sched_tag = rq->internal_tag; + if (rq->rq_flags & RQF_ELVPRIV) { + if (e && e->type->ops.mq.finish_request) + e->type->ops.mq.finish_request(rq); + if (rq->elv.icq) { + put_io_context(rq->elv.icq->ioc); + rq->elv.icq = NULL; + } + } + + ctx->rq_completed[rq_is_sync(rq)]++; if (rq->rq_flags & RQF_MQ_INFLIGHT) atomic_dec(&hctx->nr_active); wbt_done(q->rq_wb, &rq->issue_stat); - rq->rq_flags = 0; clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); @@ -372,29 +451,9 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, blk_mq_sched_restart(hctx); blk_queue_exit(q); } - -static void blk_mq_finish_hctx_request(struct blk_mq_hw_ctx *hctx, - struct request *rq) -{ - struct blk_mq_ctx *ctx = rq->mq_ctx; - - ctx->rq_completed[rq_is_sync(rq)]++; - __blk_mq_finish_request(hctx, ctx, rq); -} - -void blk_mq_finish_request(struct request *rq) -{ - blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq); -} -EXPORT_SYMBOL_GPL(blk_mq_finish_request); - -void blk_mq_free_request(struct request *rq) -{ - blk_mq_sched_put_request(rq); -} EXPORT_SYMBOL_GPL(blk_mq_free_request); -inline void __blk_mq_end_request(struct request *rq, int error) +inline void __blk_mq_end_request(struct request *rq, blk_status_t error) { blk_account_io_done(rq); @@ -409,7 +468,7 @@ inline void __blk_mq_end_request(struct request *rq, int error) } EXPORT_SYMBOL(__blk_mq_end_request); -void blk_mq_end_request(struct request *rq, int error) +void blk_mq_end_request(struct request *rq, blk_status_t error) { if (blk_update_request(rq, error, blk_rq_bytes(rq))) BUG(); @@ -753,50 +812,6 @@ static void blk_mq_timeout_work(struct work_struct *work) blk_queue_exit(q); } -/* - * Reverse check our software queue for entries that we could potentially - * merge with. Currently includes a hand-wavy stop count of 8, to not spend - * too much time checking for merges. - */ -static bool blk_mq_attempt_merge(struct request_queue *q, - struct blk_mq_ctx *ctx, struct bio *bio) -{ - struct request *rq; - int checked = 8; - - list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { - bool merged = false; - - if (!checked--) - break; - - if (!blk_rq_merge_ok(rq, bio)) - continue; - - switch (blk_try_merge(rq, bio)) { - case ELEVATOR_BACK_MERGE: - if (blk_mq_sched_allow_merge(q, rq, bio)) - merged = bio_attempt_back_merge(q, rq, bio); - break; - case ELEVATOR_FRONT_MERGE: - if (blk_mq_sched_allow_merge(q, rq, bio)) - merged = bio_attempt_front_merge(q, rq, bio); - break; - case ELEVATOR_DISCARD_MERGE: - merged = bio_attempt_discard_merge(q, rq, bio); - break; - default: - continue; - } - - if (merged) - ctx->rq_merged++; - return merged; - } - - return false; -} - struct flush_busy_ctx_data { struct blk_mq_hw_ctx *hctx; struct list_head *list; @@ -968,7 +983,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) { struct blk_mq_hw_ctx *hctx; struct request *rq; - int errors, queued, ret = BLK_MQ_RQ_QUEUE_OK; + int errors, queued; if (list_empty(list)) return false; @@ -979,6 +994,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) errors = queued = 0; do { struct blk_mq_queue_data bd; + blk_status_t ret; rq = list_first_entry(list, struct request, queuelist); if (!blk_mq_get_driver_tag(rq, &hctx, false)) { @@ -1019,25 +1035,20 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) } ret = q->mq_ops->queue_rq(hctx, &bd); - switch (ret) { - case BLK_MQ_RQ_QUEUE_OK: - queued++; - break; - case BLK_MQ_RQ_QUEUE_BUSY: + if (ret == BLK_STS_RESOURCE) { blk_mq_put_driver_tag_hctx(hctx, rq); list_add(&rq->queuelist, list); __blk_mq_requeue_request(rq); break; - default: - pr_err("blk-mq: bad return on queue: %d\n", ret); - case BLK_MQ_RQ_QUEUE_ERROR: + } + + if (unlikely(ret != BLK_STS_OK)) { errors++; - blk_mq_end_request(rq, -EIO); - break; + blk_mq_end_request(rq, BLK_STS_IOERR); + continue; } - if (ret == BLK_MQ_RQ_QUEUE_BUSY) - break; + queued++; } while (!list_empty(list)); hctx->dispatched[queued_to_index(queued)]++; @@ -1075,7 +1086,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) * - blk_mq_run_hw_queue() checks whether or not a queue has * been stopped before rerunning a queue. * - Some but not all block drivers stop a queue before - * returning BLK_MQ_RQ_QUEUE_BUSY. Two exceptions are scsi-mq + * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq * and dm-rq. */ if (!blk_mq_sched_needs_restart(hctx) && @@ -1100,9 +1111,9 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) } else { might_sleep(); - srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu); + srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); blk_mq_sched_dispatch_requests(hctx); - srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx); + srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx); } } @@ -1134,8 +1145,10 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, unsigned long msecs) { - if (unlikely(blk_mq_hctx_stopped(hctx) || - !blk_mq_hw_queue_mapped(hctx))) + if (WARN_ON_ONCE(!blk_mq_hw_queue_mapped(hctx))) + return; + + if (unlikely(blk_mq_hctx_stopped(hctx))) return; if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { @@ -1201,34 +1214,39 @@ bool blk_mq_queue_stopped(struct request_queue *q) } EXPORT_SYMBOL(blk_mq_queue_stopped); -static void __blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx, bool sync) +/* + * This function is often used for pausing .queue_rq() by driver when + * there isn't enough resource or some conditions aren't satisfied, and + * BLK_MQ_RQ_QUEUE_BUSY is usually returned. + * + * We do not guarantee that dispatch can be drained or blocked + * after blk_mq_stop_hw_queue() returns. Please use + * blk_mq_quiesce_queue() for that requirement. + */ +void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) { - if (sync) - cancel_delayed_work_sync(&hctx->run_work); - else - cancel_delayed_work(&hctx->run_work); + cancel_delayed_work(&hctx->run_work); set_bit(BLK_MQ_S_STOPPED, &hctx->state); } - -void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) -{ - __blk_mq_stop_hw_queue(hctx, false); -} EXPORT_SYMBOL(blk_mq_stop_hw_queue); -static void __blk_mq_stop_hw_queues(struct request_queue *q, bool sync) +/* + * This function is often used for pausing .queue_rq() by driver when + * there isn't enough resource or some conditions aren't satisfied, and + * BLK_MQ_RQ_QUEUE_BUSY is usually returned. + * + * We do not guarantee that dispatch can be drained or blocked + * after blk_mq_stop_hw_queues() returns. Please use + * blk_mq_quiesce_queue() for that requirement. + */ +void blk_mq_stop_hw_queues(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; int i; queue_for_each_hw_ctx(q, hctx, i) - __blk_mq_stop_hw_queue(hctx, sync); -} - -void blk_mq_stop_hw_queues(struct request_queue *q) -{ - __blk_mq_stop_hw_queues(q, false); + blk_mq_stop_hw_queue(hctx); } EXPORT_SYMBOL(blk_mq_stop_hw_queues); @@ -1295,7 +1313,7 @@ static void blk_mq_run_work_fn(struct work_struct *work) void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) { - if (unlikely(!blk_mq_hw_queue_mapped(hctx))) + if (WARN_ON_ONCE(!blk_mq_hw_queue_mapped(hctx))) return; /* @@ -1317,6 +1335,8 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, { struct blk_mq_ctx *ctx = rq->mq_ctx; + lockdep_assert_held(&ctx->lock); + trace_block_rq_insert(hctx->queue, rq); if (at_head) @@ -1330,6 +1350,8 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, { struct blk_mq_ctx *ctx = rq->mq_ctx; + lockdep_assert_held(&ctx->lock); + __blk_mq_insert_req_list(hctx, rq, at_head); blk_mq_hctx_mark_pending(hctx, ctx); } @@ -1427,30 +1449,13 @@ static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx) !blk_queue_nomerges(hctx->queue); } -static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, - struct blk_mq_ctx *ctx, - struct request *rq, struct bio *bio) +static inline void blk_mq_queue_io(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx, + struct request *rq) { - if (!hctx_allow_merges(hctx) || !bio_mergeable(bio)) { - blk_mq_bio_to_request(rq, bio); - spin_lock(&ctx->lock); -insert_rq: - __blk_mq_insert_request(hctx, rq, false); - spin_unlock(&ctx->lock); - return false; - } else { - struct request_queue *q = hctx->queue; - - spin_lock(&ctx->lock); - if (!blk_mq_attempt_merge(q, ctx, bio)) { - blk_mq_bio_to_request(rq, bio); - goto insert_rq; - } - - spin_unlock(&ctx->lock); - __blk_mq_finish_request(hctx, ctx, rq); - return true; - } + spin_lock(&ctx->lock); + __blk_mq_insert_request(hctx, rq, false); + spin_unlock(&ctx->lock); } static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) @@ -1471,10 +1476,11 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, .last = true, }; blk_qc_t new_cookie; - int ret; + blk_status_t ret; bool run_queue = true; - if (blk_mq_hctx_stopped(hctx)) { + /* RCU or SRCU read lock is needed before checking quiesced flag */ + if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) { run_queue = false; goto insert; } @@ -1493,18 +1499,19 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, * would have done */ ret = q->mq_ops->queue_rq(hctx, &bd); - if (ret == BLK_MQ_RQ_QUEUE_OK) { + switch (ret) { + case BLK_STS_OK: *cookie = new_cookie; return; - } - - if (ret == BLK_MQ_RQ_QUEUE_ERROR) { + case BLK_STS_RESOURCE: + __blk_mq_requeue_request(rq); + goto insert; + default: *cookie = BLK_QC_T_NONE; - blk_mq_end_request(rq, -EIO); + blk_mq_end_request(rq, ret); return; } - __blk_mq_requeue_request(rq); insert: blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep); } @@ -1521,9 +1528,9 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, might_sleep(); - srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu); + srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); __blk_mq_try_issue_directly(hctx, rq, cookie, true); - srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx); + srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx); } } @@ -1541,7 +1548,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_queue_bounce(q, &bio); - blk_queue_split(q, &bio, q->bio_split); + blk_queue_split(q, &bio); if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { bio_io_error(bio); @@ -1559,9 +1566,11 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) trace_block_getrq(q, bio, bio->bi_opf); - rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data); + rq = blk_mq_get_request(q, bio, bio->bi_opf, &data); if (unlikely(!rq)) { __wbt_done(q->rq_wb, wb_acct); + if (bio->bi_opf & REQ_NOWAIT) + bio_wouldblock_error(bio); return BLK_QC_T_NONE; } @@ -1639,11 +1648,12 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); blk_mq_sched_insert_request(rq, false, true, true, true); - } else if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { + } else { blk_mq_put_ctx(data.ctx); + blk_mq_bio_to_request(rq, bio); + blk_mq_queue_io(data.hctx, data.ctx, rq); blk_mq_run_hw_queue(data.hctx, true); - } else - blk_mq_put_ctx(data.ctx); + } return cookie; } @@ -1866,7 +1876,7 @@ static void blk_mq_exit_hctx(struct request_queue *q, set->ops->exit_hctx(hctx, hctx_idx); if (hctx->flags & BLK_MQ_F_BLOCKING) - cleanup_srcu_struct(&hctx->queue_rq_srcu); + cleanup_srcu_struct(hctx->queue_rq_srcu); blk_mq_remove_cpuhp(hctx); blk_free_flush_queue(hctx->fq); @@ -1900,7 +1910,6 @@ static int blk_mq_init_hctx(struct request_queue *q, spin_lock_init(&hctx->lock); INIT_LIST_HEAD(&hctx->dispatch); hctx->queue = q; - hctx->queue_num = hctx_idx; hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED; cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); @@ -1939,7 +1948,7 @@ static int blk_mq_init_hctx(struct request_queue *q, goto free_fq; if (hctx->flags & BLK_MQ_F_BLOCKING) - init_srcu_struct(&hctx->queue_rq_srcu); + init_srcu_struct(hctx->queue_rq_srcu); blk_mq_debugfs_register_hctx(q, hctx); @@ -2224,6 +2233,20 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) } EXPORT_SYMBOL(blk_mq_init_queue); +static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set) +{ + int hw_ctx_size = sizeof(struct blk_mq_hw_ctx); + + BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, queue_rq_srcu), + __alignof__(struct blk_mq_hw_ctx)) != + sizeof(struct blk_mq_hw_ctx)); + + if (tag_set->flags & BLK_MQ_F_BLOCKING) + hw_ctx_size += sizeof(struct srcu_struct); + + return hw_ctx_size; +} + static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct request_queue *q) { @@ -2238,7 +2261,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, continue; node = blk_mq_hw_queue_to_node(q->mq_map, i); - hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx), + hctxs[i] = kzalloc_node(blk_mq_hw_ctx_size(set), GFP_KERNEL, node); if (!hctxs[i]) break; diff --git a/block/blk-mq.h b/block/blk-mq.h index cc67b48..1a06fdf 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -128,17 +128,6 @@ static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data return data->hctx->tags; } -/* - * Internal helpers for request allocation/init/free - */ -void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, - struct request *rq, unsigned int op); -void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, - struct request *rq); -void blk_mq_finish_request(struct request *rq); -struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data, - unsigned int op); - static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx) { return test_bit(BLK_MQ_S_STOPPED, &hctx->state); diff --git a/block/blk-settings.c b/block/blk-settings.c index 4fa81ed3..be1f115 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -172,11 +172,6 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) q->nr_batching = BLK_BATCH_REQ; blk_set_default_limits(&q->limits); - - /* - * by default assume old behaviour and bounce for any highmem page - */ - blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); } EXPORT_SYMBOL(blk_queue_make_request); diff --git a/block/blk-tag.c b/block/blk-tag.c index 07cc329..2290f65 100644 --- a/block/blk-tag.c +++ b/block/blk-tag.c @@ -258,15 +258,14 @@ EXPORT_SYMBOL(blk_queue_resize_tags); * all transfers have been done for a request. It's important to call * this function before end_that_request_last(), as that will put the * request back on the free list thus corrupting the internal tag list. - * - * Notes: - * queue lock must be held. **/ void blk_queue_end_tag(struct request_queue *q, struct request *rq) { struct blk_queue_tag *bqt = q->queue_tags; unsigned tag = rq->tag; /* negative tags invalid */ + lockdep_assert_held(q->queue_lock); + BUG_ON(tag >= bqt->real_max_depth); list_del_init(&rq->queuelist); @@ -307,9 +306,6 @@ EXPORT_SYMBOL(blk_queue_end_tag); * calling this function. The request will also be removed from * the request queue, so it's the drivers responsibility to readd * it if it should need to be restarted for some reason. - * - * Notes: - * queue lock must be held. **/ int blk_queue_start_tag(struct request_queue *q, struct request *rq) { @@ -317,6 +313,8 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq) unsigned max_depth; int tag; + lockdep_assert_held(q->queue_lock); + if (unlikely((rq->rq_flags & RQF_QUEUED))) { printk(KERN_ERR "%s: request %p for device [%s] already tagged %d", @@ -389,14 +387,13 @@ EXPORT_SYMBOL(blk_queue_start_tag); * Hardware conditions may dictate a need to stop all pending requests. * In this case, we will safely clear the block side of the tag queue and * readd all requests to the request queue in the right order. - * - * Notes: - * queue lock must be held. **/ void blk_queue_invalidate_tags(struct request_queue *q) { struct list_head *tmp, *n; + lockdep_assert_held(q->queue_lock); + list_for_each_safe(tmp, n, &q->tag_busy_list) blk_requeue_request(q, list_entry_rq(tmp)); } diff --git a/block/blk-timeout.c b/block/blk-timeout.c index cbff183..17ec83b 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -189,13 +189,15 @@ unsigned long blk_rq_timeout(unsigned long timeout) * Notes: * Each request has its own timer, and as it is added to the queue, we * set up the timer. When the request completes, we cancel the timer. - * Queue lock must be held for the non-mq case, mq case doesn't care. */ void blk_add_timer(struct request *req) { struct request_queue *q = req->q; unsigned long expiry; + if (!q->mq_ops) + lockdep_assert_held(q->queue_lock); + /* blk-mq has its own handler, so we don't need ->rq_timed_out_fn */ if (!q->mq_ops && !q->rq_timed_out_fn) return; diff --git a/block/blk.h b/block/blk.h index 83c8e11..01ebb81 100644 --- a/block/blk.h +++ b/block/blk.h @@ -143,6 +143,8 @@ static inline struct request *__elv_next_request(struct request_queue *q) struct request *rq; struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); + WARN_ON_ONCE(q->mq_ops); + while (1) { if (!list_empty(&q->queue_head)) { rq = list_entry_rq(q->queue_head.next); @@ -334,4 +336,17 @@ static inline void blk_throtl_bio_endio(struct bio *bio) { } static inline void blk_throtl_stat_add(struct request *rq, u64 time) { } #endif +#ifdef CONFIG_BOUNCE +extern int init_emergency_isa_pool(void); +extern void blk_queue_bounce(struct request_queue *q, struct bio **bio); +#else +static inline int init_emergency_isa_pool(void) +{ + return 0; +} +static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio) +{ +} +#endif /* CONFIG_BOUNCE */ + #endif /* BLK_INTERNAL_H */ diff --git a/block/bounce.c b/block/bounce.c index 1cb5dd3..5793c2d 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -22,10 +22,12 @@ #include <asm/tlbflush.h> #include <trace/events/block.h> +#include "blk.h" #define POOL_SIZE 64 #define ISA_POOL_SIZE 16 +static struct bio_set *bounce_bio_set, *bounce_bio_split; static mempool_t *page_pool, *isa_page_pool; #if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL) @@ -40,6 +42,14 @@ static __init int init_emergency_pool(void) BUG_ON(!page_pool); pr_info("pool size: %d pages\n", POOL_SIZE); + bounce_bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); + BUG_ON(!bounce_bio_set); + if (bioset_integrity_create(bounce_bio_set, BIO_POOL_SIZE)) + BUG_ON(1); + + bounce_bio_split = bioset_create(BIO_POOL_SIZE, 0, 0); + BUG_ON(!bounce_bio_split); + return 0; } @@ -143,7 +153,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool) mempool_free(bvec->bv_page, pool); } - bio_orig->bi_error = bio->bi_error; + bio_orig->bi_status = bio->bi_status; bio_endio(bio_orig); bio_put(bio); } @@ -163,7 +173,7 @@ static void __bounce_end_io_read(struct bio *bio, mempool_t *pool) { struct bio *bio_orig = bio->bi_private; - if (!bio->bi_error) + if (!bio->bi_status) copy_to_high_bio_irq(bio_orig, bio); bounce_end_io(bio, pool); @@ -186,20 +196,31 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, int rw = bio_data_dir(*bio_orig); struct bio_vec *to, from; struct bvec_iter iter; - unsigned i; - - bio_for_each_segment(from, *bio_orig, iter) - if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q)) - goto bounce; + unsigned i = 0; + bool bounce = false; + int sectors = 0; + + bio_for_each_segment(from, *bio_orig, iter) { + if (i++ < BIO_MAX_PAGES) + sectors += from.bv_len >> 9; + if (page_to_pfn(from.bv_page) > q->limits.bounce_pfn) + bounce = true; + } + if (!bounce) + return; - return; -bounce: - bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set); + if (sectors < bio_sectors(*bio_orig)) { + bio = bio_split(*bio_orig, sectors, GFP_NOIO, bounce_bio_split); + bio_chain(bio, *bio_orig); + generic_make_request(*bio_orig); + *bio_orig = bio; + } + bio = bio_clone_bioset(*bio_orig, GFP_NOIO, bounce_bio_set); bio_for_each_segment_all(to, bio, i) { struct page *page = to->bv_page; - if (page_to_pfn(page) <= queue_bounce_pfn(q)) + if (page_to_pfn(page) <= q->limits.bounce_pfn) continue; to->bv_page = mempool_alloc(pool, q->bounce_gfp); @@ -251,7 +272,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) * don't waste time iterating over bio segments */ if (!(q->bounce_gfp & GFP_DMA)) { - if (queue_bounce_pfn(q) >= blk_max_pfn) + if (q->limits.bounce_pfn >= blk_max_pfn) return; pool = page_pool; } else { @@ -264,5 +285,3 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) */ __blk_queue_bounce(q, bio_orig, pool); } - -EXPORT_SYMBOL(blk_queue_bounce); diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 0a23dbb..c4513b2 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -37,7 +37,7 @@ static void bsg_destroy_job(struct kref *kref) struct bsg_job *job = container_of(kref, struct bsg_job, kref); struct request *rq = job->req; - blk_end_request_all(rq, scsi_req(rq)->result); + blk_end_request_all(rq, BLK_STS_OK); put_device(job->dev); /* release reference for the request */ @@ -202,7 +202,7 @@ static void bsg_request_fn(struct request_queue *q) ret = bsg_create_job(dev, req); if (ret) { scsi_req(req)->result = ret; - blk_end_request_all(req, ret); + blk_end_request_all(req, BLK_STS_OK); spin_lock_irq(q->queue_lock); continue; } @@ -246,6 +246,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, char *name, q->bsg_job_size = dd_job_size; q->bsg_job_fn = job_fn; queue_flag_set_unlocked(QUEUE_FLAG_BIDI, q); + queue_flag_set_unlocked(QUEUE_FLAG_SCSI_PASSTHROUGH, q); blk_queue_softirq_done(q, bsg_softirq_done); blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT); diff --git a/block/bsg.c b/block/bsg.c index 6fd0854..37663b6 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -236,7 +236,6 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm) rq = blk_get_request(q, op, GFP_KERNEL); if (IS_ERR(rq)) return rq; - scsi_req_init(rq); ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, has_write_perm); if (ret) @@ -294,14 +293,14 @@ out: * async completion call-back from the block layer, when scsi/ide/whatever * calls end_that_request_last() on a request */ -static void bsg_rq_end_io(struct request *rq, int uptodate) +static void bsg_rq_end_io(struct request *rq, blk_status_t status) { struct bsg_command *bc = rq->end_io_data; struct bsg_device *bd = bc->bd; unsigned long flags; - dprintk("%s: finished rq %p bc %p, bio %p stat %d\n", - bd->name, rq, bc, bc->bio, uptodate); + dprintk("%s: finished rq %p bc %p, bio %p\n", + bd->name, rq, bc, bc->bio); bc->hdr.duration = jiffies_to_msecs(jiffies - bc->hdr.duration); @@ -750,6 +749,12 @@ static struct bsg_device *bsg_add_device(struct inode *inode, #ifdef BSG_DEBUG unsigned char buf[32]; #endif + + if (!blk_queue_scsi_passthrough(rq)) { + WARN_ONCE(true, "Attempt to register a non-SCSI queue\n"); + return ERR_PTR(-EINVAL); + } + if (!blk_get_queue(rq)) return ERR_PTR(-ENXIO); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index b7e9c7f..3d5c289 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -982,15 +982,6 @@ static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) return min_vdisktime; } -static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime) -{ - s64 delta = (s64)(vdisktime - min_vdisktime); - if (delta < 0) - min_vdisktime = vdisktime; - - return min_vdisktime; -} - static void update_min_vdisktime(struct cfq_rb_root *st) { struct cfq_group *cfqg; diff --git a/block/elevator.c b/block/elevator.c index dac99fb..4bb2f0c 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -681,6 +681,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where) */ if (elv_attempt_insert_merge(q, rq)) break; + /* fall through */ case ELEVATOR_INSERT_SORT: BUG_ON(blk_rq_is_passthrough(rq)); rq->rq_flags |= RQF_SORTED; diff --git a/block/genhd.c b/block/genhd.c index d252d29..7f520fa 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -36,7 +36,7 @@ struct kobject *block_depr; static DEFINE_SPINLOCK(ext_devt_lock); static DEFINE_IDR(ext_devt_idr); -static struct device_type disk_type; +static const struct device_type disk_type; static void disk_check_events(struct disk_events *ev, unsigned int *clearing_ptr); @@ -1183,7 +1183,7 @@ static char *block_devnode(struct device *dev, umode_t *mode, return NULL; } -static struct device_type disk_type = { +static const struct device_type disk_type = { .name = "disk", .groups = disk_attr_groups, .release = disk_release, diff --git a/block/ioprio.c b/block/ioprio.c index 4b120c9..6f5d0b6 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -75,7 +75,8 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) case IOPRIO_CLASS_RT: if (!capable(CAP_SYS_ADMIN)) return -EPERM; - /* fall through, rt has prio field too */ + /* fall through */ + /* rt has prio field too */ case IOPRIO_CLASS_BE: if (data >= IOPRIO_BE_NR || data < 0) return -EINVAL; diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index b9faabc..a9f6fd3 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -426,33 +426,29 @@ static void rq_clear_domain_token(struct kyber_queue_data *kqd, } } -static struct request *kyber_get_request(struct request_queue *q, - unsigned int op, - struct blk_mq_alloc_data *data) +static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) { - struct kyber_queue_data *kqd = q->elevator->elevator_data; - struct request *rq; - /* * We use the scheduler tags as per-hardware queue queueing tokens. * Async requests can be limited at this stage. */ - if (!op_is_sync(op)) + if (!op_is_sync(op)) { + struct kyber_queue_data *kqd = data->q->elevator->elevator_data; + data->shallow_depth = kqd->async_depth; + } +} - rq = __blk_mq_alloc_request(data, op); - if (rq) - rq_set_domain_token(rq, -1); - return rq; +static void kyber_prepare_request(struct request *rq, struct bio *bio) +{ + rq_set_domain_token(rq, -1); } -static void kyber_put_request(struct request *rq) +static void kyber_finish_request(struct request *rq) { - struct request_queue *q = rq->q; - struct kyber_queue_data *kqd = q->elevator->elevator_data; + struct kyber_queue_data *kqd = rq->q->elevator->elevator_data; rq_clear_domain_token(kqd, rq); - blk_mq_finish_request(rq); } static void kyber_completed_request(struct request *rq) @@ -815,8 +811,9 @@ static struct elevator_type kyber_sched = { .exit_sched = kyber_exit_sched, .init_hctx = kyber_init_hctx, .exit_hctx = kyber_exit_hctx, - .get_request = kyber_get_request, - .put_request = kyber_put_request, + .limit_depth = kyber_limit_depth, + .prepare_request = kyber_prepare_request, + .finish_request = kyber_finish_request, .completed_request = kyber_completed_request, .dispatch_request = kyber_dispatch_request, .has_work = kyber_has_work, diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 4a294a5..7440de4 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -326,7 +326,6 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, if (IS_ERR(rq)) return PTR_ERR(rq); req = scsi_req(rq); - scsi_req_init(rq); if (hdr->cmd_len > BLK_MAX_CDB) { req->cmd = kzalloc(hdr->cmd_len, GFP_KERNEL); @@ -456,7 +455,6 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, goto error_free_buffer; } req = scsi_req(rq); - scsi_req_init(rq); cmdlen = COMMAND_SIZE(opcode); @@ -542,7 +540,6 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk, rq = blk_get_request(q, REQ_OP_SCSI_OUT, __GFP_RECLAIM); if (IS_ERR(rq)) return PTR_ERR(rq); - scsi_req_init(rq); rq->timeout = BLK_DEFAULT_SG_TIMEOUT; scsi_req(rq)->cmd[0] = cmd; scsi_req(rq)->cmd[4] = data; @@ -744,10 +741,14 @@ int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode, } EXPORT_SYMBOL(scsi_cmd_blk_ioctl); -void scsi_req_init(struct request *rq) +/** + * scsi_req_init - initialize certain fields of a scsi_request structure + * @req: Pointer to a scsi_request structure. + * Initializes .__cmd[], .cmd, .cmd_len and .sense_len but no other members + * of struct scsi_request. + */ +void scsi_req_init(struct scsi_request *req) { - struct scsi_request *req = scsi_req(rq); - memset(req->__cmd, 0, sizeof(req->__cmd)); req->cmd = req->__cmd; req->cmd_len = BLK_MAX_CDB; diff --git a/block/t10-pi.c b/block/t10-pi.c index 680c6d6..3416dad 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -46,8 +46,8 @@ static __be16 t10_pi_ip_fn(void *data, unsigned int len) * 16 bit app tag, 32 bit reference tag. Type 3 does not define the ref * tag. */ -static int t10_pi_generate(struct blk_integrity_iter *iter, csum_fn *fn, - unsigned int type) +static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, + csum_fn *fn, unsigned int type) { unsigned int i; @@ -67,11 +67,11 @@ static int t10_pi_generate(struct blk_integrity_iter *iter, csum_fn *fn, iter->seed++; } - return 0; + return BLK_STS_OK; } -static int t10_pi_verify(struct blk_integrity_iter *iter, csum_fn *fn, - unsigned int type) +static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, + csum_fn *fn, unsigned int type) { unsigned int i; @@ -91,7 +91,7 @@ static int t10_pi_verify(struct blk_integrity_iter *iter, csum_fn *fn, "(rcvd %u)\n", iter->disk_name, (unsigned long long) iter->seed, be32_to_cpu(pi->ref_tag)); - return -EILSEQ; + return BLK_STS_PROTECTION; } break; case 3: @@ -108,7 +108,7 @@ static int t10_pi_verify(struct blk_integrity_iter *iter, csum_fn *fn, "(rcvd %04x, want %04x)\n", iter->disk_name, (unsigned long long)iter->seed, be16_to_cpu(pi->guard_tag), be16_to_cpu(csum)); - return -EILSEQ; + return BLK_STS_PROTECTION; } next: @@ -117,45 +117,45 @@ next: iter->seed++; } - return 0; + return BLK_STS_OK; } -static int t10_pi_type1_generate_crc(struct blk_integrity_iter *iter) +static blk_status_t t10_pi_type1_generate_crc(struct blk_integrity_iter *iter) { return t10_pi_generate(iter, t10_pi_crc_fn, 1); } -static int t10_pi_type1_generate_ip(struct blk_integrity_iter *iter) +static blk_status_t t10_pi_type1_generate_ip(struct blk_integrity_iter *iter) { return t10_pi_generate(iter, t10_pi_ip_fn, 1); } -static int t10_pi_type1_verify_crc(struct blk_integrity_iter *iter) +static blk_status_t t10_pi_type1_verify_crc(struct blk_integrity_iter *iter) { return t10_pi_verify(iter, t10_pi_crc_fn, 1); } -static int t10_pi_type1_verify_ip(struct blk_integrity_iter *iter) +static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter) { return t10_pi_verify(iter, t10_pi_ip_fn, 1); } -static int t10_pi_type3_generate_crc(struct blk_integrity_iter *iter) +static blk_status_t t10_pi_type3_generate_crc(struct blk_integrity_iter *iter) { return t10_pi_generate(iter, t10_pi_crc_fn, 3); } -static int t10_pi_type3_generate_ip(struct blk_integrity_iter *iter) +static blk_status_t t10_pi_type3_generate_ip(struct blk_integrity_iter *iter) { return t10_pi_generate(iter, t10_pi_ip_fn, 3); } -static int t10_pi_type3_verify_crc(struct blk_integrity_iter *iter) +static blk_status_t t10_pi_type3_verify_crc(struct blk_integrity_iter *iter) { return t10_pi_verify(iter, t10_pi_crc_fn, 3); } -static int t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) +static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) { return t10_pi_verify(iter, t10_pi_ip_fn, 3); } diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index 26a51be..245a879 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -3464,7 +3464,7 @@ static inline bool DAC960_ProcessCompletedRequest(DAC960_Command_T *Command, bool SuccessfulIO) { struct request *Request = Command->Request; - int Error = SuccessfulIO ? 0 : -EIO; + blk_status_t Error = SuccessfulIO ? BLK_STS_OK : BLK_STS_IOERR; pci_unmap_sg(Command->Controller->PCIDevice, Command->cmd_sglist, Command->SegmentCount, Command->DmaDirection); diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index a328f67..49908c7 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1378,7 +1378,7 @@ static void redo_fd_request(void) struct amiga_floppy_struct *floppy; char *data; unsigned long flags; - int err; + blk_status_t err; next_req: rq = set_next_request(); @@ -1392,7 +1392,7 @@ next_req: next_segment: /* Here someone could investigate to be more efficient */ - for (cnt = 0, err = 0; cnt < blk_rq_cur_sectors(rq); cnt++) { + for (cnt = 0, err = BLK_STS_OK; cnt < blk_rq_cur_sectors(rq); cnt++) { #ifdef DEBUG printk("fd: sector %ld + %d requested for %s\n", blk_rq_pos(rq), cnt, @@ -1400,7 +1400,7 @@ next_segment: #endif block = blk_rq_pos(rq) + cnt; if ((int)block > floppy->blocks) { - err = -EIO; + err = BLK_STS_IOERR; break; } @@ -1413,7 +1413,7 @@ next_segment: #endif if (get_track(drive, track) == -1) { - err = -EIO; + err = BLK_STS_IOERR; break; } @@ -1424,7 +1424,7 @@ next_segment: /* keep the drive spinning while writes are scheduled */ if (!fd_motor_on(drive)) { - err = -EIO; + err = BLK_STS_IOERR; break; } /* diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 027b876..6797e6c 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -388,6 +388,7 @@ aoeblk_gdalloc(void *vp) d->aoemajor, d->aoeminor); goto err_mempool; } + blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); spin_lock_irqsave(&d->lock, flags); WARN_ON(!(d->flags & DEVFL_GD_NOW)); diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 3c606c0..dc43254 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -1070,8 +1070,8 @@ aoe_end_request(struct aoedev *d, struct request *rq, int fastfail) d->ip.rq = NULL; do { bio = rq->bio; - bok = !fastfail && !bio->bi_error; - } while (__blk_end_request(rq, bok ? 0 : -EIO, bio->bi_iter.bi_size)); + bok = !fastfail && !bio->bi_status; + } while (__blk_end_request(rq, bok ? BLK_STS_OK : BLK_STS_IOERR, bio->bi_iter.bi_size)); /* cf. http://lkml.org/lkml/2006/10/31/28 */ if (!fastfail) @@ -1131,7 +1131,7 @@ ktiocomplete(struct frame *f) ahout->cmdstat, ahin->cmdstat, d->aoemajor, d->aoeminor); noskb: if (buf) - buf->bio->bi_error = -EIO; + buf->bio->bi_status = BLK_STS_IOERR; goto out; } @@ -1144,7 +1144,7 @@ noskb: if (buf) "aoe: runt data size in read from", (long) d->aoemajor, d->aoeminor, skb->len, n); - buf->bio->bi_error = -EIO; + buf->bio->bi_status = BLK_STS_IOERR; break; } if (n > f->iter.bi_size) { @@ -1152,7 +1152,7 @@ noskb: if (buf) "aoe: too-large data size in read from", (long) d->aoemajor, d->aoeminor, n, f->iter.bi_size); - buf->bio->bi_error = -EIO; + buf->bio->bi_status = BLK_STS_IOERR; break; } bvcpy(skb, f->buf->bio, f->iter, n); @@ -1654,7 +1654,7 @@ aoe_failbuf(struct aoedev *d, struct buf *buf) if (buf == NULL) return; buf->iter.bi_size = 0; - buf->bio->bi_error = -EIO; + buf->bio->bi_status = BLK_STS_IOERR; if (buf->nframesout == 0) aoe_end_buf(d, buf); } diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index ffd1947..b28fefb 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c @@ -170,7 +170,7 @@ aoe_failip(struct aoedev *d) if (rq == NULL) return; while ((bio = d->ip.nxbio)) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; d->ip.nxbio = bio->bi_next; n = (unsigned long) rq->special; rq->special = (void *) --n; diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index fa69ecd..92da886 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -378,7 +378,7 @@ static DEFINE_TIMER(readtrack_timer, fd_readtrack_check, 0, 0); static DEFINE_TIMER(timeout_timer, fd_times_out, 0, 0); static DEFINE_TIMER(fd_timer, check_change, 0, 0); -static void fd_end_request_cur(int err) +static void fd_end_request_cur(blk_status_t err) { if (!__blk_end_request_cur(fd_request, err)) fd_request = NULL; @@ -620,7 +620,7 @@ static void fd_error( void ) fd_request->error_count++; if (fd_request->error_count >= MAX_ERRORS) { printk(KERN_ERR "fd%d: too many errors.\n", SelectedDrive ); - fd_end_request_cur(-EIO); + fd_end_request_cur(BLK_STS_IOERR); } else if (fd_request->error_count == RECALIBRATE_ERRORS) { printk(KERN_WARNING "fd%d: recalibrating\n", SelectedDrive ); @@ -739,7 +739,7 @@ static void do_fd_action( int drive ) } else { /* all sectors finished */ - fd_end_request_cur(0); + fd_end_request_cur(BLK_STS_OK); redo_fd_request(); return; } @@ -1144,7 +1144,7 @@ static void fd_rwsec_done1(int status) } else { /* all sectors finished */ - fd_end_request_cur(0); + fd_end_request_cur(BLK_STS_OK); redo_fd_request(); } return; @@ -1445,7 +1445,7 @@ repeat: if (!UD.connected) { /* drive not connected */ printk(KERN_ERR "Unknown Device: fd%d\n", drive ); - fd_end_request_cur(-EIO); + fd_end_request_cur(BLK_STS_IOERR); goto repeat; } @@ -1461,12 +1461,12 @@ repeat: /* user supplied disk type */ if (--type >= NUM_DISK_MINORS) { printk(KERN_WARNING "fd%d: invalid disk format", drive ); - fd_end_request_cur(-EIO); + fd_end_request_cur(BLK_STS_IOERR); goto repeat; } if (minor2disktype[type].drive_types > DriveType) { printk(KERN_WARNING "fd%d: unsupported disk format", drive ); - fd_end_request_cur(-EIO); + fd_end_request_cur(BLK_STS_IOERR); goto repeat; } type = minor2disktype[type].index; @@ -1476,7 +1476,7 @@ repeat: } if (blk_rq_pos(fd_request) + 1 > UDT->blocks) { - fd_end_request_cur(-EIO); + fd_end_request_cur(BLK_STS_IOERR); goto repeat; } diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 57b574f..6112e99 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -418,7 +418,6 @@ static struct brd_device *brd_alloc(int i) blk_queue_make_request(brd->brd_queue, brd_make_request); blk_queue_max_hw_sectors(brd->brd_queue, 1024); - blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY); /* This is so fdisk will align partitions on 4k, because of * direct_access API needing 4k alignment, returning a PFN diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index cd37550..02a6119 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -1864,7 +1864,8 @@ static void cciss_softirq_done(struct request *rq) /* set the residual count for pc requests */ if (blk_rq_is_passthrough(rq)) scsi_req(rq)->resid_len = c->err_info->ResidualCnt; - blk_end_request_all(rq, scsi_req(rq)->result ? -EIO : 0); + blk_end_request_all(rq, scsi_req(rq)->result ? + BLK_STS_IOERR : BLK_STS_OK); spin_lock_irqsave(&h->lock, flags); cmd_free(h, c); @@ -1956,6 +1957,7 @@ static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk, disk->queue->cmd_size = sizeof(struct scsi_request); disk->queue->request_fn = do_cciss_request; disk->queue->queue_lock = &h->lock; + queue_flag_set_unlocked(QUEUE_FLAG_SCSI_PASSTHROUGH, disk->queue); if (blk_init_allocated_queue(disk->queue) < 0) goto cleanup_queue; diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 8d7bcfa..e02c45c 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -178,7 +178,7 @@ static int _drbd_md_sync_page_io(struct drbd_device *device, else submit_bio(bio); wait_until_done_or_force_detached(device, bdev, &device->md_io.done); - if (!bio->bi_error) + if (!bio->bi_status) err = device->md_io.error; out: diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index a804a41..809fd24 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -959,16 +959,16 @@ static void drbd_bm_endio(struct bio *bio) !bm_test_page_unchanged(b->bm_pages[idx])) drbd_warn(device, "bitmap page idx %u changed during IO!\n", idx); - if (bio->bi_error) { + if (bio->bi_status) { /* ctx error will hold the completed-last non-zero error code, * in case error codes differ. */ - ctx->error = bio->bi_error; + ctx->error = blk_status_to_errno(bio->bi_status); bm_set_page_io_err(b->bm_pages[idx]); /* Not identical to on disk version of it. * Is BM_PAGE_IO_ERROR enough? */ if (__ratelimit(&drbd_ratelimit_state)) drbd_err(device, "IO ERROR %d on bitmap page idx %u\n", - bio->bi_error, idx); + bio->bi_status, idx); } else { bm_clear_page_io_err(b->bm_pages[idx]); dynamic_drbd_dbg(device, "bitmap page idx %u completed\n", idx); diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index d5da45b..d17b6e6 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1441,6 +1441,9 @@ extern struct bio_set *drbd_md_io_bio_set; /* to allocate from that set */ extern struct bio *bio_alloc_drbd(gfp_t gfp_mask); +/* And a bio_set for cloning */ +extern struct bio_set *drbd_io_bio_set; + extern struct mutex resources_mutex; extern int conn_lowest_minor(struct drbd_connection *connection); @@ -1627,7 +1630,7 @@ static inline void drbd_generic_make_request(struct drbd_device *device, __release(local); if (!bio->bi_bdev) { drbd_err(device, "drbd_generic_make_request: bio->bi_bdev == NULL\n"); - bio->bi_error = -ENODEV; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); return; } diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 84455c3..5fb99e0 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -128,6 +128,7 @@ mempool_t *drbd_request_mempool; mempool_t *drbd_ee_mempool; mempool_t *drbd_md_io_page_pool; struct bio_set *drbd_md_io_bio_set; +struct bio_set *drbd_io_bio_set; /* I do not use a standard mempool, because: 1) I want to hand out the pre-allocated objects first. @@ -2098,6 +2099,8 @@ static void drbd_destroy_mempools(void) /* D_ASSERT(device, atomic_read(&drbd_pp_vacant)==0); */ + if (drbd_io_bio_set) + bioset_free(drbd_io_bio_set); if (drbd_md_io_bio_set) bioset_free(drbd_md_io_bio_set); if (drbd_md_io_page_pool) @@ -2115,6 +2118,7 @@ static void drbd_destroy_mempools(void) if (drbd_al_ext_cache) kmem_cache_destroy(drbd_al_ext_cache); + drbd_io_bio_set = NULL; drbd_md_io_bio_set = NULL; drbd_md_io_page_pool = NULL; drbd_ee_mempool = NULL; @@ -2142,6 +2146,7 @@ static int drbd_create_mempools(void) drbd_pp_pool = NULL; drbd_md_io_page_pool = NULL; drbd_md_io_bio_set = NULL; + drbd_io_bio_set = NULL; /* caches */ drbd_request_cache = kmem_cache_create( @@ -2165,7 +2170,13 @@ static int drbd_create_mempools(void) goto Enomem; /* mempools */ - drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0); + drbd_io_bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_RESCUER); + if (drbd_io_bio_set == NULL) + goto Enomem; + + drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0, + BIOSET_NEED_BVECS | + BIOSET_NEED_RESCUER); if (drbd_md_io_bio_set == NULL) goto Enomem; @@ -2839,7 +2850,6 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig /* Setting the max_hw_sectors to an odd value of 8kibyte here This triggers a max_bio_size message upon first attach or connect */ blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8); - blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); q->queue_lock = &resource->req_lock; device->md_io.page = alloc_page(GFP_KERNEL); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 02255a0..ad0fcb4 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -2294,7 +2294,7 @@ _check_net_options(struct drbd_connection *connection, struct net_conf *old_net_ static enum drbd_ret_code check_net_options(struct drbd_connection *connection, struct net_conf *new_net_conf) { - static enum drbd_ret_code rv; + enum drbd_ret_code rv; struct drbd_peer_device *peer_device; int i; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 1b0a2be..c7e95e6 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1229,9 +1229,9 @@ void one_flush_endio(struct bio *bio) struct drbd_device *device = octx->device; struct issue_flush_context *ctx = octx->ctx; - if (bio->bi_error) { - ctx->error = bio->bi_error; - drbd_info(device, "local disk FLUSH FAILED with status %d\n", bio->bi_error); + if (bio->bi_status) { + ctx->error = blk_status_to_errno(bio->bi_status); + drbd_info(device, "local disk FLUSH FAILED with status %d\n", bio->bi_status); } kfree(octx); bio_put(bio); diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 6566243..f6e865b 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -203,7 +203,7 @@ void start_new_tl_epoch(struct drbd_connection *connection) void complete_master_bio(struct drbd_device *device, struct bio_and_error *m) { - m->bio->bi_error = m->error; + m->bio->bi_status = errno_to_blk_status(m->error); bio_endio(m->bio); dec_ap_bio(device); } @@ -1157,7 +1157,7 @@ static void drbd_process_discard_req(struct drbd_request *req) if (blkdev_issue_zeroout(bdev, req->i.sector, req->i.size >> 9, GFP_NOIO, 0)) - req->private_bio->bi_error = -EIO; + req->private_bio->bi_status = BLK_STS_IOERR; bio_endio(req->private_bio); } @@ -1225,7 +1225,7 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long /* only pass the error to the upper layers. * if user cannot handle io errors, that's not our business. */ drbd_err(device, "could not kmalloc() req\n"); - bio->bi_error = -ENOMEM; + bio->bi_status = BLK_STS_RESOURCE; bio_endio(bio); return ERR_PTR(-ENOMEM); } @@ -1560,7 +1560,7 @@ blk_qc_t drbd_make_request(struct request_queue *q, struct bio *bio) struct drbd_device *device = (struct drbd_device *) q->queuedata; unsigned long start_jif; - blk_queue_split(q, &bio, q->bio_split); + blk_queue_split(q, &bio); start_jif = jiffies; diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index eb49e7f..9e1866a 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -263,7 +263,7 @@ enum drbd_req_state_bits { static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src) { struct bio *bio; - bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */ + bio = bio_clone_fast(bio_src, GFP_NOIO, drbd_io_bio_set); req->private_bio = bio; diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 1afcb4e..1d8726a 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -63,7 +63,7 @@ void drbd_md_endio(struct bio *bio) struct drbd_device *device; device = bio->bi_private; - device->md_io.error = bio->bi_error; + device->md_io.error = blk_status_to_errno(bio->bi_status); /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able * to timeout on the lower level device, and eventually detach from it. @@ -177,13 +177,13 @@ void drbd_peer_request_endio(struct bio *bio) bool is_discard = bio_op(bio) == REQ_OP_WRITE_ZEROES || bio_op(bio) == REQ_OP_DISCARD; - if (bio->bi_error && __ratelimit(&drbd_ratelimit_state)) + if (bio->bi_status && __ratelimit(&drbd_ratelimit_state)) drbd_warn(device, "%s: error=%d s=%llus\n", is_write ? (is_discard ? "discard" : "write") - : "read", bio->bi_error, + : "read", bio->bi_status, (unsigned long long)peer_req->i.sector); - if (bio->bi_error) + if (bio->bi_status) set_bit(__EE_WAS_ERROR, &peer_req->flags); bio_put(bio); /* no need for the bio anymore */ @@ -243,16 +243,16 @@ void drbd_request_endio(struct bio *bio) if (__ratelimit(&drbd_ratelimit_state)) drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n"); - if (!bio->bi_error) + if (!bio->bi_status) drbd_panic_after_delayed_completion_of_aborted_request(device); } /* to avoid recursion in __req_mod */ - if (unlikely(bio->bi_error)) { + if (unlikely(bio->bi_status)) { switch (bio_op(bio)) { case REQ_OP_WRITE_ZEROES: case REQ_OP_DISCARD: - if (bio->bi_error == -EOPNOTSUPP) + if (bio->bi_status == BLK_STS_NOTSUPP) what = DISCARD_COMPLETED_NOTSUPP; else what = DISCARD_COMPLETED_WITH_ERROR; @@ -272,7 +272,7 @@ void drbd_request_endio(struct bio *bio) } bio_put(req->private_bio); - req->private_bio = ERR_PTR(bio->bi_error); + req->private_bio = ERR_PTR(blk_status_to_errno(bio->bi_status)); /* not req_mod(), we need irqsave here! */ spin_lock_irqsave(&device->resource->req_lock, flags); diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 60d4c76..ce82364 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -2202,7 +2202,7 @@ static int do_format(int drive, struct format_descr *tmp_format_req) * ============================= */ -static void floppy_end_request(struct request *req, int error) +static void floppy_end_request(struct request *req, blk_status_t error) { unsigned int nr_sectors = current_count_sectors; unsigned int drive = (unsigned long)req->rq_disk->private_data; @@ -2263,7 +2263,7 @@ static void request_done(int uptodate) DRWE->last_error_generation = DRS->generation; } spin_lock_irqsave(q->queue_lock, flags); - floppy_end_request(req, -EIO); + floppy_end_request(req, BLK_STS_IOERR); spin_unlock_irqrestore(q->queue_lock, flags); } } @@ -3780,9 +3780,9 @@ static void floppy_rb0_cb(struct bio *bio) struct rb0_cbdata *cbdata = (struct rb0_cbdata *)bio->bi_private; int drive = cbdata->drive; - if (bio->bi_error) { + if (bio->bi_status) { pr_info("floppy: error %d while reading block 0\n", - bio->bi_error); + bio->bi_status); set_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags); } complete(&cbdata->complete); @@ -4203,6 +4203,7 @@ static int __init do_floppy_init(void) goto out_put_disk; } + blk_queue_bounce_limit(disks[drive]->queue, BLK_BOUNCE_HIGH); blk_queue_max_hw_sectors(disks[drive]->queue, 64); disks[drive]->major = FLOPPY_MAJOR; disks[drive]->first_minor = TOMINOR(drive); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index ebbd0c3..0de1144 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -221,7 +221,8 @@ static void __loop_update_dio(struct loop_device *lo, bool dio) } static int -figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit) +figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit, + loff_t logical_blocksize) { loff_t size = get_size(offset, sizelimit, lo->lo_backing_file); sector_t x = (sector_t)size; @@ -233,6 +234,12 @@ figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit) lo->lo_offset = offset; if (lo->lo_sizelimit != sizelimit) lo->lo_sizelimit = sizelimit; + if (lo->lo_flags & LO_FLAGS_BLOCKSIZE) { + lo->lo_logical_blocksize = logical_blocksize; + blk_queue_physical_block_size(lo->lo_queue, lo->lo_blocksize); + blk_queue_logical_block_size(lo->lo_queue, + lo->lo_logical_blocksize); + } set_capacity(lo->lo_disk, x); bd_set_size(bdev, (loff_t)get_capacity(bdev->bd_disk) << 9); /* let user-space know about the new size */ @@ -457,7 +464,7 @@ static void lo_complete_rq(struct request *rq) zero_fill_bio(bio); } - blk_mq_end_request(rq, cmd->ret < 0 ? -EIO : 0); + blk_mq_end_request(rq, cmd->ret < 0 ? BLK_STS_IOERR : BLK_STS_OK); } static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2) @@ -813,6 +820,7 @@ static void loop_config_discard(struct loop_device *lo) struct file *file = lo->lo_backing_file; struct inode *inode = file->f_mapping->host; struct request_queue *q = lo->lo_queue; + int lo_bits = 9; /* * We use punch hole to reclaim the free space used by the @@ -832,8 +840,11 @@ static void loop_config_discard(struct loop_device *lo) q->limits.discard_granularity = inode->i_sb->s_blocksize; q->limits.discard_alignment = 0; - blk_queue_max_discard_sectors(q, UINT_MAX >> 9); - blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9); + if (lo->lo_flags & LO_FLAGS_BLOCKSIZE) + lo_bits = blksize_bits(lo->lo_logical_blocksize); + + blk_queue_max_discard_sectors(q, UINT_MAX >> lo_bits); + blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> lo_bits); queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); } @@ -843,10 +854,16 @@ static void loop_unprepare_queue(struct loop_device *lo) kthread_stop(lo->worker_task); } +static int loop_kthread_worker_fn(void *worker_ptr) +{ + current->flags |= PF_LESS_THROTTLE; + return kthread_worker_fn(worker_ptr); +} + static int loop_prepare_queue(struct loop_device *lo) { kthread_init_worker(&lo->worker); - lo->worker_task = kthread_run(kthread_worker_fn, + lo->worker_task = kthread_run(loop_kthread_worker_fn, &lo->worker, "loop%d", lo->lo_number); if (IS_ERR(lo->worker_task)) return -ENOMEM; @@ -921,6 +938,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, lo->use_dio = false; lo->lo_blocksize = lo_blocksize; + lo->lo_logical_blocksize = 512; lo->lo_device = bdev; lo->lo_flags = lo_flags; lo->lo_backing_file = file; @@ -1086,6 +1104,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) int err; struct loop_func_table *xfer; kuid_t uid = current_uid(); + int lo_flags = lo->lo_flags; if (lo->lo_encrypt_key_size && !uid_eq(lo->lo_key_owner, uid) && @@ -1118,12 +1137,30 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) if (err) goto exit; + if (info->lo_flags & LO_FLAGS_BLOCKSIZE) { + if (!(lo->lo_flags & LO_FLAGS_BLOCKSIZE)) + lo->lo_logical_blocksize = 512; + lo->lo_flags |= LO_FLAGS_BLOCKSIZE; + if (LO_INFO_BLOCKSIZE(info) != 512 && + LO_INFO_BLOCKSIZE(info) != 1024 && + LO_INFO_BLOCKSIZE(info) != 2048 && + LO_INFO_BLOCKSIZE(info) != 4096) + return -EINVAL; + if (LO_INFO_BLOCKSIZE(info) > lo->lo_blocksize) + return -EINVAL; + } + if (lo->lo_offset != info->lo_offset || - lo->lo_sizelimit != info->lo_sizelimit) - if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) { + lo->lo_sizelimit != info->lo_sizelimit || + lo->lo_flags != lo_flags || + ((lo->lo_flags & LO_FLAGS_BLOCKSIZE) && + lo->lo_logical_blocksize != LO_INFO_BLOCKSIZE(info))) { + if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit, + LO_INFO_BLOCKSIZE(info))) { err = -EFBIG; goto exit; } + } loop_config_discard(lo); @@ -1306,12 +1343,13 @@ loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) { return err; } -static int loop_set_capacity(struct loop_device *lo, struct block_device *bdev) +static int loop_set_capacity(struct loop_device *lo) { if (unlikely(lo->lo_state != Lo_bound)) return -ENXIO; - return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit); + return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit, + lo->lo_logical_blocksize); } static int loop_set_dio(struct loop_device *lo, unsigned long arg) @@ -1369,7 +1407,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode, case LOOP_SET_CAPACITY: err = -EPERM; if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) - err = loop_set_capacity(lo, bdev); + err = loop_set_capacity(lo); break; case LOOP_SET_DIRECT_IO: err = -EPERM; @@ -1645,7 +1683,7 @@ int loop_unregister_transfer(int number) EXPORT_SYMBOL(loop_register_transfer); EXPORT_SYMBOL(loop_unregister_transfer); -static int loop_queue_rq(struct blk_mq_hw_ctx *hctx, +static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct loop_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); @@ -1654,7 +1692,7 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(bd->rq); if (lo->lo_state != Lo_bound) - return BLK_MQ_RQ_QUEUE_ERROR; + return BLK_STS_IOERR; switch (req_op(cmd->rq)) { case REQ_OP_FLUSH: @@ -1669,7 +1707,7 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx, kthread_queue_work(&lo->worker, &cmd->work); - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; } static void loop_handle_cmd(struct loop_cmd *cmd) diff --git a/drivers/block/loop.h b/drivers/block/loop.h index fecd3f9..2c096b9 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -49,6 +49,7 @@ struct loop_device { struct file * lo_backing_file; struct block_device *lo_device; unsigned lo_blocksize; + unsigned lo_logical_blocksize; void *key_data; gfp_t old_gfp_mask; diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 3a779a4..61b046f 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -532,7 +532,7 @@ static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer, static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id, struct smart_attr *attrib); -static void mtip_complete_command(struct mtip_cmd *cmd, int status) +static void mtip_complete_command(struct mtip_cmd *cmd, blk_status_t status) { struct request *req = blk_mq_rq_from_pdu(cmd); @@ -568,7 +568,7 @@ static void mtip_handle_tfe(struct driver_data *dd) if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) { cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL); dbg_printk(MTIP_DRV_NAME " TFE for the internal command\n"); - mtip_complete_command(cmd, -EIO); + mtip_complete_command(cmd, BLK_STS_IOERR); return; } @@ -667,7 +667,7 @@ static void mtip_handle_tfe(struct driver_data *dd) tag, fail_reason != NULL ? fail_reason : "unknown"); - mtip_complete_command(cmd, -ENODATA); + mtip_complete_command(cmd, BLK_STS_MEDIUM); continue; } } @@ -690,7 +690,7 @@ static void mtip_handle_tfe(struct driver_data *dd) dev_warn(&port->dd->pdev->dev, "retiring tag %d\n", tag); - mtip_complete_command(cmd, -EIO); + mtip_complete_command(cmd, BLK_STS_IOERR); } } print_tags(dd, "reissued (TFE)", tagaccum, cmd_cnt); @@ -1063,23 +1063,10 @@ static int mtip_exec_internal_command(struct mtip_port *port, /* insert request and run queue */ blk_execute_rq(rq->q, NULL, rq, true); - rv = int_cmd->status; - if (rv < 0) { - if (rv == -ERESTARTSYS) { /* interrupted */ - dev_err(&dd->pdev->dev, - "Internal command [%02X] was interrupted after %u ms\n", - fis->command, - jiffies_to_msecs(jiffies - start)); - rv = -EINTR; - goto exec_ic_exit; - } else if (rv == 0) /* timeout */ - dev_err(&dd->pdev->dev, - "Internal command did not complete [%02X] within timeout of %lu ms\n", - fis->command, timeout); - else - dev_err(&dd->pdev->dev, - "Internal command [%02X] wait returned code [%d] after %lu ms - unhandled\n", - fis->command, rv, timeout); + if (int_cmd->status) { + dev_err(&dd->pdev->dev, "Internal command [%02X] failed %d\n", + fis->command, int_cmd->status); + rv = -EIO; if (mtip_check_surprise_removal(dd->pdev) || test_bit(MTIP_DDF_REMOVE_PENDING_BIT, @@ -2753,7 +2740,7 @@ static void mtip_abort_cmd(struct request *req, void *data, dbg_printk(MTIP_DRV_NAME " Aborting request, tag = %d\n", req->tag); clear_bit(req->tag, dd->port->cmds_to_issue); - cmd->status = -EIO; + cmd->status = BLK_STS_IOERR; mtip_softirq_done_fn(req); } @@ -3597,7 +3584,7 @@ static int mtip_submit_request(struct blk_mq_hw_ctx *hctx, struct request *rq) int err; err = mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq)); - blk_mq_end_request(rq, err); + blk_mq_end_request(rq, err ? BLK_STS_IOERR : BLK_STS_OK); return 0; } @@ -3633,8 +3620,8 @@ static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx, return false; } -static int mtip_issue_reserved_cmd(struct blk_mq_hw_ctx *hctx, - struct request *rq) +static blk_status_t mtip_issue_reserved_cmd(struct blk_mq_hw_ctx *hctx, + struct request *rq) { struct driver_data *dd = hctx->queue->queuedata; struct mtip_int_cmd *icmd = rq->special; @@ -3642,7 +3629,7 @@ static int mtip_issue_reserved_cmd(struct blk_mq_hw_ctx *hctx, struct mtip_cmd_sg *command_sg; if (mtip_commands_active(dd->port)) - return BLK_MQ_RQ_QUEUE_BUSY; + return BLK_STS_RESOURCE; /* Populate the SG list */ cmd->command_header->opts = @@ -3666,10 +3653,10 @@ static int mtip_issue_reserved_cmd(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(rq); mtip_issue_non_ncq_command(dd->port, rq->tag); - return BLK_MQ_RQ_QUEUE_OK; + return 0; } -static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, +static blk_status_t mtip_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *rq = bd->rq; @@ -3681,15 +3668,14 @@ static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, return mtip_issue_reserved_cmd(hctx, rq); if (unlikely(mtip_check_unal_depth(hctx, rq))) - return BLK_MQ_RQ_QUEUE_BUSY; + return BLK_STS_RESOURCE; blk_mq_start_request(rq); ret = mtip_submit_request(hctx, rq); if (likely(!ret)) - return BLK_MQ_RQ_QUEUE_OK; - - return BLK_MQ_RQ_QUEUE_ERROR; + return BLK_STS_OK; + return BLK_STS_IOERR; } static void mtip_free_cmd(struct blk_mq_tag_set *set, struct request *rq, @@ -3730,7 +3716,7 @@ static enum blk_eh_timer_return mtip_cmd_timeout(struct request *req, if (reserved) { struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req); - cmd->status = -ETIME; + cmd->status = BLK_STS_TIMEOUT; return BLK_EH_HANDLED; } @@ -3961,7 +3947,7 @@ static void mtip_no_dev_cleanup(struct request *rq, void *data, bool reserv) { struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); - cmd->status = -ENODEV; + cmd->status = BLK_STS_IOERR; blk_mq_complete_request(rq); } diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index 37b8e3e..e8286af 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -342,7 +342,7 @@ struct mtip_cmd { int retries; /* The number of retries left for this command. */ int direction; /* Data transfer direction */ - int status; + blk_status_t status; }; /* Structure used to describe a port. */ diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index f3f191b..977ec96 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -116,7 +116,7 @@ struct nbd_cmd { int index; int cookie; struct completion send_complete; - int status; + blk_status_t status; }; #if IS_ENABLED(CONFIG_DEBUG_FS) @@ -286,7 +286,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, struct nbd_config *config; if (!refcount_inc_not_zero(&nbd->config_refs)) { - cmd->status = -EIO; + cmd->status = BLK_STS_TIMEOUT; return BLK_EH_HANDLED; } @@ -331,7 +331,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, "Connection timed out\n"); } set_bit(NBD_TIMEDOUT, &config->runtime_flags); - cmd->status = -EIO; + cmd->status = BLK_STS_IOERR; sock_shutdown(nbd); nbd_config_put(nbd); @@ -400,6 +400,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) unsigned long size = blk_rq_bytes(req); struct bio *bio; u32 type; + u32 nbd_cmd_flags = 0; u32 tag = blk_mq_unique_tag(req); int sent = nsock->sent, skip = 0; @@ -429,6 +430,9 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) return -EIO; } + if (req->cmd_flags & REQ_FUA) + nbd_cmd_flags |= NBD_CMD_FLAG_FUA; + /* We did a partial send previously, and we at least sent the whole * request struct, so just go and send the rest of the pages in the * request. @@ -442,7 +446,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) } cmd->index = index; cmd->cookie = nsock->cookie; - request.type = htonl(type); + request.type = htonl(type | nbd_cmd_flags); if (type != NBD_CMD_FLUSH) { request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9); request.len = htonl(size); @@ -465,7 +469,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) nsock->pending = req; nsock->sent = sent; } - return BLK_MQ_RQ_QUEUE_BUSY; + return BLK_STS_RESOURCE; } dev_err_ratelimited(disk_to_dev(nbd->disk), "Send control failed (result %d)\n", result); @@ -506,7 +510,7 @@ send_pages: */ nsock->pending = req; nsock->sent = sent; - return BLK_MQ_RQ_QUEUE_BUSY; + return BLK_STS_RESOURCE; } dev_err(disk_to_dev(nbd->disk), "Send data failed (result %d)\n", @@ -574,7 +578,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) if (ntohl(reply.error)) { dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n", ntohl(reply.error)); - cmd->status = -EIO; + cmd->status = BLK_STS_IOERR; return cmd; } @@ -599,7 +603,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) */ if (nbd_disconnected(config) || config->num_connections <= 1) { - cmd->status = -EIO; + cmd->status = BLK_STS_IOERR; return cmd; } return ERR_PTR(-EIO); @@ -651,7 +655,7 @@ static void nbd_clear_req(struct request *req, void *data, bool reserved) if (!blk_mq_request_started(req)) return; cmd = blk_mq_rq_to_pdu(req); - cmd->status = -EIO; + cmd->status = BLK_STS_IOERR; blk_mq_complete_request(req); } @@ -740,7 +744,7 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index) nbd_config_put(nbd); return -EINVAL; } - cmd->status = 0; + cmd->status = BLK_STS_OK; again: nsock = config->socks[index]; mutex_lock(&nsock->tx_lock); @@ -794,7 +798,7 @@ out: return ret; } -static int nbd_queue_rq(struct blk_mq_hw_ctx *hctx, +static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); @@ -818,13 +822,9 @@ static int nbd_queue_rq(struct blk_mq_hw_ctx *hctx, * appropriate. */ ret = nbd_handle_cmd(cmd, hctx->queue_num); - if (ret < 0) - ret = BLK_MQ_RQ_QUEUE_ERROR; - if (!ret) - ret = BLK_MQ_RQ_QUEUE_OK; complete(&cmd->send_complete); - return ret; + return ret < 0 ? BLK_STS_IOERR : BLK_STS_OK; } static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg, @@ -910,6 +910,7 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg) continue; } sk_set_memalloc(sock->sk); + sock->sk->sk_sndtimeo = nbd->tag_set.timeout; atomic_inc(&config->recv_threads); refcount_inc(&nbd->config_refs); old = nsock->sock; @@ -957,8 +958,12 @@ static void nbd_parse_flags(struct nbd_device *nbd) set_disk_ro(nbd->disk, false); if (config->flags & NBD_FLAG_SEND_TRIM) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue); - if (config->flags & NBD_FLAG_SEND_FLUSH) - blk_queue_write_cache(nbd->disk->queue, true, false); + if (config->flags & NBD_FLAG_SEND_FLUSH) { + if (config->flags & NBD_FLAG_SEND_FUA) + blk_queue_write_cache(nbd->disk->queue, true, true); + else + blk_queue_write_cache(nbd->disk->queue, true, false); + } else blk_queue_write_cache(nbd->disk->queue, false, false); } @@ -1071,6 +1076,7 @@ static int nbd_start_device(struct nbd_device *nbd) return -ENOMEM; } sk_set_memalloc(config->socks[i]->sock->sk); + config->socks[i]->sock->sk->sk_sndtimeo = nbd->tag_set.timeout; atomic_inc(&config->recv_threads); refcount_inc(&nbd->config_refs); INIT_WORK(&args->work, recv_work); @@ -1305,6 +1311,8 @@ static int nbd_dbg_flags_show(struct seq_file *s, void *unused) seq_puts(s, "NBD_FLAG_READ_ONLY\n"); if (flags & NBD_FLAG_SEND_FLUSH) seq_puts(s, "NBD_FLAG_SEND_FLUSH\n"); + if (flags & NBD_FLAG_SEND_FUA) + seq_puts(s, "NBD_FLAG_SEND_FUA\n"); if (flags & NBD_FLAG_SEND_TRIM) seq_puts(s, "NBD_FLAG_SEND_TRIM\n"); diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index d946e1e..71f4422 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -35,7 +35,8 @@ struct nullb { struct request_queue *q; struct gendisk *disk; struct nvm_dev *ndev; - struct blk_mq_tag_set tag_set; + struct blk_mq_tag_set *tag_set; + struct blk_mq_tag_set __tag_set; struct hrtimer timer; unsigned int queue_depth; spinlock_t lock; @@ -50,6 +51,7 @@ static struct mutex lock; static int null_major; static int nullb_indexes; static struct kmem_cache *ppa_cache; +static struct blk_mq_tag_set tag_set; enum { NULL_IRQ_NONE = 0, @@ -109,7 +111,7 @@ static int bs = 512; module_param(bs, int, S_IRUGO); MODULE_PARM_DESC(bs, "Block size (in bytes)"); -static int nr_devices = 2; +static int nr_devices = 1; module_param(nr_devices, int, S_IRUGO); MODULE_PARM_DESC(nr_devices, "Number of devices to register"); @@ -121,6 +123,10 @@ static bool blocking; module_param(blocking, bool, S_IRUGO); MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device"); +static bool shared_tags; +module_param(shared_tags, bool, S_IRUGO); +MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq"); + static int irqmode = NULL_IRQ_SOFTIRQ; static int null_set_irqmode(const char *str, const struct kernel_param *kp) @@ -229,11 +235,11 @@ static void end_cmd(struct nullb_cmd *cmd) switch (queue_mode) { case NULL_Q_MQ: - blk_mq_end_request(cmd->rq, 0); + blk_mq_end_request(cmd->rq, BLK_STS_OK); return; case NULL_Q_RQ: INIT_LIST_HEAD(&cmd->rq->queuelist); - blk_end_request_all(cmd->rq, 0); + blk_end_request_all(cmd->rq, BLK_STS_OK); break; case NULL_Q_BIO: bio_endio(cmd->bio); @@ -356,7 +362,7 @@ static void null_request_fn(struct request_queue *q) } } -static int null_queue_rq(struct blk_mq_hw_ctx *hctx, +static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); @@ -373,34 +379,11 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(bd->rq); null_handle_cmd(cmd); - return BLK_MQ_RQ_QUEUE_OK; -} - -static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) -{ - BUG_ON(!nullb); - BUG_ON(!nq); - - init_waitqueue_head(&nq->wait); - nq->queue_depth = nullb->queue_depth; -} - -static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, - unsigned int index) -{ - struct nullb *nullb = data; - struct nullb_queue *nq = &nullb->queues[index]; - - hctx->driver_data = nq; - null_init_queue(nullb, nq); - nullb->nr_queues++; - - return 0; + return BLK_STS_OK; } static const struct blk_mq_ops null_mq_ops = { .queue_rq = null_queue_rq, - .init_hctx = null_init_hctx, .complete = null_softirq_done_fn, }; @@ -422,11 +405,12 @@ static void cleanup_queues(struct nullb *nullb) #ifdef CONFIG_NVM -static void null_lnvm_end_io(struct request *rq, int error) +static void null_lnvm_end_io(struct request *rq, blk_status_t status) { struct nvm_rq *rqd = rq->end_io_data; - rqd->error = error; + /* XXX: lighnvm core seems to expect NVM_RSP_* values here.. */ + rqd->error = status ? -EIO : 0; nvm_end_io(rqd); blk_put_request(rq); @@ -591,8 +575,8 @@ static void null_del_dev(struct nullb *nullb) else del_gendisk(nullb->disk); blk_cleanup_queue(nullb->q); - if (queue_mode == NULL_Q_MQ) - blk_mq_free_tag_set(&nullb->tag_set); + if (queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) + blk_mq_free_tag_set(nullb->tag_set); if (!use_lightnvm) put_disk(nullb->disk); cleanup_queues(nullb); @@ -614,6 +598,32 @@ static const struct block_device_operations null_fops = { .release = null_release, }; +static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) +{ + BUG_ON(!nullb); + BUG_ON(!nq); + + init_waitqueue_head(&nq->wait); + nq->queue_depth = nullb->queue_depth; +} + +static void null_init_queues(struct nullb *nullb) +{ + struct request_queue *q = nullb->q; + struct blk_mq_hw_ctx *hctx; + struct nullb_queue *nq; + int i; + + queue_for_each_hw_ctx(q, hctx, i) { + if (!hctx->nr_ctx || !hctx->tags) + continue; + nq = &nullb->queues[i]; + hctx->driver_data = nq; + null_init_queue(nullb, nq); + nullb->nr_queues++; + } +} + static int setup_commands(struct nullb_queue *nq) { struct nullb_cmd *cmd; @@ -694,6 +704,22 @@ static int null_gendisk_register(struct nullb *nullb) return 0; } +static int null_init_tag_set(struct blk_mq_tag_set *set) +{ + set->ops = &null_mq_ops; + set->nr_hw_queues = submit_queues; + set->queue_depth = hw_queue_depth; + set->numa_node = home_node; + set->cmd_size = sizeof(struct nullb_cmd); + set->flags = BLK_MQ_F_SHOULD_MERGE; + set->driver_data = NULL; + + if (blocking) + set->flags |= BLK_MQ_F_BLOCKING; + + return blk_mq_alloc_tag_set(set); +} + static int null_add_dev(void) { struct nullb *nullb; @@ -715,26 +741,23 @@ static int null_add_dev(void) goto out_free_nullb; if (queue_mode == NULL_Q_MQ) { - nullb->tag_set.ops = &null_mq_ops; - nullb->tag_set.nr_hw_queues = submit_queues; - nullb->tag_set.queue_depth = hw_queue_depth; - nullb->tag_set.numa_node = home_node; - nullb->tag_set.cmd_size = sizeof(struct nullb_cmd); - nullb->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; - nullb->tag_set.driver_data = nullb; - - if (blocking) - nullb->tag_set.flags |= BLK_MQ_F_BLOCKING; - - rv = blk_mq_alloc_tag_set(&nullb->tag_set); + if (shared_tags) { + nullb->tag_set = &tag_set; + rv = 0; + } else { + nullb->tag_set = &nullb->__tag_set; + rv = null_init_tag_set(nullb->tag_set); + } + if (rv) goto out_cleanup_queues; - nullb->q = blk_mq_init_queue(&nullb->tag_set); + nullb->q = blk_mq_init_queue(nullb->tag_set); if (IS_ERR(nullb->q)) { rv = -ENOMEM; goto out_cleanup_tags; } + null_init_queues(nullb); } else if (queue_mode == NULL_Q_BIO) { nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node); if (!nullb->q) { @@ -787,8 +810,8 @@ static int null_add_dev(void) out_cleanup_blk_queue: blk_cleanup_queue(nullb->q); out_cleanup_tags: - if (queue_mode == NULL_Q_MQ) - blk_mq_free_tag_set(&nullb->tag_set); + if (queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) + blk_mq_free_tag_set(nullb->tag_set); out_cleanup_queues: cleanup_queues(nullb); out_free_nullb: @@ -821,6 +844,9 @@ static int __init null_init(void) queue_mode = NULL_Q_MQ; } + if (queue_mode == NULL_Q_MQ && shared_tags) + null_init_tag_set(&tag_set); + if (queue_mode == NULL_Q_MQ && use_per_node_hctx) { if (submit_queues < nr_online_nodes) { pr_warn("null_blk: submit_queues param is set to %u.", @@ -881,6 +907,9 @@ static void __exit null_exit(void) } mutex_unlock(&lock); + if (queue_mode == NULL_Q_MQ && shared_tags) + blk_mq_free_tag_set(&tag_set); + kmem_cache_destroy(ppa_cache); } diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index b1267ef..7b8c636 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -305,6 +305,7 @@ static void pcd_init_units(void) put_disk(disk); continue; } + blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH); cd->disk = disk; cd->pi = &cd->pia; cd->present = 0; @@ -783,7 +784,7 @@ static void pcd_request(void) ps_set_intr(do_pcd_read, NULL, 0, nice); return; } else { - __blk_end_request_all(pcd_req, -EIO); + __blk_end_request_all(pcd_req, BLK_STS_IOERR); pcd_req = NULL; } } @@ -794,7 +795,7 @@ static void do_pcd_request(struct request_queue *q) pcd_request(); } -static inline void next_request(int err) +static inline void next_request(blk_status_t err) { unsigned long saved_flags; @@ -837,7 +838,7 @@ static void pcd_start(void) if (pcd_command(pcd_current, rd_cmd, 2048, "read block")) { pcd_bufblk = -1; - next_request(-EIO); + next_request(BLK_STS_IOERR); return; } @@ -871,7 +872,7 @@ static void do_pcd_read_drq(void) return; } pcd_bufblk = -1; - next_request(-EIO); + next_request(BLK_STS_IOERR); return; } diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index 7d2402f..27a44b9 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -438,7 +438,7 @@ static void run_fsm(void) phase = NULL; spin_lock_irqsave(&pd_lock, saved_flags); if (!__blk_end_request_cur(pd_req, - res == Ok ? 0 : -EIO)) { + res == Ok ? 0 : BLK_STS_IOERR)) { if (!set_next_request()) stop = 1; } @@ -863,6 +863,7 @@ static void pd_probe_drive(struct pd_unit *disk) return; } blk_queue_max_hw_sectors(p->queue, cluster); + blk_queue_bounce_limit(p->queue, BLK_BOUNCE_HIGH); if (disk->drive == -1) { for (disk->drive = 0; disk->drive <= 1; disk->drive++) diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index f24ca73..eef7a91 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -293,6 +293,7 @@ static void __init pf_init_units(void) return; } blk_queue_max_segments(disk->queue, cluster); + blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH); pf->disk = disk; pf->pi = &pf->pia; pf->media_status = PF_NM; @@ -801,7 +802,7 @@ static int set_next_request(void) return pf_req != NULL; } -static void pf_end_request(int err) +static void pf_end_request(blk_status_t err) { if (pf_req && !__blk_end_request_cur(pf_req, err)) pf_req = NULL; @@ -821,7 +822,7 @@ repeat: pf_count = blk_rq_cur_sectors(pf_req); if (pf_block + pf_count > get_capacity(pf_req->rq_disk)) { - pf_end_request(-EIO); + pf_end_request(BLK_STS_IOERR); goto repeat; } @@ -836,7 +837,7 @@ repeat: pi_do_claimed(pf_current->pi, do_pf_write); else { pf_busy = 0; - pf_end_request(-EIO); + pf_end_request(BLK_STS_IOERR); goto repeat; } } @@ -868,7 +869,7 @@ static int pf_next_buf(void) return 0; } -static inline void next_request(int err) +static inline void next_request(blk_status_t err) { unsigned long saved_flags; @@ -896,7 +897,7 @@ static void do_pf_read_start(void) pi_do_claimed(pf_current->pi, do_pf_read_start); return; } - next_request(-EIO); + next_request(BLK_STS_IOERR); return; } pf_mask = STAT_DRQ; @@ -915,7 +916,7 @@ static void do_pf_read_drq(void) pi_do_claimed(pf_current->pi, do_pf_read_start); return; } - next_request(-EIO); + next_request(BLK_STS_IOERR); return; } pi_read_block(pf_current->pi, pf_buf, 512); @@ -942,7 +943,7 @@ static void do_pf_write_start(void) pi_do_claimed(pf_current->pi, do_pf_write_start); return; } - next_request(-EIO); + next_request(BLK_STS_IOERR); return; } @@ -955,7 +956,7 @@ static void do_pf_write_start(void) pi_do_claimed(pf_current->pi, do_pf_write_start); return; } - next_request(-EIO); + next_request(BLK_STS_IOERR); return; } pi_write_block(pf_current->pi, pf_buf, 512); @@ -975,7 +976,7 @@ static void do_pf_write_done(void) pi_do_claimed(pf_current->pi, do_pf_write_start); return; } - next_request(-EIO); + next_request(BLK_STS_IOERR); return; } pi_disconnect(pf_current->pi); diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 205b865..467beca 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -98,6 +98,7 @@ static int write_congestion_on = PKT_WRITE_CONGESTION_ON; static int write_congestion_off = PKT_WRITE_CONGESTION_OFF; static struct mutex ctl_mutex; /* Serialize open/close/setup/teardown */ static mempool_t *psd_pool; +static struct bio_set *pkt_bio_set; static struct class *class_pktcdvd = NULL; /* /sys/class/pktcdvd */ static struct dentry *pkt_debugfs_root = NULL; /* /sys/kernel/debug/pktcdvd */ @@ -707,7 +708,6 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command * REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, __GFP_RECLAIM); if (IS_ERR(rq)) return PTR_ERR(rq); - scsi_req_init(rq); if (cgc->buflen) { ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen, @@ -952,9 +952,9 @@ static void pkt_end_io_read(struct bio *bio) pkt_dbg(2, pd, "bio=%p sec0=%llx sec=%llx err=%d\n", bio, (unsigned long long)pkt->sector, - (unsigned long long)bio->bi_iter.bi_sector, bio->bi_error); + (unsigned long long)bio->bi_iter.bi_sector, bio->bi_status); - if (bio->bi_error) + if (bio->bi_status) atomic_inc(&pkt->io_errors); if (atomic_dec_and_test(&pkt->io_wait)) { atomic_inc(&pkt->run_sm); @@ -969,7 +969,7 @@ static void pkt_end_io_packet_write(struct bio *bio) struct pktcdvd_device *pd = pkt->pd; BUG_ON(!pd); - pkt_dbg(2, pd, "id=%d, err=%d\n", pkt->id, bio->bi_error); + pkt_dbg(2, pd, "id=%d, err=%d\n", pkt->id, bio->bi_status); pd->stats.pkt_ended++; @@ -1305,16 +1305,16 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) pkt_queue_bio(pd, pkt->w_bio); } -static void pkt_finish_packet(struct packet_data *pkt, int error) +static void pkt_finish_packet(struct packet_data *pkt, blk_status_t status) { struct bio *bio; - if (error) + if (status) pkt->cache_valid = 0; /* Finish all bios corresponding to this packet */ while ((bio = bio_list_pop(&pkt->orig_bios))) { - bio->bi_error = error; + bio->bi_status = status; bio_endio(bio); } } @@ -1349,7 +1349,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data if (atomic_read(&pkt->io_wait) > 0) return; - if (!pkt->w_bio->bi_error) { + if (!pkt->w_bio->bi_status) { pkt_set_state(pkt, PACKET_FINISHED_STATE); } else { pkt_set_state(pkt, PACKET_RECOVERY_STATE); @@ -1366,7 +1366,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data break; case PACKET_FINISHED_STATE: - pkt_finish_packet(pkt, pkt->w_bio->bi_error); + pkt_finish_packet(pkt, pkt->w_bio->bi_status); return; default: @@ -2301,7 +2301,7 @@ static void pkt_end_io_read_cloned(struct bio *bio) struct packet_stacked_data *psd = bio->bi_private; struct pktcdvd_device *pd = psd->pd; - psd->bio->bi_error = bio->bi_error; + psd->bio->bi_status = bio->bi_status; bio_put(bio); bio_endio(psd->bio); mempool_free(psd, psd_pool); @@ -2310,7 +2310,7 @@ static void pkt_end_io_read_cloned(struct bio *bio) static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio) { - struct bio *cloned_bio = bio_clone(bio, GFP_NOIO); + struct bio *cloned_bio = bio_clone_fast(bio, GFP_NOIO, pkt_bio_set); struct packet_stacked_data *psd = mempool_alloc(psd_pool, GFP_NOIO); psd->pd = pd; @@ -2412,9 +2412,7 @@ static blk_qc_t pkt_make_request(struct request_queue *q, struct bio *bio) char b[BDEVNAME_SIZE]; struct bio *split; - blk_queue_bounce(q, &bio); - - blk_queue_split(q, &bio, q->bio_split); + blk_queue_split(q, &bio); pd = q->queuedata; if (!pd) { @@ -2455,7 +2453,7 @@ static blk_qc_t pkt_make_request(struct request_queue *q, struct bio *bio) split = bio_split(bio, last_zone - bio->bi_iter.bi_sector, - GFP_NOIO, fs_bio_set); + GFP_NOIO, pkt_bio_set); bio_chain(split, bio); } else { split = bio; @@ -2583,6 +2581,11 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) bdev = bdget(dev); if (!bdev) return -ENOMEM; + if (!blk_queue_scsi_passthrough(bdev_get_queue(bdev))) { + WARN_ONCE(true, "Attempt to register a non-SCSI queue\n"); + bdput(bdev); + return -EINVAL; + } ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL); if (ret) return ret; @@ -2919,6 +2922,11 @@ static int __init pkt_init(void) sizeof(struct packet_stacked_data)); if (!psd_pool) return -ENOMEM; + pkt_bio_set = bioset_create(BIO_POOL_SIZE, 0, 0); + if (!pkt_bio_set) { + mempool_destroy(psd_pool); + return -ENOMEM; + } ret = register_blkdev(pktdev_major, DRIVER_NAME); if (ret < 0) { @@ -2951,6 +2959,7 @@ out: unregister_blkdev(pktdev_major, DRIVER_NAME); out2: mempool_destroy(psd_pool); + bioset_free(pkt_bio_set); return ret; } @@ -2964,6 +2973,7 @@ static void __exit pkt_exit(void) unregister_blkdev(pktdev_major, DRIVER_NAME); mempool_destroy(psd_pool); + bioset_free(pkt_bio_set); } MODULE_DESCRIPTION("Packet writing layer for CD/DVD drives"); diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index a809e3e..075662f 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -158,7 +158,7 @@ static int ps3disk_submit_request_sg(struct ps3_storage_device *dev, if (res) { dev_err(&dev->sbd.core, "%s:%u: %s failed %d\n", __func__, __LINE__, op, res); - __blk_end_request_all(req, -EIO); + __blk_end_request_all(req, BLK_STS_IOERR); return 0; } @@ -180,7 +180,7 @@ static int ps3disk_submit_flush_request(struct ps3_storage_device *dev, if (res) { dev_err(&dev->sbd.core, "%s:%u: sync cache failed 0x%llx\n", __func__, __LINE__, res); - __blk_end_request_all(req, -EIO); + __blk_end_request_all(req, BLK_STS_IOERR); return 0; } @@ -208,7 +208,7 @@ static void ps3disk_do_request(struct ps3_storage_device *dev, break; default: blk_dump_rq_flags(req, DEVICE_NAME " bad request"); - __blk_end_request_all(req, -EIO); + __blk_end_request_all(req, BLK_STS_IOERR); } } } @@ -231,7 +231,8 @@ static irqreturn_t ps3disk_interrupt(int irq, void *data) struct ps3_storage_device *dev = data; struct ps3disk_private *priv; struct request *req; - int res, read, error; + int res, read; + blk_status_t error; u64 tag, status; const char *op; @@ -269,7 +270,7 @@ static irqreturn_t ps3disk_interrupt(int irq, void *data) if (status) { dev_dbg(&dev->sbd.core, "%s:%u: %s failed 0x%llx\n", __func__, __LINE__, op, status); - error = -EIO; + error = BLK_STS_IOERR; } else { dev_dbg(&dev->sbd.core, "%s:%u: %s completed\n", __func__, __LINE__, op); diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 456b4fe..e0e81ca 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -428,7 +428,7 @@ static void ps3vram_cache_cleanup(struct ps3_system_bus_device *dev) kfree(priv->cache.tags); } -static int ps3vram_read(struct ps3_system_bus_device *dev, loff_t from, +static blk_status_t ps3vram_read(struct ps3_system_bus_device *dev, loff_t from, size_t len, size_t *retlen, u_char *buf) { struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); @@ -438,7 +438,7 @@ static int ps3vram_read(struct ps3_system_bus_device *dev, loff_t from, (unsigned int)from, len); if (from >= priv->size) - return -EIO; + return BLK_STS_IOERR; if (len > priv->size - from) len = priv->size - from; @@ -472,14 +472,14 @@ static int ps3vram_read(struct ps3_system_bus_device *dev, loff_t from, return 0; } -static int ps3vram_write(struct ps3_system_bus_device *dev, loff_t to, +static blk_status_t ps3vram_write(struct ps3_system_bus_device *dev, loff_t to, size_t len, size_t *retlen, const u_char *buf) { struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); unsigned int cached, count; if (to >= priv->size) - return -EIO; + return BLK_STS_IOERR; if (len > priv->size - to) len = priv->size - to; @@ -554,7 +554,7 @@ static struct bio *ps3vram_do_bio(struct ps3_system_bus_device *dev, int write = bio_data_dir(bio) == WRITE; const char *op = write ? "write" : "read"; loff_t offset = bio->bi_iter.bi_sector << 9; - int error = 0; + blk_status_t error = 0; struct bio_vec bvec; struct bvec_iter iter; struct bio *next; @@ -578,7 +578,7 @@ static struct bio *ps3vram_do_bio(struct ps3_system_bus_device *dev, if (retlen != len) { dev_err(&dev->core, "Short %s\n", op); - error = -EIO; + error = BLK_STS_IOERR; goto out; } @@ -593,7 +593,7 @@ out: next = bio_list_peek(&priv->list); spin_unlock_irq(&priv->lock); - bio->bi_error = error; + bio->bi_status = error; bio_endio(bio); return next; } @@ -606,7 +606,7 @@ static blk_qc_t ps3vram_make_request(struct request_queue *q, struct bio *bio) dev_dbg(&dev->core, "%s\n", __func__); - blk_queue_split(q, &bio, q->bio_split); + blk_queue_split(q, &bio); spin_lock_irq(&priv->lock); busy = !bio_list_empty(&priv->list); diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c16f745..b008b6a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -442,6 +442,8 @@ static DEFINE_SPINLOCK(rbd_client_list_lock); static struct kmem_cache *rbd_img_request_cache; static struct kmem_cache *rbd_obj_request_cache; +static struct bio_set *rbd_bio_clone; + static int rbd_major; static DEFINE_IDA(rbd_dev_id_ida); @@ -1363,7 +1365,7 @@ static struct bio *bio_clone_range(struct bio *bio_src, { struct bio *bio; - bio = bio_clone(bio_src, gfpmask); + bio = bio_clone_fast(bio_src, gfpmask, rbd_bio_clone); if (!bio) return NULL; /* ENOMEM */ @@ -2293,11 +2295,13 @@ static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) rbd_assert(img_request->obj_request != NULL); more = obj_request->which < img_request->obj_request_count - 1; } else { + blk_status_t status = errno_to_blk_status(result); + rbd_assert(img_request->rq != NULL); - more = blk_update_request(img_request->rq, result, xferred); + more = blk_update_request(img_request->rq, status, xferred); if (!more) - __blk_mq_end_request(img_request->rq, result); + __blk_mq_end_request(img_request->rq, status); } return more; @@ -4150,17 +4154,17 @@ err_rq: obj_op_name(op_type), length, offset, result); ceph_put_snap_context(snapc); err: - blk_mq_end_request(rq, result); + blk_mq_end_request(rq, errno_to_blk_status(result)); } -static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx, +static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *rq = bd->rq; struct work_struct *work = blk_mq_rq_to_pdu(rq); queue_work(rbd_wq, work); - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; } static void rbd_free_disk(struct rbd_device *rbd_dev) @@ -6414,8 +6418,16 @@ static int rbd_slab_init(void) if (!rbd_obj_request_cache) goto out_err; + rbd_assert(!rbd_bio_clone); + rbd_bio_clone = bioset_create(BIO_POOL_SIZE, 0, 0); + if (!rbd_bio_clone) + goto out_err_clone; + return 0; +out_err_clone: + kmem_cache_destroy(rbd_obj_request_cache); + rbd_obj_request_cache = NULL; out_err: kmem_cache_destroy(rbd_img_request_cache); rbd_img_request_cache = NULL; @@ -6431,6 +6443,10 @@ static void rbd_slab_exit(void) rbd_assert(rbd_img_request_cache); kmem_cache_destroy(rbd_img_request_cache); rbd_img_request_cache = NULL; + + rbd_assert(rbd_bio_clone); + bioset_free(rbd_bio_clone); + rbd_bio_clone = NULL; } static int __init rbd_init(void) diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c index 9c56636..7f4aceb 100644 --- a/drivers/block/rsxx/dev.c +++ b/drivers/block/rsxx/dev.c @@ -149,9 +149,9 @@ static blk_qc_t rsxx_make_request(struct request_queue *q, struct bio *bio) { struct rsxx_cardinfo *card = q->queuedata; struct rsxx_bio_meta *bio_meta; - int st = -EINVAL; + blk_status_t st = BLK_STS_IOERR; - blk_queue_split(q, &bio, q->bio_split); + blk_queue_split(q, &bio); might_sleep(); @@ -161,15 +161,11 @@ static blk_qc_t rsxx_make_request(struct request_queue *q, struct bio *bio) if (bio_end_sector(bio) > get_capacity(card->gendisk)) goto req_err; - if (unlikely(card->halt)) { - st = -EFAULT; + if (unlikely(card->halt)) goto req_err; - } - if (unlikely(card->dma_fault)) { - st = (-EFAULT); + if (unlikely(card->dma_fault)) goto req_err; - } if (bio->bi_iter.bi_size == 0) { dev_err(CARD_TO_DEV(card), "size zero BIO!\n"); @@ -178,7 +174,7 @@ static blk_qc_t rsxx_make_request(struct request_queue *q, struct bio *bio) bio_meta = kmem_cache_alloc(bio_meta_pool, GFP_KERNEL); if (!bio_meta) { - st = -ENOMEM; + st = BLK_STS_RESOURCE; goto req_err; } @@ -205,7 +201,7 @@ queue_err: kmem_cache_free(bio_meta_pool, bio_meta); req_err: if (st) - bio->bi_error = st; + bio->bi_status = st; bio_endio(bio); return BLK_QC_T_NONE; } @@ -288,7 +284,6 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card) } blk_queue_make_request(card->queue, rsxx_make_request); - blk_queue_bounce_limit(card->queue, BLK_BOUNCE_ANY); blk_queue_max_hw_sectors(card->queue, blkdev_max_hw_sectors); blk_queue_physical_block_size(card->queue, RSXX_HW_BLK_SIZE); diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c index 5a20385f8..6a1b217 100644 --- a/drivers/block/rsxx/dma.c +++ b/drivers/block/rsxx/dma.c @@ -611,7 +611,7 @@ static void rsxx_schedule_done(struct work_struct *work) mutex_unlock(&ctrl->work_lock); } -static int rsxx_queue_discard(struct rsxx_cardinfo *card, +static blk_status_t rsxx_queue_discard(struct rsxx_cardinfo *card, struct list_head *q, unsigned int laddr, rsxx_dma_cb cb, @@ -621,7 +621,7 @@ static int rsxx_queue_discard(struct rsxx_cardinfo *card, dma = kmem_cache_alloc(rsxx_dma_pool, GFP_KERNEL); if (!dma) - return -ENOMEM; + return BLK_STS_RESOURCE; dma->cmd = HW_CMD_BLK_DISCARD; dma->laddr = laddr; @@ -640,7 +640,7 @@ static int rsxx_queue_discard(struct rsxx_cardinfo *card, return 0; } -static int rsxx_queue_dma(struct rsxx_cardinfo *card, +static blk_status_t rsxx_queue_dma(struct rsxx_cardinfo *card, struct list_head *q, int dir, unsigned int dma_off, @@ -655,7 +655,7 @@ static int rsxx_queue_dma(struct rsxx_cardinfo *card, dma = kmem_cache_alloc(rsxx_dma_pool, GFP_KERNEL); if (!dma) - return -ENOMEM; + return BLK_STS_RESOURCE; dma->cmd = dir ? HW_CMD_BLK_WRITE : HW_CMD_BLK_READ; dma->laddr = laddr; @@ -677,7 +677,7 @@ static int rsxx_queue_dma(struct rsxx_cardinfo *card, return 0; } -int rsxx_dma_queue_bio(struct rsxx_cardinfo *card, +blk_status_t rsxx_dma_queue_bio(struct rsxx_cardinfo *card, struct bio *bio, atomic_t *n_dmas, rsxx_dma_cb cb, @@ -694,7 +694,7 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card, unsigned int dma_len; int dma_cnt[RSXX_MAX_TARGETS]; int tgt; - int st; + blk_status_t st; int i; addr8 = bio->bi_iter.bi_sector << 9; /* sectors are 512 bytes */ @@ -769,7 +769,6 @@ bvec_err: for (i = 0; i < card->n_targets; i++) rsxx_cleanup_dma_queue(&card->ctrl[i], &dma_list[i], FREE_DMA); - return st; } diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h index 6bbc64d..277f27e 100644 --- a/drivers/block/rsxx/rsxx_priv.h +++ b/drivers/block/rsxx/rsxx_priv.h @@ -391,7 +391,7 @@ int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl); void rsxx_dma_cleanup(void); void rsxx_dma_queue_reset(struct rsxx_cardinfo *card); int rsxx_dma_configure(struct rsxx_cardinfo *card); -int rsxx_dma_queue_bio(struct rsxx_cardinfo *card, +blk_status_t rsxx_dma_queue_bio(struct rsxx_cardinfo *card, struct bio *bio, atomic_t *n_dmas, rsxx_dma_cb cb, diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 27833e4..d036868 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -451,8 +451,8 @@ static void skd_send_special_fitmsg(struct skd_device *skdev, struct skd_special_context *skspcl); static void skd_request_fn(struct request_queue *rq); static void skd_end_request(struct skd_device *skdev, - struct skd_request_context *skreq, int error); -static int skd_preop_sg_list(struct skd_device *skdev, + struct skd_request_context *skreq, blk_status_t status); +static bool skd_preop_sg_list(struct skd_device *skdev, struct skd_request_context *skreq); static void skd_postop_sg_list(struct skd_device *skdev, struct skd_request_context *skreq); @@ -491,7 +491,7 @@ static void skd_fail_all_pending(struct skd_device *skdev) if (req == NULL) break; blk_start_request(req); - __blk_end_request_all(req, -EIO); + __blk_end_request_all(req, BLK_STS_IOERR); } } @@ -545,7 +545,6 @@ static void skd_request_fn(struct request_queue *q) struct request *req = NULL; struct skd_scsi_request *scsi_req; unsigned long io_flags; - int error; u32 lba; u32 count; int data_dir; @@ -716,9 +715,7 @@ static void skd_request_fn(struct request_queue *q) if (!req->bio) goto skip_sg; - error = skd_preop_sg_list(skdev, skreq); - - if (error != 0) { + if (!skd_preop_sg_list(skdev, skreq)) { /* * Complete the native request with error. * Note that the request context is still at the @@ -730,7 +727,7 @@ static void skd_request_fn(struct request_queue *q) */ pr_debug("%s:%s:%d error Out\n", skdev->name, __func__, __LINE__); - skd_end_request(skdev, skreq, error); + skd_end_request(skdev, skreq, BLK_STS_RESOURCE); continue; } @@ -805,7 +802,7 @@ skip_sg: } static void skd_end_request(struct skd_device *skdev, - struct skd_request_context *skreq, int error) + struct skd_request_context *skreq, blk_status_t error) { if (unlikely(error)) { struct request *req = skreq->req; @@ -822,7 +819,7 @@ static void skd_end_request(struct skd_device *skdev, __blk_end_request_all(skreq->req, error); } -static int skd_preop_sg_list(struct skd_device *skdev, +static bool skd_preop_sg_list(struct skd_device *skdev, struct skd_request_context *skreq) { struct request *req = skreq->req; @@ -839,7 +836,7 @@ static int skd_preop_sg_list(struct skd_device *skdev, n_sg = blk_rq_map_sg(skdev->queue, req, sg); if (n_sg <= 0) - return -EINVAL; + return false; /* * Map scatterlist to PCI bus addresses. @@ -847,7 +844,7 @@ static int skd_preop_sg_list(struct skd_device *skdev, */ n_sg = pci_map_sg(skdev->pdev, sg, n_sg, pci_dir); if (n_sg <= 0) - return -EINVAL; + return false; SKD_ASSERT(n_sg <= skdev->sgs_per_request); @@ -882,7 +879,7 @@ static int skd_preop_sg_list(struct skd_device *skdev, } } - return 0; + return true; } static void skd_postop_sg_list(struct skd_device *skdev, @@ -2333,7 +2330,7 @@ static void skd_resolve_req_exception(struct skd_device *skdev, switch (skd_check_status(skdev, cmp_status, &skreq->err_info)) { case SKD_CHECK_STATUS_REPORT_GOOD: case SKD_CHECK_STATUS_REPORT_SMART_ALERT: - skd_end_request(skdev, skreq, 0); + skd_end_request(skdev, skreq, BLK_STS_OK); break; case SKD_CHECK_STATUS_BUSY_IMMINENT: @@ -2355,7 +2352,7 @@ static void skd_resolve_req_exception(struct skd_device *skdev, case SKD_CHECK_STATUS_REPORT_ERROR: default: - skd_end_request(skdev, skreq, -EIO); + skd_end_request(skdev, skreq, BLK_STS_IOERR); break; } } @@ -2748,7 +2745,7 @@ static int skd_isr_completion_posted(struct skd_device *skdev, * native request. */ if (likely(cmp_status == SAM_STAT_GOOD)) - skd_end_request(skdev, skreq, 0); + skd_end_request(skdev, skreq, BLK_STS_OK); else skd_resolve_req_exception(skdev, skreq); } @@ -3190,7 +3187,7 @@ static void skd_recover_requests(struct skd_device *skdev, int requeue) SKD_MAX_RETRIES) blk_requeue_request(skdev->queue, skreq->req); else - skd_end_request(skdev, skreq, -EIO); + skd_end_request(skdev, skreq, BLK_STS_IOERR); skreq->req = NULL; @@ -4276,6 +4273,7 @@ static int skd_cons_disk(struct skd_device *skdev) rc = -ENOMEM; goto err_out; } + blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); skdev->queue = q; disk->queue = q; diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 3f3a3ab..6b16ead 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -316,7 +316,7 @@ static void vdc_end_one(struct vdc_port *port, struct vio_dring_state *dr, rqe->req = NULL; - __blk_end_request(req, (desc->status ? -EIO : 0), desc->size); + __blk_end_request(req, (desc->status ? BLK_STS_IOERR : 0), desc->size); vdc_blk_queue_start(port); } @@ -1023,7 +1023,7 @@ static void vdc_queue_drain(struct vdc_port *port) struct request *req; while ((req = blk_fetch_request(port->disk->queue)) != NULL) - __blk_end_request_all(req, -EIO); + __blk_end_request_all(req, BLK_STS_IOERR); } static void vdc_ldc_reset_timer(unsigned long _arg) diff --git a/drivers/block/swim.c b/drivers/block/swim.c index 3064be6..84434d3 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -493,7 +493,7 @@ static inline int swim_read_sector(struct floppy_state *fs, return ret; } -static int floppy_read_sectors(struct floppy_state *fs, +static blk_status_t floppy_read_sectors(struct floppy_state *fs, int req_sector, int sectors_nb, unsigned char *buffer) { @@ -516,7 +516,7 @@ static int floppy_read_sectors(struct floppy_state *fs, ret = swim_read_sector(fs, side, track, sector, buffer); if (try-- == 0) - return -EIO; + return BLK_STS_IOERR; } while (ret != 512); buffer += ret; @@ -553,7 +553,7 @@ static void do_fd_request(struct request_queue *q) req = swim_next_request(swd); while (req) { - int err = -EIO; + blk_status_t err = BLK_STS_IOERR; fs = req->rq_disk->private_data; if (blk_rq_pos(req) >= fs->total_secs) @@ -864,6 +864,8 @@ static int swim_floppy_init(struct swim_priv *swd) put_disk(swd->unit[drive].disk); goto exit_put_disks; } + blk_queue_bounce_limit(swd->unit[drive].disk->queue, + BLK_BOUNCE_HIGH); swd->unit[drive].disk->queue->queuedata = swd; swd->unit[drive].swd = swd; } diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index ba4809c..9f931f8 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -257,7 +257,7 @@ static unsigned int floppy_check_events(struct gendisk *disk, unsigned int clearing); static int floppy_revalidate(struct gendisk *disk); -static bool swim3_end_request(struct floppy_state *fs, int err, unsigned int nr_bytes) +static bool swim3_end_request(struct floppy_state *fs, blk_status_t err, unsigned int nr_bytes) { struct request *req = fs->cur_req; int rc; @@ -334,7 +334,7 @@ static void start_request(struct floppy_state *fs) if (fs->mdev->media_bay && check_media_bay(fs->mdev->media_bay) != MB_FD) { swim3_dbg("%s", " media bay absent, dropping req\n"); - swim3_end_request(fs, -ENODEV, 0); + swim3_end_request(fs, BLK_STS_IOERR, 0); continue; } @@ -350,12 +350,12 @@ static void start_request(struct floppy_state *fs) if (blk_rq_pos(req) >= fs->total_secs) { swim3_dbg(" pos out of bounds (%ld, max is %ld)\n", (long)blk_rq_pos(req), (long)fs->total_secs); - swim3_end_request(fs, -EIO, 0); + swim3_end_request(fs, BLK_STS_IOERR, 0); continue; } if (fs->ejected) { swim3_dbg("%s", " disk ejected\n"); - swim3_end_request(fs, -EIO, 0); + swim3_end_request(fs, BLK_STS_IOERR, 0); continue; } @@ -364,7 +364,7 @@ static void start_request(struct floppy_state *fs) fs->write_prot = swim3_readbit(fs, WRITE_PROT); if (fs->write_prot) { swim3_dbg("%s", " try to write, disk write protected\n"); - swim3_end_request(fs, -EIO, 0); + swim3_end_request(fs, BLK_STS_IOERR, 0); continue; } } @@ -548,7 +548,7 @@ static void act(struct floppy_state *fs) if (fs->retries > 5) { swim3_err("Wrong cylinder in transfer, want: %d got %d\n", fs->req_cyl, fs->cur_cyl); - swim3_end_request(fs, -EIO, 0); + swim3_end_request(fs, BLK_STS_IOERR, 0); fs->state = idle; return; } @@ -584,7 +584,7 @@ static void scan_timeout(unsigned long data) out_8(&sw->intr_enable, 0); fs->cur_cyl = -1; if (fs->retries > 5) { - swim3_end_request(fs, -EIO, 0); + swim3_end_request(fs, BLK_STS_IOERR, 0); fs->state = idle; start_request(fs); } else { @@ -608,7 +608,7 @@ static void seek_timeout(unsigned long data) out_8(&sw->select, RELAX); out_8(&sw->intr_enable, 0); swim3_err("%s", "Seek timeout\n"); - swim3_end_request(fs, -EIO, 0); + swim3_end_request(fs, BLK_STS_IOERR, 0); fs->state = idle; start_request(fs); spin_unlock_irqrestore(&swim3_lock, flags); @@ -637,7 +637,7 @@ static void settle_timeout(unsigned long data) goto unlock; } swim3_err("%s", "Seek settle timeout\n"); - swim3_end_request(fs, -EIO, 0); + swim3_end_request(fs, BLK_STS_IOERR, 0); fs->state = idle; start_request(fs); unlock: @@ -666,7 +666,7 @@ static void xfer_timeout(unsigned long data) swim3_err("Timeout %sing sector %ld\n", (rq_data_dir(fs->cur_req)==WRITE? "writ": "read"), (long)blk_rq_pos(fs->cur_req)); - swim3_end_request(fs, -EIO, 0); + swim3_end_request(fs, BLK_STS_IOERR, 0); fs->state = idle; start_request(fs); spin_unlock_irqrestore(&swim3_lock, flags); @@ -703,7 +703,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id) swim3_err("%s", "Seen sector but cyl=ff?\n"); fs->cur_cyl = -1; if (fs->retries > 5) { - swim3_end_request(fs, -EIO, 0); + swim3_end_request(fs, BLK_STS_IOERR, 0); fs->state = idle; start_request(fs); } else { @@ -786,7 +786,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id) swim3_err("Error %sing block %ld (err=%x)\n", rq_data_dir(req) == WRITE? "writ": "read", (long)blk_rq_pos(req), err); - swim3_end_request(fs, -EIO, 0); + swim3_end_request(fs, BLK_STS_IOERR, 0); fs->state = idle; } } else { @@ -795,7 +795,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id) swim3_err("fd dma error: stat=%x resid=%d\n", stat, resid); swim3_err(" state=%d, dir=%x, intr=%x, err=%x\n", fs->state, rq_data_dir(req), intr, err); - swim3_end_request(fs, -EIO, 0); + swim3_end_request(fs, BLK_STS_IOERR, 0); fs->state = idle; start_request(fs); break; @@ -1223,6 +1223,7 @@ static int swim3_attach(struct macio_dev *mdev, put_disk(disk); return -ENOMEM; } + blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH); disk->queue->queuedata = &floppy_states[index]; if (index == 0) { @@ -1245,7 +1246,7 @@ static int swim3_attach(struct macio_dev *mdev, return 0; } -static struct of_device_id swim3_match[] = +static const struct of_device_id swim3_match[] = { { .name = "swim3", diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index c8e072c..08586dc 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -745,7 +745,7 @@ static unsigned int carm_fill_get_fw_ver(struct carm_host *host, static inline void carm_end_request_queued(struct carm_host *host, struct carm_request *crq, - int error) + blk_status_t error) { struct request *req = crq->rq; int rc; @@ -791,7 +791,7 @@ static inline void carm_round_robin(struct carm_host *host) } static inline void carm_end_rq(struct carm_host *host, struct carm_request *crq, - int error) + blk_status_t error) { carm_end_request_queued(host, crq, error); if (max_queue == 1) @@ -869,14 +869,14 @@ queue_one_request: sg = &crq->sg[0]; n_elem = blk_rq_map_sg(q, rq, sg); if (n_elem <= 0) { - carm_end_rq(host, crq, -EIO); + carm_end_rq(host, crq, BLK_STS_IOERR); return; /* request with no s/g entries? */ } /* map scatterlist to PCI bus addresses */ n_elem = pci_map_sg(host->pdev, sg, n_elem, pci_dir); if (n_elem <= 0) { - carm_end_rq(host, crq, -EIO); + carm_end_rq(host, crq, BLK_STS_IOERR); return; /* request with no s/g entries? */ } crq->n_elem = n_elem; @@ -937,7 +937,7 @@ queue_one_request: static void carm_handle_array_info(struct carm_host *host, struct carm_request *crq, u8 *mem, - int error) + blk_status_t error) { struct carm_port *port; u8 *msg_data = mem + sizeof(struct carm_array_info); @@ -997,7 +997,7 @@ out: static void carm_handle_scan_chan(struct carm_host *host, struct carm_request *crq, u8 *mem, - int error) + blk_status_t error) { u8 *msg_data = mem + IOC_SCAN_CHAN_OFFSET; unsigned int i, dev_count = 0; @@ -1029,7 +1029,7 @@ out: } static void carm_handle_generic(struct carm_host *host, - struct carm_request *crq, int error, + struct carm_request *crq, blk_status_t error, int cur_state, int next_state) { DPRINTK("ENTER\n"); @@ -1045,7 +1045,7 @@ static void carm_handle_generic(struct carm_host *host, } static inline void carm_handle_rw(struct carm_host *host, - struct carm_request *crq, int error) + struct carm_request *crq, blk_status_t error) { int pci_dir; @@ -1067,7 +1067,7 @@ static inline void carm_handle_resp(struct carm_host *host, u32 handle = le32_to_cpu(ret_handle_le); unsigned int msg_idx; struct carm_request *crq; - int error = (status == RMSG_OK) ? 0 : -EIO; + blk_status_t error = (status == RMSG_OK) ? 0 : BLK_STS_IOERR; u8 *mem; VPRINTK("ENTER, handle == 0x%x\n", handle); @@ -1155,7 +1155,7 @@ static inline void carm_handle_resp(struct carm_host *host, err_out: printk(KERN_WARNING DRV_NAME "(%s): BUG: unhandled message type %d/%d\n", pci_name(host->pdev), crq->msg_type, crq->msg_subtype); - carm_end_rq(host, crq, -EIO); + carm_end_rq(host, crq, BLK_STS_IOERR); } static inline void carm_handle_responses(struct carm_host *host) diff --git a/drivers/block/umem.c b/drivers/block/umem.c index c141cc3..0677d25 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -454,7 +454,7 @@ static void process_page(unsigned long data) PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE); if (control & DMASCR_HARD_ERROR) { /* error */ - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; dev_printk(KERN_WARNING, &card->dev->dev, "I/O error on sector %d/%d\n", le32_to_cpu(desc->local_addr)>>9, @@ -529,7 +529,7 @@ static blk_qc_t mm_make_request(struct request_queue *q, struct bio *bio) (unsigned long long)bio->bi_iter.bi_sector, bio->bi_iter.bi_size); - blk_queue_split(q, &bio, q->bio_split); + blk_queue_split(q, &bio); spin_lock_irq(&card->lock); *card->biotail = bio; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 553cc4c..0297ad7 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -64,15 +64,15 @@ struct virtblk_req { struct scatterlist sg[]; }; -static inline int virtblk_result(struct virtblk_req *vbr) +static inline blk_status_t virtblk_result(struct virtblk_req *vbr) { switch (vbr->status) { case VIRTIO_BLK_S_OK: - return 0; + return BLK_STS_OK; case VIRTIO_BLK_S_UNSUPP: - return -ENOTTY; + return BLK_STS_NOTSUPP; default: - return -EIO; + return BLK_STS_IOERR; } } @@ -214,7 +214,7 @@ static void virtblk_done(struct virtqueue *vq) spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); } -static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, +static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct virtio_blk *vblk = hctx->queue->queuedata; @@ -246,7 +246,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, break; default: WARN_ON_ONCE(1); - return BLK_MQ_RQ_QUEUE_ERROR; + return BLK_STS_IOERR; } vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, type); @@ -276,8 +276,8 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, /* Out of mem doesn't actually happen, since we fall back * to direct descriptors */ if (err == -ENOMEM || err == -ENOSPC) - return BLK_MQ_RQ_QUEUE_BUSY; - return BLK_MQ_RQ_QUEUE_ERROR; + return BLK_STS_RESOURCE; + return BLK_STS_IOERR; } if (bd->last && virtqueue_kick_prepare(vblk->vqs[qid].vq)) @@ -286,7 +286,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, if (notify) virtqueue_notify(vblk->vqs[qid].vq); - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; } /* return id (s/n) string for *disk to *id_str @@ -307,7 +307,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str) goto out; blk_execute_rq(vblk->disk->queue, vblk->disk, req, false); - err = virtblk_result(blk_mq_rq_to_pdu(req)); + err = blk_status_to_errno(virtblk_result(blk_mq_rq_to_pdu(req))); out: blk_put_request(req); return err; @@ -720,9 +720,6 @@ static int virtblk_probe(struct virtio_device *vdev) /* We can handle whatever the host told us to handle. */ blk_queue_max_segments(q, vblk->sg_elems-2); - /* No need to bounce any requests */ - blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); - /* No real sector limit. */ blk_queue_max_hw_sectors(q, -1U); diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 0e82409..fe7cd58 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -1066,20 +1066,17 @@ static void xen_blk_drain_io(struct xen_blkif_ring *ring) atomic_set(&blkif->drain, 0); } -/* - * Completion callback on the bio's. Called as bh->b_end_io() - */ - -static void __end_block_io_op(struct pending_req *pending_req, int error) +static void __end_block_io_op(struct pending_req *pending_req, + blk_status_t error) { /* An error fails the entire request. */ - if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) && - (error == -EOPNOTSUPP)) { + if (pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE && + error == BLK_STS_NOTSUPP) { pr_debug("flush diskcache op failed, not supported\n"); xen_blkbk_flush_diskcache(XBT_NIL, pending_req->ring->blkif->be, 0); pending_req->status = BLKIF_RSP_EOPNOTSUPP; - } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) && - (error == -EOPNOTSUPP)) { + } else if (pending_req->operation == BLKIF_OP_WRITE_BARRIER && + error == BLK_STS_NOTSUPP) { pr_debug("write barrier op failed, not supported\n"); xen_blkbk_barrier(XBT_NIL, pending_req->ring->blkif->be, 0); pending_req->status = BLKIF_RSP_EOPNOTSUPP; @@ -1103,7 +1100,7 @@ static void __end_block_io_op(struct pending_req *pending_req, int error) */ static void end_block_io_op(struct bio *bio) { - __end_block_io_op(bio->bi_private, bio->bi_error); + __end_block_io_op(bio->bi_private, bio->bi_status); bio_put(bio); } @@ -1420,7 +1417,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, for (i = 0; i < nbio; i++) bio_put(biolist[i]); atomic_set(&pending_req->pendcnt, 1); - __end_block_io_op(pending_req, -EINVAL); + __end_block_io_op(pending_req, BLK_STS_RESOURCE); msleep(1); /* back off a bit */ return -EIO; } diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 3945963..c852ed3 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -110,11 +110,6 @@ struct blk_shadow { unsigned long associated_id; }; -struct split_bio { - struct bio *bio; - atomic_t pending; -}; - struct blkif_req { int error; }; @@ -881,7 +876,7 @@ static inline bool blkif_request_flush_invalid(struct request *req, !info->feature_fua)); } -static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx, +static blk_status_t blkif_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *qd) { unsigned long flags; @@ -904,16 +899,16 @@ static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx, flush_requests(rinfo); spin_unlock_irqrestore(&rinfo->ring_lock, flags); - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; out_err: spin_unlock_irqrestore(&rinfo->ring_lock, flags); - return BLK_MQ_RQ_QUEUE_ERROR; + return BLK_STS_IOERR; out_busy: spin_unlock_irqrestore(&rinfo->ring_lock, flags); blk_mq_stop_hw_queue(hctx); - return BLK_MQ_RQ_QUEUE_BUSY; + return BLK_STS_RESOURCE; } static void blkif_complete_rq(struct request *rq) @@ -958,9 +953,6 @@ static void blkif_set_queue_limits(struct blkfront_info *info) /* Make sure buffer addresses are sector-aligned. */ blk_queue_dma_alignment(rq, 511); - - /* Make sure we don't use bounce buffers. */ - blk_queue_bounce_limit(rq, BLK_BOUNCE_ANY); } static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, @@ -1601,14 +1593,18 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) continue; } - blkif_req(req)->error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO; + if (bret->status == BLKIF_RSP_OKAY) + blkif_req(req)->error = BLK_STS_OK; + else + blkif_req(req)->error = BLK_STS_IOERR; + switch (bret->operation) { case BLKIF_OP_DISCARD: if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { struct request_queue *rq = info->rq; printk(KERN_WARNING "blkfront: %s: %s op failed\n", info->gd->disk_name, op_name(bret->operation)); - blkif_req(req)->error = -EOPNOTSUPP; + blkif_req(req)->error = BLK_STS_NOTSUPP; info->feature_discard = 0; info->feature_secdiscard = 0; queue_flag_clear(QUEUE_FLAG_DISCARD, rq); @@ -1626,11 +1622,11 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) rinfo->shadow[id].req.u.rw.nr_segments == 0)) { printk(KERN_WARNING "blkfront: %s: empty %s op failed\n", info->gd->disk_name, op_name(bret->operation)); - blkif_req(req)->error = -EOPNOTSUPP; + blkif_req(req)->error = BLK_STS_NOTSUPP; } if (unlikely(blkif_req(req)->error)) { - if (blkif_req(req)->error == -EOPNOTSUPP) - blkif_req(req)->error = 0; + if (blkif_req(req)->error == BLK_STS_NOTSUPP) + blkif_req(req)->error = BLK_STS_OK; info->feature_fua = 0; info->feature_flush = 0; xlvbd_flush(info); @@ -1996,28 +1992,13 @@ static int blkfront_probe(struct xenbus_device *dev, return 0; } -static void split_bio_end(struct bio *bio) -{ - struct split_bio *split_bio = bio->bi_private; - - if (atomic_dec_and_test(&split_bio->pending)) { - split_bio->bio->bi_phys_segments = 0; - split_bio->bio->bi_error = bio->bi_error; - bio_endio(split_bio->bio); - kfree(split_bio); - } - bio_put(bio); -} - static int blkif_recover(struct blkfront_info *info) { - unsigned int i, r_index; + unsigned int r_index; struct request *req, *n; int rc; - struct bio *bio, *cloned_bio; - unsigned int segs, offset; - int pending, size; - struct split_bio *split_bio; + struct bio *bio; + unsigned int segs; blkfront_gather_backend_features(info); /* Reset limits changed by blk_mq_update_nr_hw_queues(). */ @@ -2056,34 +2037,6 @@ static int blkif_recover(struct blkfront_info *info) while ((bio = bio_list_pop(&info->bio_list)) != NULL) { /* Traverse the list of pending bios and re-queue them */ - if (bio_segments(bio) > segs) { - /* - * This bio has more segments than what we can - * handle, we have to split it. - */ - pending = (bio_segments(bio) + segs - 1) / segs; - split_bio = kzalloc(sizeof(*split_bio), GFP_NOIO); - BUG_ON(split_bio == NULL); - atomic_set(&split_bio->pending, pending); - split_bio->bio = bio; - for (i = 0; i < pending; i++) { - offset = (i * segs * XEN_PAGE_SIZE) >> 9; - size = min((unsigned int)(segs * XEN_PAGE_SIZE) >> 9, - (unsigned int)bio_sectors(bio) - offset); - cloned_bio = bio_clone(bio, GFP_NOIO); - BUG_ON(cloned_bio == NULL); - bio_trim(cloned_bio, offset, size); - cloned_bio->bi_private = split_bio; - cloned_bio->bi_end_io = split_bio_end; - submit_bio(cloned_bio); - } - /* - * Now we have to wait for all those smaller bios to - * end, so we can also end the "parent" bio. - */ - continue; - } - /* We don't need to split this bio */ submit_bio(bio); } @@ -2137,7 +2090,7 @@ static int blkfront_resume(struct xenbus_device *dev) merge_bio.tail = shadow[j].request->biotail; bio_list_merge(&info->bio_list, &merge_bio); shadow[j].request->bio = NULL; - blk_mq_end_request(shadow[j].request, 0); + blk_mq_end_request(shadow[j].request, BLK_STS_OK); } } diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c index 757dce2..14459d6 100644 --- a/drivers/block/xsysace.c +++ b/drivers/block/xsysace.c @@ -471,7 +471,7 @@ static struct request *ace_get_next_request(struct request_queue *q) if (!blk_rq_is_passthrough(req)) break; blk_start_request(req); - __blk_end_request_all(req, -EIO); + __blk_end_request_all(req, BLK_STS_IOERR); } return req; } @@ -499,11 +499,11 @@ static void ace_fsm_dostate(struct ace_device *ace) /* Drop all in-flight and pending requests */ if (ace->req) { - __blk_end_request_all(ace->req, -EIO); + __blk_end_request_all(ace->req, BLK_STS_IOERR); ace->req = NULL; } while ((req = blk_fetch_request(ace->queue)) != NULL) - __blk_end_request_all(req, -EIO); + __blk_end_request_all(req, BLK_STS_IOERR); /* Drop back to IDLE state and notify waiters */ ace->fsm_state = ACE_FSM_STATE_IDLE; @@ -728,7 +728,7 @@ static void ace_fsm_dostate(struct ace_device *ace) } /* bio finished; is there another one? */ - if (__blk_end_request_cur(ace->req, 0)) { + if (__blk_end_request_cur(ace->req, BLK_STS_OK)) { /* dev_dbg(ace->dev, "next block; h=%u c=%u\n", * blk_rq_sectors(ace->req), * blk_rq_cur_sectors(ace->req)); @@ -993,6 +993,7 @@ static int ace_setup(struct ace_device *ace) if (ace->queue == NULL) goto err_blk_initq; blk_queue_logical_block_size(ace->queue, 512); + blk_queue_bounce_limit(ace->queue, BLK_BOUNCE_HIGH); /* * Allocate and initialize GD structure diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index 968f9e5..41c95c9 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -74,14 +74,14 @@ static void do_z2_request(struct request_queue *q) while (req) { unsigned long start = blk_rq_pos(req) << 9; unsigned long len = blk_rq_cur_bytes(req); - int err = 0; + blk_status_t err = BLK_STS_OK; if (start + len > z2ram_size) { pr_err(DEVICE_NAME ": bad access: block=%llu, " "count=%u\n", (unsigned long long)blk_rq_pos(req), blk_rq_cur_sectors(req)); - err = -EIO; + err = BLK_STS_IOERR; goto done; } while (len) { diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 76c952f..e36d160 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -2178,6 +2178,12 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf, if (!q) return -ENXIO; + if (!blk_queue_scsi_passthrough(q)) { + WARN_ONCE(true, + "Attempt read CDDA info through a non-SCSI queue\n"); + return -EINVAL; + } + cdi->last_sense = 0; while (nframes) { @@ -2195,7 +2201,6 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf, break; } req = scsi_req(rq); - scsi_req_init(rq); ret = blk_rq_map_user(q, rq, NULL, ubuf, len, GFP_KERNEL); if (ret) { diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index 1372763..6495b03f 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -583,7 +583,8 @@ static int gdrom_set_interrupt_handlers(void) */ static void gdrom_readdisk_dma(struct work_struct *work) { - int err, block, block_cnt; + int block, block_cnt; + blk_status_t err; struct packet_command *read_command; struct list_head *elem, *next; struct request *req; @@ -641,7 +642,7 @@ static void gdrom_readdisk_dma(struct work_struct *work) __raw_writeb(1, GDROM_DMA_STATUS_REG); wait_event_interruptible_timeout(request_queue, gd.transfer == 0, GDROM_DEFAULT_TIMEOUT); - err = gd.transfer ? -EIO : 0; + err = gd.transfer ? BLK_STS_IOERR : BLK_STS_OK; gd.transfer = 0; gd.pending = 0; /* now seek to take the request spinlock @@ -670,11 +671,11 @@ static void gdrom_request(struct request_queue *rq) break; case REQ_OP_WRITE: pr_notice("Read only device - write request ignored\n"); - __blk_end_request_all(req, -EIO); + __blk_end_request_all(req, BLK_STS_IOERR); break; default: printk(KERN_DEBUG "gdrom: Non-fs request ignored\n"); - __blk_end_request_all(req, -EIO); + __blk_end_request_all(req, BLK_STS_IOERR); break; } } @@ -812,6 +813,7 @@ static int probe_gdrom(struct platform_device *devptr) err = -ENOMEM; goto probe_fail_requestq; } + blk_queue_bounce_limit(gd.gdrom_rq, BLK_BOUNCE_HIGH); err = probe_gdrom_setupqueue(); if (err) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 5901937..14d1e7d 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -93,7 +93,6 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk, int error; rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); - scsi_req_init(rq); ide_req(rq)->type = ATA_PRIV_MISC; rq->special = (char *)pc; @@ -200,7 +199,7 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq) memset(sense, 0, sizeof(*sense)); blk_rq_init(rq->q, sense_rq); - scsi_req_init(sense_rq); + scsi_req_init(req); err = blk_rq_map_kern(drive->queue, sense_rq, sense, sense_len, GFP_NOIO); @@ -273,7 +272,7 @@ void ide_retry_pc(ide_drive_t *drive) ide_requeue_and_plug(drive, failed_rq); if (ide_queue_sense_rq(drive, pc)) { blk_start_request(failed_rq); - ide_complete_rq(drive, -EIO, blk_rq_bytes(failed_rq)); + ide_complete_rq(drive, BLK_STS_IOERR, blk_rq_bytes(failed_rq)); } } EXPORT_SYMBOL_GPL(ide_retry_pc); @@ -437,7 +436,8 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) /* No more interrupts */ if ((stat & ATA_DRQ) == 0) { - int uptodate, error; + int uptodate; + blk_status_t error; debug_log("Packet command completed, %d bytes transferred\n", blk_rq_bytes(rq)); @@ -490,7 +490,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) if (ata_misc_request(rq)) { scsi_req(rq)->result = 0; - error = 0; + error = BLK_STS_OK; } else { if (blk_rq_is_passthrough(rq) && uptodate <= 0) { @@ -498,7 +498,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) scsi_req(rq)->result = -EIO; } - error = uptodate ? 0 : -EIO; + error = uptodate ? BLK_STS_OK : BLK_STS_IOERR; } ide_complete_rq(drive, error, blk_rq_bytes(rq)); diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 07e5ff3..81e18f96 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -228,7 +228,7 @@ static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq) scsi_req(failed)->sense_len = scsi_req(rq)->sense_len; cdrom_analyze_sense_data(drive, failed); - if (ide_end_rq(drive, failed, -EIO, blk_rq_bytes(failed))) + if (ide_end_rq(drive, failed, BLK_STS_IOERR, blk_rq_bytes(failed))) BUG(); } else cdrom_analyze_sense_data(drive, NULL); @@ -438,7 +438,6 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, rq = blk_get_request(drive->queue, write ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, __GFP_RECLAIM); - scsi_req_init(rq); memcpy(scsi_req(rq)->cmd, cmd, BLK_MAX_CDB); ide_req(rq)->type = ATA_PRIV_PC; rq->rq_flags |= rq_flags; @@ -508,7 +507,7 @@ static bool ide_cd_error_cmd(ide_drive_t *drive, struct ide_cmd *cmd) nr_bytes -= cmd->last_xfer_len; if (nr_bytes > 0) { - ide_complete_rq(drive, 0, nr_bytes); + ide_complete_rq(drive, BLK_STS_OK, nr_bytes); return true; } @@ -674,7 +673,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) out_end: if (blk_rq_is_scsi(rq) && rc == 0) { scsi_req(rq)->resid_len = 0; - blk_end_request_all(rq, 0); + blk_end_request_all(rq, BLK_STS_OK); hwif->rq = NULL; } else { if (sense && uptodate) @@ -699,7 +698,7 @@ out_end: scsi_req(rq)->resid_len += cmd->last_xfer_len; } - ide_complete_rq(drive, uptodate ? 0 : -EIO, blk_rq_bytes(rq)); + ide_complete_rq(drive, uptodate ? BLK_STS_OK : BLK_STS_IOERR, blk_rq_bytes(rq)); if (sense && rc == 2) ide_error(drive, "request sense failure", stat); @@ -844,7 +843,7 @@ out_end: if (nsectors == 0) nsectors = 1; - ide_complete_rq(drive, uptodate ? 0 : -EIO, nsectors << 9); + ide_complete_rq(drive, uptodate ? BLK_STS_OK : BLK_STS_IOERR, nsectors << 9); return ide_stopped; } diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c index 55cd736..9d26c97 100644 --- a/drivers/ide/ide-cd_ioctl.c +++ b/drivers/ide/ide-cd_ioctl.c @@ -304,7 +304,6 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi) int ret; rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); - scsi_req_init(rq); ide_req(rq)->type = ATA_PRIV_MISC; rq->rq_flags = RQF_QUIET; blk_execute_rq(drive->queue, cd->disk, rq, 0); diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c index 9b69c32..ef7c8c4 100644 --- a/drivers/ide/ide-devsets.c +++ b/drivers/ide/ide-devsets.c @@ -166,7 +166,6 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting, return setting->set(drive, arg); rq = blk_get_request(q, REQ_OP_DRV_IN, __GFP_RECLAIM); - scsi_req_init(rq); ide_req(rq)->type = ATA_PRIV_MISC; scsi_req(rq)->cmd_len = 5; scsi_req(rq)->cmd[0] = REQ_DEVSET_EXEC; diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index 7c06237..241983d 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -478,7 +478,6 @@ static int set_multcount(ide_drive_t *drive, int arg) return -EBUSY; rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); - scsi_req_init(rq); ide_req(rq)->type = ATA_PRIV_TASKFILE; drive->mult_req = arg; diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c index 51c8122..54d4d78 100644 --- a/drivers/ide/ide-dma.c +++ b/drivers/ide/ide-dma.c @@ -104,7 +104,7 @@ ide_startstop_t ide_dma_intr(ide_drive_t *drive) if ((cmd->tf_flags & IDE_TFLAG_FS) == 0) ide_finish_cmd(drive, cmd, stat); else - ide_complete_rq(drive, 0, + ide_complete_rq(drive, BLK_STS_OK, blk_rq_sectors(cmd->rq) << 9); return ide_stopped; } diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c index 4b7ffd7..47d5f337 100644 --- a/drivers/ide/ide-eh.c +++ b/drivers/ide/ide-eh.c @@ -135,7 +135,7 @@ ide_startstop_t ide_error(ide_drive_t *drive, const char *msg, u8 stat) return ide_stopped; } scsi_req(rq)->result = err; - ide_complete_rq(drive, err ? -EIO : 0, blk_rq_bytes(rq)); + ide_complete_rq(drive, err ? BLK_STS_IOERR : BLK_STS_OK, blk_rq_bytes(rq)); return ide_stopped; } @@ -143,7 +143,7 @@ ide_startstop_t ide_error(ide_drive_t *drive, const char *msg, u8 stat) } EXPORT_SYMBOL_GPL(ide_error); -static inline void ide_complete_drive_reset(ide_drive_t *drive, int err) +static inline void ide_complete_drive_reset(ide_drive_t *drive, blk_status_t err) { struct request *rq = drive->hwif->rq; @@ -151,7 +151,7 @@ static inline void ide_complete_drive_reset(ide_drive_t *drive, int err) scsi_req(rq)->cmd[0] == REQ_DRIVE_RESET) { if (err <= 0 && scsi_req(rq)->result == 0) scsi_req(rq)->result = -EIO; - ide_complete_rq(drive, err ? err : 0, blk_rq_bytes(rq)); + ide_complete_rq(drive, err, blk_rq_bytes(rq)); } } @@ -191,7 +191,7 @@ static ide_startstop_t atapi_reset_pollfunc(ide_drive_t *drive) } /* done polling */ hwif->polling = 0; - ide_complete_drive_reset(drive, 0); + ide_complete_drive_reset(drive, BLK_STS_OK); return ide_stopped; } @@ -225,7 +225,7 @@ static ide_startstop_t reset_pollfunc(ide_drive_t *drive) ide_hwif_t *hwif = drive->hwif; const struct ide_port_ops *port_ops = hwif->port_ops; u8 tmp; - int err = 0; + blk_status_t err = BLK_STS_OK; if (port_ops && port_ops->reset_poll) { err = port_ops->reset_poll(drive); @@ -247,7 +247,7 @@ static ide_startstop_t reset_pollfunc(ide_drive_t *drive) printk(KERN_ERR "%s: reset timed-out, status=0x%02x\n", hwif->name, tmp); drive->failures++; - err = -EIO; + err = BLK_STS_IOERR; } else { tmp = ide_read_error(drive); @@ -257,7 +257,7 @@ static ide_startstop_t reset_pollfunc(ide_drive_t *drive) } else { ide_reset_report_error(hwif, tmp); drive->failures++; - err = -EIO; + err = BLK_STS_IOERR; } } out: @@ -392,7 +392,7 @@ static ide_startstop_t do_reset1(ide_drive_t *drive, int do_not_try_atapi) if (io_ports->ctl_addr == 0) { spin_unlock_irqrestore(&hwif->lock, flags); - ide_complete_drive_reset(drive, -ENXIO); + ide_complete_drive_reset(drive, BLK_STS_IOERR); return ide_stopped; } diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index 8ac6048..627b1f6 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -143,7 +143,7 @@ static ide_startstop_t ide_floppy_issue_pc(ide_drive_t *drive, drive->failed_pc = NULL; drive->pc_callback(drive, 0); - ide_complete_rq(drive, -EIO, done); + ide_complete_rq(drive, BLK_STS_IOERR, done); return ide_stopped; } @@ -248,7 +248,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive, if (ata_misc_request(rq)) { scsi_req(rq)->result = 0; - ide_complete_rq(drive, 0, blk_rq_bytes(rq)); + ide_complete_rq(drive, BLK_STS_OK, blk_rq_bytes(rq)); return ide_stopped; } else goto out_end; @@ -303,7 +303,7 @@ out_end: drive->failed_pc = NULL; if (blk_rq_is_passthrough(rq) && scsi_req(rq)->result == 0) scsi_req(rq)->result = -EIO; - ide_complete_rq(drive, -EIO, blk_rq_bytes(rq)); + ide_complete_rq(drive, BLK_STS_IOERR, blk_rq_bytes(rq)); return ide_stopped; } diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 323af72..3a234701 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -54,7 +54,7 @@ #include <linux/uaccess.h> #include <asm/io.h> -int ide_end_rq(ide_drive_t *drive, struct request *rq, int error, +int ide_end_rq(ide_drive_t *drive, struct request *rq, blk_status_t error, unsigned int nr_bytes) { /* @@ -112,7 +112,7 @@ void ide_complete_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat, u8 err) } } -int ide_complete_rq(ide_drive_t *drive, int error, unsigned int nr_bytes) +int ide_complete_rq(ide_drive_t *drive, blk_status_t error, unsigned int nr_bytes) { ide_hwif_t *hwif = drive->hwif; struct request *rq = hwif->rq; @@ -122,7 +122,7 @@ int ide_complete_rq(ide_drive_t *drive, int error, unsigned int nr_bytes) * if failfast is set on a request, override number of sectors * and complete the whole request right now */ - if (blk_noretry_request(rq) && error <= 0) + if (blk_noretry_request(rq) && error) nr_bytes = blk_rq_sectors(rq) << 9; rc = ide_end_rq(drive, rq, error, nr_bytes); @@ -149,7 +149,7 @@ void ide_kill_rq(ide_drive_t *drive, struct request *rq) scsi_req(rq)->result = -EIO; } - ide_complete_rq(drive, -EIO, blk_rq_bytes(rq)); + ide_complete_rq(drive, BLK_STS_IOERR, blk_rq_bytes(rq)); } static void ide_tf_set_specify_cmd(ide_drive_t *drive, struct ide_taskfile *tf) @@ -272,7 +272,7 @@ static ide_startstop_t execute_drive_cmd (ide_drive_t *drive, printk("%s: DRIVE_CMD (null)\n", drive->name); #endif scsi_req(rq)->result = 0; - ide_complete_rq(drive, 0, blk_rq_bytes(rq)); + ide_complete_rq(drive, BLK_STS_OK, blk_rq_bytes(rq)); return ide_stopped; } diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c index 8c0d172..3661abb 100644 --- a/drivers/ide/ide-ioctls.c +++ b/drivers/ide/ide-ioctls.c @@ -126,7 +126,6 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg) struct request *rq; rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); - scsi_req_init(rq); ide_req(rq)->type = ATA_PRIV_TASKFILE; blk_execute_rq(drive->queue, NULL, rq, 0); err = scsi_req(rq)->result ? -EIO : 0; @@ -224,7 +223,6 @@ static int generic_drive_reset(ide_drive_t *drive) int ret = 0; rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); - scsi_req_init(rq); ide_req(rq)->type = ATA_PRIV_MISC; scsi_req(rq)->cmd_len = 1; scsi_req(rq)->cmd[0] = REQ_DRIVE_RESET; diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c index 94e3107..1f264d5 100644 --- a/drivers/ide/ide-park.c +++ b/drivers/ide/ide-park.c @@ -32,7 +32,6 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout) spin_unlock_irq(&hwif->lock); rq = blk_get_request(q, REQ_OP_DRV_IN, __GFP_RECLAIM); - scsi_req_init(rq); scsi_req(rq)->cmd[0] = REQ_PARK_HEADS; scsi_req(rq)->cmd_len = 1; ide_req(rq)->type = ATA_PRIV_MISC; @@ -48,7 +47,6 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout) * timeout has expired, so power management will be reenabled. */ rq = blk_get_request(q, REQ_OP_DRV_IN, GFP_NOWAIT); - scsi_req_init(rq); if (IS_ERR(rq)) goto out; diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c index 0977fc1..544f02d 100644 --- a/drivers/ide/ide-pm.c +++ b/drivers/ide/ide-pm.c @@ -19,7 +19,6 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg) memset(&rqpm, 0, sizeof(rqpm)); rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); - scsi_req_init(rq); ide_req(rq)->type = ATA_PRIV_PM_SUSPEND; rq->special = &rqpm; rqpm.pm_step = IDE_PM_START_SUSPEND; @@ -40,7 +39,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg) return ret; } -static void ide_end_sync_rq(struct request *rq, int error) +static void ide_end_sync_rq(struct request *rq, blk_status_t error) { complete(rq->end_io_data); } @@ -57,7 +56,7 @@ static int ide_pm_execute_rq(struct request *rq) if (unlikely(blk_queue_dying(q))) { rq->rq_flags |= RQF_QUIET; scsi_req(rq)->result = -ENXIO; - __blk_end_request_all(rq, 0); + __blk_end_request_all(rq, BLK_STS_OK); spin_unlock_irq(q->queue_lock); return -ENXIO; } @@ -91,7 +90,6 @@ int generic_ide_resume(struct device *dev) memset(&rqpm, 0, sizeof(rqpm)); rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); - scsi_req_init(rq); ide_req(rq)->type = ATA_PRIV_PM_RESUME; rq->rq_flags |= RQF_PREEMPT; rq->special = &rqpm; @@ -235,7 +233,7 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq) drive->hwif->rq = NULL; - if (blk_end_request(rq, 0, 0)) + if (blk_end_request(rq, BLK_STS_OK, 0)) BUG(); } diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index 0235625..01b2adf 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -741,12 +741,12 @@ static void ide_port_tune_devices(ide_hwif_t *hwif) } } -static int ide_init_rq(struct request_queue *q, struct request *rq, gfp_t gfp) +static void ide_initialize_rq(struct request *rq) { struct ide_request *req = blk_mq_rq_to_pdu(rq); + scsi_req_init(&req->sreq); req->sreq.sense = req->sense; - return 0; } /* @@ -771,8 +771,9 @@ static int ide_init_queue(ide_drive_t *drive) return 1; q->request_fn = do_ide_request; - q->init_rq_fn = ide_init_rq; + q->initialize_rq_fn = ide_initialize_rq; q->cmd_size = sizeof(struct ide_request); + queue_flag_set_unlocked(QUEUE_FLAG_SCSI_PASSTHROUGH, q); if (blk_init_allocated_queue(q) < 0) { blk_cleanup_queue(q); return 1; diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index a0651f9..fd57e8c 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -474,7 +474,7 @@ static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive, drive->failed_pc = NULL; drive->pc_callback(drive, 0); - ide_complete_rq(drive, -EIO, blk_rq_bytes(rq)); + ide_complete_rq(drive, BLK_STS_IOERR, blk_rq_bytes(rq)); return ide_stopped; } ide_debug_log(IDE_DBG_SENSE, "retry #%d, cmd: 0x%02x", pc->retries, @@ -855,7 +855,6 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size) BUG_ON(size < 0 || size % tape->blk_size); rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); - scsi_req_init(rq); ide_req(rq)->type = ATA_PRIV_MISC; scsi_req(rq)->cmd[13] = cmd; rq->rq_disk = tape->disk; diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index d71199d..4efe4c6 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -318,7 +318,7 @@ static void ide_error_cmd(ide_drive_t *drive, struct ide_cmd *cmd) } if (nr_bytes > 0) - ide_complete_rq(drive, 0, nr_bytes); + ide_complete_rq(drive, BLK_STS_OK, nr_bytes); } } @@ -336,7 +336,7 @@ void ide_finish_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat) ide_driveid_update(drive); } - ide_complete_rq(drive, err ? -EIO : 0, blk_rq_bytes(rq)); + ide_complete_rq(drive, err ? BLK_STS_IOERR : BLK_STS_OK, blk_rq_bytes(rq)); } /* @@ -394,7 +394,7 @@ out_end: if ((cmd->tf_flags & IDE_TFLAG_FS) == 0) ide_finish_cmd(drive, cmd, stat); else - ide_complete_rq(drive, 0, blk_rq_sectors(cmd->rq) << 9); + ide_complete_rq(drive, BLK_STS_OK, blk_rq_sectors(cmd->rq) << 9); return ide_stopped; out_err: ide_error_cmd(drive, cmd); @@ -433,7 +433,6 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf, rq = blk_get_request(drive->queue, (cmd->tf_flags & IDE_TFLAG_WRITE) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, __GFP_RECLAIM); - scsi_req_init(rq); ide_req(rq)->type = ATA_PRIV_TASKFILE; /* diff --git a/drivers/ide/siimage.c b/drivers/ide/siimage.c index 6a1849b..57eea5a 100644 --- a/drivers/ide/siimage.c +++ b/drivers/ide/siimage.c @@ -406,7 +406,7 @@ static int siimage_dma_test_irq(ide_drive_t *drive) * yet. */ -static int sil_sata_reset_poll(ide_drive_t *drive) +static blk_status_t sil_sata_reset_poll(ide_drive_t *drive) { ide_hwif_t *hwif = drive->hwif; void __iomem *sata_status_addr @@ -419,11 +419,11 @@ static int sil_sata_reset_poll(ide_drive_t *drive) if ((sata_stat & 0x03) != 0x03) { printk(KERN_WARNING "%s: reset phy dead, status=0x%08x\n", hwif->name, sata_stat); - return -ENXIO; + return BLK_STS_IOERR; } } - return 0; + return BLK_STS_OK; } /** diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 6a4aa60..ddae430 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -252,8 +252,9 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) } mutex_unlock(&dev->mlock); - if (nvm_reserve_luns(dev, s->lun_begin, s->lun_end)) - return -ENOMEM; + ret = nvm_reserve_luns(dev, s->lun_begin, s->lun_end); + if (ret) + return ret; t = kmalloc(sizeof(struct nvm_target), GFP_KERNEL); if (!t) { @@ -640,6 +641,7 @@ EXPORT_SYMBOL(nvm_max_phys_sects); int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) { struct nvm_dev *dev = tgt_dev->parent; + int ret; if (!dev->ops->submit_io) return -ENODEV; @@ -647,7 +649,12 @@ int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) nvm_rq_tgt_to_dev(tgt_dev, rqd); rqd->dev = tgt_dev; - return dev->ops->submit_io(dev, rqd); + + /* In case of error, fail with right address format */ + ret = dev->ops->submit_io(dev, rqd); + if (ret) + nvm_rq_dev_to_tgt(tgt_dev, rqd); + return ret; } EXPORT_SYMBOL(nvm_submit_io); diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c index 59bcea8..024a8fc 100644 --- a/drivers/lightnvm/pblk-cache.c +++ b/drivers/lightnvm/pblk-cache.c @@ -31,9 +31,13 @@ int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags) */ retry: ret = pblk_rb_may_write_user(&pblk->rwb, bio, nr_entries, &bpos); - if (ret == NVM_IO_REQUEUE) { + switch (ret) { + case NVM_IO_REQUEUE: io_schedule(); goto retry; + case NVM_IO_ERR: + pblk_pipeline_stop(pblk); + goto out; } if (unlikely(!bio_has_data(bio))) @@ -58,6 +62,8 @@ retry: atomic_long_add(nr_entries, &pblk->req_writes); #endif + pblk_rl_inserted(&pblk->rl, nr_entries); + out: pblk_write_should_kick(pblk); return ret; diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 5e44768..11fe0c5 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -17,7 +17,6 @@ */ #include "pblk.h" -#include <linux/time.h> static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line, struct ppa_addr *ppa) @@ -34,7 +33,7 @@ static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line, pr_err("pblk: attempted to erase bb: line:%d, pos:%d\n", line->id, pos); - pblk_line_run_ws(pblk, NULL, ppa, pblk_line_mark_bb); + pblk_line_run_ws(pblk, NULL, ppa, pblk_line_mark_bb, pblk->bb_wq); } static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd) @@ -54,6 +53,8 @@ static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd) *ppa = rqd->ppa_addr; pblk_mark_bb(pblk, line, ppa); } + + atomic_dec(&pblk->inflight_io); } /* Erase completion assumes that only one block is erased at the time */ @@ -61,13 +62,12 @@ static void pblk_end_io_erase(struct nvm_rq *rqd) { struct pblk *pblk = rqd->private; - up(&pblk->erase_sem); __pblk_end_io_erase(pblk, rqd); - mempool_free(rqd, pblk->r_rq_pool); + mempool_free(rqd, pblk->g_rq_pool); } -static void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line, - u64 paddr) +void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line, + u64 paddr) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct list_head *move_list = NULL; @@ -88,7 +88,7 @@ static void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line, spin_unlock(&line->lock); return; } - line->vsc--; + le32_add_cpu(line->vsc, -1); if (line->state == PBLK_LINESTATE_CLOSED) move_list = pblk_line_gc_list(pblk, line); @@ -130,18 +130,6 @@ void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa) __pblk_map_invalidate(pblk, line, paddr); } -void pblk_map_pad_invalidate(struct pblk *pblk, struct pblk_line *line, - u64 paddr) -{ - __pblk_map_invalidate(pblk, line, paddr); - - pblk_rb_sync_init(&pblk->rwb, NULL); - line->left_ssecs--; - if (!line->left_ssecs) - pblk_line_run_ws(pblk, line, NULL, pblk_line_close_ws); - pblk_rb_sync_end(&pblk->rwb, NULL); -} - static void pblk_invalidate_range(struct pblk *pblk, sector_t slba, unsigned int nr_secs) { @@ -172,8 +160,8 @@ struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw) pool = pblk->w_rq_pool; rq_size = pblk_w_rq_size; } else { - pool = pblk->r_rq_pool; - rq_size = pblk_r_rq_size; + pool = pblk->g_rq_pool; + rq_size = pblk_g_rq_size; } rqd = mempool_alloc(pool, GFP_KERNEL); @@ -189,7 +177,7 @@ void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw) if (rw == WRITE) pool = pblk->w_rq_pool; else - pool = pblk->r_rq_pool; + pool = pblk->g_rq_pool; mempool_free(rqd, pool); } @@ -271,35 +259,26 @@ void pblk_end_io_sync(struct nvm_rq *rqd) complete(waiting); } -void pblk_flush_writer(struct pblk *pblk) +void pblk_wait_for_meta(struct pblk *pblk) { - struct bio *bio; - int ret; - DECLARE_COMPLETION_ONSTACK(wait); - - bio = bio_alloc(GFP_KERNEL, 1); - if (!bio) - return; - - bio->bi_iter.bi_sector = 0; /* internal bio */ - bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_OP_FLUSH); - bio->bi_private = &wait; - bio->bi_end_io = pblk_end_bio_sync; + do { + if (!atomic_read(&pblk->inflight_io)) + break; - ret = pblk_write_to_cache(pblk, bio, 0); - if (ret == NVM_IO_OK) { - if (!wait_for_completion_io_timeout(&wait, - msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { - pr_err("pblk: flush cache timed out\n"); - } - } else if (ret != NVM_IO_DONE) { - pr_err("pblk: tear down bio failed\n"); - } + schedule(); + } while (1); +} - if (bio->bi_error) - pr_err("pblk: flush sync write failed (%u)\n", bio->bi_error); +static void pblk_flush_writer(struct pblk *pblk) +{ + pblk_rb_flush(&pblk->rwb); + do { + if (!pblk_rb_sync_count(&pblk->rwb)) + break; - bio_put(bio); + pblk_write_kick(pblk); + schedule(); + } while (1); } struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line) @@ -307,28 +286,31 @@ struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line) struct pblk_line_meta *lm = &pblk->lm; struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct list_head *move_list = NULL; + int vsc = le32_to_cpu(*line->vsc); - if (!line->vsc) { + lockdep_assert_held(&line->lock); + + if (!vsc) { if (line->gc_group != PBLK_LINEGC_FULL) { line->gc_group = PBLK_LINEGC_FULL; move_list = &l_mg->gc_full_list; } - } else if (line->vsc < lm->mid_thrs) { + } else if (vsc < lm->high_thrs) { if (line->gc_group != PBLK_LINEGC_HIGH) { line->gc_group = PBLK_LINEGC_HIGH; move_list = &l_mg->gc_high_list; } - } else if (line->vsc < lm->high_thrs) { + } else if (vsc < lm->mid_thrs) { if (line->gc_group != PBLK_LINEGC_MID) { line->gc_group = PBLK_LINEGC_MID; move_list = &l_mg->gc_mid_list; } - } else if (line->vsc < line->sec_in_line) { + } else if (vsc < line->sec_in_line) { if (line->gc_group != PBLK_LINEGC_LOW) { line->gc_group = PBLK_LINEGC_LOW; move_list = &l_mg->gc_low_list; } - } else if (line->vsc == line->sec_in_line) { + } else if (vsc == line->sec_in_line) { if (line->gc_group != PBLK_LINEGC_EMPTY) { line->gc_group = PBLK_LINEGC_EMPTY; move_list = &l_mg->gc_empty_list; @@ -338,7 +320,7 @@ struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line) line->gc_group = PBLK_LINEGC_NONE; move_list = &l_mg->corrupt_list; pr_err("pblk: corrupted vsc for line %d, vsc:%d (%d/%d/%d)\n", - line->id, line->vsc, + line->id, vsc, line->sec_in_line, lm->high_thrs, lm->mid_thrs); } @@ -397,6 +379,11 @@ void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd) #endif } +void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write) +{ + pblk->sec_per_write = sec_per_write; +} + int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd) { struct nvm_tgt_dev *dev = pblk->dev; @@ -431,21 +418,23 @@ int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd) } } #endif + + atomic_inc(&pblk->inflight_io); + return nvm_submit_io(dev, rqd); } struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data, unsigned int nr_secs, unsigned int len, - gfp_t gfp_mask) + int alloc_type, gfp_t gfp_mask) { struct nvm_tgt_dev *dev = pblk->dev; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; void *kaddr = data; struct page *page; struct bio *bio; int i, ret; - if (l_mg->emeta_alloc_type == PBLK_KMALLOC_META) + if (alloc_type == PBLK_KMALLOC_META) return bio_map_kern(dev->q, kaddr, len, gfp_mask); bio = bio_kmalloc(gfp_mask, nr_secs); @@ -478,7 +467,7 @@ out: int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail, unsigned long secs_to_flush) { - int max = pblk->max_write_pgs; + int max = pblk->sec_per_write; int min = pblk->min_write_pgs; int secs_to_sync = 0; @@ -492,12 +481,26 @@ int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail, return secs_to_sync; } -static u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, - int nr_secs) +void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs) +{ + u64 addr; + int i; + + addr = find_next_zero_bit(line->map_bitmap, + pblk->lm.sec_per_line, line->cur_sec); + line->cur_sec = addr - nr_secs; + + for (i = 0; i < nr_secs; i++, line->cur_sec--) + WARN_ON(!test_and_clear_bit(line->cur_sec, line->map_bitmap)); +} + +u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs) { u64 addr; int i; + lockdep_assert_held(&line->lock); + /* logic error: ppa out-of-bounds. Prevent generating bad address */ if (line->cur_sec + nr_secs > pblk->lm.sec_per_line) { WARN(1, "pblk: page allocation out of bounds\n"); @@ -528,27 +531,38 @@ u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs) return addr; } +u64 pblk_lookup_page(struct pblk *pblk, struct pblk_line *line) +{ + u64 paddr; + + spin_lock(&line->lock); + paddr = find_next_zero_bit(line->map_bitmap, + pblk->lm.sec_per_line, line->cur_sec); + spin_unlock(&line->lock); + + return paddr; +} + /* * Submit emeta to one LUN in the raid line at the time to avoid a deadlock when * taking the per LUN semaphore. */ static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line, - u64 paddr, int dir) + void *emeta_buf, u64 paddr, int dir) { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line_meta *lm = &pblk->lm; + void *ppa_list, *meta_list; struct bio *bio; struct nvm_rq rqd; - struct ppa_addr *ppa_list; - dma_addr_t dma_ppa_list; - void *emeta = line->emeta; + dma_addr_t dma_ppa_list, dma_meta_list; int min = pblk->min_write_pgs; - int left_ppas = lm->emeta_sec; + int left_ppas = lm->emeta_sec[0]; int id = line->id; int rq_ppas, rq_len; int cmd_op, bio_op; - int flags; int i, j; int ret; DECLARE_COMPLETION_ONSTACK(wait); @@ -556,25 +570,28 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line, if (dir == WRITE) { bio_op = REQ_OP_WRITE; cmd_op = NVM_OP_PWRITE; - flags = pblk_set_progr_mode(pblk, WRITE); } else if (dir == READ) { bio_op = REQ_OP_READ; cmd_op = NVM_OP_PREAD; - flags = pblk_set_read_mode(pblk); } else return -EINVAL; - ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_ppa_list); - if (!ppa_list) + meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, + &dma_meta_list); + if (!meta_list) return -ENOMEM; + ppa_list = meta_list + pblk_dma_meta_size; + dma_ppa_list = dma_meta_list + pblk_dma_meta_size; + next_rq: memset(&rqd, 0, sizeof(struct nvm_rq)); rq_ppas = pblk_calc_secs(pblk, left_ppas, 0); rq_len = rq_ppas * geo->sec_size; - bio = pblk_bio_map_addr(pblk, emeta, rq_ppas, rq_len, GFP_KERNEL); + bio = pblk_bio_map_addr(pblk, emeta_buf, rq_ppas, rq_len, + l_mg->emeta_alloc_type, GFP_KERNEL); if (IS_ERR(bio)) { ret = PTR_ERR(bio); goto free_rqd_dma; @@ -584,27 +601,38 @@ next_rq: bio_set_op_attrs(bio, bio_op, 0); rqd.bio = bio; - rqd.opcode = cmd_op; - rqd.flags = flags; - rqd.nr_ppas = rq_ppas; + rqd.meta_list = meta_list; rqd.ppa_list = ppa_list; + rqd.dma_meta_list = dma_meta_list; rqd.dma_ppa_list = dma_ppa_list; + rqd.opcode = cmd_op; + rqd.nr_ppas = rq_ppas; rqd.end_io = pblk_end_io_sync; rqd.private = &wait; if (dir == WRITE) { + struct pblk_sec_meta *meta_list = rqd.meta_list; + + rqd.flags = pblk_set_progr_mode(pblk, WRITE); for (i = 0; i < rqd.nr_ppas; ) { spin_lock(&line->lock); paddr = __pblk_alloc_page(pblk, line, min); spin_unlock(&line->lock); - for (j = 0; j < min; j++, i++, paddr++) + for (j = 0; j < min; j++, i++, paddr++) { + meta_list[i].lba = cpu_to_le64(ADDR_EMPTY); rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, id); + } } } else { for (i = 0; i < rqd.nr_ppas; ) { struct ppa_addr ppa = addr_to_gen_ppa(pblk, paddr, id); int pos = pblk_dev_ppa_to_pos(geo, ppa); + int read_type = PBLK_READ_RANDOM; + + if (pblk_io_aligned(pblk, rq_ppas)) + read_type = PBLK_READ_SEQUENTIAL; + rqd.flags = pblk_set_read_mode(pblk, read_type); while (test_bit(pos, line->blk_bitmap)) { paddr += min; @@ -645,9 +673,11 @@ next_rq: msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { pr_err("pblk: emeta I/O timed out\n"); } + atomic_dec(&pblk->inflight_io); reinit_completion(&wait); - bio_put(bio); + if (likely(pblk->l_mg.emeta_alloc_type == PBLK_VMALLOC_META)) + bio_put(bio); if (rqd.error) { if (dir == WRITE) @@ -656,12 +686,12 @@ next_rq: pblk_log_read_err(pblk, &rqd); } - emeta += rq_len; + emeta_buf += rq_len; left_ppas -= rq_ppas; if (left_ppas) goto next_rq; free_rqd_dma: - nvm_dev_dma_free(dev->parent, ppa_list, dma_ppa_list); + nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); return ret; } @@ -697,21 +727,24 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, bio_op = REQ_OP_WRITE; cmd_op = NVM_OP_PWRITE; flags = pblk_set_progr_mode(pblk, WRITE); - lba_list = pblk_line_emeta_to_lbas(line->emeta); + lba_list = emeta_to_lbas(pblk, line->emeta->buf); } else if (dir == READ) { bio_op = REQ_OP_READ; cmd_op = NVM_OP_PREAD; - flags = pblk_set_read_mode(pblk); + flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); } else return -EINVAL; memset(&rqd, 0, sizeof(struct nvm_rq)); - rqd.ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, - &rqd.dma_ppa_list); - if (!rqd.ppa_list) + rqd.meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, + &rqd.dma_meta_list); + if (!rqd.meta_list) return -ENOMEM; + rqd.ppa_list = rqd.meta_list + pblk_dma_meta_size; + rqd.dma_ppa_list = rqd.dma_meta_list + pblk_dma_meta_size; + bio = bio_map_kern(dev->q, line->smeta, lm->smeta_len, GFP_KERNEL); if (IS_ERR(bio)) { ret = PTR_ERR(bio); @@ -729,9 +762,15 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, rqd.private = &wait; for (i = 0; i < lm->smeta_sec; i++, paddr++) { + struct pblk_sec_meta *meta_list = rqd.meta_list; + rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id); - if (dir == WRITE) - lba_list[paddr] = cpu_to_le64(ADDR_EMPTY); + + if (dir == WRITE) { + __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); + + meta_list[i].lba = lba_list[paddr] = addr_empty; + } } /* @@ -750,6 +789,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { pr_err("pblk: smeta I/O timed out\n"); } + atomic_dec(&pblk->inflight_io); if (rqd.error) { if (dir == WRITE) @@ -759,7 +799,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, } free_ppa_list: - nvm_dev_dma_free(dev->parent, rqd.ppa_list, rqd.dma_ppa_list); + nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); return ret; } @@ -771,9 +811,11 @@ int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line) return pblk_line_submit_smeta_io(pblk, line, bpaddr, READ); } -int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line) +int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line, + void *emeta_buf) { - return pblk_line_submit_emeta_io(pblk, line, line->emeta_ssec, READ); + return pblk_line_submit_emeta_io(pblk, line, emeta_buf, + line->emeta_ssec, READ); } static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd, @@ -789,7 +831,7 @@ static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd, static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa) { struct nvm_rq rqd; - int ret; + int ret = 0; DECLARE_COMPLETION_ONSTACK(wait); memset(&rqd, 0, sizeof(struct nvm_rq)); @@ -824,14 +866,14 @@ out: rqd.private = pblk; __pblk_end_io_erase(pblk, &rqd); - return 0; + return ret; } int pblk_line_erase(struct pblk *pblk, struct pblk_line *line) { struct pblk_line_meta *lm = &pblk->lm; struct ppa_addr ppa; - int bit = -1; + int ret, bit = -1; /* Erase only good blocks, one at a time */ do { @@ -850,27 +892,59 @@ int pblk_line_erase(struct pblk *pblk, struct pblk_line *line) WARN_ON(test_and_set_bit(bit, line->erase_bitmap)); spin_unlock(&line->lock); - if (pblk_blk_erase_sync(pblk, ppa)) { + ret = pblk_blk_erase_sync(pblk, ppa); + if (ret) { pr_err("pblk: failed to erase line %d\n", line->id); - return -ENOMEM; + return ret; } } while (1); return 0; } +static void pblk_line_setup_metadata(struct pblk_line *line, + struct pblk_line_mgmt *l_mg, + struct pblk_line_meta *lm) +{ + int meta_line; + + lockdep_assert_held(&l_mg->free_lock); + +retry_meta: + meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES); + if (meta_line == PBLK_DATA_LINES) { + spin_unlock(&l_mg->free_lock); + io_schedule(); + spin_lock(&l_mg->free_lock); + goto retry_meta; + } + + set_bit(meta_line, &l_mg->meta_bitmap); + line->meta_line = meta_line; + + line->smeta = l_mg->sline_meta[meta_line]; + line->emeta = l_mg->eline_meta[meta_line]; + + memset(line->smeta, 0, lm->smeta_len); + memset(line->emeta->buf, 0, lm->emeta_len[0]); + + line->emeta->mem = 0; + atomic_set(&line->emeta->sync, 0); +} + /* For now lines are always assumed full lines. Thus, smeta former and current * lun bitmaps are omitted. */ -static int pblk_line_set_metadata(struct pblk *pblk, struct pblk_line *line, +static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line, struct pblk_line *cur) { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; struct pblk_line_meta *lm = &pblk->lm; struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct line_smeta *smeta = line->smeta; - struct line_emeta *emeta = line->emeta; + struct pblk_emeta *emeta = line->emeta; + struct line_emeta *emeta_buf = emeta->buf; + struct line_smeta *smeta_buf = (struct line_smeta *)line->smeta; int nr_blk_line; /* After erasing the line, new bad blocks might appear and we risk @@ -893,42 +967,44 @@ static int pblk_line_set_metadata(struct pblk *pblk, struct pblk_line *line, } /* Run-time metadata */ - line->lun_bitmap = ((void *)(smeta)) + sizeof(struct line_smeta); + line->lun_bitmap = ((void *)(smeta_buf)) + sizeof(struct line_smeta); /* Mark LUNs allocated in this line (all for now) */ bitmap_set(line->lun_bitmap, 0, lm->lun_bitmap_len); - smeta->header.identifier = cpu_to_le32(PBLK_MAGIC); - memcpy(smeta->header.uuid, pblk->instance_uuid, 16); - smeta->header.id = cpu_to_le32(line->id); - smeta->header.type = cpu_to_le16(line->type); - smeta->header.version = cpu_to_le16(1); + smeta_buf->header.identifier = cpu_to_le32(PBLK_MAGIC); + memcpy(smeta_buf->header.uuid, pblk->instance_uuid, 16); + smeta_buf->header.id = cpu_to_le32(line->id); + smeta_buf->header.type = cpu_to_le16(line->type); + smeta_buf->header.version = cpu_to_le16(1); /* Start metadata */ - smeta->seq_nr = cpu_to_le64(line->seq_nr); - smeta->window_wr_lun = cpu_to_le32(geo->nr_luns); + smeta_buf->seq_nr = cpu_to_le64(line->seq_nr); + smeta_buf->window_wr_lun = cpu_to_le32(geo->nr_luns); /* Fill metadata among lines */ if (cur) { memcpy(line->lun_bitmap, cur->lun_bitmap, lm->lun_bitmap_len); - smeta->prev_id = cpu_to_le32(cur->id); - cur->emeta->next_id = cpu_to_le32(line->id); + smeta_buf->prev_id = cpu_to_le32(cur->id); + cur->emeta->buf->next_id = cpu_to_le32(line->id); } else { - smeta->prev_id = cpu_to_le32(PBLK_LINE_EMPTY); + smeta_buf->prev_id = cpu_to_le32(PBLK_LINE_EMPTY); } /* All smeta must be set at this point */ - smeta->header.crc = cpu_to_le32(pblk_calc_meta_header_crc(pblk, smeta)); - smeta->crc = cpu_to_le32(pblk_calc_smeta_crc(pblk, smeta)); + smeta_buf->header.crc = cpu_to_le32( + pblk_calc_meta_header_crc(pblk, &smeta_buf->header)); + smeta_buf->crc = cpu_to_le32(pblk_calc_smeta_crc(pblk, smeta_buf)); /* End metadata */ - memcpy(&emeta->header, &smeta->header, sizeof(struct line_header)); - emeta->seq_nr = cpu_to_le64(line->seq_nr); - emeta->nr_lbas = cpu_to_le64(line->sec_in_line); - emeta->nr_valid_lbas = cpu_to_le64(0); - emeta->next_id = cpu_to_le32(PBLK_LINE_EMPTY); - emeta->crc = cpu_to_le32(0); - emeta->prev_id = smeta->prev_id; + memcpy(&emeta_buf->header, &smeta_buf->header, + sizeof(struct line_header)); + emeta_buf->seq_nr = cpu_to_le64(line->seq_nr); + emeta_buf->nr_lbas = cpu_to_le64(line->sec_in_line); + emeta_buf->nr_valid_lbas = cpu_to_le64(0); + emeta_buf->next_id = cpu_to_le32(PBLK_LINE_EMPTY); + emeta_buf->crc = cpu_to_le32(0); + emeta_buf->prev_id = smeta_buf->prev_id; return 1; } @@ -965,7 +1041,6 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line, /* Mark smeta metadata sectors as bad sectors */ bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line); off = bit * geo->sec_per_pl; -retry_smeta: bitmap_set(line->map_bitmap, off, lm->smeta_sec); line->sec_in_line -= lm->smeta_sec; line->smeta_ssec = off; @@ -973,8 +1048,7 @@ retry_smeta: if (init && pblk_line_submit_smeta_io(pblk, line, off, WRITE)) { pr_debug("pblk: line smeta I/O failed. Retry\n"); - off += geo->sec_per_pl; - goto retry_smeta; + return 1; } bitmap_copy(line->invalid_bitmap, line->map_bitmap, lm->sec_per_line); @@ -983,8 +1057,8 @@ retry_smeta: * blocks to make sure that there are enough sectors to store emeta */ bit = lm->sec_per_line; - off = lm->sec_per_line - lm->emeta_sec; - bitmap_set(line->invalid_bitmap, off, lm->emeta_sec); + off = lm->sec_per_line - lm->emeta_sec[0]; + bitmap_set(line->invalid_bitmap, off, lm->emeta_sec[0]); while (nr_bb) { off -= geo->sec_per_pl; if (!test_bit(off, line->invalid_bitmap)) { @@ -993,9 +1067,11 @@ retry_smeta: } } - line->sec_in_line -= lm->emeta_sec; + line->sec_in_line -= lm->emeta_sec[0]; line->emeta_ssec = off; - line->vsc = line->left_ssecs = line->left_msecs = line->sec_in_line; + line->nr_valid_lbas = 0; + line->left_msecs = line->sec_in_line; + *line->vsc = cpu_to_le32(line->sec_in_line); if (lm->sec_per_line - line->sec_in_line != bitmap_weight(line->invalid_bitmap, lm->sec_per_line)) { @@ -1034,14 +1110,20 @@ static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line) spin_lock(&line->lock); if (line->state != PBLK_LINESTATE_FREE) { + mempool_free(line->invalid_bitmap, pblk->line_meta_pool); + mempool_free(line->map_bitmap, pblk->line_meta_pool); spin_unlock(&line->lock); - WARN(1, "pblk: corrupted line state\n"); - return -EINTR; + WARN(1, "pblk: corrupted line %d, state %d\n", + line->id, line->state); + return -EAGAIN; } + line->state = PBLK_LINESTATE_OPEN; atomic_set(&line->left_eblks, blk_in_line); atomic_set(&line->left_seblks, blk_in_line); + + line->meta_distance = lm->meta_distance; spin_unlock(&line->lock); /* Bad blocks do not need to be erased */ @@ -1091,15 +1173,15 @@ struct pblk_line *pblk_line_get(struct pblk *pblk) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line *line = NULL; - int bit; + struct pblk_line *line; + int ret, bit; lockdep_assert_held(&l_mg->free_lock); -retry_get: +retry: if (list_empty(&l_mg->free_list)) { pr_err("pblk: no free lines\n"); - goto out; + return NULL; } line = list_first_entry(&l_mg->free_list, struct pblk_line, list); @@ -1115,16 +1197,22 @@ retry_get: list_add_tail(&line->list, &l_mg->bad_list); pr_debug("pblk: line %d is bad\n", line->id); - goto retry_get; + goto retry; } - if (pblk_line_prepare(pblk, line)) { - pr_err("pblk: failed to prepare line %d\n", line->id); - list_add(&line->list, &l_mg->free_list); - return NULL; + ret = pblk_line_prepare(pblk, line); + if (ret) { + if (ret == -EAGAIN) { + list_add(&line->list, &l_mg->corrupt_list); + goto retry; + } else { + pr_err("pblk: failed to prepare line %d\n", line->id); + list_add(&line->list, &l_mg->free_list); + l_mg->nr_free_lines++; + return NULL; + } } -out: return line; } @@ -1134,6 +1222,7 @@ static struct pblk_line *pblk_line_retry(struct pblk *pblk, struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line *retry_line; +retry: spin_lock(&l_mg->free_lock); retry_line = pblk_line_get(pblk); if (!retry_line) { @@ -1150,23 +1239,25 @@ static struct pblk_line *pblk_line_retry(struct pblk *pblk, l_mg->data_line = retry_line; spin_unlock(&l_mg->free_lock); - if (pblk_line_erase(pblk, retry_line)) { - spin_lock(&l_mg->free_lock); - l_mg->data_line = NULL; - spin_unlock(&l_mg->free_lock); - return NULL; - } - pblk_rl_free_lines_dec(&pblk->rl, retry_line); + if (pblk_line_erase(pblk, retry_line)) + goto retry; + return retry_line; } +static void pblk_set_space_limit(struct pblk *pblk) +{ + struct pblk_rl *rl = &pblk->rl; + + atomic_set(&rl->rb_space, 0); +} + struct pblk_line *pblk_line_get_first_data(struct pblk *pblk) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line *line; - int meta_line; int is_next = 0; spin_lock(&l_mg->free_lock); @@ -1180,30 +1271,37 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk) line->type = PBLK_LINETYPE_DATA; l_mg->data_line = line; - meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES); - set_bit(meta_line, &l_mg->meta_bitmap); - line->smeta = l_mg->sline_meta[meta_line].meta; - line->emeta = l_mg->eline_meta[meta_line].meta; - line->meta_line = meta_line; + pblk_line_setup_metadata(line, l_mg, &pblk->lm); /* Allocate next line for preparation */ l_mg->data_next = pblk_line_get(pblk); - if (l_mg->data_next) { + if (!l_mg->data_next) { + /* If we cannot get a new line, we need to stop the pipeline. + * Only allow as many writes in as we can store safely and then + * fail gracefully + */ + pblk_set_space_limit(pblk); + + l_mg->data_next = NULL; + } else { l_mg->data_next->seq_nr = l_mg->d_seq_nr++; l_mg->data_next->type = PBLK_LINETYPE_DATA; is_next = 1; } spin_unlock(&l_mg->free_lock); + if (pblk_line_erase(pblk, line)) { + line = pblk_line_retry(pblk, line); + if (!line) + return NULL; + } + pblk_rl_free_lines_dec(&pblk->rl, line); if (is_next) pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next); - if (pblk_line_erase(pblk, line)) - return NULL; - retry_setup: - if (!pblk_line_set_metadata(pblk, line, NULL)) { + if (!pblk_line_init_metadata(pblk, line, NULL)) { line = pblk_line_retry(pblk, line); if (!line) return NULL; @@ -1222,69 +1320,89 @@ retry_setup: return line; } -struct pblk_line *pblk_line_replace_data(struct pblk *pblk) +static void pblk_stop_writes(struct pblk *pblk, struct pblk_line *line) +{ + lockdep_assert_held(&pblk->l_mg.free_lock); + + pblk_set_space_limit(pblk); + pblk->state = PBLK_STATE_STOPPING; +} + +void pblk_pipeline_stop(struct pblk *pblk) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + int ret; + + spin_lock(&l_mg->free_lock); + if (pblk->state == PBLK_STATE_RECOVERING || + pblk->state == PBLK_STATE_STOPPED) { + spin_unlock(&l_mg->free_lock); + return; + } + pblk->state = PBLK_STATE_RECOVERING; + spin_unlock(&l_mg->free_lock); + + pblk_flush_writer(pblk); + pblk_wait_for_meta(pblk); + + ret = pblk_recov_pad(pblk); + if (ret) { + pr_err("pblk: could not close data on teardown(%d)\n", ret); + return; + } + + flush_workqueue(pblk->bb_wq); + pblk_line_close_meta_sync(pblk); + + spin_lock(&l_mg->free_lock); + pblk->state = PBLK_STATE_STOPPED; + l_mg->data_line = NULL; + l_mg->data_next = NULL; + spin_unlock(&l_mg->free_lock); +} + +void pblk_line_replace_data(struct pblk *pblk) { - struct pblk_line_meta *lm = &pblk->lm; struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line *cur, *new; unsigned int left_seblks; - int meta_line; int is_next = 0; cur = l_mg->data_line; new = l_mg->data_next; if (!new) - return NULL; + return; l_mg->data_line = new; -retry_line: + spin_lock(&l_mg->free_lock); + if (pblk->state != PBLK_STATE_RUNNING) { + l_mg->data_line = NULL; + l_mg->data_next = NULL; + spin_unlock(&l_mg->free_lock); + return; + } + + pblk_line_setup_metadata(new, l_mg, &pblk->lm); + spin_unlock(&l_mg->free_lock); + +retry_erase: left_seblks = atomic_read(&new->left_seblks); if (left_seblks) { /* If line is not fully erased, erase it */ if (atomic_read(&new->left_eblks)) { if (pblk_line_erase(pblk, new)) - return NULL; + return; } else { io_schedule(); } - goto retry_line; + goto retry_erase; } - spin_lock(&l_mg->free_lock); - /* Allocate next line for preparation */ - l_mg->data_next = pblk_line_get(pblk); - if (l_mg->data_next) { - l_mg->data_next->seq_nr = l_mg->d_seq_nr++; - l_mg->data_next->type = PBLK_LINETYPE_DATA; - is_next = 1; - } - -retry_meta: - meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES); - if (meta_line == PBLK_DATA_LINES) { - spin_unlock(&l_mg->free_lock); - io_schedule(); - spin_lock(&l_mg->free_lock); - goto retry_meta; - } - - set_bit(meta_line, &l_mg->meta_bitmap); - new->smeta = l_mg->sline_meta[meta_line].meta; - new->emeta = l_mg->eline_meta[meta_line].meta; - new->meta_line = meta_line; - - memset(new->smeta, 0, lm->smeta_len); - memset(new->emeta, 0, lm->emeta_len); - spin_unlock(&l_mg->free_lock); - - if (is_next) - pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next); - retry_setup: - if (!pblk_line_set_metadata(pblk, new, cur)) { + if (!pblk_line_init_metadata(pblk, new, cur)) { new = pblk_line_retry(pblk, new); if (!new) - return NULL; + return; goto retry_setup; } @@ -1292,12 +1410,30 @@ retry_setup: if (!pblk_line_init_bb(pblk, new, 1)) { new = pblk_line_retry(pblk, new); if (!new) - return NULL; + return; goto retry_setup; } - return new; + /* Allocate next line for preparation */ + spin_lock(&l_mg->free_lock); + l_mg->data_next = pblk_line_get(pblk); + if (!l_mg->data_next) { + /* If we cannot get a new line, we need to stop the pipeline. + * Only allow as many writes in as we can store safely and then + * fail gracefully + */ + pblk_stop_writes(pblk, new); + l_mg->data_next = NULL; + } else { + l_mg->data_next->seq_nr = l_mg->d_seq_nr++; + l_mg->data_next->type = PBLK_LINETYPE_DATA; + is_next = 1; + } + spin_unlock(&l_mg->free_lock); + + if (is_next) + pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next); } void pblk_line_free(struct pblk *pblk, struct pblk_line *line) @@ -1307,6 +1443,8 @@ void pblk_line_free(struct pblk *pblk, struct pblk_line *line) if (line->invalid_bitmap) mempool_free(line->invalid_bitmap, pblk->line_meta_pool); + *line->vsc = cpu_to_le32(EMPTY_ENTRY); + line->map_bitmap = NULL; line->invalid_bitmap = NULL; line->smeta = NULL; @@ -1339,8 +1477,8 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa) struct nvm_rq *rqd; int err; - rqd = mempool_alloc(pblk->r_rq_pool, GFP_KERNEL); - memset(rqd, 0, pblk_r_rq_size); + rqd = mempool_alloc(pblk->g_rq_pool, GFP_KERNEL); + memset(rqd, 0, pblk_g_rq_size); pblk_setup_e_rq(pblk, rqd, ppa); @@ -1368,7 +1506,8 @@ struct pblk_line *pblk_line_get_data(struct pblk *pblk) return pblk->l_mg.data_line; } -struct pblk_line *pblk_line_get_data_next(struct pblk *pblk) +/* For now, always erase next line */ +struct pblk_line *pblk_line_get_erase(struct pblk *pblk) { return pblk->l_mg.data_next; } @@ -1378,18 +1517,58 @@ int pblk_line_is_full(struct pblk_line *line) return (line->left_msecs == 0); } +void pblk_line_close_meta_sync(struct pblk *pblk) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_line *line, *tline; + LIST_HEAD(list); + + spin_lock(&l_mg->close_lock); + if (list_empty(&l_mg->emeta_list)) { + spin_unlock(&l_mg->close_lock); + return; + } + + list_cut_position(&list, &l_mg->emeta_list, l_mg->emeta_list.prev); + spin_unlock(&l_mg->close_lock); + + list_for_each_entry_safe(line, tline, &list, list) { + struct pblk_emeta *emeta = line->emeta; + + while (emeta->mem < lm->emeta_len[0]) { + int ret; + + ret = pblk_submit_meta_io(pblk, line); + if (ret) { + pr_err("pblk: sync meta line %d failed (%d)\n", + line->id, ret); + return; + } + } + } + + pblk_wait_for_meta(pblk); + flush_workqueue(pblk->close_wq); +} + +static void pblk_line_should_sync_meta(struct pblk *pblk) +{ + if (pblk_rl_is_limit(&pblk->rl)) + pblk_line_close_meta_sync(pblk); +} + void pblk_line_close(struct pblk *pblk, struct pblk_line *line) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct list_head *move_list; - line->emeta->crc = cpu_to_le32(pblk_calc_emeta_crc(pblk, line->emeta)); - - if (pblk_line_submit_emeta_io(pblk, line, line->cur_sec, WRITE)) - pr_err("pblk: line %d close I/O failed\n", line->id); +#ifdef CONFIG_NVM_DEBUG + struct pblk_line_meta *lm = &pblk->lm; - WARN(!bitmap_full(line->map_bitmap, line->sec_in_line), + WARN(!bitmap_full(line->map_bitmap, lm->sec_per_line), "pblk: corrupt closed line %d\n", line->id); +#endif spin_lock(&l_mg->free_lock); WARN_ON(!test_and_clear_bit(line->meta_line, &l_mg->meta_bitmap)); @@ -1410,6 +1589,31 @@ void pblk_line_close(struct pblk *pblk, struct pblk_line *line) spin_unlock(&line->lock); spin_unlock(&l_mg->gc_lock); + + pblk_gc_should_kick(pblk); +} + +void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_emeta *emeta = line->emeta; + struct line_emeta *emeta_buf = emeta->buf; + + /* No need for exact vsc value; avoid a big line lock and take aprox. */ + memcpy(emeta_to_vsc(pblk, emeta_buf), l_mg->vsc_list, lm->vsc_list_len); + memcpy(emeta_to_bb(emeta_buf), line->blk_bitmap, lm->blk_bitmap_len); + + emeta_buf->nr_valid_lbas = cpu_to_le64(line->nr_valid_lbas); + emeta_buf->crc = cpu_to_le32(pblk_calc_emeta_crc(pblk, emeta_buf)); + + spin_lock(&l_mg->close_lock); + spin_lock(&line->lock); + list_add_tail(&line->list, &l_mg->emeta_list); + spin_unlock(&line->lock); + spin_unlock(&l_mg->close_lock); + + pblk_line_should_sync_meta(pblk); } void pblk_line_close_ws(struct work_struct *work) @@ -1449,7 +1653,8 @@ void pblk_line_mark_bb(struct work_struct *work) } void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, - void (*work)(struct work_struct *)) + void (*work)(struct work_struct *), + struct workqueue_struct *wq) { struct pblk_line_ws *line_ws; @@ -1462,7 +1667,7 @@ void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, line_ws->priv = priv; INIT_WORK(&line_ws->ws, work); - queue_work(pblk->kw_wq, &line_ws->ws); + queue_work(wq, &line_ws->ws); } void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, @@ -1471,7 +1676,7 @@ void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; struct pblk_lun *rlun; - int lun_id = ppa_list[0].g.ch * geo->luns_per_chnl + ppa_list[0].g.lun; + int pos = pblk_ppa_to_pos(geo, ppa_list[0]); int ret; /* @@ -1488,10 +1693,10 @@ void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, /* If the LUN has been locked for this same request, do no attempt to * lock it again */ - if (test_and_set_bit(lun_id, lun_bitmap)) + if (test_and_set_bit(pos, lun_bitmap)) return; - rlun = &pblk->luns[lun_id]; + rlun = &pblk->luns[pos]; ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(5000)); if (ret) { switch (ret) { diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index eaf479c..6090d28 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -20,8 +20,7 @@ static void pblk_gc_free_gc_rq(struct pblk_gc_rq *gc_rq) { - kfree(gc_rq->data); - kfree(gc_rq->lba_list); + vfree(gc_rq->data); kfree(gc_rq); } @@ -37,10 +36,8 @@ static int pblk_gc_write(struct pblk *pblk) return 1; } - list_for_each_entry_safe(gc_rq, tgc_rq, &gc->w_list, list) { - list_move_tail(&gc_rq->list, &w_list); - gc->w_entries--; - } + list_cut_position(&w_list, &gc->w_list, gc->w_list.prev); + gc->w_entries = 0; spin_unlock(&gc->w_lock); list_for_each_entry_safe(gc_rq, tgc_rq, &w_list, list) { @@ -48,9 +45,8 @@ static int pblk_gc_write(struct pblk *pblk) gc_rq->nr_secs, gc_rq->secs_to_gc, gc_rq->line, PBLK_IOTYPE_GC); - kref_put(&gc_rq->line->ref, pblk_line_put); - list_del(&gc_rq->list); + kref_put(&gc_rq->line->ref, pblk_line_put); pblk_gc_free_gc_rq(gc_rq); } @@ -66,52 +62,41 @@ static void pblk_gc_writer_kick(struct pblk_gc *gc) * Responsible for managing all memory related to a gc request. Also in case of * failure */ -static int pblk_gc_move_valid_secs(struct pblk *pblk, struct pblk_line *line, - u64 *lba_list, unsigned int nr_secs) +static int pblk_gc_move_valid_secs(struct pblk *pblk, struct pblk_gc_rq *gc_rq) { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; struct pblk_gc *gc = &pblk->gc; - struct pblk_gc_rq *gc_rq; + struct pblk_line *line = gc_rq->line; void *data; unsigned int secs_to_gc; - int ret = NVM_IO_OK; + int ret = 0; - data = kmalloc(nr_secs * geo->sec_size, GFP_KERNEL); + data = vmalloc(gc_rq->nr_secs * geo->sec_size); if (!data) { - ret = NVM_IO_ERR; - goto free_lba_list; + ret = -ENOMEM; + goto out; } /* Read from GC victim block */ - if (pblk_submit_read_gc(pblk, lba_list, data, nr_secs, + if (pblk_submit_read_gc(pblk, gc_rq->lba_list, data, gc_rq->nr_secs, &secs_to_gc, line)) { - ret = NVM_IO_ERR; + ret = -EFAULT; goto free_data; } if (!secs_to_gc) - goto free_data; - - gc_rq = kmalloc(sizeof(struct pblk_gc_rq), GFP_KERNEL); - if (!gc_rq) { - ret = NVM_IO_ERR; - goto free_data; - } + goto free_rq; - gc_rq->line = line; gc_rq->data = data; - gc_rq->lba_list = lba_list; - gc_rq->nr_secs = nr_secs; gc_rq->secs_to_gc = secs_to_gc; - kref_get(&line->ref); - retry: spin_lock(&gc->w_lock); - if (gc->w_entries > 256) { + if (gc->w_entries >= PBLK_GC_W_QD) { spin_unlock(&gc->w_lock); - usleep_range(256, 1024); + pblk_gc_writer_kick(&pblk->gc); + usleep_range(128, 256); goto retry; } gc->w_entries++; @@ -120,13 +105,14 @@ retry: pblk_gc_writer_kick(&pblk->gc); - return NVM_IO_OK; + return 0; +free_rq: + kfree(gc_rq); free_data: - kfree(data); -free_lba_list: - kfree(lba_list); - + vfree(data); +out: + kref_put(&line->ref, pblk_line_put); return ret; } @@ -150,140 +136,206 @@ static void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line) static void pblk_gc_line_ws(struct work_struct *work) { + struct pblk_line_ws *line_rq_ws = container_of(work, + struct pblk_line_ws, ws); + struct pblk *pblk = line_rq_ws->pblk; + struct pblk_gc *gc = &pblk->gc; + struct pblk_line *line = line_rq_ws->line; + struct pblk_gc_rq *gc_rq = line_rq_ws->priv; + + up(&gc->gc_sem); + + if (pblk_gc_move_valid_secs(pblk, gc_rq)) { + pr_err("pblk: could not GC all sectors: line:%d (%d/%d)\n", + line->id, *line->vsc, + gc_rq->nr_secs); + } + + mempool_free(line_rq_ws, pblk->line_ws_pool); +} + +static void pblk_gc_line_prepare_ws(struct work_struct *work) +{ struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws, ws); struct pblk *pblk = line_ws->pblk; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line *line = line_ws->line; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line_meta *lm = &pblk->lm; - __le64 *lba_list = line_ws->priv; - u64 *gc_list; - int sec_left; - int nr_ppas, bit; - int put_line = 1; + struct pblk_gc *gc = &pblk->gc; + struct line_emeta *emeta_buf; + struct pblk_line_ws *line_rq_ws; + struct pblk_gc_rq *gc_rq; + __le64 *lba_list; + int sec_left, nr_secs, bit; + int ret; - pr_debug("pblk: line '%d' being reclaimed for GC\n", line->id); + emeta_buf = pblk_malloc(lm->emeta_len[0], l_mg->emeta_alloc_type, + GFP_KERNEL); + if (!emeta_buf) { + pr_err("pblk: cannot use GC emeta\n"); + return; + } - spin_lock(&line->lock); - sec_left = line->vsc; - if (!sec_left) { - /* Lines are erased before being used (l_mg->data_/log_next) */ - spin_unlock(&line->lock); - goto out; + ret = pblk_line_read_emeta(pblk, line, emeta_buf); + if (ret) { + pr_err("pblk: line %d read emeta failed (%d)\n", line->id, ret); + goto fail_free_emeta; + } + + /* If this read fails, it means that emeta is corrupted. For now, leave + * the line untouched. TODO: Implement a recovery routine that scans and + * moves all sectors on the line. + */ + lba_list = pblk_recov_get_lba_list(pblk, emeta_buf); + if (!lba_list) { + pr_err("pblk: could not interpret emeta (line %d)\n", line->id); + goto fail_free_emeta; } - spin_unlock(&line->lock); + sec_left = pblk_line_vsc(line); if (sec_left < 0) { pr_err("pblk: corrupted GC line (%d)\n", line->id); - put_line = 0; - pblk_put_line_back(pblk, line); - goto out; + goto fail_free_emeta; } bit = -1; next_rq: - gc_list = kmalloc_array(pblk->max_write_pgs, sizeof(u64), GFP_KERNEL); - if (!gc_list) { - put_line = 0; - pblk_put_line_back(pblk, line); - goto out; - } + gc_rq = kmalloc(sizeof(struct pblk_gc_rq), GFP_KERNEL); + if (!gc_rq) + goto fail_free_emeta; - nr_ppas = 0; + nr_secs = 0; do { bit = find_next_zero_bit(line->invalid_bitmap, lm->sec_per_line, bit + 1); if (bit > line->emeta_ssec) break; - gc_list[nr_ppas++] = le64_to_cpu(lba_list[bit]); - } while (nr_ppas < pblk->max_write_pgs); + gc_rq->lba_list[nr_secs++] = le64_to_cpu(lba_list[bit]); + } while (nr_secs < pblk->max_write_pgs); - if (unlikely(!nr_ppas)) { - kfree(gc_list); + if (unlikely(!nr_secs)) { + kfree(gc_rq); goto out; } - if (pblk_gc_move_valid_secs(pblk, line, gc_list, nr_ppas)) { - pr_err("pblk: could not GC all sectors: line:%d (%d/%d/%d)\n", - line->id, line->vsc, - nr_ppas, nr_ppas); - put_line = 0; - pblk_put_line_back(pblk, line); - goto out; - } + gc_rq->nr_secs = nr_secs; + gc_rq->line = line; + + line_rq_ws = mempool_alloc(pblk->line_ws_pool, GFP_KERNEL); + if (!line_rq_ws) + goto fail_free_gc_rq; - sec_left -= nr_ppas; + line_rq_ws->pblk = pblk; + line_rq_ws->line = line; + line_rq_ws->priv = gc_rq; + + down(&gc->gc_sem); + kref_get(&line->ref); + + INIT_WORK(&line_rq_ws->ws, pblk_gc_line_ws); + queue_work(gc->gc_line_reader_wq, &line_rq_ws->ws); + + sec_left -= nr_secs; if (sec_left > 0) goto next_rq; out: - pblk_mfree(line->emeta, l_mg->emeta_alloc_type); + pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); mempool_free(line_ws, pblk->line_ws_pool); - atomic_dec(&pblk->gc.inflight_gc); - if (put_line) - kref_put(&line->ref, pblk_line_put); + + kref_put(&line->ref, pblk_line_put); + atomic_dec(&gc->inflight_gc); + + return; + +fail_free_gc_rq: + kfree(gc_rq); +fail_free_emeta: + pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); + pblk_put_line_back(pblk, line); + kref_put(&line->ref, pblk_line_put); + mempool_free(line_ws, pblk->line_ws_pool); + atomic_dec(&gc->inflight_gc); + + pr_err("pblk: Failed to GC line %d\n", line->id); } static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line) { - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line_meta *lm = &pblk->lm; + struct pblk_gc *gc = &pblk->gc; struct pblk_line_ws *line_ws; - __le64 *lba_list; - int ret; - line_ws = mempool_alloc(pblk->line_ws_pool, GFP_KERNEL); - line->emeta = pblk_malloc(lm->emeta_len, l_mg->emeta_alloc_type, - GFP_KERNEL); - if (!line->emeta) { - pr_err("pblk: cannot use GC emeta\n"); - goto fail_free_ws; - } - - ret = pblk_line_read_emeta(pblk, line); - if (ret) { - pr_err("pblk: line %d read emeta failed (%d)\n", line->id, ret); - goto fail_free_emeta; - } + pr_debug("pblk: line '%d' being reclaimed for GC\n", line->id); - /* If this read fails, it means that emeta is corrupted. For now, leave - * the line untouched. TODO: Implement a recovery routine that scans and - * moves all sectors on the line. - */ - lba_list = pblk_recov_get_lba_list(pblk, line->emeta); - if (!lba_list) { - pr_err("pblk: could not interpret emeta (line %d)\n", line->id); - goto fail_free_emeta; - } + line_ws = mempool_alloc(pblk->line_ws_pool, GFP_KERNEL); + if (!line_ws) + return -ENOMEM; line_ws->pblk = pblk; line_ws->line = line; - line_ws->priv = lba_list; - INIT_WORK(&line_ws->ws, pblk_gc_line_ws); - queue_work(pblk->gc.gc_reader_wq, &line_ws->ws); + INIT_WORK(&line_ws->ws, pblk_gc_line_prepare_ws); + queue_work(gc->gc_reader_wq, &line_ws->ws); return 0; +} -fail_free_emeta: - pblk_mfree(line->emeta, l_mg->emeta_alloc_type); -fail_free_ws: - mempool_free(line_ws, pblk->line_ws_pool); - pblk_put_line_back(pblk, line); +static int pblk_gc_read(struct pblk *pblk) +{ + struct pblk_gc *gc = &pblk->gc; + struct pblk_line *line; + + spin_lock(&gc->r_lock); + if (list_empty(&gc->r_list)) { + spin_unlock(&gc->r_lock); + return 1; + } + + line = list_first_entry(&gc->r_list, struct pblk_line, list); + list_del(&line->list); + spin_unlock(&gc->r_lock); + + pblk_gc_kick(pblk); - return 1; + if (pblk_gc_line(pblk, line)) + pr_err("pblk: failed to GC line %d\n", line->id); + + return 0; } -static void pblk_gc_lines(struct pblk *pblk, struct list_head *gc_list) +static void pblk_gc_reader_kick(struct pblk_gc *gc) { - struct pblk_line *line, *tline; + wake_up_process(gc->gc_reader_ts); +} - list_for_each_entry_safe(line, tline, gc_list, list) { - if (pblk_gc_line(pblk, line)) - pr_err("pblk: failed to GC line %d\n", line->id); - list_del(&line->list); +static struct pblk_line *pblk_gc_get_victim_line(struct pblk *pblk, + struct list_head *group_list) +{ + struct pblk_line *line, *victim; + int line_vsc, victim_vsc; + + victim = list_first_entry(group_list, struct pblk_line, list); + list_for_each_entry(line, group_list, list) { + line_vsc = le32_to_cpu(*line->vsc); + victim_vsc = le32_to_cpu(*victim->vsc); + if (line_vsc < victim_vsc) + victim = line; } + + return victim; +} + +static bool pblk_gc_should_run(struct pblk_gc *gc, struct pblk_rl *rl) +{ + unsigned int nr_blocks_free, nr_blocks_need; + + nr_blocks_need = pblk_rl_high_thrs(rl); + nr_blocks_free = pblk_rl_nr_free_blks(rl); + + /* This is not critical, no need to take lock here */ + return ((gc->gc_active) && (nr_blocks_need > nr_blocks_free)); } /* @@ -296,71 +348,83 @@ static void pblk_gc_run(struct pblk *pblk) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_gc *gc = &pblk->gc; - struct pblk_line *line, *tline; - unsigned int nr_blocks_free, nr_blocks_need; + struct pblk_line *line; struct list_head *group_list; - int run_gc, gc_group = 0; - int prev_gc = 0; - int inflight_gc = atomic_read(&gc->inflight_gc); - LIST_HEAD(gc_list); + bool run_gc; + int inflight_gc, gc_group = 0, prev_group = 0; + + do { + spin_lock(&l_mg->gc_lock); + if (list_empty(&l_mg->gc_full_list)) { + spin_unlock(&l_mg->gc_lock); + break; + } + + line = list_first_entry(&l_mg->gc_full_list, + struct pblk_line, list); - spin_lock(&l_mg->gc_lock); - list_for_each_entry_safe(line, tline, &l_mg->gc_full_list, list) { spin_lock(&line->lock); WARN_ON(line->state != PBLK_LINESTATE_CLOSED); line->state = PBLK_LINESTATE_GC; spin_unlock(&line->lock); list_del(&line->list); + spin_unlock(&l_mg->gc_lock); + kref_put(&line->ref, pblk_line_put); - } - spin_unlock(&l_mg->gc_lock); + } while (1); - nr_blocks_need = pblk_rl_gc_thrs(&pblk->rl); - nr_blocks_free = pblk_rl_nr_free_blks(&pblk->rl); - run_gc = (nr_blocks_need > nr_blocks_free || gc->gc_forced); + run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl); + if (!run_gc || (atomic_read(&gc->inflight_gc) >= PBLK_GC_L_QD)) + return; next_gc_group: group_list = l_mg->gc_lists[gc_group++]; - spin_lock(&l_mg->gc_lock); - while (run_gc && !list_empty(group_list)) { - /* No need to queue up more GC lines than we can handle */ - if (!run_gc || inflight_gc > gc->gc_jobs_active) { + + do { + spin_lock(&l_mg->gc_lock); + if (list_empty(group_list)) { spin_unlock(&l_mg->gc_lock); - pblk_gc_lines(pblk, &gc_list); - return; + break; } - line = list_first_entry(group_list, struct pblk_line, list); - nr_blocks_free += atomic_read(&line->blk_in_line); + line = pblk_gc_get_victim_line(pblk, group_list); spin_lock(&line->lock); WARN_ON(line->state != PBLK_LINESTATE_CLOSED); line->state = PBLK_LINESTATE_GC; - list_move_tail(&line->list, &gc_list); - atomic_inc(&gc->inflight_gc); - inflight_gc++; spin_unlock(&line->lock); - prev_gc = 1; - run_gc = (nr_blocks_need > nr_blocks_free || gc->gc_forced); - } - spin_unlock(&l_mg->gc_lock); + list_del(&line->list); + spin_unlock(&l_mg->gc_lock); + + spin_lock(&gc->r_lock); + list_add_tail(&line->list, &gc->r_list); + spin_unlock(&gc->r_lock); - pblk_gc_lines(pblk, &gc_list); + inflight_gc = atomic_inc_return(&gc->inflight_gc); + pblk_gc_reader_kick(gc); - if (!prev_gc && pblk->rl.rb_state > gc_group && - gc_group < PBLK_NR_GC_LISTS) + prev_group = 1; + + /* No need to queue up more GC lines than we can handle */ + run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl); + if (!run_gc || inflight_gc >= PBLK_GC_L_QD) + break; + } while (1); + + if (!prev_group && pblk->rl.rb_state > gc_group && + gc_group < PBLK_GC_NR_LISTS) goto next_gc_group; } - -static void pblk_gc_kick(struct pblk *pblk) +void pblk_gc_kick(struct pblk *pblk) { struct pblk_gc *gc = &pblk->gc; wake_up_process(gc->gc_ts); pblk_gc_writer_kick(gc); + pblk_gc_reader_kick(gc); mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS)); } @@ -398,42 +462,34 @@ static int pblk_gc_writer_ts(void *data) return 0; } -static void pblk_gc_start(struct pblk *pblk) +static int pblk_gc_reader_ts(void *data) { - pblk->gc.gc_active = 1; + struct pblk *pblk = data; - pr_debug("pblk: gc start\n"); + while (!kthread_should_stop()) { + if (!pblk_gc_read(pblk)) + continue; + set_current_state(TASK_INTERRUPTIBLE); + io_schedule(); + } + + return 0; } -int pblk_gc_status(struct pblk *pblk) +static void pblk_gc_start(struct pblk *pblk) { - struct pblk_gc *gc = &pblk->gc; - int ret; - - spin_lock(&gc->lock); - ret = gc->gc_active; - spin_unlock(&gc->lock); - - return ret; + pblk->gc.gc_active = 1; + pr_debug("pblk: gc start\n"); } -static void __pblk_gc_should_start(struct pblk *pblk) +void pblk_gc_should_start(struct pblk *pblk) { struct pblk_gc *gc = &pblk->gc; - lockdep_assert_held(&gc->lock); - if (gc->gc_enabled && !gc->gc_active) pblk_gc_start(pblk); -} -void pblk_gc_should_start(struct pblk *pblk) -{ - struct pblk_gc *gc = &pblk->gc; - - spin_lock(&gc->lock); - __pblk_gc_should_start(pblk); - spin_unlock(&gc->lock); + pblk_gc_kick(pblk); } /* @@ -442,10 +498,7 @@ void pblk_gc_should_start(struct pblk *pblk) */ static void pblk_gc_stop(struct pblk *pblk, int flush_wq) { - spin_lock(&pblk->gc.lock); pblk->gc.gc_active = 0; - spin_unlock(&pblk->gc.lock); - pr_debug("pblk: gc stop\n"); } @@ -468,20 +521,25 @@ void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled, spin_unlock(&gc->lock); } -void pblk_gc_sysfs_force(struct pblk *pblk, int force) +int pblk_gc_sysfs_force(struct pblk *pblk, int force) { struct pblk_gc *gc = &pblk->gc; - int rsv = 0; + + if (force < 0 || force > 1) + return -EINVAL; spin_lock(&gc->lock); - if (force) { - gc->gc_enabled = 1; - rsv = 64; - } - pblk_rl_set_gc_rsc(&pblk->rl, rsv); gc->gc_forced = force; - __pblk_gc_should_start(pblk); + + if (force) + gc->gc_enabled = 1; + else + gc->gc_enabled = 0; spin_unlock(&gc->lock); + + pblk_gc_should_start(pblk); + + return 0; } int pblk_gc_init(struct pblk *pblk) @@ -503,30 +561,58 @@ int pblk_gc_init(struct pblk *pblk) goto fail_free_main_kthread; } + gc->gc_reader_ts = kthread_create(pblk_gc_reader_ts, pblk, + "pblk-gc-reader-ts"); + if (IS_ERR(gc->gc_reader_ts)) { + pr_err("pblk: could not allocate GC reader kthread\n"); + ret = PTR_ERR(gc->gc_reader_ts); + goto fail_free_writer_kthread; + } + setup_timer(&gc->gc_timer, pblk_gc_timer, (unsigned long)pblk); mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS)); gc->gc_active = 0; gc->gc_forced = 0; gc->gc_enabled = 1; - gc->gc_jobs_active = 8; gc->w_entries = 0; atomic_set(&gc->inflight_gc, 0); - gc->gc_reader_wq = alloc_workqueue("pblk-gc-reader-wq", - WQ_MEM_RECLAIM | WQ_UNBOUND, gc->gc_jobs_active); + /* Workqueue that reads valid sectors from a line and submit them to the + * GC writer to be recycled. + */ + gc->gc_line_reader_wq = alloc_workqueue("pblk-gc-line-reader-wq", + WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_GC_MAX_READERS); + if (!gc->gc_line_reader_wq) { + pr_err("pblk: could not allocate GC line reader workqueue\n"); + ret = -ENOMEM; + goto fail_free_reader_kthread; + } + + /* Workqueue that prepare lines for GC */ + gc->gc_reader_wq = alloc_workqueue("pblk-gc-line_wq", + WQ_MEM_RECLAIM | WQ_UNBOUND, 1); if (!gc->gc_reader_wq) { pr_err("pblk: could not allocate GC reader workqueue\n"); ret = -ENOMEM; - goto fail_free_writer_kthread; + goto fail_free_reader_line_wq; } spin_lock_init(&gc->lock); spin_lock_init(&gc->w_lock); + spin_lock_init(&gc->r_lock); + + sema_init(&gc->gc_sem, 128); + INIT_LIST_HEAD(&gc->w_list); + INIT_LIST_HEAD(&gc->r_list); return 0; +fail_free_reader_line_wq: + destroy_workqueue(gc->gc_line_reader_wq); +fail_free_reader_kthread: + kthread_stop(gc->gc_reader_ts); fail_free_writer_kthread: kthread_stop(gc->gc_writer_ts); fail_free_main_kthread: @@ -540,6 +626,7 @@ void pblk_gc_exit(struct pblk *pblk) struct pblk_gc *gc = &pblk->gc; flush_workqueue(gc->gc_reader_wq); + flush_workqueue(gc->gc_line_reader_wq); del_timer(&gc->gc_timer); pblk_gc_stop(pblk, 1); @@ -547,9 +634,15 @@ void pblk_gc_exit(struct pblk *pblk) if (gc->gc_ts) kthread_stop(gc->gc_ts); - if (pblk->gc.gc_reader_wq) - destroy_workqueue(pblk->gc.gc_reader_wq); + if (gc->gc_reader_wq) + destroy_workqueue(gc->gc_reader_wq); + + if (gc->gc_line_reader_wq) + destroy_workqueue(gc->gc_line_reader_wq); if (gc->gc_writer_ts) kthread_stop(gc->gc_writer_ts); + + if (gc->gc_reader_ts) + kthread_stop(gc->gc_reader_ts); } diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index ae8cd6d..1b0f612 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -20,9 +20,10 @@ #include "pblk.h" -static struct kmem_cache *pblk_blk_ws_cache, *pblk_rec_cache, *pblk_r_rq_cache, - *pblk_w_rq_cache, *pblk_line_meta_cache; +static struct kmem_cache *pblk_blk_ws_cache, *pblk_rec_cache, *pblk_g_rq_cache, + *pblk_w_rq_cache, *pblk_line_meta_cache; static DECLARE_RWSEM(pblk_lock); +struct bio_set *pblk_bio_set; static int pblk_rw_io(struct request_queue *q, struct pblk *pblk, struct bio *bio) @@ -33,7 +34,7 @@ static int pblk_rw_io(struct request_queue *q, struct pblk *pblk, * constraint. Writes can be of arbitrary size. */ if (bio_data_dir(bio) == READ) { - blk_queue_split(q, &bio, q->bio_split); + blk_queue_split(q, &bio); ret = pblk_submit_read(pblk, bio); if (ret == NVM_IO_DONE && bio_flagged(bio, BIO_CLONED)) bio_put(bio); @@ -46,7 +47,7 @@ static int pblk_rw_io(struct request_queue *q, struct pblk *pblk, * available for user I/O. */ if (unlikely(pblk_get_secs(bio) >= pblk_rl_sysfs_rate_show(&pblk->rl))) - blk_queue_split(q, &bio, q->bio_split); + blk_queue_split(q, &bio); return pblk_write_to_cache(pblk, bio, PBLK_IOTYPE_USER); } @@ -199,9 +200,9 @@ static int pblk_init_global_caches(struct pblk *pblk) return -ENOMEM; } - pblk_r_rq_cache = kmem_cache_create("pblk_r_rq", pblk_r_rq_size, + pblk_g_rq_cache = kmem_cache_create("pblk_g_rq", pblk_g_rq_size, 0, 0, NULL); - if (!pblk_r_rq_cache) { + if (!pblk_g_rq_cache) { kmem_cache_destroy(pblk_blk_ws_cache); kmem_cache_destroy(pblk_rec_cache); up_write(&pblk_lock); @@ -213,7 +214,7 @@ static int pblk_init_global_caches(struct pblk *pblk) if (!pblk_w_rq_cache) { kmem_cache_destroy(pblk_blk_ws_cache); kmem_cache_destroy(pblk_rec_cache); - kmem_cache_destroy(pblk_r_rq_cache); + kmem_cache_destroy(pblk_g_rq_cache); up_write(&pblk_lock); return -ENOMEM; } @@ -225,7 +226,7 @@ static int pblk_init_global_caches(struct pblk *pblk) if (!pblk_line_meta_cache) { kmem_cache_destroy(pblk_blk_ws_cache); kmem_cache_destroy(pblk_rec_cache); - kmem_cache_destroy(pblk_r_rq_cache); + kmem_cache_destroy(pblk_g_rq_cache); kmem_cache_destroy(pblk_w_rq_cache); up_write(&pblk_lock); return -ENOMEM; @@ -239,27 +240,10 @@ static int pblk_core_init(struct pblk *pblk) { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; - int max_write_ppas; - int mod; - pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE); - max_write_ppas = pblk->min_write_pgs * geo->nr_luns; - pblk->max_write_pgs = (max_write_ppas < nvm_max_phys_sects(dev)) ? - max_write_ppas : nvm_max_phys_sects(dev); pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg * geo->nr_planes * geo->nr_luns; - if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) { - pr_err("pblk: cannot support device max_phys_sect\n"); - return -EINVAL; - } - - div_u64_rem(geo->sec_per_blk, pblk->min_write_pgs, &mod); - if (mod) { - pr_err("pblk: bad configuration of sectors/pages\n"); - return -EINVAL; - } - if (pblk_init_global_caches(pblk)) return -ENOMEM; @@ -267,7 +251,7 @@ static int pblk_core_init(struct pblk *pblk) if (!pblk->page_pool) return -ENOMEM; - pblk->line_ws_pool = mempool_create_slab_pool(geo->nr_luns, + pblk->line_ws_pool = mempool_create_slab_pool(PBLK_WS_POOL_SIZE, pblk_blk_ws_cache); if (!pblk->line_ws_pool) goto free_page_pool; @@ -276,41 +260,51 @@ static int pblk_core_init(struct pblk *pblk) if (!pblk->rec_pool) goto free_blk_ws_pool; - pblk->r_rq_pool = mempool_create_slab_pool(64, pblk_r_rq_cache); - if (!pblk->r_rq_pool) + pblk->g_rq_pool = mempool_create_slab_pool(PBLK_READ_REQ_POOL_SIZE, + pblk_g_rq_cache); + if (!pblk->g_rq_pool) goto free_rec_pool; - pblk->w_rq_pool = mempool_create_slab_pool(64, pblk_w_rq_cache); + pblk->w_rq_pool = mempool_create_slab_pool(geo->nr_luns * 2, + pblk_w_rq_cache); if (!pblk->w_rq_pool) - goto free_r_rq_pool; + goto free_g_rq_pool; pblk->line_meta_pool = - mempool_create_slab_pool(16, pblk_line_meta_cache); + mempool_create_slab_pool(PBLK_META_POOL_SIZE, + pblk_line_meta_cache); if (!pblk->line_meta_pool) goto free_w_rq_pool; - pblk->kw_wq = alloc_workqueue("pblk-aux-wq", - WQ_MEM_RECLAIM | WQ_UNBOUND, 1); - if (!pblk->kw_wq) + pblk->close_wq = alloc_workqueue("pblk-close-wq", + WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_NR_CLOSE_JOBS); + if (!pblk->close_wq) goto free_line_meta_pool; + pblk->bb_wq = alloc_workqueue("pblk-bb-wq", + WQ_MEM_RECLAIM | WQ_UNBOUND, 0); + if (!pblk->bb_wq) + goto free_close_wq; + if (pblk_set_ppaf(pblk)) - goto free_kw_wq; + goto free_bb_wq; if (pblk_rwb_init(pblk)) - goto free_kw_wq; + goto free_bb_wq; INIT_LIST_HEAD(&pblk->compl_list); return 0; -free_kw_wq: - destroy_workqueue(pblk->kw_wq); +free_bb_wq: + destroy_workqueue(pblk->bb_wq); +free_close_wq: + destroy_workqueue(pblk->close_wq); free_line_meta_pool: mempool_destroy(pblk->line_meta_pool); free_w_rq_pool: mempool_destroy(pblk->w_rq_pool); -free_r_rq_pool: - mempool_destroy(pblk->r_rq_pool); +free_g_rq_pool: + mempool_destroy(pblk->g_rq_pool); free_rec_pool: mempool_destroy(pblk->rec_pool); free_blk_ws_pool: @@ -322,19 +316,22 @@ free_page_pool: static void pblk_core_free(struct pblk *pblk) { - if (pblk->kw_wq) - destroy_workqueue(pblk->kw_wq); + if (pblk->close_wq) + destroy_workqueue(pblk->close_wq); + + if (pblk->bb_wq) + destroy_workqueue(pblk->bb_wq); mempool_destroy(pblk->page_pool); mempool_destroy(pblk->line_ws_pool); mempool_destroy(pblk->rec_pool); - mempool_destroy(pblk->r_rq_pool); + mempool_destroy(pblk->g_rq_pool); mempool_destroy(pblk->w_rq_pool); mempool_destroy(pblk->line_meta_pool); kmem_cache_destroy(pblk_blk_ws_cache); kmem_cache_destroy(pblk_rec_cache); - kmem_cache_destroy(pblk_r_rq_cache); + kmem_cache_destroy(pblk_g_rq_cache); kmem_cache_destroy(pblk_w_rq_cache); kmem_cache_destroy(pblk_line_meta_cache); } @@ -344,6 +341,12 @@ static void pblk_luns_free(struct pblk *pblk) kfree(pblk->luns); } +static void pblk_free_line_bitmaps(struct pblk_line *line) +{ + kfree(line->blk_bitmap); + kfree(line->erase_bitmap); +} + static void pblk_lines_free(struct pblk *pblk) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; @@ -355,8 +358,7 @@ static void pblk_lines_free(struct pblk *pblk) line = &pblk->lines[i]; pblk_line_free(pblk, line); - kfree(line->blk_bitmap); - kfree(line->erase_bitmap); + pblk_free_line_bitmaps(line); } spin_unlock(&l_mg->free_lock); } @@ -368,11 +370,15 @@ static void pblk_line_meta_free(struct pblk *pblk) kfree(l_mg->bb_template); kfree(l_mg->bb_aux); + kfree(l_mg->vsc_list); + spin_lock(&l_mg->free_lock); for (i = 0; i < PBLK_DATA_LINES; i++) { - pblk_mfree(l_mg->sline_meta[i].meta, l_mg->smeta_alloc_type); - pblk_mfree(l_mg->eline_meta[i].meta, l_mg->emeta_alloc_type); + kfree(l_mg->sline_meta[i]); + pblk_mfree(l_mg->eline_meta[i]->buf, l_mg->emeta_alloc_type); + kfree(l_mg->eline_meta[i]); } + spin_unlock(&l_mg->free_lock); kfree(pblk->lines); } @@ -411,13 +417,31 @@ out: return ret; } -static int pblk_bb_line(struct pblk *pblk, struct pblk_line *line) +static int pblk_bb_line(struct pblk *pblk, struct pblk_line *line, + int blk_per_line) { - struct pblk_line_meta *lm = &pblk->lm; + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; struct pblk_lun *rlun; int bb_cnt = 0; int i; + for (i = 0; i < blk_per_line; i++) { + rlun = &pblk->luns[i]; + if (rlun->bb_list[line->id] == NVM_BLK_T_FREE) + continue; + + set_bit(pblk_ppa_to_pos(geo, rlun->bppa), line->blk_bitmap); + bb_cnt++; + } + + return bb_cnt; +} + +static int pblk_alloc_line_bitmaps(struct pblk *pblk, struct pblk_line *line) +{ + struct pblk_line_meta *lm = &pblk->lm; + line->blk_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL); if (!line->blk_bitmap) return -ENOMEM; @@ -428,16 +452,7 @@ static int pblk_bb_line(struct pblk *pblk, struct pblk_line *line) return -ENOMEM; } - for (i = 0; i < lm->blk_per_line; i++) { - rlun = &pblk->luns[i]; - if (rlun->bb_list[line->id] == NVM_BLK_T_FREE) - continue; - - set_bit(i, line->blk_bitmap); - bb_cnt++; - } - - return bb_cnt; + return 0; } static int pblk_luns_init(struct pblk *pblk, struct ppa_addr *luns) @@ -505,12 +520,32 @@ static int pblk_lines_configure(struct pblk *pblk, int flags) } /* See comment over struct line_emeta definition */ -static unsigned int calc_emeta_len(struct pblk *pblk, struct pblk_line_meta *lm) +static unsigned int calc_emeta_len(struct pblk *pblk) { - return (sizeof(struct line_emeta) + - ((lm->sec_per_line - lm->emeta_sec) * sizeof(u64)) + - (pblk->l_mg.nr_lines * sizeof(u32)) + - lm->blk_bitmap_len); + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + + /* Round to sector size so that lba_list starts on its own sector */ + lm->emeta_sec[1] = DIV_ROUND_UP( + sizeof(struct line_emeta) + lm->blk_bitmap_len, + geo->sec_size); + lm->emeta_len[1] = lm->emeta_sec[1] * geo->sec_size; + + /* Round to sector size so that vsc_list starts on its own sector */ + lm->dsec_per_line = lm->sec_per_line - lm->emeta_sec[0]; + lm->emeta_sec[2] = DIV_ROUND_UP(lm->dsec_per_line * sizeof(u64), + geo->sec_size); + lm->emeta_len[2] = lm->emeta_sec[2] * geo->sec_size; + + lm->emeta_sec[3] = DIV_ROUND_UP(l_mg->nr_lines * sizeof(u32), + geo->sec_size); + lm->emeta_len[3] = lm->emeta_sec[3] * geo->sec_size; + + lm->vsc_list_len = l_mg->nr_lines * sizeof(u32); + + return (lm->emeta_len[1] + lm->emeta_len[2] + lm->emeta_len[3]); } static void pblk_set_provision(struct pblk *pblk, long nr_free_blks) @@ -534,6 +569,78 @@ static void pblk_set_provision(struct pblk *pblk, long nr_free_blks) atomic_set(&pblk->rl.free_blocks, nr_free_blks); } +static int pblk_lines_alloc_metadata(struct pblk *pblk) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line_meta *lm = &pblk->lm; + int i; + + /* smeta is always small enough to fit on a kmalloc memory allocation, + * emeta depends on the number of LUNs allocated to the pblk instance + */ + for (i = 0; i < PBLK_DATA_LINES; i++) { + l_mg->sline_meta[i] = kmalloc(lm->smeta_len, GFP_KERNEL); + if (!l_mg->sline_meta[i]) + goto fail_free_smeta; + } + + /* emeta allocates three different buffers for managing metadata with + * in-memory and in-media layouts + */ + for (i = 0; i < PBLK_DATA_LINES; i++) { + struct pblk_emeta *emeta; + + emeta = kmalloc(sizeof(struct pblk_emeta), GFP_KERNEL); + if (!emeta) + goto fail_free_emeta; + + if (lm->emeta_len[0] > KMALLOC_MAX_CACHE_SIZE) { + l_mg->emeta_alloc_type = PBLK_VMALLOC_META; + + emeta->buf = vmalloc(lm->emeta_len[0]); + if (!emeta->buf) { + kfree(emeta); + goto fail_free_emeta; + } + + emeta->nr_entries = lm->emeta_sec[0]; + l_mg->eline_meta[i] = emeta; + } else { + l_mg->emeta_alloc_type = PBLK_KMALLOC_META; + + emeta->buf = kmalloc(lm->emeta_len[0], GFP_KERNEL); + if (!emeta->buf) { + kfree(emeta); + goto fail_free_emeta; + } + + emeta->nr_entries = lm->emeta_sec[0]; + l_mg->eline_meta[i] = emeta; + } + } + + l_mg->vsc_list = kcalloc(l_mg->nr_lines, sizeof(__le32), GFP_KERNEL); + if (!l_mg->vsc_list) + goto fail_free_emeta; + + for (i = 0; i < l_mg->nr_lines; i++) + l_mg->vsc_list[i] = cpu_to_le32(EMPTY_ENTRY); + + return 0; + +fail_free_emeta: + while (--i >= 0) { + vfree(l_mg->eline_meta[i]->buf); + kfree(l_mg->eline_meta[i]); + } + +fail_free_smeta: + for (i = 0; i < PBLK_DATA_LINES; i++) + kfree(l_mg->sline_meta[i]); + + return -ENOMEM; +} + static int pblk_lines_init(struct pblk *pblk) { struct nvm_tgt_dev *dev = pblk->dev; @@ -542,10 +649,32 @@ static int pblk_lines_init(struct pblk *pblk) struct pblk_line_meta *lm = &pblk->lm; struct pblk_line *line; unsigned int smeta_len, emeta_len; - long nr_bad_blks, nr_meta_blks, nr_free_blks; - int bb_distance; - int i; - int ret; + long nr_bad_blks, nr_free_blks; + int bb_distance, max_write_ppas, mod; + int i, ret; + + pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE); + max_write_ppas = pblk->min_write_pgs * geo->nr_luns; + pblk->max_write_pgs = (max_write_ppas < nvm_max_phys_sects(dev)) ? + max_write_ppas : nvm_max_phys_sects(dev); + pblk_set_sec_per_write(pblk, pblk->min_write_pgs); + + if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) { + pr_err("pblk: cannot support device max_phys_sect\n"); + return -EINVAL; + } + + div_u64_rem(geo->sec_per_blk, pblk->min_write_pgs, &mod); + if (mod) { + pr_err("pblk: bad configuration of sectors/pages\n"); + return -EINVAL; + } + + l_mg->nr_lines = geo->blks_per_lun; + l_mg->log_line = l_mg->data_line = NULL; + l_mg->l_seq_nr = l_mg->d_seq_nr = 0; + l_mg->nr_free_lines = 0; + bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES); lm->sec_per_line = geo->sec_per_blk * geo->nr_luns; lm->blk_per_line = geo->nr_luns; @@ -554,20 +683,17 @@ static int pblk_lines_init(struct pblk *pblk) lm->lun_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long); lm->high_thrs = lm->sec_per_line / 2; lm->mid_thrs = lm->sec_per_line / 4; + lm->meta_distance = (geo->nr_luns / 2) * pblk->min_write_pgs; /* Calculate necessary pages for smeta. See comment over struct * line_smeta definition */ - lm->smeta_len = sizeof(struct line_smeta) + - PBLK_LINE_NR_LUN_BITMAP * lm->lun_bitmap_len; - i = 1; add_smeta_page: lm->smeta_sec = i * geo->sec_per_pl; lm->smeta_len = lm->smeta_sec * geo->sec_size; - smeta_len = sizeof(struct line_smeta) + - PBLK_LINE_NR_LUN_BITMAP * lm->lun_bitmap_len; + smeta_len = sizeof(struct line_smeta) + lm->lun_bitmap_len; if (smeta_len > lm->smeta_len) { i++; goto add_smeta_page; @@ -578,66 +704,28 @@ add_smeta_page: */ i = 1; add_emeta_page: - lm->emeta_sec = i * geo->sec_per_pl; - lm->emeta_len = lm->emeta_sec * geo->sec_size; + lm->emeta_sec[0] = i * geo->sec_per_pl; + lm->emeta_len[0] = lm->emeta_sec[0] * geo->sec_size; - emeta_len = calc_emeta_len(pblk, lm); - if (emeta_len > lm->emeta_len) { + emeta_len = calc_emeta_len(pblk); + if (emeta_len > lm->emeta_len[0]) { i++; goto add_emeta_page; } - lm->emeta_bb = geo->nr_luns - i; - - nr_meta_blks = (lm->smeta_sec + lm->emeta_sec + - (geo->sec_per_blk / 2)) / geo->sec_per_blk; - lm->min_blk_line = nr_meta_blks + 1; - - l_mg->nr_lines = geo->blks_per_lun; - l_mg->log_line = l_mg->data_line = NULL; - l_mg->l_seq_nr = l_mg->d_seq_nr = 0; - l_mg->nr_free_lines = 0; - bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES); - /* smeta is always small enough to fit on a kmalloc memory allocation, - * emeta depends on the number of LUNs allocated to the pblk instance - */ - l_mg->smeta_alloc_type = PBLK_KMALLOC_META; - for (i = 0; i < PBLK_DATA_LINES; i++) { - l_mg->sline_meta[i].meta = kmalloc(lm->smeta_len, GFP_KERNEL); - if (!l_mg->sline_meta[i].meta) - while (--i >= 0) { - kfree(l_mg->sline_meta[i].meta); - ret = -ENOMEM; - goto fail; - } + lm->emeta_bb = geo->nr_luns - i; + lm->min_blk_line = 1 + DIV_ROUND_UP(lm->smeta_sec + lm->emeta_sec[0], + geo->sec_per_blk); + if (lm->min_blk_line > lm->blk_per_line) { + pr_err("pblk: config. not supported. Min. LUN in line:%d\n", + lm->blk_per_line); + ret = -EINVAL; + goto fail; } - if (lm->emeta_len > KMALLOC_MAX_CACHE_SIZE) { - l_mg->emeta_alloc_type = PBLK_VMALLOC_META; - - for (i = 0; i < PBLK_DATA_LINES; i++) { - l_mg->eline_meta[i].meta = vmalloc(lm->emeta_len); - if (!l_mg->eline_meta[i].meta) - while (--i >= 0) { - vfree(l_mg->eline_meta[i].meta); - ret = -ENOMEM; - goto fail; - } - } - } else { - l_mg->emeta_alloc_type = PBLK_KMALLOC_META; - - for (i = 0; i < PBLK_DATA_LINES; i++) { - l_mg->eline_meta[i].meta = - kmalloc(lm->emeta_len, GFP_KERNEL); - if (!l_mg->eline_meta[i].meta) - while (--i >= 0) { - kfree(l_mg->eline_meta[i].meta); - ret = -ENOMEM; - goto fail; - } - } - } + ret = pblk_lines_alloc_metadata(pblk); + if (ret) + goto fail; l_mg->bb_template = kzalloc(lm->sec_bitmap_len, GFP_KERNEL); if (!l_mg->bb_template) { @@ -664,11 +752,14 @@ add_emeta_page: INIT_LIST_HEAD(&l_mg->gc_low_list); INIT_LIST_HEAD(&l_mg->gc_empty_list); + INIT_LIST_HEAD(&l_mg->emeta_list); + l_mg->gc_lists[0] = &l_mg->gc_high_list; l_mg->gc_lists[1] = &l_mg->gc_mid_list; l_mg->gc_lists[2] = &l_mg->gc_low_list; spin_lock_init(&l_mg->free_lock); + spin_lock_init(&l_mg->close_lock); spin_lock_init(&l_mg->gc_lock); pblk->lines = kcalloc(l_mg->nr_lines, sizeof(struct pblk_line), @@ -689,10 +780,16 @@ add_emeta_page: line->type = PBLK_LINETYPE_FREE; line->state = PBLK_LINESTATE_FREE; line->gc_group = PBLK_LINEGC_NONE; + line->vsc = &l_mg->vsc_list[i]; spin_lock_init(&line->lock); - nr_bad_blks = pblk_bb_line(pblk, line); + ret = pblk_alloc_line_bitmaps(pblk, line); + if (ret) + goto fail_free_lines; + + nr_bad_blks = pblk_bb_line(pblk, line, lm->blk_per_line); if (nr_bad_blks < 0 || nr_bad_blks > lm->blk_per_line) { + pblk_free_line_bitmaps(line); ret = -EINVAL; goto fail_free_lines; } @@ -713,24 +810,20 @@ add_emeta_page: pblk_set_provision(pblk, nr_free_blks); - sema_init(&pblk->erase_sem, 1); - /* Cleanup per-LUN bad block lists - managed within lines on run-time */ for (i = 0; i < geo->nr_luns; i++) kfree(pblk->luns[i].bb_list); return 0; fail_free_lines: - kfree(pblk->lines); + while (--i >= 0) + pblk_free_line_bitmaps(&pblk->lines[i]); fail_free_bb_aux: kfree(l_mg->bb_aux); fail_free_bb_template: kfree(l_mg->bb_template); fail_free_meta: - for (i = 0; i < PBLK_DATA_LINES; i++) { - pblk_mfree(l_mg->sline_meta[i].meta, l_mg->smeta_alloc_type); - pblk_mfree(l_mg->eline_meta[i].meta, l_mg->emeta_alloc_type); - } + pblk_line_meta_free(pblk); fail: for (i = 0; i < geo->nr_luns; i++) kfree(pblk->luns[i].bb_list); @@ -754,6 +847,15 @@ static int pblk_writer_init(struct pblk *pblk) static void pblk_writer_stop(struct pblk *pblk) { + /* The pipeline must be stopped and the write buffer emptied before the + * write thread is stopped + */ + WARN(pblk_rb_read_count(&pblk->rwb), + "Stopping not fully persisted write buffer\n"); + + WARN(pblk_rb_sync_count(&pblk->rwb), + "Stopping not fully synced write buffer\n"); + if (pblk->writer_ts) kthread_stop(pblk->writer_ts); del_timer(&pblk->wtimer); @@ -772,10 +874,9 @@ static void pblk_free(struct pblk *pblk) static void pblk_tear_down(struct pblk *pblk) { - pblk_flush_writer(pblk); + pblk_pipeline_stop(pblk); pblk_writer_stop(pblk); pblk_rb_sync_l2p(&pblk->rwb); - pblk_recov_pad(pblk); pblk_rwb_free(pblk); pblk_rl_free(&pblk->rl); @@ -821,6 +922,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, pblk->dev = dev; pblk->disk = tdisk; + pblk->state = PBLK_STATE_RUNNING; spin_lock_init(&pblk->trans_lock); spin_lock_init(&pblk->lock); @@ -836,8 +938,8 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, atomic_long_set(&pblk->req_writes, 0); atomic_long_set(&pblk->sub_writes, 0); atomic_long_set(&pblk->sync_writes, 0); - atomic_long_set(&pblk->compl_writes, 0); atomic_long_set(&pblk->inflight_reads, 0); + atomic_long_set(&pblk->cache_reads, 0); atomic_long_set(&pblk->sync_reads, 0); atomic_long_set(&pblk->recov_writes, 0); atomic_long_set(&pblk->recov_writes, 0); @@ -946,11 +1048,20 @@ static struct nvm_tgt_type tt_pblk = { static int __init pblk_module_init(void) { - return nvm_register_tgt_type(&tt_pblk); + int ret; + + pblk_bio_set = bioset_create(BIO_POOL_SIZE, 0, 0); + if (!pblk_bio_set) + return -ENOMEM; + ret = nvm_register_tgt_type(&tt_pblk); + if (ret) + bioset_free(pblk_bio_set); + return ret; } static void pblk_module_exit(void) { + bioset_free(pblk_bio_set); nvm_unregister_tgt_type(&tt_pblk); } diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c index 17c1695..fddb924 100644 --- a/drivers/lightnvm/pblk-map.c +++ b/drivers/lightnvm/pblk-map.c @@ -25,9 +25,9 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry, unsigned int valid_secs) { struct pblk_line *line = pblk_line_get_data(pblk); - struct line_emeta *emeta = line->emeta; + struct pblk_emeta *emeta = line->emeta; struct pblk_w_ctx *w_ctx; - __le64 *lba_list = pblk_line_emeta_to_lbas(emeta); + __le64 *lba_list = emeta_to_lbas(pblk, emeta->buf); u64 paddr; int nr_secs = pblk->min_write_pgs; int i; @@ -51,18 +51,20 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry, w_ctx->ppa = ppa_list[i]; meta_list[i].lba = cpu_to_le64(w_ctx->lba); lba_list[paddr] = cpu_to_le64(w_ctx->lba); - le64_add_cpu(&line->emeta->nr_valid_lbas, 1); + line->nr_valid_lbas++; } else { - meta_list[i].lba = cpu_to_le64(ADDR_EMPTY); - lba_list[paddr] = cpu_to_le64(ADDR_EMPTY); - pblk_map_pad_invalidate(pblk, line, paddr); + __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); + + lba_list[paddr] = meta_list[i].lba = addr_empty; + __pblk_map_invalidate(pblk, line, paddr); } } if (pblk_line_is_full(line)) { - line = pblk_line_replace_data(pblk); - if (!line) - return; + struct pblk_line *prev_line = line; + + pblk_line_replace_data(pblk); + pblk_line_close_meta(pblk, prev_line); } pblk_down_rq(pblk, ppa_list, nr_secs, lun_bitmap); @@ -91,8 +93,9 @@ void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd, { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; - struct pblk_line *e_line = pblk_line_get_data_next(pblk); + struct pblk_line_meta *lm = &pblk->lm; struct pblk_sec_meta *meta_list = rqd->meta_list; + struct pblk_line *e_line, *d_line; unsigned int map_secs; int min = pblk->min_write_pgs; int i, erase_lun; @@ -102,35 +105,63 @@ void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd, pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i], lun_bitmap, &meta_list[i], map_secs); - erase_lun = rqd->ppa_list[i].g.lun * geo->nr_chnls + - rqd->ppa_list[i].g.ch; + erase_lun = pblk_ppa_to_pos(geo, rqd->ppa_list[i]); - if (!test_bit(erase_lun, e_line->erase_bitmap)) { - if (down_trylock(&pblk->erase_sem)) - continue; + /* line can change after page map. We might also be writing the + * last line. + */ + e_line = pblk_line_get_erase(pblk); + if (!e_line) + return pblk_map_rq(pblk, rqd, sentry, lun_bitmap, + valid_secs, i + min); + spin_lock(&e_line->lock); + if (!test_bit(erase_lun, e_line->erase_bitmap)) { set_bit(erase_lun, e_line->erase_bitmap); atomic_dec(&e_line->left_eblks); + *erase_ppa = rqd->ppa_list[i]; erase_ppa->g.blk = e_line->id; + spin_unlock(&e_line->lock); + /* Avoid evaluating e_line->left_eblks */ return pblk_map_rq(pblk, rqd, sentry, lun_bitmap, valid_secs, i + min); } + spin_unlock(&e_line->lock); } - /* Erase blocks that are bad in this line but might not be in next */ - if (unlikely(ppa_empty(*erase_ppa))) { - struct pblk_line_meta *lm = &pblk->lm; + d_line = pblk_line_get_data(pblk); + + /* line can change after page map. We might also be writing the + * last line. + */ + e_line = pblk_line_get_erase(pblk); + if (!e_line) + return; - i = find_first_zero_bit(e_line->erase_bitmap, lm->blk_per_line); - if (i == lm->blk_per_line) + /* Erase blocks that are bad in this line but might not be in next */ + if (unlikely(ppa_empty(*erase_ppa)) && + bitmap_weight(d_line->blk_bitmap, lm->blk_per_line)) { + int bit = -1; + +retry: + bit = find_next_bit(d_line->blk_bitmap, + lm->blk_per_line, bit + 1); + if (bit >= lm->blk_per_line) return; - set_bit(i, e_line->erase_bitmap); + spin_lock(&e_line->lock); + if (test_bit(bit, e_line->erase_bitmap)) { + spin_unlock(&e_line->lock); + goto retry; + } + spin_unlock(&e_line->lock); + + set_bit(bit, e_line->erase_bitmap); atomic_dec(&e_line->left_eblks); - *erase_ppa = pblk->luns[i].bppa; /* set ch and lun */ + *erase_ppa = pblk->luns[bit].bppa; /* set ch and lun */ erase_ppa->g.blk = e_line->id; } } diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c index 045384d..5ecc154 100644 --- a/drivers/lightnvm/pblk-rb.c +++ b/drivers/lightnvm/pblk-rb.c @@ -150,6 +150,7 @@ try: /* Release flags on context. Protect from writes and reads */ smp_store_release(&w_ctx->flags, PBLK_WRITABLE_ENTRY); pblk_ppa_set_empty(&w_ctx->ppa); + w_ctx->lba = ADDR_EMPTY; } #define pblk_rb_ring_count(head, tail, size) CIRC_CNT(head, tail, size) @@ -180,6 +181,14 @@ unsigned int pblk_rb_read_count(struct pblk_rb *rb) return pblk_rb_ring_count(mem, subm, rb->nr_entries); } +unsigned int pblk_rb_sync_count(struct pblk_rb *rb) +{ + unsigned int mem = READ_ONCE(rb->mem); + unsigned int sync = READ_ONCE(rb->sync); + + return pblk_rb_ring_count(mem, sync, rb->nr_entries); +} + unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int nr_entries) { unsigned int subm; @@ -199,12 +208,22 @@ static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd, struct pblk_line *line; struct pblk_rb_entry *entry; struct pblk_w_ctx *w_ctx; + unsigned int user_io = 0, gc_io = 0; unsigned int i; + int flags; for (i = 0; i < to_update; i++) { entry = &rb->entries[*l2p_upd]; w_ctx = &entry->w_ctx; + flags = READ_ONCE(entry->w_ctx.flags); + if (flags & PBLK_IOTYPE_USER) + user_io++; + else if (flags & PBLK_IOTYPE_GC) + gc_io++; + else + WARN(1, "pblk: unknown IO type\n"); + pblk_update_map_dev(pblk, w_ctx->lba, w_ctx->ppa, entry->cacheline); @@ -214,6 +233,8 @@ static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd, *l2p_upd = (*l2p_upd + 1) & (rb->nr_entries - 1); } + pblk_rl_out(&pblk->rl, user_io, gc_io); + return 0; } @@ -357,6 +378,9 @@ static int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio, /* Protect syncs */ smp_store_release(&rb->sync_point, sync_point); + if (!bio) + return 0; + spin_lock_irq(&rb->s_lock); bio_list_add(&entry->w_ctx.bios, bio); spin_unlock_irq(&rb->s_lock); @@ -395,6 +419,17 @@ static int pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries, return 1; } +void pblk_rb_flush(struct pblk_rb *rb) +{ + struct pblk *pblk = container_of(rb, struct pblk, rwb); + unsigned int mem = READ_ONCE(rb->mem); + + if (pblk_rb_sync_point_set(rb, NULL, mem)) + return; + + pblk_write_should_kick(pblk); +} + static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries, unsigned int *pos, struct bio *bio, int *io_ret) @@ -431,15 +466,16 @@ int pblk_rb_may_write_user(struct pblk_rb *rb, struct bio *bio, unsigned int nr_entries, unsigned int *pos) { struct pblk *pblk = container_of(rb, struct pblk, rwb); - int flush_done; + int io_ret; spin_lock(&rb->w_lock); - if (!pblk_rl_user_may_insert(&pblk->rl, nr_entries)) { + io_ret = pblk_rl_user_may_insert(&pblk->rl, nr_entries); + if (io_ret) { spin_unlock(&rb->w_lock); - return NVM_IO_REQUEUE; + return io_ret; } - if (!pblk_rb_may_write_flush(rb, nr_entries, pos, bio, &flush_done)) { + if (!pblk_rb_may_write_flush(rb, nr_entries, pos, bio, &io_ret)) { spin_unlock(&rb->w_lock); return NVM_IO_REQUEUE; } @@ -447,7 +483,7 @@ int pblk_rb_may_write_user(struct pblk_rb *rb, struct bio *bio, pblk_rl_user_in(&pblk->rl, nr_entries); spin_unlock(&rb->w_lock); - return flush_done; + return io_ret; } /* @@ -521,20 +557,18 @@ out: * This function is used by the write thread to form the write bio that will * persist data on the write buffer to the media. */ -unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct bio *bio, - struct pblk_c_ctx *c_ctx, - unsigned int pos, - unsigned int nr_entries, - unsigned int count) +unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd, + struct bio *bio, unsigned int pos, + unsigned int nr_entries, unsigned int count) { struct pblk *pblk = container_of(rb, struct pblk, rwb); + struct request_queue *q = pblk->dev->q; + struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); struct pblk_rb_entry *entry; struct page *page; - unsigned int pad = 0, read = 0, to_read = nr_entries; - unsigned int user_io = 0, gc_io = 0; + unsigned int pad = 0, to_read = nr_entries; unsigned int i; int flags; - int ret; if (count < nr_entries) { pad = nr_entries - count; @@ -553,15 +587,10 @@ unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct bio *bio, */ try: flags = READ_ONCE(entry->w_ctx.flags); - if (!(flags & PBLK_WRITTEN_DATA)) + if (!(flags & PBLK_WRITTEN_DATA)) { + io_schedule(); goto try; - - if (flags & PBLK_IOTYPE_USER) - user_io++; - else if (flags & PBLK_IOTYPE_GC) - gc_io++; - else - WARN(1, "pblk: unknown IO type\n"); + } page = virt_to_page(entry->data); if (!page) { @@ -570,17 +599,17 @@ try: flags |= PBLK_SUBMITTED_ENTRY; /* Release flags on context. Protect from writes */ smp_store_release(&entry->w_ctx.flags, flags); - goto out; + return NVM_IO_ERR; } - ret = bio_add_page(bio, page, rb->seg_size, 0); - if (ret != rb->seg_size) { + if (bio_add_pc_page(q, bio, page, rb->seg_size, 0) != + rb->seg_size) { pr_err("pblk: could not add page to write bio\n"); flags &= ~PBLK_WRITTEN_DATA; flags |= PBLK_SUBMITTED_ENTRY; /* Release flags on context. Protect from writes */ smp_store_release(&entry->w_ctx.flags, flags); - goto out; + return NVM_IO_ERR; } if (flags & PBLK_FLUSH_ENTRY) { @@ -607,14 +636,19 @@ try: pos = (pos + 1) & (rb->nr_entries - 1); } - read = to_read; - pblk_rl_out(&pblk->rl, user_io, gc_io); + if (pad) { + if (pblk_bio_add_pages(pblk, bio, GFP_KERNEL, pad)) { + pr_err("pblk: could not pad page in write bio\n"); + return NVM_IO_ERR; + } + } + #ifdef CONFIG_NVM_DEBUG atomic_long_add(pad, &((struct pblk *) (container_of(rb, struct pblk, rwb)))->padded_writes); #endif -out: - return read; + + return NVM_IO_OK; } /* @@ -623,15 +657,17 @@ out: * be directed to disk. */ int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba, - u64 pos, int bio_iter) + struct ppa_addr ppa, int bio_iter) { + struct pblk *pblk = container_of(rb, struct pblk, rwb); struct pblk_rb_entry *entry; struct pblk_w_ctx *w_ctx; + struct ppa_addr l2p_ppa; + u64 pos = pblk_addr_to_cacheline(ppa); void *data; int flags; int ret = 1; - spin_lock(&rb->w_lock); #ifdef CONFIG_NVM_DEBUG /* Caller must ensure that the access will not cause an overflow */ @@ -641,8 +677,14 @@ int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba, w_ctx = &entry->w_ctx; flags = READ_ONCE(w_ctx->flags); + spin_lock(&rb->w_lock); + spin_lock(&pblk->trans_lock); + l2p_ppa = pblk_trans_map_get(pblk, lba); + spin_unlock(&pblk->trans_lock); + /* Check if the entry has been overwritten or is scheduled to be */ - if (w_ctx->lba != lba || flags & PBLK_WRITABLE_ENTRY) { + if (!pblk_ppa_comp(l2p_ppa, ppa) || w_ctx->lba != lba || + flags & PBLK_WRITABLE_ENTRY) { ret = 0; goto out; } diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index 4a12f14..4e5c48f 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -34,8 +34,7 @@ static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio, BUG_ON(!pblk_addr_in_cache(ppa)); #endif - return pblk_rb_copy_to_bio(&pblk->rwb, bio, lba, - pblk_addr_to_cacheline(ppa), bio_iter); + return pblk_rb_copy_to_bio(&pblk->rwb, bio, lba, ppa, bio_iter); } static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd, @@ -76,6 +75,9 @@ retry: } WARN_ON(test_and_set_bit(i, read_bitmap)); advanced_bio = 1; +#ifdef CONFIG_NVM_DEBUG + atomic_long_inc(&pblk->cache_reads); +#endif } else { /* Read from media non-cached sectors */ rqd->ppa_list[j++] = p; @@ -85,6 +87,11 @@ retry: bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE); } + if (pblk_io_aligned(pblk, nr_secs)) + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); + else + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); + #ifdef CONFIG_NVM_DEBUG atomic_long_add(nr_secs, &pblk->inflight_reads); #endif @@ -94,8 +101,6 @@ static int pblk_submit_read_io(struct pblk *pblk, struct nvm_rq *rqd) { int err; - rqd->flags = pblk_set_read_mode(pblk); - err = pblk_submit_io(pblk, rqd); if (err) return NVM_IO_ERR; @@ -107,27 +112,27 @@ static void pblk_end_io_read(struct nvm_rq *rqd) { struct pblk *pblk = rqd->private; struct nvm_tgt_dev *dev = pblk->dev; - struct pblk_r_ctx *r_ctx = nvm_rq_to_pdu(rqd); + struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); struct bio *bio = rqd->bio; if (rqd->error) pblk_log_read_err(pblk, rqd); #ifdef CONFIG_NVM_DEBUG else - WARN_ONCE(bio->bi_error, "pblk: corrupted read error\n"); + WARN_ONCE(bio->bi_status, "pblk: corrupted read error\n"); #endif - if (rqd->nr_ppas > 1) - nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list); + nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); bio_put(bio); - if (r_ctx->orig_bio) { + if (r_ctx->private) { + struct bio *orig_bio = r_ctx->private; + #ifdef CONFIG_NVM_DEBUG - WARN_ONCE(r_ctx->orig_bio->bi_error, - "pblk: corrupted read bio\n"); + WARN_ONCE(orig_bio->bi_status, "pblk: corrupted read bio\n"); #endif - bio_endio(r_ctx->orig_bio); - bio_put(r_ctx->orig_bio); + bio_endio(orig_bio); + bio_put(orig_bio); } #ifdef CONFIG_NVM_DEBUG @@ -136,6 +141,7 @@ static void pblk_end_io_read(struct nvm_rq *rqd) #endif pblk_free_rqd(pblk, rqd, READ); + atomic_dec(&pblk->inflight_io); } static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, @@ -173,6 +179,7 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, rqd->bio = new_bio; rqd->nr_ppas = nr_holes; + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); rqd->end_io = NULL; if (unlikely(nr_secs > 1 && nr_holes == 1)) { @@ -280,9 +287,14 @@ retry: goto retry; } WARN_ON(test_and_set_bit(0, read_bitmap)); +#ifdef CONFIG_NVM_DEBUG + atomic_long_inc(&pblk->cache_reads); +#endif } else { rqd->ppa_addr = ppa; } + + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); } int pblk_submit_read(struct pblk *pblk, struct bio *bio) @@ -316,13 +328,16 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) */ bio_init_idx = pblk_get_bi_idx(bio); + rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, + &rqd->dma_meta_list); + if (!rqd->meta_list) { + pr_err("pblk: not able to allocate ppa list\n"); + goto fail_rqd_free; + } + if (nr_secs > 1) { - rqd->ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, - &rqd->dma_ppa_list); - if (!rqd->ppa_list) { - pr_err("pblk: not able to allocate ppa list\n"); - goto fail_rqd_free; - } + rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size; + rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size; pblk_read_ppalist_rq(pblk, rqd, &read_bitmap); } else { @@ -332,6 +347,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) bio_get(bio); if (bitmap_full(&read_bitmap, nr_secs)) { bio_endio(bio); + atomic_inc(&pblk->inflight_io); pblk_end_io_read(rqd); return NVM_IO_OK; } @@ -339,17 +355,17 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) /* All sectors are to be read from the device */ if (bitmap_empty(&read_bitmap, rqd->nr_ppas)) { struct bio *int_bio = NULL; - struct pblk_r_ctx *r_ctx = nvm_rq_to_pdu(rqd); + struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); /* Clone read bio to deal with read errors internally */ - int_bio = bio_clone_bioset(bio, GFP_KERNEL, fs_bio_set); + int_bio = bio_clone_fast(bio, GFP_KERNEL, pblk_bio_set); if (!int_bio) { pr_err("pblk: could not clone read bio\n"); return NVM_IO_ERR; } rqd->bio = int_bio; - r_ctx->orig_bio = bio; + r_ctx->private = bio; ret = pblk_submit_read_io(pblk, rqd); if (ret) { @@ -445,7 +461,6 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; - struct request_queue *q = dev->q; struct bio *bio; struct nvm_rq rqd; int ret, data_len; @@ -453,22 +468,19 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, memset(&rqd, 0, sizeof(struct nvm_rq)); + rqd.meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, + &rqd.dma_meta_list); + if (!rqd.meta_list) + return NVM_IO_ERR; + if (nr_secs > 1) { - rqd.ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, - &rqd.dma_ppa_list); - if (!rqd.ppa_list) - return NVM_IO_ERR; + rqd.ppa_list = rqd.meta_list + pblk_dma_meta_size; + rqd.dma_ppa_list = rqd.dma_meta_list + pblk_dma_meta_size; *secs_to_gc = read_ppalist_rq_gc(pblk, &rqd, line, lba_list, nr_secs); - if (*secs_to_gc == 1) { - struct ppa_addr ppa; - - ppa = rqd.ppa_list[0]; - nvm_dev_dma_free(dev->parent, rqd.ppa_list, - rqd.dma_ppa_list); - rqd.ppa_addr = ppa; - } + if (*secs_to_gc == 1) + rqd.ppa_addr = rqd.ppa_list[0]; } else { *secs_to_gc = read_rq_gc(pblk, &rqd, line, lba_list[0]); } @@ -477,7 +489,8 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, goto out; data_len = (*secs_to_gc) * geo->sec_size; - bio = bio_map_kern(q, data, data_len, GFP_KERNEL); + bio = pblk_bio_map_addr(pblk, data, *secs_to_gc, data_len, + PBLK_KMALLOC_META, GFP_KERNEL); if (IS_ERR(bio)) { pr_err("pblk: could not allocate GC bio (%lu)\n", PTR_ERR(bio)); goto err_free_dma; @@ -490,6 +503,7 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, rqd.end_io = pblk_end_io_sync; rqd.private = &wait; rqd.nr_ppas = *secs_to_gc; + rqd.flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); rqd.bio = bio; ret = pblk_submit_read_io(pblk, &rqd); @@ -503,6 +517,7 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { pr_err("pblk: GC read I/O timed out\n"); } + atomic_dec(&pblk->inflight_io); if (rqd.error) { atomic_long_inc(&pblk->read_failed_gc); @@ -518,12 +533,10 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, #endif out: - if (rqd.nr_ppas > 1) - nvm_dev_dma_free(dev->parent, rqd.ppa_list, rqd.dma_ppa_list); + nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); return NVM_IO_OK; err_free_dma: - if (rqd.nr_ppas > 1) - nvm_dev_dma_free(dev->parent, rqd.ppa_list, rqd.dma_ppa_list); + nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); return NVM_IO_ERR; } diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index f8f8508..0e48d3e 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -120,18 +120,18 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, return 0; } -__le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta) +__le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta_buf) { u32 crc; - crc = pblk_calc_emeta_crc(pblk, emeta); - if (le32_to_cpu(emeta->crc) != crc) + crc = pblk_calc_emeta_crc(pblk, emeta_buf); + if (le32_to_cpu(emeta_buf->crc) != crc) return NULL; - if (le32_to_cpu(emeta->header.identifier) != PBLK_MAGIC) + if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC) return NULL; - return pblk_line_emeta_to_lbas(emeta); + return emeta_to_lbas(pblk, emeta_buf); } static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) @@ -139,19 +139,20 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; struct pblk_line_meta *lm = &pblk->lm; - struct line_emeta *emeta = line->emeta; + struct pblk_emeta *emeta = line->emeta; + struct line_emeta *emeta_buf = emeta->buf; __le64 *lba_list; int data_start; int nr_data_lbas, nr_valid_lbas, nr_lbas = 0; int i; - lba_list = pblk_recov_get_lba_list(pblk, emeta); + lba_list = pblk_recov_get_lba_list(pblk, emeta_buf); if (!lba_list) return 1; data_start = pblk_line_smeta_start(pblk, line) + lm->smeta_sec; - nr_data_lbas = lm->sec_per_line - lm->emeta_sec; - nr_valid_lbas = le64_to_cpu(emeta->nr_valid_lbas); + nr_data_lbas = lm->sec_per_line - lm->emeta_sec[0]; + nr_valid_lbas = le64_to_cpu(emeta_buf->nr_valid_lbas); for (i = data_start; i < nr_data_lbas && nr_lbas < nr_valid_lbas; i++) { struct ppa_addr ppa; @@ -169,7 +170,7 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) if (test_and_set_bit(i, line->invalid_bitmap)) WARN_ONCE(1, "pblk: rec. double invalidate:\n"); else - line->vsc--; + le32_add_cpu(line->vsc, -1); spin_unlock(&line->lock); continue; @@ -181,7 +182,7 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) if (nr_valid_lbas != nr_lbas) pr_err("pblk: line %d - inconsistent lba list(%llu/%d)\n", - line->id, line->emeta->nr_valid_lbas, nr_lbas); + line->id, emeta_buf->nr_valid_lbas, nr_lbas); line->left_msecs = 0; @@ -195,7 +196,7 @@ static int pblk_calc_sec_in_line(struct pblk *pblk, struct pblk_line *line) struct pblk_line_meta *lm = &pblk->lm; int nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line); - return lm->sec_per_line - lm->smeta_sec - lm->emeta_sec - + return lm->sec_per_line - lm->smeta_sec - lm->emeta_sec[0] - nr_bb * geo->sec_per_blk; } @@ -240,7 +241,7 @@ static int pblk_recov_read_oob(struct pblk *pblk, struct pblk_line *line, r_ptr_int = r_ptr; next_read_rq: - memset(rqd, 0, pblk_r_rq_size); + memset(rqd, 0, pblk_g_rq_size); rq_ppas = pblk_calc_secs(pblk, left_ppas, 0); if (!rq_ppas) @@ -256,7 +257,6 @@ next_read_rq: rqd->bio = bio; rqd->opcode = NVM_OP_PREAD; - rqd->flags = pblk_set_read_mode(pblk); rqd->meta_list = meta_list; rqd->nr_ppas = rq_ppas; rqd->ppa_list = ppa_list; @@ -265,6 +265,11 @@ next_read_rq: rqd->end_io = pblk_end_io_sync; rqd->private = &wait; + if (pblk_io_aligned(pblk, rq_ppas)) + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); + else + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); + for (i = 0; i < rqd->nr_ppas; ) { struct ppa_addr ppa; int pos; @@ -295,7 +300,7 @@ next_read_rq: pr_err("pblk: L2P recovery read timed out\n"); return -EINTR; } - + atomic_dec(&pblk->inflight_io); reinit_completion(&wait); /* At this point, the read should not fail. If it does, it is a problem @@ -322,47 +327,94 @@ next_read_rq: return 0; } +static void pblk_recov_complete(struct kref *ref) +{ + struct pblk_pad_rq *pad_rq = container_of(ref, struct pblk_pad_rq, ref); + + complete(&pad_rq->wait); +} + +static void pblk_end_io_recov(struct nvm_rq *rqd) +{ + struct pblk_pad_rq *pad_rq = rqd->private; + struct pblk *pblk = pad_rq->pblk; + struct nvm_tgt_dev *dev = pblk->dev; + + kref_put(&pad_rq->ref, pblk_recov_complete); + nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); + pblk_free_rqd(pblk, rqd, WRITE); +} + static int pblk_recov_pad_oob(struct pblk *pblk, struct pblk_line *line, - struct pblk_recov_alloc p, int left_ppas) + int left_ppas) { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; struct ppa_addr *ppa_list; struct pblk_sec_meta *meta_list; + struct pblk_pad_rq *pad_rq; struct nvm_rq *rqd; struct bio *bio; void *data; dma_addr_t dma_ppa_list, dma_meta_list; - __le64 *lba_list = pblk_line_emeta_to_lbas(line->emeta); + __le64 *lba_list = emeta_to_lbas(pblk, line->emeta->buf); u64 w_ptr = line->cur_sec; - int left_line_ppas = line->left_msecs; - int rq_ppas, rq_len; + int left_line_ppas, rq_ppas, rq_len; int i, j; int ret = 0; - DECLARE_COMPLETION_ONSTACK(wait); - ppa_list = p.ppa_list; - meta_list = p.meta_list; - rqd = p.rqd; - data = p.data; - dma_ppa_list = p.dma_ppa_list; - dma_meta_list = p.dma_meta_list; + spin_lock(&line->lock); + left_line_ppas = line->left_msecs; + spin_unlock(&line->lock); + + pad_rq = kmalloc(sizeof(struct pblk_pad_rq), GFP_KERNEL); + if (!pad_rq) + return -ENOMEM; + + data = vzalloc(pblk->max_write_pgs * geo->sec_size); + if (!data) { + ret = -ENOMEM; + goto free_rq; + } + + pad_rq->pblk = pblk; + init_completion(&pad_rq->wait); + kref_init(&pad_rq->ref); next_pad_rq: rq_ppas = pblk_calc_secs(pblk, left_ppas, 0); - if (!rq_ppas) - rq_ppas = pblk->min_write_pgs; + if (rq_ppas < pblk->min_write_pgs) { + pr_err("pblk: corrupted pad line %d\n", line->id); + goto free_rq; + } + rq_len = rq_ppas * geo->sec_size; + meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list); + if (!meta_list) { + ret = -ENOMEM; + goto free_data; + } + + ppa_list = (void *)(meta_list) + pblk_dma_meta_size; + dma_ppa_list = dma_meta_list + pblk_dma_meta_size; + + rqd = pblk_alloc_rqd(pblk, WRITE); + if (IS_ERR(rqd)) { + ret = PTR_ERR(rqd); + goto fail_free_meta; + } + memset(rqd, 0, pblk_w_rq_size); + bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL); - if (IS_ERR(bio)) - return PTR_ERR(bio); + if (IS_ERR(bio)) { + ret = PTR_ERR(bio); + goto fail_free_rqd; + } bio->bi_iter.bi_sector = 0; /* internal bio */ bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - memset(rqd, 0, pblk_r_rq_size); - rqd->bio = bio; rqd->opcode = NVM_OP_PWRITE; rqd->flags = pblk_set_progr_mode(pblk, WRITE); @@ -371,8 +423,8 @@ next_pad_rq: rqd->ppa_list = ppa_list; rqd->dma_ppa_list = dma_ppa_list; rqd->dma_meta_list = dma_meta_list; - rqd->end_io = pblk_end_io_sync; - rqd->private = &wait; + rqd->end_io = pblk_end_io_recov; + rqd->private = pad_rq; for (i = 0; i < rqd->nr_ppas; ) { struct ppa_addr ppa; @@ -390,34 +442,51 @@ next_pad_rq: for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++) { struct ppa_addr dev_ppa; + __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); dev_ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); pblk_map_invalidate(pblk, dev_ppa); - meta_list[i].lba = cpu_to_le64(ADDR_EMPTY); - lba_list[w_ptr] = cpu_to_le64(ADDR_EMPTY); + lba_list[w_ptr] = meta_list[i].lba = addr_empty; rqd->ppa_list[i] = dev_ppa; } } + kref_get(&pad_rq->ref); + ret = pblk_submit_io(pblk, rqd); if (ret) { pr_err("pblk: I/O submission failed: %d\n", ret); - return ret; + goto free_data; } - if (!wait_for_completion_io_timeout(&wait, - msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { - pr_err("pblk: L2P recovery write timed out\n"); - } - reinit_completion(&wait); + atomic_dec(&pblk->inflight_io); left_line_ppas -= rq_ppas; left_ppas -= rq_ppas; - if (left_ppas > 0 && left_line_ppas) + if (left_ppas && left_line_ppas) goto next_pad_rq; - return 0; + kref_put(&pad_rq->ref, pblk_recov_complete); + + if (!wait_for_completion_io_timeout(&pad_rq->wait, + msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { + pr_err("pblk: pad write timed out\n"); + ret = -ETIME; + } + +free_rq: + kfree(pad_rq); +free_data: + vfree(data); + return ret; + +fail_free_rqd: + pblk_free_rqd(pblk, rqd, WRITE); +fail_free_meta: + nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list); + kfree(pad_rq); + return ret; } /* When this function is called, it means that not all upper pages have been @@ -456,7 +525,7 @@ static int pblk_recov_scan_all_oob(struct pblk *pblk, struct pblk_line *line, rec_round = 0; next_rq: - memset(rqd, 0, pblk_r_rq_size); + memset(rqd, 0, pblk_g_rq_size); rq_ppas = pblk_calc_secs(pblk, left_ppas, 0); if (!rq_ppas) @@ -472,7 +541,6 @@ next_rq: rqd->bio = bio; rqd->opcode = NVM_OP_PREAD; - rqd->flags = pblk_set_read_mode(pblk); rqd->meta_list = meta_list; rqd->nr_ppas = rq_ppas; rqd->ppa_list = ppa_list; @@ -481,6 +549,11 @@ next_rq: rqd->end_io = pblk_end_io_sync; rqd->private = &wait; + if (pblk_io_aligned(pblk, rq_ppas)) + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); + else + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); + for (i = 0; i < rqd->nr_ppas; ) { struct ppa_addr ppa; int pos; @@ -510,6 +583,7 @@ next_rq: msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { pr_err("pblk: L2P recovery read timed out\n"); } + atomic_dec(&pblk->inflight_io); reinit_completion(&wait); /* This should not happen since the read failed during normal recovery, @@ -544,7 +618,7 @@ next_rq: if (pad_secs > line->left_msecs) pad_secs = line->left_msecs; - ret = pblk_recov_pad_oob(pblk, line, p, pad_secs); + ret = pblk_recov_pad_oob(pblk, line, pad_secs); if (ret) pr_err("pblk: OOB padding failed (err:%d)\n", ret); @@ -552,7 +626,6 @@ next_rq: if (ret) pr_err("pblk: OOB read failed (err:%d)\n", ret); - line->left_ssecs = line->left_msecs; left_ppas = 0; } @@ -591,7 +664,7 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line, *done = 1; next_rq: - memset(rqd, 0, pblk_r_rq_size); + memset(rqd, 0, pblk_g_rq_size); rq_ppas = pblk_calc_secs(pblk, left_ppas, 0); if (!rq_ppas) @@ -607,7 +680,6 @@ next_rq: rqd->bio = bio; rqd->opcode = NVM_OP_PREAD; - rqd->flags = pblk_set_read_mode(pblk); rqd->meta_list = meta_list; rqd->nr_ppas = rq_ppas; rqd->ppa_list = ppa_list; @@ -616,6 +688,11 @@ next_rq: rqd->end_io = pblk_end_io_sync; rqd->private = &wait; + if (pblk_io_aligned(pblk, rq_ppas)) + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); + else + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); + for (i = 0; i < rqd->nr_ppas; ) { struct ppa_addr ppa; int pos; @@ -646,6 +723,7 @@ next_rq: msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { pr_err("pblk: L2P recovery read timed out\n"); } + atomic_dec(&pblk->inflight_io); reinit_completion(&wait); /* Reached the end of the written line */ @@ -658,7 +736,6 @@ next_rq: /* Roll back failed sectors */ line->cur_sec -= nr_error_bits; line->left_msecs += nr_error_bits; - line->left_ssecs = line->left_msecs; bitmap_clear(line->map_bitmap, line->cur_sec, nr_error_bits); left_ppas = 0; @@ -770,8 +847,9 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) struct pblk_line_meta *lm = &pblk->lm; struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line *line, *tline, *data_line = NULL; - struct line_smeta *smeta; - struct line_emeta *emeta; + struct pblk_smeta *smeta; + struct pblk_emeta *emeta; + struct line_smeta *smeta_buf; int found_lines = 0, recovered_lines = 0, open_lines = 0; int is_next = 0; int meta_line; @@ -784,8 +862,9 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) spin_lock(&l_mg->free_lock); meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES); set_bit(meta_line, &l_mg->meta_bitmap); - smeta = l_mg->sline_meta[meta_line].meta; - emeta = l_mg->eline_meta[meta_line].meta; + smeta = l_mg->sline_meta[meta_line]; + emeta = l_mg->eline_meta[meta_line]; + smeta_buf = (struct line_smeta *)smeta; spin_unlock(&l_mg->free_lock); /* Order data lines using their sequence number */ @@ -796,33 +875,33 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) memset(smeta, 0, lm->smeta_len); line->smeta = smeta; - line->lun_bitmap = ((void *)(smeta)) + + line->lun_bitmap = ((void *)(smeta_buf)) + sizeof(struct line_smeta); /* Lines that cannot be read are assumed as not written here */ if (pblk_line_read_smeta(pblk, line)) continue; - crc = pblk_calc_smeta_crc(pblk, smeta); - if (le32_to_cpu(smeta->crc) != crc) + crc = pblk_calc_smeta_crc(pblk, smeta_buf); + if (le32_to_cpu(smeta_buf->crc) != crc) continue; - if (le32_to_cpu(smeta->header.identifier) != PBLK_MAGIC) + if (le32_to_cpu(smeta_buf->header.identifier) != PBLK_MAGIC) continue; - if (le16_to_cpu(smeta->header.version) != 1) { + if (le16_to_cpu(smeta_buf->header.version) != 1) { pr_err("pblk: found incompatible line version %u\n", - smeta->header.version); + smeta_buf->header.version); return ERR_PTR(-EINVAL); } /* The first valid instance uuid is used for initialization */ if (!valid_uuid) { - memcpy(pblk->instance_uuid, smeta->header.uuid, 16); + memcpy(pblk->instance_uuid, smeta_buf->header.uuid, 16); valid_uuid = 1; } - if (memcmp(pblk->instance_uuid, smeta->header.uuid, 16)) { + if (memcmp(pblk->instance_uuid, smeta_buf->header.uuid, 16)) { pr_debug("pblk: ignore line %u due to uuid mismatch\n", i); continue; @@ -830,9 +909,9 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) /* Update line metadata */ spin_lock(&line->lock); - line->id = le32_to_cpu(line->smeta->header.id); - line->type = le16_to_cpu(line->smeta->header.type); - line->seq_nr = le64_to_cpu(line->smeta->seq_nr); + line->id = le32_to_cpu(smeta_buf->header.id); + line->type = le16_to_cpu(smeta_buf->header.type); + line->seq_nr = le64_to_cpu(smeta_buf->seq_nr); spin_unlock(&line->lock); /* Update general metadata */ @@ -848,7 +927,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) pblk_recov_line_add_ordered(&recov_list, line); found_lines++; pr_debug("pblk: recovering data line %d, seq:%llu\n", - line->id, smeta->seq_nr); + line->id, smeta_buf->seq_nr); } if (!found_lines) { @@ -868,15 +947,15 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) recovered_lines++; /* Calculate where emeta starts based on the line bb */ - off = lm->sec_per_line - lm->emeta_sec; + off = lm->sec_per_line - lm->emeta_sec[0]; nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line); off -= nr_bb * geo->sec_per_pl; - memset(emeta, 0, lm->emeta_len); - line->emeta = emeta; line->emeta_ssec = off; + line->emeta = emeta; + memset(line->emeta->buf, 0, lm->emeta_len[0]); - if (pblk_line_read_emeta(pblk, line)) { + if (pblk_line_read_emeta(pblk, line, line->emeta->buf)) { pblk_recov_l2p_from_oob(pblk, line); goto next; } @@ -941,58 +1020,26 @@ out: } /* - * Pad until smeta can be read on current data line + * Pad current line */ -void pblk_recov_pad(struct pblk *pblk) +int pblk_recov_pad(struct pblk *pblk) { - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; struct pblk_line *line; struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct nvm_rq *rqd; - struct pblk_recov_alloc p; - struct ppa_addr *ppa_list; - struct pblk_sec_meta *meta_list; - void *data; - dma_addr_t dma_ppa_list, dma_meta_list; + int left_msecs; + int ret = 0; spin_lock(&l_mg->free_lock); line = l_mg->data_line; + left_msecs = line->left_msecs; spin_unlock(&l_mg->free_lock); - rqd = pblk_alloc_rqd(pblk, READ); - if (IS_ERR(rqd)) - return; - - meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list); - if (!meta_list) - goto free_rqd; - - ppa_list = (void *)(meta_list) + pblk_dma_meta_size; - dma_ppa_list = dma_meta_list + pblk_dma_meta_size; - - data = kcalloc(pblk->max_write_pgs, geo->sec_size, GFP_KERNEL); - if (!data) - goto free_meta_list; - - p.ppa_list = ppa_list; - p.meta_list = meta_list; - p.rqd = rqd; - p.data = data; - p.dma_ppa_list = dma_ppa_list; - p.dma_meta_list = dma_meta_list; - - if (pblk_recov_pad_oob(pblk, line, p, line->left_msecs)) { - pr_err("pblk: Tear down padding failed\n"); - goto free_data; + ret = pblk_recov_pad_oob(pblk, line, left_msecs); + if (ret) { + pr_err("pblk: Tear down padding failed (%d)\n", ret); + return ret; } - pblk_line_close(pblk, line); - -free_data: - kfree(data); -free_meta_list: - nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list); -free_rqd: - pblk_free_rqd(pblk, rqd, READ); + pblk_line_close_meta(pblk, line); + return ret; } diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c index ab7cbb1..2e6a536 100644 --- a/drivers/lightnvm/pblk-rl.c +++ b/drivers/lightnvm/pblk-rl.c @@ -23,11 +23,35 @@ static void pblk_rl_kick_u_timer(struct pblk_rl *rl) mod_timer(&rl->u_timer, jiffies + msecs_to_jiffies(5000)); } +int pblk_rl_is_limit(struct pblk_rl *rl) +{ + int rb_space; + + rb_space = atomic_read(&rl->rb_space); + + return (rb_space == 0); +} + int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries) { int rb_user_cnt = atomic_read(&rl->rb_user_cnt); + int rb_space = atomic_read(&rl->rb_space); - return (!(rb_user_cnt + nr_entries > rl->rb_user_max)); + if (unlikely(rb_space >= 0) && (rb_space - nr_entries < 0)) + return NVM_IO_ERR; + + if (rb_user_cnt >= rl->rb_user_max) + return NVM_IO_REQUEUE; + + return NVM_IO_OK; +} + +void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries) +{ + int rb_space = atomic_read(&rl->rb_space); + + if (unlikely(rb_space >= 0)) + atomic_sub(nr_entries, &rl->rb_space); } int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries) @@ -37,7 +61,7 @@ int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries) /* If there is no user I/O let GC take over space on the write buffer */ rb_user_active = READ_ONCE(rl->rb_user_active); - return (!(rb_gc_cnt + nr_entries > rl->rb_gc_max && rb_user_active)); + return (!(rb_gc_cnt >= rl->rb_gc_max && rb_user_active)); } void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries) @@ -77,33 +101,32 @@ static int pblk_rl_update_rates(struct pblk_rl *rl, unsigned long max) unsigned long free_blocks = pblk_rl_nr_free_blks(rl); if (free_blocks >= rl->high) { - rl->rb_user_max = max - rl->rb_gc_rsv; - rl->rb_gc_max = rl->rb_gc_rsv; + rl->rb_user_max = max; + rl->rb_gc_max = 0; rl->rb_state = PBLK_RL_HIGH; } else if (free_blocks < rl->high) { int shift = rl->high_pw - rl->rb_windows_pw; int user_windows = free_blocks >> shift; int user_max = user_windows << PBLK_MAX_REQ_ADDRS_PW; - int gc_max; rl->rb_user_max = user_max; - gc_max = max - rl->rb_user_max; - rl->rb_gc_max = max(gc_max, rl->rb_gc_rsv); - - if (free_blocks > rl->low) - rl->rb_state = PBLK_RL_MID; - else - rl->rb_state = PBLK_RL_LOW; + rl->rb_gc_max = max - user_max; + + if (free_blocks <= rl->rsv_blocks) { + rl->rb_user_max = 0; + rl->rb_gc_max = max; + } + + /* In the worst case, we will need to GC lines in the low list + * (high valid sector count). If there are lines to GC on high + * or mid lists, these will be prioritized + */ + rl->rb_state = PBLK_RL_LOW; } return rl->rb_state; } -void pblk_rl_set_gc_rsc(struct pblk_rl *rl, int rsv) -{ - rl->rb_gc_rsv = rl->rb_gc_max = rsv; -} - void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line) { struct pblk *pblk = container_of(rl, struct pblk, rl); @@ -122,11 +145,15 @@ void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line) void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line) { - struct pblk *pblk = container_of(rl, struct pblk, rl); int blk_in_line = atomic_read(&line->blk_in_line); - int ret; atomic_sub(blk_in_line, &rl->free_blocks); +} + +void pblk_gc_should_kick(struct pblk *pblk) +{ + struct pblk_rl *rl = &pblk->rl; + int ret; /* Rates will not change that often - no need to lock update */ ret = pblk_rl_update_rates(rl, rl->rb_budget); @@ -136,11 +163,16 @@ void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line) pblk_gc_should_stop(pblk); } -int pblk_rl_gc_thrs(struct pblk_rl *rl) +int pblk_rl_high_thrs(struct pblk_rl *rl) { return rl->high; } +int pblk_rl_low_thrs(struct pblk_rl *rl) +{ + return rl->low; +} + int pblk_rl_sysfs_rate_show(struct pblk_rl *rl) { return rl->rb_user_max; @@ -161,24 +193,36 @@ void pblk_rl_free(struct pblk_rl *rl) void pblk_rl_init(struct pblk_rl *rl, int budget) { + struct pblk *pblk = container_of(rl, struct pblk, rl); + struct pblk_line_meta *lm = &pblk->lm; + int min_blocks = lm->blk_per_line * PBLK_GC_RSV_LINE; unsigned int rb_windows; rl->high = rl->total_blocks / PBLK_USER_HIGH_THRS; - rl->low = rl->total_blocks / PBLK_USER_LOW_THRS; rl->high_pw = get_count_order(rl->high); + rl->low = rl->total_blocks / PBLK_USER_LOW_THRS; + if (rl->low < min_blocks) + rl->low = min_blocks; + + rl->rsv_blocks = min_blocks; + /* This will always be a power-of-2 */ rb_windows = budget / PBLK_MAX_REQ_ADDRS; - rl->rb_windows_pw = get_count_order(rb_windows) + 1; + rl->rb_windows_pw = get_count_order(rb_windows); /* To start with, all buffer is available to user I/O writers */ rl->rb_budget = budget; rl->rb_user_max = budget; - atomic_set(&rl->rb_user_cnt, 0); rl->rb_gc_max = 0; rl->rb_state = PBLK_RL_HIGH; + + atomic_set(&rl->rb_user_cnt, 0); atomic_set(&rl->rb_gc_cnt, 0); + atomic_set(&rl->rb_space, -1); setup_timer(&rl->u_timer, pblk_rl_u_timer, (unsigned long)rl); + rl->rb_user_active = 0; + rl->rb_gc_active = 0; } diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c index f0af1d1..95fb434 100644 --- a/drivers/lightnvm/pblk-sysfs.c +++ b/drivers/lightnvm/pblk-sysfs.c @@ -49,30 +49,26 @@ static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page) static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page) { - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; int free_blocks, total_blocks; int rb_user_max, rb_user_cnt; - int rb_gc_max, rb_gc_rsv, rb_gc_cnt, rb_budget, rb_state; + int rb_gc_max, rb_gc_cnt, rb_budget, rb_state; free_blocks = atomic_read(&pblk->rl.free_blocks); rb_user_max = pblk->rl.rb_user_max; rb_user_cnt = atomic_read(&pblk->rl.rb_user_cnt); rb_gc_max = pblk->rl.rb_gc_max; - rb_gc_rsv = pblk->rl.rb_gc_rsv; rb_gc_cnt = atomic_read(&pblk->rl.rb_gc_cnt); rb_budget = pblk->rl.rb_budget; rb_state = pblk->rl.rb_state; - total_blocks = geo->blks_per_lun * geo->nr_luns; + total_blocks = pblk->rl.total_blocks; return snprintf(page, PAGE_SIZE, - "u:%u/%u,gc:%u/%u/%u(%u/%u)(stop:<%u,full:>%u,free:%d/%d)-%d\n", + "u:%u/%u,gc:%u/%u(%u/%u)(stop:<%u,full:>%u,free:%d/%d)-%d\n", rb_user_cnt, rb_user_max, rb_gc_cnt, rb_gc_max, - rb_gc_rsv, rb_state, rb_budget, pblk->rl.low, @@ -150,11 +146,11 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page) ssize_t sz = 0; int nr_free_lines; int cur_data, cur_log; - int free_line_cnt = 0, closed_line_cnt = 0; + int free_line_cnt = 0, closed_line_cnt = 0, emeta_line_cnt = 0; int d_line_cnt = 0, l_line_cnt = 0; int gc_full = 0, gc_high = 0, gc_mid = 0, gc_low = 0, gc_empty = 0; - int free = 0, bad = 0, cor = 0; - int msecs = 0, ssecs = 0, cur_sec = 0, vsc = 0, sec_in_line = 0; + int bad = 0, cor = 0; + int msecs = 0, cur_sec = 0, vsc = 0, sec_in_line = 0; int map_weight = 0, meta_weight = 0; spin_lock(&l_mg->free_lock); @@ -166,6 +162,11 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page) free_line_cnt++; spin_unlock(&l_mg->free_lock); + spin_lock(&l_mg->close_lock); + list_for_each_entry(line, &l_mg->emeta_list, list) + emeta_line_cnt++; + spin_unlock(&l_mg->close_lock); + spin_lock(&l_mg->gc_lock); list_for_each_entry(line, &l_mg->gc_full_list, list) { if (line->type == PBLK_LINETYPE_DATA) @@ -212,8 +213,6 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page) gc_empty++; } - list_for_each_entry(line, &l_mg->free_list, list) - free++; list_for_each_entry(line, &l_mg->bad_list, list) bad++; list_for_each_entry(line, &l_mg->corrupt_list, list) @@ -224,8 +223,7 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page) if (l_mg->data_line) { cur_sec = l_mg->data_line->cur_sec; msecs = l_mg->data_line->left_msecs; - ssecs = l_mg->data_line->left_ssecs; - vsc = l_mg->data_line->vsc; + vsc = le32_to_cpu(*l_mg->data_line->vsc); sec_in_line = l_mg->data_line->sec_in_line; meta_weight = bitmap_weight(&l_mg->meta_bitmap, PBLK_DATA_LINES); @@ -235,17 +233,20 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page) spin_unlock(&l_mg->free_lock); if (nr_free_lines != free_line_cnt) - pr_err("pblk: corrupted free line list\n"); + pr_err("pblk: corrupted free line list:%d/%d\n", + nr_free_lines, free_line_cnt); sz = snprintf(page, PAGE_SIZE - sz, "line: nluns:%d, nblks:%d, nsecs:%d\n", geo->nr_luns, lm->blk_per_line, lm->sec_per_line); sz += snprintf(page + sz, PAGE_SIZE - sz, - "lines:d:%d,l:%d-f:%d(%d),b:%d,co:%d,c:%d(d:%d,l:%d)t:%d\n", + "lines:d:%d,l:%d-f:%d,m:%d/%d,c:%d,b:%d,co:%d(d:%d,l:%d)t:%d\n", cur_data, cur_log, - free, nr_free_lines, bad, cor, + nr_free_lines, + emeta_line_cnt, meta_weight, closed_line_cnt, + bad, cor, d_line_cnt, l_line_cnt, l_mg->nr_lines); @@ -255,9 +256,10 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page) atomic_read(&pblk->gc.inflight_gc)); sz += snprintf(page + sz, PAGE_SIZE - sz, - "data (%d) cur:%d, left:%d/%d, vsc:%d, s:%d, map:%d/%d (%d)\n", - cur_data, cur_sec, msecs, ssecs, vsc, sec_in_line, - map_weight, lm->sec_per_line, meta_weight); + "data (%d) cur:%d, left:%d, vsc:%d, s:%d, map:%d/%d (%d)\n", + cur_data, cur_sec, msecs, vsc, sec_in_line, + map_weight, lm->sec_per_line, + atomic_read(&pblk->inflight_io)); return sz; } @@ -274,7 +276,7 @@ static ssize_t pblk_sysfs_lines_info(struct pblk *pblk, char *page) lm->smeta_len, lm->smeta_sec); sz += snprintf(page + sz, PAGE_SIZE - sz, "emeta - len:%d, sec:%d, bb_start:%d\n", - lm->emeta_len, lm->emeta_sec, + lm->emeta_len[0], lm->emeta_sec[0], lm->emeta_bb); sz += snprintf(page + sz, PAGE_SIZE - sz, "bitmap lengths: sec:%d, blk:%d, lun:%d\n", @@ -290,6 +292,11 @@ static ssize_t pblk_sysfs_lines_info(struct pblk *pblk, char *page) return sz; } +static ssize_t pblk_sysfs_get_sec_per_write(struct pblk *pblk, char *page) +{ + return snprintf(page, PAGE_SIZE, "%d\n", pblk->sec_per_write); +} + #ifdef CONFIG_NVM_DEBUG static ssize_t pblk_sysfs_stats_debug(struct pblk *pblk, char *page) { @@ -303,52 +310,51 @@ static ssize_t pblk_sysfs_stats_debug(struct pblk *pblk, char *page) atomic_long_read(&pblk->padded_wb), atomic_long_read(&pblk->sub_writes), atomic_long_read(&pblk->sync_writes), - atomic_long_read(&pblk->compl_writes), atomic_long_read(&pblk->recov_writes), atomic_long_read(&pblk->recov_gc_writes), atomic_long_read(&pblk->recov_gc_reads), + atomic_long_read(&pblk->cache_reads), atomic_long_read(&pblk->sync_reads)); } #endif -static ssize_t pblk_sysfs_rate_store(struct pblk *pblk, const char *page, - size_t len) +static ssize_t pblk_sysfs_gc_force(struct pblk *pblk, const char *page, + size_t len) { - struct pblk_gc *gc = &pblk->gc; size_t c_len; - int value; + int force; c_len = strcspn(page, "\n"); if (c_len >= len) return -EINVAL; - if (kstrtouint(page, 0, &value)) + if (kstrtouint(page, 0, &force)) return -EINVAL; - spin_lock(&gc->lock); - pblk_rl_set_gc_rsc(&pblk->rl, value); - spin_unlock(&gc->lock); + pblk_gc_sysfs_force(pblk, force); return len; } -static ssize_t pblk_sysfs_gc_force(struct pblk *pblk, const char *page, - size_t len) +static ssize_t pblk_sysfs_set_sec_per_write(struct pblk *pblk, + const char *page, size_t len) { size_t c_len; - int force; + int sec_per_write; c_len = strcspn(page, "\n"); if (c_len >= len) return -EINVAL; - if (kstrtouint(page, 0, &force)) + if (kstrtouint(page, 0, &sec_per_write)) return -EINVAL; - if (force < 0 || force > 1) + if (sec_per_write < pblk->min_write_pgs + || sec_per_write > pblk->max_write_pgs + || sec_per_write % pblk->min_write_pgs != 0) return -EINVAL; - pblk_gc_sysfs_force(pblk, force); + pblk_set_sec_per_write(pblk, sec_per_write); return len; } @@ -398,9 +404,9 @@ static struct attribute sys_gc_force = { .mode = 0200, }; -static struct attribute sys_gc_rl_max = { - .name = "gc_rl_max", - .mode = 0200, +static struct attribute sys_max_sec_per_write = { + .name = "max_sec_per_write", + .mode = 0644, }; #ifdef CONFIG_NVM_DEBUG @@ -416,7 +422,7 @@ static struct attribute *pblk_attrs[] = { &sys_errors_attr, &sys_gc_state, &sys_gc_force, - &sys_gc_rl_max, + &sys_max_sec_per_write, &sys_rb_attr, &sys_stats_ppaf_attr, &sys_lines_attr, @@ -448,6 +454,8 @@ static ssize_t pblk_sysfs_show(struct kobject *kobj, struct attribute *attr, return pblk_sysfs_lines(pblk, buf); else if (strcmp(attr->name, "lines_info") == 0) return pblk_sysfs_lines_info(pblk, buf); + else if (strcmp(attr->name, "max_sec_per_write") == 0) + return pblk_sysfs_get_sec_per_write(pblk, buf); #ifdef CONFIG_NVM_DEBUG else if (strcmp(attr->name, "stats") == 0) return pblk_sysfs_stats_debug(pblk, buf); @@ -460,10 +468,10 @@ static ssize_t pblk_sysfs_store(struct kobject *kobj, struct attribute *attr, { struct pblk *pblk = container_of(kobj, struct pblk, kobj); - if (strcmp(attr->name, "gc_rl_max") == 0) - return pblk_sysfs_rate_store(pblk, buf, len); - else if (strcmp(attr->name, "gc_force") == 0) + if (strcmp(attr->name, "gc_force") == 0) return pblk_sysfs_gc_force(pblk, buf, len); + else if (strcmp(attr->name, "max_sec_per_write") == 0) + return pblk_sysfs_set_sec_per_write(pblk, buf, len); return 0; } diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index aef6fd7..d62a8f4 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -17,18 +17,6 @@ #include "pblk.h" -static void pblk_sync_line(struct pblk *pblk, struct pblk_line *line) -{ -#ifdef CONFIG_NVM_DEBUG - atomic_long_inc(&pblk->sync_writes); -#endif - - /* Counter protected by rb sync lock */ - line->left_ssecs--; - if (!line->left_ssecs) - pblk_line_run_ws(pblk, line, NULL, pblk_line_close_ws); -} - static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd, struct pblk_c_ctx *c_ctx) { @@ -39,21 +27,14 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd, for (i = 0; i < c_ctx->nr_valid; i++) { struct pblk_w_ctx *w_ctx; - struct ppa_addr p; - struct pblk_line *line; w_ctx = pblk_rb_w_ctx(&pblk->rwb, c_ctx->sentry + i); - - p = rqd->ppa_list[i]; - line = &pblk->lines[pblk_dev_ppa_to_line(p)]; - pblk_sync_line(pblk, line); - while ((original_bio = bio_list_pop(&w_ctx->bios))) bio_endio(original_bio); } #ifdef CONFIG_NVM_DEBUG - atomic_long_add(c_ctx->nr_valid, &pblk->compl_writes); + atomic_long_add(c_ctx->nr_valid, &pblk->sync_writes); #endif ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid); @@ -169,7 +150,7 @@ static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd) } INIT_WORK(&recovery->ws_rec, pblk_submit_rec); - queue_work(pblk->kw_wq, &recovery->ws_rec); + queue_work(pblk->close_wq, &recovery->ws_rec); out: pblk_complete_write(pblk, rqd, c_ctx); @@ -186,14 +167,50 @@ static void pblk_end_io_write(struct nvm_rq *rqd) } #ifdef CONFIG_NVM_DEBUG else - WARN_ONCE(rqd->bio->bi_error, "pblk: corrupted write error\n"); + WARN_ONCE(rqd->bio->bi_status, "pblk: corrupted write error\n"); #endif pblk_complete_write(pblk, rqd, c_ctx); + atomic_dec(&pblk->inflight_io); +} + +static void pblk_end_io_write_meta(struct nvm_rq *rqd) +{ + struct pblk *pblk = rqd->private; + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_g_ctx *m_ctx = nvm_rq_to_pdu(rqd); + struct pblk_line *line = m_ctx->private; + struct pblk_emeta *emeta = line->emeta; + int pos = pblk_ppa_to_pos(geo, rqd->ppa_list[0]); + struct pblk_lun *rlun = &pblk->luns[pos]; + int sync; + + up(&rlun->wr_sem); + + if (rqd->error) { + pblk_log_write_err(pblk, rqd); + pr_err("pblk: metadata I/O failed. Line %d\n", line->id); + } +#ifdef CONFIG_NVM_DEBUG + else + WARN_ONCE(rqd->bio->bi_status, "pblk: corrupted write error\n"); +#endif + + sync = atomic_add_return(rqd->nr_ppas, &emeta->sync); + if (sync == emeta->nr_entries) + pblk_line_run_ws(pblk, line, NULL, pblk_line_close_ws, + pblk->close_wq); + + bio_put(rqd->bio); + pblk_free_rqd(pblk, rqd, READ); + + atomic_dec(&pblk->inflight_io); } static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd, - unsigned int nr_secs) + unsigned int nr_secs, + nvm_end_io_fn(*end_io)) { struct nvm_tgt_dev *dev = pblk->dev; @@ -202,7 +219,7 @@ static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd, rqd->nr_ppas = nr_secs; rqd->flags = pblk_set_progr_mode(pblk, WRITE); rqd->private = pblk; - rqd->end_io = pblk_end_io_write; + rqd->end_io = end_io; rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &rqd->dma_meta_list); @@ -219,11 +236,10 @@ static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd, } static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd, - struct pblk_c_ctx *c_ctx) + struct pblk_c_ctx *c_ctx, struct ppa_addr *erase_ppa) { struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line *e_line = pblk_line_get_data_next(pblk); - struct ppa_addr erase_ppa; + struct pblk_line *e_line = pblk_line_get_erase(pblk); unsigned int valid = c_ctx->nr_valid; unsigned int padded = c_ctx->nr_padded; unsigned int nr_secs = valid + padded; @@ -231,40 +247,23 @@ static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd, int ret = 0; lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL); - if (!lun_bitmap) { - ret = -ENOMEM; - goto out; - } + if (!lun_bitmap) + return -ENOMEM; c_ctx->lun_bitmap = lun_bitmap; - ret = pblk_alloc_w_rq(pblk, rqd, nr_secs); + ret = pblk_alloc_w_rq(pblk, rqd, nr_secs, pblk_end_io_write); if (ret) { kfree(lun_bitmap); - goto out; + return ret; } - ppa_set_empty(&erase_ppa); if (likely(!e_line || !atomic_read(&e_line->left_eblks))) pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, valid, 0); else pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, - valid, &erase_ppa); - -out: - if (unlikely(e_line && !ppa_empty(erase_ppa))) { - if (pblk_blk_erase_async(pblk, erase_ppa)) { - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - int bit; - - atomic_inc(&e_line->left_eblks); - bit = erase_ppa.g.lun * geo->nr_chnls + erase_ppa.g.ch; - WARN_ON(!test_and_clear_bit(bit, e_line->erase_bitmap)); - up(&pblk->erase_sem); - } - } + valid, erase_ppa); - return ret; + return 0; } int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd, @@ -280,7 +279,7 @@ int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd, c_ctx->lun_bitmap = lun_bitmap; - ret = pblk_alloc_w_rq(pblk, rqd, rqd->nr_ppas); + ret = pblk_alloc_w_rq(pblk, rqd, rqd->nr_ppas, pblk_end_io_write); if (ret) return ret; @@ -311,16 +310,237 @@ static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail, return secs_to_sync; } +static inline int pblk_valid_meta_ppa(struct pblk *pblk, + struct pblk_line *meta_line, + struct ppa_addr *ppa_list, int nr_ppas) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_line *data_line; + struct ppa_addr ppa, ppa_opt; + u64 paddr; + int i; + + data_line = &pblk->lines[pblk_dev_ppa_to_line(ppa_list[0])]; + paddr = pblk_lookup_page(pblk, meta_line); + ppa = addr_to_gen_ppa(pblk, paddr, 0); + + if (test_bit(pblk_ppa_to_pos(geo, ppa), data_line->blk_bitmap)) + return 1; + + /* Schedule a metadata I/O that is half the distance from the data I/O + * with regards to the number of LUNs forming the pblk instance. This + * balances LUN conflicts across every I/O. + * + * When the LUN configuration changes (e.g., due to GC), this distance + * can align, which would result on a LUN deadlock. In this case, modify + * the distance to not be optimal, but allow metadata I/Os to succeed. + */ + ppa_opt = addr_to_gen_ppa(pblk, paddr + data_line->meta_distance, 0); + if (unlikely(ppa_opt.ppa == ppa.ppa)) { + data_line->meta_distance--; + return 0; + } + + for (i = 0; i < nr_ppas; i += pblk->min_write_pgs) + if (ppa_list[i].g.ch == ppa_opt.g.ch && + ppa_list[i].g.lun == ppa_opt.g.lun) + return 1; + + if (test_bit(pblk_ppa_to_pos(geo, ppa_opt), data_line->blk_bitmap)) { + for (i = 0; i < nr_ppas; i += pblk->min_write_pgs) + if (ppa_list[i].g.ch == ppa.g.ch && + ppa_list[i].g.lun == ppa.g.lun) + return 0; + + return 1; + } + + return 0; +} + +int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_emeta *emeta = meta_line->emeta; + struct pblk_g_ctx *m_ctx; + struct pblk_lun *rlun; + struct bio *bio; + struct nvm_rq *rqd; + void *data; + u64 paddr; + int rq_ppas = pblk->min_write_pgs; + int id = meta_line->id; + int rq_len; + int i, j; + int ret; + + rqd = pblk_alloc_rqd(pblk, READ); + if (IS_ERR(rqd)) { + pr_err("pblk: cannot allocate write req.\n"); + return PTR_ERR(rqd); + } + m_ctx = nvm_rq_to_pdu(rqd); + m_ctx->private = meta_line; + + rq_len = rq_ppas * geo->sec_size; + data = ((void *)emeta->buf) + emeta->mem; + + bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len, + l_mg->emeta_alloc_type, GFP_KERNEL); + if (IS_ERR(bio)) { + ret = PTR_ERR(bio); + goto fail_free_rqd; + } + bio->bi_iter.bi_sector = 0; /* internal bio */ + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + rqd->bio = bio; + + ret = pblk_alloc_w_rq(pblk, rqd, rq_ppas, pblk_end_io_write_meta); + if (ret) + goto fail_free_bio; + + for (i = 0; i < rqd->nr_ppas; ) { + spin_lock(&meta_line->lock); + paddr = __pblk_alloc_page(pblk, meta_line, rq_ppas); + spin_unlock(&meta_line->lock); + for (j = 0; j < rq_ppas; j++, i++, paddr++) + rqd->ppa_list[i] = addr_to_gen_ppa(pblk, paddr, id); + } + + rlun = &pblk->luns[pblk_ppa_to_pos(geo, rqd->ppa_list[0])]; + ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(5000)); + if (ret) { + pr_err("pblk: lun semaphore timed out (%d)\n", ret); + goto fail_free_bio; + } + + emeta->mem += rq_len; + if (emeta->mem >= lm->emeta_len[0]) { + spin_lock(&l_mg->close_lock); + list_del(&meta_line->list); + WARN(!bitmap_full(meta_line->map_bitmap, lm->sec_per_line), + "pblk: corrupt meta line %d\n", meta_line->id); + spin_unlock(&l_mg->close_lock); + } + + ret = pblk_submit_io(pblk, rqd); + if (ret) { + pr_err("pblk: emeta I/O submission failed: %d\n", ret); + goto fail_rollback; + } + + return NVM_IO_OK; + +fail_rollback: + spin_lock(&l_mg->close_lock); + pblk_dealloc_page(pblk, meta_line, rq_ppas); + list_add(&meta_line->list, &meta_line->list); + spin_unlock(&l_mg->close_lock); +fail_free_bio: + if (likely(l_mg->emeta_alloc_type == PBLK_VMALLOC_META)) + bio_put(bio); +fail_free_rqd: + pblk_free_rqd(pblk, rqd, READ); + return ret; +} + +static int pblk_sched_meta_io(struct pblk *pblk, struct ppa_addr *prev_list, + int prev_n) +{ + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line *meta_line; + + spin_lock(&l_mg->close_lock); +retry: + if (list_empty(&l_mg->emeta_list)) { + spin_unlock(&l_mg->close_lock); + return 0; + } + meta_line = list_first_entry(&l_mg->emeta_list, struct pblk_line, list); + if (bitmap_full(meta_line->map_bitmap, lm->sec_per_line)) + goto retry; + spin_unlock(&l_mg->close_lock); + + if (!pblk_valid_meta_ppa(pblk, meta_line, prev_list, prev_n)) + return 0; + + return pblk_submit_meta_io(pblk, meta_line); +} + +static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd) +{ + struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); + struct ppa_addr erase_ppa; + int err; + + ppa_set_empty(&erase_ppa); + + /* Assign lbas to ppas and populate request structure */ + err = pblk_setup_w_rq(pblk, rqd, c_ctx, &erase_ppa); + if (err) { + pr_err("pblk: could not setup write request: %d\n", err); + return NVM_IO_ERR; + } + + if (likely(ppa_empty(erase_ppa))) { + /* Submit metadata write for previous data line */ + err = pblk_sched_meta_io(pblk, rqd->ppa_list, rqd->nr_ppas); + if (err) { + pr_err("pblk: metadata I/O submission failed: %d", err); + return NVM_IO_ERR; + } + + /* Submit data write for current data line */ + err = pblk_submit_io(pblk, rqd); + if (err) { + pr_err("pblk: data I/O submission failed: %d\n", err); + return NVM_IO_ERR; + } + } else { + /* Submit data write for current data line */ + err = pblk_submit_io(pblk, rqd); + if (err) { + pr_err("pblk: data I/O submission failed: %d\n", err); + return NVM_IO_ERR; + } + + /* Submit available erase for next data line */ + if (pblk_blk_erase_async(pblk, erase_ppa)) { + struct pblk_line *e_line = pblk_line_get_erase(pblk); + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + int bit; + + atomic_inc(&e_line->left_eblks); + bit = pblk_ppa_to_pos(geo, erase_ppa); + WARN_ON(!test_and_clear_bit(bit, e_line->erase_bitmap)); + } + } + + return NVM_IO_OK; +} + +static void pblk_free_write_rqd(struct pblk *pblk, struct nvm_rq *rqd) +{ + struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); + struct bio *bio = rqd->bio; + + if (c_ctx->nr_padded) + pblk_bio_free_pages(pblk, bio, rqd->nr_ppas, c_ctx->nr_padded); +} + static int pblk_submit_write(struct pblk *pblk) { struct bio *bio; struct nvm_rq *rqd; - struct pblk_c_ctx *c_ctx; - unsigned int pgs_read; unsigned int secs_avail, secs_to_sync, secs_to_com; unsigned int secs_to_flush; unsigned long pos; - int err; /* If there are no sectors in the cache, flushes (bios without data) * will be cleared on the cache threads @@ -338,7 +558,6 @@ static int pblk_submit_write(struct pblk *pblk) pr_err("pblk: cannot allocate write req.\n"); return 1; } - c_ctx = nvm_rq_to_pdu(rqd); bio = bio_alloc(GFP_KERNEL, pblk->max_write_pgs); if (!bio) { @@ -358,29 +577,14 @@ static int pblk_submit_write(struct pblk *pblk) secs_to_com = (secs_to_sync > secs_avail) ? secs_avail : secs_to_sync; pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com); - pgs_read = pblk_rb_read_to_bio(&pblk->rwb, bio, c_ctx, pos, - secs_to_sync, secs_avail); - if (!pgs_read) { + if (pblk_rb_read_to_bio(&pblk->rwb, rqd, bio, pos, secs_to_sync, + secs_avail)) { pr_err("pblk: corrupted write bio\n"); goto fail_put_bio; } - if (c_ctx->nr_padded) - if (pblk_bio_add_pages(pblk, bio, GFP_KERNEL, c_ctx->nr_padded)) - goto fail_put_bio; - - /* Assign lbas to ppas and populate request structure */ - err = pblk_setup_w_rq(pblk, rqd, c_ctx); - if (err) { - pr_err("pblk: could not setup write request\n"); - goto fail_free_bio; - } - - err = pblk_submit_io(pblk, rqd); - if (err) { - pr_err("pblk: I/O submission failed: %d\n", err); + if (pblk_submit_io_set(pblk, rqd)) goto fail_free_bio; - } #ifdef CONFIG_NVM_DEBUG atomic_long_add(secs_to_sync, &pblk->sub_writes); @@ -389,8 +593,7 @@ static int pblk_submit_write(struct pblk *pblk) return 0; fail_free_bio: - if (c_ctx->nr_padded) - pblk_bio_free_pages(pblk, bio, secs_to_sync, c_ctx->nr_padded); + pblk_free_write_rqd(pblk, rqd); fail_put_bio: bio_put(bio); fail_free_rqd: diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 99f3186..1593138 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -40,6 +40,12 @@ #define PBLK_MAX_REQ_ADDRS (64) #define PBLK_MAX_REQ_ADDRS_PW (6) +#define PBLK_WS_POOL_SIZE (128) +#define PBLK_META_POOL_SIZE (128) +#define PBLK_READ_REQ_POOL_SIZE (1024) + +#define PBLK_NR_CLOSE_JOBS (4) + #define PBLK_CACHE_NAME_LEN (DISK_NAME_LEN + 16) #define PBLK_COMMAND_TIMEOUT_MS 30000 @@ -72,11 +78,15 @@ enum { PBLK_BLK_ST_CLOSED = 0x2, }; +struct pblk_sec_meta { + u64 reserved; + __le64 lba; +}; + /* The number of GC lists and the rate-limiter states go together. This way the * rate-limiter can dictate how much GC is needed based on resource utilization. */ -#define PBLK_NR_GC_LISTS 3 -#define PBLK_MAX_GC_JOBS 32 +#define PBLK_GC_NR_LISTS 3 enum { PBLK_RL_HIGH = 1, @@ -84,14 +94,9 @@ enum { PBLK_RL_LOW = 3, }; -struct pblk_sec_meta { - u64 reserved; - __le64 lba; -}; - #define pblk_dma_meta_size (sizeof(struct pblk_sec_meta) * PBLK_MAX_REQ_ADDRS) -/* write completion context */ +/* write buffer completion context */ struct pblk_c_ctx { struct list_head list; /* Head for out-of-order completion */ @@ -101,9 +106,16 @@ struct pblk_c_ctx { unsigned int nr_padded; }; -/* Read context */ -struct pblk_r_ctx { - struct bio *orig_bio; +/* generic context */ +struct pblk_g_ctx { + void *private; +}; + +/* Pad context */ +struct pblk_pad_rq { + struct pblk *pblk; + struct completion wait; + struct kref ref; }; /* Recovery context */ @@ -195,29 +207,39 @@ struct pblk_lun { struct pblk_gc_rq { struct pblk_line *line; void *data; - u64 *lba_list; + u64 lba_list[PBLK_MAX_REQ_ADDRS]; int nr_secs; int secs_to_gc; struct list_head list; }; struct pblk_gc { + /* These states are not protected by a lock since (i) they are in the + * fast path, and (ii) they are not critical. + */ int gc_active; int gc_enabled; int gc_forced; - int gc_jobs_active; - atomic_t inflight_gc; struct task_struct *gc_ts; struct task_struct *gc_writer_ts; + struct task_struct *gc_reader_ts; + + struct workqueue_struct *gc_line_reader_wq; struct workqueue_struct *gc_reader_wq; + struct timer_list gc_timer; + struct semaphore gc_sem; + atomic_t inflight_gc; int w_entries; + struct list_head w_list; + struct list_head r_list; spinlock_t lock; spinlock_t w_lock; + spinlock_t r_lock; }; struct pblk_rl { @@ -229,10 +251,8 @@ struct pblk_rl { */ unsigned int high_pw; /* High rounded up as a power of 2 */ -#define PBLK_USER_HIGH_THRS 2 /* Begin write limit at 50 percent - * available blks - */ -#define PBLK_USER_LOW_THRS 20 /* Aggressive GC at 5% available blocks */ +#define PBLK_USER_HIGH_THRS 8 /* Begin write limit at 12% available blks */ +#define PBLK_USER_LOW_THRS 10 /* Aggressive GC at 10% available blocks */ int rb_windows_pw; /* Number of rate windows in the write buffer * given as a power-of-2. This guarantees that @@ -244,13 +264,19 @@ struct pblk_rl { */ int rb_budget; /* Total number of entries available for I/O */ int rb_user_max; /* Max buffer entries available for user I/O */ - atomic_t rb_user_cnt; /* User I/O buffer counter */ int rb_gc_max; /* Max buffer entries available for GC I/O */ int rb_gc_rsv; /* Reserved buffer entries for GC I/O */ int rb_state; /* Rate-limiter current state */ + + atomic_t rb_user_cnt; /* User I/O buffer counter */ atomic_t rb_gc_cnt; /* GC I/O buffer counter */ + atomic_t rb_space; /* Space limit in case of reaching capacity */ + + int rsv_blocks; /* Reserved blocks for GC */ int rb_user_active; + int rb_gc_active; + struct timer_list u_timer; unsigned long long nr_secs; @@ -258,8 +284,6 @@ struct pblk_rl { atomic_t free_blocks; }; -#define PBLK_LINE_NR_LUN_BITMAP 2 -#define PBLK_LINE_NR_SEC_BITMAP 2 #define PBLK_LINE_EMPTY (~0U) enum { @@ -310,16 +334,19 @@ struct line_smeta { __le32 window_wr_lun; /* Number of parallel LUNs to write */ __le32 rsvd[2]; + + __le64 lun_bitmap[]; }; /* - * Metadata Layout: - * 1. struct pblk_emeta - * 2. nr_lbas u64 forming lba list - * 3. nr_lines (all) u32 valid sector count (vsc) (~0U: non-alloc line) - * 4. nr_luns bits (u64 format) forming line bad block bitmap - * - * 3. and 4. will be part of FTL log + * Metadata layout in media: + * First sector: + * 1. struct line_emeta + * 2. bad block bitmap (u64 * window_wr_lun) + * Mid sectors (start at lbas_sector): + * 3. nr_lbas (u64) forming lba list + * Last sectors (start at vsc_sector): + * 4. u32 valid sector count (vsc) for all lines (~0U: free line) */ struct line_emeta { struct line_header header; @@ -339,6 +366,23 @@ struct line_emeta { __le32 next_id; /* Line id for next line */ __le64 nr_lbas; /* Number of lbas mapped in line */ __le64 nr_valid_lbas; /* Number of valid lbas mapped in line */ + __le64 bb_bitmap[]; /* Updated bad block bitmap for line */ +}; + +struct pblk_emeta { + struct line_emeta *buf; /* emeta buffer in media format */ + int mem; /* Write offset - points to next + * writable entry in memory + */ + atomic_t sync; /* Synced - backpointer that signals the + * last entry that has been successfully + * persisted to media + */ + unsigned int nr_entries; /* Number of emeta entries */ +}; + +struct pblk_smeta { + struct line_smeta *buf; /* smeta buffer in persistent format */ }; struct pblk_line { @@ -355,9 +399,12 @@ struct pblk_line { unsigned long *lun_bitmap; /* Bitmap for LUNs mapped in line */ - struct line_smeta *smeta; /* Start metadata */ - struct line_emeta *emeta; /* End metadata */ + struct pblk_smeta *smeta; /* Start metadata */ + struct pblk_emeta *emeta; /* End medatada */ + int meta_line; /* Metadata line id */ + int meta_distance; /* Distance between data and metadata */ + u64 smeta_ssec; /* Sector where smeta starts */ u64 emeta_ssec; /* Sector where emeta starts */ @@ -374,9 +421,10 @@ struct pblk_line { atomic_t left_seblks; /* Blocks left for sync erasing */ int left_msecs; /* Sectors left for mapping */ - int left_ssecs; /* Sectors left to sync */ unsigned int cur_sec; /* Sector map pointer */ - unsigned int vsc; /* Valid sector count in line */ + unsigned int nr_valid_lbas; /* Number of valid lbas in line */ + + __le32 *vsc; /* Valid sector count in line */ struct kref ref; /* Write buffer L2P references */ @@ -385,13 +433,15 @@ struct pblk_line { #define PBLK_DATA_LINES 4 -enum{ +enum { PBLK_KMALLOC_META = 1, PBLK_VMALLOC_META = 2, }; -struct pblk_line_metadata { - void *meta; +enum { + PBLK_EMETA_TYPE_HEADER = 1, /* struct line_emeta first sector */ + PBLK_EMETA_TYPE_LLBA = 2, /* lba list - type: __le64 */ + PBLK_EMETA_TYPE_VSC = 3, /* vsc list - type: __le32 */ }; struct pblk_line_mgmt { @@ -404,7 +454,7 @@ struct pblk_line_mgmt { struct list_head bad_list; /* Full lines bad */ /* GC lists - use gc_lock */ - struct list_head *gc_lists[PBLK_NR_GC_LISTS]; + struct list_head *gc_lists[PBLK_GC_NR_LISTS]; struct list_head gc_high_list; /* Full lines ready to GC, high isc */ struct list_head gc_mid_list; /* Full lines ready to GC, mid isc */ struct list_head gc_low_list; /* Full lines ready to GC, low isc */ @@ -417,13 +467,16 @@ struct pblk_line_mgmt { struct pblk_line *log_next; /* Next FTL log line */ struct pblk_line *data_next; /* Next data line */ + struct list_head emeta_list; /* Lines queued to schedule emeta */ + + __le32 *vsc_list; /* Valid sector counts for all lines */ + /* Metadata allocation type: VMALLOC | KMALLOC */ - int smeta_alloc_type; int emeta_alloc_type; /* Pre-allocated metadata for data lines */ - struct pblk_line_metadata sline_meta[PBLK_DATA_LINES]; - struct pblk_line_metadata eline_meta[PBLK_DATA_LINES]; + struct pblk_smeta *sline_meta[PBLK_DATA_LINES]; + struct pblk_emeta *eline_meta[PBLK_DATA_LINES]; unsigned long meta_bitmap; /* Helpers for fast bitmap calculations */ @@ -434,25 +487,40 @@ struct pblk_line_mgmt { unsigned long l_seq_nr; /* Log line unique sequence number */ spinlock_t free_lock; + spinlock_t close_lock; spinlock_t gc_lock; }; struct pblk_line_meta { unsigned int smeta_len; /* Total length for smeta */ - unsigned int smeta_sec; /* Sectors needed for smeta*/ - unsigned int emeta_len; /* Total length for emeta */ - unsigned int emeta_sec; /* Sectors needed for emeta*/ + unsigned int smeta_sec; /* Sectors needed for smeta */ + + unsigned int emeta_len[4]; /* Lengths for emeta: + * [0]: Total length + * [1]: struct line_emeta length + * [2]: L2P portion length + * [3]: vsc list length + */ + unsigned int emeta_sec[4]; /* Sectors needed for emeta. Same layout + * as emeta_len + */ + unsigned int emeta_bb; /* Boundary for bb that affects emeta */ + + unsigned int vsc_list_len; /* Length for vsc list */ unsigned int sec_bitmap_len; /* Length for sector bitmap in line */ unsigned int blk_bitmap_len; /* Length for block bitmap in line */ unsigned int lun_bitmap_len; /* Length for lun bitmap in line */ unsigned int blk_per_line; /* Number of blocks in a full line */ unsigned int sec_per_line; /* Number of sectors in a line */ + unsigned int dsec_per_line; /* Number of data sectors in a line */ unsigned int min_blk_line; /* Min. number of good blocks in line */ unsigned int mid_thrs; /* Threshold for GC mid list */ unsigned int high_thrs; /* Threshold for GC high list */ + + unsigned int meta_distance; /* Distance between data and metadata */ }; struct pblk_addr_format { @@ -470,6 +538,13 @@ struct pblk_addr_format { u8 sec_offset; }; +enum { + PBLK_STATE_RUNNING = 0, + PBLK_STATE_STOPPING = 1, + PBLK_STATE_RECOVERING = 2, + PBLK_STATE_STOPPED = 3, +}; + struct pblk { struct nvm_tgt_dev *dev; struct gendisk *disk; @@ -487,6 +562,8 @@ struct pblk { struct pblk_rb rwb; + int state; /* pblk line state */ + int min_write_pgs; /* Minimum amount of pages required by controller */ int max_write_pgs; /* Maximum amount of pages supported by controller */ int pgs_in_buffer; /* Number of pages that need to be held in buffer to @@ -499,7 +576,7 @@ struct pblk { /* pblk provisioning values. Used by rate limiter */ struct pblk_rl rl; - struct semaphore erase_sem; + int sec_per_write; unsigned char instance_uuid[16]; #ifdef CONFIG_NVM_DEBUG @@ -511,8 +588,8 @@ struct pblk { atomic_long_t req_writes; /* Sectors stored on write buffer */ atomic_long_t sub_writes; /* Sectors submitted from buffer */ atomic_long_t sync_writes; /* Sectors synced to media */ - atomic_long_t compl_writes; /* Sectors completed in write bio */ atomic_long_t inflight_reads; /* Inflight sector read requests */ + atomic_long_t cache_reads; /* Read requests that hit the cache */ atomic_long_t sync_reads; /* Completed sector read requests */ atomic_long_t recov_writes; /* Sectors submitted from recovery */ atomic_long_t recov_gc_writes; /* Sectors submitted from write GC */ @@ -528,6 +605,8 @@ struct pblk { atomic_long_t write_failed; atomic_long_t erase_failed; + atomic_t inflight_io; /* General inflight I/O counter */ + struct task_struct *writer_ts; /* Simple translation map of logical addresses to physical addresses. @@ -542,11 +621,13 @@ struct pblk { mempool_t *page_pool; mempool_t *line_ws_pool; mempool_t *rec_pool; - mempool_t *r_rq_pool; + mempool_t *g_rq_pool; mempool_t *w_rq_pool; mempool_t *line_meta_pool; - struct workqueue_struct *kw_wq; + struct workqueue_struct *close_wq; + struct workqueue_struct *bb_wq; + struct timer_list wtimer; struct pblk_gc gc; @@ -559,7 +640,7 @@ struct pblk_line_ws { struct work_struct ws; }; -#define pblk_r_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_r_ctx)) +#define pblk_g_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_g_ctx)) #define pblk_w_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_c_ctx)) /* @@ -579,18 +660,17 @@ void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data, struct pblk_w_ctx w_ctx, struct pblk_line *gc_line, unsigned int pos); struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos); +void pblk_rb_flush(struct pblk_rb *rb); void pblk_rb_sync_l2p(struct pblk_rb *rb); -unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct bio *bio, - struct pblk_c_ctx *c_ctx, - unsigned int pos, - unsigned int nr_entries, - unsigned int count); +unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd, + struct bio *bio, unsigned int pos, + unsigned int nr_entries, unsigned int count); unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio, struct list_head *list, unsigned int max); int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba, - u64 pos, int bio_iter); + struct ppa_addr ppa, int bio_iter); unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int entries); unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags); @@ -601,6 +681,7 @@ void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags); unsigned int pblk_rb_sync_point_count(struct pblk_rb *rb); unsigned int pblk_rb_read_count(struct pblk_rb *rb); +unsigned int pblk_rb_sync_count(struct pblk_rb *rb); unsigned int pblk_rb_wrap_pos(struct pblk_rb *rb, unsigned int pos); int pblk_rb_tear_down_check(struct pblk_rb *rb); @@ -612,40 +693,50 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf); * pblk core */ struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw); +void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write); int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd, struct pblk_c_ctx *c_ctx); void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw); -void pblk_flush_writer(struct pblk *pblk); +void pblk_wait_for_meta(struct pblk *pblk); struct ppa_addr pblk_get_lba_map(struct pblk *pblk, sector_t lba); void pblk_discard(struct pblk *pblk, struct bio *bio); void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd); void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd); int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd); +int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line); struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data, unsigned int nr_secs, unsigned int len, - gfp_t gfp_mask); + int alloc_type, gfp_t gfp_mask); struct pblk_line *pblk_line_get(struct pblk *pblk); struct pblk_line *pblk_line_get_first_data(struct pblk *pblk); -struct pblk_line *pblk_line_replace_data(struct pblk *pblk); +void pblk_line_replace_data(struct pblk *pblk); int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line); void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line); struct pblk_line *pblk_line_get_data(struct pblk *pblk); -struct pblk_line *pblk_line_get_data_next(struct pblk *pblk); +struct pblk_line *pblk_line_get_erase(struct pblk *pblk); int pblk_line_erase(struct pblk *pblk, struct pblk_line *line); int pblk_line_is_full(struct pblk_line *line); void pblk_line_free(struct pblk *pblk, struct pblk_line *line); -void pblk_line_close_ws(struct work_struct *work); +void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line); void pblk_line_close(struct pblk *pblk, struct pblk_line *line); +void pblk_line_close_meta_sync(struct pblk *pblk); +void pblk_line_close_ws(struct work_struct *work); +void pblk_pipeline_stop(struct pblk *pblk); void pblk_line_mark_bb(struct work_struct *work); void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, - void (*work)(struct work_struct *)); + void (*work)(struct work_struct *), + struct workqueue_struct *wq); u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line); int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line); -int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line); +int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line, + void *emeta_buf); int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr erase_ppa); void pblk_line_put(struct kref *ref); struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line); +u64 pblk_lookup_page(struct pblk *pblk, struct pblk_line *line); +void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs); u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs); +u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs); int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail, unsigned long secs_to_flush); void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, @@ -656,11 +747,11 @@ void pblk_end_bio_sync(struct bio *bio); void pblk_end_io_sync(struct nvm_rq *rqd); int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags, int nr_pages); -void pblk_map_pad_invalidate(struct pblk *pblk, struct pblk_line *line, - u64 paddr); void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off, int nr_pages); void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa); +void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line, + u64 paddr); void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa); void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa); @@ -702,6 +793,7 @@ void pblk_write_should_kick(struct pblk *pblk); /* * pblk read path */ +extern struct bio_set *pblk_bio_set; int pblk_submit_read(struct pblk *pblk, struct bio *bio); int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, unsigned int nr_secs, unsigned int *secs_to_gc, @@ -711,7 +803,7 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, */ void pblk_submit_rec(struct work_struct *work); struct pblk_line *pblk_recov_l2p(struct pblk *pblk); -void pblk_recov_pad(struct pblk *pblk); +int pblk_recov_pad(struct pblk *pblk); __le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta); int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, struct pblk_rec_ctx *recovery, u64 *comp_bits, @@ -720,33 +812,40 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, /* * pblk gc */ -#define PBLK_GC_TRIES 3 +#define PBLK_GC_MAX_READERS 8 /* Max number of outstanding GC reader jobs */ +#define PBLK_GC_W_QD 128 /* Queue depth for inflight GC write I/Os */ +#define PBLK_GC_L_QD 4 /* Queue depth for inflight GC lines */ +#define PBLK_GC_RSV_LINE 1 /* Reserved lines for GC */ int pblk_gc_init(struct pblk *pblk); void pblk_gc_exit(struct pblk *pblk); void pblk_gc_should_start(struct pblk *pblk); void pblk_gc_should_stop(struct pblk *pblk); -int pblk_gc_status(struct pblk *pblk); +void pblk_gc_should_kick(struct pblk *pblk); +void pblk_gc_kick(struct pblk *pblk); void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled, int *gc_active); -void pblk_gc_sysfs_force(struct pblk *pblk, int force); +int pblk_gc_sysfs_force(struct pblk *pblk, int force); /* * pblk rate limiter */ void pblk_rl_init(struct pblk_rl *rl, int budget); void pblk_rl_free(struct pblk_rl *rl); -int pblk_rl_gc_thrs(struct pblk_rl *rl); +int pblk_rl_high_thrs(struct pblk_rl *rl); +int pblk_rl_low_thrs(struct pblk_rl *rl); unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl); int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries); +void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries); void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries); int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries); void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries); void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc); -void pblk_rl_set_gc_rsc(struct pblk_rl *rl, int rsv); int pblk_rl_sysfs_rate_show(struct pblk_rl *rl); void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line); void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line); +void pblk_rl_set_space_limit(struct pblk_rl *rl, int entries_left); +int pblk_rl_is_limit(struct pblk_rl *rl); /* * pblk sysfs @@ -774,9 +873,30 @@ static inline struct nvm_rq *nvm_rq_from_c_ctx(void *c_ctx) return c_ctx - sizeof(struct nvm_rq); } -static inline void *pblk_line_emeta_to_lbas(struct line_emeta *emeta) +static inline void *emeta_to_bb(struct line_emeta *emeta) +{ + return emeta->bb_bitmap; +} + +static inline void *emeta_to_lbas(struct pblk *pblk, struct line_emeta *emeta) +{ + return ((void *)emeta + pblk->lm.emeta_len[1]); +} + +static inline void *emeta_to_vsc(struct pblk *pblk, struct line_emeta *emeta) { - return (emeta) + 1; + return (emeta_to_lbas(pblk, emeta) + pblk->lm.emeta_len[2]); +} + +static inline int pblk_line_vsc(struct pblk_line *line) +{ + int vsc; + + spin_lock(&line->lock); + vsc = le32_to_cpu(*line->vsc); + spin_unlock(&line->lock); + + return vsc; } #define NVM_MEM_PAGE_WRITE (8) @@ -917,6 +1037,14 @@ static inline void pblk_ppa_set_empty(struct ppa_addr *ppa_addr) ppa_addr->ppa = ADDR_EMPTY; } +static inline bool pblk_ppa_comp(struct ppa_addr lppa, struct ppa_addr rppa) +{ + if (lppa.ppa == rppa.ppa) + return true; + + return false; +} + static inline int pblk_addr_in_cache(struct ppa_addr ppa) { return (ppa.ppa != ADDR_EMPTY && ppa.c.is_cached); @@ -964,11 +1092,11 @@ static inline struct ppa_addr addr_to_pblk_ppa(struct pblk *pblk, u64 paddr, } static inline u32 pblk_calc_meta_header_crc(struct pblk *pblk, - struct line_smeta *smeta) + struct line_header *header) { u32 crc = ~(u32)0; - crc = crc32_le(crc, (unsigned char *)smeta + sizeof(crc), + crc = crc32_le(crc, (unsigned char *)header + sizeof(crc), sizeof(struct line_header) - sizeof(crc)); return crc; @@ -996,7 +1124,7 @@ static inline u32 pblk_calc_emeta_crc(struct pblk *pblk, crc = crc32_le(crc, (unsigned char *)emeta + sizeof(struct line_header) + sizeof(crc), - lm->emeta_len - + lm->emeta_len[0] - sizeof(struct line_header) - sizeof(crc)); return crc; @@ -1016,9 +1144,27 @@ static inline int pblk_set_progr_mode(struct pblk *pblk, int type) return flags; } -static inline int pblk_set_read_mode(struct pblk *pblk) +enum { + PBLK_READ_RANDOM = 0, + PBLK_READ_SEQUENTIAL = 1, +}; + +static inline int pblk_set_read_mode(struct pblk *pblk, int type) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + int flags; + + flags = NVM_IO_SUSPEND | NVM_IO_SCRAMBLE_ENABLE; + if (type == PBLK_READ_SEQUENTIAL) + flags |= geo->plane_mode >> 1; + + return flags; +} + +static inline int pblk_io_aligned(struct pblk *pblk, int nr_secs) { - return NVM_IO_SNGL_ACCESS | NVM_IO_SUSPEND | NVM_IO_SCRAMBLE_ENABLE; + return !(nr_secs % pblk->min_write_pgs); } #ifdef CONFIG_NVM_DEBUG diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index cf0e28a..267f01a 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -279,8 +279,8 @@ static void rrpc_end_sync_bio(struct bio *bio) { struct completion *waiting = bio->bi_private; - if (bio->bi_error) - pr_err("nvm: gc request failed (%u).\n", bio->bi_error); + if (bio->bi_status) + pr_err("nvm: gc request failed (%u).\n", bio->bi_status); complete(waiting); } @@ -359,7 +359,7 @@ try: goto finished; } wait_for_completion_io(&wait); - if (bio->bi_error) { + if (bio->bi_status) { rrpc_inflight_laddr_release(rrpc, rqd); goto finished; } @@ -385,7 +385,7 @@ try: wait_for_completion_io(&wait); rrpc_inflight_laddr_release(rrpc, rqd); - if (bio->bi_error) + if (bio->bi_status) goto finished; bio_reset(bio); @@ -994,7 +994,7 @@ static blk_qc_t rrpc_make_rq(struct request_queue *q, struct bio *bio) struct nvm_rq *rqd; int err; - blk_queue_split(q, &bio, q->bio_split); + blk_queue_split(q, &bio); if (bio_op(bio) == REQ_OP_DISCARD) { rrpc_discard(rrpc, bio); diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index c3ea03c..dee542f 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -849,10 +849,11 @@ static inline void wake_up_allocators(struct cache_set *c) /* Forward declarations */ -void bch_count_io_errors(struct cache *, int, const char *); +void bch_count_io_errors(struct cache *, blk_status_t, const char *); void bch_bbio_count_io_errors(struct cache_set *, struct bio *, - int, const char *); -void bch_bbio_endio(struct cache_set *, struct bio *, int, const char *); + blk_status_t, const char *); +void bch_bbio_endio(struct cache_set *, struct bio *, blk_status_t, + const char *); void bch_bbio_free(struct bio *, struct cache_set *); struct bio *bch_bbio_alloc(struct cache_set *); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 450d0e8..866dcf7 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -307,7 +307,7 @@ static void bch_btree_node_read(struct btree *b) bch_submit_bbio(bio, b->c, &b->key, 0); closure_sync(&cl); - if (bio->bi_error) + if (bio->bi_status) set_btree_node_io_error(b); bch_bbio_free(bio, b->c); @@ -374,10 +374,10 @@ static void btree_node_write_endio(struct bio *bio) struct closure *cl = bio->bi_private; struct btree *b = container_of(cl, struct btree, io); - if (bio->bi_error) + if (bio->bi_status) set_btree_node_io_error(b); - bch_bbio_count_io_errors(b->c, bio, bio->bi_error, "writing btree"); + bch_bbio_count_io_errors(b->c, bio, bio->bi_status, "writing btree"); closure_put(cl); } diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 06f5505..35a5a72 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -110,7 +110,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) struct bio_vec bv, cbv; struct bvec_iter iter, citer = { 0 }; - check = bio_clone(bio, GFP_NOIO); + check = bio_clone_kmalloc(bio, GFP_NOIO); if (!check) return; check->bi_opf = REQ_OP_READ; diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index db45a88..6a9b850 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -50,7 +50,7 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c, /* IO errors */ -void bch_count_io_errors(struct cache *ca, int error, const char *m) +void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m) { /* * The halflife of an error is: @@ -103,7 +103,7 @@ void bch_count_io_errors(struct cache *ca, int error, const char *m) } void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, - int error, const char *m) + blk_status_t error, const char *m) { struct bbio *b = container_of(bio, struct bbio, bio); struct cache *ca = PTR_CACHE(c, &b->key, 0); @@ -132,7 +132,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, } void bch_bbio_endio(struct cache_set *c, struct bio *bio, - int error, const char *m) + blk_status_t error, const char *m) { struct closure *cl = bio->bi_private; diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 1198e53..0352d05 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -549,7 +549,7 @@ static void journal_write_endio(struct bio *bio) { struct journal_write *w = bio->bi_private; - cache_set_err_on(bio->bi_error, w->c, "journal io error"); + cache_set_err_on(bio->bi_status, w->c, "journal io error"); closure_put(&w->c->journal.io); } diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 13b8a90..f633b30 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -63,14 +63,14 @@ static void read_moving_endio(struct bio *bio) struct moving_io *io = container_of(bio->bi_private, struct moving_io, cl); - if (bio->bi_error) - io->op.error = bio->bi_error; + if (bio->bi_status) + io->op.status = bio->bi_status; else if (!KEY_DIRTY(&b->key) && ptr_stale(io->op.c, &b->key, 0)) { - io->op.error = -EINTR; + io->op.status = BLK_STS_IOERR; } - bch_bbio_endio(io->op.c, bio, bio->bi_error, "reading data to move"); + bch_bbio_endio(io->op.c, bio, bio->bi_status, "reading data to move"); } static void moving_init(struct moving_io *io) @@ -92,7 +92,7 @@ static void write_moving(struct closure *cl) struct moving_io *io = container_of(cl, struct moving_io, cl); struct data_insert_op *op = &io->op; - if (!op->error) { + if (!op->status) { moving_init(io); io->bio.bio.bi_iter.bi_sector = KEY_START(&io->w->key); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 709c9cc..019b3df 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -81,7 +81,7 @@ static void bch_data_insert_keys(struct closure *cl) if (ret == -ESRCH) { op->replace_collision = true; } else if (ret) { - op->error = -ENOMEM; + op->status = BLK_STS_RESOURCE; op->insert_data_done = true; } @@ -178,17 +178,17 @@ static void bch_data_insert_endio(struct bio *bio) struct closure *cl = bio->bi_private; struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); - if (bio->bi_error) { + if (bio->bi_status) { /* TODO: We could try to recover from this. */ if (op->writeback) - op->error = bio->bi_error; + op->status = bio->bi_status; else if (!op->replace) set_closure_fn(cl, bch_data_insert_error, op->wq); else set_closure_fn(cl, NULL, NULL); } - bch_bbio_endio(op->c, bio, bio->bi_error, "writing data to cache"); + bch_bbio_endio(op->c, bio, bio->bi_status, "writing data to cache"); } static void bch_data_insert_start(struct closure *cl) @@ -488,15 +488,15 @@ static void bch_cache_read_endio(struct bio *bio) * from the backing device. */ - if (bio->bi_error) - s->iop.error = bio->bi_error; + if (bio->bi_status) + s->iop.status = bio->bi_status; else if (!KEY_DIRTY(&b->key) && ptr_stale(s->iop.c, &b->key, 0)) { atomic_long_inc(&s->iop.c->cache_read_races); - s->iop.error = -EINTR; + s->iop.status = BLK_STS_IOERR; } - bch_bbio_endio(s->iop.c, bio, bio->bi_error, "reading from cache"); + bch_bbio_endio(s->iop.c, bio, bio->bi_status, "reading from cache"); } /* @@ -593,9 +593,9 @@ static void request_endio(struct bio *bio) { struct closure *cl = bio->bi_private; - if (bio->bi_error) { + if (bio->bi_status) { struct search *s = container_of(cl, struct search, cl); - s->iop.error = bio->bi_error; + s->iop.status = bio->bi_status; /* Only cache read errors are recoverable */ s->recoverable = false; } @@ -611,7 +611,7 @@ static void bio_complete(struct search *s) &s->d->disk->part0, s->start_time); trace_bcache_request_end(s->d, s->orig_bio); - s->orig_bio->bi_error = s->iop.error; + s->orig_bio->bi_status = s->iop.status; bio_endio(s->orig_bio); s->orig_bio = NULL; } @@ -664,7 +664,7 @@ static inline struct search *search_alloc(struct bio *bio, s->iop.inode = d->id; s->iop.write_point = hash_long((unsigned long) current, 16); s->iop.write_prio = 0; - s->iop.error = 0; + s->iop.status = 0; s->iop.flags = 0; s->iop.flush_journal = op_is_flush(bio->bi_opf); s->iop.wq = bcache_wq; @@ -707,7 +707,7 @@ static void cached_dev_read_error(struct closure *cl) /* Retry from the backing device: */ trace_bcache_read_retry(s->orig_bio); - s->iop.error = 0; + s->iop.status = 0; do_bio_hook(s, s->orig_bio); /* XXX: invalidate cache */ @@ -767,7 +767,7 @@ static void cached_dev_read_done_bh(struct closure *cl) !s->cache_miss, s->iop.bypass); trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass); - if (s->iop.error) + if (s->iop.status) continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq); else if (s->iop.bio || verify(dc, &s->bio.bio)) continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq); diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 1ff3687..7689176 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -10,7 +10,7 @@ struct data_insert_op { unsigned inode; uint16_t write_point; uint16_t write_prio; - short error; + blk_status_t status; union { uint16_t flags; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e57353e..8352fad 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -271,7 +271,7 @@ static void write_super_endio(struct bio *bio) { struct cache *ca = bio->bi_private; - bch_count_io_errors(ca, bio->bi_error, "writing superblock"); + bch_count_io_errors(ca, bio->bi_status, "writing superblock"); closure_put(&ca->set->sb_write); } @@ -321,7 +321,7 @@ static void uuid_endio(struct bio *bio) struct closure *cl = bio->bi_private; struct cache_set *c = container_of(cl, struct cache_set, uuid_write); - cache_set_err_on(bio->bi_error, c, "accessing uuids"); + cache_set_err_on(bio->bi_status, c, "accessing uuids"); bch_bbio_free(bio, c); closure_put(cl); } @@ -494,7 +494,7 @@ static void prio_endio(struct bio *bio) { struct cache *ca = bio->bi_private; - cache_set_err_on(bio->bi_error, ca->set, "accessing priorities"); + cache_set_err_on(bio->bi_status, ca->set, "accessing priorities"); bch_bbio_free(bio, ca->set); closure_put(&ca->prio); } @@ -782,7 +782,9 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, minor *= BCACHE_MINORS; - if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || + if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio), + BIOSET_NEED_BVECS | + BIOSET_NEED_RESCUER)) || !(d->disk = alloc_disk(BCACHE_MINORS))) { ida_simple_remove(&bcache_minor, minor); return -ENOMEM; @@ -1516,7 +1518,9 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) sizeof(struct bbio) + sizeof(struct bio_vec) * bucket_pages(c))) || !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || - !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || + !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio), + BIOSET_NEED_BVECS | + BIOSET_NEED_RESCUER)) || !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || !(c->moving_gc_wq = alloc_workqueue("bcache_gc", WQ_MEM_RECLAIM, 0)) || diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 6ac2e48..42c66e7 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -167,7 +167,7 @@ static void dirty_endio(struct bio *bio) struct keybuf_key *w = bio->bi_private; struct dirty_io *io = w->private; - if (bio->bi_error) + if (bio->bi_status) SET_KEY_DIRTY(&w->key, false); closure_put(&io->cl); @@ -195,7 +195,7 @@ static void read_dirty_endio(struct bio *bio) struct dirty_io *io = w->private; bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), - bio->bi_error, "reading dirty data from cache"); + bio->bi_status, "reading dirty data from cache"); dirty_endio(bio); } diff --git a/drivers/md/dm-bio-prison-v1.c b/drivers/md/dm-bio-prison-v1.c index ae7da2c..82d2738 100644 --- a/drivers/md/dm-bio-prison-v1.c +++ b/drivers/md/dm-bio-prison-v1.c @@ -229,7 +229,7 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison, EXPORT_SYMBOL_GPL(dm_cell_release_no_holder); void dm_cell_error(struct dm_bio_prison *prison, - struct dm_bio_prison_cell *cell, int error) + struct dm_bio_prison_cell *cell, blk_status_t error) { struct bio_list bios; struct bio *bio; @@ -238,7 +238,7 @@ void dm_cell_error(struct dm_bio_prison *prison, dm_cell_release(prison, cell, &bios); while ((bio = bio_list_pop(&bios))) { - bio->bi_error = error; + bio->bi_status = error; bio_endio(bio); } } diff --git a/drivers/md/dm-bio-prison-v1.h b/drivers/md/dm-bio-prison-v1.h index cddd4ac..cec52ac 100644 --- a/drivers/md/dm-bio-prison-v1.h +++ b/drivers/md/dm-bio-prison-v1.h @@ -91,7 +91,7 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison, struct dm_bio_prison_cell *cell, struct bio_list *inmates); void dm_cell_error(struct dm_bio_prison *prison, - struct dm_bio_prison_cell *cell, int error); + struct dm_bio_prison_cell *cell, blk_status_t error); /* * Visits the cell and then releases. Guarantees no new inmates are diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 840c149..850ff6c 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -145,8 +145,8 @@ struct dm_buffer { enum data_mode data_mode; unsigned char list_mode; /* LIST_* */ unsigned hold_count; - int read_error; - int write_error; + blk_status_t read_error; + blk_status_t write_error; unsigned long state; unsigned long last_accessed; struct dm_bufio_client *c; @@ -555,7 +555,7 @@ static void dmio_complete(unsigned long error, void *context) { struct dm_buffer *b = context; - b->bio.bi_error = error ? -EIO : 0; + b->bio.bi_status = error ? BLK_STS_IOERR : 0; b->bio.bi_end_io(&b->bio); } @@ -588,7 +588,7 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t sector, r = dm_io(&io_req, 1, ®ion, NULL); if (r) { - b->bio.bi_error = r; + b->bio.bi_status = errno_to_blk_status(r); end_io(&b->bio); } } @@ -596,7 +596,7 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t sector, static void inline_endio(struct bio *bio) { bio_end_io_t *end_fn = bio->bi_private; - int error = bio->bi_error; + blk_status_t status = bio->bi_status; /* * Reset the bio to free any attached resources @@ -604,7 +604,7 @@ static void inline_endio(struct bio *bio) */ bio_reset(bio); - bio->bi_error = error; + bio->bi_status = status; end_fn(bio); } @@ -685,11 +685,12 @@ static void write_endio(struct bio *bio) { struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); - b->write_error = bio->bi_error; - if (unlikely(bio->bi_error)) { + b->write_error = bio->bi_status; + if (unlikely(bio->bi_status)) { struct dm_bufio_client *c = b->c; - int error = bio->bi_error; - (void)cmpxchg(&c->async_write_error, 0, error); + + (void)cmpxchg(&c->async_write_error, 0, + blk_status_to_errno(bio->bi_status)); } BUG_ON(!test_bit(B_WRITING, &b->state)); @@ -1063,7 +1064,7 @@ static void read_endio(struct bio *bio) { struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); - b->read_error = bio->bi_error; + b->read_error = bio->bi_status; BUG_ON(!test_bit(B_READING, &b->state)); @@ -1107,7 +1108,7 @@ static void *new_read(struct dm_bufio_client *c, sector_t block, wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); if (b->read_error) { - int error = b->read_error; + int error = blk_status_to_errno(b->read_error); dm_bufio_release(b); @@ -1257,7 +1258,8 @@ EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async); */ int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c) { - int a, f; + blk_status_t a; + int f; unsigned long buffers_processed = 0; struct dm_buffer *b, *tmp; diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index d682a05..c5ea03f 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -119,7 +119,7 @@ static void iot_io_end(struct io_tracker *iot, sector_t len) */ struct continuation { struct work_struct ws; - int input; + blk_status_t input; }; static inline void init_continuation(struct continuation *k, @@ -145,7 +145,7 @@ struct batcher { /* * The operation that everyone is waiting for. */ - int (*commit_op)(void *context); + blk_status_t (*commit_op)(void *context); void *commit_context; /* @@ -171,8 +171,7 @@ struct batcher { static void __commit(struct work_struct *_ws) { struct batcher *b = container_of(_ws, struct batcher, commit_work); - - int r; + blk_status_t r; unsigned long flags; struct list_head work_items; struct work_struct *ws, *tmp; @@ -205,7 +204,7 @@ static void __commit(struct work_struct *_ws) while ((bio = bio_list_pop(&bios))) { if (r) { - bio->bi_error = r; + bio->bi_status = r; bio_endio(bio); } else b->issue_op(bio, b->issue_context); @@ -213,7 +212,7 @@ static void __commit(struct work_struct *_ws) } static void batcher_init(struct batcher *b, - int (*commit_op)(void *), + blk_status_t (*commit_op)(void *), void *commit_context, void (*issue_op)(struct bio *bio, void *), void *issue_context, @@ -955,7 +954,7 @@ static void writethrough_endio(struct bio *bio) dm_unhook_bio(&pb->hook_info, bio); - if (bio->bi_error) { + if (bio->bi_status) { bio_endio(bio); return; } @@ -1220,7 +1219,7 @@ static void copy_complete(int read_err, unsigned long write_err, void *context) struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k); if (read_err || write_err) - mg->k.input = -EIO; + mg->k.input = BLK_STS_IOERR; queue_continuation(mg->cache->wq, &mg->k); } @@ -1266,8 +1265,8 @@ static void overwrite_endio(struct bio *bio) dm_unhook_bio(&pb->hook_info, bio); - if (bio->bi_error) - mg->k.input = bio->bi_error; + if (bio->bi_status) + mg->k.input = bio->bi_status; queue_continuation(mg->cache->wq, &mg->k); } @@ -1323,8 +1322,10 @@ static void mg_complete(struct dm_cache_migration *mg, bool success) if (mg->overwrite_bio) { if (success) force_set_dirty(cache, cblock); + else if (mg->k.input) + mg->overwrite_bio->bi_status = mg->k.input; else - mg->overwrite_bio->bi_error = (mg->k.input ? : -EIO); + mg->overwrite_bio->bi_status = BLK_STS_IOERR; bio_endio(mg->overwrite_bio); } else { if (success) @@ -1504,7 +1505,7 @@ static void mg_copy(struct work_struct *ws) r = copy(mg, is_policy_promote); if (r) { DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache)); - mg->k.input = -EIO; + mg->k.input = BLK_STS_IOERR; mg_complete(mg, false); } } @@ -1907,12 +1908,12 @@ static int commit(struct cache *cache, bool clean_shutdown) /* * Used by the batcher. */ -static int commit_op(void *context) +static blk_status_t commit_op(void *context) { struct cache *cache = context; if (dm_cache_changed_this_transaction(cache->cmd)) - return commit(cache, false); + return errno_to_blk_status(commit(cache, false)); return 0; } @@ -2018,7 +2019,7 @@ static void requeue_deferred_bios(struct cache *cache) bio_list_init(&cache->deferred_bios); while ((bio = bio_list_pop(&bios))) { - bio->bi_error = DM_ENDIO_REQUEUE; + bio->bi_status = BLK_STS_DM_REQUEUE; bio_endio(bio); } } @@ -2820,7 +2821,8 @@ static int cache_map(struct dm_target *ti, struct bio *bio) return r; } -static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) +static int cache_end_io(struct dm_target *ti, struct bio *bio, + blk_status_t *error) { struct cache *cache = ti->private; unsigned long flags; @@ -2838,7 +2840,7 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) bio_drop_shared_lock(cache, bio); accounted_complete(cache, bio); - return 0; + return DM_ENDIO_DONE; } static int write_dirty_bitset(struct cache *cache) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index ebf9e72..9e1b72e 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -71,7 +71,7 @@ struct dm_crypt_io { struct convert_context ctx; atomic_t io_pending; - int error; + blk_status_t error; sector_t sector; struct rb_node rb_node; @@ -1292,7 +1292,7 @@ static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_ /* * Encrypt / decrypt data from one bio to another one (can be the same one) */ -static int crypt_convert(struct crypt_config *cc, +static blk_status_t crypt_convert(struct crypt_config *cc, struct convert_context *ctx) { unsigned int tag_offset = 0; @@ -1343,13 +1343,13 @@ static int crypt_convert(struct crypt_config *cc, */ case -EBADMSG: atomic_dec(&ctx->cc_pending); - return -EILSEQ; + return BLK_STS_PROTECTION; /* * There was an error while processing the request. */ default: atomic_dec(&ctx->cc_pending); - return -EIO; + return BLK_STS_IOERR; } } @@ -1463,7 +1463,7 @@ static void crypt_dec_pending(struct dm_crypt_io *io) { struct crypt_config *cc = io->cc; struct bio *base_bio = io->base_bio; - int error = io->error; + blk_status_t error = io->error; if (!atomic_dec_and_test(&io->io_pending)) return; @@ -1476,7 +1476,7 @@ static void crypt_dec_pending(struct dm_crypt_io *io) else kfree(io->integrity_metadata); - base_bio->bi_error = error; + base_bio->bi_status = error; bio_endio(base_bio); } @@ -1502,7 +1502,7 @@ static void crypt_endio(struct bio *clone) struct dm_crypt_io *io = clone->bi_private; struct crypt_config *cc = io->cc; unsigned rw = bio_data_dir(clone); - int error; + blk_status_t error; /* * free the processed pages @@ -1510,7 +1510,7 @@ static void crypt_endio(struct bio *clone) if (rw == WRITE) crypt_free_buffer_pages(cc, clone); - error = clone->bi_error; + error = clone->bi_status; bio_put(clone); if (rw == READ && !error) { @@ -1570,7 +1570,7 @@ static void kcryptd_io_read_work(struct work_struct *work) crypt_inc_pending(io); if (kcryptd_io_read(io, GFP_NOIO)) - io->error = -ENOMEM; + io->error = BLK_STS_RESOURCE; crypt_dec_pending(io); } @@ -1656,7 +1656,7 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) sector_t sector; struct rb_node **rbp, *parent; - if (unlikely(io->error < 0)) { + if (unlikely(io->error)) { crypt_free_buffer_pages(cc, clone); bio_put(clone); crypt_dec_pending(io); @@ -1697,7 +1697,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) struct bio *clone; int crypt_finished; sector_t sector = io->sector; - int r; + blk_status_t r; /* * Prevent io from disappearing until this function completes. @@ -1707,7 +1707,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) clone = crypt_alloc_buffer(io, io->base_bio->bi_iter.bi_size); if (unlikely(!clone)) { - io->error = -EIO; + io->error = BLK_STS_IOERR; goto dec; } @@ -1718,7 +1718,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) crypt_inc_pending(io); r = crypt_convert(cc, &io->ctx); - if (r < 0) + if (r) io->error = r; crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending); @@ -1740,7 +1740,7 @@ static void kcryptd_crypt_read_done(struct dm_crypt_io *io) static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) { struct crypt_config *cc = io->cc; - int r = 0; + blk_status_t r; crypt_inc_pending(io); @@ -1748,7 +1748,7 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) io->sector); r = crypt_convert(cc, &io->ctx); - if (r < 0) + if (r) io->error = r; if (atomic_dec_and_test(&io->ctx.cc_pending)) @@ -1781,9 +1781,9 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, if (error == -EBADMSG) { DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu", (unsigned long long)le64_to_cpu(*org_sector_of_dmreq(cc, dmreq))); - io->error = -EILSEQ; + io->error = BLK_STS_PROTECTION; } else if (error < 0) - io->error = -EIO; + io->error = BLK_STS_IOERR; crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio); @@ -2677,7 +2677,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - cc->bs = bioset_create(MIN_IOS, 0); + cc->bs = bioset_create(MIN_IOS, 0, (BIOSET_NEED_BVECS | + BIOSET_NEED_RESCUER)); if (!cc->bs) { ti->error = "Cannot allocate crypt bioset"; goto bad; @@ -2795,10 +2796,10 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) * and is aligned to this size as defined in IO hints. */ if (unlikely((bio->bi_iter.bi_sector & ((cc->sector_size >> SECTOR_SHIFT) - 1)) != 0)) - return -EIO; + return DM_MAPIO_KILL; if (unlikely(bio->bi_iter.bi_size & (cc->sector_size - 1))) - return -EIO; + return DM_MAPIO_KILL; io = dm_per_bio_data(bio, cc->per_bio_data_size); crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector)); diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 13305a1..3d04d5c 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -321,7 +321,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) if (bio_data_dir(bio) == READ) { if (!fc->corrupt_bio_byte && !test_bit(DROP_WRITES, &fc->flags) && !test_bit(ERROR_WRITES, &fc->flags)) - return -EIO; + return DM_MAPIO_KILL; goto map_bio; } @@ -349,7 +349,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) /* * By default, error all I/O. */ - return -EIO; + return DM_MAPIO_KILL; } map_bio: @@ -358,12 +358,13 @@ map_bio: return DM_MAPIO_REMAPPED; } -static int flakey_end_io(struct dm_target *ti, struct bio *bio, int error) +static int flakey_end_io(struct dm_target *ti, struct bio *bio, + blk_status_t *error) { struct flakey_c *fc = ti->private; struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); - if (!error && pb->bio_submitted && (bio_data_dir(bio) == READ)) { + if (!*error && pb->bio_submitted && (bio_data_dir(bio) == READ)) { if (fc->corrupt_bio_byte && (fc->corrupt_bio_rw == READ) && all_corrupt_bio_flags_match(bio, fc)) { /* @@ -377,11 +378,11 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, int error) * Error read during the down_interval if drop_writes * and error_writes were not configured. */ - return -EIO; + *error = BLK_STS_IOERR; } } - return error; + return DM_ENDIO_DONE; } static void flakey_status(struct dm_target *ti, status_type_t type, diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 93b1810..1b224aa 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -246,7 +246,7 @@ struct dm_integrity_io { unsigned metadata_offset; atomic_t in_flight; - int bi_error; + blk_status_t bi_status; struct completion *completion; @@ -1118,8 +1118,8 @@ static void submit_flush_bio(struct dm_integrity_c *ic, struct dm_integrity_io * static void do_endio(struct dm_integrity_c *ic, struct bio *bio) { int r = dm_integrity_failed(ic); - if (unlikely(r) && !bio->bi_error) - bio->bi_error = r; + if (unlikely(r) && !bio->bi_status) + bio->bi_status = errno_to_blk_status(r); bio_endio(bio); } @@ -1127,7 +1127,7 @@ static void do_endio_flush(struct dm_integrity_c *ic, struct dm_integrity_io *di { struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); - if (unlikely(dio->fua) && likely(!bio->bi_error) && likely(!dm_integrity_failed(ic))) + if (unlikely(dio->fua) && likely(!bio->bi_status) && likely(!dm_integrity_failed(ic))) submit_flush_bio(ic, dio); else do_endio(ic, bio); @@ -1146,9 +1146,9 @@ static void dec_in_flight(struct dm_integrity_io *dio) bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); - if (unlikely(dio->bi_error) && !bio->bi_error) - bio->bi_error = dio->bi_error; - if (likely(!bio->bi_error) && unlikely(bio_sectors(bio) != dio->range.n_sectors)) { + if (unlikely(dio->bi_status) && !bio->bi_status) + bio->bi_status = dio->bi_status; + if (likely(!bio->bi_status) && unlikely(bio_sectors(bio) != dio->range.n_sectors)) { dio->range.logical_sector += dio->range.n_sectors; bio_advance(bio, dio->range.n_sectors << SECTOR_SHIFT); INIT_WORK(&dio->work, integrity_bio_wait); @@ -1322,7 +1322,7 @@ skip_io: dec_in_flight(dio); return; error: - dio->bi_error = r; + dio->bi_status = errno_to_blk_status(r); dec_in_flight(dio); } @@ -1335,7 +1335,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) sector_t area, offset; dio->ic = ic; - dio->bi_error = 0; + dio->bi_status = 0; if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { submit_flush_bio(ic, dio); @@ -1356,13 +1356,13 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) DMERR("Too big sector number: 0x%llx + 0x%x > 0x%llx", (unsigned long long)dio->range.logical_sector, bio_sectors(bio), (unsigned long long)ic->provided_data_sectors); - return -EIO; + return DM_MAPIO_KILL; } if (unlikely((dio->range.logical_sector | bio_sectors(bio)) & (unsigned)(ic->sectors_per_block - 1))) { DMERR("Bio not aligned on %u sectors: 0x%llx, 0x%x", ic->sectors_per_block, (unsigned long long)dio->range.logical_sector, bio_sectors(bio)); - return -EIO; + return DM_MAPIO_KILL; } if (ic->sectors_per_block > 1) { @@ -1372,7 +1372,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) if (unlikely((bv.bv_offset | bv.bv_len) & ((ic->sectors_per_block << SECTOR_SHIFT) - 1))) { DMERR("Bio vector (%u,%u) is not aligned on %u-sector boundary", bv.bv_offset, bv.bv_len, ic->sectors_per_block); - return -EIO; + return DM_MAPIO_KILL; } } } @@ -1387,18 +1387,18 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) wanted_tag_size *= ic->tag_size; if (unlikely(wanted_tag_size != bip->bip_iter.bi_size)) { DMERR("Invalid integrity data size %u, expected %u", bip->bip_iter.bi_size, wanted_tag_size); - return -EIO; + return DM_MAPIO_KILL; } } } else { if (unlikely(bip != NULL)) { DMERR("Unexpected integrity data when using internal hash"); - return -EIO; + return DM_MAPIO_KILL; } } if (unlikely(ic->mode == 'R') && unlikely(dio->write)) - return -EIO; + return DM_MAPIO_KILL; get_area_and_offset(ic, dio->range.logical_sector, &area, &offset); dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset); diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 8d5ca30..2503960 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -58,7 +58,8 @@ struct dm_io_client *dm_io_client_create(void) if (!client->pool) goto bad; - client->bios = bioset_create(min_ios, 0); + client->bios = bioset_create(min_ios, 0, (BIOSET_NEED_BVECS | + BIOSET_NEED_RESCUER)); if (!client->bios) goto bad; @@ -124,7 +125,7 @@ static void complete_io(struct io *io) fn(error_bits, context); } -static void dec_count(struct io *io, unsigned int region, int error) +static void dec_count(struct io *io, unsigned int region, blk_status_t error) { if (error) set_bit(region, &io->error_bits); @@ -137,9 +138,9 @@ static void endio(struct bio *bio) { struct io *io; unsigned region; - int error; + blk_status_t error; - if (bio->bi_error && bio_data_dir(bio) == READ) + if (bio->bi_status && bio_data_dir(bio) == READ) zero_fill_bio(bio); /* @@ -147,7 +148,7 @@ static void endio(struct bio *bio) */ retrieve_io_and_region_from_bio(bio, &io, ®ion); - error = bio->bi_error; + error = bio->bi_status; bio_put(bio); dec_count(io, region, error); @@ -319,7 +320,7 @@ static void do_region(int op, int op_flags, unsigned region, if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES || op == REQ_OP_WRITE_SAME) && special_cmd_max_sectors == 0) { atomic_inc(&io->count); - dec_count(io, region, -EOPNOTSUPP); + dec_count(io, region, BLK_STS_NOTSUPP); return; } diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 4dfe386..a1da0eb 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -150,10 +150,10 @@ static void log_end_io(struct bio *bio) { struct log_writes_c *lc = bio->bi_private; - if (bio->bi_error) { + if (bio->bi_status) { unsigned long flags; - DMERR("Error writing log block, error=%d", bio->bi_error); + DMERR("Error writing log block, error=%d", bio->bi_status); spin_lock_irqsave(&lc->blocks_lock, flags); lc->logging_enabled = false; spin_unlock_irqrestore(&lc->blocks_lock, flags); @@ -586,7 +586,7 @@ static int log_writes_map(struct dm_target *ti, struct bio *bio) spin_lock_irq(&lc->blocks_lock); lc->logging_enabled = false; spin_unlock_irq(&lc->blocks_lock); - return -ENOMEM; + return DM_MAPIO_KILL; } INIT_LIST_HEAD(&block->list); pb->block = block; @@ -639,7 +639,7 @@ static int log_writes_map(struct dm_target *ti, struct bio *bio) spin_lock_irq(&lc->blocks_lock); lc->logging_enabled = false; spin_unlock_irq(&lc->blocks_lock); - return -ENOMEM; + return DM_MAPIO_KILL; } src = kmap_atomic(bv.bv_page); @@ -664,7 +664,8 @@ map_bio: return DM_MAPIO_REMAPPED; } -static int normal_end_io(struct dm_target *ti, struct bio *bio, int error) +static int normal_end_io(struct dm_target *ti, struct bio *bio, + blk_status_t *error) { struct log_writes_c *lc = ti->private; struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); @@ -686,7 +687,7 @@ static int normal_end_io(struct dm_target *ti, struct bio *bio, int error) spin_unlock_irqrestore(&lc->blocks_lock, flags); } - return error; + return DM_ENDIO_DONE; } /* diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 3df056b..0e8ab5b 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -559,13 +559,13 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) return DM_MAPIO_REQUEUE; dm_report_EIO(m); - return -EIO; + return DM_MAPIO_KILL; } mpio->pgpath = pgpath; mpio->nr_bytes = nr_bytes; - bio->bi_error = 0; + bio->bi_status = 0; bio->bi_bdev = pgpath->path.dev->bdev; bio->bi_opf |= REQ_FAILFAST_TRANSPORT; @@ -621,11 +621,19 @@ static void process_queued_bios(struct work_struct *work) blk_start_plug(&plug); while ((bio = bio_list_pop(&bios))) { r = __multipath_map_bio(m, bio, get_mpio_from_bio(bio)); - if (r < 0 || r == DM_MAPIO_REQUEUE) { - bio->bi_error = r; + switch (r) { + case DM_MAPIO_KILL: + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + break; + case DM_MAPIO_REQUEUE: + bio->bi_status = BLK_STS_DM_REQUEUE; bio_endio(bio); - } else if (r == DM_MAPIO_REMAPPED) + break; + case DM_MAPIO_REMAPPED: generic_make_request(bio); + break; + } } blk_finish_plug(&plug); } @@ -1442,22 +1450,15 @@ static void activate_path_work(struct work_struct *work) activate_or_offline_path(pgpath); } -static int noretry_error(int error) +static int noretry_error(blk_status_t error) { switch (error) { - case -EBADE: - /* - * EBADE signals an reservation conflict. - * We shouldn't fail the path here as we can communicate with - * the target. We should failover to the next path, but in - * doing so we might be causing a ping-pong between paths. - * So just return the reservation conflict error. - */ - case -EOPNOTSUPP: - case -EREMOTEIO: - case -EILSEQ: - case -ENODATA: - case -ENOSPC: + case BLK_STS_NOTSUPP: + case BLK_STS_NOSPC: + case BLK_STS_TARGET: + case BLK_STS_NEXUS: + case BLK_STS_MEDIUM: + case BLK_STS_RESOURCE: return 1; } @@ -1466,7 +1467,7 @@ static int noretry_error(int error) } static int multipath_end_io(struct dm_target *ti, struct request *clone, - int error, union map_info *map_context) + blk_status_t error, union map_info *map_context) { struct dm_mpath_io *mpio = get_mpio(map_context); struct pgpath *pgpath = mpio->pgpath; @@ -1493,7 +1494,7 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, if (atomic_read(&m->nr_valid_paths) == 0 && !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { - if (error == -EIO) + if (error == BLK_STS_IOERR) dm_report_EIO(m); /* complete with the original error */ r = DM_ENDIO_DONE; @@ -1510,24 +1511,26 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, return r; } -static int do_end_io_bio(struct multipath *m, struct bio *clone, - int error, struct dm_mpath_io *mpio) +static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, + blk_status_t *error) { + struct multipath *m = ti->private; + struct dm_mpath_io *mpio = get_mpio_from_bio(clone); + struct pgpath *pgpath = mpio->pgpath; unsigned long flags; + int r = DM_ENDIO_DONE; - if (!error) - return 0; /* I/O complete */ - - if (noretry_error(error)) - return error; + if (!*error || noretry_error(*error)) + goto done; - if (mpio->pgpath) - fail_path(mpio->pgpath); + if (pgpath) + fail_path(pgpath); if (atomic_read(&m->nr_valid_paths) == 0 && !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { dm_report_EIO(m); - return -EIO; + *error = BLK_STS_IOERR; + goto done; } /* Queue for the daemon to resubmit */ @@ -1539,23 +1542,11 @@ static int do_end_io_bio(struct multipath *m, struct bio *clone, if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) queue_work(kmultipathd, &m->process_queued_bios); - return DM_ENDIO_INCOMPLETE; -} - -static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, int error) -{ - struct multipath *m = ti->private; - struct dm_mpath_io *mpio = get_mpio_from_bio(clone); - struct pgpath *pgpath; - struct path_selector *ps; - int r; - - BUG_ON(!mpio); - - r = do_end_io_bio(m, clone, error, mpio); - pgpath = mpio->pgpath; + r = DM_ENDIO_INCOMPLETE; +done: if (pgpath) { - ps = &pgpath->pg->ps; + struct path_selector *ps = &pgpath->pg->ps; + if (ps->type->end_io) ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); } diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 4da8858..a4fbd91 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -491,9 +491,9 @@ static void hold_bio(struct mirror_set *ms, struct bio *bio) * If device is suspended, complete the bio. */ if (dm_noflush_suspending(ms->ti)) - bio->bi_error = DM_ENDIO_REQUEUE; + bio->bi_status = BLK_STS_DM_REQUEUE; else - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); return; @@ -627,7 +627,7 @@ static void write_callback(unsigned long error, void *context) * degrade the array. */ if (bio_op(bio) == REQ_OP_DISCARD) { - bio->bi_error = -EOPNOTSUPP; + bio->bi_status = BLK_STS_NOTSUPP; bio_endio(bio); return; } @@ -1210,14 +1210,14 @@ static int mirror_map(struct dm_target *ti, struct bio *bio) r = log->type->in_sync(log, dm_rh_bio_to_region(ms->rh, bio), 0); if (r < 0 && r != -EWOULDBLOCK) - return r; + return DM_MAPIO_KILL; /* * If region is not in-sync queue the bio. */ if (!r || (r == -EWOULDBLOCK)) { if (bio->bi_opf & REQ_RAHEAD) - return -EWOULDBLOCK; + return DM_MAPIO_KILL; queue_bio(ms, bio, rw); return DM_MAPIO_SUBMITTED; @@ -1229,7 +1229,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio) */ m = choose_mirror(ms, bio->bi_iter.bi_sector); if (unlikely(!m)) - return -EIO; + return DM_MAPIO_KILL; dm_bio_record(&bio_record->details, bio); bio_record->m = m; @@ -1239,7 +1239,8 @@ static int mirror_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_REMAPPED; } -static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error) +static int mirror_end_io(struct dm_target *ti, struct bio *bio, + blk_status_t *error) { int rw = bio_data_dir(bio); struct mirror_set *ms = (struct mirror_set *) ti->private; @@ -1255,16 +1256,16 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error) if (!(bio->bi_opf & REQ_PREFLUSH) && bio_op(bio) != REQ_OP_DISCARD) dm_rh_dec(ms->rh, bio_record->write_region); - return error; + return DM_ENDIO_DONE; } - if (error == -EOPNOTSUPP) + if (*error == BLK_STS_NOTSUPP) goto out; - if ((error == -EWOULDBLOCK) && (bio->bi_opf & REQ_RAHEAD)) + if (bio->bi_opf & REQ_RAHEAD) goto out; - if (unlikely(error)) { + if (unlikely(*error)) { if (!bio_record->details.bi_bdev) { /* * There wasn't enough memory to record necessary @@ -1272,7 +1273,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error) * mirror in-sync. */ DMERR_LIMIT("Mirror read failed."); - return -EIO; + return DM_ENDIO_DONE; } m = bio_record->m; @@ -1291,7 +1292,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error) dm_bio_restore(bd, bio); bio_record->details.bi_bdev = NULL; - bio->bi_error = 0; + bio->bi_status = 0; queue_bio(ms, bio, rw); return DM_ENDIO_INCOMPLETE; @@ -1302,7 +1303,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error) out: bio_record->details.bi_bdev = NULL; - return error; + return DM_ENDIO_DONE; } static void mirror_presuspend(struct dm_target *ti) diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index b639fa7..c6ebc5b 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -71,7 +71,7 @@ static void dm_old_start_queue(struct request_queue *q) static void dm_mq_start_queue(struct request_queue *q) { - blk_mq_start_stopped_hw_queues(q, true); + blk_mq_unquiesce_queue(q); blk_mq_kick_requeue_list(q); } @@ -119,7 +119,7 @@ static void end_clone_bio(struct bio *clone) struct dm_rq_target_io *tio = info->tio; struct bio *bio = info->orig; unsigned int nr_bytes = info->orig->bi_iter.bi_size; - int error = clone->bi_error; + blk_status_t error = clone->bi_status; bio_put(clone); @@ -158,7 +158,7 @@ static void end_clone_bio(struct bio *clone) * Do not use blk_end_request() here, because it may complete * the original request before the clone, and break the ordering. */ - blk_update_request(tio->orig, 0, nr_bytes); + blk_update_request(tio->orig, BLK_STS_OK, nr_bytes); } static struct dm_rq_target_io *tio_from_request(struct request *rq) @@ -216,7 +216,7 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue) * Must be called without clone's queue lock held, * see end_clone_request() for more details. */ -static void dm_end_request(struct request *clone, int error) +static void dm_end_request(struct request *clone, blk_status_t error) { int rw = rq_data_dir(clone); struct dm_rq_target_io *tio = clone->end_io_data; @@ -285,7 +285,7 @@ static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_ rq_completed(md, rw, false); } -static void dm_done(struct request *clone, int error, bool mapped) +static void dm_done(struct request *clone, blk_status_t error, bool mapped) { int r = DM_ENDIO_DONE; struct dm_rq_target_io *tio = clone->end_io_data; @@ -298,7 +298,7 @@ static void dm_done(struct request *clone, int error, bool mapped) r = rq_end_io(tio->ti, clone, error, &tio->info); } - if (unlikely(error == -EREMOTEIO)) { + if (unlikely(error == BLK_STS_TARGET)) { if (req_op(clone) == REQ_OP_WRITE_SAME && !clone->q->limits.max_write_same_sectors) disable_write_same(tio->md); @@ -358,7 +358,7 @@ static void dm_softirq_done(struct request *rq) * Complete the clone and the original request with the error status * through softirq context. */ -static void dm_complete_request(struct request *rq, int error) +static void dm_complete_request(struct request *rq, blk_status_t error) { struct dm_rq_target_io *tio = tio_from_request(rq); @@ -375,7 +375,7 @@ static void dm_complete_request(struct request *rq, int error) * Target's rq_end_io() function isn't called. * This may be used when the target's map_rq() or clone_and_map_rq() functions fail. */ -static void dm_kill_unmapped_request(struct request *rq, int error) +static void dm_kill_unmapped_request(struct request *rq, blk_status_t error) { rq->rq_flags |= RQF_FAILED; dm_complete_request(rq, error); @@ -384,7 +384,7 @@ static void dm_kill_unmapped_request(struct request *rq, int error) /* * Called with the clone's queue lock held (in the case of .request_fn) */ -static void end_clone_request(struct request *clone, int error) +static void end_clone_request(struct request *clone, blk_status_t error) { struct dm_rq_target_io *tio = clone->end_io_data; @@ -401,7 +401,7 @@ static void end_clone_request(struct request *clone, int error) static void dm_dispatch_clone_request(struct request *clone, struct request *rq) { - int r; + blk_status_t r; if (blk_queue_io_stat(clone->q)) clone->rq_flags |= RQF_IO_STAT; @@ -506,7 +506,7 @@ static int map_request(struct dm_rq_target_io *tio) break; case DM_MAPIO_KILL: /* The target wants to complete the I/O */ - dm_kill_unmapped_request(rq, -EIO); + dm_kill_unmapped_request(rq, BLK_STS_IOERR); break; default: DMWARN("unimplemented target map return value: %d", r); @@ -727,7 +727,7 @@ static int dm_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, return __dm_rq_init_rq(set->driver_data, rq); } -static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, +static blk_status_t dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *rq = bd->rq; @@ -744,7 +744,7 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, } if (ti->type->busy && ti->type->busy(ti)) - return BLK_MQ_RQ_QUEUE_BUSY; + return BLK_STS_RESOURCE; dm_start_request(md, rq); @@ -762,10 +762,10 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, rq_end_stats(md, rq); rq_completed(md, rq_data_dir(rq), false); blk_mq_delay_run_hw_queue(hctx, 100/*ms*/); - return BLK_MQ_RQ_QUEUE_BUSY; + return BLK_STS_RESOURCE; } - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; } static const struct blk_mq_ops dm_mq_ops = { diff --git a/drivers/md/dm-rq.h b/drivers/md/dm-rq.h index f0020d2..9813922 100644 --- a/drivers/md/dm-rq.h +++ b/drivers/md/dm-rq.h @@ -24,7 +24,7 @@ struct dm_rq_target_io { struct dm_target *ti; struct request *orig, *clone; struct kthread_work work; - int error; + blk_status_t error; union map_info info; struct dm_stats_aux stats_aux; unsigned long duration_jiffies; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index e152d98..1ba4104 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1590,7 +1590,7 @@ static void full_bio_end_io(struct bio *bio) { void *callback_data = bio->bi_private; - dm_kcopyd_do_callback(callback_data, 0, bio->bi_error ? 1 : 0); + dm_kcopyd_do_callback(callback_data, 0, bio->bi_status ? 1 : 0); } static void start_full_bio(struct dm_snap_pending_exception *pe, @@ -1690,7 +1690,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) /* Full snapshots are not usable */ /* To get here the table must be live so s->active is always set. */ if (!s->valid) - return -EIO; + return DM_MAPIO_KILL; /* FIXME: should only take write lock if we need * to copy an exception */ @@ -1698,7 +1698,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) if (!s->valid || (unlikely(s->snapshot_overflowed) && bio_data_dir(bio) == WRITE)) { - r = -EIO; + r = DM_MAPIO_KILL; goto out_unlock; } @@ -1723,7 +1723,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) if (!s->valid || s->snapshot_overflowed) { free_pending_exception(pe); - r = -EIO; + r = DM_MAPIO_KILL; goto out_unlock; } @@ -1741,7 +1741,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) DMERR("Snapshot overflowed: Unable to allocate exception."); } else __invalidate_snapshot(s, -ENOMEM); - r = -EIO; + r = DM_MAPIO_KILL; goto out_unlock; } } @@ -1851,14 +1851,15 @@ out_unlock: return r; } -static int snapshot_end_io(struct dm_target *ti, struct bio *bio, int error) +static int snapshot_end_io(struct dm_target *ti, struct bio *bio, + blk_status_t *error) { struct dm_snapshot *s = ti->private; if (is_bio_tracked(bio)) stop_tracking_chunk(s, bio); - return 0; + return DM_ENDIO_DONE; } static void snapshot_merge_presuspend(struct dm_target *ti) diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 7515248..11621a0 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -375,20 +375,21 @@ static void stripe_status(struct dm_target *ti, status_type_t type, } } -static int stripe_end_io(struct dm_target *ti, struct bio *bio, int error) +static int stripe_end_io(struct dm_target *ti, struct bio *bio, + blk_status_t *error) { unsigned i; char major_minor[16]; struct stripe_c *sc = ti->private; - if (!error) - return 0; /* I/O complete */ + if (!*error) + return DM_ENDIO_DONE; /* I/O complete */ - if ((error == -EWOULDBLOCK) && (bio->bi_opf & REQ_RAHEAD)) - return error; + if (bio->bi_opf & REQ_RAHEAD) + return DM_ENDIO_DONE; - if (error == -EOPNOTSUPP) - return error; + if (*error == BLK_STS_NOTSUPP) + return DM_ENDIO_DONE; memset(major_minor, 0, sizeof(major_minor)); sprintf(major_minor, "%d:%d", @@ -409,7 +410,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, int error) schedule_work(&sc->trigger_event); } - return error; + return DM_ENDIO_DONE; } static int stripe_iterate_devices(struct dm_target *ti, diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index b242b75..c0d7e60 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -128,7 +128,7 @@ static void io_err_dtr(struct dm_target *tt) static int io_err_map(struct dm_target *tt, struct bio *bio) { - return -EIO; + return DM_MAPIO_KILL; } static int io_err_clone_and_map_rq(struct dm_target *ti, struct request *rq, diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 28808e5..9dec2f8 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -383,8 +383,8 @@ static void end_discard(struct discard_op *op, int r) * Even if r is set, there could be sub discards in flight that we * need to wait for. */ - if (r && !op->parent_bio->bi_error) - op->parent_bio->bi_error = r; + if (r && !op->parent_bio->bi_status) + op->parent_bio->bi_status = errno_to_blk_status(r); bio_endio(op->parent_bio); } @@ -450,22 +450,20 @@ static void cell_release_no_holder(struct pool *pool, } static void cell_error_with_code(struct pool *pool, - struct dm_bio_prison_cell *cell, int error_code) + struct dm_bio_prison_cell *cell, blk_status_t error_code) { dm_cell_error(pool->prison, cell, error_code); dm_bio_prison_free_cell(pool->prison, cell); } -static int get_pool_io_error_code(struct pool *pool) +static blk_status_t get_pool_io_error_code(struct pool *pool) { - return pool->out_of_data_space ? -ENOSPC : -EIO; + return pool->out_of_data_space ? BLK_STS_NOSPC : BLK_STS_IOERR; } static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell) { - int error = get_pool_io_error_code(pool); - - cell_error_with_code(pool, cell, error); + cell_error_with_code(pool, cell, get_pool_io_error_code(pool)); } static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell) @@ -475,7 +473,7 @@ static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell) static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell) { - cell_error_with_code(pool, cell, DM_ENDIO_REQUEUE); + cell_error_with_code(pool, cell, BLK_STS_DM_REQUEUE); } /*----------------------------------------------------------------*/ @@ -555,17 +553,18 @@ static void __merge_bio_list(struct bio_list *bios, struct bio_list *master) bio_list_init(master); } -static void error_bio_list(struct bio_list *bios, int error) +static void error_bio_list(struct bio_list *bios, blk_status_t error) { struct bio *bio; while ((bio = bio_list_pop(bios))) { - bio->bi_error = error; + bio->bi_status = error; bio_endio(bio); } } -static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, int error) +static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, + blk_status_t error) { struct bio_list bios; unsigned long flags; @@ -608,11 +607,11 @@ static void requeue_io(struct thin_c *tc) __merge_bio_list(&bios, &tc->retry_on_resume_list); spin_unlock_irqrestore(&tc->lock, flags); - error_bio_list(&bios, DM_ENDIO_REQUEUE); + error_bio_list(&bios, BLK_STS_DM_REQUEUE); requeue_deferred_cells(tc); } -static void error_retry_list_with_code(struct pool *pool, int error) +static void error_retry_list_with_code(struct pool *pool, blk_status_t error) { struct thin_c *tc; @@ -624,9 +623,7 @@ static void error_retry_list_with_code(struct pool *pool, int error) static void error_retry_list(struct pool *pool) { - int error = get_pool_io_error_code(pool); - - error_retry_list_with_code(pool, error); + error_retry_list_with_code(pool, get_pool_io_error_code(pool)); } /* @@ -774,7 +771,7 @@ struct dm_thin_new_mapping { */ atomic_t prepare_actions; - int err; + blk_status_t status; struct thin_c *tc; dm_block_t virt_begin, virt_end; dm_block_t data_block; @@ -814,7 +811,7 @@ static void copy_complete(int read_err, unsigned long write_err, void *context) { struct dm_thin_new_mapping *m = context; - m->err = read_err || write_err ? -EIO : 0; + m->status = read_err || write_err ? BLK_STS_IOERR : 0; complete_mapping_preparation(m); } @@ -825,7 +822,7 @@ static void overwrite_endio(struct bio *bio) bio->bi_end_io = m->saved_bi_end_io; - m->err = bio->bi_error; + m->status = bio->bi_status; complete_mapping_preparation(m); } @@ -925,7 +922,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) struct bio *bio = m->bio; int r; - if (m->err) { + if (m->status) { cell_error(pool, m->cell); goto out; } @@ -1495,7 +1492,7 @@ static void retry_on_resume(struct bio *bio) spin_unlock_irqrestore(&tc->lock, flags); } -static int should_error_unserviceable_bio(struct pool *pool) +static blk_status_t should_error_unserviceable_bio(struct pool *pool) { enum pool_mode m = get_pool_mode(pool); @@ -1503,27 +1500,27 @@ static int should_error_unserviceable_bio(struct pool *pool) case PM_WRITE: /* Shouldn't get here */ DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode"); - return -EIO; + return BLK_STS_IOERR; case PM_OUT_OF_DATA_SPACE: - return pool->pf.error_if_no_space ? -ENOSPC : 0; + return pool->pf.error_if_no_space ? BLK_STS_NOSPC : 0; case PM_READ_ONLY: case PM_FAIL: - return -EIO; + return BLK_STS_IOERR; default: /* Shouldn't get here */ DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode"); - return -EIO; + return BLK_STS_IOERR; } } static void handle_unserviceable_bio(struct pool *pool, struct bio *bio) { - int error = should_error_unserviceable_bio(pool); + blk_status_t error = should_error_unserviceable_bio(pool); if (error) { - bio->bi_error = error; + bio->bi_status = error; bio_endio(bio); } else retry_on_resume(bio); @@ -1533,7 +1530,7 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c { struct bio *bio; struct bio_list bios; - int error; + blk_status_t error; error = should_error_unserviceable_bio(pool); if (error) { @@ -2071,7 +2068,8 @@ static void process_thin_deferred_bios(struct thin_c *tc) unsigned count = 0; if (tc->requeue_mode) { - error_thin_bio_list(tc, &tc->deferred_bio_list, DM_ENDIO_REQUEUE); + error_thin_bio_list(tc, &tc->deferred_bio_list, + BLK_STS_DM_REQUEUE); return; } @@ -2322,7 +2320,7 @@ static void do_no_space_timeout(struct work_struct *ws) if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) { pool->pf.error_if_no_space = true; notify_of_pool_mode_change_to_oods(pool); - error_retry_list_with_code(pool, -ENOSPC); + error_retry_list_with_code(pool, BLK_STS_NOSPC); } } @@ -2624,7 +2622,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) thin_hook_bio(tc, bio); if (tc->requeue_mode) { - bio->bi_error = DM_ENDIO_REQUEUE; + bio->bi_status = BLK_STS_DM_REQUEUE; bio_endio(bio); return DM_MAPIO_SUBMITTED; } @@ -4177,7 +4175,8 @@ static int thin_map(struct dm_target *ti, struct bio *bio) return thin_bio_map(ti, bio); } -static int thin_endio(struct dm_target *ti, struct bio *bio, int err) +static int thin_endio(struct dm_target *ti, struct bio *bio, + blk_status_t *err) { unsigned long flags; struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); @@ -4212,7 +4211,7 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err) if (h->cell) cell_defer_no_holder(h->tc, h->cell); - return 0; + return DM_ENDIO_DONE; } static void thin_presuspend(struct dm_target *ti) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 1ec9b2c..b46705e 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -538,13 +538,13 @@ static int verity_verify_io(struct dm_verity_io *io) /* * End one "io" structure with a given error. */ -static void verity_finish_io(struct dm_verity_io *io, int error) +static void verity_finish_io(struct dm_verity_io *io, blk_status_t status) { struct dm_verity *v = io->v; struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); bio->bi_end_io = io->orig_bi_end_io; - bio->bi_error = error; + bio->bi_status = status; verity_fec_finish_io(io); @@ -555,15 +555,15 @@ static void verity_work(struct work_struct *w) { struct dm_verity_io *io = container_of(w, struct dm_verity_io, work); - verity_finish_io(io, verity_verify_io(io)); + verity_finish_io(io, errno_to_blk_status(verity_verify_io(io))); } static void verity_end_io(struct bio *bio) { struct dm_verity_io *io = bio->bi_private; - if (bio->bi_error && !verity_fec_is_enabled(io->v)) { - verity_finish_io(io, bio->bi_error); + if (bio->bi_status && !verity_fec_is_enabled(io->v)) { + verity_finish_io(io, bio->bi_status); return; } @@ -643,17 +643,17 @@ static int verity_map(struct dm_target *ti, struct bio *bio) if (((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) & ((1 << (v->data_dev_block_bits - SECTOR_SHIFT)) - 1)) { DMERR_LIMIT("unaligned io"); - return -EIO; + return DM_MAPIO_KILL; } if (bio_end_sector(bio) >> (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) { DMERR_LIMIT("io out of range"); - return -EIO; + return DM_MAPIO_KILL; } if (bio_data_dir(bio) == WRITE) - return -EIO; + return DM_MAPIO_KILL; io = dm_per_bio_data(bio, ti->per_io_data_size); io->v = v; diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c index b616f11..b65ca8d 100644 --- a/drivers/md/dm-zero.c +++ b/drivers/md/dm-zero.c @@ -39,7 +39,7 @@ static int zero_map(struct dm_target *ti, struct bio *bio) case REQ_OP_READ: if (bio->bi_opf & REQ_RAHEAD) { /* readahead of null bytes only wastes buffer cache */ - return -EIO; + return DM_MAPIO_KILL; } zero_fill_bio(bio); break; @@ -47,7 +47,7 @@ static int zero_map(struct dm_target *ti, struct bio *bio) /* writes get silently dropped */ break; default: - return -EIO; + return DM_MAPIO_KILL; } bio_endio(bio); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 37ccd73..4029460 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -63,7 +63,7 @@ static struct workqueue_struct *deferred_remove_workqueue; */ struct dm_io { struct mapped_device *md; - int error; + blk_status_t status; atomic_t io_count; struct bio *bio; unsigned long start_time; @@ -768,23 +768,24 @@ static int __noflush_suspending(struct mapped_device *md) * Decrements the number of outstanding ios that a bio has been * cloned into, completing the original io if necc. */ -static void dec_pending(struct dm_io *io, int error) +static void dec_pending(struct dm_io *io, blk_status_t error) { unsigned long flags; - int io_error; + blk_status_t io_error; struct bio *bio; struct mapped_device *md = io->md; /* Push-back supersedes any I/O errors */ if (unlikely(error)) { spin_lock_irqsave(&io->endio_lock, flags); - if (!(io->error > 0 && __noflush_suspending(md))) - io->error = error; + if (!(io->status == BLK_STS_DM_REQUEUE && + __noflush_suspending(md))) + io->status = error; spin_unlock_irqrestore(&io->endio_lock, flags); } if (atomic_dec_and_test(&io->io_count)) { - if (io->error == DM_ENDIO_REQUEUE) { + if (io->status == BLK_STS_DM_REQUEUE) { /* * Target requested pushing back the I/O. */ @@ -793,16 +794,16 @@ static void dec_pending(struct dm_io *io, int error) bio_list_add_head(&md->deferred, io->bio); else /* noflush suspend was interrupted. */ - io->error = -EIO; + io->status = BLK_STS_IOERR; spin_unlock_irqrestore(&md->deferred_lock, flags); } - io_error = io->error; + io_error = io->status; bio = io->bio; end_io_acct(io); free_io(md, io); - if (io_error == DM_ENDIO_REQUEUE) + if (io_error == BLK_STS_DM_REQUEUE) return; if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) { @@ -814,7 +815,7 @@ static void dec_pending(struct dm_io *io, int error) queue_io(md, bio); } else { /* done with normal IO or empty flush */ - bio->bi_error = io_error; + bio->bi_status = io_error; bio_endio(bio); } } @@ -838,31 +839,13 @@ void disable_write_zeroes(struct mapped_device *md) static void clone_endio(struct bio *bio) { - int error = bio->bi_error; - int r = error; + blk_status_t error = bio->bi_status; struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); struct dm_io *io = tio->io; struct mapped_device *md = tio->io->md; dm_endio_fn endio = tio->ti->type->end_io; - if (endio) { - r = endio(tio->ti, bio, error); - if (r < 0 || r == DM_ENDIO_REQUEUE) - /* - * error and requeue request are handled - * in dec_pending(). - */ - error = r; - else if (r == DM_ENDIO_INCOMPLETE) - /* The target will handle the io */ - return; - else if (r) { - DMWARN("unimplemented target endio return value: %d", r); - BUG(); - } - } - - if (unlikely(r == -EREMOTEIO)) { + if (unlikely(error == BLK_STS_TARGET)) { if (bio_op(bio) == REQ_OP_WRITE_SAME && !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors) disable_write_same(md); @@ -871,6 +854,23 @@ static void clone_endio(struct bio *bio) disable_write_zeroes(md); } + if (endio) { + int r = endio(tio->ti, bio, &error); + switch (r) { + case DM_ENDIO_REQUEUE: + error = BLK_STS_DM_REQUEUE; + /*FALLTHRU*/ + case DM_ENDIO_DONE: + break; + case DM_ENDIO_INCOMPLETE: + /* The target will handle the io */ + return; + default: + DMWARN("unimplemented target endio return value: %d", r); + BUG(); + } + } + free_tio(tio); dec_pending(io, error); } @@ -1036,7 +1036,8 @@ static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule) while ((bio = bio_list_pop(&list))) { struct bio_set *bs = bio->bi_pool; - if (unlikely(!bs) || bs == fs_bio_set) { + if (unlikely(!bs) || bs == fs_bio_set || + !bs->rescue_workqueue) { bio_list_add(¤t->bio_list[i], bio); continue; } @@ -1084,18 +1085,24 @@ static void __map_bio(struct dm_target_io *tio) r = ti->type->map(ti, clone); dm_offload_end(&o); - if (r == DM_MAPIO_REMAPPED) { + switch (r) { + case DM_MAPIO_SUBMITTED: + break; + case DM_MAPIO_REMAPPED: /* the bio has been remapped so dispatch it */ - trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone, tio->io->bio->bi_bdev->bd_dev, sector); - generic_make_request(clone); - } else if (r < 0 || r == DM_MAPIO_REQUEUE) { - /* error the io and bail out, or requeue it if needed */ - dec_pending(tio->io, r); + break; + case DM_MAPIO_KILL: + dec_pending(tio->io, BLK_STS_IOERR); + free_tio(tio); + break; + case DM_MAPIO_REQUEUE: + dec_pending(tio->io, BLK_STS_DM_REQUEUE); free_tio(tio); - } else if (r != DM_MAPIO_SUBMITTED) { + break; + default: DMWARN("unimplemented target map return value: %d", r); BUG(); } @@ -1360,7 +1367,7 @@ static void __split_and_process_bio(struct mapped_device *md, ci.map = map; ci.md = md; ci.io = alloc_io(md); - ci.io->error = 0; + ci.io->status = 0; atomic_set(&ci.io->io_count, 1); ci.io->bio = bio; ci.io->md = md; @@ -1527,7 +1534,6 @@ void dm_init_normal_md_queue(struct mapped_device *md) * Initialize aspects of queue that aren't relevant for blk-mq */ md->queue->backing_dev_info->congested_fn = dm_any_congested; - blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); } static void cleanup_mapped_device(struct mapped_device *md) @@ -2654,7 +2660,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu BUG(); } - pools->bs = bioset_create_nobvec(pool_size, front_pad); + pools->bs = bioset_create(pool_size, front_pad, BIOSET_NEED_RESCUER); if (!pools->bs) goto out; diff --git a/drivers/md/md.c b/drivers/md/md.c index 84e76eb..31bcbfb 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -185,7 +185,7 @@ static int start_readonly; static bool create_on_open = true; /* bio_clone_mddev - * like bio_clone, but with a local bio set + * like bio_clone_bioset, but with a local bio set */ struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, @@ -265,7 +265,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) unsigned int sectors; int cpu; - blk_queue_split(q, &bio, q->bio_split); + blk_queue_split(q, &bio); if (mddev == NULL || mddev->pers == NULL) { bio_io_error(bio); @@ -273,7 +273,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) } if (mddev->ro == 1 && unlikely(rw == WRITE)) { if (bio_sectors(bio) != 0) - bio->bi_error = -EROFS; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); return BLK_QC_T_NONE; } @@ -719,8 +719,8 @@ static void super_written(struct bio *bio) struct md_rdev *rdev = bio->bi_private; struct mddev *mddev = rdev->mddev; - if (bio->bi_error) { - pr_err("md: super_written gets error=%d\n", bio->bi_error); + if (bio->bi_status) { + pr_err("md: super_written gets error=%d\n", bio->bi_status); md_error(mddev, rdev); if (!test_bit(Faulty, &rdev->flags) && (bio->bi_opf & MD_FAILFAST)) { @@ -801,7 +801,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, submit_bio_wait(bio); - ret = !bio->bi_error; + ret = !bio->bi_status; bio_put(bio); return ret; } @@ -5428,7 +5428,7 @@ int md_run(struct mddev *mddev) } if (mddev->bio_set == NULL) { - mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0); + mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (!mddev->bio_set) return -ENOMEM; } diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index e95d521..68d036e 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -73,12 +73,12 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh) * operation and are ready to return a success/failure code to the buffer * cache layer. */ -static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err) +static void multipath_end_bh_io(struct multipath_bh *mp_bh, blk_status_t status) { struct bio *bio = mp_bh->master_bio; struct mpconf *conf = mp_bh->mddev->private; - bio->bi_error = err; + bio->bi_status = status; bio_endio(bio); mempool_free(mp_bh, conf->pool); } @@ -89,7 +89,7 @@ static void multipath_end_request(struct bio *bio) struct mpconf *conf = mp_bh->mddev->private; struct md_rdev *rdev = conf->multipaths[mp_bh->path].rdev; - if (!bio->bi_error) + if (!bio->bi_status) multipath_end_bh_io(mp_bh, 0); else if (!(bio->bi_opf & REQ_RAHEAD)) { /* @@ -102,7 +102,7 @@ static void multipath_end_request(struct bio *bio) (unsigned long long)bio->bi_iter.bi_sector); multipath_reschedule_retry(mp_bh); } else - multipath_end_bh_io(mp_bh, bio->bi_error); + multipath_end_bh_io(mp_bh, bio->bi_status); rdev_dec_pending(rdev, conf->mddev); } @@ -347,7 +347,7 @@ static void multipathd(struct md_thread *thread) pr_err("multipath: %s: unrecoverable IO read error for block %llu\n", bdevname(bio->bi_bdev,b), (unsigned long long)bio->bi_iter.bi_sector); - multipath_end_bh_io(mp_bh, -EIO); + multipath_end_bh_io(mp_bh, BLK_STS_IOERR); } else { pr_err("multipath: %s: redirecting sector %llu to another IO path\n", bdevname(bio->bi_bdev,b), diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e1a7e3d..98ca2c1 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -277,7 +277,7 @@ static void call_bio_endio(struct r1bio *r1_bio) struct r1conf *conf = r1_bio->mddev->private; if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); /* @@ -335,7 +335,7 @@ static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio) static void raid1_end_read_request(struct bio *bio) { - int uptodate = !bio->bi_error; + int uptodate = !bio->bi_status; struct r1bio *r1_bio = bio->bi_private; struct r1conf *conf = r1_bio->mddev->private; struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev; @@ -426,12 +426,12 @@ static void raid1_end_write_request(struct bio *bio) struct md_rdev *rdev = conf->mirrors[mirror].rdev; bool discard_error; - discard_error = bio->bi_error && bio_op(bio) == REQ_OP_DISCARD; + discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD; /* * 'one mirror IO has finished' event handler: */ - if (bio->bi_error && !discard_error) { + if (bio->bi_status && !discard_error) { set_bit(WriteErrorSeen, &rdev->flags); if (!test_and_set_bit(WantReplacement, &rdev->flags)) set_bit(MD_RECOVERY_NEEDED, & @@ -802,7 +802,7 @@ static void flush_bio_list(struct r1conf *conf, struct bio *bio) bio->bi_next = NULL; bio->bi_bdev = rdev->bdev; if (test_bit(Faulty, &rdev->flags)) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) @@ -1856,7 +1856,7 @@ static void end_sync_read(struct bio *bio) * or re-read if the read failed. * We don't do much here, just schedule handling by raid1d */ - if (!bio->bi_error) + if (!bio->bi_status) set_bit(R1BIO_Uptodate, &r1_bio->state); if (atomic_dec_and_test(&r1_bio->remaining)) @@ -1865,7 +1865,7 @@ static void end_sync_read(struct bio *bio) static void end_sync_write(struct bio *bio) { - int uptodate = !bio->bi_error; + int uptodate = !bio->bi_status; struct r1bio *r1_bio = get_resync_r1bio(bio); struct mddev *mddev = r1_bio->mddev; struct r1conf *conf = mddev->private; @@ -2058,7 +2058,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio) idx ++; } set_bit(R1BIO_Uptodate, &r1_bio->state); - bio->bi_error = 0; + bio->bi_status = 0; return 1; } @@ -2082,16 +2082,16 @@ static void process_checks(struct r1bio *r1_bio) for (i = 0; i < conf->raid_disks * 2; i++) { int j; int size; - int error; + blk_status_t status; struct bio_vec *bi; struct bio *b = r1_bio->bios[i]; struct resync_pages *rp = get_resync_pages(b); if (b->bi_end_io != end_sync_read) continue; /* fixup the bio for reuse, but preserve errno */ - error = b->bi_error; + status = b->bi_status; bio_reset(b); - b->bi_error = error; + b->bi_status = status; b->bi_vcnt = vcnt; b->bi_iter.bi_size = r1_bio->sectors << 9; b->bi_iter.bi_sector = r1_bio->sector + @@ -2113,7 +2113,7 @@ static void process_checks(struct r1bio *r1_bio) } for (primary = 0; primary < conf->raid_disks * 2; primary++) if (r1_bio->bios[primary]->bi_end_io == end_sync_read && - !r1_bio->bios[primary]->bi_error) { + !r1_bio->bios[primary]->bi_status) { r1_bio->bios[primary]->bi_end_io = NULL; rdev_dec_pending(conf->mirrors[primary].rdev, mddev); break; @@ -2123,7 +2123,7 @@ static void process_checks(struct r1bio *r1_bio) int j; struct bio *pbio = r1_bio->bios[primary]; struct bio *sbio = r1_bio->bios[i]; - int error = sbio->bi_error; + blk_status_t status = sbio->bi_status; struct page **ppages = get_resync_pages(pbio)->pages; struct page **spages = get_resync_pages(sbio)->pages; struct bio_vec *bi; @@ -2132,12 +2132,12 @@ static void process_checks(struct r1bio *r1_bio) if (sbio->bi_end_io != end_sync_read) continue; /* Now we can 'fixup' the error value */ - sbio->bi_error = 0; + sbio->bi_status = 0; bio_for_each_segment_all(bi, sbio, j) page_len[j] = bi->bv_len; - if (!error) { + if (!status) { for (j = vcnt; j-- ; ) { if (memcmp(page_address(ppages[j]), page_address(spages[j]), @@ -2149,7 +2149,7 @@ static void process_checks(struct r1bio *r1_bio) if (j >= 0) atomic64_add(r1_bio->sectors, &mddev->resync_mismatches); if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) - && !error)) { + && !status)) { /* No need to write to this device. */ sbio->bi_end_io = NULL; rdev_dec_pending(conf->mirrors[i].rdev, mddev); @@ -2400,11 +2400,11 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio struct bio *bio = r1_bio->bios[m]; if (bio->bi_end_io == NULL) continue; - if (!bio->bi_error && + if (!bio->bi_status && test_bit(R1BIO_MadeGood, &r1_bio->state)) { rdev_clear_badblocks(rdev, r1_bio->sector, s, 0); } - if (bio->bi_error && + if (bio->bi_status && test_bit(R1BIO_WriteError, &r1_bio->state)) { if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0)) md_error(conf->mddev, rdev); @@ -2955,7 +2955,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) if (!conf->r1bio_pool) goto abort; - conf->bio_split = bioset_create(BIO_POOL_SIZE, 0); + conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0); if (!conf->bio_split) goto abort; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 797ed60..57a250f 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -336,7 +336,7 @@ static void raid_end_bio_io(struct r10bio *r10_bio) struct r10conf *conf = r10_bio->mddev->private; if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); /* @@ -389,7 +389,7 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, static void raid10_end_read_request(struct bio *bio) { - int uptodate = !bio->bi_error; + int uptodate = !bio->bi_status; struct r10bio *r10_bio = bio->bi_private; int slot, dev; struct md_rdev *rdev; @@ -477,7 +477,7 @@ static void raid10_end_write_request(struct bio *bio) struct bio *to_put = NULL; bool discard_error; - discard_error = bio->bi_error && bio_op(bio) == REQ_OP_DISCARD; + discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD; dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); @@ -491,7 +491,7 @@ static void raid10_end_write_request(struct bio *bio) /* * this branch is our 'one mirror IO has finished' event handler: */ - if (bio->bi_error && !discard_error) { + if (bio->bi_status && !discard_error) { if (repl) /* Never record new bad blocks to replacement, * just fail it. @@ -913,7 +913,7 @@ static void flush_pending_writes(struct r10conf *conf) bio->bi_next = NULL; bio->bi_bdev = rdev->bdev; if (test_bit(Faulty, &rdev->flags)) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) @@ -1098,7 +1098,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) bio->bi_next = NULL; bio->bi_bdev = rdev->bdev; if (test_bit(Faulty, &rdev->flags)) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) @@ -1888,7 +1888,7 @@ static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d) { struct r10conf *conf = r10_bio->mddev->private; - if (!bio->bi_error) + if (!bio->bi_status) set_bit(R10BIO_Uptodate, &r10_bio->state); else /* The write handler will notice the lack of @@ -1972,7 +1972,7 @@ static void end_sync_write(struct bio *bio) else rdev = conf->mirrors[d].rdev; - if (bio->bi_error) { + if (bio->bi_status) { if (repl) md_error(mddev, rdev); else { @@ -2021,7 +2021,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) /* find the first device with a block */ for (i=0; i<conf->copies; i++) - if (!r10_bio->devs[i].bio->bi_error) + if (!r10_bio->devs[i].bio->bi_status) break; if (i == conf->copies) @@ -2050,7 +2050,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) tpages = get_resync_pages(tbio)->pages; d = r10_bio->devs[i].devnum; rdev = conf->mirrors[d].rdev; - if (!r10_bio->devs[i].bio->bi_error) { + if (!r10_bio->devs[i].bio->bi_status) { /* We know that the bi_io_vec layout is the same for * both 'first' and 'i', so we just compare them. * All vec entries are PAGE_SIZE; @@ -2633,7 +2633,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) rdev = conf->mirrors[dev].rdev; if (r10_bio->devs[m].bio == NULL) continue; - if (!r10_bio->devs[m].bio->bi_error) { + if (!r10_bio->devs[m].bio->bi_status) { rdev_clear_badblocks( rdev, r10_bio->devs[m].addr, @@ -2649,7 +2649,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) if (r10_bio->devs[m].repl_bio == NULL) continue; - if (!r10_bio->devs[m].repl_bio->bi_error) { + if (!r10_bio->devs[m].repl_bio->bi_status) { rdev_clear_badblocks( rdev, r10_bio->devs[m].addr, @@ -2675,7 +2675,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) r10_bio->devs[m].addr, r10_bio->sectors, 0); rdev_dec_pending(rdev, conf->mddev); - } else if (bio != NULL && bio->bi_error) { + } else if (bio != NULL && bio->bi_status) { fail = true; if (!narrow_write_error(r10_bio, m)) { md_error(conf->mddev, rdev); @@ -3267,7 +3267,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, r10_bio->devs[i].repl_bio->bi_end_io = NULL; bio = r10_bio->devs[i].bio; - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; rcu_read_lock(); rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { @@ -3309,7 +3309,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, /* Need to set up for writing to the replacement */ bio = r10_bio->devs[i].repl_bio; - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; sector = r10_bio->devs[i].addr; bio->bi_next = biolist; @@ -3375,7 +3375,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (bio->bi_end_io == end_sync_read) { md_sync_acct(bio->bi_bdev, nr_sectors); - bio->bi_error = 0; + bio->bi_status = 0; generic_make_request(bio); } } @@ -3552,7 +3552,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) if (!conf->r10bio_pool) goto out; - conf->bio_split = bioset_create(BIO_POOL_SIZE, 0); + conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0); if (!conf->bio_split) goto out; @@ -4397,7 +4397,7 @@ read_more: read_bio->bi_end_io = end_reshape_read; bio_set_op_attrs(read_bio, REQ_OP_READ, 0); read_bio->bi_flags &= (~0UL << BIO_RESET_BITS); - read_bio->bi_error = 0; + read_bio->bi_status = 0; read_bio->bi_vcnt = 0; read_bio->bi_iter.bi_size = 0; r10_bio->master_bio = read_bio; @@ -4641,7 +4641,7 @@ static void end_reshape_write(struct bio *bio) rdev = conf->mirrors[d].rdev; } - if (bio->bi_error) { + if (bio->bi_status) { /* FIXME should record badblock */ md_error(mddev, rdev); } diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 0a7af8b..bfa1e90 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -572,7 +572,7 @@ static void r5l_log_endio(struct bio *bio) struct r5l_log *log = io->log; unsigned long flags; - if (bio->bi_error) + if (bio->bi_status) md_error(log->rdev->mddev, log->rdev); bio_put(bio); @@ -1247,7 +1247,7 @@ static void r5l_log_flush_endio(struct bio *bio) unsigned long flags; struct r5l_io_unit *io; - if (bio->bi_error) + if (bio->bi_status) md_error(log->rdev->mddev, log->rdev); spin_lock_irqsave(&log->io_list_lock, flags); @@ -3063,7 +3063,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) if (!log->io_pool) goto io_pool; - log->bs = bioset_create(R5L_POOL_SIZE, 0); + log->bs = bioset_create(R5L_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (!log->bs) goto io_bs; diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index ccce92e..77cce35 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -397,7 +397,7 @@ static void ppl_log_endio(struct bio *bio) pr_debug("%s: seq: %llu\n", __func__, io->seq); - if (bio->bi_error) + if (bio->bi_status) md_error(ppl_conf->mddev, log->rdev); list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { @@ -1150,7 +1150,7 @@ int ppl_init_log(struct r5conf *conf) goto err; } - ppl_conf->bs = bioset_create(conf->raid_disks, 0); + ppl_conf->bs = bioset_create(conf->raid_disks, 0, 0); if (!ppl_conf->bs) { ret = -ENOMEM; goto err; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ec0f951..62c965b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2476,7 +2476,7 @@ static void raid5_end_read_request(struct bio * bi) pr_debug("end_read_request %llu/%d, count: %d, error %d.\n", (unsigned long long)sh->sector, i, atomic_read(&sh->count), - bi->bi_error); + bi->bi_status); if (i == disks) { bio_reset(bi); BUG(); @@ -2496,7 +2496,7 @@ static void raid5_end_read_request(struct bio * bi) s = sh->sector + rdev->new_data_offset; else s = sh->sector + rdev->data_offset; - if (!bi->bi_error) { + if (!bi->bi_status) { set_bit(R5_UPTODATE, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) { /* Note that this cannot happen on a @@ -2613,7 +2613,7 @@ static void raid5_end_write_request(struct bio *bi) } pr_debug("end_write_request %llu/%d, count %d, error: %d.\n", (unsigned long long)sh->sector, i, atomic_read(&sh->count), - bi->bi_error); + bi->bi_status); if (i == disks) { bio_reset(bi); BUG(); @@ -2621,14 +2621,14 @@ static void raid5_end_write_request(struct bio *bi) } if (replacement) { - if (bi->bi_error) + if (bi->bi_status) md_error(conf->mddev, rdev); else if (is_badblock(rdev, sh->sector, STRIPE_SECTORS, &first_bad, &bad_sectors)) set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); } else { - if (bi->bi_error) { + if (bi->bi_status) { set_bit(STRIPE_DEGRADED, &sh->state); set_bit(WriteErrorSeen, &rdev->flags); set_bit(R5_WriteError, &sh->dev[i].flags); @@ -2649,7 +2649,7 @@ static void raid5_end_write_request(struct bio *bi) } rdev_dec_pending(rdev, conf->mddev); - if (sh->batch_head && bi->bi_error && !replacement) + if (sh->batch_head && bi->bi_status && !replacement) set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); bio_reset(bi); @@ -3381,7 +3381,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); - bi->bi_error = -EIO; + bi->bi_status = BLK_STS_IOERR; md_write_end(conf->mddev); bio_endio(bi); bi = nextbi; @@ -3403,7 +3403,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, sh->dev[i].sector + STRIPE_SECTORS) { struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); - bi->bi_error = -EIO; + bi->bi_status = BLK_STS_IOERR; md_write_end(conf->mddev); bio_endio(bi); bi = bi2; @@ -3429,7 +3429,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); - bi->bi_error = -EIO; + bi->bi_status = BLK_STS_IOERR; bio_endio(bi); bi = nextbi; } @@ -5154,7 +5154,7 @@ static void raid5_align_endio(struct bio *bi) struct mddev *mddev; struct r5conf *conf; struct md_rdev *rdev; - int error = bi->bi_error; + blk_status_t error = bi->bi_status; bio_put(bi); @@ -5731,7 +5731,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) release_stripe_plug(mddev, sh); } else { /* cannot get stripe for read-ahead, just give-up */ - bi->bi_error = -EIO; + bi->bi_status = BLK_STS_IOERR; break; } } @@ -6943,7 +6943,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) goto abort; } - conf->bio_split = bioset_create(BIO_POOL_SIZE, 0); + conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0); if (!conf->bio_split) goto abort; conf->mddev = mddev; diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c index 99e651c..22de7f5 100644 --- a/drivers/memstick/core/ms_block.c +++ b/drivers/memstick/core/ms_block.c @@ -1921,12 +1921,13 @@ static void msb_io_work(struct work_struct *work) spin_lock_irqsave(&msb->q_lock, flags); if (len) - if (!__blk_end_request(msb->req, 0, len)) + if (!__blk_end_request(msb->req, BLK_STS_OK, len)) msb->req = NULL; if (error && msb->req) { + blk_status_t ret = errno_to_blk_status(error); dbg_verbose("IO: ending one sector of the request with error"); - if (!__blk_end_request(msb->req, error, msb->page_size)) + if (!__blk_end_request(msb->req, ret, msb->page_size)) msb->req = NULL; } @@ -2014,7 +2015,7 @@ static void msb_submit_req(struct request_queue *q) WARN_ON(!msb->io_queue_stopped); while ((req = blk_fetch_request(q)) != NULL) - __blk_end_request_all(req, -ENODEV); + __blk_end_request_all(req, BLK_STS_IOERR); return; } diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index c00d8a2..8897962 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -709,7 +709,8 @@ try_again: msb->req_sg); if (!msb->seg_count) { - chunk = __blk_end_request_cur(msb->block_req, -ENOMEM); + chunk = __blk_end_request_cur(msb->block_req, + BLK_STS_RESOURCE); continue; } @@ -776,7 +777,8 @@ static int mspro_block_complete_req(struct memstick_dev *card, int error) if (error && !t_len) t_len = blk_rq_cur_bytes(msb->block_req); - chunk = __blk_end_request(msb->block_req, error, t_len); + chunk = __blk_end_request(msb->block_req, + errno_to_blk_status(error), t_len); error = mspro_block_issue_req(card, chunk); @@ -838,7 +840,7 @@ static void mspro_block_submit_req(struct request_queue *q) if (msb->eject) { while ((req = blk_fetch_request(q)) != NULL) - __blk_end_request_all(req, -ENODEV); + __blk_end_request_all(req, BLK_STS_IOERR); return; } diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 8273b07..6ff94a9 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -1184,9 +1184,10 @@ static void mmc_blk_issue_discard_rq(struct mmc_queue *mq, struct request *req) struct mmc_card *card = md->queue.card; unsigned int from, nr, arg; int err = 0, type = MMC_BLK_DISCARD; + blk_status_t status = BLK_STS_OK; if (!mmc_can_erase(card)) { - err = -EOPNOTSUPP; + status = BLK_STS_NOTSUPP; goto fail; } @@ -1212,10 +1213,12 @@ static void mmc_blk_issue_discard_rq(struct mmc_queue *mq, struct request *req) if (!err) err = mmc_erase(card, from, nr, arg); } while (err == -EIO && !mmc_blk_reset(md, card->host, type)); - if (!err) + if (err) + status = BLK_STS_IOERR; + else mmc_blk_reset_success(md, type); fail: - blk_end_request(req, err, blk_rq_bytes(req)); + blk_end_request(req, status, blk_rq_bytes(req)); } static void mmc_blk_issue_secdiscard_rq(struct mmc_queue *mq, @@ -1225,9 +1228,10 @@ static void mmc_blk_issue_secdiscard_rq(struct mmc_queue *mq, struct mmc_card *card = md->queue.card; unsigned int from, nr, arg; int err = 0, type = MMC_BLK_SECDISCARD; + blk_status_t status = BLK_STS_OK; if (!(mmc_can_secure_erase_trim(card))) { - err = -EOPNOTSUPP; + status = BLK_STS_NOTSUPP; goto out; } @@ -1254,8 +1258,10 @@ retry: err = mmc_erase(card, from, nr, arg); if (err == -EIO) goto out_retry; - if (err) + if (err) { + status = BLK_STS_IOERR; goto out; + } if (arg == MMC_SECURE_TRIM1_ARG) { if (card->quirks & MMC_QUIRK_INAND_CMD38) { @@ -1270,8 +1276,10 @@ retry: err = mmc_erase(card, from, nr, MMC_SECURE_TRIM2_ARG); if (err == -EIO) goto out_retry; - if (err) + if (err) { + status = BLK_STS_IOERR; goto out; + } } out_retry: @@ -1280,7 +1288,7 @@ out_retry: if (!err) mmc_blk_reset_success(md, type); out: - blk_end_request(req, err, blk_rq_bytes(req)); + blk_end_request(req, status, blk_rq_bytes(req)); } static void mmc_blk_issue_flush(struct mmc_queue *mq, struct request *req) @@ -1290,10 +1298,7 @@ static void mmc_blk_issue_flush(struct mmc_queue *mq, struct request *req) int ret = 0; ret = mmc_flush_cache(card); - if (ret) - ret = -EIO; - - blk_end_request_all(req, ret); + blk_end_request_all(req, ret ? BLK_STS_IOERR : BLK_STS_OK); } /* @@ -1641,7 +1646,7 @@ static void mmc_blk_rw_cmd_abort(struct mmc_queue *mq, struct mmc_card *card, { if (mmc_card_removed(card)) req->rq_flags |= RQF_QUIET; - while (blk_end_request(req, -EIO, blk_rq_cur_bytes(req))); + while (blk_end_request(req, BLK_STS_IOERR, blk_rq_cur_bytes(req))); mmc_queue_req_free(mq, mqrq); } @@ -1661,7 +1666,7 @@ static void mmc_blk_rw_try_restart(struct mmc_queue *mq, struct request *req, */ if (mmc_card_removed(mq->card)) { req->rq_flags |= RQF_QUIET; - blk_end_request_all(req, -EIO); + blk_end_request_all(req, BLK_STS_IOERR); mmc_queue_req_free(mq, mqrq); return; } @@ -1743,7 +1748,7 @@ static void mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *new_req) */ mmc_blk_reset_success(md, type); - req_pending = blk_end_request(old_req, 0, + req_pending = blk_end_request(old_req, BLK_STS_OK, brq->data.bytes_xfered); /* * If the blk_end_request function returns non-zero even @@ -1811,7 +1816,7 @@ static void mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *new_req) * time, so we only reach here after trying to * read a single sector. */ - req_pending = blk_end_request(old_req, -EIO, + req_pending = blk_end_request(old_req, BLK_STS_IOERR, brq->data.blksz); if (!req_pending) { mmc_queue_req_free(mq, mq_rq); @@ -1860,7 +1865,7 @@ void mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req) ret = mmc_blk_part_switch(card, md); if (ret) { if (req) { - blk_end_request_all(req, -EIO); + blk_end_request_all(req, BLK_STS_IOERR); } goto out; } diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index 5c37b6b..b659a28 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -133,7 +133,7 @@ static void mmc_request_fn(struct request_queue *q) if (!mq) { while ((req = blk_fetch_request(q)) != NULL) { req->rq_flags |= RQF_QUIET; - __blk_end_request_all(req, -EIO); + __blk_end_request_all(req, BLK_STS_IOERR); } return; } @@ -388,7 +388,6 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, mmc_queue_setup_discard(mq->queue, card); if (card->bouncesz) { - blk_queue_bounce_limit(mq->queue, BLK_BOUNCE_ANY); blk_queue_max_hw_sectors(mq->queue, card->bouncesz / 512); blk_queue_max_segments(mq->queue, card->bouncesz / 512); blk_queue_max_segment_size(mq->queue, card->bouncesz); diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 6b8d5cd..f336a9b 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -73,7 +73,7 @@ static void blktrans_dev_put(struct mtd_blktrans_dev *dev) } -static int do_blktrans_request(struct mtd_blktrans_ops *tr, +static blk_status_t do_blktrans_request(struct mtd_blktrans_ops *tr, struct mtd_blktrans_dev *dev, struct request *req) { @@ -84,33 +84,37 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr, nsect = blk_rq_cur_bytes(req) >> tr->blkshift; buf = bio_data(req->bio); - if (req_op(req) == REQ_OP_FLUSH) - return tr->flush(dev); + if (req_op(req) == REQ_OP_FLUSH) { + if (tr->flush(dev)) + return BLK_STS_IOERR; + return BLK_STS_OK; + } if (blk_rq_pos(req) + blk_rq_cur_sectors(req) > get_capacity(req->rq_disk)) - return -EIO; + return BLK_STS_IOERR; switch (req_op(req)) { case REQ_OP_DISCARD: - return tr->discard(dev, block, nsect); + if (tr->discard(dev, block, nsect)) + return BLK_STS_IOERR; + return BLK_STS_OK; case REQ_OP_READ: for (; nsect > 0; nsect--, block++, buf += tr->blksize) if (tr->readsect(dev, block, buf)) - return -EIO; + return BLK_STS_IOERR; rq_flush_dcache_pages(req); - return 0; + return BLK_STS_OK; case REQ_OP_WRITE: if (!tr->writesect) - return -EIO; + return BLK_STS_IOERR; rq_flush_dcache_pages(req); for (; nsect > 0; nsect--, block++, buf += tr->blksize) if (tr->writesect(dev, block, buf)) - return -EIO; - return 0; + return BLK_STS_IOERR; default: - return -EIO; + return BLK_STS_IOERR; } } @@ -132,7 +136,7 @@ static void mtd_blktrans_work(struct work_struct *work) spin_lock_irq(rq->queue_lock); while (1) { - int res; + blk_status_t res; dev->bg_stop = false; if (!req && !(req = blk_fetch_request(rq))) { @@ -178,7 +182,7 @@ static void mtd_blktrans_request(struct request_queue *rq) if (!dev) while ((req = blk_fetch_request(rq)) != NULL) - __blk_end_request_all(req, -ENODEV); + __blk_end_request_all(req, BLK_STS_IOERR); else queue_work(dev->wq, &dev->work); } @@ -413,6 +417,7 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) new->rq->queuedata = new; blk_queue_logical_block_size(new->rq, tr->blksize); + blk_queue_bounce_limit(new->rq, BLK_BOUNCE_HIGH); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, new->rq); queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, new->rq); diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index 5497e65..c3963f8 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -313,10 +313,10 @@ static void ubiblock_do_work(struct work_struct *work) ret = ubiblock_read(pdu); rq_flush_dcache_pages(req); - blk_mq_end_request(req, ret); + blk_mq_end_request(req, errno_to_blk_status(ret)); } -static int ubiblock_queue_rq(struct blk_mq_hw_ctx *hctx, +static blk_status_t ubiblock_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *req = bd->rq; @@ -327,9 +327,9 @@ static int ubiblock_queue_rq(struct blk_mq_hw_ctx *hctx, case REQ_OP_READ: ubi_sgl_init(&pdu->usgl); queue_work(dev->wq, &pdu->work); - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; default: - return BLK_MQ_RQ_QUEUE_ERROR; + return BLK_STS_IOERR; } } diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c index 822198a..f12d23c 100644 --- a/drivers/nvdimm/blk.c +++ b/drivers/nvdimm/blk.c @@ -186,7 +186,7 @@ static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio) * another kernel subsystem, and we just pass it through. */ if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; goto out; } @@ -205,7 +205,7 @@ static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio) "io error in %s sector %lld, len %d,\n", (rw == READ) ? "READ" : "WRITE", (unsigned long long) iter.bi_sector, len); - bio->bi_error = err; + bio->bi_status = errno_to_blk_status(err); break; } } @@ -273,7 +273,6 @@ static int nsblk_attach_disk(struct nd_namespace_blk *nsblk) blk_queue_make_request(q, nd_blk_make_request); blk_queue_max_hw_sectors(q, UINT_MAX); - blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); blk_queue_logical_block_size(q, nsblk_sector_size(nsblk)); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); q->queuedata = nsblk; diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 983718b..b6ba061 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1210,7 +1210,7 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio) * another kernel subsystem, and we just pass it through. */ if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; goto out; } @@ -1232,7 +1232,7 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio) (op_is_write(bio_op(bio))) ? "WRITE" : "READ", (unsigned long long) iter.bi_sector, len); - bio->bi_error = err; + bio->bi_status = errno_to_blk_status(err); break; } } @@ -1297,7 +1297,6 @@ static int btt_blk_init(struct btt *btt) blk_queue_make_request(btt->btt_queue, btt_make_request); blk_queue_logical_block_size(btt->btt_queue, btt->sector_size); blk_queue_max_hw_sectors(btt->btt_queue, UINT_MAX); - blk_queue_bounce_limit(btt->btt_queue, BLK_BOUNCE_ANY); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, btt->btt_queue); btt->btt_queue->queuedata = btt; diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index c544d46..6b577af 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -49,19 +49,19 @@ static struct nd_region *to_region(struct pmem_device *pmem) return to_nd_region(to_dev(pmem)->parent); } -static int pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset, - unsigned int len) +static blk_status_t pmem_clear_poison(struct pmem_device *pmem, + phys_addr_t offset, unsigned int len) { struct device *dev = to_dev(pmem); sector_t sector; long cleared; - int rc = 0; + blk_status_t rc = BLK_STS_OK; sector = (offset - pmem->data_offset) / 512; cleared = nvdimm_clear_poison(dev, pmem->phys_addr + offset, len); if (cleared < len) - rc = -EIO; + rc = BLK_STS_IOERR; if (cleared > 0 && cleared / 512) { cleared /= 512; dev_dbg(dev, "%s: %#llx clear %ld sector%s\n", __func__, @@ -84,7 +84,7 @@ static void write_pmem(void *pmem_addr, struct page *page, kunmap_atomic(mem); } -static int read_pmem(struct page *page, unsigned int off, +static blk_status_t read_pmem(struct page *page, unsigned int off, void *pmem_addr, unsigned int len) { int rc; @@ -93,15 +93,15 @@ static int read_pmem(struct page *page, unsigned int off, rc = memcpy_mcsafe(mem + off, pmem_addr, len); kunmap_atomic(mem); if (rc) - return -EIO; - return 0; + return BLK_STS_IOERR; + return BLK_STS_OK; } -static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, +static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page, unsigned int len, unsigned int off, bool is_write, sector_t sector) { - int rc = 0; + blk_status_t rc = BLK_STS_OK; bool bad_pmem = false; phys_addr_t pmem_off = sector * 512 + pmem->data_offset; void *pmem_addr = pmem->virt_addr + pmem_off; @@ -111,7 +111,7 @@ static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, if (!is_write) { if (unlikely(bad_pmem)) - rc = -EIO; + rc = BLK_STS_IOERR; else { rc = read_pmem(page, off, pmem_addr, len); flush_dcache_page(page); @@ -149,7 +149,7 @@ static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) { - int rc = 0; + blk_status_t rc = 0; bool do_acct; unsigned long start; struct bio_vec bvec; @@ -166,7 +166,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) bvec.bv_offset, op_is_write(bio_op(bio)), iter.bi_sector); if (rc) { - bio->bi_error = rc; + bio->bi_status = rc; break; } } @@ -184,7 +184,7 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, struct page *page, bool is_write) { struct pmem_device *pmem = bdev->bd_queue->queuedata; - int rc; + blk_status_t rc; rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, is_write, sector); @@ -197,7 +197,7 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, if (rc == 0) page_endio(page, is_write, 0); - return rc; + return blk_status_to_errno(rc); } /* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */ @@ -343,7 +343,6 @@ static int pmem_attach_disk(struct device *dev, blk_queue_make_request(q, pmem_make_request); blk_queue_physical_block_size(q, PAGE_SIZE); blk_queue_max_hw_sectors(q, UINT_MAX); - blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); queue_flag_set_unlocked(QUEUE_FLAG_DAX, q); q->queuedata = pmem; diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index 90745a6..46d6cb1 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -13,18 +13,6 @@ config BLK_DEV_NVME To compile this driver as a module, choose M here: the module will be called nvme. -config BLK_DEV_NVME_SCSI - bool "SCSI emulation for NVMe device nodes" - depends on NVME_CORE - ---help--- - This adds support for the SG_IO ioctl on the NVMe character - and block devices nodes, as well as a translation for a small - number of selected SCSI commands to NVMe commands to the NVMe - driver. If you don't know what this means you probably want - to say N here, unless you run a distro that abuses the SCSI - emulation to provide stable device names for mount by id, like - some OpenSuSE and SLES versions. - config NVME_FABRICS tristate diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index f1a7d94..cc0aacb 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -5,7 +5,6 @@ obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o obj-$(CONFIG_NVME_FC) += nvme-fc.o nvme-core-y := core.o -nvme-core-$(CONFIG_BLK_DEV_NVME_SCSI) += scsi.o nvme-core-$(CONFIG_NVM) += lightnvm.o nvme-y += pci.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 903d581..d70df1d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -27,7 +27,6 @@ #include <linux/nvme_ioctl.h> #include <linux/t10-pi.h> #include <linux/pm_qos.h> -#include <scsi/sg.h> #include <asm/unaligned.h> #include "nvme.h" @@ -45,7 +44,7 @@ module_param_named(io_timeout, nvme_io_timeout, byte, 0644); MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); EXPORT_SYMBOL_GPL(nvme_io_timeout); -unsigned char shutdown_timeout = 5; +static unsigned char shutdown_timeout = 5; module_param(shutdown_timeout, byte, 0644); MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); @@ -65,34 +64,53 @@ static bool force_apst; module_param(force_apst, bool, 0644); MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off"); +static bool streams; +module_param(streams, bool, 0644); +MODULE_PARM_DESC(streams, "turn on support for Streams write directives"); + +struct workqueue_struct *nvme_wq; +EXPORT_SYMBOL_GPL(nvme_wq); + static LIST_HEAD(nvme_ctrl_list); static DEFINE_SPINLOCK(dev_list_lock); static struct class *nvme_class; -static int nvme_error_status(struct request *req) +int nvme_reset_ctrl(struct nvme_ctrl *ctrl) +{ + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) + return -EBUSY; + if (!queue_work(nvme_wq, &ctrl->reset_work)) + return -EBUSY; + return 0; +} +EXPORT_SYMBOL_GPL(nvme_reset_ctrl); + +static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) +{ + int ret; + + ret = nvme_reset_ctrl(ctrl); + if (!ret) + flush_work(&ctrl->reset_work); + return ret; +} + +static blk_status_t nvme_error_status(struct request *req) { switch (nvme_req(req)->status & 0x7ff) { case NVME_SC_SUCCESS: - return 0; + return BLK_STS_OK; case NVME_SC_CAP_EXCEEDED: - return -ENOSPC; - default: - return -EIO; - - /* - * XXX: these errors are a nasty side-band protocol to - * drivers/md/dm-mpath.c:noretry_error() that aren't documented - * anywhere.. - */ - case NVME_SC_CMD_SEQ_ERROR: - return -EILSEQ; + return BLK_STS_NOSPC; case NVME_SC_ONCS_NOT_SUPPORTED: - return -EOPNOTSUPP; + return BLK_STS_NOTSUPP; case NVME_SC_WRITE_FAULT: case NVME_SC_READ_ERROR: case NVME_SC_UNWRITTEN_BLOCK: - return -ENODATA; + return BLK_STS_MEDIUM; + default: + return BLK_STS_IOERR; } } @@ -165,7 +183,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, switch (old_state) { case NVME_CTRL_NEW: case NVME_CTRL_LIVE: - case NVME_CTRL_RECONNECTING: changed = true; /* FALLTHRU */ default: @@ -283,6 +300,105 @@ struct request *nvme_alloc_request(struct request_queue *q, } EXPORT_SYMBOL_GPL(nvme_alloc_request); +static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable) +{ + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + + c.directive.opcode = nvme_admin_directive_send; + c.directive.nsid = cpu_to_le32(0xffffffff); + c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE; + c.directive.dtype = NVME_DIR_IDENTIFY; + c.directive.tdtype = NVME_DIR_STREAMS; + c.directive.endir = enable ? NVME_DIR_ENDIR : 0; + + return nvme_submit_sync_cmd(ctrl->admin_q, &c, NULL, 0); +} + +static int nvme_disable_streams(struct nvme_ctrl *ctrl) +{ + return nvme_toggle_streams(ctrl, false); +} + +static int nvme_enable_streams(struct nvme_ctrl *ctrl) +{ + return nvme_toggle_streams(ctrl, true); +} + +static int nvme_get_stream_params(struct nvme_ctrl *ctrl, + struct streams_directive_params *s, u32 nsid) +{ + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + memset(s, 0, sizeof(*s)); + + c.directive.opcode = nvme_admin_directive_recv; + c.directive.nsid = cpu_to_le32(nsid); + c.directive.numd = sizeof(*s); + c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM; + c.directive.dtype = NVME_DIR_STREAMS; + + return nvme_submit_sync_cmd(ctrl->admin_q, &c, s, sizeof(*s)); +} + +static int nvme_configure_directives(struct nvme_ctrl *ctrl) +{ + struct streams_directive_params s; + int ret; + + if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES)) + return 0; + if (!streams) + return 0; + + ret = nvme_enable_streams(ctrl); + if (ret) + return ret; + + ret = nvme_get_stream_params(ctrl, &s, 0xffffffff); + if (ret) + return ret; + + ctrl->nssa = le16_to_cpu(s.nssa); + if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) { + dev_info(ctrl->device, "too few streams (%u) available\n", + ctrl->nssa); + nvme_disable_streams(ctrl); + return 0; + } + + ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1); + dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams); + return 0; +} + +/* + * Check if 'req' has a write hint associated with it. If it does, assign + * a valid namespace stream to the write. + */ +static void nvme_assign_write_stream(struct nvme_ctrl *ctrl, + struct request *req, u16 *control, + u32 *dsmgmt) +{ + enum rw_hint streamid = req->write_hint; + + if (streamid == WRITE_LIFE_NOT_SET || streamid == WRITE_LIFE_NONE) + streamid = 0; + else { + streamid--; + if (WARN_ON_ONCE(streamid > ctrl->nr_streams)) + return; + + *control |= NVME_RW_DTYPE_STREAMS; + *dsmgmt |= streamid << 16; + } + + if (streamid < ARRAY_SIZE(req->q->write_hints)) + req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9; +} + static inline void nvme_setup_flush(struct nvme_ns *ns, struct nvme_command *cmnd) { @@ -291,7 +407,7 @@ static inline void nvme_setup_flush(struct nvme_ns *ns, cmnd->common.nsid = cpu_to_le32(ns->ns_id); } -static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req, +static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, struct nvme_command *cmnd) { unsigned short segments = blk_rq_nr_discard_segments(req), n = 0; @@ -300,7 +416,7 @@ static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req, range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC); if (!range) - return BLK_MQ_RQ_QUEUE_BUSY; + return BLK_STS_RESOURCE; __rq_for_each_bio(bio, req) { u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector); @@ -314,7 +430,7 @@ static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req, if (WARN_ON_ONCE(n != segments)) { kfree(range); - return BLK_MQ_RQ_QUEUE_ERROR; + return BLK_STS_IOERR; } memset(cmnd, 0, sizeof(*cmnd)); @@ -328,15 +444,26 @@ static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req, req->special_vec.bv_len = sizeof(*range) * segments; req->rq_flags |= RQF_SPECIAL_PAYLOAD; - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; } -static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req, - struct nvme_command *cmnd) +static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, + struct request *req, struct nvme_command *cmnd) { + struct nvme_ctrl *ctrl = ns->ctrl; u16 control = 0; u32 dsmgmt = 0; + /* + * If formated with metadata, require the block layer provide a buffer + * unless this namespace is formated such that the metadata can be + * stripped/generated by the controller with PRACT=1. + */ + if (ns && ns->ms && + (!ns->pi_type || ns->ms != sizeof(struct t10_pi_tuple)) && + !blk_integrity_rq(req) && !blk_rq_is_passthrough(req)) + return BLK_STS_NOTSUPP; + if (req->cmd_flags & REQ_FUA) control |= NVME_RW_FUA; if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) @@ -351,6 +478,9 @@ static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req, cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); + if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams) + nvme_assign_write_stream(ctrl, req, &control, &dsmgmt); + if (ns->ms) { switch (ns->pi_type) { case NVME_NS_DPS_PI_TYPE3: @@ -370,12 +500,13 @@ static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req, cmnd->rw.control = cpu_to_le16(control); cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); + return 0; } -int nvme_setup_cmd(struct nvme_ns *ns, struct request *req, +blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, struct nvme_command *cmd) { - int ret = BLK_MQ_RQ_QUEUE_OK; + blk_status_t ret = BLK_STS_OK; if (!(req->rq_flags & RQF_DONTPREP)) { nvme_req(req)->retries = 0; @@ -398,11 +529,11 @@ int nvme_setup_cmd(struct nvme_ns *ns, struct request *req, break; case REQ_OP_READ: case REQ_OP_WRITE: - nvme_setup_rw(ns, req, cmd); + ret = nvme_setup_rw(ns, req, cmd); break; default: WARN_ON_ONCE(1); - return BLK_MQ_RQ_QUEUE_ERROR; + return BLK_STS_IOERR; } cmd->common.command_id = req->tag; @@ -555,15 +686,16 @@ int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, result, timeout); } -static void nvme_keep_alive_end_io(struct request *rq, int error) +static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) { struct nvme_ctrl *ctrl = rq->end_io_data; blk_mq_free_request(rq); - if (error) { + if (status) { dev_err(ctrl->device, - "failed nvme_keep_alive_end_io error=%d\n", error); + "failed nvme_keep_alive_end_io error=%d\n", + status); return; } @@ -599,7 +731,7 @@ static void nvme_keep_alive_work(struct work_struct *work) if (nvme_keep_alive(ctrl)) { /* allocation failure, reset the controller */ dev_err(ctrl->device, "keep-alive failed\n"); - ctrl->ops->reset_ctrl(ctrl); + nvme_reset_ctrl(ctrl); return; } } @@ -623,7 +755,7 @@ void nvme_stop_keep_alive(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_stop_keep_alive); -int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) +static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) { struct nvme_command c = { }; int error; @@ -643,6 +775,77 @@ int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) return error; } +static int nvme_identify_ns_descs(struct nvme_ns *ns, unsigned nsid) +{ + struct nvme_command c = { }; + int status; + void *data; + int pos; + int len; + + c.identify.opcode = nvme_admin_identify; + c.identify.nsid = cpu_to_le32(nsid); + c.identify.cns = NVME_ID_CNS_NS_DESC_LIST; + + data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); + if (!data) + return -ENOMEM; + + status = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, data, + NVME_IDENTIFY_DATA_SIZE); + if (status) + goto free_data; + + for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) { + struct nvme_ns_id_desc *cur = data + pos; + + if (cur->nidl == 0) + break; + + switch (cur->nidt) { + case NVME_NIDT_EUI64: + if (cur->nidl != NVME_NIDT_EUI64_LEN) { + dev_warn(ns->ctrl->device, + "ctrl returned bogus length: %d for NVME_NIDT_EUI64\n", + cur->nidl); + goto free_data; + } + len = NVME_NIDT_EUI64_LEN; + memcpy(ns->eui, data + pos + sizeof(*cur), len); + break; + case NVME_NIDT_NGUID: + if (cur->nidl != NVME_NIDT_NGUID_LEN) { + dev_warn(ns->ctrl->device, + "ctrl returned bogus length: %d for NVME_NIDT_NGUID\n", + cur->nidl); + goto free_data; + } + len = NVME_NIDT_NGUID_LEN; + memcpy(ns->nguid, data + pos + sizeof(*cur), len); + break; + case NVME_NIDT_UUID: + if (cur->nidl != NVME_NIDT_UUID_LEN) { + dev_warn(ns->ctrl->device, + "ctrl returned bogus length: %d for NVME_NIDT_UUID\n", + cur->nidl); + goto free_data; + } + len = NVME_NIDT_UUID_LEN; + uuid_copy(&ns->uuid, data + pos + sizeof(*cur)); + break; + default: + /* Skip unnkown types */ + len = cur->nidl; + break; + } + + len += sizeof(*cur); + } +free_data: + kfree(data); + return status; +} + static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list) { struct nvme_command c = { }; @@ -653,7 +856,7 @@ static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *n return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000); } -int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, +static int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, struct nvme_id_ns **id) { struct nvme_command c = { }; @@ -675,26 +878,7 @@ int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, return error; } -int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, - void *buffer, size_t buflen, u32 *result) -{ - struct nvme_command c; - union nvme_result res; - int ret; - - memset(&c, 0, sizeof(c)); - c.features.opcode = nvme_admin_get_features; - c.features.nsid = cpu_to_le32(nsid); - c.features.fid = cpu_to_le32(fid); - - ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res, buffer, buflen, 0, - NVME_QID_ANY, 0, 0); - if (ret >= 0 && result) - *result = le32_to_cpu(res.u32); - return ret; -} - -int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, +static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, void *buffer, size_t buflen, u32 *result) { struct nvme_command c; @@ -713,28 +897,6 @@ int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, return ret; } -int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log) -{ - struct nvme_command c = { }; - int error; - - c.common.opcode = nvme_admin_get_log_page, - c.common.nsid = cpu_to_le32(0xFFFFFFFF), - c.common.cdw10[0] = cpu_to_le32( - (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) | - NVME_LOG_SMART), - - *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL); - if (!*log) - return -ENOMEM; - - error = nvme_submit_sync_cmd(dev->admin_q, &c, *log, - sizeof(struct nvme_smart_log)); - if (error) - kfree(*log); - return error; -} - int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) { u32 q_count = (*count - 1) | ((*count - 1) << 16); @@ -752,7 +914,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) * access to the admin queue, as that might be only way to fix them up. */ if (status > 0) { - dev_err(ctrl->dev, "Could not set queue count (%d)\n", status); + dev_err(ctrl->device, "Could not set queue count (%d)\n", status); *count = 0; } else { nr_io_queues = min(result & 0xffff, result >> 16) + 1; @@ -870,12 +1032,6 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg); case NVME_IOCTL_SUBMIT_IO: return nvme_submit_io(ns, (void __user *)arg); -#ifdef CONFIG_BLK_DEV_NVME_SCSI - case SG_GET_VERSION_NUM: - return nvme_sg_get_version_num((void __user *)arg); - case SG_IO: - return nvme_sg_io(ns, (void __user *)arg); -#endif default: #ifdef CONFIG_NVM if (ns->ndev) @@ -892,10 +1048,6 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { - switch (cmd) { - case SG_IO: - return -ENOIOCTLCMD; - } return nvme_ioctl(bdev, mode, cmd, arg); } #else @@ -983,6 +1135,12 @@ static void nvme_init_integrity(struct nvme_ns *ns) } #endif /* CONFIG_BLK_DEV_INTEGRITY */ +static void nvme_set_chunk_size(struct nvme_ns *ns) +{ + u32 chunk_size = (((u32)ns->noiob) << (ns->lba_shift - 9)); + blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size)); +} + static void nvme_config_discard(struct nvme_ns *ns) { struct nvme_ctrl *ctrl = ns->ctrl; @@ -991,8 +1149,15 @@ static void nvme_config_discard(struct nvme_ns *ns) BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) < NVME_DSM_MAX_RANGES); - ns->queue->limits.discard_alignment = logical_block_size; - ns->queue->limits.discard_granularity = logical_block_size; + if (ctrl->nr_streams && ns->sws && ns->sgs) { + unsigned int sz = logical_block_size * ns->sws * ns->sgs; + + ns->queue->limits.discard_alignment = sz; + ns->queue->limits.discard_granularity = sz; + } else { + ns->queue->limits.discard_alignment = logical_block_size; + ns->queue->limits.discard_granularity = logical_block_size; + } blk_queue_max_discard_sectors(ns->queue, UINT_MAX); blk_queue_max_discard_segments(ns->queue, NVME_DSM_MAX_RANGES); queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); @@ -1016,7 +1181,15 @@ static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id) if (ns->ctrl->vs >= NVME_VS(1, 1, 0)) memcpy(ns->eui, (*id)->eui64, sizeof(ns->eui)); if (ns->ctrl->vs >= NVME_VS(1, 2, 0)) - memcpy(ns->uuid, (*id)->nguid, sizeof(ns->uuid)); + memcpy(ns->nguid, (*id)->nguid, sizeof(ns->nguid)); + if (ns->ctrl->vs >= NVME_VS(1, 3, 0)) { + /* Don't treat error as fatal we potentially + * already have a NGUID or EUI-64 + */ + if (nvme_identify_ns_descs(ns, ns->ns_id)) + dev_warn(ns->ctrl->device, + "%s: Identify Descriptors failed\n", __func__); + } return 0; } @@ -1024,6 +1197,7 @@ static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id) static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) { struct nvme_ns *ns = disk->private_data; + struct nvme_ctrl *ctrl = ns->ctrl; u16 bs; /* @@ -1034,12 +1208,15 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) if (ns->lba_shift == 0) ns->lba_shift = 9; bs = 1 << ns->lba_shift; + ns->noiob = le16_to_cpu(id->noiob); blk_mq_freeze_queue(disk->queue); - if (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED) + if (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED) nvme_prep_integrity(disk, id, bs); blk_queue_logical_block_size(ns->queue, bs); + if (ns->noiob) + nvme_set_chunk_size(ns); if (ns->ms && !blk_get_integrity(disk) && !ns->ext) nvme_init_integrity(ns); if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) @@ -1047,7 +1224,7 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) else set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); - if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM) + if (ctrl->oncs & NVME_CTRL_ONCS_DSM) nvme_config_discard(ns); blk_mq_unfreeze_queue(disk->queue); } @@ -1283,7 +1460,7 @@ EXPORT_SYMBOL_GPL(nvme_enable_ctrl); int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) { - unsigned long timeout = SHUTDOWN_TIMEOUT + jiffies; + unsigned long timeout = jiffies + (shutdown_timeout * HZ); u32 csts; int ret; @@ -1372,7 +1549,7 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl) if (!table) return; - if (ctrl->ps_max_latency_us == 0) { + if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) { /* Turn off APST. */ apste = 0; dev_dbg(ctrl->device, "APST disabled\n"); @@ -1528,6 +1705,31 @@ static bool quirk_matches(const struct nvme_id_ctrl *id, string_matches(id->fr, q->fr, sizeof(id->fr)); } +static void nvme_init_subnqn(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) +{ + size_t nqnlen; + int off; + + nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE); + if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) { + strcpy(ctrl->subnqn, id->subnqn); + return; + } + + if (ctrl->vs >= NVME_VS(1, 2, 1)) + dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n"); + + /* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */ + off = snprintf(ctrl->subnqn, NVMF_NQN_SIZE, + "nqn.2014.08.org.nvmexpress:%4x%4x", + le16_to_cpu(id->vid), le16_to_cpu(id->ssvid)); + memcpy(ctrl->subnqn + off, id->sn, sizeof(id->sn)); + off += sizeof(id->sn); + memcpy(ctrl->subnqn + off, id->mn, sizeof(id->mn)); + off += sizeof(id->mn); + memset(ctrl->subnqn + off, 0, sizeof(ctrl->subnqn) - off); +} + /* * Initialize the cached copies of the Identify data and various controller * register in our nvme_ctrl structure. This should be called as soon as @@ -1539,7 +1741,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) u64 cap; int ret, page_shift; u32 max_hw_sectors; - u8 prev_apsta; + bool prev_apst_enabled; ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); if (ret) { @@ -1563,6 +1765,8 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) return -EIO; } + nvme_init_subnqn(ctrl, id); + if (!ctrl->identified) { /* * Check for quirks. Quirk can depend on firmware version, @@ -1582,7 +1786,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) } if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) { - dev_warn(ctrl->dev, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n"); + dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n"); ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS; } @@ -1607,16 +1811,17 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->kas = le16_to_cpu(id->kas); ctrl->npss = id->npss; - prev_apsta = ctrl->apsta; + ctrl->apsta = id->apsta; + prev_apst_enabled = ctrl->apst_enabled; if (ctrl->quirks & NVME_QUIRK_NO_APST) { if (force_apst && id->apsta) { - dev_warn(ctrl->dev, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n"); - ctrl->apsta = 1; + dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n"); + ctrl->apst_enabled = true; } else { - ctrl->apsta = 0; + ctrl->apst_enabled = false; } } else { - ctrl->apsta = id->apsta; + ctrl->apst_enabled = id->apsta; } memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd)); @@ -1634,22 +1839,25 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ret = -EINVAL; if (!ctrl->opts->discovery_nqn && !ctrl->kas) { - dev_err(ctrl->dev, + dev_err(ctrl->device, "keep-alive support is mandatory for fabrics\n"); ret = -EINVAL; } } else { ctrl->cntlid = le16_to_cpu(id->cntlid); + ctrl->hmpre = le32_to_cpu(id->hmpre); + ctrl->hmmin = le32_to_cpu(id->hmmin); } kfree(id); - if (ctrl->apsta && !prev_apsta) + if (ctrl->apst_enabled && !prev_apst_enabled) dev_pm_qos_expose_latency_tolerance(ctrl->device); - else if (!ctrl->apsta && prev_apsta) + else if (!ctrl->apst_enabled && prev_apst_enabled) dev_pm_qos_hide_latency_tolerance(ctrl->device); nvme_configure_apst(ctrl); + nvme_configure_directives(ctrl); ctrl->identified = true; @@ -1735,7 +1943,7 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd, return nvme_dev_user_cmd(ctrl, argp); case NVME_IOCTL_RESET: dev_warn(ctrl->device, "resetting controller\n"); - return ctrl->ops->reset_ctrl(ctrl); + return nvme_reset_ctrl_sync(ctrl); case NVME_IOCTL_SUBSYS_RESET: return nvme_reset_subsystem(ctrl); case NVME_IOCTL_RESCAN: @@ -1761,7 +1969,7 @@ static ssize_t nvme_sysfs_reset(struct device *dev, struct nvme_ctrl *ctrl = dev_get_drvdata(dev); int ret; - ret = ctrl->ops->reset_ctrl(ctrl); + ret = nvme_reset_ctrl_sync(ctrl); if (ret < 0) return ret; return count; @@ -1787,8 +1995,8 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, int serial_len = sizeof(ctrl->serial); int model_len = sizeof(ctrl->model); - if (memchr_inv(ns->uuid, 0, sizeof(ns->uuid))) - return sprintf(buf, "eui.%16phN\n", ns->uuid); + if (memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) + return sprintf(buf, "eui.%16phN\n", ns->nguid); if (memchr_inv(ns->eui, 0, sizeof(ns->eui))) return sprintf(buf, "eui.%8phN\n", ns->eui); @@ -1803,11 +2011,28 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL); +static ssize_t nguid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); + return sprintf(buf, "%pU\n", ns->nguid); +} +static DEVICE_ATTR(nguid, S_IRUGO, nguid_show, NULL); + static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - return sprintf(buf, "%pU\n", ns->uuid); + + /* For backward compatibility expose the NGUID to userspace if + * we have no UUID set + */ + if (uuid_is_null(&ns->uuid)) { + printk_ratelimited(KERN_WARNING + "No UUID available providing old NGUID\n"); + return sprintf(buf, "%pU\n", ns->nguid); + } + return sprintf(buf, "%pU\n", &ns->uuid); } static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL); @@ -1830,6 +2055,7 @@ static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL); static struct attribute *nvme_ns_attrs[] = { &dev_attr_wwid.attr, &dev_attr_uuid.attr, + &dev_attr_nguid.attr, &dev_attr_eui.attr, &dev_attr_nsid.attr, NULL, @@ -1842,7 +2068,12 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, struct nvme_ns *ns = nvme_get_ns_from_dev(dev); if (a == &dev_attr_uuid.attr) { - if (!memchr_inv(ns->uuid, 0, sizeof(ns->uuid))) + if (uuid_is_null(&ns->uuid) || + !memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) + return 0; + } + if (a == &dev_attr_nguid.attr) { + if (!memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) return 0; } if (a == &dev_attr_eui.attr) { @@ -1931,8 +2162,7 @@ static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev, { struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - return snprintf(buf, PAGE_SIZE, "%s\n", - ctrl->ops->get_subsysnqn(ctrl)); + return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subnqn); } static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL); @@ -1961,24 +2191,16 @@ static struct attribute *nvme_dev_attrs[] = { NULL }; -#define CHECK_ATTR(ctrl, a, name) \ - if ((a) == &dev_attr_##name.attr && \ - !(ctrl)->ops->get_##name) \ - return 0 - static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = container_of(kobj, struct device, kobj); struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - if (a == &dev_attr_delete_controller.attr) { - if (!ctrl->ops->delete_ctrl) - return 0; - } - - CHECK_ATTR(ctrl, a, subsysnqn); - CHECK_ATTR(ctrl, a, address); + if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl) + return 0; + if (a == &dev_attr_address.attr && !ctrl->ops->get_address) + return 0; return a->mode; } @@ -2019,6 +2241,32 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) return ret; } +static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns) +{ + struct streams_directive_params s; + int ret; + + if (!ctrl->nr_streams) + return 0; + + ret = nvme_get_stream_params(ctrl, &s, ns->ns_id); + if (ret) + return ret; + + ns->sws = le32_to_cpu(s.sws); + ns->sgs = le16_to_cpu(s.sgs); + + if (ns->sws) { + unsigned int bs = 1 << ns->lba_shift; + + blk_queue_io_min(ns->queue, bs * ns->sws); + if (ns->sgs) + blk_queue_io_opt(ns->queue, bs * ns->sws * ns->sgs); + } + + return 0; +} + static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns *ns; @@ -2048,6 +2296,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); nvme_set_queue_limits(ctrl, ns->queue); + nvme_setup_streams_ns(ctrl, ns); sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->instance); @@ -2056,7 +2305,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) if (nvme_nvm_ns_supported(ns, id) && nvme_nvm_register(ns, disk_name, node)) { - dev_warn(ctrl->dev, "%s: LightNVM init failure\n", __func__); + dev_warn(ctrl->device, "%s: LightNVM init failure\n", __func__); goto out_free_id; } @@ -2231,7 +2480,7 @@ void nvme_queue_scan(struct nvme_ctrl *ctrl) * removal. */ if (ctrl->state == NVME_CTRL_LIVE) - schedule_work(&ctrl->scan_work); + queue_work(nvme_wq, &ctrl->scan_work); } EXPORT_SYMBOL_GPL(nvme_queue_scan); @@ -2286,7 +2535,7 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, /*FALLTHRU*/ case NVME_SC_ABORT_REQ: ++ctrl->event_limit; - schedule_work(&ctrl->async_event_work); + queue_work(nvme_wq, &ctrl->async_event_work); break; default: break; @@ -2309,7 +2558,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event); void nvme_queue_async_events(struct nvme_ctrl *ctrl) { ctrl->event_limit = NVME_NR_AERS; - schedule_work(&ctrl->async_event_work); + queue_work(nvme_wq, &ctrl->async_event_work); } EXPORT_SYMBOL_GPL(nvme_queue_async_events); @@ -2442,6 +2691,9 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) mutex_lock(&ctrl->namespaces_mutex); + /* Forcibly unquiesce queues to avoid blocking dispatch */ + blk_mq_unquiesce_queue(ctrl->admin_q); + /* Forcibly start all queues to avoid having stuck requests */ blk_mq_start_hw_queues(ctrl->admin_q); @@ -2455,6 +2707,9 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) revalidate_disk(ns->disk); blk_set_queue_dying(ns->queue); + /* Forcibly unquiesce queues to avoid blocking dispatch */ + blk_mq_unquiesce_queue(ns->queue); + /* * Forcibly start all queues to avoid having stuck requests. * Note that we must ensure the queues are not stopped @@ -2533,7 +2788,7 @@ void nvme_start_queues(struct nvme_ctrl *ctrl) mutex_lock(&ctrl->namespaces_mutex); list_for_each_entry(ns, &ctrl->namespaces, list) { - blk_mq_start_stopped_hw_queues(ns->queue, true); + blk_mq_unquiesce_queue(ns->queue); blk_mq_kick_requeue_list(ns->queue); } mutex_unlock(&ctrl->namespaces_mutex); @@ -2544,10 +2799,15 @@ int __init nvme_core_init(void) { int result; + nvme_wq = alloc_workqueue("nvme-wq", + WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); + if (!nvme_wq) + return -ENOMEM; + result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", &nvme_dev_fops); if (result < 0) - return result; + goto destroy_wq; else if (result > 0) nvme_char_major = result; @@ -2559,8 +2819,10 @@ int __init nvme_core_init(void) return 0; - unregister_chrdev: +unregister_chrdev: __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); +destroy_wq: + destroy_workqueue(nvme_wq); return result; } @@ -2568,6 +2830,7 @@ void nvme_core_exit(void) { class_destroy(nvme_class); __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); + destroy_workqueue(nvme_wq); } MODULE_LICENSE("GPL"); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index c190d7e..2e582a2 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -58,7 +58,6 @@ static struct nvmf_host *nvmf_host_add(const char *hostnqn) kref_init(&host->ref); memcpy(host->nqn, hostnqn, NVMF_NQN_SIZE); - uuid_gen(&host->id); list_add_tail(&host->list, &nvmf_hosts); out_unlock: @@ -75,7 +74,6 @@ static struct nvmf_host *nvmf_host_default(void) return NULL; kref_init(&host->ref); - uuid_gen(&host->id); snprintf(host->nqn, NVMF_NQN_SIZE, "nqn.2014-08.org.nvmexpress:NVMf:uuid:%pUb", &host->id); @@ -128,16 +126,6 @@ int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size) EXPORT_SYMBOL_GPL(nvmf_get_address); /** - * nvmf_get_subsysnqn() - Get subsystem NQN - * @ctrl: Host NVMe controller instance which we got the NQN - */ -const char *nvmf_get_subsysnqn(struct nvme_ctrl *ctrl) -{ - return ctrl->opts->subsysnqn; -} -EXPORT_SYMBOL_GPL(nvmf_get_subsysnqn); - -/** * nvmf_reg_read32() - NVMe Fabrics "Property Get" API function. * @ctrl: Host NVMe controller instance maintaining the admin * queue used to submit the property read command to @@ -337,6 +325,24 @@ static void nvmf_log_connect_error(struct nvme_ctrl *ctrl, } } break; + + case NVME_SC_CONNECT_INVALID_HOST: + dev_err(ctrl->device, + "Connect for subsystem %s is not allowed, hostnqn: %s\n", + data->subsysnqn, data->hostnqn); + break; + + case NVME_SC_CONNECT_CTRL_BUSY: + dev_err(ctrl->device, + "Connect command failed: controller is busy or not available\n"); + break; + + case NVME_SC_CONNECT_FORMAT: + dev_err(ctrl->device, + "Connect incompatible format: %d", + cmd->connect.recfmt); + break; + default: dev_err(ctrl->device, "Connect command failed, error wo/DNR bit: %d\n", @@ -376,13 +382,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) cmd.connect.opcode = nvme_fabrics_command; cmd.connect.fctype = nvme_fabrics_type_connect; cmd.connect.qid = 0; - - /* - * fabrics spec sets a minimum of depth 32 for admin queue, - * so set the queue with this depth always until - * justification otherwise. - */ - cmd.connect.sqsize = cpu_to_le16(NVMF_AQ_DEPTH - 1); + cmd.connect.sqsize = cpu_to_le16(NVME_AQ_DEPTH - 1); /* * Set keep-alive timeout in seconds granularity (ms * 1000) @@ -474,7 +474,7 @@ EXPORT_SYMBOL_GPL(nvmf_connect_io_queue); bool nvmf_should_reconnect(struct nvme_ctrl *ctrl) { if (ctrl->opts->max_reconnects != -1 && - ctrl->opts->nr_reconnects < ctrl->opts->max_reconnects) + ctrl->nr_reconnects < ctrl->opts->max_reconnects) return true; return false; @@ -547,6 +547,7 @@ static const match_table_t opt_tokens = { { NVMF_OPT_KATO, "keep_alive_tmo=%d" }, { NVMF_OPT_HOSTNQN, "hostnqn=%s" }, { NVMF_OPT_HOST_TRADDR, "host_traddr=%s" }, + { NVMF_OPT_HOST_ID, "hostid=%s" }, { NVMF_OPT_ERR, NULL } }; @@ -558,6 +559,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, int token, ret = 0; size_t nqnlen = 0; int ctrl_loss_tmo = NVMF_DEF_CTRL_LOSS_TMO; + uuid_t hostid; /* Set defaults */ opts->queue_size = NVMF_DEF_QUEUE_SIZE; @@ -568,6 +570,8 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, if (!options) return -ENOMEM; + uuid_gen(&hostid); + while ((p = strsep(&o, ",\n")) != NULL) { if (!*p) continue; @@ -724,6 +728,17 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, } opts->host_traddr = p; break; + case NVMF_OPT_HOST_ID: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + if (uuid_parse(p, &hostid)) { + ret = -EINVAL; + goto out; + } + break; default: pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", p); @@ -743,6 +758,8 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->host = nvmf_default_host; } + uuid_copy(&opts->host->id, &hostid); + out: if (!opts->discovery_nqn && !opts->kato) opts->kato = NVME_DEFAULT_KATO; @@ -803,7 +820,8 @@ EXPORT_SYMBOL_GPL(nvmf_free_options); #define NVMF_REQUIRED_OPTS (NVMF_OPT_TRANSPORT | NVMF_OPT_NQN) #define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \ - NVMF_OPT_KATO | NVMF_OPT_HOSTNQN) + NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \ + NVMF_OPT_HOST_ID) static struct nvme_ctrl * nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) @@ -854,6 +872,15 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) goto out_unlock; } + if (strcmp(ctrl->subnqn, opts->subsysnqn)) { + dev_warn(ctrl->device, + "controller returned incorrect NQN: \"%s\".\n", + ctrl->subnqn); + mutex_unlock(&nvmf_transports_mutex); + ctrl->ops->delete_ctrl(ctrl); + return ERR_PTR(-EINVAL); + } + mutex_unlock(&nvmf_transports_mutex); return ctrl; diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 29be760..bf33663 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -56,6 +56,7 @@ enum { NVMF_OPT_RECONNECT_DELAY = 1 << 9, NVMF_OPT_HOST_TRADDR = 1 << 10, NVMF_OPT_CTRL_LOSS_TMO = 1 << 11, + NVMF_OPT_HOST_ID = 1 << 12, }; /** @@ -80,7 +81,6 @@ enum { * @discovery_nqn: indicates if the subsysnqn is the well-known discovery NQN. * @kato: Keep-alive timeout. * @host: Virtual NVMe host, contains the NQN and Host ID. - * @nr_reconnects: number of reconnect attempted since the last ctrl failure * @max_reconnects: maximum number of allowed reconnect attempts before removing * the controller, (-1) means reconnect forever, zero means remove * immediately; @@ -98,7 +98,6 @@ struct nvmf_ctrl_options { bool discovery_nqn; unsigned int kato; struct nvmf_host *host; - int nr_reconnects; int max_reconnects; }; @@ -140,7 +139,6 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid); int nvmf_register_transport(struct nvmf_transport_ops *ops); void nvmf_unregister_transport(struct nvmf_transport_ops *ops); void nvmf_free_options(struct nvmf_ctrl_options *opts); -const char *nvmf_get_subsysnqn(struct nvme_ctrl *ctrl); int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size); bool nvmf_should_reconnect(struct nvme_ctrl *ctrl); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 5ee4c71..ed87214 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -36,7 +36,7 @@ */ #define NVME_FC_NR_AEN_COMMANDS 1 #define NVME_FC_AQ_BLKMQ_DEPTH \ - (NVMF_AQ_DEPTH - NVME_FC_NR_AEN_COMMANDS) + (NVME_AQ_DEPTH - NVME_FC_NR_AEN_COMMANDS) #define AEN_CMDID_BASE (NVME_FC_AQ_BLKMQ_DEPTH + 1) enum nvme_fc_queue_flags { @@ -161,12 +161,12 @@ struct nvme_fc_ctrl { struct blk_mq_tag_set tag_set; struct work_struct delete_work; - struct work_struct reset_work; struct delayed_work connect_work; struct kref ref; u32 flags; u32 iocnt; + wait_queue_head_t ioabort_wait; struct nvme_fc_fcp_op aen_ops[NVME_FC_NR_AEN_COMMANDS]; @@ -214,7 +214,6 @@ static LIST_HEAD(nvme_fc_lport_list); static DEFINE_IDA(nvme_fc_local_port_cnt); static DEFINE_IDA(nvme_fc_ctrl_cnt); -static struct workqueue_struct *nvme_fc_wq; @@ -1241,8 +1240,10 @@ __nvme_fc_fcpop_chk_teardowns(struct nvme_fc_ctrl *ctrl, spin_lock_irqsave(&ctrl->lock, flags); if (unlikely(op->flags & FCOP_FLAGS_TERMIO)) { - if (ctrl->flags & FCCTRL_TERMIO) - ctrl->iocnt--; + if (ctrl->flags & FCCTRL_TERMIO) { + if (!--ctrl->iocnt) + wake_up(&ctrl->ioabort_wait); + } } if (op->flags & FCOP_FLAGS_RELEASED) complete_rq = true; @@ -1449,18 +1450,8 @@ nvme_fc_init_request(struct blk_mq_tag_set *set, struct request *rq, { struct nvme_fc_ctrl *ctrl = set->driver_data; struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq); - struct nvme_fc_queue *queue = &ctrl->queues[hctx_idx+1]; - - return __nvme_fc_init_request(ctrl, queue, op, rq, queue->rqcnt++); -} - -static int -nvme_fc_init_admin_request(struct blk_mq_tag_set *set, struct request *rq, - unsigned int hctx_idx, unsigned int numa_node) -{ - struct nvme_fc_ctrl *ctrl = set->driver_data; - struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq); - struct nvme_fc_queue *queue = &ctrl->queues[0]; + int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; + struct nvme_fc_queue *queue = &ctrl->queues[queue_idx]; return __nvme_fc_init_request(ctrl, queue, op, rq, queue->rqcnt++); } @@ -1758,16 +1749,16 @@ nvme_fc_nvme_ctrl_freed(struct nvme_ctrl *nctrl) static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg) { + /* only proceed if in LIVE state - e.g. on first error */ + if (ctrl->ctrl.state != NVME_CTRL_LIVE) + return; + dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: transport association error detected: %s\n", ctrl->cnum, errmsg); dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: resetting controller\n", ctrl->cnum); - /* stop the queues on error, cleanup is in reset thread */ - if (ctrl->queue_count > 1) - nvme_stop_queues(&ctrl->ctrl); - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) { dev_err(ctrl->ctrl.device, "NVME-FC{%d}: error_recovery: Couldn't change state " @@ -1775,10 +1766,7 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg) return; } - if (!queue_work(nvme_fc_wq, &ctrl->reset_work)) - dev_err(ctrl->ctrl.device, - "NVME-FC{%d}: error_recovery: Failed to schedule " - "reset work\n", ctrl->cnum); + nvme_reset_ctrl(&ctrl->ctrl); } static enum blk_eh_timer_return @@ -1887,7 +1875,7 @@ nvme_fc_unmap_data(struct nvme_fc_ctrl *ctrl, struct request *rq, * level FC exchange resource that is also outstanding. This must be * considered in all cleanup operations. */ -static int +static blk_status_t nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, struct nvme_fc_fcp_op *op, u32 data_len, enum nvmefc_fcp_datadir io_dir) @@ -1902,10 +1890,10 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, * the target device is present */ if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE) - return BLK_MQ_RQ_QUEUE_ERROR; + return BLK_STS_IOERR; if (!nvme_fc_ctrl_get(ctrl)) - return BLK_MQ_RQ_QUEUE_ERROR; + return BLK_STS_IOERR; /* format the FC-NVME CMD IU and fcp_req */ cmdiu->connection_id = cpu_to_be64(queue->connection_id); @@ -1953,8 +1941,9 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, if (ret < 0) { nvme_cleanup_cmd(op->rq); nvme_fc_ctrl_put(ctrl); - return (ret == -ENOMEM || ret == -EAGAIN) ? - BLK_MQ_RQ_QUEUE_BUSY : BLK_MQ_RQ_QUEUE_ERROR; + if (ret == -ENOMEM || ret == -EAGAIN) + return BLK_STS_RESOURCE; + return BLK_STS_IOERR; } } @@ -1971,28 +1960,26 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, queue->lldd_handle, &op->fcp_req); if (ret) { - if (op->rq) { /* normal request */ + if (op->rq) /* normal request */ nvme_fc_unmap_data(ctrl, op->rq, op); - nvme_cleanup_cmd(op->rq); - } /* else - aen. no cleanup needed */ nvme_fc_ctrl_put(ctrl); if (ret != -EBUSY) - return BLK_MQ_RQ_QUEUE_ERROR; + return BLK_STS_IOERR; if (op->rq) { blk_mq_stop_hw_queues(op->rq->q); blk_mq_delay_queue(queue->hctx, NVMEFC_QUEUE_DELAY); } - return BLK_MQ_RQ_QUEUE_BUSY; + return BLK_STS_RESOURCE; } - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; } -static int +static blk_status_t nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -2005,7 +1992,7 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_command *sqe = &cmdiu->sqe; enum nvmefc_fcp_datadir io_dir; u32 data_len; - int ret; + blk_status_t ret; ret = nvme_setup_cmd(ns, rq, sqe); if (ret) @@ -2060,7 +2047,7 @@ nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx) struct nvme_fc_fcp_op *aen_op; unsigned long flags; bool terminating = false; - int ret; + blk_status_t ret; if (aer_idx > NVME_FC_NR_AEN_COMMANDS) return; @@ -2092,7 +2079,6 @@ __nvme_fc_final_op_cleanup(struct request *rq) op->flags &= ~(FCOP_FLAGS_TERMIO | FCOP_FLAGS_RELEASED | FCOP_FLAGS_COMPLETE); - nvme_cleanup_cmd(rq); nvme_fc_unmap_data(ctrl, rq, op); nvme_complete_rq(rq); nvme_fc_ctrl_put(ctrl); @@ -2310,7 +2296,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) int ret; bool changed; - ++ctrl->ctrl.opts->nr_reconnects; + ++ctrl->ctrl.nr_reconnects; /* * Create the admin queue @@ -2407,7 +2393,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); WARN_ON_ONCE(!changed); - ctrl->ctrl.opts->nr_reconnects = 0; + ctrl->ctrl.nr_reconnects = 0; if (ctrl->queue_count > 1) { nvme_start_queues(&ctrl->ctrl); @@ -2493,11 +2479,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) /* wait for all io that had to be aborted */ spin_lock_irqsave(&ctrl->lock, flags); - while (ctrl->iocnt) { - spin_unlock_irqrestore(&ctrl->lock, flags); - msleep(1000); - spin_lock_irqsave(&ctrl->lock, flags); - } + wait_event_lock_irq(ctrl->ioabort_wait, ctrl->iocnt == 0, ctrl->lock); ctrl->flags &= ~FCCTRL_TERMIO; spin_unlock_irqrestore(&ctrl->lock, flags); @@ -2527,7 +2509,7 @@ nvme_fc_delete_ctrl_work(struct work_struct *work) struct nvme_fc_ctrl *ctrl = container_of(work, struct nvme_fc_ctrl, delete_work); - cancel_work_sync(&ctrl->reset_work); + cancel_work_sync(&ctrl->ctrl.reset_work); cancel_delayed_work_sync(&ctrl->connect_work); /* @@ -2554,7 +2536,7 @@ __nvme_fc_schedule_delete_work(struct nvme_fc_ctrl *ctrl) if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) return true; - if (!queue_work(nvme_fc_wq, &ctrl->delete_work)) + if (!queue_work(nvme_wq, &ctrl->delete_work)) return true; return false; @@ -2581,7 +2563,7 @@ nvme_fc_del_nvme_ctrl(struct nvme_ctrl *nctrl) ret = __nvme_fc_del_ctrl(ctrl); if (!ret) - flush_workqueue(nvme_fc_wq); + flush_workqueue(nvme_wq); nvme_put_ctrl(&ctrl->ctrl); @@ -2606,13 +2588,13 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) dev_info(ctrl->ctrl.device, "NVME-FC{%d}: Reconnect attempt in %d seconds.\n", ctrl->cnum, ctrl->ctrl.opts->reconnect_delay); - queue_delayed_work(nvme_fc_wq, &ctrl->connect_work, + queue_delayed_work(nvme_wq, &ctrl->connect_work, ctrl->ctrl.opts->reconnect_delay * HZ); } else { dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: Max reconnect attempts (%d) " "reached. Removing controller\n", - ctrl->cnum, ctrl->ctrl.opts->nr_reconnects); + ctrl->cnum, ctrl->ctrl.nr_reconnects); WARN_ON(__nvme_fc_schedule_delete_work(ctrl)); } } @@ -2621,7 +2603,7 @@ static void nvme_fc_reset_ctrl_work(struct work_struct *work) { struct nvme_fc_ctrl *ctrl = - container_of(work, struct nvme_fc_ctrl, reset_work); + container_of(work, struct nvme_fc_ctrl, ctrl.reset_work); int ret; /* will block will waiting for io to terminate */ @@ -2635,29 +2617,6 @@ nvme_fc_reset_ctrl_work(struct work_struct *work) "NVME-FC{%d}: controller reset complete\n", ctrl->cnum); } -/* - * called by the nvme core layer, for sysfs interface that requests - * a reset of the nvme controller - */ -static int -nvme_fc_reset_nvme_ctrl(struct nvme_ctrl *nctrl) -{ - struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); - - dev_info(ctrl->ctrl.device, - "NVME-FC{%d}: admin requested controller reset\n", ctrl->cnum); - - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) - return -EBUSY; - - if (!queue_work(nvme_fc_wq, &ctrl->reset_work)) - return -EBUSY; - - flush_work(&ctrl->reset_work); - - return 0; -} - static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { .name = "fc", .module = THIS_MODULE, @@ -2665,11 +2624,9 @@ static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { .reg_read32 = nvmf_reg_read32, .reg_read64 = nvmf_reg_read64, .reg_write32 = nvmf_reg_write32, - .reset_ctrl = nvme_fc_reset_nvme_ctrl, .free_ctrl = nvme_fc_nvme_ctrl_freed, .submit_async_event = nvme_fc_submit_async_event, .delete_ctrl = nvme_fc_del_nvme_ctrl, - .get_subsysnqn = nvmf_get_subsysnqn, .get_address = nvmf_get_address, }; @@ -2695,7 +2652,7 @@ nvme_fc_connect_ctrl_work(struct work_struct *work) static const struct blk_mq_ops nvme_fc_admin_mq_ops = { .queue_rq = nvme_fc_queue_rq, .complete = nvme_fc_complete_rq, - .init_request = nvme_fc_init_admin_request, + .init_request = nvme_fc_init_request, .exit_request = nvme_fc_exit_request, .reinit_request = nvme_fc_reinit_request, .init_hctx = nvme_fc_init_admin_hctx, @@ -2740,7 +2697,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, kref_init(&ctrl->ref); INIT_WORK(&ctrl->delete_work, nvme_fc_delete_ctrl_work); - INIT_WORK(&ctrl->reset_work, nvme_fc_reset_ctrl_work); + INIT_WORK(&ctrl->ctrl.reset_work, nvme_fc_reset_ctrl_work); INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work); spin_lock_init(&ctrl->lock); @@ -2807,6 +2764,9 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, nvme_uninit_ctrl(&ctrl->ctrl); nvme_put_ctrl(&ctrl->ctrl); + /* Remove core ctrl ref. */ + nvme_put_ctrl(&ctrl->ctrl); + /* as we're past the point where we transition to the ref * counting teardown path, if we return a bad pointer here, * the calling routine, thinking it's prior to the @@ -2965,20 +2925,7 @@ static struct nvmf_transport_ops nvme_fc_transport = { static int __init nvme_fc_init_module(void) { - int ret; - - nvme_fc_wq = create_workqueue("nvme_fc_wq"); - if (!nvme_fc_wq) - return -ENOMEM; - - ret = nvmf_register_transport(&nvme_fc_transport); - if (ret) - goto err; - - return 0; -err: - destroy_workqueue(nvme_fc_wq); - return ret; + return nvmf_register_transport(&nvme_fc_transport); } static void __exit nvme_fc_exit_module(void) @@ -2989,8 +2936,6 @@ static void __exit nvme_fc_exit_module(void) nvmf_unregister_transport(&nvme_fc_transport); - destroy_workqueue(nvme_fc_wq); - ida_destroy(&nvme_fc_local_port_cnt); ida_destroy(&nvme_fc_ctrl_cnt); } diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index f5df78e..be85413 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -242,7 +242,7 @@ static inline void _nvme_nvm_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64); BUILD_BUG_ON(sizeof(struct nvme_nvm_id_group) != 960); BUILD_BUG_ON(sizeof(struct nvme_nvm_addr_format) != 16); - BUILD_BUG_ON(sizeof(struct nvme_nvm_id) != 4096); + BUILD_BUG_ON(sizeof(struct nvme_nvm_id) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_nvm_bb_tbl) != 64); } @@ -480,7 +480,7 @@ static inline void nvme_nvm_rqtocmd(struct nvm_rq *rqd, struct nvme_ns *ns, rqd->bio->bi_iter.bi_sector)); } -static void nvme_nvm_end_io(struct request *rq, int error) +static void nvme_nvm_end_io(struct request *rq, blk_status_t status) { struct nvm_rq *rqd = rq->end_io_data; @@ -509,7 +509,7 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) rq = nvme_alloc_request(q, (struct nvme_command *)cmd, 0, NVME_QID_ANY); if (IS_ERR(rq)) { kfree(cmd); - return -ENOMEM; + return PTR_ERR(rq); } rq->cmd_flags &= ~REQ_FAILFAST_DRIVER; @@ -571,13 +571,6 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = { .max_phys_sect = 64, }; -static void nvme_nvm_end_user_vio(struct request *rq, int error) -{ - struct completion *waiting = rq->end_io_data; - - complete(waiting); -} - static int nvme_nvm_submit_user_cmd(struct request_queue *q, struct nvme_ns *ns, struct nvme_nvm_command *vcmd, @@ -608,7 +601,6 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q, rq->timeout = timeout ? timeout : ADMIN_TIMEOUT; rq->cmd_flags &= ~REQ_FAILFAST_DRIVER; - rq->end_io_data = &wait; if (ppa_buf && ppa_len) { ppa_list = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, &ppa_dma); @@ -662,9 +654,7 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q, } submit: - blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_user_vio); - - wait_for_completion_io(&wait); + blk_execute_rq(q, NULL, rq, 0); if (nvme_req(rq)->flags & NVME_REQ_CANCELLED) ret = -EINTR; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9d6a070..d70ff0f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -27,12 +27,11 @@ extern unsigned char nvme_io_timeout; extern unsigned char admin_timeout; #define ADMIN_TIMEOUT (admin_timeout * HZ) -extern unsigned char shutdown_timeout; -#define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ) - #define NVME_DEFAULT_KATO 5 #define NVME_KATO_GRACE 10 +extern struct workqueue_struct *nvme_wq; + enum { NVME_NS_LBA = 0, NVME_NS_LIGHTNVM = 1, @@ -131,6 +130,7 @@ struct nvme_ctrl { struct device *device; /* char device */ struct list_head node; struct ida ns_ida; + struct work_struct reset_work; struct opal_dev *opal_dev; @@ -138,6 +138,7 @@ struct nvme_ctrl { char serial[20]; char model[40]; char firmware_rev[8]; + char subnqn[NVMF_NQN_SIZE]; u16 cntlid; u32 ctrl_config; @@ -147,6 +148,8 @@ struct nvme_ctrl { u16 oncs; u16 vid; u16 oacs; + u16 nssa; + u16 nr_streams; atomic_t abort_limit; u8 event_limit; u8 vwc; @@ -165,6 +168,10 @@ struct nvme_ctrl { /* Power saving configuration */ u64 ps_max_latency_us; + bool apst_enabled; + + u32 hmpre; + u32 hmmin; /* Fabrics only */ u16 sqsize; @@ -172,12 +179,10 @@ struct nvme_ctrl { u32 iorcsz; u16 icdoff; u16 maxcmd; + int nr_reconnects; struct nvmf_ctrl_options *opts; }; -/* - * An NVM Express namespace is equivalent to a SCSI LUN - */ struct nvme_ns { struct list_head list; @@ -189,14 +194,18 @@ struct nvme_ns { int instance; u8 eui[8]; - u8 uuid[16]; + u8 nguid[16]; + uuid_t uuid; unsigned ns_id; int lba_shift; u16 ms; + u16 sgs; + u32 sws; bool ext; u8 pi_type; unsigned long flags; + u16 noiob; #define NVME_NS_REMOVING 0 #define NVME_NS_DEAD 1 @@ -214,11 +223,9 @@ struct nvme_ctrl_ops { int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); - int (*reset_ctrl)(struct nvme_ctrl *ctrl); void (*free_ctrl)(struct nvme_ctrl *ctrl); void (*submit_async_event)(struct nvme_ctrl *ctrl, int aer_idx); int (*delete_ctrl)(struct nvme_ctrl *ctrl); - const char *(*get_subsysnqn)(struct nvme_ctrl *ctrl); int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); }; @@ -296,7 +303,7 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl); #define NVME_QID_ANY -1 struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, unsigned int flags, int qid); -int nvme_setup_cmd(struct nvme_ns *ns, struct request *req, +blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, struct nvme_command *cmd); int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buf, unsigned bufflen); @@ -310,23 +317,10 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, u32 meta_seed, u32 *result, unsigned timeout); -int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id); -int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, - struct nvme_id_ns **id); -int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log); -int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, - void *buffer, size_t buflen, u32 *result); -int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, - void *buffer, size_t buflen, u32 *result); int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); void nvme_start_keep_alive(struct nvme_ctrl *ctrl); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); - -struct sg_io_hdr; - -int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr); -int nvme_sg_io32(struct nvme_ns *ns, unsigned long arg); -int nvme_sg_get_version_num(int __user *ip); +int nvme_reset_ctrl(struct nvme_ctrl *ctrl); #ifdef CONFIG_NVM int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 40c7581..33c3b9d 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -17,28 +17,15 @@ #include <linux/blkdev.h> #include <linux/blk-mq.h> #include <linux/blk-mq-pci.h> -#include <linux/cpu.h> -#include <linux/delay.h> #include <linux/dmi.h> -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/genhd.h> -#include <linux/hdreg.h> -#include <linux/idr.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/io.h> -#include <linux/kdev_t.h> -#include <linux/kernel.h> #include <linux/mm.h> #include <linux/module.h> -#include <linux/moduleparam.h> #include <linux/mutex.h> #include <linux/pci.h> #include <linux/poison.h> -#include <linux/ptrace.h> -#include <linux/sched.h> -#include <linux/slab.h> #include <linux/t10-pi.h> #include <linux/timer.h> #include <linux/types.h> @@ -49,7 +36,6 @@ #include "nvme.h" #define NVME_Q_DEPTH 1024 -#define NVME_AQ_DEPTH 256 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) @@ -66,12 +52,14 @@ static bool use_cmb_sqes = true; module_param(use_cmb_sqes, bool, 0644); MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes"); -static struct workqueue_struct *nvme_workq; +static unsigned int max_host_mem_size_mb = 128; +module_param(max_host_mem_size_mb, uint, 0444); +MODULE_PARM_DESC(max_host_mem_size_mb, + "Maximum Host Memory Buffer (HMB) size per controller (in MiB)"); struct nvme_dev; struct nvme_queue; -static int nvme_reset(struct nvme_dev *dev); static void nvme_process_cq(struct nvme_queue *nvmeq); static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); @@ -92,9 +80,8 @@ struct nvme_dev { int q_depth; u32 db_stride; void __iomem *bar; - struct work_struct reset_work; + unsigned long bar_mapped_size; struct work_struct remove_work; - struct timer_list watchdog_timer; struct mutex shutdown_lock; bool subsystem; void __iomem *cmb; @@ -104,10 +91,18 @@ struct nvme_dev { u32 cmbloc; struct nvme_ctrl ctrl; struct completion ioq_wait; + + /* shadow doorbell buffer support: */ u32 *dbbuf_dbs; dma_addr_t dbbuf_dbs_dma_addr; u32 *dbbuf_eis; dma_addr_t dbbuf_eis_dma_addr; + + /* host memory buffer support: */ + u64 host_mem_size; + u32 nr_host_mem_descs; + struct nvme_host_mem_buf_desc *host_mem_descs; + void **host_mem_desc_bufs; }; static inline unsigned int sq_idx(unsigned int qid, u32 stride) @@ -185,8 +180,8 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64); BUILD_BUG_ON(sizeof(struct nvme_command) != 64); - BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); - BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); + BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE); + BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64); @@ -350,19 +345,6 @@ static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_i nvmeq->tags = NULL; } -static int nvme_admin_init_request(struct blk_mq_tag_set *set, - struct request *req, unsigned int hctx_idx, - unsigned int numa_node) -{ - struct nvme_dev *dev = set->driver_data; - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = dev->queues[0]; - - BUG_ON(!nvmeq); - iod->nvmeq = nvmeq; - return 0; -} - static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { @@ -382,7 +364,8 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req, { struct nvme_dev *dev = set->driver_data; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; + int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0; + struct nvme_queue *nvmeq = dev->queues[queue_idx]; BUG_ON(!nvmeq); iod->nvmeq = nvmeq; @@ -427,7 +410,7 @@ static __le64 **iod_list(struct request *req) return (__le64 **)(iod->sg + blk_rq_nr_phys_segments(req)); } -static int nvme_init_iod(struct request *rq, struct nvme_dev *dev) +static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev) { struct nvme_iod *iod = blk_mq_rq_to_pdu(rq); int nseg = blk_rq_nr_phys_segments(rq); @@ -436,7 +419,7 @@ static int nvme_init_iod(struct request *rq, struct nvme_dev *dev) if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) { iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC); if (!iod->sg) - return BLK_MQ_RQ_QUEUE_BUSY; + return BLK_STS_RESOURCE; } else { iod->sg = iod->inline_sg; } @@ -446,7 +429,7 @@ static int nvme_init_iod(struct request *rq, struct nvme_dev *dev) iod->nents = 0; iod->length = size; - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; } static void nvme_free_iod(struct nvme_dev *dev, struct request *req) @@ -616,21 +599,21 @@ static bool nvme_setup_prps(struct nvme_dev *dev, struct request *req) return true; } -static int nvme_map_data(struct nvme_dev *dev, struct request *req, +static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, struct nvme_command *cmnd) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct request_queue *q = req->q; enum dma_data_direction dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; - int ret = BLK_MQ_RQ_QUEUE_ERROR; + blk_status_t ret = BLK_STS_IOERR; sg_init_table(iod->sg, blk_rq_nr_phys_segments(req)); iod->nents = blk_rq_map_sg(q, req, iod->sg); if (!iod->nents) goto out; - ret = BLK_MQ_RQ_QUEUE_BUSY; + ret = BLK_STS_RESOURCE; if (!dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, dma_dir, DMA_ATTR_NO_WARN)) goto out; @@ -638,7 +621,7 @@ static int nvme_map_data(struct nvme_dev *dev, struct request *req, if (!nvme_setup_prps(dev, req)) goto out_unmap; - ret = BLK_MQ_RQ_QUEUE_ERROR; + ret = BLK_STS_IOERR; if (blk_integrity_rq(req)) { if (blk_rq_count_integrity_sg(q, req->bio) != 1) goto out_unmap; @@ -658,7 +641,7 @@ static int nvme_map_data(struct nvme_dev *dev, struct request *req, cmnd->rw.dptr.prp2 = cpu_to_le64(iod->first_dma); if (blk_integrity_rq(req)) cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg)); - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; out_unmap: dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); @@ -688,7 +671,7 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) /* * NOTE: ns is NULL when called on the admin queue. */ -static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, +static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct nvme_ns *ns = hctx->queue->queuedata; @@ -696,47 +679,34 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_dev *dev = nvmeq->dev; struct request *req = bd->rq; struct nvme_command cmnd; - int ret = BLK_MQ_RQ_QUEUE_OK; - - /* - * If formated with metadata, require the block layer provide a buffer - * unless this namespace is formated such that the metadata can be - * stripped/generated by the controller with PRACT=1. - */ - if (ns && ns->ms && !blk_integrity_rq(req)) { - if (!(ns->pi_type && ns->ms == 8) && - !blk_rq_is_passthrough(req)) { - blk_mq_end_request(req, -EFAULT); - return BLK_MQ_RQ_QUEUE_OK; - } - } + blk_status_t ret; ret = nvme_setup_cmd(ns, req, &cmnd); - if (ret != BLK_MQ_RQ_QUEUE_OK) + if (ret) return ret; ret = nvme_init_iod(req, dev); - if (ret != BLK_MQ_RQ_QUEUE_OK) + if (ret) goto out_free_cmd; - if (blk_rq_nr_phys_segments(req)) + if (blk_rq_nr_phys_segments(req)) { ret = nvme_map_data(dev, req, &cmnd); - - if (ret != BLK_MQ_RQ_QUEUE_OK) - goto out_cleanup_iod; + if (ret) + goto out_cleanup_iod; + } blk_mq_start_request(req); spin_lock_irq(&nvmeq->q_lock); if (unlikely(nvmeq->cq_vector < 0)) { - ret = BLK_MQ_RQ_QUEUE_ERROR; + ret = BLK_STS_IOERR; spin_unlock_irq(&nvmeq->q_lock); goto out_cleanup_iod; } __nvme_submit_cmd(nvmeq, &cmnd); nvme_process_cq(nvmeq); spin_unlock_irq(&nvmeq->q_lock); - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; out_cleanup_iod: nvme_free_iod(dev, req); out_free_cmd: @@ -759,65 +729,75 @@ static inline bool nvme_cqe_valid(struct nvme_queue *nvmeq, u16 head, return (le16_to_cpu(nvmeq->cqes[head].status) & 1) == phase; } -static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) +static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq) { - u16 head, phase; - - head = nvmeq->cq_head; - phase = nvmeq->cq_phase; - - while (nvme_cqe_valid(nvmeq, head, phase)) { - struct nvme_completion cqe = nvmeq->cqes[head]; - struct request *req; - - if (++head == nvmeq->q_depth) { - head = 0; - phase = !phase; - } - - if (tag && *tag == cqe.command_id) - *tag = -1; + u16 head = nvmeq->cq_head; - if (unlikely(cqe.command_id >= nvmeq->q_depth)) { - dev_warn(nvmeq->dev->ctrl.device, - "invalid id %d completed on queue %d\n", - cqe.command_id, le16_to_cpu(cqe.sq_id)); - continue; - } + if (likely(nvmeq->cq_vector >= 0)) { + if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db, + nvmeq->dbbuf_cq_ei)) + writel(head, nvmeq->q_db + nvmeq->dev->db_stride); + } +} - /* - * AEN requests are special as they don't time out and can - * survive any kind of queue freeze and often don't respond to - * aborts. We don't even bother to allocate a struct request - * for them but rather special case them here. - */ - if (unlikely(nvmeq->qid == 0 && - cqe.command_id >= NVME_AQ_BLKMQ_DEPTH)) { - nvme_complete_async_event(&nvmeq->dev->ctrl, - cqe.status, &cqe.result); - continue; - } +static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, + struct nvme_completion *cqe) +{ + struct request *req; - req = blk_mq_tag_to_rq(*nvmeq->tags, cqe.command_id); - nvme_end_request(req, cqe.status, cqe.result); + if (unlikely(cqe->command_id >= nvmeq->q_depth)) { + dev_warn(nvmeq->dev->ctrl.device, + "invalid id %d completed on queue %d\n", + cqe->command_id, le16_to_cpu(cqe->sq_id)); + return; } - if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) + /* + * AEN requests are special as they don't time out and can + * survive any kind of queue freeze and often don't respond to + * aborts. We don't even bother to allocate a struct request + * for them but rather special case them here. + */ + if (unlikely(nvmeq->qid == 0 && + cqe->command_id >= NVME_AQ_BLKMQ_DEPTH)) { + nvme_complete_async_event(&nvmeq->dev->ctrl, + cqe->status, &cqe->result); return; + } - if (likely(nvmeq->cq_vector >= 0)) - if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db, - nvmeq->dbbuf_cq_ei)) - writel(head, nvmeq->q_db + nvmeq->dev->db_stride); - nvmeq->cq_head = head; - nvmeq->cq_phase = phase; + req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id); + nvme_end_request(req, cqe->status, cqe->result); +} - nvmeq->cqe_seen = 1; +static inline bool nvme_read_cqe(struct nvme_queue *nvmeq, + struct nvme_completion *cqe) +{ + if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) { + *cqe = nvmeq->cqes[nvmeq->cq_head]; + + if (++nvmeq->cq_head == nvmeq->q_depth) { + nvmeq->cq_head = 0; + nvmeq->cq_phase = !nvmeq->cq_phase; + } + return true; + } + return false; } static void nvme_process_cq(struct nvme_queue *nvmeq) { - __nvme_process_cq(nvmeq, NULL); + struct nvme_completion cqe; + int consumed = 0; + + while (nvme_read_cqe(nvmeq, &cqe)) { + nvme_handle_cqe(nvmeq, &cqe); + consumed++; + } + + if (consumed) { + nvme_ring_cq_doorbell(nvmeq); + nvmeq->cqe_seen = 1; + } } static irqreturn_t nvme_irq(int irq, void *data) @@ -842,16 +822,28 @@ static irqreturn_t nvme_irq_check(int irq, void *data) static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag) { - if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) { - spin_lock_irq(&nvmeq->q_lock); - __nvme_process_cq(nvmeq, &tag); - spin_unlock_irq(&nvmeq->q_lock); + struct nvme_completion cqe; + int found = 0, consumed = 0; - if (tag == -1) - return 1; - } + if (!nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) + return 0; - return 0; + spin_lock_irq(&nvmeq->q_lock); + while (nvme_read_cqe(nvmeq, &cqe)) { + nvme_handle_cqe(nvmeq, &cqe); + consumed++; + + if (tag == cqe.command_id) { + found = 1; + break; + } + } + + if (consumed) + nvme_ring_cq_doorbell(nvmeq); + spin_unlock_irq(&nvmeq->q_lock); + + return found; } static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) @@ -939,7 +931,7 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); } -static void abort_endio(struct request *req, int error) +static void abort_endio(struct request *req, blk_status_t error) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_queue *nvmeq = iod->nvmeq; @@ -950,6 +942,51 @@ static void abort_endio(struct request *req, int error) blk_mq_free_request(req); } +static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) +{ + + /* If true, indicates loss of adapter communication, possibly by a + * NVMe Subsystem reset. + */ + bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO); + + /* If there is a reset ongoing, we shouldn't reset again. */ + if (dev->ctrl.state == NVME_CTRL_RESETTING) + return false; + + /* We shouldn't reset unless the controller is on fatal error state + * _or_ if we lost the communication with it. + */ + if (!(csts & NVME_CSTS_CFS) && !nssro) + return false; + + /* If PCI error recovery process is happening, we cannot reset or + * the recovery mechanism will surely fail. + */ + if (pci_channel_offline(to_pci_dev(dev->dev))) + return false; + + return true; +} + +static void nvme_warn_reset(struct nvme_dev *dev, u32 csts) +{ + /* Read a config register to help see what died. */ + u16 pci_status; + int result; + + result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS, + &pci_status); + if (result == PCIBIOS_SUCCESSFUL) + dev_warn(dev->ctrl.device, + "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n", + csts, pci_status); + else + dev_warn(dev->ctrl.device, + "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n", + csts, result); +} + static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -957,6 +994,17 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) struct nvme_dev *dev = nvmeq->dev; struct request *abort_req; struct nvme_command cmd; + u32 csts = readl(dev->bar + NVME_REG_CSTS); + + /* + * Reset immediately if the controller is failed + */ + if (nvme_should_reset(dev, csts)) { + nvme_warn_reset(dev, csts); + nvme_dev_disable(dev, false); + nvme_reset_ctrl(&dev->ctrl); + return BLK_EH_HANDLED; + } /* * Did we miss an interrupt? @@ -993,7 +1041,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) "I/O %d QID %d timeout, reset controller\n", req->tag, nvmeq->qid); nvme_dev_disable(dev, false); - nvme_reset(dev); + nvme_reset_ctrl(&dev->ctrl); /* * Mark the request as handled, since the inline shutdown @@ -1247,7 +1295,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = { .complete = nvme_pci_complete_rq, .init_hctx = nvme_admin_init_hctx, .exit_hctx = nvme_admin_exit_hctx, - .init_request = nvme_admin_init_request, + .init_request = nvme_init_request, .timeout = nvme_timeout, }; @@ -1311,6 +1359,32 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) return 0; } +static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) +{ + return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride); +} + +static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size) +{ + struct pci_dev *pdev = to_pci_dev(dev->dev); + + if (size <= dev->bar_mapped_size) + return 0; + if (size > pci_resource_len(pdev, 0)) + return -ENOMEM; + if (dev->bar) + iounmap(dev->bar); + dev->bar = ioremap(pci_resource_start(pdev, 0), size); + if (!dev->bar) { + dev->bar_mapped_size = 0; + return -ENOMEM; + } + dev->bar_mapped_size = size; + dev->dbs = dev->bar + NVME_REG_DBS; + + return 0; +} + static int nvme_configure_admin_queue(struct nvme_dev *dev) { int result; @@ -1318,6 +1392,10 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP); struct nvme_queue *nvmeq; + result = nvme_remap_bar(dev, db_bar_size(dev, 0)); + if (result < 0) + return result; + dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ? NVME_CAP_NSSRC(cap) : 0; @@ -1358,66 +1436,6 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) return result; } -static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) -{ - - /* If true, indicates loss of adapter communication, possibly by a - * NVMe Subsystem reset. - */ - bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO); - - /* If there is a reset ongoing, we shouldn't reset again. */ - if (dev->ctrl.state == NVME_CTRL_RESETTING) - return false; - - /* We shouldn't reset unless the controller is on fatal error state - * _or_ if we lost the communication with it. - */ - if (!(csts & NVME_CSTS_CFS) && !nssro) - return false; - - /* If PCI error recovery process is happening, we cannot reset or - * the recovery mechanism will surely fail. - */ - if (pci_channel_offline(to_pci_dev(dev->dev))) - return false; - - return true; -} - -static void nvme_warn_reset(struct nvme_dev *dev, u32 csts) -{ - /* Read a config register to help see what died. */ - u16 pci_status; - int result; - - result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS, - &pci_status); - if (result == PCIBIOS_SUCCESSFUL) - dev_warn(dev->ctrl.device, - "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n", - csts, pci_status); - else - dev_warn(dev->ctrl.device, - "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n", - csts, result); -} - -static void nvme_watchdog_timer(unsigned long data) -{ - struct nvme_dev *dev = (struct nvme_dev *)data; - u32 csts = readl(dev->bar + NVME_REG_CSTS); - - /* Skip controllers under certain specific conditions. */ - if (nvme_should_reset(dev, csts)) { - if (!nvme_reset(dev)) - nvme_warn_reset(dev, csts); - return; - } - - mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ)); -} - static int nvme_create_io_queues(struct nvme_dev *dev) { unsigned i, max; @@ -1514,16 +1532,168 @@ static inline void nvme_release_cmb(struct nvme_dev *dev) } } -static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) +static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) +{ + size_t len = dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs); + struct nvme_command c; + u64 dma_addr; + int ret; + + dma_addr = dma_map_single(dev->dev, dev->host_mem_descs, len, + DMA_TO_DEVICE); + if (dma_mapping_error(dev->dev, dma_addr)) + return -ENOMEM; + + memset(&c, 0, sizeof(c)); + c.features.opcode = nvme_admin_set_features; + c.features.fid = cpu_to_le32(NVME_FEAT_HOST_MEM_BUF); + c.features.dword11 = cpu_to_le32(bits); + c.features.dword12 = cpu_to_le32(dev->host_mem_size >> + ilog2(dev->ctrl.page_size)); + c.features.dword13 = cpu_to_le32(lower_32_bits(dma_addr)); + c.features.dword14 = cpu_to_le32(upper_32_bits(dma_addr)); + c.features.dword15 = cpu_to_le32(dev->nr_host_mem_descs); + + ret = nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); + if (ret) { + dev_warn(dev->ctrl.device, + "failed to set host mem (err %d, flags %#x).\n", + ret, bits); + } + dma_unmap_single(dev->dev, dma_addr, len, DMA_TO_DEVICE); + return ret; +} + +static void nvme_free_host_mem(struct nvme_dev *dev) +{ + int i; + + for (i = 0; i < dev->nr_host_mem_descs; i++) { + struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i]; + size_t size = le32_to_cpu(desc->size) * dev->ctrl.page_size; + + dma_free_coherent(dev->dev, size, dev->host_mem_desc_bufs[i], + le64_to_cpu(desc->addr)); + } + + kfree(dev->host_mem_desc_bufs); + dev->host_mem_desc_bufs = NULL; + kfree(dev->host_mem_descs); + dev->host_mem_descs = NULL; +} + +static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred) { - return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride); + struct nvme_host_mem_buf_desc *descs; + u32 chunk_size, max_entries, i = 0; + void **bufs; + u64 size, tmp; + + /* start big and work our way down */ + chunk_size = min(preferred, (u64)PAGE_SIZE << MAX_ORDER); +retry: + tmp = (preferred + chunk_size - 1); + do_div(tmp, chunk_size); + max_entries = tmp; + descs = kcalloc(max_entries, sizeof(*descs), GFP_KERNEL); + if (!descs) + goto out; + + bufs = kcalloc(max_entries, sizeof(*bufs), GFP_KERNEL); + if (!bufs) + goto out_free_descs; + + for (size = 0; size < preferred; size += chunk_size) { + u32 len = min_t(u64, chunk_size, preferred - size); + dma_addr_t dma_addr; + + bufs[i] = dma_alloc_attrs(dev->dev, len, &dma_addr, GFP_KERNEL, + DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN); + if (!bufs[i]) + break; + + descs[i].addr = cpu_to_le64(dma_addr); + descs[i].size = cpu_to_le32(len / dev->ctrl.page_size); + i++; + } + + if (!size || (min && size < min)) { + dev_warn(dev->ctrl.device, + "failed to allocate host memory buffer.\n"); + goto out_free_bufs; + } + + dev_info(dev->ctrl.device, + "allocated %lld MiB host memory buffer.\n", + size >> ilog2(SZ_1M)); + dev->nr_host_mem_descs = i; + dev->host_mem_size = size; + dev->host_mem_descs = descs; + dev->host_mem_desc_bufs = bufs; + return 0; + +out_free_bufs: + while (--i >= 0) { + size_t size = le32_to_cpu(descs[i].size) * dev->ctrl.page_size; + + dma_free_coherent(dev->dev, size, bufs[i], + le64_to_cpu(descs[i].addr)); + } + + kfree(bufs); +out_free_descs: + kfree(descs); +out: + /* try a smaller chunk size if we failed early */ + if (chunk_size >= PAGE_SIZE * 2 && (i == 0 || size < min)) { + chunk_size /= 2; + goto retry; + } + dev->host_mem_descs = NULL; + return -ENOMEM; +} + +static void nvme_setup_host_mem(struct nvme_dev *dev) +{ + u64 max = (u64)max_host_mem_size_mb * SZ_1M; + u64 preferred = (u64)dev->ctrl.hmpre * 4096; + u64 min = (u64)dev->ctrl.hmmin * 4096; + u32 enable_bits = NVME_HOST_MEM_ENABLE; + + preferred = min(preferred, max); + if (min > max) { + dev_warn(dev->ctrl.device, + "min host memory (%lld MiB) above limit (%d MiB).\n", + min >> ilog2(SZ_1M), max_host_mem_size_mb); + nvme_free_host_mem(dev); + return; + } + + /* + * If we already have a buffer allocated check if we can reuse it. + */ + if (dev->host_mem_descs) { + if (dev->host_mem_size >= min) + enable_bits |= NVME_HOST_MEM_RETURN; + else + nvme_free_host_mem(dev); + } + + if (!dev->host_mem_descs) { + if (nvme_alloc_host_mem(dev, min, preferred)) + return; + } + + if (nvme_set_host_mem(dev, enable_bits)) + nvme_free_host_mem(dev); } static int nvme_setup_io_queues(struct nvme_dev *dev) { struct nvme_queue *adminq = dev->queues[0]; struct pci_dev *pdev = to_pci_dev(dev->dev); - int result, nr_io_queues, size; + int result, nr_io_queues; + unsigned long size; nr_io_queues = num_online_cpus(); result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); @@ -1542,20 +1712,15 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) nvme_release_cmb(dev); } - size = db_bar_size(dev, nr_io_queues); - if (size > 8192) { - iounmap(dev->bar); - do { - dev->bar = ioremap(pci_resource_start(pdev, 0), size); - if (dev->bar) - break; - if (!--nr_io_queues) - return -ENOMEM; - size = db_bar_size(dev, nr_io_queues); - } while (1); - dev->dbs = dev->bar + 4096; - adminq->q_db = dev->dbs; - } + do { + size = db_bar_size(dev, nr_io_queues); + result = nvme_remap_bar(dev, size); + if (!result) + break; + if (!--nr_io_queues) + return -ENOMEM; + } while (1); + adminq->q_db = dev->dbs; /* Deregister the admin queue's interrupt */ pci_free_irq(pdev, 0, adminq); @@ -1586,7 +1751,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) return nvme_create_io_queues(dev); } -static void nvme_del_queue_end(struct request *req, int error) +static void nvme_del_queue_end(struct request *req, blk_status_t error) { struct nvme_queue *nvmeq = req->end_io_data; @@ -1594,7 +1759,7 @@ static void nvme_del_queue_end(struct request *req, int error) complete(&nvmeq->dev->ioq_wait); } -static void nvme_del_cq_end(struct request *req, int error) +static void nvme_del_cq_end(struct request *req, blk_status_t error) { struct nvme_queue *nvmeq = req->end_io_data; @@ -1799,8 +1964,6 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) bool dead = true; struct pci_dev *pdev = to_pci_dev(dev->dev); - del_timer_sync(&dev->watchdog_timer); - mutex_lock(&dev->shutdown_lock); if (pci_is_enabled(pdev)) { u32 csts = readl(dev->bar + NVME_REG_CSTS); @@ -1816,8 +1979,20 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) * Give the controller a chance to complete all entered requests if * doing a safe shutdown. */ - if (!dead && shutdown) - nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); + if (!dead) { + if (shutdown) + nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); + + /* + * If the controller is still alive tell it to stop using the + * host memory buffer. In theory the shutdown / reset should + * make sure that it doesn't access the host memoery anymore, + * but I'd rather be safe than sorry.. + */ + if (dev->host_mem_descs) + nvme_set_host_mem(dev, 0); + + } nvme_stop_queues(&dev->ctrl); queues = dev->online_queues - 1; @@ -1900,7 +2075,8 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status) static void nvme_reset_work(struct work_struct *work) { - struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work); + struct nvme_dev *dev = + container_of(work, struct nvme_dev, ctrl.reset_work); bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); int result = -ENODEV; @@ -1949,6 +2125,9 @@ static void nvme_reset_work(struct work_struct *work) "unable to allocate dma for dbbuf\n"); } + if (dev->ctrl.hmpre) + nvme_setup_host_mem(dev); + result = nvme_setup_io_queues(dev); if (result) goto out; @@ -1962,8 +2141,6 @@ static void nvme_reset_work(struct work_struct *work) if (dev->online_queues > 1) nvme_queue_async_events(&dev->ctrl); - mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ)); - /* * Keep the controller around but remove all namespaces if we don't have * any working I/O queue. @@ -2003,17 +2180,6 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work) nvme_put_ctrl(&dev->ctrl); } -static int nvme_reset(struct nvme_dev *dev) -{ - if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q)) - return -ENODEV; - if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) - return -EBUSY; - if (!queue_work(nvme_workq, &dev->reset_work)) - return -EBUSY; - return 0; -} - static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) { *val = readl(to_nvme_dev(ctrl)->bar + off); @@ -2032,16 +2198,6 @@ static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) return 0; } -static int nvme_pci_reset_ctrl(struct nvme_ctrl *ctrl) -{ - struct nvme_dev *dev = to_nvme_dev(ctrl); - int ret = nvme_reset(dev); - - if (!ret) - flush_work(&dev->reset_work); - return ret; -} - static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .name = "pcie", .module = THIS_MODULE, @@ -2049,7 +2205,6 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .reg_read32 = nvme_pci_reg_read32, .reg_write32 = nvme_pci_reg_write32, .reg_read64 = nvme_pci_reg_read64, - .reset_ctrl = nvme_pci_reset_ctrl, .free_ctrl = nvme_pci_free_ctrl, .submit_async_event = nvme_pci_submit_async_event, }; @@ -2061,8 +2216,7 @@ static int nvme_dev_map(struct nvme_dev *dev) if (pci_request_mem_regions(pdev, "nvme")) return -ENODEV; - dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); - if (!dev->bar) + if (nvme_remap_bar(dev, NVME_REG_DBS + 4096)) goto release; return 0; @@ -2116,10 +2270,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto free; - INIT_WORK(&dev->reset_work, nvme_reset_work); + INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); - setup_timer(&dev->watchdog_timer, nvme_watchdog_timer, - (unsigned long)dev); mutex_init(&dev->shutdown_lock); init_completion(&dev->ioq_wait); @@ -2137,7 +2289,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING); dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); - queue_work(nvme_workq, &dev->reset_work); + queue_work(nvme_wq, &dev->ctrl.reset_work); return 0; release_pools: @@ -2158,7 +2310,7 @@ static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) if (prepare) nvme_dev_disable(dev, false); else - nvme_reset(dev); + nvme_reset_ctrl(&dev->ctrl); } static void nvme_shutdown(struct pci_dev *pdev) @@ -2178,7 +2330,7 @@ static void nvme_remove(struct pci_dev *pdev) nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); - cancel_work_sync(&dev->reset_work); + cancel_work_sync(&dev->ctrl.reset_work); pci_set_drvdata(pdev, NULL); if (!pci_device_is_present(pdev)) { @@ -2186,9 +2338,10 @@ static void nvme_remove(struct pci_dev *pdev) nvme_dev_disable(dev, false); } - flush_work(&dev->reset_work); + flush_work(&dev->ctrl.reset_work); nvme_uninit_ctrl(&dev->ctrl); nvme_dev_disable(dev, true); + nvme_free_host_mem(dev); nvme_dev_remove_admin(dev); nvme_free_queues(dev, 0); nvme_release_prp_pools(dev); @@ -2229,7 +2382,7 @@ static int nvme_resume(struct device *dev) struct pci_dev *pdev = to_pci_dev(dev); struct nvme_dev *ndev = pci_get_drvdata(pdev); - nvme_reset(ndev); + nvme_reset_ctrl(&ndev->ctrl); return 0; } #endif @@ -2268,7 +2421,7 @@ static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev) dev_info(dev->ctrl.device, "restart after slot reset\n"); pci_restore_state(pdev); - nvme_reset(dev); + nvme_reset_ctrl(&dev->ctrl); return PCI_ERS_RESULT_RECOVERED; } @@ -2324,22 +2477,12 @@ static struct pci_driver nvme_driver = { static int __init nvme_init(void) { - int result; - - nvme_workq = alloc_workqueue("nvme", WQ_UNBOUND | WQ_MEM_RECLAIM, 0); - if (!nvme_workq) - return -ENOMEM; - - result = pci_register_driver(&nvme_driver); - if (result) - destroy_workqueue(nvme_workq); - return result; + return pci_register_driver(&nvme_driver); } static void __exit nvme_exit(void) { pci_unregister_driver(&nvme_driver); - destroy_workqueue(nvme_workq); _nvme_check_size(); } diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 24397d3..6d4119d 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -48,7 +48,7 @@ */ #define NVME_RDMA_NR_AEN_COMMANDS 1 #define NVME_RDMA_AQ_BLKMQ_DEPTH \ - (NVMF_AQ_DEPTH - NVME_RDMA_NR_AEN_COMMANDS) + (NVME_AQ_DEPTH - NVME_RDMA_NR_AEN_COMMANDS) struct nvme_rdma_device { struct ib_device *dev; @@ -80,10 +80,8 @@ struct nvme_rdma_request { }; enum nvme_rdma_queue_flags { - NVME_RDMA_Q_CONNECTED = (1 << 0), - NVME_RDMA_IB_QUEUE_ALLOCATED = (1 << 1), - NVME_RDMA_Q_DELETING = (1 << 2), - NVME_RDMA_Q_LIVE = (1 << 3), + NVME_RDMA_Q_LIVE = 0, + NVME_RDMA_Q_DELETING = 1, }; struct nvme_rdma_queue { @@ -103,9 +101,6 @@ struct nvme_rdma_queue { }; struct nvme_rdma_ctrl { - /* read and written in the hot path */ - spinlock_t lock; - /* read only in the hot path */ struct nvme_rdma_queue *queues; u32 queue_count; @@ -113,7 +108,6 @@ struct nvme_rdma_ctrl { /* other member variables */ struct blk_mq_tag_set tag_set; struct work_struct delete_work; - struct work_struct reset_work; struct work_struct err_work; struct nvme_rdma_qe async_event_sqe; @@ -145,8 +139,6 @@ static DEFINE_MUTEX(device_list_mutex); static LIST_HEAD(nvme_rdma_ctrl_list); static DEFINE_MUTEX(nvme_rdma_ctrl_mutex); -static struct workqueue_struct *nvme_rdma_wq; - /* * Disabling this option makes small I/O goes faster, but is fundamentally * unsafe. With it turned off we will have to register a global rkey that @@ -301,10 +293,12 @@ out: return ret; } -static void __nvme_rdma_exit_request(struct nvme_rdma_ctrl *ctrl, - struct request *rq, unsigned int queue_idx) +static void nvme_rdma_exit_request(struct blk_mq_tag_set *set, + struct request *rq, unsigned int hctx_idx) { + struct nvme_rdma_ctrl *ctrl = set->driver_data; struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; struct nvme_rdma_device *dev = queue->device; @@ -315,22 +309,13 @@ static void __nvme_rdma_exit_request(struct nvme_rdma_ctrl *ctrl, DMA_TO_DEVICE); } -static void nvme_rdma_exit_request(struct blk_mq_tag_set *set, - struct request *rq, unsigned int hctx_idx) -{ - return __nvme_rdma_exit_request(set->driver_data, rq, hctx_idx + 1); -} - -static void nvme_rdma_exit_admin_request(struct blk_mq_tag_set *set, - struct request *rq, unsigned int hctx_idx) -{ - return __nvme_rdma_exit_request(set->driver_data, rq, 0); -} - -static int __nvme_rdma_init_request(struct nvme_rdma_ctrl *ctrl, - struct request *rq, unsigned int queue_idx) +static int nvme_rdma_init_request(struct blk_mq_tag_set *set, + struct request *rq, unsigned int hctx_idx, + unsigned int numa_node) { + struct nvme_rdma_ctrl *ctrl = set->driver_data; struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; struct nvme_rdma_device *dev = queue->device; struct ib_device *ibdev = dev->dev; @@ -358,20 +343,6 @@ out_free_qe: return -ENOMEM; } -static int nvme_rdma_init_request(struct blk_mq_tag_set *set, - struct request *rq, unsigned int hctx_idx, - unsigned int numa_node) -{ - return __nvme_rdma_init_request(set->driver_data, rq, hctx_idx + 1); -} - -static int nvme_rdma_init_admin_request(struct blk_mq_tag_set *set, - struct request *rq, unsigned int hctx_idx, - unsigned int numa_node) -{ - return __nvme_rdma_init_request(set->driver_data, rq, 0); -} - static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { @@ -469,9 +440,6 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) struct nvme_rdma_device *dev; struct ib_device *ibdev; - if (!test_and_clear_bit(NVME_RDMA_IB_QUEUE_ALLOCATED, &queue->flags)) - return; - dev = queue->device; ibdev = dev->dev; rdma_destroy_qp(queue->cm_id); @@ -483,17 +451,21 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) nvme_rdma_dev_put(dev); } -static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue, - struct nvme_rdma_device *dev) +static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) { - struct ib_device *ibdev = dev->dev; + struct ib_device *ibdev; const int send_wr_factor = 3; /* MR, SEND, INV */ const int cq_factor = send_wr_factor + 1; /* + RECV */ int comp_vector, idx = nvme_rdma_queue_idx(queue); - int ret; - queue->device = dev; + queue->device = nvme_rdma_find_get_device(queue->cm_id); + if (!queue->device) { + dev_err(queue->cm_id->device->dev.parent, + "no client data found!\n"); + return -ECONNREFUSED; + } + ibdev = queue->device->dev; /* * The admin queue is barely used once the controller is live, so don't @@ -506,12 +478,12 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue, /* +1 for ib_stop_cq */ - queue->ib_cq = ib_alloc_cq(dev->dev, queue, - cq_factor * queue->queue_size + 1, comp_vector, - IB_POLL_SOFTIRQ); + queue->ib_cq = ib_alloc_cq(ibdev, queue, + cq_factor * queue->queue_size + 1, + comp_vector, IB_POLL_SOFTIRQ); if (IS_ERR(queue->ib_cq)) { ret = PTR_ERR(queue->ib_cq); - goto out; + goto out_put_dev; } ret = nvme_rdma_create_qp(queue, send_wr_factor); @@ -524,7 +496,6 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue, ret = -ENOMEM; goto out_destroy_qp; } - set_bit(NVME_RDMA_IB_QUEUE_ALLOCATED, &queue->flags); return 0; @@ -532,7 +503,8 @@ out_destroy_qp: ib_destroy_qp(queue->qp); out_destroy_ib_cq: ib_free_cq(queue->ib_cq); -out: +out_put_dev: + nvme_rdma_dev_put(queue->device); return ret; } @@ -583,12 +555,10 @@ static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl, } clear_bit(NVME_RDMA_Q_DELETING, &queue->flags); - set_bit(NVME_RDMA_Q_CONNECTED, &queue->flags); return 0; out_destroy_cm_id: - nvme_rdma_destroy_queue_ib(queue); rdma_destroy_id(queue->cm_id); return ret; } @@ -718,11 +688,11 @@ static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl) if (nvmf_should_reconnect(&ctrl->ctrl)) { dev_info(ctrl->ctrl.device, "Reconnecting in %d seconds...\n", ctrl->ctrl.opts->reconnect_delay); - queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work, + queue_delayed_work(nvme_wq, &ctrl->reconnect_work, ctrl->ctrl.opts->reconnect_delay * HZ); } else { dev_info(ctrl->ctrl.device, "Removing controller...\n"); - queue_work(nvme_rdma_wq, &ctrl->delete_work); + queue_work(nvme_wq, &ctrl->delete_work); } } @@ -733,7 +703,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) bool changed; int ret; - ++ctrl->ctrl.opts->nr_reconnects; + ++ctrl->ctrl.nr_reconnects; if (ctrl->queue_count > 1) { nvme_rdma_free_io_queues(ctrl); @@ -749,7 +719,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) if (ret) goto requeue; - ret = nvme_rdma_init_queue(ctrl, 0, NVMF_AQ_DEPTH); + ret = nvme_rdma_init_queue(ctrl, 0, NVME_AQ_DEPTH); if (ret) goto requeue; @@ -777,7 +747,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); WARN_ON_ONCE(!changed); - ctrl->ctrl.opts->nr_reconnects = 0; + ctrl->ctrl.nr_reconnects = 0; if (ctrl->queue_count > 1) { nvme_queue_scan(&ctrl->ctrl); @@ -790,7 +760,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) requeue: dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n", - ctrl->ctrl.opts->nr_reconnects); + ctrl->ctrl.nr_reconnects); nvme_rdma_reconnect_or_remove(ctrl); } @@ -802,10 +772,8 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) nvme_stop_keep_alive(&ctrl->ctrl); - for (i = 0; i < ctrl->queue_count; i++) { - clear_bit(NVME_RDMA_Q_CONNECTED, &ctrl->queues[i].flags); + for (i = 0; i < ctrl->queue_count; i++) clear_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags); - } if (ctrl->queue_count > 1) nvme_stop_queues(&ctrl->ctrl); @@ -833,7 +801,7 @@ static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl) if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) return; - queue_work(nvme_rdma_wq, &ctrl->err_work); + queue_work(nvme_wq, &ctrl->err_work); } static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc, @@ -1278,21 +1246,11 @@ static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue, static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue) { - struct nvme_rdma_device *dev; int ret; - dev = nvme_rdma_find_get_device(queue->cm_id); - if (!dev) { - dev_err(queue->cm_id->device->dev.parent, - "no client data found!\n"); - return -ECONNREFUSED; - } - - ret = nvme_rdma_create_queue_ib(queue, dev); - if (ret) { - nvme_rdma_dev_put(dev); - goto out; - } + ret = nvme_rdma_create_queue_ib(queue); + if (ret) + return ret; ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS); if (ret) { @@ -1306,7 +1264,6 @@ static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue) out_destroy_queue: nvme_rdma_destroy_queue_ib(queue); -out: return ret; } @@ -1334,8 +1291,8 @@ static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue) * specified by the Fabrics standard. */ if (priv.qid == 0) { - priv.hrqsize = cpu_to_le16(NVMF_AQ_DEPTH); - priv.hsqsize = cpu_to_le16(NVMF_AQ_DEPTH - 1); + priv.hrqsize = cpu_to_le16(NVME_AQ_DEPTH); + priv.hsqsize = cpu_to_le16(NVME_AQ_DEPTH - 1); } else { /* * current interpretation of the fabrics spec @@ -1383,12 +1340,14 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, complete(&queue->cm_done); return 0; case RDMA_CM_EVENT_REJECTED: + nvme_rdma_destroy_queue_ib(queue); cm_error = nvme_rdma_conn_rejected(queue, ev); break; - case RDMA_CM_EVENT_ADDR_ERROR: case RDMA_CM_EVENT_ROUTE_ERROR: case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: + nvme_rdma_destroy_queue_ib(queue); + case RDMA_CM_EVENT_ADDR_ERROR: dev_dbg(queue->ctrl->ctrl.device, "CM error event %d\n", ev->event); cm_error = -ECONNRESET; @@ -1435,8 +1394,8 @@ nvme_rdma_timeout(struct request *rq, bool reserved) /* * We cannot accept any other command until the Connect command has completed. */ -static inline int nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue, - struct request *rq) +static inline blk_status_t +nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue, struct request *rq) { if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags))) { struct nvme_command *cmd = nvme_req(rq)->cmd; @@ -1452,16 +1411,15 @@ static inline int nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue, * failover. */ if (queue->ctrl->ctrl.state == NVME_CTRL_RECONNECTING) - return -EIO; - else - return -EAGAIN; + return BLK_STS_IOERR; + return BLK_STS_RESOURCE; /* try again later */ } } return 0; } -static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, +static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct nvme_ns *ns = hctx->queue->queuedata; @@ -1472,28 +1430,29 @@ static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_command *c = sqe->data; bool flush = false; struct ib_device *dev; - int ret; + blk_status_t ret; + int err; WARN_ON_ONCE(rq->tag < 0); ret = nvme_rdma_queue_is_ready(queue, rq); if (unlikely(ret)) - goto err; + return ret; dev = queue->device->dev; ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(struct nvme_command), DMA_TO_DEVICE); ret = nvme_setup_cmd(ns, rq, c); - if (ret != BLK_MQ_RQ_QUEUE_OK) + if (ret) return ret; blk_mq_start_request(rq); - ret = nvme_rdma_map_data(queue, rq, c); - if (ret < 0) { + err = nvme_rdma_map_data(queue, rq, c); + if (err < 0) { dev_err(queue->ctrl->ctrl.device, - "Failed to map data (%d)\n", ret); + "Failed to map data (%d)\n", err); nvme_cleanup_cmd(rq); goto err; } @@ -1503,17 +1462,18 @@ static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, if (req_op(rq) == REQ_OP_FLUSH) flush = true; - ret = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge, + err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge, req->mr->need_inval ? &req->reg_wr.wr : NULL, flush); - if (ret) { + if (err) { nvme_rdma_unmap_data(queue, rq); goto err; } - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; err: - return (ret == -ENOMEM || ret == -EAGAIN) ? - BLK_MQ_RQ_QUEUE_BUSY : BLK_MQ_RQ_QUEUE_ERROR; + if (err == -ENOMEM || err == -EAGAIN) + return BLK_STS_RESOURCE; + return BLK_STS_IOERR; } static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) @@ -1523,7 +1483,6 @@ static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) struct ib_wc wc; int found = 0; - ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); while (ib_poll_cq(cq, 1, &wc) > 0) { struct ib_cqe *cqe = wc.wr_cqe; @@ -1560,8 +1519,8 @@ static const struct blk_mq_ops nvme_rdma_mq_ops = { static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { .queue_rq = nvme_rdma_queue_rq, .complete = nvme_rdma_complete_rq, - .init_request = nvme_rdma_init_admin_request, - .exit_request = nvme_rdma_exit_admin_request, + .init_request = nvme_rdma_init_request, + .exit_request = nvme_rdma_exit_request, .reinit_request = nvme_rdma_reinit_request, .init_hctx = nvme_rdma_init_admin_hctx, .timeout = nvme_rdma_timeout, @@ -1571,7 +1530,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) { int error; - error = nvme_rdma_init_queue(ctrl, 0, NVMF_AQ_DEPTH); + error = nvme_rdma_init_queue(ctrl, 0, NVME_AQ_DEPTH); if (error) return error; @@ -1672,7 +1631,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl) nvme_rdma_free_io_queues(ctrl); } - if (test_bit(NVME_RDMA_Q_CONNECTED, &ctrl->queues[0].flags)) + if (test_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags)) nvme_shutdown_ctrl(&ctrl->ctrl); blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); @@ -1709,7 +1668,7 @@ static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl) if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) return -EBUSY; - if (!queue_work(nvme_rdma_wq, &ctrl->delete_work)) + if (!queue_work(nvme_wq, &ctrl->delete_work)) return -EBUSY; return 0; @@ -1743,8 +1702,8 @@ static void nvme_rdma_remove_ctrl_work(struct work_struct *work) static void nvme_rdma_reset_ctrl_work(struct work_struct *work) { - struct nvme_rdma_ctrl *ctrl = container_of(work, - struct nvme_rdma_ctrl, reset_work); + struct nvme_rdma_ctrl *ctrl = + container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work); int ret; bool changed; @@ -1785,22 +1744,7 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) del_dead_ctrl: /* Deleting this dead controller... */ dev_warn(ctrl->ctrl.device, "Removing after reset failure\n"); - WARN_ON(!queue_work(nvme_rdma_wq, &ctrl->delete_work)); -} - -static int nvme_rdma_reset_ctrl(struct nvme_ctrl *nctrl) -{ - struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); - - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) - return -EBUSY; - - if (!queue_work(nvme_rdma_wq, &ctrl->reset_work)) - return -EBUSY; - - flush_work(&ctrl->reset_work); - - return 0; + WARN_ON(!queue_work(nvme_wq, &ctrl->delete_work)); } static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { @@ -1810,11 +1754,9 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { .reg_read32 = nvmf_reg_read32, .reg_read64 = nvmf_reg_read64, .reg_write32 = nvmf_reg_write32, - .reset_ctrl = nvme_rdma_reset_ctrl, .free_ctrl = nvme_rdma_free_ctrl, .submit_async_event = nvme_rdma_submit_async_event, .delete_ctrl = nvme_rdma_del_ctrl, - .get_subsysnqn = nvmf_get_subsysnqn, .get_address = nvmf_get_address, }; @@ -1919,8 +1861,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, nvme_rdma_reconnect_ctrl_work); INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work); INIT_WORK(&ctrl->delete_work, nvme_rdma_del_ctrl_work); - INIT_WORK(&ctrl->reset_work, nvme_rdma_reset_ctrl_work); - spin_lock_init(&ctrl->lock); + INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work); ctrl->queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */ ctrl->ctrl.sqsize = opts->queue_size - 1; @@ -1939,12 +1880,14 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, /* sanity check icdoff */ if (ctrl->ctrl.icdoff) { dev_err(ctrl->ctrl.device, "icdoff is not supported!\n"); + ret = -EINVAL; goto out_remove_admin_queue; } /* sanity check keyed sgls */ if (!(ctrl->ctrl.sgls & (1 << 20))) { dev_err(ctrl->ctrl.device, "Mandatory keyed sgls are not support\n"); + ret = -EINVAL; goto out_remove_admin_queue; } @@ -2033,7 +1976,7 @@ static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data) } mutex_unlock(&nvme_rdma_ctrl_mutex); - flush_workqueue(nvme_rdma_wq); + flush_workqueue(nvme_wq); } static struct ib_client nvme_rdma_ib_client = { @@ -2046,13 +1989,9 @@ static int __init nvme_rdma_init_module(void) { int ret; - nvme_rdma_wq = create_workqueue("nvme_rdma_wq"); - if (!nvme_rdma_wq) - return -ENOMEM; - ret = ib_register_client(&nvme_rdma_ib_client); if (ret) - goto err_destroy_wq; + return ret; ret = nvmf_register_transport(&nvme_rdma_transport); if (ret) @@ -2062,8 +2001,6 @@ static int __init nvme_rdma_init_module(void) err_unreg_client: ib_unregister_client(&nvme_rdma_ib_client); -err_destroy_wq: - destroy_workqueue(nvme_rdma_wq); return ret; } @@ -2071,7 +2008,6 @@ static void __exit nvme_rdma_cleanup_module(void) { nvmf_unregister_transport(&nvme_rdma_transport); ib_unregister_client(&nvme_rdma_ib_client); - destroy_workqueue(nvme_rdma_wq); } module_init(nvme_rdma_init_module); diff --git a/drivers/nvme/host/scsi.c b/drivers/nvme/host/scsi.c deleted file mode 100644 index 1f7671e..0000000 --- a/drivers/nvme/host/scsi.c +++ /dev/null @@ -1,2460 +0,0 @@ -/* - * NVM Express device driver - * Copyright (c) 2011-2014, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - */ - -/* - * Refer to the SCSI-NVMe Translation spec for details on how - * each command is translated. - */ - -#include <linux/bio.h> -#include <linux/bitops.h> -#include <linux/blkdev.h> -#include <linux/compat.h> -#include <linux/delay.h> -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/genhd.h> -#include <linux/idr.h> -#include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/io.h> -#include <linux/kdev_t.h> -#include <linux/kthread.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/pci.h> -#include <linux/poison.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/types.h> -#include <asm/unaligned.h> -#include <scsi/sg.h> -#include <scsi/scsi.h> -#include <scsi/scsi_request.h> - -#include "nvme.h" - -static int sg_version_num = 30534; /* 2 digits for each component */ - -/* VPD Page Codes */ -#define VPD_SUPPORTED_PAGES 0x00 -#define VPD_SERIAL_NUMBER 0x80 -#define VPD_DEVICE_IDENTIFIERS 0x83 -#define VPD_EXTENDED_INQUIRY 0x86 -#define VPD_BLOCK_LIMITS 0xB0 -#define VPD_BLOCK_DEV_CHARACTERISTICS 0xB1 - -/* format unit paramter list offsets */ -#define FORMAT_UNIT_SHORT_PARM_LIST_LEN 4 -#define FORMAT_UNIT_LONG_PARM_LIST_LEN 8 -#define FORMAT_UNIT_PROT_INT_OFFSET 3 -#define FORMAT_UNIT_PROT_FIELD_USAGE_OFFSET 0 -#define FORMAT_UNIT_PROT_FIELD_USAGE_MASK 0x07 - -/* Misc. defines */ -#define FIXED_SENSE_DATA 0x70 -#define DESC_FORMAT_SENSE_DATA 0x72 -#define FIXED_SENSE_DATA_ADD_LENGTH 10 -#define LUN_ENTRY_SIZE 8 -#define LUN_DATA_HEADER_SIZE 8 -#define ALL_LUNS_RETURNED 0x02 -#define ALL_WELL_KNOWN_LUNS_RETURNED 0x01 -#define RESTRICTED_LUNS_RETURNED 0x00 -#define DOWNLOAD_SAVE_ACTIVATE 0x05 -#define DOWNLOAD_SAVE_DEFER_ACTIVATE 0x0E -#define ACTIVATE_DEFERRED_MICROCODE 0x0F -#define FORMAT_UNIT_IMMED_MASK 0x2 -#define FORMAT_UNIT_IMMED_OFFSET 1 -#define KELVIN_TEMP_FACTOR 273 -#define FIXED_FMT_SENSE_DATA_SIZE 18 -#define DESC_FMT_SENSE_DATA_SIZE 8 - -/* SCSI/NVMe defines and bit masks */ -#define INQ_STANDARD_INQUIRY_PAGE 0x00 -#define INQ_SUPPORTED_VPD_PAGES_PAGE 0x00 -#define INQ_UNIT_SERIAL_NUMBER_PAGE 0x80 -#define INQ_DEVICE_IDENTIFICATION_PAGE 0x83 -#define INQ_EXTENDED_INQUIRY_DATA_PAGE 0x86 -#define INQ_BDEV_LIMITS_PAGE 0xB0 -#define INQ_BDEV_CHARACTERISTICS_PAGE 0xB1 -#define INQ_SERIAL_NUMBER_LENGTH 0x14 -#define INQ_NUM_SUPPORTED_VPD_PAGES 6 -#define VERSION_SPC_4 0x06 -#define ACA_UNSUPPORTED 0 -#define STANDARD_INQUIRY_LENGTH 36 -#define ADDITIONAL_STD_INQ_LENGTH 31 -#define EXTENDED_INQUIRY_DATA_PAGE_LENGTH 0x3C -#define RESERVED_FIELD 0 - -/* Mode Sense/Select defines */ -#define MODE_PAGE_INFO_EXCEP 0x1C -#define MODE_PAGE_CACHING 0x08 -#define MODE_PAGE_CONTROL 0x0A -#define MODE_PAGE_POWER_CONDITION 0x1A -#define MODE_PAGE_RETURN_ALL 0x3F -#define MODE_PAGE_BLK_DES_LEN 0x08 -#define MODE_PAGE_LLBAA_BLK_DES_LEN 0x10 -#define MODE_PAGE_CACHING_LEN 0x14 -#define MODE_PAGE_CONTROL_LEN 0x0C -#define MODE_PAGE_POW_CND_LEN 0x28 -#define MODE_PAGE_INF_EXC_LEN 0x0C -#define MODE_PAGE_ALL_LEN 0x54 -#define MODE_SENSE6_MPH_SIZE 4 -#define MODE_SENSE_PAGE_CONTROL_MASK 0xC0 -#define MODE_SENSE_PAGE_CODE_OFFSET 2 -#define MODE_SENSE_PAGE_CODE_MASK 0x3F -#define MODE_SENSE_LLBAA_MASK 0x10 -#define MODE_SENSE_LLBAA_SHIFT 4 -#define MODE_SENSE_DBD_MASK 8 -#define MODE_SENSE_DBD_SHIFT 3 -#define MODE_SENSE10_MPH_SIZE 8 -#define MODE_SELECT_CDB_PAGE_FORMAT_MASK 0x10 -#define MODE_SELECT_CDB_SAVE_PAGES_MASK 0x1 -#define MODE_SELECT_6_BD_OFFSET 3 -#define MODE_SELECT_10_BD_OFFSET 6 -#define MODE_SELECT_10_LLBAA_OFFSET 4 -#define MODE_SELECT_10_LLBAA_MASK 1 -#define MODE_SELECT_6_MPH_SIZE 4 -#define MODE_SELECT_10_MPH_SIZE 8 -#define CACHING_MODE_PAGE_WCE_MASK 0x04 -#define MODE_SENSE_BLK_DESC_ENABLED 0 -#define MODE_SENSE_BLK_DESC_COUNT 1 -#define MODE_SELECT_PAGE_CODE_MASK 0x3F -#define SHORT_DESC_BLOCK 8 -#define LONG_DESC_BLOCK 16 -#define MODE_PAGE_POW_CND_LEN_FIELD 0x26 -#define MODE_PAGE_INF_EXC_LEN_FIELD 0x0A -#define MODE_PAGE_CACHING_LEN_FIELD 0x12 -#define MODE_PAGE_CONTROL_LEN_FIELD 0x0A -#define MODE_SENSE_PC_CURRENT_VALUES 0 - -/* Log Sense defines */ -#define LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE 0x00 -#define LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH 0x07 -#define LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE 0x2F -#define LOG_PAGE_TEMPERATURE_PAGE 0x0D -#define LOG_SENSE_CDB_SP_NOT_ENABLED 0 -#define LOG_SENSE_CDB_PC_MASK 0xC0 -#define LOG_SENSE_CDB_PC_SHIFT 6 -#define LOG_SENSE_CDB_PC_CUMULATIVE_VALUES 1 -#define LOG_SENSE_CDB_PAGE_CODE_MASK 0x3F -#define REMAINING_INFO_EXCP_PAGE_LENGTH 0x8 -#define LOG_INFO_EXCP_PAGE_LENGTH 0xC -#define REMAINING_TEMP_PAGE_LENGTH 0xC -#define LOG_TEMP_PAGE_LENGTH 0x10 -#define LOG_TEMP_UNKNOWN 0xFF -#define SUPPORTED_LOG_PAGES_PAGE_LENGTH 0x3 - -/* Read Capacity defines */ -#define READ_CAP_10_RESP_SIZE 8 -#define READ_CAP_16_RESP_SIZE 32 - -/* NVMe Namespace and Command Defines */ -#define BYTES_TO_DWORDS 4 -#define NVME_MAX_FIRMWARE_SLOT 7 - -/* Report LUNs defines */ -#define REPORT_LUNS_FIRST_LUN_OFFSET 8 - -/* SCSI ADDITIONAL SENSE Codes */ - -#define SCSI_ASC_NO_SENSE 0x00 -#define SCSI_ASC_PERIPHERAL_DEV_WRITE_FAULT 0x03 -#define SCSI_ASC_LUN_NOT_READY 0x04 -#define SCSI_ASC_WARNING 0x0B -#define SCSI_ASC_LOG_BLOCK_GUARD_CHECK_FAILED 0x10 -#define SCSI_ASC_LOG_BLOCK_APPTAG_CHECK_FAILED 0x10 -#define SCSI_ASC_LOG_BLOCK_REFTAG_CHECK_FAILED 0x10 -#define SCSI_ASC_UNRECOVERED_READ_ERROR 0x11 -#define SCSI_ASC_MISCOMPARE_DURING_VERIFY 0x1D -#define SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID 0x20 -#define SCSI_ASC_ILLEGAL_COMMAND 0x20 -#define SCSI_ASC_ILLEGAL_BLOCK 0x21 -#define SCSI_ASC_INVALID_CDB 0x24 -#define SCSI_ASC_INVALID_LUN 0x25 -#define SCSI_ASC_INVALID_PARAMETER 0x26 -#define SCSI_ASC_FORMAT_COMMAND_FAILED 0x31 -#define SCSI_ASC_INTERNAL_TARGET_FAILURE 0x44 - -/* SCSI ADDITIONAL SENSE Code Qualifiers */ - -#define SCSI_ASCQ_CAUSE_NOT_REPORTABLE 0x00 -#define SCSI_ASCQ_FORMAT_COMMAND_FAILED 0x01 -#define SCSI_ASCQ_LOG_BLOCK_GUARD_CHECK_FAILED 0x01 -#define SCSI_ASCQ_LOG_BLOCK_APPTAG_CHECK_FAILED 0x02 -#define SCSI_ASCQ_LOG_BLOCK_REFTAG_CHECK_FAILED 0x03 -#define SCSI_ASCQ_FORMAT_IN_PROGRESS 0x04 -#define SCSI_ASCQ_POWER_LOSS_EXPECTED 0x08 -#define SCSI_ASCQ_INVALID_LUN_ID 0x09 - -/* copied from drivers/usb/gadget/function/storage_common.h */ -static inline u32 get_unaligned_be24(u8 *buf) -{ - return 0xffffff & (u32) get_unaligned_be32(buf - 1); -} - -/* Struct to gather data that needs to be extracted from a SCSI CDB. - Not conforming to any particular CDB variant, but compatible with all. */ - -struct nvme_trans_io_cdb { - u8 fua; - u8 prot_info; - u64 lba; - u32 xfer_len; -}; - - -/* Internal Helper Functions */ - - -/* Copy data to userspace memory */ - -static int nvme_trans_copy_to_user(struct sg_io_hdr *hdr, void *from, - unsigned long n) -{ - int i; - void *index = from; - size_t remaining = n; - size_t xfer_len; - - if (hdr->iovec_count > 0) { - struct sg_iovec sgl; - - for (i = 0; i < hdr->iovec_count; i++) { - if (copy_from_user(&sgl, hdr->dxferp + - i * sizeof(struct sg_iovec), - sizeof(struct sg_iovec))) - return -EFAULT; - xfer_len = min(remaining, sgl.iov_len); - if (copy_to_user(sgl.iov_base, index, xfer_len)) - return -EFAULT; - - index += xfer_len; - remaining -= xfer_len; - if (remaining == 0) - break; - } - return 0; - } - - if (copy_to_user(hdr->dxferp, from, n)) - return -EFAULT; - return 0; -} - -/* Copy data from userspace memory */ - -static int nvme_trans_copy_from_user(struct sg_io_hdr *hdr, void *to, - unsigned long n) -{ - int i; - void *index = to; - size_t remaining = n; - size_t xfer_len; - - if (hdr->iovec_count > 0) { - struct sg_iovec sgl; - - for (i = 0; i < hdr->iovec_count; i++) { - if (copy_from_user(&sgl, hdr->dxferp + - i * sizeof(struct sg_iovec), - sizeof(struct sg_iovec))) - return -EFAULT; - xfer_len = min(remaining, sgl.iov_len); - if (copy_from_user(index, sgl.iov_base, xfer_len)) - return -EFAULT; - index += xfer_len; - remaining -= xfer_len; - if (remaining == 0) - break; - } - return 0; - } - - if (copy_from_user(to, hdr->dxferp, n)) - return -EFAULT; - return 0; -} - -/* Status/Sense Buffer Writeback */ - -static int nvme_trans_completion(struct sg_io_hdr *hdr, u8 status, u8 sense_key, - u8 asc, u8 ascq) -{ - u8 xfer_len; - u8 resp[DESC_FMT_SENSE_DATA_SIZE]; - - if (scsi_status_is_good(status)) { - hdr->status = SAM_STAT_GOOD; - hdr->masked_status = GOOD; - hdr->host_status = DID_OK; - hdr->driver_status = DRIVER_OK; - hdr->sb_len_wr = 0; - } else { - hdr->status = status; - hdr->masked_status = status >> 1; - hdr->host_status = DID_OK; - hdr->driver_status = DRIVER_OK; - - memset(resp, 0, DESC_FMT_SENSE_DATA_SIZE); - resp[0] = DESC_FORMAT_SENSE_DATA; - resp[1] = sense_key; - resp[2] = asc; - resp[3] = ascq; - - xfer_len = min_t(u8, hdr->mx_sb_len, DESC_FMT_SENSE_DATA_SIZE); - hdr->sb_len_wr = xfer_len; - if (copy_to_user(hdr->sbp, resp, xfer_len) > 0) - return -EFAULT; - } - - return 0; -} - -/* - * Take a status code from a lowlevel routine, and if it was a positive NVMe - * error code update the sense data based on it. In either case the passed - * in value is returned again, unless an -EFAULT from copy_to_user overrides - * it. - */ -static int nvme_trans_status_code(struct sg_io_hdr *hdr, int nvme_sc) -{ - u8 status, sense_key, asc, ascq; - int res; - - /* For non-nvme (Linux) errors, simply return the error code */ - if (nvme_sc < 0) - return nvme_sc; - - /* Mask DNR, More, and reserved fields */ - switch (nvme_sc & 0x7FF) { - /* Generic Command Status */ - case NVME_SC_SUCCESS: - status = SAM_STAT_GOOD; - sense_key = NO_SENSE; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_INVALID_OPCODE: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_ILLEGAL_COMMAND; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_INVALID_FIELD: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_INVALID_CDB; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_DATA_XFER_ERROR: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MEDIUM_ERROR; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_POWER_LOSS: - status = SAM_STAT_TASK_ABORTED; - sense_key = ABORTED_COMMAND; - asc = SCSI_ASC_WARNING; - ascq = SCSI_ASCQ_POWER_LOSS_EXPECTED; - break; - case NVME_SC_INTERNAL: - status = SAM_STAT_CHECK_CONDITION; - sense_key = HARDWARE_ERROR; - asc = SCSI_ASC_INTERNAL_TARGET_FAILURE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_ABORT_REQ: - status = SAM_STAT_TASK_ABORTED; - sense_key = ABORTED_COMMAND; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_ABORT_QUEUE: - status = SAM_STAT_TASK_ABORTED; - sense_key = ABORTED_COMMAND; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_FUSED_FAIL: - status = SAM_STAT_TASK_ABORTED; - sense_key = ABORTED_COMMAND; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_FUSED_MISSING: - status = SAM_STAT_TASK_ABORTED; - sense_key = ABORTED_COMMAND; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_INVALID_NS: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID; - ascq = SCSI_ASCQ_INVALID_LUN_ID; - break; - case NVME_SC_LBA_RANGE: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_ILLEGAL_BLOCK; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_CAP_EXCEEDED: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MEDIUM_ERROR; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_NS_NOT_READY: - status = SAM_STAT_CHECK_CONDITION; - sense_key = NOT_READY; - asc = SCSI_ASC_LUN_NOT_READY; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - - /* Command Specific Status */ - case NVME_SC_INVALID_FORMAT: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_FORMAT_COMMAND_FAILED; - ascq = SCSI_ASCQ_FORMAT_COMMAND_FAILED; - break; - case NVME_SC_BAD_ATTRIBUTES: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_INVALID_CDB; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - - /* Media Errors */ - case NVME_SC_WRITE_FAULT: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MEDIUM_ERROR; - asc = SCSI_ASC_PERIPHERAL_DEV_WRITE_FAULT; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_READ_ERROR: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MEDIUM_ERROR; - asc = SCSI_ASC_UNRECOVERED_READ_ERROR; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_GUARD_CHECK: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MEDIUM_ERROR; - asc = SCSI_ASC_LOG_BLOCK_GUARD_CHECK_FAILED; - ascq = SCSI_ASCQ_LOG_BLOCK_GUARD_CHECK_FAILED; - break; - case NVME_SC_APPTAG_CHECK: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MEDIUM_ERROR; - asc = SCSI_ASC_LOG_BLOCK_APPTAG_CHECK_FAILED; - ascq = SCSI_ASCQ_LOG_BLOCK_APPTAG_CHECK_FAILED; - break; - case NVME_SC_REFTAG_CHECK: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MEDIUM_ERROR; - asc = SCSI_ASC_LOG_BLOCK_REFTAG_CHECK_FAILED; - ascq = SCSI_ASCQ_LOG_BLOCK_REFTAG_CHECK_FAILED; - break; - case NVME_SC_COMPARE_FAILED: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MISCOMPARE; - asc = SCSI_ASC_MISCOMPARE_DURING_VERIFY; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_ACCESS_DENIED: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID; - ascq = SCSI_ASCQ_INVALID_LUN_ID; - break; - - /* Unspecified/Default */ - case NVME_SC_CMDID_CONFLICT: - case NVME_SC_CMD_SEQ_ERROR: - case NVME_SC_CQ_INVALID: - case NVME_SC_QID_INVALID: - case NVME_SC_QUEUE_SIZE: - case NVME_SC_ABORT_LIMIT: - case NVME_SC_ABORT_MISSING: - case NVME_SC_ASYNC_LIMIT: - case NVME_SC_FIRMWARE_SLOT: - case NVME_SC_FIRMWARE_IMAGE: - case NVME_SC_INVALID_VECTOR: - case NVME_SC_INVALID_LOG_PAGE: - default: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - } - - res = nvme_trans_completion(hdr, status, sense_key, asc, ascq); - return res ? res : nvme_sc; -} - -/* INQUIRY Helper Functions */ - -static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *inq_response, - int alloc_len) -{ - struct nvme_ctrl *ctrl = ns->ctrl; - struct nvme_id_ns *id_ns; - int res; - int nvme_sc; - int xfer_len; - u8 resp_data_format = 0x02; - u8 protect; - u8 cmdque = 0x01 << 1; - u8 fw_offset = sizeof(ctrl->firmware_rev); - - /* nvme ns identify - use DPS value for PROTECT field */ - nvme_sc = nvme_identify_ns(ctrl, ns->ns_id, &id_ns); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; - - if (id_ns->dps) - protect = 0x01; - else - protect = 0; - kfree(id_ns); - - memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); - inq_response[2] = VERSION_SPC_4; - inq_response[3] = resp_data_format; /*normaca=0 | hisup=0 */ - inq_response[4] = ADDITIONAL_STD_INQ_LENGTH; - inq_response[5] = protect; /* sccs=0 | acc=0 | tpgs=0 | pc3=0 */ - inq_response[7] = cmdque; /* wbus16=0 | sync=0 | vs=0 */ - strncpy(&inq_response[8], "NVMe ", 8); - strncpy(&inq_response[16], ctrl->model, 16); - - while (ctrl->firmware_rev[fw_offset - 1] == ' ' && fw_offset > 4) - fw_offset--; - fw_offset -= 4; - strncpy(&inq_response[32], ctrl->firmware_rev + fw_offset, 4); - - xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); - return nvme_trans_copy_to_user(hdr, inq_response, xfer_len); -} - -static int nvme_trans_supported_vpd_pages(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *inq_response, - int alloc_len) -{ - int xfer_len; - - memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); - inq_response[1] = INQ_SUPPORTED_VPD_PAGES_PAGE; /* Page Code */ - inq_response[3] = INQ_NUM_SUPPORTED_VPD_PAGES; /* Page Length */ - inq_response[4] = INQ_SUPPORTED_VPD_PAGES_PAGE; - inq_response[5] = INQ_UNIT_SERIAL_NUMBER_PAGE; - inq_response[6] = INQ_DEVICE_IDENTIFICATION_PAGE; - inq_response[7] = INQ_EXTENDED_INQUIRY_DATA_PAGE; - inq_response[8] = INQ_BDEV_CHARACTERISTICS_PAGE; - inq_response[9] = INQ_BDEV_LIMITS_PAGE; - - xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); - return nvme_trans_copy_to_user(hdr, inq_response, xfer_len); -} - -static int nvme_trans_unit_serial_page(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *inq_response, - int alloc_len) -{ - int xfer_len; - - memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); - inq_response[1] = INQ_UNIT_SERIAL_NUMBER_PAGE; /* Page Code */ - inq_response[3] = INQ_SERIAL_NUMBER_LENGTH; /* Page Length */ - strncpy(&inq_response[4], ns->ctrl->serial, INQ_SERIAL_NUMBER_LENGTH); - - xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); - return nvme_trans_copy_to_user(hdr, inq_response, xfer_len); -} - -static int nvme_fill_device_id_eui64(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *inq_response, int alloc_len) -{ - struct nvme_id_ns *id_ns; - int nvme_sc, res; - size_t len; - void *eui; - - nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; - - eui = id_ns->eui64; - len = sizeof(id_ns->eui64); - - if (ns->ctrl->vs >= NVME_VS(1, 2, 0)) { - if (bitmap_empty(eui, len * 8)) { - eui = id_ns->nguid; - len = sizeof(id_ns->nguid); - } - } - - if (bitmap_empty(eui, len * 8)) { - res = -EOPNOTSUPP; - goto out_free_id; - } - - memset(inq_response, 0, alloc_len); - inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; - inq_response[3] = 4 + len; /* Page Length */ - - /* Designation Descriptor start */ - inq_response[4] = 0x01; /* Proto ID=0h | Code set=1h */ - inq_response[5] = 0x02; /* PIV=0b | Asso=00b | Designator Type=2h */ - inq_response[6] = 0x00; /* Rsvd */ - inq_response[7] = len; /* Designator Length */ - memcpy(&inq_response[8], eui, len); - - res = nvme_trans_copy_to_user(hdr, inq_response, alloc_len); -out_free_id: - kfree(id_ns); - return res; -} - -static int nvme_fill_device_id_scsi_string(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *inq_response, int alloc_len) -{ - struct nvme_ctrl *ctrl = ns->ctrl; - struct nvme_id_ctrl *id_ctrl; - int nvme_sc, res; - - if (alloc_len < 72) { - return nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - } - - nvme_sc = nvme_identify_ctrl(ctrl, &id_ctrl); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; - - memset(inq_response, 0, alloc_len); - inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; - inq_response[3] = 0x48; /* Page Length */ - - /* Designation Descriptor start */ - inq_response[4] = 0x03; /* Proto ID=0h | Code set=3h */ - inq_response[5] = 0x08; /* PIV=0b | Asso=00b | Designator Type=8h */ - inq_response[6] = 0x00; /* Rsvd */ - inq_response[7] = 0x44; /* Designator Length */ - - sprintf(&inq_response[8], "%04x", le16_to_cpu(id_ctrl->vid)); - memcpy(&inq_response[12], ctrl->model, sizeof(ctrl->model)); - sprintf(&inq_response[52], "%04x", cpu_to_be32(ns->ns_id)); - memcpy(&inq_response[56], ctrl->serial, sizeof(ctrl->serial)); - - res = nvme_trans_copy_to_user(hdr, inq_response, alloc_len); - kfree(id_ctrl); - return res; -} - -static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *resp, int alloc_len) -{ - int res; - - if (ns->ctrl->vs >= NVME_VS(1, 1, 0)) { - res = nvme_fill_device_id_eui64(ns, hdr, resp, alloc_len); - if (res != -EOPNOTSUPP) - return res; - } - - return nvme_fill_device_id_scsi_string(ns, hdr, resp, alloc_len); -} - -static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, - int alloc_len) -{ - u8 *inq_response; - int res; - int nvme_sc; - struct nvme_ctrl *ctrl = ns->ctrl; - struct nvme_id_ctrl *id_ctrl; - struct nvme_id_ns *id_ns; - int xfer_len; - u8 microcode = 0x80; - u8 spt; - u8 spt_lut[8] = {0, 0, 2, 1, 4, 6, 5, 7}; - u8 grd_chk, app_chk, ref_chk, protect; - u8 uask_sup = 0x20; - u8 v_sup; - u8 luiclr = 0x01; - - inq_response = kmalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL); - if (inq_response == NULL) - return -ENOMEM; - - nvme_sc = nvme_identify_ns(ctrl, ns->ns_id, &id_ns); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - goto out_free_inq; - - spt = spt_lut[id_ns->dpc & 0x07] << 3; - if (id_ns->dps) - protect = 0x01; - else - protect = 0; - kfree(id_ns); - - grd_chk = protect << 2; - app_chk = protect << 1; - ref_chk = protect; - - nvme_sc = nvme_identify_ctrl(ctrl, &id_ctrl); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - goto out_free_inq; - - v_sup = id_ctrl->vwc; - kfree(id_ctrl); - - memset(inq_response, 0, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); - inq_response[1] = INQ_EXTENDED_INQUIRY_DATA_PAGE; /* Page Code */ - inq_response[2] = 0x00; /* Page Length MSB */ - inq_response[3] = 0x3C; /* Page Length LSB */ - inq_response[4] = microcode | spt | grd_chk | app_chk | ref_chk; - inq_response[5] = uask_sup; - inq_response[6] = v_sup; - inq_response[7] = luiclr; - inq_response[8] = 0; - inq_response[9] = 0; - - xfer_len = min(alloc_len, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); - res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); - - out_free_inq: - kfree(inq_response); - return res; -} - -static int nvme_trans_bdev_limits_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *inq_response, int alloc_len) -{ - __be32 max_sectors = cpu_to_be32( - nvme_block_nr(ns, queue_max_hw_sectors(ns->queue))); - __be32 max_discard = cpu_to_be32(ns->queue->limits.max_discard_sectors); - __be32 discard_desc_count = cpu_to_be32(0x100); - - memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); - inq_response[1] = VPD_BLOCK_LIMITS; - inq_response[3] = 0x3c; /* Page Length */ - memcpy(&inq_response[8], &max_sectors, sizeof(u32)); - memcpy(&inq_response[20], &max_discard, sizeof(u32)); - - if (max_discard) - memcpy(&inq_response[24], &discard_desc_count, sizeof(u32)); - - return nvme_trans_copy_to_user(hdr, inq_response, 0x3c); -} - -static int nvme_trans_bdev_char_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, - int alloc_len) -{ - u8 *inq_response; - int res; - int xfer_len; - - inq_response = kzalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL); - if (inq_response == NULL) { - res = -ENOMEM; - goto out_mem; - } - - inq_response[1] = INQ_BDEV_CHARACTERISTICS_PAGE; /* Page Code */ - inq_response[2] = 0x00; /* Page Length MSB */ - inq_response[3] = 0x3C; /* Page Length LSB */ - inq_response[4] = 0x00; /* Medium Rotation Rate MSB */ - inq_response[5] = 0x01; /* Medium Rotation Rate LSB */ - inq_response[6] = 0x00; /* Form Factor */ - - xfer_len = min(alloc_len, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); - res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); - - kfree(inq_response); - out_mem: - return res; -} - -/* LOG SENSE Helper Functions */ - -static int nvme_trans_log_supp_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr, - int alloc_len) -{ - int res; - int xfer_len; - u8 *log_response; - - log_response = kzalloc(LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH, GFP_KERNEL); - if (log_response == NULL) { - res = -ENOMEM; - goto out_mem; - } - - log_response[0] = LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE; - /* Subpage=0x00, Page Length MSB=0 */ - log_response[3] = SUPPORTED_LOG_PAGES_PAGE_LENGTH; - log_response[4] = LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE; - log_response[5] = LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE; - log_response[6] = LOG_PAGE_TEMPERATURE_PAGE; - - xfer_len = min(alloc_len, LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH); - res = nvme_trans_copy_to_user(hdr, log_response, xfer_len); - - kfree(log_response); - out_mem: - return res; -} - -static int nvme_trans_log_info_exceptions(struct nvme_ns *ns, - struct sg_io_hdr *hdr, int alloc_len) -{ - int res; - int xfer_len; - u8 *log_response; - struct nvme_smart_log *smart_log; - u8 temp_c; - u16 temp_k; - - log_response = kzalloc(LOG_INFO_EXCP_PAGE_LENGTH, GFP_KERNEL); - if (log_response == NULL) - return -ENOMEM; - - res = nvme_get_log_page(ns->ctrl, &smart_log); - if (res < 0) - goto out_free_response; - - if (res != NVME_SC_SUCCESS) { - temp_c = LOG_TEMP_UNKNOWN; - } else { - temp_k = (smart_log->temperature[1] << 8) + - (smart_log->temperature[0]); - temp_c = temp_k - KELVIN_TEMP_FACTOR; - } - kfree(smart_log); - - log_response[0] = LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE; - /* Subpage=0x00, Page Length MSB=0 */ - log_response[3] = REMAINING_INFO_EXCP_PAGE_LENGTH; - /* Informational Exceptions Log Parameter 1 Start */ - /* Parameter Code=0x0000 bytes 4,5 */ - log_response[6] = 0x23; /* DU=0, TSD=1, ETC=0, TMC=0, FMT_AND_LNK=11b */ - log_response[7] = 0x04; /* PARAMETER LENGTH */ - /* Add sense Code and qualifier = 0x00 each */ - /* Use Temperature from NVMe Get Log Page, convert to C from K */ - log_response[10] = temp_c; - - xfer_len = min(alloc_len, LOG_INFO_EXCP_PAGE_LENGTH); - res = nvme_trans_copy_to_user(hdr, log_response, xfer_len); - - out_free_response: - kfree(log_response); - return res; -} - -static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr, - int alloc_len) -{ - int res; - int xfer_len; - u8 *log_response; - struct nvme_smart_log *smart_log; - u32 feature_resp; - u8 temp_c_cur, temp_c_thresh; - u16 temp_k; - - log_response = kzalloc(LOG_TEMP_PAGE_LENGTH, GFP_KERNEL); - if (log_response == NULL) - return -ENOMEM; - - res = nvme_get_log_page(ns->ctrl, &smart_log); - if (res < 0) - goto out_free_response; - - if (res != NVME_SC_SUCCESS) { - temp_c_cur = LOG_TEMP_UNKNOWN; - } else { - temp_k = (smart_log->temperature[1] << 8) + - (smart_log->temperature[0]); - temp_c_cur = temp_k - KELVIN_TEMP_FACTOR; - } - kfree(smart_log); - - /* Get Features for Temp Threshold */ - res = nvme_get_features(ns->ctrl, NVME_FEAT_TEMP_THRESH, 0, NULL, 0, - &feature_resp); - if (res != NVME_SC_SUCCESS) - temp_c_thresh = LOG_TEMP_UNKNOWN; - else - temp_c_thresh = (feature_resp & 0xFFFF) - KELVIN_TEMP_FACTOR; - - log_response[0] = LOG_PAGE_TEMPERATURE_PAGE; - /* Subpage=0x00, Page Length MSB=0 */ - log_response[3] = REMAINING_TEMP_PAGE_LENGTH; - /* Temperature Log Parameter 1 (Temperature) Start */ - /* Parameter Code = 0x0000 */ - log_response[6] = 0x01; /* Format and Linking = 01b */ - log_response[7] = 0x02; /* Parameter Length */ - /* Use Temperature from NVMe Get Log Page, convert to C from K */ - log_response[9] = temp_c_cur; - /* Temperature Log Parameter 2 (Reference Temperature) Start */ - log_response[11] = 0x01; /* Parameter Code = 0x0001 */ - log_response[12] = 0x01; /* Format and Linking = 01b */ - log_response[13] = 0x02; /* Parameter Length */ - /* Use Temperature Thresh from NVMe Get Log Page, convert to C from K */ - log_response[15] = temp_c_thresh; - - xfer_len = min(alloc_len, LOG_TEMP_PAGE_LENGTH); - res = nvme_trans_copy_to_user(hdr, log_response, xfer_len); - - out_free_response: - kfree(log_response); - return res; -} - -/* MODE SENSE Helper Functions */ - -static int nvme_trans_fill_mode_parm_hdr(u8 *resp, int len, u8 cdb10, u8 llbaa, - u16 mode_data_length, u16 blk_desc_len) -{ - /* Quick check to make sure I don't stomp on my own memory... */ - if ((cdb10 && len < 8) || (!cdb10 && len < 4)) - return -EINVAL; - - if (cdb10) { - resp[0] = (mode_data_length & 0xFF00) >> 8; - resp[1] = (mode_data_length & 0x00FF); - resp[3] = 0x10 /* DPOFUA */; - resp[4] = llbaa; - resp[5] = RESERVED_FIELD; - resp[6] = (blk_desc_len & 0xFF00) >> 8; - resp[7] = (blk_desc_len & 0x00FF); - } else { - resp[0] = (mode_data_length & 0x00FF); - resp[2] = 0x10 /* DPOFUA */; - resp[3] = (blk_desc_len & 0x00FF); - } - - return 0; -} - -static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *resp, int len, u8 llbaa) -{ - int res; - int nvme_sc; - struct nvme_id_ns *id_ns; - u8 flbas; - u32 lba_length; - - if (llbaa == 0 && len < MODE_PAGE_BLK_DES_LEN) - return -EINVAL; - else if (llbaa > 0 && len < MODE_PAGE_LLBAA_BLK_DES_LEN) - return -EINVAL; - - nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; - - flbas = (id_ns->flbas) & 0x0F; - lba_length = (1 << (id_ns->lbaf[flbas].ds)); - - if (llbaa == 0) { - __be32 tmp_cap = cpu_to_be32(le64_to_cpu(id_ns->ncap)); - /* Byte 4 is reserved */ - __be32 tmp_len = cpu_to_be32(lba_length & 0x00FFFFFF); - - memcpy(resp, &tmp_cap, sizeof(u32)); - memcpy(&resp[4], &tmp_len, sizeof(u32)); - } else { - __be64 tmp_cap = cpu_to_be64(le64_to_cpu(id_ns->ncap)); - __be32 tmp_len = cpu_to_be32(lba_length); - - memcpy(resp, &tmp_cap, sizeof(u64)); - /* Bytes 8, 9, 10, 11 are reserved */ - memcpy(&resp[12], &tmp_len, sizeof(u32)); - } - - kfree(id_ns); - return res; -} - -static int nvme_trans_fill_control_page(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *resp, - int len) -{ - if (len < MODE_PAGE_CONTROL_LEN) - return -EINVAL; - - resp[0] = MODE_PAGE_CONTROL; - resp[1] = MODE_PAGE_CONTROL_LEN_FIELD; - resp[2] = 0x0E; /* TST=000b, TMF_ONLY=0, DPICZ=1, - * D_SENSE=1, GLTSD=1, RLEC=0 */ - resp[3] = 0x12; /* Q_ALGO_MODIFIER=1h, NUAR=0, QERR=01b */ - /* Byte 4: VS=0, RAC=0, UA_INT=0, SWP=0 */ - resp[5] = 0x40; /* ATO=0, TAS=1, ATMPE=0, RWWP=0, AUTOLOAD=0 */ - /* resp[6] and [7] are obsolete, thus zero */ - resp[8] = 0xFF; /* Busy timeout period = 0xffff */ - resp[9] = 0xFF; - /* Bytes 10,11: Extended selftest completion time = 0x0000 */ - - return 0; -} - -static int nvme_trans_fill_caching_page(struct nvme_ns *ns, - struct sg_io_hdr *hdr, - u8 *resp, int len) -{ - int res = 0; - int nvme_sc; - u32 feature_resp; - u8 vwc; - - if (len < MODE_PAGE_CACHING_LEN) - return -EINVAL; - - nvme_sc = nvme_get_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, 0, NULL, 0, - &feature_resp); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; - - vwc = feature_resp & 0x00000001; - - resp[0] = MODE_PAGE_CACHING; - resp[1] = MODE_PAGE_CACHING_LEN_FIELD; - resp[2] = vwc << 2; - return 0; -} - -static int nvme_trans_fill_pow_cnd_page(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *resp, - int len) -{ - if (len < MODE_PAGE_POW_CND_LEN) - return -EINVAL; - - resp[0] = MODE_PAGE_POWER_CONDITION; - resp[1] = MODE_PAGE_POW_CND_LEN_FIELD; - /* All other bytes are zero */ - - return 0; -} - -static int nvme_trans_fill_inf_exc_page(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *resp, - int len) -{ - if (len < MODE_PAGE_INF_EXC_LEN) - return -EINVAL; - - resp[0] = MODE_PAGE_INFO_EXCEP; - resp[1] = MODE_PAGE_INF_EXC_LEN_FIELD; - resp[2] = 0x88; - /* All other bytes are zero */ - - return 0; -} - -static int nvme_trans_fill_all_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *resp, int len) -{ - int res; - u16 mode_pages_offset_1 = 0; - u16 mode_pages_offset_2, mode_pages_offset_3, mode_pages_offset_4; - - mode_pages_offset_2 = mode_pages_offset_1 + MODE_PAGE_CACHING_LEN; - mode_pages_offset_3 = mode_pages_offset_2 + MODE_PAGE_CONTROL_LEN; - mode_pages_offset_4 = mode_pages_offset_3 + MODE_PAGE_POW_CND_LEN; - - res = nvme_trans_fill_caching_page(ns, hdr, &resp[mode_pages_offset_1], - MODE_PAGE_CACHING_LEN); - if (res) - return res; - res = nvme_trans_fill_control_page(ns, hdr, &resp[mode_pages_offset_2], - MODE_PAGE_CONTROL_LEN); - if (res) - return res; - res = nvme_trans_fill_pow_cnd_page(ns, hdr, &resp[mode_pages_offset_3], - MODE_PAGE_POW_CND_LEN); - if (res) - return res; - return nvme_trans_fill_inf_exc_page(ns, hdr, &resp[mode_pages_offset_4], - MODE_PAGE_INF_EXC_LEN); -} - -static inline int nvme_trans_get_blk_desc_len(u8 dbd, u8 llbaa) -{ - if (dbd == MODE_SENSE_BLK_DESC_ENABLED) { - /* SPC-4: len = 8 x Num_of_descriptors if llbaa = 0, 16x if 1 */ - return 8 * (llbaa + 1) * MODE_SENSE_BLK_DESC_COUNT; - } else { - return 0; - } -} - -static int nvme_trans_mode_page_create(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *cmd, - u16 alloc_len, u8 cdb10, - int (*mode_page_fill_func) - (struct nvme_ns *, - struct sg_io_hdr *hdr, u8 *, int), - u16 mode_pages_tot_len) -{ - int res; - int xfer_len; - u8 *response; - u8 dbd, llbaa; - u16 resp_size; - int mph_size; - u16 mode_pages_offset_1; - u16 blk_desc_len, blk_desc_offset, mode_data_length; - - dbd = (cmd[1] & MODE_SENSE_DBD_MASK) >> MODE_SENSE_DBD_SHIFT; - llbaa = (cmd[1] & MODE_SENSE_LLBAA_MASK) >> MODE_SENSE_LLBAA_SHIFT; - mph_size = cdb10 ? MODE_SENSE10_MPH_SIZE : MODE_SENSE6_MPH_SIZE; - - blk_desc_len = nvme_trans_get_blk_desc_len(dbd, llbaa); - - resp_size = mph_size + blk_desc_len + mode_pages_tot_len; - /* Refer spc4r34 Table 440 for calculation of Mode data Length field */ - mode_data_length = 3 + (3 * cdb10) + blk_desc_len + mode_pages_tot_len; - - blk_desc_offset = mph_size; - mode_pages_offset_1 = blk_desc_offset + blk_desc_len; - - response = kzalloc(resp_size, GFP_KERNEL); - if (response == NULL) { - res = -ENOMEM; - goto out_mem; - } - - res = nvme_trans_fill_mode_parm_hdr(&response[0], mph_size, cdb10, - llbaa, mode_data_length, blk_desc_len); - if (res) - goto out_free; - if (blk_desc_len > 0) { - res = nvme_trans_fill_blk_desc(ns, hdr, - &response[blk_desc_offset], - blk_desc_len, llbaa); - if (res) - goto out_free; - } - res = mode_page_fill_func(ns, hdr, &response[mode_pages_offset_1], - mode_pages_tot_len); - if (res) - goto out_free; - - xfer_len = min(alloc_len, resp_size); - res = nvme_trans_copy_to_user(hdr, response, xfer_len); - - out_free: - kfree(response); - out_mem: - return res; -} - -/* Read Capacity Helper Functions */ - -static void nvme_trans_fill_read_cap(u8 *response, struct nvme_id_ns *id_ns, - u8 cdb16) -{ - u8 flbas; - u32 lba_length; - u64 rlba; - u8 prot_en; - u8 p_type_lut[4] = {0, 0, 1, 2}; - __be64 tmp_rlba; - __be32 tmp_rlba_32; - __be32 tmp_len; - - flbas = (id_ns->flbas) & 0x0F; - lba_length = (1 << (id_ns->lbaf[flbas].ds)); - rlba = le64_to_cpup(&id_ns->nsze) - 1; - (id_ns->dps) ? (prot_en = 0x01) : (prot_en = 0); - - if (!cdb16) { - if (rlba > 0xFFFFFFFF) - rlba = 0xFFFFFFFF; - tmp_rlba_32 = cpu_to_be32(rlba); - tmp_len = cpu_to_be32(lba_length); - memcpy(response, &tmp_rlba_32, sizeof(u32)); - memcpy(&response[4], &tmp_len, sizeof(u32)); - } else { - tmp_rlba = cpu_to_be64(rlba); - tmp_len = cpu_to_be32(lba_length); - memcpy(response, &tmp_rlba, sizeof(u64)); - memcpy(&response[8], &tmp_len, sizeof(u32)); - response[12] = (p_type_lut[id_ns->dps & 0x3] << 1) | prot_en; - /* P_I_Exponent = 0x0 | LBPPBE = 0x0 */ - /* LBPME = 0 | LBPRZ = 0 | LALBA = 0x00 */ - /* Bytes 16-31 - Reserved */ - } -} - -/* Start Stop Unit Helper Functions */ - -static int nvme_trans_send_activate_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 buffer_id) -{ - struct nvme_command c; - int nvme_sc; - - memset(&c, 0, sizeof(c)); - c.common.opcode = nvme_admin_activate_fw; - c.common.cdw10[0] = cpu_to_le32(buffer_id | NVME_FWACT_REPL_ACTV); - - nvme_sc = nvme_submit_sync_cmd(ns->queue, &c, NULL, 0); - return nvme_trans_status_code(hdr, nvme_sc); -} - -static int nvme_trans_send_download_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 opcode, u32 tot_len, u32 offset, - u8 buffer_id) -{ - int nvme_sc; - struct nvme_command c; - - if (hdr->iovec_count > 0) { - /* Assuming SGL is not allowed for this command */ - return nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, - SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - } - - memset(&c, 0, sizeof(c)); - c.common.opcode = nvme_admin_download_fw; - c.dlfw.numd = cpu_to_le32((tot_len/BYTES_TO_DWORDS) - 1); - c.dlfw.offset = cpu_to_le32(offset/BYTES_TO_DWORDS); - - nvme_sc = nvme_submit_user_cmd(ns->ctrl->admin_q, &c, - hdr->dxferp, tot_len, NULL, 0); - return nvme_trans_status_code(hdr, nvme_sc); -} - -/* Mode Select Helper Functions */ - -static inline void nvme_trans_modesel_get_bd_len(u8 *parm_list, u8 cdb10, - u16 *bd_len, u8 *llbaa) -{ - if (cdb10) { - /* 10 Byte CDB */ - *bd_len = (parm_list[MODE_SELECT_10_BD_OFFSET] << 8) + - parm_list[MODE_SELECT_10_BD_OFFSET + 1]; - *llbaa = parm_list[MODE_SELECT_10_LLBAA_OFFSET] & - MODE_SELECT_10_LLBAA_MASK; - } else { - /* 6 Byte CDB */ - *bd_len = parm_list[MODE_SELECT_6_BD_OFFSET]; - } -} - -static void nvme_trans_modesel_save_bd(struct nvme_ns *ns, u8 *parm_list, - u16 idx, u16 bd_len, u8 llbaa) -{ - /* Store block descriptor info if a FORMAT UNIT comes later */ - /* TODO Saving 1st BD info; what to do if multiple BD received? */ - if (llbaa == 0) { - /* Standard Block Descriptor - spc4r34 7.5.5.1 */ - ns->mode_select_num_blocks = - (parm_list[idx + 1] << 16) + - (parm_list[idx + 2] << 8) + - (parm_list[idx + 3]); - - ns->mode_select_block_len = - (parm_list[idx + 5] << 16) + - (parm_list[idx + 6] << 8) + - (parm_list[idx + 7]); - } else { - /* Long LBA Block Descriptor - sbc3r27 6.4.2.3 */ - ns->mode_select_num_blocks = - (((u64)parm_list[idx + 0]) << 56) + - (((u64)parm_list[idx + 1]) << 48) + - (((u64)parm_list[idx + 2]) << 40) + - (((u64)parm_list[idx + 3]) << 32) + - (((u64)parm_list[idx + 4]) << 24) + - (((u64)parm_list[idx + 5]) << 16) + - (((u64)parm_list[idx + 6]) << 8) + - ((u64)parm_list[idx + 7]); - - ns->mode_select_block_len = - (parm_list[idx + 12] << 24) + - (parm_list[idx + 13] << 16) + - (parm_list[idx + 14] << 8) + - (parm_list[idx + 15]); - } -} - -static int nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *mode_page, u8 page_code) -{ - int res = 0; - int nvme_sc; - unsigned dword11; - - switch (page_code) { - case MODE_PAGE_CACHING: - dword11 = ((mode_page[2] & CACHING_MODE_PAGE_WCE_MASK) ? 1 : 0); - nvme_sc = nvme_set_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, - dword11, NULL, 0, NULL); - res = nvme_trans_status_code(hdr, nvme_sc); - break; - case MODE_PAGE_CONTROL: - break; - case MODE_PAGE_POWER_CONDITION: - /* Verify the OS is not trying to set timers */ - if ((mode_page[2] & 0x01) != 0 || (mode_page[3] & 0x0F) != 0) { - res = nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, - SCSI_ASC_INVALID_PARAMETER, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - break; - default: - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - - return res; -} - -static int nvme_trans_modesel_data(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd, u16 parm_list_len, u8 pf, - u8 sp, u8 cdb10) -{ - int res; - u8 *parm_list; - u16 bd_len; - u8 llbaa = 0; - u16 index, saved_index; - u8 page_code; - u16 mp_size; - - /* Get parm list from data-in/out buffer */ - parm_list = kmalloc(parm_list_len, GFP_KERNEL); - if (parm_list == NULL) { - res = -ENOMEM; - goto out; - } - - res = nvme_trans_copy_from_user(hdr, parm_list, parm_list_len); - if (res) - goto out_mem; - - nvme_trans_modesel_get_bd_len(parm_list, cdb10, &bd_len, &llbaa); - index = (cdb10) ? (MODE_SELECT_10_MPH_SIZE) : (MODE_SELECT_6_MPH_SIZE); - - if (bd_len != 0) { - /* Block Descriptors present, parse */ - nvme_trans_modesel_save_bd(ns, parm_list, index, bd_len, llbaa); - index += bd_len; - } - saved_index = index; - - /* Multiple mode pages may be present; iterate through all */ - /* In 1st Iteration, don't do NVME Command, only check for CDB errors */ - do { - page_code = parm_list[index] & MODE_SELECT_PAGE_CODE_MASK; - mp_size = parm_list[index + 1] + 2; - if ((page_code != MODE_PAGE_CACHING) && - (page_code != MODE_PAGE_CONTROL) && - (page_code != MODE_PAGE_POWER_CONDITION)) { - res = nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, - SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out_mem; - } - index += mp_size; - } while (index < parm_list_len); - - /* In 2nd Iteration, do the NVME Commands */ - index = saved_index; - do { - page_code = parm_list[index] & MODE_SELECT_PAGE_CODE_MASK; - mp_size = parm_list[index + 1] + 2; - res = nvme_trans_modesel_get_mp(ns, hdr, &parm_list[index], - page_code); - if (res) - break; - index += mp_size; - } while (index < parm_list_len); - - out_mem: - kfree(parm_list); - out: - return res; -} - -/* Format Unit Helper Functions */ - -static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns, - struct sg_io_hdr *hdr) -{ - int res = 0; - int nvme_sc; - u8 flbas; - - /* - * SCSI Expects a MODE SELECT would have been issued prior to - * a FORMAT UNIT, and the block size and number would be used - * from the block descriptor in it. If a MODE SELECT had not - * been issued, FORMAT shall use the current values for both. - */ - - if (ns->mode_select_num_blocks == 0 || ns->mode_select_block_len == 0) { - struct nvme_id_ns *id_ns; - - nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; - - if (ns->mode_select_num_blocks == 0) - ns->mode_select_num_blocks = le64_to_cpu(id_ns->ncap); - if (ns->mode_select_block_len == 0) { - flbas = (id_ns->flbas) & 0x0F; - ns->mode_select_block_len = - (1 << (id_ns->lbaf[flbas].ds)); - } - - kfree(id_ns); - } - - return 0; -} - -static int nvme_trans_fmt_get_parm_header(struct sg_io_hdr *hdr, u8 len, - u8 format_prot_info, u8 *nvme_pf_code) -{ - int res; - u8 *parm_list; - u8 pf_usage, pf_code; - - parm_list = kmalloc(len, GFP_KERNEL); - if (parm_list == NULL) { - res = -ENOMEM; - goto out; - } - res = nvme_trans_copy_from_user(hdr, parm_list, len); - if (res) - goto out_mem; - - if ((parm_list[FORMAT_UNIT_IMMED_OFFSET] & - FORMAT_UNIT_IMMED_MASK) != 0) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out_mem; - } - - if (len == FORMAT_UNIT_LONG_PARM_LIST_LEN && - (parm_list[FORMAT_UNIT_PROT_INT_OFFSET] & 0x0F) != 0) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out_mem; - } - pf_usage = parm_list[FORMAT_UNIT_PROT_FIELD_USAGE_OFFSET] & - FORMAT_UNIT_PROT_FIELD_USAGE_MASK; - pf_code = (pf_usage << 2) | format_prot_info; - switch (pf_code) { - case 0: - *nvme_pf_code = 0; - break; - case 2: - *nvme_pf_code = 1; - break; - case 3: - *nvme_pf_code = 2; - break; - case 7: - *nvme_pf_code = 3; - break; - default: - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - - out_mem: - kfree(parm_list); - out: - return res; -} - -static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 prot_info) -{ - int res; - int nvme_sc; - struct nvme_id_ns *id_ns; - u8 i; - u8 nlbaf; - u8 selected_lbaf = 0xFF; - u32 cdw10 = 0; - struct nvme_command c; - - /* Loop thru LBAF's in id_ns to match reqd lbaf, put in cdw10 */ - nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; - - nlbaf = id_ns->nlbaf; - - for (i = 0; i < nlbaf; i++) { - if (ns->mode_select_block_len == (1 << (id_ns->lbaf[i].ds))) { - selected_lbaf = i; - break; - } - } - if (selected_lbaf > 0x0F) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_PARAMETER, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - } - if (ns->mode_select_num_blocks != le64_to_cpu(id_ns->ncap)) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_PARAMETER, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - } - - cdw10 |= prot_info << 5; - cdw10 |= selected_lbaf & 0x0F; - memset(&c, 0, sizeof(c)); - c.format.opcode = nvme_admin_format_nvm; - c.format.nsid = cpu_to_le32(ns->ns_id); - c.format.cdw10 = cpu_to_le32(cdw10); - - nvme_sc = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, NULL, 0); - res = nvme_trans_status_code(hdr, nvme_sc); - - kfree(id_ns); - return res; -} - -static inline u32 nvme_trans_io_get_num_cmds(struct sg_io_hdr *hdr, - struct nvme_trans_io_cdb *cdb_info, - u32 max_blocks) -{ - /* If using iovecs, send one nvme command per vector */ - if (hdr->iovec_count > 0) - return hdr->iovec_count; - else if (cdb_info->xfer_len > max_blocks) - return ((cdb_info->xfer_len - 1) / max_blocks) + 1; - else - return 1; -} - -static u16 nvme_trans_io_get_control(struct nvme_ns *ns, - struct nvme_trans_io_cdb *cdb_info) -{ - u16 control = 0; - - /* When Protection information support is added, implement here */ - - if (cdb_info->fua > 0) - control |= NVME_RW_FUA; - - return control; -} - -static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, - struct nvme_trans_io_cdb *cdb_info, u8 is_write) -{ - int nvme_sc = NVME_SC_SUCCESS; - u32 num_cmds; - u64 unit_len; - u64 unit_num_blocks; /* Number of blocks to xfer in each nvme cmd */ - u32 retcode; - u32 i = 0; - u64 nvme_offset = 0; - void __user *next_mapping_addr; - struct nvme_command c; - u8 opcode = (is_write ? nvme_cmd_write : nvme_cmd_read); - u16 control; - u32 max_blocks = queue_max_hw_sectors(ns->queue) >> (ns->lba_shift - 9); - - num_cmds = nvme_trans_io_get_num_cmds(hdr, cdb_info, max_blocks); - - /* - * This loop handles two cases. - * First, when an SGL is used in the form of an iovec list: - * - Use iov_base as the next mapping address for the nvme command_id - * - Use iov_len as the data transfer length for the command. - * Second, when we have a single buffer - * - If larger than max_blocks, split into chunks, offset - * each nvme command accordingly. - */ - for (i = 0; i < num_cmds; i++) { - memset(&c, 0, sizeof(c)); - if (hdr->iovec_count > 0) { - struct sg_iovec sgl; - - retcode = copy_from_user(&sgl, hdr->dxferp + - i * sizeof(struct sg_iovec), - sizeof(struct sg_iovec)); - if (retcode) - return -EFAULT; - unit_len = sgl.iov_len; - unit_num_blocks = unit_len >> ns->lba_shift; - next_mapping_addr = sgl.iov_base; - } else { - unit_num_blocks = min((u64)max_blocks, - (cdb_info->xfer_len - nvme_offset)); - unit_len = unit_num_blocks << ns->lba_shift; - next_mapping_addr = hdr->dxferp + - ((1 << ns->lba_shift) * nvme_offset); - } - - c.rw.opcode = opcode; - c.rw.nsid = cpu_to_le32(ns->ns_id); - c.rw.slba = cpu_to_le64(cdb_info->lba + nvme_offset); - c.rw.length = cpu_to_le16(unit_num_blocks - 1); - control = nvme_trans_io_get_control(ns, cdb_info); - c.rw.control = cpu_to_le16(control); - - if (get_capacity(ns->disk) - unit_num_blocks < - cdb_info->lba + nvme_offset) { - nvme_sc = NVME_SC_LBA_RANGE; - break; - } - nvme_sc = nvme_submit_user_cmd(ns->queue, &c, - next_mapping_addr, unit_len, NULL, 0); - if (nvme_sc) - break; - - nvme_offset += unit_num_blocks; - } - - return nvme_trans_status_code(hdr, nvme_sc); -} - - -/* SCSI Command Translation Functions */ - -static int nvme_trans_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 is_write, - u8 *cmd) -{ - int res = 0; - struct nvme_trans_io_cdb cdb_info = { 0, }; - u8 opcode = cmd[0]; - u64 xfer_bytes; - u64 sum_iov_len = 0; - struct sg_iovec sgl; - int i; - size_t not_copied; - - /* - * The FUA and WPROTECT fields are not supported in 6-byte CDBs, - * but always in the same place for all others. - */ - switch (opcode) { - case WRITE_6: - case READ_6: - break; - default: - cdb_info.fua = cmd[1] & 0x8; - cdb_info.prot_info = (cmd[1] & 0xe0) >> 5; - if (cdb_info.prot_info && !ns->pi_type) { - return nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, - SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - } - } - - switch (opcode) { - case WRITE_6: - case READ_6: - cdb_info.lba = get_unaligned_be24(&cmd[1]); - cdb_info.xfer_len = cmd[4]; - if (cdb_info.xfer_len == 0) - cdb_info.xfer_len = 256; - break; - case WRITE_10: - case READ_10: - cdb_info.lba = get_unaligned_be32(&cmd[2]); - cdb_info.xfer_len = get_unaligned_be16(&cmd[7]); - break; - case WRITE_12: - case READ_12: - cdb_info.lba = get_unaligned_be32(&cmd[2]); - cdb_info.xfer_len = get_unaligned_be32(&cmd[6]); - break; - case WRITE_16: - case READ_16: - cdb_info.lba = get_unaligned_be64(&cmd[2]); - cdb_info.xfer_len = get_unaligned_be32(&cmd[10]); - break; - default: - /* Will never really reach here */ - res = -EIO; - goto out; - } - - /* Calculate total length of transfer (in bytes) */ - if (hdr->iovec_count > 0) { - for (i = 0; i < hdr->iovec_count; i++) { - not_copied = copy_from_user(&sgl, hdr->dxferp + - i * sizeof(struct sg_iovec), - sizeof(struct sg_iovec)); - if (not_copied) - return -EFAULT; - sum_iov_len += sgl.iov_len; - /* IO vector sizes should be multiples of block size */ - if (sgl.iov_len % (1 << ns->lba_shift) != 0) { - res = nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, - SCSI_ASC_INVALID_PARAMETER, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - } - } else { - sum_iov_len = hdr->dxfer_len; - } - - /* As Per sg ioctl howto, if the lengths differ, use the lower one */ - xfer_bytes = min(((u64)hdr->dxfer_len), sum_iov_len); - - /* If block count and actual data buffer size dont match, error out */ - if (xfer_bytes != (cdb_info.xfer_len << ns->lba_shift)) { - res = -EINVAL; - goto out; - } - - /* Check for 0 length transfer - it is not illegal */ - if (cdb_info.xfer_len == 0) - goto out; - - /* Send NVMe IO Command(s) */ - res = nvme_trans_do_nvme_io(ns, hdr, &cdb_info, is_write); - if (res) - goto out; - - out: - return res; -} - -static int nvme_trans_inquiry(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res = 0; - u8 evpd; - u8 page_code; - int alloc_len; - u8 *inq_response; - - evpd = cmd[1] & 0x01; - page_code = cmd[2]; - alloc_len = get_unaligned_be16(&cmd[3]); - - inq_response = kmalloc(max(alloc_len, STANDARD_INQUIRY_LENGTH), - GFP_KERNEL); - if (inq_response == NULL) { - res = -ENOMEM; - goto out_mem; - } - - if (evpd == 0) { - if (page_code == INQ_STANDARD_INQUIRY_PAGE) { - res = nvme_trans_standard_inquiry_page(ns, hdr, - inq_response, alloc_len); - } else { - res = nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, - SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - } - } else { - switch (page_code) { - case VPD_SUPPORTED_PAGES: - res = nvme_trans_supported_vpd_pages(ns, hdr, - inq_response, alloc_len); - break; - case VPD_SERIAL_NUMBER: - res = nvme_trans_unit_serial_page(ns, hdr, inq_response, - alloc_len); - break; - case VPD_DEVICE_IDENTIFIERS: - res = nvme_trans_device_id_page(ns, hdr, inq_response, - alloc_len); - break; - case VPD_EXTENDED_INQUIRY: - res = nvme_trans_ext_inq_page(ns, hdr, alloc_len); - break; - case VPD_BLOCK_LIMITS: - res = nvme_trans_bdev_limits_page(ns, hdr, inq_response, - alloc_len); - break; - case VPD_BLOCK_DEV_CHARACTERISTICS: - res = nvme_trans_bdev_char_page(ns, hdr, alloc_len); - break; - default: - res = nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, - SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - } - kfree(inq_response); - out_mem: - return res; -} - -static int nvme_trans_log_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res; - u16 alloc_len; - u8 pc; - u8 page_code; - - if (cmd[1] != LOG_SENSE_CDB_SP_NOT_ENABLED) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - - page_code = cmd[2] & LOG_SENSE_CDB_PAGE_CODE_MASK; - pc = (cmd[2] & LOG_SENSE_CDB_PC_MASK) >> LOG_SENSE_CDB_PC_SHIFT; - if (pc != LOG_SENSE_CDB_PC_CUMULATIVE_VALUES) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - alloc_len = get_unaligned_be16(&cmd[7]); - switch (page_code) { - case LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE: - res = nvme_trans_log_supp_pages(ns, hdr, alloc_len); - break; - case LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE: - res = nvme_trans_log_info_exceptions(ns, hdr, alloc_len); - break; - case LOG_PAGE_TEMPERATURE_PAGE: - res = nvme_trans_log_temperature(ns, hdr, alloc_len); - break; - default: - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - - out: - return res; -} - -static int nvme_trans_mode_select(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - u8 cdb10 = 0; - u16 parm_list_len; - u8 page_format; - u8 save_pages; - - page_format = cmd[1] & MODE_SELECT_CDB_PAGE_FORMAT_MASK; - save_pages = cmd[1] & MODE_SELECT_CDB_SAVE_PAGES_MASK; - - if (cmd[0] == MODE_SELECT) { - parm_list_len = cmd[4]; - } else { - parm_list_len = cmd[7]; - cdb10 = 1; - } - - if (parm_list_len != 0) { - /* - * According to SPC-4 r24, a paramter list length field of 0 - * shall not be considered an error - */ - return nvme_trans_modesel_data(ns, hdr, cmd, parm_list_len, - page_format, save_pages, cdb10); - } - - return 0; -} - -static int nvme_trans_mode_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res = 0; - u16 alloc_len; - u8 cdb10 = 0; - - if (cmd[0] == MODE_SENSE) { - alloc_len = cmd[4]; - } else { - alloc_len = get_unaligned_be16(&cmd[7]); - cdb10 = 1; - } - - if ((cmd[2] & MODE_SENSE_PAGE_CONTROL_MASK) != - MODE_SENSE_PC_CURRENT_VALUES) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - - switch (cmd[2] & MODE_SENSE_PAGE_CODE_MASK) { - case MODE_PAGE_CACHING: - res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, - cdb10, - &nvme_trans_fill_caching_page, - MODE_PAGE_CACHING_LEN); - break; - case MODE_PAGE_CONTROL: - res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, - cdb10, - &nvme_trans_fill_control_page, - MODE_PAGE_CONTROL_LEN); - break; - case MODE_PAGE_POWER_CONDITION: - res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, - cdb10, - &nvme_trans_fill_pow_cnd_page, - MODE_PAGE_POW_CND_LEN); - break; - case MODE_PAGE_INFO_EXCEP: - res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, - cdb10, - &nvme_trans_fill_inf_exc_page, - MODE_PAGE_INF_EXC_LEN); - break; - case MODE_PAGE_RETURN_ALL: - res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, - cdb10, - &nvme_trans_fill_all_pages, - MODE_PAGE_ALL_LEN); - break; - default: - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - - out: - return res; -} - -static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd, u8 cdb16) -{ - int res; - int nvme_sc; - u32 alloc_len; - u32 resp_size; - u32 xfer_len; - struct nvme_id_ns *id_ns; - u8 *response; - - if (cdb16) { - alloc_len = get_unaligned_be32(&cmd[10]); - resp_size = READ_CAP_16_RESP_SIZE; - } else { - alloc_len = READ_CAP_10_RESP_SIZE; - resp_size = READ_CAP_10_RESP_SIZE; - } - - nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; - - response = kzalloc(resp_size, GFP_KERNEL); - if (response == NULL) { - res = -ENOMEM; - goto out_free_id; - } - nvme_trans_fill_read_cap(response, id_ns, cdb16); - - xfer_len = min(alloc_len, resp_size); - res = nvme_trans_copy_to_user(hdr, response, xfer_len); - - kfree(response); - out_free_id: - kfree(id_ns); - return res; -} - -static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res; - int nvme_sc; - u32 alloc_len, xfer_len, resp_size; - u8 *response; - struct nvme_id_ctrl *id_ctrl; - u32 ll_length, lun_id; - u8 lun_id_offset = REPORT_LUNS_FIRST_LUN_OFFSET; - __be32 tmp_len; - - switch (cmd[2]) { - default: - return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - case ALL_LUNS_RETURNED: - case ALL_WELL_KNOWN_LUNS_RETURNED: - case RESTRICTED_LUNS_RETURNED: - nvme_sc = nvme_identify_ctrl(ns->ctrl, &id_ctrl); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; - - ll_length = le32_to_cpu(id_ctrl->nn) * LUN_ENTRY_SIZE; - resp_size = ll_length + LUN_DATA_HEADER_SIZE; - - alloc_len = get_unaligned_be32(&cmd[6]); - if (alloc_len < resp_size) { - res = nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out_free_id; - } - - response = kzalloc(resp_size, GFP_KERNEL); - if (response == NULL) { - res = -ENOMEM; - goto out_free_id; - } - - /* The first LUN ID will always be 0 per the SAM spec */ - for (lun_id = 0; lun_id < le32_to_cpu(id_ctrl->nn); lun_id++) { - /* - * Set the LUN Id and then increment to the next LUN - * location in the parameter data. - */ - __be64 tmp_id = cpu_to_be64(lun_id); - memcpy(&response[lun_id_offset], &tmp_id, sizeof(u64)); - lun_id_offset += LUN_ENTRY_SIZE; - } - tmp_len = cpu_to_be32(ll_length); - memcpy(response, &tmp_len, sizeof(u32)); - } - - xfer_len = min(alloc_len, resp_size); - res = nvme_trans_copy_to_user(hdr, response, xfer_len); - - kfree(response); - out_free_id: - kfree(id_ctrl); - return res; -} - -static int nvme_trans_request_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res; - u8 alloc_len, xfer_len, resp_size; - u8 desc_format; - u8 *response; - - desc_format = cmd[1] & 0x01; - alloc_len = cmd[4]; - - resp_size = ((desc_format) ? (DESC_FMT_SENSE_DATA_SIZE) : - (FIXED_FMT_SENSE_DATA_SIZE)); - response = kzalloc(resp_size, GFP_KERNEL); - if (response == NULL) { - res = -ENOMEM; - goto out; - } - - if (desc_format) { - /* Descriptor Format Sense Data */ - response[0] = DESC_FORMAT_SENSE_DATA; - response[1] = NO_SENSE; - /* TODO How is LOW POWER CONDITION ON handled? (byte 2) */ - response[2] = SCSI_ASC_NO_SENSE; - response[3] = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - /* SDAT_OVFL = 0 | Additional Sense Length = 0 */ - } else { - /* Fixed Format Sense Data */ - response[0] = FIXED_SENSE_DATA; - /* Byte 1 = Obsolete */ - response[2] = NO_SENSE; /* FM, EOM, ILI, SDAT_OVFL = 0 */ - /* Bytes 3-6 - Information - set to zero */ - response[7] = FIXED_SENSE_DATA_ADD_LENGTH; - /* Bytes 8-11 - Cmd Specific Information - set to zero */ - response[12] = SCSI_ASC_NO_SENSE; - response[13] = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - /* Byte 14 = Field Replaceable Unit Code = 0 */ - /* Bytes 15-17 - SKSV=0; Sense Key Specific = 0 */ - } - - xfer_len = min(alloc_len, resp_size); - res = nvme_trans_copy_to_user(hdr, response, xfer_len); - - kfree(response); - out: - return res; -} - -static int nvme_trans_synchronize_cache(struct nvme_ns *ns, - struct sg_io_hdr *hdr) -{ - int nvme_sc; - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - c.common.opcode = nvme_cmd_flush; - c.common.nsid = cpu_to_le32(ns->ns_id); - - nvme_sc = nvme_submit_sync_cmd(ns->queue, &c, NULL, 0); - return nvme_trans_status_code(hdr, nvme_sc); -} - -static int nvme_trans_format_unit(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res; - u8 parm_hdr_len = 0; - u8 nvme_pf_code = 0; - u8 format_prot_info, long_list, format_data; - - format_prot_info = (cmd[1] & 0xc0) >> 6; - long_list = cmd[1] & 0x20; - format_data = cmd[1] & 0x10; - - if (format_data != 0) { - if (format_prot_info != 0) { - if (long_list == 0) - parm_hdr_len = FORMAT_UNIT_SHORT_PARM_LIST_LEN; - else - parm_hdr_len = FORMAT_UNIT_LONG_PARM_LIST_LEN; - } - } else if (format_data == 0 && format_prot_info != 0) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - - /* Get parm header from data-in/out buffer */ - /* - * According to the translation spec, the only fields in the parameter - * list we are concerned with are in the header. So allocate only that. - */ - if (parm_hdr_len > 0) { - res = nvme_trans_fmt_get_parm_header(hdr, parm_hdr_len, - format_prot_info, &nvme_pf_code); - if (res) - goto out; - } - - /* Attempt to activate any previously downloaded firmware image */ - res = nvme_trans_send_activate_fw_cmd(ns, hdr, 0); - - /* Determine Block size and count and send format command */ - res = nvme_trans_fmt_set_blk_size_count(ns, hdr); - if (res) - goto out; - - res = nvme_trans_fmt_send_cmd(ns, hdr, nvme_pf_code); - - out: - return res; -} - -static int nvme_trans_test_unit_ready(struct nvme_ns *ns, - struct sg_io_hdr *hdr, - u8 *cmd) -{ - if (nvme_ctrl_ready(ns->ctrl)) - return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - NOT_READY, SCSI_ASC_LUN_NOT_READY, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - else - return nvme_trans_completion(hdr, SAM_STAT_GOOD, NO_SENSE, 0, 0); -} - -static int nvme_trans_write_buffer(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res = 0; - u32 buffer_offset, parm_list_length; - u8 buffer_id, mode; - - parm_list_length = get_unaligned_be24(&cmd[6]); - if (parm_list_length % BYTES_TO_DWORDS != 0) { - /* NVMe expects Firmware file to be a whole number of DWORDS */ - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - buffer_id = cmd[2]; - if (buffer_id > NVME_MAX_FIRMWARE_SLOT) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - mode = cmd[1] & 0x1f; - buffer_offset = get_unaligned_be24(&cmd[3]); - - switch (mode) { - case DOWNLOAD_SAVE_ACTIVATE: - res = nvme_trans_send_download_fw_cmd(ns, hdr, nvme_admin_download_fw, - parm_list_length, buffer_offset, - buffer_id); - if (res) - goto out; - res = nvme_trans_send_activate_fw_cmd(ns, hdr, buffer_id); - break; - case DOWNLOAD_SAVE_DEFER_ACTIVATE: - res = nvme_trans_send_download_fw_cmd(ns, hdr, nvme_admin_download_fw, - parm_list_length, buffer_offset, - buffer_id); - break; - case ACTIVATE_DEFERRED_MICROCODE: - res = nvme_trans_send_activate_fw_cmd(ns, hdr, buffer_id); - break; - default: - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - - out: - return res; -} - -struct scsi_unmap_blk_desc { - __be64 slba; - __be32 nlb; - u32 resv; -}; - -struct scsi_unmap_parm_list { - __be16 unmap_data_len; - __be16 unmap_blk_desc_data_len; - u32 resv; - struct scsi_unmap_blk_desc desc[0]; -}; - -static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - struct scsi_unmap_parm_list *plist; - struct nvme_dsm_range *range; - struct nvme_command c; - int i, nvme_sc, res; - u16 ndesc, list_len; - - list_len = get_unaligned_be16(&cmd[7]); - if (!list_len) - return -EINVAL; - - plist = kmalloc(list_len, GFP_KERNEL); - if (!plist) - return -ENOMEM; - - res = nvme_trans_copy_from_user(hdr, plist, list_len); - if (res) - goto out; - - ndesc = be16_to_cpu(plist->unmap_blk_desc_data_len) >> 4; - if (!ndesc || ndesc > 256) { - res = -EINVAL; - goto out; - } - - range = kcalloc(ndesc, sizeof(*range), GFP_KERNEL); - if (!range) { - res = -ENOMEM; - goto out; - } - - for (i = 0; i < ndesc; i++) { - range[i].nlb = cpu_to_le32(be32_to_cpu(plist->desc[i].nlb)); - range[i].slba = cpu_to_le64(be64_to_cpu(plist->desc[i].slba)); - range[i].cattr = 0; - } - - memset(&c, 0, sizeof(c)); - c.dsm.opcode = nvme_cmd_dsm; - c.dsm.nsid = cpu_to_le32(ns->ns_id); - c.dsm.nr = cpu_to_le32(ndesc - 1); - c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); - - nvme_sc = nvme_submit_sync_cmd(ns->queue, &c, range, - ndesc * sizeof(*range)); - res = nvme_trans_status_code(hdr, nvme_sc); - - kfree(range); - out: - kfree(plist); - return res; -} - -static int nvme_scsi_translate(struct nvme_ns *ns, struct sg_io_hdr *hdr) -{ - u8 cmd[16]; - int retcode; - unsigned int opcode; - - if (hdr->cmdp == NULL) - return -EMSGSIZE; - if (hdr->cmd_len > sizeof(cmd)) - return -EINVAL; - if (copy_from_user(cmd, hdr->cmdp, hdr->cmd_len)) - return -EFAULT; - - /* - * Prime the hdr with good status for scsi commands that don't require - * an nvme command for translation. - */ - retcode = nvme_trans_status_code(hdr, NVME_SC_SUCCESS); - if (retcode) - return retcode; - - opcode = cmd[0]; - - switch (opcode) { - case READ_6: - case READ_10: - case READ_12: - case READ_16: - retcode = nvme_trans_io(ns, hdr, 0, cmd); - break; - case WRITE_6: - case WRITE_10: - case WRITE_12: - case WRITE_16: - retcode = nvme_trans_io(ns, hdr, 1, cmd); - break; - case INQUIRY: - retcode = nvme_trans_inquiry(ns, hdr, cmd); - break; - case LOG_SENSE: - retcode = nvme_trans_log_sense(ns, hdr, cmd); - break; - case MODE_SELECT: - case MODE_SELECT_10: - retcode = nvme_trans_mode_select(ns, hdr, cmd); - break; - case MODE_SENSE: - case MODE_SENSE_10: - retcode = nvme_trans_mode_sense(ns, hdr, cmd); - break; - case READ_CAPACITY: - retcode = nvme_trans_read_capacity(ns, hdr, cmd, 0); - break; - case SERVICE_ACTION_IN_16: - switch (cmd[1]) { - case SAI_READ_CAPACITY_16: - retcode = nvme_trans_read_capacity(ns, hdr, cmd, 1); - break; - default: - goto out; - } - break; - case REPORT_LUNS: - retcode = nvme_trans_report_luns(ns, hdr, cmd); - break; - case REQUEST_SENSE: - retcode = nvme_trans_request_sense(ns, hdr, cmd); - break; - case SYNCHRONIZE_CACHE: - retcode = nvme_trans_synchronize_cache(ns, hdr); - break; - case FORMAT_UNIT: - retcode = nvme_trans_format_unit(ns, hdr, cmd); - break; - case TEST_UNIT_READY: - retcode = nvme_trans_test_unit_ready(ns, hdr, cmd); - break; - case WRITE_BUFFER: - retcode = nvme_trans_write_buffer(ns, hdr, cmd); - break; - case UNMAP: - retcode = nvme_trans_unmap(ns, hdr, cmd); - break; - default: - out: - retcode = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_ILLEGAL_COMMAND, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - return retcode; -} - -int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr) -{ - struct sg_io_hdr hdr; - int retcode; - - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (copy_from_user(&hdr, u_hdr, sizeof(hdr))) - return -EFAULT; - if (hdr.interface_id != 'S') - return -EINVAL; - - /* - * A positive return code means a NVMe status, which has been - * translated to sense data. - */ - retcode = nvme_scsi_translate(ns, &hdr); - if (retcode < 0) - return retcode; - if (copy_to_user(u_hdr, &hdr, sizeof(sg_io_hdr_t)) > 0) - return -EFAULT; - return 0; -} - -int nvme_sg_get_version_num(int __user *ip) -{ - return put_user(sg_version_num, ip); -} diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index ff1f970..35f930d 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -336,7 +336,7 @@ out: static void nvmet_execute_identify_nslist(struct nvmet_req *req) { - static const int buf_size = 4096; + static const int buf_size = NVME_IDENTIFY_DATA_SIZE; struct nvmet_ctrl *ctrl = req->sq->ctrl; struct nvmet_ns *ns; u32 min_nsid = le32_to_cpu(req->cmd->identify.nsid); @@ -367,6 +367,64 @@ out: nvmet_req_complete(req, status); } +static u16 nvmet_copy_ns_identifier(struct nvmet_req *req, u8 type, u8 len, + void *id, off_t *off) +{ + struct nvme_ns_id_desc desc = { + .nidt = type, + .nidl = len, + }; + u16 status; + + status = nvmet_copy_to_sgl(req, *off, &desc, sizeof(desc)); + if (status) + return status; + *off += sizeof(desc); + + status = nvmet_copy_to_sgl(req, *off, id, len); + if (status) + return status; + *off += len; + + return 0; +} + +static void nvmet_execute_identify_desclist(struct nvmet_req *req) +{ + struct nvmet_ns *ns; + u16 status = 0; + off_t off = 0; + + ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid); + if (!ns) { + status = NVME_SC_INVALID_NS | NVME_SC_DNR; + goto out; + } + + if (memchr_inv(&ns->uuid, 0, sizeof(ns->uuid))) { + status = nvmet_copy_ns_identifier(req, NVME_NIDT_UUID, + NVME_NIDT_UUID_LEN, + &ns->uuid, &off); + if (status) + goto out_put_ns; + } + if (memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) { + status = nvmet_copy_ns_identifier(req, NVME_NIDT_NGUID, + NVME_NIDT_NGUID_LEN, + &ns->nguid, &off); + if (status) + goto out_put_ns; + } + + if (sg_zero_buffer(req->sg, req->sg_cnt, NVME_IDENTIFY_DATA_SIZE - off, + off) != NVME_IDENTIFY_DATA_SIZE - off) + status = NVME_SC_INTERNAL | NVME_SC_DNR; +out_put_ns: + nvmet_put_namespace(ns); +out: + nvmet_req_complete(req, status); +} + /* * A "mimimum viable" abort implementation: the command is mandatory in the * spec, but we are not required to do any useful work. We couldn't really @@ -504,7 +562,7 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) } break; case nvme_admin_identify: - req->data_len = 4096; + req->data_len = NVME_IDENTIFY_DATA_SIZE; switch (cmd->identify.cns) { case NVME_ID_CNS_NS: req->execute = nvmet_execute_identify_ns; @@ -515,6 +573,9 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) case NVME_ID_CNS_NS_ACTIVE_LIST: req->execute = nvmet_execute_identify_nslist; return 0; + case NVME_ID_CNS_NS_DESC_LIST: + req->execute = nvmet_execute_identify_desclist; + return 0; } break; case nvme_admin_abort_cmd: diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index be8c800..a358ecd 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -305,11 +305,41 @@ out_unlock: CONFIGFS_ATTR(nvmet_ns_, device_path); +static ssize_t nvmet_ns_device_uuid_show(struct config_item *item, char *page) +{ + return sprintf(page, "%pUb\n", &to_nvmet_ns(item)->uuid); +} + +static ssize_t nvmet_ns_device_uuid_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + struct nvmet_subsys *subsys = ns->subsys; + int ret = 0; + + + mutex_lock(&subsys->lock); + if (ns->enabled) { + ret = -EBUSY; + goto out_unlock; + } + + + if (uuid_parse(page, &ns->uuid)) + ret = -EINVAL; + +out_unlock: + mutex_unlock(&subsys->lock); + return ret ? ret : count; +} + static ssize_t nvmet_ns_device_nguid_show(struct config_item *item, char *page) { return sprintf(page, "%pUb\n", &to_nvmet_ns(item)->nguid); } +CONFIGFS_ATTR(nvmet_ns_, device_uuid); + static ssize_t nvmet_ns_device_nguid_store(struct config_item *item, const char *page, size_t count) { @@ -379,6 +409,7 @@ CONFIGFS_ATTR(nvmet_ns_, enable); static struct configfs_attribute *nvmet_ns_attrs[] = { &nvmet_ns_attr_device_path, &nvmet_ns_attr_device_nguid, + &nvmet_ns_attr_device_uuid, &nvmet_ns_attr_enable, NULL, }; @@ -619,8 +650,45 @@ out_unlock: CONFIGFS_ATTR(nvmet_subsys_, attr_allow_any_host); +static ssize_t nvmet_subsys_version_show(struct config_item *item, + char *page) +{ + struct nvmet_subsys *subsys = to_subsys(item); + + if (NVME_TERTIARY(subsys->ver)) + return snprintf(page, PAGE_SIZE, "%d.%d.%d\n", + (int)NVME_MAJOR(subsys->ver), + (int)NVME_MINOR(subsys->ver), + (int)NVME_TERTIARY(subsys->ver)); + else + return snprintf(page, PAGE_SIZE, "%d.%d\n", + (int)NVME_MAJOR(subsys->ver), + (int)NVME_MINOR(subsys->ver)); +} + +static ssize_t nvmet_subsys_version_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item); + int major, minor, tertiary = 0; + int ret; + + + ret = sscanf(page, "%d.%d.%d\n", &major, &minor, &tertiary); + if (ret != 2 && ret != 3) + return -EINVAL; + + down_write(&nvmet_config_sem); + subsys->ver = NVME_VS(major, minor, tertiary); + up_write(&nvmet_config_sem); + + return count; +} +CONFIGFS_ATTR(nvmet_subsys_, version); + static struct configfs_attribute *nvmet_subsys_attrs[] = { &nvmet_subsys_attr_attr_allow_any_host, + &nvmet_subsys_attr_version, NULL, }; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index eb9399a..b5b4ac1 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -380,6 +380,7 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) ns->nsid = nsid; ns->subsys = subsys; + uuid_gen(&ns->uuid); return ns; } @@ -926,7 +927,7 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, if (!subsys) return NULL; - subsys->ver = NVME_VS(1, 2, 1); /* NVMe 1.2.1 */ + subsys->ver = NVME_VS(1, 3, 0); /* NVMe 1.3.0 */ switch (type) { case NVME_NQN_NVME: diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index 1aaf597..8f3b57b 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -53,7 +53,7 @@ static void nvmet_format_discovery_entry(struct nvmf_disc_rsp_page_hdr *hdr, e->portid = port->disc_addr.portid; /* we support only dynamic controllers */ e->cntlid = cpu_to_le16(NVME_CNTLID_DYNAMIC); - e->asqsz = cpu_to_le16(NVMF_AQ_DEPTH); + e->asqsz = cpu_to_le16(NVME_AQ_DEPTH); e->subtype = type; memcpy(e->trsvcid, port->disc_addr.trsvcid, NVMF_TRSVCID_SIZE); memcpy(e->traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE); @@ -185,7 +185,7 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req) return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } case nvme_admin_identify: - req->data_len = 4096; + req->data_len = NVME_IDENTIFY_DATA_SIZE; switch (cmd->identify.cns) { case NVME_ID_CNS_CTRL: req->execute = diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 2006fae..7692a96 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -2096,20 +2096,22 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, /* clear any response payload */ memset(&fod->rspiubuf, 0, sizeof(fod->rspiubuf)); + fod->data_sg = NULL; + fod->data_sg_cnt = 0; + ret = nvmet_req_init(&fod->req, &fod->queue->nvme_cq, &fod->queue->nvme_sq, &nvmet_fc_tgt_fcp_ops); - if (!ret) { /* bad SQE content or invalid ctrl state */ - nvmet_fc_abort_op(tgtport, fod); + if (!ret) { + /* bad SQE content or invalid ctrl state */ + /* nvmet layer has already called op done to send rsp. */ return; } /* keep a running counter of tail position */ atomic_inc(&fod->queue->sqtail); - fod->data_sg = NULL; - fod->data_sg_cnt = 0; if (fod->total_length) { ret = nvmet_fc_alloc_tgt_pgs(fod); if (ret) { diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 294a661..1bb9d5b 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -569,7 +569,6 @@ fcloop_tgt_fcp_abort(struct nvmet_fc_target_port *tgtport, struct nvmefc_tgt_fcp_req *tgt_fcpreq) { struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq); - int active; /* * mark aborted only in case there were 2 threads in transport @@ -577,7 +576,6 @@ fcloop_tgt_fcp_abort(struct nvmet_fc_target_port *tgtport, * after the abort request */ spin_lock(&tfcp_req->reqlock); - active = tfcp_req->active; tfcp_req->aborted = true; spin_unlock(&tfcp_req->reqlock); diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c index c77940d..4012879 100644 --- a/drivers/nvme/target/io-cmd.c +++ b/drivers/nvme/target/io-cmd.c @@ -21,7 +21,7 @@ static void nvmet_bio_done(struct bio *bio) struct nvmet_req *req = bio->bi_private; nvmet_req_complete(req, - bio->bi_error ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); + bio->bi_status ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); if (bio != &req->inline_bio) bio_put(bio); @@ -145,7 +145,7 @@ static void nvmet_execute_discard(struct nvmet_req *req) bio->bi_private = req; bio->bi_end_io = nvmet_bio_done; if (status) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); } else { submit_bio(bio); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index e503cff..5f55c68 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -21,8 +21,6 @@ #include "../host/nvme.h" #include "../host/fabrics.h" -#define NVME_LOOP_AQ_DEPTH 256 - #define NVME_LOOP_MAX_SEGMENTS 256 /* @@ -31,7 +29,7 @@ */ #define NVME_LOOP_NR_AEN_COMMANDS 1 #define NVME_LOOP_AQ_BLKMQ_DEPTH \ - (NVME_LOOP_AQ_DEPTH - NVME_LOOP_NR_AEN_COMMANDS) + (NVME_AQ_DEPTH - NVME_LOOP_NR_AEN_COMMANDS) struct nvme_loop_iod { struct nvme_request nvme_req; @@ -45,7 +43,6 @@ struct nvme_loop_iod { }; struct nvme_loop_ctrl { - spinlock_t lock; struct nvme_loop_queue *queues; u32 queue_count; @@ -59,7 +56,6 @@ struct nvme_loop_ctrl { struct nvmet_ctrl *target_ctrl; struct work_struct delete_work; - struct work_struct reset_work; }; static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl) @@ -151,7 +147,7 @@ nvme_loop_timeout(struct request *rq, bool reserved) struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(rq); /* queue error recovery */ - schedule_work(&iod->queue->ctrl->reset_work); + nvme_reset_ctrl(&iod->queue->ctrl->ctrl); /* fail with DNR on admin cmd timeout */ nvme_req(rq)->status = NVME_SC_ABORT_REQ | NVME_SC_DNR; @@ -159,17 +155,17 @@ nvme_loop_timeout(struct request *rq, bool reserved) return BLK_EH_HANDLED; } -static int nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, +static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct nvme_ns *ns = hctx->queue->queuedata; struct nvme_loop_queue *queue = hctx->driver_data; struct request *req = bd->rq; struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req); - int ret; + blk_status_t ret; ret = nvme_setup_cmd(ns, req, &iod->cmd); - if (ret != BLK_MQ_RQ_QUEUE_OK) + if (ret) return ret; iod->cmd.common.flags |= NVME_CMD_SGL_METABUF; @@ -179,16 +175,15 @@ static int nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, nvme_cleanup_cmd(req); blk_mq_start_request(req); nvme_loop_queue_response(&iod->req); - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; } if (blk_rq_bytes(req)) { iod->sg_table.sgl = iod->first_sgl; - ret = sg_alloc_table_chained(&iod->sg_table, + if (sg_alloc_table_chained(&iod->sg_table, blk_rq_nr_phys_segments(req), - iod->sg_table.sgl); - if (ret) - return BLK_MQ_RQ_QUEUE_BUSY; + iod->sg_table.sgl)) + return BLK_STS_RESOURCE; iod->req.sg = iod->sg_table.sgl; iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl); @@ -197,7 +192,7 @@ static int nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(req); schedule_work(&iod->work); - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; } static void nvme_loop_submit_async_event(struct nvme_ctrl *arg, int aer_idx) @@ -234,15 +229,10 @@ static int nvme_loop_init_request(struct blk_mq_tag_set *set, struct request *req, unsigned int hctx_idx, unsigned int numa_node) { - return nvme_loop_init_iod(set->driver_data, blk_mq_rq_to_pdu(req), - hctx_idx + 1); -} + struct nvme_loop_ctrl *ctrl = set->driver_data; -static int nvme_loop_init_admin_request(struct blk_mq_tag_set *set, - struct request *req, unsigned int hctx_idx, - unsigned int numa_node) -{ - return nvme_loop_init_iod(set->driver_data, blk_mq_rq_to_pdu(req), 0); + return nvme_loop_init_iod(ctrl, blk_mq_rq_to_pdu(req), + (set == &ctrl->tag_set) ? hctx_idx + 1 : 0); } static int nvme_loop_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, @@ -280,7 +270,7 @@ static const struct blk_mq_ops nvme_loop_mq_ops = { static const struct blk_mq_ops nvme_loop_admin_mq_ops = { .queue_rq = nvme_loop_queue_rq, .complete = nvme_loop_complete_rq, - .init_request = nvme_loop_init_admin_request, + .init_request = nvme_loop_init_request, .init_hctx = nvme_loop_init_admin_hctx, .timeout = nvme_loop_timeout, }; @@ -467,7 +457,7 @@ static int __nvme_loop_del_ctrl(struct nvme_loop_ctrl *ctrl) if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) return -EBUSY; - if (!schedule_work(&ctrl->delete_work)) + if (!queue_work(nvme_wq, &ctrl->delete_work)) return -EBUSY; return 0; @@ -501,8 +491,8 @@ static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl) static void nvme_loop_reset_ctrl_work(struct work_struct *work) { - struct nvme_loop_ctrl *ctrl = container_of(work, - struct nvme_loop_ctrl, reset_work); + struct nvme_loop_ctrl *ctrl = + container_of(work, struct nvme_loop_ctrl, ctrl.reset_work); bool changed; int ret; @@ -540,21 +530,6 @@ out_disable: nvme_put_ctrl(&ctrl->ctrl); } -static int nvme_loop_reset_ctrl(struct nvme_ctrl *nctrl) -{ - struct nvme_loop_ctrl *ctrl = to_loop_ctrl(nctrl); - - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) - return -EBUSY; - - if (!schedule_work(&ctrl->reset_work)) - return -EBUSY; - - flush_work(&ctrl->reset_work); - - return 0; -} - static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = { .name = "loop", .module = THIS_MODULE, @@ -562,11 +537,9 @@ static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = { .reg_read32 = nvmf_reg_read32, .reg_read64 = nvmf_reg_read64, .reg_write32 = nvmf_reg_write32, - .reset_ctrl = nvme_loop_reset_ctrl, .free_ctrl = nvme_loop_free_ctrl, .submit_async_event = nvme_loop_submit_async_event, .delete_ctrl = nvme_loop_del_ctrl, - .get_subsysnqn = nvmf_get_subsysnqn, }; static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl) @@ -629,15 +602,13 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, INIT_LIST_HEAD(&ctrl->list); INIT_WORK(&ctrl->delete_work, nvme_loop_del_ctrl_work); - INIT_WORK(&ctrl->reset_work, nvme_loop_reset_ctrl_work); + INIT_WORK(&ctrl->ctrl.reset_work, nvme_loop_reset_ctrl_work); ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_loop_ctrl_ops, 0 /* no quirks, we're perfect! */); if (ret) goto out_put_ctrl; - spin_lock_init(&ctrl->lock); - ret = -ENOMEM; ctrl->ctrl.sqsize = opts->queue_size - 1; @@ -766,7 +737,7 @@ static void __exit nvme_loop_cleanup_module(void) __nvme_loop_del_ctrl(ctrl); mutex_unlock(&nvme_loop_ctrl_mutex); - flush_scheduled_work(); + flush_workqueue(nvme_wq); } module_init(nvme_loop_init_module); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 8ff6e43..747bbdb 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -47,6 +47,7 @@ struct nvmet_ns { u32 blksize_shift; loff_t size; u8 nguid[16]; + uuid_t uuid; bool enabled; struct nvmet_subsys *subsys; diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 9e45cde..56a4cba 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1027,7 +1027,7 @@ nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn, queue->recv_queue_size = le16_to_cpu(req->hsqsize) + 1; queue->send_queue_size = le16_to_cpu(req->hrqsize); - if (!queue->host_qid && queue->recv_queue_size > NVMF_AQ_DEPTH) + if (!queue->host_qid && queue->recv_queue_size > NVME_AQ_DEPTH) return NVME_RDMA_CM_INVALID_HSQSIZE; /* XXX: Should we enforce some kind of max for IO queues? */ @@ -1307,53 +1307,44 @@ static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id, /** * nvme_rdma_device_removal() - Handle RDMA device removal + * @cm_id: rdma_cm id, used for nvmet port * @queue: nvmet rdma queue (cm id qp_context) - * @addr: nvmet address (cm_id context) * * DEVICE_REMOVAL event notifies us that the RDMA device is about - * to unplug so we should take care of destroying our RDMA resources. - * This event will be generated for each allocated cm_id. + * to unplug. Note that this event can be generated on a normal + * queue cm_id and/or a device bound listener cm_id (where in this + * case queue will be null). * - * Note that this event can be generated on a normal queue cm_id - * and/or a device bound listener cm_id (where in this case - * queue will be null). - * - * we claim ownership on destroying the cm_id. For queues we move - * the queue state to NVMET_RDMA_IN_DEVICE_REMOVAL and for port + * We registered an ib_client to handle device removal for queues, + * so we only need to handle the listening port cm_ids. In this case * we nullify the priv to prevent double cm_id destruction and destroying * the cm_id implicitely by returning a non-zero rc to the callout. */ static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id, struct nvmet_rdma_queue *queue) { - unsigned long flags; - - if (!queue) { - struct nvmet_port *port = cm_id->context; + struct nvmet_port *port; + if (queue) { /* - * This is a listener cm_id. Make sure that - * future remove_port won't invoke a double - * cm_id destroy. use atomic xchg to make sure - * we don't compete with remove_port. - */ - if (xchg(&port->priv, NULL) != cm_id) - return 0; - } else { - /* - * This is a queue cm_id. Make sure that - * release queue will not destroy the cm_id - * and schedule all ctrl queues removal (only - * if the queue is not disconnecting already). + * This is a queue cm_id. we have registered + * an ib_client to handle queues removal + * so don't interfear and just return. */ - spin_lock_irqsave(&queue->state_lock, flags); - if (queue->state != NVMET_RDMA_Q_DISCONNECTING) - queue->state = NVMET_RDMA_IN_DEVICE_REMOVAL; - spin_unlock_irqrestore(&queue->state_lock, flags); - nvmet_rdma_queue_disconnect(queue); - flush_scheduled_work(); + return 0; } + port = cm_id->context; + + /* + * This is a listener cm_id. Make sure that + * future remove_port won't invoke a double + * cm_id destroy. use atomic xchg to make sure + * we don't compete with remove_port. + */ + if (xchg(&port->priv, NULL) != cm_id) + return 0; + /* * We need to return 1 so that the core will destroy * it's own ID. What a great API design.. @@ -1519,9 +1510,51 @@ static struct nvmet_fabrics_ops nvmet_rdma_ops = { .delete_ctrl = nvmet_rdma_delete_ctrl, }; +static void nvmet_rdma_add_one(struct ib_device *ib_device) +{ +} + +static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data) +{ + struct nvmet_rdma_queue *queue; + + /* Device is being removed, delete all queues using this device */ + mutex_lock(&nvmet_rdma_queue_mutex); + list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) { + if (queue->dev->device != ib_device) + continue; + + pr_info("Removing queue %d\n", queue->idx); + __nvmet_rdma_queue_disconnect(queue); + } + mutex_unlock(&nvmet_rdma_queue_mutex); + + flush_scheduled_work(); +} + +static struct ib_client nvmet_rdma_ib_client = { + .name = "nvmet_rdma", + .add = nvmet_rdma_add_one, + .remove = nvmet_rdma_remove_one +}; + static int __init nvmet_rdma_init(void) { - return nvmet_register_transport(&nvmet_rdma_ops); + int ret; + + ret = ib_register_client(&nvmet_rdma_ib_client); + if (ret) + return ret; + + ret = nvmet_register_transport(&nvmet_rdma_ops); + if (ret) + goto err_ib_client; + + return 0; + +err_ib_client: + ib_unregister_client(&nvmet_rdma_ib_client); + return ret; } static void __exit nvmet_rdma_exit(void) @@ -1544,6 +1577,7 @@ static void __exit nvmet_rdma_exit(void) mutex_unlock(&nvmet_rdma_queue_mutex); flush_scheduled_work(); + ib_unregister_client(&nvmet_rdma_ib_client); ida_destroy(&nvmet_rdma_queue_ida); } diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 6fb3fd5..b7cbd5d 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -2672,7 +2672,7 @@ static void __dasd_process_request_queue(struct dasd_block *block) */ if (basedev->state < DASD_STATE_READY) { while ((req = blk_fetch_request(block->request_queue))) - __blk_end_request_all(req, -EIO); + __blk_end_request_all(req, BLK_STS_IOERR); return; } @@ -2692,7 +2692,7 @@ static void __dasd_process_request_queue(struct dasd_block *block) "Rejecting write request %p", req); blk_start_request(req); - __blk_end_request_all(req, -EIO); + __blk_end_request_all(req, BLK_STS_IOERR); continue; } if (test_bit(DASD_FLAG_ABORTALL, &basedev->flags) && @@ -2702,7 +2702,7 @@ static void __dasd_process_request_queue(struct dasd_block *block) "Rejecting failfast request %p", req); blk_start_request(req); - __blk_end_request_all(req, -ETIMEDOUT); + __blk_end_request_all(req, BLK_STS_TIMEOUT); continue; } cqr = basedev->discipline->build_cp(basedev, block, req); @@ -2734,7 +2734,7 @@ static void __dasd_process_request_queue(struct dasd_block *block) "on request %p", PTR_ERR(cqr), req); blk_start_request(req); - __blk_end_request_all(req, -EIO); + __blk_end_request_all(req, BLK_STS_IOERR); continue; } /* @@ -2755,21 +2755,29 @@ static void __dasd_cleanup_cqr(struct dasd_ccw_req *cqr) { struct request *req; int status; - int error = 0; + blk_status_t error = BLK_STS_OK; req = (struct request *) cqr->callback_data; dasd_profile_end(cqr->block, cqr, req); + status = cqr->block->base->discipline->free_cp(cqr, req); if (status < 0) - error = status; + error = errno_to_blk_status(status); else if (status == 0) { - if (cqr->intrc == -EPERM) - error = -EBADE; - else if (cqr->intrc == -ENOLINK || - cqr->intrc == -ETIMEDOUT) - error = cqr->intrc; - else - error = -EIO; + switch (cqr->intrc) { + case -EPERM: + error = BLK_STS_NEXUS; + break; + case -ENOLINK: + error = BLK_STS_TRANSPORT; + break; + case -ETIMEDOUT: + error = BLK_STS_TIMEOUT; + break; + default: + error = BLK_STS_IOERR; + break; + } } __blk_end_request_all(req, error); } @@ -3190,7 +3198,7 @@ static void dasd_flush_request_queue(struct dasd_block *block) spin_lock_irq(&block->request_queue_lock); while ((req = blk_fetch_request(block->request_queue))) - __blk_end_request_all(req, -EIO); + __blk_end_request_all(req, BLK_STS_IOERR); spin_unlock_irq(&block->request_queue_lock); } diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 36e5280..06eb1de 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -845,7 +845,7 @@ dcssblk_make_request(struct request_queue *q, struct bio *bio) unsigned long source_addr; unsigned long bytes_done; - blk_queue_split(q, &bio, q->bio_split); + blk_queue_split(q, &bio); bytes_done = 0; dev_info = bio->bi_bdev->bd_disk->private_data; diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c index 152de68..3c2c84b 100644 --- a/drivers/s390/block/scm_blk.c +++ b/drivers/s390/block/scm_blk.c @@ -231,7 +231,7 @@ static inline void scm_request_init(struct scm_blk_dev *bdev, aob->request.data = (u64) aobrq; scmrq->bdev = bdev; scmrq->retries = 4; - scmrq->error = 0; + scmrq->error = BLK_STS_OK; /* We don't use all msbs - place aidaws at the end of the aob page. */ scmrq->next_aidaw = (void *) &aob->msb[nr_requests_per_io]; scm_request_cluster_init(scmrq); @@ -364,7 +364,7 @@ static void __scmrq_log_error(struct scm_request *scmrq) { struct aob *aob = scmrq->aob; - if (scmrq->error == -ETIMEDOUT) + if (scmrq->error == BLK_STS_TIMEOUT) SCM_LOG(1, "Request timeout"); else { SCM_LOG(1, "Request error"); @@ -377,7 +377,7 @@ static void __scmrq_log_error(struct scm_request *scmrq) scmrq->error); } -void scm_blk_irq(struct scm_device *scmdev, void *data, int error) +void scm_blk_irq(struct scm_device *scmdev, void *data, blk_status_t error) { struct scm_request *scmrq = data; struct scm_blk_dev *bdev = scmrq->bdev; @@ -397,7 +397,7 @@ static void scm_blk_handle_error(struct scm_request *scmrq) struct scm_blk_dev *bdev = scmrq->bdev; unsigned long flags; - if (scmrq->error != -EIO) + if (scmrq->error != BLK_STS_IOERR) goto restart; /* For -EIO the response block is valid. */ diff --git a/drivers/s390/block/scm_blk.h b/drivers/s390/block/scm_blk.h index 09218cd..cd598d1 100644 --- a/drivers/s390/block/scm_blk.h +++ b/drivers/s390/block/scm_blk.h @@ -35,7 +35,7 @@ struct scm_request { struct aob *aob; struct list_head list; u8 retries; - int error; + blk_status_t error; #ifdef CONFIG_SCM_BLOCK_CLUSTER_WRITE struct { enum {CLUSTER_NONE, CLUSTER_READ, CLUSTER_WRITE} state; @@ -50,7 +50,7 @@ struct scm_request { int scm_blk_dev_setup(struct scm_blk_dev *, struct scm_device *); void scm_blk_dev_cleanup(struct scm_blk_dev *); void scm_blk_set_available(struct scm_blk_dev *); -void scm_blk_irq(struct scm_device *, void *, int); +void scm_blk_irq(struct scm_device *, void *, blk_status_t); void scm_request_finish(struct scm_request *); void scm_request_requeue(struct scm_request *); diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c index b9d7e75..a48f0d4 100644 --- a/drivers/s390/block/xpram.c +++ b/drivers/s390/block/xpram.c @@ -190,7 +190,7 @@ static blk_qc_t xpram_make_request(struct request_queue *q, struct bio *bio) unsigned long page_addr; unsigned long bytes; - blk_queue_split(q, &bio, q->bio_split); + blk_queue_split(q, &bio); if ((bio->bi_iter.bi_sector & 7) != 0 || (bio->bi_iter.bi_size & 4095) != 0) diff --git a/drivers/s390/cio/eadm_sch.c b/drivers/s390/cio/eadm_sch.c index b3f44bc..0f11f3b 100644 --- a/drivers/s390/cio/eadm_sch.c +++ b/drivers/s390/cio/eadm_sch.c @@ -135,7 +135,7 @@ static void eadm_subchannel_irq(struct subchannel *sch) struct eadm_private *private = get_eadm_private(sch); struct eadm_scsw *scsw = &sch->schib.scsw.eadm; struct irb *irb = this_cpu_ptr(&cio_irb); - int error = 0; + blk_status_t error = BLK_STS_OK; EADM_LOG(6, "irq"); EADM_LOG_HEX(6, irb, sizeof(*irb)); @@ -144,10 +144,10 @@ static void eadm_subchannel_irq(struct subchannel *sch) if ((scsw->stctl & (SCSW_STCTL_ALERT_STATUS | SCSW_STCTL_STATUS_PEND)) && scsw->eswf == 1 && irb->esw.eadm.erw.r) - error = -EIO; + error = BLK_STS_IOERR; if (scsw->fctl & SCSW_FCTL_CLEAR_FUNC) - error = -ETIMEDOUT; + error = BLK_STS_TIMEOUT; eadm_subchannel_set_timeout(sch, 0); diff --git a/drivers/s390/cio/scm.c b/drivers/s390/cio/scm.c index 15268ed..1fa53ec 100644 --- a/drivers/s390/cio/scm.c +++ b/drivers/s390/cio/scm.c @@ -71,7 +71,7 @@ void scm_driver_unregister(struct scm_driver *scmdrv) } EXPORT_SYMBOL_GPL(scm_driver_unregister); -void scm_irq_handler(struct aob *aob, int error) +void scm_irq_handler(struct aob *aob, blk_status_t error) { struct aob_rq_header *aobrq = (void *) aob->request.data; struct scm_device *scmdev = aobrq->scmdev; diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c index 62fed9d..14f377a 100644 --- a/drivers/sbus/char/jsflash.c +++ b/drivers/sbus/char/jsflash.c @@ -214,7 +214,7 @@ static void jsfd_request(void) struct jsfd_part *jdp = req->rq_disk->private_data; unsigned long offset = blk_rq_pos(req) << 9; size_t len = blk_rq_cur_bytes(req); - int err = -EIO; + blk_status_t err = BLK_STS_IOERR; if ((offset + len) > jdp->dsize) goto end; @@ -230,7 +230,7 @@ static void jsfd_request(void) } jsfd_read(bio_data(req->bio), jdp->dbase + offset, len); - err = 0; + err = BLK_STS_OK; end: if (!__blk_end_request_cur(req, err)) req = jsfd_next_request(); @@ -592,6 +592,7 @@ static int jsfd_init(void) put_disk(disk); goto out; } + blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH); jsfd_disk[i] = disk; } diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c index 8a1b948..a4f28b7 100644 --- a/drivers/scsi/osd/osd_initiator.c +++ b/drivers/scsi/osd/osd_initiator.c @@ -446,7 +446,7 @@ static void _put_request(struct request *rq) * code paths. */ if (unlikely(rq->bio)) - blk_end_request(rq, -ENOMEM, blk_rq_bytes(rq)); + blk_end_request(rq, BLK_STS_IOERR, blk_rq_bytes(rq)); else blk_put_request(rq); } @@ -474,10 +474,10 @@ void osd_end_request(struct osd_request *or) EXPORT_SYMBOL(osd_end_request); static void _set_error_resid(struct osd_request *or, struct request *req, - int error) + blk_status_t error) { or->async_error = error; - or->req_errors = scsi_req(req)->result ? : error; + or->req_errors = scsi_req(req)->result; or->sense_len = scsi_req(req)->sense_len; if (or->sense_len) memcpy(or->sense, scsi_req(req)->sense, or->sense_len); @@ -489,17 +489,19 @@ static void _set_error_resid(struct osd_request *or, struct request *req, int osd_execute_request(struct osd_request *or) { - int error; - blk_execute_rq(or->request->q, NULL, or->request, 0); - error = scsi_req(or->request)->result ? -EIO : 0; - _set_error_resid(or, or->request, error); - return error; + if (scsi_req(or->request)->result) { + _set_error_resid(or, or->request, BLK_STS_IOERR); + return -EIO; + } + + _set_error_resid(or, or->request, BLK_STS_OK); + return 0; } EXPORT_SYMBOL(osd_execute_request); -static void osd_request_async_done(struct request *req, int error) +static void osd_request_async_done(struct request *req, blk_status_t error) { struct osd_request *or = req->end_io_data; @@ -1572,13 +1574,9 @@ static struct request *_make_request(struct request_queue *q, bool has_write, flags); if (IS_ERR(req)) return req; - scsi_req_init(req); for_each_bio(bio) { - struct bio *bounce_bio = bio; - - blk_queue_bounce(req->q, &bounce_bio); - ret = blk_rq_append_bio(req, bounce_bio); + ret = blk_rq_append_bio(req, bio); if (ret) return ERR_PTR(ret); } @@ -1617,7 +1615,6 @@ static int _init_blk_request(struct osd_request *or, ret = PTR_ERR(req); goto out; } - scsi_req_init(req); or->in.req = or->request->next_rq = req; } } else if (has_in) @@ -1914,7 +1911,7 @@ analyze: /* scsi sense is Empty, the request was never issued to target * linux return code might tell us what happened. */ - if (or->async_error == -ENOMEM) + if (or->async_error == BLK_STS_RESOURCE) osi->osd_err_pri = OSD_ERR_PRI_RESOURCE; else osi->osd_err_pri = OSD_ERR_PRI_UNREACHABLE; diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c index 67cbed9..929ee7e 100644 --- a/drivers/scsi/osst.c +++ b/drivers/scsi/osst.c @@ -320,7 +320,7 @@ static int osst_chk_result(struct osst_tape * STp, struct osst_request * SRpnt) /* Wakeup from interrupt */ -static void osst_end_async(struct request *req, int update) +static void osst_end_async(struct request *req, blk_status_t status) { struct scsi_request *rq = scsi_req(req); struct osst_request *SRpnt = req->end_io_data; @@ -373,7 +373,6 @@ static int osst_execute(struct osst_request *SRpnt, const unsigned char *cmd, return DRIVER_ERROR << 24; rq = scsi_req(req); - scsi_req_init(req); req->rq_flags |= RQF_QUIET; SRpnt->bio = NULL; diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index ecc07da..304a715 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -1874,7 +1874,7 @@ int scsi_decide_disposition(struct scsi_cmnd *scmd) } } -static void eh_lock_door_done(struct request *req, int uptodate) +static void eh_lock_door_done(struct request *req, blk_status_t status) { __blk_put_request(req->q, req); } @@ -1903,7 +1903,6 @@ static void scsi_eh_lock_door(struct scsi_device *sdev) if (IS_ERR(req)) return; rq = scsi_req(req); - scsi_req_init(req); rq->cmd[0] = ALLOW_MEDIUM_REMOVAL; rq->cmd[1] = 0; diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 99e16ac..550e29f 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -250,7 +250,6 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, if (IS_ERR(req)) return ret; rq = scsi_req(req); - scsi_req_init(req); if (bufflen && blk_rq_map_kern(sdev->request_queue, req, buffer, bufflen, __GFP_RECLAIM)) @@ -635,7 +634,7 @@ static void scsi_release_bidi_buffers(struct scsi_cmnd *cmd) cmd->request->next_rq->special = NULL; } -static bool scsi_end_request(struct request *req, int error, +static bool scsi_end_request(struct request *req, blk_status_t error, unsigned int bytes, unsigned int bidi_bytes) { struct scsi_cmnd *cmd = req->special; @@ -694,45 +693,28 @@ static bool scsi_end_request(struct request *req, int error, * @cmd: SCSI command (unused) * @result: scsi error code * - * Translate SCSI error code into standard UNIX errno. - * Return values: - * -ENOLINK temporary transport failure - * -EREMOTEIO permanent target failure, do not retry - * -EBADE permanent nexus failure, retry on other path - * -ENOSPC No write space available - * -ENODATA Medium error - * -EIO unspecified I/O error + * Translate SCSI error code into block errors. */ -static int __scsi_error_from_host_byte(struct scsi_cmnd *cmd, int result) +static blk_status_t __scsi_error_from_host_byte(struct scsi_cmnd *cmd, + int result) { - int error = 0; - - switch(host_byte(result)) { + switch (host_byte(result)) { case DID_TRANSPORT_FAILFAST: - error = -ENOLINK; - break; + return BLK_STS_TRANSPORT; case DID_TARGET_FAILURE: set_host_byte(cmd, DID_OK); - error = -EREMOTEIO; - break; + return BLK_STS_TARGET; case DID_NEXUS_FAILURE: - set_host_byte(cmd, DID_OK); - error = -EBADE; - break; + return BLK_STS_NEXUS; case DID_ALLOC_FAILURE: set_host_byte(cmd, DID_OK); - error = -ENOSPC; - break; + return BLK_STS_NOSPC; case DID_MEDIUM_ERROR: set_host_byte(cmd, DID_OK); - error = -ENODATA; - break; + return BLK_STS_MEDIUM; default: - error = -EIO; - break; + return BLK_STS_IOERR; } - - return error; } /* @@ -769,7 +751,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) int result = cmd->result; struct request_queue *q = cmd->device->request_queue; struct request *req = cmd->request; - int error = 0; + blk_status_t error = BLK_STS_OK; struct scsi_sense_hdr sshdr; bool sense_valid = false; int sense_deferred = 0, level = 0; @@ -808,7 +790,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) * both sides at once. */ scsi_req(req->next_rq)->resid_len = scsi_in(cmd)->resid; - if (scsi_end_request(req, 0, blk_rq_bytes(req), + if (scsi_end_request(req, BLK_STS_OK, blk_rq_bytes(req), blk_rq_bytes(req->next_rq))) BUG(); return; @@ -850,7 +832,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) scsi_print_sense(cmd); result = 0; /* for passthrough error may be set */ - error = 0; + error = BLK_STS_OK; } /* @@ -922,18 +904,18 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) action = ACTION_REPREP; } else if (sshdr.asc == 0x10) /* DIX */ { action = ACTION_FAIL; - error = -EILSEQ; + error = BLK_STS_PROTECTION; /* INVALID COMMAND OPCODE or INVALID FIELD IN CDB */ } else if (sshdr.asc == 0x20 || sshdr.asc == 0x24) { action = ACTION_FAIL; - error = -EREMOTEIO; + error = BLK_STS_TARGET; } else action = ACTION_FAIL; break; case ABORTED_COMMAND: action = ACTION_FAIL; if (sshdr.asc == 0x10) /* DIF */ - error = -EILSEQ; + error = BLK_STS_PROTECTION; break; case NOT_READY: /* If the device is in the process of becoming @@ -1134,6 +1116,20 @@ err_exit: } EXPORT_SYMBOL(scsi_init_io); +/** + * scsi_initialize_rq - initialize struct scsi_cmnd.req + * + * Called from inside blk_get_request(). + */ +void scsi_initialize_rq(struct request *rq) +{ + struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq); + + scsi_req_init(&cmd->req); +} +EXPORT_SYMBOL(scsi_initialize_rq); + +/* Called after a request has been started. */ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd) { void *buf = cmd->sense_buffer; @@ -1829,15 +1825,15 @@ out_delay: blk_delay_queue(q, SCSI_QUEUE_DELAY); } -static inline int prep_to_mq(int ret) +static inline blk_status_t prep_to_mq(int ret) { switch (ret) { case BLKPREP_OK: - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; case BLKPREP_DEFER: - return BLK_MQ_RQ_QUEUE_BUSY; + return BLK_STS_RESOURCE; default: - return BLK_MQ_RQ_QUEUE_ERROR; + return BLK_STS_IOERR; } } @@ -1909,7 +1905,7 @@ static void scsi_mq_done(struct scsi_cmnd *cmd) blk_mq_complete_request(cmd->request); } -static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, +static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *req = bd->rq; @@ -1917,14 +1913,14 @@ static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct scsi_device *sdev = q->queuedata; struct Scsi_Host *shost = sdev->host; struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); - int ret; + blk_status_t ret; int reason; ret = prep_to_mq(scsi_prep_state_check(sdev, req)); - if (ret != BLK_MQ_RQ_QUEUE_OK) + if (ret != BLK_STS_OK) goto out; - ret = BLK_MQ_RQ_QUEUE_BUSY; + ret = BLK_STS_RESOURCE; if (!get_device(&sdev->sdev_gendev)) goto out; @@ -1937,7 +1933,7 @@ static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, if (!(req->rq_flags & RQF_DONTPREP)) { ret = prep_to_mq(scsi_mq_prep_fn(req)); - if (ret != BLK_MQ_RQ_QUEUE_OK) + if (ret != BLK_STS_OK) goto out_dec_host_busy; req->rq_flags |= RQF_DONTPREP; } else { @@ -1955,11 +1951,11 @@ static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, reason = scsi_dispatch_cmd(cmd); if (reason) { scsi_set_blocked(cmd, reason); - ret = BLK_MQ_RQ_QUEUE_BUSY; + ret = BLK_STS_RESOURCE; goto out_dec_host_busy; } - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; out_dec_host_busy: atomic_dec(&shost->host_busy); @@ -1972,12 +1968,14 @@ out_put_device: put_device(&sdev->sdev_gendev); out: switch (ret) { - case BLK_MQ_RQ_QUEUE_BUSY: + case BLK_STS_OK: + break; + case BLK_STS_RESOURCE: if (atomic_read(&sdev->device_busy) == 0 && !scsi_device_blocked(sdev)) blk_mq_delay_run_hw_queue(hctx, SCSI_QUEUE_DELAY); break; - case BLK_MQ_RQ_QUEUE_ERROR: + default: /* * Make sure to release all allocated ressources when * we hit an error, as we will never see this command @@ -1986,8 +1984,6 @@ out: if (req->rq_flags & RQF_DONTPREP) scsi_mq_uninit_cmd(cmd); break; - default: - break; } return ret; } @@ -2057,6 +2053,8 @@ void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q) { struct device *dev = shost->dma_dev; + queue_flag_set_unlocked(QUEUE_FLAG_SCSI_PASSTHROUGH, q); + /* * this limit is imposed by hardware restrictions */ @@ -2139,6 +2137,7 @@ struct request_queue *scsi_alloc_queue(struct scsi_device *sdev) q->request_fn = scsi_request_fn; q->init_rq_fn = scsi_init_rq; q->exit_rq_fn = scsi_exit_rq; + q->initialize_rq_fn = scsi_initialize_rq; if (blk_init_allocated_queue(q) < 0) { blk_cleanup_queue(q); @@ -2163,6 +2162,7 @@ static const struct blk_mq_ops scsi_mq_ops = { #endif .init_request = scsi_init_request, .exit_request = scsi_exit_request, + .initialize_rq_fn = scsi_initialize_rq, .map_queues = scsi_map_queues, }; @@ -2977,7 +2977,7 @@ scsi_internal_device_block(struct scsi_device *sdev, bool wait) if (wait) blk_mq_quiesce_queue(q); else - blk_mq_stop_hw_queues(q); + blk_mq_quiesce_queue_nowait(q); } else { spin_lock_irqsave(q->queue_lock, flags); blk_stop_queue(q); @@ -3031,7 +3031,7 @@ scsi_internal_device_unblock(struct scsi_device *sdev, return -EINVAL; if (q->mq_ops) { - blk_mq_start_stopped_hw_queues(q, false); + blk_mq_unquiesce_queue(q); } else { spin_lock_irqsave(q->queue_lock, flags); blk_start_queue(q); diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c index 0ebe2f1..5006a65 100644 --- a/drivers/scsi/scsi_transport_sas.c +++ b/drivers/scsi/scsi_transport_sas.c @@ -33,6 +33,7 @@ #include <linux/bsg.h> #include <scsi/scsi.h> +#include <scsi/scsi_cmnd.h> #include <scsi/scsi_request.h> #include <scsi/scsi_device.h> #include <scsi/scsi_host.h> @@ -172,7 +173,7 @@ static void sas_smp_request(struct request_queue *q, struct Scsi_Host *shost, struct sas_rphy *rphy) { struct request *req; - int ret; + blk_status_t ret; int (*handler)(struct Scsi_Host *, struct sas_rphy *, struct request *); while ((req = blk_fetch_request(q)) != NULL) { @@ -230,6 +231,7 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy) q = blk_alloc_queue(GFP_KERNEL); if (!q) return -ENOMEM; + q->initialize_rq_fn = scsi_initialize_rq; q->cmd_size = sizeof(struct scsi_request); if (rphy) { @@ -249,6 +251,11 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy) if (error) goto out_cleanup_queue; + /* + * by default assume old behaviour and bounce for any highmem page + */ + blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); + error = bsg_register_queue(q, dev, name, release); if (error) goto out_cleanup_queue; @@ -264,6 +271,7 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy) q->queuedata = shost; queue_flag_set_unlocked(QUEUE_FLAG_BIDI, q); + queue_flag_set_unlocked(QUEUE_FLAG_SCSI_PASSTHROUGH, q); return 0; out_cleanup_queue: diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 82c33a6..21225d6 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -177,7 +177,7 @@ typedef struct sg_device { /* holds the state of each scsi generic device */ } Sg_device; /* tasklet or soft irq callback */ -static void sg_rq_end_io(struct request *rq, int uptodate); +static void sg_rq_end_io(struct request *rq, blk_status_t status); static int sg_start_req(Sg_request *srp, unsigned char *cmd); static int sg_finish_rem_req(Sg_request * srp); static int sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size); @@ -808,7 +808,7 @@ sg_common_write(Sg_fd * sfp, Sg_request * srp, if (atomic_read(&sdp->detaching)) { if (srp->bio) { scsi_req_free_cmd(scsi_req(srp->rq)); - blk_end_request_all(srp->rq, -EIO); + blk_end_request_all(srp->rq, BLK_STS_IOERR); srp->rq = NULL; } @@ -1300,7 +1300,7 @@ sg_rq_end_io_usercontext(struct work_struct *work) * level when a command is completed (or has failed). */ static void -sg_rq_end_io(struct request *rq, int uptodate) +sg_rq_end_io(struct request *rq, blk_status_t status) { struct sg_request *srp = rq->end_io_data; struct scsi_request *req = scsi_req(rq); @@ -1732,8 +1732,6 @@ sg_start_req(Sg_request *srp, unsigned char *cmd) } req = scsi_req(rq); - scsi_req_init(rq); - if (hp->cmd_len > BLK_MAX_CDB) req->cmd = long_cmdp; memcpy(req->cmd, cmd, hp->cmd_len); diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 1ea34d6..8e5013d 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -511,7 +511,7 @@ static void st_do_stats(struct scsi_tape *STp, struct request *req) atomic64_dec(&STp->stats->in_flight); } -static void st_scsi_execute_end(struct request *req, int uptodate) +static void st_scsi_execute_end(struct request *req, blk_status_t status) { struct st_request *SRpnt = req->end_io_data; struct scsi_request *rq = scsi_req(req); @@ -549,7 +549,6 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd, if (IS_ERR(req)) return DRIVER_ERROR << 24; rq = scsi_req(req); - scsi_req_init(req); req->rq_flags |= RQF_QUIET; mdata->null_mapped = 1; diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index bb069eb..c05d3801 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -93,7 +93,7 @@ static int iblock_configure_device(struct se_device *dev) return -EINVAL; } - ib_dev->ibd_bio_set = bioset_create(IBLOCK_BIO_POOL_SIZE, 0); + ib_dev->ibd_bio_set = bioset_create(IBLOCK_BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (!ib_dev->ibd_bio_set) { pr_err("IBLOCK: Unable to create bioset\n"); goto out; @@ -296,8 +296,8 @@ static void iblock_bio_done(struct bio *bio) struct se_cmd *cmd = bio->bi_private; struct iblock_req *ibr = cmd->priv; - if (bio->bi_error) { - pr_err("bio error: %p, err: %d\n", bio, bio->bi_error); + if (bio->bi_status) { + pr_err("bio error: %p, err: %d\n", bio, bio->bi_status); /* * Bump the ib_bio_err_cnt and release bio. */ @@ -354,11 +354,11 @@ static void iblock_end_io_flush(struct bio *bio) { struct se_cmd *cmd = bio->bi_private; - if (bio->bi_error) - pr_err("IBLOCK: cache flush failed: %d\n", bio->bi_error); + if (bio->bi_status) + pr_err("IBLOCK: cache flush failed: %d\n", bio->bi_status); if (cmd) { - if (bio->bi_error) + if (bio->bi_status) target_complete_cmd(cmd, SAM_STAT_CHECK_CONDITION); else target_complete_cmd(cmd, SAM_STAT_GOOD); diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 3e4abb1..ceec021 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -55,7 +55,7 @@ static inline struct pscsi_dev_virt *PSCSI_DEV(struct se_device *dev) } static sense_reason_t pscsi_execute_cmd(struct se_cmd *cmd); -static void pscsi_req_done(struct request *, int); +static void pscsi_req_done(struct request *, blk_status_t); /* pscsi_attach_hba(): * @@ -992,8 +992,6 @@ pscsi_execute_cmd(struct se_cmd *cmd) goto fail; } - scsi_req_init(req); - if (sgl) { ret = pscsi_map_sg(cmd, sgl, sgl_nents, req); if (ret) @@ -1045,7 +1043,7 @@ static sector_t pscsi_get_blocks(struct se_device *dev) return 0; } -static void pscsi_req_done(struct request *req, int uptodate) +static void pscsi_req_done(struct request *req, blk_status_t status) { struct se_cmd *cmd = req->end_io_data; struct pscsi_plugin_task *pt = cmd->priv; @@ -1541,7 +1541,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, ssize_t ret; /* enforce forwards compatibility on users */ - if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) { + if (unlikely(iocb->aio_reserved2)) { pr_debug("EINVAL: reserve field set\n"); return -EINVAL; } @@ -1568,6 +1568,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->common.ki_pos = iocb->aio_offset; req->common.ki_complete = aio_complete; req->common.ki_flags = iocb_flags(req->common.ki_filp); + req->common.ki_hint = file_write_hint(file); if (iocb->aio_flags & IOCB_FLAG_RESFD) { /* @@ -1586,6 +1587,18 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->common.ki_flags |= IOCB_EVENTFD; } + ret = kiocb_set_rw_flags(&req->common, iocb->aio_rw_flags); + if (unlikely(ret)) { + pr_debug("EINVAL: aio_rw_flags\n"); + goto out_put_req; + } + + if ((req->common.ki_flags & IOCB_NOWAIT) && + !(req->common.ki_flags & IOCB_DIRECT)) { + ret = -EOPNOTSUPP; + goto out_put_req; + } + ret = put_user(KIOCB_KEY, &user_iocb->aio_key); if (unlikely(ret)) { pr_debug("EFAULT: aio_key\n"); diff --git a/fs/block_dev.c b/fs/block_dev.c index 0a7404e..a7df151 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -225,6 +225,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, bio_init(&bio, vecs, nr_pages); bio.bi_bdev = bdev; bio.bi_iter.bi_sector = pos >> 9; + bio.bi_write_hint = iocb->ki_hint; bio.bi_private = current; bio.bi_end_io = blkdev_bio_end_io_simple; @@ -262,8 +263,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, if (vecs != inline_vecs) kfree(vecs); - if (unlikely(bio.bi_error)) - ret = bio.bi_error; + if (unlikely(bio.bi_status)) + ret = blk_status_to_errno(bio.bi_status); bio_uninit(&bio); @@ -291,16 +292,18 @@ static void blkdev_bio_end_io(struct bio *bio) bool should_dirty = dio->should_dirty; if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) { - if (bio->bi_error && !dio->bio.bi_error) - dio->bio.bi_error = bio->bi_error; + if (bio->bi_status && !dio->bio.bi_status) + dio->bio.bi_status = bio->bi_status; } else { if (!dio->is_sync) { struct kiocb *iocb = dio->iocb; - ssize_t ret = dio->bio.bi_error; + ssize_t ret; - if (likely(!ret)) { + if (likely(!dio->bio.bi_status)) { ret = dio->size; iocb->ki_pos += ret; + } else { + ret = blk_status_to_errno(dio->bio.bi_status); } dio->iocb->ki_complete(iocb, ret, 0); @@ -337,7 +340,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) bool is_read = (iov_iter_rw(iter) == READ), is_sync; loff_t pos = iocb->ki_pos; blk_qc_t qc = BLK_QC_T_NONE; - int ret; + int ret = 0; if ((pos | iov_iter_alignment(iter)) & (bdev_logical_block_size(bdev) - 1)) @@ -361,12 +364,13 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) for (;;) { bio->bi_bdev = bdev; bio->bi_iter.bi_sector = pos >> 9; + bio->bi_write_hint = iocb->ki_hint; bio->bi_private = dio; bio->bi_end_io = blkdev_bio_end_io; ret = bio_iov_iter_get_pages(bio, iter); if (unlikely(ret)) { - bio->bi_error = ret; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); break; } @@ -415,7 +419,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) } __set_current_state(TASK_RUNNING); - ret = dio->bio.bi_error; + if (!ret) + ret = blk_status_to_errno(dio->bio.bi_status); if (likely(!ret)) ret = dio->size; @@ -439,7 +444,7 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static __init int blkdev_init(void) { - blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio)); + blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS); if (!blkdev_dio_pool) return -ENOMEM; return 0; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index b8622e4..d87ac27 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -310,7 +310,8 @@ struct btrfs_dio_private { * The original bio may be split to several sub-bios, this is * done during endio of sub-bios */ - int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int); + blk_status_t (*subio_endio)(struct inode *, struct btrfs_io_bio *, + blk_status_t); }; /* diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index ab14c2e..4ded1c3 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -2129,7 +2129,7 @@ static void btrfsic_bio_end_io(struct bio *bp) /* mutex is not held! This is not save if IO is not yet completed * on umount */ iodone_w_error = 0; - if (bp->bi_error) + if (bp->bi_status) iodone_w_error = 1; BUG_ON(NULL == block); @@ -2143,7 +2143,7 @@ static void btrfsic_bio_end_io(struct bio *bp) if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) pr_info("bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n", - bp->bi_error, + bp->bi_status, btrfsic_get_block_type(dev_state->state, block), block->logical_bytenr, dev_state->name, block->dev_bytenr, block->mirror_num); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 10e6b28..a2fad39 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -155,7 +155,7 @@ static void end_compressed_bio_read(struct bio *bio) unsigned long index; int ret; - if (bio->bi_error) + if (bio->bi_status) cb->errors = 1; /* if there are more bios still pending for this compressed @@ -268,7 +268,7 @@ static void end_compressed_bio_write(struct bio *bio) struct page *page; unsigned long index; - if (bio->bi_error) + if (bio->bi_status) cb->errors = 1; /* if there are more bios still pending for this compressed @@ -287,7 +287,7 @@ static void end_compressed_bio_write(struct bio *bio) cb->start, cb->start + cb->len - 1, NULL, - bio->bi_error ? 0 : 1); + bio->bi_status ? 0 : 1); cb->compressed_pages[0]->mapping = NULL; end_compressed_writeback(inode, cb); @@ -320,7 +320,7 @@ out: * This also checksums the file bytes and gets things ready for * the end io hooks. */ -int btrfs_submit_compressed_write(struct inode *inode, u64 start, +blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long len, u64 disk_start, unsigned long compressed_len, struct page **compressed_pages, @@ -335,13 +335,13 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, struct page *page; u64 first_byte = disk_start; struct block_device *bdev; - int ret; + blk_status_t ret; int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; WARN_ON(start & ((u64)PAGE_SIZE - 1)); cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); if (!cb) - return -ENOMEM; + return BLK_STS_RESOURCE; refcount_set(&cb->pending_bios, 0); cb->errors = 0; cb->inode = inode; @@ -358,7 +358,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); if (!bio) { kfree(cb); - return -ENOMEM; + return BLK_STS_RESOURCE; } bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_private = cb; @@ -368,17 +368,17 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, /* create and submit bios for the compressed pages */ bytes_left = compressed_len; for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) { + int submit = 0; + page = compressed_pages[pg_index]; page->mapping = inode->i_mapping; if (bio->bi_iter.bi_size) - ret = io_tree->ops->merge_bio_hook(page, 0, + submit = io_tree->ops->merge_bio_hook(page, 0, PAGE_SIZE, bio, 0); - else - ret = 0; page->mapping = NULL; - if (ret || bio_add_page(bio, page, PAGE_SIZE, 0) < + if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_get(bio); @@ -400,7 +400,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, ret = btrfs_map_bio(fs_info, bio, 0, 1); if (ret) { - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); } @@ -434,7 +434,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, ret = btrfs_map_bio(fs_info, bio, 0, 1); if (ret) { - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); } @@ -569,7 +569,7 @@ next: * After the compressed pages are read, we copy the bytes into the * bio we were passed and then call the bio end_io calls */ -int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, +blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -586,7 +586,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, u64 em_len; u64 em_start; struct extent_map *em; - int ret = -ENOMEM; + blk_status_t ret = BLK_STS_RESOURCE; int faili = 0; u32 *sums; @@ -600,7 +600,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, PAGE_SIZE); read_unlock(&em_tree->lock); if (!em) - return -EIO; + return BLK_STS_IOERR; compressed_len = em->block_len; cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); @@ -638,7 +638,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, __GFP_HIGHMEM); if (!cb->compressed_pages[pg_index]) { faili = pg_index - 1; - ret = -ENOMEM; + ret = BLK_STS_RESOURCE; goto fail2; } } @@ -659,19 +659,19 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, refcount_set(&cb->pending_bios, 1); for (pg_index = 0; pg_index < nr_pages; pg_index++) { + int submit = 0; + page = cb->compressed_pages[pg_index]; page->mapping = inode->i_mapping; page->index = em_start >> PAGE_SHIFT; if (comp_bio->bi_iter.bi_size) - ret = tree->ops->merge_bio_hook(page, 0, + submit = tree->ops->merge_bio_hook(page, 0, PAGE_SIZE, comp_bio, 0); - else - ret = 0; page->mapping = NULL; - if (ret || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < + if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_get(comp_bio); @@ -697,7 +697,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0); if (ret) { - comp_bio->bi_error = ret; + comp_bio->bi_status = ret; bio_endio(comp_bio); } @@ -726,7 +726,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0); if (ret) { - comp_bio->bi_error = ret; + comp_bio->bi_status = ret; bio_endio(comp_bio); } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 39ec43a..680d426 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -48,12 +48,12 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, unsigned long total_out, u64 disk_start, struct bio *bio); -int btrfs_submit_compressed_write(struct inode *inode, u64 start, +blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long len, u64 disk_start, unsigned long compressed_len, struct page **compressed_pages, unsigned long nr_pages); -int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, +blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); enum btrfs_compression_type { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4f8f75d..a0d0c79 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3078,8 +3078,8 @@ int btrfs_find_name_in_ext_backref(struct btrfs_path *path, struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 len); -int btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst); -int btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst); +blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 logical_offset); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -3094,7 +3094,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); -int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, +blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, u64 file_start, int contig); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5f678dc..6036d15 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -87,7 +87,7 @@ struct btrfs_end_io_wq { bio_end_io_t *end_io; void *private; struct btrfs_fs_info *info; - int error; + blk_status_t status; enum btrfs_wq_endio_type metadata; struct list_head list; struct btrfs_work work; @@ -131,7 +131,7 @@ struct async_submit_bio { */ u64 bio_offset; struct btrfs_work work; - int error; + blk_status_t status; }; /* @@ -799,7 +799,7 @@ static void end_workqueue_bio(struct bio *bio) btrfs_work_func_t func; fs_info = end_io_wq->info; - end_io_wq->error = bio->bi_error; + end_io_wq->status = bio->bi_status; if (bio_op(bio) == REQ_OP_WRITE) { if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) { @@ -836,19 +836,19 @@ static void end_workqueue_bio(struct bio *bio) btrfs_queue_work(wq, &end_io_wq->work); } -int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, +blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata) { struct btrfs_end_io_wq *end_io_wq; end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS); if (!end_io_wq) - return -ENOMEM; + return BLK_STS_RESOURCE; end_io_wq->private = bio->bi_private; end_io_wq->end_io = bio->bi_end_io; end_io_wq->info = info; - end_io_wq->error = 0; + end_io_wq->status = 0; end_io_wq->bio = bio; end_io_wq->metadata = metadata; @@ -868,14 +868,14 @@ unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) static void run_one_async_start(struct btrfs_work *work) { struct async_submit_bio *async; - int ret; + blk_status_t ret; async = container_of(work, struct async_submit_bio, work); ret = async->submit_bio_start(async->inode, async->bio, async->mirror_num, async->bio_flags, async->bio_offset); if (ret) - async->error = ret; + async->status = ret; } static void run_one_async_done(struct btrfs_work *work) @@ -898,8 +898,8 @@ static void run_one_async_done(struct btrfs_work *work) wake_up(&fs_info->async_submit_wait); /* If an error occurred we just want to clean up the bio and move on */ - if (async->error) { - async->bio->bi_error = async->error; + if (async->status) { + async->bio->bi_status = async->status; bio_endio(async->bio); return; } @@ -916,18 +916,17 @@ static void run_one_async_free(struct btrfs_work *work) kfree(async); } -int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, - struct bio *bio, int mirror_num, - unsigned long bio_flags, - u64 bio_offset, - extent_submit_bio_hook_t *submit_bio_start, - extent_submit_bio_hook_t *submit_bio_done) +blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, + struct inode *inode, struct bio *bio, int mirror_num, + unsigned long bio_flags, u64 bio_offset, + extent_submit_bio_hook_t *submit_bio_start, + extent_submit_bio_hook_t *submit_bio_done) { struct async_submit_bio *async; async = kmalloc(sizeof(*async), GFP_NOFS); if (!async) - return -ENOMEM; + return BLK_STS_RESOURCE; async->inode = inode; async->bio = bio; @@ -941,7 +940,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->bio_flags = bio_flags; async->bio_offset = bio_offset; - async->error = 0; + async->status = 0; atomic_inc(&fs_info->nr_async_submits); @@ -959,7 +958,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, return 0; } -static int btree_csum_one_bio(struct bio *bio) +static blk_status_t btree_csum_one_bio(struct bio *bio) { struct bio_vec *bvec; struct btrfs_root *root; @@ -972,12 +971,12 @@ static int btree_csum_one_bio(struct bio *bio) break; } - return ret; + return errno_to_blk_status(ret); } -static int __btree_submit_bio_start(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t __btree_submit_bio_start(struct inode *inode, + struct bio *bio, int mirror_num, unsigned long bio_flags, + u64 bio_offset) { /* * when we're called for a write, we're already in the async @@ -986,11 +985,11 @@ static int __btree_submit_bio_start(struct inode *inode, struct bio *bio, return btree_csum_one_bio(bio); } -static int __btree_submit_bio_done(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t __btree_submit_bio_done(struct inode *inode, + struct bio *bio, int mirror_num, unsigned long bio_flags, + u64 bio_offset) { - int ret; + blk_status_t ret; /* * when we're called for a write, we're already in the async @@ -998,7 +997,7 @@ static int __btree_submit_bio_done(struct inode *inode, struct bio *bio, */ ret = btrfs_map_bio(btrfs_sb(inode->i_sb), bio, mirror_num, 1); if (ret) { - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); } return ret; @@ -1015,13 +1014,13 @@ static int check_async_write(unsigned long bio_flags) return 1; } -static int btree_submit_bio_hook(struct inode *inode, struct bio *bio, +static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int async = check_async_write(bio_flags); - int ret; + blk_status_t ret; if (bio_op(bio) != REQ_OP_WRITE) { /* @@ -1054,7 +1053,7 @@ static int btree_submit_bio_hook(struct inode *inode, struct bio *bio, return 0; out_w_error: - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); return ret; } @@ -1820,7 +1819,7 @@ static void end_workqueue_fn(struct btrfs_work *work) end_io_wq = container_of(work, struct btrfs_end_io_wq, work); bio = end_io_wq->bio; - bio->bi_error = end_io_wq->error; + bio->bi_status = end_io_wq->status; bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); @@ -3497,11 +3496,11 @@ static void btrfs_end_empty_barrier(struct bio *bio) * any device where the flush fails with eopnotsupp are flagged as not-barrier * capable */ -static int write_dev_flush(struct btrfs_device *device, int wait) +static blk_status_t write_dev_flush(struct btrfs_device *device, int wait) { struct request_queue *q = bdev_get_queue(device->bdev); struct bio *bio; - int ret = 0; + blk_status_t ret = 0; if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) return 0; @@ -3513,8 +3512,8 @@ static int write_dev_flush(struct btrfs_device *device, int wait) wait_for_completion(&device->flush_wait); - if (bio->bi_error) { - ret = bio->bi_error; + if (bio->bi_status) { + ret = bio->bi_status; btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS); } @@ -3533,7 +3532,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) device->flush_bio = NULL; bio = btrfs_io_bio_alloc(GFP_NOFS, 0); if (!bio) - return -ENOMEM; + return BLK_STS_RESOURCE; bio->bi_end_io = btrfs_end_empty_barrier; bio->bi_bdev = device->bdev; @@ -3558,7 +3557,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info) struct btrfs_device *dev; int errors_send = 0; int errors_wait = 0; - int ret; + blk_status_t ret; /* send down all the barriers */ head = &info->fs_devices->devices; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 21f1ceb..c581927 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -118,13 +118,13 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); u32 btrfs_csum_data(const char *data, u32 seed, size_t len); void btrfs_csum_final(u32 crc, u8 *result); -int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, +blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata); -int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, - struct bio *bio, int mirror_num, - unsigned long bio_flags, u64 bio_offset, - extent_submit_bio_hook_t *submit_bio_start, - extent_submit_bio_hook_t *submit_bio_done); +blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, + struct inode *inode, struct bio *bio, int mirror_num, + unsigned long bio_flags, u64 bio_offset, + extent_submit_bio_hook_t *submit_bio_start, + extent_submit_bio_hook_t *submit_bio_done); unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); int btrfs_write_tree_block(struct extent_buffer *buf); int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d3619e0..d1cd601 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -174,7 +174,8 @@ int __init extent_io_init(void) goto free_state_cache; btrfs_bioset = bioset_create(BIO_POOL_SIZE, - offsetof(struct btrfs_io_bio, bio)); + offsetof(struct btrfs_io_bio, bio), + BIOSET_NEED_BVECS); if (!btrfs_bioset) goto free_buffer_cache; @@ -2399,6 +2400,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct bio *bio; int read_mode = 0; + blk_status_t status; int ret; BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); @@ -2431,11 +2433,12 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d", read_mode, failrec->this_mirror, failrec->in_validation); - ret = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror, + status = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror, failrec->bio_flags, 0); - if (ret) { + if (status) { free_io_failure(BTRFS_I(inode), failrec); bio_put(bio); + ret = blk_status_to_errno(status); } return ret; @@ -2474,6 +2477,7 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) */ static void end_bio_extent_writepage(struct bio *bio) { + int error = blk_status_to_errno(bio->bi_status); struct bio_vec *bvec; u64 start; u64 end; @@ -2503,7 +2507,7 @@ static void end_bio_extent_writepage(struct bio *bio) start = page_offset(page); end = start + bvec->bv_offset + bvec->bv_len - 1; - end_extent_writepage(page, bio->bi_error, start, end); + end_extent_writepage(page, error, start, end); end_page_writeback(page); } @@ -2536,7 +2540,7 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, static void end_bio_extent_readpage(struct bio *bio) { struct bio_vec *bvec; - int uptodate = !bio->bi_error; + int uptodate = !bio->bi_status; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct extent_io_tree *tree; u64 offset = 0; @@ -2556,7 +2560,7 @@ static void end_bio_extent_readpage(struct bio *bio) btrfs_debug(fs_info, "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", - (u64)bio->bi_iter.bi_sector, bio->bi_error, + (u64)bio->bi_iter.bi_sector, bio->bi_status, io_bio->mirror_num); tree = &BTRFS_I(inode)->io_tree; @@ -2615,7 +2619,7 @@ static void end_bio_extent_readpage(struct bio *bio) ret = bio_readpage_error(bio, offset, page, start, end, mirror); if (ret == 0) { - uptodate = !bio->bi_error; + uptodate = !bio->bi_status; offset += len; continue; } @@ -2673,7 +2677,7 @@ readpage_ok: endio_readpage_release_extent(tree, extent_start, extent_len, uptodate); if (io_bio->end_io) - io_bio->end_io(io_bio, bio->bi_error); + io_bio->end_io(io_bio, blk_status_to_errno(bio->bi_status)); bio_put(bio); } @@ -2743,7 +2747,7 @@ struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) static int __must_check submit_one_bio(struct bio *bio, int mirror_num, unsigned long bio_flags) { - int ret = 0; + blk_status_t ret = 0; struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; struct page *page = bvec->bv_page; struct extent_io_tree *tree = bio->bi_private; @@ -2761,7 +2765,7 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num, btrfsic_submit_bio(bio); bio_put(bio); - return ret; + return blk_status_to_errno(ret); } static int merge_bio(struct extent_io_tree *tree, struct page *page, @@ -2826,6 +2830,7 @@ static int submit_extent_page(int op, int op_flags, struct extent_io_tree *tree, bio_add_page(bio, page, page_size, offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; + bio->bi_write_hint = page->mapping->host->i_write_hint; bio_set_op_attrs(bio, op, op_flags); if (wbc) { wbc_init_bio(wbc, bio); @@ -3707,7 +3712,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio) BUG_ON(!eb); done = atomic_dec_and_test(&eb->io_pages); - if (bio->bi_error || + if (bio->bi_status || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { ClearPageUptodate(page); set_btree_ioerr(page); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 1eafa2f..487ca02 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -92,9 +92,9 @@ struct btrfs_inode; struct btrfs_io_bio; struct io_failure_record; -typedef int (extent_submit_bio_hook_t)(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset); +typedef blk_status_t (extent_submit_bio_hook_t)(struct inode *inode, + struct bio *bio, int mirror_num, unsigned long bio_flags, + u64 bio_offset); struct extent_io_ops { /* * The following callbacks must be allways defined, the function diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 64fcb31..5b1c709 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -160,7 +160,7 @@ static void btrfs_io_bio_endio_readpage(struct btrfs_io_bio *bio, int err) kfree(bio->csum_allocated); } -static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, +static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u64 logical_offset, u32 *dst, int dio) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -182,7 +182,7 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, path = btrfs_alloc_path(); if (!path) - return -ENOMEM; + return BLK_STS_RESOURCE; nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits; if (!dst) { @@ -191,7 +191,7 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, csum_size, GFP_NOFS); if (!btrfs_bio->csum_allocated) { btrfs_free_path(path); - return -ENOMEM; + return BLK_STS_RESOURCE; } btrfs_bio->csum = btrfs_bio->csum_allocated; btrfs_bio->end_io = btrfs_io_bio_endio_readpage; @@ -303,12 +303,12 @@ next: return 0; } -int btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst) +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst) { return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0); } -int btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 offset) +blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 offset) { return __btrfs_lookup_bio_sums(inode, bio, offset, NULL, 1); } @@ -433,7 +433,7 @@ fail: return ret; } -int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, +blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, u64 file_start, int contig) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -452,7 +452,7 @@ int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, sums = kzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), GFP_NOFS); if (!sums) - return -ENOMEM; + return BLK_STS_RESOURCE; sums->len = bio->bi_iter.bi_size; INIT_LIST_HEAD(&sums->list); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index da1096e..59e2dcc 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1875,12 +1875,29 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, ssize_t num_written = 0; bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); ssize_t err; - loff_t pos; - size_t count; + loff_t pos = iocb->ki_pos; + size_t count = iov_iter_count(from); loff_t oldsize; int clean_page = 0; - inode_lock(inode); + if ((iocb->ki_flags & IOCB_NOWAIT) && + (iocb->ki_flags & IOCB_DIRECT)) { + /* Don't sleep on inode rwsem */ + if (!inode_trylock(inode)) + return -EAGAIN; + /* + * We will allocate space in case nodatacow is not set, + * so bail + */ + if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC)) || + check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) { + inode_unlock(inode); + return -EAGAIN; + } + } else + inode_lock(inode); + err = generic_write_checks(iocb, from); if (err <= 0) { inode_unlock(inode); @@ -1914,8 +1931,6 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, */ update_time_for_write(inode); - pos = iocb->ki_pos; - count = iov_iter_count(from); start_pos = round_down(pos, fs_info->sectorsize); oldsize = i_size_read(inode); if (start_pos > oldsize) { @@ -3071,13 +3086,19 @@ out: return offset; } +static int btrfs_file_open(struct inode *inode, struct file *filp) +{ + filp->f_mode |= FMODE_AIO_NOWAIT; + return generic_file_open(inode, filp); +} + const struct file_operations btrfs_file_operations = { .llseek = btrfs_file_llseek, .read_iter = generic_file_read_iter, .splice_read = generic_file_splice_read, .write_iter = btrfs_file_write_iter, .mmap = btrfs_file_mmap, - .open = generic_file_open, + .open = btrfs_file_open, .release = btrfs_release_file, .fsync = btrfs_sync_file, .fallocate = btrfs_fallocate, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ef3c98c..556c930 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -842,13 +842,12 @@ retry: NULL, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK); - ret = btrfs_submit_compressed_write(inode, + if (btrfs_submit_compressed_write(inode, async_extent->start, async_extent->ram_size, ins.objectid, ins.offset, async_extent->pages, - async_extent->nr_pages); - if (ret) { + async_extent->nr_pages)) { struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct page *p = async_extent->pages[0]; const u64 start = async_extent->start; @@ -1901,11 +1900,11 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t __btrfs_submit_bio_start(struct inode *inode, + struct bio *bio, int mirror_num, unsigned long bio_flags, + u64 bio_offset) { - int ret = 0; + blk_status_t ret = 0; ret = btrfs_csum_one_bio(inode, bio, 0, 0); BUG_ON(ret); /* -ENOMEM */ @@ -1920,16 +1919,16 @@ static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t __btrfs_submit_bio_done(struct inode *inode, + struct bio *bio, int mirror_num, unsigned long bio_flags, + u64 bio_offset) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int ret; + blk_status_t ret; ret = btrfs_map_bio(fs_info, bio, mirror_num, 1); if (ret) { - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); } return ret; @@ -1939,14 +1938,14 @@ static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio, * extent_io.c submission hook. This does the right thing for csum calculation * on write, or reading the csums from the tree before a read */ -static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, +static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA; - int ret = 0; + blk_status_t ret = 0; int skip_sum; int async = !atomic_read(&BTRFS_I(inode)->sync_writers); @@ -1991,8 +1990,8 @@ mapit: ret = btrfs_map_bio(fs_info, bio, mirror_num, 0); out: - if (ret < 0) { - bio->bi_error = ret; + if (ret) { + bio->bi_status = ret; bio_endio(bio); } return ret; @@ -8037,7 +8036,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio) struct bio_vec *bvec; int i; - if (bio->bi_error) + if (bio->bi_status) goto end; ASSERT(bio->bi_vcnt == 1); @@ -8116,7 +8115,7 @@ static void btrfs_retry_endio(struct bio *bio) int ret; int i; - if (bio->bi_error) + if (bio->bi_status) goto end; uptodate = 1; @@ -8141,8 +8140,8 @@ end: bio_put(bio); } -static int __btrfs_subio_endio_read(struct inode *inode, - struct btrfs_io_bio *io_bio, int err) +static blk_status_t __btrfs_subio_endio_read(struct inode *inode, + struct btrfs_io_bio *io_bio, blk_status_t err) { struct btrfs_fs_info *fs_info; struct bio_vec *bvec; @@ -8184,7 +8183,7 @@ try_again: io_bio->mirror_num, btrfs_retry_endio, &done); if (ret) { - err = ret; + err = errno_to_blk_status(ret); goto next; } @@ -8211,8 +8210,8 @@ next: return err; } -static int btrfs_subio_endio_read(struct inode *inode, - struct btrfs_io_bio *io_bio, int err) +static blk_status_t btrfs_subio_endio_read(struct inode *inode, + struct btrfs_io_bio *io_bio, blk_status_t err) { bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; @@ -8232,7 +8231,7 @@ static void btrfs_endio_direct_read(struct bio *bio) struct inode *inode = dip->inode; struct bio *dio_bio; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); - int err = bio->bi_error; + blk_status_t err = bio->bi_status; if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) err = btrfs_subio_endio_read(inode, io_bio, err); @@ -8243,11 +8242,11 @@ static void btrfs_endio_direct_read(struct bio *bio) kfree(dip); - dio_bio->bi_error = bio->bi_error; - dio_end_io(dio_bio, bio->bi_error); + dio_bio->bi_status = bio->bi_status; + dio_end_io(dio_bio); if (io_bio->end_io) - io_bio->end_io(io_bio, err); + io_bio->end_io(io_bio, blk_status_to_errno(err)); bio_put(bio); } @@ -8299,20 +8298,20 @@ static void btrfs_endio_direct_write(struct bio *bio) struct bio *dio_bio = dip->dio_bio; __endio_write_update_ordered(dip->inode, dip->logical_offset, - dip->bytes, !bio->bi_error); + dip->bytes, !bio->bi_status); kfree(dip); - dio_bio->bi_error = bio->bi_error; - dio_end_io(dio_bio, bio->bi_error); + dio_bio->bi_status = bio->bi_status; + dio_end_io(dio_bio); bio_put(bio); } -static int __btrfs_submit_bio_start_direct_io(struct inode *inode, +static blk_status_t __btrfs_submit_bio_start_direct_io(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 offset) { - int ret; + blk_status_t ret; ret = btrfs_csum_one_bio(inode, bio, offset, 1); BUG_ON(ret); /* -ENOMEM */ return 0; @@ -8321,7 +8320,7 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, static void btrfs_end_dio_bio(struct bio *bio) { struct btrfs_dio_private *dip = bio->bi_private; - int err = bio->bi_error; + blk_status_t err = bio->bi_status; if (err) btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, @@ -8351,7 +8350,7 @@ static void btrfs_end_dio_bio(struct bio *bio) if (dip->errors) { bio_io_error(dip->orig_bio); } else { - dip->dio_bio->bi_error = 0; + dip->dio_bio->bi_status = 0; bio_endio(dip->orig_bio); } out: @@ -8368,14 +8367,14 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, return bio; } -static inline int btrfs_lookup_and_bind_dio_csum(struct inode *inode, +static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode, struct btrfs_dio_private *dip, struct bio *bio, u64 file_offset) { struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio); - int ret; + blk_status_t ret; /* * We load all the csum data we need when we submit @@ -8406,7 +8405,7 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_private *dip = bio->bi_private; bool write = bio_op(bio) == REQ_OP_WRITE; - int ret; + blk_status_t ret; if (async_submit) async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); @@ -8649,7 +8648,7 @@ free_ordered: * callbacks - they require an allocated dip and a clone of dio_bio. */ if (io_bio && dip) { - io_bio->bi_error = -EIO; + io_bio->bi_status = BLK_STS_IOERR; bio_endio(io_bio); /* * The end io callbacks free our dip, do the final put on io_bio @@ -8668,12 +8667,12 @@ free_ordered: unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, file_offset + dio_bio->bi_iter.bi_size - 1); - dio_bio->bi_error = -EIO; + dio_bio->bi_status = BLK_STS_IOERR; /* * Releases and cleans up our dio_bio, no need to bio_put() * nor bio_endio()/bio_io_error() against dio_bio. */ - dio_end_io(dio_bio, ret); + dio_end_io(dio_bio); } if (io_bio) bio_put(io_bio); @@ -8755,6 +8754,9 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) dio_data.overwrite = 1; inode_unlock(inode); relock = true; + } else if (iocb->ki_flags & IOCB_NOWAIT) { + ret = -EAGAIN; + goto out; } ret = btrfs_delalloc_reserve_space(inode, offset, count); if (ret) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index d8ea0eb..f3d30d9e 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -871,7 +871,7 @@ static void free_raid_bio(struct btrfs_raid_bio *rbio) * this frees the rbio and runs through all the bios in the * bio_list and calls end_io on them */ -static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err) +static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) { struct bio *cur = bio_list_get(&rbio->bio_list); struct bio *next; @@ -884,7 +884,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err) while (cur) { next = cur->bi_next; cur->bi_next = NULL; - cur->bi_error = err; + cur->bi_status = err; bio_endio(cur); cur = next; } @@ -897,7 +897,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err) static void raid_write_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - int err = bio->bi_error; + blk_status_t err = bio->bi_status; int max_errors; if (err) @@ -914,7 +914,7 @@ static void raid_write_end_io(struct bio *bio) max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? 0 : rbio->bbio->max_errors; if (atomic_read(&rbio->error) > max_errors) - err = -EIO; + err = BLK_STS_IOERR; rbio_orig_end_io(rbio, err); } @@ -1092,7 +1092,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, * devices or if they are not contiguous */ if (last_end == disk_start && stripe->dev->bdev && - !last->bi_error && + !last->bi_status && last->bi_bdev == stripe->dev->bdev) { ret = bio_add_page(last, page, PAGE_SIZE, 0); if (ret == PAGE_SIZE) @@ -1448,7 +1448,7 @@ static void raid_rmw_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - if (bio->bi_error) + if (bio->bi_status) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); @@ -1991,7 +1991,7 @@ static void raid_recover_end_io(struct bio *bio) * we only read stripe pages off the disk, set them * up to date if there were no errors */ - if (bio->bi_error) + if (bio->bi_status) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); @@ -2530,7 +2530,7 @@ static void raid56_parity_scrub_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - if (bio->bi_error) + if (bio->bi_status) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index c7b45eb..ba5595d 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -95,7 +95,7 @@ struct scrub_bio { struct scrub_ctx *sctx; struct btrfs_device *dev; struct bio *bio; - int err; + blk_status_t status; u64 logical; u64 physical; #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO @@ -1668,14 +1668,14 @@ leave_nomem: struct scrub_bio_ret { struct completion event; - int error; + blk_status_t status; }; static void scrub_bio_wait_endio(struct bio *bio) { struct scrub_bio_ret *ret = bio->bi_private; - ret->error = bio->bi_error; + ret->status = bio->bi_status; complete(&ret->event); } @@ -1693,7 +1693,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, int ret; init_completion(&done.event); - done.error = 0; + done.status = 0; bio->bi_iter.bi_sector = page->logical >> 9; bio->bi_private = &done; bio->bi_end_io = scrub_bio_wait_endio; @@ -1705,7 +1705,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, return ret; wait_for_completion(&done.event); - if (done.error) + if (done.status) return -EIO; return 0; @@ -1937,7 +1937,7 @@ again: bio->bi_bdev = sbio->dev->bdev; bio->bi_iter.bi_sector = sbio->physical >> 9; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - sbio->err = 0; + sbio->status = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical_for_dev_replace || sbio->logical + sbio->page_count * PAGE_SIZE != @@ -1992,7 +1992,7 @@ static void scrub_wr_bio_end_io(struct bio *bio) struct scrub_bio *sbio = bio->bi_private; struct btrfs_fs_info *fs_info = sbio->dev->fs_info; - sbio->err = bio->bi_error; + sbio->status = bio->bi_status; sbio->bio = bio; btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper, @@ -2007,7 +2007,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) int i; WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO); - if (sbio->err) { + if (sbio->status) { struct btrfs_dev_replace *dev_replace = &sbio->sctx->fs_info->dev_replace; @@ -2341,7 +2341,7 @@ again: bio->bi_bdev = sbio->dev->bdev; bio->bi_iter.bi_sector = sbio->physical >> 9; bio_set_op_attrs(bio, REQ_OP_READ, 0); - sbio->err = 0; + sbio->status = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical || sbio->logical + sbio->page_count * PAGE_SIZE != @@ -2377,7 +2377,7 @@ static void scrub_missing_raid56_end_io(struct bio *bio) struct scrub_block *sblock = bio->bi_private; struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; - if (bio->bi_error) + if (bio->bi_status) sblock->no_io_error_seen = 0; bio_put(bio); @@ -2588,7 +2588,7 @@ static void scrub_bio_end_io(struct bio *bio) struct scrub_bio *sbio = bio->bi_private; struct btrfs_fs_info *fs_info = sbio->dev->fs_info; - sbio->err = bio->bi_error; + sbio->status = bio->bi_status; sbio->bio = bio; btrfs_queue_work(fs_info->scrub_workers, &sbio->work); @@ -2601,7 +2601,7 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) int i; BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO); - if (sbio->err) { + if (sbio->status) { for (i = 0; i < sbio->page_count; i++) { struct scrub_page *spage = sbio->pagev[i]; @@ -3004,7 +3004,7 @@ static void scrub_parity_bio_endio(struct bio *bio) struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; struct btrfs_fs_info *fs_info = sparity->sctx->fs_info; - if (bio->bi_error) + if (bio->bi_status) bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, sparity->nsectors); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 017b67d..84a4959 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6042,9 +6042,10 @@ static void btrfs_end_bio(struct bio *bio) struct btrfs_bio *bbio = bio->bi_private; int is_orig_bio = 0; - if (bio->bi_error) { + if (bio->bi_status) { atomic_inc(&bbio->error); - if (bio->bi_error == -EIO || bio->bi_error == -EREMOTEIO) { + if (bio->bi_status == BLK_STS_IOERR || + bio->bi_status == BLK_STS_TARGET) { unsigned int stripe_index = btrfs_io_bio(bio)->stripe_index; struct btrfs_device *dev; @@ -6082,13 +6083,13 @@ static void btrfs_end_bio(struct bio *bio) * beyond the tolerance of the btrfs bio */ if (atomic_read(&bbio->error) > bbio->max_errors) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; } else { /* * this bio is actually up to date, we didn't * go over the max number of errors */ - bio->bi_error = 0; + bio->bi_status = 0; } btrfs_end_bbio(bbio, bio); @@ -6199,7 +6200,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; bio->bi_iter.bi_sector = logical >> 9; - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; btrfs_end_bbio(bbio, bio); } } diff --git a/fs/buffer.c b/fs/buffer.c index 161be58..5c2cba8 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -49,7 +49,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, - struct writeback_control *wbc); + enum rw_hint hint, struct writeback_control *wbc); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) @@ -1829,7 +1829,8 @@ int __block_write_full_page(struct inode *inode, struct page *page, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc); + submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, + inode->i_write_hint, wbc); nr_underway++; } bh = next; @@ -1883,7 +1884,8 @@ recover: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc); + submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, + inode->i_write_hint, wbc); nr_underway++; } bh = next; @@ -3038,7 +3040,7 @@ static void end_bio_bh_io_sync(struct bio *bio) if (unlikely(bio_flagged(bio, BIO_QUIET))) set_bit(BH_Quiet, &bh->b_state); - bh->b_end_io(bh, !bio->bi_error); + bh->b_end_io(bh, !bio->bi_status); bio_put(bio); } @@ -3091,7 +3093,7 @@ void guard_bio_eod(int op, struct bio *bio) } static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, - struct writeback_control *wbc) + enum rw_hint write_hint, struct writeback_control *wbc) { struct bio *bio; @@ -3120,6 +3122,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; + bio->bi_write_hint = write_hint; bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); BUG_ON(bio->bi_iter.bi_size != bh->b_size); @@ -3142,7 +3145,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, int submit_bh(int op, int op_flags, struct buffer_head *bh) { - return submit_bh_wbc(op, op_flags, bh, NULL); + return submit_bh_wbc(op, op_flags, bh, 0, NULL); } EXPORT_SYMBOL(submit_bh); diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index a409a84..6181e95 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -129,7 +129,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, goto errout; } err = submit_bio_wait(bio); - if ((err == 0) && bio->bi_error) + if (err == 0 && bio->bi_status) err = -EIO; bio_put(bio); if (err) diff --git a/fs/direct-io.c b/fs/direct-io.c index a04ebea..08cf278 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -294,7 +294,7 @@ static void dio_aio_complete_work(struct work_struct *work) dio_complete(dio, 0, true); } -static int dio_bio_complete(struct dio *dio, struct bio *bio); +static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio); /* * Asynchronous IO callback. @@ -348,13 +348,12 @@ static void dio_bio_end_io(struct bio *bio) /** * dio_end_io - handle the end io action for the given bio * @bio: The direct io bio thats being completed - * @error: Error if there was one * * This is meant to be called by any filesystem that uses their own dio_submit_t * so that the DIO specific endio actions are dealt with after the filesystem * has done it's completion work. */ -void dio_end_io(struct bio *bio, int error) +void dio_end_io(struct bio *bio) { struct dio *dio = bio->bi_private; @@ -386,6 +385,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, else bio->bi_end_io = dio_bio_end_io; + bio->bi_write_hint = dio->iocb->ki_hint; + sdio->bio = bio; sdio->logical_offset_in_bio = sdio->cur_page_fs_offset; } @@ -474,17 +475,20 @@ static struct bio *dio_await_one(struct dio *dio) /* * Process one completed BIO. No locks are held. */ -static int dio_bio_complete(struct dio *dio, struct bio *bio) +static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio) { struct bio_vec *bvec; unsigned i; - int err; + blk_status_t err = bio->bi_status; - if (bio->bi_error) - dio->io_error = -EIO; + if (err) { + if (err == BLK_STS_AGAIN && (bio->bi_opf & REQ_NOWAIT)) + dio->io_error = -EAGAIN; + else + dio->io_error = -EIO; + } if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) { - err = bio->bi_error; bio_check_pages_dirty(bio); /* transfers ownership */ } else { bio_for_each_segment_all(bvec, bio, i) { @@ -495,7 +499,6 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio) set_page_dirty_lock(page); put_page(page); } - err = bio->bi_error; bio_put(bio); } return err; @@ -539,7 +542,7 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio) bio = dio->bio_list; dio->bio_list = bio->bi_private; spin_unlock_irqrestore(&dio->bio_lock, flags); - ret2 = dio_bio_complete(dio, bio); + ret2 = blk_status_to_errno(dio_bio_complete(dio, bio)); if (ret == 0) ret = ret2; } @@ -1197,6 +1200,8 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, if (iov_iter_rw(iter) == WRITE) { dio->op = REQ_OP_WRITE; dio->op_flags = REQ_SYNC | REQ_IDLE; + if (iocb->ki_flags & IOCB_NOWAIT) + dio->op_flags |= REQ_NOWAIT; } else { dio->op = REQ_OP_READ; } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 02ce7e7..58e2eea 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -37,7 +37,11 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; - inode_lock_shared(inode); + if (!inode_trylock_shared(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_lock_shared(inode); + } /* * Recheck under inode lock - at this point we are sure it cannot * change anymore @@ -179,7 +183,11 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; - inode_lock(inode); + if (!inode_trylock(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_lock(inode); + } ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; @@ -216,7 +224,12 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) return ext4_dax_write_iter(iocb, from); #endif - inode_lock(inode); + if (!inode_trylock(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_lock(inode); + } + ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; @@ -235,9 +248,15 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) iocb->private = &overwrite; /* Check whether we do a DIO overwrite or not */ - if (o_direct && ext4_should_dioread_nolock(inode) && !unaligned_aio && - ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) - overwrite = 1; + if (o_direct && !unaligned_aio) { + if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) { + if (ext4_should_dioread_nolock(inode)) + overwrite = 1; + } else if (iocb->ki_flags & IOCB_NOWAIT) { + ret = -EAGAIN; + goto out; + } + } ret = __generic_file_write_iter(iocb, from); inode_unlock(inode); @@ -435,6 +454,10 @@ static int ext4_file_open(struct inode * inode, struct file * filp) if (ret < 0) return ret; } + + /* Set the flags to support nowait AIO */ + filp->f_mode |= FMODE_AIO_NOWAIT; + return dquot_file_open(inode, filp); } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 1a82138..c2fce44 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -85,7 +85,7 @@ static void ext4_finish_bio(struct bio *bio) } #endif - if (bio->bi_error) { + if (bio->bi_status) { SetPageError(page); mapping_set_error(page->mapping, -EIO); } @@ -104,7 +104,7 @@ static void ext4_finish_bio(struct bio *bio) continue; } clear_buffer_async_write(bh); - if (bio->bi_error) + if (bio->bi_status) buffer_io_error(bh); } while ((bh = bh->b_this_page) != head); bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); @@ -303,24 +303,25 @@ static void ext4_end_bio(struct bio *bio) bdevname(bio->bi_bdev, b), (long long) bio->bi_iter.bi_sector, (unsigned) bio_sectors(bio), - bio->bi_error)) { + bio->bi_status)) { ext4_finish_bio(bio); bio_put(bio); return; } bio->bi_end_io = NULL; - if (bio->bi_error) { + if (bio->bi_status) { struct inode *inode = io_end->inode; ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " "(offset %llu size %ld starting block %llu)", - bio->bi_error, inode->i_ino, + bio->bi_status, inode->i_ino, (unsigned long long) io_end->offset, (long) io_end->size, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); - mapping_set_error(inode->i_mapping, bio->bi_error); + mapping_set_error(inode->i_mapping, + blk_status_to_errno(bio->bi_status)); } if (io_end->flag & EXT4_IO_END_UNWRITTEN) { @@ -349,6 +350,7 @@ void ext4_io_submit(struct ext4_io_submit *io) if (bio) { int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ? REQ_SYNC : 0; + io->io_bio->bi_write_hint = io->io_end->inode->i_write_hint; bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags); submit_bio(io->io_bio); } @@ -396,6 +398,7 @@ submit_and_retry: ret = io_submit_init_bio(io, bh); if (ret) return ret; + io->io_bio->bi_write_hint = inode->i_write_hint; } ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); if (ret != bh->b_size) diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index a81b829..40a5497 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -73,7 +73,7 @@ static void mpage_end_io(struct bio *bio) int i; if (ext4_bio_encrypted(bio)) { - if (bio->bi_error) { + if (bio->bi_status) { fscrypt_release_ctx(bio->bi_private); } else { fscrypt_decrypt_bio_pages(bio->bi_private, bio); @@ -83,7 +83,7 @@ static void mpage_end_io(struct bio *bio) bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - if (!bio->bi_error) { + if (!bio->bi_status) { SetPageUptodate(page); } else { ClearPageUptodate(page); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7c0f6bd..36fe820 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -58,12 +58,12 @@ static void f2fs_read_end_io(struct bio *bio) #ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) { f2fs_show_injection_info(FAULT_IO); - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; } #endif if (f2fs_bio_encrypted(bio)) { - if (bio->bi_error) { + if (bio->bi_status) { fscrypt_release_ctx(bio->bi_private); } else { fscrypt_decrypt_bio_pages(bio->bi_private, bio); @@ -74,7 +74,7 @@ static void f2fs_read_end_io(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - if (!bio->bi_error) { + if (!bio->bi_status) { if (!PageUptodate(page)) SetPageUptodate(page); } else { @@ -102,14 +102,14 @@ static void f2fs_write_end_io(struct bio *bio) unlock_page(page); mempool_free(page, sbi->write_io_dummy); - if (unlikely(bio->bi_error)) + if (unlikely(bio->bi_status)) f2fs_stop_checkpoint(sbi, true); continue; } fscrypt_pullback_bio_page(&page, true); - if (unlikely(bio->bi_error)) { + if (unlikely(bio->bi_status)) { mapping_set_error(page->mapping, -EIO); f2fs_stop_checkpoint(sbi, true); } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9684585..ea9f455 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -749,7 +749,7 @@ static void f2fs_submit_discard_endio(struct bio *bio) { struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; - dc->error = bio->bi_error; + dc->error = blk_status_to_errno(bio->bi_status); dc->state = D_DONE; complete(&dc->wait); bio_put(bio); @@ -243,6 +243,67 @@ static int f_getowner_uids(struct file *filp, unsigned long arg) } #endif +static bool rw_hint_valid(enum rw_hint hint) +{ + switch (hint) { + case RWF_WRITE_LIFE_NOT_SET: + case RWH_WRITE_LIFE_NONE: + case RWH_WRITE_LIFE_SHORT: + case RWH_WRITE_LIFE_MEDIUM: + case RWH_WRITE_LIFE_LONG: + case RWH_WRITE_LIFE_EXTREME: + return true; + default: + return false; + } +} + +static long fcntl_rw_hint(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct inode *inode = file_inode(file); + u64 *argp = (u64 __user *)arg; + enum rw_hint hint; + u64 h; + + switch (cmd) { + case F_GET_FILE_RW_HINT: + h = file_write_hint(file); + if (copy_to_user(argp, &h, sizeof(*argp))) + return -EFAULT; + return 0; + case F_SET_FILE_RW_HINT: + if (copy_from_user(&h, argp, sizeof(h))) + return -EFAULT; + hint = (enum rw_hint) h; + if (!rw_hint_valid(hint)) + return -EINVAL; + + spin_lock(&file->f_lock); + file->f_write_hint = hint; + spin_unlock(&file->f_lock); + return 0; + case F_GET_RW_HINT: + h = inode->i_write_hint; + if (copy_to_user(argp, &h, sizeof(*argp))) + return -EFAULT; + return 0; + case F_SET_RW_HINT: + if (copy_from_user(&h, argp, sizeof(h))) + return -EFAULT; + hint = (enum rw_hint) h; + if (!rw_hint_valid(hint)) + return -EINVAL; + + inode_lock(inode); + inode->i_write_hint = hint; + inode_unlock(inode); + return 0; + default: + return -EINVAL; + } +} + static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp) { @@ -337,6 +398,12 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, case F_GET_SEALS: err = shmem_fcntl(filp, cmd, arg); break; + case F_GET_RW_HINT: + case F_SET_RW_HINT: + case F_GET_FILE_RW_HINT: + case F_SET_FILE_RW_HINT: + err = fcntl_rw_hint(filp, cmd, arg); + break; default: break; } diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index b7cf65d..aa3d445 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -815,7 +815,6 @@ struct gfs2_sbd { atomic_t sd_log_in_flight; struct bio *sd_log_bio; wait_queue_head_t sd_log_flush_wait; - int sd_log_error; atomic_t sd_reserving_log; wait_queue_head_t sd_reserving_log_wait; diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index b1f9144..885d36e 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -170,7 +170,7 @@ static u64 gfs2_log_bmap(struct gfs2_sbd *sdp) */ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec, - int error) + blk_status_t error) { struct buffer_head *bh, *next; struct page *page = bvec->bv_page; @@ -209,15 +209,13 @@ static void gfs2_end_log_write(struct bio *bio) struct page *page; int i; - if (bio->bi_error) { - sdp->sd_log_error = bio->bi_error; - fs_err(sdp, "Error %d writing to log\n", bio->bi_error); - } + if (bio->bi_status) + fs_err(sdp, "Error %d writing to log\n", bio->bi_status); bio_for_each_segment_all(bvec, bio, i) { page = bvec->bv_page; if (page_has_buffers(page)) - gfs2_end_log_write_bh(sdp, bvec, bio->bi_error); + gfs2_end_log_write_bh(sdp, bvec, bio->bi_status); else mempool_free(page, gfs2_page_pool); } diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 663ffc1..fabe1614f 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -201,7 +201,7 @@ static void gfs2_meta_read_endio(struct bio *bio) do { struct buffer_head *next = bh->b_this_page; len -= bh->b_size; - bh->b_end_io(bh, !bio->bi_error); + bh->b_end_io(bh, !bio->bi_status); bh = next; } while (bh && len); } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index b92135c..e76058d 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -176,10 +176,10 @@ static void end_bio_io_page(struct bio *bio) { struct page *page = bio->bi_private; - if (!bio->bi_error) + if (!bio->bi_status) SetPageUptodate(page); else - pr_warn("error %d reading superblock\n", bio->bi_error); + pr_warn("error %d reading superblock\n", bio->bi_status); unlock_page(page); } @@ -146,6 +146,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) i_gid_write(inode, 0); atomic_set(&inode->i_writecount, 0); inode->i_size = 0; + inode->i_write_hint = WRITE_LIFE_NOT_SET; inode->i_blocks = 0; inode->i_bytes = 0; inode->i_generation = 0; @@ -672,8 +672,8 @@ static void iomap_dio_bio_end_io(struct bio *bio) struct iomap_dio *dio = bio->bi_private; bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); - if (bio->bi_error) - iomap_dio_set_error(dio, bio->bi_error); + if (bio->bi_status) + iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); if (atomic_dec_and_test(&dio->ref)) { if (is_sync_kiocb(dio->iocb)) { @@ -793,6 +793,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, bio->bi_bdev = iomap->bdev; bio->bi_iter.bi_sector = iomap->blkno + ((pos - iomap->offset) >> 9); + bio->bi_write_hint = dio->iocb->ki_hint; bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; @@ -881,6 +882,14 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, flags |= IOMAP_WRITE; } + if (iocb->ki_flags & IOCB_NOWAIT) { + if (filemap_range_has_page(mapping, start, end)) { + ret = -EAGAIN; + goto out_free_dio; + } + flags |= IOMAP_NOWAIT; + } + ret = filemap_write_and_wait_range(mapping, start, end); if (ret) goto out_free_dio; diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index bb1da1f..a21f0e9 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -2205,7 +2205,7 @@ static void lbmIODone(struct bio *bio) bp->l_flag |= lbmDONE; - if (bio->bi_error) { + if (bio->bi_status) { bp->l_flag |= lbmERROR; jfs_err("lbmIODone: I/O error in JFS log"); diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 489aaa1..ce93db3 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -280,7 +280,7 @@ static void metapage_read_end_io(struct bio *bio) { struct page *page = bio->bi_private; - if (bio->bi_error) { + if (bio->bi_status) { printk(KERN_ERR "metapage_read_end_io: I/O error\n"); SetPageError(page); } @@ -337,7 +337,7 @@ static void metapage_write_end_io(struct bio *bio) BUG_ON(!PagePrivate(page)); - if (bio->bi_error) { + if (bio->bi_status) { printk(KERN_ERR "metapage_write_end_io: I/O error\n"); SetPageError(page); } @@ -50,7 +50,8 @@ static void mpage_end_io(struct bio *bio) bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - page_endio(page, op_is_write(bio_op(bio)), bio->bi_error); + page_endio(page, op_is_write(bio_op(bio)), + blk_status_to_errno(bio->bi_status)); } bio_put(bio); @@ -614,6 +615,7 @@ alloc_new: goto confused; wbc_init_bio(wbc, bio); + bio->bi_write_hint = inode->i_write_hint; } /* diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 0ca370d..d8863a8 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -188,7 +188,7 @@ static void bl_end_io_read(struct bio *bio) { struct parallel_io *par = bio->bi_private; - if (bio->bi_error) { + if (bio->bi_status) { struct nfs_pgio_header *header = par->data; if (!header->pnfs_error) @@ -319,7 +319,7 @@ static void bl_end_io_write(struct bio *bio) struct parallel_io *par = bio->bi_private; struct nfs_pgio_header *header = par->data; - if (bio->bi_error) { + if (bio->bi_status) { if (!header->pnfs_error) header->pnfs_error = -EIO; pnfs_set_lo_fail(header->lseg); diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index fb5213a..c862c24 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -219,6 +219,9 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev, u8 *buf, *d, type, assoc; int error; + if (WARN_ON_ONCE(!blk_queue_scsi_passthrough(q))) + return -EINVAL; + buf = kzalloc(bufflen, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -229,7 +232,6 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev, goto out_free_buf; } req = scsi_req(rq); - scsi_req_init(rq); error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL); if (error) diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 6f87b2a..e73c86d 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -338,7 +338,7 @@ static void nilfs_end_bio_write(struct bio *bio) { struct nilfs_segment_buffer *segbuf = bio->bi_private; - if (bio->bi_error) + if (bio->bi_status) atomic_inc(&segbuf->sb_err); bio_put(bio); diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 0da0332..ffe0039 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -516,9 +516,9 @@ static void o2hb_bio_end_io(struct bio *bio) { struct o2hb_bio_wait_ctxt *wc = bio->bi_private; - if (bio->bi_error) { - mlog(ML_ERROR, "IO Error %d\n", bio->bi_error); - wc->wc_error = bio->bi_error; + if (bio->bi_status) { + mlog(ML_ERROR, "IO Error %d\n", bio->bi_status); + wc->wc_error = blk_status_to_errno(bio->bi_status); } o2hb_bio_wait_dec(wc, 1); @@ -759,6 +759,7 @@ static int do_dentry_open(struct file *f, likely(f->f_op->write || f->f_op->write_iter)) f->f_mode |= FMODE_CAN_WRITE; + f->f_write_hint = WRITE_LIFE_NOT_SET; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); diff --git a/fs/read_write.c b/fs/read_write.c index 19d4d88..d591eee 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -678,16 +678,10 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, struct kiocb kiocb; ssize_t ret; - if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC)) - return -EOPNOTSUPP; - init_sync_kiocb(&kiocb, filp); - if (flags & RWF_HIPRI) - kiocb.ki_flags |= IOCB_HIPRI; - if (flags & RWF_DSYNC) - kiocb.ki_flags |= IOCB_DSYNC; - if (flags & RWF_SYNC) - kiocb.ki_flags |= (IOCB_DSYNC | IOCB_SYNC); + ret = kiocb_set_rw_flags(&kiocb, flags); + if (ret) + return ret; kiocb.ki_pos = *ppos; if (type == READ) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3b91faa..d20c29b 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -276,7 +276,7 @@ xfs_end_io( struct xfs_inode *ip = XFS_I(ioend->io_inode); xfs_off_t offset = ioend->io_offset; size_t size = ioend->io_size; - int error = ioend->io_bio->bi_error; + int error; /* * Just clean up the in-memory strutures if the fs has been shut down. @@ -289,6 +289,7 @@ xfs_end_io( /* * Clean up any COW blocks on an I/O error. */ + error = blk_status_to_errno(ioend->io_bio->bi_status); if (unlikely(error)) { switch (ioend->io_type) { case XFS_IO_COW: @@ -332,7 +333,7 @@ xfs_end_bio( else if (ioend->io_append_trans) queue_work(mp->m_data_workqueue, &ioend->io_work); else - xfs_destroy_ioend(ioend, bio->bi_error); + xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status)); } STATIC int @@ -500,11 +501,12 @@ xfs_submit_ioend( * time. */ if (status) { - ioend->io_bio->bi_error = status; + ioend->io_bio->bi_status = errno_to_blk_status(status); bio_endio(ioend->io_bio); return status; } + ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint; submit_bio(ioend->io_bio); return 0; } @@ -564,6 +566,7 @@ xfs_chain_bio( bio_chain(ioend->io_bio, new); bio_get(ioend->io_bio); /* for xfs_destroy_ioend */ ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); + ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint; submit_bio(ioend->io_bio); ioend->io_bio = new; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 16d6a57..438505f 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1227,8 +1227,11 @@ xfs_buf_bio_end_io( * don't overwrite existing errors - otherwise we can lose errors on * buffers that require multiple bios to complete. */ - if (bio->bi_error) - cmpxchg(&bp->b_io_error, 0, bio->bi_error); + if (bio->bi_status) { + int error = blk_status_to_errno(bio->bi_status); + + cmpxchg(&bp->b_io_error, 0, error); + } if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 5fb5a09..17f27a2 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -237,7 +237,11 @@ xfs_file_dax_read( if (!count) return 0; /* skip atime */ - xfs_ilock(ip, XFS_IOLOCK_SHARED); + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + xfs_ilock(ip, XFS_IOLOCK_SHARED); + } ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops); xfs_iunlock(ip, XFS_IOLOCK_SHARED); @@ -541,7 +545,11 @@ xfs_file_dio_aio_write( iolock = XFS_IOLOCK_SHARED; } - xfs_ilock(ip, iolock); + if (!xfs_ilock_nowait(ip, iolock)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + xfs_ilock(ip, iolock); + } ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) @@ -553,9 +561,15 @@ xfs_file_dio_aio_write( * otherwise demote the lock if we had to take the exclusive lock * for other reasons in xfs_file_aio_write_checks. */ - if (unaligned_io) - inode_dio_wait(inode); - else if (iolock == XFS_IOLOCK_EXCL) { + if (unaligned_io) { + /* If we are going to wait for other DIO to finish, bail */ + if (iocb->ki_flags & IOCB_NOWAIT) { + if (atomic_read(&inode->i_dio_count)) + return -EAGAIN; + } else { + inode_dio_wait(inode); + } + } else if (iolock == XFS_IOLOCK_EXCL) { xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; } @@ -585,7 +599,12 @@ xfs_file_dax_write( size_t count; loff_t pos; - xfs_ilock(ip, iolock); + if (!xfs_ilock_nowait(ip, iolock)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + xfs_ilock(ip, iolock); + } + ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; @@ -892,6 +911,7 @@ xfs_file_open( return -EFBIG; if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb))) return -EIO; + file->f_mode |= FMODE_AIO_NOWAIT; return 0; } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 94e5bdf..05dc87e 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -995,6 +995,11 @@ xfs_file_iomap_begin( lockmode = xfs_ilock_data_map_shared(ip); } + if ((flags & IOMAP_NOWAIT) && !(ip->i_df.if_flags & XFS_IFEXTENTS)) { + error = -EAGAIN; + goto out_unlock; + } + ASSERT(offset <= mp->m_super->s_maxbytes); if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes) length = mp->m_super->s_maxbytes - offset; @@ -1016,6 +1021,15 @@ xfs_file_iomap_begin( if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { if (flags & IOMAP_DIRECT) { + /* + * A reflinked inode will result in CoW alloc. + * FIXME: It could still overwrite on unshared extents + * and not need allocation. + */ + if (flags & IOMAP_NOWAIT) { + error = -EAGAIN; + goto out_unlock; + } /* may drop and re-acquire the ilock */ error = xfs_reflink_allocate_cow(ip, &imap, &shared, &lockmode); @@ -1033,6 +1047,14 @@ xfs_file_iomap_begin( if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, &imap, nimaps)) { /* + * If nowait is set bail since we are going to make + * allocations. + */ + if (flags & IOMAP_NOWAIT) { + error = -EAGAIN; + goto out_unlock; + } + /* * We cap the maximum length we map here to MAX_WRITEBACK_PAGES * pages to keep the chunks of work done where somewhat symmetric * with the work writeback does. This is a completely arbitrary diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 455a575..97df4db 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1766,7 +1766,8 @@ STATIC int __init xfs_init_zones(void) { xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE, - offsetof(struct xfs_ioend, io_inline_bio)); + offsetof(struct xfs_ioend, io_inline_bio), + BIOSET_NEED_BVECS); if (!xfs_ioend_bioset) goto out; diff --git a/include/linux/bio.h b/include/linux/bio.h index a7e29fa..664a27d 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -118,7 +118,6 @@ static inline void *bio_data(struct bio *bio) /* * will die */ -#define bio_to_phys(bio) (page_to_phys(bio_page((bio))) + (unsigned long) bio_offset((bio))) #define bvec_to_phys(bv) (page_to_phys((bv)->bv_page) + (unsigned long) (bv)->bv_offset) /* @@ -373,8 +372,11 @@ static inline struct bio *bio_next_split(struct bio *bio, int sectors, return bio_split(bio, sectors, gfp, bs); } -extern struct bio_set *bioset_create(unsigned int, unsigned int); -extern struct bio_set *bioset_create_nobvec(unsigned int, unsigned int); +extern struct bio_set *bioset_create(unsigned int, unsigned int, int flags); +enum { + BIOSET_NEED_BVECS = BIT(0), + BIOSET_NEED_RESCUER = BIT(1), +}; extern void bioset_free(struct bio_set *); extern mempool_t *biovec_create_pool(int pool_entries); @@ -392,11 +394,6 @@ static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) return bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); } -static inline struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) -{ - return bio_clone_bioset(bio, gfp_mask, fs_bio_set); -} - static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) { return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL); @@ -414,7 +411,13 @@ extern void bio_endio(struct bio *); static inline void bio_io_error(struct bio *bio) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); +} + +static inline void bio_wouldblock_error(struct bio *bio) +{ + bio->bi_status = BLK_STS_AGAIN; bio_endio(bio); } diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index fcd6410..23d32ff 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -39,8 +39,6 @@ struct blk_mq_hw_ctx { struct blk_mq_tags *tags; struct blk_mq_tags *sched_tags; - struct srcu_struct queue_rq_srcu; - unsigned long queued; unsigned long run; #define BLK_MQ_MAX_DISPATCH_ORDER 7 @@ -62,6 +60,9 @@ struct blk_mq_hw_ctx { struct dentry *debugfs_dir; struct dentry *sched_debugfs_dir; #endif + + /* Must be the last member - see also blk_mq_hw_ctx_size(). */ + struct srcu_struct queue_rq_srcu[0]; }; struct blk_mq_tag_set { @@ -87,7 +88,8 @@ struct blk_mq_queue_data { bool last; }; -typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *); +typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *, + const struct blk_mq_queue_data *); typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); @@ -142,6 +144,8 @@ struct blk_mq_ops { init_request_fn *init_request; exit_request_fn *exit_request; reinit_request_fn *reinit_request; + /* Called from inside blk_get_request() */ + void (*initialize_rq_fn)(struct request *rq); map_queues_fn *map_queues; @@ -155,10 +159,6 @@ struct blk_mq_ops { }; enum { - BLK_MQ_RQ_QUEUE_OK = 0, /* queued fine */ - BLK_MQ_RQ_QUEUE_BUSY = 1, /* requeue IO for later */ - BLK_MQ_RQ_QUEUE_ERROR = 2, /* end IO with error */ - BLK_MQ_F_SHOULD_MERGE = 1 << 0, BLK_MQ_F_TAG_SHARED = 1 << 1, BLK_MQ_F_SG_MERGE = 1 << 2, @@ -204,10 +204,10 @@ enum { BLK_MQ_REQ_INTERNAL = (1 << 2), /* allocate internal/sched tag */ }; -struct request *blk_mq_alloc_request(struct request_queue *q, int rw, +struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, unsigned int flags); -struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int op, - unsigned int flags, unsigned int hctx_idx); +struct request *blk_mq_alloc_request_hctx(struct request_queue *q, + unsigned int op, unsigned int flags, unsigned int hctx_idx); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); enum { @@ -230,8 +230,8 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag) int blk_mq_request_started(struct request *rq); void blk_mq_start_request(struct request *rq); -void blk_mq_end_request(struct request *rq, int error); -void __blk_mq_end_request(struct request *rq, int error); +void blk_mq_end_request(struct request *rq, blk_status_t error); +void __blk_mq_end_request(struct request *rq, blk_status_t error); void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, @@ -247,6 +247,8 @@ void blk_mq_stop_hw_queues(struct request_queue *q); void blk_mq_start_hw_queues(struct request_queue *q); void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); +void blk_mq_quiesce_queue(struct request_queue *q); +void blk_mq_unquiesce_queue(struct request_queue *q); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_run_hw_queues(struct request_queue *q, bool async); @@ -264,6 +266,8 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set); int blk_mq_map_queues(struct blk_mq_tag_set *set); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); +void blk_mq_quiesce_queue_nowait(struct request_queue *q); + /* * Driver command data is immediately after the request. So subtract request * size to get back to the original request, add request size to get the PDU. diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 61339bc..d2eb87c 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -17,6 +17,27 @@ struct io_context; struct cgroup_subsys_state; typedef void (bio_end_io_t) (struct bio *); +/* + * Block error status values. See block/blk-core:blk_errors for the details. + */ +typedef u8 __bitwise blk_status_t; +#define BLK_STS_OK 0 +#define BLK_STS_NOTSUPP ((__force blk_status_t)1) +#define BLK_STS_TIMEOUT ((__force blk_status_t)2) +#define BLK_STS_NOSPC ((__force blk_status_t)3) +#define BLK_STS_TRANSPORT ((__force blk_status_t)4) +#define BLK_STS_TARGET ((__force blk_status_t)5) +#define BLK_STS_NEXUS ((__force blk_status_t)6) +#define BLK_STS_MEDIUM ((__force blk_status_t)7) +#define BLK_STS_PROTECTION ((__force blk_status_t)8) +#define BLK_STS_RESOURCE ((__force blk_status_t)9) +#define BLK_STS_IOERR ((__force blk_status_t)10) + +/* hack for device mapper, don't use elsewhere: */ +#define BLK_STS_DM_REQUEUE ((__force blk_status_t)11) + +#define BLK_STS_AGAIN ((__force blk_status_t)12) + struct blk_issue_stat { u64 stat; }; @@ -28,13 +49,14 @@ struct blk_issue_stat { struct bio { struct bio *bi_next; /* request queue link */ struct block_device *bi_bdev; - int bi_error; + blk_status_t bi_status; unsigned int bi_opf; /* bottom bits req flags, * top bits REQ_OP. Use * accessors. */ unsigned short bi_flags; /* status, etc and bvec pool number */ unsigned short bi_ioprio; + unsigned short bi_write_hint; struct bvec_iter bi_iter; @@ -205,6 +227,7 @@ enum req_flag_bits { /* command specific flags for REQ_OP_WRITE_ZEROES: */ __REQ_NOUNMAP, /* do not free blocks when zeroing */ + __REQ_NOWAIT, /* Don't wait if request will block */ __REQ_NR_BITS, /* stops here */ }; @@ -223,6 +246,7 @@ enum req_flag_bits { #define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) +#define REQ_NOWAIT (1ULL << __REQ_NOWAIT) #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1ddd36b..25f6a0c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -55,7 +55,7 @@ struct blk_stat_callback; */ #define BLKCG_MAX_POLS 3 -typedef void (rq_end_io_fn)(struct request *, int); +typedef void (rq_end_io_fn)(struct request *, blk_status_t); #define BLK_RL_SYNCFULL (1U << 0) #define BLK_RL_ASYNCFULL (1U << 1) @@ -225,6 +225,8 @@ struct request { unsigned int extra_len; /* length of alignment and padding */ + unsigned short write_hint; + unsigned long deadline; struct list_head timeout_list; @@ -412,8 +414,12 @@ struct request_queue { rq_timed_out_fn *rq_timed_out_fn; dma_drain_needed_fn *dma_drain_needed; lld_busy_fn *lld_busy_fn; + /* Called just after a request is allocated */ init_rq_fn *init_rq_fn; + /* Called just before a request is freed */ exit_rq_fn *exit_rq_fn; + /* Called from inside blk_get_request() */ + void (*initialize_rq_fn)(struct request *rq); const struct blk_mq_ops *mq_ops; @@ -590,6 +596,9 @@ struct request_queue { void *rq_alloc_data; struct work_struct release_work; + +#define BLK_MAX_WRITE_HINTS 5 + u64 write_hints[BLK_MAX_WRITE_HINTS]; }; #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ @@ -622,6 +631,8 @@ struct request_queue { #define QUEUE_FLAG_STATS 27 /* track rq completion times */ #define QUEUE_FLAG_POLL_STATS 28 /* collecting stats for hybrid polling */ #define QUEUE_FLAG_REGISTERED 29 /* queue has been registered to a disk */ +#define QUEUE_FLAG_SCSI_PASSTHROUGH 30 /* queue supports SCSI commands */ +#define QUEUE_FLAG_QUIESCED 31 /* queue has been quiesced */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ @@ -633,6 +644,13 @@ struct request_queue { (1 << QUEUE_FLAG_SAME_COMP) | \ (1 << QUEUE_FLAG_POLL)) +/* + * @q->queue_lock is set while a queue is being initialized. Since we know + * that no other threads access the queue object before @q->queue_lock has + * been set, it is safe to manipulate queue flags without holding the + * queue_lock if @q->queue_lock == NULL. See also blk_alloc_queue_node() and + * blk_init_allocated_queue(). + */ static inline void queue_lockdep_assert_held(struct request_queue *q) { if (q->queue_lock) @@ -712,10 +730,13 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) #define blk_queue_secure_erase(q) \ (test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags)) #define blk_queue_dax(q) test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags) +#define blk_queue_scsi_passthrough(q) \ + test_bit(QUEUE_FLAG_SCSI_PASSTHROUGH, &(q)->queue_flags) #define blk_noretry_request(rq) \ ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \ REQ_FAILFAST_DRIVER)) +#define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags) static inline bool blk_account_rq(struct request *rq) { @@ -814,7 +835,8 @@ static inline bool rq_mergeable(struct request *rq) static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b) { - if (bio_data(a) == bio_data(b)) + if (bio_page(a) == bio_page(b) && + bio_offset(a) == bio_offset(b)) return true; return false; @@ -862,19 +884,6 @@ extern unsigned long blk_max_low_pfn, blk_max_pfn; #define BLK_DEFAULT_SG_TIMEOUT (60 * HZ) #define BLK_MIN_SG_TIMEOUT (7 * HZ) -#ifdef CONFIG_BOUNCE -extern int init_emergency_isa_pool(void); -extern void blk_queue_bounce(struct request_queue *q, struct bio **bio); -#else -static inline int init_emergency_isa_pool(void) -{ - return 0; -} -static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio) -{ -} -#endif /* CONFIG_MMU */ - struct rq_map_data { struct page **pages; int page_order; @@ -933,7 +942,8 @@ extern void blk_rq_init(struct request_queue *q, struct request *rq); extern void blk_init_request_from_bio(struct request *req, struct bio *bio); extern void blk_put_request(struct request *); extern void __blk_put_request(struct request_queue *, struct request *); -extern struct request *blk_get_request(struct request_queue *, int, gfp_t); +extern struct request *blk_get_request(struct request_queue *, unsigned int op, + gfp_t gfp_mask); extern void blk_requeue_request(struct request_queue *, struct request *); extern int blk_lld_busy(struct request_queue *q); extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, @@ -941,12 +951,11 @@ extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, int (*bio_ctr)(struct bio *, struct bio *, void *), void *data); extern void blk_rq_unprep_clone(struct request *rq); -extern int blk_insert_cloned_request(struct request_queue *q, +extern blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq); extern int blk_rq_append_bio(struct request *rq, struct bio *bio); extern void blk_delay_queue(struct request_queue *, unsigned long); -extern void blk_queue_split(struct request_queue *, struct bio **, - struct bio_set *); +extern void blk_queue_split(struct request_queue *, struct bio **); extern void blk_recount_segments(struct request_queue *, struct bio *); extern int scsi_verify_blk_ioctl(struct block_device *, unsigned int); extern int scsi_cmd_blk_ioctl(struct block_device *, fmode_t, @@ -967,7 +976,6 @@ extern void __blk_run_queue(struct request_queue *q); extern void __blk_run_queue_uncond(struct request_queue *q); extern void blk_run_queue(struct request_queue *); extern void blk_run_queue_async(struct request_queue *q); -extern void blk_mq_quiesce_queue(struct request_queue *q); extern int blk_rq_map_user(struct request_queue *, struct request *, struct rq_map_data *, void __user *, unsigned long, gfp_t); @@ -981,6 +989,9 @@ extern void blk_execute_rq(struct request_queue *, struct gendisk *, extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, struct request *, int, rq_end_io_fn *); +int blk_status_to_errno(blk_status_t status); +blk_status_t errno_to_blk_status(int errno); + bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) @@ -1113,16 +1124,16 @@ extern struct request *blk_fetch_request(struct request_queue *q); * blk_end_request() for parts of the original function. * This prevents code duplication in drivers. */ -extern bool blk_update_request(struct request *rq, int error, +extern bool blk_update_request(struct request *rq, blk_status_t error, unsigned int nr_bytes); -extern void blk_finish_request(struct request *rq, int error); -extern bool blk_end_request(struct request *rq, int error, +extern void blk_finish_request(struct request *rq, blk_status_t error); +extern bool blk_end_request(struct request *rq, blk_status_t error, unsigned int nr_bytes); -extern void blk_end_request_all(struct request *rq, int error); -extern bool __blk_end_request(struct request *rq, int error, +extern void blk_end_request_all(struct request *rq, blk_status_t error); +extern bool __blk_end_request(struct request *rq, blk_status_t error, unsigned int nr_bytes); -extern void __blk_end_request_all(struct request *rq, int error); -extern bool __blk_end_request_cur(struct request *rq, int error); +extern void __blk_end_request_all(struct request *rq, blk_status_t error); +extern bool __blk_end_request_cur(struct request *rq, blk_status_t error); extern void blk_complete_request(struct request *); extern void __blk_complete_request(struct request *); @@ -1374,11 +1385,6 @@ enum blk_default_limits { #define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist) -static inline unsigned long queue_bounce_pfn(struct request_queue *q) -{ - return q->limits.bounce_pfn; -} - static inline unsigned long queue_segment_boundary(struct request_queue *q) { return q->limits.seg_boundary_mask; @@ -1780,7 +1786,7 @@ struct blk_integrity_iter { const char *disk_name; }; -typedef int (integrity_processing_fn) (struct blk_integrity_iter *); +typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *); struct blk_integrity_profile { integrity_processing_fn *generate_fn; diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index f4c639c..456da50 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -72,9 +72,9 @@ typedef void (*dm_release_clone_request_fn) (struct request *clone); * 2 : The target wants to push back the io */ typedef int (*dm_endio_fn) (struct dm_target *ti, - struct bio *bio, int error); + struct bio *bio, blk_status_t *error); typedef int (*dm_request_endio_fn) (struct dm_target *ti, - struct request *clone, int error, + struct request *clone, blk_status_t error, union map_info *map_context); typedef void (*dm_presuspend_fn) (struct dm_target *ti); diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 0e306c5..5bc8f86 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -104,8 +104,9 @@ struct elevator_mq_ops { int (*request_merge)(struct request_queue *q, struct request **, struct bio *); void (*request_merged)(struct request_queue *, struct request *, enum elv_merge); void (*requests_merged)(struct request_queue *, struct request *, struct request *); - struct request *(*get_request)(struct request_queue *, unsigned int, struct blk_mq_alloc_data *); - void (*put_request)(struct request *); + void (*limit_depth)(unsigned int, struct blk_mq_alloc_data *); + void (*prepare_request)(struct request *, struct bio *bio); + void (*finish_request)(struct request *); void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool); struct request *(*dispatch_request)(struct blk_mq_hw_ctx *); bool (*has_work)(struct blk_mq_hw_ctx *); @@ -114,8 +115,6 @@ struct elevator_mq_ops { void (*requeue_request)(struct request *); struct request *(*former_request)(struct request_queue *, struct request *); struct request *(*next_request)(struct request_queue *, struct request *); - int (*get_rq_priv)(struct request_queue *, struct request *, struct bio *); - void (*put_rq_priv)(struct request_queue *, struct request *); void (*init_icq)(struct io_cq *); void (*exit_icq)(struct io_cq *); }; diff --git a/include/linux/fs.h b/include/linux/fs.h index 3e68cab..65adbdd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -20,6 +20,7 @@ #include <linux/rwsem.h> #include <linux/capability.h> #include <linux/semaphore.h> +#include <linux/fcntl.h> #include <linux/fiemap.h> #include <linux/rculist_bl.h> #include <linux/atomic.h> @@ -143,6 +144,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) +/* File is capable of returning -EAGAIN if AIO will block */ +#define FMODE_AIO_NOWAIT ((__force fmode_t)0x8000000) + /* * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector * that indicates that they should check the contents of the iovec are @@ -262,6 +266,18 @@ struct page; struct address_space; struct writeback_control; +/* + * Write life time hint values. + */ +enum rw_hint { + WRITE_LIFE_NOT_SET = 0, + WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE, + WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT, + WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM, + WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG, + WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME, +}; + #define IOCB_EVENTFD (1 << 0) #define IOCB_APPEND (1 << 1) #define IOCB_DIRECT (1 << 2) @@ -269,6 +285,7 @@ struct writeback_control; #define IOCB_DSYNC (1 << 4) #define IOCB_SYNC (1 << 5) #define IOCB_WRITE (1 << 6) +#define IOCB_NOWAIT (1 << 7) struct kiocb { struct file *ki_filp; @@ -276,6 +293,7 @@ struct kiocb { void (*ki_complete)(struct kiocb *iocb, long ret, long ret2); void *private; int ki_flags; + enum rw_hint ki_hint; }; static inline bool is_sync_kiocb(struct kiocb *kiocb) @@ -283,16 +301,6 @@ static inline bool is_sync_kiocb(struct kiocb *kiocb) return kiocb->ki_complete == NULL; } -static inline int iocb_flags(struct file *file); - -static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) -{ - *kiocb = (struct kiocb) { - .ki_filp = filp, - .ki_flags = iocb_flags(filp), - }; -} - /* * "descriptor" for what we're up to with a read. * This allows us to use the same read code yet @@ -593,6 +601,7 @@ struct inode { spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ unsigned short i_bytes; unsigned int i_blkbits; + enum rw_hint i_write_hint; blkcnt_t i_blocks; #ifdef __NEED_I_SIZE_ORDERED @@ -847,6 +856,7 @@ struct file { * Must not be taken from IRQ context. */ spinlock_t f_lock; + enum rw_hint f_write_hint; atomic_long_t f_count; unsigned int f_flags; fmode_t f_mode; @@ -1022,8 +1032,6 @@ struct file_lock_context { #define OFFT_OFFSET_MAX INT_LIMIT(off_t) #endif -#include <linux/fcntl.h> - extern void send_sigio(struct fown_struct *fown, int fd, int band); /* @@ -1874,6 +1882,25 @@ static inline bool HAS_UNMAPPED_ID(struct inode *inode) return !uid_valid(inode->i_uid) || !gid_valid(inode->i_gid); } +static inline enum rw_hint file_write_hint(struct file *file) +{ + if (file->f_write_hint != WRITE_LIFE_NOT_SET) + return file->f_write_hint; + + return file_inode(file)->i_write_hint; +} + +static inline int iocb_flags(struct file *file); + +static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) +{ + *kiocb = (struct kiocb) { + .ki_filp = filp, + .ki_flags = iocb_flags(filp), + .ki_hint = file_write_hint(filp), + }; +} + /* * Inode state bits. Protected by inode->i_lock * @@ -2518,6 +2545,8 @@ extern int filemap_fdatawait(struct address_space *); extern void filemap_fdatawait_keep_errors(struct address_space *); extern int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend); +extern bool filemap_range_has_page(struct address_space *, loff_t lstart, + loff_t lend); extern int filemap_write_and_wait(struct address_space *mapping); extern int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); @@ -2844,7 +2873,7 @@ enum { DIO_SKIP_DIO_COUNT = 0x08, }; -void dio_end_io(struct bio *bio, int error); +void dio_end_io(struct bio *bio); ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, @@ -3057,6 +3086,25 @@ static inline int iocb_flags(struct file *file) return res; } +static inline int kiocb_set_rw_flags(struct kiocb *ki, int flags) +{ + if (unlikely(flags & ~RWF_SUPPORTED)) + return -EOPNOTSUPP; + + if (flags & RWF_NOWAIT) { + if (!(ki->ki_filp->f_mode & FMODE_AIO_NOWAIT)) + return -EOPNOTSUPP; + ki->ki_flags |= IOCB_NOWAIT; + } + if (flags & RWF_HIPRI) + ki->ki_flags |= IOCB_HIPRI; + if (flags & RWF_DSYNC) + ki->ki_flags |= IOCB_DSYNC; + if (flags & RWF_SYNC) + ki->ki_flags |= (IOCB_DSYNC | IOCB_SYNC); + return 0; +} + static inline ino_t parent_ino(struct dentry *dentry) { ino_t res; diff --git a/include/linux/ide.h b/include/linux/ide.h index 6980ca3..dc152e4 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -671,7 +671,7 @@ struct ide_port_ops { void (*init_dev)(ide_drive_t *); void (*set_pio_mode)(struct hwif_s *, ide_drive_t *); void (*set_dma_mode)(struct hwif_s *, ide_drive_t *); - int (*reset_poll)(ide_drive_t *); + blk_status_t (*reset_poll)(ide_drive_t *); void (*pre_reset)(ide_drive_t *); void (*resetproc)(ide_drive_t *); void (*maskproc)(ide_drive_t *, int); @@ -1092,7 +1092,7 @@ int generic_ide_ioctl(ide_drive_t *, struct block_device *, unsigned, unsigned l extern int ide_vlb_clk; extern int ide_pci_clk; -int ide_end_rq(ide_drive_t *, struct request *, int, unsigned int); +int ide_end_rq(ide_drive_t *, struct request *, blk_status_t, unsigned int); void ide_kill_rq(ide_drive_t *, struct request *); void __ide_set_handler(ide_drive_t *, ide_handler_t *, unsigned int); @@ -1123,7 +1123,7 @@ extern int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting, int arg); void ide_complete_cmd(ide_drive_t *, struct ide_cmd *, u8, u8); -int ide_complete_rq(ide_drive_t *, int, unsigned int); +int ide_complete_rq(ide_drive_t *, blk_status_t, unsigned int); void ide_tf_readback(ide_drive_t *drive, struct ide_cmd *cmd); void ide_tf_dump(const char *, struct ide_cmd *); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index f753e78..69f4e94 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -52,6 +52,7 @@ struct iomap { #define IOMAP_REPORT (1 << 2) /* report extent status, e.g. FIEMAP */ #define IOMAP_FAULT (1 << 3) /* mapping for page fault */ #define IOMAP_DIRECT (1 << 4) /* direct I/O */ +#define IOMAP_NOWAIT (1 << 5) /* Don't wait for writeback */ struct iomap_ops { /* diff --git a/include/linux/nvme.h b/include/linux/nvme.h index e400a69..6b8ee9e 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -87,7 +87,7 @@ enum { NVMF_RDMA_CMS_RDMA_CM = 1, /* Sockets based endpoint addressing */ }; -#define NVMF_AQ_DEPTH 32 +#define NVME_AQ_DEPTH 32 enum { NVME_REG_CAP = 0x0000, /* Controller Capabilities */ @@ -102,6 +102,7 @@ enum { NVME_REG_ACQ = 0x0030, /* Admin CQ Base Address */ NVME_REG_CMBLOC = 0x0038, /* Controller Memory Buffer Location */ NVME_REG_CMBSZ = 0x003c, /* Controller Memory Buffer Size */ + NVME_REG_DBS = 0x1000, /* SQ 0 Tail Doorbell */ }; #define NVME_CAP_MQES(cap) ((cap) & 0xffff) @@ -208,9 +209,15 @@ struct nvme_id_ctrl { __u8 tnvmcap[16]; __u8 unvmcap[16]; __le32 rpmbs; - __u8 rsvd316[4]; + __le16 edstt; + __u8 dsto; + __u8 fwug; __le16 kas; - __u8 rsvd322[190]; + __le16 hctma; + __le16 mntmt; + __le16 mxtmt; + __le32 sanicap; + __u8 rsvd332[180]; __u8 sqes; __u8 cqes; __le16 maxcmd; @@ -246,6 +253,7 @@ enum { NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3, NVME_CTRL_VWC_PRESENT = 1 << 0, NVME_CTRL_OACS_SEC_SUPP = 1 << 0, + NVME_CTRL_OACS_DIRECTIVES = 1 << 5, NVME_CTRL_OACS_DBBUF_SUPP = 1 << 7, }; @@ -275,7 +283,7 @@ struct nvme_id_ns { __le16 nabsn; __le16 nabo; __le16 nabspf; - __u16 rsvd46; + __le16 noiob; __u8 nvmcap[16]; __u8 rsvd64[40]; __u8 nguid[16]; @@ -289,6 +297,7 @@ enum { NVME_ID_CNS_NS = 0x00, NVME_ID_CNS_CTRL = 0x01, NVME_ID_CNS_NS_ACTIVE_LIST = 0x02, + NVME_ID_CNS_NS_DESC_LIST = 0x03, NVME_ID_CNS_NS_PRESENT_LIST = 0x10, NVME_ID_CNS_NS_PRESENT = 0x11, NVME_ID_CNS_CTRL_NS_LIST = 0x12, @@ -296,6 +305,19 @@ enum { }; enum { + NVME_DIR_IDENTIFY = 0x00, + NVME_DIR_STREAMS = 0x01, + NVME_DIR_SND_ID_OP_ENABLE = 0x01, + NVME_DIR_SND_ST_OP_REL_ID = 0x01, + NVME_DIR_SND_ST_OP_REL_RSC = 0x02, + NVME_DIR_RCV_ID_OP_PARAM = 0x01, + NVME_DIR_RCV_ST_OP_PARAM = 0x01, + NVME_DIR_RCV_ST_OP_STATUS = 0x02, + NVME_DIR_RCV_ST_OP_RESOURCE = 0x03, + NVME_DIR_ENDIR = 0x01, +}; + +enum { NVME_NS_FEAT_THIN = 1 << 0, NVME_NS_FLBAS_LBA_MASK = 0xf, NVME_NS_FLBAS_META_EXT = 0x10, @@ -315,6 +337,22 @@ enum { NVME_NS_DPS_PI_TYPE3 = 3, }; +struct nvme_ns_id_desc { + __u8 nidt; + __u8 nidl; + __le16 reserved; +}; + +#define NVME_NIDT_EUI64_LEN 8 +#define NVME_NIDT_NGUID_LEN 16 +#define NVME_NIDT_UUID_LEN 16 + +enum { + NVME_NIDT_EUI64 = 0x01, + NVME_NIDT_NGUID = 0x02, + NVME_NIDT_UUID = 0x03, +}; + struct nvme_smart_log { __u8 critical_warning; __u8 temperature[2]; @@ -536,6 +574,7 @@ enum { NVME_RW_PRINFO_PRCHK_APP = 1 << 11, NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12, NVME_RW_PRINFO_PRACT = 1 << 13, + NVME_RW_DTYPE_STREAMS = 1 << 4, }; struct nvme_dsm_cmd { @@ -587,6 +626,11 @@ struct nvme_feat_auto_pst { __le64 entries[32]; }; +enum { + NVME_HOST_MEM_ENABLE = (1 << 0), + NVME_HOST_MEM_RETURN = (1 << 1), +}; + /* Admin commands */ enum nvme_admin_opcode { @@ -605,6 +649,8 @@ enum nvme_admin_opcode { nvme_admin_download_fw = 0x11, nvme_admin_ns_attach = 0x15, nvme_admin_keep_alive = 0x18, + nvme_admin_directive_send = 0x19, + nvme_admin_directive_recv = 0x1a, nvme_admin_dbbuf = 0x7C, nvme_admin_format_nvm = 0x80, nvme_admin_security_send = 0x81, @@ -659,6 +705,8 @@ struct nvme_identify { __u32 rsvd11[5]; }; +#define NVME_IDENTIFY_DATA_SIZE 4096 + struct nvme_features { __u8 opcode; __u8 flags; @@ -668,7 +716,16 @@ struct nvme_features { union nvme_data_ptr dptr; __le32 fid; __le32 dword11; - __u32 rsvd12[4]; + __le32 dword12; + __le32 dword13; + __le32 dword14; + __le32 dword15; +}; + +struct nvme_host_mem_buf_desc { + __le64 addr; + __le32 size; + __u32 rsvd; }; struct nvme_create_cq { @@ -757,6 +814,24 @@ struct nvme_get_log_page_command { __u32 rsvd14[2]; }; +struct nvme_directive_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + union nvme_data_ptr dptr; + __le32 numd; + __u8 doper; + __u8 dtype; + __le16 dspec; + __u8 endir; + __u8 tdtype; + __u16 rsvd15; + + __u32 rsvd16[3]; +}; + /* * Fabrics subcommands. */ @@ -887,6 +962,18 @@ struct nvme_dbbuf { __u32 rsvd12[6]; }; +struct streams_directive_params { + __u16 msl; + __u16 nssa; + __u16 nsso; + __u8 rsvd[10]; + __u32 sws; + __u16 sgs; + __u16 nsa; + __u16 nso; + __u8 rsvd2[6]; +}; + struct nvme_command { union { struct nvme_common_command common; @@ -907,6 +994,7 @@ struct nvme_command { struct nvmf_property_set_command prop_set; struct nvmf_property_get_command prop_get; struct nvme_dbbuf dbbuf; + struct nvme_directive_cmd directive; }; }; @@ -1051,4 +1139,8 @@ struct nvme_completion { #define NVME_VS(major, minor, tertiary) \ (((major) << 16) | ((minor) << 8) | (tertiary)) +#define NVME_MAJOR(ver) ((ver) >> 16) +#define NVME_MINOR(ver) (((ver) >> 8) & 0xff) +#define NVME_TERTIARY(ver) ((ver) & 0xff) + #endif /* _LINUX_NVME_H */ diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index cb3c8fe..4b3286a 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -278,6 +278,8 @@ size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents, const void *buf, size_t buflen, off_t skip); size_t sg_pcopy_to_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen, off_t skip); +size_t sg_zero_buffer(struct scatterlist *sgl, unsigned int nents, + size_t buflen, off_t skip); /* * Maximum number of entries that will be allocated in one piece, if diff --git a/include/scsi/osd_initiator.h b/include/scsi/osd_initiator.h index a09cca8..a29d308 100644 --- a/include/scsi/osd_initiator.h +++ b/include/scsi/osd_initiator.h @@ -157,7 +157,7 @@ struct osd_request { osd_req_done_fn *async_done; void *async_private; - int async_error; + blk_status_t async_error; int req_errors; }; diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index b379f93..da9bf2b 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -166,6 +166,7 @@ extern void *scsi_kmap_atomic_sg(struct scatterlist *sg, int sg_count, extern void scsi_kunmap_atomic_sg(void *virt); extern int scsi_init_io(struct scsi_cmnd *cmd); +extern void scsi_initialize_rq(struct request *rq); extern int scsi_dma_map(struct scsi_cmnd *cmd); extern void scsi_dma_unmap(struct scsi_cmnd *cmd); diff --git a/include/scsi/scsi_request.h b/include/scsi/scsi_request.h index f0c76f9..e0afa44 100644 --- a/include/scsi/scsi_request.h +++ b/include/scsi/scsi_request.h @@ -27,6 +27,6 @@ static inline void scsi_req_free_cmd(struct scsi_request *req) kfree(req->cmd); } -void scsi_req_init(struct request *); +void scsi_req_init(struct scsi_request *req); #endif /* _SCSI_SCSI_REQUEST_H */ diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h index bb2554f..a2d4a8a 100644 --- a/include/uapi/linux/aio_abi.h +++ b/include/uapi/linux/aio_abi.h @@ -79,7 +79,7 @@ struct io_event { struct iocb { /* these are internal to the kernel/libc. */ __u64 aio_data; /* data to be returned in event's data */ - __u32 PADDED(aio_key, aio_reserved1); + __u32 PADDED(aio_key, aio_rw_flags); /* the kernel sets aio_key to the req # */ /* common fields */ diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index 4bf9f1e..2f6c77a 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -267,9 +267,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 35 +#define DM_VERSION_MINOR 36 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2016-06-23)" +#define DM_VERSION_EXTRA "-ioctl (2017-06-09)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 813afd6..ec69d55 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -43,6 +43,27 @@ /* (1U << 31) is reserved for signed error codes */ /* + * Set/Get write life time hints. {GET,SET}_RW_HINT operate on the + * underlying inode, while {GET,SET}_FILE_RW_HINT operate only on + * the specific file. + */ +#define F_GET_RW_HINT (F_LINUX_SPECIFIC_BASE + 11) +#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12) +#define F_GET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 13) +#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14) + +/* + * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be + * used to clear any hints previously set. + */ +#define RWF_WRITE_LIFE_NOT_SET 0 +#define RWH_WRITE_LIFE_NONE 1 +#define RWH_WRITE_LIFE_SHORT 2 +#define RWH_WRITE_LIFE_MEDIUM 3 +#define RWH_WRITE_LIFE_LONG 4 +#define RWH_WRITE_LIFE_EXTREME 5 + +/* * Types of directory notifications that may be requested. */ #define DN_ACCESS 0x00000001 /* File accessed */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 24e61a5..27d8c36 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -360,5 +360,9 @@ struct fscrypt_key { #define RWF_HIPRI 0x00000001 /* high priority request, poll if possible */ #define RWF_DSYNC 0x00000002 /* per-IO O_DSYNC */ #define RWF_SYNC 0x00000004 /* per-IO O_SYNC */ +#define RWF_NOWAIT 0x00000008 /* per-IO, return -EAGAIN if operation would block */ + +#define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC |\ + RWF_NOWAIT) #endif /* _UAPI_LINUX_FS_H */ diff --git a/include/uapi/linux/loop.h b/include/uapi/linux/loop.h index c8125ec..a3960f9 100644 --- a/include/uapi/linux/loop.h +++ b/include/uapi/linux/loop.h @@ -22,6 +22,7 @@ enum { LO_FLAGS_AUTOCLEAR = 4, LO_FLAGS_PARTSCAN = 8, LO_FLAGS_DIRECT_IO = 16, + LO_FLAGS_BLOCKSIZE = 32, }; #include <asm/posix_types.h> /* for __kernel_old_dev_t */ @@ -59,6 +60,8 @@ struct loop_info64 { __u64 lo_init[2]; }; +#define LO_INFO_BLOCKSIZE(l) (l)->lo_init[0] + /* * Loop filter types */ diff --git a/include/uapi/linux/nbd.h b/include/uapi/linux/nbd.h index 155e33f..a50527e 100644 --- a/include/uapi/linux/nbd.h +++ b/include/uapi/linux/nbd.h @@ -41,10 +41,14 @@ enum { #define NBD_FLAG_HAS_FLAGS (1 << 0) /* nbd-server supports flags */ #define NBD_FLAG_READ_ONLY (1 << 1) /* device is read-only */ #define NBD_FLAG_SEND_FLUSH (1 << 2) /* can flush writeback cache */ +#define NBD_FLAG_SEND_FUA (1 << 3) /* send FUA (forced unit access) */ /* there is a gap here to match userspace */ #define NBD_FLAG_SEND_TRIM (1 << 5) /* send trim/discard */ #define NBD_FLAG_CAN_MULTI_CONN (1 << 8) /* Server supports multiple connections per export. */ +/* values for cmd flags in the upper 16 bits of request type */ +#define NBD_CMD_FLAG_FUA (1 << 16) /* FUA (forced unit access) op */ + /* These are client behavior specific flags. */ #define NBD_CFLAG_DESTROY_ON_DISCONNECT (1 << 0) /* delete the nbd device on disconnect. */ diff --git a/kernel/power/swap.c b/kernel/power/swap.c index f80fd33..57d2257 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -225,14 +225,14 @@ static struct block_device *hib_resume_bdev; struct hib_bio_batch { atomic_t count; wait_queue_head_t wait; - int error; + blk_status_t error; }; static void hib_init_batch(struct hib_bio_batch *hb) { atomic_set(&hb->count, 0); init_waitqueue_head(&hb->wait); - hb->error = 0; + hb->error = BLK_STS_OK; } static void hib_end_io(struct bio *bio) @@ -240,7 +240,7 @@ static void hib_end_io(struct bio *bio) struct hib_bio_batch *hb = bio->bi_private; struct page *page = bio->bi_io_vec[0].bv_page; - if (bio->bi_error) { + if (bio->bi_status) { printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", imajor(bio->bi_bdev->bd_inode), iminor(bio->bi_bdev->bd_inode), @@ -253,8 +253,8 @@ static void hib_end_io(struct bio *bio) flush_icache_range((unsigned long)page_address(page), (unsigned long)page_address(page) + PAGE_SIZE); - if (bio->bi_error && !hb->error) - hb->error = bio->bi_error; + if (bio->bi_status && !hb->error) + hb->error = bio->bi_status; if (atomic_dec_and_test(&hb->count)) wake_up(&hb->wait); @@ -293,10 +293,10 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, return error; } -static int hib_wait_io(struct hib_bio_batch *hb) +static blk_status_t hib_wait_io(struct hib_bio_batch *hb) { wait_event(hb->wait, atomic_read(&hb->count) == 0); - return hb->error; + return blk_status_to_errno(hb->error); } /* diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 193c5f5..bc364f8 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -867,7 +867,7 @@ static void blk_add_trace_split(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, - BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu), + BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu), &rpdu); } } @@ -900,7 +900,7 @@ static void blk_add_trace_bio_remap(void *ignore, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_error, + bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status, sizeof(r), &r); } diff --git a/lib/scatterlist.c b/lib/scatterlist.c index c6cf822..be7b4dd 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -751,3 +751,38 @@ size_t sg_pcopy_to_buffer(struct scatterlist *sgl, unsigned int nents, return sg_copy_buffer(sgl, nents, buf, buflen, skip, true); } EXPORT_SYMBOL(sg_pcopy_to_buffer); + +/** + * sg_zero_buffer - Zero-out a part of a SG list + * @sgl: The SG list + * @nents: Number of SG entries + * @buflen: The number of bytes to zero out + * @skip: Number of bytes to skip before zeroing + * + * Returns the number of bytes zeroed. + **/ +size_t sg_zero_buffer(struct scatterlist *sgl, unsigned int nents, + size_t buflen, off_t skip) +{ + unsigned int offset = 0; + struct sg_mapping_iter miter; + unsigned int sg_flags = SG_MITER_ATOMIC | SG_MITER_TO_SG; + + sg_miter_start(&miter, sgl, nents, sg_flags); + + if (!sg_miter_skip(&miter, skip)) + return false; + + while (offset < buflen && sg_miter_next(&miter)) { + unsigned int len; + + len = min(miter.length, buflen - offset); + memset(miter.addr, 0, len); + + offset += len; + } + + sg_miter_stop(&miter); + return offset; +} +EXPORT_SYMBOL(sg_zero_buffer); diff --git a/mm/filemap.c b/mm/filemap.c index 6f1be57..742034e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -376,6 +376,38 @@ int filemap_flush(struct address_space *mapping) } EXPORT_SYMBOL(filemap_flush); +/** + * filemap_range_has_page - check if a page exists in range. + * @mapping: address space within which to check + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) + * + * Find at least one page in the range supplied, usually used to check if + * direct writing in this range will trigger a writeback. + */ +bool filemap_range_has_page(struct address_space *mapping, + loff_t start_byte, loff_t end_byte) +{ + pgoff_t index = start_byte >> PAGE_SHIFT; + pgoff_t end = end_byte >> PAGE_SHIFT; + struct pagevec pvec; + bool ret; + + if (end_byte < start_byte) + return false; + + if (mapping->nrpages == 0) + return false; + + pagevec_init(&pvec, 0); + if (!pagevec_lookup(&pvec, mapping, index, 1)) + return false; + ret = (pvec.pages[0]->index <= end); + pagevec_release(&pvec); + return ret; +} +EXPORT_SYMBOL(filemap_range_has_page); + static int __filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { @@ -2038,10 +2070,17 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) loff_t size; size = i_size_read(inode); - retval = filemap_write_and_wait_range(mapping, iocb->ki_pos, - iocb->ki_pos + count - 1); - if (retval < 0) - goto out; + if (iocb->ki_flags & IOCB_NOWAIT) { + if (filemap_range_has_page(mapping, iocb->ki_pos, + iocb->ki_pos + count - 1)) + return -EAGAIN; + } else { + retval = filemap_write_and_wait_range(mapping, + iocb->ki_pos, + iocb->ki_pos + count - 1); + if (retval < 0) + goto out; + } file_accessed(file); @@ -2642,6 +2681,9 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) pos = iocb->ki_pos; + if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) + return -EINVAL; + if (limit != RLIM_INFINITY) { if (iocb->ki_pos >= limit) { send_sig(SIGXFSZ, current, 0); @@ -2710,9 +2752,17 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) write_len = iov_iter_count(from); end = (pos + write_len - 1) >> PAGE_SHIFT; - written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); - if (written) - goto out; + if (iocb->ki_flags & IOCB_NOWAIT) { + /* If there are pages to writeback, return */ + if (filemap_range_has_page(inode->i_mapping, pos, + pos + iov_iter_count(from))) + return -EAGAIN; + } else { + written = filemap_write_and_wait_range(mapping, pos, + pos + write_len - 1); + if (written) + goto out; + } /* * After a write we want buffered reads to be sure to go to disk to get diff --git a/mm/page_io.c b/mm/page_io.c index 23f6d0d..2da71e6 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -45,7 +45,7 @@ void end_swap_bio_write(struct bio *bio) { struct page *page = bio->bi_io_vec[0].bv_page; - if (bio->bi_error) { + if (bio->bi_status) { SetPageError(page); /* * We failed to write the page out to swap-space. @@ -118,7 +118,7 @@ static void end_swap_bio_read(struct bio *bio) { struct page *page = bio->bi_io_vec[0].bv_page; - if (bio->bi_error) { + if (bio->bi_status) { SetPageError(page); ClearPageUptodate(page); pr_alert("Read-error on swap-device (%u:%u:%llu)\n", |