diff options
129 files changed, 11613 insertions, 1279 deletions
diff --git a/Documentation/ioctl/cdrom.txt b/Documentation/ioctl/cdrom.txt index 59df81c..a4d62a9 100644 --- a/Documentation/ioctl/cdrom.txt +++ b/Documentation/ioctl/cdrom.txt @@ -340,7 +340,8 @@ CDROMSUBCHNL Read subchannel data (struct cdrom_subchnl) EINVAL format not CDROM_MSF or CDROM_LBA notes: - Format is converted to CDROM_MSF on return + Format is converted to CDROM_MSF or CDROM_LBA + as per user request on return diff --git a/MAINTAINERS b/MAINTAINERS index 9987f84..8312586 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8184,6 +8184,13 @@ S: Supported F: drivers/nvme/host/ F: include/linux/nvme.h +NVM EXPRESS TARGET DRIVER +M: Christoph Hellwig <hch@lst.de> +M: Sagi Grimberg <sagi@grimberg.me> +L: linux-nvme@lists.infradead.org +S: Supported +F: drivers/nvme/target/ + NVMEM FRAMEWORK M: Srinivas Kandagatla <srinivas.kandagatla@linaro.org> M: Maxime Ripard <maxime.ripard@free-electrons.com> diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c index ff75d70..f9af646 100644 --- a/arch/powerpc/sysdev/axonram.c +++ b/arch/powerpc/sysdev/axonram.c @@ -223,7 +223,6 @@ static int axon_ram_probe(struct platform_device *device) bank->disk->first_minor = azfs_minor; bank->disk->fops = &axon_ram_devops; bank->disk->private_data = bank; - bank->disk->driverfs_dev = &device->dev; sprintf(bank->disk->disk_name, "%s%d", AXON_RAM_DEVICE_NAME, axon_ram_bank_id); @@ -238,7 +237,7 @@ static int axon_ram_probe(struct platform_device *device) set_capacity(bank->disk, bank->size >> AXON_RAM_SECTOR_SHIFT); blk_queue_make_request(bank->disk->queue, axon_ram_make_request); blk_queue_logical_block_size(bank->disk->queue, AXON_RAM_SECTOR_SIZE); - add_disk(bank->disk); + device_add_disk(&device->dev, bank->disk); bank->irq_id = irq_of_parse_and_map(device->dev.of_node, 0); if (bank->irq_id == NO_IRQ) { diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index ef6b4d9..f354027 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -801,6 +801,7 @@ static void ubd_device_release(struct device *dev) static int ubd_disk_register(int major, u64 size, int unit, struct gendisk **disk_out) { + struct device *parent = NULL; struct gendisk *disk; disk = alloc_disk(1 << UBD_SHIFT); @@ -823,12 +824,12 @@ static int ubd_disk_register(int major, u64 size, int unit, ubd_devs[unit].pdev.dev.release = ubd_device_release; dev_set_drvdata(&ubd_devs[unit].pdev.dev, &ubd_devs[unit]); platform_device_register(&ubd_devs[unit].pdev); - disk->driverfs_dev = &ubd_devs[unit].pdev.dev; + parent = &ubd_devs[unit].pdev.dev; } disk->private_data = &ubd_devs[unit]; disk->queue = ubd_devs[unit].queue; - add_disk(disk); + device_add_disk(parent, disk); *disk_out = disk; return 0; diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 15d37b1..f70cc3b 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -54,7 +54,6 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, { struct bio_integrity_payload *bip; struct bio_set *bs = bio->bi_pool; - unsigned long idx = BIO_POOL_NONE; unsigned inline_vecs; if (!bs || !bs->bio_integrity_pool) { @@ -72,17 +71,19 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, memset(bip, 0, sizeof(*bip)); if (nr_vecs > inline_vecs) { + unsigned long idx = 0; + bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx, bs->bvec_integrity_pool); if (!bip->bip_vec) goto err; bip->bip_max_vcnt = bvec_nr_vecs(idx); + bip->bip_slab = idx; } else { bip->bip_vec = bip->bip_inline_vecs; bip->bip_max_vcnt = inline_vecs; } - bip->bip_slab = idx; bip->bip_bio = bio; bio->bi_integrity = bip; bio->bi_rw |= REQ_INTEGRITY; @@ -111,9 +112,7 @@ void bio_integrity_free(struct bio *bio) bip->bip_vec->bv_offset); if (bs && bs->bio_integrity_pool) { - if (bip->bip_slab != BIO_POOL_NONE) - bvec_free(bs->bvec_integrity_pool, bip->bip_vec, - bip->bip_slab); + bvec_free(bs->bvec_integrity_pool, bip->bip_vec, bip->bip_slab); mempool_free(bip, bs->bio_integrity_pool); } else { diff --git a/block/bio.c b/block/bio.c index 848cd35..54ee384 100644 --- a/block/bio.c +++ b/block/bio.c @@ -43,7 +43,7 @@ * unsigned short */ #define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) } -static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { +static struct biovec_slab bvec_slabs[BVEC_POOL_NR] __read_mostly = { BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES), }; #undef BV @@ -160,11 +160,15 @@ unsigned int bvec_nr_vecs(unsigned short idx) void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) { - BIO_BUG_ON(idx >= BIOVEC_NR_POOLS); + if (!idx) + return; + idx--; + + BIO_BUG_ON(idx >= BVEC_POOL_NR); - if (idx == BIOVEC_MAX_IDX) + if (idx == BVEC_POOL_MAX) { mempool_free(bv, pool); - else { + } else { struct biovec_slab *bvs = bvec_slabs + idx; kmem_cache_free(bvs->slab, bv); @@ -206,7 +210,7 @@ struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, * idx now points to the pool we want to allocate from. only the * 1-vec entry pool is mempool backed. */ - if (*idx == BIOVEC_MAX_IDX) { + if (*idx == BVEC_POOL_MAX) { fallback: bvl = mempool_alloc(pool, gfp_mask); } else { @@ -226,11 +230,12 @@ fallback: */ bvl = kmem_cache_alloc(bvs->slab, __gfp_mask); if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) { - *idx = BIOVEC_MAX_IDX; + *idx = BVEC_POOL_MAX; goto fallback; } } + (*idx)++; return bvl; } @@ -250,8 +255,7 @@ static void bio_free(struct bio *bio) __bio_free(bio); if (bs) { - if (bio_flagged(bio, BIO_OWNS_VEC)) - bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio)); + bvec_free(bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio)); /* * If we have front padding, adjust the bio pointer before freeing @@ -420,7 +424,6 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) gfp_t saved_gfp = gfp_mask; unsigned front_pad; unsigned inline_vecs; - unsigned long idx = BIO_POOL_NONE; struct bio_vec *bvl = NULL; struct bio *bio; void *p; @@ -480,6 +483,8 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) bio_init(bio); if (nr_iovecs > inline_vecs) { + unsigned long idx = 0; + bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); if (!bvl && gfp_mask != saved_gfp) { punt_bios_to_rescuer(bs); @@ -490,13 +495,12 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) if (unlikely(!bvl)) goto err_free; - bio_set_flag(bio, BIO_OWNS_VEC); + bio->bi_flags |= idx << BVEC_POOL_OFFSET; } else if (nr_iovecs) { bvl = bio->bi_inline_vecs; } bio->bi_pool = bs; - bio->bi_flags |= idx << BIO_POOL_OFFSET; bio->bi_max_vecs = nr_iovecs; bio->bi_io_vec = bvl; return bio; @@ -568,7 +572,7 @@ EXPORT_SYMBOL(bio_phys_segments); */ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) { - BUG_ON(bio->bi_pool && BIO_POOL_IDX(bio) != BIO_POOL_NONE); + BUG_ON(bio->bi_pool && BVEC_POOL_IDX(bio)); /* * most users will be overriding ->bi_bdev with a new target, @@ -1097,7 +1101,6 @@ int bio_uncopy_user(struct bio *bio) bio_put(bio); return ret; } -EXPORT_SYMBOL(bio_uncopy_user); /** * bio_copy_user_iov - copy user data to bio @@ -1392,7 +1395,6 @@ void bio_unmap_user(struct bio *bio) __bio_unmap_user(bio); bio_put(bio); } -EXPORT_SYMBOL(bio_unmap_user); static void bio_map_kern_endio(struct bio *bio) { @@ -1538,7 +1540,6 @@ cleanup: bio_put(bio); return ERR_PTR(-ENOMEM); } -EXPORT_SYMBOL(bio_copy_kern); /* * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions @@ -1832,7 +1833,7 @@ EXPORT_SYMBOL_GPL(bio_trim); */ mempool_t *biovec_create_pool(int pool_entries) { - struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX; + struct biovec_slab *bp = bvec_slabs + BVEC_POOL_MAX; return mempool_create_slab_pool(pool_entries, bp->slab); } @@ -2009,7 +2010,7 @@ static void __init biovec_init_slabs(void) { int i; - for (i = 0; i < BIOVEC_NR_POOLS; i++) { + for (i = 0; i < BVEC_POOL_NR; i++) { int size; struct biovec_slab *bvs = bvec_slabs + i; diff --git a/block/blk-core.c b/block/blk-core.c index 3cfd67d..a687e9c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1294,10 +1294,15 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw, spin_lock_irq(q->queue_lock); rq = get_request(q, rw, 0, NULL, gfp_mask); - if (IS_ERR(rq)) + if (IS_ERR(rq)) { spin_unlock_irq(q->queue_lock); - /* q->queue_lock is unlocked at this point */ + return rq; + } + /* q->queue_lock is unlocked at this point */ + rq->__data_len = 0; + rq->__sector = (sector_t) -1; + rq->bio = rq->biotail = NULL; return rq; } @@ -1313,63 +1318,6 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) EXPORT_SYMBOL(blk_get_request); /** - * blk_make_request - given a bio, allocate a corresponding struct request. - * @q: target request queue - * @bio: The bio describing the memory mappings that will be submitted for IO. - * It may be a chained-bio properly constructed by block/bio layer. - * @gfp_mask: gfp flags to be used for memory allocation - * - * blk_make_request is the parallel of generic_make_request for BLOCK_PC - * type commands. Where the struct request needs to be farther initialized by - * the caller. It is passed a &struct bio, which describes the memory info of - * the I/O transfer. - * - * The caller of blk_make_request must make sure that bi_io_vec - * are set to describe the memory buffers. That bio_data_dir() will return - * the needed direction of the request. (And all bio's in the passed bio-chain - * are properly set accordingly) - * - * If called under none-sleepable conditions, mapped bio buffers must not - * need bouncing, by calling the appropriate masked or flagged allocator, - * suitable for the target device. Otherwise the call to blk_queue_bounce will - * BUG. - * - * WARNING: When allocating/cloning a bio-chain, careful consideration should be - * given to how you allocate bios. In particular, you cannot use - * __GFP_DIRECT_RECLAIM for anything but the first bio in the chain. Otherwise - * you risk waiting for IO completion of a bio that hasn't been submitted yet, - * thus resulting in a deadlock. Alternatively bios should be allocated using - * bio_kmalloc() instead of bio_alloc(), as that avoids the mempool deadlock. - * If possible a big IO should be split into smaller parts when allocation - * fails. Partial allocation should not be an error, or you risk a live-lock. - */ -struct request *blk_make_request(struct request_queue *q, struct bio *bio, - gfp_t gfp_mask) -{ - struct request *rq = blk_get_request(q, bio_data_dir(bio), gfp_mask); - - if (IS_ERR(rq)) - return rq; - - blk_rq_set_block_pc(rq); - - for_each_bio(bio) { - struct bio *bounce_bio = bio; - int ret; - - blk_queue_bounce(q, &bounce_bio); - ret = blk_rq_append_bio(q, rq, bounce_bio); - if (unlikely(ret)) { - blk_put_request(rq); - return ERR_PTR(ret); - } - } - - return rq; -} -EXPORT_SYMBOL(blk_make_request); - -/** * blk_rq_set_block_pc - initialize a request to type BLOCK_PC * @rq: request to be initialized * @@ -1377,9 +1325,6 @@ EXPORT_SYMBOL(blk_make_request); void blk_rq_set_block_pc(struct request *rq) { rq->cmd_type = REQ_TYPE_BLOCK_PC; - rq->__data_len = 0; - rq->__sector = (sector_t) -1; - rq->bio = rq->biotail = NULL; memset(rq->__cmd, 0, sizeof(rq->__cmd)); } EXPORT_SYMBOL(blk_rq_set_block_pc); @@ -1982,16 +1927,21 @@ generic_make_request_checks(struct bio *bio) } } - if ((bio_op(bio) == REQ_OP_DISCARD) && - (!blk_queue_discard(q) || - ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) { - err = -EOPNOTSUPP; - goto end_io; - } - - if (bio_op(bio) == REQ_OP_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) { - err = -EOPNOTSUPP; - goto end_io; + switch (bio_op(bio)) { + case REQ_OP_DISCARD: + if (!blk_queue_discard(q)) + goto not_supported; + break; + case REQ_OP_SECURE_ERASE: + if (!blk_queue_secure_erase(q)) + goto not_supported; + break; + case REQ_OP_WRITE_SAME: + if (!bdev_write_same(bio->bi_bdev)) + goto not_supported; + break; + default: + break; } /* @@ -2008,6 +1958,8 @@ generic_make_request_checks(struct bio *bio) trace_block_bio_queue(q, bio); return true; +not_supported: + err = -EOPNOTSUPP; end_io: bio->bi_error = err; bio_endio(bio); @@ -3383,6 +3335,7 @@ bool blk_poll(struct request_queue *q, blk_qc_t cookie) return false; } +EXPORT_SYMBOL_GPL(blk_poll); #ifdef CONFIG_PM /** diff --git a/block/blk-lib.c b/block/blk-lib.c index 9031d2a..083e56f 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -23,20 +23,32 @@ static struct bio *next_bio(struct bio *bio, unsigned int nr_pages, } int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, int op_flags, + sector_t nr_sects, gfp_t gfp_mask, int flags, struct bio **biop) { struct request_queue *q = bdev_get_queue(bdev); struct bio *bio = *biop; unsigned int granularity; + enum req_op op; int alignment; if (!q) return -ENXIO; - if (!blk_queue_discard(q)) - return -EOPNOTSUPP; - if ((op_flags & REQ_SECURE) && !blk_queue_secdiscard(q)) - return -EOPNOTSUPP; + + if (flags & BLKDEV_DISCARD_SECURE) { + if (flags & BLKDEV_DISCARD_ZERO) + return -EOPNOTSUPP; + if (!blk_queue_secure_erase(q)) + return -EOPNOTSUPP; + op = REQ_OP_SECURE_ERASE; + } else { + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + if ((flags & BLKDEV_DISCARD_ZERO) && + !q->limits.discard_zeroes_data) + return -EOPNOTSUPP; + op = REQ_OP_DISCARD; + } /* Zero-sector (unknown) and one-sector granularities are the same. */ granularity = max(q->limits.discard_granularity >> 9, 1U); @@ -66,7 +78,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, bio = next_bio(bio, 1, gfp_mask); bio->bi_iter.bi_sector = sector; bio->bi_bdev = bdev; - bio_set_op_attrs(bio, REQ_OP_DISCARD, op_flags); + bio_set_op_attrs(bio, op, 0); bio->bi_iter.bi_size = req_sects << 9; nr_sects -= req_sects; @@ -100,20 +112,16 @@ EXPORT_SYMBOL(__blkdev_issue_discard); int blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) { - int op_flags = 0; struct bio *bio = NULL; struct blk_plug plug; int ret; - if (flags & BLKDEV_DISCARD_SECURE) - op_flags |= REQ_SECURE; - blk_start_plug(&plug); - ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, op_flags, + ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, flags, &bio); if (!ret && bio) { ret = submit_bio_wait(bio); - if (ret == -EOPNOTSUPP) + if (ret == -EOPNOTSUPP && !(flags & BLKDEV_DISCARD_ZERO)) ret = 0; bio_put(bio); } @@ -173,7 +181,7 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, ret = submit_bio_wait(bio); bio_put(bio); } - return ret != -EOPNOTSUPP ? ret : 0; + return ret; } EXPORT_SYMBOL(blkdev_issue_write_same); @@ -244,11 +252,11 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, bool discard) { - struct request_queue *q = bdev_get_queue(bdev); - - if (discard && blk_queue_discard(q) && q->limits.discard_zeroes_data && - blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, 0) == 0) - return 0; + if (discard) { + if (!blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, + BLKDEV_DISCARD_ZERO)) + return 0; + } if (bdev_write_same(bdev) && blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, diff --git a/block/blk-map.c b/block/blk-map.c index 61733a6..b8657fa 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -9,21 +9,26 @@ #include "blk.h" -int blk_rq_append_bio(struct request_queue *q, struct request *rq, - struct bio *bio) +/* + * Append a bio to a passthrough request. Only works can be merged into + * the request based on the driver constraints. + */ +int blk_rq_append_bio(struct request *rq, struct bio *bio) { - if (!rq->bio) - blk_rq_bio_prep(q, rq, bio); - else if (!ll_back_merge_fn(q, rq, bio)) - return -EINVAL; - else { + if (!rq->bio) { + blk_rq_bio_prep(rq->q, rq, bio); + } else { + if (!ll_back_merge_fn(rq->q, rq, bio)) + return -EINVAL; + rq->biotail->bi_next = bio; rq->biotail = bio; - rq->__data_len += bio->bi_iter.bi_size; } + return 0; } +EXPORT_SYMBOL(blk_rq_append_bio); static int __blk_rq_unmap_user(struct bio *bio) { @@ -71,7 +76,7 @@ static int __blk_rq_map_user_iov(struct request *rq, */ bio_get(bio); - ret = blk_rq_append_bio(q, rq, bio); + ret = blk_rq_append_bio(rq, bio); if (ret) { bio_endio(bio); __blk_rq_unmap_user(orig_bio); @@ -229,7 +234,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, if (do_copy) rq->cmd_flags |= REQ_COPY_USER; - ret = blk_rq_append_bio(q, rq, bio); + ret = blk_rq_append_bio(rq, bio); if (unlikely(ret)) { /* request is too big */ bio_put(bio); diff --git a/block/blk-merge.c b/block/blk-merge.c index 5e4d93e..41cbd48 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -649,8 +649,7 @@ static int attempt_merge(struct request_queue *q, struct request *req, if (!rq_mergeable(req) || !rq_mergeable(next)) return 0; - if (!blk_check_merge_flags(req->cmd_flags, req_op(req), next->cmd_flags, - req_op(next))) + if (req_op(req) != req_op(next)) return 0; /* @@ -758,8 +757,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (!rq_mergeable(rq) || !bio_mergeable(bio)) return false; - if (!blk_check_merge_flags(rq->cmd_flags, req_op(rq), bio->bi_rw, - bio_op(bio))) + if (req_op(rq) != bio_op(bio)) return false; /* different data direction or already started, don't merge */ diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 56a0c37..729bac3 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -485,6 +485,32 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, } EXPORT_SYMBOL(blk_mq_tagset_busy_iter); +int blk_mq_reinit_tagset(struct blk_mq_tag_set *set) +{ + int i, j, ret = 0; + + if (!set->ops->reinit_request) + goto out; + + for (i = 0; i < set->nr_hw_queues; i++) { + struct blk_mq_tags *tags = set->tags[i]; + + for (j = 0; j < tags->nr_tags; j++) { + if (!tags->rqs[j]) + continue; + + ret = set->ops->reinit_request(set->driver_data, + tags->rqs[j]); + if (ret) + goto out; + } + } + +out: + return ret; +} +EXPORT_SYMBOL_GPL(blk_mq_reinit_tagset); + void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void *priv) { diff --git a/block/blk-mq.c b/block/blk-mq.c index 2a1920c..576e711 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -263,10 +263,53 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, blk_queue_exit(q); return ERR_PTR(-EWOULDBLOCK); } + + rq->__data_len = 0; + rq->__sector = (sector_t) -1; + rq->bio = rq->biotail = NULL; return rq; } EXPORT_SYMBOL(blk_mq_alloc_request); +struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw, + unsigned int flags, unsigned int hctx_idx) +{ + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + struct request *rq; + struct blk_mq_alloc_data alloc_data; + int ret; + + /* + * If the tag allocator sleeps we could get an allocation for a + * different hardware context. No need to complicate the low level + * allocator for this for the rare use case of a command tied to + * a specific queue. + */ + if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT))) + return ERR_PTR(-EINVAL); + + if (hctx_idx >= q->nr_hw_queues) + return ERR_PTR(-EIO); + + ret = blk_queue_enter(q, true); + if (ret) + return ERR_PTR(ret); + + hctx = q->queue_hw_ctx[hctx_idx]; + ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask)); + + blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); + rq = __blk_mq_alloc_request(&alloc_data, rw, 0); + if (!rq) { + blk_queue_exit(q); + return ERR_PTR(-EWOULDBLOCK); + } + + return rq; +} +EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); + static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct request *rq) { diff --git a/block/blk.h b/block/blk.h index 70e4aee..c37492f 100644 --- a/block/blk.h +++ b/block/blk.h @@ -64,8 +64,6 @@ void blk_exit_rl(struct request_list *rl); void init_request_from_bio(struct request *req, struct bio *bio); void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio); -int blk_rq_append_bio(struct request_queue *q, struct request *rq, - struct bio *bio); void blk_queue_bypass_start(struct request_queue *q); void blk_queue_bypass_end(struct request_queue *q); void blk_dequeue_request(struct request *rq); diff --git a/block/genhd.c b/block/genhd.c index f06d7f3..3c9dede 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -506,7 +506,7 @@ static int exact_lock(dev_t devt, void *data) return 0; } -static void register_disk(struct gendisk *disk) +static void register_disk(struct device *parent, struct gendisk *disk) { struct device *ddev = disk_to_dev(disk); struct block_device *bdev; @@ -514,7 +514,7 @@ static void register_disk(struct gendisk *disk) struct hd_struct *part; int err; - ddev->parent = disk->driverfs_dev; + ddev->parent = parent; dev_set_name(ddev, "%s", disk->disk_name); @@ -573,7 +573,8 @@ exit: } /** - * add_disk - add partitioning information to kernel list + * device_add_disk - add partitioning information to kernel list + * @parent: parent device for the disk * @disk: per-device partitioning information * * This function registers the partitioning information in @disk @@ -581,7 +582,7 @@ exit: * * FIXME: error handling */ -void add_disk(struct gendisk *disk) +void device_add_disk(struct device *parent, struct gendisk *disk) { struct backing_dev_info *bdi; dev_t devt; @@ -617,7 +618,7 @@ void add_disk(struct gendisk *disk) blk_register_region(disk_devt(disk), disk->minors, NULL, exact_match, exact_lock, disk); - register_disk(disk); + register_disk(parent, disk); blk_register_queue(disk); /* @@ -633,7 +634,7 @@ void add_disk(struct gendisk *disk) disk_add_events(disk); blk_integrity_add(disk); } -EXPORT_SYMBOL(add_disk); +EXPORT_SYMBOL(device_add_disk); void del_gendisk(struct gendisk *disk) { @@ -799,10 +800,9 @@ void __init printk_all_partitions(void) , disk_name(disk, part->partno, name_buf), part->info ? part->info->uuid : ""); if (is_part0) { - if (disk->driverfs_dev != NULL && - disk->driverfs_dev->driver != NULL) + if (dev->parent && dev->parent->driver) printk(" driver: %s\n", - disk->driverfs_dev->driver->name); + dev->parent->driver->name); else printk(" (driver?)\n"); } else diff --git a/drivers/block/brd.c b/drivers/block/brd.c index dd96a93..ba5145d 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -347,9 +347,7 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio) goto out; } - rw = bio_rw(bio); - if (rw == READA) - rw = READ; + rw = bio_data_dir(bio); bio_for_each_segment(bvec, bio, iter) { unsigned int len = bvec.bv_len; diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 63c2064..db9d6bb 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -1951,7 +1951,6 @@ static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk, if (cciss_create_ld_sysfs_entry(h, drv_index)) goto cleanup_queue; disk->private_data = h->drv[drv_index]; - disk->driverfs_dev = &h->drv[drv_index]->dev; /* Set up queue information */ blk_queue_bounce_limit(disk->queue, h->pdev->dma_mask); @@ -1973,7 +1972,7 @@ static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk, /* allows the interrupt handler to start the queue */ wmb(); h->drv[drv_index]->queue = disk->queue; - add_disk(disk); + device_add_disk(&h->drv[drv_index]->dev, disk); return 0; cleanup_queue: diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index d524973..0a1aaf8 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -258,7 +258,7 @@ bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); - D_ASSERT(device, (unsigned)(last - first) <= 1); + D_ASSERT(device, first <= last); D_ASSERT(device, atomic_read(&device->local_cnt) > 0); /* FIXME figure out a fast path for bios crossing AL extent boundaries */ @@ -341,6 +341,8 @@ static int __al_write_transaction(struct drbd_device *device, struct al_transact i = 0; + drbd_bm_reset_al_hints(device); + /* Even though no one can start to change this list * once we set the LC_LOCKED -- from drbd_al_begin_io(), * lc_try_lock_for_transaction() --, someone may still @@ -770,10 +772,18 @@ static bool lazy_bitmap_update_due(struct drbd_device *device) static void maybe_schedule_on_disk_bitmap_update(struct drbd_device *device, bool rs_done) { - if (rs_done) - set_bit(RS_DONE, &device->flags); - /* and also set RS_PROGRESS below */ - else if (!lazy_bitmap_update_due(device)) + if (rs_done) { + struct drbd_connection *connection = first_peer_device(device)->connection; + if (connection->agreed_pro_version <= 95 || + is_sync_target_state(device->state.conn)) + set_bit(RS_DONE, &device->flags); + /* and also set RS_PROGRESS below */ + + /* Else: rather wait for explicit notification via receive_state, + * to avoid uuids-rotated-too-fast causing full resync + * in next handshake, in case the replication link breaks + * at the most unfortunate time... */ + } else if (!lazy_bitmap_update_due(device)) return; drbd_device_post_work(device, RS_PROGRESS); @@ -832,6 +842,13 @@ static int update_sync_bits(struct drbd_device *device, return count; } +static bool plausible_request_size(int size) +{ + return size > 0 + && size <= DRBD_MAX_BATCH_BIO_SIZE + && IS_ALIGNED(size, 512); +} + /* clear the bit corresponding to the piece of storage in question: * size byte of data starting from sector. Only clear a bits of the affected * one ore more _aligned_ BM_BLOCK_SIZE blocks. @@ -851,7 +868,7 @@ int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, if ((mode == SET_OUT_OF_SYNC) && size == 0) return 0; - if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { + if (!plausible_request_size(size)) { drbd_err(device, "%s: sector=%llus size=%d nonsense!\n", drbd_change_sync_fname[mode], (unsigned long long)sector, size); diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index e5d89f6..ab62b81 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -96,6 +96,13 @@ struct drbd_bitmap { struct page **bm_pages; spinlock_t bm_lock; + /* exclusively to be used by __al_write_transaction(), + * drbd_bm_mark_for_writeout() and + * and drbd_bm_write_hinted() -> bm_rw() called from there. + */ + unsigned int n_bitmap_hints; + unsigned int al_bitmap_hints[AL_UPDATES_PER_TRANSACTION]; + /* see LIMITATIONS: above */ unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */ @@ -242,6 +249,11 @@ static void bm_set_page_need_writeout(struct page *page) set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page)); } +void drbd_bm_reset_al_hints(struct drbd_device *device) +{ + device->bitmap->n_bitmap_hints = 0; +} + /** * drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout * @device: DRBD device. @@ -253,6 +265,7 @@ static void bm_set_page_need_writeout(struct page *page) */ void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr) { + struct drbd_bitmap *b = device->bitmap; struct page *page; if (page_nr >= device->bitmap->bm_number_of_pages) { drbd_warn(device, "BAD: page_nr: %u, number_of_pages: %u\n", @@ -260,7 +273,9 @@ void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr) return; } page = device->bitmap->bm_pages[page_nr]; - set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page)); + BUG_ON(b->n_bitmap_hints >= ARRAY_SIZE(b->al_bitmap_hints)); + if (!test_and_set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page))) + b->al_bitmap_hints[b->n_bitmap_hints++] = page_nr; } static int bm_test_page_unchanged(struct page *page) @@ -427,8 +442,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) } /* - * called on driver init only. TODO call when a device is created. - * allocates the drbd_bitmap, and stores it in device->bitmap. + * allocates the drbd_bitmap and stores it in device->bitmap. */ int drbd_bm_init(struct drbd_device *device) { @@ -633,7 +647,8 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi unsigned long bits, words, owords, obits; unsigned long want, have, onpages; /* number of pages */ struct page **npages, **opages = NULL; - int err = 0, growing; + int err = 0; + bool growing; if (!expect(b)) return -ENOMEM; @@ -1030,7 +1045,7 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned { struct drbd_bm_aio_ctx *ctx; struct drbd_bitmap *b = device->bitmap; - int num_pages, i, count = 0; + unsigned int num_pages, i, count = 0; unsigned long now; char ppb[10]; int err = 0; @@ -1078,16 +1093,37 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned now = jiffies; /* let the layers below us try to merge these bios... */ - for (i = 0; i < num_pages; i++) { - /* ignore completely unchanged pages */ - if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx) - break; - if (!(flags & BM_AIO_READ)) { - if ((flags & BM_AIO_WRITE_HINTED) && - !test_and_clear_bit(BM_PAGE_HINT_WRITEOUT, - &page_private(b->bm_pages[i]))) - continue; + if (flags & BM_AIO_READ) { + for (i = 0; i < num_pages; i++) { + atomic_inc(&ctx->in_flight); + bm_page_io_async(ctx, i); + ++count; + cond_resched(); + } + } else if (flags & BM_AIO_WRITE_HINTED) { + /* ASSERT: BM_AIO_WRITE_ALL_PAGES is not set. */ + unsigned int hint; + for (hint = 0; hint < b->n_bitmap_hints; hint++) { + i = b->al_bitmap_hints[hint]; + if (i >= num_pages) /* == -1U: no hint here. */ + continue; + /* Several AL-extents may point to the same page. */ + if (!test_and_clear_bit(BM_PAGE_HINT_WRITEOUT, + &page_private(b->bm_pages[i]))) + continue; + /* Has it even changed? */ + if (bm_test_page_unchanged(b->bm_pages[i])) + continue; + atomic_inc(&ctx->in_flight); + bm_page_io_async(ctx, i); + ++count; + } + } else { + for (i = 0; i < num_pages; i++) { + /* ignore completely unchanged pages */ + if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx) + break; if (!(flags & BM_AIO_WRITE_ALL_PAGES) && bm_test_page_unchanged(b->bm_pages[i])) { dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i); @@ -1100,11 +1136,11 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned dynamic_drbd_dbg(device, "skipped bm lazy write for idx %u\n", i); continue; } + atomic_inc(&ctx->in_flight); + bm_page_io_async(ctx, i); + ++count; + cond_resched(); } - atomic_inc(&ctx->in_flight); - bm_page_io_async(ctx, i); - ++count; - cond_resched(); } /* @@ -1121,10 +1157,14 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy); /* summary for global bitmap IO */ - if (flags == 0) - drbd_info(device, "bitmap %s of %u pages took %lu jiffies\n", - (flags & BM_AIO_READ) ? "READ" : "WRITE", - count, jiffies - now); + if (flags == 0) { + unsigned int ms = jiffies_to_msecs(jiffies - now); + if (ms > 5) { + drbd_info(device, "bitmap %s of %u pages took %u ms\n", + (flags & BM_AIO_READ) ? "READ" : "WRITE", + count, ms); + } + } if (ctx->error) { drbd_alert(device, "we had at least one MD IO ERROR during bitmap IO\n"); diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c index 4de95bb..be91a8d 100644 --- a/drivers/block/drbd/drbd_debugfs.c +++ b/drivers/block/drbd/drbd_debugfs.c @@ -237,14 +237,9 @@ static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_re seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C"); seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync"); - if (f & EE_IS_TRIM) { - seq_putc(m, sep); - sep = '|'; - if (f & EE_IS_TRIM_USE_ZEROOUT) - seq_puts(m, "zero-out"); - else - seq_puts(m, "trim"); - } + if (f & EE_IS_TRIM) + __seq_print_rq_state_bit(m, f & EE_IS_TRIM_USE_ZEROOUT, &sep, "zero-out", "trim"); + seq_print_rq_state_bit(m, f & EE_WRITE_SAME, &sep, "write-same"); seq_putc(m, '\n'); } @@ -908,7 +903,7 @@ static int drbd_version_open(struct inode *inode, struct file *file) return single_open(file, drbd_version_show, NULL); } -static struct file_operations drbd_version_fops = { +static const struct file_operations drbd_version_fops = { .owner = THIS_MODULE, .open = drbd_version_open, .llseek = seq_lseek, diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index a64c645..7b54354 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -468,9 +468,15 @@ enum { /* this is/was a write request */ __EE_WRITE, + /* this is/was a write same request */ + __EE_WRITE_SAME, + /* this originates from application on peer * (not some resync or verify or other DRBD internal request) */ __EE_APPLICATION, + + /* If it contains only 0 bytes, send back P_RS_DEALLOCATED */ + __EE_RS_THIN_REQ, }; #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) @@ -484,7 +490,9 @@ enum { #define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE) #define EE_SUBMITTED (1<<__EE_SUBMITTED) #define EE_WRITE (1<<__EE_WRITE) +#define EE_WRITE_SAME (1<<__EE_WRITE_SAME) #define EE_APPLICATION (1<<__EE_APPLICATION) +#define EE_RS_THIN_REQ (1<<__EE_RS_THIN_REQ) /* flag bits per device */ enum { @@ -1123,6 +1131,7 @@ extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int extern int drbd_send_bitmap(struct drbd_device *device); extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode); extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode); +extern int drbd_send_rs_deallocated(struct drbd_peer_device *, struct drbd_peer_request *); extern void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev); extern void drbd_device_cleanup(struct drbd_device *device); void drbd_print_uuids(struct drbd_device *device, const char *text); @@ -1342,11 +1351,11 @@ struct bm_extent { #define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */ #define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */ -/* For now, don't allow more than one activity log extent worth of data - * to be discarded in one go. We may need to rework drbd_al_begin_io() - * to allow for even larger discard ranges */ -#define DRBD_MAX_DISCARD_SIZE AL_EXTENT_SIZE -#define DRBD_MAX_DISCARD_SECTORS (DRBD_MAX_DISCARD_SIZE >> 9) +/* For now, don't allow more than half of what we can "activate" in one + * activity log transaction to be discarded in one go. We may need to rework + * drbd_al_begin_io() to allow for even larger discard ranges */ +#define DRBD_MAX_BATCH_BIO_SIZE (AL_UPDATES_PER_TRANSACTION/2*AL_EXTENT_SIZE) +#define DRBD_MAX_BBIO_SECTORS (DRBD_MAX_BATCH_BIO_SIZE >> 9) extern int drbd_bm_init(struct drbd_device *device); extern int drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits); @@ -1369,6 +1378,7 @@ extern int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr); extern int drbd_bm_read(struct drbd_device *device) __must_hold(local); extern void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr); extern int drbd_bm_write(struct drbd_device *device) __must_hold(local); +extern void drbd_bm_reset_al_hints(struct drbd_device *device) __must_hold(local); extern int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local); extern int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local); extern int drbd_bm_write_all(struct drbd_device *device) __must_hold(local); @@ -1483,12 +1493,14 @@ enum determine_dev_size { extern enum determine_dev_size drbd_determine_dev_size(struct drbd_device *, enum dds_flags, struct resize_parms *) __must_hold(local); extern void resync_after_online_grow(struct drbd_device *); -extern void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev); +extern void drbd_reconsider_queue_parameters(struct drbd_device *device, + struct drbd_backing_dev *bdev, struct o_qlim *o); extern enum drbd_state_rv drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force); extern bool conn_try_outdate_peer(struct drbd_connection *connection); extern void conn_try_outdate_peer_async(struct drbd_connection *connection); +extern enum drbd_peer_state conn_khelper(struct drbd_connection *connection, char *cmd); extern int drbd_khelper(struct drbd_device *device, char *cmd); /* drbd_worker.c */ @@ -1548,6 +1560,8 @@ extern void start_resync_timer_fn(unsigned long data); extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req); /* drbd_receiver.c */ +extern int drbd_issue_discard_or_zero_out(struct drbd_device *device, + sector_t start, unsigned int nr_sectors, bool discard); extern int drbd_receiver(struct drbd_thread *thi); extern int drbd_ack_receiver(struct drbd_thread *thi); extern void drbd_send_ping_wf(struct work_struct *ws); @@ -1561,7 +1575,7 @@ extern int drbd_submit_peer_request(struct drbd_device *, extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *); extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64, sector_t, unsigned int, - bool, + unsigned int, gfp_t) __must_hold(local); extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *, int); @@ -1635,8 +1649,6 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin /* drbd_proc.c */ extern struct proc_dir_entry *drbd_proc; extern const struct file_operations drbd_proc_fops; -extern const char *drbd_conn_str(enum drbd_conns s); -extern const char *drbd_role_str(enum drbd_role s); /* drbd_actlog.c */ extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i); @@ -2095,13 +2107,22 @@ static inline void _sub_unacked(struct drbd_device *device, int n, const char *f ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); } +static inline bool is_sync_target_state(enum drbd_conns connection_state) +{ + return connection_state == C_SYNC_TARGET || + connection_state == C_PAUSED_SYNC_T; +} + +static inline bool is_sync_source_state(enum drbd_conns connection_state) +{ + return connection_state == C_SYNC_SOURCE || + connection_state == C_PAUSED_SYNC_S; +} + static inline bool is_sync_state(enum drbd_conns connection_state) { - return - (connection_state == C_SYNC_SOURCE - || connection_state == C_SYNC_TARGET - || connection_state == C_PAUSED_SYNC_S - || connection_state == C_PAUSED_SYNC_T); + return is_sync_source_state(connection_state) || + is_sync_target_state(connection_state); } /** diff --git a/drivers/block/drbd/drbd_interval.h b/drivers/block/drbd/drbd_interval.h index f210543..23c5a94 100644 --- a/drivers/block/drbd/drbd_interval.h +++ b/drivers/block/drbd/drbd_interval.h @@ -6,13 +6,13 @@ struct drbd_interval { struct rb_node rb; - sector_t sector; /* start sector of the interval */ - unsigned int size; /* size in bytes */ - sector_t end; /* highest interval end in subtree */ - int local:1 /* local or remote request? */; - int waiting:1; /* someone is waiting for this to complete */ - int completed:1; /* this has been completed already; - * ignore for conflict detection */ + sector_t sector; /* start sector of the interval */ + unsigned int size; /* size in bytes */ + sector_t end; /* highest interval end in subtree */ + unsigned int local:1 /* local or remote request? */; + unsigned int waiting:1; /* someone is waiting for completion */ + unsigned int completed:1; /* this has been completed already; + * ignore for conflict detection */ }; static inline void drbd_clear_interval(struct drbd_interval *i) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 2b37744..0501ae0 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -31,7 +31,7 @@ #include <linux/module.h> #include <linux/jiffies.h> #include <linux/drbd.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/types.h> #include <net/sock.h> #include <linux/ctype.h> @@ -920,6 +920,31 @@ void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *peer_device) } } +/* communicated if (agreed_features & DRBD_FF_WSAME) */ +void assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, struct request_queue *q) +{ + if (q) { + p->qlim->physical_block_size = cpu_to_be32(queue_physical_block_size(q)); + p->qlim->logical_block_size = cpu_to_be32(queue_logical_block_size(q)); + p->qlim->alignment_offset = cpu_to_be32(queue_alignment_offset(q)); + p->qlim->io_min = cpu_to_be32(queue_io_min(q)); + p->qlim->io_opt = cpu_to_be32(queue_io_opt(q)); + p->qlim->discard_enabled = blk_queue_discard(q); + p->qlim->discard_zeroes_data = queue_discard_zeroes_data(q); + p->qlim->write_same_capable = !!q->limits.max_write_same_sectors; + } else { + q = device->rq_queue; + p->qlim->physical_block_size = cpu_to_be32(queue_physical_block_size(q)); + p->qlim->logical_block_size = cpu_to_be32(queue_logical_block_size(q)); + p->qlim->alignment_offset = 0; + p->qlim->io_min = cpu_to_be32(queue_io_min(q)); + p->qlim->io_opt = cpu_to_be32(queue_io_opt(q)); + p->qlim->discard_enabled = 0; + p->qlim->discard_zeroes_data = 0; + p->qlim->write_same_capable = 0; + } +} + int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enum dds_flags flags) { struct drbd_device *device = peer_device->device; @@ -928,29 +953,37 @@ int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enu sector_t d_size, u_size; int q_order_type; unsigned int max_bio_size; + unsigned int packet_size; + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + if (!p) + return -EIO; + packet_size = sizeof(*p); + if (peer_device->connection->agreed_features & DRBD_FF_WSAME) + packet_size += sizeof(p->qlim[0]); + + memset(p, 0, packet_size); if (get_ldev_if_state(device, D_NEGOTIATING)) { - D_ASSERT(device, device->ldev->backing_bdev); + struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev); d_size = drbd_get_max_capacity(device->ldev); rcu_read_lock(); u_size = rcu_dereference(device->ldev->disk_conf)->disk_size; rcu_read_unlock(); q_order_type = drbd_queue_order_type(device); - max_bio_size = queue_max_hw_sectors(device->ldev->backing_bdev->bd_disk->queue) << 9; + max_bio_size = queue_max_hw_sectors(q) << 9; max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE); + assign_p_sizes_qlim(device, p, q); put_ldev(device); } else { d_size = 0; u_size = 0; q_order_type = QUEUE_ORDERED_NONE; max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */ + assign_p_sizes_qlim(device, p, NULL); } - sock = &peer_device->connection->data; - p = drbd_prepare_command(peer_device, sock); - if (!p) - return -EIO; - if (peer_device->connection->agreed_pro_version <= 94) max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET); else if (peer_device->connection->agreed_pro_version < 100) @@ -962,7 +995,8 @@ int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enu p->max_bio_size = cpu_to_be32(max_bio_size); p->queue_order_type = cpu_to_be16(q_order_type); p->dds_flags = cpu_to_be16(flags); - return drbd_send_command(peer_device, sock, P_SIZES, sizeof(*p), NULL, 0); + + return drbd_send_command(peer_device, sock, P_SIZES, packet_size, NULL, 0); } /** @@ -1377,6 +1411,22 @@ int drbd_send_ack_ex(struct drbd_peer_device *peer_device, enum drbd_packet cmd, cpu_to_be64(block_id)); } +int drbd_send_rs_deallocated(struct drbd_peer_device *peer_device, + struct drbd_peer_request *peer_req) +{ + struct drbd_socket *sock; + struct p_block_desc *p; + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + if (!p) + return -EIO; + p->sector = cpu_to_be64(peer_req->i.sector); + p->blksize = cpu_to_be32(peer_req->i.size); + p->pad = 0; + return drbd_send_command(peer_device, sock, P_RS_DEALLOCATED, sizeof(*p), NULL, 0); +} + int drbd_send_drequest(struct drbd_peer_device *peer_device, int cmd, sector_t sector, int size, u64 block_id) { @@ -1561,6 +1611,9 @@ static int _drbd_send_bio(struct drbd_peer_device *peer_device, struct bio *bio) ? 0 : MSG_MORE); if (err) return err; + /* REQ_OP_WRITE_SAME has only one segment */ + if (bio_op(bio) == REQ_OP_WRITE_SAME) + break; } return 0; } @@ -1579,6 +1632,9 @@ static int _drbd_send_zc_bio(struct drbd_peer_device *peer_device, struct bio *b bio_iter_last(bvec, iter) ? 0 : MSG_MORE); if (err) return err; + /* REQ_OP_WRITE_SAME has only one segment */ + if (bio_op(bio) == REQ_OP_WRITE_SAME) + break; } return 0; } @@ -1610,6 +1666,7 @@ static u32 bio_flags_to_wire(struct drbd_connection *connection, return (bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) | (bio->bi_rw & REQ_FUA ? DP_FUA : 0) | (bio->bi_rw & REQ_PREFLUSH ? DP_FLUSH : 0) | + (bio_op(bio) == REQ_OP_WRITE_SAME ? DP_WSAME : 0) | (bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0); else return bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0; @@ -1623,6 +1680,8 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request * struct drbd_device *device = peer_device->device; struct drbd_socket *sock; struct p_data *p; + struct p_wsame *wsame = NULL; + void *digest_out; unsigned int dp_flags = 0; int digest_size; int err; @@ -1658,12 +1717,29 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request * err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0); goto out; } + if (dp_flags & DP_WSAME) { + /* this will only work if DRBD_FF_WSAME is set AND the + * handshake agreed that all nodes and backend devices are + * WRITE_SAME capable and agree on logical_block_size */ + wsame = (struct p_wsame*)p; + digest_out = wsame + 1; + wsame->size = cpu_to_be32(req->i.size); + } else + digest_out = p + 1; /* our digest is still only over the payload. * TRIM does not carry any payload. */ if (digest_size) - drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, p + 1); - err = __send_command(peer_device->connection, device->vnr, sock, P_DATA, sizeof(*p) + digest_size, NULL, req->i.size); + drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, digest_out); + if (wsame) { + err = + __send_command(peer_device->connection, device->vnr, sock, P_WSAME, + sizeof(*wsame) + digest_size, NULL, + bio_iovec(req->master_bio).bv_len); + } else + err = + __send_command(peer_device->connection, device->vnr, sock, P_DATA, + sizeof(*p) + digest_size, NULL, req->i.size); if (!err) { /* For protocol A, we have to memcpy the payload into * socket buffers, as we may complete right away @@ -3507,7 +3583,12 @@ static int w_bitmap_io(struct drbd_work *w, int unused) struct bm_io_work *work = &device->bm_io_work; int rv = -EIO; - D_ASSERT(device, atomic_read(&device->ap_bio_cnt) == 0); + if (work->flags != BM_LOCKED_CHANGE_ALLOWED) { + int cnt = atomic_read(&device->ap_bio_cnt); + if (cnt) + drbd_err(device, "FIXME: ap_bio_cnt %d, expected 0; queued for '%s'\n", + cnt, work->why); + } if (get_ldev(device)) { drbd_bm_lock(device, work->why, work->flags); @@ -3587,18 +3668,20 @@ void drbd_queue_bitmap_io(struct drbd_device *device, int drbd_bitmap_io(struct drbd_device *device, int (*io_fn)(struct drbd_device *), char *why, enum bm_flag flags) { + /* Only suspend io, if some operation is supposed to be locked out */ + const bool do_suspend_io = flags & (BM_DONT_CLEAR|BM_DONT_SET|BM_DONT_TEST); int rv; D_ASSERT(device, current != first_peer_device(device)->connection->worker.task); - if ((flags & BM_LOCKED_SET_ALLOWED) == 0) + if (do_suspend_io) drbd_suspend_io(device); drbd_bm_lock(device, why, flags); rv = io_fn(device); drbd_bm_unlock(device); - if ((flags & BM_LOCKED_SET_ALLOWED) == 0) + if (do_suspend_io) drbd_resume_io(device); return rv; @@ -3637,6 +3720,8 @@ const char *cmdname(enum drbd_packet cmd) * one PRO_VERSION */ static const char *cmdnames[] = { [P_DATA] = "Data", + [P_WSAME] = "WriteSame", + [P_TRIM] = "Trim", [P_DATA_REPLY] = "DataReply", [P_RS_DATA_REPLY] = "RSDataReply", [P_BARRIER] = "Barrier", @@ -3681,6 +3766,8 @@ const char *cmdname(enum drbd_packet cmd) [P_CONN_ST_CHG_REPLY] = "conn_st_chg_reply", [P_RETRY_WRITE] = "retry_write", [P_PROTOCOL_UPDATE] = "protocol_update", + [P_RS_THIN_REQ] = "rs_thin_req", + [P_RS_DEALLOCATED] = "rs_deallocated", /* enum drbd_packet, but not commands - obsoleted flags: * P_MAY_IGNORE diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 0bac9c8..f35db29 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -343,7 +343,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd) (char[20]) { }, /* address family */ (char[60]) { }, /* address */ NULL }; - char mb[12]; + char mb[14]; char *argv[] = {usermode_helper, cmd, mb, NULL }; struct drbd_connection *connection = first_peer_device(device)->connection; struct sib_info sib; @@ -352,7 +352,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd) if (current == connection->worker.task) set_bit(CALLBACK_PENDING, &connection->flags); - snprintf(mb, 12, "minor-%d", device_to_minor(device)); + snprintf(mb, 14, "minor-%d", device_to_minor(device)); setup_khelper_env(connection, envp); /* The helper may take some time. @@ -387,7 +387,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd) return ret; } -static int conn_khelper(struct drbd_connection *connection, char *cmd) +enum drbd_peer_state conn_khelper(struct drbd_connection *connection, char *cmd) { char *envp[] = { "HOME=/", "TERM=linux", @@ -442,19 +442,17 @@ static enum drbd_fencing_p highest_fencing_policy(struct drbd_connection *connec } rcu_read_unlock(); - if (fp == FP_NOT_AVAIL) { - /* IO Suspending works on the whole resource. - Do it only for one device. */ - vnr = 0; - peer_device = idr_get_next(&connection->peer_devices, &vnr); - drbd_change_state(peer_device->device, CS_VERBOSE | CS_HARD, NS(susp_fen, 0)); - } - return fp; } +static bool resource_is_supended(struct drbd_resource *resource) +{ + return resource->susp || resource->susp_fen || resource->susp_nod; +} + bool conn_try_outdate_peer(struct drbd_connection *connection) { + struct drbd_resource * const resource = connection->resource; unsigned int connect_cnt; union drbd_state mask = { }; union drbd_state val = { }; @@ -462,21 +460,41 @@ bool conn_try_outdate_peer(struct drbd_connection *connection) char *ex_to_string; int r; - spin_lock_irq(&connection->resource->req_lock); + spin_lock_irq(&resource->req_lock); if (connection->cstate >= C_WF_REPORT_PARAMS) { drbd_err(connection, "Expected cstate < C_WF_REPORT_PARAMS\n"); - spin_unlock_irq(&connection->resource->req_lock); + spin_unlock_irq(&resource->req_lock); return false; } connect_cnt = connection->connect_cnt; - spin_unlock_irq(&connection->resource->req_lock); + spin_unlock_irq(&resource->req_lock); fp = highest_fencing_policy(connection); switch (fp) { case FP_NOT_AVAIL: drbd_warn(connection, "Not fencing peer, I'm not even Consistent myself.\n"); - goto out; + spin_lock_irq(&resource->req_lock); + if (connection->cstate < C_WF_REPORT_PARAMS) { + _conn_request_state(connection, + (union drbd_state) { { .susp_fen = 1 } }, + (union drbd_state) { { .susp_fen = 0 } }, + CS_VERBOSE | CS_HARD | CS_DC_SUSP); + /* We are no longer suspended due to the fencing policy. + * We may still be suspended due to the on-no-data-accessible policy. + * If that was OND_IO_ERROR, fail pending requests. */ + if (!resource_is_supended(resource)) + _tl_restart(connection, CONNECTION_LOST_WHILE_PENDING); + } + /* Else: in case we raced with a connection handshake, + * let the handshake figure out if we maybe can RESEND, + * and do not resume/fail pending requests here. + * Worst case is we stay suspended for now, which may be + * resolved by either re-establishing the replication link, or + * the next link failure, or eventually the administrator. */ + spin_unlock_irq(&resource->req_lock); + return false; + case FP_DONT_CARE: return true; default: ; @@ -485,17 +503,17 @@ bool conn_try_outdate_peer(struct drbd_connection *connection) r = conn_khelper(connection, "fence-peer"); switch ((r>>8) & 0xff) { - case 3: /* peer is inconsistent */ + case P_INCONSISTENT: /* peer is inconsistent */ ex_to_string = "peer is inconsistent or worse"; mask.pdsk = D_MASK; val.pdsk = D_INCONSISTENT; break; - case 4: /* peer got outdated, or was already outdated */ + case P_OUTDATED: /* peer got outdated, or was already outdated */ ex_to_string = "peer was fenced"; mask.pdsk = D_MASK; val.pdsk = D_OUTDATED; break; - case 5: /* peer was down */ + case P_DOWN: /* peer was down */ if (conn_highest_disk(connection) == D_UP_TO_DATE) { /* we will(have) create(d) a new UUID anyways... */ ex_to_string = "peer is unreachable, assumed to be dead"; @@ -505,7 +523,7 @@ bool conn_try_outdate_peer(struct drbd_connection *connection) ex_to_string = "peer unreachable, doing nothing since disk != UpToDate"; } break; - case 6: /* Peer is primary, voluntarily outdate myself. + case P_PRIMARY: /* Peer is primary, voluntarily outdate myself. * This is useful when an unconnected R_SECONDARY is asked to * become R_PRIMARY, but finds the other peer being active. */ ex_to_string = "peer is active"; @@ -513,7 +531,9 @@ bool conn_try_outdate_peer(struct drbd_connection *connection) mask.disk = D_MASK; val.disk = D_OUTDATED; break; - case 7: + case P_FENCING: + /* THINK: do we need to handle this + * like case 4, or more like case 5? */ if (fp != FP_STONITH) drbd_err(connection, "fence-peer() = 7 && fencing != Stonith !!!\n"); ex_to_string = "peer was stonithed"; @@ -529,13 +549,11 @@ bool conn_try_outdate_peer(struct drbd_connection *connection) drbd_info(connection, "fence-peer helper returned %d (%s)\n", (r>>8) & 0xff, ex_to_string); - out: - /* Not using conn_request_state(connection, mask, val, CS_VERBOSE); here, because we might were able to re-establish the connection in the meantime. */ - spin_lock_irq(&connection->resource->req_lock); + spin_lock_irq(&resource->req_lock); if (connection->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &connection->flags)) { if (connection->connect_cnt != connect_cnt) /* In case the connection was established and droped @@ -544,7 +562,7 @@ bool conn_try_outdate_peer(struct drbd_connection *connection) else _conn_request_state(connection, mask, val, CS_VERBOSE); } - spin_unlock_irq(&connection->resource->req_lock); + spin_unlock_irq(&resource->req_lock); return conn_highest_pdsk(connection) <= D_OUTDATED; } @@ -1154,51 +1172,160 @@ static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc) return 0; } +static void blk_queue_discard_granularity(struct request_queue *q, unsigned int granularity) +{ + q->limits.discard_granularity = granularity; +} + +static unsigned int drbd_max_discard_sectors(struct drbd_connection *connection) +{ + /* when we introduced REQ_WRITE_SAME support, we also bumped + * our maximum supported batch bio size used for discards. */ + if (connection->agreed_features & DRBD_FF_WSAME) + return DRBD_MAX_BBIO_SECTORS; + /* before, with DRBD <= 8.4.6, we only allowed up to one AL_EXTENT_SIZE. */ + return AL_EXTENT_SIZE >> 9; +} + +static void decide_on_discard_support(struct drbd_device *device, + struct request_queue *q, + struct request_queue *b, + bool discard_zeroes_if_aligned) +{ + /* q = drbd device queue (device->rq_queue) + * b = backing device queue (device->ldev->backing_bdev->bd_disk->queue), + * or NULL if diskless + */ + struct drbd_connection *connection = first_peer_device(device)->connection; + bool can_do = b ? blk_queue_discard(b) : true; + + if (can_do && b && !b->limits.discard_zeroes_data && !discard_zeroes_if_aligned) { + can_do = false; + drbd_info(device, "discard_zeroes_data=0 and discard_zeroes_if_aligned=no: disabling discards\n"); + } + if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_TRIM)) { + can_do = false; + drbd_info(connection, "peer DRBD too old, does not support TRIM: disabling discards\n"); + } + if (can_do) { + /* We don't care for the granularity, really. + * Stacking limits below should fix it for the local + * device. Whether or not it is a suitable granularity + * on the remote device is not our problem, really. If + * you care, you need to use devices with similar + * topology on all peers. */ + blk_queue_discard_granularity(q, 512); + q->limits.max_discard_sectors = drbd_max_discard_sectors(connection); + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); + } else { + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); + blk_queue_discard_granularity(q, 0); + q->limits.max_discard_sectors = 0; + } +} + +static void fixup_discard_if_not_supported(struct request_queue *q) +{ + /* To avoid confusion, if this queue does not support discard, clear + * max_discard_sectors, which is what lsblk -D reports to the user. + * Older kernels got this wrong in "stack limits". + * */ + if (!blk_queue_discard(q)) { + blk_queue_max_discard_sectors(q, 0); + blk_queue_discard_granularity(q, 0); + } +} + +static void decide_on_write_same_support(struct drbd_device *device, + struct request_queue *q, + struct request_queue *b, struct o_qlim *o) +{ + struct drbd_peer_device *peer_device = first_peer_device(device); + struct drbd_connection *connection = peer_device->connection; + bool can_do = b ? b->limits.max_write_same_sectors : true; + + if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_WSAME)) { + can_do = false; + drbd_info(peer_device, "peer does not support WRITE_SAME\n"); + } + + if (o) { + /* logical block size; queue_logical_block_size(NULL) is 512 */ + unsigned int peer_lbs = be32_to_cpu(o->logical_block_size); + unsigned int me_lbs_b = queue_logical_block_size(b); + unsigned int me_lbs = queue_logical_block_size(q); + + if (me_lbs_b != me_lbs) { + drbd_warn(device, + "logical block size of local backend does not match (drbd:%u, backend:%u); was this a late attach?\n", + me_lbs, me_lbs_b); + /* rather disable write same than trigger some BUG_ON later in the scsi layer. */ + can_do = false; + } + if (me_lbs_b != peer_lbs) { + drbd_warn(peer_device, "logical block sizes do not match (me:%u, peer:%u); this may cause problems.\n", + me_lbs, peer_lbs); + if (can_do) { + drbd_dbg(peer_device, "logical block size mismatch: WRITE_SAME disabled.\n"); + can_do = false; + } + me_lbs = max(me_lbs, me_lbs_b); + /* We cannot change the logical block size of an in-use queue. + * We can only hope that access happens to be properly aligned. + * If not, the peer will likely produce an IO error, and detach. */ + if (peer_lbs > me_lbs) { + if (device->state.role != R_PRIMARY) { + blk_queue_logical_block_size(q, peer_lbs); + drbd_warn(peer_device, "logical block size set to %u\n", peer_lbs); + } else { + drbd_warn(peer_device, + "current Primary must NOT adjust logical block size (%u -> %u); hope for the best.\n", + me_lbs, peer_lbs); + } + } + } + if (can_do && !o->write_same_capable) { + /* If we introduce an open-coded write-same loop on the receiving side, + * the peer would present itself as "capable". */ + drbd_dbg(peer_device, "WRITE_SAME disabled (peer device not capable)\n"); + can_do = false; + } + } + + blk_queue_max_write_same_sectors(q, can_do ? DRBD_MAX_BBIO_SECTORS : 0); +} + static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev, - unsigned int max_bio_size) + unsigned int max_bio_size, struct o_qlim *o) { struct request_queue * const q = device->rq_queue; unsigned int max_hw_sectors = max_bio_size >> 9; unsigned int max_segments = 0; struct request_queue *b = NULL; + struct disk_conf *dc; + bool discard_zeroes_if_aligned = true; if (bdev) { b = bdev->backing_bdev->bd_disk->queue; max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); rcu_read_lock(); - max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs; + dc = rcu_dereference(device->ldev->disk_conf); + max_segments = dc->max_bio_bvecs; + discard_zeroes_if_aligned = dc->discard_zeroes_if_aligned; rcu_read_unlock(); blk_set_stacking_limits(&q->limits); - blk_queue_max_write_same_sectors(q, 0); } - blk_queue_logical_block_size(q, 512); blk_queue_max_hw_sectors(q, max_hw_sectors); /* This is the workaround for "bio would need to, but cannot, be split" */ blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS); blk_queue_segment_boundary(q, PAGE_SIZE-1); + decide_on_discard_support(device, q, b, discard_zeroes_if_aligned); + decide_on_write_same_support(device, q, b, o); if (b) { - struct drbd_connection *connection = first_peer_device(device)->connection; - - blk_queue_max_discard_sectors(q, DRBD_MAX_DISCARD_SECTORS); - - if (blk_queue_discard(b) && - (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) { - /* We don't care, stacking below should fix it for the local device. - * Whether or not it is a suitable granularity on the remote device - * is not our problem, really. If you care, you need to - * use devices with similar topology on all peers. */ - q->limits.discard_granularity = 512; - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); - } else { - blk_queue_max_discard_sectors(q, 0); - queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); - q->limits.discard_granularity = 0; - } - blk_queue_stack_limits(q, b); if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) { @@ -1208,15 +1335,10 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; } } - /* To avoid confusion, if this queue does not support discard, clear - * max_discard_sectors, which is what lsblk -D reports to the user. */ - if (!blk_queue_discard(q)) { - blk_queue_max_discard_sectors(q, 0); - q->limits.discard_granularity = 0; - } + fixup_discard_if_not_supported(q); } -void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev) +void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev, struct o_qlim *o) { unsigned int now, new, local, peer; @@ -1259,7 +1381,7 @@ void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backin if (new != now) drbd_info(device, "max BIO size = %u\n", new); - drbd_setup_queue_param(device, bdev, new); + drbd_setup_queue_param(device, bdev, new, o); } /* Starts the worker thread */ @@ -1348,6 +1470,43 @@ static bool write_ordering_changed(struct disk_conf *a, struct disk_conf *b) a->disk_drain != b->disk_drain; } +static void sanitize_disk_conf(struct drbd_device *device, struct disk_conf *disk_conf, + struct drbd_backing_dev *nbc) +{ + struct request_queue * const q = nbc->backing_bdev->bd_disk->queue; + + if (disk_conf->al_extents < DRBD_AL_EXTENTS_MIN) + disk_conf->al_extents = DRBD_AL_EXTENTS_MIN; + if (disk_conf->al_extents > drbd_al_extents_max(nbc)) + disk_conf->al_extents = drbd_al_extents_max(nbc); + + if (!blk_queue_discard(q) + || (!q->limits.discard_zeroes_data && !disk_conf->discard_zeroes_if_aligned)) { + if (disk_conf->rs_discard_granularity) { + disk_conf->rs_discard_granularity = 0; /* disable feature */ + drbd_info(device, "rs_discard_granularity feature disabled\n"); + } + } + + if (disk_conf->rs_discard_granularity) { + int orig_value = disk_conf->rs_discard_granularity; + int remainder; + + if (q->limits.discard_granularity > disk_conf->rs_discard_granularity) + disk_conf->rs_discard_granularity = q->limits.discard_granularity; + + remainder = disk_conf->rs_discard_granularity % q->limits.discard_granularity; + disk_conf->rs_discard_granularity += remainder; + + if (disk_conf->rs_discard_granularity > q->limits.max_discard_sectors << 9) + disk_conf->rs_discard_granularity = q->limits.max_discard_sectors << 9; + + if (disk_conf->rs_discard_granularity != orig_value) + drbd_info(device, "rs_discard_granularity changed to %d\n", + disk_conf->rs_discard_granularity); + } +} + int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) { struct drbd_config_context adm_ctx; @@ -1395,10 +1554,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) if (!expect(new_disk_conf->resync_rate >= 1)) new_disk_conf->resync_rate = 1; - if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN) - new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN; - if (new_disk_conf->al_extents > drbd_al_extents_max(device->ldev)) - new_disk_conf->al_extents = drbd_al_extents_max(device->ldev); + sanitize_disk_conf(device, new_disk_conf, device->ldev); if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX) new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX; @@ -1457,6 +1613,9 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) if (write_ordering_changed(old_disk_conf, new_disk_conf)) drbd_bump_write_ordering(device->resource, NULL, WO_BDEV_FLUSH); + if (old_disk_conf->discard_zeroes_if_aligned != new_disk_conf->discard_zeroes_if_aligned) + drbd_reconsider_queue_parameters(device, device->ldev, NULL); + drbd_md_sync(device); if (device->state.conn >= C_CONNECTED) { @@ -1693,10 +1852,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) if (retcode != NO_ERROR) goto fail; - if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN) - new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN; - if (new_disk_conf->al_extents > drbd_al_extents_max(nbc)) - new_disk_conf->al_extents = drbd_al_extents_max(nbc); + sanitize_disk_conf(device, new_disk_conf, nbc); if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) { drbd_err(device, "max capacity %llu smaller than disk size %llu\n", @@ -1838,7 +1994,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) device->read_cnt = 0; device->writ_cnt = 0; - drbd_reconsider_max_bio_size(device, device->ldev); + drbd_reconsider_queue_parameters(device, device->ldev, NULL); /* If I am currently not R_PRIMARY, * but meta data primary indicator is set, diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 6537b25..be2b93f 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -25,7 +25,7 @@ #include <linux/module.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/proc_fs.h> @@ -122,18 +122,18 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se x = res/50; y = 20-x; - seq_printf(seq, "\t["); + seq_puts(seq, "\t["); for (i = 1; i < x; i++) - seq_printf(seq, "="); - seq_printf(seq, ">"); + seq_putc(seq, '='); + seq_putc(seq, '>'); for (i = 0; i < y; i++) seq_printf(seq, "."); - seq_printf(seq, "] "); + seq_puts(seq, "] "); if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T) - seq_printf(seq, "verified:"); + seq_puts(seq, "verified:"); else - seq_printf(seq, "sync'ed:"); + seq_puts(seq, "sync'ed:"); seq_printf(seq, "%3u.%u%% ", res / 10, res % 10); /* if more than a few GB, display in MB */ @@ -146,7 +146,7 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se (unsigned long) Bit2KB(rs_left), (unsigned long) Bit2KB(rs_total)); - seq_printf(seq, "\n\t"); + seq_puts(seq, "\n\t"); /* see drivers/md/md.c * We do not want to overflow, so the order of operands and @@ -175,9 +175,9 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se rt / 3600, (rt % 3600) / 60, rt % 60); dbdt = Bit2KB(db/dt); - seq_printf(seq, " speed: "); + seq_puts(seq, " speed: "); seq_printf_with_thousands_grouping(seq, dbdt); - seq_printf(seq, " ("); + seq_puts(seq, " ("); /* ------------------------- ~3s average ------------------------ */ if (proc_details >= 1) { /* this is what drbd_rs_should_slow_down() uses */ @@ -188,7 +188,7 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se db = device->rs_mark_left[i] - rs_left; dbdt = Bit2KB(db/dt); seq_printf_with_thousands_grouping(seq, dbdt); - seq_printf(seq, " -- "); + seq_puts(seq, " -- "); } /* --------------------- long term average ---------------------- */ @@ -200,11 +200,11 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se db = rs_total - rs_left; dbdt = Bit2KB(db/dt); seq_printf_with_thousands_grouping(seq, dbdt); - seq_printf(seq, ")"); + seq_putc(seq, ')'); if (state.conn == C_SYNC_TARGET || state.conn == C_VERIFY_S) { - seq_printf(seq, " want: "); + seq_puts(seq, " want: "); seq_printf_with_thousands_grouping(seq, device->c_sync_rate); } seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : ""); @@ -231,7 +231,7 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se (unsigned long long)bm_bits * BM_SECT_PER_BIT); if (stop_sector != 0 && stop_sector != ULLONG_MAX) seq_printf(seq, " stop sector: %llu", stop_sector); - seq_printf(seq, "\n"); + seq_putc(seq, '\n'); } } @@ -276,7 +276,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v) rcu_read_lock(); idr_for_each_entry(&drbd_devices, device, i) { if (prev_i != i - 1) - seq_printf(seq, "\n"); + seq_putc(seq, '\n'); prev_i = i; state = device->state; diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h index 129f8c7..4d29680 100644 --- a/drivers/block/drbd/drbd_protocol.h +++ b/drivers/block/drbd/drbd_protocol.h @@ -60,6 +60,15 @@ enum drbd_packet { * which is why I chose TRIM here, to disambiguate. */ P_TRIM = 0x31, + /* Only use these two if both support FF_THIN_RESYNC */ + P_RS_THIN_REQ = 0x32, /* Request a block for resync or reply P_RS_DEALLOCATED */ + P_RS_DEALLOCATED = 0x33, /* Contains only zeros on sync source node */ + + /* REQ_WRITE_SAME. + * On a receiving side without REQ_WRITE_SAME, + * we may fall back to an opencoded loop instead. */ + P_WSAME = 0x34, + P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ P_MAX_OPT_CMD = 0x101, @@ -106,8 +115,11 @@ struct p_header100 { u32 pad; } __packed; -/* these defines must not be changed without changing the protocol version */ -#define DP_HARDBARRIER 1 /* depricated */ +/* These defines must not be changed without changing the protocol version. + * New defines may only be introduced together with protocol version bump or + * new protocol feature flags. + */ +#define DP_HARDBARRIER 1 /* no longer used */ #define DP_RW_SYNC 2 /* equals REQ_SYNC */ #define DP_MAY_SET_IN_SYNC 4 #define DP_UNPLUG 8 /* not used anymore */ @@ -116,6 +128,7 @@ struct p_header100 { #define DP_DISCARD 64 /* equals REQ_DISCARD */ #define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */ #define DP_SEND_WRITE_ACK 256 /* This is a proto C write request */ +#define DP_WSAME 512 /* equiv. REQ_WRITE_SAME */ struct p_data { u64 sector; /* 64 bits sector number */ @@ -129,6 +142,11 @@ struct p_trim { u32 size; /* == bio->bi_size */ } __packed; +struct p_wsame { + struct p_data p_data; + u32 size; /* == bio->bi_size */ +} __packed; + /* * commands which share a struct: * p_block_ack: @@ -160,7 +178,23 @@ struct p_block_req { * ReportParams */ -#define FF_TRIM 1 +/* supports TRIM/DISCARD on the "wire" protocol */ +#define DRBD_FF_TRIM 1 + +/* Detect all-zeros during resync, and rather TRIM/UNMAP/DISCARD those blocks + * instead of fully allocate a supposedly thin volume on initial resync */ +#define DRBD_FF_THIN_RESYNC 2 + +/* supports REQ_WRITE_SAME on the "wire" protocol. + * Note: this flag is overloaded, + * its presence also + * - indicates support for 128 MiB "batch bios", + * max discard size of 128 MiB + * instead of 4M before that. + * - indicates that we exchange additional settings in p_sizes + * drbd_send_sizes()/receive_sizes() + */ +#define DRBD_FF_WSAME 4 struct p_connection_features { u32 protocol_min; @@ -235,6 +269,40 @@ struct p_rs_uuid { u64 uuid; } __packed; +/* optional queue_limits if (agreed_features & DRBD_FF_WSAME) + * see also struct queue_limits, as of late 2015 */ +struct o_qlim { + /* we don't need it yet, but we may as well communicate it now */ + u32 physical_block_size; + + /* so the original in struct queue_limits is unsigned short, + * but I'd have to put in padding anyways. */ + u32 logical_block_size; + + /* One incoming bio becomes one DRBD request, + * which may be translated to several bio on the receiving side. + * We don't need to communicate chunk/boundary/segment ... limits. + */ + + /* various IO hints may be useful with "diskless client" setups */ + u32 alignment_offset; + u32 io_min; + u32 io_opt; + + /* We may need to communicate integrity stuff at some point, + * but let's not get ahead of ourselves. */ + + /* Backend discard capabilities. + * Receiving side uses "blkdev_issue_discard()", no need to communicate + * more specifics. If the backend cannot do discards, the DRBD peer + * may fall back to blkdev_issue_zeroout(). + */ + u8 discard_enabled; + u8 discard_zeroes_data; + u8 write_same_capable; + u8 _pad; +} __packed; + struct p_sizes { u64 d_size; /* size of disk */ u64 u_size; /* user requested size */ @@ -242,6 +310,9 @@ struct p_sizes { u32 max_bio_size; /* Maximal size of a BIO */ u16 queue_order_type; /* not yet implemented in DRBD*/ u16 dds_flags; /* use enum dds_flags here. */ + + /* optional queue_limits if (agreed_features & DRBD_FF_WSAME) */ + struct o_qlim qlim[0]; } __packed; struct p_state { diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 1ee0023..df45713 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -25,7 +25,7 @@ #include <linux/module.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <net/sock.h> #include <linux/drbd.h> @@ -48,7 +48,7 @@ #include "drbd_req.h" #include "drbd_vli.h" -#define PRO_FEATURES (FF_TRIM) +#define PRO_FEATURES (DRBD_FF_TRIM|DRBD_FF_THIN_RESYNC|DRBD_FF_WSAME) struct packet_info { enum drbd_packet cmd; @@ -361,14 +361,17 @@ You must not have the req_lock: drbd_wait_ee_list_empty() */ +/* normal: payload_size == request size (bi_size) + * w_same: payload_size == logical_block_size + * trim: payload_size == 0 */ struct drbd_peer_request * drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector, - unsigned int data_size, bool has_payload, gfp_t gfp_mask) __must_hold(local) + unsigned int request_size, unsigned int payload_size, gfp_t gfp_mask) __must_hold(local) { struct drbd_device *device = peer_device->device; struct drbd_peer_request *peer_req; struct page *page = NULL; - unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; + unsigned nr_pages = (payload_size + PAGE_SIZE -1) >> PAGE_SHIFT; if (drbd_insert_fault(device, DRBD_FAULT_AL_EE)) return NULL; @@ -380,7 +383,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto return NULL; } - if (has_payload && data_size) { + if (nr_pages) { page = drbd_alloc_pages(peer_device, nr_pages, gfpflags_allow_blocking(gfp_mask)); if (!page) @@ -390,7 +393,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto memset(peer_req, 0, sizeof(*peer_req)); INIT_LIST_HEAD(&peer_req->w.list); drbd_clear_interval(&peer_req->i); - peer_req->i.size = data_size; + peer_req->i.size = request_size; peer_req->i.sector = sector; peer_req->submit_jif = jiffies; peer_req->peer_device = peer_device; @@ -1204,13 +1207,84 @@ static int drbd_recv_header(struct drbd_connection *connection, struct packet_in return err; } -static void drbd_flush(struct drbd_connection *connection) +/* This is blkdev_issue_flush, but asynchronous. + * We want to submit to all component volumes in parallel, + * then wait for all completions. + */ +struct issue_flush_context { + atomic_t pending; + int error; + struct completion done; +}; +struct one_flush_context { + struct drbd_device *device; + struct issue_flush_context *ctx; +}; + +void one_flush_endio(struct bio *bio) { - int rv; - struct drbd_peer_device *peer_device; - int vnr; + struct one_flush_context *octx = bio->bi_private; + struct drbd_device *device = octx->device; + struct issue_flush_context *ctx = octx->ctx; + if (bio->bi_error) { + ctx->error = bio->bi_error; + drbd_info(device, "local disk FLUSH FAILED with status %d\n", bio->bi_error); + } + kfree(octx); + bio_put(bio); + + clear_bit(FLUSH_PENDING, &device->flags); + put_ldev(device); + kref_put(&device->kref, drbd_destroy_device); + + if (atomic_dec_and_test(&ctx->pending)) + complete(&ctx->done); +} + +static void submit_one_flush(struct drbd_device *device, struct issue_flush_context *ctx) +{ + struct bio *bio = bio_alloc(GFP_NOIO, 0); + struct one_flush_context *octx = kmalloc(sizeof(*octx), GFP_NOIO); + if (!bio || !octx) { + drbd_warn(device, "Could not allocate a bio, CANNOT ISSUE FLUSH\n"); + /* FIXME: what else can I do now? disconnecting or detaching + * really does not help to improve the state of the world, either. + */ + kfree(octx); + if (bio) + bio_put(bio); + + ctx->error = -ENOMEM; + put_ldev(device); + kref_put(&device->kref, drbd_destroy_device); + return; + } + + octx->device = device; + octx->ctx = ctx; + bio->bi_bdev = device->ldev->backing_bdev; + bio->bi_private = octx; + bio->bi_end_io = one_flush_endio; + bio_set_op_attrs(bio, REQ_OP_FLUSH, WRITE_FLUSH); + + device->flush_jif = jiffies; + set_bit(FLUSH_PENDING, &device->flags); + atomic_inc(&ctx->pending); + submit_bio(bio); +} + +static void drbd_flush(struct drbd_connection *connection) +{ if (connection->resource->write_ordering >= WO_BDEV_FLUSH) { + struct drbd_peer_device *peer_device; + struct issue_flush_context ctx; + int vnr; + + atomic_set(&ctx.pending, 1); + ctx.error = 0; + init_completion(&ctx.done); + rcu_read_lock(); idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { struct drbd_device *device = peer_device->device; @@ -1220,31 +1294,24 @@ static void drbd_flush(struct drbd_connection *connection) kref_get(&device->kref); rcu_read_unlock(); - /* Right now, we have only this one synchronous code path - * for flushes between request epochs. - * We may want to make those asynchronous, - * or at least parallelize the flushes to the volume devices. - */ - device->flush_jif = jiffies; - set_bit(FLUSH_PENDING, &device->flags); - rv = blkdev_issue_flush(device->ldev->backing_bdev, - GFP_NOIO, NULL); - clear_bit(FLUSH_PENDING, &device->flags); - if (rv) { - drbd_info(device, "local disk flush failed with status %d\n", rv); - /* would rather check on EOPNOTSUPP, but that is not reliable. - * don't try again for ANY return value != 0 - * if (rv == -EOPNOTSUPP) */ - drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO); - } - put_ldev(device); - kref_put(&device->kref, drbd_destroy_device); + submit_one_flush(device, &ctx); rcu_read_lock(); - if (rv) - break; } rcu_read_unlock(); + + /* Do we want to add a timeout, + * if disk-timeout is set? */ + if (!atomic_dec_and_test(&ctx.pending)) + wait_for_completion(&ctx.done); + + if (ctx.error) { + /* would rather check on EOPNOTSUPP, but that is not reliable. + * don't try again for ANY return value != 0 + * if (rv == -EOPNOTSUPP) */ + /* Any error is already reported by bio_endio callback. */ + drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO); + } } } @@ -1379,6 +1446,120 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]); } +/* + * We *may* ignore the discard-zeroes-data setting, if so configured. + * + * Assumption is that it "discard_zeroes_data=0" is only because the backend + * may ignore partial unaligned discards. + * + * LVM/DM thin as of at least + * LVM version: 2.02.115(2)-RHEL7 (2015-01-28) + * Library version: 1.02.93-RHEL7 (2015-01-28) + * Driver version: 4.29.0 + * still behaves this way. + * + * For unaligned (wrt. alignment and granularity) or too small discards, + * we zero-out the initial (and/or) trailing unaligned partial chunks, + * but discard all the aligned full chunks. + * + * At least for LVM/DM thin, the result is effectively "discard_zeroes_data=1". + */ +int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, bool discard) +{ + struct block_device *bdev = device->ldev->backing_bdev; + struct request_queue *q = bdev_get_queue(bdev); + sector_t tmp, nr; + unsigned int max_discard_sectors, granularity; + int alignment; + int err = 0; + + if (!discard) + goto zero_out; + + /* Zero-sector (unknown) and one-sector granularities are the same. */ + granularity = max(q->limits.discard_granularity >> 9, 1U); + alignment = (bdev_discard_alignment(bdev) >> 9) % granularity; + + max_discard_sectors = min(q->limits.max_discard_sectors, (1U << 22)); + max_discard_sectors -= max_discard_sectors % granularity; + if (unlikely(!max_discard_sectors)) + goto zero_out; + + if (nr_sectors < granularity) + goto zero_out; + + tmp = start; + if (sector_div(tmp, granularity) != alignment) { + if (nr_sectors < 2*granularity) + goto zero_out; + /* start + gran - (start + gran - align) % gran */ + tmp = start + granularity - alignment; + tmp = start + granularity - sector_div(tmp, granularity); + + nr = tmp - start; + err |= blkdev_issue_zeroout(bdev, start, nr, GFP_NOIO, 0); + nr_sectors -= nr; + start = tmp; + } + while (nr_sectors >= granularity) { + nr = min_t(sector_t, nr_sectors, max_discard_sectors); + err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO, 0); + nr_sectors -= nr; + start += nr; + } + zero_out: + if (nr_sectors) { + err |= blkdev_issue_zeroout(bdev, start, nr_sectors, GFP_NOIO, 0); + } + return err != 0; +} + +static bool can_do_reliable_discards(struct drbd_device *device) +{ + struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev); + struct disk_conf *dc; + bool can_do; + + if (!blk_queue_discard(q)) + return false; + + if (q->limits.discard_zeroes_data) + return true; + + rcu_read_lock(); + dc = rcu_dereference(device->ldev->disk_conf); + can_do = dc->discard_zeroes_if_aligned; + rcu_read_unlock(); + return can_do; +} + +static void drbd_issue_peer_discard(struct drbd_device *device, struct drbd_peer_request *peer_req) +{ + /* If the backend cannot discard, or does not guarantee + * read-back zeroes in discarded ranges, we fall back to + * zero-out. Unless configuration specifically requested + * otherwise. */ + if (!can_do_reliable_discards(device)) + peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT; + + if (drbd_issue_discard_or_zero_out(device, peer_req->i.sector, + peer_req->i.size >> 9, !(peer_req->flags & EE_IS_TRIM_USE_ZEROOUT))) + peer_req->flags |= EE_WAS_ERROR; + drbd_endio_write_sec_final(peer_req); +} + +static void drbd_issue_peer_wsame(struct drbd_device *device, + struct drbd_peer_request *peer_req) +{ + struct block_device *bdev = device->ldev->backing_bdev; + sector_t s = peer_req->i.sector; + sector_t nr = peer_req->i.size >> 9; + if (blkdev_issue_write_same(bdev, s, nr, GFP_NOIO, peer_req->pages)) + peer_req->flags |= EE_WAS_ERROR; + drbd_endio_write_sec_final(peer_req); +} + + /** * drbd_submit_peer_request() * @device: DRBD device. @@ -1410,7 +1591,13 @@ int drbd_submit_peer_request(struct drbd_device *device, unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; int err = -ENOMEM; - if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) { + /* TRIM/DISCARD: for now, always use the helper function + * blkdev_issue_zeroout(..., discard=true). + * It's synchronous, but it does the right thing wrt. bio splitting. + * Correctness first, performance later. Next step is to code an + * asynchronous variant of the same. + */ + if (peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) { /* wait for all pending IO completions, before we start * zeroing things out. */ conn_wait_active_ee_empty(peer_req->peer_device->connection); @@ -1418,22 +1605,22 @@ int drbd_submit_peer_request(struct drbd_device *device, * so we can find it to present it in debugfs */ peer_req->submit_jif = jiffies; peer_req->flags |= EE_SUBMITTED; - spin_lock_irq(&device->resource->req_lock); - list_add_tail(&peer_req->w.list, &device->active_ee); - spin_unlock_irq(&device->resource->req_lock); - if (blkdev_issue_zeroout(device->ldev->backing_bdev, - sector, data_size >> 9, GFP_NOIO, false)) - peer_req->flags |= EE_WAS_ERROR; - drbd_endio_write_sec_final(peer_req); + + /* If this was a resync request from receive_rs_deallocated(), + * it is already on the sync_ee list */ + if (list_empty(&peer_req->w.list)) { + spin_lock_irq(&device->resource->req_lock); + list_add_tail(&peer_req->w.list, &device->active_ee); + spin_unlock_irq(&device->resource->req_lock); + } + + if (peer_req->flags & EE_IS_TRIM) + drbd_issue_peer_discard(device, peer_req); + else /* EE_WRITE_SAME */ + drbd_issue_peer_wsame(device, peer_req); return 0; } - /* Discards don't have any payload. - * But the scsi layer still expects a bio_vec it can use internally, - * see sd_setup_discard_cmnd() and blk_add_request_payload(). */ - if (peer_req->flags & EE_IS_TRIM) - nr_pages = 1; - /* In most cases, we will only need one bio. But in case the lower * level restrictions happen to be different at this offset on this * side than those of the sending peer, we may need to submit the @@ -1459,11 +1646,6 @@ next_bio: bios = bio; ++n_bios; - if (op == REQ_OP_DISCARD) { - bio->bi_iter.bi_size = data_size; - goto submit; - } - page_chain_for_each(page) { unsigned len = min_t(unsigned, data_size, PAGE_SIZE); if (!bio_add_page(bio, page, len, 0)) { @@ -1485,7 +1667,6 @@ next_bio: --nr_pages; } D_ASSERT(device, data_size == 0); -submit: D_ASSERT(device, page == NULL); atomic_set(&peer_req->pending_bios, n_bios); @@ -1609,8 +1790,26 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf return 0; } +/* quick wrapper in case payload size != request_size (write same) */ +static void drbd_csum_ee_size(struct crypto_ahash *h, + struct drbd_peer_request *r, void *d, + unsigned int payload_size) +{ + unsigned int tmp = r->i.size; + r->i.size = payload_size; + drbd_csum_ee(h, r, d); + r->i.size = tmp; +} + /* used from receive_RSDataReply (recv_resync_read) - * and from receive_Data */ + * and from receive_Data. + * data_size: actual payload ("data in") + * for normal writes that is bi_size. + * for discards, that is zero. + * for write same, it is logical_block_size. + * both trim and write same have the bi_size ("data len to be affected") + * as extra argument in the packet header. + */ static struct drbd_peer_request * read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, struct packet_info *pi) __must_hold(local) @@ -1625,6 +1824,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, void *dig_vv = peer_device->connection->int_dig_vv; unsigned long *data; struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL; + struct p_trim *wsame = (pi->cmd == P_WSAME) ? pi->data : NULL; digest_size = 0; if (!trim && peer_device->connection->peer_integrity_tfm) { @@ -1639,38 +1839,60 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, data_size -= digest_size; } + /* assume request_size == data_size, but special case trim and wsame. */ + ds = data_size; if (trim) { - D_ASSERT(peer_device, data_size == 0); - data_size = be32_to_cpu(trim->size); + if (!expect(data_size == 0)) + return NULL; + ds = be32_to_cpu(trim->size); + } else if (wsame) { + if (data_size != queue_logical_block_size(device->rq_queue)) { + drbd_err(peer_device, "data size (%u) != drbd logical block size (%u)\n", + data_size, queue_logical_block_size(device->rq_queue)); + return NULL; + } + if (data_size != bdev_logical_block_size(device->ldev->backing_bdev)) { + drbd_err(peer_device, "data size (%u) != backend logical block size (%u)\n", + data_size, bdev_logical_block_size(device->ldev->backing_bdev)); + return NULL; + } + ds = be32_to_cpu(wsame->size); } - if (!expect(IS_ALIGNED(data_size, 512))) + if (!expect(IS_ALIGNED(ds, 512))) return NULL; - /* prepare for larger trim requests. */ - if (!trim && !expect(data_size <= DRBD_MAX_BIO_SIZE)) + if (trim || wsame) { + if (!expect(ds <= (DRBD_MAX_BBIO_SECTORS << 9))) + return NULL; + } else if (!expect(ds <= DRBD_MAX_BIO_SIZE)) return NULL; /* even though we trust out peer, * we sometimes have to double check. */ - if (sector + (data_size>>9) > capacity) { + if (sector + (ds>>9) > capacity) { drbd_err(device, "request from peer beyond end of local disk: " "capacity: %llus < sector: %llus + size: %u\n", (unsigned long long)capacity, - (unsigned long long)sector, data_size); + (unsigned long long)sector, ds); return NULL; } /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD * "criss-cross" setup, that might cause write-out on some other DRBD, * which in turn might block on the other node at this very place. */ - peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, trim == NULL, GFP_NOIO); + peer_req = drbd_alloc_peer_req(peer_device, id, sector, ds, data_size, GFP_NOIO); if (!peer_req) return NULL; peer_req->flags |= EE_WRITE; - if (trim) + if (trim) { + peer_req->flags |= EE_IS_TRIM; return peer_req; + } + if (wsame) + peer_req->flags |= EE_WRITE_SAME; + /* receive payload size bytes into page chain */ ds = data_size; page = peer_req->pages; page_chain_for_each(page) { @@ -1690,7 +1912,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, } if (digest_size) { - drbd_csum_ee(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv); + drbd_csum_ee_size(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv, data_size); if (memcmp(dig_in, dig_vv, digest_size)) { drbd_err(device, "Digest integrity check FAILED: %llus +%u\n", (unsigned long long)sector, data_size); @@ -2067,13 +2289,13 @@ static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2) static bool overlapping_resync_write(struct drbd_device *device, struct drbd_peer_request *peer_req) { struct drbd_peer_request *rs_req; - bool rv = 0; + bool rv = false; spin_lock_irq(&device->resource->req_lock); list_for_each_entry(rs_req, &device->sync_ee, w.list) { if (overlaps(peer_req->i.sector, peer_req->i.size, rs_req->i.sector, rs_req->i.size)) { - rv = 1; + rv = true; break; } } @@ -2354,10 +2576,6 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * op = wire_flags_to_bio_op(dp_flags); op_flags = wire_flags_to_bio_flags(dp_flags); if (pi->cmd == P_TRIM) { - struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev); - peer_req->flags |= EE_IS_TRIM; - if (!blk_queue_discard(q)) - peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT; D_ASSERT(peer_device, peer_req->i.size > 0); D_ASSERT(peer_device, op == REQ_OP_DISCARD); D_ASSERT(peer_device, peer_req->pages == NULL); @@ -2424,11 +2642,11 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * update_peer_seq(peer_device, peer_seq); spin_lock_irq(&device->resource->req_lock); } - /* if we use the zeroout fallback code, we process synchronously - * and we wait for all pending requests, respectively wait for + /* TRIM and WRITE_SAME are processed synchronously, + * we wait for all pending requests, respectively wait for * active_ee to become empty in drbd_submit_peer_request(); * better not add ourselves here. */ - if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0) + if ((peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) == 0) list_add_tail(&peer_req->w.list, &device->active_ee); spin_unlock_irq(&device->resource->req_lock); @@ -2460,7 +2678,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * } out_interrupted: - drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT + EV_CLEANUP); + drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT | EV_CLEANUP); put_ldev(device); drbd_free_peer_req(device, peer_req); return err; @@ -2585,6 +2803,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet case P_DATA_REQUEST: drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p); break; + case P_RS_THIN_REQ: case P_RS_DATA_REQUEST: case P_CSUM_RS_REQUEST: case P_OV_REQUEST: @@ -2610,7 +2829,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet * "criss-cross" setup, that might cause write-out on some other DRBD, * which in turn might block on the other node at this very place. */ peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size, - true /* has real payload */, GFP_NOIO); + size, GFP_NOIO); if (!peer_req) { put_ldev(device); return -ENOMEM; @@ -2624,6 +2843,12 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet peer_req->flags |= EE_APPLICATION; goto submit; + case P_RS_THIN_REQ: + /* If at some point in the future we have a smart way to + find out if this data block is completely deallocated, + then we would do something smarter here than reading + the block... */ + peer_req->flags |= EE_RS_THIN_REQ; case P_RS_DATA_REQUEST: peer_req->w.cb = w_e_end_rsdata_req; fault_type = DRBD_FAULT_RS_RD; @@ -2969,7 +3194,8 @@ static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid, -1091 requires proto 91 -1096 requires proto 96 */ -static int drbd_uuid_compare(struct drbd_device *const device, int *rule_nr) __must_hold(local) + +static int drbd_uuid_compare(struct drbd_device *const device, enum drbd_role const peer_role, int *rule_nr) __must_hold(local) { struct drbd_peer_device *const peer_device = first_peer_device(device); struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; @@ -3049,8 +3275,39 @@ static int drbd_uuid_compare(struct drbd_device *const device, int *rule_nr) __m * next bit (weight 2) is set when peer was primary */ *rule_nr = 40; + /* Neither has the "crashed primary" flag set, + * only a replication link hickup. */ + if (rct == 0) + return 0; + + /* Current UUID equal and no bitmap uuid; does not necessarily + * mean this was a "simultaneous hard crash", maybe IO was + * frozen, so no UUID-bump happened. + * This is a protocol change, overload DRBD_FF_WSAME as flag + * for "new-enough" peer DRBD version. */ + if (device->state.role == R_PRIMARY || peer_role == R_PRIMARY) { + *rule_nr = 41; + if (!(connection->agreed_features & DRBD_FF_WSAME)) { + drbd_warn(peer_device, "Equivalent unrotated UUIDs, but current primary present.\n"); + return -(0x10000 | PRO_VERSION_MAX | (DRBD_FF_WSAME << 8)); + } + if (device->state.role == R_PRIMARY && peer_role == R_PRIMARY) { + /* At least one has the "crashed primary" bit set, + * both are primary now, but neither has rotated its UUIDs? + * "Can not happen." */ + drbd_err(peer_device, "Equivalent unrotated UUIDs, but both are primary. Can not resolve this.\n"); + return -100; + } + if (device->state.role == R_PRIMARY) + return 1; + return -1; + } + + /* Both are secondary. + * Really looks like recovery from simultaneous hard crash. + * Check which had been primary before, and arbitrate. */ switch (rct) { - case 0: /* !self_pri && !peer_pri */ return 0; + case 0: /* !self_pri && !peer_pri */ return 0; /* already handled */ case 1: /* self_pri && !peer_pri */ return 1; case 2: /* !self_pri && peer_pri */ return -1; case 3: /* self_pri && peer_pri */ @@ -3177,7 +3434,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device, drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]); - hg = drbd_uuid_compare(device, &rule_nr); + hg = drbd_uuid_compare(device, peer_role, &rule_nr); spin_unlock_irq(&device->ldev->md.uuid_lock); drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr); @@ -3186,6 +3443,15 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device, drbd_alert(device, "Unrelated data, aborting!\n"); return C_MASK; } + if (hg < -0x10000) { + int proto, fflags; + hg = -hg; + proto = hg & 0xff; + fflags = (hg >> 8) & 0xff; + drbd_alert(device, "To resolve this both sides have to support at least protocol %d and feature flags 0x%x\n", + proto, fflags); + return C_MASK; + } if (hg < -1000) { drbd_alert(device, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000); return C_MASK; @@ -3415,7 +3681,8 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in */ peer_integrity_tfm = crypto_alloc_ahash(integrity_alg, 0, CRYPTO_ALG_ASYNC); - if (!peer_integrity_tfm) { + if (IS_ERR(peer_integrity_tfm)) { + peer_integrity_tfm = NULL; drbd_err(connection, "peer data-integrity-alg %s not supported\n", integrity_alg); goto disconnect; @@ -3766,6 +4033,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info struct drbd_peer_device *peer_device; struct drbd_device *device; struct p_sizes *p = pi->data; + struct o_qlim *o = (connection->agreed_features & DRBD_FF_WSAME) ? p->qlim : NULL; enum determine_dev_size dd = DS_UNCHANGED; sector_t p_size, p_usize, p_csize, my_usize; int ldsc = 0; /* local disk size changed */ @@ -3785,6 +4053,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info device->p_size = p_size; if (get_ldev(device)) { + sector_t new_size, cur_size; rcu_read_lock(); my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size; rcu_read_unlock(); @@ -3801,11 +4070,13 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info /* Never shrink a device with usable data during connect. But allow online shrinking if we are connected. */ - if (drbd_new_dev_size(device, device->ldev, p_usize, 0) < - drbd_get_capacity(device->this_bdev) && + new_size = drbd_new_dev_size(device, device->ldev, p_usize, 0); + cur_size = drbd_get_capacity(device->this_bdev); + if (new_size < cur_size && device->state.disk >= D_OUTDATED && device->state.conn < C_CONNECTED) { - drbd_err(device, "The peer's disk size is too small!\n"); + drbd_err(device, "The peer's disk size is too small! (%llu < %llu sectors)\n", + (unsigned long long)new_size, (unsigned long long)cur_size); conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); put_ldev(device); return -EIO; @@ -3839,14 +4110,14 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info } device->peer_max_bio_size = be32_to_cpu(p->max_bio_size); - /* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size(). + /* Leave drbd_reconsider_queue_parameters() before drbd_determine_dev_size(). In case we cleared the QUEUE_FLAG_DISCARD from our queue in - drbd_reconsider_max_bio_size(), we can be sure that after + drbd_reconsider_queue_parameters(), we can be sure that after drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */ ddsf = be16_to_cpu(p->dds_flags); if (get_ldev(device)) { - drbd_reconsider_max_bio_size(device, device->ldev); + drbd_reconsider_queue_parameters(device, device->ldev, o); dd = drbd_determine_dev_size(device, ddsf, NULL); put_ldev(device); if (dd == DS_ERROR) @@ -3866,7 +4137,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info * However, if he sends a zero current size, * take his (user-capped or) backing disk size anyways. */ - drbd_reconsider_max_bio_size(device, NULL); + drbd_reconsider_queue_parameters(device, NULL, o); drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size); } @@ -4599,9 +4870,75 @@ static int receive_out_of_sync(struct drbd_connection *connection, struct packet return 0; } +static int receive_rs_deallocated(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct p_block_desc *p = pi->data; + struct drbd_device *device; + sector_t sector; + int size, err = 0; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + sector = be64_to_cpu(p->sector); + size = be32_to_cpu(p->blksize); + + dec_rs_pending(device); + + if (get_ldev(device)) { + struct drbd_peer_request *peer_req; + const int op = REQ_OP_DISCARD; + + peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector, + size, 0, GFP_NOIO); + if (!peer_req) { + put_ldev(device); + return -ENOMEM; + } + + peer_req->w.cb = e_end_resync_block; + peer_req->submit_jif = jiffies; + peer_req->flags |= EE_IS_TRIM; + + spin_lock_irq(&device->resource->req_lock); + list_add_tail(&peer_req->w.list, &device->sync_ee); + spin_unlock_irq(&device->resource->req_lock); + + atomic_add(pi->size >> 9, &device->rs_sect_ev); + err = drbd_submit_peer_request(device, peer_req, op, 0, DRBD_FAULT_RS_WR); + + if (err) { + spin_lock_irq(&device->resource->req_lock); + list_del(&peer_req->w.list); + spin_unlock_irq(&device->resource->req_lock); + + drbd_free_peer_req(device, peer_req); + put_ldev(device); + err = 0; + goto fail; + } + + inc_unacked(device); + + /* No put_ldev() here. Gets called in drbd_endio_write_sec_final(), + as well as drbd_rs_complete_io() */ + } else { + fail: + drbd_rs_complete_io(device, sector); + drbd_send_ack_ex(peer_device, P_NEG_ACK, sector, size, ID_SYNCER); + } + + atomic_add(size >> 9, &device->rs_sect_in); + + return err; +} + struct data_cmd { int expect_payload; - size_t pkt_size; + unsigned int pkt_size; int (*fn)(struct drbd_connection *, struct packet_info *); }; @@ -4626,11 +4963,14 @@ static struct data_cmd drbd_cmd_handler[] = { [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest }, [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest }, + [P_RS_THIN_REQ] = { 0, sizeof(struct p_block_req), receive_DataRequest }, [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip }, [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync }, [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state }, [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol }, [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data }, + [P_RS_DEALLOCATED] = { 0, sizeof(struct p_block_desc), receive_rs_deallocated }, + [P_WSAME] = { 1, sizeof(struct p_wsame), receive_Data }, }; static void drbdd(struct drbd_connection *connection) @@ -4640,7 +4980,7 @@ static void drbdd(struct drbd_connection *connection) int err; while (get_t_state(&connection->receiver) == RUNNING) { - struct data_cmd *cmd; + struct data_cmd const *cmd; drbd_thread_current_set_cpu(&connection->receiver); update_receiver_timing_details(connection, drbd_recv_header); @@ -4655,11 +4995,18 @@ static void drbdd(struct drbd_connection *connection) } shs = cmd->pkt_size; + if (pi.cmd == P_SIZES && connection->agreed_features & DRBD_FF_WSAME) + shs += sizeof(struct o_qlim); if (pi.size > shs && !cmd->expect_payload) { drbd_err(connection, "No payload expected %s l:%d\n", cmdname(pi.cmd), pi.size); goto err_out; } + if (pi.size < shs) { + drbd_err(connection, "%s: unexpected packet size, expected:%d received:%d\n", + cmdname(pi.cmd), (int)shs, pi.size); + goto err_out; + } if (shs) { update_receiver_timing_details(connection, drbd_recv_all_warn); @@ -4795,9 +5142,11 @@ static int drbd_disconnected(struct drbd_peer_device *peer_device) drbd_md_sync(device); - /* serialize with bitmap writeout triggered by the state change, - * if any. */ - wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags)); + if (get_ldev(device)) { + drbd_bitmap_io(device, &drbd_bm_write_copy_pages, + "write from disconnected", BM_LOCKED_CHANGE_ALLOWED); + put_ldev(device); + } /* tcp_close and release of sendpage pages can be deferred. I don't * want to use SO_LINGER, because apparently it can be deferred for @@ -4904,8 +5253,12 @@ static int drbd_do_features(struct drbd_connection *connection) drbd_info(connection, "Handshake successful: " "Agreed network protocol version %d\n", connection->agreed_pro_version); - drbd_info(connection, "Agreed to%ssupport TRIM on protocol level\n", - connection->agreed_features & FF_TRIM ? " " : " not "); + drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s.\n", + connection->agreed_features, + connection->agreed_features & DRBD_FF_TRIM ? " TRIM" : "", + connection->agreed_features & DRBD_FF_THIN_RESYNC ? " THIN_RESYNC" : "", + connection->agreed_features & DRBD_FF_WSAME ? " WRITE_SAME" : + connection->agreed_features ? "" : " none"); return 1; diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index eef6e95..66b8e4b 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -47,8 +47,7 @@ static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *r &device->vdisk->part0, req->start_jif); } -static struct drbd_request *drbd_req_new(struct drbd_device *device, - struct bio *bio_src) +static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio *bio_src) { struct drbd_request *req; @@ -58,10 +57,12 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device, memset(req, 0, sizeof(*req)); drbd_req_make_private_bio(req, bio_src); - req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0; - req->device = device; - req->master_bio = bio_src; - req->epoch = 0; + req->rq_state = (bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0) + | (bio_op(bio_src) == REQ_OP_WRITE_SAME ? RQ_WSAME : 0) + | (bio_op(bio_src) == REQ_OP_DISCARD ? RQ_UNMAP : 0); + req->device = device; + req->master_bio = bio_src; + req->epoch = 0; drbd_clear_interval(&req->i); req->i.sector = bio_src->bi_iter.bi_sector; @@ -218,7 +219,6 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m) { const unsigned s = req->rq_state; struct drbd_device *device = req->device; - int rw; int error, ok; /* we must not complete the master bio, while it is @@ -242,8 +242,6 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m) return; } - rw = bio_rw(req->master_bio); - /* * figure out whether to report success or failure. * @@ -267,7 +265,7 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m) * epoch number. If they match, increase the current_tle_nr, * and reset the transfer log epoch write_cnt. */ - if (rw == WRITE && + if (op_is_write(bio_op(req->master_bio)) && req->epoch == atomic_read(&first_peer_device(device)->connection->current_tle_nr)) start_new_tl_epoch(first_peer_device(device)->connection); @@ -284,11 +282,14 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m) * because no path was available, in which case * it was not even added to the transfer_log. * - * READA may fail, and will not be retried. + * read-ahead may fail, and will not be retried. * * WRITE should have used all available paths already. */ - if (!ok && rw == READ && !list_empty(&req->tl_requests)) + if (!ok && + bio_op(req->master_bio) == REQ_OP_READ && + !(req->master_bio->bi_rw & REQ_RAHEAD) && + !list_empty(&req->tl_requests)) req->rq_state |= RQ_POSTPONED; if (!(req->rq_state & RQ_POSTPONED)) { @@ -644,7 +645,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, __drbd_chk_io_error(device, DRBD_READ_ERROR); /* fall through. */ case READ_AHEAD_COMPLETED_WITH_ERROR: - /* it is legal to fail READA, no __drbd_chk_io_error in that case. */ + /* it is legal to fail read-ahead, no __drbd_chk_io_error in that case. */ mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); break; @@ -656,7 +657,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, break; case QUEUE_FOR_NET_READ: - /* READ or READA, and + /* READ, and * no local disk, * or target area marked as invalid, * or just got an io-error. */ @@ -977,16 +978,20 @@ static void complete_conflicting_writes(struct drbd_request *req) sector_t sector = req->i.sector; int size = req->i.size; - i = drbd_find_overlap(&device->write_requests, sector, size); - if (!i) - return; - for (;;) { - prepare_to_wait(&device->misc_wait, &wait, TASK_UNINTERRUPTIBLE); - i = drbd_find_overlap(&device->write_requests, sector, size); - if (!i) + drbd_for_each_overlap(i, &device->write_requests, sector, size) { + /* Ignore, if already completed to upper layers. */ + if (i->completed) + continue; + /* Handle the first found overlap. After the schedule + * we have to restart the tree walk. */ + break; + } + if (!i) /* if any */ break; + /* Indicate to wake up device->misc_wait on progress. */ + prepare_to_wait(&device->misc_wait, &wait, TASK_UNINTERRUPTIBLE); i->waiting = true; spin_unlock_irq(&device->resource->req_lock); schedule(); @@ -995,7 +1000,7 @@ static void complete_conflicting_writes(struct drbd_request *req) finish_wait(&device->misc_wait, &wait); } -/* called within req_lock and rcu_read_lock() */ +/* called within req_lock */ static void maybe_pull_ahead(struct drbd_device *device) { struct drbd_connection *connection = first_peer_device(device)->connection; @@ -1152,12 +1157,29 @@ static int drbd_process_write_request(struct drbd_request *req) return remote; } +static void drbd_process_discard_req(struct drbd_request *req) +{ + int err = drbd_issue_discard_or_zero_out(req->device, + req->i.sector, req->i.size >> 9, true); + + if (err) + req->private_bio->bi_error = -EIO; + bio_endio(req->private_bio); +} + static void drbd_submit_req_private_bio(struct drbd_request *req) { struct drbd_device *device = req->device; struct bio *bio = req->private_bio; - const int rw = bio_rw(bio); + unsigned int type; + + if (bio_op(bio) != REQ_OP_READ) + type = DRBD_FAULT_DT_WR; + else if (bio->bi_rw & REQ_RAHEAD) + type = DRBD_FAULT_DT_RA; + else + type = DRBD_FAULT_DT_RD; bio->bi_bdev = device->ldev->backing_bdev; @@ -1167,11 +1189,10 @@ drbd_submit_req_private_bio(struct drbd_request *req) * stable storage, and this is a WRITE, we may not even submit * this bio. */ if (get_ldev(device)) { - if (drbd_insert_fault(device, - rw == WRITE ? DRBD_FAULT_DT_WR - : rw == READ ? DRBD_FAULT_DT_RD - : DRBD_FAULT_DT_RA)) + if (drbd_insert_fault(device, type)) bio_io_error(bio); + else if (bio_op(bio) == REQ_OP_DISCARD) + drbd_process_discard_req(req); else generic_make_request(bio); put_ldev(device); @@ -1223,24 +1244,45 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long /* Update disk stats */ _drbd_start_io_acct(device, req); + /* process discards always from our submitter thread */ + if (bio_op(bio) & REQ_OP_DISCARD) + goto queue_for_submitter_thread; + if (rw == WRITE && req->private_bio && req->i.size && !test_bit(AL_SUSPENDED, &device->flags)) { - if (!drbd_al_begin_io_fastpath(device, &req->i)) { - atomic_inc(&device->ap_actlog_cnt); - drbd_queue_write(device, req); - return NULL; - } + if (!drbd_al_begin_io_fastpath(device, &req->i)) + goto queue_for_submitter_thread; req->rq_state |= RQ_IN_ACT_LOG; req->in_actlog_jif = jiffies; } - return req; + + queue_for_submitter_thread: + atomic_inc(&device->ap_actlog_cnt); + drbd_queue_write(device, req); + return NULL; +} + +/* Require at least one path to current data. + * We don't want to allow writes on C_STANDALONE D_INCONSISTENT: + * We would not allow to read what was written, + * we would not have bumped the data generation uuids, + * we would cause data divergence for all the wrong reasons. + * + * If we don't see at least one D_UP_TO_DATE, we will fail this request, + * which either returns EIO, or, if OND_SUSPEND_IO is set, suspends IO, + * and queues for retry later. + */ +static bool may_do_writes(struct drbd_device *device) +{ + const union drbd_dev_state s = device->state; + return s.disk == D_UP_TO_DATE || s.pdsk == D_UP_TO_DATE; } static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request *req) { struct drbd_resource *resource = device->resource; - const int rw = bio_rw(req->master_bio); + const int rw = bio_data_dir(req->master_bio); struct bio_and_error m = { NULL, }; bool no_remote = false; bool submit_private_bio = false; @@ -1270,7 +1312,7 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request goto out; } - /* We fail READ/READA early, if we can not serve it. + /* We fail READ early, if we can not serve it. * We must do this before req is registered on any lists. * Otherwise, drbd_req_complete() will queue failed READ for retry. */ if (rw != WRITE) { @@ -1291,6 +1333,12 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request } if (rw == WRITE) { + if (req->private_bio && !may_do_writes(device)) { + bio_put(req->private_bio); + req->private_bio = NULL; + put_ldev(device); + goto nodata; + } if (!drbd_process_write_request(req)) no_remote = true; } else { diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index bb2ef78..eb49e7f 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -206,6 +206,8 @@ enum drbd_req_state_bits { /* Set when this is a write, clear for a read */ __RQ_WRITE, + __RQ_WSAME, + __RQ_UNMAP, /* Should call drbd_al_complete_io() for this request... */ __RQ_IN_ACT_LOG, @@ -241,10 +243,11 @@ enum drbd_req_state_bits { #define RQ_NET_OK (1UL << __RQ_NET_OK) #define RQ_NET_SIS (1UL << __RQ_NET_SIS) -/* 0x1f8 */ #define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK) #define RQ_WRITE (1UL << __RQ_WRITE) +#define RQ_WSAME (1UL << __RQ_WSAME) +#define RQ_UNMAP (1UL << __RQ_UNMAP) #define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG) #define RQ_POSTPONED (1UL << __RQ_POSTPONED) #define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP) diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 5a7ef78..eea0c4a 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -814,7 +814,7 @@ is_valid_state(struct drbd_device *device, union drbd_state ns) } if (rv <= 0) - /* already found a reason to abort */; + goto out; /* already found a reason to abort */ else if (ns.role == R_SECONDARY && device->open_cnt) rv = SS_DEVICE_IN_USE; @@ -862,6 +862,7 @@ is_valid_state(struct drbd_device *device, union drbd_state ns) else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN) rv = SS_CONNECTED_OUTDATES; +out: rcu_read_unlock(); return rv; @@ -906,6 +907,15 @@ is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_c (ns.conn >= C_CONNECTED && os.conn == C_WF_REPORT_PARAMS))) rv = SS_IN_TRANSIENT_STATE; + /* Do not promote during resync handshake triggered by "force primary". + * This is a hack. It should really be rejected by the peer during the + * cluster wide state change request. */ + if (os.role != R_PRIMARY && ns.role == R_PRIMARY + && ns.pdsk == D_UP_TO_DATE + && ns.disk != D_UP_TO_DATE && ns.disk != D_DISKLESS + && (ns.conn <= C_WF_SYNC_UUID || ns.conn != os.conn)) + rv = SS_IN_TRANSIENT_STATE; + if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) rv = SS_NEED_CONNECTION; @@ -1628,6 +1638,26 @@ static void broadcast_state_change(struct drbd_state_change *state_change) #undef REMEMBER_STATE_CHANGE } +/* takes old and new peer disk state */ +static bool lost_contact_to_peer_data(enum drbd_disk_state os, enum drbd_disk_state ns) +{ + if ((os >= D_INCONSISTENT && os != D_UNKNOWN && os != D_OUTDATED) + && (ns < D_INCONSISTENT || ns == D_UNKNOWN || ns == D_OUTDATED)) + return true; + + /* Scenario, starting with normal operation + * Connected Primary/Secondary UpToDate/UpToDate + * NetworkFailure Primary/Unknown UpToDate/DUnknown (frozen) + * ... + * Connected Primary/Secondary UpToDate/Diskless (resumed; needs to bump uuid!) + */ + if (os == D_UNKNOWN + && (ns == D_DISKLESS || ns == D_FAILED || ns == D_OUTDATED)) + return true; + + return false; +} + /** * after_state_ch() - Perform after state change actions that may sleep * @device: DRBD device. @@ -1675,7 +1705,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, what = RESEND; if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && - conn_lowest_disk(connection) > D_NEGOTIATING) + conn_lowest_disk(connection) == D_UP_TO_DATE) what = RESTART_FROZEN_DISK_IO; if (resource->susp_nod && what != NOTHING) { @@ -1699,6 +1729,13 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, idr_for_each_entry(&connection->peer_devices, peer_device, vnr) clear_bit(NEW_CUR_UUID, &peer_device->device->flags); rcu_read_unlock(); + + /* We should actively create a new uuid, _before_ + * we resume/resent, if the peer is diskless + * (recovery from a multiple error scenario). + * Currently, this happens with a slight delay + * below when checking lost_contact_to_peer_data() ... + */ _tl_restart(connection, RESEND); _conn_request_state(connection, (union drbd_state) { { .susp_fen = 1 } }, @@ -1742,12 +1779,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, BM_LOCKED_TEST_ALLOWED); /* Lost contact to peer's copy of the data */ - if ((os.pdsk >= D_INCONSISTENT && - os.pdsk != D_UNKNOWN && - os.pdsk != D_OUTDATED) - && (ns.pdsk < D_INCONSISTENT || - ns.pdsk == D_UNKNOWN || - ns.pdsk == D_OUTDATED)) { + if (lost_contact_to_peer_data(os.pdsk, ns.pdsk)) { if (get_ldev(device)) { if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { @@ -1934,12 +1966,17 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, /* This triggers bitmap writeout of potentially still unwritten pages * if the resync finished cleanly, or aborted because of peer disk - * failure, or because of connection loss. + * failure, or on transition from resync back to AHEAD/BEHIND. + * + * Connection loss is handled in drbd_disconnected() by the receiver. + * * For resync aborted because of local disk failure, we cannot do * any bitmap writeout anymore. + * * No harm done if some bits change during this phase. */ - if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(device)) { + if ((os.conn > C_CONNECTED && os.conn < C_AHEAD) && + (ns.conn == C_CONNECTED || ns.conn >= C_AHEAD) && get_ldev(device)) { drbd_queue_bitmap_io(device, &drbd_bm_write_copy_pages, NULL, "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED); put_ldev(device); @@ -2160,9 +2197,7 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union ns.disk = os.disk; rv = _drbd_set_state(device, ns, flags, NULL); - if (rv < SS_SUCCESS) - BUG(); - + BUG_ON(rv < SS_SUCCESS); ns.i = device->state.i; ns_max.role = max_role(ns.role, ns_max.role); ns_max.peer = max_role(ns.peer, ns_max.peer); diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h index bd98953..6c9d5d4 100644 --- a/drivers/block/drbd/drbd_state.h +++ b/drivers/block/drbd/drbd_state.h @@ -140,7 +140,7 @@ extern void drbd_resume_al(struct drbd_device *device); extern bool conn_all_vols_unconf(struct drbd_connection *connection); /** - * drbd_request_state() - Reqest a state change + * drbd_request_state() - Request a state change * @device: DRBD device. * @mask: mask of state bits to change. * @val: value of new state bits. diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c index 80b0f63..0eeab14 100644 --- a/drivers/block/drbd/drbd_strings.c +++ b/drivers/block/drbd/drbd_strings.c @@ -26,7 +26,7 @@ #include <linux/drbd.h> #include "drbd_strings.h" -static const char *drbd_conn_s_names[] = { +static const char * const drbd_conn_s_names[] = { [C_STANDALONE] = "StandAlone", [C_DISCONNECTING] = "Disconnecting", [C_UNCONNECTED] = "Unconnected", @@ -53,13 +53,13 @@ static const char *drbd_conn_s_names[] = { [C_BEHIND] = "Behind", }; -static const char *drbd_role_s_names[] = { +static const char * const drbd_role_s_names[] = { [R_PRIMARY] = "Primary", [R_SECONDARY] = "Secondary", [R_UNKNOWN] = "Unknown" }; -static const char *drbd_disk_s_names[] = { +static const char * const drbd_disk_s_names[] = { [D_DISKLESS] = "Diskless", [D_ATTACHING] = "Attaching", [D_FAILED] = "Failed", @@ -71,7 +71,7 @@ static const char *drbd_disk_s_names[] = { [D_UP_TO_DATE] = "UpToDate", }; -static const char *drbd_state_sw_errors[] = { +static const char * const drbd_state_sw_errors[] = { [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config", [-SS_NO_UP_TO_DATE_DISK] = "Need access to UpToDate data", [-SS_NO_LOCAL_DISK] = "Can not resync without local disk", diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 51fab97..35dbb3d 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -173,8 +173,8 @@ void drbd_peer_request_endio(struct bio *bio) { struct drbd_peer_request *peer_req = bio->bi_private; struct drbd_device *device = peer_req->peer_device->device; - int is_write = bio_data_dir(bio) == WRITE; - int is_discard = !!(bio_op(bio) == REQ_OP_DISCARD); + bool is_write = bio_data_dir(bio) == WRITE; + bool is_discard = !!(bio_op(bio) == REQ_OP_DISCARD); if (bio->bi_error && __ratelimit(&drbd_ratelimit_state)) drbd_warn(device, "%s: error=%d s=%llus\n", @@ -248,18 +248,26 @@ void drbd_request_endio(struct bio *bio) /* to avoid recursion in __req_mod */ if (unlikely(bio->bi_error)) { - if (bio_op(bio) == REQ_OP_DISCARD) - what = (bio->bi_error == -EOPNOTSUPP) - ? DISCARD_COMPLETED_NOTSUPP - : DISCARD_COMPLETED_WITH_ERROR; - else - what = (bio_data_dir(bio) == WRITE) - ? WRITE_COMPLETED_WITH_ERROR - : (bio_rw(bio) == READ) - ? READ_COMPLETED_WITH_ERROR - : READ_AHEAD_COMPLETED_WITH_ERROR; - } else + switch (bio_op(bio)) { + case REQ_OP_DISCARD: + if (bio->bi_error == -EOPNOTSUPP) + what = DISCARD_COMPLETED_NOTSUPP; + else + what = DISCARD_COMPLETED_WITH_ERROR; + break; + case REQ_OP_READ: + if (bio->bi_rw & REQ_RAHEAD) + what = READ_AHEAD_COMPLETED_WITH_ERROR; + else + what = READ_COMPLETED_WITH_ERROR; + break; + default: + what = WRITE_COMPLETED_WITH_ERROR; + break; + } + } else { what = COMPLETED_OK; + } bio_put(req->private_bio); req->private_bio = ERR_PTR(bio->bi_error); @@ -320,6 +328,10 @@ void drbd_csum_bio(struct crypto_ahash *tfm, struct bio *bio, void *digest) sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset); ahash_request_set_crypt(req, &sg, NULL, sg.length); crypto_ahash_update(req); + /* REQ_OP_WRITE_SAME has only one segment, + * checksum the payload only once. */ + if (bio_op(bio) == REQ_OP_WRITE_SAME) + break; } ahash_request_set_crypt(req, NULL, digest, 0); crypto_ahash_final(req); @@ -387,7 +399,7 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector, /* GFP_TRY, because if there is no memory available right now, this may * be rescheduled for later. It is "only" background resync, after all. */ peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector, - size, true /* has real payload */, GFP_TRY); + size, size, GFP_TRY); if (!peer_req) goto defer; @@ -583,6 +595,7 @@ static int make_resync_request(struct drbd_device *const device, int cancel) int number, rollback_i, size; int align, requeue = 0; int i = 0; + int discard_granularity = 0; if (unlikely(cancel)) return 0; @@ -602,6 +615,12 @@ static int make_resync_request(struct drbd_device *const device, int cancel) return 0; } + if (connection->agreed_features & DRBD_FF_THIN_RESYNC) { + rcu_read_lock(); + discard_granularity = rcu_dereference(device->ldev->disk_conf)->rs_discard_granularity; + rcu_read_unlock(); + } + max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9; number = drbd_rs_number_requests(device); if (number <= 0) @@ -666,6 +685,9 @@ next_sector: if (sector & ((1<<(align+3))-1)) break; + if (discard_granularity && size == discard_granularity) + break; + /* do not cross extent boundaries */ if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0) break; @@ -712,7 +734,8 @@ next_sector: int err; inc_rs_pending(device); - err = drbd_send_drequest(peer_device, P_RS_DATA_REQUEST, + err = drbd_send_drequest(peer_device, + size == discard_granularity ? P_RS_THIN_REQ : P_RS_DATA_REQUEST, sector, size, ID_SYNCER); if (err) { drbd_err(device, "drbd_send_drequest() failed, aborting...\n"); @@ -829,6 +852,7 @@ static void ping_peer(struct drbd_device *device) int drbd_resync_finished(struct drbd_device *device) { + struct drbd_connection *connection = first_peer_device(device)->connection; unsigned long db, dt, dbdt; unsigned long n_oos; union drbd_state os, ns; @@ -850,8 +874,7 @@ int drbd_resync_finished(struct drbd_device *device) if (dw) { dw->w.cb = w_resync_finished; dw->device = device; - drbd_queue_work(&first_peer_device(device)->connection->sender_work, - &dw->w); + drbd_queue_work(&connection->sender_work, &dw->w); return 1; } drbd_err(device, "Warn failed to drbd_rs_del_all() and to kmalloc(dw).\n"); @@ -964,6 +987,30 @@ int drbd_resync_finished(struct drbd_device *device) _drbd_set_state(device, ns, CS_VERBOSE, NULL); out_unlock: spin_unlock_irq(&device->resource->req_lock); + + /* If we have been sync source, and have an effective fencing-policy, + * once *all* volumes are back in sync, call "unfence". */ + if (os.conn == C_SYNC_SOURCE) { + enum drbd_disk_state disk_state = D_MASK; + enum drbd_disk_state pdsk_state = D_MASK; + enum drbd_fencing_p fp = FP_DONT_CARE; + + rcu_read_lock(); + fp = rcu_dereference(device->ldev->disk_conf)->fencing; + if (fp != FP_DONT_CARE) { + struct drbd_peer_device *peer_device; + int vnr; + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + disk_state = min_t(enum drbd_disk_state, disk_state, device->state.disk); + pdsk_state = min_t(enum drbd_disk_state, pdsk_state, device->state.pdsk); + } + } + rcu_read_unlock(); + if (disk_state == D_UP_TO_DATE && pdsk_state == D_UP_TO_DATE) + conn_khelper(connection, "unfence-peer"); + } + put_ldev(device); out: device->rs_total = 0; @@ -1000,7 +1047,6 @@ static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_ /** * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST - * @device: DRBD device. * @w: work object. * @cancel: The connection will be closed anyways */ @@ -1036,6 +1082,30 @@ int w_e_end_data_req(struct drbd_work *w, int cancel) return err; } +static bool all_zero(struct drbd_peer_request *peer_req) +{ + struct page *page = peer_req->pages; + unsigned int len = peer_req->i.size; + + page_chain_for_each(page) { + unsigned int l = min_t(unsigned int, len, PAGE_SIZE); + unsigned int i, words = l / sizeof(long); + unsigned long *d; + + d = kmap_atomic(page); + for (i = 0; i < words; i++) { + if (d[i]) { + kunmap_atomic(d); + return false; + } + } + kunmap_atomic(d); + len -= l; + } + + return true; +} + /** * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST * @w: work object. @@ -1064,7 +1134,10 @@ int w_e_end_rsdata_req(struct drbd_work *w, int cancel) } else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { if (likely(device->state.pdsk >= D_INCONSISTENT)) { inc_rs_pending(device); - err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req); + if (peer_req->flags & EE_RS_THIN_REQ && all_zero(peer_req)) + err = drbd_send_rs_deallocated(peer_device, peer_req); + else + err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req); } else { if (__ratelimit(&drbd_ratelimit_state)) drbd_err(device, "Not sending RSDataReply, " @@ -1634,7 +1707,7 @@ static bool use_checksum_based_resync(struct drbd_connection *connection, struct rcu_read_unlock(); return connection->agreed_pro_version >= 89 && /* supported? */ connection->csums_tfm && /* configured? */ - (csums_after_crash_only == 0 /* use for each resync? */ + (csums_after_crash_only == false /* use for each resync? */ || test_bit(CRASHED_PRIMARY, &device->flags)); /* or only after Primary crash? */ } @@ -1769,7 +1842,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) device->bm_resync_fo = 0; device->use_csums = use_checksum_based_resync(connection, device); } else { - device->use_csums = 0; + device->use_csums = false; } /* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index f9bfecd..c557057 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4350,8 +4350,7 @@ static int __init do_floppy_init(void) /* to be cleaned up... */ disks[drive]->private_data = (void *)(long)drive; disks[drive]->flags |= GENHD_FL_REMOVABLE; - disks[drive]->driverfs_dev = &floppy_device[drive].dev; - add_disk(disks[drive]); + device_add_disk(&floppy_device[drive].dev, disks[drive]); } return 0; diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 364d491..075377e 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1765,6 +1765,7 @@ static int loop_add(struct loop_device **l, int i) */ queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue); + err = -ENOMEM; disk = lo->lo_disk = alloc_disk(1 << part_shift); if (!disk) goto out_free_queue; diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index 145ce2a..e937fcf7 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -687,15 +687,13 @@ static unsigned int mg_issue_req(struct request *req, unsigned int sect_num, unsigned int sect_cnt) { - switch (rq_data_dir(req)) { - case READ: + if (rq_data_dir(req) == READ) { if (mg_out(host, sect_num, sect_cnt, MG_CMD_RD, &mg_read_intr) != MG_ERR_NONE) { mg_bad_rw_intr(host); return host->error; } - break; - case WRITE: + } else { /* TODO : handler */ outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL); if (mg_out(host, sect_num, sect_cnt, MG_CMD_WR, &mg_write_intr) @@ -714,7 +712,6 @@ static unsigned int mg_issue_req(struct request *req, mod_timer(&host->timer, jiffies + 3 * HZ); outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND); - break; } return MG_ERR_NONE; } @@ -1018,7 +1015,7 @@ probe_err_7: probe_err_6: blk_cleanup_queue(host->breq); probe_err_5: - unregister_blkdev(MG_DISK_MAJ, MG_DISK_NAME); + unregister_blkdev(host->major, MG_DISK_NAME); probe_err_4: if (!prv_data->use_polling) free_irq(host->irq, host); diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 8e3e708..2aca98e 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3956,7 +3956,6 @@ static int mtip_block_initialize(struct driver_data *dd) if (rv) goto disk_index_error; - dd->disk->driverfs_dev = &dd->pdev->dev; dd->disk->major = dd->major; dd->disk->first_minor = index * MTIP_MAX_MINORS; dd->disk->minors = MTIP_MAX_MINORS; @@ -4008,7 +4007,7 @@ skip_create_disk: /* * if rebuild pending, start the service thread, and delay the block - * queue creation and add_disk() + * queue creation and device_add_disk() */ if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC) goto start_service_thread; @@ -4042,7 +4041,7 @@ skip_create_disk: set_capacity(dd->disk, capacity); /* Enable the block device and add it to /dev */ - add_disk(dd->disk); + device_add_disk(&dd->pdev->dev, dd->disk); dd->bdev = bdget_disk(dd->disk, 0); /* diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index cab9759..75a7f88 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -448,7 +448,7 @@ static int null_lnvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) struct request *rq; struct bio *bio = rqd->bio; - rq = blk_mq_alloc_request(q, bio_rw(bio), 0); + rq = blk_mq_alloc_request(q, bio_data_dir(bio), 0); if (IS_ERR(rq)) return -ENOMEM; diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index acb4452..76f33c8 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -487,7 +487,6 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) gendisk->fops = &ps3disk_fops; gendisk->queue = queue; gendisk->private_data = dev; - gendisk->driverfs_dev = &dev->sbd.core; snprintf(gendisk->disk_name, sizeof(gendisk->disk_name), PS3DISK_NAME, devidx+'a'); priv->blocking_factor = dev->blk_size >> 9; @@ -499,7 +498,7 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) gendisk->disk_name, priv->model, priv->raw_capacity >> 11, get_capacity(gendisk) >> 11); - add_disk(gendisk); + device_add_disk(&dev->sbd.core, gendisk); return 0; fail_cleanup_queue: diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 56847fc..456b4fe 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -773,14 +773,13 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev) gendisk->fops = &ps3vram_fops; gendisk->queue = queue; gendisk->private_data = dev; - gendisk->driverfs_dev = &dev->core; strlcpy(gendisk->disk_name, DEVICE_NAME, sizeof(gendisk->disk_name)); set_capacity(gendisk, priv->size >> 9); dev_info(&dev->core, "%s: Using %lu MiB of GPU memory\n", gendisk->disk_name, get_capacity(gendisk) >> 11); - add_disk(gendisk); + device_add_disk(&dev->core, gendisk); return 0; fail_cleanup_queue: diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c index e1b8b70..f81d70b 100644 --- a/drivers/block/rsxx/dev.c +++ b/drivers/block/rsxx/dev.c @@ -230,8 +230,7 @@ int rsxx_attach_dev(struct rsxx_cardinfo *card) set_capacity(card->gendisk, card->size8 >> 9); else set_capacity(card->gendisk, 0); - add_disk(card->gendisk); - + device_add_disk(CARD_TO_DEV(card), card->gendisk); card->bdev_attached = 1; } @@ -308,7 +307,6 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card) snprintf(card->gendisk->disk_name, sizeof(card->gendisk->disk_name), "rsxx%d", card->disk_id); - card->gendisk->driverfs_dev = &card->dev->dev; card->gendisk->major = card->major; card->gendisk->first_minor = 0; card->gendisk->fops = &rsxx_fops; diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 5c07a23..3822eae 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -4690,10 +4690,10 @@ static int skd_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo) return -EIO; } -static int skd_bdev_attach(struct skd_device *skdev) +static int skd_bdev_attach(struct device *parent, struct skd_device *skdev) { pr_debug("%s:%s:%d add_disk\n", skdev->name, __func__, __LINE__); - add_disk(skdev->disk); + device_add_disk(parent, skdev->disk); return 0; } @@ -4812,8 +4812,6 @@ static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) pci_set_drvdata(pdev, skdev); - skdev->disk->driverfs_dev = &pdev->dev; - for (i = 0; i < SKD_MAX_BARS; i++) { skdev->mem_phys[i] = pci_resource_start(pdev, i); skdev->mem_size[i] = (u32)pci_resource_len(pdev, i); @@ -4851,7 +4849,7 @@ static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) (SKD_START_WAIT_SECONDS * HZ)); if (skdev->gendisk_on > 0) { /* device came on-line after reset */ - skd_bdev_attach(skdev); + skd_bdev_attach(&pdev->dev, skdev); rc = 0; } else { /* we timed out, something is wrong with the device, diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 4b911ed..cab1573 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -804,7 +804,6 @@ static int probe_disk(struct vdc_port *port) g->fops = &vdc_fops; g->queue = q; g->private_data = port; - g->driverfs_dev = &port->vio.vdev->dev; set_capacity(g, port->vdisk_size); @@ -835,7 +834,7 @@ static int probe_disk(struct vdc_port *port) port->vdisk_size, (port->vdisk_size >> (20 - 9)), port->vio.ver.major, port->vio.ver.minor); - add_disk(g); + device_add_disk(&port->vio.vdev->dev, g); return 0; } diff --git a/drivers/block/umem.c b/drivers/block/umem.c index 4b3ba74..d0a3e6d 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -344,7 +344,6 @@ static int add_bio(struct cardinfo *card) int offset; struct bio *bio; struct bio_vec vec; - int rw; bio = card->currentbio; if (!bio && card->bio) { @@ -359,7 +358,6 @@ static int add_bio(struct cardinfo *card) if (!bio) return 0; - rw = bio_rw(bio); if (card->mm_pages[card->Ready].cnt >= DESC_PER_PAGE) return 0; @@ -369,7 +367,7 @@ static int add_bio(struct cardinfo *card) vec.bv_page, vec.bv_offset, vec.bv_len, - (rw == READ) ? + bio_op(bio) == REQ_OP_READ ? PCI_DMA_FROMDEVICE : PCI_DMA_TODEVICE); p = &card->mm_pages[card->Ready]; @@ -398,7 +396,7 @@ static int add_bio(struct cardinfo *card) DMASCR_CHAIN_EN | DMASCR_SEM_EN | pci_cmds); - if (rw == WRITE) + if (bio_op(bio) == REQ_OP_WRITE) desc->control_bits |= cpu_to_le32(DMASCR_TRANSFER_READ); desc->sem_control_bits = desc->control_bits; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 18e4069..1523e05 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -236,25 +236,22 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, static int virtblk_get_id(struct gendisk *disk, char *id_str) { struct virtio_blk *vblk = disk->private_data; + struct request_queue *q = vblk->disk->queue; struct request *req; - struct bio *bio; int err; - bio = bio_map_kern(vblk->disk->queue, id_str, VIRTIO_BLK_ID_BYTES, - GFP_KERNEL); - if (IS_ERR(bio)) - return PTR_ERR(bio); - - req = blk_make_request(vblk->disk->queue, bio, GFP_KERNEL); - if (IS_ERR(req)) { - bio_put(bio); + req = blk_get_request(q, READ, GFP_KERNEL); + if (IS_ERR(req)) return PTR_ERR(req); - } - req->cmd_type = REQ_TYPE_DRV_PRIV; + + err = blk_rq_map_kern(q, req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL); + if (err) + goto out; + err = blk_execute_rq(vblk->disk->queue, vblk->disk, req, false); +out: blk_put_request(req); - return err; } @@ -656,7 +653,6 @@ static int virtblk_probe(struct virtio_device *vdev) vblk->disk->first_minor = index_to_minor(index); vblk->disk->private_data = vblk; vblk->disk->fops = &virtblk_fops; - vblk->disk->driverfs_dev = &vdev->dev; vblk->disk->flags |= GENHD_FL_EXT_DEVT; vblk->index = index; @@ -733,7 +729,7 @@ static int virtblk_probe(struct virtio_device *vdev) virtio_device_ready(vdev); - add_disk(vblk->disk); + device_add_disk(&vdev->dev, vblk->disk); err = device_create_file(disk_to_dev(vblk->disk), &dev_attr_serial); if (err) goto out_del_disk; diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 3355f1c..2994cfa 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -480,7 +480,7 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle, if (q && test_bit(QUEUE_FLAG_WC, &q->queue_flags)) vbd->flush_support = true; - if (q && blk_queue_secdiscard(q)) + if (q && blk_queue_secure_erase(q)) vbd->discard_secure = true; pr_debug("Successful creation of handle=%04x (dom=%u)\n", diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index da05d3f..0b6682a 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -548,7 +548,7 @@ static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_inf ring_req->u.discard.nr_sectors = blk_rq_sectors(req); ring_req->u.discard.id = id; ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req); - if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) + if (req_op(req) == REQ_OP_SECURE_ERASE && info->feature_secdiscard) ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; else ring_req->u.discard.flag = 0; @@ -844,7 +844,7 @@ static int blkif_queue_request(struct request *req, struct blkfront_ring_info *r return 1; if (unlikely(req_op(req) == REQ_OP_DISCARD || - req->cmd_flags & REQ_SECURE)) + req_op(req) == REQ_OP_SECURE_ERASE)) return blkif_queue_discard_req(req, rinfo); else return blkif_queue_rw_req(req, rinfo); @@ -952,7 +952,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, rq->limits.discard_granularity = info->discard_granularity; rq->limits.discard_alignment = info->discard_alignment; if (info->feature_secdiscard) - queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, rq); + queue_flag_set_unlocked(QUEUE_FLAG_SECERASE, rq); } /* Hard sector size and max sectors impersonate the equiv. hardware. */ @@ -1134,7 +1134,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, gd->first_minor = minor; gd->fops = &xlvbd_block_fops; gd->private_data = info; - gd->driverfs_dev = &(info->xbdev->dev); set_capacity(gd, capacity); if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size, @@ -1592,7 +1591,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) info->feature_discard = 0; info->feature_secdiscard = 0; queue_flag_clear(QUEUE_FLAG_DISCARD, rq); - queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq); + queue_flag_clear(QUEUE_FLAG_SECERASE, rq); } blk_mq_complete_request(req, error); break; @@ -2106,11 +2105,14 @@ static int blkfront_resume(struct xenbus_device *dev) */ if (req_op(shadow[i].request) == REQ_OP_FLUSH || req_op(shadow[i].request) == REQ_OP_DISCARD || - shadow[j].request->cmd_flags & (REQ_FUA | REQ_SECURE)) { - + req_op(shadow[i].request) == REQ_OP_SECURE_ERASE || + shadow[j].request->cmd_flags & REQ_FUA) { /* * Flush operations don't contain bios, so * we need to requeue the whole request + * + * XXX: but this doesn't make any sense for a + * write with the FUA flag set.. */ list_add(&shadow[j].request->queuelist, &info->requests); continue; @@ -2445,7 +2447,7 @@ static void blkfront_connect(struct blkfront_info *info) for (i = 0; i < info->nr_rings; i++) kick_pending_request_queues(&info->rinfo[i]); - add_disk(info->gd); + device_add_disk(&info->xbdev->dev, info->gd); info->is_ready = 1; } diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 1b257ea..5d475b3 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -2032,7 +2032,7 @@ static int cdrom_read_subchannel(struct cdrom_device_info *cdi, init_cdrom_command(&cgc, buffer, 16, CGC_DATA_READ); cgc.cmd[0] = GPCMD_READ_SUBCHANNEL; - cgc.cmd[1] = 2; /* MSF addressing */ + cgc.cmd[1] = subchnl->cdsc_format;/* MSF or LBA addressing */ cgc.cmd[2] = 0x40; /* request subQ data */ cgc.cmd[3] = mcn ? 2 : 1; cgc.cmd[8] = 16; @@ -2041,17 +2041,27 @@ static int cdrom_read_subchannel(struct cdrom_device_info *cdi, return ret; subchnl->cdsc_audiostatus = cgc.buffer[1]; - subchnl->cdsc_format = CDROM_MSF; subchnl->cdsc_ctrl = cgc.buffer[5] & 0xf; subchnl->cdsc_trk = cgc.buffer[6]; subchnl->cdsc_ind = cgc.buffer[7]; - subchnl->cdsc_reladdr.msf.minute = cgc.buffer[13]; - subchnl->cdsc_reladdr.msf.second = cgc.buffer[14]; - subchnl->cdsc_reladdr.msf.frame = cgc.buffer[15]; - subchnl->cdsc_absaddr.msf.minute = cgc.buffer[9]; - subchnl->cdsc_absaddr.msf.second = cgc.buffer[10]; - subchnl->cdsc_absaddr.msf.frame = cgc.buffer[11]; + if (subchnl->cdsc_format == CDROM_LBA) { + subchnl->cdsc_absaddr.lba = ((cgc.buffer[8] << 24) | + (cgc.buffer[9] << 16) | + (cgc.buffer[10] << 8) | + (cgc.buffer[11])); + subchnl->cdsc_reladdr.lba = ((cgc.buffer[12] << 24) | + (cgc.buffer[13] << 16) | + (cgc.buffer[14] << 8) | + (cgc.buffer[15])); + } else { + subchnl->cdsc_reladdr.msf.minute = cgc.buffer[13]; + subchnl->cdsc_reladdr.msf.second = cgc.buffer[14]; + subchnl->cdsc_reladdr.msf.frame = cgc.buffer[15]; + subchnl->cdsc_absaddr.msf.minute = cgc.buffer[9]; + subchnl->cdsc_absaddr.msf.second = cgc.buffer[10]; + subchnl->cdsc_absaddr.msf.frame = cgc.buffer[11]; + } return 0; } @@ -3022,7 +3032,7 @@ static noinline int mmc_ioctl_cdrom_subchannel(struct cdrom_device_info *cdi, if (!((requested == CDROM_MSF) || (requested == CDROM_LBA))) return -EINVAL; - q.cdsc_format = CDROM_MSF; + ret = cdrom_read_subchannel(cdi, &q, 0); if (ret) return ret; diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index ef907fd..bf9a2ad 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -1770,7 +1770,6 @@ static int ide_cd_probe(ide_drive_t *drive) drive->driver_data = info; g->minors = 1; - g->driverfs_dev = &drive->gendev; g->flags = GENHD_FL_CD | GENHD_FL_REMOVABLE; if (ide_cdrom_setup(drive)) { put_device(&info->dev); @@ -1780,7 +1779,7 @@ static int ide_cd_probe(ide_drive_t *drive) ide_cd_read_toc(drive, &sense); g->fops = &idecd_ops; g->flags |= GENHD_FL_REMOVABLE | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; - add_disk(g); + device_add_disk(&drive->gendev, g); return 0; out_free_disk: diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c index 838996a..e823394 100644 --- a/drivers/ide/ide-gd.c +++ b/drivers/ide/ide-gd.c @@ -412,12 +412,11 @@ static int ide_gd_probe(ide_drive_t *drive) set_capacity(g, ide_gd_capacity(drive)); g->minors = IDE_DISK_MINORS; - g->driverfs_dev = &drive->gendev; g->flags |= GENHD_FL_EXT_DEVT; if (drive->dev_flags & IDE_DFLAG_REMOVABLE) g->flags = GENHD_FL_REMOVABLE; g->fops = &ide_gd_ops; - add_disk(g); + device_add_disk(&drive->gendev, g); return 0; out_free_disk: diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig index 85a3390..61c68a1 100644 --- a/drivers/lightnvm/Kconfig +++ b/drivers/lightnvm/Kconfig @@ -27,11 +27,13 @@ config NVM_DEBUG It is required to create/remove targets without IOCTLs. config NVM_GENNVM - tristate "Generic NVM manager for Open-Channel SSDs" + tristate "General Non-Volatile Memory Manager for Open-Channel SSDs" ---help--- - NVM media manager for Open-Channel SSDs that offload management - functionality to device, while keeping data placement and garbage - collection decisions on the host. + Non-volatile memory media manager for Open-Channel SSDs that implements + physical media metadata management and block provisioning API. + + This is the standard media manager for using Open-Channel SSDs, and + required for targets to be instantiated. config NVM_RRPC tristate "Round-robin Hybrid Open-Channel SSD target" diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 160c1a6..9ebd2cf 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -18,8 +18,6 @@ * */ -#include <linux/blkdev.h> -#include <linux/blk-mq.h> #include <linux/list.h> #include <linux/types.h> #include <linux/sem.h> @@ -28,46 +26,42 @@ #include <linux/miscdevice.h> #include <linux/lightnvm.h> #include <linux/sched/sysctl.h> -#include <uapi/linux/lightnvm.h> static LIST_HEAD(nvm_tgt_types); +static DECLARE_RWSEM(nvm_tgtt_lock); static LIST_HEAD(nvm_mgrs); static LIST_HEAD(nvm_devices); -static LIST_HEAD(nvm_targets); static DECLARE_RWSEM(nvm_lock); -static struct nvm_target *nvm_find_target(const char *name) +struct nvm_tgt_type *nvm_find_target_type(const char *name, int lock) { - struct nvm_target *tgt; + struct nvm_tgt_type *tmp, *tt = NULL; - list_for_each_entry(tgt, &nvm_targets, list) - if (!strcmp(name, tgt->disk->disk_name)) - return tgt; + if (lock) + down_write(&nvm_tgtt_lock); - return NULL; -} - -static struct nvm_tgt_type *nvm_find_target_type(const char *name) -{ - struct nvm_tgt_type *tt; - - list_for_each_entry(tt, &nvm_tgt_types, list) - if (!strcmp(name, tt->name)) - return tt; + list_for_each_entry(tmp, &nvm_tgt_types, list) + if (!strcmp(name, tmp->name)) { + tt = tmp; + break; + } - return NULL; + if (lock) + up_write(&nvm_tgtt_lock); + return tt; } +EXPORT_SYMBOL(nvm_find_target_type); int nvm_register_tgt_type(struct nvm_tgt_type *tt) { int ret = 0; - down_write(&nvm_lock); - if (nvm_find_target_type(tt->name)) + down_write(&nvm_tgtt_lock); + if (nvm_find_target_type(tt->name, 0)) ret = -EEXIST; else list_add(&tt->list, &nvm_tgt_types); - up_write(&nvm_lock); + up_write(&nvm_tgtt_lock); return ret; } @@ -110,7 +104,7 @@ static struct nvmm_type *nvm_find_mgr_type(const char *name) return NULL; } -struct nvmm_type *nvm_init_mgr(struct nvm_dev *dev) +static struct nvmm_type *nvm_init_mgr(struct nvm_dev *dev) { struct nvmm_type *mt; int ret; @@ -182,20 +176,6 @@ static struct nvm_dev *nvm_find_nvm_dev(const char *name) return NULL; } -struct nvm_block *nvm_get_blk_unlocked(struct nvm_dev *dev, struct nvm_lun *lun, - unsigned long flags) -{ - return dev->mt->get_blk_unlocked(dev, lun, flags); -} -EXPORT_SYMBOL(nvm_get_blk_unlocked); - -/* Assumes that all valid pages have already been moved on release to bm */ -void nvm_put_blk_unlocked(struct nvm_dev *dev, struct nvm_block *blk) -{ - return dev->mt->put_blk_unlocked(dev, blk); -} -EXPORT_SYMBOL(nvm_put_blk_unlocked); - struct nvm_block *nvm_get_blk(struct nvm_dev *dev, struct nvm_lun *lun, unsigned long flags) { @@ -210,6 +190,12 @@ void nvm_put_blk(struct nvm_dev *dev, struct nvm_block *blk) } EXPORT_SYMBOL(nvm_put_blk); +void nvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type) +{ + return dev->mt->mark_blk(dev, ppa, type); +} +EXPORT_SYMBOL(nvm_mark_blk); + int nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) { return dev->mt->submit_io(dev, rqd); @@ -251,9 +237,10 @@ void nvm_generic_to_addr_mode(struct nvm_dev *dev, struct nvm_rq *rqd) EXPORT_SYMBOL(nvm_generic_to_addr_mode); int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd, - struct ppa_addr *ppas, int nr_ppas, int vblk) + const struct ppa_addr *ppas, int nr_ppas, int vblk) { int i, plane_cnt, pl_idx; + struct ppa_addr ppa; if ((!vblk || dev->plane_mode == NVM_PLANE_SINGLE) && nr_ppas == 1) { rqd->nr_ppas = nr_ppas; @@ -278,8 +265,9 @@ int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd, for (i = 0; i < nr_ppas; i++) { for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) { - ppas[i].g.pl = pl_idx; - rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppas[i]; + ppa = ppas[i]; + ppa.g.pl = pl_idx; + rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa; } } } @@ -337,7 +325,7 @@ static void nvm_end_io_sync(struct nvm_rq *rqd) complete(waiting); } -int __nvm_submit_ppa(struct nvm_dev *dev, struct nvm_rq *rqd, int opcode, +static int __nvm_submit_ppa(struct nvm_dev *dev, struct nvm_rq *rqd, int opcode, int flags, void *buf, int len) { DECLARE_COMPLETION_ONSTACK(wait); @@ -367,7 +355,9 @@ int __nvm_submit_ppa(struct nvm_dev *dev, struct nvm_rq *rqd, int opcode, /* Prevent hang_check timer from firing at us during very long I/O */ hang_check = sysctl_hung_task_timeout_secs; if (hang_check) - while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2))); + while (!wait_for_completion_io_timeout(&wait, + hang_check * (HZ/2))) + ; else wait_for_completion_io(&wait); @@ -510,7 +500,8 @@ static int nvm_init_mlc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp) /* The lower page table encoding consists of a list of bytes, where each * has a lower and an upper half. The first half byte maintains the * increment value and every value after is an offset added to the - * previous incrementation value */ + * previous incrementation value + */ dev->lptbl[0] = mlc->pairs[0] & 0xF; for (i = 1; i < dev->lps_per_blk; i++) { p = mlc->pairs[i >> 1]; @@ -596,42 +587,11 @@ err_fmtype: return ret; } -static void nvm_remove_target(struct nvm_target *t) -{ - struct nvm_tgt_type *tt = t->type; - struct gendisk *tdisk = t->disk; - struct request_queue *q = tdisk->queue; - - lockdep_assert_held(&nvm_lock); - - del_gendisk(tdisk); - blk_cleanup_queue(q); - - if (tt->exit) - tt->exit(tdisk->private_data); - - put_disk(tdisk); - - list_del(&t->list); - kfree(t); -} - static void nvm_free_mgr(struct nvm_dev *dev) { - struct nvm_target *tgt, *tmp; - if (!dev->mt) return; - down_write(&nvm_lock); - list_for_each_entry_safe(tgt, tmp, &nvm_targets, list) { - if (tgt->dev != dev) - continue; - - nvm_remove_target(tgt); - } - up_write(&nvm_lock); - dev->mt->unregister_mgr(dev); dev->mt = NULL; } @@ -778,91 +738,6 @@ void nvm_unregister(char *disk_name) } EXPORT_SYMBOL(nvm_unregister); -static const struct block_device_operations nvm_fops = { - .owner = THIS_MODULE, -}; - -static int nvm_create_target(struct nvm_dev *dev, - struct nvm_ioctl_create *create) -{ - struct nvm_ioctl_create_simple *s = &create->conf.s; - struct request_queue *tqueue; - struct gendisk *tdisk; - struct nvm_tgt_type *tt; - struct nvm_target *t; - void *targetdata; - - if (!dev->mt) { - pr_info("nvm: device has no media manager registered.\n"); - return -ENODEV; - } - - down_write(&nvm_lock); - tt = nvm_find_target_type(create->tgttype); - if (!tt) { - pr_err("nvm: target type %s not found\n", create->tgttype); - up_write(&nvm_lock); - return -EINVAL; - } - - t = nvm_find_target(create->tgtname); - if (t) { - pr_err("nvm: target name already exists.\n"); - up_write(&nvm_lock); - return -EINVAL; - } - up_write(&nvm_lock); - - t = kmalloc(sizeof(struct nvm_target), GFP_KERNEL); - if (!t) - return -ENOMEM; - - tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node); - if (!tqueue) - goto err_t; - blk_queue_make_request(tqueue, tt->make_rq); - - tdisk = alloc_disk(0); - if (!tdisk) - goto err_queue; - - sprintf(tdisk->disk_name, "%s", create->tgtname); - tdisk->flags = GENHD_FL_EXT_DEVT; - tdisk->major = 0; - tdisk->first_minor = 0; - tdisk->fops = &nvm_fops; - tdisk->queue = tqueue; - - targetdata = tt->init(dev, tdisk, s->lun_begin, s->lun_end); - if (IS_ERR(targetdata)) - goto err_init; - - tdisk->private_data = targetdata; - tqueue->queuedata = targetdata; - - blk_queue_max_hw_sectors(tqueue, 8 * dev->ops->max_phys_sect); - - set_capacity(tdisk, tt->capacity(targetdata)); - add_disk(tdisk); - - t->type = tt; - t->disk = tdisk; - t->dev = dev; - - down_write(&nvm_lock); - list_add_tail(&t->list, &nvm_targets); - up_write(&nvm_lock); - - return 0; -err_init: - put_disk(tdisk); -err_queue: - blk_cleanup_queue(tqueue); -err_t: - kfree(t); - return -ENOMEM; -} - static int __nvm_configure_create(struct nvm_ioctl_create *create) { struct nvm_dev *dev; @@ -871,11 +746,17 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create) down_write(&nvm_lock); dev = nvm_find_nvm_dev(create->dev); up_write(&nvm_lock); + if (!dev) { pr_err("nvm: device not found\n"); return -EINVAL; } + if (!dev->mt) { + pr_info("nvm: device has no media manager registered.\n"); + return -ENODEV; + } + if (create->conf.type != NVM_CONFIG_TYPE_SIMPLE) { pr_err("nvm: config type not valid\n"); return -EINVAL; @@ -888,25 +769,7 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create) return -EINVAL; } - return nvm_create_target(dev, create); -} - -static int __nvm_configure_remove(struct nvm_ioctl_remove *remove) -{ - struct nvm_target *t; - - down_write(&nvm_lock); - t = nvm_find_target(remove->tgtname); - if (!t) { - pr_err("nvm: target \"%s\" doesn't exist.\n", remove->tgtname); - up_write(&nvm_lock); - return -EINVAL; - } - - nvm_remove_target(t); - up_write(&nvm_lock); - - return 0; + return dev->mt->create_tgt(dev, create); } #ifdef CONFIG_NVM_DEBUG @@ -941,8 +804,9 @@ static int nvm_configure_show(const char *val) static int nvm_configure_remove(const char *val) { struct nvm_ioctl_remove remove; + struct nvm_dev *dev; char opcode; - int ret; + int ret = 0; ret = sscanf(val, "%c %256s", &opcode, remove.tgtname); if (ret != 2) { @@ -952,7 +816,13 @@ static int nvm_configure_remove(const char *val) remove.flags = 0; - return __nvm_configure_remove(&remove); + list_for_each_entry(dev, &nvm_devices, devices) { + ret = dev->mt->remove_tgt(dev, &remove); + if (!ret) + break; + } + + return ret; } static int nvm_configure_create(const char *val) @@ -1149,6 +1019,8 @@ static long nvm_ioctl_dev_create(struct file *file, void __user *arg) static long nvm_ioctl_dev_remove(struct file *file, void __user *arg) { struct nvm_ioctl_remove remove; + struct nvm_dev *dev; + int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1163,7 +1035,13 @@ static long nvm_ioctl_dev_remove(struct file *file, void __user *arg) return -EINVAL; } - return __nvm_configure_remove(&remove); + list_for_each_entry(dev, &nvm_devices, devices) { + ret = dev->mt->remove_tgt(dev, &remove); + if (!ret) + break; + } + + return ret; } static void nvm_setup_nvm_sb_info(struct nvm_sb_info *info) diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index ec9fb68..b74174c 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -15,22 +15,160 @@ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, * USA. * - * Implementation of a generic nvm manager for Open-Channel SSDs. + * Implementation of a general nvm manager for Open-Channel SSDs. */ #include "gennvm.h" -static int gennvm_get_area(struct nvm_dev *dev, sector_t *lba, sector_t len) +static struct nvm_target *gen_find_target(struct gen_dev *gn, const char *name) { - struct gen_nvm *gn = dev->mp; - struct gennvm_area *area, *prev, *next; + struct nvm_target *tgt; + + list_for_each_entry(tgt, &gn->targets, list) + if (!strcmp(name, tgt->disk->disk_name)) + return tgt; + + return NULL; +} + +static const struct block_device_operations gen_fops = { + .owner = THIS_MODULE, +}; + +static int gen_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) +{ + struct gen_dev *gn = dev->mp; + struct nvm_ioctl_create_simple *s = &create->conf.s; + struct request_queue *tqueue; + struct gendisk *tdisk; + struct nvm_tgt_type *tt; + struct nvm_target *t; + void *targetdata; + + tt = nvm_find_target_type(create->tgttype, 1); + if (!tt) { + pr_err("nvm: target type %s not found\n", create->tgttype); + return -EINVAL; + } + + mutex_lock(&gn->lock); + t = gen_find_target(gn, create->tgtname); + if (t) { + pr_err("nvm: target name already exists.\n"); + mutex_unlock(&gn->lock); + return -EINVAL; + } + mutex_unlock(&gn->lock); + + t = kmalloc(sizeof(struct nvm_target), GFP_KERNEL); + if (!t) + return -ENOMEM; + + tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node); + if (!tqueue) + goto err_t; + blk_queue_make_request(tqueue, tt->make_rq); + + tdisk = alloc_disk(0); + if (!tdisk) + goto err_queue; + + sprintf(tdisk->disk_name, "%s", create->tgtname); + tdisk->flags = GENHD_FL_EXT_DEVT; + tdisk->major = 0; + tdisk->first_minor = 0; + tdisk->fops = &gen_fops; + tdisk->queue = tqueue; + + targetdata = tt->init(dev, tdisk, s->lun_begin, s->lun_end); + if (IS_ERR(targetdata)) + goto err_init; + + tdisk->private_data = targetdata; + tqueue->queuedata = targetdata; + + blk_queue_max_hw_sectors(tqueue, 8 * dev->ops->max_phys_sect); + + set_capacity(tdisk, tt->capacity(targetdata)); + add_disk(tdisk); + + t->type = tt; + t->disk = tdisk; + t->dev = dev; + + mutex_lock(&gn->lock); + list_add_tail(&t->list, &gn->targets); + mutex_unlock(&gn->lock); + + return 0; +err_init: + put_disk(tdisk); +err_queue: + blk_cleanup_queue(tqueue); +err_t: + kfree(t); + return -ENOMEM; +} + +static void __gen_remove_target(struct nvm_target *t) +{ + struct nvm_tgt_type *tt = t->type; + struct gendisk *tdisk = t->disk; + struct request_queue *q = tdisk->queue; + + del_gendisk(tdisk); + blk_cleanup_queue(q); + + if (tt->exit) + tt->exit(tdisk->private_data); + + put_disk(tdisk); + + list_del(&t->list); + kfree(t); +} + +/** + * gen_remove_tgt - Removes a target from the media manager + * @dev: device + * @remove: ioctl structure with target name to remove. + * + * Returns: + * 0: on success + * 1: on not found + * <0: on error + */ +static int gen_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove) +{ + struct gen_dev *gn = dev->mp; + struct nvm_target *t; + + if (!gn) + return 1; + + mutex_lock(&gn->lock); + t = gen_find_target(gn, remove->tgtname); + if (!t) { + mutex_unlock(&gn->lock); + return 1; + } + __gen_remove_target(t); + mutex_unlock(&gn->lock); + + return 0; +} + +static int gen_get_area(struct nvm_dev *dev, sector_t *lba, sector_t len) +{ + struct gen_dev *gn = dev->mp; + struct gen_area *area, *prev, *next; sector_t begin = 0; sector_t max_sectors = (dev->sec_size * dev->total_secs) >> 9; if (len > max_sectors) return -EINVAL; - area = kmalloc(sizeof(struct gennvm_area), GFP_KERNEL); + area = kmalloc(sizeof(struct gen_area), GFP_KERNEL); if (!area) return -ENOMEM; @@ -64,10 +202,10 @@ static int gennvm_get_area(struct nvm_dev *dev, sector_t *lba, sector_t len) return 0; } -static void gennvm_put_area(struct nvm_dev *dev, sector_t begin) +static void gen_put_area(struct nvm_dev *dev, sector_t begin) { - struct gen_nvm *gn = dev->mp; - struct gennvm_area *area; + struct gen_dev *gn = dev->mp; + struct gen_area *area; spin_lock(&dev->lock); list_for_each_entry(area, &gn->area_list, list) { @@ -82,27 +220,27 @@ static void gennvm_put_area(struct nvm_dev *dev, sector_t begin) spin_unlock(&dev->lock); } -static void gennvm_blocks_free(struct nvm_dev *dev) +static void gen_blocks_free(struct nvm_dev *dev) { - struct gen_nvm *gn = dev->mp; + struct gen_dev *gn = dev->mp; struct gen_lun *lun; int i; - gennvm_for_each_lun(gn, lun, i) { + gen_for_each_lun(gn, lun, i) { if (!lun->vlun.blocks) break; vfree(lun->vlun.blocks); } } -static void gennvm_luns_free(struct nvm_dev *dev) +static void gen_luns_free(struct nvm_dev *dev) { - struct gen_nvm *gn = dev->mp; + struct gen_dev *gn = dev->mp; kfree(gn->luns); } -static int gennvm_luns_init(struct nvm_dev *dev, struct gen_nvm *gn) +static int gen_luns_init(struct nvm_dev *dev, struct gen_dev *gn) { struct gen_lun *lun; int i; @@ -111,7 +249,7 @@ static int gennvm_luns_init(struct nvm_dev *dev, struct gen_nvm *gn) if (!gn->luns) return -ENOMEM; - gennvm_for_each_lun(gn, lun, i) { + gen_for_each_lun(gn, lun, i) { spin_lock_init(&lun->vlun.lock); INIT_LIST_HEAD(&lun->free_list); INIT_LIST_HEAD(&lun->used_list); @@ -122,14 +260,11 @@ static int gennvm_luns_init(struct nvm_dev *dev, struct gen_nvm *gn) lun->vlun.lun_id = i % dev->luns_per_chnl; lun->vlun.chnl_id = i / dev->luns_per_chnl; lun->vlun.nr_free_blocks = dev->blks_per_lun; - lun->vlun.nr_open_blocks = 0; - lun->vlun.nr_closed_blocks = 0; - lun->vlun.nr_bad_blocks = 0; } return 0; } -static int gennvm_block_bb(struct gen_nvm *gn, struct ppa_addr ppa, +static int gen_block_bb(struct gen_dev *gn, struct ppa_addr ppa, u8 *blks, int nr_blks) { struct nvm_dev *dev = gn->dev; @@ -149,17 +284,16 @@ static int gennvm_block_bb(struct gen_nvm *gn, struct ppa_addr ppa, blk = &lun->vlun.blocks[i]; list_move_tail(&blk->list, &lun->bb_list); - lun->vlun.nr_bad_blocks++; lun->vlun.nr_free_blocks--; } return 0; } -static int gennvm_block_map(u64 slba, u32 nlb, __le64 *entries, void *private) +static int gen_block_map(u64 slba, u32 nlb, __le64 *entries, void *private) { struct nvm_dev *dev = private; - struct gen_nvm *gn = dev->mp; + struct gen_dev *gn = dev->mp; u64 elba = slba + nlb; struct gen_lun *lun; struct nvm_block *blk; @@ -167,7 +301,7 @@ static int gennvm_block_map(u64 slba, u32 nlb, __le64 *entries, void *private) int lun_id; if (unlikely(elba > dev->total_secs)) { - pr_err("gennvm: L2P data from device is out of bounds!\n"); + pr_err("gen: L2P data from device is out of bounds!\n"); return -EINVAL; } @@ -175,7 +309,7 @@ static int gennvm_block_map(u64 slba, u32 nlb, __le64 *entries, void *private) u64 pba = le64_to_cpu(entries[i]); if (unlikely(pba >= dev->total_secs && pba != U64_MAX)) { - pr_err("gennvm: L2P data entry is out of bounds!\n"); + pr_err("gen: L2P data entry is out of bounds!\n"); return -EINVAL; } @@ -200,16 +334,15 @@ static int gennvm_block_map(u64 slba, u32 nlb, __le64 *entries, void *private) * block state. The block is assumed to be open. */ list_move_tail(&blk->list, &lun->used_list); - blk->state = NVM_BLK_ST_OPEN; + blk->state = NVM_BLK_ST_TGT; lun->vlun.nr_free_blocks--; - lun->vlun.nr_open_blocks++; } } return 0; } -static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn) +static int gen_blocks_init(struct nvm_dev *dev, struct gen_dev *gn) { struct gen_lun *lun; struct nvm_block *block; @@ -222,7 +355,7 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn) if (!blks) return -ENOMEM; - gennvm_for_each_lun(gn, lun, lun_iter) { + gen_for_each_lun(gn, lun, lun_iter) { lun->vlun.blocks = vzalloc(sizeof(struct nvm_block) * dev->blks_per_lun); if (!lun->vlun.blocks) { @@ -256,20 +389,20 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn) ret = nvm_get_bb_tbl(dev, ppa, blks); if (ret) - pr_err("gennvm: could not get BB table\n"); + pr_err("gen: could not get BB table\n"); - ret = gennvm_block_bb(gn, ppa, blks, nr_blks); + ret = gen_block_bb(gn, ppa, blks, nr_blks); if (ret) - pr_err("gennvm: BB table map failed\n"); + pr_err("gen: BB table map failed\n"); } } if ((dev->identity.dom & NVM_RSP_L2P) && dev->ops->get_l2p_tbl) { ret = dev->ops->get_l2p_tbl(dev, 0, dev->total_secs, - gennvm_block_map, dev); + gen_block_map, dev); if (ret) { - pr_err("gennvm: could not read L2P table.\n"); - pr_warn("gennvm: default block initialization"); + pr_err("gen: could not read L2P table.\n"); + pr_warn("gen: default block initialization"); } } @@ -277,67 +410,79 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn) return 0; } -static void gennvm_free(struct nvm_dev *dev) +static void gen_free(struct nvm_dev *dev) { - gennvm_blocks_free(dev); - gennvm_luns_free(dev); + gen_blocks_free(dev); + gen_luns_free(dev); kfree(dev->mp); dev->mp = NULL; } -static int gennvm_register(struct nvm_dev *dev) +static int gen_register(struct nvm_dev *dev) { - struct gen_nvm *gn; + struct gen_dev *gn; int ret; if (!try_module_get(THIS_MODULE)) return -ENODEV; - gn = kzalloc(sizeof(struct gen_nvm), GFP_KERNEL); + gn = kzalloc(sizeof(struct gen_dev), GFP_KERNEL); if (!gn) return -ENOMEM; gn->dev = dev; gn->nr_luns = dev->nr_luns; INIT_LIST_HEAD(&gn->area_list); + mutex_init(&gn->lock); + INIT_LIST_HEAD(&gn->targets); dev->mp = gn; - ret = gennvm_luns_init(dev, gn); + ret = gen_luns_init(dev, gn); if (ret) { - pr_err("gennvm: could not initialize luns\n"); + pr_err("gen: could not initialize luns\n"); goto err; } - ret = gennvm_blocks_init(dev, gn); + ret = gen_blocks_init(dev, gn); if (ret) { - pr_err("gennvm: could not initialize blocks\n"); + pr_err("gen: could not initialize blocks\n"); goto err; } return 1; err: - gennvm_free(dev); + gen_free(dev); module_put(THIS_MODULE); return ret; } -static void gennvm_unregister(struct nvm_dev *dev) +static void gen_unregister(struct nvm_dev *dev) { - gennvm_free(dev); + struct gen_dev *gn = dev->mp; + struct nvm_target *t, *tmp; + + mutex_lock(&gn->lock); + list_for_each_entry_safe(t, tmp, &gn->targets, list) { + if (t->dev != dev) + continue; + __gen_remove_target(t); + } + mutex_unlock(&gn->lock); + + gen_free(dev); module_put(THIS_MODULE); } -static struct nvm_block *gennvm_get_blk_unlocked(struct nvm_dev *dev, +static struct nvm_block *gen_get_blk(struct nvm_dev *dev, struct nvm_lun *vlun, unsigned long flags) { struct gen_lun *lun = container_of(vlun, struct gen_lun, vlun); struct nvm_block *blk = NULL; int is_gc = flags & NVM_IOTYPE_GC; - assert_spin_locked(&vlun->lock); - + spin_lock(&vlun->lock); if (list_empty(&lun->free_list)) { - pr_err_ratelimited("gennvm: lun %u have no free pages available", + pr_err_ratelimited("gen: lun %u have no free pages available", lun->vlun.id); goto out; } @@ -346,88 +491,58 @@ static struct nvm_block *gennvm_get_blk_unlocked(struct nvm_dev *dev, goto out; blk = list_first_entry(&lun->free_list, struct nvm_block, list); - list_move_tail(&blk->list, &lun->used_list); - blk->state = NVM_BLK_ST_OPEN; + list_move_tail(&blk->list, &lun->used_list); + blk->state = NVM_BLK_ST_TGT; lun->vlun.nr_free_blocks--; - lun->vlun.nr_open_blocks++; - out: - return blk; -} - -static struct nvm_block *gennvm_get_blk(struct nvm_dev *dev, - struct nvm_lun *vlun, unsigned long flags) -{ - struct nvm_block *blk; - - spin_lock(&vlun->lock); - blk = gennvm_get_blk_unlocked(dev, vlun, flags); spin_unlock(&vlun->lock); return blk; } -static void gennvm_put_blk_unlocked(struct nvm_dev *dev, struct nvm_block *blk) +static void gen_put_blk(struct nvm_dev *dev, struct nvm_block *blk) { struct nvm_lun *vlun = blk->lun; struct gen_lun *lun = container_of(vlun, struct gen_lun, vlun); - assert_spin_locked(&vlun->lock); - - if (blk->state & NVM_BLK_ST_OPEN) { - list_move_tail(&blk->list, &lun->free_list); - lun->vlun.nr_open_blocks--; - lun->vlun.nr_free_blocks++; - blk->state = NVM_BLK_ST_FREE; - } else if (blk->state & NVM_BLK_ST_CLOSED) { + spin_lock(&vlun->lock); + if (blk->state & NVM_BLK_ST_TGT) { list_move_tail(&blk->list, &lun->free_list); - lun->vlun.nr_closed_blocks--; lun->vlun.nr_free_blocks++; blk->state = NVM_BLK_ST_FREE; } else if (blk->state & NVM_BLK_ST_BAD) { list_move_tail(&blk->list, &lun->bb_list); - lun->vlun.nr_bad_blocks++; blk->state = NVM_BLK_ST_BAD; } else { WARN_ON_ONCE(1); - pr_err("gennvm: erroneous block type (%lu -> %u)\n", + pr_err("gen: erroneous block type (%lu -> %u)\n", blk->id, blk->state); list_move_tail(&blk->list, &lun->bb_list); - lun->vlun.nr_bad_blocks++; - blk->state = NVM_BLK_ST_BAD; } -} - -static void gennvm_put_blk(struct nvm_dev *dev, struct nvm_block *blk) -{ - struct nvm_lun *vlun = blk->lun; - - spin_lock(&vlun->lock); - gennvm_put_blk_unlocked(dev, blk); spin_unlock(&vlun->lock); } -static void gennvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type) +static void gen_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type) { - struct gen_nvm *gn = dev->mp; + struct gen_dev *gn = dev->mp; struct gen_lun *lun; struct nvm_block *blk; - pr_debug("gennvm: ppa (ch: %u lun: %u blk: %u pg: %u) -> %u\n", + pr_debug("gen: ppa (ch: %u lun: %u blk: %u pg: %u) -> %u\n", ppa.g.ch, ppa.g.lun, ppa.g.blk, ppa.g.pg, type); if (unlikely(ppa.g.ch > dev->nr_chnls || ppa.g.lun > dev->luns_per_chnl || ppa.g.blk > dev->blks_per_lun)) { WARN_ON_ONCE(1); - pr_err("gennvm: ppa broken (ch: %u > %u lun: %u > %u blk: %u > %u", + pr_err("gen: ppa broken (ch: %u > %u lun: %u > %u blk: %u > %u", ppa.g.ch, dev->nr_chnls, ppa.g.lun, dev->luns_per_chnl, ppa.g.blk, dev->blks_per_lun); return; } - lun = &gn->luns[ppa.g.lun * ppa.g.ch]; + lun = &gn->luns[(dev->luns_per_chnl * ppa.g.ch) + ppa.g.lun]; blk = &lun->vlun.blocks[ppa.g.blk]; /* will be moved to bb list on put_blk from target */ @@ -435,9 +550,9 @@ static void gennvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type) } /* - * mark block bad in gennvm. It is expected that the target recovers separately + * mark block bad in gen. It is expected that the target recovers separately */ -static void gennvm_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd) +static void gen_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd) { int bit = -1; int max_secs = dev->ops->max_phys_sect; @@ -447,25 +562,25 @@ static void gennvm_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd) /* look up blocks and mark them as bad */ if (rqd->nr_ppas == 1) { - gennvm_mark_blk(dev, rqd->ppa_addr, NVM_BLK_ST_BAD); + gen_mark_blk(dev, rqd->ppa_addr, NVM_BLK_ST_BAD); return; } while ((bit = find_next_bit(comp_bits, max_secs, bit + 1)) < max_secs) - gennvm_mark_blk(dev, rqd->ppa_list[bit], NVM_BLK_ST_BAD); + gen_mark_blk(dev, rqd->ppa_list[bit], NVM_BLK_ST_BAD); } -static void gennvm_end_io(struct nvm_rq *rqd) +static void gen_end_io(struct nvm_rq *rqd) { struct nvm_tgt_instance *ins = rqd->ins; if (rqd->error == NVM_RSP_ERR_FAILWRITE) - gennvm_mark_blk_bad(rqd->dev, rqd); + gen_mark_blk_bad(rqd->dev, rqd); ins->tt->end_io(rqd); } -static int gennvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) +static int gen_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) { if (!dev->ops->submit_io) return -ENODEV; @@ -474,11 +589,11 @@ static int gennvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) nvm_generic_to_addr_mode(dev, rqd); rqd->dev = dev; - rqd->end_io = gennvm_end_io; + rqd->end_io = gen_end_io; return dev->ops->submit_io(dev, rqd); } -static int gennvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk, +static int gen_erase_blk(struct nvm_dev *dev, struct nvm_block *blk, unsigned long flags) { struct ppa_addr addr = block_to_ppa(dev, blk); @@ -486,19 +601,19 @@ static int gennvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk, return nvm_erase_ppa(dev, &addr, 1); } -static int gennvm_reserve_lun(struct nvm_dev *dev, int lunid) +static int gen_reserve_lun(struct nvm_dev *dev, int lunid) { return test_and_set_bit(lunid, dev->lun_map); } -static void gennvm_release_lun(struct nvm_dev *dev, int lunid) +static void gen_release_lun(struct nvm_dev *dev, int lunid) { WARN_ON(!test_and_clear_bit(lunid, dev->lun_map)); } -static struct nvm_lun *gennvm_get_lun(struct nvm_dev *dev, int lunid) +static struct nvm_lun *gen_get_lun(struct nvm_dev *dev, int lunid) { - struct gen_nvm *gn = dev->mp; + struct gen_dev *gn = dev->mp; if (unlikely(lunid >= dev->nr_luns)) return NULL; @@ -506,66 +621,62 @@ static struct nvm_lun *gennvm_get_lun(struct nvm_dev *dev, int lunid) return &gn->luns[lunid].vlun; } -static void gennvm_lun_info_print(struct nvm_dev *dev) +static void gen_lun_info_print(struct nvm_dev *dev) { - struct gen_nvm *gn = dev->mp; + struct gen_dev *gn = dev->mp; struct gen_lun *lun; unsigned int i; - gennvm_for_each_lun(gn, lun, i) { + gen_for_each_lun(gn, lun, i) { spin_lock(&lun->vlun.lock); - pr_info("%s: lun%8u\t%u\t%u\t%u\t%u\n", - dev->name, i, - lun->vlun.nr_free_blocks, - lun->vlun.nr_open_blocks, - lun->vlun.nr_closed_blocks, - lun->vlun.nr_bad_blocks); + pr_info("%s: lun%8u\t%u\n", dev->name, i, + lun->vlun.nr_free_blocks); spin_unlock(&lun->vlun.lock); } } -static struct nvmm_type gennvm = { +static struct nvmm_type gen = { .name = "gennvm", .version = {0, 1, 0}, - .register_mgr = gennvm_register, - .unregister_mgr = gennvm_unregister, + .register_mgr = gen_register, + .unregister_mgr = gen_unregister, - .get_blk_unlocked = gennvm_get_blk_unlocked, - .put_blk_unlocked = gennvm_put_blk_unlocked, + .create_tgt = gen_create_tgt, + .remove_tgt = gen_remove_tgt, - .get_blk = gennvm_get_blk, - .put_blk = gennvm_put_blk, + .get_blk = gen_get_blk, + .put_blk = gen_put_blk, - .submit_io = gennvm_submit_io, - .erase_blk = gennvm_erase_blk, + .submit_io = gen_submit_io, + .erase_blk = gen_erase_blk, - .mark_blk = gennvm_mark_blk, + .mark_blk = gen_mark_blk, - .get_lun = gennvm_get_lun, - .reserve_lun = gennvm_reserve_lun, - .release_lun = gennvm_release_lun, - .lun_info_print = gennvm_lun_info_print, + .get_lun = gen_get_lun, + .reserve_lun = gen_reserve_lun, + .release_lun = gen_release_lun, + .lun_info_print = gen_lun_info_print, - .get_area = gennvm_get_area, - .put_area = gennvm_put_area, + .get_area = gen_get_area, + .put_area = gen_put_area, }; -static int __init gennvm_module_init(void) +static int __init gen_module_init(void) { - return nvm_register_mgr(&gennvm); + return nvm_register_mgr(&gen); } -static void gennvm_module_exit(void) +static void gen_module_exit(void) { - nvm_unregister_mgr(&gennvm); + nvm_unregister_mgr(&gen); } -module_init(gennvm_module_init); -module_exit(gennvm_module_exit); +module_init(gen_module_init); +module_exit(gen_module_exit); MODULE_LICENSE("GPL v2"); -MODULE_DESCRIPTION("Generic media manager for Open-Channel SSDs"); +MODULE_DESCRIPTION("General media manager for Open-Channel SSDs"); diff --git a/drivers/lightnvm/gennvm.h b/drivers/lightnvm/gennvm.h index 04d7c23..8ecfa81 100644 --- a/drivers/lightnvm/gennvm.h +++ b/drivers/lightnvm/gennvm.h @@ -34,20 +34,24 @@ struct gen_lun { */ }; -struct gen_nvm { +struct gen_dev { struct nvm_dev *dev; int nr_luns; struct gen_lun *luns; struct list_head area_list; + + struct mutex lock; + struct list_head targets; }; -struct gennvm_area { +struct gen_area { struct list_head list; sector_t begin; sector_t end; /* end is excluded */ }; -#define gennvm_for_each_lun(bm, lun, i) \ + +#define gen_for_each_lun(bm, lun, i) \ for ((i) = 0, lun = &(bm)->luns[0]; \ (i) < (bm)->nr_luns; (i)++, lun = &(bm)->luns[(i)]) diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index de86d72..37fcaad 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -48,7 +48,7 @@ static void rrpc_page_invalidate(struct rrpc *rrpc, struct rrpc_addr *a) } static void rrpc_invalidate_range(struct rrpc *rrpc, sector_t slba, - unsigned len) + unsigned int len) { sector_t i; @@ -96,10 +96,13 @@ static void rrpc_discard(struct rrpc *rrpc, struct bio *bio) sector_t len = bio->bi_iter.bi_size / RRPC_EXPOSED_PAGE_SIZE; struct nvm_rq *rqd; - do { + while (1) { rqd = rrpc_inflight_laddr_acquire(rrpc, slba, len); + if (rqd) + break; + schedule(); - } while (!rqd); + } if (IS_ERR(rqd)) { pr_err("rrpc: unable to acquire inflight IO\n"); @@ -172,39 +175,32 @@ static struct ppa_addr rrpc_ppa_to_gaddr(struct nvm_dev *dev, u64 addr) } /* requires lun->lock taken */ -static void rrpc_set_lun_cur(struct rrpc_lun *rlun, struct rrpc_block *rblk) +static void rrpc_set_lun_cur(struct rrpc_lun *rlun, struct rrpc_block *new_rblk, + struct rrpc_block **cur_rblk) { struct rrpc *rrpc = rlun->rrpc; - BUG_ON(!rblk); - - if (rlun->cur) { - spin_lock(&rlun->cur->lock); - WARN_ON(!block_is_full(rrpc, rlun->cur)); - spin_unlock(&rlun->cur->lock); + if (*cur_rblk) { + spin_lock(&(*cur_rblk)->lock); + WARN_ON(!block_is_full(rrpc, *cur_rblk)); + spin_unlock(&(*cur_rblk)->lock); } - rlun->cur = rblk; + *cur_rblk = new_rblk; } static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun *rlun, unsigned long flags) { - struct nvm_lun *lun = rlun->parent; struct nvm_block *blk; struct rrpc_block *rblk; - spin_lock(&lun->lock); - blk = nvm_get_blk_unlocked(rrpc->dev, rlun->parent, flags); + blk = nvm_get_blk(rrpc->dev, rlun->parent, flags); if (!blk) { pr_err("nvm: rrpc: cannot get new block from media manager\n"); - spin_unlock(&lun->lock); return NULL; } rblk = rrpc_get_rblk(rlun, blk->id); - list_add_tail(&rblk->list, &rlun->open_list); - spin_unlock(&lun->lock); - blk->priv = rblk; bitmap_zero(rblk->invalid_pages, rrpc->dev->sec_per_blk); rblk->next_page = 0; @@ -216,13 +212,7 @@ static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun *rlun, static void rrpc_put_blk(struct rrpc *rrpc, struct rrpc_block *rblk) { - struct rrpc_lun *rlun = rblk->rlun; - struct nvm_lun *lun = rlun->parent; - - spin_lock(&lun->lock); - nvm_put_blk_unlocked(rrpc->dev, rblk->parent); - list_del(&rblk->list); - spin_unlock(&lun->lock); + nvm_put_blk(rrpc->dev, rblk->parent); } static void rrpc_put_blks(struct rrpc *rrpc) @@ -508,21 +498,11 @@ static void rrpc_gc_queue(struct work_struct *work) struct rrpc *rrpc = gcb->rrpc; struct rrpc_block *rblk = gcb->rblk; struct rrpc_lun *rlun = rblk->rlun; - struct nvm_lun *lun = rblk->parent->lun; - struct nvm_block *blk = rblk->parent; spin_lock(&rlun->lock); list_add_tail(&rblk->prio, &rlun->prio_list); spin_unlock(&rlun->lock); - spin_lock(&lun->lock); - lun->nr_open_blocks--; - lun->nr_closed_blocks++; - blk->state &= ~NVM_BLK_ST_OPEN; - blk->state |= NVM_BLK_ST_CLOSED; - list_move_tail(&rblk->list, &rlun->closed_list); - spin_unlock(&lun->lock); - mempool_free(gcb, rrpc->gcb_pool); pr_debug("nvm: block '%lu' is full, allow GC (sched)\n", rblk->parent->id); @@ -596,21 +576,20 @@ out: return addr; } -/* Simple round-robin Logical to physical address translation. - * - * Retrieve the mapping using the active append point. Then update the ap for - * the next write to the disk. +/* Map logical address to a physical page. The mapping implements a round robin + * approach and allocates a page from the next lun available. * - * Returns rrpc_addr with the physical address and block. Remember to return to - * rrpc->addr_cache when request is finished. + * Returns rrpc_addr with the physical address and block. Returns NULL if no + * blocks in the next rlun are available. */ static struct rrpc_addr *rrpc_map_page(struct rrpc *rrpc, sector_t laddr, int is_gc) { struct rrpc_lun *rlun; - struct rrpc_block *rblk; + struct rrpc_block *rblk, **cur_rblk; struct nvm_lun *lun; u64 paddr; + int gc_force = 0; rlun = rrpc_get_lun_rr(rrpc, is_gc); lun = rlun->parent; @@ -618,41 +597,65 @@ static struct rrpc_addr *rrpc_map_page(struct rrpc *rrpc, sector_t laddr, if (!is_gc && lun->nr_free_blocks < rrpc->nr_luns * 4) return NULL; - spin_lock(&rlun->lock); + /* + * page allocation steps: + * 1. Try to allocate new page from current rblk + * 2a. If succeed, proceed to map it in and return + * 2b. If fail, first try to allocate a new block from media manger, + * and then retry step 1. Retry until the normal block pool is + * exhausted. + * 3. If exhausted, and garbage collector is requesting the block, + * go to the reserved block and retry step 1. + * In the case that this fails as well, or it is not GC + * requesting, report not able to retrieve a block and let the + * caller handle further processing. + */ + spin_lock(&rlun->lock); + cur_rblk = &rlun->cur; rblk = rlun->cur; retry: paddr = rrpc_alloc_addr(rrpc, rblk); - if (paddr == ADDR_EMPTY) { - rblk = rrpc_get_blk(rrpc, rlun, 0); - if (rblk) { - rrpc_set_lun_cur(rlun, rblk); - goto retry; - } + if (paddr != ADDR_EMPTY) + goto done; - if (is_gc) { - /* retry from emergency gc block */ - paddr = rrpc_alloc_addr(rrpc, rlun->gc_cur); - if (paddr == ADDR_EMPTY) { - rblk = rrpc_get_blk(rrpc, rlun, 1); - if (!rblk) { - pr_err("rrpc: no more blocks"); - goto err; - } - - rlun->gc_cur = rblk; - paddr = rrpc_alloc_addr(rrpc, rlun->gc_cur); - } - rblk = rlun->gc_cur; - } + if (!list_empty(&rlun->wblk_list)) { +new_blk: + rblk = list_first_entry(&rlun->wblk_list, struct rrpc_block, + prio); + rrpc_set_lun_cur(rlun, rblk, cur_rblk); + list_del(&rblk->prio); + goto retry; + } + spin_unlock(&rlun->lock); + + rblk = rrpc_get_blk(rrpc, rlun, gc_force); + if (rblk) { + spin_lock(&rlun->lock); + list_add_tail(&rblk->prio, &rlun->wblk_list); + /* + * another thread might already have added a new block, + * Therefore, make sure that one is used, instead of the + * one just added. + */ + goto new_blk; } + if (unlikely(is_gc) && !gc_force) { + /* retry from emergency gc block */ + cur_rblk = &rlun->gc_cur; + rblk = rlun->gc_cur; + gc_force = 1; + spin_lock(&rlun->lock); + goto retry; + } + + pr_err("rrpc: failed to allocate new block\n"); + return NULL; +done: spin_unlock(&rlun->lock); return rrpc_update_map(rrpc, laddr, rblk, paddr); -err: - spin_unlock(&rlun->lock); - return NULL; } static void rrpc_run_gc(struct rrpc *rrpc, struct rrpc_block *rblk) @@ -850,14 +853,14 @@ static int rrpc_setup_rq(struct rrpc *rrpc, struct bio *bio, return NVM_IO_ERR; } - if (bio_rw(bio) == WRITE) + if (bio_op(bio) == REQ_OP_WRITE) return rrpc_write_ppalist_rq(rrpc, bio, rqd, flags, npages); return rrpc_read_ppalist_rq(rrpc, bio, rqd, flags, npages); } - if (bio_rw(bio) == WRITE) + if (bio_op(bio) == REQ_OP_WRITE) return rrpc_write_rq(rrpc, bio, rqd, flags); return rrpc_read_rq(rrpc, bio, rqd, flags); @@ -1196,8 +1199,7 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end) rlun->rrpc = rrpc; INIT_LIST_HEAD(&rlun->prio_list); - INIT_LIST_HEAD(&rlun->open_list); - INIT_LIST_HEAD(&rlun->closed_list); + INIT_LIST_HEAD(&rlun->wblk_list); INIT_WORK(&rlun->ws_gc, rrpc_lun_gc); spin_lock_init(&rlun->lock); @@ -1338,14 +1340,13 @@ static int rrpc_luns_configure(struct rrpc *rrpc) rblk = rrpc_get_blk(rrpc, rlun, 0); if (!rblk) goto err; - - rrpc_set_lun_cur(rlun, rblk); + rrpc_set_lun_cur(rlun, rblk, &rlun->cur); /* Emergency gc block */ rblk = rrpc_get_blk(rrpc, rlun, 1); if (!rblk) goto err; - rlun->gc_cur = rblk; + rrpc_set_lun_cur(rlun, rblk, &rlun->gc_cur); } return 0; diff --git a/drivers/lightnvm/rrpc.h b/drivers/lightnvm/rrpc.h index 87e84b5..5e87d52 100644 --- a/drivers/lightnvm/rrpc.h +++ b/drivers/lightnvm/rrpc.h @@ -56,7 +56,6 @@ struct rrpc_block { struct nvm_block *parent; struct rrpc_lun *rlun; struct list_head prio; - struct list_head list; #define MAX_INVALID_PAGES_STORAGE 8 /* Bitmap for invalid page intries */ @@ -77,13 +76,7 @@ struct rrpc_lun { struct rrpc_block *blocks; /* Reference to block allocation */ struct list_head prio_list; /* Blocks that may be GC'ed */ - struct list_head open_list; /* In-use open blocks. These are blocks - * that can be both written to and read - * from - */ - struct list_head closed_list; /* In-use closed blocks. These are - * blocks that can _only_ be read from - */ + struct list_head wblk_list; /* Queued blocks to be written to */ struct work_struct ws_gc; @@ -188,7 +181,7 @@ static inline int request_intersects(struct rrpc_inflight_rq *r, } static int __rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr, - unsigned pages, struct rrpc_inflight_rq *r) + unsigned int pages, struct rrpc_inflight_rq *r) { sector_t laddr_end = laddr + pages - 1; struct rrpc_inflight_rq *rtmp; @@ -213,7 +206,7 @@ static int __rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr, } static inline int rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr, - unsigned pages, + unsigned int pages, struct rrpc_inflight_rq *r) { BUG_ON((laddr + pages) > rrpc->nr_sects); diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c index 994697a..a75bd28 100644 --- a/drivers/lightnvm/sysblk.c +++ b/drivers/lightnvm/sysblk.c @@ -39,7 +39,8 @@ static inline int scan_ppa_idx(int row, int blkid) return (row * MAX_BLKS_PR_SYSBLK) + blkid; } -void nvm_sysblk_to_cpu(struct nvm_sb_info *info, struct nvm_system_block *sb) +static void nvm_sysblk_to_cpu(struct nvm_sb_info *info, + struct nvm_system_block *sb) { info->seqnr = be32_to_cpu(sb->seqnr); info->erase_cnt = be32_to_cpu(sb->erase_cnt); @@ -48,7 +49,8 @@ void nvm_sysblk_to_cpu(struct nvm_sb_info *info, struct nvm_system_block *sb) info->fs_ppa.ppa = be64_to_cpu(sb->fs_ppa); } -void nvm_cpu_to_sysblk(struct nvm_system_block *sb, struct nvm_sb_info *info) +static void nvm_cpu_to_sysblk(struct nvm_system_block *sb, + struct nvm_sb_info *info) { sb->magic = cpu_to_be32(NVM_SYSBLK_MAGIC); sb->seqnr = cpu_to_be32(info->seqnr); @@ -86,7 +88,7 @@ static int nvm_setup_sysblks(struct nvm_dev *dev, struct ppa_addr *sysblk_ppas) return nr_rows; } -void nvm_setup_sysblk_scan(struct nvm_dev *dev, struct sysblk_scan *s, +static void nvm_setup_sysblk_scan(struct nvm_dev *dev, struct sysblk_scan *s, struct ppa_addr *sysblk_ppas) { memset(s, 0, sizeof(struct sysblk_scan)); diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index 9eaf1d6..864e673 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -112,7 +112,7 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) EXPORT_SYMBOL(closure_wait); /** - * closure_sync - sleep until a closure a closure has nothing left to wait on + * closure_sync - sleep until a closure has nothing left to wait on * * Sleeps until the refcount hits 1 - the thread that's running the closure owns * the last refcount. diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 782cc2c..9b2fe2d 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -31,7 +31,8 @@ * passing it, as you might expect, the function to run when nothing is pending * and the workqueue to run that function out of. * - * continue_at() also, critically, is a macro that returns the calling function. + * continue_at() also, critically, requires a 'return' immediately following the + * location where this macro is referenced, to return to the calling function. * There's good reason for this. * * To use safely closures asynchronously, they must always have a refcount while diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index fd885cc..e97b0ac 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -25,7 +25,6 @@ struct bio *bch_bbio_alloc(struct cache_set *c) struct bio *bio = &b->bio; bio_init(bio); - bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET; bio->bi_max_vecs = bucket_pages(c); bio->bi_io_vec = bio->bi_inline_vecs; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index c944daf7..88ef6d1 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -134,7 +134,6 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, case BCACHE_SB_VERSION_CDEV: case BCACHE_SB_VERSION_CDEV_WITH_UUID: sb->nbuckets = le64_to_cpu(s->nbuckets); - sb->block_size = le16_to_cpu(s->block_size); sb->bucket_size = le16_to_cpu(s->bucket_size); sb->nr_in_set = le16_to_cpu(s->nr_in_set); @@ -1520,7 +1519,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || - !(c->moving_gc_wq = create_workqueue("bcache_gc")) || + !(c->moving_gc_wq = alloc_workqueue("bcache_gc", + WQ_MEM_RECLAIM, 0)) || bch_journal_alloc(c) || bch_btree_cache_alloc(c) || bch_open_buckets_alloc(c) || @@ -1805,7 +1805,7 @@ void bch_cache_release(struct kobject *kobj) module_put(THIS_MODULE); } -static int cache_alloc(struct cache_sb *sb, struct cache *ca) +static int cache_alloc(struct cache *ca) { size_t free; struct bucket *b; @@ -1860,7 +1860,7 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, if (blk_queue_discard(bdev_get_queue(ca->bdev))) ca->discard = CACHE_DISCARD(&ca->sb); - ret = cache_alloc(sb, ca); + ret = cache_alloc(ca); if (ret != 0) goto err; @@ -2099,7 +2099,7 @@ static int __init bcache_init(void) return bcache_major; } - if (!(bcache_wq = create_workqueue("bcache")) || + if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) || !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || sysfs_create_files(bcache_kobj, files) || bch_request_init() || diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 9f5f460..dac55b2 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -528,7 +528,7 @@ static void read_callback(unsigned long error, void *context) DMWARN_LIMIT("Read failure on mirror device %s. " "Trying alternative device.", m->dev->name); - queue_bio(m->ms, bio, bio_rw(bio)); + queue_bio(m->ms, bio, bio_data_dir(bio)); return; } @@ -1193,7 +1193,7 @@ static void mirror_dtr(struct dm_target *ti) */ static int mirror_map(struct dm_target *ti, struct bio *bio) { - int r, rw = bio_rw(bio); + int r, rw = bio_data_dir(bio); struct mirror *m; struct mirror_set *ms = ti->private; struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); @@ -1217,7 +1217,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio) * If region is not in-sync queue the bio. */ if (!r || (r == -EWOULDBLOCK)) { - if (rw == READA) + if (bio->bi_rw & REQ_RAHEAD) return -EWOULDBLOCK; queue_bio(ms, bio, rw); @@ -1242,7 +1242,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio) static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error) { - int rw = bio_rw(bio); + int rw = bio_data_dir(bio); struct mirror_set *ms = (struct mirror_set *) ti->private; struct mirror *m = NULL; struct dm_bio_details *bd = NULL; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 69ab1ff..cc2f14b 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1696,7 +1696,8 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) * to copy an exception */ down_write(&s->lock); - if (!s->valid || (unlikely(s->snapshot_overflowed) && bio_rw(bio) == WRITE)) { + if (!s->valid || (unlikely(s->snapshot_overflowed) && + bio_data_dir(bio) == WRITE)) { r = -EIO; goto out_unlock; } @@ -1713,7 +1714,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) * flags so we should only get this if we are * writeable. */ - if (bio_rw(bio) == WRITE) { + if (bio_data_dir(bio) == WRITE) { pe = __lookup_pending_exception(s, chunk); if (!pe) { up_write(&s->lock); @@ -1819,7 +1820,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio) e = dm_lookup_exception(&s->complete, chunk); if (e) { /* Queue writes overlapping with chunks being merged */ - if (bio_rw(bio) == WRITE && + if (bio_data_dir(bio) == WRITE && chunk >= s->first_merging_chunk && chunk < (s->first_merging_chunk + s->num_merging_chunks)) { @@ -1831,7 +1832,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio) remap_exception(s, e, bio, chunk); - if (bio_rw(bio) == WRITE) + if (bio_data_dir(bio) == WRITE) track_chunk(s, bio, chunk); goto out_unlock; } @@ -1839,7 +1840,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio) redirect_to_origin: bio->bi_bdev = s->origin->bdev; - if (bio_rw(bio) == WRITE) { + if (bio_data_dir(bio) == WRITE) { up_write(&s->lock); return do_origin(s->origin, bio); } @@ -2288,7 +2289,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio) if (unlikely(bio->bi_rw & REQ_PREFLUSH)) return DM_MAPIO_REMAPPED; - if (bio_rw(bio) != WRITE) + if (bio_data_dir(bio) != WRITE) return DM_MAPIO_REMAPPED; available_sectors = o->split_boundary - diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c index 766bc93..618b875 100644 --- a/drivers/md/dm-zero.c +++ b/drivers/md/dm-zero.c @@ -35,16 +35,19 @@ static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv) */ static int zero_map(struct dm_target *ti, struct bio *bio) { - switch(bio_rw(bio)) { - case READ: + switch (bio_op(bio)) { + case REQ_OP_READ: + if (bio->bi_rw & REQ_RAHEAD) { + /* readahead of null bytes only wastes buffer cache */ + return -EIO; + } zero_fill_bio(bio); break; - case READA: - /* readahead of null bytes only wastes buffer cache */ - return -EIO; - case WRITE: + case REQ_OP_WRITE: /* writes get silently dropped */ break; + default: + return -EIO; } bio_endio(bio); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index aba7ed9..812fd59 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1833,7 +1833,7 @@ static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { dm_put_live_table(md, srcu_idx); - if (bio_rw(bio) != READA) + if (!(bio->bi_rw & REQ_RAHEAD)) queue_io(md, bio); else bio_io_error(bio); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 10e53cd..4e6da44 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1058,7 +1058,6 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio) const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_flush_fua = (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA)); - const unsigned long do_sec = (bio->bi_rw & REQ_SECURE); struct md_rdev *blocked_rdev; struct blk_plug_cb *cb; struct raid1_plug_cb *plug = NULL; @@ -1106,7 +1105,7 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio) bitmap = mddev->bitmap; /* - * make_request() can abort the operation when READA is being + * make_request() can abort the operation when read-ahead is being * used and no empty request is available. * */ @@ -1376,7 +1375,7 @@ read_again: conf->mirrors[i].rdev->data_offset); mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_end_io = raid1_end_write_request; - bio_set_op_attrs(mbio, op, do_flush_fua | do_sync | do_sec); + bio_set_op_attrs(mbio, op, do_flush_fua | do_sync); mbio->bi_private = r1_bio; atomic_inc(&r1_bio->remaining); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 245640b..26ae74f 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1062,7 +1062,6 @@ static void __make_request(struct mddev *mddev, struct bio *bio) const int rw = bio_data_dir(bio); const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_fua = (bio->bi_rw & REQ_FUA); - const unsigned long do_sec = (bio->bi_rw & REQ_SECURE); unsigned long flags; struct md_rdev *blocked_rdev; struct blk_plug_cb *cb; @@ -1362,7 +1361,7 @@ retry_write: rdev)); mbio->bi_bdev = rdev->bdev; mbio->bi_end_io = raid10_end_write_request; - bio_set_op_attrs(mbio, op, do_sync | do_fua | do_sec); + bio_set_op_attrs(mbio, op, do_sync | do_fua); mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); @@ -1404,7 +1403,7 @@ retry_write: r10_bio, rdev)); mbio->bi_bdev = rdev->bdev; mbio->bi_end_io = raid10_end_write_request; - bio_set_op_attrs(mbio, op, do_sync | do_fua | do_sec); + bio_set_op_attrs(mbio, op, do_sync | do_fua); mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7aacf5b..6953d78 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5233,7 +5233,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) (unsigned long long)logical_sector); sh = raid5_get_active_stripe(conf, new_sector, previous, - (bi->bi_rw&RWA_MASK), 0); + (bi->bi_rw & REQ_RAHEAD), 0); if (sh) { if (unlikely(previous)) { /* expansion might have moved on while waiting for a diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c index 3cd6815..40bb8ae 100644 --- a/drivers/memstick/core/ms_block.c +++ b/drivers/memstick/core/ms_block.c @@ -2002,8 +2002,7 @@ static int msb_bd_getgeo(struct block_device *bdev, static int msb_prepare_req(struct request_queue *q, struct request *req) { - if (req->cmd_type != REQ_TYPE_FS && - req->cmd_type != REQ_TYPE_BLOCK_PC) { + if (req->cmd_type != REQ_TYPE_FS) { blk_dump_rq_flags(req, "MS unsupported request"); return BLKPREP_KILL; } @@ -2146,7 +2145,6 @@ static int msb_init_disk(struct memstick_dev *card) msb->disk->fops = &msb_bdops; msb->disk->private_data = msb; msb->disk->queue = msb->queue; - msb->disk->driverfs_dev = &card->dev; msb->disk->flags |= GENHD_FL_EXT_DEVT; capacity = msb->pages_in_block * msb->logical_block_count; @@ -2163,7 +2161,7 @@ static int msb_init_disk(struct memstick_dev *card) set_disk_ro(msb->disk, 1); msb_start(card); - add_disk(msb->disk); + device_add_disk(&card->dev, msb->disk); dbg("Disk added"); return 0; diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index 0fb27d3..c147227 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -829,8 +829,7 @@ static void mspro_block_start(struct memstick_dev *card) static int mspro_block_prepare_req(struct request_queue *q, struct request *req) { - if (req->cmd_type != REQ_TYPE_FS && - req->cmd_type != REQ_TYPE_BLOCK_PC) { + if (req->cmd_type != REQ_TYPE_FS) { blk_dump_rq_flags(req, "MSPro unsupported request"); return BLKPREP_KILL; } @@ -1243,7 +1242,6 @@ static int mspro_block_init_disk(struct memstick_dev *card) msb->usage_count = 1; msb->disk->private_data = msb; msb->disk->queue = msb->queue; - msb->disk->driverfs_dev = &card->dev; sprintf(msb->disk->disk_name, "mspblk%d", disk_id); @@ -1255,7 +1253,7 @@ static int mspro_block_init_disk(struct memstick_dev *card) set_capacity(msb->disk, capacity); dev_dbg(&card->dev, "capacity set %ld\n", capacity); - add_disk(msb->disk); + device_add_disk(&card->dev, msb->disk); msb->active = 1; return 0; diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index 11ee414..10b5537 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -93,6 +93,7 @@ static DEFINE_SPINLOCK(mmc_blk_lock); */ struct mmc_blk_data { spinlock_t lock; + struct device *parent; struct gendisk *disk; struct mmc_queue queue; struct list_head part; @@ -2169,10 +2170,12 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req) /* complete ongoing async transfer before issuing discard */ if (card->host->areq) mmc_blk_issue_rw_rq(mq, NULL); - if (req->cmd_flags & REQ_SECURE) - ret = mmc_blk_issue_secdiscard_rq(mq, req); - else - ret = mmc_blk_issue_discard_rq(mq, req); + ret = mmc_blk_issue_discard_rq(mq, req); + } else if (req && req_op(req) == REQ_OP_SECURE_ERASE) { + /* complete ongoing async transfer before issuing secure erase*/ + if (card->host->areq) + mmc_blk_issue_rw_rq(mq, NULL); + ret = mmc_blk_issue_secdiscard_rq(mq, req); } else if (req && req_op(req) == REQ_OP_FLUSH) { /* complete ongoing async transfer before issuing flush */ if (card->host->areq) @@ -2270,7 +2273,7 @@ again: md->disk->fops = &mmc_bdops; md->disk->private_data = md; md->disk->queue = md->queue.queue; - md->disk->driverfs_dev = parent; + md->parent = parent; set_disk_ro(md->disk, md->read_only || default_ro); md->disk->flags = GENHD_FL_EXT_DEVT; if (area_type & (MMC_BLK_DATA_AREA_RPMB | MMC_BLK_DATA_AREA_BOOT)) @@ -2458,7 +2461,7 @@ static int mmc_add_disk(struct mmc_blk_data *md) int ret; struct mmc_card *card = md->queue.card; - add_disk(md->disk); + device_add_disk(md->parent, md->disk); md->force_ro.show = force_ro_show; md->force_ro.store = force_ro_store; sysfs_attr_init(&md->force_ro.attr); diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c index c2d5f6f..bf14642 100644 --- a/drivers/mmc/card/queue.c +++ b/drivers/mmc/card/queue.c @@ -171,7 +171,7 @@ static void mmc_queue_setup_discard(struct request_queue *q, if (card->pref_erase > max_discard) q->limits.discard_granularity = 0; if (mmc_can_secure_erase_trim(card)) - queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q); + queue_flag_set_unlocked(QUEUE_FLAG_SECERASE, q); } /** diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 78b3eb4..8d58acf 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -431,12 +431,10 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) goto error4; INIT_WORK(&new->work, mtd_blktrans_work); - gd->driverfs_dev = &new->mtd->dev; - if (new->readonly) set_disk_ro(gd, 1); - add_disk(gd); + device_add_disk(&new->mtd->dev, gd); if (new->disk_attributes) { ret = sysfs_create_group(&disk_to_dev(gd)->kobj, diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c index 495e06d9..7e262ef 100644 --- a/drivers/nvdimm/blk.c +++ b/drivers/nvdimm/blk.c @@ -287,14 +287,13 @@ static int nsblk_attach_disk(struct nd_namespace_blk *nsblk) return -ENOMEM; } - disk->driverfs_dev = dev; disk->first_minor = 0; disk->fops = &nd_blk_fops; disk->queue = q; disk->flags = GENHD_FL_EXT_DEVT; nvdimm_namespace_disk_name(&nsblk->common, disk->disk_name); set_capacity(disk, 0); - add_disk(disk); + device_add_disk(dev, disk); if (nsblk_meta_size(nsblk)) { int rc = nd_integrity_init(disk, nsblk_meta_size(nsblk)); diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 68a7c3c..9dce03f 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1243,7 +1243,6 @@ static int btt_blk_init(struct btt *btt) } nvdimm_namespace_disk_name(ndns, btt->btt_disk->disk_name); - btt->btt_disk->driverfs_dev = &btt->nd_btt->dev; btt->btt_disk->first_minor = 0; btt->btt_disk->fops = &btt_fops; btt->btt_disk->private_data = btt; @@ -1258,7 +1257,7 @@ static int btt_blk_init(struct btt *btt) btt->btt_queue->queuedata = btt; set_capacity(btt->btt_disk, 0); - add_disk(btt->btt_disk); + device_add_disk(&btt->nd_btt->dev, btt->btt_disk); if (btt_meta_size(btt)) { int rc = nd_integrity_init(btt->btt_disk, btt_meta_size(btt)); diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index f085f8b..5e4e5c7 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -312,7 +312,7 @@ EXPORT_SYMBOL(__nd_driver_register); int nvdimm_revalidate_disk(struct gendisk *disk) { - struct device *dev = disk->driverfs_dev; + struct device *dev = disk_to_dev(disk)->parent; struct nd_region *nd_region = to_nd_region(dev->parent); const char *pol = nd_region->ro ? "only" : "write"; diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 53b701b..36cb390 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -298,14 +298,13 @@ static int pmem_attach_disk(struct device *dev, disk->queue = q; disk->flags = GENHD_FL_EXT_DEVT; nvdimm_namespace_disk_name(ndns, disk->disk_name); - disk->driverfs_dev = dev; set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset) / 512); if (devm_init_badblocks(dev, &pmem->bb)) return -ENOMEM; nvdimm_badblocks_populate(to_nd_region(dev->parent), &pmem->bb, res); disk->bb = &pmem->bb; - add_disk(disk); + device_add_disk(dev, disk); revalidate_disk(disk); return 0; diff --git a/drivers/nvme/Kconfig b/drivers/nvme/Kconfig index a39d943..b7c78a5 100644 --- a/drivers/nvme/Kconfig +++ b/drivers/nvme/Kconfig @@ -1 +1,2 @@ source "drivers/nvme/host/Kconfig" +source "drivers/nvme/target/Kconfig" diff --git a/drivers/nvme/Makefile b/drivers/nvme/Makefile index 9421e82..0096a7f 100644 --- a/drivers/nvme/Makefile +++ b/drivers/nvme/Makefile @@ -1,2 +1,3 @@ obj-y += host/ +obj-y += target/ diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index d296fc3..db39d53 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -24,3 +24,22 @@ config BLK_DEV_NVME_SCSI to say N here, unless you run a distro that abuses the SCSI emulation to provide stable device names for mount by id, like some OpenSuSE and SLES versions. + +config NVME_FABRICS + tristate + +config NVME_RDMA + tristate "NVM Express over Fabrics RDMA host driver" + depends on INFINIBAND + depends on BLK_DEV_NVME + select NVME_FABRICS + select SG_POOL + help + This provides support for the NVMe over Fabrics protocol using + the RDMA (Infiniband, RoCE, iWarp) transport. This allows you + to use remote block devices exported using the NVMe protocol set. + + To configure a NVMe over Fabrics controller use the nvme-cli tool + from https://github.com/linux-nvme/nvme-cli. + + If unsure, say N. diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index 9a3ca89..47abcec 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -1,8 +1,14 @@ obj-$(CONFIG_NVME_CORE) += nvme-core.o obj-$(CONFIG_BLK_DEV_NVME) += nvme.o +obj-$(CONFIG_NVME_FABRICS) += nvme-fabrics.o +obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o nvme-core-y := core.o nvme-core-$(CONFIG_BLK_DEV_NVME_SCSI) += scsi.o nvme-core-$(CONFIG_NVM) += lightnvm.o nvme-y += pci.o + +nvme-fabrics-y += fabrics.o + +nvme-rdma-y += rdma.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1c5a032..7ff2e82 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -30,6 +30,7 @@ #include <asm/unaligned.h> #include "nvme.h" +#include "fabrics.h" #define NVME_MINORS (1U << MINORBITS) @@ -47,8 +48,10 @@ unsigned char shutdown_timeout = 5; module_param(shutdown_timeout, byte, 0644); MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); -static int nvme_major; -module_param(nvme_major, int, 0); +unsigned int nvme_max_retries = 5; +module_param_named(max_retries, nvme_max_retries, uint, 0644); +MODULE_PARM_DESC(max_retries, "max number of retries a command may have"); +EXPORT_SYMBOL_GPL(nvme_max_retries); static int nvme_char_major; module_param(nvme_char_major, int, 0); @@ -58,6 +61,23 @@ static DEFINE_SPINLOCK(dev_list_lock); static struct class *nvme_class; +void nvme_cancel_request(struct request *req, void *data, bool reserved) +{ + int status; + + if (!blk_mq_request_started(req)) + return; + + dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device, + "Cancelling I/O %d", req->tag); + + status = NVME_SC_ABORT_REQ; + if (blk_queue_dying(req->q)) + status |= NVME_SC_DNR; + blk_mq_complete_request(req, status); +} +EXPORT_SYMBOL_GPL(nvme_cancel_request); + bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, enum nvme_ctrl_state new_state) { @@ -68,7 +88,9 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, switch (new_state) { case NVME_CTRL_LIVE: switch (old_state) { + case NVME_CTRL_NEW: case NVME_CTRL_RESETTING: + case NVME_CTRL_RECONNECTING: changed = true; /* FALLTHRU */ default: @@ -79,6 +101,16 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, switch (old_state) { case NVME_CTRL_NEW: case NVME_CTRL_LIVE: + case NVME_CTRL_RECONNECTING: + changed = true; + /* FALLTHRU */ + default: + break; + } + break; + case NVME_CTRL_RECONNECTING: + switch (old_state) { + case NVME_CTRL_LIVE: changed = true; /* FALLTHRU */ default: @@ -89,6 +121,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, switch (old_state) { case NVME_CTRL_LIVE: case NVME_CTRL_RESETTING: + case NVME_CTRL_RECONNECTING: changed = true; /* FALLTHRU */ default: @@ -174,21 +207,21 @@ void nvme_requeue_req(struct request *req) EXPORT_SYMBOL_GPL(nvme_requeue_req); struct request *nvme_alloc_request(struct request_queue *q, - struct nvme_command *cmd, unsigned int flags) + struct nvme_command *cmd, unsigned int flags, int qid) { - bool write = cmd->common.opcode & 1; struct request *req; - req = blk_mq_alloc_request(q, write, flags); + if (qid == NVME_QID_ANY) { + req = blk_mq_alloc_request(q, nvme_is_write(cmd), flags); + } else { + req = blk_mq_alloc_request_hctx(q, nvme_is_write(cmd), flags, + qid ? qid - 1 : 0); + } if (IS_ERR(req)) return req; req->cmd_type = REQ_TYPE_DRV_PRIV; req->cmd_flags |= REQ_FAILFAST_DRIVER; - req->__data_len = 0; - req->__sector = (sector_t) -1; - req->bio = req->biotail = NULL; - req->cmd = (unsigned char *)cmd; req->cmd_len = sizeof(struct nvme_command); @@ -307,12 +340,12 @@ EXPORT_SYMBOL_GPL(nvme_setup_cmd); */ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, struct nvme_completion *cqe, void *buffer, unsigned bufflen, - unsigned timeout) + unsigned timeout, int qid, int at_head, int flags) { struct request *req; int ret; - req = nvme_alloc_request(q, cmd, 0); + req = nvme_alloc_request(q, cmd, flags, qid); if (IS_ERR(req)) return PTR_ERR(req); @@ -325,17 +358,19 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, goto out; } - blk_execute_rq(req->q, NULL, req, 0); + blk_execute_rq(req->q, NULL, req, at_head); ret = req->errors; out: blk_mq_free_request(req); return ret; } +EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd); int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buffer, unsigned bufflen) { - return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0); + return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0, + NVME_QID_ANY, 0, 0); } EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); @@ -344,7 +379,7 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *meta_buffer, unsigned meta_len, u32 meta_seed, u32 *result, unsigned timeout) { - bool write = cmd->common.opcode & 1; + bool write = nvme_is_write(cmd); struct nvme_completion cqe; struct nvme_ns *ns = q->queuedata; struct gendisk *disk = ns ? ns->disk : NULL; @@ -353,7 +388,7 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void *meta = NULL; int ret; - req = nvme_alloc_request(q, cmd, 0); + req = nvme_alloc_request(q, cmd, 0, NVME_QID_ANY); if (IS_ERR(req)) return PTR_ERR(req); @@ -439,6 +474,74 @@ int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, result, timeout); } +static void nvme_keep_alive_end_io(struct request *rq, int error) +{ + struct nvme_ctrl *ctrl = rq->end_io_data; + + blk_mq_free_request(rq); + + if (error) { + dev_err(ctrl->device, + "failed nvme_keep_alive_end_io error=%d\n", error); + return; + } + + schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); +} + +static int nvme_keep_alive(struct nvme_ctrl *ctrl) +{ + struct nvme_command c; + struct request *rq; + + memset(&c, 0, sizeof(c)); + c.common.opcode = nvme_admin_keep_alive; + + rq = nvme_alloc_request(ctrl->admin_q, &c, BLK_MQ_REQ_RESERVED, + NVME_QID_ANY); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + rq->timeout = ctrl->kato * HZ; + rq->end_io_data = ctrl; + + blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io); + + return 0; +} + +static void nvme_keep_alive_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = container_of(to_delayed_work(work), + struct nvme_ctrl, ka_work); + + if (nvme_keep_alive(ctrl)) { + /* allocation failure, reset the controller */ + dev_err(ctrl->device, "keep-alive failed\n"); + ctrl->ops->reset_ctrl(ctrl); + return; + } +} + +void nvme_start_keep_alive(struct nvme_ctrl *ctrl) +{ + if (unlikely(ctrl->kato == 0)) + return; + + INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work); + schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); +} +EXPORT_SYMBOL_GPL(nvme_start_keep_alive); + +void nvme_stop_keep_alive(struct nvme_ctrl *ctrl) +{ + if (unlikely(ctrl->kato == 0)) + return; + + cancel_delayed_work_sync(&ctrl->ka_work); +} +EXPORT_SYMBOL_GPL(nvme_stop_keep_alive); + int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) { struct nvme_command c = { }; @@ -500,10 +603,11 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, memset(&c, 0, sizeof(c)); c.features.opcode = nvme_admin_get_features; c.features.nsid = cpu_to_le32(nsid); - c.features.prp1 = cpu_to_le64(dma_addr); + c.features.dptr.prp1 = cpu_to_le64(dma_addr); c.features.fid = cpu_to_le32(fid); - ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0); + ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0, + NVME_QID_ANY, 0, 0); if (ret >= 0) *result = le32_to_cpu(cqe.result); return ret; @@ -518,11 +622,12 @@ int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, memset(&c, 0, sizeof(c)); c.features.opcode = nvme_admin_set_features; - c.features.prp1 = cpu_to_le64(dma_addr); + c.features.dptr.prp1 = cpu_to_le64(dma_addr); c.features.fid = cpu_to_le32(fid); c.features.dword11 = cpu_to_le32(dword11); - ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0); + ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0, + NVME_QID_ANY, 0, 0); if (ret >= 0) *result = le32_to_cpu(cqe.result); return ret; @@ -558,11 +663,22 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0, &result); - if (status) + if (status < 0) return status; - nr_io_queues = min(result & 0xffff, result >> 16) + 1; - *count = min(*count, nr_io_queues); + /* + * Degraded controllers might return an error when setting the queue + * count. We still want to be able to bring them online and offer + * access to the admin queue, as that might be only way to fix them up. + */ + if (status > 0) { + dev_err(ctrl->dev, "Could not set queue count (%d)\n", status); + *count = 0; + } else { + nr_io_queues = min(result & 0xffff, result >> 16) + 1; + *count = min(*count, nr_io_queues); + } + return 0; } EXPORT_SYMBOL_GPL(nvme_set_queue_count); @@ -726,6 +842,7 @@ static void nvme_init_integrity(struct nvme_ns *ns) { struct blk_integrity integrity; + memset(&integrity, 0, sizeof(integrity)); switch (ns->pi_type) { case NVME_NS_DPS_PI_TYPE3: integrity.profile = &t10_pi_type3_crc; @@ -764,7 +881,7 @@ static void nvme_config_discard(struct nvme_ns *ns) ns->queue->limits.discard_alignment = logical_block_size; ns->queue->limits.discard_granularity = logical_block_size; - blk_queue_max_discard_sectors(ns->queue, 0xffffffff); + blk_queue_max_discard_sectors(ns->queue, UINT_MAX); queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); } @@ -991,6 +1108,15 @@ int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap) ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); if (ret) return ret; + + /* Checking for ctrl->tagset is a trick to avoid sleeping on module + * load, since we only need the quirk on reset_controller. Notice + * that the HGST device needs this delay only in firmware activation + * procedure; unfortunately we have no (easy) way to verify this. + */ + if ((ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) && ctrl->tagset) + msleep(NVME_QUIRK_DELAY_AMOUNT); + return nvme_wait_ready(ctrl, cap, false); } EXPORT_SYMBOL_GPL(nvme_disable_ctrl); @@ -1088,6 +1214,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) struct nvme_id_ctrl *id; u64 cap; int ret, page_shift; + u32 max_hw_sectors; ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); if (ret) { @@ -1120,9 +1247,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) memcpy(ctrl->model, id->mn, sizeof(id->mn)); memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr)); if (id->mdts) - ctrl->max_hw_sectors = 1 << (id->mdts + page_shift - 9); + max_hw_sectors = 1 << (id->mdts + page_shift - 9); else - ctrl->max_hw_sectors = UINT_MAX; + max_hw_sectors = UINT_MAX; + ctrl->max_hw_sectors = + min_not_zero(ctrl->max_hw_sectors, max_hw_sectors); if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && id->vs[3]) { unsigned int max_hw_sectors; @@ -1138,9 +1267,33 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) } nvme_set_queue_limits(ctrl, ctrl->admin_q); + ctrl->sgls = le32_to_cpu(id->sgls); + ctrl->kas = le16_to_cpu(id->kas); + + if (ctrl->ops->is_fabrics) { + ctrl->icdoff = le16_to_cpu(id->icdoff); + ctrl->ioccsz = le32_to_cpu(id->ioccsz); + ctrl->iorcsz = le32_to_cpu(id->iorcsz); + ctrl->maxcmd = le16_to_cpu(id->maxcmd); + + /* + * In fabrics we need to verify the cntlid matches the + * admin connect + */ + if (ctrl->cntlid != le16_to_cpu(id->cntlid)) + ret = -EINVAL; + + if (!ctrl->opts->discovery_nqn && !ctrl->kas) { + dev_err(ctrl->dev, + "keep-alive support is mandatory for fabrics\n"); + ret = -EINVAL; + } + } else { + ctrl->cntlid = le16_to_cpu(id->cntlid); + } kfree(id); - return 0; + return ret; } EXPORT_SYMBOL_GPL(nvme_init_identify); @@ -1322,7 +1475,7 @@ static struct attribute *nvme_ns_attrs[] = { NULL, }; -static umode_t nvme_attrs_are_visible(struct kobject *kobj, +static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = container_of(kobj, struct device, kobj); @@ -1341,7 +1494,7 @@ static umode_t nvme_attrs_are_visible(struct kobject *kobj, static const struct attribute_group nvme_ns_attr_group = { .attrs = nvme_ns_attrs, - .is_visible = nvme_attrs_are_visible, + .is_visible = nvme_ns_attrs_are_visible, }; #define nvme_show_str_function(field) \ @@ -1367,6 +1520,49 @@ nvme_show_str_function(serial); nvme_show_str_function(firmware_rev); nvme_show_int_function(cntlid); +static ssize_t nvme_sysfs_delete(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (device_remove_file_self(dev, attr)) + ctrl->ops->delete_ctrl(ctrl); + return count; +} +static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete); + +static ssize_t nvme_sysfs_show_transport(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->ops->name); +} +static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL); + +static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return snprintf(buf, PAGE_SIZE, "%s\n", + ctrl->ops->get_subsysnqn(ctrl)); +} +static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL); + +static ssize_t nvme_sysfs_show_address(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE); +} +static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL); + static struct attribute *nvme_dev_attrs[] = { &dev_attr_reset_controller.attr, &dev_attr_rescan_controller.attr, @@ -1374,11 +1570,38 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_serial.attr, &dev_attr_firmware_rev.attr, &dev_attr_cntlid.attr, + &dev_attr_delete_controller.attr, + &dev_attr_transport.attr, + &dev_attr_subsysnqn.attr, + &dev_attr_address.attr, NULL }; +#define CHECK_ATTR(ctrl, a, name) \ + if ((a) == &dev_attr_##name.attr && \ + !(ctrl)->ops->get_##name) \ + return 0 + +static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (a == &dev_attr_delete_controller.attr) { + if (!ctrl->ops->delete_ctrl) + return 0; + } + + CHECK_ATTR(ctrl, a, subsysnqn); + CHECK_ATTR(ctrl, a, address); + + return a->mode; +} + static struct attribute_group nvme_dev_attrs_group = { - .attrs = nvme_dev_attrs, + .attrs = nvme_dev_attrs, + .is_visible = nvme_dev_attrs_are_visible, }; static const struct attribute_group *nvme_dev_attr_groups[] = { @@ -1446,12 +1669,9 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); nvme_set_queue_limits(ctrl, ns->queue); - disk->major = nvme_major; - disk->first_minor = 0; disk->fops = &nvme_fops; disk->private_data = ns; disk->queue = ns->queue; - disk->driverfs_dev = ctrl->device; disk->flags = GENHD_FL_EXT_DEVT; sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, ns->instance); @@ -1466,7 +1686,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) if (ns->type == NVME_NS_LIGHTNVM) return; - add_disk(ns->disk); + device_add_disk(ctrl->device, ns->disk); if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj, &nvme_ns_attr_group)) pr_warn("%s: failed to create sysfs group for identification\n", @@ -1517,6 +1737,17 @@ static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid) nvme_alloc_ns(ctrl, nsid); } +static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, + unsigned nsid) +{ + struct nvme_ns *ns, *next; + + list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { + if (ns->ns_id > nsid) + nvme_ns_remove(ns); + } +} + static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn) { struct nvme_ns *ns; @@ -1531,7 +1762,7 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn) for (i = 0; i < num_lists; i++) { ret = nvme_identify_ns_list(ctrl, prev, ns_list); if (ret) - goto out; + goto free; for (j = 0; j < min(nn, 1024U); j++) { nsid = le32_to_cpu(ns_list[j]); @@ -1551,22 +1782,20 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn) nn -= j; } out: + nvme_remove_invalid_namespaces(ctrl, prev); + free: kfree(ns_list); return ret; } static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn) { - struct nvme_ns *ns, *next; unsigned i; for (i = 1; i <= nn; i++) nvme_validate_ns(ctrl, i); - list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { - if (ns->ns_id > nn) - nvme_ns_remove(ns); - } + nvme_remove_invalid_namespaces(ctrl, nn); } static void nvme_scan_work(struct work_struct *work) @@ -1852,16 +2081,10 @@ int __init nvme_core_init(void) { int result; - result = register_blkdev(nvme_major, "nvme"); - if (result < 0) - return result; - else if (result > 0) - nvme_major = result; - result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", &nvme_dev_fops); if (result < 0) - goto unregister_blkdev; + return result; else if (result > 0) nvme_char_major = result; @@ -1875,8 +2098,6 @@ int __init nvme_core_init(void) unregister_chrdev: __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); - unregister_blkdev: - unregister_blkdev(nvme_major, "nvme"); return result; } @@ -1884,7 +2105,6 @@ void nvme_core_exit(void) { class_destroy(nvme_class); __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); - unregister_blkdev(nvme_major, "nvme"); } MODULE_LICENSE("GPL"); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c new file mode 100644 index 0000000..dc99676 --- /dev/null +++ b/drivers/nvme/host/fabrics.c @@ -0,0 +1,952 @@ +/* + * NVMe over Fabrics common host code. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/init.h> +#include <linux/miscdevice.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/parser.h> +#include <linux/seq_file.h> +#include "nvme.h" +#include "fabrics.h" + +static LIST_HEAD(nvmf_transports); +static DEFINE_MUTEX(nvmf_transports_mutex); + +static LIST_HEAD(nvmf_hosts); +static DEFINE_MUTEX(nvmf_hosts_mutex); + +static struct nvmf_host *nvmf_default_host; + +static struct nvmf_host *__nvmf_host_find(const char *hostnqn) +{ + struct nvmf_host *host; + + list_for_each_entry(host, &nvmf_hosts, list) { + if (!strcmp(host->nqn, hostnqn)) + return host; + } + + return NULL; +} + +static struct nvmf_host *nvmf_host_add(const char *hostnqn) +{ + struct nvmf_host *host; + + mutex_lock(&nvmf_hosts_mutex); + host = __nvmf_host_find(hostnqn); + if (host) + goto out_unlock; + + host = kmalloc(sizeof(*host), GFP_KERNEL); + if (!host) + goto out_unlock; + + kref_init(&host->ref); + memcpy(host->nqn, hostnqn, NVMF_NQN_SIZE); + uuid_le_gen(&host->id); + + list_add_tail(&host->list, &nvmf_hosts); +out_unlock: + mutex_unlock(&nvmf_hosts_mutex); + return host; +} + +static struct nvmf_host *nvmf_host_default(void) +{ + struct nvmf_host *host; + + host = kmalloc(sizeof(*host), GFP_KERNEL); + if (!host) + return NULL; + + kref_init(&host->ref); + uuid_le_gen(&host->id); + snprintf(host->nqn, NVMF_NQN_SIZE, + "nqn.2014-08.org.nvmexpress:NVMf:uuid:%pUl", &host->id); + + mutex_lock(&nvmf_hosts_mutex); + list_add_tail(&host->list, &nvmf_hosts); + mutex_unlock(&nvmf_hosts_mutex); + + return host; +} + +static void nvmf_host_destroy(struct kref *ref) +{ + struct nvmf_host *host = container_of(ref, struct nvmf_host, ref); + + mutex_lock(&nvmf_hosts_mutex); + list_del(&host->list); + mutex_unlock(&nvmf_hosts_mutex); + + kfree(host); +} + +static void nvmf_host_put(struct nvmf_host *host) +{ + if (host) + kref_put(&host->ref, nvmf_host_destroy); +} + +/** + * nvmf_get_address() - Get address/port + * @ctrl: Host NVMe controller instance which we got the address + * @buf: OUTPUT parameter that will contain the address/port + * @size: buffer size + */ +int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size) +{ + return snprintf(buf, size, "traddr=%s,trsvcid=%s\n", + ctrl->opts->traddr, ctrl->opts->trsvcid); +} +EXPORT_SYMBOL_GPL(nvmf_get_address); + +/** + * nvmf_get_subsysnqn() - Get subsystem NQN + * @ctrl: Host NVMe controller instance which we got the NQN + */ +const char *nvmf_get_subsysnqn(struct nvme_ctrl *ctrl) +{ + return ctrl->opts->subsysnqn; +} +EXPORT_SYMBOL_GPL(nvmf_get_subsysnqn); + +/** + * nvmf_reg_read32() - NVMe Fabrics "Property Get" API function. + * @ctrl: Host NVMe controller instance maintaining the admin + * queue used to submit the property read command to + * the allocated NVMe controller resource on the target system. + * @off: Starting offset value of the targeted property + * register (see the fabrics section of the NVMe standard). + * @val: OUTPUT parameter that will contain the value of + * the property after a successful read. + * + * Used by the host system to retrieve a 32-bit capsule property value + * from an NVMe controller on the target system. + * + * ("Capsule property" is an "PCIe register concept" applied to the + * NVMe fabrics space.) + * + * Return: + * 0: successful read + * > 0: NVMe error status code + * < 0: Linux errno error code + */ +int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) +{ + struct nvme_command cmd; + struct nvme_completion cqe; + int ret; + + memset(&cmd, 0, sizeof(cmd)); + cmd.prop_get.opcode = nvme_fabrics_command; + cmd.prop_get.fctype = nvme_fabrics_type_property_get; + cmd.prop_get.offset = cpu_to_le32(off); + + ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &cqe, NULL, 0, 0, + NVME_QID_ANY, 0, 0); + + if (ret >= 0) + *val = le64_to_cpu(cqe.result64); + if (unlikely(ret != 0)) + dev_err(ctrl->device, + "Property Get error: %d, offset %#x\n", + ret > 0 ? ret & ~NVME_SC_DNR : ret, off); + + return ret; +} +EXPORT_SYMBOL_GPL(nvmf_reg_read32); + +/** + * nvmf_reg_read64() - NVMe Fabrics "Property Get" API function. + * @ctrl: Host NVMe controller instance maintaining the admin + * queue used to submit the property read command to + * the allocated controller resource on the target system. + * @off: Starting offset value of the targeted property + * register (see the fabrics section of the NVMe standard). + * @val: OUTPUT parameter that will contain the value of + * the property after a successful read. + * + * Used by the host system to retrieve a 64-bit capsule property value + * from an NVMe controller on the target system. + * + * ("Capsule property" is an "PCIe register concept" applied to the + * NVMe fabrics space.) + * + * Return: + * 0: successful read + * > 0: NVMe error status code + * < 0: Linux errno error code + */ +int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) +{ + struct nvme_command cmd; + struct nvme_completion cqe; + int ret; + + memset(&cmd, 0, sizeof(cmd)); + cmd.prop_get.opcode = nvme_fabrics_command; + cmd.prop_get.fctype = nvme_fabrics_type_property_get; + cmd.prop_get.attrib = 1; + cmd.prop_get.offset = cpu_to_le32(off); + + ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &cqe, NULL, 0, 0, + NVME_QID_ANY, 0, 0); + + if (ret >= 0) + *val = le64_to_cpu(cqe.result64); + if (unlikely(ret != 0)) + dev_err(ctrl->device, + "Property Get error: %d, offset %#x\n", + ret > 0 ? ret & ~NVME_SC_DNR : ret, off); + return ret; +} +EXPORT_SYMBOL_GPL(nvmf_reg_read64); + +/** + * nvmf_reg_write32() - NVMe Fabrics "Property Write" API function. + * @ctrl: Host NVMe controller instance maintaining the admin + * queue used to submit the property read command to + * the allocated NVMe controller resource on the target system. + * @off: Starting offset value of the targeted property + * register (see the fabrics section of the NVMe standard). + * @val: Input parameter that contains the value to be + * written to the property. + * + * Used by the NVMe host system to write a 32-bit capsule property value + * to an NVMe controller on the target system. + * + * ("Capsule property" is an "PCIe register concept" applied to the + * NVMe fabrics space.) + * + * Return: + * 0: successful write + * > 0: NVMe error status code + * < 0: Linux errno error code + */ +int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val) +{ + struct nvme_command cmd; + int ret; + + memset(&cmd, 0, sizeof(cmd)); + cmd.prop_set.opcode = nvme_fabrics_command; + cmd.prop_set.fctype = nvme_fabrics_type_property_set; + cmd.prop_set.attrib = 0; + cmd.prop_set.offset = cpu_to_le32(off); + cmd.prop_set.value = cpu_to_le64(val); + + ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, NULL, 0, 0, + NVME_QID_ANY, 0, 0); + if (unlikely(ret)) + dev_err(ctrl->device, + "Property Set error: %d, offset %#x\n", + ret > 0 ? ret & ~NVME_SC_DNR : ret, off); + return ret; +} +EXPORT_SYMBOL_GPL(nvmf_reg_write32); + +/** + * nvmf_log_connect_error() - Error-parsing-diagnostic print + * out function for connect() errors. + * + * @ctrl: the specific /dev/nvmeX device that had the error. + * + * @errval: Error code to be decoded in a more human-friendly + * printout. + * + * @offset: For use with the NVMe error code NVME_SC_CONNECT_INVALID_PARAM. + * + * @cmd: This is the SQE portion of a submission capsule. + * + * @data: This is the "Data" portion of a submission capsule. + */ +static void nvmf_log_connect_error(struct nvme_ctrl *ctrl, + int errval, int offset, struct nvme_command *cmd, + struct nvmf_connect_data *data) +{ + int err_sctype = errval & (~NVME_SC_DNR); + + switch (err_sctype) { + + case (NVME_SC_CONNECT_INVALID_PARAM): + if (offset >> 16) { + char *inv_data = "Connect Invalid Data Parameter"; + + switch (offset & 0xffff) { + case (offsetof(struct nvmf_connect_data, cntlid)): + dev_err(ctrl->device, + "%s, cntlid: %d\n", + inv_data, data->cntlid); + break; + case (offsetof(struct nvmf_connect_data, hostnqn)): + dev_err(ctrl->device, + "%s, hostnqn \"%s\"\n", + inv_data, data->hostnqn); + break; + case (offsetof(struct nvmf_connect_data, subsysnqn)): + dev_err(ctrl->device, + "%s, subsysnqn \"%s\"\n", + inv_data, data->subsysnqn); + break; + default: + dev_err(ctrl->device, + "%s, starting byte offset: %d\n", + inv_data, offset & 0xffff); + break; + } + } else { + char *inv_sqe = "Connect Invalid SQE Parameter"; + + switch (offset) { + case (offsetof(struct nvmf_connect_command, qid)): + dev_err(ctrl->device, + "%s, qid %d\n", + inv_sqe, cmd->connect.qid); + break; + default: + dev_err(ctrl->device, + "%s, starting byte offset: %d\n", + inv_sqe, offset); + } + } + break; + default: + dev_err(ctrl->device, + "Connect command failed, error wo/DNR bit: %d\n", + err_sctype); + break; + } /* switch (err_sctype) */ +} + +/** + * nvmf_connect_admin_queue() - NVMe Fabrics Admin Queue "Connect" + * API function. + * @ctrl: Host nvme controller instance used to request + * a new NVMe controller allocation on the target + * system and establish an NVMe Admin connection to + * that controller. + * + * This function enables an NVMe host device to request a new allocation of + * an NVMe controller resource on a target system as well establish a + * fabrics-protocol connection of the NVMe Admin queue between the + * host system device and the allocated NVMe controller on the + * target system via a NVMe Fabrics "Connect" command. + * + * Return: + * 0: success + * > 0: NVMe error status code + * < 0: Linux errno error code + * + */ +int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) +{ + struct nvme_command cmd; + struct nvme_completion cqe; + struct nvmf_connect_data *data; + int ret; + + memset(&cmd, 0, sizeof(cmd)); + cmd.connect.opcode = nvme_fabrics_command; + cmd.connect.fctype = nvme_fabrics_type_connect; + cmd.connect.qid = 0; + cmd.connect.sqsize = cpu_to_le16(ctrl->sqsize); + /* + * Set keep-alive timeout in seconds granularity (ms * 1000) + * and add a grace period for controller kato enforcement + */ + cmd.connect.kato = ctrl->opts->discovery_nqn ? 0 : + cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000); + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + memcpy(&data->hostid, &ctrl->opts->host->id, sizeof(uuid_le)); + data->cntlid = cpu_to_le16(0xffff); + strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE); + strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE); + + ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &cqe, + data, sizeof(*data), 0, NVME_QID_ANY, 1, + BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT); + if (ret) { + nvmf_log_connect_error(ctrl, ret, le32_to_cpu(cqe.result), + &cmd, data); + goto out_free_data; + } + + ctrl->cntlid = le16_to_cpu(cqe.result16); + +out_free_data: + kfree(data); + return ret; +} +EXPORT_SYMBOL_GPL(nvmf_connect_admin_queue); + +/** + * nvmf_connect_io_queue() - NVMe Fabrics I/O Queue "Connect" + * API function. + * @ctrl: Host nvme controller instance used to establish an + * NVMe I/O queue connection to the already allocated NVMe + * controller on the target system. + * @qid: NVMe I/O queue number for the new I/O connection between + * host and target (note qid == 0 is illegal as this is + * the Admin queue, per NVMe standard). + * + * This function issues a fabrics-protocol connection + * of a NVMe I/O queue (via NVMe Fabrics "Connect" command) + * between the host system device and the allocated NVMe controller + * on the target system. + * + * Return: + * 0: success + * > 0: NVMe error status code + * < 0: Linux errno error code + */ +int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid) +{ + struct nvme_command cmd; + struct nvmf_connect_data *data; + struct nvme_completion cqe; + int ret; + + memset(&cmd, 0, sizeof(cmd)); + cmd.connect.opcode = nvme_fabrics_command; + cmd.connect.fctype = nvme_fabrics_type_connect; + cmd.connect.qid = cpu_to_le16(qid); + cmd.connect.sqsize = cpu_to_le16(ctrl->sqsize); + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + memcpy(&data->hostid, &ctrl->opts->host->id, sizeof(uuid_le)); + data->cntlid = cpu_to_le16(ctrl->cntlid); + strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE); + strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE); + + ret = __nvme_submit_sync_cmd(ctrl->connect_q, &cmd, &cqe, + data, sizeof(*data), 0, qid, 1, + BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT); + if (ret) { + nvmf_log_connect_error(ctrl, ret, le32_to_cpu(cqe.result), + &cmd, data); + } + kfree(data); + return ret; +} +EXPORT_SYMBOL_GPL(nvmf_connect_io_queue); + +/** + * nvmf_register_transport() - NVMe Fabrics Library registration function. + * @ops: Transport ops instance to be registered to the + * common fabrics library. + * + * API function that registers the type of specific transport fabric + * being implemented to the common NVMe fabrics library. Part of + * the overall init sequence of starting up a fabrics driver. + */ +void nvmf_register_transport(struct nvmf_transport_ops *ops) +{ + mutex_lock(&nvmf_transports_mutex); + list_add_tail(&ops->entry, &nvmf_transports); + mutex_unlock(&nvmf_transports_mutex); +} +EXPORT_SYMBOL_GPL(nvmf_register_transport); + +/** + * nvmf_unregister_transport() - NVMe Fabrics Library unregistration function. + * @ops: Transport ops instance to be unregistered from the + * common fabrics library. + * + * Fabrics API function that unregisters the type of specific transport + * fabric being implemented from the common NVMe fabrics library. + * Part of the overall exit sequence of unloading the implemented driver. + */ +void nvmf_unregister_transport(struct nvmf_transport_ops *ops) +{ + mutex_lock(&nvmf_transports_mutex); + list_del(&ops->entry); + mutex_unlock(&nvmf_transports_mutex); +} +EXPORT_SYMBOL_GPL(nvmf_unregister_transport); + +static struct nvmf_transport_ops *nvmf_lookup_transport( + struct nvmf_ctrl_options *opts) +{ + struct nvmf_transport_ops *ops; + + lockdep_assert_held(&nvmf_transports_mutex); + + list_for_each_entry(ops, &nvmf_transports, entry) { + if (strcmp(ops->name, opts->transport) == 0) + return ops; + } + + return NULL; +} + +static const match_table_t opt_tokens = { + { NVMF_OPT_TRANSPORT, "transport=%s" }, + { NVMF_OPT_TRADDR, "traddr=%s" }, + { NVMF_OPT_TRSVCID, "trsvcid=%s" }, + { NVMF_OPT_NQN, "nqn=%s" }, + { NVMF_OPT_QUEUE_SIZE, "queue_size=%d" }, + { NVMF_OPT_NR_IO_QUEUES, "nr_io_queues=%d" }, + { NVMF_OPT_RECONNECT_DELAY, "reconnect_delay=%d" }, + { NVMF_OPT_KATO, "keep_alive_tmo=%d" }, + { NVMF_OPT_HOSTNQN, "hostnqn=%s" }, + { NVMF_OPT_ERR, NULL } +}; + +static int nvmf_parse_options(struct nvmf_ctrl_options *opts, + const char *buf) +{ + substring_t args[MAX_OPT_ARGS]; + char *options, *o, *p; + int token, ret = 0; + size_t nqnlen = 0; + + /* Set defaults */ + opts->queue_size = NVMF_DEF_QUEUE_SIZE; + opts->nr_io_queues = num_online_cpus(); + opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY; + + options = o = kstrdup(buf, GFP_KERNEL); + if (!options) + return -ENOMEM; + + while ((p = strsep(&o, ",\n")) != NULL) { + if (!*p) + continue; + + token = match_token(p, opt_tokens, args); + opts->mask |= token; + switch (token) { + case NVMF_OPT_TRANSPORT: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + opts->transport = p; + break; + case NVMF_OPT_NQN: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + opts->subsysnqn = p; + nqnlen = strlen(opts->subsysnqn); + if (nqnlen >= NVMF_NQN_SIZE) { + pr_err("%s needs to be < %d bytes\n", + opts->subsysnqn, NVMF_NQN_SIZE); + ret = -EINVAL; + goto out; + } + opts->discovery_nqn = + !(strcmp(opts->subsysnqn, + NVME_DISC_SUBSYS_NAME)); + if (opts->discovery_nqn) + opts->nr_io_queues = 0; + break; + case NVMF_OPT_TRADDR: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + opts->traddr = p; + break; + case NVMF_OPT_TRSVCID: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + opts->trsvcid = p; + break; + case NVMF_OPT_QUEUE_SIZE: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + if (token < NVMF_MIN_QUEUE_SIZE || + token > NVMF_MAX_QUEUE_SIZE) { + pr_err("Invalid queue_size %d\n", token); + ret = -EINVAL; + goto out; + } + opts->queue_size = token; + break; + case NVMF_OPT_NR_IO_QUEUES: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + if (token <= 0) { + pr_err("Invalid number of IOQs %d\n", token); + ret = -EINVAL; + goto out; + } + opts->nr_io_queues = min_t(unsigned int, + num_online_cpus(), token); + break; + case NVMF_OPT_KATO: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + + if (opts->discovery_nqn) { + pr_err("Discovery controllers cannot accept keep_alive_tmo != 0\n"); + ret = -EINVAL; + goto out; + } + + if (token < 0) { + pr_err("Invalid keep_alive_tmo %d\n", token); + ret = -EINVAL; + goto out; + } else if (token == 0) { + /* Allowed for debug */ + pr_warn("keep_alive_tmo 0 won't execute keep alives!!!\n"); + } + opts->kato = token; + break; + case NVMF_OPT_HOSTNQN: + if (opts->host) { + pr_err("hostnqn already user-assigned: %s\n", + opts->host->nqn); + ret = -EADDRINUSE; + goto out; + } + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + nqnlen = strlen(p); + if (nqnlen >= NVMF_NQN_SIZE) { + pr_err("%s needs to be < %d bytes\n", + p, NVMF_NQN_SIZE); + ret = -EINVAL; + goto out; + } + opts->host = nvmf_host_add(p); + if (!opts->host) { + ret = -ENOMEM; + goto out; + } + break; + case NVMF_OPT_RECONNECT_DELAY: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + if (token <= 0) { + pr_err("Invalid reconnect_delay %d\n", token); + ret = -EINVAL; + goto out; + } + opts->reconnect_delay = token; + break; + default: + pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", + p); + ret = -EINVAL; + goto out; + } + } + + if (!opts->host) { + kref_get(&nvmf_default_host->ref); + opts->host = nvmf_default_host; + } + +out: + if (!opts->discovery_nqn && !opts->kato) + opts->kato = NVME_DEFAULT_KATO; + kfree(options); + return ret; +} + +static int nvmf_check_required_opts(struct nvmf_ctrl_options *opts, + unsigned int required_opts) +{ + if ((opts->mask & required_opts) != required_opts) { + int i; + + for (i = 0; i < ARRAY_SIZE(opt_tokens); i++) { + if ((opt_tokens[i].token & required_opts) && + !(opt_tokens[i].token & opts->mask)) { + pr_warn("missing parameter '%s'\n", + opt_tokens[i].pattern); + } + } + + return -EINVAL; + } + + return 0; +} + +static int nvmf_check_allowed_opts(struct nvmf_ctrl_options *opts, + unsigned int allowed_opts) +{ + if (opts->mask & ~allowed_opts) { + int i; + + for (i = 0; i < ARRAY_SIZE(opt_tokens); i++) { + if (opt_tokens[i].token & ~allowed_opts) { + pr_warn("invalid parameter '%s'\n", + opt_tokens[i].pattern); + } + } + + return -EINVAL; + } + + return 0; +} + +void nvmf_free_options(struct nvmf_ctrl_options *opts) +{ + nvmf_host_put(opts->host); + kfree(opts->transport); + kfree(opts->traddr); + kfree(opts->trsvcid); + kfree(opts->subsysnqn); + kfree(opts); +} +EXPORT_SYMBOL_GPL(nvmf_free_options); + +#define NVMF_REQUIRED_OPTS (NVMF_OPT_TRANSPORT | NVMF_OPT_NQN) +#define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \ + NVMF_OPT_KATO | NVMF_OPT_HOSTNQN) + +static struct nvme_ctrl * +nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) +{ + struct nvmf_ctrl_options *opts; + struct nvmf_transport_ops *ops; + struct nvme_ctrl *ctrl; + int ret; + + opts = kzalloc(sizeof(*opts), GFP_KERNEL); + if (!opts) + return ERR_PTR(-ENOMEM); + + ret = nvmf_parse_options(opts, buf); + if (ret) + goto out_free_opts; + + /* + * Check the generic options first as we need a valid transport for + * the lookup below. Then clear the generic flags so that transport + * drivers don't have to care about them. + */ + ret = nvmf_check_required_opts(opts, NVMF_REQUIRED_OPTS); + if (ret) + goto out_free_opts; + opts->mask &= ~NVMF_REQUIRED_OPTS; + + mutex_lock(&nvmf_transports_mutex); + ops = nvmf_lookup_transport(opts); + if (!ops) { + pr_info("no handler found for transport %s.\n", + opts->transport); + ret = -EINVAL; + goto out_unlock; + } + + ret = nvmf_check_required_opts(opts, ops->required_opts); + if (ret) + goto out_unlock; + ret = nvmf_check_allowed_opts(opts, NVMF_ALLOWED_OPTS | + ops->allowed_opts | ops->required_opts); + if (ret) + goto out_unlock; + + ctrl = ops->create_ctrl(dev, opts); + if (IS_ERR(ctrl)) { + ret = PTR_ERR(ctrl); + goto out_unlock; + } + + mutex_unlock(&nvmf_transports_mutex); + return ctrl; + +out_unlock: + mutex_unlock(&nvmf_transports_mutex); +out_free_opts: + nvmf_host_put(opts->host); + kfree(opts); + return ERR_PTR(ret); +} + +static struct class *nvmf_class; +static struct device *nvmf_device; +static DEFINE_MUTEX(nvmf_dev_mutex); + +static ssize_t nvmf_dev_write(struct file *file, const char __user *ubuf, + size_t count, loff_t *pos) +{ + struct seq_file *seq_file = file->private_data; + struct nvme_ctrl *ctrl; + const char *buf; + int ret = 0; + + if (count > PAGE_SIZE) + return -ENOMEM; + + buf = memdup_user_nul(ubuf, count); + if (IS_ERR(buf)) + return PTR_ERR(buf); + + mutex_lock(&nvmf_dev_mutex); + if (seq_file->private) { + ret = -EINVAL; + goto out_unlock; + } + + ctrl = nvmf_create_ctrl(nvmf_device, buf, count); + if (IS_ERR(ctrl)) { + ret = PTR_ERR(ctrl); + goto out_unlock; + } + + seq_file->private = ctrl; + +out_unlock: + mutex_unlock(&nvmf_dev_mutex); + kfree(buf); + return ret ? ret : count; +} + +static int nvmf_dev_show(struct seq_file *seq_file, void *private) +{ + struct nvme_ctrl *ctrl; + int ret = 0; + + mutex_lock(&nvmf_dev_mutex); + ctrl = seq_file->private; + if (!ctrl) { + ret = -EINVAL; + goto out_unlock; + } + + seq_printf(seq_file, "instance=%d,cntlid=%d\n", + ctrl->instance, ctrl->cntlid); + +out_unlock: + mutex_unlock(&nvmf_dev_mutex); + return ret; +} + +static int nvmf_dev_open(struct inode *inode, struct file *file) +{ + /* + * The miscdevice code initializes file->private_data, but doesn't + * make use of it later. + */ + file->private_data = NULL; + return single_open(file, nvmf_dev_show, NULL); +} + +static int nvmf_dev_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq_file = file->private_data; + struct nvme_ctrl *ctrl = seq_file->private; + + if (ctrl) + nvme_put_ctrl(ctrl); + return single_release(inode, file); +} + +static const struct file_operations nvmf_dev_fops = { + .owner = THIS_MODULE, + .write = nvmf_dev_write, + .read = seq_read, + .open = nvmf_dev_open, + .release = nvmf_dev_release, +}; + +static struct miscdevice nvmf_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = "nvme-fabrics", + .fops = &nvmf_dev_fops, +}; + +static int __init nvmf_init(void) +{ + int ret; + + nvmf_default_host = nvmf_host_default(); + if (!nvmf_default_host) + return -ENOMEM; + + nvmf_class = class_create(THIS_MODULE, "nvme-fabrics"); + if (IS_ERR(nvmf_class)) { + pr_err("couldn't register class nvme-fabrics\n"); + ret = PTR_ERR(nvmf_class); + goto out_free_host; + } + + nvmf_device = + device_create(nvmf_class, NULL, MKDEV(0, 0), NULL, "ctl"); + if (IS_ERR(nvmf_device)) { + pr_err("couldn't create nvme-fabris device!\n"); + ret = PTR_ERR(nvmf_device); + goto out_destroy_class; + } + + ret = misc_register(&nvmf_misc); + if (ret) { + pr_err("couldn't register misc device: %d\n", ret); + goto out_destroy_device; + } + + return 0; + +out_destroy_device: + device_destroy(nvmf_class, MKDEV(0, 0)); +out_destroy_class: + class_destroy(nvmf_class); +out_free_host: + nvmf_host_put(nvmf_default_host); + return ret; +} + +static void __exit nvmf_exit(void) +{ + misc_deregister(&nvmf_misc); + device_destroy(nvmf_class, MKDEV(0, 0)); + class_destroy(nvmf_class); + nvmf_host_put(nvmf_default_host); + + BUILD_BUG_ON(sizeof(struct nvmf_connect_command) != 64); + BUILD_BUG_ON(sizeof(struct nvmf_property_get_command) != 64); + BUILD_BUG_ON(sizeof(struct nvmf_property_set_command) != 64); + BUILD_BUG_ON(sizeof(struct nvmf_connect_data) != 1024); +} + +MODULE_LICENSE("GPL v2"); + +module_init(nvmf_init); +module_exit(nvmf_exit); diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h new file mode 100644 index 0000000..89df52c --- /dev/null +++ b/drivers/nvme/host/fabrics.h @@ -0,0 +1,132 @@ +/* + * NVMe over Fabrics common host code. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#ifndef _NVME_FABRICS_H +#define _NVME_FABRICS_H 1 + +#include <linux/in.h> +#include <linux/inet.h> + +#define NVMF_MIN_QUEUE_SIZE 16 +#define NVMF_MAX_QUEUE_SIZE 1024 +#define NVMF_DEF_QUEUE_SIZE 128 +#define NVMF_DEF_RECONNECT_DELAY 10 + +/* + * Define a host as seen by the target. We allocate one at boot, but also + * allow the override it when creating controllers. This is both to provide + * persistence of the Host NQN over multiple boots, and to allow using + * multiple ones, for example in a container scenario. Because we must not + * use different Host NQNs with the same Host ID we generate a Host ID and + * use this structure to keep track of the relation between the two. + */ +struct nvmf_host { + struct kref ref; + struct list_head list; + char nqn[NVMF_NQN_SIZE]; + uuid_le id; +}; + +/** + * enum nvmf_parsing_opts - used to define the sysfs parsing options used. + */ +enum { + NVMF_OPT_ERR = 0, + NVMF_OPT_TRANSPORT = 1 << 0, + NVMF_OPT_NQN = 1 << 1, + NVMF_OPT_TRADDR = 1 << 2, + NVMF_OPT_TRSVCID = 1 << 3, + NVMF_OPT_QUEUE_SIZE = 1 << 4, + NVMF_OPT_NR_IO_QUEUES = 1 << 5, + NVMF_OPT_TL_RETRY_COUNT = 1 << 6, + NVMF_OPT_KATO = 1 << 7, + NVMF_OPT_HOSTNQN = 1 << 8, + NVMF_OPT_RECONNECT_DELAY = 1 << 9, +}; + +/** + * struct nvmf_ctrl_options - Used to hold the options specified + * with the parsing opts enum. + * @mask: Used by the fabrics library to parse through sysfs options + * on adding a NVMe controller. + * @transport: Holds the fabric transport "technology name" (for a lack of + * better description) that will be used by an NVMe controller + * being added. + * @subsysnqn: Hold the fully qualified NQN subystem name (format defined + * in the NVMe specification, "NVMe Qualified Names"). + * @traddr: network address that will be used by the host to communicate + * to the added NVMe controller. + * @trsvcid: network port used for host-controller communication. + * @queue_size: Number of IO queue elements. + * @nr_io_queues: Number of controller IO queues that will be established. + * @reconnect_delay: Time between two consecutive reconnect attempts. + * @discovery_nqn: indicates if the subsysnqn is the well-known discovery NQN. + * @kato: Keep-alive timeout. + * @host: Virtual NVMe host, contains the NQN and Host ID. + */ +struct nvmf_ctrl_options { + unsigned mask; + char *transport; + char *subsysnqn; + char *traddr; + char *trsvcid; + size_t queue_size; + unsigned int nr_io_queues; + unsigned int reconnect_delay; + bool discovery_nqn; + unsigned int kato; + struct nvmf_host *host; +}; + +/* + * struct nvmf_transport_ops - used to register a specific + * fabric implementation of NVMe fabrics. + * @entry: Used by the fabrics library to add the new + * registration entry to its linked-list internal tree. + * @name: Name of the NVMe fabric driver implementation. + * @required_opts: sysfs command-line options that must be specified + * when adding a new NVMe controller. + * @allowed_opts: sysfs command-line options that can be specified + * when adding a new NVMe controller. + * @create_ctrl(): function pointer that points to a non-NVMe + * implementation-specific fabric technology + * that would go into starting up that fabric + * for the purpose of conneciton to an NVMe controller + * using that fabric technology. + * + * Notes: + * 1. At minimum, 'required_opts' and 'allowed_opts' should + * be set to the same enum parsing options defined earlier. + * 2. create_ctrl() must be defined (even if it does nothing) + */ +struct nvmf_transport_ops { + struct list_head entry; + const char *name; + int required_opts; + int allowed_opts; + struct nvme_ctrl *(*create_ctrl)(struct device *dev, + struct nvmf_ctrl_options *opts); +}; + +int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val); +int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val); +int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val); +int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl); +int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid); +void nvmf_register_transport(struct nvmf_transport_ops *ops); +void nvmf_unregister_transport(struct nvmf_transport_ops *ops); +void nvmf_free_options(struct nvmf_ctrl_options *opts); +const char *nvmf_get_subsysnqn(struct nvme_ctrl *ctrl); +int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size); + +#endif /* _NVME_FABRICS_H */ diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index a0af055..63f483d 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -156,7 +156,7 @@ struct nvme_nvm_completion { #define NVME_NVM_LP_MLC_PAIRS 886 struct nvme_nvm_lp_mlc { - __u16 num_pairs; + __le16 num_pairs; __u8 pairs[NVME_NVM_LP_MLC_PAIRS]; }; @@ -500,7 +500,7 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) struct bio *bio = rqd->bio; struct nvme_nvm_command *cmd; - rq = blk_mq_alloc_request(q, bio_rw(bio), 0); + rq = blk_mq_alloc_request(q, bio_data_dir(bio), 0); if (IS_ERR(rq)) return -ENOMEM; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 4d196d2..ab18b78 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -38,6 +38,11 @@ extern unsigned char admin_timeout; extern unsigned char shutdown_timeout; #define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ) +#define NVME_DEFAULT_KATO 5 +#define NVME_KATO_GRACE 10 + +extern unsigned int nvme_max_retries; + enum { NVME_NS_LBA = 0, NVME_NS_LIGHTNVM = 1, @@ -65,12 +70,26 @@ enum nvme_quirks { * logical blocks. */ NVME_QUIRK_DISCARD_ZEROES = (1 << 2), + + /* + * The controller needs a delay before starts checking the device + * readiness, which is done by reading the NVME_CSTS_RDY bit. + */ + NVME_QUIRK_DELAY_BEFORE_CHK_RDY = (1 << 3), }; +/* The below value is the specific amount of delay needed before checking + * readiness in case of the PCI_DEVICE(0x1c58, 0x0003), which needs the + * NVME_QUIRK_DELAY_BEFORE_CHK_RDY quirk enabled. The value (in ms) was + * found empirically. + */ +#define NVME_QUIRK_DELAY_AMOUNT 2000 + enum nvme_ctrl_state { NVME_CTRL_NEW, NVME_CTRL_LIVE, NVME_CTRL_RESETTING, + NVME_CTRL_RECONNECTING, NVME_CTRL_DELETING, NVME_CTRL_DEAD, }; @@ -80,6 +99,7 @@ struct nvme_ctrl { spinlock_t lock; const struct nvme_ctrl_ops *ops; struct request_queue *admin_q; + struct request_queue *connect_q; struct device *dev; struct kref kref; int instance; @@ -107,10 +127,22 @@ struct nvme_ctrl { u8 event_limit; u8 vwc; u32 vs; + u32 sgls; + u16 kas; + unsigned int kato; bool subsystem; unsigned long quirks; struct work_struct scan_work; struct work_struct async_event_work; + struct delayed_work ka_work; + + /* Fabrics only */ + u16 sqsize; + u32 ioccsz; + u32 iorcsz; + u16 icdoff; + u16 maxcmd; + struct nvmf_ctrl_options *opts; }; /* @@ -144,7 +176,9 @@ struct nvme_ns { }; struct nvme_ctrl_ops { + const char *name; struct module *module; + bool is_fabrics; int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); @@ -152,6 +186,9 @@ struct nvme_ctrl_ops { void (*free_ctrl)(struct nvme_ctrl *ctrl); void (*post_scan)(struct nvme_ctrl *ctrl); void (*submit_async_event)(struct nvme_ctrl *ctrl, int aer_idx); + int (*delete_ctrl)(struct nvme_ctrl *ctrl); + const char *(*get_subsysnqn)(struct nvme_ctrl *ctrl); + int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); }; static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl) @@ -204,9 +241,11 @@ static inline int nvme_error_status(u16 status) static inline bool nvme_req_needs_retry(struct request *req, u16 status) { return !(status & NVME_SC_DNR || blk_noretry_request(req)) && - (jiffies - req->start_time) < req->timeout; + (jiffies - req->start_time) < req->timeout && + req->retries < nvme_max_retries; } +void nvme_cancel_request(struct request *req, void *data, bool reserved); bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, enum nvme_ctrl_state new_state); int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap); @@ -230,8 +269,9 @@ void nvme_stop_queues(struct nvme_ctrl *ctrl); void nvme_start_queues(struct nvme_ctrl *ctrl); void nvme_kill_queues(struct nvme_ctrl *ctrl); +#define NVME_QID_ANY -1 struct request *nvme_alloc_request(struct request_queue *q, - struct nvme_command *cmd, unsigned int flags); + struct nvme_command *cmd, unsigned int flags, int qid); void nvme_requeue_req(struct request *req); int nvme_setup_cmd(struct nvme_ns *ns, struct request *req, struct nvme_command *cmd); @@ -239,7 +279,7 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buf, unsigned bufflen); int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, struct nvme_completion *cqe, void *buffer, unsigned bufflen, - unsigned timeout); + unsigned timeout, int qid, int at_head, int flags); int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, u32 *result, unsigned timeout); @@ -256,6 +296,8 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, dma_addr_t dma_addr, u32 *result); int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); +void nvme_start_keep_alive(struct nvme_ctrl *ctrl); +void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); struct sg_io_hdr; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index befac5b..4cb9b15 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -310,6 +310,11 @@ static int nvme_init_iod(struct request *rq, unsigned size, iod->npages = -1; iod->nents = 0; iod->length = size; + + if (!(rq->cmd_flags & REQ_DONTPREP)) { + rq->retries = 0; + rq->cmd_flags |= REQ_DONTPREP; + } return 0; } @@ -520,8 +525,8 @@ static int nvme_map_data(struct nvme_dev *dev, struct request *req, goto out_unmap; } - cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); - cmnd->rw.prp2 = cpu_to_le64(iod->first_dma); + cmnd->rw.dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); + cmnd->rw.dptr.prp2 = cpu_to_le64(iod->first_dma); if (blk_integrity_rq(req)) cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg)); return BLK_MQ_RQ_QUEUE_OK; @@ -623,6 +628,7 @@ static void nvme_complete_rq(struct request *req) if (unlikely(req->errors)) { if (nvme_req_needs_retry(req, req->errors)) { + req->retries++; nvme_requeue_req(req); return; } @@ -901,7 +907,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) req->tag, nvmeq->qid); abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd, - BLK_MQ_REQ_NOWAIT); + BLK_MQ_REQ_NOWAIT, NVME_QID_ANY); if (IS_ERR(abort_req)) { atomic_inc(&dev->ctrl.abort_limit); return BLK_EH_RESET_TIMER; @@ -919,22 +925,6 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) return BLK_EH_RESET_TIMER; } -static void nvme_cancel_io(struct request *req, void *data, bool reserved) -{ - int status; - - if (!blk_mq_request_started(req)) - return; - - dev_dbg_ratelimited(((struct nvme_dev *) data)->ctrl.device, - "Cancelling I/O %d", req->tag); - - status = NVME_SC_ABORT_REQ; - if (blk_queue_dying(req->q)) - status |= NVME_SC_DNR; - blk_mq_complete_request(req, status); -} - static void nvme_free_queue(struct nvme_queue *nvmeq) { dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), @@ -1399,16 +1389,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) if (result < 0) return result; - /* - * Degraded controllers might return an error when setting the queue - * count. We still want to be able to bring them online and offer - * access to the admin queue, as that might be only way to fix them up. - */ - if (result > 0) { - dev_err(dev->ctrl.device, - "Could not set queue count (%d)\n", result); + if (nr_io_queues == 0) return 0; - } if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) { result = nvme_cmb_qdepth(dev, nr_io_queues, @@ -1536,7 +1518,7 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) cmd.delete_queue.opcode = opcode; cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid); - req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT); + req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY); if (IS_ERR(req)) return PTR_ERR(req); @@ -1727,8 +1709,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) } nvme_pci_disable(dev); - blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_io, dev); - blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_io, dev); + blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl); + blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl); mutex_unlock(&dev->shutdown_lock); } @@ -1902,6 +1884,7 @@ static int nvme_pci_reset_ctrl(struct nvme_ctrl *ctrl) } static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { + .name = "pcie", .module = THIS_MODULE, .reg_read32 = nvme_pci_reg_read32, .reg_write32 = nvme_pci_reg_write32, @@ -1940,7 +1923,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) node = dev_to_node(&pdev->dev); if (node == NUMA_NO_NODE) - set_dev_node(&pdev->dev, 0); + set_dev_node(&pdev->dev, first_memory_node); dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); if (!dev) @@ -2037,6 +2020,24 @@ static void nvme_remove(struct pci_dev *pdev) nvme_put_ctrl(&dev->ctrl); } +static int nvme_pci_sriov_configure(struct pci_dev *pdev, int numvfs) +{ + int ret = 0; + + if (numvfs == 0) { + if (pci_vfs_assigned(pdev)) { + dev_warn(&pdev->dev, + "Cannot disable SR-IOV VFs while assigned\n"); + return -EPERM; + } + pci_disable_sriov(pdev); + return 0; + } + + ret = pci_enable_sriov(pdev, numvfs); + return ret ? ret : numvfs; +} + #ifdef CONFIG_PM_SLEEP static int nvme_suspend(struct device *dev) { @@ -2122,6 +2123,8 @@ static const struct pci_device_id nvme_id_table[] = { NVME_QUIRK_DISCARD_ZEROES, }, { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, + { PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */ + .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, { 0, } @@ -2137,6 +2140,7 @@ static struct pci_driver nvme_driver = { .driver = { .pm = &nvme_dev_pm_ops, }, + .sriov_configure = nvme_pci_sriov_configure, .err_handler = &nvme_err_handler, }; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c new file mode 100644 index 0000000..3e3ce2b --- /dev/null +++ b/drivers/nvme/host/rdma.c @@ -0,0 +1,2018 @@ +/* + * NVMe over Fabrics RDMA host code. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/delay.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/err.h> +#include <linux/string.h> +#include <linux/jiffies.h> +#include <linux/atomic.h> +#include <linux/blk-mq.h> +#include <linux/types.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/scatterlist.h> +#include <linux/nvme.h> +#include <linux/t10-pi.h> +#include <asm/unaligned.h> + +#include <rdma/ib_verbs.h> +#include <rdma/rdma_cm.h> +#include <rdma/ib_cm.h> +#include <linux/nvme-rdma.h> + +#include "nvme.h" +#include "fabrics.h" + + +#define NVME_RDMA_CONNECT_TIMEOUT_MS 1000 /* 1 second */ + +#define NVME_RDMA_MAX_SEGMENT_SIZE 0xffffff /* 24-bit SGL field */ + +#define NVME_RDMA_MAX_SEGMENTS 256 + +#define NVME_RDMA_MAX_INLINE_SEGMENTS 1 + +#define NVME_RDMA_MAX_PAGES_PER_MR 512 + +#define NVME_RDMA_DEF_RECONNECT_DELAY 20 + +/* + * We handle AEN commands ourselves and don't even let the + * block layer know about them. + */ +#define NVME_RDMA_NR_AEN_COMMANDS 1 +#define NVME_RDMA_AQ_BLKMQ_DEPTH \ + (NVMF_AQ_DEPTH - NVME_RDMA_NR_AEN_COMMANDS) + +struct nvme_rdma_device { + struct ib_device *dev; + struct ib_pd *pd; + struct ib_mr *mr; + struct kref ref; + struct list_head entry; +}; + +struct nvme_rdma_qe { + struct ib_cqe cqe; + void *data; + u64 dma; +}; + +struct nvme_rdma_queue; +struct nvme_rdma_request { + struct ib_mr *mr; + struct nvme_rdma_qe sqe; + struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS]; + u32 num_sge; + int nents; + bool inline_data; + bool need_inval; + struct ib_reg_wr reg_wr; + struct ib_cqe reg_cqe; + struct nvme_rdma_queue *queue; + struct sg_table sg_table; + struct scatterlist first_sgl[]; +}; + +enum nvme_rdma_queue_flags { + NVME_RDMA_Q_CONNECTED = (1 << 0), +}; + +struct nvme_rdma_queue { + struct nvme_rdma_qe *rsp_ring; + u8 sig_count; + int queue_size; + size_t cmnd_capsule_len; + struct nvme_rdma_ctrl *ctrl; + struct nvme_rdma_device *device; + struct ib_cq *ib_cq; + struct ib_qp *qp; + + unsigned long flags; + struct rdma_cm_id *cm_id; + int cm_error; + struct completion cm_done; +}; + +struct nvme_rdma_ctrl { + /* read and written in the hot path */ + spinlock_t lock; + + /* read only in the hot path */ + struct nvme_rdma_queue *queues; + u32 queue_count; + + /* other member variables */ + struct blk_mq_tag_set tag_set; + struct work_struct delete_work; + struct work_struct reset_work; + struct work_struct err_work; + + struct nvme_rdma_qe async_event_sqe; + + int reconnect_delay; + struct delayed_work reconnect_work; + + struct list_head list; + + struct blk_mq_tag_set admin_tag_set; + struct nvme_rdma_device *device; + + u64 cap; + u32 max_fr_pages; + + union { + struct sockaddr addr; + struct sockaddr_in addr_in; + }; + + struct nvme_ctrl ctrl; +}; + +static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl) +{ + return container_of(ctrl, struct nvme_rdma_ctrl, ctrl); +} + +static LIST_HEAD(device_list); +static DEFINE_MUTEX(device_list_mutex); + +static LIST_HEAD(nvme_rdma_ctrl_list); +static DEFINE_MUTEX(nvme_rdma_ctrl_mutex); + +static struct workqueue_struct *nvme_rdma_wq; + +/* + * Disabling this option makes small I/O goes faster, but is fundamentally + * unsafe. With it turned off we will have to register a global rkey that + * allows read and write access to all physical memory. + */ +static bool register_always = true; +module_param(register_always, bool, 0444); +MODULE_PARM_DESC(register_always, + "Use memory registration even for contiguous memory regions"); + +static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); +static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); +static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl); + +/* XXX: really should move to a generic header sooner or later.. */ +static inline void put_unaligned_le24(u32 val, u8 *p) +{ + *p++ = val; + *p++ = val >> 8; + *p++ = val >> 16; +} + +static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue) +{ + return queue - queue->ctrl->queues; +} + +static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue) +{ + return queue->cmnd_capsule_len - sizeof(struct nvme_command); +} + +static void nvme_rdma_free_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe, + size_t capsule_size, enum dma_data_direction dir) +{ + ib_dma_unmap_single(ibdev, qe->dma, capsule_size, dir); + kfree(qe->data); +} + +static int nvme_rdma_alloc_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe, + size_t capsule_size, enum dma_data_direction dir) +{ + qe->data = kzalloc(capsule_size, GFP_KERNEL); + if (!qe->data) + return -ENOMEM; + + qe->dma = ib_dma_map_single(ibdev, qe->data, capsule_size, dir); + if (ib_dma_mapping_error(ibdev, qe->dma)) { + kfree(qe->data); + return -ENOMEM; + } + + return 0; +} + +static void nvme_rdma_free_ring(struct ib_device *ibdev, + struct nvme_rdma_qe *ring, size_t ib_queue_size, + size_t capsule_size, enum dma_data_direction dir) +{ + int i; + + for (i = 0; i < ib_queue_size; i++) + nvme_rdma_free_qe(ibdev, &ring[i], capsule_size, dir); + kfree(ring); +} + +static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev, + size_t ib_queue_size, size_t capsule_size, + enum dma_data_direction dir) +{ + struct nvme_rdma_qe *ring; + int i; + + ring = kcalloc(ib_queue_size, sizeof(struct nvme_rdma_qe), GFP_KERNEL); + if (!ring) + return NULL; + + for (i = 0; i < ib_queue_size; i++) { + if (nvme_rdma_alloc_qe(ibdev, &ring[i], capsule_size, dir)) + goto out_free_ring; + } + + return ring; + +out_free_ring: + nvme_rdma_free_ring(ibdev, ring, i, capsule_size, dir); + return NULL; +} + +static void nvme_rdma_qp_event(struct ib_event *event, void *context) +{ + pr_debug("QP event %d\n", event->event); +} + +static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue) +{ + wait_for_completion_interruptible_timeout(&queue->cm_done, + msecs_to_jiffies(NVME_RDMA_CONNECT_TIMEOUT_MS) + 1); + return queue->cm_error; +} + +static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor) +{ + struct nvme_rdma_device *dev = queue->device; + struct ib_qp_init_attr init_attr; + int ret; + + memset(&init_attr, 0, sizeof(init_attr)); + init_attr.event_handler = nvme_rdma_qp_event; + /* +1 for drain */ + init_attr.cap.max_send_wr = factor * queue->queue_size + 1; + /* +1 for drain */ + init_attr.cap.max_recv_wr = queue->queue_size + 1; + init_attr.cap.max_recv_sge = 1; + init_attr.cap.max_send_sge = 1 + NVME_RDMA_MAX_INLINE_SEGMENTS; + init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + init_attr.qp_type = IB_QPT_RC; + init_attr.send_cq = queue->ib_cq; + init_attr.recv_cq = queue->ib_cq; + + ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr); + + queue->qp = queue->cm_id->qp; + return ret; +} + +static int nvme_rdma_reinit_request(void *data, struct request *rq) +{ + struct nvme_rdma_ctrl *ctrl = data; + struct nvme_rdma_device *dev = ctrl->device; + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + int ret = 0; + + if (!req->need_inval) + goto out; + + ib_dereg_mr(req->mr); + + req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG, + ctrl->max_fr_pages); + if (IS_ERR(req->mr)) { + ret = PTR_ERR(req->mr); + req->mr = NULL; + } + + req->need_inval = false; + +out: + return ret; +} + +static void __nvme_rdma_exit_request(struct nvme_rdma_ctrl *ctrl, + struct request *rq, unsigned int queue_idx) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; + struct nvme_rdma_device *dev = queue->device; + + if (req->mr) + ib_dereg_mr(req->mr); + + nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command), + DMA_TO_DEVICE); +} + +static void nvme_rdma_exit_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int rq_idx) +{ + return __nvme_rdma_exit_request(data, rq, hctx_idx + 1); +} + +static void nvme_rdma_exit_admin_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int rq_idx) +{ + return __nvme_rdma_exit_request(data, rq, 0); +} + +static int __nvme_rdma_init_request(struct nvme_rdma_ctrl *ctrl, + struct request *rq, unsigned int queue_idx) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; + struct nvme_rdma_device *dev = queue->device; + struct ib_device *ibdev = dev->dev; + int ret; + + BUG_ON(queue_idx >= ctrl->queue_count); + + ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command), + DMA_TO_DEVICE); + if (ret) + return ret; + + req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG, + ctrl->max_fr_pages); + if (IS_ERR(req->mr)) { + ret = PTR_ERR(req->mr); + goto out_free_qe; + } + + req->queue = queue; + + return 0; + +out_free_qe: + nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command), + DMA_TO_DEVICE); + return -ENOMEM; +} + +static int nvme_rdma_init_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int rq_idx, + unsigned int numa_node) +{ + return __nvme_rdma_init_request(data, rq, hctx_idx + 1); +} + +static int nvme_rdma_init_admin_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int rq_idx, + unsigned int numa_node) +{ + return __nvme_rdma_init_request(data, rq, 0); +} + +static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_rdma_ctrl *ctrl = data; + struct nvme_rdma_queue *queue = &ctrl->queues[hctx_idx + 1]; + + BUG_ON(hctx_idx >= ctrl->queue_count); + + hctx->driver_data = queue; + return 0; +} + +static int nvme_rdma_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_rdma_ctrl *ctrl = data; + struct nvme_rdma_queue *queue = &ctrl->queues[0]; + + BUG_ON(hctx_idx != 0); + + hctx->driver_data = queue; + return 0; +} + +static void nvme_rdma_free_dev(struct kref *ref) +{ + struct nvme_rdma_device *ndev = + container_of(ref, struct nvme_rdma_device, ref); + + mutex_lock(&device_list_mutex); + list_del(&ndev->entry); + mutex_unlock(&device_list_mutex); + + if (!register_always) + ib_dereg_mr(ndev->mr); + ib_dealloc_pd(ndev->pd); + + kfree(ndev); +} + +static void nvme_rdma_dev_put(struct nvme_rdma_device *dev) +{ + kref_put(&dev->ref, nvme_rdma_free_dev); +} + +static int nvme_rdma_dev_get(struct nvme_rdma_device *dev) +{ + return kref_get_unless_zero(&dev->ref); +} + +static struct nvme_rdma_device * +nvme_rdma_find_get_device(struct rdma_cm_id *cm_id) +{ + struct nvme_rdma_device *ndev; + + mutex_lock(&device_list_mutex); + list_for_each_entry(ndev, &device_list, entry) { + if (ndev->dev->node_guid == cm_id->device->node_guid && + nvme_rdma_dev_get(ndev)) + goto out_unlock; + } + + ndev = kzalloc(sizeof(*ndev), GFP_KERNEL); + if (!ndev) + goto out_err; + + ndev->dev = cm_id->device; + kref_init(&ndev->ref); + + ndev->pd = ib_alloc_pd(ndev->dev); + if (IS_ERR(ndev->pd)) + goto out_free_dev; + + if (!register_always) { + ndev->mr = ib_get_dma_mr(ndev->pd, + IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE); + if (IS_ERR(ndev->mr)) + goto out_free_pd; + } + + if (!(ndev->dev->attrs.device_cap_flags & + IB_DEVICE_MEM_MGT_EXTENSIONS)) { + dev_err(&ndev->dev->dev, + "Memory registrations not supported.\n"); + goto out_free_mr; + } + + list_add(&ndev->entry, &device_list); +out_unlock: + mutex_unlock(&device_list_mutex); + return ndev; + +out_free_mr: + if (!register_always) + ib_dereg_mr(ndev->mr); +out_free_pd: + ib_dealloc_pd(ndev->pd); +out_free_dev: + kfree(ndev); +out_err: + mutex_unlock(&device_list_mutex); + return NULL; +} + +static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) +{ + struct nvme_rdma_device *dev = queue->device; + struct ib_device *ibdev = dev->dev; + + rdma_destroy_qp(queue->cm_id); + ib_free_cq(queue->ib_cq); + + nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size, + sizeof(struct nvme_completion), DMA_FROM_DEVICE); + + nvme_rdma_dev_put(dev); +} + +static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue, + struct nvme_rdma_device *dev) +{ + struct ib_device *ibdev = dev->dev; + const int send_wr_factor = 3; /* MR, SEND, INV */ + const int cq_factor = send_wr_factor + 1; /* + RECV */ + int comp_vector, idx = nvme_rdma_queue_idx(queue); + + int ret; + + queue->device = dev; + + /* + * The admin queue is barely used once the controller is live, so don't + * bother to spread it out. + */ + if (idx == 0) + comp_vector = 0; + else + comp_vector = idx % ibdev->num_comp_vectors; + + + /* +1 for ib_stop_cq */ + queue->ib_cq = ib_alloc_cq(dev->dev, queue, + cq_factor * queue->queue_size + 1, comp_vector, + IB_POLL_SOFTIRQ); + if (IS_ERR(queue->ib_cq)) { + ret = PTR_ERR(queue->ib_cq); + goto out; + } + + ret = nvme_rdma_create_qp(queue, send_wr_factor); + if (ret) + goto out_destroy_ib_cq; + + queue->rsp_ring = nvme_rdma_alloc_ring(ibdev, queue->queue_size, + sizeof(struct nvme_completion), DMA_FROM_DEVICE); + if (!queue->rsp_ring) { + ret = -ENOMEM; + goto out_destroy_qp; + } + + return 0; + +out_destroy_qp: + ib_destroy_qp(queue->qp); +out_destroy_ib_cq: + ib_free_cq(queue->ib_cq); +out: + return ret; +} + +static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl, + int idx, size_t queue_size) +{ + struct nvme_rdma_queue *queue; + int ret; + + queue = &ctrl->queues[idx]; + queue->ctrl = ctrl; + init_completion(&queue->cm_done); + + if (idx > 0) + queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16; + else + queue->cmnd_capsule_len = sizeof(struct nvme_command); + + queue->queue_size = queue_size; + + queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(queue->cm_id)) { + dev_info(ctrl->ctrl.device, + "failed to create CM ID: %ld\n", PTR_ERR(queue->cm_id)); + return PTR_ERR(queue->cm_id); + } + + queue->cm_error = -ETIMEDOUT; + ret = rdma_resolve_addr(queue->cm_id, NULL, &ctrl->addr, + NVME_RDMA_CONNECT_TIMEOUT_MS); + if (ret) { + dev_info(ctrl->ctrl.device, + "rdma_resolve_addr failed (%d).\n", ret); + goto out_destroy_cm_id; + } + + ret = nvme_rdma_wait_for_cm(queue); + if (ret) { + dev_info(ctrl->ctrl.device, + "rdma_resolve_addr wait failed (%d).\n", ret); + goto out_destroy_cm_id; + } + + set_bit(NVME_RDMA_Q_CONNECTED, &queue->flags); + + return 0; + +out_destroy_cm_id: + rdma_destroy_id(queue->cm_id); + return ret; +} + +static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue) +{ + rdma_disconnect(queue->cm_id); + ib_drain_qp(queue->qp); +} + +static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue) +{ + nvme_rdma_destroy_queue_ib(queue); + rdma_destroy_id(queue->cm_id); +} + +static void nvme_rdma_stop_and_free_queue(struct nvme_rdma_queue *queue) +{ + if (!test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags)) + return; + nvme_rdma_stop_queue(queue); + nvme_rdma_free_queue(queue); +} + +static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl) +{ + int i; + + for (i = 1; i < ctrl->queue_count; i++) + nvme_rdma_stop_and_free_queue(&ctrl->queues[i]); +} + +static int nvme_rdma_connect_io_queues(struct nvme_rdma_ctrl *ctrl) +{ + int i, ret = 0; + + for (i = 1; i < ctrl->queue_count; i++) { + ret = nvmf_connect_io_queue(&ctrl->ctrl, i); + if (ret) + break; + } + + return ret; +} + +static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl) +{ + int i, ret; + + for (i = 1; i < ctrl->queue_count; i++) { + ret = nvme_rdma_init_queue(ctrl, i, ctrl->ctrl.sqsize); + if (ret) { + dev_info(ctrl->ctrl.device, + "failed to initialize i/o queue: %d\n", ret); + goto out_free_queues; + } + } + + return 0; + +out_free_queues: + for (; i >= 1; i--) + nvme_rdma_stop_and_free_queue(&ctrl->queues[i]); + + return ret; +} + +static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl) +{ + nvme_rdma_free_qe(ctrl->queues[0].device->dev, &ctrl->async_event_sqe, + sizeof(struct nvme_command), DMA_TO_DEVICE); + nvme_rdma_stop_and_free_queue(&ctrl->queues[0]); + blk_cleanup_queue(ctrl->ctrl.admin_q); + blk_mq_free_tag_set(&ctrl->admin_tag_set); + nvme_rdma_dev_put(ctrl->device); +} + +static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl) +{ + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); + + if (list_empty(&ctrl->list)) + goto free_ctrl; + + mutex_lock(&nvme_rdma_ctrl_mutex); + list_del(&ctrl->list); + mutex_unlock(&nvme_rdma_ctrl_mutex); + + if (ctrl->ctrl.tagset) { + blk_cleanup_queue(ctrl->ctrl.connect_q); + blk_mq_free_tag_set(&ctrl->tag_set); + nvme_rdma_dev_put(ctrl->device); + } + kfree(ctrl->queues); + nvmf_free_options(nctrl->opts); +free_ctrl: + kfree(ctrl); +} + +static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) +{ + struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work), + struct nvme_rdma_ctrl, reconnect_work); + bool changed; + int ret; + + if (ctrl->queue_count > 1) { + nvme_rdma_free_io_queues(ctrl); + + ret = blk_mq_reinit_tagset(&ctrl->tag_set); + if (ret) + goto requeue; + } + + nvme_rdma_stop_and_free_queue(&ctrl->queues[0]); + + ret = blk_mq_reinit_tagset(&ctrl->admin_tag_set); + if (ret) + goto requeue; + + ret = nvme_rdma_init_queue(ctrl, 0, NVMF_AQ_DEPTH); + if (ret) + goto requeue; + + blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true); + + ret = nvmf_connect_admin_queue(&ctrl->ctrl); + if (ret) + goto stop_admin_q; + + ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap); + if (ret) + goto stop_admin_q; + + nvme_start_keep_alive(&ctrl->ctrl); + + if (ctrl->queue_count > 1) { + ret = nvme_rdma_init_io_queues(ctrl); + if (ret) + goto stop_admin_q; + + ret = nvme_rdma_connect_io_queues(ctrl); + if (ret) + goto stop_admin_q; + } + + changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); + WARN_ON_ONCE(!changed); + + if (ctrl->queue_count > 1) + nvme_start_queues(&ctrl->ctrl); + + dev_info(ctrl->ctrl.device, "Successfully reconnected\n"); + + return; + +stop_admin_q: + blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); +requeue: + /* Make sure we are not resetting/deleting */ + if (ctrl->ctrl.state == NVME_CTRL_RECONNECTING) { + dev_info(ctrl->ctrl.device, + "Failed reconnect attempt, requeueing...\n"); + queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work, + ctrl->reconnect_delay * HZ); + } +} + +static void nvme_rdma_error_recovery_work(struct work_struct *work) +{ + struct nvme_rdma_ctrl *ctrl = container_of(work, + struct nvme_rdma_ctrl, err_work); + + nvme_stop_keep_alive(&ctrl->ctrl); + if (ctrl->queue_count > 1) + nvme_stop_queues(&ctrl->ctrl); + blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); + + /* We must take care of fastfail/requeue all our inflight requests */ + if (ctrl->queue_count > 1) + blk_mq_tagset_busy_iter(&ctrl->tag_set, + nvme_cancel_request, &ctrl->ctrl); + blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, + nvme_cancel_request, &ctrl->ctrl); + + dev_info(ctrl->ctrl.device, "reconnecting in %d seconds\n", + ctrl->reconnect_delay); + + queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work, + ctrl->reconnect_delay * HZ); +} + +static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl) +{ + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) + return; + + queue_work(nvme_rdma_wq, &ctrl->err_work); +} + +static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc, + const char *op) +{ + struct nvme_rdma_queue *queue = cq->cq_context; + struct nvme_rdma_ctrl *ctrl = queue->ctrl; + + if (ctrl->ctrl.state == NVME_CTRL_LIVE) + dev_info(ctrl->ctrl.device, + "%s for CQE 0x%p failed with status %s (%d)\n", + op, wc->wr_cqe, + ib_wc_status_msg(wc->status), wc->status); + nvme_rdma_error_recovery(ctrl); +} + +static void nvme_rdma_memreg_done(struct ib_cq *cq, struct ib_wc *wc) +{ + if (unlikely(wc->status != IB_WC_SUCCESS)) + nvme_rdma_wr_error(cq, wc, "MEMREG"); +} + +static void nvme_rdma_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc) +{ + if (unlikely(wc->status != IB_WC_SUCCESS)) + nvme_rdma_wr_error(cq, wc, "LOCAL_INV"); +} + +static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue, + struct nvme_rdma_request *req) +{ + struct ib_send_wr *bad_wr; + struct ib_send_wr wr = { + .opcode = IB_WR_LOCAL_INV, + .next = NULL, + .num_sge = 0, + .send_flags = 0, + .ex.invalidate_rkey = req->mr->rkey, + }; + + req->reg_cqe.done = nvme_rdma_inv_rkey_done; + wr.wr_cqe = &req->reg_cqe; + + return ib_post_send(queue->qp, &wr, &bad_wr); +} + +static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue, + struct request *rq) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_ctrl *ctrl = queue->ctrl; + struct nvme_rdma_device *dev = queue->device; + struct ib_device *ibdev = dev->dev; + int res; + + if (!blk_rq_bytes(rq)) + return; + + if (req->need_inval) { + res = nvme_rdma_inv_rkey(queue, req); + if (res < 0) { + dev_err(ctrl->ctrl.device, + "Queueing INV WR for rkey %#x failed (%d)\n", + req->mr->rkey, res); + nvme_rdma_error_recovery(queue->ctrl); + } + } + + ib_dma_unmap_sg(ibdev, req->sg_table.sgl, + req->nents, rq_data_dir(rq) == + WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + + nvme_cleanup_cmd(rq); + sg_free_table_chained(&req->sg_table, true); +} + +static int nvme_rdma_set_sg_null(struct nvme_command *c) +{ + struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; + + sg->addr = 0; + put_unaligned_le24(0, sg->length); + put_unaligned_le32(0, sg->key); + sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4; + return 0; +} + +static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue, + struct nvme_rdma_request *req, struct nvme_command *c) +{ + struct nvme_sgl_desc *sg = &c->common.dptr.sgl; + + req->sge[1].addr = sg_dma_address(req->sg_table.sgl); + req->sge[1].length = sg_dma_len(req->sg_table.sgl); + req->sge[1].lkey = queue->device->pd->local_dma_lkey; + + sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff); + sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl)); + sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET; + + req->inline_data = true; + req->num_sge++; + return 0; +} + +static int nvme_rdma_map_sg_single(struct nvme_rdma_queue *queue, + struct nvme_rdma_request *req, struct nvme_command *c) +{ + struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; + + sg->addr = cpu_to_le64(sg_dma_address(req->sg_table.sgl)); + put_unaligned_le24(sg_dma_len(req->sg_table.sgl), sg->length); + put_unaligned_le32(queue->device->mr->rkey, sg->key); + sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4; + return 0; +} + +static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue, + struct nvme_rdma_request *req, struct nvme_command *c, + int count) +{ + struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; + int nr; + + nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, PAGE_SIZE); + if (nr < count) { + if (nr < 0) + return nr; + return -EINVAL; + } + + ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey)); + + req->reg_cqe.done = nvme_rdma_memreg_done; + memset(&req->reg_wr, 0, sizeof(req->reg_wr)); + req->reg_wr.wr.opcode = IB_WR_REG_MR; + req->reg_wr.wr.wr_cqe = &req->reg_cqe; + req->reg_wr.wr.num_sge = 0; + req->reg_wr.mr = req->mr; + req->reg_wr.key = req->mr->rkey; + req->reg_wr.access = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE; + + req->need_inval = true; + + sg->addr = cpu_to_le64(req->mr->iova); + put_unaligned_le24(req->mr->length, sg->length); + put_unaligned_le32(req->mr->rkey, sg->key); + sg->type = (NVME_KEY_SGL_FMT_DATA_DESC << 4) | + NVME_SGL_FMT_INVALIDATE; + + return 0; +} + +static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, + struct request *rq, unsigned int map_len, + struct nvme_command *c) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_device *dev = queue->device; + struct ib_device *ibdev = dev->dev; + int nents, count; + int ret; + + req->num_sge = 1; + req->inline_data = false; + req->need_inval = false; + + c->common.flags |= NVME_CMD_SGL_METABUF; + + if (!blk_rq_bytes(rq)) + return nvme_rdma_set_sg_null(c); + + req->sg_table.sgl = req->first_sgl; + ret = sg_alloc_table_chained(&req->sg_table, rq->nr_phys_segments, + req->sg_table.sgl); + if (ret) + return -ENOMEM; + + nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl); + BUG_ON(nents > rq->nr_phys_segments); + req->nents = nents; + + count = ib_dma_map_sg(ibdev, req->sg_table.sgl, nents, + rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + if (unlikely(count <= 0)) { + sg_free_table_chained(&req->sg_table, true); + return -EIO; + } + + if (count == 1) { + if (rq_data_dir(rq) == WRITE && + map_len <= nvme_rdma_inline_data_size(queue) && + nvme_rdma_queue_idx(queue)) + return nvme_rdma_map_sg_inline(queue, req, c); + + if (!register_always) + return nvme_rdma_map_sg_single(queue, req, c); + } + + return nvme_rdma_map_sg_fr(queue, req, c, count); +} + +static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) +{ + if (unlikely(wc->status != IB_WC_SUCCESS)) + nvme_rdma_wr_error(cq, wc, "SEND"); +} + +static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, + struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge, + struct ib_send_wr *first, bool flush) +{ + struct ib_send_wr wr, *bad_wr; + int ret; + + sge->addr = qe->dma; + sge->length = sizeof(struct nvme_command), + sge->lkey = queue->device->pd->local_dma_lkey; + + qe->cqe.done = nvme_rdma_send_done; + + wr.next = NULL; + wr.wr_cqe = &qe->cqe; + wr.sg_list = sge; + wr.num_sge = num_sge; + wr.opcode = IB_WR_SEND; + wr.send_flags = 0; + + /* + * Unsignalled send completions are another giant desaster in the + * IB Verbs spec: If we don't regularly post signalled sends + * the send queue will fill up and only a QP reset will rescue us. + * Would have been way to obvious to handle this in hardware or + * at least the RDMA stack.. + * + * This messy and racy code sniplet is copy and pasted from the iSER + * initiator, and the magic '32' comes from there as well. + * + * Always signal the flushes. The magic request used for the flush + * sequencer is not allocated in our driver's tagset and it's + * triggered to be freed by blk_cleanup_queue(). So we need to + * always mark it as signaled to ensure that the "wr_cqe", which is + * embeded in request's payload, is not freed when __ib_process_cq() + * calls wr_cqe->done(). + */ + if ((++queue->sig_count % 32) == 0 || flush) + wr.send_flags |= IB_SEND_SIGNALED; + + if (first) + first->next = ≀ + else + first = ≀ + + ret = ib_post_send(queue->qp, first, &bad_wr); + if (ret) { + dev_err(queue->ctrl->ctrl.device, + "%s failed with error code %d\n", __func__, ret); + } + return ret; +} + +static int nvme_rdma_post_recv(struct nvme_rdma_queue *queue, + struct nvme_rdma_qe *qe) +{ + struct ib_recv_wr wr, *bad_wr; + struct ib_sge list; + int ret; + + list.addr = qe->dma; + list.length = sizeof(struct nvme_completion); + list.lkey = queue->device->pd->local_dma_lkey; + + qe->cqe.done = nvme_rdma_recv_done; + + wr.next = NULL; + wr.wr_cqe = &qe->cqe; + wr.sg_list = &list; + wr.num_sge = 1; + + ret = ib_post_recv(queue->qp, &wr, &bad_wr); + if (ret) { + dev_err(queue->ctrl->ctrl.device, + "%s failed with error code %d\n", __func__, ret); + } + return ret; +} + +static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue) +{ + u32 queue_idx = nvme_rdma_queue_idx(queue); + + if (queue_idx == 0) + return queue->ctrl->admin_tag_set.tags[queue_idx]; + return queue->ctrl->tag_set.tags[queue_idx - 1]; +} + +static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx) +{ + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg); + struct nvme_rdma_queue *queue = &ctrl->queues[0]; + struct ib_device *dev = queue->device->dev; + struct nvme_rdma_qe *sqe = &ctrl->async_event_sqe; + struct nvme_command *cmd = sqe->data; + struct ib_sge sge; + int ret; + + if (WARN_ON_ONCE(aer_idx != 0)) + return; + + ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE); + + memset(cmd, 0, sizeof(*cmd)); + cmd->common.opcode = nvme_admin_async_event; + cmd->common.command_id = NVME_RDMA_AQ_BLKMQ_DEPTH; + cmd->common.flags |= NVME_CMD_SGL_METABUF; + nvme_rdma_set_sg_null(cmd); + + ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd), + DMA_TO_DEVICE); + + ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL, false); + WARN_ON_ONCE(ret); +} + +static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue, + struct nvme_completion *cqe, struct ib_wc *wc, int tag) +{ + u16 status = le16_to_cpu(cqe->status); + struct request *rq; + struct nvme_rdma_request *req; + int ret = 0; + + status >>= 1; + + rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id); + if (!rq) { + dev_err(queue->ctrl->ctrl.device, + "tag 0x%x on QP %#x not found\n", + cqe->command_id, queue->qp->qp_num); + nvme_rdma_error_recovery(queue->ctrl); + return ret; + } + req = blk_mq_rq_to_pdu(rq); + + if (rq->cmd_type == REQ_TYPE_DRV_PRIV && rq->special) + memcpy(rq->special, cqe, sizeof(*cqe)); + + if (rq->tag == tag) + ret = 1; + + if ((wc->wc_flags & IB_WC_WITH_INVALIDATE) && + wc->ex.invalidate_rkey == req->mr->rkey) + req->need_inval = false; + + blk_mq_complete_request(rq, status); + + return ret; +} + +static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag) +{ + struct nvme_rdma_qe *qe = + container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe); + struct nvme_rdma_queue *queue = cq->cq_context; + struct ib_device *ibdev = queue->device->dev; + struct nvme_completion *cqe = qe->data; + const size_t len = sizeof(struct nvme_completion); + int ret = 0; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + nvme_rdma_wr_error(cq, wc, "RECV"); + return 0; + } + + ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE); + /* + * AEN requests are special as they don't time out and can + * survive any kind of queue freeze and often don't respond to + * aborts. We don't even bother to allocate a struct request + * for them but rather special case them here. + */ + if (unlikely(nvme_rdma_queue_idx(queue) == 0 && + cqe->command_id >= NVME_RDMA_AQ_BLKMQ_DEPTH)) + nvme_complete_async_event(&queue->ctrl->ctrl, cqe); + else + ret = nvme_rdma_process_nvme_rsp(queue, cqe, wc, tag); + ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE); + + nvme_rdma_post_recv(queue, qe); + return ret; +} + +static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) +{ + __nvme_rdma_recv_done(cq, wc, -1); +} + +static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue) +{ + int ret, i; + + for (i = 0; i < queue->queue_size; i++) { + ret = nvme_rdma_post_recv(queue, &queue->rsp_ring[i]); + if (ret) + goto out_destroy_queue_ib; + } + + return 0; + +out_destroy_queue_ib: + nvme_rdma_destroy_queue_ib(queue); + return ret; +} + +static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue, + struct rdma_cm_event *ev) +{ + if (ev->param.conn.private_data_len) { + struct nvme_rdma_cm_rej *rej = + (struct nvme_rdma_cm_rej *)ev->param.conn.private_data; + + dev_err(queue->ctrl->ctrl.device, + "Connect rejected, status %d.", le16_to_cpu(rej->sts)); + /* XXX: Think of something clever to do here... */ + } else { + dev_err(queue->ctrl->ctrl.device, + "Connect rejected, no private data.\n"); + } + + return -ECONNRESET; +} + +static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue) +{ + struct nvme_rdma_device *dev; + int ret; + + dev = nvme_rdma_find_get_device(queue->cm_id); + if (!dev) { + dev_err(queue->cm_id->device->dma_device, + "no client data found!\n"); + return -ECONNREFUSED; + } + + ret = nvme_rdma_create_queue_ib(queue, dev); + if (ret) { + nvme_rdma_dev_put(dev); + goto out; + } + + ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS); + if (ret) { + dev_err(queue->ctrl->ctrl.device, + "rdma_resolve_route failed (%d).\n", + queue->cm_error); + goto out_destroy_queue; + } + + return 0; + +out_destroy_queue: + nvme_rdma_destroy_queue_ib(queue); +out: + return ret; +} + +static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue) +{ + struct nvme_rdma_ctrl *ctrl = queue->ctrl; + struct rdma_conn_param param = { }; + struct nvme_rdma_cm_req priv; + int ret; + + param.qp_num = queue->qp->qp_num; + param.flow_control = 1; + + param.responder_resources = queue->device->dev->attrs.max_qp_rd_atom; + /* maximum retry count */ + param.retry_count = 7; + param.rnr_retry_count = 7; + param.private_data = &priv; + param.private_data_len = sizeof(priv); + + priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); + priv.qid = cpu_to_le16(nvme_rdma_queue_idx(queue)); + priv.hrqsize = cpu_to_le16(queue->queue_size); + priv.hsqsize = cpu_to_le16(queue->queue_size); + + ret = rdma_connect(queue->cm_id, ¶m); + if (ret) { + dev_err(ctrl->ctrl.device, + "rdma_connect failed (%d).\n", ret); + goto out_destroy_queue_ib; + } + + return 0; + +out_destroy_queue_ib: + nvme_rdma_destroy_queue_ib(queue); + return ret; +} + +/** + * nvme_rdma_device_unplug() - Handle RDMA device unplug + * @queue: Queue that owns the cm_id that caught the event + * + * DEVICE_REMOVAL event notifies us that the RDMA device is about + * to unplug so we should take care of destroying our RDMA resources. + * This event will be generated for each allocated cm_id. + * + * In our case, the RDMA resources are managed per controller and not + * only per queue. So the way we handle this is we trigger an implicit + * controller deletion upon the first DEVICE_REMOVAL event we see, and + * hold the event inflight until the controller deletion is completed. + * + * One exception that we need to handle is the destruction of the cm_id + * that caught the event. Since we hold the callout until the controller + * deletion is completed, we'll deadlock if the controller deletion will + * call rdma_destroy_id on this queue's cm_id. Thus, we claim ownership + * of destroying this queue before-hand, destroy the queue resources + * after the controller deletion completed with the exception of destroying + * the cm_id implicitely by returning a non-zero rc to the callout. + */ +static int nvme_rdma_device_unplug(struct nvme_rdma_queue *queue) +{ + struct nvme_rdma_ctrl *ctrl = queue->ctrl; + int ret, ctrl_deleted = 0; + + /* First disable the queue so ctrl delete won't free it */ + if (!test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags)) + goto out; + + /* delete the controller */ + ret = __nvme_rdma_del_ctrl(ctrl); + if (!ret) { + dev_warn(ctrl->ctrl.device, + "Got rdma device removal event, deleting ctrl\n"); + flush_work(&ctrl->delete_work); + + /* Return non-zero so the cm_id will destroy implicitly */ + ctrl_deleted = 1; + + /* Free this queue ourselves */ + rdma_disconnect(queue->cm_id); + ib_drain_qp(queue->qp); + nvme_rdma_destroy_queue_ib(queue); + } + +out: + return ctrl_deleted; +} + +static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *ev) +{ + struct nvme_rdma_queue *queue = cm_id->context; + int cm_error = 0; + + dev_dbg(queue->ctrl->ctrl.device, "%s (%d): status %d id %p\n", + rdma_event_msg(ev->event), ev->event, + ev->status, cm_id); + + switch (ev->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + cm_error = nvme_rdma_addr_resolved(queue); + break; + case RDMA_CM_EVENT_ROUTE_RESOLVED: + cm_error = nvme_rdma_route_resolved(queue); + break; + case RDMA_CM_EVENT_ESTABLISHED: + queue->cm_error = nvme_rdma_conn_established(queue); + /* complete cm_done regardless of success/failure */ + complete(&queue->cm_done); + return 0; + case RDMA_CM_EVENT_REJECTED: + cm_error = nvme_rdma_conn_rejected(queue, ev); + break; + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + dev_dbg(queue->ctrl->ctrl.device, + "CM error event %d\n", ev->event); + cm_error = -ECONNRESET; + break; + case RDMA_CM_EVENT_DISCONNECTED: + case RDMA_CM_EVENT_ADDR_CHANGE: + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + dev_dbg(queue->ctrl->ctrl.device, + "disconnect received - connection closed\n"); + nvme_rdma_error_recovery(queue->ctrl); + break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + /* return 1 means impliciy CM ID destroy */ + return nvme_rdma_device_unplug(queue); + default: + dev_err(queue->ctrl->ctrl.device, + "Unexpected RDMA CM event (%d)\n", ev->event); + nvme_rdma_error_recovery(queue->ctrl); + break; + } + + if (cm_error) { + queue->cm_error = cm_error; + complete(&queue->cm_done); + } + + return 0; +} + +static enum blk_eh_timer_return +nvme_rdma_timeout(struct request *rq, bool reserved) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + + /* queue error recovery */ + nvme_rdma_error_recovery(req->queue->ctrl); + + /* fail with DNR on cmd timeout */ + rq->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR; + + return BLK_EH_HANDLED; +} + +static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct nvme_ns *ns = hctx->queue->queuedata; + struct nvme_rdma_queue *queue = hctx->driver_data; + struct request *rq = bd->rq; + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_qe *sqe = &req->sqe; + struct nvme_command *c = sqe->data; + bool flush = false; + struct ib_device *dev; + unsigned int map_len; + int ret; + + WARN_ON_ONCE(rq->tag < 0); + + dev = queue->device->dev; + ib_dma_sync_single_for_cpu(dev, sqe->dma, + sizeof(struct nvme_command), DMA_TO_DEVICE); + + ret = nvme_setup_cmd(ns, rq, c); + if (ret) + return ret; + + c->common.command_id = rq->tag; + blk_mq_start_request(rq); + + map_len = nvme_map_len(rq); + ret = nvme_rdma_map_data(queue, rq, map_len, c); + if (ret < 0) { + dev_err(queue->ctrl->ctrl.device, + "Failed to map data (%d)\n", ret); + nvme_cleanup_cmd(rq); + goto err; + } + + ib_dma_sync_single_for_device(dev, sqe->dma, + sizeof(struct nvme_command), DMA_TO_DEVICE); + + if (rq->cmd_type == REQ_TYPE_FS && req_op(rq) == REQ_OP_FLUSH) + flush = true; + ret = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge, + req->need_inval ? &req->reg_wr.wr : NULL, flush); + if (ret) { + nvme_rdma_unmap_data(queue, rq); + goto err; + } + + return BLK_MQ_RQ_QUEUE_OK; +err: + return (ret == -ENOMEM || ret == -EAGAIN) ? + BLK_MQ_RQ_QUEUE_BUSY : BLK_MQ_RQ_QUEUE_ERROR; +} + +static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) +{ + struct nvme_rdma_queue *queue = hctx->driver_data; + struct ib_cq *cq = queue->ib_cq; + struct ib_wc wc; + int found = 0; + + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + while (ib_poll_cq(cq, 1, &wc) > 0) { + struct ib_cqe *cqe = wc.wr_cqe; + + if (cqe) { + if (cqe->done == nvme_rdma_recv_done) + found |= __nvme_rdma_recv_done(cq, &wc, tag); + else + cqe->done(cq, &wc); + } + } + + return found; +} + +static void nvme_rdma_complete_rq(struct request *rq) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_queue *queue = req->queue; + int error = 0; + + nvme_rdma_unmap_data(queue, rq); + + if (unlikely(rq->errors)) { + if (nvme_req_needs_retry(rq, rq->errors)) { + nvme_requeue_req(rq); + return; + } + + if (rq->cmd_type == REQ_TYPE_DRV_PRIV) + error = rq->errors; + else + error = nvme_error_status(rq->errors); + } + + blk_mq_end_request(rq, error); +} + +static struct blk_mq_ops nvme_rdma_mq_ops = { + .queue_rq = nvme_rdma_queue_rq, + .complete = nvme_rdma_complete_rq, + .map_queue = blk_mq_map_queue, + .init_request = nvme_rdma_init_request, + .exit_request = nvme_rdma_exit_request, + .reinit_request = nvme_rdma_reinit_request, + .init_hctx = nvme_rdma_init_hctx, + .poll = nvme_rdma_poll, + .timeout = nvme_rdma_timeout, +}; + +static struct blk_mq_ops nvme_rdma_admin_mq_ops = { + .queue_rq = nvme_rdma_queue_rq, + .complete = nvme_rdma_complete_rq, + .map_queue = blk_mq_map_queue, + .init_request = nvme_rdma_init_admin_request, + .exit_request = nvme_rdma_exit_admin_request, + .reinit_request = nvme_rdma_reinit_request, + .init_hctx = nvme_rdma_init_admin_hctx, + .timeout = nvme_rdma_timeout, +}; + +static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) +{ + int error; + + error = nvme_rdma_init_queue(ctrl, 0, NVMF_AQ_DEPTH); + if (error) + return error; + + ctrl->device = ctrl->queues[0].device; + + /* + * We need a reference on the device as long as the tag_set is alive, + * as the MRs in the request structures need a valid ib_device. + */ + error = -EINVAL; + if (!nvme_rdma_dev_get(ctrl->device)) + goto out_free_queue; + + ctrl->max_fr_pages = min_t(u32, NVME_RDMA_MAX_SEGMENTS, + ctrl->device->dev->attrs.max_fast_reg_page_list_len); + + memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); + ctrl->admin_tag_set.ops = &nvme_rdma_admin_mq_ops; + ctrl->admin_tag_set.queue_depth = NVME_RDMA_AQ_BLKMQ_DEPTH; + ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */ + ctrl->admin_tag_set.numa_node = NUMA_NO_NODE; + ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_rdma_request) + + SG_CHUNK_SIZE * sizeof(struct scatterlist); + ctrl->admin_tag_set.driver_data = ctrl; + ctrl->admin_tag_set.nr_hw_queues = 1; + ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT; + + error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); + if (error) + goto out_put_dev; + + ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); + if (IS_ERR(ctrl->ctrl.admin_q)) { + error = PTR_ERR(ctrl->ctrl.admin_q); + goto out_free_tagset; + } + + error = nvmf_connect_admin_queue(&ctrl->ctrl); + if (error) + goto out_cleanup_queue; + + error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap); + if (error) { + dev_err(ctrl->ctrl.device, + "prop_get NVME_REG_CAP failed\n"); + goto out_cleanup_queue; + } + + ctrl->ctrl.sqsize = + min_t(int, NVME_CAP_MQES(ctrl->cap) + 1, ctrl->ctrl.sqsize); + + error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap); + if (error) + goto out_cleanup_queue; + + ctrl->ctrl.max_hw_sectors = + (ctrl->max_fr_pages - 1) << (PAGE_SHIFT - 9); + + error = nvme_init_identify(&ctrl->ctrl); + if (error) + goto out_cleanup_queue; + + error = nvme_rdma_alloc_qe(ctrl->queues[0].device->dev, + &ctrl->async_event_sqe, sizeof(struct nvme_command), + DMA_TO_DEVICE); + if (error) + goto out_cleanup_queue; + + nvme_start_keep_alive(&ctrl->ctrl); + + return 0; + +out_cleanup_queue: + blk_cleanup_queue(ctrl->ctrl.admin_q); +out_free_tagset: + /* disconnect and drain the queue before freeing the tagset */ + nvme_rdma_stop_queue(&ctrl->queues[0]); + blk_mq_free_tag_set(&ctrl->admin_tag_set); +out_put_dev: + nvme_rdma_dev_put(ctrl->device); +out_free_queue: + nvme_rdma_free_queue(&ctrl->queues[0]); + return error; +} + +static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl) +{ + nvme_stop_keep_alive(&ctrl->ctrl); + cancel_work_sync(&ctrl->err_work); + cancel_delayed_work_sync(&ctrl->reconnect_work); + + if (ctrl->queue_count > 1) { + nvme_stop_queues(&ctrl->ctrl); + blk_mq_tagset_busy_iter(&ctrl->tag_set, + nvme_cancel_request, &ctrl->ctrl); + nvme_rdma_free_io_queues(ctrl); + } + + if (ctrl->ctrl.state == NVME_CTRL_LIVE) + nvme_shutdown_ctrl(&ctrl->ctrl); + + blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); + blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, + nvme_cancel_request, &ctrl->ctrl); + nvme_rdma_destroy_admin_queue(ctrl); +} + +static void nvme_rdma_del_ctrl_work(struct work_struct *work) +{ + struct nvme_rdma_ctrl *ctrl = container_of(work, + struct nvme_rdma_ctrl, delete_work); + + nvme_remove_namespaces(&ctrl->ctrl); + nvme_rdma_shutdown_ctrl(ctrl); + nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); +} + +static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl) +{ + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) + return -EBUSY; + + if (!queue_work(nvme_rdma_wq, &ctrl->delete_work)) + return -EBUSY; + + return 0; +} + +static int nvme_rdma_del_ctrl(struct nvme_ctrl *nctrl) +{ + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); + int ret; + + ret = __nvme_rdma_del_ctrl(ctrl); + if (ret) + return ret; + + flush_work(&ctrl->delete_work); + + return 0; +} + +static void nvme_rdma_remove_ctrl_work(struct work_struct *work) +{ + struct nvme_rdma_ctrl *ctrl = container_of(work, + struct nvme_rdma_ctrl, delete_work); + + nvme_remove_namespaces(&ctrl->ctrl); + nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); +} + +static void nvme_rdma_reset_ctrl_work(struct work_struct *work) +{ + struct nvme_rdma_ctrl *ctrl = container_of(work, + struct nvme_rdma_ctrl, reset_work); + int ret; + bool changed; + + nvme_rdma_shutdown_ctrl(ctrl); + + ret = nvme_rdma_configure_admin_queue(ctrl); + if (ret) { + /* ctrl is already shutdown, just remove the ctrl */ + INIT_WORK(&ctrl->delete_work, nvme_rdma_remove_ctrl_work); + goto del_dead_ctrl; + } + + if (ctrl->queue_count > 1) { + ret = blk_mq_reinit_tagset(&ctrl->tag_set); + if (ret) + goto del_dead_ctrl; + + ret = nvme_rdma_init_io_queues(ctrl); + if (ret) + goto del_dead_ctrl; + + ret = nvme_rdma_connect_io_queues(ctrl); + if (ret) + goto del_dead_ctrl; + } + + changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); + WARN_ON_ONCE(!changed); + + if (ctrl->queue_count > 1) { + nvme_start_queues(&ctrl->ctrl); + nvme_queue_scan(&ctrl->ctrl); + } + + return; + +del_dead_ctrl: + /* Deleting this dead controller... */ + dev_warn(ctrl->ctrl.device, "Removing after reset failure\n"); + WARN_ON(!queue_work(nvme_rdma_wq, &ctrl->delete_work)); +} + +static int nvme_rdma_reset_ctrl(struct nvme_ctrl *nctrl) +{ + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); + + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) + return -EBUSY; + + if (!queue_work(nvme_rdma_wq, &ctrl->reset_work)) + return -EBUSY; + + flush_work(&ctrl->reset_work); + + return 0; +} + +static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { + .name = "rdma", + .module = THIS_MODULE, + .is_fabrics = true, + .reg_read32 = nvmf_reg_read32, + .reg_read64 = nvmf_reg_read64, + .reg_write32 = nvmf_reg_write32, + .reset_ctrl = nvme_rdma_reset_ctrl, + .free_ctrl = nvme_rdma_free_ctrl, + .submit_async_event = nvme_rdma_submit_async_event, + .delete_ctrl = nvme_rdma_del_ctrl, + .get_subsysnqn = nvmf_get_subsysnqn, + .get_address = nvmf_get_address, +}; + +static int nvme_rdma_create_io_queues(struct nvme_rdma_ctrl *ctrl) +{ + struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + int ret; + + ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues); + if (ret) + return ret; + + ctrl->queue_count = opts->nr_io_queues + 1; + if (ctrl->queue_count < 2) + return 0; + + dev_info(ctrl->ctrl.device, + "creating %d I/O queues.\n", opts->nr_io_queues); + + ret = nvme_rdma_init_io_queues(ctrl); + if (ret) + return ret; + + /* + * We need a reference on the device as long as the tag_set is alive, + * as the MRs in the request structures need a valid ib_device. + */ + ret = -EINVAL; + if (!nvme_rdma_dev_get(ctrl->device)) + goto out_free_io_queues; + + memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set)); + ctrl->tag_set.ops = &nvme_rdma_mq_ops; + ctrl->tag_set.queue_depth = ctrl->ctrl.sqsize; + ctrl->tag_set.reserved_tags = 1; /* fabric connect */ + ctrl->tag_set.numa_node = NUMA_NO_NODE; + ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + ctrl->tag_set.cmd_size = sizeof(struct nvme_rdma_request) + + SG_CHUNK_SIZE * sizeof(struct scatterlist); + ctrl->tag_set.driver_data = ctrl; + ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1; + ctrl->tag_set.timeout = NVME_IO_TIMEOUT; + + ret = blk_mq_alloc_tag_set(&ctrl->tag_set); + if (ret) + goto out_put_dev; + ctrl->ctrl.tagset = &ctrl->tag_set; + + ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set); + if (IS_ERR(ctrl->ctrl.connect_q)) { + ret = PTR_ERR(ctrl->ctrl.connect_q); + goto out_free_tag_set; + } + + ret = nvme_rdma_connect_io_queues(ctrl); + if (ret) + goto out_cleanup_connect_q; + + return 0; + +out_cleanup_connect_q: + blk_cleanup_queue(ctrl->ctrl.connect_q); +out_free_tag_set: + blk_mq_free_tag_set(&ctrl->tag_set); +out_put_dev: + nvme_rdma_dev_put(ctrl->device); +out_free_io_queues: + nvme_rdma_free_io_queues(ctrl); + return ret; +} + +static int nvme_rdma_parse_ipaddr(struct sockaddr_in *in_addr, char *p) +{ + u8 *addr = (u8 *)&in_addr->sin_addr.s_addr; + size_t buflen = strlen(p); + + /* XXX: handle IPv6 addresses */ + + if (buflen > INET_ADDRSTRLEN) + return -EINVAL; + if (in4_pton(p, buflen, addr, '\0', NULL) == 0) + return -EINVAL; + in_addr->sin_family = AF_INET; + return 0; +} + +static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, + struct nvmf_ctrl_options *opts) +{ + struct nvme_rdma_ctrl *ctrl; + int ret; + bool changed; + + ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); + if (!ctrl) + return ERR_PTR(-ENOMEM); + ctrl->ctrl.opts = opts; + INIT_LIST_HEAD(&ctrl->list); + + ret = nvme_rdma_parse_ipaddr(&ctrl->addr_in, opts->traddr); + if (ret) { + pr_err("malformed IP address passed: %s\n", opts->traddr); + goto out_free_ctrl; + } + + if (opts->mask & NVMF_OPT_TRSVCID) { + u16 port; + + ret = kstrtou16(opts->trsvcid, 0, &port); + if (ret) + goto out_free_ctrl; + + ctrl->addr_in.sin_port = cpu_to_be16(port); + } else { + ctrl->addr_in.sin_port = cpu_to_be16(NVME_RDMA_IP_PORT); + } + + ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops, + 0 /* no quirks, we're perfect! */); + if (ret) + goto out_free_ctrl; + + ctrl->reconnect_delay = opts->reconnect_delay; + INIT_DELAYED_WORK(&ctrl->reconnect_work, + nvme_rdma_reconnect_ctrl_work); + INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work); + INIT_WORK(&ctrl->delete_work, nvme_rdma_del_ctrl_work); + INIT_WORK(&ctrl->reset_work, nvme_rdma_reset_ctrl_work); + spin_lock_init(&ctrl->lock); + + ctrl->queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */ + ctrl->ctrl.sqsize = opts->queue_size; + ctrl->ctrl.kato = opts->kato; + + ret = -ENOMEM; + ctrl->queues = kcalloc(ctrl->queue_count, sizeof(*ctrl->queues), + GFP_KERNEL); + if (!ctrl->queues) + goto out_uninit_ctrl; + + ret = nvme_rdma_configure_admin_queue(ctrl); + if (ret) + goto out_kfree_queues; + + /* sanity check icdoff */ + if (ctrl->ctrl.icdoff) { + dev_err(ctrl->ctrl.device, "icdoff is not supported!\n"); + goto out_remove_admin_queue; + } + + /* sanity check keyed sgls */ + if (!(ctrl->ctrl.sgls & (1 << 20))) { + dev_err(ctrl->ctrl.device, "Mandatory keyed sgls are not support\n"); + goto out_remove_admin_queue; + } + + if (opts->queue_size > ctrl->ctrl.maxcmd) { + /* warn if maxcmd is lower than queue_size */ + dev_warn(ctrl->ctrl.device, + "queue_size %zu > ctrl maxcmd %u, clamping down\n", + opts->queue_size, ctrl->ctrl.maxcmd); + opts->queue_size = ctrl->ctrl.maxcmd; + } + + if (opts->nr_io_queues) { + ret = nvme_rdma_create_io_queues(ctrl); + if (ret) + goto out_remove_admin_queue; + } + + changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); + WARN_ON_ONCE(!changed); + + dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n", + ctrl->ctrl.opts->subsysnqn, &ctrl->addr); + + kref_get(&ctrl->ctrl.kref); + + mutex_lock(&nvme_rdma_ctrl_mutex); + list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); + mutex_unlock(&nvme_rdma_ctrl_mutex); + + if (opts->nr_io_queues) { + nvme_queue_scan(&ctrl->ctrl); + nvme_queue_async_events(&ctrl->ctrl); + } + + return &ctrl->ctrl; + +out_remove_admin_queue: + nvme_stop_keep_alive(&ctrl->ctrl); + nvme_rdma_destroy_admin_queue(ctrl); +out_kfree_queues: + kfree(ctrl->queues); +out_uninit_ctrl: + nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); + if (ret > 0) + ret = -EIO; + return ERR_PTR(ret); +out_free_ctrl: + kfree(ctrl); + return ERR_PTR(ret); +} + +static struct nvmf_transport_ops nvme_rdma_transport = { + .name = "rdma", + .required_opts = NVMF_OPT_TRADDR, + .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY, + .create_ctrl = nvme_rdma_create_ctrl, +}; + +static int __init nvme_rdma_init_module(void) +{ + nvme_rdma_wq = create_workqueue("nvme_rdma_wq"); + if (!nvme_rdma_wq) + return -ENOMEM; + + nvmf_register_transport(&nvme_rdma_transport); + return 0; +} + +static void __exit nvme_rdma_cleanup_module(void) +{ + struct nvme_rdma_ctrl *ctrl; + + nvmf_unregister_transport(&nvme_rdma_transport); + + mutex_lock(&nvme_rdma_ctrl_mutex); + list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) + __nvme_rdma_del_ctrl(ctrl); + mutex_unlock(&nvme_rdma_ctrl_mutex); + + destroy_workqueue(nvme_rdma_wq); +} + +module_init(nvme_rdma_init_module); +module_exit(nvme_rdma_cleanup_module); + +MODULE_LICENSE("GPL v2"); diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig new file mode 100644 index 0000000..a5c31cb --- /dev/null +++ b/drivers/nvme/target/Kconfig @@ -0,0 +1,36 @@ + +config NVME_TARGET + tristate "NVMe Target support" + depends on BLOCK + depends on CONFIGFS_FS + help + This enabled target side support for the NVMe protocol, that is + it allows the Linux kernel to implement NVMe subsystems and + controllers and export Linux block devices as NVMe namespaces. + You need to select at least one of the transports below to make this + functionality useful. + + To configure the NVMe target you probably want to use the nvmetcli + tool from http://git.infradead.org/users/hch/nvmetcli.git. + +config NVME_TARGET_LOOP + tristate "NVMe loopback device support" + depends on BLK_DEV_NVME + depends on NVME_TARGET + select NVME_FABRICS + select SG_POOL + help + This enables the NVMe loopback device support, which can be useful + to test NVMe host and target side features. + + If unsure, say N. + +config NVME_TARGET_RDMA + tristate "NVMe over Fabrics RDMA target support" + depends on INFINIBAND + depends on NVME_TARGET + help + This enables the NVMe RDMA target support, which allows exporting NVMe + devices over RDMA. + + If unsure, say N. diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile new file mode 100644 index 0000000..b7a0623 --- /dev/null +++ b/drivers/nvme/target/Makefile @@ -0,0 +1,9 @@ + +obj-$(CONFIG_NVME_TARGET) += nvmet.o +obj-$(CONFIG_NVME_TARGET_LOOP) += nvme-loop.o +obj-$(CONFIG_NVME_TARGET_RDMA) += nvmet-rdma.o + +nvmet-y += core.o configfs.o admin-cmd.o io-cmd.o fabrics-cmd.o \ + discovery.o +nvme-loop-y += loop.o +nvmet-rdma-y += rdma.o diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c new file mode 100644 index 0000000..2fac17a --- /dev/null +++ b/drivers/nvme/target/admin-cmd.c @@ -0,0 +1,465 @@ +/* + * NVMe admin command implementation. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/random.h> +#include <generated/utsrelease.h> +#include "nvmet.h" + +u32 nvmet_get_log_page_len(struct nvme_command *cmd) +{ + u32 len = le16_to_cpu(cmd->get_log_page.numdu); + + len <<= 16; + len += le16_to_cpu(cmd->get_log_page.numdl); + /* NUMD is a 0's based value */ + len += 1; + len *= sizeof(u32); + + return len; +} + +static void nvmet_execute_get_log_page(struct nvmet_req *req) +{ + size_t data_len = nvmet_get_log_page_len(req->cmd); + void *buf; + u16 status = 0; + + buf = kzalloc(data_len, GFP_KERNEL); + if (!buf) { + status = NVME_SC_INTERNAL; + goto out; + } + + switch (req->cmd->get_log_page.lid) { + case 0x01: + /* + * We currently never set the More bit in the status field, + * so all error log entries are invalid and can be zeroed out. + * This is called a minum viable implementation (TM) of this + * mandatory log page. + */ + break; + case 0x02: + /* + * XXX: fill out actual smart log + * + * We might have a hard time coming up with useful values for + * many of the fields, and even when we have useful data + * available (e.g. units or commands read/written) those aren't + * persistent over power loss. + */ + break; + case 0x03: + /* + * We only support a single firmware slot which always is + * active, so we can zero out the whole firmware slot log and + * still claim to fully implement this mandatory log page. + */ + break; + default: + BUG(); + } + + status = nvmet_copy_to_sgl(req, 0, buf, data_len); + + kfree(buf); +out: + nvmet_req_complete(req, status); +} + +static void nvmet_execute_identify_ctrl(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvme_id_ctrl *id; + u64 serial; + u16 status = 0; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) { + status = NVME_SC_INTERNAL; + goto out; + } + + /* XXX: figure out how to assign real vendors IDs. */ + id->vid = 0; + id->ssvid = 0; + + /* generate a random serial number as our controllers are ephemeral: */ + get_random_bytes(&serial, sizeof(serial)); + memset(id->sn, ' ', sizeof(id->sn)); + snprintf(id->sn, sizeof(id->sn), "%llx", serial); + + memset(id->mn, ' ', sizeof(id->mn)); + strncpy((char *)id->mn, "Linux", sizeof(id->mn)); + + memset(id->fr, ' ', sizeof(id->fr)); + strncpy((char *)id->fr, UTS_RELEASE, sizeof(id->fr)); + + id->rab = 6; + + /* + * XXX: figure out how we can assign a IEEE OUI, but until then + * the safest is to leave it as zeroes. + */ + + /* we support multiple ports and multiples hosts: */ + id->mic = (1 << 0) | (1 << 1); + + /* no limit on data transfer sizes for now */ + id->mdts = 0; + id->cntlid = cpu_to_le16(ctrl->cntlid); + id->ver = cpu_to_le32(ctrl->subsys->ver); + + /* XXX: figure out what to do about RTD3R/RTD3 */ + id->oaes = cpu_to_le32(1 << 8); + id->ctratt = cpu_to_le32(1 << 0); + + id->oacs = 0; + + /* + * We don't really have a practical limit on the number of abort + * comands. But we don't do anything useful for abort either, so + * no point in allowing more abort commands than the spec requires. + */ + id->acl = 3; + + id->aerl = NVMET_ASYNC_EVENTS - 1; + + /* first slot is read-only, only one slot supported */ + id->frmw = (1 << 0) | (1 << 1); + id->lpa = (1 << 0) | (1 << 2); + id->elpe = NVMET_ERROR_LOG_SLOTS - 1; + id->npss = 0; + + /* We support keep-alive timeout in granularity of seconds */ + id->kas = cpu_to_le16(NVMET_KAS); + + id->sqes = (0x6 << 4) | 0x6; + id->cqes = (0x4 << 4) | 0x4; + + /* no enforcement soft-limit for maxcmd - pick arbitrary high value */ + id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); + + id->nn = cpu_to_le32(ctrl->subsys->max_nsid); + id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM); + + /* XXX: don't report vwc if the underlying device is write through */ + id->vwc = NVME_CTRL_VWC_PRESENT; + + /* + * We can't support atomic writes bigger than a LBA without support + * from the backend device. + */ + id->awun = 0; + id->awupf = 0; + + id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ + if (ctrl->ops->has_keyed_sgls) + id->sgls |= cpu_to_le32(1 << 2); + if (ctrl->ops->sqe_inline_size) + id->sgls |= cpu_to_le32(1 << 20); + + strcpy(id->subnqn, ctrl->subsys->subsysnqn); + + /* Max command capsule size is sqe + single page of in-capsule data */ + id->ioccsz = cpu_to_le32((sizeof(struct nvme_command) + + ctrl->ops->sqe_inline_size) / 16); + /* Max response capsule size is cqe */ + id->iorcsz = cpu_to_le32(sizeof(struct nvme_completion) / 16); + + id->msdbd = ctrl->ops->msdbd; + + /* + * Meh, we don't really support any power state. Fake up the same + * values that qemu does. + */ + id->psd[0].max_power = cpu_to_le16(0x9c4); + id->psd[0].entry_lat = cpu_to_le32(0x10); + id->psd[0].exit_lat = cpu_to_le32(0x4); + + status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); + + kfree(id); +out: + nvmet_req_complete(req, status); +} + +static void nvmet_execute_identify_ns(struct nvmet_req *req) +{ + struct nvmet_ns *ns; + struct nvme_id_ns *id; + u16 status = 0; + + ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid); + if (!ns) { + status = NVME_SC_INVALID_NS | NVME_SC_DNR; + goto out; + } + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) { + status = NVME_SC_INTERNAL; + goto out_put_ns; + } + + /* + * nuse = ncap = nsze isn't aways true, but we have no way to find + * that out from the underlying device. + */ + id->ncap = id->nuse = id->nsze = + cpu_to_le64(ns->size >> ns->blksize_shift); + + /* + * We just provide a single LBA format that matches what the + * underlying device reports. + */ + id->nlbaf = 0; + id->flbas = 0; + + /* + * Our namespace might always be shared. Not just with other + * controllers, but also with any other user of the block device. + */ + id->nmic = (1 << 0); + + memcpy(&id->nguid, &ns->nguid, sizeof(uuid_le)); + + id->lbaf[0].ds = ns->blksize_shift; + + status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); + + kfree(id); +out_put_ns: + nvmet_put_namespace(ns); +out: + nvmet_req_complete(req, status); +} + +static void nvmet_execute_identify_nslist(struct nvmet_req *req) +{ + static const int buf_size = 4096; + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvmet_ns *ns; + u32 min_nsid = le32_to_cpu(req->cmd->identify.nsid); + __le32 *list; + u16 status = 0; + int i = 0; + + list = kzalloc(buf_size, GFP_KERNEL); + if (!list) { + status = NVME_SC_INTERNAL; + goto out; + } + + rcu_read_lock(); + list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) { + if (ns->nsid <= min_nsid) + continue; + list[i++] = cpu_to_le32(ns->nsid); + if (i == buf_size / sizeof(__le32)) + break; + } + rcu_read_unlock(); + + status = nvmet_copy_to_sgl(req, 0, list, buf_size); + + kfree(list); +out: + nvmet_req_complete(req, status); +} + +/* + * A "mimimum viable" abort implementation: the command is mandatory in the + * spec, but we are not required to do any useful work. We couldn't really + * do a useful abort, so don't bother even with waiting for the command + * to be exectuted and return immediately telling the command to abort + * wasn't found. + */ +static void nvmet_execute_abort(struct nvmet_req *req) +{ + nvmet_set_result(req, 1); + nvmet_req_complete(req, 0); +} + +static void nvmet_execute_set_features(struct nvmet_req *req) +{ + struct nvmet_subsys *subsys = req->sq->ctrl->subsys; + u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]); + u64 val; + u32 val32; + u16 status = 0; + + switch (cdw10 & 0xf) { + case NVME_FEAT_NUM_QUEUES: + nvmet_set_result(req, + (subsys->max_qid - 1) | ((subsys->max_qid - 1) << 16)); + break; + case NVME_FEAT_KATO: + val = le64_to_cpu(req->cmd->prop_set.value); + val32 = val & 0xffff; + req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000); + nvmet_set_result(req, req->sq->ctrl->kato); + break; + default: + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + + nvmet_req_complete(req, status); +} + +static void nvmet_execute_get_features(struct nvmet_req *req) +{ + struct nvmet_subsys *subsys = req->sq->ctrl->subsys; + u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]); + u16 status = 0; + + switch (cdw10 & 0xf) { + /* + * These features are mandatory in the spec, but we don't + * have a useful way to implement them. We'll eventually + * need to come up with some fake values for these. + */ +#if 0 + case NVME_FEAT_ARBITRATION: + break; + case NVME_FEAT_POWER_MGMT: + break; + case NVME_FEAT_TEMP_THRESH: + break; + case NVME_FEAT_ERR_RECOVERY: + break; + case NVME_FEAT_IRQ_COALESCE: + break; + case NVME_FEAT_IRQ_CONFIG: + break; + case NVME_FEAT_WRITE_ATOMIC: + break; + case NVME_FEAT_ASYNC_EVENT: + break; +#endif + case NVME_FEAT_VOLATILE_WC: + nvmet_set_result(req, 1); + break; + case NVME_FEAT_NUM_QUEUES: + nvmet_set_result(req, + (subsys->max_qid-1) | ((subsys->max_qid-1) << 16)); + break; + case NVME_FEAT_KATO: + nvmet_set_result(req, req->sq->ctrl->kato * 1000); + break; + default: + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + + nvmet_req_complete(req, status); +} + +static void nvmet_execute_async_event(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + + mutex_lock(&ctrl->lock); + if (ctrl->nr_async_event_cmds >= NVMET_ASYNC_EVENTS) { + mutex_unlock(&ctrl->lock); + nvmet_req_complete(req, NVME_SC_ASYNC_LIMIT | NVME_SC_DNR); + return; + } + ctrl->async_event_cmds[ctrl->nr_async_event_cmds++] = req; + mutex_unlock(&ctrl->lock); + + schedule_work(&ctrl->async_event_work); +} + +static void nvmet_execute_keep_alive(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + + pr_debug("ctrl %d update keep-alive timer for %d secs\n", + ctrl->cntlid, ctrl->kato); + + mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ); + nvmet_req_complete(req, 0); +} + +int nvmet_parse_admin_cmd(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + req->ns = NULL; + + if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) { + pr_err("nvmet: got admin cmd %d while CC.EN == 0\n", + cmd->common.opcode); + return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + } + if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) { + pr_err("nvmet: got admin cmd %d while CSTS.RDY == 0\n", + cmd->common.opcode); + return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + } + + switch (cmd->common.opcode) { + case nvme_admin_get_log_page: + req->data_len = nvmet_get_log_page_len(cmd); + + switch (cmd->get_log_page.lid) { + case 0x01: + case 0x02: + case 0x03: + req->execute = nvmet_execute_get_log_page; + return 0; + } + break; + case nvme_admin_identify: + req->data_len = 4096; + switch (le32_to_cpu(cmd->identify.cns)) { + case 0x00: + req->execute = nvmet_execute_identify_ns; + return 0; + case 0x01: + req->execute = nvmet_execute_identify_ctrl; + return 0; + case 0x02: + req->execute = nvmet_execute_identify_nslist; + return 0; + } + break; + case nvme_admin_abort_cmd: + req->execute = nvmet_execute_abort; + req->data_len = 0; + return 0; + case nvme_admin_set_features: + req->execute = nvmet_execute_set_features; + req->data_len = 0; + return 0; + case nvme_admin_get_features: + req->execute = nvmet_execute_get_features; + req->data_len = 0; + return 0; + case nvme_admin_async_event: + req->execute = nvmet_execute_async_event; + req->data_len = 0; + return 0; + case nvme_admin_keep_alive: + req->execute = nvmet_execute_keep_alive; + req->data_len = 0; + return 0; + } + + pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; +} diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c new file mode 100644 index 0000000..af5e2dc --- /dev/null +++ b/drivers/nvme/target/configfs.c @@ -0,0 +1,917 @@ +/* + * Configfs interface for the NVMe target. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/stat.h> +#include <linux/ctype.h> + +#include "nvmet.h" + +static struct config_item_type nvmet_host_type; +static struct config_item_type nvmet_subsys_type; + +/* + * nvmet_port Generic ConfigFS definitions. + * Used in any place in the ConfigFS tree that refers to an address. + */ +static ssize_t nvmet_addr_adrfam_show(struct config_item *item, + char *page) +{ + switch (to_nvmet_port(item)->disc_addr.adrfam) { + case NVMF_ADDR_FAMILY_IP4: + return sprintf(page, "ipv4\n"); + case NVMF_ADDR_FAMILY_IP6: + return sprintf(page, "ipv6\n"); + case NVMF_ADDR_FAMILY_IB: + return sprintf(page, "ib\n"); + default: + return sprintf(page, "\n"); + } +} + +static ssize_t nvmet_addr_adrfam_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + + if (port->enabled) { + pr_err("Cannot modify address while enabled\n"); + pr_err("Disable the address before modifying\n"); + return -EACCES; + } + + if (sysfs_streq(page, "ipv4")) { + port->disc_addr.adrfam = NVMF_ADDR_FAMILY_IP4; + } else if (sysfs_streq(page, "ipv6")) { + port->disc_addr.adrfam = NVMF_ADDR_FAMILY_IP6; + } else if (sysfs_streq(page, "ib")) { + port->disc_addr.adrfam = NVMF_ADDR_FAMILY_IB; + } else { + pr_err("Invalid value '%s' for adrfam\n", page); + return -EINVAL; + } + + return count; +} + +CONFIGFS_ATTR(nvmet_, addr_adrfam); + +static ssize_t nvmet_addr_portid_show(struct config_item *item, + char *page) +{ + struct nvmet_port *port = to_nvmet_port(item); + + return snprintf(page, PAGE_SIZE, "%d\n", + le16_to_cpu(port->disc_addr.portid)); +} + +static ssize_t nvmet_addr_portid_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + u16 portid = 0; + + if (kstrtou16(page, 0, &portid)) { + pr_err("Invalid value '%s' for portid\n", page); + return -EINVAL; + } + + if (port->enabled) { + pr_err("Cannot modify address while enabled\n"); + pr_err("Disable the address before modifying\n"); + return -EACCES; + } + port->disc_addr.portid = cpu_to_le16(portid); + return count; +} + +CONFIGFS_ATTR(nvmet_, addr_portid); + +static ssize_t nvmet_addr_traddr_show(struct config_item *item, + char *page) +{ + struct nvmet_port *port = to_nvmet_port(item); + + return snprintf(page, PAGE_SIZE, "%s\n", + port->disc_addr.traddr); +} + +static ssize_t nvmet_addr_traddr_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + + if (count > NVMF_TRADDR_SIZE) { + pr_err("Invalid value '%s' for traddr\n", page); + return -EINVAL; + } + + if (port->enabled) { + pr_err("Cannot modify address while enabled\n"); + pr_err("Disable the address before modifying\n"); + return -EACCES; + } + return snprintf(port->disc_addr.traddr, + sizeof(port->disc_addr.traddr), "%s", page); +} + +CONFIGFS_ATTR(nvmet_, addr_traddr); + +static ssize_t nvmet_addr_treq_show(struct config_item *item, + char *page) +{ + switch (to_nvmet_port(item)->disc_addr.treq) { + case NVMF_TREQ_NOT_SPECIFIED: + return sprintf(page, "not specified\n"); + case NVMF_TREQ_REQUIRED: + return sprintf(page, "required\n"); + case NVMF_TREQ_NOT_REQUIRED: + return sprintf(page, "not required\n"); + default: + return sprintf(page, "\n"); + } +} + +static ssize_t nvmet_addr_treq_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + + if (port->enabled) { + pr_err("Cannot modify address while enabled\n"); + pr_err("Disable the address before modifying\n"); + return -EACCES; + } + + if (sysfs_streq(page, "not specified")) { + port->disc_addr.treq = NVMF_TREQ_NOT_SPECIFIED; + } else if (sysfs_streq(page, "required")) { + port->disc_addr.treq = NVMF_TREQ_REQUIRED; + } else if (sysfs_streq(page, "not required")) { + port->disc_addr.treq = NVMF_TREQ_NOT_REQUIRED; + } else { + pr_err("Invalid value '%s' for treq\n", page); + return -EINVAL; + } + + return count; +} + +CONFIGFS_ATTR(nvmet_, addr_treq); + +static ssize_t nvmet_addr_trsvcid_show(struct config_item *item, + char *page) +{ + struct nvmet_port *port = to_nvmet_port(item); + + return snprintf(page, PAGE_SIZE, "%s\n", + port->disc_addr.trsvcid); +} + +static ssize_t nvmet_addr_trsvcid_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + + if (count > NVMF_TRSVCID_SIZE) { + pr_err("Invalid value '%s' for trsvcid\n", page); + return -EINVAL; + } + if (port->enabled) { + pr_err("Cannot modify address while enabled\n"); + pr_err("Disable the address before modifying\n"); + return -EACCES; + } + return snprintf(port->disc_addr.trsvcid, + sizeof(port->disc_addr.trsvcid), "%s", page); +} + +CONFIGFS_ATTR(nvmet_, addr_trsvcid); + +static ssize_t nvmet_addr_trtype_show(struct config_item *item, + char *page) +{ + switch (to_nvmet_port(item)->disc_addr.trtype) { + case NVMF_TRTYPE_RDMA: + return sprintf(page, "rdma\n"); + case NVMF_TRTYPE_LOOP: + return sprintf(page, "loop\n"); + default: + return sprintf(page, "\n"); + } +} + +static void nvmet_port_init_tsas_rdma(struct nvmet_port *port) +{ + port->disc_addr.trtype = NVMF_TRTYPE_RDMA; + memset(&port->disc_addr.tsas.rdma, 0, NVMF_TSAS_SIZE); + port->disc_addr.tsas.rdma.qptype = NVMF_RDMA_QPTYPE_CONNECTED; + port->disc_addr.tsas.rdma.prtype = NVMF_RDMA_PRTYPE_NOT_SPECIFIED; + port->disc_addr.tsas.rdma.cms = NVMF_RDMA_CMS_RDMA_CM; +} + +static void nvmet_port_init_tsas_loop(struct nvmet_port *port) +{ + port->disc_addr.trtype = NVMF_TRTYPE_LOOP; + memset(&port->disc_addr.tsas, 0, NVMF_TSAS_SIZE); +} + +static ssize_t nvmet_addr_trtype_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + + if (port->enabled) { + pr_err("Cannot modify address while enabled\n"); + pr_err("Disable the address before modifying\n"); + return -EACCES; + } + + if (sysfs_streq(page, "rdma")) { + nvmet_port_init_tsas_rdma(port); + } else if (sysfs_streq(page, "loop")) { + nvmet_port_init_tsas_loop(port); + } else { + pr_err("Invalid value '%s' for trtype\n", page); + return -EINVAL; + } + + return count; +} + +CONFIGFS_ATTR(nvmet_, addr_trtype); + +/* + * Namespace structures & file operation functions below + */ +static ssize_t nvmet_ns_device_path_show(struct config_item *item, char *page) +{ + return sprintf(page, "%s\n", to_nvmet_ns(item)->device_path); +} + +static ssize_t nvmet_ns_device_path_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + struct nvmet_subsys *subsys = ns->subsys; + int ret; + + mutex_lock(&subsys->lock); + ret = -EBUSY; + if (nvmet_ns_enabled(ns)) + goto out_unlock; + + kfree(ns->device_path); + + ret = -ENOMEM; + ns->device_path = kstrdup(page, GFP_KERNEL); + if (!ns->device_path) + goto out_unlock; + + mutex_unlock(&subsys->lock); + return count; + +out_unlock: + mutex_unlock(&subsys->lock); + return ret; +} + +CONFIGFS_ATTR(nvmet_ns_, device_path); + +static ssize_t nvmet_ns_device_nguid_show(struct config_item *item, char *page) +{ + return sprintf(page, "%pUb\n", &to_nvmet_ns(item)->nguid); +} + +static ssize_t nvmet_ns_device_nguid_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + struct nvmet_subsys *subsys = ns->subsys; + u8 nguid[16]; + const char *p = page; + int i; + int ret = 0; + + mutex_lock(&subsys->lock); + if (nvmet_ns_enabled(ns)) { + ret = -EBUSY; + goto out_unlock; + } + + for (i = 0; i < 16; i++) { + if (p + 2 > page + count) { + ret = -EINVAL; + goto out_unlock; + } + if (!isxdigit(p[0]) || !isxdigit(p[1])) { + ret = -EINVAL; + goto out_unlock; + } + + nguid[i] = (hex_to_bin(p[0]) << 4) | hex_to_bin(p[1]); + p += 2; + + if (*p == '-' || *p == ':') + p++; + } + + memcpy(&ns->nguid, nguid, sizeof(nguid)); +out_unlock: + mutex_unlock(&subsys->lock); + return ret ? ret : count; +} + +CONFIGFS_ATTR(nvmet_ns_, device_nguid); + +static ssize_t nvmet_ns_enable_show(struct config_item *item, char *page) +{ + return sprintf(page, "%d\n", nvmet_ns_enabled(to_nvmet_ns(item))); +} + +static ssize_t nvmet_ns_enable_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + bool enable; + int ret = 0; + + if (strtobool(page, &enable)) + return -EINVAL; + + if (enable) + ret = nvmet_ns_enable(ns); + else + nvmet_ns_disable(ns); + + return ret ? ret : count; +} + +CONFIGFS_ATTR(nvmet_ns_, enable); + +static struct configfs_attribute *nvmet_ns_attrs[] = { + &nvmet_ns_attr_device_path, + &nvmet_ns_attr_device_nguid, + &nvmet_ns_attr_enable, + NULL, +}; + +static void nvmet_ns_release(struct config_item *item) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + + nvmet_ns_free(ns); +} + +static struct configfs_item_operations nvmet_ns_item_ops = { + .release = nvmet_ns_release, +}; + +static struct config_item_type nvmet_ns_type = { + .ct_item_ops = &nvmet_ns_item_ops, + .ct_attrs = nvmet_ns_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct config_group *nvmet_ns_make(struct config_group *group, + const char *name) +{ + struct nvmet_subsys *subsys = namespaces_to_subsys(&group->cg_item); + struct nvmet_ns *ns; + int ret; + u32 nsid; + + ret = kstrtou32(name, 0, &nsid); + if (ret) + goto out; + + ret = -EINVAL; + if (nsid == 0 || nsid == 0xffffffff) + goto out; + + ret = -ENOMEM; + ns = nvmet_ns_alloc(subsys, nsid); + if (!ns) + goto out; + config_group_init_type_name(&ns->group, name, &nvmet_ns_type); + + pr_info("adding nsid %d to subsystem %s\n", nsid, subsys->subsysnqn); + + return &ns->group; +out: + return ERR_PTR(ret); +} + +static struct configfs_group_operations nvmet_namespaces_group_ops = { + .make_group = nvmet_ns_make, +}; + +static struct config_item_type nvmet_namespaces_type = { + .ct_group_ops = &nvmet_namespaces_group_ops, + .ct_owner = THIS_MODULE, +}; + +static int nvmet_port_subsys_allow_link(struct config_item *parent, + struct config_item *target) +{ + struct nvmet_port *port = to_nvmet_port(parent->ci_parent); + struct nvmet_subsys *subsys; + struct nvmet_subsys_link *link, *p; + int ret; + + if (target->ci_type != &nvmet_subsys_type) { + pr_err("can only link subsystems into the subsystems dir.!\n"); + return -EINVAL; + } + subsys = to_subsys(target); + link = kmalloc(sizeof(*link), GFP_KERNEL); + if (!link) + return -ENOMEM; + link->subsys = subsys; + + down_write(&nvmet_config_sem); + ret = -EEXIST; + list_for_each_entry(p, &port->subsystems, entry) { + if (p->subsys == subsys) + goto out_free_link; + } + + if (list_empty(&port->subsystems)) { + ret = nvmet_enable_port(port); + if (ret) + goto out_free_link; + } + + list_add_tail(&link->entry, &port->subsystems); + nvmet_genctr++; + up_write(&nvmet_config_sem); + return 0; + +out_free_link: + up_write(&nvmet_config_sem); + kfree(link); + return ret; +} + +static int nvmet_port_subsys_drop_link(struct config_item *parent, + struct config_item *target) +{ + struct nvmet_port *port = to_nvmet_port(parent->ci_parent); + struct nvmet_subsys *subsys = to_subsys(target); + struct nvmet_subsys_link *p; + + down_write(&nvmet_config_sem); + list_for_each_entry(p, &port->subsystems, entry) { + if (p->subsys == subsys) + goto found; + } + up_write(&nvmet_config_sem); + return -EINVAL; + +found: + list_del(&p->entry); + nvmet_genctr++; + if (list_empty(&port->subsystems)) + nvmet_disable_port(port); + up_write(&nvmet_config_sem); + kfree(p); + return 0; +} + +static struct configfs_item_operations nvmet_port_subsys_item_ops = { + .allow_link = nvmet_port_subsys_allow_link, + .drop_link = nvmet_port_subsys_drop_link, +}; + +static struct config_item_type nvmet_port_subsys_type = { + .ct_item_ops = &nvmet_port_subsys_item_ops, + .ct_owner = THIS_MODULE, +}; + +static int nvmet_allowed_hosts_allow_link(struct config_item *parent, + struct config_item *target) +{ + struct nvmet_subsys *subsys = to_subsys(parent->ci_parent); + struct nvmet_host *host; + struct nvmet_host_link *link, *p; + int ret; + + if (target->ci_type != &nvmet_host_type) { + pr_err("can only link hosts into the allowed_hosts directory!\n"); + return -EINVAL; + } + + host = to_host(target); + link = kmalloc(sizeof(*link), GFP_KERNEL); + if (!link) + return -ENOMEM; + link->host = host; + + down_write(&nvmet_config_sem); + ret = -EINVAL; + if (subsys->allow_any_host) { + pr_err("can't add hosts when allow_any_host is set!\n"); + goto out_free_link; + } + + ret = -EEXIST; + list_for_each_entry(p, &subsys->hosts, entry) { + if (!strcmp(nvmet_host_name(p->host), nvmet_host_name(host))) + goto out_free_link; + } + list_add_tail(&link->entry, &subsys->hosts); + nvmet_genctr++; + up_write(&nvmet_config_sem); + return 0; +out_free_link: + up_write(&nvmet_config_sem); + kfree(link); + return ret; +} + +static int nvmet_allowed_hosts_drop_link(struct config_item *parent, + struct config_item *target) +{ + struct nvmet_subsys *subsys = to_subsys(parent->ci_parent); + struct nvmet_host *host = to_host(target); + struct nvmet_host_link *p; + + down_write(&nvmet_config_sem); + list_for_each_entry(p, &subsys->hosts, entry) { + if (!strcmp(nvmet_host_name(p->host), nvmet_host_name(host))) + goto found; + } + up_write(&nvmet_config_sem); + return -EINVAL; + +found: + list_del(&p->entry); + nvmet_genctr++; + up_write(&nvmet_config_sem); + kfree(p); + return 0; +} + +static struct configfs_item_operations nvmet_allowed_hosts_item_ops = { + .allow_link = nvmet_allowed_hosts_allow_link, + .drop_link = nvmet_allowed_hosts_drop_link, +}; + +static struct config_item_type nvmet_allowed_hosts_type = { + .ct_item_ops = &nvmet_allowed_hosts_item_ops, + .ct_owner = THIS_MODULE, +}; + +static ssize_t nvmet_subsys_attr_allow_any_host_show(struct config_item *item, + char *page) +{ + return snprintf(page, PAGE_SIZE, "%d\n", + to_subsys(item)->allow_any_host); +} + +static ssize_t nvmet_subsys_attr_allow_any_host_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item); + bool allow_any_host; + int ret = 0; + + if (strtobool(page, &allow_any_host)) + return -EINVAL; + + down_write(&nvmet_config_sem); + if (allow_any_host && !list_empty(&subsys->hosts)) { + pr_err("Can't set allow_any_host when explicit hosts are set!\n"); + ret = -EINVAL; + goto out_unlock; + } + + subsys->allow_any_host = allow_any_host; +out_unlock: + up_write(&nvmet_config_sem); + return ret ? ret : count; +} + +CONFIGFS_ATTR(nvmet_subsys_, attr_allow_any_host); + +static struct configfs_attribute *nvmet_subsys_attrs[] = { + &nvmet_subsys_attr_attr_allow_any_host, + NULL, +}; + +/* + * Subsystem structures & folder operation functions below + */ +static void nvmet_subsys_release(struct config_item *item) +{ + struct nvmet_subsys *subsys = to_subsys(item); + + nvmet_subsys_put(subsys); +} + +static struct configfs_item_operations nvmet_subsys_item_ops = { + .release = nvmet_subsys_release, +}; + +static struct config_item_type nvmet_subsys_type = { + .ct_item_ops = &nvmet_subsys_item_ops, + .ct_attrs = nvmet_subsys_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct config_group *nvmet_subsys_make(struct config_group *group, + const char *name) +{ + struct nvmet_subsys *subsys; + + if (sysfs_streq(name, NVME_DISC_SUBSYS_NAME)) { + pr_err("can't create discovery subsystem through configfs\n"); + return ERR_PTR(-EINVAL); + } + + subsys = nvmet_subsys_alloc(name, NVME_NQN_NVME); + if (!subsys) + return ERR_PTR(-ENOMEM); + + config_group_init_type_name(&subsys->group, name, &nvmet_subsys_type); + + config_group_init_type_name(&subsys->namespaces_group, + "namespaces", &nvmet_namespaces_type); + configfs_add_default_group(&subsys->namespaces_group, &subsys->group); + + config_group_init_type_name(&subsys->allowed_hosts_group, + "allowed_hosts", &nvmet_allowed_hosts_type); + configfs_add_default_group(&subsys->allowed_hosts_group, + &subsys->group); + + return &subsys->group; +} + +static struct configfs_group_operations nvmet_subsystems_group_ops = { + .make_group = nvmet_subsys_make, +}; + +static struct config_item_type nvmet_subsystems_type = { + .ct_group_ops = &nvmet_subsystems_group_ops, + .ct_owner = THIS_MODULE, +}; + +static ssize_t nvmet_referral_enable_show(struct config_item *item, + char *page) +{ + return snprintf(page, PAGE_SIZE, "%d\n", to_nvmet_port(item)->enabled); +} + +static ssize_t nvmet_referral_enable_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *parent = to_nvmet_port(item->ci_parent->ci_parent); + struct nvmet_port *port = to_nvmet_port(item); + bool enable; + + if (strtobool(page, &enable)) + goto inval; + + if (enable) + nvmet_referral_enable(parent, port); + else + nvmet_referral_disable(port); + + return count; +inval: + pr_err("Invalid value '%s' for enable\n", page); + return -EINVAL; +} + +CONFIGFS_ATTR(nvmet_referral_, enable); + +/* + * Discovery Service subsystem definitions + */ +static struct configfs_attribute *nvmet_referral_attrs[] = { + &nvmet_attr_addr_adrfam, + &nvmet_attr_addr_portid, + &nvmet_attr_addr_treq, + &nvmet_attr_addr_traddr, + &nvmet_attr_addr_trsvcid, + &nvmet_attr_addr_trtype, + &nvmet_referral_attr_enable, + NULL, +}; + +static void nvmet_referral_release(struct config_item *item) +{ + struct nvmet_port *port = to_nvmet_port(item); + + nvmet_referral_disable(port); + kfree(port); +} + +static struct configfs_item_operations nvmet_referral_item_ops = { + .release = nvmet_referral_release, +}; + +static struct config_item_type nvmet_referral_type = { + .ct_owner = THIS_MODULE, + .ct_attrs = nvmet_referral_attrs, + .ct_item_ops = &nvmet_referral_item_ops, +}; + +static struct config_group *nvmet_referral_make( + struct config_group *group, const char *name) +{ + struct nvmet_port *port; + + port = kzalloc(sizeof(*port), GFP_KERNEL); + if (!port) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&port->entry); + config_group_init_type_name(&port->group, name, &nvmet_referral_type); + + return &port->group; +} + +static struct configfs_group_operations nvmet_referral_group_ops = { + .make_group = nvmet_referral_make, +}; + +static struct config_item_type nvmet_referrals_type = { + .ct_owner = THIS_MODULE, + .ct_group_ops = &nvmet_referral_group_ops, +}; + +/* + * Ports definitions. + */ +static void nvmet_port_release(struct config_item *item) +{ + struct nvmet_port *port = to_nvmet_port(item); + + kfree(port); +} + +static struct configfs_attribute *nvmet_port_attrs[] = { + &nvmet_attr_addr_adrfam, + &nvmet_attr_addr_treq, + &nvmet_attr_addr_traddr, + &nvmet_attr_addr_trsvcid, + &nvmet_attr_addr_trtype, + NULL, +}; + +static struct configfs_item_operations nvmet_port_item_ops = { + .release = nvmet_port_release, +}; + +static struct config_item_type nvmet_port_type = { + .ct_attrs = nvmet_port_attrs, + .ct_item_ops = &nvmet_port_item_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_group *nvmet_ports_make(struct config_group *group, + const char *name) +{ + struct nvmet_port *port; + u16 portid; + + if (kstrtou16(name, 0, &portid)) + return ERR_PTR(-EINVAL); + + port = kzalloc(sizeof(*port), GFP_KERNEL); + if (!port) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&port->entry); + INIT_LIST_HEAD(&port->subsystems); + INIT_LIST_HEAD(&port->referrals); + + port->disc_addr.portid = cpu_to_le16(portid); + config_group_init_type_name(&port->group, name, &nvmet_port_type); + + config_group_init_type_name(&port->subsys_group, + "subsystems", &nvmet_port_subsys_type); + configfs_add_default_group(&port->subsys_group, &port->group); + + config_group_init_type_name(&port->referrals_group, + "referrals", &nvmet_referrals_type); + configfs_add_default_group(&port->referrals_group, &port->group); + + return &port->group; +} + +static struct configfs_group_operations nvmet_ports_group_ops = { + .make_group = nvmet_ports_make, +}; + +static struct config_item_type nvmet_ports_type = { + .ct_group_ops = &nvmet_ports_group_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_group nvmet_subsystems_group; +static struct config_group nvmet_ports_group; + +static void nvmet_host_release(struct config_item *item) +{ + struct nvmet_host *host = to_host(item); + + kfree(host); +} + +static struct configfs_item_operations nvmet_host_item_ops = { + .release = nvmet_host_release, +}; + +static struct config_item_type nvmet_host_type = { + .ct_item_ops = &nvmet_host_item_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_group *nvmet_hosts_make_group(struct config_group *group, + const char *name) +{ + struct nvmet_host *host; + + host = kzalloc(sizeof(*host), GFP_KERNEL); + if (!host) + return ERR_PTR(-ENOMEM); + + config_group_init_type_name(&host->group, name, &nvmet_host_type); + + return &host->group; +} + +static struct configfs_group_operations nvmet_hosts_group_ops = { + .make_group = nvmet_hosts_make_group, +}; + +static struct config_item_type nvmet_hosts_type = { + .ct_group_ops = &nvmet_hosts_group_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_group nvmet_hosts_group; + +static struct config_item_type nvmet_root_type = { + .ct_owner = THIS_MODULE, +}; + +static struct configfs_subsystem nvmet_configfs_subsystem = { + .su_group = { + .cg_item = { + .ci_namebuf = "nvmet", + .ci_type = &nvmet_root_type, + }, + }, +}; + +int __init nvmet_init_configfs(void) +{ + int ret; + + config_group_init(&nvmet_configfs_subsystem.su_group); + mutex_init(&nvmet_configfs_subsystem.su_mutex); + + config_group_init_type_name(&nvmet_subsystems_group, + "subsystems", &nvmet_subsystems_type); + configfs_add_default_group(&nvmet_subsystems_group, + &nvmet_configfs_subsystem.su_group); + + config_group_init_type_name(&nvmet_ports_group, + "ports", &nvmet_ports_type); + configfs_add_default_group(&nvmet_ports_group, + &nvmet_configfs_subsystem.su_group); + + config_group_init_type_name(&nvmet_hosts_group, + "hosts", &nvmet_hosts_type); + configfs_add_default_group(&nvmet_hosts_group, + &nvmet_configfs_subsystem.su_group); + + ret = configfs_register_subsystem(&nvmet_configfs_subsystem); + if (ret) { + pr_err("configfs_register_subsystem: %d\n", ret); + return ret; + } + + return 0; +} + +void __exit nvmet_exit_configfs(void) +{ + configfs_unregister_subsystem(&nvmet_configfs_subsystem); +} diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c new file mode 100644 index 0000000..8a891ca --- /dev/null +++ b/drivers/nvme/target/core.c @@ -0,0 +1,964 @@ +/* + * Common code for the NVMe target. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include "nvmet.h" + +static struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX]; + +/* + * This read/write semaphore is used to synchronize access to configuration + * information on a target system that will result in discovery log page + * information change for at least one host. + * The full list of resources to protected by this semaphore is: + * + * - subsystems list + * - per-subsystem allowed hosts list + * - allow_any_host subsystem attribute + * - nvmet_genctr + * - the nvmet_transports array + * + * When updating any of those lists/structures write lock should be obtained, + * while when reading (popolating discovery log page or checking host-subsystem + * link) read lock is obtained to allow concurrent reads. + */ +DECLARE_RWSEM(nvmet_config_sem); + +static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, + const char *subsysnqn); + +u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf, + size_t len) +{ + if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len) + return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR; + return 0; +} + +u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len) +{ + if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len) + return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR; + return 0; +} + +static u32 nvmet_async_event_result(struct nvmet_async_event *aen) +{ + return aen->event_type | (aen->event_info << 8) | (aen->log_page << 16); +} + +static void nvmet_async_events_free(struct nvmet_ctrl *ctrl) +{ + struct nvmet_req *req; + + while (1) { + mutex_lock(&ctrl->lock); + if (!ctrl->nr_async_event_cmds) { + mutex_unlock(&ctrl->lock); + return; + } + + req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds]; + mutex_unlock(&ctrl->lock); + nvmet_req_complete(req, NVME_SC_INTERNAL | NVME_SC_DNR); + } +} + +static void nvmet_async_event_work(struct work_struct *work) +{ + struct nvmet_ctrl *ctrl = + container_of(work, struct nvmet_ctrl, async_event_work); + struct nvmet_async_event *aen; + struct nvmet_req *req; + + while (1) { + mutex_lock(&ctrl->lock); + aen = list_first_entry_or_null(&ctrl->async_events, + struct nvmet_async_event, entry); + if (!aen || !ctrl->nr_async_event_cmds) { + mutex_unlock(&ctrl->lock); + return; + } + + req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds]; + nvmet_set_result(req, nvmet_async_event_result(aen)); + + list_del(&aen->entry); + kfree(aen); + + mutex_unlock(&ctrl->lock); + nvmet_req_complete(req, 0); + } +} + +static void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type, + u8 event_info, u8 log_page) +{ + struct nvmet_async_event *aen; + + aen = kmalloc(sizeof(*aen), GFP_KERNEL); + if (!aen) + return; + + aen->event_type = event_type; + aen->event_info = event_info; + aen->log_page = log_page; + + mutex_lock(&ctrl->lock); + list_add_tail(&aen->entry, &ctrl->async_events); + mutex_unlock(&ctrl->lock); + + schedule_work(&ctrl->async_event_work); +} + +int nvmet_register_transport(struct nvmet_fabrics_ops *ops) +{ + int ret = 0; + + down_write(&nvmet_config_sem); + if (nvmet_transports[ops->type]) + ret = -EINVAL; + else + nvmet_transports[ops->type] = ops; + up_write(&nvmet_config_sem); + + return ret; +} +EXPORT_SYMBOL_GPL(nvmet_register_transport); + +void nvmet_unregister_transport(struct nvmet_fabrics_ops *ops) +{ + down_write(&nvmet_config_sem); + nvmet_transports[ops->type] = NULL; + up_write(&nvmet_config_sem); +} +EXPORT_SYMBOL_GPL(nvmet_unregister_transport); + +int nvmet_enable_port(struct nvmet_port *port) +{ + struct nvmet_fabrics_ops *ops; + int ret; + + lockdep_assert_held(&nvmet_config_sem); + + ops = nvmet_transports[port->disc_addr.trtype]; + if (!ops) { + up_write(&nvmet_config_sem); + request_module("nvmet-transport-%d", port->disc_addr.trtype); + down_write(&nvmet_config_sem); + ops = nvmet_transports[port->disc_addr.trtype]; + if (!ops) { + pr_err("transport type %d not supported\n", + port->disc_addr.trtype); + return -EINVAL; + } + } + + if (!try_module_get(ops->owner)) + return -EINVAL; + + ret = ops->add_port(port); + if (ret) { + module_put(ops->owner); + return ret; + } + + port->enabled = true; + return 0; +} + +void nvmet_disable_port(struct nvmet_port *port) +{ + struct nvmet_fabrics_ops *ops; + + lockdep_assert_held(&nvmet_config_sem); + + port->enabled = false; + + ops = nvmet_transports[port->disc_addr.trtype]; + ops->remove_port(port); + module_put(ops->owner); +} + +static void nvmet_keep_alive_timer(struct work_struct *work) +{ + struct nvmet_ctrl *ctrl = container_of(to_delayed_work(work), + struct nvmet_ctrl, ka_work); + + pr_err("ctrl %d keep-alive timer (%d seconds) expired!\n", + ctrl->cntlid, ctrl->kato); + + ctrl->ops->delete_ctrl(ctrl); +} + +static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl) +{ + pr_debug("ctrl %d start keep-alive timer for %d secs\n", + ctrl->cntlid, ctrl->kato); + + INIT_DELAYED_WORK(&ctrl->ka_work, nvmet_keep_alive_timer); + schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); +} + +static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl) +{ + pr_debug("ctrl %d stop keep-alive\n", ctrl->cntlid); + + cancel_delayed_work_sync(&ctrl->ka_work); +} + +static struct nvmet_ns *__nvmet_find_namespace(struct nvmet_ctrl *ctrl, + __le32 nsid) +{ + struct nvmet_ns *ns; + + list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) { + if (ns->nsid == le32_to_cpu(nsid)) + return ns; + } + + return NULL; +} + +struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid) +{ + struct nvmet_ns *ns; + + rcu_read_lock(); + ns = __nvmet_find_namespace(ctrl, nsid); + if (ns) + percpu_ref_get(&ns->ref); + rcu_read_unlock(); + + return ns; +} + +static void nvmet_destroy_namespace(struct percpu_ref *ref) +{ + struct nvmet_ns *ns = container_of(ref, struct nvmet_ns, ref); + + complete(&ns->disable_done); +} + +void nvmet_put_namespace(struct nvmet_ns *ns) +{ + percpu_ref_put(&ns->ref); +} + +int nvmet_ns_enable(struct nvmet_ns *ns) +{ + struct nvmet_subsys *subsys = ns->subsys; + struct nvmet_ctrl *ctrl; + int ret = 0; + + mutex_lock(&subsys->lock); + if (!list_empty(&ns->dev_link)) + goto out_unlock; + + ns->bdev = blkdev_get_by_path(ns->device_path, FMODE_READ | FMODE_WRITE, + NULL); + if (IS_ERR(ns->bdev)) { + pr_err("nvmet: failed to open block device %s: (%ld)\n", + ns->device_path, PTR_ERR(ns->bdev)); + ret = PTR_ERR(ns->bdev); + ns->bdev = NULL; + goto out_unlock; + } + + ns->size = i_size_read(ns->bdev->bd_inode); + ns->blksize_shift = blksize_bits(bdev_logical_block_size(ns->bdev)); + + ret = percpu_ref_init(&ns->ref, nvmet_destroy_namespace, + 0, GFP_KERNEL); + if (ret) + goto out_blkdev_put; + + if (ns->nsid > subsys->max_nsid) + subsys->max_nsid = ns->nsid; + + /* + * The namespaces list needs to be sorted to simplify the implementation + * of the Identify Namepace List subcommand. + */ + if (list_empty(&subsys->namespaces)) { + list_add_tail_rcu(&ns->dev_link, &subsys->namespaces); + } else { + struct nvmet_ns *old; + + list_for_each_entry_rcu(old, &subsys->namespaces, dev_link) { + BUG_ON(ns->nsid == old->nsid); + if (ns->nsid < old->nsid) + break; + } + + list_add_tail_rcu(&ns->dev_link, &old->dev_link); + } + + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) + nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, 0, 0); + + ret = 0; +out_unlock: + mutex_unlock(&subsys->lock); + return ret; +out_blkdev_put: + blkdev_put(ns->bdev, FMODE_WRITE|FMODE_READ); + ns->bdev = NULL; + goto out_unlock; +} + +void nvmet_ns_disable(struct nvmet_ns *ns) +{ + struct nvmet_subsys *subsys = ns->subsys; + struct nvmet_ctrl *ctrl; + + mutex_lock(&subsys->lock); + if (list_empty(&ns->dev_link)) { + mutex_unlock(&subsys->lock); + return; + } + list_del_init(&ns->dev_link); + mutex_unlock(&subsys->lock); + + /* + * Now that we removed the namespaces from the lookup list, we + * can kill the per_cpu ref and wait for any remaining references + * to be dropped, as well as a RCU grace period for anyone only + * using the namepace under rcu_read_lock(). Note that we can't + * use call_rcu here as we need to ensure the namespaces have + * been fully destroyed before unloading the module. + */ + percpu_ref_kill(&ns->ref); + synchronize_rcu(); + wait_for_completion(&ns->disable_done); + percpu_ref_exit(&ns->ref); + + mutex_lock(&subsys->lock); + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) + nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, 0, 0); + + if (ns->bdev) + blkdev_put(ns->bdev, FMODE_WRITE|FMODE_READ); + mutex_unlock(&subsys->lock); +} + +void nvmet_ns_free(struct nvmet_ns *ns) +{ + nvmet_ns_disable(ns); + + kfree(ns->device_path); + kfree(ns); +} + +struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) +{ + struct nvmet_ns *ns; + + ns = kzalloc(sizeof(*ns), GFP_KERNEL); + if (!ns) + return NULL; + + INIT_LIST_HEAD(&ns->dev_link); + init_completion(&ns->disable_done); + + ns->nsid = nsid; + ns->subsys = subsys; + + return ns; +} + +static void __nvmet_req_complete(struct nvmet_req *req, u16 status) +{ + if (status) + nvmet_set_status(req, status); + + /* XXX: need to fill in something useful for sq_head */ + req->rsp->sq_head = 0; + if (likely(req->sq)) /* may happen during early failure */ + req->rsp->sq_id = cpu_to_le16(req->sq->qid); + req->rsp->command_id = req->cmd->common.command_id; + + if (req->ns) + nvmet_put_namespace(req->ns); + req->ops->queue_response(req); +} + +void nvmet_req_complete(struct nvmet_req *req, u16 status) +{ + __nvmet_req_complete(req, status); + percpu_ref_put(&req->sq->ref); +} +EXPORT_SYMBOL_GPL(nvmet_req_complete); + +void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, + u16 qid, u16 size) +{ + cq->qid = qid; + cq->size = size; + + ctrl->cqs[qid] = cq; +} + +void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, + u16 qid, u16 size) +{ + sq->qid = qid; + sq->size = size; + + ctrl->sqs[qid] = sq; +} + +void nvmet_sq_destroy(struct nvmet_sq *sq) +{ + /* + * If this is the admin queue, complete all AERs so that our + * queue doesn't have outstanding requests on it. + */ + if (sq->ctrl && sq->ctrl->sqs && sq->ctrl->sqs[0] == sq) + nvmet_async_events_free(sq->ctrl); + percpu_ref_kill(&sq->ref); + wait_for_completion(&sq->free_done); + percpu_ref_exit(&sq->ref); + + if (sq->ctrl) { + nvmet_ctrl_put(sq->ctrl); + sq->ctrl = NULL; /* allows reusing the queue later */ + } +} +EXPORT_SYMBOL_GPL(nvmet_sq_destroy); + +static void nvmet_sq_free(struct percpu_ref *ref) +{ + struct nvmet_sq *sq = container_of(ref, struct nvmet_sq, ref); + + complete(&sq->free_done); +} + +int nvmet_sq_init(struct nvmet_sq *sq) +{ + int ret; + + ret = percpu_ref_init(&sq->ref, nvmet_sq_free, 0, GFP_KERNEL); + if (ret) { + pr_err("percpu_ref init failed!\n"); + return ret; + } + init_completion(&sq->free_done); + + return 0; +} +EXPORT_SYMBOL_GPL(nvmet_sq_init); + +bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, + struct nvmet_sq *sq, struct nvmet_fabrics_ops *ops) +{ + u8 flags = req->cmd->common.flags; + u16 status; + + req->cq = cq; + req->sq = sq; + req->ops = ops; + req->sg = NULL; + req->sg_cnt = 0; + req->rsp->status = 0; + + /* no support for fused commands yet */ + if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto fail; + } + + /* either variant of SGLs is fine, as we don't support metadata */ + if (unlikely((flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METABUF && + (flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METASEG)) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto fail; + } + + if (unlikely(!req->sq->ctrl)) + /* will return an error for any Non-connect command: */ + status = nvmet_parse_connect_cmd(req); + else if (likely(req->sq->qid != 0)) + status = nvmet_parse_io_cmd(req); + else if (req->cmd->common.opcode == nvme_fabrics_command) + status = nvmet_parse_fabrics_cmd(req); + else if (req->sq->ctrl->subsys->type == NVME_NQN_DISC) + status = nvmet_parse_discovery_cmd(req); + else + status = nvmet_parse_admin_cmd(req); + + if (status) + goto fail; + + if (unlikely(!percpu_ref_tryget_live(&sq->ref))) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto fail; + } + + return true; + +fail: + __nvmet_req_complete(req, status); + return false; +} +EXPORT_SYMBOL_GPL(nvmet_req_init); + +static inline bool nvmet_cc_en(u32 cc) +{ + return cc & 0x1; +} + +static inline u8 nvmet_cc_css(u32 cc) +{ + return (cc >> 4) & 0x7; +} + +static inline u8 nvmet_cc_mps(u32 cc) +{ + return (cc >> 7) & 0xf; +} + +static inline u8 nvmet_cc_ams(u32 cc) +{ + return (cc >> 11) & 0x7; +} + +static inline u8 nvmet_cc_shn(u32 cc) +{ + return (cc >> 14) & 0x3; +} + +static inline u8 nvmet_cc_iosqes(u32 cc) +{ + return (cc >> 16) & 0xf; +} + +static inline u8 nvmet_cc_iocqes(u32 cc) +{ + return (cc >> 20) & 0xf; +} + +static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl) +{ + lockdep_assert_held(&ctrl->lock); + + if (nvmet_cc_iosqes(ctrl->cc) != NVME_NVM_IOSQES || + nvmet_cc_iocqes(ctrl->cc) != NVME_NVM_IOCQES || + nvmet_cc_mps(ctrl->cc) != 0 || + nvmet_cc_ams(ctrl->cc) != 0 || + nvmet_cc_css(ctrl->cc) != 0) { + ctrl->csts = NVME_CSTS_CFS; + return; + } + + ctrl->csts = NVME_CSTS_RDY; +} + +static void nvmet_clear_ctrl(struct nvmet_ctrl *ctrl) +{ + lockdep_assert_held(&ctrl->lock); + + /* XXX: tear down queues? */ + ctrl->csts &= ~NVME_CSTS_RDY; + ctrl->cc = 0; +} + +void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new) +{ + u32 old; + + mutex_lock(&ctrl->lock); + old = ctrl->cc; + ctrl->cc = new; + + if (nvmet_cc_en(new) && !nvmet_cc_en(old)) + nvmet_start_ctrl(ctrl); + if (!nvmet_cc_en(new) && nvmet_cc_en(old)) + nvmet_clear_ctrl(ctrl); + if (nvmet_cc_shn(new) && !nvmet_cc_shn(old)) { + nvmet_clear_ctrl(ctrl); + ctrl->csts |= NVME_CSTS_SHST_CMPLT; + } + if (!nvmet_cc_shn(new) && nvmet_cc_shn(old)) + ctrl->csts &= ~NVME_CSTS_SHST_CMPLT; + mutex_unlock(&ctrl->lock); +} + +static void nvmet_init_cap(struct nvmet_ctrl *ctrl) +{ + /* command sets supported: NVMe command set: */ + ctrl->cap = (1ULL << 37); + /* CC.EN timeout in 500msec units: */ + ctrl->cap |= (15ULL << 24); + /* maximum queue entries supported: */ + ctrl->cap |= NVMET_QUEUE_SIZE - 1; +} + +u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid, + struct nvmet_req *req, struct nvmet_ctrl **ret) +{ + struct nvmet_subsys *subsys; + struct nvmet_ctrl *ctrl; + u16 status = 0; + + subsys = nvmet_find_get_subsys(req->port, subsysnqn); + if (!subsys) { + pr_warn("connect request for invalid subsystem %s!\n", + subsysnqn); + req->rsp->result = IPO_IATTR_CONNECT_DATA(subsysnqn); + return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + } + + mutex_lock(&subsys->lock); + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { + if (ctrl->cntlid == cntlid) { + if (strncmp(hostnqn, ctrl->hostnqn, NVMF_NQN_SIZE)) { + pr_warn("hostnqn mismatch.\n"); + continue; + } + if (!kref_get_unless_zero(&ctrl->ref)) + continue; + + *ret = ctrl; + goto out; + } + } + + pr_warn("could not find controller %d for subsys %s / host %s\n", + cntlid, subsysnqn, hostnqn); + req->rsp->result = IPO_IATTR_CONNECT_DATA(cntlid); + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + +out: + mutex_unlock(&subsys->lock); + nvmet_subsys_put(subsys); + return status; +} + +static bool __nvmet_host_allowed(struct nvmet_subsys *subsys, + const char *hostnqn) +{ + struct nvmet_host_link *p; + + if (subsys->allow_any_host) + return true; + + list_for_each_entry(p, &subsys->hosts, entry) { + if (!strcmp(nvmet_host_name(p->host), hostnqn)) + return true; + } + + return false; +} + +static bool nvmet_host_discovery_allowed(struct nvmet_req *req, + const char *hostnqn) +{ + struct nvmet_subsys_link *s; + + list_for_each_entry(s, &req->port->subsystems, entry) { + if (__nvmet_host_allowed(s->subsys, hostnqn)) + return true; + } + + return false; +} + +bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys, + const char *hostnqn) +{ + lockdep_assert_held(&nvmet_config_sem); + + if (subsys->type == NVME_NQN_DISC) + return nvmet_host_discovery_allowed(req, hostnqn); + else + return __nvmet_host_allowed(subsys, hostnqn); +} + +u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, + struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp) +{ + struct nvmet_subsys *subsys; + struct nvmet_ctrl *ctrl; + int ret; + u16 status; + + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + subsys = nvmet_find_get_subsys(req->port, subsysnqn); + if (!subsys) { + pr_warn("connect request for invalid subsystem %s!\n", + subsysnqn); + req->rsp->result = IPO_IATTR_CONNECT_DATA(subsysnqn); + goto out; + } + + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + down_read(&nvmet_config_sem); + if (!nvmet_host_allowed(req, subsys, hostnqn)) { + pr_info("connect by host %s for subsystem %s not allowed\n", + hostnqn, subsysnqn); + req->rsp->result = IPO_IATTR_CONNECT_DATA(hostnqn); + up_read(&nvmet_config_sem); + goto out_put_subsystem; + } + up_read(&nvmet_config_sem); + + status = NVME_SC_INTERNAL; + ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); + if (!ctrl) + goto out_put_subsystem; + mutex_init(&ctrl->lock); + + nvmet_init_cap(ctrl); + + INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work); + INIT_LIST_HEAD(&ctrl->async_events); + + memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE); + memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE); + + kref_init(&ctrl->ref); + ctrl->subsys = subsys; + + ctrl->cqs = kcalloc(subsys->max_qid + 1, + sizeof(struct nvmet_cq *), + GFP_KERNEL); + if (!ctrl->cqs) + goto out_free_ctrl; + + ctrl->sqs = kcalloc(subsys->max_qid + 1, + sizeof(struct nvmet_sq *), + GFP_KERNEL); + if (!ctrl->sqs) + goto out_free_cqs; + + ret = ida_simple_get(&subsys->cntlid_ida, + NVME_CNTLID_MIN, NVME_CNTLID_MAX, + GFP_KERNEL); + if (ret < 0) { + status = NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR; + goto out_free_sqs; + } + ctrl->cntlid = ret; + + ctrl->ops = req->ops; + if (ctrl->subsys->type == NVME_NQN_DISC) { + /* Don't accept keep-alive timeout for discovery controllers */ + if (kato) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto out_free_sqs; + } + + /* + * Discovery controllers use some arbitrary high value in order + * to cleanup stale discovery sessions + * + * From the latest base diff RC: + * "The Keep Alive command is not supported by + * Discovery controllers. A transport may specify a + * fixed Discovery controller activity timeout value + * (e.g., 2 minutes). If no commands are received + * by a Discovery controller within that time + * period, the controller may perform the + * actions for Keep Alive Timer expiration". + */ + ctrl->kato = NVMET_DISC_KATO; + } else { + /* keep-alive timeout in seconds */ + ctrl->kato = DIV_ROUND_UP(kato, 1000); + } + nvmet_start_keep_alive_timer(ctrl); + + mutex_lock(&subsys->lock); + list_add_tail(&ctrl->subsys_entry, &subsys->ctrls); + mutex_unlock(&subsys->lock); + + *ctrlp = ctrl; + return 0; + +out_free_sqs: + kfree(ctrl->sqs); +out_free_cqs: + kfree(ctrl->cqs); +out_free_ctrl: + kfree(ctrl); +out_put_subsystem: + nvmet_subsys_put(subsys); +out: + return status; +} + +static void nvmet_ctrl_free(struct kref *ref) +{ + struct nvmet_ctrl *ctrl = container_of(ref, struct nvmet_ctrl, ref); + struct nvmet_subsys *subsys = ctrl->subsys; + + nvmet_stop_keep_alive_timer(ctrl); + + mutex_lock(&subsys->lock); + list_del(&ctrl->subsys_entry); + mutex_unlock(&subsys->lock); + + ida_simple_remove(&subsys->cntlid_ida, ctrl->cntlid); + nvmet_subsys_put(subsys); + + kfree(ctrl->sqs); + kfree(ctrl->cqs); + kfree(ctrl); +} + +void nvmet_ctrl_put(struct nvmet_ctrl *ctrl) +{ + kref_put(&ctrl->ref, nvmet_ctrl_free); +} + +static void nvmet_fatal_error_handler(struct work_struct *work) +{ + struct nvmet_ctrl *ctrl = + container_of(work, struct nvmet_ctrl, fatal_err_work); + + pr_err("ctrl %d fatal error occurred!\n", ctrl->cntlid); + ctrl->ops->delete_ctrl(ctrl); +} + +void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl) +{ + ctrl->csts |= NVME_CSTS_CFS; + INIT_WORK(&ctrl->fatal_err_work, nvmet_fatal_error_handler); + schedule_work(&ctrl->fatal_err_work); +} +EXPORT_SYMBOL_GPL(nvmet_ctrl_fatal_error); + +static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, + const char *subsysnqn) +{ + struct nvmet_subsys_link *p; + + if (!port) + return NULL; + + if (!strncmp(NVME_DISC_SUBSYS_NAME, subsysnqn, + NVMF_NQN_SIZE)) { + if (!kref_get_unless_zero(&nvmet_disc_subsys->ref)) + return NULL; + return nvmet_disc_subsys; + } + + down_read(&nvmet_config_sem); + list_for_each_entry(p, &port->subsystems, entry) { + if (!strncmp(p->subsys->subsysnqn, subsysnqn, + NVMF_NQN_SIZE)) { + if (!kref_get_unless_zero(&p->subsys->ref)) + break; + up_read(&nvmet_config_sem); + return p->subsys; + } + } + up_read(&nvmet_config_sem); + return NULL; +} + +struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, + enum nvme_subsys_type type) +{ + struct nvmet_subsys *subsys; + + subsys = kzalloc(sizeof(*subsys), GFP_KERNEL); + if (!subsys) + return NULL; + + subsys->ver = (1 << 16) | (2 << 8) | 1; /* NVMe 1.2.1 */ + + switch (type) { + case NVME_NQN_NVME: + subsys->max_qid = NVMET_NR_QUEUES; + break; + case NVME_NQN_DISC: + subsys->max_qid = 0; + break; + default: + pr_err("%s: Unknown Subsystem type - %d\n", __func__, type); + kfree(subsys); + return NULL; + } + subsys->type = type; + subsys->subsysnqn = kstrndup(subsysnqn, NVMF_NQN_SIZE, + GFP_KERNEL); + if (!subsys->subsysnqn) { + kfree(subsys); + return NULL; + } + + kref_init(&subsys->ref); + + mutex_init(&subsys->lock); + INIT_LIST_HEAD(&subsys->namespaces); + INIT_LIST_HEAD(&subsys->ctrls); + + ida_init(&subsys->cntlid_ida); + + INIT_LIST_HEAD(&subsys->hosts); + + return subsys; +} + +static void nvmet_subsys_free(struct kref *ref) +{ + struct nvmet_subsys *subsys = + container_of(ref, struct nvmet_subsys, ref); + + WARN_ON_ONCE(!list_empty(&subsys->namespaces)); + + ida_destroy(&subsys->cntlid_ida); + kfree(subsys->subsysnqn); + kfree(subsys); +} + +void nvmet_subsys_put(struct nvmet_subsys *subsys) +{ + kref_put(&subsys->ref, nvmet_subsys_free); +} + +static int __init nvmet_init(void) +{ + int error; + + error = nvmet_init_discovery(); + if (error) + goto out; + + error = nvmet_init_configfs(); + if (error) + goto out_exit_discovery; + return 0; + +out_exit_discovery: + nvmet_exit_discovery(); +out: + return error; +} + +static void __exit nvmet_exit(void) +{ + nvmet_exit_configfs(); + nvmet_exit_discovery(); + + BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024); + BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024); +} + +module_init(nvmet_init); +module_exit(nvmet_exit); + +MODULE_LICENSE("GPL v2"); diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c new file mode 100644 index 0000000..6f65646 --- /dev/null +++ b/drivers/nvme/target/discovery.c @@ -0,0 +1,221 @@ +/* + * Discovery service for the NVMe over Fabrics target. + * Copyright (C) 2016 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/slab.h> +#include <generated/utsrelease.h> +#include "nvmet.h" + +struct nvmet_subsys *nvmet_disc_subsys; + +u64 nvmet_genctr; + +void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port) +{ + down_write(&nvmet_config_sem); + if (list_empty(&port->entry)) { + list_add_tail(&port->entry, &parent->referrals); + port->enabled = true; + nvmet_genctr++; + } + up_write(&nvmet_config_sem); +} + +void nvmet_referral_disable(struct nvmet_port *port) +{ + down_write(&nvmet_config_sem); + if (!list_empty(&port->entry)) { + port->enabled = false; + list_del_init(&port->entry); + nvmet_genctr++; + } + up_write(&nvmet_config_sem); +} + +static void nvmet_format_discovery_entry(struct nvmf_disc_rsp_page_hdr *hdr, + struct nvmet_port *port, char *subsys_nqn, u8 type, u32 numrec) +{ + struct nvmf_disc_rsp_page_entry *e = &hdr->entries[numrec]; + + e->trtype = port->disc_addr.trtype; + e->adrfam = port->disc_addr.adrfam; + e->treq = port->disc_addr.treq; + e->portid = port->disc_addr.portid; + /* we support only dynamic controllers */ + e->cntlid = cpu_to_le16(NVME_CNTLID_DYNAMIC); + e->asqsz = cpu_to_le16(NVMF_AQ_DEPTH); + e->nqntype = type; + memcpy(e->trsvcid, port->disc_addr.trsvcid, NVMF_TRSVCID_SIZE); + memcpy(e->traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE); + memcpy(e->tsas.common, port->disc_addr.tsas.common, NVMF_TSAS_SIZE); + memcpy(e->subnqn, subsys_nqn, NVMF_NQN_SIZE); +} + +static void nvmet_execute_get_disc_log_page(struct nvmet_req *req) +{ + const int entry_size = sizeof(struct nvmf_disc_rsp_page_entry); + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvmf_disc_rsp_page_hdr *hdr; + size_t data_len = nvmet_get_log_page_len(req->cmd); + size_t alloc_len = max(data_len, sizeof(*hdr)); + int residual_len = data_len - sizeof(*hdr); + struct nvmet_subsys_link *p; + struct nvmet_port *r; + u32 numrec = 0; + u16 status = 0; + + /* + * Make sure we're passing at least a buffer of response header size. + * If host provided data len is less than the header size, only the + * number of bytes requested by host will be sent to host. + */ + hdr = kzalloc(alloc_len, GFP_KERNEL); + if (!hdr) { + status = NVME_SC_INTERNAL; + goto out; + } + + down_read(&nvmet_config_sem); + list_for_each_entry(p, &req->port->subsystems, entry) { + if (!nvmet_host_allowed(req, p->subsys, ctrl->hostnqn)) + continue; + if (residual_len >= entry_size) { + nvmet_format_discovery_entry(hdr, req->port, + p->subsys->subsysnqn, + NVME_NQN_NVME, numrec); + residual_len -= entry_size; + } + numrec++; + } + + list_for_each_entry(r, &req->port->referrals, entry) { + if (residual_len >= entry_size) { + nvmet_format_discovery_entry(hdr, r, + NVME_DISC_SUBSYS_NAME, + NVME_NQN_DISC, numrec); + residual_len -= entry_size; + } + numrec++; + } + + hdr->genctr = cpu_to_le64(nvmet_genctr); + hdr->numrec = cpu_to_le64(numrec); + hdr->recfmt = cpu_to_le16(0); + + up_read(&nvmet_config_sem); + + status = nvmet_copy_to_sgl(req, 0, hdr, data_len); + kfree(hdr); +out: + nvmet_req_complete(req, status); +} + +static void nvmet_execute_identify_disc_ctrl(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvme_id_ctrl *id; + u16 status = 0; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) { + status = NVME_SC_INTERNAL; + goto out; + } + + memset(id->fr, ' ', sizeof(id->fr)); + strncpy((char *)id->fr, UTS_RELEASE, sizeof(id->fr)); + + /* no limit on data transfer sizes for now */ + id->mdts = 0; + id->cntlid = cpu_to_le16(ctrl->cntlid); + id->ver = cpu_to_le32(ctrl->subsys->ver); + id->lpa = (1 << 2); + + /* no enforcement soft-limit for maxcmd - pick arbitrary high value */ + id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); + + id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ + if (ctrl->ops->has_keyed_sgls) + id->sgls |= cpu_to_le32(1 << 2); + if (ctrl->ops->sqe_inline_size) + id->sgls |= cpu_to_le32(1 << 20); + + strcpy(id->subnqn, ctrl->subsys->subsysnqn); + + status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); + + kfree(id); +out: + nvmet_req_complete(req, status); +} + +int nvmet_parse_discovery_cmd(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + req->ns = NULL; + + if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) { + pr_err("nvmet: got cmd %d while not ready\n", + cmd->common.opcode); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + + switch (cmd->common.opcode) { + case nvme_admin_get_log_page: + req->data_len = nvmet_get_log_page_len(cmd); + + switch (cmd->get_log_page.lid) { + case NVME_LOG_DISC: + req->execute = nvmet_execute_get_disc_log_page; + return 0; + default: + pr_err("nvmet: unsupported get_log_page lid %d\n", + cmd->get_log_page.lid); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + case nvme_admin_identify: + req->data_len = 4096; + switch (le32_to_cpu(cmd->identify.cns)) { + case 0x01: + req->execute = + nvmet_execute_identify_disc_ctrl; + return 0; + default: + pr_err("nvmet: unsupported identify cns %d\n", + le32_to_cpu(cmd->identify.cns)); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + default: + pr_err("nvmet: unsupported cmd %d\n", + cmd->common.opcode); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + + pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; +} + +int __init nvmet_init_discovery(void) +{ + nvmet_disc_subsys = + nvmet_subsys_alloc(NVME_DISC_SUBSYS_NAME, NVME_NQN_DISC); + if (!nvmet_disc_subsys) + return -ENOMEM; + return 0; +} + +void nvmet_exit_discovery(void) +{ + nvmet_subsys_put(nvmet_disc_subsys); +} diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c new file mode 100644 index 0000000..9a97ae6 --- /dev/null +++ b/drivers/nvme/target/fabrics-cmd.c @@ -0,0 +1,240 @@ +/* + * NVMe Fabrics command implementation. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/blkdev.h> +#include "nvmet.h" + +static void nvmet_execute_prop_set(struct nvmet_req *req) +{ + u16 status = 0; + + if (!(req->cmd->prop_set.attrib & 1)) { + u64 val = le64_to_cpu(req->cmd->prop_set.value); + + switch (le32_to_cpu(req->cmd->prop_set.offset)) { + case NVME_REG_CC: + nvmet_update_cc(req->sq->ctrl, val); + break; + default: + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + } else { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + } + + nvmet_req_complete(req, status); +} + +static void nvmet_execute_prop_get(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + u16 status = 0; + u64 val = 0; + + if (req->cmd->prop_get.attrib & 1) { + switch (le32_to_cpu(req->cmd->prop_get.offset)) { + case NVME_REG_CAP: + val = ctrl->cap; + break; + default: + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + } else { + switch (le32_to_cpu(req->cmd->prop_get.offset)) { + case NVME_REG_VS: + val = ctrl->subsys->ver; + break; + case NVME_REG_CC: + val = ctrl->cc; + break; + case NVME_REG_CSTS: + val = ctrl->csts; + break; + default: + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + } + + req->rsp->result64 = cpu_to_le64(val); + nvmet_req_complete(req, status); +} + +int nvmet_parse_fabrics_cmd(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + req->ns = NULL; + + switch (cmd->fabrics.fctype) { + case nvme_fabrics_type_property_set: + req->data_len = 0; + req->execute = nvmet_execute_prop_set; + break; + case nvme_fabrics_type_property_get: + req->data_len = 0; + req->execute = nvmet_execute_prop_get; + break; + default: + pr_err("received unknown capsule type 0x%x\n", + cmd->fabrics.fctype); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + + return 0; +} + +static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) +{ + struct nvmf_connect_command *c = &req->cmd->connect; + u16 qid = le16_to_cpu(c->qid); + u16 sqsize = le16_to_cpu(c->sqsize); + struct nvmet_ctrl *old; + + old = cmpxchg(&req->sq->ctrl, NULL, ctrl); + if (old) { + pr_warn("queue already connected!\n"); + return NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR; + } + + nvmet_cq_setup(ctrl, req->cq, qid, sqsize); + nvmet_sq_setup(ctrl, req->sq, qid, sqsize); + return 0; +} + +static void nvmet_execute_admin_connect(struct nvmet_req *req) +{ + struct nvmf_connect_command *c = &req->cmd->connect; + struct nvmf_connect_data *d; + struct nvmet_ctrl *ctrl = NULL; + u16 status = 0; + + d = kmap(sg_page(req->sg)) + req->sg->offset; + + /* zero out initial completion result, assign values as needed */ + req->rsp->result = 0; + + if (c->recfmt != 0) { + pr_warn("invalid connect version (%d).\n", + le16_to_cpu(c->recfmt)); + status = NVME_SC_CONNECT_FORMAT | NVME_SC_DNR; + goto out; + } + + if (unlikely(d->cntlid != cpu_to_le16(0xffff))) { + pr_warn("connect attempt for invalid controller ID %#x\n", + d->cntlid); + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + req->rsp->result = IPO_IATTR_CONNECT_DATA(cntlid); + goto out; + } + + status = nvmet_alloc_ctrl(d->subsysnqn, d->hostnqn, req, + le32_to_cpu(c->kato), &ctrl); + if (status) + goto out; + + status = nvmet_install_queue(ctrl, req); + if (status) { + nvmet_ctrl_put(ctrl); + goto out; + } + + pr_info("creating controller %d for NQN %s.\n", + ctrl->cntlid, ctrl->hostnqn); + req->rsp->result16 = cpu_to_le16(ctrl->cntlid); + +out: + kunmap(sg_page(req->sg)); + nvmet_req_complete(req, status); +} + +static void nvmet_execute_io_connect(struct nvmet_req *req) +{ + struct nvmf_connect_command *c = &req->cmd->connect; + struct nvmf_connect_data *d; + struct nvmet_ctrl *ctrl = NULL; + u16 qid = le16_to_cpu(c->qid); + u16 status = 0; + + d = kmap(sg_page(req->sg)) + req->sg->offset; + + /* zero out initial completion result, assign values as needed */ + req->rsp->result = 0; + + if (c->recfmt != 0) { + pr_warn("invalid connect version (%d).\n", + le16_to_cpu(c->recfmt)); + status = NVME_SC_CONNECT_FORMAT | NVME_SC_DNR; + goto out; + } + + status = nvmet_ctrl_find_get(d->subsysnqn, d->hostnqn, + le16_to_cpu(d->cntlid), + req, &ctrl); + if (status) + goto out; + + if (unlikely(qid > ctrl->subsys->max_qid)) { + pr_warn("invalid queue id (%d)\n", qid); + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + req->rsp->result = IPO_IATTR_CONNECT_SQE(qid); + goto out_ctrl_put; + } + + status = nvmet_install_queue(ctrl, req); + if (status) { + /* pass back cntlid that had the issue of installing queue */ + req->rsp->result16 = cpu_to_le16(ctrl->cntlid); + goto out_ctrl_put; + } + + pr_info("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid); + +out: + kunmap(sg_page(req->sg)); + nvmet_req_complete(req, status); + return; + +out_ctrl_put: + nvmet_ctrl_put(ctrl); + goto out; +} + +int nvmet_parse_connect_cmd(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + req->ns = NULL; + + if (req->cmd->common.opcode != nvme_fabrics_command) { + pr_err("invalid command 0x%x on unconnected queue.\n", + cmd->fabrics.opcode); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + if (cmd->fabrics.fctype != nvme_fabrics_type_connect) { + pr_err("invalid capsule type 0x%x on unconnected queue.\n", + cmd->fabrics.fctype); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + + req->data_len = sizeof(struct nvmf_connect_data); + if (cmd->connect.qid == 0) + req->execute = nvmet_execute_admin_connect; + else + req->execute = nvmet_execute_io_connect; + return 0; +} diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c new file mode 100644 index 0000000..2cd069b6 --- /dev/null +++ b/drivers/nvme/target/io-cmd.c @@ -0,0 +1,215 @@ +/* + * NVMe I/O command implementation. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/blkdev.h> +#include <linux/module.h> +#include "nvmet.h" + +static void nvmet_bio_done(struct bio *bio) +{ + struct nvmet_req *req = bio->bi_private; + + nvmet_req_complete(req, + bio->bi_error ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); + + if (bio != &req->inline_bio) + bio_put(bio); +} + +static inline u32 nvmet_rw_len(struct nvmet_req *req) +{ + return ((u32)le16_to_cpu(req->cmd->rw.length) + 1) << + req->ns->blksize_shift; +} + +static void nvmet_inline_bio_init(struct nvmet_req *req) +{ + struct bio *bio = &req->inline_bio; + + bio_init(bio); + bio->bi_max_vecs = NVMET_MAX_INLINE_BIOVEC; + bio->bi_io_vec = req->inline_bvec; +} + +static void nvmet_execute_rw(struct nvmet_req *req) +{ + int sg_cnt = req->sg_cnt; + struct scatterlist *sg; + struct bio *bio; + sector_t sector; + blk_qc_t cookie; + int op, op_flags = 0, i; + + if (!req->sg_cnt) { + nvmet_req_complete(req, 0); + return; + } + + if (req->cmd->rw.opcode == nvme_cmd_write) { + op = REQ_OP_WRITE; + if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA)) + op_flags |= REQ_FUA; + } else { + op = REQ_OP_READ; + } + + sector = le64_to_cpu(req->cmd->rw.slba); + sector <<= (req->ns->blksize_shift - 9); + + nvmet_inline_bio_init(req); + bio = &req->inline_bio; + bio->bi_bdev = req->ns->bdev; + bio->bi_iter.bi_sector = sector; + bio->bi_private = req; + bio->bi_end_io = nvmet_bio_done; + bio_set_op_attrs(bio, op, op_flags); + + for_each_sg(req->sg, sg, req->sg_cnt, i) { + while (bio_add_page(bio, sg_page(sg), sg->length, sg->offset) + != sg->length) { + struct bio *prev = bio; + + bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES)); + bio->bi_bdev = req->ns->bdev; + bio->bi_iter.bi_sector = sector; + bio_set_op_attrs(bio, op, op_flags); + + bio_chain(bio, prev); + cookie = submit_bio(prev); + } + + sector += sg->length >> 9; + sg_cnt--; + } + + cookie = submit_bio(bio); + + blk_poll(bdev_get_queue(req->ns->bdev), cookie); +} + +static void nvmet_execute_flush(struct nvmet_req *req) +{ + struct bio *bio; + + nvmet_inline_bio_init(req); + bio = &req->inline_bio; + + bio->bi_bdev = req->ns->bdev; + bio->bi_private = req; + bio->bi_end_io = nvmet_bio_done; + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); + + submit_bio(bio); +} + +static u16 nvmet_discard_range(struct nvmet_ns *ns, + struct nvme_dsm_range *range, struct bio **bio) +{ + if (__blkdev_issue_discard(ns->bdev, + le64_to_cpu(range->slba) << (ns->blksize_shift - 9), + le32_to_cpu(range->nlb) << (ns->blksize_shift - 9), + GFP_KERNEL, 0, bio)) + return NVME_SC_INTERNAL | NVME_SC_DNR; + return 0; +} + +static void nvmet_execute_discard(struct nvmet_req *req) +{ + struct nvme_dsm_range range; + struct bio *bio = NULL; + int i; + u16 status; + + for (i = 0; i <= le32_to_cpu(req->cmd->dsm.nr); i++) { + status = nvmet_copy_from_sgl(req, i * sizeof(range), &range, + sizeof(range)); + if (status) + break; + + status = nvmet_discard_range(req->ns, &range, &bio); + if (status) + break; + } + + if (bio) { + bio->bi_private = req; + bio->bi_end_io = nvmet_bio_done; + if (status) { + bio->bi_error = -EIO; + bio_endio(bio); + } else { + submit_bio(bio); + } + } else { + nvmet_req_complete(req, status); + } +} + +static void nvmet_execute_dsm(struct nvmet_req *req) +{ + switch (le32_to_cpu(req->cmd->dsm.attributes)) { + case NVME_DSMGMT_AD: + nvmet_execute_discard(req); + return; + case NVME_DSMGMT_IDR: + case NVME_DSMGMT_IDW: + default: + /* Not supported yet */ + nvmet_req_complete(req, 0); + return; + } +} + +int nvmet_parse_io_cmd(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) { + pr_err("nvmet: got io cmd %d while CC.EN == 0\n", + cmd->common.opcode); + req->ns = NULL; + return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + } + + if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) { + pr_err("nvmet: got io cmd %d while CSTS.RDY == 0\n", + cmd->common.opcode); + req->ns = NULL; + return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + } + + req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid); + if (!req->ns) + return NVME_SC_INVALID_NS | NVME_SC_DNR; + + switch (cmd->common.opcode) { + case nvme_cmd_read: + case nvme_cmd_write: + req->execute = nvmet_execute_rw; + req->data_len = nvmet_rw_len(req); + return 0; + case nvme_cmd_flush: + req->execute = nvmet_execute_flush; + req->data_len = 0; + return 0; + case nvme_cmd_dsm: + req->execute = nvmet_execute_dsm; + req->data_len = le32_to_cpu(cmd->dsm.nr) * + sizeof(struct nvme_dsm_range); + return 0; + default: + pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } +} diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c new file mode 100644 index 0000000..94e7829 --- /dev/null +++ b/drivers/nvme/target/loop.c @@ -0,0 +1,754 @@ +/* + * NVMe over Fabrics loopback device. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/scatterlist.h> +#include <linux/delay.h> +#include <linux/blk-mq.h> +#include <linux/nvme.h> +#include <linux/module.h> +#include <linux/parser.h> +#include <linux/t10-pi.h> +#include "nvmet.h" +#include "../host/nvme.h" +#include "../host/fabrics.h" + +#define NVME_LOOP_AQ_DEPTH 256 + +#define NVME_LOOP_MAX_SEGMENTS 256 + +/* + * We handle AEN commands ourselves and don't even let the + * block layer know about them. + */ +#define NVME_LOOP_NR_AEN_COMMANDS 1 +#define NVME_LOOP_AQ_BLKMQ_DEPTH \ + (NVME_LOOP_AQ_DEPTH - NVME_LOOP_NR_AEN_COMMANDS) + +struct nvme_loop_iod { + struct nvme_command cmd; + struct nvme_completion rsp; + struct nvmet_req req; + struct nvme_loop_queue *queue; + struct work_struct work; + struct sg_table sg_table; + struct scatterlist first_sgl[]; +}; + +struct nvme_loop_ctrl { + spinlock_t lock; + struct nvme_loop_queue *queues; + u32 queue_count; + + struct blk_mq_tag_set admin_tag_set; + + struct list_head list; + u64 cap; + struct blk_mq_tag_set tag_set; + struct nvme_loop_iod async_event_iod; + struct nvme_ctrl ctrl; + + struct nvmet_ctrl *target_ctrl; + struct work_struct delete_work; + struct work_struct reset_work; +}; + +static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl) +{ + return container_of(ctrl, struct nvme_loop_ctrl, ctrl); +} + +struct nvme_loop_queue { + struct nvmet_cq nvme_cq; + struct nvmet_sq nvme_sq; + struct nvme_loop_ctrl *ctrl; +}; + +static struct nvmet_port *nvmet_loop_port; + +static LIST_HEAD(nvme_loop_ctrl_list); +static DEFINE_MUTEX(nvme_loop_ctrl_mutex); + +static void nvme_loop_queue_response(struct nvmet_req *nvme_req); +static void nvme_loop_delete_ctrl(struct nvmet_ctrl *ctrl); + +static struct nvmet_fabrics_ops nvme_loop_ops; + +static inline int nvme_loop_queue_idx(struct nvme_loop_queue *queue) +{ + return queue - queue->ctrl->queues; +} + +static void nvme_loop_complete_rq(struct request *req) +{ + struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req); + int error = 0; + + nvme_cleanup_cmd(req); + sg_free_table_chained(&iod->sg_table, true); + + if (unlikely(req->errors)) { + if (nvme_req_needs_retry(req, req->errors)) { + nvme_requeue_req(req); + return; + } + + if (req->cmd_type == REQ_TYPE_DRV_PRIV) + error = req->errors; + else + error = nvme_error_status(req->errors); + } + + blk_mq_end_request(req, error); +} + +static void nvme_loop_queue_response(struct nvmet_req *nvme_req) +{ + struct nvme_loop_iod *iod = + container_of(nvme_req, struct nvme_loop_iod, req); + struct nvme_completion *cqe = &iod->rsp; + + /* + * AEN requests are special as they don't time out and can + * survive any kind of queue freeze and often don't respond to + * aborts. We don't even bother to allocate a struct request + * for them but rather special case them here. + */ + if (unlikely(nvme_loop_queue_idx(iod->queue) == 0 && + cqe->command_id >= NVME_LOOP_AQ_BLKMQ_DEPTH)) { + nvme_complete_async_event(&iod->queue->ctrl->ctrl, cqe); + } else { + struct request *req = blk_mq_rq_from_pdu(iod); + + if (req->cmd_type == REQ_TYPE_DRV_PRIV && req->special) + memcpy(req->special, cqe, sizeof(*cqe)); + blk_mq_complete_request(req, le16_to_cpu(cqe->status) >> 1); + } +} + +static void nvme_loop_execute_work(struct work_struct *work) +{ + struct nvme_loop_iod *iod = + container_of(work, struct nvme_loop_iod, work); + + iod->req.execute(&iod->req); +} + +static enum blk_eh_timer_return +nvme_loop_timeout(struct request *rq, bool reserved) +{ + struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(rq); + + /* queue error recovery */ + schedule_work(&iod->queue->ctrl->reset_work); + + /* fail with DNR on admin cmd timeout */ + rq->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR; + + return BLK_EH_HANDLED; +} + +static int nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct nvme_ns *ns = hctx->queue->queuedata; + struct nvme_loop_queue *queue = hctx->driver_data; + struct request *req = bd->rq; + struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req); + int ret; + + ret = nvme_setup_cmd(ns, req, &iod->cmd); + if (ret) + return ret; + + iod->cmd.common.flags |= NVME_CMD_SGL_METABUF; + iod->req.port = nvmet_loop_port; + if (!nvmet_req_init(&iod->req, &queue->nvme_cq, + &queue->nvme_sq, &nvme_loop_ops)) { + nvme_cleanup_cmd(req); + blk_mq_start_request(req); + nvme_loop_queue_response(&iod->req); + return 0; + } + + if (blk_rq_bytes(req)) { + iod->sg_table.sgl = iod->first_sgl; + ret = sg_alloc_table_chained(&iod->sg_table, + req->nr_phys_segments, iod->sg_table.sgl); + if (ret) + return BLK_MQ_RQ_QUEUE_BUSY; + + iod->req.sg = iod->sg_table.sgl; + iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl); + BUG_ON(iod->req.sg_cnt > req->nr_phys_segments); + } + + iod->cmd.common.command_id = req->tag; + blk_mq_start_request(req); + + schedule_work(&iod->work); + return 0; +} + +static void nvme_loop_submit_async_event(struct nvme_ctrl *arg, int aer_idx) +{ + struct nvme_loop_ctrl *ctrl = to_loop_ctrl(arg); + struct nvme_loop_queue *queue = &ctrl->queues[0]; + struct nvme_loop_iod *iod = &ctrl->async_event_iod; + + memset(&iod->cmd, 0, sizeof(iod->cmd)); + iod->cmd.common.opcode = nvme_admin_async_event; + iod->cmd.common.command_id = NVME_LOOP_AQ_BLKMQ_DEPTH; + iod->cmd.common.flags |= NVME_CMD_SGL_METABUF; + + if (!nvmet_req_init(&iod->req, &queue->nvme_cq, &queue->nvme_sq, + &nvme_loop_ops)) { + dev_err(ctrl->ctrl.device, "failed async event work\n"); + return; + } + + schedule_work(&iod->work); +} + +static int nvme_loop_init_iod(struct nvme_loop_ctrl *ctrl, + struct nvme_loop_iod *iod, unsigned int queue_idx) +{ + BUG_ON(queue_idx >= ctrl->queue_count); + + iod->req.cmd = &iod->cmd; + iod->req.rsp = &iod->rsp; + iod->queue = &ctrl->queues[queue_idx]; + INIT_WORK(&iod->work, nvme_loop_execute_work); + return 0; +} + +static int nvme_loop_init_request(void *data, struct request *req, + unsigned int hctx_idx, unsigned int rq_idx, + unsigned int numa_node) +{ + return nvme_loop_init_iod(data, blk_mq_rq_to_pdu(req), hctx_idx + 1); +} + +static int nvme_loop_init_admin_request(void *data, struct request *req, + unsigned int hctx_idx, unsigned int rq_idx, + unsigned int numa_node) +{ + return nvme_loop_init_iod(data, blk_mq_rq_to_pdu(req), 0); +} + +static int nvme_loop_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_loop_ctrl *ctrl = data; + struct nvme_loop_queue *queue = &ctrl->queues[hctx_idx + 1]; + + BUG_ON(hctx_idx >= ctrl->queue_count); + + hctx->driver_data = queue; + return 0; +} + +static int nvme_loop_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_loop_ctrl *ctrl = data; + struct nvme_loop_queue *queue = &ctrl->queues[0]; + + BUG_ON(hctx_idx != 0); + + hctx->driver_data = queue; + return 0; +} + +static struct blk_mq_ops nvme_loop_mq_ops = { + .queue_rq = nvme_loop_queue_rq, + .complete = nvme_loop_complete_rq, + .map_queue = blk_mq_map_queue, + .init_request = nvme_loop_init_request, + .init_hctx = nvme_loop_init_hctx, + .timeout = nvme_loop_timeout, +}; + +static struct blk_mq_ops nvme_loop_admin_mq_ops = { + .queue_rq = nvme_loop_queue_rq, + .complete = nvme_loop_complete_rq, + .map_queue = blk_mq_map_queue, + .init_request = nvme_loop_init_admin_request, + .init_hctx = nvme_loop_init_admin_hctx, + .timeout = nvme_loop_timeout, +}; + +static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl) +{ + blk_cleanup_queue(ctrl->ctrl.admin_q); + blk_mq_free_tag_set(&ctrl->admin_tag_set); + nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); +} + +static void nvme_loop_free_ctrl(struct nvme_ctrl *nctrl) +{ + struct nvme_loop_ctrl *ctrl = to_loop_ctrl(nctrl); + + if (list_empty(&ctrl->list)) + goto free_ctrl; + + mutex_lock(&nvme_loop_ctrl_mutex); + list_del(&ctrl->list); + mutex_unlock(&nvme_loop_ctrl_mutex); + + if (nctrl->tagset) { + blk_cleanup_queue(ctrl->ctrl.connect_q); + blk_mq_free_tag_set(&ctrl->tag_set); + } + kfree(ctrl->queues); + nvmf_free_options(nctrl->opts); +free_ctrl: + kfree(ctrl); +} + +static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) +{ + int error; + + memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); + ctrl->admin_tag_set.ops = &nvme_loop_admin_mq_ops; + ctrl->admin_tag_set.queue_depth = NVME_LOOP_AQ_BLKMQ_DEPTH; + ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */ + ctrl->admin_tag_set.numa_node = NUMA_NO_NODE; + ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_loop_iod) + + SG_CHUNK_SIZE * sizeof(struct scatterlist); + ctrl->admin_tag_set.driver_data = ctrl; + ctrl->admin_tag_set.nr_hw_queues = 1; + ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT; + + ctrl->queues[0].ctrl = ctrl; + error = nvmet_sq_init(&ctrl->queues[0].nvme_sq); + if (error) + return error; + ctrl->queue_count = 1; + + error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); + if (error) + goto out_free_sq; + + ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); + if (IS_ERR(ctrl->ctrl.admin_q)) { + error = PTR_ERR(ctrl->ctrl.admin_q); + goto out_free_tagset; + } + + error = nvmf_connect_admin_queue(&ctrl->ctrl); + if (error) + goto out_cleanup_queue; + + error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap); + if (error) { + dev_err(ctrl->ctrl.device, + "prop_get NVME_REG_CAP failed\n"); + goto out_cleanup_queue; + } + + ctrl->ctrl.sqsize = + min_t(int, NVME_CAP_MQES(ctrl->cap) + 1, ctrl->ctrl.sqsize); + + error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap); + if (error) + goto out_cleanup_queue; + + ctrl->ctrl.max_hw_sectors = + (NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9); + + error = nvme_init_identify(&ctrl->ctrl); + if (error) + goto out_cleanup_queue; + + nvme_start_keep_alive(&ctrl->ctrl); + + return 0; + +out_cleanup_queue: + blk_cleanup_queue(ctrl->ctrl.admin_q); +out_free_tagset: + blk_mq_free_tag_set(&ctrl->admin_tag_set); +out_free_sq: + nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); + return error; +} + +static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) +{ + int i; + + nvme_stop_keep_alive(&ctrl->ctrl); + + if (ctrl->queue_count > 1) { + nvme_stop_queues(&ctrl->ctrl); + blk_mq_tagset_busy_iter(&ctrl->tag_set, + nvme_cancel_request, &ctrl->ctrl); + + for (i = 1; i < ctrl->queue_count; i++) + nvmet_sq_destroy(&ctrl->queues[i].nvme_sq); + } + + if (ctrl->ctrl.state == NVME_CTRL_LIVE) + nvme_shutdown_ctrl(&ctrl->ctrl); + + blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); + blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, + nvme_cancel_request, &ctrl->ctrl); + nvme_loop_destroy_admin_queue(ctrl); +} + +static void nvme_loop_del_ctrl_work(struct work_struct *work) +{ + struct nvme_loop_ctrl *ctrl = container_of(work, + struct nvme_loop_ctrl, delete_work); + + nvme_remove_namespaces(&ctrl->ctrl); + nvme_loop_shutdown_ctrl(ctrl); + nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); +} + +static int __nvme_loop_del_ctrl(struct nvme_loop_ctrl *ctrl) +{ + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) + return -EBUSY; + + if (!schedule_work(&ctrl->delete_work)) + return -EBUSY; + + return 0; +} + +static int nvme_loop_del_ctrl(struct nvme_ctrl *nctrl) +{ + struct nvme_loop_ctrl *ctrl = to_loop_ctrl(nctrl); + int ret; + + ret = __nvme_loop_del_ctrl(ctrl); + if (ret) + return ret; + + flush_work(&ctrl->delete_work); + + return 0; +} + +static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl) +{ + struct nvme_loop_ctrl *ctrl; + + mutex_lock(&nvme_loop_ctrl_mutex); + list_for_each_entry(ctrl, &nvme_loop_ctrl_list, list) { + if (ctrl->ctrl.cntlid == nctrl->cntlid) + __nvme_loop_del_ctrl(ctrl); + } + mutex_unlock(&nvme_loop_ctrl_mutex); +} + +static void nvme_loop_reset_ctrl_work(struct work_struct *work) +{ + struct nvme_loop_ctrl *ctrl = container_of(work, + struct nvme_loop_ctrl, reset_work); + bool changed; + int i, ret; + + nvme_loop_shutdown_ctrl(ctrl); + + ret = nvme_loop_configure_admin_queue(ctrl); + if (ret) + goto out_disable; + + for (i = 1; i <= ctrl->ctrl.opts->nr_io_queues; i++) { + ctrl->queues[i].ctrl = ctrl; + ret = nvmet_sq_init(&ctrl->queues[i].nvme_sq); + if (ret) + goto out_free_queues; + + ctrl->queue_count++; + } + + for (i = 1; i <= ctrl->ctrl.opts->nr_io_queues; i++) { + ret = nvmf_connect_io_queue(&ctrl->ctrl, i); + if (ret) + goto out_free_queues; + } + + changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); + WARN_ON_ONCE(!changed); + + nvme_queue_scan(&ctrl->ctrl); + nvme_queue_async_events(&ctrl->ctrl); + + nvme_start_queues(&ctrl->ctrl); + + return; + +out_free_queues: + for (i = 1; i < ctrl->queue_count; i++) + nvmet_sq_destroy(&ctrl->queues[i].nvme_sq); + nvme_loop_destroy_admin_queue(ctrl); +out_disable: + dev_warn(ctrl->ctrl.device, "Removing after reset failure\n"); + nvme_remove_namespaces(&ctrl->ctrl); + nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); +} + +static int nvme_loop_reset_ctrl(struct nvme_ctrl *nctrl) +{ + struct nvme_loop_ctrl *ctrl = to_loop_ctrl(nctrl); + + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) + return -EBUSY; + + if (!schedule_work(&ctrl->reset_work)) + return -EBUSY; + + flush_work(&ctrl->reset_work); + + return 0; +} + +static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = { + .name = "loop", + .module = THIS_MODULE, + .is_fabrics = true, + .reg_read32 = nvmf_reg_read32, + .reg_read64 = nvmf_reg_read64, + .reg_write32 = nvmf_reg_write32, + .reset_ctrl = nvme_loop_reset_ctrl, + .free_ctrl = nvme_loop_free_ctrl, + .submit_async_event = nvme_loop_submit_async_event, + .delete_ctrl = nvme_loop_del_ctrl, + .get_subsysnqn = nvmf_get_subsysnqn, +}; + +static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl) +{ + struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + int ret, i; + + ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues); + if (ret || !opts->nr_io_queues) + return ret; + + dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n", + opts->nr_io_queues); + + for (i = 1; i <= opts->nr_io_queues; i++) { + ctrl->queues[i].ctrl = ctrl; + ret = nvmet_sq_init(&ctrl->queues[i].nvme_sq); + if (ret) + goto out_destroy_queues; + + ctrl->queue_count++; + } + + memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set)); + ctrl->tag_set.ops = &nvme_loop_mq_ops; + ctrl->tag_set.queue_depth = ctrl->ctrl.sqsize; + ctrl->tag_set.reserved_tags = 1; /* fabric connect */ + ctrl->tag_set.numa_node = NUMA_NO_NODE; + ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + ctrl->tag_set.cmd_size = sizeof(struct nvme_loop_iod) + + SG_CHUNK_SIZE * sizeof(struct scatterlist); + ctrl->tag_set.driver_data = ctrl; + ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1; + ctrl->tag_set.timeout = NVME_IO_TIMEOUT; + ctrl->ctrl.tagset = &ctrl->tag_set; + + ret = blk_mq_alloc_tag_set(&ctrl->tag_set); + if (ret) + goto out_destroy_queues; + + ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set); + if (IS_ERR(ctrl->ctrl.connect_q)) { + ret = PTR_ERR(ctrl->ctrl.connect_q); + goto out_free_tagset; + } + + for (i = 1; i <= opts->nr_io_queues; i++) { + ret = nvmf_connect_io_queue(&ctrl->ctrl, i); + if (ret) + goto out_cleanup_connect_q; + } + + return 0; + +out_cleanup_connect_q: + blk_cleanup_queue(ctrl->ctrl.connect_q); +out_free_tagset: + blk_mq_free_tag_set(&ctrl->tag_set); +out_destroy_queues: + for (i = 1; i < ctrl->queue_count; i++) + nvmet_sq_destroy(&ctrl->queues[i].nvme_sq); + return ret; +} + +static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, + struct nvmf_ctrl_options *opts) +{ + struct nvme_loop_ctrl *ctrl; + bool changed; + int ret; + + ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); + if (!ctrl) + return ERR_PTR(-ENOMEM); + ctrl->ctrl.opts = opts; + INIT_LIST_HEAD(&ctrl->list); + + INIT_WORK(&ctrl->delete_work, nvme_loop_del_ctrl_work); + INIT_WORK(&ctrl->reset_work, nvme_loop_reset_ctrl_work); + + ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_loop_ctrl_ops, + 0 /* no quirks, we're perfect! */); + if (ret) + goto out_put_ctrl; + + spin_lock_init(&ctrl->lock); + + ret = -ENOMEM; + + ctrl->ctrl.sqsize = opts->queue_size; + ctrl->ctrl.kato = opts->kato; + + ctrl->queues = kcalloc(opts->nr_io_queues + 1, sizeof(*ctrl->queues), + GFP_KERNEL); + if (!ctrl->queues) + goto out_uninit_ctrl; + + ret = nvme_loop_configure_admin_queue(ctrl); + if (ret) + goto out_free_queues; + + if (opts->queue_size > ctrl->ctrl.maxcmd) { + /* warn if maxcmd is lower than queue_size */ + dev_warn(ctrl->ctrl.device, + "queue_size %zu > ctrl maxcmd %u, clamping down\n", + opts->queue_size, ctrl->ctrl.maxcmd); + opts->queue_size = ctrl->ctrl.maxcmd; + } + + if (opts->nr_io_queues) { + ret = nvme_loop_create_io_queues(ctrl); + if (ret) + goto out_remove_admin_queue; + } + + nvme_loop_init_iod(ctrl, &ctrl->async_event_iod, 0); + + dev_info(ctrl->ctrl.device, + "new ctrl: \"%s\"\n", ctrl->ctrl.opts->subsysnqn); + + kref_get(&ctrl->ctrl.kref); + + changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); + WARN_ON_ONCE(!changed); + + mutex_lock(&nvme_loop_ctrl_mutex); + list_add_tail(&ctrl->list, &nvme_loop_ctrl_list); + mutex_unlock(&nvme_loop_ctrl_mutex); + + if (opts->nr_io_queues) { + nvme_queue_scan(&ctrl->ctrl); + nvme_queue_async_events(&ctrl->ctrl); + } + + return &ctrl->ctrl; + +out_remove_admin_queue: + nvme_loop_destroy_admin_queue(ctrl); +out_free_queues: + kfree(ctrl->queues); +out_uninit_ctrl: + nvme_uninit_ctrl(&ctrl->ctrl); +out_put_ctrl: + nvme_put_ctrl(&ctrl->ctrl); + if (ret > 0) + ret = -EIO; + return ERR_PTR(ret); +} + +static int nvme_loop_add_port(struct nvmet_port *port) +{ + /* + * XXX: disalow adding more than one port so + * there is no connection rejections when a + * a subsystem is assigned to a port for which + * loop doesn't have a pointer. + * This scenario would be possible if we allowed + * more than one port to be added and a subsystem + * was assigned to a port other than nvmet_loop_port. + */ + + if (nvmet_loop_port) + return -EPERM; + + nvmet_loop_port = port; + return 0; +} + +static void nvme_loop_remove_port(struct nvmet_port *port) +{ + if (port == nvmet_loop_port) + nvmet_loop_port = NULL; +} + +static struct nvmet_fabrics_ops nvme_loop_ops = { + .owner = THIS_MODULE, + .type = NVMF_TRTYPE_LOOP, + .add_port = nvme_loop_add_port, + .remove_port = nvme_loop_remove_port, + .queue_response = nvme_loop_queue_response, + .delete_ctrl = nvme_loop_delete_ctrl, +}; + +static struct nvmf_transport_ops nvme_loop_transport = { + .name = "loop", + .create_ctrl = nvme_loop_create_ctrl, +}; + +static int __init nvme_loop_init_module(void) +{ + int ret; + + ret = nvmet_register_transport(&nvme_loop_ops); + if (ret) + return ret; + nvmf_register_transport(&nvme_loop_transport); + return 0; +} + +static void __exit nvme_loop_cleanup_module(void) +{ + struct nvme_loop_ctrl *ctrl, *next; + + nvmf_unregister_transport(&nvme_loop_transport); + nvmet_unregister_transport(&nvme_loop_ops); + + mutex_lock(&nvme_loop_ctrl_mutex); + list_for_each_entry_safe(ctrl, next, &nvme_loop_ctrl_list, list) + __nvme_loop_del_ctrl(ctrl); + mutex_unlock(&nvme_loop_ctrl_mutex); + + flush_scheduled_work(); +} + +module_init(nvme_loop_init_module); +module_exit(nvme_loop_cleanup_module); + +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("nvmet-transport-254"); /* 254 == NVMF_TRTYPE_LOOP */ diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h new file mode 100644 index 0000000..57dd6d8 --- /dev/null +++ b/drivers/nvme/target/nvmet.h @@ -0,0 +1,331 @@ +/* + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _NVMET_H +#define _NVMET_H + +#include <linux/dma-mapping.h> +#include <linux/types.h> +#include <linux/device.h> +#include <linux/kref.h> +#include <linux/percpu-refcount.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/nvme.h> +#include <linux/configfs.h> +#include <linux/rcupdate.h> +#include <linux/blkdev.h> + +#define NVMET_ASYNC_EVENTS 4 +#define NVMET_ERROR_LOG_SLOTS 128 + +/* Helper Macros when NVMe error is NVME_SC_CONNECT_INVALID_PARAM + * The 16 bit shift is to set IATTR bit to 1, which means offending + * offset starts in the data section of connect() + */ +#define IPO_IATTR_CONNECT_DATA(x) \ + (cpu_to_le32((1 << 16) | (offsetof(struct nvmf_connect_data, x)))) +#define IPO_IATTR_CONNECT_SQE(x) \ + (cpu_to_le32(offsetof(struct nvmf_connect_command, x))) + +struct nvmet_ns { + struct list_head dev_link; + struct percpu_ref ref; + struct block_device *bdev; + u32 nsid; + u32 blksize_shift; + loff_t size; + u8 nguid[16]; + + struct nvmet_subsys *subsys; + const char *device_path; + + struct config_group device_group; + struct config_group group; + + struct completion disable_done; +}; + +static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_ns, group); +} + +static inline bool nvmet_ns_enabled(struct nvmet_ns *ns) +{ + return !list_empty_careful(&ns->dev_link); +} + +struct nvmet_cq { + u16 qid; + u16 size; +}; + +struct nvmet_sq { + struct nvmet_ctrl *ctrl; + struct percpu_ref ref; + u16 qid; + u16 size; + struct completion free_done; +}; + +/** + * struct nvmet_port - Common structure to keep port + * information for the target. + * @entry: List head for holding a list of these elements. + * @disc_addr: Address information is stored in a format defined + * for a discovery log page entry. + * @group: ConfigFS group for this element's folder. + * @priv: Private data for the transport. + */ +struct nvmet_port { + struct list_head entry; + struct nvmf_disc_rsp_page_entry disc_addr; + struct config_group group; + struct config_group subsys_group; + struct list_head subsystems; + struct config_group referrals_group; + struct list_head referrals; + void *priv; + bool enabled; +}; + +static inline struct nvmet_port *to_nvmet_port(struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_port, + group); +} + +struct nvmet_ctrl { + struct nvmet_subsys *subsys; + struct nvmet_cq **cqs; + struct nvmet_sq **sqs; + + struct mutex lock; + u64 cap; + u32 cc; + u32 csts; + + u16 cntlid; + u32 kato; + + struct nvmet_req *async_event_cmds[NVMET_ASYNC_EVENTS]; + unsigned int nr_async_event_cmds; + struct list_head async_events; + struct work_struct async_event_work; + + struct list_head subsys_entry; + struct kref ref; + struct delayed_work ka_work; + struct work_struct fatal_err_work; + + struct nvmet_fabrics_ops *ops; + + char subsysnqn[NVMF_NQN_FIELD_LEN]; + char hostnqn[NVMF_NQN_FIELD_LEN]; +}; + +struct nvmet_subsys { + enum nvme_subsys_type type; + + struct mutex lock; + struct kref ref; + + struct list_head namespaces; + unsigned int max_nsid; + + struct list_head ctrls; + struct ida cntlid_ida; + + struct list_head hosts; + bool allow_any_host; + + u16 max_qid; + + u64 ver; + char *subsysnqn; + + struct config_group group; + + struct config_group namespaces_group; + struct config_group allowed_hosts_group; +}; + +static inline struct nvmet_subsys *to_subsys(struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_subsys, group); +} + +static inline struct nvmet_subsys *namespaces_to_subsys( + struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_subsys, + namespaces_group); +} + +struct nvmet_host { + struct config_group group; +}; + +static inline struct nvmet_host *to_host(struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_host, group); +} + +static inline char *nvmet_host_name(struct nvmet_host *host) +{ + return config_item_name(&host->group.cg_item); +} + +struct nvmet_host_link { + struct list_head entry; + struct nvmet_host *host; +}; + +struct nvmet_subsys_link { + struct list_head entry; + struct nvmet_subsys *subsys; +}; + +struct nvmet_req; +struct nvmet_fabrics_ops { + struct module *owner; + unsigned int type; + unsigned int sqe_inline_size; + unsigned int msdbd; + bool has_keyed_sgls : 1; + void (*queue_response)(struct nvmet_req *req); + int (*add_port)(struct nvmet_port *port); + void (*remove_port)(struct nvmet_port *port); + void (*delete_ctrl)(struct nvmet_ctrl *ctrl); +}; + +#define NVMET_MAX_INLINE_BIOVEC 8 + +struct nvmet_req { + struct nvme_command *cmd; + struct nvme_completion *rsp; + struct nvmet_sq *sq; + struct nvmet_cq *cq; + struct nvmet_ns *ns; + struct scatterlist *sg; + struct bio inline_bio; + struct bio_vec inline_bvec[NVMET_MAX_INLINE_BIOVEC]; + int sg_cnt; + size_t data_len; + + struct nvmet_port *port; + + void (*execute)(struct nvmet_req *req); + struct nvmet_fabrics_ops *ops; +}; + +static inline void nvmet_set_status(struct nvmet_req *req, u16 status) +{ + req->rsp->status = cpu_to_le16(status << 1); +} + +static inline void nvmet_set_result(struct nvmet_req *req, u32 result) +{ + req->rsp->result = cpu_to_le32(result); +} + +/* + * NVMe command writes actually are DMA reads for us on the target side. + */ +static inline enum dma_data_direction +nvmet_data_dir(struct nvmet_req *req) +{ + return nvme_is_write(req->cmd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE; +} + +struct nvmet_async_event { + struct list_head entry; + u8 event_type; + u8 event_info; + u8 log_page; +}; + +int nvmet_parse_connect_cmd(struct nvmet_req *req); +int nvmet_parse_io_cmd(struct nvmet_req *req); +int nvmet_parse_admin_cmd(struct nvmet_req *req); +int nvmet_parse_discovery_cmd(struct nvmet_req *req); +int nvmet_parse_fabrics_cmd(struct nvmet_req *req); + +bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, + struct nvmet_sq *sq, struct nvmet_fabrics_ops *ops); +void nvmet_req_complete(struct nvmet_req *req, u16 status); + +void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, + u16 size); +void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid, + u16 size); +void nvmet_sq_destroy(struct nvmet_sq *sq); +int nvmet_sq_init(struct nvmet_sq *sq); + +void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl); + +void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new); +u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, + struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp); +u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid, + struct nvmet_req *req, struct nvmet_ctrl **ret); +void nvmet_ctrl_put(struct nvmet_ctrl *ctrl); + +struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, + enum nvme_subsys_type type); +void nvmet_subsys_put(struct nvmet_subsys *subsys); + +struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid); +void nvmet_put_namespace(struct nvmet_ns *ns); +int nvmet_ns_enable(struct nvmet_ns *ns); +void nvmet_ns_disable(struct nvmet_ns *ns); +struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid); +void nvmet_ns_free(struct nvmet_ns *ns); + +int nvmet_register_transport(struct nvmet_fabrics_ops *ops); +void nvmet_unregister_transport(struct nvmet_fabrics_ops *ops); + +int nvmet_enable_port(struct nvmet_port *port); +void nvmet_disable_port(struct nvmet_port *port); + +void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port); +void nvmet_referral_disable(struct nvmet_port *port); + +u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf, + size_t len); +u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, + size_t len); + +u32 nvmet_get_log_page_len(struct nvme_command *cmd); + +#define NVMET_QUEUE_SIZE 1024 +#define NVMET_NR_QUEUES 64 +#define NVMET_MAX_CMD NVMET_QUEUE_SIZE +#define NVMET_KAS 10 +#define NVMET_DISC_KATO 120 + +int __init nvmet_init_configfs(void); +void __exit nvmet_exit_configfs(void); + +int __init nvmet_init_discovery(void); +void nvmet_exit_discovery(void); + +extern struct nvmet_subsys *nvmet_disc_subsys; +extern u64 nvmet_genctr; +extern struct rw_semaphore nvmet_config_sem; + +bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys, + const char *hostnqn); + +#endif /* _NVMET_H */ diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c new file mode 100644 index 0000000..e06d504 --- /dev/null +++ b/drivers/nvme/target/rdma.c @@ -0,0 +1,1448 @@ +/* + * NVMe over Fabrics RDMA target. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/atomic.h> +#include <linux/ctype.h> +#include <linux/delay.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/nvme.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/wait.h> +#include <linux/inet.h> +#include <asm/unaligned.h> + +#include <rdma/ib_verbs.h> +#include <rdma/rdma_cm.h> +#include <rdma/rw.h> + +#include <linux/nvme-rdma.h> +#include "nvmet.h" + +/* + * We allow up to a page of inline data to go with the SQE + */ +#define NVMET_RDMA_INLINE_DATA_SIZE PAGE_SIZE + +struct nvmet_rdma_cmd { + struct ib_sge sge[2]; + struct ib_cqe cqe; + struct ib_recv_wr wr; + struct scatterlist inline_sg; + struct page *inline_page; + struct nvme_command *nvme_cmd; + struct nvmet_rdma_queue *queue; +}; + +enum { + NVMET_RDMA_REQ_INLINE_DATA = (1 << 0), + NVMET_RDMA_REQ_INVALIDATE_RKEY = (1 << 1), +}; + +struct nvmet_rdma_rsp { + struct ib_sge send_sge; + struct ib_cqe send_cqe; + struct ib_send_wr send_wr; + + struct nvmet_rdma_cmd *cmd; + struct nvmet_rdma_queue *queue; + + struct ib_cqe read_cqe; + struct rdma_rw_ctx rw; + + struct nvmet_req req; + + u8 n_rdma; + u32 flags; + u32 invalidate_rkey; + + struct list_head wait_list; + struct list_head free_list; +}; + +enum nvmet_rdma_queue_state { + NVMET_RDMA_Q_CONNECTING, + NVMET_RDMA_Q_LIVE, + NVMET_RDMA_Q_DISCONNECTING, +}; + +struct nvmet_rdma_queue { + struct rdma_cm_id *cm_id; + struct nvmet_port *port; + struct ib_cq *cq; + atomic_t sq_wr_avail; + struct nvmet_rdma_device *dev; + spinlock_t state_lock; + enum nvmet_rdma_queue_state state; + struct nvmet_cq nvme_cq; + struct nvmet_sq nvme_sq; + + struct nvmet_rdma_rsp *rsps; + struct list_head free_rsps; + spinlock_t rsps_lock; + struct nvmet_rdma_cmd *cmds; + + struct work_struct release_work; + struct list_head rsp_wait_list; + struct list_head rsp_wr_wait_list; + spinlock_t rsp_wr_wait_lock; + + int idx; + int host_qid; + int recv_queue_size; + int send_queue_size; + + struct list_head queue_list; +}; + +struct nvmet_rdma_device { + struct ib_device *device; + struct ib_pd *pd; + struct ib_srq *srq; + struct nvmet_rdma_cmd *srq_cmds; + size_t srq_size; + struct kref ref; + struct list_head entry; +}; + +static bool nvmet_rdma_use_srq; +module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444); +MODULE_PARM_DESC(use_srq, "Use shared receive queue."); + +static DEFINE_IDA(nvmet_rdma_queue_ida); +static LIST_HEAD(nvmet_rdma_queue_list); +static DEFINE_MUTEX(nvmet_rdma_queue_mutex); + +static LIST_HEAD(device_list); +static DEFINE_MUTEX(device_list_mutex); + +static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp); +static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc); +static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); +static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc); +static void nvmet_rdma_qp_event(struct ib_event *event, void *priv); +static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue); + +static struct nvmet_fabrics_ops nvmet_rdma_ops; + +/* XXX: really should move to a generic header sooner or later.. */ +static inline u32 get_unaligned_le24(const u8 *p) +{ + return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16; +} + +static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp) +{ + return nvme_is_write(rsp->req.cmd) && + rsp->req.data_len && + !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); +} + +static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp) +{ + return !nvme_is_write(rsp->req.cmd) && + rsp->req.data_len && + !rsp->req.rsp->status && + !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); +} + +static inline struct nvmet_rdma_rsp * +nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue) +{ + struct nvmet_rdma_rsp *rsp; + unsigned long flags; + + spin_lock_irqsave(&queue->rsps_lock, flags); + rsp = list_first_entry(&queue->free_rsps, + struct nvmet_rdma_rsp, free_list); + list_del(&rsp->free_list); + spin_unlock_irqrestore(&queue->rsps_lock, flags); + + return rsp; +} + +static inline void +nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp) +{ + unsigned long flags; + + spin_lock_irqsave(&rsp->queue->rsps_lock, flags); + list_add_tail(&rsp->free_list, &rsp->queue->free_rsps); + spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags); +} + +static void nvmet_rdma_free_sgl(struct scatterlist *sgl, unsigned int nents) +{ + struct scatterlist *sg; + int count; + + if (!sgl || !nents) + return; + + for_each_sg(sgl, sg, nents, count) + __free_page(sg_page(sg)); + kfree(sgl); +} + +static int nvmet_rdma_alloc_sgl(struct scatterlist **sgl, unsigned int *nents, + u32 length) +{ + struct scatterlist *sg; + struct page *page; + unsigned int nent; + int i = 0; + + nent = DIV_ROUND_UP(length, PAGE_SIZE); + sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL); + if (!sg) + goto out; + + sg_init_table(sg, nent); + + while (length) { + u32 page_len = min_t(u32, length, PAGE_SIZE); + + page = alloc_page(GFP_KERNEL); + if (!page) + goto out_free_pages; + + sg_set_page(&sg[i], page, page_len, 0); + length -= page_len; + i++; + } + *sgl = sg; + *nents = nent; + return 0; + +out_free_pages: + while (i > 0) { + i--; + __free_page(sg_page(&sg[i])); + } + kfree(sg); +out: + return NVME_SC_INTERNAL; +} + +static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *c, bool admin) +{ + /* NVMe command / RDMA RECV */ + c->nvme_cmd = kmalloc(sizeof(*c->nvme_cmd), GFP_KERNEL); + if (!c->nvme_cmd) + goto out; + + c->sge[0].addr = ib_dma_map_single(ndev->device, c->nvme_cmd, + sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ndev->device, c->sge[0].addr)) + goto out_free_cmd; + + c->sge[0].length = sizeof(*c->nvme_cmd); + c->sge[0].lkey = ndev->pd->local_dma_lkey; + + if (!admin) { + c->inline_page = alloc_pages(GFP_KERNEL, + get_order(NVMET_RDMA_INLINE_DATA_SIZE)); + if (!c->inline_page) + goto out_unmap_cmd; + c->sge[1].addr = ib_dma_map_page(ndev->device, + c->inline_page, 0, NVMET_RDMA_INLINE_DATA_SIZE, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ndev->device, c->sge[1].addr)) + goto out_free_inline_page; + c->sge[1].length = NVMET_RDMA_INLINE_DATA_SIZE; + c->sge[1].lkey = ndev->pd->local_dma_lkey; + } + + c->cqe.done = nvmet_rdma_recv_done; + + c->wr.wr_cqe = &c->cqe; + c->wr.sg_list = c->sge; + c->wr.num_sge = admin ? 1 : 2; + + return 0; + +out_free_inline_page: + if (!admin) { + __free_pages(c->inline_page, + get_order(NVMET_RDMA_INLINE_DATA_SIZE)); + } +out_unmap_cmd: + ib_dma_unmap_single(ndev->device, c->sge[0].addr, + sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); +out_free_cmd: + kfree(c->nvme_cmd); + +out: + return -ENOMEM; +} + +static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *c, bool admin) +{ + if (!admin) { + ib_dma_unmap_page(ndev->device, c->sge[1].addr, + NVMET_RDMA_INLINE_DATA_SIZE, DMA_FROM_DEVICE); + __free_pages(c->inline_page, + get_order(NVMET_RDMA_INLINE_DATA_SIZE)); + } + ib_dma_unmap_single(ndev->device, c->sge[0].addr, + sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); + kfree(c->nvme_cmd); +} + +static struct nvmet_rdma_cmd * +nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev, + int nr_cmds, bool admin) +{ + struct nvmet_rdma_cmd *cmds; + int ret = -EINVAL, i; + + cmds = kcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL); + if (!cmds) + goto out; + + for (i = 0; i < nr_cmds; i++) { + ret = nvmet_rdma_alloc_cmd(ndev, cmds + i, admin); + if (ret) + goto out_free; + } + + return cmds; + +out_free: + while (--i >= 0) + nvmet_rdma_free_cmd(ndev, cmds + i, admin); + kfree(cmds); +out: + return ERR_PTR(ret); +} + +static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *cmds, int nr_cmds, bool admin) +{ + int i; + + for (i = 0; i < nr_cmds; i++) + nvmet_rdma_free_cmd(ndev, cmds + i, admin); + kfree(cmds); +} + +static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_rsp *r) +{ + /* NVMe CQE / RDMA SEND */ + r->req.rsp = kmalloc(sizeof(*r->req.rsp), GFP_KERNEL); + if (!r->req.rsp) + goto out; + + r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.rsp, + sizeof(*r->req.rsp), DMA_TO_DEVICE); + if (ib_dma_mapping_error(ndev->device, r->send_sge.addr)) + goto out_free_rsp; + + r->send_sge.length = sizeof(*r->req.rsp); + r->send_sge.lkey = ndev->pd->local_dma_lkey; + + r->send_cqe.done = nvmet_rdma_send_done; + + r->send_wr.wr_cqe = &r->send_cqe; + r->send_wr.sg_list = &r->send_sge; + r->send_wr.num_sge = 1; + r->send_wr.send_flags = IB_SEND_SIGNALED; + + /* Data In / RDMA READ */ + r->read_cqe.done = nvmet_rdma_read_data_done; + return 0; + +out_free_rsp: + kfree(r->req.rsp); +out: + return -ENOMEM; +} + +static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_rsp *r) +{ + ib_dma_unmap_single(ndev->device, r->send_sge.addr, + sizeof(*r->req.rsp), DMA_TO_DEVICE); + kfree(r->req.rsp); +} + +static int +nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue) +{ + struct nvmet_rdma_device *ndev = queue->dev; + int nr_rsps = queue->recv_queue_size * 2; + int ret = -EINVAL, i; + + queue->rsps = kcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp), + GFP_KERNEL); + if (!queue->rsps) + goto out; + + for (i = 0; i < nr_rsps; i++) { + struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; + + ret = nvmet_rdma_alloc_rsp(ndev, rsp); + if (ret) + goto out_free; + + list_add_tail(&rsp->free_list, &queue->free_rsps); + } + + return 0; + +out_free: + while (--i >= 0) { + struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; + + list_del(&rsp->free_list); + nvmet_rdma_free_rsp(ndev, rsp); + } + kfree(queue->rsps); +out: + return ret; +} + +static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue) +{ + struct nvmet_rdma_device *ndev = queue->dev; + int i, nr_rsps = queue->recv_queue_size * 2; + + for (i = 0; i < nr_rsps; i++) { + struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; + + list_del(&rsp->free_list); + nvmet_rdma_free_rsp(ndev, rsp); + } + kfree(queue->rsps); +} + +static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *cmd) +{ + struct ib_recv_wr *bad_wr; + + if (ndev->srq) + return ib_post_srq_recv(ndev->srq, &cmd->wr, &bad_wr); + return ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, &bad_wr); +} + +static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue) +{ + spin_lock(&queue->rsp_wr_wait_lock); + while (!list_empty(&queue->rsp_wr_wait_list)) { + struct nvmet_rdma_rsp *rsp; + bool ret; + + rsp = list_entry(queue->rsp_wr_wait_list.next, + struct nvmet_rdma_rsp, wait_list); + list_del(&rsp->wait_list); + + spin_unlock(&queue->rsp_wr_wait_lock); + ret = nvmet_rdma_execute_command(rsp); + spin_lock(&queue->rsp_wr_wait_lock); + + if (!ret) { + list_add(&rsp->wait_list, &queue->rsp_wr_wait_list); + break; + } + } + spin_unlock(&queue->rsp_wr_wait_lock); +} + + +static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) +{ + struct nvmet_rdma_queue *queue = rsp->queue; + + atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); + + if (rsp->n_rdma) { + rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, + queue->cm_id->port_num, rsp->req.sg, + rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); + } + + if (rsp->req.sg != &rsp->cmd->inline_sg) + nvmet_rdma_free_sgl(rsp->req.sg, rsp->req.sg_cnt); + + if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) + nvmet_rdma_process_wr_wait_list(queue); + + nvmet_rdma_put_rsp(rsp); +} + +static void nvmet_rdma_error_comp(struct nvmet_rdma_queue *queue) +{ + if (queue->nvme_sq.ctrl) { + nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl); + } else { + /* + * we didn't setup the controller yet in case + * of admin connect error, just disconnect and + * cleanup the queue + */ + nvmet_rdma_queue_disconnect(queue); + } +} + +static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct nvmet_rdma_rsp *rsp = + container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe); + + nvmet_rdma_release_rsp(rsp); + + if (unlikely(wc->status != IB_WC_SUCCESS && + wc->status != IB_WC_WR_FLUSH_ERR)) { + pr_err("SEND for CQE 0x%p failed with status %s (%d).\n", + wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); + nvmet_rdma_error_comp(rsp->queue); + } +} + +static void nvmet_rdma_queue_response(struct nvmet_req *req) +{ + struct nvmet_rdma_rsp *rsp = + container_of(req, struct nvmet_rdma_rsp, req); + struct rdma_cm_id *cm_id = rsp->queue->cm_id; + struct ib_send_wr *first_wr, *bad_wr; + + if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) { + rsp->send_wr.opcode = IB_WR_SEND_WITH_INV; + rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey; + } else { + rsp->send_wr.opcode = IB_WR_SEND; + } + + if (nvmet_rdma_need_data_out(rsp)) + first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp, + cm_id->port_num, NULL, &rsp->send_wr); + else + first_wr = &rsp->send_wr; + + nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd); + if (ib_post_send(cm_id->qp, first_wr, &bad_wr)) { + pr_err("sending cmd response failed\n"); + nvmet_rdma_release_rsp(rsp); + } +} + +static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct nvmet_rdma_rsp *rsp = + container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe); + struct nvmet_rdma_queue *queue = cq->cq_context; + + WARN_ON(rsp->n_rdma <= 0); + atomic_add(rsp->n_rdma, &queue->sq_wr_avail); + rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, + queue->cm_id->port_num, rsp->req.sg, + rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); + rsp->n_rdma = 0; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + nvmet_rdma_release_rsp(rsp); + if (wc->status != IB_WC_WR_FLUSH_ERR) { + pr_info("RDMA READ for CQE 0x%p failed with status %s (%d).\n", + wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); + nvmet_rdma_error_comp(queue); + } + return; + } + + rsp->req.execute(&rsp->req); +} + +static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len, + u64 off) +{ + sg_init_table(&rsp->cmd->inline_sg, 1); + sg_set_page(&rsp->cmd->inline_sg, rsp->cmd->inline_page, len, off); + rsp->req.sg = &rsp->cmd->inline_sg; + rsp->req.sg_cnt = 1; +} + +static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) +{ + struct nvme_sgl_desc *sgl = &rsp->req.cmd->common.dptr.sgl; + u64 off = le64_to_cpu(sgl->addr); + u32 len = le32_to_cpu(sgl->length); + + if (!nvme_is_write(rsp->req.cmd)) + return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + + if (off + len > NVMET_RDMA_INLINE_DATA_SIZE) { + pr_err("invalid inline data offset!\n"); + return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR; + } + + /* no data command? */ + if (!len) + return 0; + + nvmet_rdma_use_inline_sg(rsp, len, off); + rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA; + return 0; +} + +static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, + struct nvme_keyed_sgl_desc *sgl, bool invalidate) +{ + struct rdma_cm_id *cm_id = rsp->queue->cm_id; + u64 addr = le64_to_cpu(sgl->addr); + u32 len = get_unaligned_le24(sgl->length); + u32 key = get_unaligned_le32(sgl->key); + int ret; + u16 status; + + /* no data command? */ + if (!len) + return 0; + + /* use the already allocated data buffer if possible */ + if (len <= NVMET_RDMA_INLINE_DATA_SIZE && rsp->queue->host_qid) { + nvmet_rdma_use_inline_sg(rsp, len, 0); + } else { + status = nvmet_rdma_alloc_sgl(&rsp->req.sg, &rsp->req.sg_cnt, + len); + if (status) + return status; + } + + ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, + rsp->req.sg, rsp->req.sg_cnt, 0, addr, key, + nvmet_data_dir(&rsp->req)); + if (ret < 0) + return NVME_SC_INTERNAL; + rsp->n_rdma += ret; + + if (invalidate) { + rsp->invalidate_rkey = key; + rsp->flags |= NVMET_RDMA_REQ_INVALIDATE_RKEY; + } + + return 0; +} + +static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp) +{ + struct nvme_keyed_sgl_desc *sgl = &rsp->req.cmd->common.dptr.ksgl; + + switch (sgl->type >> 4) { + case NVME_SGL_FMT_DATA_DESC: + switch (sgl->type & 0xf) { + case NVME_SGL_FMT_OFFSET: + return nvmet_rdma_map_sgl_inline(rsp); + default: + pr_err("invalid SGL subtype: %#x\n", sgl->type); + return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + } + case NVME_KEY_SGL_FMT_DATA_DESC: + switch (sgl->type & 0xf) { + case NVME_SGL_FMT_ADDRESS | NVME_SGL_FMT_INVALIDATE: + return nvmet_rdma_map_sgl_keyed(rsp, sgl, true); + case NVME_SGL_FMT_ADDRESS: + return nvmet_rdma_map_sgl_keyed(rsp, sgl, false); + default: + pr_err("invalid SGL subtype: %#x\n", sgl->type); + return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + } + default: + pr_err("invalid SGL type: %#x\n", sgl->type); + return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR; + } +} + +static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp) +{ + struct nvmet_rdma_queue *queue = rsp->queue; + + if (unlikely(atomic_sub_return(1 + rsp->n_rdma, + &queue->sq_wr_avail) < 0)) { + pr_debug("IB send queue full (needed %d): queue %u cntlid %u\n", + 1 + rsp->n_rdma, queue->idx, + queue->nvme_sq.ctrl->cntlid); + atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); + return false; + } + + if (nvmet_rdma_need_data_in(rsp)) { + if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp, + queue->cm_id->port_num, &rsp->read_cqe, NULL)) + nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR); + } else { + rsp->req.execute(&rsp->req); + } + + return true; +} + +static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue, + struct nvmet_rdma_rsp *cmd) +{ + u16 status; + + cmd->queue = queue; + cmd->n_rdma = 0; + cmd->req.port = queue->port; + + if (!nvmet_req_init(&cmd->req, &queue->nvme_cq, + &queue->nvme_sq, &nvmet_rdma_ops)) + return; + + status = nvmet_rdma_map_sgl(cmd); + if (status) + goto out_err; + + if (unlikely(!nvmet_rdma_execute_command(cmd))) { + spin_lock(&queue->rsp_wr_wait_lock); + list_add_tail(&cmd->wait_list, &queue->rsp_wr_wait_list); + spin_unlock(&queue->rsp_wr_wait_lock); + } + + return; + +out_err: + nvmet_req_complete(&cmd->req, status); +} + +static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct nvmet_rdma_cmd *cmd = + container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe); + struct nvmet_rdma_queue *queue = cq->cq_context; + struct nvmet_rdma_rsp *rsp; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + if (wc->status != IB_WC_WR_FLUSH_ERR) { + pr_err("RECV for CQE 0x%p failed with status %s (%d)\n", + wc->wr_cqe, ib_wc_status_msg(wc->status), + wc->status); + nvmet_rdma_error_comp(queue); + } + return; + } + + if (unlikely(wc->byte_len < sizeof(struct nvme_command))) { + pr_err("Ctrl Fatal Error: capsule size less than 64 bytes\n"); + nvmet_rdma_error_comp(queue); + return; + } + + cmd->queue = queue; + rsp = nvmet_rdma_get_rsp(queue); + rsp->cmd = cmd; + rsp->flags = 0; + rsp->req.cmd = cmd->nvme_cmd; + + if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) { + unsigned long flags; + + spin_lock_irqsave(&queue->state_lock, flags); + if (queue->state == NVMET_RDMA_Q_CONNECTING) + list_add_tail(&rsp->wait_list, &queue->rsp_wait_list); + else + nvmet_rdma_put_rsp(rsp); + spin_unlock_irqrestore(&queue->state_lock, flags); + return; + } + + nvmet_rdma_handle_command(queue, rsp); +} + +static void nvmet_rdma_destroy_srq(struct nvmet_rdma_device *ndev) +{ + if (!ndev->srq) + return; + + nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false); + ib_destroy_srq(ndev->srq); +} + +static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev) +{ + struct ib_srq_init_attr srq_attr = { NULL, }; + struct ib_srq *srq; + size_t srq_size; + int ret, i; + + srq_size = 4095; /* XXX: tune */ + + srq_attr.attr.max_wr = srq_size; + srq_attr.attr.max_sge = 2; + srq_attr.attr.srq_limit = 0; + srq_attr.srq_type = IB_SRQT_BASIC; + srq = ib_create_srq(ndev->pd, &srq_attr); + if (IS_ERR(srq)) { + /* + * If SRQs aren't supported we just go ahead and use normal + * non-shared receive queues. + */ + pr_info("SRQ requested but not supported.\n"); + return 0; + } + + ndev->srq_cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false); + if (IS_ERR(ndev->srq_cmds)) { + ret = PTR_ERR(ndev->srq_cmds); + goto out_destroy_srq; + } + + ndev->srq = srq; + ndev->srq_size = srq_size; + + for (i = 0; i < srq_size; i++) + nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]); + + return 0; + +out_destroy_srq: + ib_destroy_srq(srq); + return ret; +} + +static void nvmet_rdma_free_dev(struct kref *ref) +{ + struct nvmet_rdma_device *ndev = + container_of(ref, struct nvmet_rdma_device, ref); + + mutex_lock(&device_list_mutex); + list_del(&ndev->entry); + mutex_unlock(&device_list_mutex); + + nvmet_rdma_destroy_srq(ndev); + ib_dealloc_pd(ndev->pd); + + kfree(ndev); +} + +static struct nvmet_rdma_device * +nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) +{ + struct nvmet_rdma_device *ndev; + int ret; + + mutex_lock(&device_list_mutex); + list_for_each_entry(ndev, &device_list, entry) { + if (ndev->device->node_guid == cm_id->device->node_guid && + kref_get_unless_zero(&ndev->ref)) + goto out_unlock; + } + + ndev = kzalloc(sizeof(*ndev), GFP_KERNEL); + if (!ndev) + goto out_err; + + ndev->device = cm_id->device; + kref_init(&ndev->ref); + + ndev->pd = ib_alloc_pd(ndev->device); + if (IS_ERR(ndev->pd)) + goto out_free_dev; + + if (nvmet_rdma_use_srq) { + ret = nvmet_rdma_init_srq(ndev); + if (ret) + goto out_free_pd; + } + + list_add(&ndev->entry, &device_list); +out_unlock: + mutex_unlock(&device_list_mutex); + pr_debug("added %s.\n", ndev->device->name); + return ndev; + +out_free_pd: + ib_dealloc_pd(ndev->pd); +out_free_dev: + kfree(ndev); +out_err: + mutex_unlock(&device_list_mutex); + return NULL; +} + +static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) +{ + struct ib_qp_init_attr qp_attr; + struct nvmet_rdma_device *ndev = queue->dev; + int comp_vector, nr_cqe, ret, i; + + /* + * Spread the io queues across completion vectors, + * but still keep all admin queues on vector 0. + */ + comp_vector = !queue->host_qid ? 0 : + queue->idx % ndev->device->num_comp_vectors; + + /* + * Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE + RDMA_SEND. + */ + nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size; + + queue->cq = ib_alloc_cq(ndev->device, queue, + nr_cqe + 1, comp_vector, + IB_POLL_WORKQUEUE); + if (IS_ERR(queue->cq)) { + ret = PTR_ERR(queue->cq); + pr_err("failed to create CQ cqe= %d ret= %d\n", + nr_cqe + 1, ret); + goto out; + } + + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.qp_context = queue; + qp_attr.event_handler = nvmet_rdma_qp_event; + qp_attr.send_cq = queue->cq; + qp_attr.recv_cq = queue->cq; + qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + qp_attr.qp_type = IB_QPT_RC; + /* +1 for drain */ + qp_attr.cap.max_send_wr = queue->send_queue_size + 1; + qp_attr.cap.max_rdma_ctxs = queue->send_queue_size; + qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd, + ndev->device->attrs.max_sge); + + if (ndev->srq) { + qp_attr.srq = ndev->srq; + } else { + /* +1 for drain */ + qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size; + qp_attr.cap.max_recv_sge = 2; + } + + ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr); + if (ret) { + pr_err("failed to create_qp ret= %d\n", ret); + goto err_destroy_cq; + } + + atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr); + + pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n", + __func__, queue->cq->cqe, qp_attr.cap.max_send_sge, + qp_attr.cap.max_send_wr, queue->cm_id); + + if (!ndev->srq) { + for (i = 0; i < queue->recv_queue_size; i++) { + queue->cmds[i].queue = queue; + nvmet_rdma_post_recv(ndev, &queue->cmds[i]); + } + } + +out: + return ret; + +err_destroy_cq: + ib_free_cq(queue->cq); + goto out; +} + +static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue) +{ + rdma_destroy_qp(queue->cm_id); + ib_free_cq(queue->cq); +} + +static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) +{ + pr_info("freeing queue %d\n", queue->idx); + + nvmet_sq_destroy(&queue->nvme_sq); + + nvmet_rdma_destroy_queue_ib(queue); + if (!queue->dev->srq) { + nvmet_rdma_free_cmds(queue->dev, queue->cmds, + queue->recv_queue_size, + !queue->host_qid); + } + nvmet_rdma_free_rsps(queue); + ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); + kfree(queue); +} + +static void nvmet_rdma_release_queue_work(struct work_struct *w) +{ + struct nvmet_rdma_queue *queue = + container_of(w, struct nvmet_rdma_queue, release_work); + struct rdma_cm_id *cm_id = queue->cm_id; + struct nvmet_rdma_device *dev = queue->dev; + + nvmet_rdma_free_queue(queue); + rdma_destroy_id(cm_id); + kref_put(&dev->ref, nvmet_rdma_free_dev); +} + +static int +nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn, + struct nvmet_rdma_queue *queue) +{ + struct nvme_rdma_cm_req *req; + + req = (struct nvme_rdma_cm_req *)conn->private_data; + if (!req || conn->private_data_len == 0) + return NVME_RDMA_CM_INVALID_LEN; + + if (le16_to_cpu(req->recfmt) != NVME_RDMA_CM_FMT_1_0) + return NVME_RDMA_CM_INVALID_RECFMT; + + queue->host_qid = le16_to_cpu(req->qid); + + /* + * req->hsqsize corresponds to our recv queue size + * req->hrqsize corresponds to our send queue size + */ + queue->recv_queue_size = le16_to_cpu(req->hsqsize); + queue->send_queue_size = le16_to_cpu(req->hrqsize); + + if (!queue->host_qid && queue->recv_queue_size > NVMF_AQ_DEPTH) + return NVME_RDMA_CM_INVALID_HSQSIZE; + + /* XXX: Should we enforce some kind of max for IO queues? */ + + return 0; +} + +static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id, + enum nvme_rdma_cm_status status) +{ + struct nvme_rdma_cm_rej rej; + + rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); + rej.sts = cpu_to_le16(status); + + return rdma_reject(cm_id, (void *)&rej, sizeof(rej)); +} + +static struct nvmet_rdma_queue * +nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, + struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + struct nvmet_rdma_queue *queue; + int ret; + + queue = kzalloc(sizeof(*queue), GFP_KERNEL); + if (!queue) { + ret = NVME_RDMA_CM_NO_RSC; + goto out_reject; + } + + ret = nvmet_sq_init(&queue->nvme_sq); + if (ret) + goto out_free_queue; + + ret = nvmet_rdma_parse_cm_connect_req(&event->param.conn, queue); + if (ret) + goto out_destroy_sq; + + /* + * Schedules the actual release because calling rdma_destroy_id from + * inside a CM callback would trigger a deadlock. (great API design..) + */ + INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work); + queue->dev = ndev; + queue->cm_id = cm_id; + + spin_lock_init(&queue->state_lock); + queue->state = NVMET_RDMA_Q_CONNECTING; + INIT_LIST_HEAD(&queue->rsp_wait_list); + INIT_LIST_HEAD(&queue->rsp_wr_wait_list); + spin_lock_init(&queue->rsp_wr_wait_lock); + INIT_LIST_HEAD(&queue->free_rsps); + spin_lock_init(&queue->rsps_lock); + + queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL); + if (queue->idx < 0) { + ret = NVME_RDMA_CM_NO_RSC; + goto out_free_queue; + } + + ret = nvmet_rdma_alloc_rsps(queue); + if (ret) { + ret = NVME_RDMA_CM_NO_RSC; + goto out_ida_remove; + } + + if (!ndev->srq) { + queue->cmds = nvmet_rdma_alloc_cmds(ndev, + queue->recv_queue_size, + !queue->host_qid); + if (IS_ERR(queue->cmds)) { + ret = NVME_RDMA_CM_NO_RSC; + goto out_free_responses; + } + } + + ret = nvmet_rdma_create_queue_ib(queue); + if (ret) { + pr_err("%s: creating RDMA queue failed (%d).\n", + __func__, ret); + ret = NVME_RDMA_CM_NO_RSC; + goto out_free_cmds; + } + + return queue; + +out_free_cmds: + if (!ndev->srq) { + nvmet_rdma_free_cmds(queue->dev, queue->cmds, + queue->recv_queue_size, + !queue->host_qid); + } +out_free_responses: + nvmet_rdma_free_rsps(queue); +out_ida_remove: + ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); +out_destroy_sq: + nvmet_sq_destroy(&queue->nvme_sq); +out_free_queue: + kfree(queue); +out_reject: + nvmet_rdma_cm_reject(cm_id, ret); + return NULL; +} + +static void nvmet_rdma_qp_event(struct ib_event *event, void *priv) +{ + struct nvmet_rdma_queue *queue = priv; + + switch (event->event) { + case IB_EVENT_COMM_EST: + rdma_notify(queue->cm_id, event->event); + break; + default: + pr_err("received unrecognized IB QP event %d\n", event->event); + break; + } +} + +static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id, + struct nvmet_rdma_queue *queue, + struct rdma_conn_param *p) +{ + struct rdma_conn_param param = { }; + struct nvme_rdma_cm_rep priv = { }; + int ret = -ENOMEM; + + param.rnr_retry_count = 7; + param.flow_control = 1; + param.initiator_depth = min_t(u8, p->initiator_depth, + queue->dev->device->attrs.max_qp_init_rd_atom); + param.private_data = &priv; + param.private_data_len = sizeof(priv); + priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); + priv.crqsize = cpu_to_le16(queue->recv_queue_size); + + ret = rdma_accept(cm_id, ¶m); + if (ret) + pr_err("rdma_accept failed (error code = %d)\n", ret); + + return ret; +} + +static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + struct nvmet_rdma_device *ndev; + struct nvmet_rdma_queue *queue; + int ret = -EINVAL; + + ndev = nvmet_rdma_find_get_device(cm_id); + if (!ndev) { + pr_err("no client data!\n"); + nvmet_rdma_cm_reject(cm_id, NVME_RDMA_CM_NO_RSC); + return -ECONNREFUSED; + } + + queue = nvmet_rdma_alloc_queue(ndev, cm_id, event); + if (!queue) { + ret = -ENOMEM; + goto put_device; + } + queue->port = cm_id->context; + + ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn); + if (ret) + goto release_queue; + + mutex_lock(&nvmet_rdma_queue_mutex); + list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list); + mutex_unlock(&nvmet_rdma_queue_mutex); + + return 0; + +release_queue: + nvmet_rdma_free_queue(queue); +put_device: + kref_put(&ndev->ref, nvmet_rdma_free_dev); + + return ret; +} + +static void nvmet_rdma_queue_established(struct nvmet_rdma_queue *queue) +{ + unsigned long flags; + + spin_lock_irqsave(&queue->state_lock, flags); + if (queue->state != NVMET_RDMA_Q_CONNECTING) { + pr_warn("trying to establish a connected queue\n"); + goto out_unlock; + } + queue->state = NVMET_RDMA_Q_LIVE; + + while (!list_empty(&queue->rsp_wait_list)) { + struct nvmet_rdma_rsp *cmd; + + cmd = list_first_entry(&queue->rsp_wait_list, + struct nvmet_rdma_rsp, wait_list); + list_del(&cmd->wait_list); + + spin_unlock_irqrestore(&queue->state_lock, flags); + nvmet_rdma_handle_command(queue, cmd); + spin_lock_irqsave(&queue->state_lock, flags); + } + +out_unlock: + spin_unlock_irqrestore(&queue->state_lock, flags); +} + +static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) +{ + bool disconnect = false; + unsigned long flags; + + pr_debug("cm_id= %p queue->state= %d\n", queue->cm_id, queue->state); + + spin_lock_irqsave(&queue->state_lock, flags); + switch (queue->state) { + case NVMET_RDMA_Q_CONNECTING: + case NVMET_RDMA_Q_LIVE: + disconnect = true; + queue->state = NVMET_RDMA_Q_DISCONNECTING; + break; + case NVMET_RDMA_Q_DISCONNECTING: + break; + } + spin_unlock_irqrestore(&queue->state_lock, flags); + + if (disconnect) { + rdma_disconnect(queue->cm_id); + ib_drain_qp(queue->cm_id->qp); + schedule_work(&queue->release_work); + } +} + +static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) +{ + bool disconnect = false; + + mutex_lock(&nvmet_rdma_queue_mutex); + if (!list_empty(&queue->queue_list)) { + list_del_init(&queue->queue_list); + disconnect = true; + } + mutex_unlock(&nvmet_rdma_queue_mutex); + + if (disconnect) + __nvmet_rdma_queue_disconnect(queue); +} + +static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id, + struct nvmet_rdma_queue *queue) +{ + WARN_ON_ONCE(queue->state != NVMET_RDMA_Q_CONNECTING); + + pr_err("failed to connect queue\n"); + schedule_work(&queue->release_work); +} + +static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + struct nvmet_rdma_queue *queue = NULL; + int ret = 0; + + if (cm_id->qp) + queue = cm_id->qp->qp_context; + + pr_debug("%s (%d): status %d id %p\n", + rdma_event_msg(event->event), event->event, + event->status, cm_id); + + switch (event->event) { + case RDMA_CM_EVENT_CONNECT_REQUEST: + ret = nvmet_rdma_queue_connect(cm_id, event); + break; + case RDMA_CM_EVENT_ESTABLISHED: + nvmet_rdma_queue_established(queue); + break; + case RDMA_CM_EVENT_ADDR_CHANGE: + case RDMA_CM_EVENT_DISCONNECTED: + case RDMA_CM_EVENT_DEVICE_REMOVAL: + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + /* + * We can get the device removal callback even for a + * CM ID that we aren't actually using. In that case + * the context pointer is NULL, so we shouldn't try + * to disconnect a non-existing queue. But we also + * need to return 1 so that the core will destroy + * it's own ID. What a great API design.. + */ + if (queue) + nvmet_rdma_queue_disconnect(queue); + else + ret = 1; + break; + case RDMA_CM_EVENT_REJECTED: + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_CONNECT_ERROR: + nvmet_rdma_queue_connect_fail(cm_id, queue); + break; + default: + pr_err("received unrecognized RDMA CM event %d\n", + event->event); + break; + } + + return ret; +} + +static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl) +{ + struct nvmet_rdma_queue *queue; + +restart: + mutex_lock(&nvmet_rdma_queue_mutex); + list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) { + if (queue->nvme_sq.ctrl == ctrl) { + list_del_init(&queue->queue_list); + mutex_unlock(&nvmet_rdma_queue_mutex); + + __nvmet_rdma_queue_disconnect(queue); + goto restart; + } + } + mutex_unlock(&nvmet_rdma_queue_mutex); +} + +static int nvmet_rdma_add_port(struct nvmet_port *port) +{ + struct rdma_cm_id *cm_id; + struct sockaddr_in addr_in; + u16 port_in; + int ret; + + switch (port->disc_addr.adrfam) { + case NVMF_ADDR_FAMILY_IP4: + break; + default: + pr_err("address family %d not supported\n", + port->disc_addr.adrfam); + return -EINVAL; + } + + ret = kstrtou16(port->disc_addr.trsvcid, 0, &port_in); + if (ret) + return ret; + + addr_in.sin_family = AF_INET; + addr_in.sin_addr.s_addr = in_aton(port->disc_addr.traddr); + addr_in.sin_port = htons(port_in); + + cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(cm_id)) { + pr_err("CM ID creation failed\n"); + return PTR_ERR(cm_id); + } + + ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr_in); + if (ret) { + pr_err("binding CM ID to %pISpc failed (%d)\n", &addr_in, ret); + goto out_destroy_id; + } + + ret = rdma_listen(cm_id, 128); + if (ret) { + pr_err("listening to %pISpc failed (%d)\n", &addr_in, ret); + goto out_destroy_id; + } + + pr_info("enabling port %d (%pISpc)\n", + le16_to_cpu(port->disc_addr.portid), &addr_in); + port->priv = cm_id; + return 0; + +out_destroy_id: + rdma_destroy_id(cm_id); + return ret; +} + +static void nvmet_rdma_remove_port(struct nvmet_port *port) +{ + struct rdma_cm_id *cm_id = port->priv; + + rdma_destroy_id(cm_id); +} + +static struct nvmet_fabrics_ops nvmet_rdma_ops = { + .owner = THIS_MODULE, + .type = NVMF_TRTYPE_RDMA, + .sqe_inline_size = NVMET_RDMA_INLINE_DATA_SIZE, + .msdbd = 1, + .has_keyed_sgls = 1, + .add_port = nvmet_rdma_add_port, + .remove_port = nvmet_rdma_remove_port, + .queue_response = nvmet_rdma_queue_response, + .delete_ctrl = nvmet_rdma_delete_ctrl, +}; + +static int __init nvmet_rdma_init(void) +{ + return nvmet_register_transport(&nvmet_rdma_ops); +} + +static void __exit nvmet_rdma_exit(void) +{ + struct nvmet_rdma_queue *queue; + + nvmet_unregister_transport(&nvmet_rdma_ops); + + flush_scheduled_work(); + + mutex_lock(&nvmet_rdma_queue_mutex); + while ((queue = list_first_entry_or_null(&nvmet_rdma_queue_list, + struct nvmet_rdma_queue, queue_list))) { + list_del_init(&queue->queue_list); + + mutex_unlock(&nvmet_rdma_queue_mutex); + __nvmet_rdma_queue_disconnect(queue); + mutex_lock(&nvmet_rdma_queue_mutex); + } + mutex_unlock(&nvmet_rdma_queue_mutex); + + flush_scheduled_work(); + ida_destroy(&nvmet_rdma_queue_ida); +} + +module_init(nvmet_rdma_init); +module_exit(nvmet_rdma_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */ diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index 31d544a..e2fa759 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -45,7 +45,6 @@ int dasd_gendisk_alloc(struct dasd_block *block) gdp->major = DASD_MAJOR; gdp->first_minor = base->devindex << DASD_PARTN_BITS; gdp->fops = &dasd_device_operations; - gdp->driverfs_dev = &base->cdev->dev; /* * Set device name. @@ -76,7 +75,7 @@ int dasd_gendisk_alloc(struct dasd_block *block) gdp->queue = block->request_queue; block->gdp = gdp; set_capacity(block->gdp, 0); - add_disk(block->gdp); + device_add_disk(&base->cdev->dev, block->gdp); return 0; } diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 093e9e1..fac1b51 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -615,7 +615,6 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char dev_info->dcssblk_queue = blk_alloc_queue(GFP_KERNEL); dev_info->gd->queue = dev_info->dcssblk_queue; dev_info->gd->private_data = dev_info; - dev_info->gd->driverfs_dev = &dev_info->dev; blk_queue_make_request(dev_info->dcssblk_queue, dcssblk_make_request); blk_queue_logical_block_size(dev_info->dcssblk_queue, 4096); queue_flag_set_unlocked(QUEUE_FLAG_DAX, dev_info->dcssblk_queue); @@ -656,7 +655,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char goto put_dev; get_device(&dev_info->dev); - add_disk(dev_info->gd); + device_add_disk(&dev_info->dev, dev_info->gd); switch (dev_info->segment_type) { case SEG_TYPE_SR: diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c index e6f54d3..9f16ea6 100644 --- a/drivers/s390/block/scm_blk.c +++ b/drivers/s390/block/scm_blk.c @@ -512,7 +512,6 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) goto out_queue; rq->queuedata = scmdev; - bdev->gendisk->driverfs_dev = &scmdev->dev; bdev->gendisk->private_data = scmdev; bdev->gendisk->fops = &scm_blk_devops; bdev->gendisk->queue = rq; @@ -531,7 +530,7 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) /* 512 byte sectors */ set_capacity(bdev->gendisk, scmdev->size >> 9); - add_disk(bdev->gendisk); + device_add_disk(&scmdev->dev, bdev->gendisk); return 0; out_queue: diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c index daa4dc1..2f2a991 100644 --- a/drivers/scsi/osd/osd_initiator.c +++ b/drivers/scsi/osd/osd_initiator.c @@ -1558,18 +1558,25 @@ static int _osd_req_finalize_data_integrity(struct osd_request *or, static struct request *_make_request(struct request_queue *q, bool has_write, struct _osd_io_info *oii, gfp_t flags) { - if (oii->bio) - return blk_make_request(q, oii->bio, flags); - else { - struct request *req; - - req = blk_get_request(q, has_write ? WRITE : READ, flags); - if (IS_ERR(req)) - return req; + struct request *req; + struct bio *bio = oii->bio; + int ret; - blk_rq_set_block_pc(req); + req = blk_get_request(q, has_write ? WRITE : READ, flags); + if (IS_ERR(req)) return req; + blk_rq_set_block_pc(req); + + for_each_bio(bio) { + struct bio *bounce_bio = bio; + + blk_queue_bounce(req->q, &bounce_bio); + ret = blk_rq_append_bio(req, bounce_bio); + if (ret) + return ERR_PTR(ret); } + + return req; } static int _init_blk_request(struct osd_request *or, diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 0609d68..fa85d19 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -2994,7 +2994,6 @@ static void sd_probe_async(void *data, async_cookie_t cookie) sd_revalidate_disk(gd); - gd->driverfs_dev = &sdp->sdev_gendev; gd->flags = GENHD_FL_EXT_DEVT; if (sdp->removable) { gd->flags |= GENHD_FL_REMOVABLE; @@ -3002,7 +3001,7 @@ static void sd_probe_async(void *data, async_cookie_t cookie) } blk_pm_runtime_init(sdp->request_queue, dev); - add_disk(gd); + device_add_disk(dev, gd); if (sdkp->capacity) sd_dif_config_host(sdkp); diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 64c8674..ed17934 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -713,7 +713,6 @@ static int sr_probe(struct device *dev) get_capabilities(cd); sr_vendor_init(cd); - disk->driverfs_dev = &sdev->sdev_gendev; set_capacity(disk, cd->capacity); disk->private_data = &cd->driver; disk->queue = sdev->request_queue; @@ -730,7 +729,7 @@ static int sr_probe(struct device *dev) dev_set_drvdata(dev, cd); disk->flags |= GENHD_FL_REMOVABLE; - add_disk(disk); + device_add_disk(&sdev->sdev_gendev, disk); sdev_printk(KERN_DEBUG, sdev, "Attached scsi CD-ROM %s\n", cd->cdi.name); diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 81564c8..9125d93 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -876,19 +876,19 @@ static inline struct bio *pscsi_get_bio(int nr_vecs) static sense_reason_t pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, - enum dma_data_direction data_direction, struct bio **hbio) + struct request *req) { struct pscsi_dev_virt *pdv = PSCSI_DEV(cmd->se_dev); - struct bio *bio = NULL, *tbio = NULL; + struct bio *bio = NULL; struct page *page; struct scatterlist *sg; u32 data_len = cmd->data_length, i, len, bytes, off; int nr_pages = (cmd->data_length + sgl[0].offset + PAGE_SIZE - 1) >> PAGE_SHIFT; int nr_vecs = 0, rc; - int rw = (data_direction == DMA_TO_DEVICE); + int rw = (cmd->data_direction == DMA_TO_DEVICE); - *hbio = NULL; + BUG_ON(!cmd->data_length); pr_debug("PSCSI: nr_pages: %d\n", nr_pages); @@ -927,16 +927,6 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, pr_debug("PSCSI: Allocated bio: %p," " dir: %s nr_vecs: %d\n", bio, (rw) ? "rw" : "r", nr_vecs); - /* - * Set *hbio pointer to handle the case: - * nr_pages > BIO_MAX_PAGES, where additional - * bios need to be added to complete a given - * command. - */ - if (!*hbio) - *hbio = tbio = bio; - else - tbio = tbio->bi_next = bio; } pr_debug("PSCSI: Calling bio_add_pc_page() i: %d" @@ -955,11 +945,16 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, pr_debug("PSCSI: Reached bio->bi_vcnt max:" " %d i: %d bio: %p, allocating another" " bio\n", bio->bi_vcnt, i, bio); + + rc = blk_rq_append_bio(req, bio); + if (rc) { + pr_err("pSCSI: failed to append bio\n"); + goto fail; + } + /* * Clear the pointer so that another bio will - * be allocated with pscsi_get_bio() above, the - * current bio has already been set *tbio and - * bio->bi_next. + * be allocated with pscsi_get_bio() above. */ bio = NULL; } @@ -968,13 +963,16 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, } } + if (bio) { + rc = blk_rq_append_bio(req, bio); + if (rc) { + pr_err("pSCSI: failed to append bio\n"); + goto fail; + } + } + return 0; fail: - while (*hbio) { - bio = *hbio; - *hbio = (*hbio)->bi_next; - bio_endio(bio); - } return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; } @@ -992,11 +990,9 @@ pscsi_execute_cmd(struct se_cmd *cmd) { struct scatterlist *sgl = cmd->t_data_sg; u32 sgl_nents = cmd->t_data_nents; - enum dma_data_direction data_direction = cmd->data_direction; struct pscsi_dev_virt *pdv = PSCSI_DEV(cmd->se_dev); struct pscsi_plugin_task *pt; struct request *req; - struct bio *hbio; sense_reason_t ret; /* @@ -1012,31 +1008,21 @@ pscsi_execute_cmd(struct se_cmd *cmd) memcpy(pt->pscsi_cdb, cmd->t_task_cdb, scsi_command_size(cmd->t_task_cdb)); - if (!sgl) { - req = blk_get_request(pdv->pdv_sd->request_queue, - (data_direction == DMA_TO_DEVICE), - GFP_KERNEL); - if (IS_ERR(req)) { - pr_err("PSCSI: blk_get_request() failed\n"); - ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; - goto fail; - } + req = blk_get_request(pdv->pdv_sd->request_queue, + (cmd->data_direction == DMA_TO_DEVICE), + GFP_KERNEL); + if (IS_ERR(req)) { + pr_err("PSCSI: blk_get_request() failed\n"); + ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; + goto fail; + } - blk_rq_set_block_pc(req); - } else { - BUG_ON(!cmd->data_length); + blk_rq_set_block_pc(req); - ret = pscsi_map_sg(cmd, sgl, sgl_nents, data_direction, &hbio); + if (sgl) { + ret = pscsi_map_sg(cmd, sgl, sgl_nents, req); if (ret) - goto fail; - - req = blk_make_request(pdv->pdv_sd->request_queue, hbio, - GFP_KERNEL); - if (IS_ERR(req)) { - pr_err("pSCSI: blk_make_request() failed\n"); - ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; - goto fail_free_bio; - } + goto fail_put_request; } req->end_io = pscsi_req_done; @@ -1057,13 +1043,8 @@ pscsi_execute_cmd(struct se_cmd *cmd) return 0; -fail_free_bio: - while (hbio) { - struct bio *bio = hbio; - hbio = hbio->bi_next; - bio_endio(bio); - } - ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; +fail_put_request: + blk_put_request(req); fail: kfree(pt); return ret; diff --git a/fs/buffer.c b/fs/buffer.c index e156a36..b9fa1be 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -153,7 +153,7 @@ static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) if (uptodate) { set_buffer_uptodate(bh); } else { - /* This happens, due to failed READA attempts. */ + /* This happens, due to failed read-ahead attempts. */ clear_buffer_uptodate(bh); } unlock_buffer(bh); @@ -1395,7 +1395,7 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size) { struct buffer_head *bh = __getblk(bdev, block, size); if (likely(bh)) { - ll_rw_block(REQ_OP_READ, READA, 1, &bh); + ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh); brelse(bh); } } @@ -3053,14 +3053,14 @@ EXPORT_SYMBOL(submit_bh); /** * ll_rw_block: low-level access to block devices (DEPRECATED) * @op: whether to %READ or %WRITE - * @op_flags: rq_flag_bits or %READA (readahead) + * @op_flags: rq_flag_bits * @nr: number of &struct buffer_heads in the array * @bhs: array of pointers to &struct buffer_head * * ll_rw_block() takes an array of pointers to &struct buffer_heads, and - * requests an I/O operation on them, either a %READ or a %WRITE. The third - * %READA option is described in the documentation for generic_make_request() - * which ll_rw_block() calls. + * requests an I/O operation on them, either a %REQ_OP_READ or a %REQ_OP_WRITE. + * @op_flags contains flags modifying the detailed I/O behavior, most notably + * %REQ_RAHEAD. * * This function drops any buffer that it cannot get a lock on (with the * BH_Lock state bit), any buffer that appears to be clean when doing a write diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index b6d600e..124b4a3 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -159,7 +159,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, .sbi = sbi, .type = META, .op = REQ_OP_READ, - .op_flags = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA, + .op_flags = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : REQ_RAHEAD, .encrypted_page = NULL, }; struct blk_plug plug; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 3649d86..f06ed73 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -733,7 +733,8 @@ next_step: start_bidx = start_bidx_of_node(nofs, inode); data_page = get_read_data_page(inode, - start_bidx + ofs_in_node, READA, true); + start_bidx + ofs_in_node, REQ_RAHEAD, + true); if (IS_ERR(data_page)) { iput(inode); continue; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index e534039..d186769 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1119,7 +1119,7 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) if (!apage) return; - err = read_node_page(apage, READA); + err = read_node_page(apage, REQ_RAHEAD); f2fs_put_page(apage, err ? 1 : 0); } diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index fd6389c..6e2bec1 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -285,7 +285,8 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl, if (trylock_buffer(rabh)) { if (!buffer_uptodate(rabh)) { rabh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, READA | REQ_META, rabh); + submit_bh(REQ_OP_READ, REQ_RAHEAD | REQ_META, + rabh); continue; } unlock_buffer(rabh); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 4d68530..fcb59b2 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1513,7 +1513,7 @@ static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index, continue; } bh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, READA | REQ_META, bh); + submit_bh(REQ_OP_READ, REQ_RAHEAD | REQ_META, bh); continue; } brelse(bh); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 052c113..950b8be 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -459,7 +459,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) bh = gfs2_getbuf(gl, dblock, CREATE); if (!buffer_uptodate(bh) && !buffer_locked(bh)) - ll_rw_block(REQ_OP_READ, READA | REQ_META, 1, &bh); + ll_rw_block(REQ_OP_READ, REQ_RAHEAD | REQ_META, 1, &bh); brelse(bh); dblock++; extlen--; diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 64b29b5..4032d1e 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -551,7 +551,7 @@ static int search_by_key_reada(struct super_block *s, if (!buffer_uptodate(bh[j])) { if (depth == -1) depth = reiserfs_write_unlock_nested(s); - ll_rw_block(REQ_OP_READ, READA, 1, bh + j); + ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, bh + j); } brelse(bh[j]); } diff --git a/fs/udf/dir.c b/fs/udf/dir.c index 80c8a21..aaec13c 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -113,7 +113,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) brelse(tmp); } if (num) { - ll_rw_block(REQ_OP_READ, READA, num, bha); + ll_rw_block(REQ_OP_READ, REQ_RAHEAD, num, bha); for (i = 0; i < num; i++) brelse(bha[i]); } diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 71f3e0b..988d535 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -87,7 +87,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, brelse(tmp); } if (num) { - ll_rw_block(REQ_OP_READ, READA, num, bha); + ll_rw_block(REQ_OP_READ, REQ_RAHEAD, num, bha); for (i = 0; i < num; i++) brelse(bha[i]); } diff --git a/include/linux/bio.h b/include/linux/bio.h index b7e1a008..583c108 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -663,8 +663,6 @@ static inline void bio_inc_remaining(struct bio *bio) * and the bvec_slabs[]. */ #define BIO_POOL_SIZE 2 -#define BIOVEC_NR_POOLS 6 -#define BIOVEC_MAX_IDX (BIOVEC_NR_POOLS - 1) struct bio_set { struct kmem_cache *bio_slab; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 2498fdf..e43bbff 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -96,6 +96,7 @@ typedef int (init_request_fn)(void *, struct request *, unsigned int, unsigned int, unsigned int); typedef void (exit_request_fn)(void *, struct request *, unsigned int, unsigned int); +typedef int (reinit_request_fn)(void *, struct request *); typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, bool); @@ -145,6 +146,7 @@ struct blk_mq_ops { */ init_request_fn *init_request; exit_request_fn *exit_request; + reinit_request_fn *reinit_request; }; enum { @@ -196,6 +198,8 @@ enum { struct request *blk_mq_alloc_request(struct request_queue *q, int rw, unsigned int flags); +struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int op, + unsigned int flags, unsigned int hctx_idx); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags); @@ -243,6 +247,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_unfreeze_queue(struct request_queue *q); void blk_mq_freeze_queue_start(struct request_queue *q); +int blk_mq_reinit_tagset(struct blk_mq_tag_set *set); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index b588e96..f254eb2 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -26,11 +26,11 @@ typedef void (bio_destructor_t) (struct bio *); struct bio { struct bio *bi_next; /* request queue link */ struct block_device *bi_bdev; - unsigned int bi_flags; /* status, command, etc */ int bi_error; unsigned int bi_rw; /* bottom bits req flags, * top bits REQ_OP */ + unsigned short bi_flags; /* status, command, etc */ unsigned short bi_ioprio; struct bvec_iter bi_iter; @@ -114,19 +114,25 @@ struct bio { /* * Flags starting here get preserved by bio_reset() - this includes - * BIO_POOL_IDX() + * BVEC_POOL_IDX() */ -#define BIO_RESET_BITS 13 -#define BIO_OWNS_VEC 13 /* bio_free() should free bvec */ +#define BIO_RESET_BITS 10 /* - * top 4 bits of bio flags indicate the pool this bio came from + * We support 6 different bvec pools, the last one is magic in that it + * is backed by a mempool. */ -#define BIO_POOL_BITS (4) -#define BIO_POOL_NONE ((1UL << BIO_POOL_BITS) - 1) -#define BIO_POOL_OFFSET (32 - BIO_POOL_BITS) -#define BIO_POOL_MASK (1UL << BIO_POOL_OFFSET) -#define BIO_POOL_IDX(bio) ((bio)->bi_flags >> BIO_POOL_OFFSET) +#define BVEC_POOL_NR 6 +#define BVEC_POOL_MAX (BVEC_POOL_NR - 1) + +/* + * Top 4 bits of bio flags indicate the pool the bvecs came from. We add + * 1 to the actual index so that 0 indicates that there are no bvecs to be + * freed. + */ +#define BVEC_POOL_BITS (4) +#define BVEC_POOL_OFFSET (16 - BVEC_POOL_BITS) +#define BVEC_POOL_IDX(bio) ((bio)->bi_flags >> BVEC_POOL_OFFSET) #endif /* CONFIG_BLOCK */ @@ -143,7 +149,6 @@ enum rq_flag_bits { __REQ_SYNC, /* request is sync (sync write or read) */ __REQ_META, /* metadata io request */ __REQ_PRIO, /* boost priority in cfq */ - __REQ_SECURE, /* secure discard (used with REQ_OP_DISCARD) */ __REQ_NOIDLE, /* don't anticipate more IO after this one */ __REQ_INTEGRITY, /* I/O includes block integrity payload */ @@ -192,7 +197,7 @@ enum rq_flag_bits { (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) #define REQ_COMMON_MASK \ (REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | REQ_NOIDLE | \ - REQ_PREFLUSH | REQ_FUA | REQ_SECURE | REQ_INTEGRITY | REQ_NOMERGE) + REQ_PREFLUSH | REQ_FUA | REQ_INTEGRITY | REQ_NOMERGE) #define REQ_CLONE_MASK REQ_COMMON_MASK /* This mask is used for both bio and request merge checking */ @@ -219,7 +224,6 @@ enum rq_flag_bits { #define REQ_FLUSH_SEQ (1ULL << __REQ_FLUSH_SEQ) #define REQ_IO_STAT (1ULL << __REQ_IO_STAT) #define REQ_MIXED_MERGE (1ULL << __REQ_MIXED_MERGE) -#define REQ_SECURE (1ULL << __REQ_SECURE) #define REQ_PM (1ULL << __REQ_PM) #define REQ_HASHED (1ULL << __REQ_HASHED) #define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) @@ -228,6 +232,7 @@ enum req_op { REQ_OP_READ, REQ_OP_WRITE, REQ_OP_DISCARD, /* request to discard sectors */ + REQ_OP_SECURE_ERASE, /* request to securely erase sectors */ REQ_OP_WRITE_SAME, /* write same block many times */ REQ_OP_FLUSH, /* request for cache flush */ }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 48f05d7..c96db9c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -496,7 +496,7 @@ struct request_queue { #define QUEUE_FLAG_DISCARD 14 /* supports DISCARD */ #define QUEUE_FLAG_NOXMERGES 15 /* No extended merges */ #define QUEUE_FLAG_ADD_RANDOM 16 /* Contributes to random pool */ -#define QUEUE_FLAG_SECDISCARD 17 /* supports SECDISCARD */ +#define QUEUE_FLAG_SECERASE 17 /* supports secure erase */ #define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */ #define QUEUE_FLAG_DEAD 19 /* queue tear-down finished */ #define QUEUE_FLAG_INIT_DONE 20 /* queue is initialized */ @@ -593,8 +593,8 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) #define blk_queue_stackable(q) \ test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags) #define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags) -#define blk_queue_secdiscard(q) (blk_queue_discard(q) && \ - test_bit(QUEUE_FLAG_SECDISCARD, &(q)->queue_flags)) +#define blk_queue_secure_erase(q) \ + (test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags)) #define blk_queue_dax(q) test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags) #define blk_noretry_request(rq) \ @@ -676,21 +676,6 @@ static inline bool rq_mergeable(struct request *rq) return true; } -static inline bool blk_check_merge_flags(unsigned int flags1, unsigned int op1, - unsigned int flags2, unsigned int op2) -{ - if ((op1 == REQ_OP_DISCARD) != (op2 == REQ_OP_DISCARD)) - return false; - - if ((flags1 & REQ_SECURE) != (flags2 & REQ_SECURE)) - return false; - - if ((op1 == REQ_OP_WRITE_SAME) != (op2 == REQ_OP_WRITE_SAME)) - return false; - - return true; -} - static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b) { if (bio_data(a) == bio_data(b)) @@ -804,8 +789,6 @@ extern void blk_rq_init(struct request_queue *q, struct request *rq); extern void blk_put_request(struct request *); extern void __blk_put_request(struct request_queue *, struct request *); extern struct request *blk_get_request(struct request_queue *, int, gfp_t); -extern struct request *blk_make_request(struct request_queue *, struct bio *, - gfp_t); extern void blk_rq_set_block_pc(struct request *); extern void blk_requeue_request(struct request_queue *, struct request *); extern void blk_add_request_payload(struct request *rq, struct page *page, @@ -818,6 +801,7 @@ extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, extern void blk_rq_unprep_clone(struct request *rq); extern int blk_insert_cloned_request(struct request_queue *q, struct request *rq); +extern int blk_rq_append_bio(struct request *rq, struct bio *bio); extern void blk_delay_queue(struct request_queue *, unsigned long); extern void blk_queue_split(struct request_queue *, struct bio **, struct bio_set *); @@ -1154,13 +1138,15 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt, return bqt->tag_index[tag]; } -#define BLKDEV_DISCARD_SECURE 0x01 /* secure discard */ + +#define BLKDEV_DISCARD_SECURE (1 << 0) /* issue a secure erase */ +#define BLKDEV_DISCARD_ZERO (1 << 1) /* must reliably zero data */ extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *); extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, unsigned long flags); extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, int op_flags, + sector_t nr_sects, gfp_t gfp_mask, int flags, struct bio **biop); extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct page *page); diff --git a/include/linux/drbd.h b/include/linux/drbd.h index d6b3c99..002611c 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -51,7 +51,7 @@ #endif extern const char *drbd_buildtag(void); -#define REL_VERSION "8.4.6" +#define REL_VERSION "8.4.7" #define API_VERSION 1 #define PRO_VERSION_MIN 86 #define PRO_VERSION_MAX 101 @@ -370,6 +370,14 @@ enum drbd_notification_type { NOTIFY_FLAGS = NOTIFY_CONTINUES, }; +enum drbd_peer_state { + P_INCONSISTENT = 3, + P_OUTDATED = 4, + P_DOWN = 5, + P_PRIMARY = 6, + P_FENCING = 7, +}; + #define UUID_JUST_CREATED ((__u64)4) enum write_ordering_e { diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h index 2d0e5ad..c934d3a 100644 --- a/include/linux/drbd_genl.h +++ b/include/linux/drbd_genl.h @@ -123,15 +123,16 @@ GENL_struct(DRBD_NLA_DISK_CONF, 3, disk_conf, __u32_field_def(13, DRBD_GENLA_F_MANDATORY, c_fill_target, DRBD_C_FILL_TARGET_DEF) __u32_field_def(14, DRBD_GENLA_F_MANDATORY, c_max_rate, DRBD_C_MAX_RATE_DEF) __u32_field_def(15, DRBD_GENLA_F_MANDATORY, c_min_rate, DRBD_C_MIN_RATE_DEF) + __u32_field_def(20, DRBD_GENLA_F_MANDATORY, disk_timeout, DRBD_DISK_TIMEOUT_DEF) + __u32_field_def(21, 0 /* OPTIONAL */, read_balancing, DRBD_READ_BALANCING_DEF) + __u32_field_def(25, 0 /* OPTIONAL */, rs_discard_granularity, DRBD_RS_DISCARD_GRANULARITY_DEF) __flg_field_def(16, DRBD_GENLA_F_MANDATORY, disk_barrier, DRBD_DISK_BARRIER_DEF) __flg_field_def(17, DRBD_GENLA_F_MANDATORY, disk_flushes, DRBD_DISK_FLUSHES_DEF) __flg_field_def(18, DRBD_GENLA_F_MANDATORY, disk_drain, DRBD_DISK_DRAIN_DEF) __flg_field_def(19, DRBD_GENLA_F_MANDATORY, md_flushes, DRBD_MD_FLUSHES_DEF) - __u32_field_def(20, DRBD_GENLA_F_MANDATORY, disk_timeout, DRBD_DISK_TIMEOUT_DEF) - __u32_field_def(21, 0 /* OPTIONAL */, read_balancing, DRBD_READ_BALANCING_DEF) - /* 9: __u32_field_def(22, DRBD_GENLA_F_MANDATORY, unplug_watermark, DRBD_UNPLUG_WATERMARK_DEF) */ __flg_field_def(23, 0 /* OPTIONAL */, al_updates, DRBD_AL_UPDATES_DEF) + __flg_field_def(24, 0 /* OPTIONAL */, discard_zeroes_if_aligned, DRBD_DISCARD_ZEROES_IF_ALIGNED) ) GENL_struct(DRBD_NLA_RESOURCE_OPTS, 4, res_opts, diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index 8ac8c5d..ddac684 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h @@ -126,8 +126,7 @@ #define DRBD_RESYNC_RATE_DEF 250 #define DRBD_RESYNC_RATE_SCALE 'k' /* kilobytes */ - /* less than 7 would hit performance unnecessarily. */ -#define DRBD_AL_EXTENTS_MIN 7 +#define DRBD_AL_EXTENTS_MIN 67 /* we use u16 as "slot number", (u16)~0 is "FREE". * If you use >= 292 kB on-disk ring buffer, * this is the maximum you can use: */ @@ -210,6 +209,12 @@ #define DRBD_MD_FLUSHES_DEF 1 #define DRBD_TCP_CORK_DEF 1 #define DRBD_AL_UPDATES_DEF 1 +/* We used to ignore the discard_zeroes_data setting. + * To not change established (and expected) behaviour, + * by default assume that, for discard_zeroes_data=0, + * we can make that an effective discard_zeroes_data=1, + * if we only explicitly zero-out unaligned partial chunks. */ +#define DRBD_DISCARD_ZEROES_IF_ALIGNED 1 #define DRBD_ALLOW_TWO_PRIMARIES_DEF 0 #define DRBD_ALWAYS_ASBP_DEF 0 @@ -230,4 +235,10 @@ #define DRBD_SOCKET_CHECK_TIMEO_MAX DRBD_PING_TIMEO_MAX #define DRBD_SOCKET_CHECK_TIMEO_DEF 0 #define DRBD_SOCKET_CHECK_TIMEO_SCALE '1' + +#define DRBD_RS_DISCARD_GRANULARITY_MIN 0 +#define DRBD_RS_DISCARD_GRANULARITY_MAX (1<<20) /* 1MiByte */ +#define DRBD_RS_DISCARD_GRANULARITY_DEF 0 /* disabled by default */ +#define DRBD_RS_DISCARD_GRANULARITY_SCALE '1' /* bytes */ + #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index 1830245..dc48866 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -178,9 +178,6 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, * READ_SYNC A synchronous read. Device is not plugged, caller can * immediately wait on this read without caring about * unplugging. - * READA Used for read-ahead operations. Lower priority, and the - * block layer could (in theory) choose to ignore this - * request if it runs into resource problems. * WRITE A normal async write. Device will be plugged. * WRITE_SYNC Synchronous write. Identical to WRITE, but passes down * the hint that someone will be waiting on this IO @@ -195,11 +192,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, * */ #define RW_MASK REQ_OP_WRITE -#define RWA_MASK REQ_RAHEAD #define READ REQ_OP_READ -#define WRITE RW_MASK -#define READA RWA_MASK +#define WRITE REQ_OP_WRITE #define READ_SYNC REQ_SYNC #define WRITE_SYNC (REQ_SYNC | REQ_NOIDLE) @@ -2471,17 +2466,6 @@ static inline bool op_is_write(unsigned int op) } /* - * return READ, READA, or WRITE - */ -static inline int bio_rw(struct bio *bio) -{ - if (op_is_write(bio_op(bio))) - return WRITE; - - return bio->bi_rw & RWA_MASK; -} - -/* * return data direction, READ or WRITE */ static inline int bio_data_dir(struct bio *bio) diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 359a8e4..1dbf52f 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -205,7 +205,6 @@ struct gendisk { void *private_data; int flags; - struct device *driverfs_dev; // FIXME: remove struct kobject *slave_dir; struct timer_rand_state *random; @@ -414,7 +413,12 @@ static inline void free_part_info(struct hd_struct *part) extern void part_round_stats(int cpu, struct hd_struct *part); /* block/genhd.c */ -extern void add_disk(struct gendisk *disk); +extern void device_add_disk(struct device *parent, struct gendisk *disk); +static inline void add_disk(struct gendisk *disk) +{ + device_add_disk(NULL, disk); +} + extern void del_gendisk(struct gendisk *gp); extern struct gendisk *get_gendisk(dev_t dev, int *partno); extern struct block_device *bdget_disk(struct gendisk *disk, int partno); diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index ef2c7d2..ba78b83 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -1,7 +1,9 @@ #ifndef NVM_H #define NVM_H +#include <linux/blkdev.h> #include <linux/types.h> +#include <uapi/linux/lightnvm.h> enum { NVM_IO_OK = 0, @@ -269,24 +271,15 @@ struct nvm_lun { int lun_id; int chnl_id; - /* It is up to the target to mark blocks as closed. If the target does - * not do it, all blocks are marked as open, and nr_open_blocks - * represents the number of blocks in use - */ - unsigned int nr_open_blocks; /* Number of used, writable blocks */ - unsigned int nr_closed_blocks; /* Number of used, read-only blocks */ - unsigned int nr_free_blocks; /* Number of unused blocks */ - unsigned int nr_bad_blocks; /* Number of bad blocks */ - spinlock_t lock; + unsigned int nr_free_blocks; /* Number of unused blocks */ struct nvm_block *blocks; }; enum { NVM_BLK_ST_FREE = 0x1, /* Free block */ - NVM_BLK_ST_OPEN = 0x2, /* Open block - read-write */ - NVM_BLK_ST_CLOSED = 0x4, /* Closed block - read-only */ + NVM_BLK_ST_TGT = 0x2, /* Block in use by target */ NVM_BLK_ST_BAD = 0x8, /* Bad block */ }; @@ -385,6 +378,7 @@ static inline struct ppa_addr dev_to_generic_addr(struct nvm_dev *dev, { struct ppa_addr l; + l.ppa = 0; /* * (r.ppa << X offset) & X len bitmask. X eq. blk, pg, etc. */ @@ -455,6 +449,8 @@ struct nvm_tgt_type { struct list_head list; }; +extern struct nvm_tgt_type *nvm_find_target_type(const char *, int); + extern int nvm_register_tgt_type(struct nvm_tgt_type *); extern void nvm_unregister_tgt_type(struct nvm_tgt_type *); @@ -463,6 +459,9 @@ extern void nvm_dev_dma_free(struct nvm_dev *, void *, dma_addr_t); typedef int (nvmm_register_fn)(struct nvm_dev *); typedef void (nvmm_unregister_fn)(struct nvm_dev *); + +typedef int (nvmm_create_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_create *); +typedef int (nvmm_remove_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_remove *); typedef struct nvm_block *(nvmm_get_blk_fn)(struct nvm_dev *, struct nvm_lun *, unsigned long); typedef void (nvmm_put_blk_fn)(struct nvm_dev *, struct nvm_block *); @@ -488,9 +487,10 @@ struct nvmm_type { nvmm_register_fn *register_mgr; nvmm_unregister_fn *unregister_mgr; + nvmm_create_tgt_fn *create_tgt; + nvmm_remove_tgt_fn *remove_tgt; + /* Block administration callbacks */ - nvmm_get_blk_fn *get_blk_unlocked; - nvmm_put_blk_fn *put_blk_unlocked; nvmm_get_blk_fn *get_blk; nvmm_put_blk_fn *put_blk; nvmm_open_blk_fn *open_blk; @@ -520,10 +520,6 @@ struct nvmm_type { extern int nvm_register_mgr(struct nvmm_type *); extern void nvm_unregister_mgr(struct nvmm_type *); -extern struct nvm_block *nvm_get_blk_unlocked(struct nvm_dev *, - struct nvm_lun *, unsigned long); -extern void nvm_put_blk_unlocked(struct nvm_dev *, struct nvm_block *); - extern struct nvm_block *nvm_get_blk(struct nvm_dev *, struct nvm_lun *, unsigned long); extern void nvm_put_blk(struct nvm_dev *, struct nvm_block *); @@ -532,11 +528,13 @@ extern int nvm_register(struct request_queue *, char *, struct nvm_dev_ops *); extern void nvm_unregister(char *); +void nvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type); + extern int nvm_submit_io(struct nvm_dev *, struct nvm_rq *); extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *); extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *); extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *, - struct ppa_addr *, int, int); + const struct ppa_addr *, int, int); extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *); extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int); extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *); diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h new file mode 100644 index 0000000..bf240a3 --- /dev/null +++ b/include/linux/nvme-rdma.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2015 Mellanox Technologies. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _LINUX_NVME_RDMA_H +#define _LINUX_NVME_RDMA_H + +enum nvme_rdma_cm_fmt { + NVME_RDMA_CM_FMT_1_0 = 0x0, +}; + +enum nvme_rdma_cm_status { + NVME_RDMA_CM_INVALID_LEN = 0x01, + NVME_RDMA_CM_INVALID_RECFMT = 0x02, + NVME_RDMA_CM_INVALID_QID = 0x03, + NVME_RDMA_CM_INVALID_HSQSIZE = 0x04, + NVME_RDMA_CM_INVALID_HRQSIZE = 0x05, + NVME_RDMA_CM_NO_RSC = 0x06, + NVME_RDMA_CM_INVALID_IRD = 0x07, + NVME_RDMA_CM_INVALID_ORD = 0x08, +}; + +/** + * struct nvme_rdma_cm_req - rdma connect request + * + * @recfmt: format of the RDMA Private Data + * @qid: queue Identifier for the Admin or I/O Queue + * @hrqsize: host receive queue size to be created + * @hsqsize: host send queue size to be created + */ +struct nvme_rdma_cm_req { + __le16 recfmt; + __le16 qid; + __le16 hrqsize; + __le16 hsqsize; + u8 rsvd[24]; +}; + +/** + * struct nvme_rdma_cm_rep - rdma connect reply + * + * @recfmt: format of the RDMA Private Data + * @crqsize: controller receive queue size + */ +struct nvme_rdma_cm_rep { + __le16 recfmt; + __le16 crqsize; + u8 rsvd[28]; +}; + +/** + * struct nvme_rdma_cm_rej - rdma connect reject + * + * @recfmt: format of the RDMA Private Data + * @fsts: error status for the associated connect request + */ +struct nvme_rdma_cm_rej { + __le16 recfmt; + __le16 sts; +}; + +#endif /* _LINUX_NVME_RDMA_H */ diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 7d51b29..d8b37ba 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -16,6 +16,78 @@ #define _LINUX_NVME_H #include <linux/types.h> +#include <linux/uuid.h> + +/* NQN names in commands fields specified one size */ +#define NVMF_NQN_FIELD_LEN 256 + +/* However the max length of a qualified name is another size */ +#define NVMF_NQN_SIZE 223 + +#define NVMF_TRSVCID_SIZE 32 +#define NVMF_TRADDR_SIZE 256 +#define NVMF_TSAS_SIZE 256 + +#define NVME_DISC_SUBSYS_NAME "nqn.2014-08.org.nvmexpress.discovery" + +#define NVME_RDMA_IP_PORT 4420 + +enum nvme_subsys_type { + NVME_NQN_DISC = 1, /* Discovery type target subsystem */ + NVME_NQN_NVME = 2, /* NVME type target subsystem */ +}; + +/* Address Family codes for Discovery Log Page entry ADRFAM field */ +enum { + NVMF_ADDR_FAMILY_PCI = 0, /* PCIe */ + NVMF_ADDR_FAMILY_IP4 = 1, /* IP4 */ + NVMF_ADDR_FAMILY_IP6 = 2, /* IP6 */ + NVMF_ADDR_FAMILY_IB = 3, /* InfiniBand */ + NVMF_ADDR_FAMILY_FC = 4, /* Fibre Channel */ +}; + +/* Transport Type codes for Discovery Log Page entry TRTYPE field */ +enum { + NVMF_TRTYPE_RDMA = 1, /* RDMA */ + NVMF_TRTYPE_FC = 2, /* Fibre Channel */ + NVMF_TRTYPE_LOOP = 254, /* Reserved for host usage */ + NVMF_TRTYPE_MAX, +}; + +/* Transport Requirements codes for Discovery Log Page entry TREQ field */ +enum { + NVMF_TREQ_NOT_SPECIFIED = 0, /* Not specified */ + NVMF_TREQ_REQUIRED = 1, /* Required */ + NVMF_TREQ_NOT_REQUIRED = 2, /* Not Required */ +}; + +/* RDMA QP Service Type codes for Discovery Log Page entry TSAS + * RDMA_QPTYPE field + */ +enum { + NVMF_RDMA_QPTYPE_CONNECTED = 0, /* Reliable Connected */ + NVMF_RDMA_QPTYPE_DATAGRAM = 1, /* Reliable Datagram */ +}; + +/* RDMA QP Service Type codes for Discovery Log Page entry TSAS + * RDMA_QPTYPE field + */ +enum { + NVMF_RDMA_PRTYPE_NOT_SPECIFIED = 0, /* No Provider Specified */ + NVMF_RDMA_PRTYPE_IB = 1, /* InfiniBand */ + NVMF_RDMA_PRTYPE_ROCE = 2, /* InfiniBand RoCE */ + NVMF_RDMA_PRTYPE_ROCEV2 = 3, /* InfiniBand RoCEV2 */ + NVMF_RDMA_PRTYPE_IWARP = 4, /* IWARP */ +}; + +/* RDMA Connection Management Service Type codes for Discovery Log Page + * entry TSAS RDMA_CMS field + */ +enum { + NVMF_RDMA_CMS_RDMA_CM = 0, /* Sockets based enpoint addressing */ +}; + +#define NVMF_AQ_DEPTH 32 enum { NVME_REG_CAP = 0x0000, /* Controller Capabilities */ @@ -50,6 +122,13 @@ enum { #define NVME_CMB_CQS(cmbsz) ((cmbsz) & 0x2) #define NVME_CMB_SQS(cmbsz) ((cmbsz) & 0x1) +/* + * Submission and Completion Queue Entry Sizes for the NVM command set. + * (In bytes and specified as a power of two (2^n)). + */ +#define NVME_NVM_IOSQES 6 +#define NVME_NVM_IOCQES 4 + enum { NVME_CC_ENABLE = 1 << 0, NVME_CC_CSS_NVM = 0 << 4, @@ -61,8 +140,8 @@ enum { NVME_CC_SHN_NORMAL = 1 << 14, NVME_CC_SHN_ABRUPT = 2 << 14, NVME_CC_SHN_MASK = 3 << 14, - NVME_CC_IOSQES = 6 << 16, - NVME_CC_IOCQES = 4 << 20, + NVME_CC_IOSQES = NVME_NVM_IOSQES << 16, + NVME_CC_IOCQES = NVME_NVM_IOCQES << 20, NVME_CSTS_RDY = 1 << 0, NVME_CSTS_CFS = 1 << 1, NVME_CSTS_NSSRO = 1 << 4, @@ -107,7 +186,11 @@ struct nvme_id_ctrl { __u8 mdts; __le16 cntlid; __le32 ver; - __u8 rsvd84[172]; + __le32 rtd3r; + __le32 rtd3e; + __le32 oaes; + __le32 ctratt; + __u8 rsvd100[156]; __le16 oacs; __u8 acl; __u8 aerl; @@ -119,10 +202,12 @@ struct nvme_id_ctrl { __u8 apsta; __le16 wctemp; __le16 cctemp; - __u8 rsvd270[242]; + __u8 rsvd270[50]; + __le16 kas; + __u8 rsvd322[190]; __u8 sqes; __u8 cqes; - __u8 rsvd514[2]; + __le16 maxcmd; __le32 nn; __le16 oncs; __le16 fuses; @@ -135,7 +220,15 @@ struct nvme_id_ctrl { __le16 acwu; __u8 rsvd534[2]; __le32 sgls; - __u8 rsvd540[1508]; + __u8 rsvd540[228]; + char subnqn[256]; + __u8 rsvd1024[768]; + __le32 ioccsz; + __le32 iorcsz; + __le16 icdoff; + __u8 ctrattr; + __u8 msdbd; + __u8 rsvd1804[244]; struct nvme_id_power_state psd[32]; __u8 vs[1024]; }; @@ -274,6 +367,12 @@ struct nvme_reservation_status { } regctl_ds[]; }; +enum nvme_async_event_type { + NVME_AER_TYPE_ERROR = 0, + NVME_AER_TYPE_SMART = 1, + NVME_AER_TYPE_NOTICE = 2, +}; + /* I/O commands */ enum nvme_opcode { @@ -290,6 +389,84 @@ enum nvme_opcode { nvme_cmd_resv_release = 0x15, }; +/* + * Descriptor subtype - lower 4 bits of nvme_(keyed_)sgl_desc identifier + * + * @NVME_SGL_FMT_ADDRESS: absolute address of the data block + * @NVME_SGL_FMT_OFFSET: relative offset of the in-capsule data block + * @NVME_SGL_FMT_INVALIDATE: RDMA transport specific remote invalidation + * request subtype + */ +enum { + NVME_SGL_FMT_ADDRESS = 0x00, + NVME_SGL_FMT_OFFSET = 0x01, + NVME_SGL_FMT_INVALIDATE = 0x0f, +}; + +/* + * Descriptor type - upper 4 bits of nvme_(keyed_)sgl_desc identifier + * + * For struct nvme_sgl_desc: + * @NVME_SGL_FMT_DATA_DESC: data block descriptor + * @NVME_SGL_FMT_SEG_DESC: sgl segment descriptor + * @NVME_SGL_FMT_LAST_SEG_DESC: last sgl segment descriptor + * + * For struct nvme_keyed_sgl_desc: + * @NVME_KEY_SGL_FMT_DATA_DESC: keyed data block descriptor + */ +enum { + NVME_SGL_FMT_DATA_DESC = 0x00, + NVME_SGL_FMT_SEG_DESC = 0x02, + NVME_SGL_FMT_LAST_SEG_DESC = 0x03, + NVME_KEY_SGL_FMT_DATA_DESC = 0x04, +}; + +struct nvme_sgl_desc { + __le64 addr; + __le32 length; + __u8 rsvd[3]; + __u8 type; +}; + +struct nvme_keyed_sgl_desc { + __le64 addr; + __u8 length[3]; + __u8 key[4]; + __u8 type; +}; + +union nvme_data_ptr { + struct { + __le64 prp1; + __le64 prp2; + }; + struct nvme_sgl_desc sgl; + struct nvme_keyed_sgl_desc ksgl; +}; + +/* + * Lowest two bits of our flags field (FUSE field in the spec): + * + * @NVME_CMD_FUSE_FIRST: Fused Operation, first command + * @NVME_CMD_FUSE_SECOND: Fused Operation, second command + * + * Highest two bits in our flags field (PSDT field in the spec): + * + * @NVME_CMD_PSDT_SGL_METABUF: Use SGLS for this transfer, + * If used, MPTR contains addr of single physical buffer (byte aligned). + * @NVME_CMD_PSDT_SGL_METASEG: Use SGLS for this transfer, + * If used, MPTR contains an address of an SGL segment containing + * exactly 1 SGL descriptor (qword aligned). + */ +enum { + NVME_CMD_FUSE_FIRST = (1 << 0), + NVME_CMD_FUSE_SECOND = (1 << 1), + + NVME_CMD_SGL_METABUF = (1 << 6), + NVME_CMD_SGL_METASEG = (1 << 7), + NVME_CMD_SGL_ALL = NVME_CMD_SGL_METABUF | NVME_CMD_SGL_METASEG, +}; + struct nvme_common_command { __u8 opcode; __u8 flags; @@ -297,8 +474,7 @@ struct nvme_common_command { __le32 nsid; __le32 cdw2[2]; __le64 metadata; - __le64 prp1; - __le64 prp2; + union nvme_data_ptr dptr; __le32 cdw10[6]; }; @@ -309,8 +485,7 @@ struct nvme_rw_command { __le32 nsid; __u64 rsvd2; __le64 metadata; - __le64 prp1; - __le64 prp2; + union nvme_data_ptr dptr; __le64 slba; __le16 length; __le16 control; @@ -350,8 +525,7 @@ struct nvme_dsm_cmd { __u16 command_id; __le32 nsid; __u64 rsvd2[2]; - __le64 prp1; - __le64 prp2; + union nvme_data_ptr dptr; __le32 nr; __le32 attributes; __u32 rsvd12[4]; @@ -384,6 +558,7 @@ enum nvme_admin_opcode { nvme_admin_async_event = 0x0c, nvme_admin_activate_fw = 0x10, nvme_admin_download_fw = 0x11, + nvme_admin_keep_alive = 0x18, nvme_admin_format_nvm = 0x80, nvme_admin_security_send = 0x81, nvme_admin_security_recv = 0x82, @@ -408,6 +583,7 @@ enum { NVME_FEAT_WRITE_ATOMIC = 0x0a, NVME_FEAT_ASYNC_EVENT = 0x0b, NVME_FEAT_AUTO_PST = 0x0c, + NVME_FEAT_KATO = 0x0f, NVME_FEAT_SW_PROGRESS = 0x80, NVME_FEAT_HOST_ID = 0x81, NVME_FEAT_RESV_MASK = 0x82, @@ -415,6 +591,7 @@ enum { NVME_LOG_ERROR = 0x01, NVME_LOG_SMART = 0x02, NVME_LOG_FW_SLOT = 0x03, + NVME_LOG_DISC = 0x70, NVME_LOG_RESERVATION = 0x80, NVME_FWACT_REPL = (0 << 3), NVME_FWACT_REPL_ACTV = (1 << 3), @@ -427,8 +604,7 @@ struct nvme_identify { __u16 command_id; __le32 nsid; __u64 rsvd2[2]; - __le64 prp1; - __le64 prp2; + union nvme_data_ptr dptr; __le32 cns; __u32 rsvd11[5]; }; @@ -439,8 +615,7 @@ struct nvme_features { __u16 command_id; __le32 nsid; __u64 rsvd2[2]; - __le64 prp1; - __le64 prp2; + union nvme_data_ptr dptr; __le32 fid; __le32 dword11; __u32 rsvd12[4]; @@ -499,8 +674,7 @@ struct nvme_download_firmware { __u8 flags; __u16 command_id; __u32 rsvd1[5]; - __le64 prp1; - __le64 prp2; + union nvme_data_ptr dptr; __le32 numd; __le32 offset; __u32 rsvd12[4]; @@ -516,6 +690,143 @@ struct nvme_format_cmd { __u32 rsvd11[5]; }; +struct nvme_get_log_page_command { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + union nvme_data_ptr dptr; + __u8 lid; + __u8 rsvd10; + __le16 numdl; + __le16 numdu; + __u16 rsvd11; + __le32 lpol; + __le32 lpou; + __u32 rsvd14[2]; +}; + +/* + * Fabrics subcommands. + */ +enum nvmf_fabrics_opcode { + nvme_fabrics_command = 0x7f, +}; + +enum nvmf_capsule_command { + nvme_fabrics_type_property_set = 0x00, + nvme_fabrics_type_connect = 0x01, + nvme_fabrics_type_property_get = 0x04, +}; + +struct nvmf_common_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[35]; + __u8 ts[24]; +}; + +/* + * The legal cntlid range a NVMe Target will provide. + * Note that cntlid of value 0 is considered illegal in the fabrics world. + * Devices based on earlier specs did not have the subsystem concept; + * therefore, those devices had their cntlid value set to 0 as a result. + */ +#define NVME_CNTLID_MIN 1 +#define NVME_CNTLID_MAX 0xffef +#define NVME_CNTLID_DYNAMIC 0xffff + +#define MAX_DISC_LOGS 255 + +/* Discovery log page entry */ +struct nvmf_disc_rsp_page_entry { + __u8 trtype; + __u8 adrfam; + __u8 nqntype; + __u8 treq; + __le16 portid; + __le16 cntlid; + __le16 asqsz; + __u8 resv8[22]; + char trsvcid[NVMF_TRSVCID_SIZE]; + __u8 resv64[192]; + char subnqn[NVMF_NQN_FIELD_LEN]; + char traddr[NVMF_TRADDR_SIZE]; + union tsas { + char common[NVMF_TSAS_SIZE]; + struct rdma { + __u8 qptype; + __u8 prtype; + __u8 cms; + __u8 resv3[5]; + __u16 pkey; + __u8 resv10[246]; + } rdma; + } tsas; +}; + +/* Discovery log page header */ +struct nvmf_disc_rsp_page_hdr { + __le64 genctr; + __le64 numrec; + __le16 recfmt; + __u8 resv14[1006]; + struct nvmf_disc_rsp_page_entry entries[0]; +}; + +struct nvmf_connect_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[19]; + union nvme_data_ptr dptr; + __le16 recfmt; + __le16 qid; + __le16 sqsize; + __u8 cattr; + __u8 resv3; + __le32 kato; + __u8 resv4[12]; +}; + +struct nvmf_connect_data { + uuid_le hostid; + __le16 cntlid; + char resv4[238]; + char subsysnqn[NVMF_NQN_FIELD_LEN]; + char hostnqn[NVMF_NQN_FIELD_LEN]; + char resv5[256]; +}; + +struct nvmf_property_set_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[35]; + __u8 attrib; + __u8 resv3[3]; + __le32 offset; + __le64 value; + __u8 resv4[8]; +}; + +struct nvmf_property_get_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[35]; + __u8 attrib; + __u8 resv3[3]; + __le32 offset; + __u8 resv4[16]; +}; + struct nvme_command { union { struct nvme_common_command common; @@ -529,10 +840,30 @@ struct nvme_command { struct nvme_format_cmd format; struct nvme_dsm_cmd dsm; struct nvme_abort_cmd abort; + struct nvme_get_log_page_command get_log_page; + struct nvmf_common_command fabrics; + struct nvmf_connect_command connect; + struct nvmf_property_set_command prop_set; + struct nvmf_property_get_command prop_get; }; }; +static inline bool nvme_is_write(struct nvme_command *cmd) +{ + /* + * What a mess... + * + * Why can't we simply have a Fabrics In and Fabrics out command? + */ + if (unlikely(cmd->common.opcode == nvme_fabrics_command)) + return cmd->fabrics.opcode & 1; + return cmd->common.opcode & 1; +} + enum { + /* + * Generic Command Status: + */ NVME_SC_SUCCESS = 0x0, NVME_SC_INVALID_OPCODE = 0x1, NVME_SC_INVALID_FIELD = 0x2, @@ -551,10 +882,18 @@ enum { NVME_SC_SGL_INVALID_DATA = 0xf, NVME_SC_SGL_INVALID_METADATA = 0x10, NVME_SC_SGL_INVALID_TYPE = 0x11, + + NVME_SC_SGL_INVALID_OFFSET = 0x16, + NVME_SC_SGL_INVALID_SUBTYPE = 0x17, + NVME_SC_LBA_RANGE = 0x80, NVME_SC_CAP_EXCEEDED = 0x81, NVME_SC_NS_NOT_READY = 0x82, NVME_SC_RESERVATION_CONFLICT = 0x83, + + /* + * Command Specific Status: + */ NVME_SC_CQ_INVALID = 0x100, NVME_SC_QID_INVALID = 0x101, NVME_SC_QUEUE_SIZE = 0x102, @@ -572,9 +911,29 @@ enum { NVME_SC_FEATURE_NOT_CHANGEABLE = 0x10e, NVME_SC_FEATURE_NOT_PER_NS = 0x10f, NVME_SC_FW_NEEDS_RESET_SUBSYS = 0x110, + + /* + * I/O Command Set Specific - NVM commands: + */ NVME_SC_BAD_ATTRIBUTES = 0x180, NVME_SC_INVALID_PI = 0x181, NVME_SC_READ_ONLY = 0x182, + + /* + * I/O Command Set Specific - Fabrics commands: + */ + NVME_SC_CONNECT_FORMAT = 0x180, + NVME_SC_CONNECT_CTRL_BUSY = 0x181, + NVME_SC_CONNECT_INVALID_PARAM = 0x182, + NVME_SC_CONNECT_RESTART_DISC = 0x183, + NVME_SC_CONNECT_INVALID_HOST = 0x184, + + NVME_SC_DISCOVERY_RESTART = 0x190, + NVME_SC_AUTH_REQUIRED = 0x191, + + /* + * Media and Data Integrity Errors: + */ NVME_SC_WRITE_FAULT = 0x280, NVME_SC_READ_ERROR = 0x281, NVME_SC_GUARD_CHECK = 0x282, @@ -582,12 +941,19 @@ enum { NVME_SC_REFTAG_CHECK = 0x284, NVME_SC_COMPARE_FAILED = 0x285, NVME_SC_ACCESS_DENIED = 0x286, + NVME_SC_DNR = 0x4000, }; struct nvme_completion { - __le32 result; /* Used by admin commands to return data */ - __u32 rsvd; + /* + * Used by Admin and Fabrics commands to return data: + */ + union { + __le16 result16; + __le32 result; + __le64 result64; + }; __le16 sq_head; /* how much of this queue may be reclaimed */ __le16 sq_id; /* submission queue that generated this entry */ __u16 command_id; /* of the command which completed */ diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 878963a..ff95fd0 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -55,7 +55,7 @@ TRACE_DEFINE_ENUM(CP_DISCARD); { IPU, "IN-PLACE" }, \ { OPU, "OUT-OF-PLACE" }) -#define F2FS_BIO_FLAG_MASK(t) (t & (READA | WRITE_FLUSH_FUA)) +#define F2FS_BIO_FLAG_MASK(t) (t & (REQ_RAHEAD | WRITE_FLUSH_FUA)) #define F2FS_BIO_EXTRA_MASK(t) (t & (REQ_META | REQ_PRIO)) #define show_bio_type(op, op_flags) show_bio_op(op), \ @@ -68,7 +68,7 @@ TRACE_DEFINE_ENUM(CP_DISCARD); #define show_bio_op_flags(flags) \ __print_symbolic(F2FS_BIO_FLAG_MASK(flags), \ - { READA, "READAHEAD" }, \ + { REQ_RAHEAD, "READAHEAD" }, \ { READ_SYNC, "READ_SYNC" }, \ { WRITE_SYNC, "WRITE_SYNC" }, \ { WRITE_FLUSH, "WRITE_FLUSH" }, \ diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index bedb84d..fb345cd 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1792,6 +1792,10 @@ void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes) case REQ_OP_DISCARD: rwbs[i++] = 'D'; break; + case REQ_OP_SECURE_ERASE: + rwbs[i++] = 'D'; + rwbs[i++] = 'E'; + break; case REQ_OP_FLUSH: rwbs[i++] = 'F'; break; @@ -1810,8 +1814,6 @@ void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes) rwbs[i++] = 'S'; if (rw & REQ_META) rwbs[i++] = 'M'; - if (rw & REQ_SECURE) - rwbs[i++] = 'E'; rwbs[i] = '\0'; } |