From 77032ca66f86fa631b38b2e7feeb2b953e59f2f0 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 24 Nov 2015 17:30:34 -0600 Subject: Return EBUSY from BLKRRPART for mounted whole-dev fs Today, blockdev --rereadpt /dev/sda will fail with EBUSY if any partition of sda is mounted (and will fail with EINVAL if pointed at a partition). But it will pass if the entire block device is formatted with a filesystem and mounted. I don't think this makes sense; partitioning should surely not ever change out from under a mounted device. So check for bdev->bd_super, and fail that with -EBUSY as well. Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partition-generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/partition-generic.c b/block/partition-generic.c index 3b03015..746935a 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -397,7 +397,7 @@ static int drop_partitions(struct gendisk *disk, struct block_device *bdev) struct hd_struct *part; int res; - if (bdev->bd_part_count) + if (bdev->bd_part_count || bdev->bd_super) return -EBUSY; res = invalidate_partition(disk, 0); if (res) -- cgit v1.1 From c4699e70d1db14119708dae76dac7c43e1e12988 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Sat, 28 Nov 2015 16:49:22 +0100 Subject: lightnvm: Simplify config when disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We shouldn't compile an object file to get empty implementations; conforms to linux coding style on conditional compilation. Signed-off-by: Keith Busch Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/nvme/host/Makefile | 3 ++- drivers/nvme/host/lightnvm.c | 13 ------------- drivers/nvme/host/nvme.h | 14 ++++++++++++++ 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index 219dc206..a5fe239 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -1,4 +1,5 @@ obj-$(CONFIG_BLK_DEV_NVME) += nvme.o -nvme-y += pci.o scsi.o lightnvm.o +lightnvm-$(CONFIG_NVM) := lightnvm.o +nvme-y += pci.o scsi.o $(lightnvm-y) diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 9202d1a..07451d6 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -22,8 +22,6 @@ #include "nvme.h" -#ifdef CONFIG_NVM - #include #include #include @@ -588,14 +586,3 @@ int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id) return 0; } -#else -int nvme_nvm_register(struct request_queue *q, char *disk_name) -{ - return 0; -} -void nvme_nvm_unregister(struct request_queue *q, char *disk_name) {}; -int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id) -{ - return 0; -} -#endif /* CONFIG_NVM */ diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index fdb4e5b..044253d 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -136,8 +136,22 @@ int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr); int nvme_sg_io32(struct nvme_ns *ns, unsigned long arg); int nvme_sg_get_version_num(int __user *ip); +#ifdef CONFIG_NVM int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id); int nvme_nvm_register(struct request_queue *q, char *disk_name); void nvme_nvm_unregister(struct request_queue *q, char *disk_name); +#else +static inline int nvme_nvm_register(struct request_queue *q, char *disk_name) +{ + return 0; +} + +static inline void nvme_nvm_unregister(struct request_queue *q, char *disk_name) {}; + +static inline int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id) +{ + return 0; +} +#endif /* CONFIG_NVM */ #endif /* _NVME_H */ -- cgit v1.1 From 8261bd48c6c9c36cd2c2e343a69e76a3be2b04a4 Mon Sep 17 00:00:00 2001 From: Wenwei Tao Date: Sat, 28 Nov 2015 16:49:23 +0100 Subject: lightnvm: free memory when gennvm register fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit free allocated nvm block and gennvm lun structures when gennvm register fails, otherwise it will cause memory leak. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/gennvm.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index e20e74e..3969a98 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -207,6 +207,14 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn) return 0; } +static void gennvm_free(struct nvm_dev *dev) +{ + gennvm_blocks_free(dev); + gennvm_luns_free(dev); + kfree(dev->mp); + dev->mp = NULL; +} + static int gennvm_register(struct nvm_dev *dev) { struct gen_nvm *gn; @@ -234,16 +242,13 @@ static int gennvm_register(struct nvm_dev *dev) return 1; err: - kfree(gn); + gennvm_free(dev); return ret; } static void gennvm_unregister(struct nvm_dev *dev) { - gennvm_blocks_free(dev); - gennvm_luns_free(dev); - kfree(dev->mp); - dev->mp = NULL; + gennvm_free(dev); } static struct nvm_block *gennvm_get_blk(struct nvm_dev *dev, -- cgit v1.1 From 76e25081b6ae60fb094328dedf900ec15f10a9fd Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Sat, 28 Nov 2015 16:49:24 +0100 Subject: lightnvm: fix ioctl memory leaks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If copy_to_user() fails we returned error but we missed releasing devices. Signed-off-by: Sudip Mukherjee Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 5178645..ea50fa5 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -680,8 +680,10 @@ static long nvm_ioctl_info(struct file *file, void __user *arg) info->tgtsize = tgt_iter; up_write(&nvm_lock); - if (copy_to_user(arg, info, sizeof(struct nvm_ioctl_info))) + if (copy_to_user(arg, info, sizeof(struct nvm_ioctl_info))) { + kfree(info); return -EFAULT; + } kfree(info); return 0; @@ -724,8 +726,11 @@ static long nvm_ioctl_get_devices(struct file *file, void __user *arg) devices->nr_devices = i; - if (copy_to_user(arg, devices, sizeof(struct nvm_ioctl_get_devices))) + if (copy_to_user(arg, devices, + sizeof(struct nvm_ioctl_get_devices))) { + kfree(devices); return -EFAULT; + } kfree(devices); return 0; -- cgit v1.1 From d160147b5c96ef5ec842c604ccd79f5f03306677 Mon Sep 17 00:00:00 2001 From: Wenwei Tao Date: Sat, 28 Nov 2015 16:49:25 +0100 Subject: lightnvm: do device max sectors boundary check first MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit do device max_phys_sect boundary check first, otherwise we will allocate dma_pools for devices whose max sectors are beyond lightnvm support and register them. Signed-off-by: Wenwei Tao Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index ea50fa5..ea6dba5 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -308,6 +308,12 @@ int nvm_register(struct request_queue *q, char *disk_name, if (ret) goto err_init; + if (dev->ops->max_phys_sect > 256) { + pr_info("nvm: max sectors supported is 256.\n"); + ret = -EINVAL; + goto err_init; + } + if (dev->ops->max_phys_sect > 1) { dev->ppalist_pool = dev->ops->create_dma_pool(dev->q, "ppalist"); @@ -316,10 +322,6 @@ int nvm_register(struct request_queue *q, char *disk_name, ret = -ENOMEM; goto err_init; } - } else if (dev->ops->max_phys_sect > 256) { - pr_info("nvm: max sectors supported is 256.\n"); - ret = -EINVAL; - goto err_init; } down_write(&nvm_lock); -- cgit v1.1 From 09f2e716096811081b204c6afd6264c2e64d1210 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Sat, 28 Nov 2015 16:49:26 +0100 Subject: lightnvm: refactor and change vendor id for qemu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The QEMU NVMe implementation uses Intel vendor, Intel device id, and the first vendor specific byte to identify a LightNVM compatible nvme instance. Instead of using the Intel specific, use a preallocated from CNEX Labs instead. This lets us uniquely identify a QEMU lightnvm device without breaking other vendor specific work in the qemu device driver. Reported-by: Christoph Hellwig Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/nvme/host/lightnvm.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 07451d6..b9e5cc7 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -569,18 +569,25 @@ void nvme_nvm_unregister(struct request_queue *q, char *disk_name) nvm_unregister(disk_name); } +/* move to shared place when used in multiple places. */ +#define PCI_VENDOR_ID_CNEX 0x1d1d +#define PCI_DEVICE_ID_CNEX_WL 0x2807 +#define PCI_DEVICE_ID_CNEX_QEMU 0x1f1f + int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id) { struct nvme_dev *dev = ns->dev; struct pci_dev *pdev = to_pci_dev(dev->dev); /* QEMU NVMe simulator - PCI ID + Vendor specific bit */ - if (pdev->vendor == PCI_VENDOR_ID_INTEL && pdev->device == 0x5845 && + if (pdev->vendor == PCI_VENDOR_ID_CNEX && + pdev->device == PCI_DEVICE_ID_CNEX_QEMU && id->vs[0] == 0x1) return 1; /* CNEX Labs - PCI ID + Vendor specific bit */ - if (pdev->vendor == 0x1d1d && pdev->device == 0x2807 && + if (pdev->vendor == PCI_VENDOR_ID_CNEX && + pdev->device == PCI_DEVICE_ID_CNEX_WL && id->vs[0] == 0x1) return 1; -- cgit v1.1 From 08236c6bb2980561fba657c58fdc76f2865f236c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Sat, 28 Nov 2015 16:49:27 +0100 Subject: lightnvm: unconverted ppa returned in get_bb_tbl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The get_bb_tbl function takes ppa as a generic address, which is converted to the ppa device address within the device driver. When the update_bbtbl callback is called from get_bb_tbl, the device specific ppa is used, instead of the generic ppa. Make sure to pass the generic ppa. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/gennvm.c | 3 +-- drivers/nvme/host/lightnvm.c | 4 +++- include/linux/lightnvm.h | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index 3969a98..35dde84 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -75,7 +75,6 @@ static int gennvm_block_bb(struct ppa_addr ppa, int nr_blocks, u8 *blks, struct nvm_block *blk; int i; - ppa = dev_to_generic_addr(gn->dev, ppa); lun = &gn->luns[(dev->nr_luns * ppa.g.ch) + ppa.g.lun]; for (i = 0; i < nr_blocks; i++) { @@ -187,7 +186,7 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn) ppa.g.lun = lun->vlun.id; ppa = generic_to_dev_addr(dev, ppa); - ret = dev->ops->get_bb_tbl(dev->q, ppa, + ret = dev->ops->get_bb_tbl(dev, ppa, dev->blks_per_lun, gennvm_block_bb, gn); if (ret) diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index b9e5cc7..06c3364 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -355,10 +355,11 @@ out: return ret; } -static int nvme_nvm_get_bb_tbl(struct request_queue *q, struct ppa_addr ppa, +static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, int nr_blocks, nvm_bb_update_fn *update_bbtbl, void *priv) { + struct request_queue *q = nvmdev->q; struct nvme_ns *ns = q->queuedata; struct nvme_dev *dev = ns->dev; struct nvme_nvm_command c = {}; @@ -402,6 +403,7 @@ static int nvme_nvm_get_bb_tbl(struct request_queue *q, struct ppa_addr ppa, goto out; } + ppa = dev_to_generic_addr(nvmdev, ppa); ret = update_bbtbl(ppa, nr_blocks, bb_tbl->blk, priv); if (ret) { ret = -EINTR; diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 3db5552..c6916ae 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -179,7 +179,7 @@ typedef int (nvm_bb_update_fn)(struct ppa_addr, int, u8 *, void *); typedef int (nvm_id_fn)(struct request_queue *, struct nvm_id *); typedef int (nvm_get_l2p_tbl_fn)(struct request_queue *, u64, u32, nvm_l2p_update_fn *, void *); -typedef int (nvm_op_bb_tbl_fn)(struct request_queue *, struct ppa_addr, int, +typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, int, nvm_bb_update_fn *, void *); typedef int (nvm_op_set_bb_fn)(struct request_queue *, struct nvm_rq *, int); typedef int (nvm_submit_io_fn)(struct request_queue *, struct nvm_rq *); -- cgit v1.1 From d0a712ceb83ebaea32d520825ee7b997f59b168f Mon Sep 17 00:00:00 2001 From: Wenwei Tao Date: Sat, 28 Nov 2015 16:49:28 +0100 Subject: lightnvm: missing nvm_lock acquire MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To avoid race conditions, traverse dev, media manager, and target lists and also register, unregister entries to/from them, should be always under the nvm_lock control. Signed-off-by: Wenwei Tao Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 75 +++++++++++++++++++++++++++---------------------- 1 file changed, 42 insertions(+), 33 deletions(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index ea6dba5..86ce887 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -123,6 +123,26 @@ void nvm_unregister_mgr(struct nvmm_type *mt) } EXPORT_SYMBOL(nvm_unregister_mgr); +/* register with device with a supported manager */ +static int register_mgr(struct nvm_dev *dev) +{ + struct nvmm_type *mt; + int ret = 0; + + list_for_each_entry(mt, &nvm_mgrs, list) { + ret = mt->register_mgr(dev); + if (ret > 0) { + dev->mt = mt; + break; /* successfully initialized */ + } + } + + if (!ret) + pr_info("nvm: no compatible nvm manager found.\n"); + + return ret; +} + static struct nvm_dev *nvm_find_nvm_dev(const char *name) { struct nvm_dev *dev; @@ -221,7 +241,6 @@ static void nvm_free(struct nvm_dev *dev) static int nvm_init(struct nvm_dev *dev) { - struct nvmm_type *mt; int ret = -EINVAL; if (!dev->q || !dev->ops) @@ -252,21 +271,13 @@ static int nvm_init(struct nvm_dev *dev) goto err; } - /* register with device with a supported manager */ - list_for_each_entry(mt, &nvm_mgrs, list) { - ret = mt->register_mgr(dev); - if (ret < 0) - goto err; /* initialization failed */ - if (ret > 0) { - dev->mt = mt; - break; /* successfully initialized */ - } - } - - if (!ret) { - pr_info("nvm: no compatible manager found.\n"); + down_write(&nvm_lock); + ret = register_mgr(dev); + up_write(&nvm_lock); + if (ret < 0) + goto err; + if (!ret) return 0; - } pr_info("nvm: registered %s [%u/%u/%u/%u/%u/%u]\n", dev->name, dev->sec_per_pg, dev->nr_planes, @@ -337,15 +348,17 @@ EXPORT_SYMBOL(nvm_register); void nvm_unregister(char *disk_name) { - struct nvm_dev *dev = nvm_find_nvm_dev(disk_name); + struct nvm_dev *dev; + down_write(&nvm_lock); + dev = nvm_find_nvm_dev(disk_name); if (!dev) { pr_err("nvm: could not find device %s to unregister\n", disk_name); + up_write(&nvm_lock); return; } - down_write(&nvm_lock); list_del(&dev->devices); up_write(&nvm_lock); @@ -363,38 +376,30 @@ static int nvm_create_target(struct nvm_dev *dev, { struct nvm_ioctl_create_simple *s = &create->conf.s; struct request_queue *tqueue; - struct nvmm_type *mt; struct gendisk *tdisk; struct nvm_tgt_type *tt; struct nvm_target *t; void *targetdata; int ret = 0; + down_write(&nvm_lock); if (!dev->mt) { - /* register with device with a supported NVM manager */ - list_for_each_entry(mt, &nvm_mgrs, list) { - ret = mt->register_mgr(dev); - if (ret < 0) - return ret; /* initialization failed */ - if (ret > 0) { - dev->mt = mt; - break; /* successfully initialized */ - } - } - - if (!ret) { - pr_info("nvm: no compatible nvm manager found.\n"); - return -ENODEV; + ret = register_mgr(dev); + if (!ret) + ret = -ENODEV; + if (ret < 0) { + up_write(&nvm_lock); + return ret; } } tt = nvm_find_target_type(create->tgttype); if (!tt) { pr_err("nvm: target type %s not found\n", create->tgttype); + up_write(&nvm_lock); return -EINVAL; } - down_write(&nvm_lock); list_for_each_entry(t, &dev->online_targets, list) { if (!strcmp(create->tgtname, t->disk->disk_name)) { pr_err("nvm: target name already exists.\n"); @@ -478,7 +483,9 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create) struct nvm_dev *dev; struct nvm_ioctl_create_simple *s; + down_write(&nvm_lock); dev = nvm_find_nvm_dev(create->dev); + up_write(&nvm_lock); if (!dev) { pr_err("nvm: device not found\n"); return -EINVAL; @@ -537,7 +544,9 @@ static int nvm_configure_show(const char *val) return -EINVAL; } + down_write(&nvm_lock); dev = nvm_find_nvm_dev(devname); + up_write(&nvm_lock); if (!dev) { pr_err("nvm: device not found\n"); return -EINVAL; -- cgit v1.1 From bf4e6b4e757488dee1b6a581f49c7ac34cd217f8 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 26 Nov 2015 08:46:57 +0100 Subject: block: Always check queue limits for cloned requests When a cloned request is retried on other queues it always needs to be checked against the queue limits of that queue. Otherwise the calculations for nr_phys_segments might be wrong, leading to a crash in scsi_init_sgtable(). To clarify this the patch renames blk_rq_check_limits() to blk_cloned_rq_check_limits() and removes the symbol export, as the new function should only be used for cloned requests and never exported. Cc: Mike Snitzer Cc: Ewan Milne Cc: Jeff Moyer Signed-off-by: Hannes Reinecke Fixes: e2a60da74 ("block: Clean up special command handling logic") Cc: stable@vger.kernel.org # 3.7+ Acked-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-core.c | 21 +++++++-------------- include/linux/blkdev.h | 1 - 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 5131993b..a0af404 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2114,7 +2114,8 @@ blk_qc_t submit_bio(int rw, struct bio *bio) EXPORT_SYMBOL(submit_bio); /** - * blk_rq_check_limits - Helper function to check a request for the queue limit + * blk_cloned_rq_check_limits - Helper function to check a cloned request + * for new the queue limits * @q: the queue * @rq: the request being checked * @@ -2125,20 +2126,13 @@ EXPORT_SYMBOL(submit_bio); * after it is inserted to @q, it should be checked against @q before * the insertion using this generic function. * - * This function should also be useful for request stacking drivers - * in some cases below, so export this function. * Request stacking drivers like request-based dm may change the queue - * limits while requests are in the queue (e.g. dm's table swapping). - * Such request stacking drivers should check those requests against - * the new queue limits again when they dispatch those requests, - * although such checkings are also done against the old queue limits - * when submitting requests. + * limits when retrying requests on other queues. Those requests need + * to be checked against the new queue limits again during dispatch. */ -int blk_rq_check_limits(struct request_queue *q, struct request *rq) +static int blk_cloned_rq_check_limits(struct request_queue *q, + struct request *rq) { - if (!rq_mergeable(rq)) - return 0; - if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, rq->cmd_flags)) { printk(KERN_ERR "%s: over max size limit.\n", __func__); return -EIO; @@ -2158,7 +2152,6 @@ int blk_rq_check_limits(struct request_queue *q, struct request *rq) return 0; } -EXPORT_SYMBOL_GPL(blk_rq_check_limits); /** * blk_insert_cloned_request - Helper for stacking drivers to submit a request @@ -2170,7 +2163,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) unsigned long flags; int where = ELEVATOR_INSERT_BACK; - if (blk_rq_check_limits(q, rq)) + if (blk_cloned_rq_check_limits(q, rq)) return -EIO; if (rq->rq_disk && diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c0d2b79..c06f8ea 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -773,7 +773,6 @@ extern void blk_rq_set_block_pc(struct request *); extern void blk_requeue_request(struct request_queue *, struct request *); extern void blk_add_request_payload(struct request *rq, struct page *page, unsigned int len); -extern int blk_rq_check_limits(struct request_queue *q, struct request *rq); extern int blk_lld_busy(struct request_queue *q); extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, struct bio_set *bs, gfp_t gfp_mask, -- cgit v1.1 From 74cedf9b6c603f2278a05bc91b140b32b434d0b5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 30 Nov 2015 10:15:42 -0700 Subject: direct-io: Fix negative return from dio read beyond eof Assume a filesystem with 4KB blocks. When a file has size 1000 bytes and we issue direct IO read at offset 1024, blockdev_direct_IO() reads the tail of the last block and the logic for handling short DIO reads in dio_complete() results in a return value -24 (1000 - 1024) which obviously confuses userspace. Fix the problem by bailing out early once we sample i_size and can reliably check that direct IO read starts beyond i_size. Reported-by: Avi Kivity Fixes: 9fe55eea7e4b444bafc42fa0000cc2d1d2847275 CC: stable@vger.kernel.org CC: Steven Whitehouse Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- fs/direct-io.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index cb5337d..1c75a3a 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1169,6 +1169,15 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, } } + /* Once we sampled i_size check for reads beyond EOF */ + dio->i_size = i_size_read(inode); + if (iov_iter_rw(iter) == READ && offset >= dio->i_size) { + if (dio->flags & DIO_LOCKING) + mutex_unlock(&inode->i_mutex); + kmem_cache_free(dio_cache, dio); + goto out; + } + /* * For file extending writes updating i_size before data writeouts * complete can expose uninitialized blocks in dumb filesystems. @@ -1222,7 +1231,6 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, sdio.next_block_for_io = -1; dio->iocb = iocb; - dio->i_size = i_size_read(inode); spin_lock_init(&dio->bio_lock); dio->refcount = 1; -- cgit v1.1 From a88d32af18b8a6616128c971f766eaf545966405 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 30 Nov 2015 16:05:49 +0800 Subject: blk-merge: fix computing bio->bi_seg_front_size in case of single segment When bio has only one physical segment, we should set bio's bi_seg_front_size as the real(final) size of the single segment. Fixes: 02e707424c2ea(blk-merge: fix blk_bio_segment_split) Reported-by: Markus Trippelsdorf Tested-by: Markus Trippelsdorf Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-merge.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/blk-merge.c b/block/blk-merge.c index 41a55ba..e01405a 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -103,6 +103,9 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, bvprv = bv; bvprvp = &bvprv; sectors += bv.bv_len >> 9; + + if (nsegs == 1 && seg_size > front_seg_size) + front_seg_size = seg_size; continue; } new_segment: -- cgit v1.1 From 3c395a969acc690f331d5fb91ec99ea8eb5155dd Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 1 Dec 2015 11:48:17 +0100 Subject: null_blk: set a separate timer for each command For the Timer IRQ mode (i.e., when command completions are delayed), there is one timer for each CPU. Each of these timers . has a completion queue associated with it, containing all the command completions to be executed when the timer fires; . is set, and a new completion-to-execute is inserted into its completion queue, every time the dispatch code for a new command happens to be executed on the CPU related to the timer. This implies that, if the dispatch of a new command happens to be executed on a CPU whose timer has already been set, but has not yet fired, then the timer is set again, to the completion time of the newly arrived command. When the timer eventually fires, all its queued completions are executed. This way of handling delayed command completions entails the following problem: if more than one command completion is inserted into the queue of a timer before the timer fires, then the expiration time for the timer is moved forward every time each of these completions is enqueued. As a consequence, only the last completion enqueued enjoys a correct execution time, while all previous completions are unjustly delayed until the last completion is executed (and at that time they are executed all together). Specifically, if all the above completions are enqueued almost at the same time, then the problem is negligible. On the opposite end, if every completion is enqueued a while after the previous completion was enqueued (in the extreme case, it is enqueued only right before the timer would have expired), then every enqueued completion, except for the last one, experiences an inflated delay, proportional to the number of completions enqueued after it. In the end, commands, and thus I/O requests, may be completed at an arbitrarily lower rate than the desired one. This commit addresses this issue by replacing per-CPU timers with per-command timers, i.e., by associating an individual timer with each command. Signed-off-by: Paolo Valente Signed-off-by: Arianna Avanzini Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 79 +++++++++++++++--------------------------------- 1 file changed, 24 insertions(+), 55 deletions(-) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 5c8ba54..08932f5 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -18,6 +18,7 @@ struct nullb_cmd { struct bio *bio; unsigned int tag; struct nullb_queue *nq; + struct hrtimer timer; }; struct nullb_queue { @@ -49,17 +50,6 @@ static int null_major; static int nullb_indexes; static struct kmem_cache *ppa_cache; -struct completion_queue { - struct llist_head list; - struct hrtimer timer; -}; - -/* - * These are per-cpu for now, they will need to be configured by the - * complete_queues parameter and appropriately mapped. - */ -static DEFINE_PER_CPU(struct completion_queue, completion_queues); - enum { NULL_IRQ_NONE = 0, NULL_IRQ_SOFTIRQ = 1, @@ -180,6 +170,8 @@ static void free_cmd(struct nullb_cmd *cmd) put_tag(cmd->nq, cmd->tag); } +static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer); + static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq) { struct nullb_cmd *cmd; @@ -190,6 +182,11 @@ static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq) cmd = &nq->cmds[tag]; cmd->tag = tag; cmd->nq = nq; + if (irqmode == NULL_IRQ_TIMER) { + hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + cmd->timer.function = null_cmd_timer_expired; + } return cmd; } @@ -238,47 +235,28 @@ static void end_cmd(struct nullb_cmd *cmd) static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) { - struct completion_queue *cq; - struct llist_node *entry; - struct nullb_cmd *cmd; - - cq = &per_cpu(completion_queues, smp_processor_id()); - - while ((entry = llist_del_all(&cq->list)) != NULL) { - entry = llist_reverse_order(entry); - do { - struct request_queue *q = NULL; + struct nullb_cmd *cmd = container_of(timer, struct nullb_cmd, timer); + struct request_queue *q = NULL; - cmd = container_of(entry, struct nullb_cmd, ll_list); - entry = entry->next; - if (cmd->rq) - q = cmd->rq->q; - end_cmd(cmd); + if (cmd->rq) + q = cmd->rq->q; - if (q && !q->mq_ops && blk_queue_stopped(q)) { - spin_lock(q->queue_lock); - if (blk_queue_stopped(q)) - blk_start_queue(q); - spin_unlock(q->queue_lock); - } - } while (entry); + if (q && !q->mq_ops && blk_queue_stopped(q)) { + spin_lock(q->queue_lock); + if (blk_queue_stopped(q)) + blk_start_queue(q); + spin_unlock(q->queue_lock); } + end_cmd(cmd); return HRTIMER_NORESTART; } static void null_cmd_end_timer(struct nullb_cmd *cmd) { - struct completion_queue *cq = &per_cpu(completion_queues, get_cpu()); - - cmd->ll_list.next = NULL; - if (llist_add(&cmd->ll_list, &cq->list)) { - ktime_t kt = ktime_set(0, completion_nsec); + ktime_t kt = ktime_set(0, completion_nsec); - hrtimer_start(&cq->timer, kt, HRTIMER_MODE_REL_PINNED); - } - - put_cpu(); + hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL); } static void null_softirq_done_fn(struct request *rq) @@ -376,6 +354,10 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx, { struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); + if (irqmode == NULL_IRQ_TIMER) { + hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cmd->timer.function = null_cmd_timer_expired; + } cmd->rq = bd->rq; cmd->nq = hctx->driver_data; @@ -813,19 +795,6 @@ static int __init null_init(void) mutex_init(&lock); - /* Initialize a separate list for each CPU for issuing softirqs */ - for_each_possible_cpu(i) { - struct completion_queue *cq = &per_cpu(completion_queues, i); - - init_llist_head(&cq->list); - - if (irqmode != NULL_IRQ_TIMER) - continue; - - hrtimer_init(&cq->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - cq->timer.function = null_cmd_timer_expired; - } - null_major = register_blkdev(0, "nullb"); if (null_major < 0) return null_major; -- cgit v1.1 From cf8ecc5a8455266f8d516426b2acd36f9bdfa061 Mon Sep 17 00:00:00 2001 From: Arianna Avanzini Date: Tue, 1 Dec 2015 11:48:18 +0100 Subject: null_blk: guarantee device restart in all irq modes In single-queue (block layer) mode,the function null_rq_prep_fn stops the device if alloc_cmd fails. Then, once stopped, the device must be restarted on the next command completion, so that the request(s) for which alloc_cmd failed can be requeued. Otherwise the device hangs. Unfortunately, device restart is currently performed only for delayed completions, i.e., in irqmode==2. This fact causes hangs, for the above reasons, with the other irqmodes in combination with single-queue block layer. This commits addresses this issue by making sure that, if stopped, the device is properly restarted for all irqmodes on completions. Signed-off-by: Paolo Valente Signed-off-by: Arianna AVanzini Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 08932f5..cf65619 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -217,6 +217,8 @@ static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait) static void end_cmd(struct nullb_cmd *cmd) { + struct request_queue *q = NULL; + switch (queue_mode) { case NULL_Q_MQ: blk_mq_end_request(cmd->rq, 0); @@ -227,27 +229,28 @@ static void end_cmd(struct nullb_cmd *cmd) break; case NULL_Q_BIO: bio_endio(cmd->bio); - break; + goto free_cmd; } - free_cmd(cmd); -} - -static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) -{ - struct nullb_cmd *cmd = container_of(timer, struct nullb_cmd, timer); - struct request_queue *q = NULL; - if (cmd->rq) q = cmd->rq->q; + /* Restart queue if needed, as we are freeing a tag */ if (q && !q->mq_ops && blk_queue_stopped(q)) { - spin_lock(q->queue_lock); + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); if (blk_queue_stopped(q)) blk_start_queue(q); - spin_unlock(q->queue_lock); + spin_unlock_irqrestore(q->queue_lock, flags); } - end_cmd(cmd); +free_cmd: + free_cmd(cmd); +} + +static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) +{ + end_cmd(container_of(timer, struct nullb_cmd, timer)); return HRTIMER_NORESTART; } -- cgit v1.1 From dbac117542b7e814245c43daa638a3626230cb2a Mon Sep 17 00:00:00 2001 From: Arianna Avanzini Date: Tue, 1 Dec 2015 11:48:19 +0100 Subject: null_blk: change type of completion_nsec to unsigned long This commit at least doubles the maximum value for completion_nsec. This helps in special cases where one wants/needs to emulate an extremely slow I/O (for example to spot bugs). Signed-off-by: Paolo Valente Signed-off-by: Arianna Avanzini Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index cf65619..0c3940e 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -132,8 +132,8 @@ static const struct kernel_param_ops null_irqmode_param_ops = { device_param_cb(irqmode, &null_irqmode_param_ops, &irqmode, S_IRUGO); MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer"); -static int completion_nsec = 10000; -module_param(completion_nsec, int, S_IRUGO); +static unsigned long completion_nsec = 10000; +module_param(completion_nsec, ulong, S_IRUGO); MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns"); static int hw_queue_depth = 64; -- cgit v1.1 From 1f390c1fde3a96974784be53cb3a645da3e4849c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stephan=20G=C3=BCnther?= Date: Tue, 1 Dec 2015 13:23:22 -0700 Subject: nvme: temporary fix for Apple controller reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recent patches added basic support for the Apple NVMe controller but still cause resets and data corruption on that particular controller when a specific pattern of read/flush commands occurs. Limiting the queue depth to 2 works around that issue. This patch enforces that limit only for the Apple controller and is considered a temporary fix until we find the root source of that problem. Signed-off-by: Stephan Günther Signed-off-by: Maurice Leclaire Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f3b53af..9e294ff 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2708,6 +2708,18 @@ static int nvme_dev_map(struct nvme_dev *dev) dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH); dev->db_stride = 1 << NVME_CAP_STRIDE(cap); dev->dbs = ((void __iomem *)dev->bar) + 4096; + + /* + * Temporary fix for the Apple controller found in the MacBook8,1 and + * some MacBook7,1 to avoid controller resets and data loss. + */ + if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) { + dev->q_depth = 2; + dev_warn(dev->dev, "detected Apple NVMe controller, set " + "queue depth=%u to work around controller resets\n", + dev->q_depth); + } + if (readl(&dev->bar->vs) >= NVME_VS(1, 2)) dev->cmb = nvme_map_cmb(dev); -- cgit v1.1