From 8757ad65d30f009fe0beeb2d70d3cd834cb998f2 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 11 Apr 2014 10:37:39 -0400 Subject: NVMe: Update copyright headers Make the copyright dates accurate and remove the final paragraph that includes the address of the FSF. Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 7c64fa7..eacd64e 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -10,10 +10,6 @@ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. */ #include -- cgit v1.1 From 27e8166c31656f7780e682eaf6bc9c3b8dd03253 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 11 Apr 2014 11:58:45 -0400 Subject: NVMe: Improve error messages Help people diagnose what is going wrong at initialisation time by printing out which command has gone wrong and what the device returned. Also fix the error message printed while waiting for reset. Signed-off-by: Matthew Wilcox Reviewed-by: Keith Busch --- drivers/block/nvme-core.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index eacd64e..074e982 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1354,7 +1354,8 @@ static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled) return -EINTR; if (time_after(jiffies, timeout)) { dev_err(&dev->pci_dev->dev, - "Device not ready; aborting initialisation\n"); + "Device not ready; aborting %s\n", enabled ? + "initialisation" : "reset"); return -ENODEV; } } @@ -2058,8 +2059,13 @@ static int set_queue_count(struct nvme_dev *dev, int count) status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0, &result); - if (status) - return status < 0 ? -EIO : -EBUSY; + if (status < 0) + return status; + if (status > 0) { + dev_err(&dev->pci_dev->dev, "Could not set queue count (%d)\n", + status); + return -EBUSY; + } return min(result & 0xffff, result >> 16) + 1; } @@ -2180,6 +2186,7 @@ static int nvme_dev_add(struct nvme_dev *dev) res = nvme_identify(dev, 0, 1, dma_addr); if (res) { + dev_err(&pdev->dev, "Identify Controller failed (%d)\n", res); res = -EIO; goto out; } -- cgit v1.1 From 94bbac4052eb93219ca0aa370ca741486b25fb98 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 24 Apr 2014 18:53:50 -0600 Subject: NVMe: Protect against badly formatted CQEs If a misbehaving device posts a CQE with a command id < depth but for one that was never allocated, the command info will have a callback function set to NULL and we don't want to try invoking that. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 074e982..b9f07f8 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -243,8 +243,9 @@ static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid, void *ctx; struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - if (cmdid >= nvmeq->q_depth) { - *fn = special_completion; + if (cmdid >= nvmeq->q_depth || !info[cmdid].fn) { + if (fn) + *fn = special_completion; return CMD_CTX_INVALID; } if (fn) -- cgit v1.1 From 3291fa57cb1b004c1a4823beb28b5cc72555f1a5 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 28 Apr 2014 12:30:52 -0600 Subject: NVMe: Add tracepoints Adding tracepoints for bio_complete and block_split into nvme to help with gathering IO info using blktrace and blkparse. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index b9f07f8..025dd4c 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -42,6 +42,8 @@ #include #include +#include + #define NVME_Q_DEPTH 1024 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) @@ -411,6 +413,7 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_iod *iod = ctx; struct bio *bio = iod->private; u16 status = le16_to_cpup(&cqe->status) >> 1; + int error = 0; if (unlikely(status)) { if (!(status & NVME_SC_DNR || @@ -423,6 +426,7 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx, wake_up(&nvmeq->sq_full); return; } + error = -EIO; } if (iod->nents) { dma_unmap_sg(nvmeq->q_dmadev, iod->sg, iod->nents, @@ -430,10 +434,9 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx, nvme_end_io_acct(bio, iod->start_time); } nvme_free_iod(nvmeq->dev, iod); - if (status) - bio_endio(bio, -EIO); - else - bio_endio(bio, 0); + + trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio, error); + bio_endio(bio, error); } /* length is in bytes. gfp flags indicates whether we may sleep. */ @@ -522,6 +525,8 @@ static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq, if (!split) return -ENOMEM; + trace_block_split(bdev_get_queue(bio->bi_bdev), bio, + split->bi_iter.bi_sector); bio_chain(split, bio); if (!waitqueue_active(&nvmeq->sq_full)) -- cgit v1.1 From a7d2ce2832d84e0182585f63bf96ca7323b3aee7 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 29 Apr 2014 11:41:28 -0600 Subject: NVMe: Configure support for block flush This configures an nvme request_queue as flush capable if the device has a volatile write cache present. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 025dd4c..e7c4fdb 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1897,6 +1897,8 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); if (dev->max_hw_sectors) blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); + if (dev->vwc & NVME_CTRL_VWC_PRESENT) + blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA); disk->major = nvme_major; disk->first_minor = 0; @@ -2201,6 +2203,7 @@ static int nvme_dev_add(struct nvme_dev *dev) nn = le32_to_cpup(&ctrl->nn); dev->oncs = le16_to_cpup(&ctrl->oncs); dev->abort_limit = ctrl->acl + 1; + dev->vwc = ctrl->vwc; memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); -- cgit v1.1 From 53562be74bd06bbe74d2acf3caca5398f8eeb160 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 29 Apr 2014 11:41:29 -0600 Subject: NVMe: Flush with data support It is possible a filesystem may send a flush flagged bio with write data. There is no such composite NVMe command, so the driver sends flush and write separately. The device is allowed to execute these commands in any order, so it was possible the driver ends the bio after the write completes, but while the flush is still active. We don't want to let a filesystem believe flush succeeded before it really has; this could cause data corruption on a power loss between these events. To fix, this patch splits the flush and write into chained bios. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 44 ++++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 20 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index e7c4fdb..cd8a8bc7 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -197,16 +197,13 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, #define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) #define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) #define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) -#define CMD_CTX_FLUSH (0x318 + CMD_CTX_BASE) -#define CMD_CTX_ABORT (0x31C + CMD_CTX_BASE) +#define CMD_CTX_ABORT (0x318 + CMD_CTX_BASE) static void special_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_completion *cqe) { if (ctx == CMD_CTX_CANCELLED) return; - if (ctx == CMD_CTX_FLUSH) - return; if (ctx == CMD_CTX_ABORT) { ++nvmeq->dev->abort_limit; return; @@ -629,16 +626,6 @@ static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, return 0; } -int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns) -{ - int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH, - special_completion, NVME_IO_TIMEOUT); - if (unlikely(cmdid < 0)) - return cmdid; - - return nvme_submit_flush(nvmeq, ns, cmdid); -} - static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod) { struct bio *bio = iod->private; @@ -654,7 +641,7 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod) if (bio->bi_rw & REQ_DISCARD) return nvme_submit_discard(nvmeq, ns, bio, iod, cmdid); - if ((bio->bi_rw & REQ_FLUSH) && !iod->nents) + if (bio->bi_rw & REQ_FLUSH) return nvme_submit_flush(nvmeq, ns, cmdid); control = 0; @@ -688,6 +675,26 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod) return 0; } +static int nvme_split_flush_data(struct nvme_queue *nvmeq, struct bio *bio) +{ + struct bio *split = bio_clone(bio, GFP_ATOMIC); + if (!split) + return -ENOMEM; + + split->bi_iter.bi_size = 0; + split->bi_phys_segments = 0; + bio->bi_rw &= ~REQ_FLUSH; + bio_chain(split, bio); + + if (!waitqueue_active(&nvmeq->sq_full)) + add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); + bio_list_add(&nvmeq->sq_cong, split); + bio_list_add(&nvmeq->sq_cong, bio); + wake_up_process(nvme_thread); + + return 0; +} + /* * Called with local interrupts disabled and the q_lock held. May not sleep. */ @@ -698,11 +705,8 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, int psegs = bio_phys_segments(ns->queue, bio); int result; - if ((bio->bi_rw & REQ_FLUSH) && psegs) { - result = nvme_submit_flush_data(nvmeq, ns); - if (result) - return result; - } + if ((bio->bi_rw & REQ_FLUSH) && psegs) + return nvme_split_flush_data(nvmeq, bio); iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC); if (!iod) -- cgit v1.1 From 21bd78bcf4208e84deab0d34f9d4e034d0580d0c Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 9 May 2014 22:42:26 -0400 Subject: NVMe: Enable BUILD_BUG_ON checks Since _nvme_check_size() wasn't being called from anywhere, the compiler was optimising it away ... along with all the link-time build failures that would result if any of the structures were the wrong size. Call it from nvme_exit() for no particular reason. Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index cd8a8bc7..b821558 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -2912,6 +2912,7 @@ static void __exit nvme_exit(void) unregister_blkdev(nvme_major, "nvme"); destroy_workqueue(nvme_workq); BUG_ON(nvme_thread && !IS_ERR(nvme_thread)); + _nvme_check_size(); } MODULE_AUTHOR("Matthew Wilcox "); -- cgit v1.1 From 6808c5fb7fc1574c7608a38c9819f1639d89c3d0 Mon Sep 17 00:00:00 2001 From: Santosh Y Date: Thu, 29 May 2014 10:01:52 +0530 Subject: NVMe: Prevent possible NULL pointer dereference kmalloc() used by the nvme_alloc_iod() to allocate memory for 'iod' can fail. So check the return value. Signed-off-by: Santosh Y Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index b821558..872d8e4 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1488,7 +1488,11 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, goto put_pages; } + err = -ENOMEM; iod = nvme_alloc_iod(count, length, GFP_KERNEL); + if (!iod) + goto put_pages; + sg = iod->sg; sg_init_table(sg, count); for (i = 0; i < count; i++) { @@ -1501,7 +1505,6 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, sg_mark_end(&sg[i - 1]); iod->nents = count; - err = -ENOMEM; nents = dma_map_sg(&dev->pci_dev->dev, sg, count, write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); if (!nents) -- cgit v1.1 From 61e4ce086df0a64a555880089e3b782517c828c0 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 13 May 2014 11:42:01 -0600 Subject: NVMe: Make iod bio timeout a parameter This was originally set to 4 times the IO timeout, but that was when the IO timeout was 5 seconds instead of 30. 20 seconds for total time to failure seemed more reasonable than 2 minutes for most, but other users have requested to make this a module parameter instead. Signed-off-by: Keith Busch [renamed the module parameter to retry_time] [made retry_time static] Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 872d8e4..1a91106 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -48,12 +48,16 @@ #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) #define ADMIN_TIMEOUT (60 * HZ) -#define IOD_TIMEOUT (4 * NVME_IO_TIMEOUT) +#define IOD_TIMEOUT (retry_time * HZ) unsigned char io_timeout = 30; module_param(io_timeout, byte, 0644); MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); +static unsigned char retry_time = 30; +module_param(retry_time, byte, 0644); +MODULE_PARM_DESC(retry_time, "time in seconds to retry failed I/O"); + static int nvme_major; module_param(nvme_major, int, 0); -- cgit v1.1 From 9d43cf646eab948db81913379dacb1dcecacb1eb Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 13 May 2014 11:42:02 -0600 Subject: NVMe: Make admin timeout a module parameter Signed-off-by: Keith Busch [made admin_timeout static] Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 1a91106..12c57eb 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -44,11 +44,15 @@ #include -#define NVME_Q_DEPTH 1024 +#define NVME_Q_DEPTH 1024 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) -#define ADMIN_TIMEOUT (60 * HZ) -#define IOD_TIMEOUT (retry_time * HZ) +#define ADMIN_TIMEOUT (admin_timeout * HZ) +#define IOD_TIMEOUT (retry_time * HZ) + +static unsigned char admin_timeout = 60; +module_param(admin_timeout, byte, 0644); +MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); unsigned char io_timeout = 30; module_param(io_timeout, byte, 0644); -- cgit v1.1 From a51afb54339c5e9ee72df66ae0f2ac5aacfed365 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 13 May 2014 10:32:46 -0600 Subject: NVMe: Fix nvme get/put queue semantics The routines to get and lock nvme queues required the caller to "put" or "unlock" them even if getting one returned NULL. This patch fixes that. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 12c57eb..29a3e85 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -285,9 +285,17 @@ static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid) static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU) { + struct nvme_queue *nvmeq; unsigned queue_id = get_cpu_var(*dev->io_queue); + rcu_read_lock(); - return rcu_dereference(dev->queues[queue_id]); + nvmeq = rcu_dereference(dev->queues[queue_id]); + if (nvmeq) + return nvmeq; + + rcu_read_unlock(); + put_cpu_var(*dev->io_queue); + return NULL; } static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) @@ -299,8 +307,15 @@ static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) static struct nvme_queue *lock_nvmeq(struct nvme_dev *dev, int q_idx) __acquires(RCU) { + struct nvme_queue *nvmeq; + rcu_read_lock(); - return rcu_dereference(dev->queues[q_idx]); + nvmeq = rcu_dereference(dev->queues[q_idx]); + if (nvmeq) + return nvmeq; + + rcu_read_unlock(); + return NULL; } static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) @@ -809,7 +824,6 @@ static void nvme_make_request(struct request_queue *q, struct bio *bio) int result = -EBUSY; if (!nvmeq) { - put_nvmeq(NULL); bio_endio(bio, -EIO); return; } @@ -884,10 +898,8 @@ static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx, struct nvme_queue *nvmeq; nvmeq = lock_nvmeq(dev, q_idx); - if (!nvmeq) { - unlock_nvmeq(nvmeq); + if (!nvmeq) return -ENODEV; - } cmdinfo.task = current; cmdinfo.status = -EINTR; @@ -912,9 +924,10 @@ static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx, if (cmdinfo.status == -EINTR) { nvmeq = lock_nvmeq(dev, q_idx); - if (nvmeq) + if (nvmeq) { nvme_abort_command(nvmeq, cmdid); - unlock_nvmeq(nvmeq); + unlock_nvmeq(nvmeq); + } return -EINTR; } -- cgit v1.1 From b4e75cbf1364c4bbce3599c3279892a55b6ede07 Mon Sep 17 00:00:00 2001 From: Sam Bradshaw Date: Fri, 9 May 2014 13:27:07 -0700 Subject: NVMe: Adhere to request queue block accounting enable/disable Recently, a new sysfs control "iostats" was added to selectively enable or disable io statistics collection for request queues. This patch hooks that control. IO statistics collection is rather expensive on large, multi-node machines with drives pushing millions of iops. Having the ability to disable collection if not needed can improve throughput significantly. As a data point, on a quad E5-4640, I see more than 50% throughput improvement when io statistics accounting is disabled during heavily multi-threaded small block random read benchmarks where device performance is in the million iops+ range. Signed-off-by: Sam Bradshaw Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 29a3e85..bb6ce31 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -406,25 +406,30 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) static void nvme_start_io_acct(struct bio *bio) { struct gendisk *disk = bio->bi_bdev->bd_disk; - const int rw = bio_data_dir(bio); - int cpu = part_stat_lock(); - part_round_stats(cpu, &disk->part0); - part_stat_inc(cpu, &disk->part0, ios[rw]); - part_stat_add(cpu, &disk->part0, sectors[rw], bio_sectors(bio)); - part_inc_in_flight(&disk->part0, rw); - part_stat_unlock(); + if (blk_queue_io_stat(disk->queue)) { + const int rw = bio_data_dir(bio); + int cpu = part_stat_lock(); + part_round_stats(cpu, &disk->part0); + part_stat_inc(cpu, &disk->part0, ios[rw]); + part_stat_add(cpu, &disk->part0, sectors[rw], + bio_sectors(bio)); + part_inc_in_flight(&disk->part0, rw); + part_stat_unlock(); + } } static void nvme_end_io_acct(struct bio *bio, unsigned long start_time) { struct gendisk *disk = bio->bi_bdev->bd_disk; - const int rw = bio_data_dir(bio); - unsigned long duration = jiffies - start_time; - int cpu = part_stat_lock(); - part_stat_add(cpu, &disk->part0, ticks[rw], duration); - part_round_stats(cpu, &disk->part0); - part_dec_in_flight(&disk->part0, rw); - part_stat_unlock(); + if (blk_queue_io_stat(disk->queue)) { + const int rw = bio_data_dir(bio); + unsigned long duration = jiffies - start_time; + int cpu = part_stat_lock(); + part_stat_add(cpu, &disk->part0, ticks[rw], duration); + part_round_stats(cpu, &disk->part0); + part_dec_in_flight(&disk->part0, rw); + part_stat_unlock(); + } } static void bio_completion(struct nvme_queue *nvmeq, void *ctx, -- cgit v1.1 From bd67608a6127c994e897c49cc4f72d9095925301 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 3 Jun 2014 23:04:30 -0400 Subject: NVMe: Rename io_timeout to nvme_io_timeout It's positively immoral to have a global variable called 'io_timeout'. Keep the module parameter called io_timeout, though. Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index bb6ce31..2af079e 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -54,8 +54,8 @@ static unsigned char admin_timeout = 60; module_param(admin_timeout, byte, 0644); MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); -unsigned char io_timeout = 30; -module_param(io_timeout, byte, 0644); +unsigned char nvme_io_timeout = 30; +module_param_named(io_timeout, nvme_io_timeout, byte, 0644); MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); static unsigned char retry_time = 30; -- cgit v1.1 From f3db22feb5de6b98b7bae924c2d4b6c8d65bedae Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 11 Jun 2014 11:51:35 -0600 Subject: NVMe: Fix hot cpu notification dead lock There is a potential dead lock if a cpu event occurs during nvme probe since it registered with hot cpu notification. This fixes the race by having the module register with notification outside of probe rather than have each device register. The actual work is done in a scheduled work queue instead of in the notifier since assigning IO queues has the potential to block if the driver creates additional queues. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 2af079e..e0ac121 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -73,6 +73,7 @@ static LIST_HEAD(dev_list); static struct task_struct *nvme_thread; static struct workqueue_struct *nvme_workq; static wait_queue_head_t nvme_kthread_wait; +static struct notifier_block nvme_nb; static void nvme_reset_failed_dev(struct work_struct *ws); @@ -2115,14 +2116,25 @@ static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride); } +static void nvme_cpu_workfn(struct work_struct *work) +{ + struct nvme_dev *dev = container_of(work, struct nvme_dev, cpu_work); + if (dev->initialized) + nvme_assign_io_queues(dev); +} + static int nvme_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { - struct nvme_dev *dev = container_of(self, struct nvme_dev, nb); + struct nvme_dev *dev; + switch (action) { case CPU_ONLINE: case CPU_DEAD: - nvme_assign_io_queues(dev); + spin_lock(&dev_list_lock); + list_for_each_entry(dev, &dev_list, node) + schedule_work(&dev->cpu_work); + spin_unlock(&dev_list_lock); break; } return NOTIFY_OK; @@ -2191,11 +2203,6 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) nvme_free_queues(dev, nr_io_queues + 1); nvme_assign_io_queues(dev); - dev->nb.notifier_call = &nvme_cpu_notify; - result = register_hotcpu_notifier(&dev->nb); - if (result) - goto free_queues; - return 0; free_queues: @@ -2495,8 +2502,6 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) int i; dev->initialized = 0; - unregister_hotcpu_notifier(&dev->nb); - nvme_dev_list_remove(dev); if (!dev->bar || (dev->bar && readl(&dev->bar->csts) == -1)) { @@ -2767,6 +2772,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) INIT_LIST_HEAD(&dev->namespaces); dev->reset_workfn = nvme_reset_failed_dev; INIT_WORK(&dev->reset_work, nvme_reset_workfn); + INIT_WORK(&dev->cpu_work, nvme_cpu_workfn); dev->pci_dev = pdev; pci_set_drvdata(pdev, dev); result = nvme_set_instance(dev); @@ -2836,6 +2842,7 @@ static void nvme_remove(struct pci_dev *pdev) pci_set_drvdata(pdev, NULL); flush_work(&dev->reset_work); + flush_work(&dev->cpu_work); misc_deregister(&dev->miscdev); nvme_dev_remove(dev); nvme_dev_shutdown(dev); @@ -2923,11 +2930,18 @@ static int __init nvme_init(void) else if (result > 0) nvme_major = result; - result = pci_register_driver(&nvme_driver); + nvme_nb.notifier_call = &nvme_cpu_notify; + result = register_hotcpu_notifier(&nvme_nb); if (result) goto unregister_blkdev; + + result = pci_register_driver(&nvme_driver); + if (result) + goto unregister_hotcpu; return 0; + unregister_hotcpu: + unregister_hotcpu_notifier(&nvme_nb); unregister_blkdev: unregister_blkdev(nvme_major, "nvme"); kill_workq: @@ -2938,6 +2952,7 @@ static int __init nvme_init(void) static void __exit nvme_exit(void) { pci_unregister_driver(&nvme_driver); + unregister_hotcpu_notifier(&nvme_nb); unregister_blkdev(nvme_major, "nvme"); destroy_workqueue(nvme_workq); BUG_ON(nvme_thread && !IS_ERR(nvme_thread)); -- cgit v1.1