diff options
Diffstat (limited to 'drivers/nvme/host/pci.c')
-rw-r--r-- | drivers/nvme/host/pci.c | 216 |
1 files changed, 111 insertions, 105 deletions
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4276ebf..6fe7af0 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -75,7 +75,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); * Represents an NVM Express device. Each nvme_dev is a PCI function. */ struct nvme_dev { - struct nvme_queue **queues; + struct nvme_queue *queues; struct blk_mq_tag_set tagset; struct blk_mq_tag_set admin_tagset; u32 __iomem *dbs; @@ -365,7 +365,7 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { struct nvme_dev *dev = data; - struct nvme_queue *nvmeq = dev->queues[0]; + struct nvme_queue *nvmeq = &dev->queues[0]; WARN_ON(hctx_idx != 0); WARN_ON(dev->admin_tagset.tags[0] != hctx->tags); @@ -387,7 +387,7 @@ static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { struct nvme_dev *dev = data; - struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; + struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1]; if (!nvmeq->tags) nvmeq->tags = &dev->tagset.tags[hctx_idx]; @@ -403,7 +403,7 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req, struct nvme_dev *dev = set->driver_data; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0; - struct nvme_queue *nvmeq = dev->queues[queue_idx]; + struct nvme_queue *nvmeq = &dev->queues[queue_idx]; BUG_ON(!nvmeq); iod->nvmeq = nvmeq; @@ -1044,7 +1044,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) { struct nvme_dev *dev = to_nvme_dev(ctrl); - struct nvme_queue *nvmeq = dev->queues[0]; + struct nvme_queue *nvmeq = &dev->queues[0]; struct nvme_command c; memset(&c, 0, sizeof(c)); @@ -1138,9 +1138,14 @@ static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) */ bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO); - /* If there is a reset ongoing, we shouldn't reset again. */ - if (dev->ctrl.state == NVME_CTRL_RESETTING) + /* If there is a reset/reinit ongoing, we shouldn't reset again. */ + switch (dev->ctrl.state) { + case NVME_CTRL_RESETTING: + case NVME_CTRL_RECONNECTING: return false; + default: + break; + } /* We shouldn't reset unless the controller is on fatal error state * _or_ if we lost the communication with it. @@ -1280,7 +1285,6 @@ static void nvme_free_queue(struct nvme_queue *nvmeq) if (nvmeq->sq_cmds) dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), nvmeq->sq_cmds, nvmeq->sq_dma_addr); - kfree(nvmeq); } static void nvme_free_queues(struct nvme_dev *dev, int lowest) @@ -1288,10 +1292,8 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest) int i; for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) { - struct nvme_queue *nvmeq = dev->queues[i]; dev->ctrl.queue_count--; - dev->queues[i] = NULL; - nvme_free_queue(nvmeq); + nvme_free_queue(&dev->queues[i]); } } @@ -1323,12 +1325,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) { - struct nvme_queue *nvmeq = dev->queues[0]; - - if (!nvmeq) - return; - if (nvme_suspend_queue(nvmeq)) - return; + struct nvme_queue *nvmeq = &dev->queues[0]; if (shutdown) nvme_shutdown_ctrl(&dev->ctrl); @@ -1367,7 +1364,7 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, int qid, int depth) { - if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) { + if (qid && dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth), dev->ctrl.page_size); nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset; @@ -1382,13 +1379,13 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, return 0; } -static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, - int depth, int node) +static int nvme_alloc_queue(struct nvme_dev *dev, int qid, + int depth, int node) { - struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL, - node); - if (!nvmeq) - return NULL; + struct nvme_queue *nvmeq = &dev->queues[qid]; + + if (dev->ctrl.queue_count > qid) + return 0; nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth), &nvmeq->cq_dma_addr, GFP_KERNEL); @@ -1407,17 +1404,15 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, nvmeq->q_depth = depth; nvmeq->qid = qid; nvmeq->cq_vector = -1; - dev->queues[qid] = nvmeq; dev->ctrl.queue_count++; - return nvmeq; + return 0; free_cqdma: dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes, nvmeq->cq_dma_addr); free_nvmeq: - kfree(nvmeq); - return NULL; + return -ENOMEM; } static int queue_request_irq(struct nvme_queue *nvmeq) @@ -1590,14 +1585,12 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) if (result < 0) return result; - nvmeq = dev->queues[0]; - if (!nvmeq) { - nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH, - dev_to_node(dev->dev)); - if (!nvmeq) - return -ENOMEM; - } + result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH, + dev_to_node(dev->dev)); + if (result) + return result; + nvmeq = &dev->queues[0]; aqa = nvmeq->q_depth - 1; aqa |= aqa << 16; @@ -1627,7 +1620,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev) for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) { /* vector == qid - 1, match nvme_create_queue */ - if (!nvme_alloc_queue(dev, i, dev->q_depth, + if (nvme_alloc_queue(dev, i, dev->q_depth, pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) { ret = -ENOMEM; break; @@ -1636,15 +1629,15 @@ static int nvme_create_io_queues(struct nvme_dev *dev) max = min(dev->max_qid, dev->ctrl.queue_count - 1); for (i = dev->online_queues; i <= max; i++) { - ret = nvme_create_queue(dev->queues[i], i); + ret = nvme_create_queue(&dev->queues[i], i); if (ret) break; } /* * Ignore failing Create SQ/CQ commands, we can continue with less - * than the desired aount of queues, and even a controller without - * I/O queues an still be used to issue admin commands. This might + * than the desired amount of queues, and even a controller without + * I/O queues can still be used to issue admin commands. This might * be useful to upgrade a buggy firmware for example. */ return ret >= 0 ? 0 : ret; @@ -1661,30 +1654,40 @@ static ssize_t nvme_cmb_show(struct device *dev, } static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL); -static void __iomem *nvme_map_cmb(struct nvme_dev *dev) +static u64 nvme_cmb_size_unit(struct nvme_dev *dev) { - u64 szu, size, offset; + u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK; + + return 1ULL << (12 + 4 * szu); +} + +static u32 nvme_cmb_size(struct nvme_dev *dev) +{ + return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK; +} + +static void nvme_map_cmb(struct nvme_dev *dev) +{ + u64 size, offset; resource_size_t bar_size; struct pci_dev *pdev = to_pci_dev(dev->dev); - void __iomem *cmb; int bar; dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ); - if (!(NVME_CMB_SZ(dev->cmbsz))) - return NULL; + if (!dev->cmbsz) + return; dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC); if (!use_cmb_sqes) - return NULL; + return; - szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz)); - size = szu * NVME_CMB_SZ(dev->cmbsz); - offset = szu * NVME_CMB_OFST(dev->cmbloc); + size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev); + offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc); bar = NVME_CMB_BIR(dev->cmbloc); bar_size = pci_resource_len(pdev, bar); if (offset > bar_size) - return NULL; + return; /* * Controllers may support a CMB size larger than their BAR, @@ -1694,13 +1697,16 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev) if (size > bar_size - offset) size = bar_size - offset; - cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size); - if (!cmb) - return NULL; - + dev->cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size); + if (!dev->cmb) + return; dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset; dev->cmb_size = size; - return cmb; + + if (sysfs_add_file_to_group(&dev->ctrl.device->kobj, + &dev_attr_cmb.attr, NULL)) + dev_warn(dev->ctrl.device, + "failed to add sysfs attribute for CMB\n"); } static inline void nvme_release_cmb(struct nvme_dev *dev) @@ -1768,7 +1774,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred, dma_addr_t descs_dma; int i = 0; void **bufs; - u64 size = 0, tmp; + u64 size, tmp; tmp = (preferred + chunk_size - 1); do_div(tmp, chunk_size); @@ -1851,7 +1857,7 @@ static int nvme_setup_host_mem(struct nvme_dev *dev) u64 preferred = (u64)dev->ctrl.hmpre * 4096; u64 min = (u64)dev->ctrl.hmmin * 4096; u32 enable_bits = NVME_HOST_MEM_ENABLE; - int ret = 0; + int ret; preferred = min(preferred, max); if (min > max) { @@ -1892,7 +1898,7 @@ static int nvme_setup_host_mem(struct nvme_dev *dev) static int nvme_setup_io_queues(struct nvme_dev *dev) { - struct nvme_queue *adminq = dev->queues[0]; + struct nvme_queue *adminq = &dev->queues[0]; struct pci_dev *pdev = to_pci_dev(dev->dev); int result, nr_io_queues; unsigned long size; @@ -1905,7 +1911,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) if (nr_io_queues == 0) return 0; - if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) { + if (dev->cmb && (dev->cmbsz & NVME_CMBSZ_SQS)) { result = nvme_cmb_qdepth(dev, nr_io_queues, sizeof(struct nvme_command)); if (result > 0) @@ -2005,9 +2011,9 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) return 0; } -static void nvme_disable_io_queues(struct nvme_dev *dev, int queues) +static void nvme_disable_io_queues(struct nvme_dev *dev) { - int pass; + int pass, queues = dev->online_queues - 1; unsigned long timeout; u8 opcode = nvme_admin_delete_sq; @@ -2018,7 +2024,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev, int queues) retry: timeout = ADMIN_TIMEOUT; for (; i > 0; i--, sent++) - if (nvme_delete_queue(dev->queues[i], opcode)) + if (nvme_delete_queue(&dev->queues[i], opcode)) break; while (sent--) { @@ -2033,13 +2039,12 @@ static void nvme_disable_io_queues(struct nvme_dev *dev, int queues) } /* - * Return: error value if an error occurred setting up the queues or calling - * Identify Device. 0 if these succeeded, even if adding some of the - * namespaces failed. At the moment, these failures are silent. TBD which - * failures should be reported. + * return error value only when tagset allocation failed */ static int nvme_dev_add(struct nvme_dev *dev) { + int ret; + if (!dev->ctrl.tagset) { dev->tagset.ops = &nvme_mq_ops; dev->tagset.nr_hw_queues = dev->online_queues - 1; @@ -2055,8 +2060,12 @@ static int nvme_dev_add(struct nvme_dev *dev) dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; dev->tagset.driver_data = dev; - if (blk_mq_alloc_tag_set(&dev->tagset)) - return 0; + ret = blk_mq_alloc_tag_set(&dev->tagset); + if (ret) { + dev_warn(dev->ctrl.device, + "IO queues tagset allocation failed %d\n", ret); + return ret; + } dev->ctrl.tagset = &dev->tagset; nvme_dbbuf_set(dev); @@ -2122,22 +2131,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) "set queue depth=%u\n", dev->q_depth); } - /* - * CMBs can currently only exist on >=1.2 PCIe devices. We only - * populate sysfs if a CMB is implemented. Since nvme_dev_attrs_group - * has no name we can pass NULL as final argument to - * sysfs_add_file_to_group. - */ - - if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2, 0)) { - dev->cmb = nvme_map_cmb(dev); - if (dev->cmb) { - if (sysfs_add_file_to_group(&dev->ctrl.device->kobj, - &dev_attr_cmb.attr, NULL)) - dev_warn(dev->ctrl.device, - "failed to add sysfs attribute for CMB\n"); - } - } + nvme_map_cmb(dev); pci_enable_pcie_error_reporting(pdev); pci_save_state(pdev); @@ -2170,7 +2164,7 @@ static void nvme_pci_disable(struct nvme_dev *dev) static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) { - int i, queues; + int i; bool dead = true; struct pci_dev *pdev = to_pci_dev(dev->dev); @@ -2205,21 +2199,13 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) } nvme_stop_queues(&dev->ctrl); - queues = dev->online_queues - 1; - for (i = dev->ctrl.queue_count - 1; i > 0; i--) - nvme_suspend_queue(dev->queues[i]); - - if (dead) { - /* A device might become IO incapable very soon during - * probe, before the admin queue is configured. Thus, - * queue_count can be 0 here. - */ - if (dev->ctrl.queue_count) - nvme_suspend_queue(dev->queues[0]); - } else { - nvme_disable_io_queues(dev, queues); + if (!dead) { + nvme_disable_io_queues(dev); nvme_disable_admin_queue(dev, shutdown); } + for (i = dev->ctrl.queue_count - 1; i >= 0; i--) + nvme_suspend_queue(&dev->queues[i]); + nvme_pci_disable(dev); blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl); @@ -2289,6 +2275,7 @@ static void nvme_reset_work(struct work_struct *work) container_of(work, struct nvme_dev, ctrl.reset_work); bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); int result = -ENODEV; + enum nvme_ctrl_state new_state = NVME_CTRL_LIVE; if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) goto out; @@ -2300,6 +2287,16 @@ static void nvme_reset_work(struct work_struct *work) if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) nvme_dev_disable(dev, false); + /* + * Introduce RECONNECTING state from nvme-fc/rdma transports to mark the + * initializing procedure here. + */ + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RECONNECTING)) { + dev_warn(dev->ctrl.device, + "failed to mark controller RECONNECTING\n"); + goto out; + } + result = nvme_pci_enable(dev); if (result) goto out; @@ -2352,15 +2349,23 @@ static void nvme_reset_work(struct work_struct *work) dev_warn(dev->ctrl.device, "IO queues not created\n"); nvme_kill_queues(&dev->ctrl); nvme_remove_namespaces(&dev->ctrl); + new_state = NVME_CTRL_ADMIN_ONLY; } else { nvme_start_queues(&dev->ctrl); nvme_wait_freeze(&dev->ctrl); - nvme_dev_add(dev); + /* hit this only when allocate tagset fails */ + if (nvme_dev_add(dev)) + new_state = NVME_CTRL_ADMIN_ONLY; nvme_unfreeze(&dev->ctrl); } - if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) { - dev_warn(dev->ctrl.device, "failed to mark controller live\n"); + /* + * If only admin queue live, keep it to do further investigation or + * recovery. + */ + if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) { + dev_warn(dev->ctrl.device, + "failed to mark controller state %d\n", new_state); goto out; } @@ -2468,8 +2473,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); if (!dev) return -ENOMEM; - dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *), - GFP_KERNEL, node); + + dev->queues = kcalloc_node(num_possible_cpus() + 1, + sizeof(struct nvme_queue), GFP_KERNEL, node); if (!dev->queues) goto free; @@ -2496,10 +2502,10 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto release_pools; - nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING); dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); - queue_work(nvme_wq, &dev->ctrl.reset_work); + nvme_reset_ctrl(&dev->ctrl); + return 0; release_pools: @@ -2523,7 +2529,7 @@ static void nvme_reset_prepare(struct pci_dev *pdev) static void nvme_reset_done(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); - nvme_reset_ctrl(&dev->ctrl); + nvme_reset_ctrl_sync(&dev->ctrl); } static void nvme_shutdown(struct pci_dev *pdev) |